From abecbec9c7887ef045fdfcad61593d28d9dcca5b Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Tue, 24 Oct 2023 13:19:10 +0200 Subject: [PATCH] stencil tensor ok on m1 arm --- benchmarks/stencil_tensor.py | 8 ++++---- lib/cgpt/lib/stencil.cc | 4 ++-- lib/cgpt/lib/stencil/tensor.h | 20 ++++++++++---------- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/benchmarks/stencil_tensor.py b/benchmarks/stencil_tensor.py index d80b672b..e1a1cef4 100755 --- a/benchmarks/stencil_tensor.py +++ b/benchmarks/stencil_tensor.py @@ -1,9 +1,9 @@ #!/usr/bin/env python3 import gpt as g #grid = g.grid([64,64,64,64], g.double) -grid = g.grid([32,32,32,32], g.double) +#grid = g.grid([32,32,32,32], g.double) #grid = g.grid([32,16,16,16], g.double) -#grid = g.grid([16,16,16,32], g.double) +grid = g.grid([16,16,16,32], g.double) #grid = g.grid([2*4,4*3,3*4,3*3*4], g.double) m1 = g.mcolor(grid) m2 = g.mcolor(grid) @@ -30,7 +30,7 @@ for osites_per_instruction in [1,4,8,16,32,64]: - for osites_per_cache_block in [2048*4, 4096*4, 8192*4]: + for osites_per_cache_block in [2048*4, 4096*4, 8192*4, grid.gsites]: ein.memory_access_pattern(osites_per_instruction, osites_per_cache_block) g.message(osites_per_instruction, osites_per_cache_block) @@ -89,7 +89,7 @@ # # D[i2[0], i1[0]] += sign1 * sign2 * Q1[i1[1], i2[1]] * g.transpose(Q2[i1[2], i2[2]]) for osites_per_instruction in [1,4,8,16,32,64]: - for osites_per_cache_block in [2048*4, 4096*4, 8192*4]: + for osites_per_cache_block in [2048*4, 4096*4, 8192*4, grid.gsites]: ein.memory_access_pattern(osites_per_instruction, osites_per_cache_block) g.message(osites_per_instruction, osites_per_cache_block) diff --git a/lib/cgpt/lib/stencil.cc b/lib/cgpt/lib/stencil.cc index d58646d8..73766636 100644 --- a/lib/cgpt/lib/stencil.cc +++ b/lib/cgpt/lib/stencil.cc @@ -139,8 +139,8 @@ EXPORT(stencil_tensor_execute,{ cgpt_stencil_tensor_execute_params_t params = { - osites_per_instruction, - osites_per_cache_block + (int)osites_per_instruction, + (int)osites_per_cache_block }; stencil->execute(__fields, params); diff --git a/lib/cgpt/lib/stencil/tensor.h b/lib/cgpt/lib/stencil/tensor.h index c0d8a316..5d0e3b06 100644 --- a/lib/cgpt/lib/stencil/tensor.h +++ b/lib/cgpt/lib/stencil/tensor.h @@ -144,13 +144,7 @@ class cgpt_stencil_tensor : public cgpt_stencil_tensor_base { delete sm; } } - /* - TODO: - - stencils should return different options for current hardware for performance (including max _npb) - - */ template void block_execute(const std::vector& fields, int osites_per_cache_block) { @@ -163,7 +157,6 @@ class cgpt_stencil_tensor : public cgpt_stencil_tensor_base { #endif typedef typename T::scalar_type coeff_t; - VECTOR_ELEMENT_VIEW_OPEN(element_t, fields, fields_v, AcceleratorWrite); int n_code = code.size(); @@ -225,8 +218,8 @@ class cgpt_stencil_tensor : public cgpt_stencil_tensor_base { uint64_t ocache_blocks = (osites + osites_per_cache_block - 1) / osites_per_cache_block; for (uint64_t ocache_block = 0;ocache_block < ocache_blocks;ocache_block++) { - uint64_t osites0 = min(ocache_block * osites_per_cache_block, osites); - uint64_t osites1 = min(osites0 + osites_per_cache_block, osites); + uint64_t osites0 = std::min(ocache_block * osites_per_cache_block, osites); + uint64_t osites1 = std::min(osites0 + osites_per_cache_block, osites); uint64_t osites_in_cache_block = osites1 - osites0; @@ -237,11 +230,16 @@ class cgpt_stencil_tensor : public cgpt_stencil_tensor_base { uint64_t osites_extra = osites_in_cache_block - osites_extra_start; //std::cout << GridLogMessage<< "Group " << osites0 << " to " << osites1 << " has oblocks " << oblocks << " and extra " << osites_extra << " from " << osites_extra_start << " compare to " << osites << std::endl; - +#ifdef GRID_HAS_ACCELERATOR int coffset = 0; for (auto & segment : segments) { int _npb = segment.number_of_blocks; int _npbs = segment.block_size; +#else + #define _npb 1 + #define _npbs n_code + #define coffset 0 +#endif accelerator_forNB(ss_block, oblocks * _npb, T::Nsimd(), { @@ -290,8 +288,10 @@ class cgpt_stencil_tensor : public cgpt_stencil_tensor_base { }); } +#ifdef GRID_HAS_ACCELERATOR coffset += _npb * _npbs; } +#endif } accelerator_barrier();