Skip to content

Commit

Permalink
stencil tensor ok on m1 arm
Browse files Browse the repository at this point in the history
  • Loading branch information
lehner committed Oct 24, 2023
1 parent 830cc51 commit abecbec
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 16 deletions.
8 changes: 4 additions & 4 deletions benchmarks/stencil_tensor.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
#!/usr/bin/env python3
import gpt as g
#grid = g.grid([64,64,64,64], g.double)
grid = g.grid([32,32,32,32], g.double)
#grid = g.grid([32,32,32,32], g.double)
#grid = g.grid([32,16,16,16], g.double)
#grid = g.grid([16,16,16,32], g.double)
grid = g.grid([16,16,16,32], g.double)
#grid = g.grid([2*4,4*3,3*4,3*3*4], g.double)
m1 = g.mcolor(grid)
m2 = g.mcolor(grid)
Expand All @@ -30,7 +30,7 @@


for osites_per_instruction in [1,4,8,16,32,64]:
for osites_per_cache_block in [2048*4, 4096*4, 8192*4]:
for osites_per_cache_block in [2048*4, 4096*4, 8192*4, grid.gsites]:
ein.memory_access_pattern(osites_per_instruction, osites_per_cache_block)

g.message(osites_per_instruction, osites_per_cache_block)
Expand Down Expand Up @@ -89,7 +89,7 @@
#
# D[i2[0], i1[0]] += sign1 * sign2 * Q1[i1[1], i2[1]] * g.transpose(Q2[i1[2], i2[2]])
for osites_per_instruction in [1,4,8,16,32,64]:
for osites_per_cache_block in [2048*4, 4096*4, 8192*4]:
for osites_per_cache_block in [2048*4, 4096*4, 8192*4, grid.gsites]:
ein.memory_access_pattern(osites_per_instruction, osites_per_cache_block)

g.message(osites_per_instruction, osites_per_cache_block)
Expand Down
4 changes: 2 additions & 2 deletions lib/cgpt/lib/stencil.cc
Original file line number Diff line number Diff line change
Expand Up @@ -139,8 +139,8 @@ EXPORT(stencil_tensor_execute,{

cgpt_stencil_tensor_execute_params_t params =
{
osites_per_instruction,
osites_per_cache_block
(int)osites_per_instruction,
(int)osites_per_cache_block
};
stencil->execute(__fields, params);

Expand Down
20 changes: 10 additions & 10 deletions lib/cgpt/lib/stencil/tensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -144,13 +144,7 @@ class cgpt_stencil_tensor : public cgpt_stencil_tensor_base {
delete sm;
}
}
/*

TODO:
stencils should return different options for current hardware for performance (including max _npb)
*/
template<int osites_per_instruction>
void block_execute(const std::vector<cgpt_Lattice_base*>& fields, int osites_per_cache_block) {

Expand All @@ -163,7 +157,6 @@ class cgpt_stencil_tensor : public cgpt_stencil_tensor_base {
#endif
typedef typename T::scalar_type coeff_t;


VECTOR_ELEMENT_VIEW_OPEN(element_t, fields, fields_v, AcceleratorWrite);

int n_code = code.size();
Expand Down Expand Up @@ -225,8 +218,8 @@ class cgpt_stencil_tensor : public cgpt_stencil_tensor_base {

uint64_t ocache_blocks = (osites + osites_per_cache_block - 1) / osites_per_cache_block;
for (uint64_t ocache_block = 0;ocache_block < ocache_blocks;ocache_block++) {
uint64_t osites0 = min(ocache_block * osites_per_cache_block, osites);
uint64_t osites1 = min(osites0 + osites_per_cache_block, osites);
uint64_t osites0 = std::min(ocache_block * osites_per_cache_block, osites);
uint64_t osites1 = std::min(osites0 + osites_per_cache_block, osites);

uint64_t osites_in_cache_block = osites1 - osites0;

Expand All @@ -237,11 +230,16 @@ class cgpt_stencil_tensor : public cgpt_stencil_tensor_base {
uint64_t osites_extra = osites_in_cache_block - osites_extra_start;

//std::cout << GridLogMessage<< "Group " << osites0 << " to " << osites1 << " has oblocks " << oblocks << " and extra " << osites_extra << " from " << osites_extra_start << " compare to " << osites << std::endl;

#ifdef GRID_HAS_ACCELERATOR
int coffset = 0;
for (auto & segment : segments) {
int _npb = segment.number_of_blocks;
int _npbs = segment.block_size;
#else
#define _npb 1
#define _npbs n_code
#define coffset 0
#endif

accelerator_forNB(ss_block, oblocks * _npb, T::Nsimd(), {

Expand Down Expand Up @@ -290,8 +288,10 @@ class cgpt_stencil_tensor : public cgpt_stencil_tensor_base {
});
}

#ifdef GRID_HAS_ACCELERATOR
coffset += _npb * _npbs;
}
#endif
}

accelerator_barrier();
Expand Down

0 comments on commit abecbec

Please sign in to comment.