From abecbec9c7887ef045fdfcad61593d28d9dcca5b Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Tue, 24 Oct 2023 13:19:10 +0200
Subject: [PATCH] stencil tensor ok on m1 arm

---
 benchmarks/stencil_tensor.py  |  8 ++++----
 lib/cgpt/lib/stencil.cc       |  4 ++--
 lib/cgpt/lib/stencil/tensor.h | 20 ++++++++++----------
 3 files changed, 16 insertions(+), 16 deletions(-)
diff --git a/benchmarks/stencil_tensor.py b/benchmarks/stencil_tensor.py
index d80b672b..e1a1cef4 100755
--- a/benchmarks/stencil_tensor.py
+++ b/benchmarks/stencil_tensor.py
@@ -1,9 +1,9 @@
 #!/usr/bin/env python3
 import gpt as g
 #grid = g.grid([64,64,64,64], g.double)
-grid = g.grid([32,32,32,32], g.double)
+#grid = g.grid([32,32,32,32], g.double)
 #grid = g.grid([32,16,16,16], g.double)
-#grid = g.grid([16,16,16,32], g.double)
+grid = g.grid([16,16,16,32], g.double)
 #grid = g.grid([2*4,4*3,3*4,3*3*4], g.double)
 m1 = g.mcolor(grid)
 m2 = g.mcolor(grid)
@@ -30,7 +30,7 @@
 
 
 for osites_per_instruction in [1,4,8,16,32,64]:
-    for osites_per_cache_block in [2048*4, 4096*4, 8192*4]:
+    for osites_per_cache_block in [2048*4, 4096*4, 8192*4, grid.gsites]:
         ein.memory_access_pattern(osites_per_instruction, osites_per_cache_block)
 
         g.message(osites_per_instruction, osites_per_cache_block)
@@ -89,7 +89,7 @@
 #
 #            D[i2[0], i1[0]] += sign1 * sign2 * Q1[i1[1], i2[1]] * g.transpose(Q2[i1[2], i2[2]])
 for osites_per_instruction in [1,4,8,16,32,64]:
-    for osites_per_cache_block in [2048*4, 4096*4, 8192*4]:
+    for osites_per_cache_block in [2048*4, 4096*4, 8192*4, grid.gsites]:
         ein.memory_access_pattern(osites_per_instruction, osites_per_cache_block)
 
         g.message(osites_per_instruction, osites_per_cache_block)
diff --git a/lib/cgpt/lib/stencil.cc b/lib/cgpt/lib/stencil.cc
index d58646d8..73766636 100644
--- a/lib/cgpt/lib/stencil.cc
+++ b/lib/cgpt/lib/stencil.cc
@@ -139,8 +139,8 @@ EXPORT(stencil_tensor_execute,{
 
     cgpt_stencil_tensor_execute_params_t params =
       {
-       osites_per_instruction,
-       osites_per_cache_block
+	(int)osites_per_instruction,
+	(int)osites_per_cache_block
       };
     stencil->execute(__fields, params);
 
diff --git a/lib/cgpt/lib/stencil/tensor.h b/lib/cgpt/lib/stencil/tensor.h
index c0d8a316..5d0e3b06 100644
--- a/lib/cgpt/lib/stencil/tensor.h
+++ b/lib/cgpt/lib/stencil/tensor.h
@@ -144,13 +144,7 @@ class cgpt_stencil_tensor : public cgpt_stencil_tensor_base {
       delete sm;
     }
   }
-  /*
 
-    TODO:
-    
-    stencils should return different options for current hardware for performance (including max _npb)
-
-  */
   template<int osites_per_instruction>
   void block_execute(const std::vector<cgpt_Lattice_base*>& fields, int osites_per_cache_block) {
 
@@ -163,7 +157,6 @@ class cgpt_stencil_tensor : public cgpt_stencil_tensor_base {
 #endif
     typedef typename T::scalar_type coeff_t;
 
-      
     VECTOR_ELEMENT_VIEW_OPEN(element_t, fields, fields_v, AcceleratorWrite);
 
     int n_code = code.size();
@@ -225,8 +218,8 @@ class cgpt_stencil_tensor : public cgpt_stencil_tensor_base {
       
       uint64_t ocache_blocks = (osites + osites_per_cache_block - 1) / osites_per_cache_block;
       for (uint64_t ocache_block = 0;ocache_block < ocache_blocks;ocache_block++) {
-	uint64_t osites0 = min(ocache_block * osites_per_cache_block, osites);
-	uint64_t osites1 = min(osites0 + osites_per_cache_block, osites);
+	uint64_t osites0 = std::min(ocache_block * osites_per_cache_block, osites);
+	uint64_t osites1 = std::min(osites0 + osites_per_cache_block, osites);
 
 	uint64_t osites_in_cache_block = osites1 - osites0;
 	
@@ -237,11 +230,16 @@ class cgpt_stencil_tensor : public cgpt_stencil_tensor_base {
 	uint64_t osites_extra = osites_in_cache_block - osites_extra_start;
 
 	//std::cout << GridLogMessage<< "Group " << osites0 << " to " << osites1 << " has oblocks " << oblocks << " and extra " << osites_extra << " from " << osites_extra_start << " compare to " << osites << std::endl;
-
+#ifdef GRID_HAS_ACCELERATOR
 	int coffset = 0;
 	for (auto & segment : segments) {
 	  int _npb = segment.number_of_blocks;
 	  int _npbs = segment.block_size;
+#else
+	  #define _npb 1
+	  #define _npbs n_code
+	  #define coffset 0
+#endif
 	  
 	  accelerator_forNB(ss_block, oblocks * _npb, T::Nsimd(), {
 	      
@@ -290,8 +288,10 @@ class cgpt_stencil_tensor : public cgpt_stencil_tensor_base {
 	      });
 	  }
 
+#ifdef GRID_HAS_ACCELERATOR
 	  coffset += _npb * _npbs;
 	}
+#endif
       }
       
       accelerator_barrier();