matrix_vector progress

lehner · Oct 8, 2023 · 0aa1bd7 · 0aa1bd7
1 parent 36d7034
commit 0aa1bd7
Show file tree

Hide file tree

Showing 9 changed files with 132 additions and 29 deletions.
diff --git a/benchmarks/stencil.py b/benchmarks/stencil.py
@@ -45,8 +45,6 @@
     st = g.stencil.matrix(
         U[0],
         [(0, 0, 0, 0), (1, 0, 0, 0), (0, 1, 0, 0), (0, 0, 1, 0), (0, 0, 0, 1)],
-        [0],
-        [1, 2, 3, 4],
         code,
     )
     # test plaquette
@@ -82,3 +80,66 @@
     Total performance           : {GFlopsPerSec:.2f} GFlops/s
     Effective memory bandwidth  : {GBPerSec:.2f} GB/s"""
     )
+
+    # covariant laplacian stencil
+    src = g.vspincolor(grid)
+    rng.cnormal(src)
+    evec = [(1, 0, 0, 0), (0, 1, 0, 0), (0, 0, 1, 0), (0, 0, 0, 1)]
+    nevec = [tuple([-x for x in y]) for y in evec]
+    UdagShift = [g(g.adj(g.cshift(U[mu], mu, -1))) for mu in range(4)]
+    _U = [0,1,2,3]
+    _UdagShift = [4,5,6,7]
+    _X = 0
+    _Xp = [1,2,3,4]
+    _Xm = [5,6,7,8]
+    code = [(0,1,_X,-1,-8.0,[])]
+    for mu in range(4):
+        code.append((0,1,_Xp[mu], 0, 1.0,[(_U[mu], _X, 0)]))
+        code.append((0,1,_Xm[mu], 0, 1.0,[(_UdagShift[mu], _X, 0)]))
+        # can switch last line to next one
+        #code.append((0,1,_Xm[mu], 0, 1.0,[(_U[mu], _Xm[mu], 1)]))
+    st = g.stencil.matrix_vector(
+        U[0],
+        src,
+        [(0, 0, 0, 0)] + evec + nevec,
+        code
+    )
+    # test laplace
+    dst = g.lattice(src)
+    st(U + UdagShift, [dst,src])
+
+    lap = g.create.smear.laplace(g.covariant.shift(U, boundary_phases=[1]*4), [0,1,2,3])
+    dst2 = lap(src)
+    eps2 = g.norm2(dst - dst2) / g.norm2(dst)
+    assert eps2 ** 0.5 < precision.eps * 100
+
+    # Flops
+    gauge_otype = U[0].otype
+    Nc = gauge_otype.shape[0]
+    Ns = 4
+    flops_per_matrix_vector_multiply = Ns * (Nc**2 * 6 + (Nc - 1) * Nc * 2)
+    flops_per_vector_add = Ns * Nc * 2
+    flops_per_site = 8 * flops_per_matrix_vector_multiply + 8 * flops_per_vector_add
+    flops = flops_per_site * src.grid.gsites * N
+    nbytes = (8 * Nc * Nc * 2 + Nc * Ns * 2) * precision.nbytes * src.grid.gsites * N
+
+    # Warmup
+    for n in range(5):
+        st(U + UdagShift, [dst,src])
+
+    # Time
+    t0 = g.time()
+    for n in range(N):
+        st(U + UdagShift, [dst,src])
+    t1 = g.time()
+
+    # Report
+    GFlopsPerSec = flops / (t1 - t0) / 1e9
+    GBPerSec = nbytes / (t1 - t0) / 1e9
+    g.message(
+        f"""
+{N} applications of laplace stencil
+    Time to complete            : {t1-t0:.2f} s
+    Total performance           : {GFlopsPerSec:.2f} GFlops/s
+    Effective memory bandwidth  : {GBPerSec:.2f} GB/s"""
+    )
diff --git a/lib/cgpt/lib/stencil/matrix_vector.h b/lib/cgpt/lib/stencil/matrix_vector.h
@@ -275,9 +275,6 @@ cgpt_stencil_matrix_vector_create(cgpt_Lattice_base* __matrix, GridBase* grid, P
   std::vector<cgpt_stencil_matrix_vector_code_t> code;
   cgpt_convert(_code,code);
 
-  // for now only local is implemented
-  ASSERT(local);
-
   // test __matrix type against matrix in spin space,
   // color space spin+color space, and singlet space
   if (is_compatible<typename matrixFromTypeAtLevel<V,2>::type>(__matrix)) {

diff --git a/lib/gpt/core/local_stencil/matrix.py b/lib/gpt/core/local_stencil/matrix.py
@@ -42,3 +42,6 @@ def __call__(self, *fields):
 
     def __del__(self):
         cgpt.stencil_matrix_delete(self.obj)
+
+    def data_access_hints(self, *hints):
+        pass
diff --git a/lib/gpt/core/local_stencil/matrix_vector.py b/lib/gpt/core/local_stencil/matrix_vector.py
@@ -34,7 +34,7 @@ def parse(c):
 
 
 class matrix_vector:
-    def __init__(self, lat_matrix, lat_vector, points, code, code_parallel_block_size=None):
+    def __init__(self, lat_matrix, lat_vector, points, code, code_parallel_block_size=None, local=1):
         self.points = points
         self.code = [parse(c) for c in code]
         self.code_parallel_block_size = code_parallel_block_size
@@ -47,11 +47,14 @@ def __init__(self, lat_matrix, lat_vector, points, code, code_parallel_block_siz
             points,
             self.code,
             code_parallel_block_size,
-            1
+            local
         )
 
     def __call__(self, matrix_fields, vector_fields):
         cgpt.stencil_matrix_vector_execute(self.obj, matrix_fields, vector_fields)
 
     def __del__(self):
         cgpt.stencil_matrix_vector_delete(self.obj)
+
+    def data_access_hints(self, *hints):
+        pass
diff --git a/lib/gpt/core/parallel_transport/matrix.py b/lib/gpt/core/parallel_transport/matrix.py
@@ -107,7 +107,8 @@ def __init__(self, U, code, n_target):
         write_fields = list(range(Ntarget))
         read_fields = list(range(Ntarget + Ntemporary, Ntarget + Ntemporary + Nd))
 
-        self.stencil = g.stencil.matrix(U[0], points.points, write_fields, read_fields, self.code)
+        self.stencil = g.stencil.matrix(U[0], points.points, self.code)
+        self.stencil.data_access_hints(write_fields, read_fields, [])
 
     def __call__(self, U):
         T = [g.lattice(U[0]) for i in range(self.Ntarget)]

diff --git a/lib/gpt/core/stencil/__init__.py b/lib/gpt/core/stencil/__init__.py
@@ -17,3 +17,4 @@
 #    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 #
 from gpt.core.stencil.matrix import matrix
+from gpt.core.stencil.matrix_vector import matrix_vector
diff --git a/lib/gpt/core/stencil/matrix.py b/lib/gpt/core/stencil/matrix.py
@@ -20,7 +20,7 @@
 
 
 class matrix_padded:
-    def __init__(self, lat, points, write_fields, read_fields, code, code_parallel_block_size=None):
+    def __init__(self, lat, points, code, code_parallel_block_size=None):
         margin = [0] * lat.grid.nd
         for p in points:
             for i in range(lat.grid.nd):
@@ -32,11 +32,17 @@ def __init__(self, lat, points, write_fields, read_fields, code, code_parallel_b
         self.local_stencil = g.local_stencil.matrix(
             self.padding(lat), points, code, code_parallel_block_size
         )
-        self.write_fields = write_fields
-        self.read_fields = read_fields
+        self.write_fields = None
         self.verbose_performance = g.default.is_verbose("stencil_performance")
 
+    def data_access_hints(self, write_fields, read_fields, cache_fields):
+        self.write_fields = write_fields
+        self.read_fields = read_fields
+        self.cache_fields = cache_fields
+
     def __call__(self, *fields):
+        if self.write_fields is None:
+            raise Exception("Generalized matrix stencil needs more information.  Call stencil.data_access_hints.")
         if self.verbose_performance:
             t = g.timer("stencil.matrix")
             t("create fields")
@@ -62,13 +68,14 @@ def __call__(self, *fields):
         if self.verbose_performance:
             t()
             g.message(t)
+        # todo: make use of cache_fields
 
 
-def matrix(lat, points, write_fields, read_fields, code, code_parallel_block_size=None):
+def matrix(lat, points, code, code_parallel_block_size=None):
     # check if all points are cartesian
     for p in points:
         if len([s for s in p if s != 0]) > 1:
             return matrix_padded(
-                lat, points, write_fields, read_fields, code, code_parallel_block_size
+                lat, points, code, code_parallel_block_size
             )
     return g.local_stencil.matrix(lat, points, code, code_parallel_block_size, local=0)
diff --git a/lib/gpt/core/stencil/matrix_vector.py b/lib/gpt/core/stencil/matrix_vector.py
@@ -0,0 +1,27 @@
+#
+#    GPT - Grid Python Toolkit
+#    Copyright (C) 2023  Christoph Lehner ([email protected], https://github.com/lehner/gpt)
+#
+#    This program is free software; you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation; either version 2 of the License, or
+#    (at your option) any later version.
+#
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+#
+#    You should have received a copy of the GNU General Public License along
+#    with this program; if not, write to the Free Software Foundation, Inc.,
+#    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+#
+import gpt as g
+
+
+def matrix_vector(lat_matrix, lat_vector, points, code, code_parallel_block_size=None):
+    # check if all points are cartesian
+    for p in points:
+        if len([s for s in p if s != 0]) > 1:
+            raise Exception("General stencil matrix_vector not yet implemented")
+    return g.local_stencil.matrix_vector(lat_matrix, lat_vector, points, code, code_parallel_block_size, local=0)
diff --git a/tests/core/stencil.py b/tests/core/stencil.py
@@ -26,14 +26,13 @@ def stencil_cshift(src, direction1, direction2):
     stencil = g.stencil.matrix(
         src,
         [direction1, direction2, (0, 0, 0, 0)],
-        [0],
-        [1, 2],
         [
             {"target": 0, "accumulate": -1, "weight": 1.0, "factor": [(1, 0, 0)]},
             {"target": 0, "accumulate": 0, "weight": 1.0, "factor": [(2, 1, 0)]},
             {"target": 0, "accumulate": 0, "weight": 1.0, "factor": [(2, 2, 0)]},
         ],
     )
+    stencil.data_access_hints([0], [1,2,3,4], [])
     dst = g.lattice(src)
     stencil(dst, src, src)
     return dst
@@ -103,11 +102,25 @@ def stencil_cshift(src, direction1, direction2):
 
 pval = 2 * g.sum(g.trace(Ps)).real / P.grid.gsites / 4 / 3 / 3
 
+eps = abs(Pref - pval)
+g.message(f"Stencil plaquette (local + padding): {pval} versus reference {Pref}: {eps}")
+assert eps < 1e-14
+
+stencil_plaquette = g.stencil.matrix(
+    P,
+    [(0, 0, 0, 0), (1, 0, 0, 0), (0, 1, 0, 0), (0, 0, 1, 0), (0, 0, 0, 1)],
+    code,
+)
+
+stencil_plaquette(Ps, *U)
+pval = 2 * g.sum(g.trace(Ps)).real / P.grid.gsites / 4 / 3 / 3
+
 eps = abs(Pref - pval)
 g.message(f"Stencil plaquette: {pval} versus reference {Pref}: {eps}")
 assert eps < 1e-14
 
 
+
 # run again for benchmark:
 # t = g.timer("test")
 # t("halo exchange")
@@ -165,15 +178,9 @@ def stencil_cshift(src, direction1, direction2):
 rng.cnormal(src)
 cov = g.covariant.shift(U, boundary_phases=[1.0, 1.0, 1.0, 1.0])
 for mu in range(4):
-    pad_U = g.padded_local_fields(U, list(evec[mu]))
-    pad_src = g.padded_local_fields(src, list(evec[mu]))
-
-    p_U = pad_U(U)
-    p_src = pad_src(src)
-
-    st = g.local_stencil.matrix_vector(
-        p_U[0],
-        p_src,
+    st = g.stencil.matrix_vector(
+        U[0],
+        src,
         [(0, 0, 0, 0), evec[mu], nevec[mu]],
         [
             {
@@ -206,15 +213,11 @@ def stencil_cshift(src, direction1, direction2):
     def lap(dst, src):
         dst @= -2.0 * src + cov.forward[mu] * src + cov.backward[mu] * src
 
-    p_dst = g.lattice(p_src)
-    dst = g.lattice(src)
-
     ref = g.lattice(src)
     stv = g.lattice(src)
 
     lap(ref, src)
-    st(p_U, [p_dst, p_src])
-    pad_src.extract(stv, p_dst)
+    st(U, [stv, src])
 
     eps2 = g.norm2(stv - ref)
     g.message(f"Stencil covariant laplace versus cshift version: {eps2}")