From 9e519176439f312034e17b881013d96095ecd9aa Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Fri, 30 Jun 2023 17:26:11 +0200
Subject: [PATCH] faster iwasaki force

---
 .../gauge/action/improved_with_rectangle.py   | 96 ++++++++++++-------
 1 file changed, 61 insertions(+), 35 deletions(-)

diff --git a/lib/gpt/qcd/gauge/action/improved_with_rectangle.py b/lib/gpt/qcd/gauge/action/improved_with_rectangle.py
index 68efe476..7f0a6d77 100644
--- a/lib/gpt/qcd/gauge/action/improved_with_rectangle.py
+++ b/lib/gpt/qcd/gauge/action/improved_with_rectangle.py
@@ -27,8 +27,7 @@ def __init__(self, beta, c1, c0=None):
         if c0 is None:
             c0 = 1.0 - 8 * c1
         self.c0 = c0
-        self.staple_1x1 = {}
-        self.staple_2x1 = {}
+        self.cache = {}
         self.__name__ = f"improved_with_rectangle({beta},{c1})"
 
     def __call__(self, U):
@@ -47,40 +46,67 @@ def __call__(self, U):
     def staples(self, U, mu_target=None):
         Nd = len(U)
 
-        ret = []
-        for mu in range(Nd):
-
-            if mu_target is not None and mu_target != mu:
-                continue
-
-            st = g.lattice(U[0])
-            st[:] = 0
-
-            O = [nu for nu in range(Nd) if nu != mu]
+        if mu_target not in self.cache:
+            code = []
+            Nret = 0
             path = g.path
-            if (Nd, mu) not in self.staple_1x1:
-                p = []
-                p += [path().f(nu).f(mu).b(nu) for nu in O]
-                p += [path().b(nu).f(mu).f(nu) for nu in O]
-                self.staple_1x1[(Nd, mu)] = g.parallel_transport(U, p)
-
-            if (Nd, mu) not in self.staple_2x1:
-                p = []
-                p += [path().f(nu, 2).f(mu).b(nu, 2) for nu in O]
-                p += [path().b(nu, 2).f(mu).f(nu, 2) for nu in O]
-                p += [path().f(nu).f(mu, 2).b(nu).b(mu) for nu in O]
-                p += [path().b(nu).f(mu, 2).f(nu).b(mu) for nu in O]
-                p += [path().b(mu).b(nu).f(mu, 2).f(nu) for nu in O]
-                p += [path().b(mu).f(nu).f(mu, 2).b(nu) for nu in O]
-                self.staple_2x1[(Nd, mu)] = g.parallel_transport(U, p)
-
-            for s in self.staple_1x1[(Nd, mu)](U):
-                st += (self.beta * self.c0 / U[0].otype.shape[0]) * s
-
-            for s in self.staple_2x1[(Nd, mu)](U):
-                st += (self.beta * self.c1 / U[0].otype.shape[0]) * s
-
-            ret.append(st)
+            for mu in range(Nd):
+
+                if mu_target is not None and mu_target != mu:
+                    continue
+
+                O = [nu for nu in range(Nd) if nu != mu]
+                code_mu = []
+                for nu in O:
+                    # 1x1 staples
+                    s0 = self.beta * self.c0 / U[0].otype.shape[0]
+                    code_mu.append(
+                        (Nret, -1 if len(code_mu) == 0 else Nret, s0, path().f(nu).f(mu).b(nu))
+                    )
+                    code_mu.append(
+                        (Nret, -1 if len(code_mu) == 0 else Nret, s0, path().b(nu).f(mu).f(nu))
+                    )
+
+                    # 2x1 staples
+                    s1 = self.beta * self.c1 / U[0].otype.shape[0]
+                    dst = -1 if len(code_mu) == 0 else Nret
+                    code_mu.append(
+                        (
+                            Nret,
+                            dst,
+                            s1,
+                            path().f(nu, 2).f(mu).b(nu, 2),
+                        )
+                    )
+                    dst = Nret
+                    code_mu.append(
+                        (
+                            Nret,
+                            Nret,
+                            s1,
+                            path().b(nu, 2).f(mu).f(nu, 2),
+                        )
+                    )
+                    code_mu.append(
+                        (
+                            Nret,
+                            Nret,
+                            s1,
+                            path().f(nu).f(mu, 2).b(nu).b(mu),
+                        )
+                    )
+                    code_mu.append((Nret, Nret, s1, path().b(nu).f(mu, 2).f(nu).b(mu)))
+                    code_mu.append((Nret, Nret, s1, path().b(mu).b(nu).f(mu, 2).f(nu)))
+                    code_mu.append((Nret, Nret, s1, path().b(mu).f(nu).f(mu, 2).b(nu)))
+
+                code = code + code_mu
+                Nret += 1
+
+            self.cache[mu_target] = g.parallel_transport_matrix(U, code, Nret)
+
+        ret = self.cache[mu_target](U)
+        if mu_target is not None:
+            ret = [ret]
         return ret