From d295a6290f09b5455797ccf53dd3e47924864083 Mon Sep 17 00:00:00 2001
From: Pratyai Mazumder <pratyai.mazumder@gmail.com>
Date: Thu, 7 Nov 2024 08:03:37 +0100
Subject: [PATCH] Fix pure reduce expansion for squeezed output memlets.
 (#1709)

It was producing wrong indices for the initialization kernel, which
would not work for some simple valid SDFGs (see the demo in the test).
---
 dace/libraries/standard/nodes/reduce.py |  2 +-
 tests/library/reduce_test.py            | 74 ++++++++++++++++++++-----
 2 files changed, 62 insertions(+), 14 deletions(-)

diff --git a/dace/libraries/standard/nodes/reduce.py b/dace/libraries/standard/nodes/reduce.py
index fa231c07f2..970dfcef3a 100644
--- a/dace/libraries/standard/nodes/reduce.py
+++ b/dace/libraries/standard/nodes/reduce.py
@@ -103,7 +103,7 @@ def expansion(node: 'Reduce', state: SDFGState, sdfg: SDFG):
                 'reduce_init', {'_o%d' % i: '0:%s' % symstr(d)
                                 for i, d in enumerate(outedge.data.subset.size())}, {},
                 '__out = %s' % node.identity,
-                {'__out': dace.Memlet.simple('_out', ','.join(['_o%d' % i for i in range(output_dims)]))},
+                {'__out': dace.Memlet.simple('_out', ','.join(['_o%d' % i for i in osqdim]))},
                 external_edges=True)
         else:
             nstate = nsdfg.add_state()
diff --git a/tests/library/reduce_test.py b/tests/library/reduce_test.py
index 8e4b2153f0..bb448226db 100644
--- a/tests/library/reduce_test.py
+++ b/tests/library/reduce_test.py
@@ -1,35 +1,82 @@
 # Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
-import dace
 import numpy as np
 import pytest
+
+import dace
 import dace.libraries.standard as std
+from dace import SDFG, Memlet
+
+C_in, C_out, H, K, N, W = (dace.symbol(s, dace.int64) for s in ('C_in', 'C_out', 'H', 'K', 'N', 'W'))
+
+
+def make_sdfg():
+    g = SDFG('prog')
+    g.add_array('A', (N, 1, 1, C_in, C_out), dace.float32,
+                strides=(C_in * C_out, C_in * C_out, C_in * C_out, C_out, 1))
+    g.add_array('C', (N, H, W, C_out), dace.float32,
+                strides=(C_out * H * W, C_out * W, C_out, 1))
+
+    st0 = g.add_state('st0', is_start_block=True)
+    st = st0
+
+    A = st.add_access('A')
+    C = st.add_access('C')
+    R = st.add_reduce('lambda x, y: x + y', [1, 2, 3], 0)
+    st.add_nedge(A, R, Memlet(expr='A[0:N, 0, 0, 0:C_in, 0:C_out]'))
+    st.add_nedge(R, C, Memlet(expr='C[0:N, 5, 5, 0:C_out]'))
+
+    return g, R
+
+
+def test_library_node_expand_reduce_pure():
+    n, cin, cout = 7, 7, 7
+    h, k, w = 25, 35, 45
+    A = np.ones((n, 1, 1, cin, cout), np.float32)
+
+    g, R = make_sdfg()
+    R.implementation = 'pure-seq'
+    g.validate()
+    g.compile()
+
+    wantC = np.ones((n, h, w, cout), np.float32) * 42
+    g(A=A, C=wantC, N=n, C_in=cin, C_out=cout, H=h, K=k, W=w)
+
+    g, R = make_sdfg()
+    R.implementation = 'pure'
+    g.validate()
+    g.compile()
+
+    gotC = np.ones((n, h, w, cout), np.float32) * 42
+    g(A=A, C=gotC, N=n, C_in=cin, C_out=cout, H=h, K=k, W=w)
+    assert np.allclose(wantC, gotC)
+
 
 _params = ['pure', 'CUDA (device)', 'pure-seq', 'GPUAuto']
-  
 
 
 @pytest.mark.gpu
 @pytest.mark.parametrize('impl', _params)
 def test_multidim_gpu(impl):
-
     test_cases = [([1, 64, 60, 60], (0, 2, 3), [64], np.float32),
-                    ([8, 512, 4096], (0,1), [4096], np.float32),
-                    ([8, 512, 4096], (0,1), [4096], np.float64),
-                    ([1024, 8], (0), [8], np.float32),
-                    ([111, 111, 111], (0,1), [111], np.float64),
-                    ([111, 111, 111], (1,2), [111], np.float64),
-                    ([1000000], (0), [1], np.float64),
-                    ([1111111], (0), [1], np.float64),
-                    ([123,21,26,8], (1,2), [123,8], np.float32),
-                    ([2, 512, 2], (0,2), [512], np.float32),
-                    ([512, 555, 257], (0,2), [555], np.float64)]
+                  ([8, 512, 4096], (0, 1), [4096], np.float32),
+                  ([8, 512, 4096], (0, 1), [4096], np.float64),
+                  ([1024, 8], (0), [8], np.float32),
+                  ([111, 111, 111], (0, 1), [111], np.float64),
+                  ([111, 111, 111], (1, 2), [111], np.float64),
+                  ([1000000], (0), [1], np.float64),
+                  ([1111111], (0), [1], np.float64),
+                  ([123, 21, 26, 8], (1, 2), [123, 8], np.float32),
+                  ([2, 512, 2], (0, 2), [512], np.float32),
+                  ([512, 555, 257], (0, 2), [555], np.float64)]
 
     for in_shape, ax, out_shape, dtype in test_cases:
         print(in_shape, ax, out_shape, dtype)
         axes = ax
+
         @dace.program
         def multidimred(a, b):
             b[:] = np.sum(a, axis=axes)
+
         a = np.random.rand(*in_shape).astype(dtype)
         b = np.random.rand(*out_shape).astype(dtype)
         sdfg = multidimred.to_sdfg(a, b)
@@ -45,3 +92,4 @@ def multidimred(a, b):
 if __name__ == '__main__':
     for p in _params:
         test_multidim_gpu(p)
+    test_library_node_expand_reduce_pure()