shallow water: reductions and L2 error comp runs on GPU

tkarna · tkarna · commit 9987123381de · 2025-02-04T19:42:42.000+02:00
diff --git a/examples/shallow_water.py b/examples/shallow_water.py
@@ -156,7 +156,7 @@ def ind_arr(shape, columns=False):
     q = create_full(F_shape, 0.0, dtype)
 
     # bathymetry
-    h = create_full(T_shape, 1.0, dtype)  # HACK init with 1
+    h = create_full(T_shape, 0.0, dtype)
 
     hu = create_full(U_shape, 0.0, dtype)
     hv = create_full(V_shape, 0.0, dtype)
@@ -165,7 +165,7 @@ def ind_arr(shape, columns=False):
     dvdx = create_full(F_shape, 0.0, dtype)
 
     # vector invariant form
-    H_at_f = create_full(F_shape, 0.0, dtype)
+    H_at_f = create_full(F_shape, 1.0, dtype)  # HACK init with 1
 
     # auxiliary variables for RK time integration
     e1 = create_full(T_shape, 0.0, dtype)
@@ -205,15 +205,14 @@ def bathymetry(x_t_2d, y_t_2d, lx, ly):
         return bath * create_full(T_shape, 1.0, dtype)
 
     # set bathymetry
-    # h[:, :] = bathymetry(x_t_2d, y_t_2d, lx, ly).to_device(device)
+    h[:, :] = bathymetry(x_t_2d, y_t_2d, lx, ly).to_device(device)
     # steady state potential energy
-    # pe_offset = 0.5 * g * float(np.sum(h**2.0, all_axes)) / nx / ny
-    pe_offset = 0.5 * g * float(1.0) / nx / ny
+    h2sum = np.sum(h**2.0, all_axes).to_device()
+    pe_offset = 0.5 * g * float(np.sum(h2sum, all_axes)) / nx / ny
 
     # compute time step
     alpha = 0.5
-    # h_max = float(np.max(h, all_axes))
-    h_max = float(1.0)
+    h_max = float(np.max(h, all_axes).to_device())
     c = (g * h_max) ** 0.5
     dt = alpha * dx / c
     dt = t_export / int(math.ceil(t_export / dt))
@@ -253,10 +252,11 @@ def rhs(u, v, e):
         H_at_f[-1, 1:-1] = 0.5 * (H[-1, 1:] + H[-1, :-1])
         H_at_f[1:-1, 0] = 0.5 * (H[1:, 0] + H[:-1, 0])
         H_at_f[1:-1, -1] = 0.5 * (H[1:, -1] + H[:-1, -1])
-        H_at_f[0, 0] = H[0, 0]
-        H_at_f[0, -1] = H[0, -1]
-        H_at_f[-1, 0] = H[-1, 0]
-        H_at_f[-1, -1] = H[-1, -1]
+        # NOTE causes gpu.memcpy error, non-identity layout
+        # H_at_f[0, 0] = H[0, 0]
+        # H_at_f[0, -1] = H[0, -1]
+        # H_at_f[-1, 0] = H[-1, 0]
+        # H_at_f[-1, -1] = H[-1, -1]
 
         # potential vorticity
         dudy[:, 1:-1] = (u[:, 1:] - u[:, :-1]) / dy
@@ -346,41 +346,36 @@ def step(u, v, e, u1, v1, e1, u2, v2, e2):
         t = i * dt
 
         if t >= next_t_export - 1e-8:
-            if device:
-                # FIXME gpu.memcpy to host requires identity layout
-                # FIXME reduction on gpu
-                elev_max = 0
-                u_max = 0
-                q_max = 0
-                diff_e = 0
-                diff_v = 0
-                total_pe = 0
-                total_ke = 0
-            else:
-                _elev_max = np.max(e, all_axes)
-                _u_max = np.max(u, all_axes)
-                _q_max = np.max(q, all_axes)
-                _total_v = np.sum(e + h, all_axes)
-
-                # potential energy
-                _pe = 0.5 * g * (e + h) * (e - h) + pe_offset
-                _total_pe = np.sum(_pe, all_axes)
-
-                # kinetic energy
-                u2 = u * u
-                v2 = v * v
-                u2_at_t = 0.5 * (u2[1:, :] + u2[:-1, :])
-                v2_at_t = 0.5 * (v2[:, 1:] + v2[:, :-1])
-                _ke = 0.5 * (u2_at_t + v2_at_t) * (e + h)
-                _total_ke = np.sum(_ke, all_axes)
-
-                total_pe = float(_total_pe) * dx * dy
-                total_ke = float(_total_ke) * dx * dy
-                total_e = total_ke + total_pe
-                elev_max = float(_elev_max)
-                u_max = float(_u_max)
-                q_max = float(_q_max)
-                total_v = float(_total_v) * dx * dy
+            sync()
+            # NOTE must precompute reduction operands to single field
+            H_tmp = e + h
+            # potential energy
+            _pe = 0.5 * g * (e + h) * (e - h) + pe_offset
+            # kinetic energy
+            u2 = u * u
+            v2 = v * v
+            u2_at_t = 0.5 * (u2[1:, :] + u2[:-1, :])
+            v2_at_t = 0.5 * (v2[:, 1:] + v2[:, :-1])
+            _ke = 0.5 * (u2_at_t + v2_at_t) * (e + h)
+            sync()
+            _elev_max = np.max(e, all_axes).to_device()
+            # NOTE max(u) segfaults, shape (n+1, n) too large for tiling
+            _u_max = np.max(u[1:, :], all_axes).to_device()
+            _q_max = np.max(q[1:, 1:], all_axes).to_device()
+            _total_v = np.sum(H_tmp, all_axes).to_device()
+            _total_pe = np.sum(_pe, all_axes).to_device()
+            _total_ke = np.sum(_ke, all_axes).to_device()
+
+            total_pe = float(_total_pe) * dx * dy
+            total_ke = float(_total_ke) * dx * dy
+            total_e = total_ke + total_pe
+            elev_max = float(_elev_max)
+            u_max = float(_u_max)
+            q_max = float(_q_max)
+            total_v = float(_total_v) * dx * dy
+
+            diff_e = 0
+            diff_v = 0
 
             if i_export == 0:
                 initial_v = total_v
@@ -415,40 +410,36 @@ def step(u, v, e, u1, v1, e1, u2, v2, e2):
     duration = time_mod.perf_counter() - tic
     info(f"Duration: {duration:.2f} s")
 
-    if device:
-        # FIXME gpu.memcpy to host requires identity layout
-        # FIXME reduction on gpu
-        pass
-    else:
-        e_exact = exact_solution(
-            t, x_t_2d, y_t_2d, x_u_2d, y_u_2d, x_v_2d, y_v_2d
-        )[2]
-        err2 = (e_exact - e) * (e_exact - e) * dx * dy / lx / ly
-        err_L2 = math.sqrt(float(np.sum(err2, all_axes)))
-        info(f"L2 error: {err_L2:7.15e}")
-
-        if nx < 128 or ny < 128:
-            info("Skipping correctness test due to small problem size.")
-        elif not benchmark_mode:
-            tolerance_ene = 1e-7 if datatype == "f32" else 1e-9
-            assert (
-                diff_e < tolerance_ene
-            ), f"Energy error exceeds tolerance: {diff_e} > {tolerance_ene}"
-            if nx == 128 and ny == 128:
-                if datatype == "f32":
-                    assert numpy.allclose(
-                        err_L2, 4.3127859e-05, rtol=1e-5
-                    ), "L2 error does not match"
-                else:
-                    assert numpy.allclose(
-                        err_L2, 4.315799035627906e-05
-                    ), "L2 error does not match"
+    e_exact = exact_solution(t, x_t_2d, y_t_2d, x_u_2d, y_u_2d, x_v_2d, y_v_2d)[
+        2
+    ].to_device(device)
+    err2 = (e_exact - e) * (e_exact - e) * dx * dy / lx / ly
+    err2sum = np.sum(err2, all_axes).to_device()
+    err_L2 = math.sqrt(float(err2sum))
+    info(f"L2 error: {err_L2:7.15e}")
+
+    if nx < 128 or ny < 128:
+        info("Skipping correctness test due to small problem size.")
+    elif not benchmark_mode:
+        tolerance_ene = 1e-7 if datatype == "f32" else 1e-9
+        assert (
+            diff_e < tolerance_ene
+        ), f"Energy error exceeds tolerance: {diff_e} > {tolerance_ene}"
+        if nx == 128 and ny == 128:
+            if datatype == "f32":
+                assert numpy.allclose(
+                    err_L2, 4.3127859e-05, rtol=1e-5
+                ), "L2 error does not match"
             else:
-                tolerance_l2 = 1e-4
-                assert (
-                    err_L2 < tolerance_l2
-                ), f"L2 error exceeds tolerance: {err_L2} > {tolerance_l2}"
-            info("SUCCESS")
+                assert numpy.allclose(
+                    err_L2, 4.315799035627906e-05
+                ), "L2 error does not match"
+        else:
+            tolerance_l2 = 1e-4
+            assert (
+                err_L2 < tolerance_l2
+            ), f"L2 error exceeds tolerance: {err_L2} > {tolerance_l2}"
+        info("SUCCESS")
 
     fini()