From 2be3a2fba269359d14b58af1f4738989b3688b15 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Mon, 27 Nov 2023 17:12:49 +0800
Subject: [PATCH 001/162] ultra-fast forward sum kernel

---
 .gitignore                           |   2 +
 src/pyjuice/layer/sum_layer.py       |   6 +
 tests/layer/sum_block_sparse_test.py | 399 +++++++++++++++++++++++++++
 tests/structures/hclt_test.py        |   2 +-
 4 files changed, 408 insertions(+), 1 deletion(-)
 create mode 100644 tests/layer/sum_block_sparse_test.py

diff --git a/.gitignore b/.gitignore
index c2f82463..95b3f888 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,6 +6,8 @@ __pycache__/
 # C extensions
 *.so
 
+temp.npz
+
 # Distribution / packaging
 .Python
 build/
diff --git a/src/pyjuice/layer/sum_layer.py b/src/pyjuice/layer/sum_layer.py
index 1736e0b3..59ab5869 100644
--- a/src/pyjuice/layer/sum_layer.py
+++ b/src/pyjuice/layer/sum_layer.py
@@ -384,6 +384,12 @@ def _forward(self, node_mars: torch.Tensor, element_mars: torch.Tensor,
             BLOCK_M = triton.next_power_of_2(n_edges)
             BLOCK_N = max(BLOCK_SIZE // BLOCK_M, 1)
 
+        # import numpy as np
+        # np.savez("temp.npz", node_mars = node_mars.cpu().numpy(), element_mars = element_mars.cpu().numpy(), params = params.cpu().numpy(),
+        #          nids = nids.cpu().numpy(), cids = cids.cpu().numpy(), pids = pids.cpu().numpy(), tot_n_nodes = tot_n_nodes, tot_n_eles = tot_n_eles, n_nodes = n_nodes,
+        #          n_edges = n_edges, batch_size = batch_size, BLOCK_M = BLOCK_M, BLOCK_N = BLOCK_N)
+        # import pdb; pdb.set_trace()
+
         grid = (triton.cdiv(n_nodes * n_edges, BLOCK_M), triton.cdiv(batch_size, BLOCK_N), 1)
 
         self._forward_triton_kernel[grid](
diff --git a/tests/layer/sum_block_sparse_test.py b/tests/layer/sum_block_sparse_test.py
new file mode 100644
index 00000000..84bc9b61
--- /dev/null
+++ b/tests/layer/sum_block_sparse_test.py
@@ -0,0 +1,399 @@
+import triton
+import triton.language as tl
+import torch
+import numpy as np
+import time
+
+
+@triton.jit
+def _forward_triton_kernel(node_mars_ptr, element_mars_ptr, params_ptr, 
+                            nids_ptr, cids_ptr, pids_ptr, tot_n_nodes, 
+                            tot_n_eles, n_nodes, n_edges: tl.constexpr, 
+                            batch_size, n_nodes_per_block_m: tl.constexpr,
+                            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr):
+
+    # We use BLOCK_M to index over edges, and BLOCK_N to index over batches
+    pid0 = tl.program_id(axis = 0)
+    pid1 = tl.program_id(axis = 1)
+    ne_start = pid0 * BLOCK_M
+    b_start = pid1 * BLOCK_N
+
+    # Id of edges processed by the current block (0.081ms)
+    ne_offsets = ne_start + tl.arange(0, BLOCK_M)
+    # Batch ids processed by the current block
+    b_offsets = b_start + tl.arange(0, BLOCK_N)
+
+    # Get node ids from `nids`
+    n_start = ne_start // n_edges
+    nid_offsets = n_start + tl.arange(0, n_nodes_per_block_m)
+    n_ids = tl.load(nids_ptr + nid_offsets)
+
+    # Get edge ids from `cids`
+    cid_offsets = tl.view(ne_offsets, (n_edges, n_nodes_per_block_m))
+    ch_ids = tl.load(cids_ptr + cid_offsets)
+    # Use `ch_ids` to retrieve the corresponding element mars
+    ele_offsets = ch_ids[None,:,:] * batch_size + b_offsets[:,None,None]
+    ch_logps = tl.load(element_mars_ptr + ele_offsets) # `element_mars[cids]`
+
+    # Get param ids from `pids`
+    # Here we reuse `cid_offsets` and `cid_mask` thank to their similar structure
+    par_ids = tl.load(pids_ptr + cid_offsets)
+
+    # Use `par_ids` to retrieve the corresponding parameters
+    ch_pars = tl.load(params_ptr + par_ids) # `params[pids]`
+
+    # Take the max of the child mars
+    ch_max_logp = tl.max(ch_logps, axis = 1) # `maxval`
+    # Subtract the max from child mars
+    ch_logps_sub_max = ch_logps - ch_max_logp[:,None,:]
+    # Take exp
+    ch_ps_sub_max = tl.exp(ch_logps_sub_max)
+
+    # Sum node marginals (unnormalized)
+    n_ps = tl.sum(ch_ps_sub_max * ch_pars[None,:,:], axis = 1)
+
+    # Take log and subtract max vals
+    n_logps = tl.log(tl.maximum(n_ps, 1e-10)) + ch_max_logp
+
+    # Read out the target indices for `node_mars`
+    nmar_offsets = n_ids[None,:] * batch_size + b_offsets[:,None]
+    
+    # Reshape seems to be necessary for certain combinations of (BLOCK_N, n_nodes_per_block_m)
+    nmar_offsets = tl.view(nmar_offsets, (BLOCK_N * n_nodes_per_block_m,))
+    n_logps = tl.view(n_logps, (BLOCK_N * n_nodes_per_block_m,))
+    tl.store(node_mars_ptr + nmar_offsets, n_logps)
+
+
+@triton.jit
+def block_sparse_kernel(ddd, node_mars, element_mars, params, nids, cids_start, cids_increment, pids_start, pids_increment,
+                        tot_n_nodes, tot_n_eles, layer_n_nodes, layer_n_edge_groups, batch_size,
+                        BLOCK_M: tl.constexpr, GROUP_SIZE: tl.constexpr):
+    
+    pid_m = tl.program_id(0)
+    pid_b = tl.program_id(1) # batch id
+
+    # initialize pointers to `element_mars`
+    node_start = tl.multiple_of(pid_m * layer_n_edge_groups * GROUP_SIZE, 8) # compiler hint
+    offs_node = tl.arange(0, BLOCK_M) + node_start
+    mask_node = offs_node < layer_n_nodes
+    offs_edge = tl.arange(0, GROUP_SIZE)
+    edge_start = tl.load(cids_start + offs_node, mask = mask_node, other = 0)
+    emars_ptr = element_mars + pid_b * tot_n_eles + edge_start[:,None] + offs_edge[None,:]
+    emars_ptr = tl.view(emars_ptr, (BLOCK_M, GROUP_SIZE))
+
+    # initialize pointers to `params`
+    param_start = tl.load(pids_start + offs_node, mask = mask_node, other = 0)
+    params_ptr = params + param_start[:,None] + offs_edge[None,:]
+    # params_ptr = params + offs_edge[:,None] + param_start[None,:]
+    params_ptr = tl.view(params_ptr, (BLOCK_M, GROUP_SIZE))
+
+    # Inner loop
+    acc = tl.zeros((BLOCK_M,), dtype = tl.float32) - float("inf")
+
+    cids_inc_ptr = cids_increment + offs_node
+    pids_inc_ptr = pids_increment + offs_node
+    for k in range(0, layer_n_edge_groups):
+        emars = tl.load(emars_ptr, mask = mask_node[:,None])
+        epars = tl.load(params_ptr, mask = mask_node[:,None])
+        emars_max = tl.max(emars, axis = 1)
+        emars = tl.exp(emars - emars_max[:,None])
+
+        # nmars = tl.dot(emars, params)
+        nmars = tl.sum(emars * epars, axis = 1)
+
+        acc = tl.where(emars_max > acc, 
+            tl.log(nmars + tl.exp(acc - emars_max)) + emars_max,
+            tl.log(tl.exp(emars_max - acc) * nmars + 1.0) + acc
+        )
+
+        cids_inc = tl.load(cids_inc_ptr, mask = mask_node)
+        pids_inc = tl.load(pids_inc_ptr, mask = mask_node)
+        emars_ptr += cids_inc
+        params_ptr += pids_inc
+        cids_inc_ptr += 1
+        pids_inc_ptr += 1
+
+    # Write back
+    ns = tl.load(nids + offs_node, mask = mask_node)
+    tl.store(node_mars + ns + pid_b * tot_n_nodes, tl.ravel(acc), mask = mask_node)
+
+
+def main_baseline():
+    data = np.load("temp.npz")
+
+    device = torch.device("cuda:0")
+
+    node_mars = torch.from_numpy(data["node_mars"]).to(device)
+    node_mars2 = node_mars.clone()
+    element_mars = torch.from_numpy(data["element_mars"]).to(device)
+    params = torch.from_numpy(data["params"]).to(device)
+    nids = torch.from_numpy(data["nids"]).to(device)
+    cids = torch.from_numpy(data["cids"]).to(device)
+    pids = torch.from_numpy(data["pids"]).to(device)
+    tot_n_nodes = int(data["tot_n_nodes"])
+    tot_n_eles = int(data["tot_n_eles"])
+    n_nodes = int(data["n_nodes"])
+    n_edges = int(data["n_edges"])
+    batch_size = int(data["batch_size"])
+    BLOCK_M = int(data["BLOCK_M"])
+    BLOCK_N = int(data["BLOCK_N"])
+
+    # ddd = torch.zeros([n_nodes * n_edges]).to(device)
+
+    BLOCK_M = 128
+    BLOCK_N = 64
+
+    grid = (triton.cdiv(n_nodes * n_edges, BLOCK_M), triton.cdiv(batch_size, BLOCK_N), 1)
+
+    ts = []
+    for i in range(5):
+        t0 = time.time()
+        _forward_triton_kernel[grid](
+            node_mars_ptr = node_mars, 
+            element_mars_ptr = element_mars, 
+            params_ptr = params,
+            nids_ptr = nids, 
+            cids_ptr = cids, 
+            pids_ptr = pids,
+            tot_n_nodes = tot_n_nodes,
+            tot_n_eles = tot_n_eles,
+            n_nodes = n_nodes,
+            n_edges = n_edges,
+            batch_size = batch_size,
+            n_nodes_per_block_m = BLOCK_M // n_edges,
+            BLOCK_M = BLOCK_M, 
+            BLOCK_N = BLOCK_N
+        )
+        torch.cuda.synchronize()
+        t1 = time.time()
+
+        if i > 0:
+            ts.append(t1 - t0)
+
+    aveg_t, std_t = torch.tensor(ts).mean().item() * 1000, torch.tensor(ts).std().item() * 1000
+    print(f"{aveg_t:.3f}±{std_t:.3f}ms")
+
+    # node_mars_gt = node_mars.clone()
+    # ch_mars = element_mars[cids]
+    # maxval = ch_mars.max(dim = 1, keepdim = True).values
+    # aaa = (((ch_mars - maxval).exp() * params[pids].unsqueeze(-1)).sum(
+    #     dim = 1).clamp(min = 1e-10)).log() + maxval.squeeze(1)
+
+    # bbb = node_mars[nids]
+
+    # print(torch.max((aaa - bbb).abs()))
+
+
+def main_blocksparse():
+
+    GROUP_SIZE = 128
+
+    data = np.load("temp.npz")
+
+    device = torch.device("cuda:0")
+
+    node_mars = torch.from_numpy(data["node_mars"]).permute(1, 0).contiguous().to(device)
+    element_mars = torch.from_numpy(data["element_mars"]).permute(1, 0).contiguous().to(device)
+    params = torch.from_numpy(data["params"]).to(device)
+
+    # Convert `nids`, `cids`, and `pids` into block sparse format
+    nids = torch.from_numpy(data["nids"]).to(device)
+    cids = torch.from_numpy(data["cids"])
+    pids = torch.from_numpy(data["pids"])
+
+    cids = cids[:,::GROUP_SIZE].contiguous()
+    pids = pids[:,::GROUP_SIZE].contiguous()
+
+    cids_start = cids[:,0].contiguous().to(device)
+    pids_start = pids[:,0].contiguous().to(device)
+    cids_increment = torch.cat((cids[:,1:] - cids[:,:-1], cids[:,0:1] * 0), dim = 1).contiguous().to(device)
+    pids_increment = torch.cat((pids[:,1:] - pids[:,:-1], pids[:,0:1] * 0), dim = 1).contiguous().to(device)
+
+    tot_n_nodes = int(data["tot_n_nodes"])
+    tot_n_eles = int(data["tot_n_eles"])
+    layer_n_nodes = int(data["n_nodes"])
+    layer_n_edges = int(data["n_edges"])
+    batch_size = int(data["batch_size"])
+
+    BLOCK_M = 16
+
+    grid = (triton.cdiv(layer_n_nodes, BLOCK_M), batch_size)
+
+    ddd = torch.zeros([layer_n_nodes, batch_size], dtype = torch.long, device = device)
+
+    ts = []
+    for i in range(5):
+        t0 = time.time()
+        block_sparse_kernel[grid](
+            ddd,
+            node_mars, 
+            element_mars, 
+            params, 
+            nids, 
+            cids_start, 
+            cids_increment, 
+            pids_start, 
+            pids_increment,
+            tot_n_nodes, 
+            tot_n_eles, 
+            layer_n_nodes, 
+            layer_n_edge_groups = layer_n_edges // GROUP_SIZE, 
+            batch_size = batch_size,
+            BLOCK_M = BLOCK_M, 
+            GROUP_SIZE = GROUP_SIZE
+        )
+        torch.cuda.synchronize()
+        t1 = time.time()
+
+        if i > 0:
+            ts.append(t1 - t0)
+
+    aveg_t, std_t = torch.tensor(ts).mean().item() * 1000, torch.tensor(ts).std().item() * 1000
+    print(f"{aveg_t:.3f}±{std_t:.3f}ms")
+
+    # import pdb; pdb.set_trace()
+
+
+@triton.jit
+def block_sparse_2d_kernel(node_mars, element_mars, params, nids, cids_start, cids_increment, pids_start,
+                           layer_n_edge_groups, batch_size, stride_pa, stride_pb, 
+                           BLOCK_B: tl.constexpr, TILE_SIZE_K: tl.constexpr, 
+                           TILE_SIZE_M: tl.constexpr, GROUP_SIZE_M: tl.constexpr):
+    
+    pid_b = tl.program_id(0) # ID of size-`BLOCK_B` batches
+    pid_m = tl.program_id(1) # ID of size-`TILE_SIZE_M` nodes
+
+    # Get inferred node group id from `pid_m`
+    ngroup_id = pid_m // (GROUP_SIZE_M // TILE_SIZE_M)
+    ntile_id = pid_m % (GROUP_SIZE_M // TILE_SIZE_M)
+
+    # initialize pointers to `params`
+    offs_node = tl.arange(0, TILE_SIZE_M)
+    offs_edge = tl.arange(0, TILE_SIZE_K)
+    par_start = tl.load(pids_start + ngroup_id * stride_pa + ntile_id * TILE_SIZE_M * stride_pb + offs_node * stride_pb)
+    epars_ptr = params + par_start[:,None] + offs_edge[None,:]
+
+    # initialize pointers to `element_mars`
+    offs_batch = tl.arange(0, BLOCK_B) + pid_b * BLOCK_B 
+    mask_batch = offs_batch < batch_size
+    edge_start = tl.load(cids_start + ngroup_id * TILE_SIZE_K + offs_edge)
+    emars_ptr = element_mars + \
+        edge_start[:,None] * batch_size + \
+        offs_batch[None,:]
+
+    # Inner loop
+    acc = tl.zeros((TILE_SIZE_M, BLOCK_B), dtype = tl.float32) - float("inf")
+
+    cids_inc_ptr = cids_increment + ngroup_id * (layer_n_edge_groups * TILE_SIZE_K) + offs_edge
+    for k in range(0, layer_n_edge_groups):
+        epars = tl.load(epars_ptr)
+        emars = tl.load(emars_ptr, mask = mask_batch[None,:])
+
+        emars_max = tl.max(emars, axis = 0)[None,:]
+        emars = tl.exp(emars - emars_max)
+        nmars = tl.dot(epars.to(tl.float16), emars.to(tl.float16)).to(tl.float32)
+
+        acc = tl.where(emars_max > acc, 
+            tl.log(nmars + tl.exp(acc - emars_max)) + emars_max,
+            tl.log(tl.exp(emars_max - acc) * nmars + 1.0) + acc
+        )
+
+        cids_inc = tl.load(cids_inc_ptr)
+        emars_ptr += cids_inc[:,None] * batch_size
+        cids_inc += TILE_SIZE_K
+
+        epars_ptr += TILE_SIZE_K
+
+    # Write back
+    offs_nids = tl.load(nids + ngroup_id * GROUP_SIZE_M + ntile_id * TILE_SIZE_M + offs_node)
+    offs_nmars = offs_nids[:,None] * batch_size + offs_batch[None,:]
+    tl.store(node_mars + offs_nmars, acc, mask = mask_batch[None,:])
+
+
+def main_blocksparse_2d():
+
+    GROUP_SIZE_M = 64
+
+    TILE_SIZE_M = 16
+    TILE_SIZE_K = 32
+
+    BLOCK_B = 128
+
+    data = np.load("temp.npz")
+
+    device = torch.device("cuda:0")
+
+    node_mars = torch.from_numpy(data["node_mars"]).to(device)
+    element_mars = torch.from_numpy(data["element_mars"]).to(device)
+    params = torch.from_numpy(data["params"]).to(device)
+
+    # Convert `nids`, `cids`, and `pids` into block sparse format
+    nids = torch.from_numpy(data["nids"])# .to(device)
+    cids = torch.from_numpy(data["cids"])# .to(device)
+    pids = torch.from_numpy(data["pids"])# .to(device)
+
+    # node_mars_gt = node_mars.clone()
+    # ch_mars = element_mars[cids]
+    # maxval = ch_mars.max(dim = 1, keepdim = True).values
+    # aaa = (((ch_mars - maxval).exp() * params[pids].unsqueeze(-1)).sum(
+    #     dim = 1).clamp(min = 1e-10)).log() + maxval.squeeze(1)
+
+    nids = nids.reshape(-1, GROUP_SIZE_M).contiguous().to(device)
+    cids = cids[::GROUP_SIZE_M,:].reshape(nids.size(0), -1, TILE_SIZE_K).contiguous()
+    pids_start = pids.reshape(nids.size(0), GROUP_SIZE_M, -1)[:,:,0].contiguous().to(device)
+
+    cids_start = cids[:,0,:].contiguous().to(device)
+    cids_increment = torch.cat((cids[:,1:,:] - cids[:,:-1,:], cids[:,0:1,:] * 0), dim = 1).contiguous().to(device)
+
+    tot_n_nodes = int(data["tot_n_nodes"])
+    tot_n_eles = int(data["tot_n_eles"])
+    layer_n_nodes = int(data["n_nodes"])
+    layer_n_edges = int(data["n_edges"])
+    batch_size = int(data["batch_size"])
+
+    layer_n_node_groups = layer_n_nodes // GROUP_SIZE_M
+    layer_n_edge_groups = layer_n_edges // TILE_SIZE_K
+
+    grid = (triton.cdiv(batch_size, BLOCK_B), triton.cdiv(layer_n_nodes, TILE_SIZE_M))
+
+    ts = []
+    for i in range(50):
+        # print("enter")
+        t0 = time.time()
+        block_sparse_2d_kernel[grid](
+            node_mars, 
+            element_mars, 
+            params, 
+            nids, 
+            cids_start, 
+            cids_increment, 
+            pids_start,
+            layer_n_edge_groups, 
+            batch_size,
+            stride_pa = pids_start.stride(0),
+            stride_pb = pids_start.stride(1), # Do not provide pids.stride(2) since it is 1
+            BLOCK_B = BLOCK_B,
+            TILE_SIZE_K = TILE_SIZE_K,
+            TILE_SIZE_M = TILE_SIZE_M,
+            GROUP_SIZE_M = GROUP_SIZE_M
+        )
+        torch.cuda.synchronize()
+        t1 = time.time()
+
+        if i > 0:
+            ts.append(t1 - t0)
+
+    aveg_t, std_t = torch.tensor(ts).mean().item() * 1000, torch.tensor(ts).std().item() * 1000
+    print(f"{aveg_t:.3f}±{std_t:.3f}ms")
+
+    # bbb = node_mars[nids]
+
+    # print(torch.max((aaa - bbb.flatten(0, 1)).abs()))
+
+    # import pdb; pdb.set_trace()
+
+
+if __name__ == "__main__":
+    # main_baseline()
+    # main_blocksparse()
+    main_blocksparse_2d()
\ No newline at end of file
diff --git a/tests/structures/hclt_test.py b/tests/structures/hclt_test.py
index e8c0d4d3..562c304a 100644
--- a/tests/structures/hclt_test.py
+++ b/tests/structures/hclt_test.py
@@ -94,7 +94,7 @@ def hclt_test():
         train_data.float().to(device), 
         num_bins = 32, 
         sigma = 0.5 / 32, 
-        num_latents = 64, 
+        num_latents = 128, 
         chunk_size = 32
     )
     pc = juice.TensorCircuit(ns)

From 56942ca650109ae80a69b0f948bf06dd2e3d8361 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Thu, 30 Nov 2023 22:49:10 +0800
Subject: [PATCH 002/162] refactor: add `group_size` option into node
 construction

---
 src/pyjuice/__init__.py                       |   2 +-
 src/pyjuice/functional/normalize.py           | 120 +++++++++++-------
 src/pyjuice/nodes/__init__.py                 |   2 +-
 src/pyjuice/nodes/construction.py             |  46 +++++--
 src/pyjuice/nodes/input_nodes.py              |   8 +-
 src/pyjuice/nodes/nodes.py                    |  25 +++-
 src/pyjuice/nodes/prod_nodes.py               |  20 +--
 src/pyjuice/nodes/sum_nodes.py                |  65 ++++++----
 tests/layer/sum_block_sparse_test.py          |  29 +++--
 ...nput_nodes_test.py => input_dists_test.py} |   0
 tests/nodes/nodes_test.py                     |  37 ++++--
 11 files changed, 232 insertions(+), 122 deletions(-)
 rename tests/nodes/{input_nodes_test.py => input_dists_test.py} (100%)

diff --git a/src/pyjuice/__init__.py b/src/pyjuice/__init__.py
index 9f9b16b9..fd280dc8 100644
--- a/src/pyjuice/__init__.py
+++ b/src/pyjuice/__init__.py
@@ -12,7 +12,7 @@
 from pyjuice.model import TensorCircuit
 
 # Construction methods
-from pyjuice.nodes import multiply, summate, inputs
+from pyjuice.nodes import multiply, summate, inputs, set_group_size
 
 # LVD
 from pyjuice.nodes.methods.lvd import LVDistiller
diff --git a/src/pyjuice/functional/normalize.py b/src/pyjuice/functional/normalize.py
index ef0b5478..8eddd32e 100644
--- a/src/pyjuice/functional/normalize.py
+++ b/src/pyjuice/functional/normalize.py
@@ -6,93 +6,125 @@
 
 
 @triton.jit
-def _cum_params_kernel(params_ptr, cum_params_ptr, node_ids_ptr, tot_num_params, batch_size, BLOCK_SIZE: tl.constexpr):
+def _cum_params_kernel(params_ptr, cum_params_ptr, node_ids_ptr, num_param_blocks, group_size, batch_size, 
+                       BLOCK_M: tl.constexpr, BLOCK_K: tl.constexpr, BLOCK_B: tl.constexpr):
 
-    pid = tl.program_id(axis = 0)
-    block_start = pid * BLOCK_SIZE
+    b_pid = tl.program_id(axis = 0)
+    k_pid = tl.program_id(axis = 1)
+    m_pid = tl.program_id(axis = 2)
 
-    offsets = block_start + tl.arange(0, BLOCK_SIZE)
-    mask = offsets < tot_num_params * batch_size
+    m_offsets = m_pid * BLOCK_M + tl.arange(0, BLOCK_M)
+    m_mask = offsets < num_param_blocks
 
-    param_offsets = offsets // batch_size
-    batch_offsets = offsets % batch_size
+    group_size = k_pid * BLOCK_K + tl.arange(0, BLOCK_K)
 
-    n_offsets = tl.load(node_ids_ptr + param_offsets, mask = mask, other = 0)
-    n_offsets = n_offsets * batch_size + batch_offsets
+    b_offsets = b_pid * BLOCK_B + tl.arange(0, BLOCK_B)
+    b_mask = offsets < batch_size
 
-    params = tl.load(params_ptr + offsets, mask = mask, other = 0)
+    n_offsets = tl.load(node_ids_ptr + m_offsets, mask = m_mask, other = 0)
+    reuse_offs = group_size[None,:,None] * batch_size + b_offsets[None,None,:]
+
+    n_offsets = n_offsets[:,None,None] * (batch_size * group_size) + reuse_offs
+    p_offsets = m_offsets[:,None,None] * reuse_offs
+
+    mask = m_mask[:,None,None] & b_mask[None,None,:]
+    params = tl.load(params_ptr + p_offsets, mask = mask, other = 0)
 
     tl.atomic_add(cum_params_ptr + n_offsets, params, mask = mask)
 
 
 @triton.jit
-def _norm_params_kernel(params_ptr, cum_params_ptr, node_ids_ptr, node_nchs_ptr, tot_num_params, 
-                        batch_size, pseudocount, BLOCK_SIZE: tl.constexpr):
+def _norm_params_kernel(params_ptr, cum_params_ptr, node_ids_ptr, node_nchs_ptr, num_param_blocks, group_size, 
+                        batch_size, pseudocount, BLOCK_M: tl.constexpr, BLOCK_K: tl.constexpr, BLOCK_B: tl.constexpr):
+
+    b_pid = tl.program_id(axis = 0)
+    k_pid = tl.program_id(axis = 1)
+    m_pid = tl.program_id(axis = 2)
 
-    pid = tl.program_id(axis = 0)
-    block_start = pid * BLOCK_SIZE
+    m_offsets = m_pid * BLOCK_M + tl.arange(0, BLOCK_M)
+    m_mask = offsets < num_param_blocks
 
-    offsets = block_start + tl.arange(0, BLOCK_SIZE)
-    mask = offsets < tot_num_params * batch_size
+    group_size = k_pid * BLOCK_K + tl.arange(0, BLOCK_K)
 
-    param_offsets = offsets // batch_size
-    batch_offsets = offsets % batch_size
+    b_offsets = b_pid * BLOCK_B + tl.arange(0, BLOCK_B)
+    b_mask = offsets < batch_size
 
-    n_offsets = tl.load(node_ids_ptr + param_offsets, mask = mask, other = 0)
-    n_offsets = n_offsets * batch_size + batch_offsets
+    n_offsets = tl.load(node_ids_ptr + m_offsets, mask = m_mask, other = 0)
+    reuse_offs = group_size[None,:,None] * batch_size + b_offsets[None,None,:]
 
-    params = tl.load(params_ptr + offsets, mask = mask, other = 0)
-    cum_params = tl.load(cum_params_ptr + n_offsets, mask = mask, other = 1)
-    nchs = tl.load(node_nchs_ptr + n_offsets, mask = mask, other = 1)
+    nb_offsets = n_offsets[:,None,None] * (batch_size * group_size) + reuse_offs
+    p_offsets = m_offsets[:,None,None] * reuse_offs
 
+    mask = m_mask[:,None,None] & b_mask[None,None,:]
+    params = tl.load(params_ptr + p_offsets, mask = mask, other = 0)
+    cum_params = tl.load(cum_params_ptr + nb_offsets, mask = mask, other = 1)
+    nchs = tl.load(node_nchs_ptr + n_offsets, mask = m_mask, other = 1)[:,None,None]
+    
     normed_params = (params + pseudocount / nchs) / (cum_params + pseudocount)
-    tl.store(params_ptr + offsets, normed_params, mask = mask)
+    tl.store(params_ptr + p_offsets, normed_params, mask = mask)
 
 
-def normalize_parameters(params: torch.Tensor, node_ids: torch.Tensor, node_nchs: Optional[torch.Tensor] = None, pseudocount: float = 0.0):
+def normalize_parameters(params: torch.Tensor, node_ids: torch.Tensor, group_size: int, ch_group_size: int, 
+                         node_nchs: Optional[torch.Tensor] = None, pseudocount: float = 0.0):
 
-    num_params = params.size(0)
-    num_nodes = torch.max(node_ids).detach().cpu().item() + 1
+    assert 3 <= params.dim() <= 4 and params.size(1) == group_size and params.size(2) == ch_group_size
+
+    num_param_blocks = params.size(0)
+    num_node_groups = torch.max(node_ids).detach().cpu().item() + 1
 
     if node_nchs is None:
-        node_nchs = torch.bincount(node_ids)
+        node_nchs = torch.bincount(node_ids) * ch_group_size
 
     if node_ids.is_cuda:
         assert params.is_cuda, "Input `params` should be on GPU."
 
-        if params.dim() == 1:
-            params = params.unsqueeze(1)
+        if params.dim() == 3:
+            params = params.unsqueeze(3)
+
+        batch_size = params.size(3)
 
-        batch_size = params.size(1)
+        cum_params = torch.zeros([num_node_groups, group_size, batch_size], dtype = torch.float32, device = params.device)
 
-        cum_params = torch.zeros([num_nodes, batch_size], dtype = torch.float32, device = params.device)
+        grouped_params = params.sum(2).contiguous()
 
-        grid1 = lambda meta: (triton.cdiv(num_params * batch_size, meta['BLOCK_SIZE']),)
-        grid2 = lambda meta: (triton.cdiv(num_params * batch_size, meta['BLOCK_SIZE']),)
+        BLOCK_B = min(batch_size, 128)
+        BLOCK_K = min(1024 // BLOCK_B, triton.next_power_of_2(group_size))
+        BLOCK_M = min(1024 // (BLOCK_B * BLOCK_K), triton.next_power_of_2(num_param_blocks))
 
-        _cum_params_kernel[grid1](params, cum_params, node_ids, num_params, batch_size, BLOCK_SIZE = 1024)
-        _norm_params_kernel[grid2](params, cum_params, node_ids, node_nchs, num_params, batch_size, pseudocount, BLOCK_SIZE = 1024)
+        grid = lambda meta: (triton.cdiv(batch_size, BLOCK_B), triton.cdiv(group_size, BLOCK_K), triton.cdiv(num_param_blocks, BLOCK_M))
+
+        _cum_params_kernel[grid](grouped_params, cum_params, node_ids, num_param_blocks, group_size, batch_size, BLOCK_M, BLOCK_K, BLOCK_B)
+        _norm_params_kernel[grid2](grouped_params, cum_params, node_ids, node_nchs, num_param_blocks, group_size, batch_size, pseudocount, BLOCK_M, BLOCK_K, BLOCK_B)
+
+        params *= (grouped_params / params.sum(2)).unsqueeze(2)
 
     else:
-        assert params.dim() == 1, "CPU version of `normalize_parameters` does not support `batch_size > 1` for now."
+        assert params.dim() == 3, "CPU version of `normalize_parameters` does not support `batch_size > 1` for now."
 
         with torch.no_grad():
 
             params = params.float()
 
-            param_ids = torch.arange(0, num_params, dtype = torch.long, device = params.device)
+            grouped_params = params.sum(dim = 2).contiguous()
+
+            param_ids = torch.arange(0, num_param_blocks, dtype = torch.long, device = params.device)
 
             cum_matrix1 = torch.sparse_coo_tensor(
                 torch.stack((node_ids, param_ids), dim = 0), 
-                params, (num_nodes, num_params)
+                torch.ones([num_param_blocks], device = params.device), 
+                (num_node_groups, num_param_blocks)
             )
-            node_buffer = torch.sparse.mm(cum_matrix1, torch.ones([num_params, 1], dtype = torch.float32, device = params.device)) + pseudocount
+            node_buffer = torch.sparse.mm(cum_matrix1, grouped_params) + pseudocount
 
             node_buffer.reciprocal_()
+            node_buffer = node_buffer.reshape(num_node_groups * group_size, 1)
+
+            param_ids = torch.arange(0, num_param_blocks * group_size, dtype = torch.long, device = params.device)
 
             cum_matrix2 = torch.sparse_coo_tensor(
-                torch.stack((param_ids, node_ids), dim = 0), 
-                params + pseudocount / node_nchs[node_ids], (num_params, num_nodes)
+                torch.stack((param_ids, node_ids.unsqueeze(1).repeat(1, group_size).reshape(-1)), dim = 0), 
+                (grouped_params + pseudocount / node_nchs[node_ids].unsqueeze(1)).reshape(-1), (num_param_blocks * group_size, num_node_groups)
             )
-            params_buffer = torch.sparse.mm(cum_matrix2, node_buffer)
-            params.data[:] = params_buffer[:,0]
\ No newline at end of file
+            params_buffer = torch.sparse.mm(cum_matrix2, node_buffer).reshape(num_param_blocks, group_size)
+            
+            params *= (params_buffer / grouped_params).unsqueeze(2)
\ No newline at end of file
diff --git a/src/pyjuice/nodes/__init__.py b/src/pyjuice/nodes/__init__.py
index 4814c660..6ade00e2 100644
--- a/src/pyjuice/nodes/__init__.py
+++ b/src/pyjuice/nodes/__init__.py
@@ -2,5 +2,5 @@
 from .input_nodes import InputNodes
 from .prod_nodes import ProdNodes
 from .sum_nodes import SumNodes
-from .construction import multiply, summate, inputs
+from .construction import multiply, summate, inputs, set_group_size
 from .methods.traversal import foreach, foldup_aggregate
\ No newline at end of file
diff --git a/src/pyjuice/nodes/construction.py b/src/pyjuice/nodes/construction.py
index e407332f..21b46d51 100644
--- a/src/pyjuice/nodes/construction.py
+++ b/src/pyjuice/nodes/construction.py
@@ -5,6 +5,7 @@
 from typing import Union, Sequence
 from copy import deepcopy
 
+from pyjuice.utils.context_manager import _DecoratorContextManager
 from pyjuice.utils import BitSet
 from .nodes import CircuitNodes
 from .input_nodes import InputNodes
@@ -17,48 +18,50 @@
 SumNodesChs = Union[ProdNodes,InputNodes]
 
 
-def inputs(var: Union[int,Sequence[int]], num_nodes: int, dist: Distribution, params: Optional[Tensor] = None, **kwargs):
+def inputs(var: Union[int,Sequence[int]], num_node_groups: int, dist: Distribution, params: Optional[Tensor] = None, 
+           group_size: int = 0, **kwargs):
     return InputNodes(
-        num_nodes = num_nodes,
+        num_node_groups = num_node_groups,
         scope = [var] if isinstance(var, int) else var,
         dist = dist,
         params = params,
+        group_size = group_size,
         **kwargs
     )
 
 
-def multiply(nodes1: ProdNodesChs, *args, 
-             edge_ids: Optional[Tensor] = None, **kwargs):
+def multiply(nodes1: ProdNodesChs, *args, edge_ids: Optional[Tensor] = None, **kwargs):
 
     assert isinstance(nodes1, SumNodes) or isinstance(nodes1, InputNodes), "Children of product nodes must be input or sum nodes." 
 
     chs = [nodes1]
-    num_nodes = nodes1.num_nodes
+    num_node_groups = nodes1.num_node_groups
+    group_size = nodes1.group_size
     scope = deepcopy(nodes1.scope)
 
     for nodes in args:
         assert isinstance(nodes, SumNodes) or isinstance(nodes, InputNodes), f"Children of product nodes must be input or sum nodes, but found input of type {type(nodes)}."
         if edge_ids is None:
-            assert nodes.num_nodes == num_nodes, "Input nodes should have the same `num_nodes`."
+            assert nodes.num_node_groups == num_node_groups, "Input nodes should have the same `num_node_groups`."
+        assert nodes.group_size == group_size, "Input nodes should have the same `num_node_groups`."
         assert len(nodes.scope & scope) == 0, "Children of a `ProdNodes` should have disjoint scopes."
         chs.append(nodes)
         scope |= nodes.scope
 
     if edge_ids is not None:
-        num_nodes = edge_ids.shape[0]
+        num_node_groups = edge_ids.shape[0]
 
-    return ProdNodes(num_nodes, chs, edge_ids, **kwargs)
+    return ProdNodes(num_node_groups, chs, edge_ids, group_size = group_size, **kwargs)
 
 
-def summate(nodes1: SumNodesChs, *args, num_nodes: int = 0, 
-            edge_ids: Optional[Tensor] = None, **kwargs):
+def summate(nodes1: SumNodesChs, *args, num_node_groups: int = 0, edge_ids: Optional[Tensor] = None, group_size: int = 0, **kwargs):
 
     assert isinstance(nodes1, ProdNodes) or isinstance(nodes1, InputNodes), f"Children of sum nodes must be input or product nodes, but found input of type {type(nodes1)}." 
 
-    if edge_ids is not None and num_nodes == 0:
-        num_nodes = edge_ids[0,:].max().item() + 1
+    if edge_ids is not None and num_node_groups == 0:
+        num_node_groups = edge_ids[0,:].max().item() + 1
 
-    assert num_nodes > 0, "Number of nodes should be greater than 0."
+    assert num_node_groups > 0, "Number of node groups should be greater than 0."
 
     chs = [nodes1]
     scope = deepcopy(nodes1.scope)
@@ -67,4 +70,19 @@ def summate(nodes1: SumNodesChs, *args, num_nodes: int = 0,
         assert nodes.scope == scope, "Children of a `SumNodes` should have the same scope."
         chs.append(nodes)
 
-    return SumNodes(num_nodes, chs, edge_ids, **kwargs)
\ No newline at end of file
+    return SumNodes(num_node_groups, chs, edge_ids, group_size = group_size, **kwargs)
+
+
+class set_group_size(_DecoratorContextManager):
+    def __init__(self, group_size: int = 1):
+
+        self.group_size = group_size
+        
+        self.original_group_size = None
+
+    def __enter__(self) -> None:
+        self.original_group_size = CircuitNodes.DEFAULT_GROUP_SIZE
+        CircuitNodes.DEFAULT_GROUP_SIZE = self.group_size
+
+    def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
+        CircuitNodes.DEFAULT_GROUP_SIZE = self.original_group_size
\ No newline at end of file
diff --git a/src/pyjuice/nodes/input_nodes.py b/src/pyjuice/nodes/input_nodes.py
index a1556e5b..eb34f474 100644
--- a/src/pyjuice/nodes/input_nodes.py
+++ b/src/pyjuice/nodes/input_nodes.py
@@ -11,11 +11,11 @@
 
 
 class InputNodes(CircuitNodes):
-    def __init__(self, num_nodes: int, scope: Union[Sequence,BitSet], dist: Distribution, 
-                 params: Optional[torch.Tensor] = None, **kwargs) -> None:
+    def __init__(self, num_node_groups: int, scope: Union[Sequence,BitSet], dist: Distribution, 
+                 params: Optional[torch.Tensor] = None, group_size: int = 0, **kwargs) -> None:
 
         rg_node = InputRegionNode(scope)
-        super(InputNodes, self).__init__(num_nodes, rg_node, **kwargs)
+        super(InputNodes, self).__init__(num_node_groups, rg_node, group_size = group_size, **kwargs)
 
         self.chs = [] # InputNodes has no children
 
@@ -43,7 +43,7 @@ def duplicate(self, scope: Optional[Union[int,Sequence,BitSet]] = None, tie_para
 
         dist = deepcopy(self.dist)
 
-        ns = InputNodes(self.num_nodes, scope = scope, dist = dist, source_node = self if tie_params else None)
+        ns = InputNodes(self.num_node_groups, scope = scope, dist = dist, group_size = self.group_size, source_node = self if tie_params else None)
 
         if hasattr(self, "_params") and self._params is not None and not tie_params:
             ns._params = self._params.clone()
diff --git a/src/pyjuice/nodes/nodes.py b/src/pyjuice/nodes/nodes.py
index 88195fca..3024ef07 100644
--- a/src/pyjuice/nodes/nodes.py
+++ b/src/pyjuice/nodes/nodes.py
@@ -38,8 +38,19 @@ class CircuitNodes():
     # add anything here.
     INIT_CALLBACKS = []
 
-    def __init__(self, num_nodes: int, region_node: RegionGraph, source_node: Optional[CircuitNodes] = None, **kwargs):
-        self.num_nodes = num_nodes
+    # Default `group_size`. Used by the context managers.
+    DEFAULT_GROUP_SIZE = 1
+
+    def __init__(self, num_node_groups: int, region_node: RegionGraph, group_size: int = 0, source_node: Optional[CircuitNodes] = None, **kwargs):
+
+        if group_size == 0:
+            group_size = self.DEFAULT_GROUP_SIZE
+
+        assert num_node_groups > 0
+        assert group_size > 0 and (group_size & (group_size - 1)) == 0, f"`group_size` must be a power of 2, but got `group_size={group_size}`."
+        
+        self.num_node_groups = num_node_groups
+        self.group_size = group_size
         self.region_node = region_node
 
         self.chs = []
@@ -77,6 +88,14 @@ def is_input(self):
     def num_chs(self):
         return len(self.chs)
 
+    @property
+    def num_nodes(self):
+        return self.num_node_groups * self.group_size
+
+    @property
+    def num_edges(self):
+        raise NotImplementedError()
+
     def duplicate(self, *args, **kwargs):
         raise ValueError(f"{type(self)} does not support `duplicate`.")
 
@@ -116,6 +135,8 @@ def set_source_ns(self, source_ns: CircuitNodes):
         assert type(source_ns) == type(self), f"Node type of the source ns ({type(source_ns)}) does not match that of self ({type(self)})."
         assert len(source_ns.chs) == len(self.chs), "Number of children does not match."
         assert not hasattr(self, "_params") or self._params is None, "The current node should not have parameters to avoid confusion."
+        assert source_ns.num_node_groups == self.num_node_groups, "`num_node_groups` does not match."
+        assert source_ns.group_size == self.group_size,  "`group_size` does not match."
 
         self._source_node = source_ns
 
diff --git a/src/pyjuice/nodes/prod_nodes.py b/src/pyjuice/nodes/prod_nodes.py
index 9eba0482..1f9b5212 100644
--- a/src/pyjuice/nodes/prod_nodes.py
+++ b/src/pyjuice/nodes/prod_nodes.py
@@ -13,10 +13,10 @@
 
 
 class ProdNodes(CircuitNodes):
-    def __init__(self, num_nodes: int, chs: Sequence[CircuitNodes], edge_ids: Optional[Tensor] = None, **kwargs) -> None:
+    def __init__(self, num_node_groups: int, chs: Sequence[CircuitNodes], edge_ids: Optional[Tensor] = None, group_size: int = 0, **kwargs) -> None:
 
         rg_node = PartitionNode([ch.region_node for ch in chs])
-        super(ProdNodes, self).__init__(num_nodes, rg_node, **kwargs)
+        super(ProdNodes, self).__init__(num_node_groups, rg_node, group_size = group_size, **kwargs)
 
         # Child layers
         self.chs = chs
@@ -30,24 +30,25 @@ def __init__(self, num_nodes: int, chs: Sequence[CircuitNodes], edge_ids: Option
     def _construct_edges(self, edge_ids: Optional[Tensor]):
         if edge_ids is None:
             for c in self.chs:
-                assert self.num_nodes == c.num_nodes, "Cannot create edges implicitly since # nodes do not match."
+                assert self.num_node_groups == c.num_node_groups and self.group_size == c.group_size, \
+                    "Cannot create edges implicitly since # nodes do not match."
 
-            edge_ids = torch.arange(self.num_nodes).unsqueeze(1).repeat(1, self.num_chs)
+            edge_ids = torch.arange(self.num_node_groups).unsqueeze(1).repeat(1, self.num_chs)
 
         if isinstance(edge_ids, np.ndarray):
             edge_ids = torch.from_numpy(edge_ids)
 
         # Sanity checks
-        assert edge_ids.size(0) == self.num_nodes and edge_ids.size(1) == self.num_chs, f"Expect edge_ids.size() == ({self.num_nodes}, {self.num_chs})."
+        assert edge_ids.size(0) == self.num_node_groups and edge_ids.size(1) == self.num_chs, f"Expect edge_ids.size() == ({self.num_node_groups}, {self.num_chs})."
         for cid in range(self.num_chs):
             assert torch.all(edge_ids[:,cid] >= 0), "Edge index underflow."
-            assert torch.all(edge_ids[:,cid] < self.chs[cid].num_nodes), "Edge index overflow."
+            assert torch.all(edge_ids[:,cid] < self.chs[cid].num_node_groups), "Edge index overflow."
 
         self.edge_ids = edge_ids
 
     @property
     def num_edges(self):
-        return self.edge_ids.size(0) * self.edge_ids.size(1)
+        return self.edge_ids.size(0) * self.edge_ids.size(1) * self.group_size
 
     def duplicate(self, *args, tie_params: bool = False):
         chs = []
@@ -61,11 +62,12 @@ def duplicate(self, *args, tie_params: bool = False):
             assert self.num_chs == len(chs), f"Number of new children ({len(chs)}) must match the number of original children ({self.num_chs})."
             for old_c, new_c in zip(self.chs, chs):
                 assert type(old_c) == type(new_c), f"Child type not match: ({type(new_c)} != {type(old_c)})."
-                assert old_c.num_nodes == new_c.num_nodes, f"Child node size not match: ({new_c.num_nodes} != {old_c.num_nodes})."
+                assert old_c.num_node_groups == new_c.num_node_groups, f"Child node size not match: (`num_node_groups`: {new_c.num_node_groups} != {old_c.num_node_groups})."
+                assert old_c.group_size == new_c.group_size, f"Child node size not match: (`group_size`: {new_c.group_size} != {old_c.group_size})."
 
         edge_ids = self.edge_ids.clone()
 
-        return ProdNodes(self.num_nodes, chs, edge_ids, source_node = self if tie_params else None)
+        return ProdNodes(self.num_node_groups, chs, edge_ids, group_size = self.group_size, source_node = self if tie_params else None)
 
     def init_parameters(self, perturbation: float = 2.0, recursive: bool = True, is_root: bool = True, **kwargs):
         super(ProdNodes, self).init_parameters(
diff --git a/src/pyjuice/nodes/sum_nodes.py b/src/pyjuice/nodes/sum_nodes.py
index bc504802..380ad3a2 100644
--- a/src/pyjuice/nodes/sum_nodes.py
+++ b/src/pyjuice/nodes/sum_nodes.py
@@ -15,17 +15,24 @@
 
 
 class SumNodes(CircuitNodes):
-    def __init__(self, num_nodes: int, chs: Sequence[CircuitNodes], edge_ids: Optional[Union[Tensor,Sequence[Tensor]]] = None, 
-                 params: Optional[Tensor] = None, **kwargs) -> None:
+    def __init__(self, num_node_groups: int, chs: Sequence[CircuitNodes], edge_ids: Optional[Union[Tensor,Sequence[Tensor]]] = None, 
+                 params: Optional[Tensor] = None, group_size: int = 0, **kwargs) -> None:
+
+        assert len(chs) > 0, "`SumNodes` must have at least one child."
+        for i in range(1, len(chs)):
+            assert chs[0].group_size == chs[i].group_size, "`group_size` of the children of a `SumNodes` should be the same."
 
         rg_node = InnerRegionNode([ch.region_node for ch in chs])
-        super(SumNodes, self).__init__(num_nodes, rg_node, **kwargs)
+        super(SumNodes, self).__init__(num_node_groups, rg_node, group_size = group_size, **kwargs)
 
         # Child layers
         self.chs = self._standardize_chs(chs)
 
-        # Total number of child circuit nodes
-        self.num_ch_nodes = reduce(lambda m, n: m + n, map(lambda n: n.num_nodes, chs))
+        # Total number of child circuit node groups
+        self.num_ch_node_groups = reduce(lambda m, n: m + n, map(lambda n: n.num_node_groups, chs))
+
+        # Group size of the children
+        self.ch_group_size = self.chs[0].group_size
 
         # Construct sum edges
         self._construct_edges(edge_ids)
@@ -39,7 +46,7 @@ def __init__(self, num_nodes: int, chs: Sequence[CircuitNodes], edge_ids: Option
 
     @property
     def num_edges(self):
-        return self.edge_ids.size(1)
+        return self.edge_ids.size(1) * self.group_size * self.ch_group_size
 
     def duplicate(self, *args, tie_params: bool = False):
         chs = []
@@ -53,7 +60,8 @@ def duplicate(self, *args, tie_params: bool = False):
             assert self.num_chs == len(chs), f"Number of new children ({len(chs)}) must match the number of original children ({self.num_chs})."
             for old_c, new_c in zip(self.chs, chs):
                 assert type(old_c) == type(new_c), f"Child type not match: ({type(new_c)} != {type(old_c)})."
-                assert old_c.num_nodes == new_c.num_nodes, f"Child node size not match: ({new_c.num_nodes} != {old_c.num_nodes})."
+                assert old_c.num_node_groups == new_c.num_node_groups, f"Child node size not match: (`num_node_groups`: {new_c.num_node_groups} != {old_c.num_node_groups})."
+                assert old_c.group_size == new_c.group_size, f"Child node size not match: (`group_size`: {new_c.group_size} != {old_c.group_size})."
 
         edge_ids = self.edge_ids.clone()
 
@@ -63,7 +71,7 @@ def duplicate(self, *args, tie_params: bool = False):
             # We also do not copy parameters explicitly if this is a tied node
             params = None
 
-        return SumNodes(self.num_nodes, chs, edge_ids, params = params, source_node = self if tie_params else None)
+        return SumNodes(self.num_node_groups, chs, edge_ids, params = params, group_size = self.group_size, source_node = self if tie_params else None)
 
     def get_params(self):
         if not hasattr(self, "_params"):
@@ -78,17 +86,25 @@ def set_params(self, params: torch.Tensor, normalize: bool = True, pseudocount:
             return None
 
         if params.dim() == 1:
+            assert self.group_size == 1 and self.ch_group_size == 1
             assert self.edge_ids.size(1) == params.size(0)
 
+            self._params = params.clone().view(-1, 1, 1)
+
+        elif params.dim() == 3:
+            assert self.edge_ids.size(1) == params.size(0) and self.group_size == params.size(1) and self.ch_group_size == params.size(2)
+
             self._params = params.clone()
 
-        elif params.dim() == 2:
-            assert params.size(0) == self.num_nodes and params.size(1) == self.num_ch_nodes
+        elif params.dim() == 4:
+            assert params.size(0) == self.num_node_groups and params.size(1) == self.num_ch_node_groups and \
+                self.group_size == params.size(2) and self.ch_group_size == params.size(3)
 
-            self._params = params[self.edge_ids[0,:],self.edge_ids[1,:]].clone().contiguous()
+            self._params = params[self.edge_ids[0,:],self.edge_ids[1,:],:,:].clone().contiguous()
 
         if normalize:
-            normalize_parameters(self._params, self.edge_ids[0,:], pseudocount = pseudocount)
+            normalize_parameters(self._params, self.edge_ids[0,:], group_size = self.group_size, 
+                                 ch_group_size = self.ch_group_size, pseudocount = pseudocount)
 
     def set_edges(self, edge_ids: Union[Tensor,Sequence[Tensor]]):
         self._construct_edges(edge_ids)
@@ -97,7 +113,7 @@ def set_edges(self, edge_ids: Union[Tensor,Sequence[Tensor]]):
 
     def init_parameters(self, perturbation: float = 2.0, recursive: bool = True, is_root: bool = True, **kwargs):
         if self._source_node is None:
-            self._params = torch.exp(torch.rand([self.edge_ids.size(1)]) * -perturbation)
+            self._params = torch.exp(torch.rand([self.edge_ids.size(1), self.group_size, self.ch_group_size]) * -perturbation)
 
             normalize_parameters(self._params, self.edge_ids[0,:], pseudocount = 0.0)
 
@@ -109,7 +125,7 @@ def init_parameters(self, perturbation: float = 2.0, recursive: bool = True, is_
         )
 
     def _get_edges_as_mask(self):
-        mask = torch.zeros([self.num_nodes, self.num_ch_nodes], dtype = torch.bool)
+        mask = torch.zeros([self.num_node_groups, self.num_ch_nodes], dtype = torch.bool)
         mask[self.edge_ids[0,:], self.edge_ids[1,:]] = True
 
         return mask
@@ -119,9 +135,10 @@ def _standardize_chs(self, chs):
         for cs in chs:
             if cs.is_input():
                 new_cs = ProdNodes(
-                    num_nodes = cs.num_nodes,
+                    num_node_groups = cs.num_node_groups,
                     chs = [cs],
-                    edge_ids = torch.arange(0, cs.num_nodes).reshape(-1, 1)
+                    edge_ids = torch.arange(0, cs.num_node_groups).reshape(-1, 1),
+                    group_size = cs.group_size
                 )
                 new_chs.append(new_cs)
             else:
@@ -132,22 +149,22 @@ def _standardize_chs(self, chs):
     def _construct_edges(self, edge_ids: Optional[Union[Tensor,Sequence[Tensor]]]):
         if edge_ids is None:
             edge_ids = torch.cat(
-                (torch.arange(self.num_nodes).unsqueeze(1).repeat(1, self.num_ch_nodes).reshape(1, -1),
-                 torch.arange(self.num_ch_nodes).unsqueeze(0).repeat(self.num_nodes, 1).reshape(1, -1)),
+                (torch.arange(self.num_node_groups).unsqueeze(1).repeat(1, self.num_ch_node_groups).reshape(1, -1),
+                 torch.arange(self.num_ch_node_groups).unsqueeze(0).repeat(self.num_node_groups, 1).reshape(1, -1)),
                 dim = 0
             )
         elif isinstance(edge_ids, Sequence):
             assert len(edge_ids) == len(self.chs)
 
             per_ns_edge_ids = edge_ids
-            ch_nid_start = 0
+            ch_gid_start = 0
             edge_ids = []
             for cs_id in range(len(self.chs)):
                 curr_edge_ids = per_ns_edge_ids[cs_id]
-                curr_edge_ids[1,:] += ch_nid_start
+                curr_edge_ids[1,:] += ch_gid_start
                 edge_ids.append(curr_edge_ids)
 
-                ch_nid_start += self.chs[cs_id].num_nodes
+                ch_nid_start += self.chs[cs_id].num_node_groups
 
             edge_ids = torch.cat(edge_ids, dim = 1)
 
@@ -155,15 +172,15 @@ def _construct_edges(self, edge_ids: Optional[Union[Tensor,Sequence[Tensor]]]):
             edge_ids = torch.from_numpy(edge_ids)
 
         if edge_ids.dim() == 2 and edge_ids.type() == torch.bool:
-            assert edge_ids.size(0) == self.num_nodes and edge_ids.size(1) == self.num_ch_nodes
+            assert edge_ids.size(0) == self.num_node_groups and edge_ids.size(1) == self.num_ch_node_groups
             x_ids, y_ids = torch.where(edge_ids)
             edge_ids = torch.stack((x_ids, y_ids), dim = 0)
 
         # Sanity checks
         assert edge_ids.size(0) == 2, "Expect `edge_ids.size(0) == 2`."
         assert torch.all(edge_ids[0,:] >= 0) and torch.all(edge_ids[1,:] >= 0), "Edge index underflow."
-        assert torch.all(edge_ids[0,:] < self.num_nodes) and torch.all(edge_ids[1,:] < self.num_ch_nodes), "Edge index overflow."
+        assert torch.all(edge_ids[0,:] < self.num_node_groups) and torch.all(edge_ids[1,:] < self.num_ch_node_groups), "Edge index overflow."
         par_ns = torch.unique(edge_ids[0,:])
-        assert par_ns.size(0) == self.num_nodes and par_ns.max() == self.num_nodes - 1, "Some node has no edge."
+        assert par_ns.size(0) == self.num_node_groups and par_ns.max() == self.num_node_groups - 1, "Some node has no edge."
 
         self.edge_ids = edge_ids
diff --git a/tests/layer/sum_block_sparse_test.py b/tests/layer/sum_block_sparse_test.py
index 84bc9b61..0185bb09 100644
--- a/tests/layer/sum_block_sparse_test.py
+++ b/tests/layer/sum_block_sparse_test.py
@@ -291,7 +291,14 @@ def block_sparse_2d_kernel(node_mars, element_mars, params, nids, cids_start, ci
 
         emars_max = tl.max(emars, axis = 0)[None,:]
         emars = tl.exp(emars - emars_max)
-        nmars = tl.dot(epars.to(tl.float16), emars.to(tl.float16)).to(tl.float32)
+        epars = epars.to(tl.float16)
+        emars = emars.to(tl.float16)
+        nmars = tl.dot(epars, emars).to(tl.float32)
+
+        # if TILE_SIZE_M < 16:
+        #     epars = tl.view(tl.broadcast_to(epars[:,None,:], (TILE_SIZE_M, 16 // TILE_SIZE_M, TILE_SIZE_K)), (16, TILE_SIZE_K))
+        #     nmars = tl.dot(epars, emars).to(tl.float32)
+        #     nmars = tl.max(tl.view(nmars, (TILE_SIZE_M, 16 // TILE_SIZE_M, BLOCK_B)), axis = 1)
 
         acc = tl.where(emars_max > acc, 
             tl.log(nmars + tl.exp(acc - emars_max)) + emars_max,
@@ -312,12 +319,12 @@ def block_sparse_2d_kernel(node_mars, element_mars, params, nids, cids_start, ci
 
 def main_blocksparse_2d():
 
-    GROUP_SIZE_M = 64
+    GROUP_SIZE_M = 32
 
     TILE_SIZE_M = 16
-    TILE_SIZE_K = 32
+    TILE_SIZE_K = 64
 
-    BLOCK_B = 128
+    BLOCK_B = max(128, 16)
 
     data = np.load("temp.npz")
 
@@ -332,11 +339,11 @@ def main_blocksparse_2d():
     cids = torch.from_numpy(data["cids"])# .to(device)
     pids = torch.from_numpy(data["pids"])# .to(device)
 
-    # node_mars_gt = node_mars.clone()
-    # ch_mars = element_mars[cids]
-    # maxval = ch_mars.max(dim = 1, keepdim = True).values
-    # aaa = (((ch_mars - maxval).exp() * params[pids].unsqueeze(-1)).sum(
-    #     dim = 1).clamp(min = 1e-10)).log() + maxval.squeeze(1)
+    node_mars_gt = node_mars.clone()
+    ch_mars = element_mars[cids]
+    maxval = ch_mars.max(dim = 1, keepdim = True).values
+    aaa = (((ch_mars - maxval).exp() * params[pids].unsqueeze(-1)).sum(
+        dim = 1).clamp(min = 1e-10)).log() + maxval.squeeze(1)
 
     nids = nids.reshape(-1, GROUP_SIZE_M).contiguous().to(device)
     cids = cids[::GROUP_SIZE_M,:].reshape(nids.size(0), -1, TILE_SIZE_K).contiguous()
@@ -386,9 +393,9 @@ def main_blocksparse_2d():
     aveg_t, std_t = torch.tensor(ts).mean().item() * 1000, torch.tensor(ts).std().item() * 1000
     print(f"{aveg_t:.3f}±{std_t:.3f}ms")
 
-    # bbb = node_mars[nids]
+    bbb = node_mars[nids]
 
-    # print(torch.max((aaa - bbb.flatten(0, 1)).abs()))
+    print(torch.max((aaa - bbb.flatten(0, 1)).abs()))
 
     # import pdb; pdb.set_trace()
 
diff --git a/tests/nodes/input_nodes_test.py b/tests/nodes/input_dists_test.py
similarity index 100%
rename from tests/nodes/input_nodes_test.py
rename to tests/nodes/input_dists_test.py
diff --git a/tests/nodes/nodes_test.py b/tests/nodes/nodes_test.py
index adefa954..69a46c9a 100644
--- a/tests/nodes/nodes_test.py
+++ b/tests/nodes/nodes_test.py
@@ -10,18 +10,31 @@
 
 
 def nodes_test():
-    num_nodes = 8
-
-    n0 = inputs(0, num_nodes, dists.Categorical(num_cats = 5))
-    n1 = inputs(1, num_nodes, dists.Categorical(num_cats = 5))
-    n2 = inputs(2, num_nodes, dists.Categorical(num_cats = 5))
-    assert n0.num_nodes == 8 and n1.num_nodes == 8 and n2.num_nodes == 8
-
-    m = multiply(n0, n1, n2)
-    assert m.num_nodes == 8
-    assert m.scope == BitSet.from_array([0,1,2])
-    n = summate(m, num_nodes = 1)
-    assert n.num_nodes == 1
+    num_node_groups_candidates = [4, 8, 12]
+    group_size_candidates = [1, 2, 4, 16]
+
+    for num_node_groups in num_node_groups_candidates:
+        for group_size in group_size_candidates:
+
+            num_nodes = num_node_groups * group_size
+
+            with juice.set_group_size(group_size):
+                n0 = inputs(0, num_node_groups, dists.Categorical(num_cats = 5))
+                n1 = inputs(1, num_node_groups, dists.Categorical(num_cats = 5))
+                n2 = inputs(2, num_node_groups, dists.Categorical(num_cats = 5))
+
+                assert n0.num_nodes == num_nodes and n1.num_nodes == num_nodes and n2.num_nodes == num_nodes
+
+                m = multiply(n0, n1, n2)
+
+                assert m.num_nodes == num_nodes
+                assert m.scope == BitSet.from_array([0,1,2])
+                assert m.num_edges == num_nodes * 3
+
+                n = summate(m, num_node_groups = 1)
+
+                assert n.num_nodes == group_size
+                assert n.num_edges == num_node_groups * (group_size ** 2)
 
 
 if __name__ == "__main__":

From 1083cc3395b4dba52880450ebb8fb9d23816ccd2 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Fri, 1 Dec 2023 02:53:35 +0800
Subject: [PATCH 003/162] fix `normalize_parameters`

---
 src/pyjuice/functional/normalize.py | 28 +++++++++++++++-------------
 src/pyjuice/nodes/sum_nodes.py      |  3 ++-
 tests/nodes/nodes_test.py           | 16 ++++++++++++++++
 3 files changed, 33 insertions(+), 14 deletions(-)

diff --git a/src/pyjuice/functional/normalize.py b/src/pyjuice/functional/normalize.py
index 8eddd32e..6778d1fd 100644
--- a/src/pyjuice/functional/normalize.py
+++ b/src/pyjuice/functional/normalize.py
@@ -14,18 +14,18 @@ def _cum_params_kernel(params_ptr, cum_params_ptr, node_ids_ptr, num_param_block
     m_pid = tl.program_id(axis = 2)
 
     m_offsets = m_pid * BLOCK_M + tl.arange(0, BLOCK_M)
-    m_mask = offsets < num_param_blocks
+    m_mask = m_offsets < num_param_blocks
 
-    group_size = k_pid * BLOCK_K + tl.arange(0, BLOCK_K)
+    k_offsets = k_pid * BLOCK_K + tl.arange(0, BLOCK_K)
 
     b_offsets = b_pid * BLOCK_B + tl.arange(0, BLOCK_B)
-    b_mask = offsets < batch_size
+    b_mask = b_offsets < batch_size
 
     n_offsets = tl.load(node_ids_ptr + m_offsets, mask = m_mask, other = 0)
-    reuse_offs = group_size[None,:,None] * batch_size + b_offsets[None,None,:]
+    reuse_offs = k_offsets[None,:,None] * batch_size + b_offsets[None,None,:]
 
     n_offsets = n_offsets[:,None,None] * (batch_size * group_size) + reuse_offs
-    p_offsets = m_offsets[:,None,None] * reuse_offs
+    p_offsets = m_offsets[:,None,None] * (batch_size * group_size) + reuse_offs
 
     mask = m_mask[:,None,None] & b_mask[None,None,:]
     params = tl.load(params_ptr + p_offsets, mask = mask, other = 0)
@@ -42,18 +42,18 @@ def _norm_params_kernel(params_ptr, cum_params_ptr, node_ids_ptr, node_nchs_ptr,
     m_pid = tl.program_id(axis = 2)
 
     m_offsets = m_pid * BLOCK_M + tl.arange(0, BLOCK_M)
-    m_mask = offsets < num_param_blocks
+    m_mask = m_offsets < num_param_blocks
 
-    group_size = k_pid * BLOCK_K + tl.arange(0, BLOCK_K)
+    k_offsets = k_pid * BLOCK_K + tl.arange(0, BLOCK_K)
 
     b_offsets = b_pid * BLOCK_B + tl.arange(0, BLOCK_B)
-    b_mask = offsets < batch_size
+    b_mask = b_offsets < batch_size
 
     n_offsets = tl.load(node_ids_ptr + m_offsets, mask = m_mask, other = 0)
-    reuse_offs = group_size[None,:,None] * batch_size + b_offsets[None,None,:]
+    reuse_offs = k_offsets[None,:,None] * batch_size + b_offsets[None,None,:]
 
     nb_offsets = n_offsets[:,None,None] * (batch_size * group_size) + reuse_offs
-    p_offsets = m_offsets[:,None,None] * reuse_offs
+    p_offsets = m_offsets[:,None,None] * (batch_size * group_size) + reuse_offs
 
     mask = m_mask[:,None,None] & b_mask[None,None,:]
     params = tl.load(params_ptr + p_offsets, mask = mask, other = 0)
@@ -94,7 +94,7 @@ def normalize_parameters(params: torch.Tensor, node_ids: torch.Tensor, group_siz
         grid = lambda meta: (triton.cdiv(batch_size, BLOCK_B), triton.cdiv(group_size, BLOCK_K), triton.cdiv(num_param_blocks, BLOCK_M))
 
         _cum_params_kernel[grid](grouped_params, cum_params, node_ids, num_param_blocks, group_size, batch_size, BLOCK_M, BLOCK_K, BLOCK_B)
-        _norm_params_kernel[grid2](grouped_params, cum_params, node_ids, node_nchs, num_param_blocks, group_size, batch_size, pseudocount, BLOCK_M, BLOCK_K, BLOCK_B)
+        _norm_params_kernel[grid](grouped_params, cum_params, node_ids, node_nchs, num_param_blocks, group_size, batch_size, pseudocount, BLOCK_M, BLOCK_K, BLOCK_B)
 
         params *= (grouped_params / params.sum(2)).unsqueeze(2)
 
@@ -120,10 +120,12 @@ def normalize_parameters(params: torch.Tensor, node_ids: torch.Tensor, group_siz
             node_buffer = node_buffer.reshape(num_node_groups * group_size, 1)
 
             param_ids = torch.arange(0, num_param_blocks * group_size, dtype = torch.long, device = params.device)
+            flattened_node_ids = (node_ids.unsqueeze(1).repeat(1, group_size) * group_size + torch.arange(0, group_size, device = params.device)).reshape(-1)
 
             cum_matrix2 = torch.sparse_coo_tensor(
-                torch.stack((param_ids, node_ids.unsqueeze(1).repeat(1, group_size).reshape(-1)), dim = 0), 
-                (grouped_params + pseudocount / node_nchs[node_ids].unsqueeze(1)).reshape(-1), (num_param_blocks * group_size, num_node_groups)
+                torch.stack((param_ids, flattened_node_ids), dim = 0), 
+                (grouped_params + pseudocount / node_nchs[node_ids].unsqueeze(1)).reshape(-1), 
+                (num_param_blocks * group_size, num_node_groups * group_size)
             )
             params_buffer = torch.sparse.mm(cum_matrix2, node_buffer).reshape(num_param_blocks, group_size)
             
diff --git a/src/pyjuice/nodes/sum_nodes.py b/src/pyjuice/nodes/sum_nodes.py
index 380ad3a2..f5710907 100644
--- a/src/pyjuice/nodes/sum_nodes.py
+++ b/src/pyjuice/nodes/sum_nodes.py
@@ -115,7 +115,8 @@ def init_parameters(self, perturbation: float = 2.0, recursive: bool = True, is_
         if self._source_node is None:
             self._params = torch.exp(torch.rand([self.edge_ids.size(1), self.group_size, self.ch_group_size]) * -perturbation)
 
-            normalize_parameters(self._params, self.edge_ids[0,:], pseudocount = 0.0)
+            normalize_parameters(self._params, self.edge_ids[0,:], group_size = self.group_size, 
+                                 ch_group_size = self.ch_group_size, pseudocount = 0.0)
 
         super(SumNodes, self).init_parameters(
             perturbation = perturbation, 
diff --git a/tests/nodes/nodes_test.py b/tests/nodes/nodes_test.py
index 69a46c9a..ec570bc9 100644
--- a/tests/nodes/nodes_test.py
+++ b/tests/nodes/nodes_test.py
@@ -5,11 +5,15 @@
 import pyjuice.nodes.distributions as dists
 from pyjuice.utils import BitSet
 from pyjuice.nodes import multiply, summate, inputs
+from pyjuice.functional.normalize import normalize_parameters
 
 import pytest
 
 
 def nodes_test():
+
+    device = torch.device("cuda:0")
+
     num_node_groups_candidates = [4, 8, 12]
     group_size_candidates = [1, 2, 4, 16]
 
@@ -36,6 +40,18 @@ def nodes_test():
                 assert n.num_nodes == group_size
                 assert n.num_edges == num_node_groups * (group_size ** 2)
 
+                n.init_parameters()
+
+                assert torch.all(torch.abs(n._params.sum(dim = 2).sum(dim = 0) - 1.0) < 1e-4)
+
+                n._params = n._params.to(device)
+                n.edge_ids = n.edge_ids.to(device)
+
+                normalize_parameters(n._params, n.edge_ids[0,:].contiguous(), group_size = n.group_size, 
+                                     ch_group_size = n.ch_group_size, pseudocount = 0.0)
+
+                assert torch.all(torch.abs(n._params.sum(dim = 2).sum(dim = 0) - 1.0) < 1e-4)
+
 
 if __name__ == "__main__":
     nodes_test()
\ No newline at end of file

From 751c1658530a2e52f597bef57ecb963b835684f0 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Fri, 1 Dec 2023 02:58:24 +0800
Subject: [PATCH 004/162] rm empty file

---
 src/pyjuice/nodes/methods/tying.py | 3 ---
 1 file changed, 3 deletions(-)
 delete mode 100644 src/pyjuice/nodes/methods/tying.py

diff --git a/src/pyjuice/nodes/methods/tying.py b/src/pyjuice/nodes/methods/tying.py
deleted file mode 100644
index c9b65936..00000000
--- a/src/pyjuice/nodes/methods/tying.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from __future__ import annotations
-
-

From a8b19a3b2ea4af9db0b32a01e2c58ddc179001fa Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Fri, 1 Dec 2023 03:16:17 +0800
Subject: [PATCH 005/162] fix lvd & unstable normalization

---
 src/pyjuice/functional/normalize.py               |  4 ++--
 src/pyjuice/nodes/construction.py                 | 12 +++++++++++-
 src/pyjuice/nodes/methods/lvd_backend/counting.py |  1 +
 src/pyjuice/nodes/sum_nodes.py                    |  3 +++
 tests/lvd/counting_lvd_test.py                    |  4 ++--
 5 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/src/pyjuice/functional/normalize.py b/src/pyjuice/functional/normalize.py
index 6778d1fd..27809b8a 100644
--- a/src/pyjuice/functional/normalize.py
+++ b/src/pyjuice/functional/normalize.py
@@ -96,7 +96,7 @@ def normalize_parameters(params: torch.Tensor, node_ids: torch.Tensor, group_siz
         _cum_params_kernel[grid](grouped_params, cum_params, node_ids, num_param_blocks, group_size, batch_size, BLOCK_M, BLOCK_K, BLOCK_B)
         _norm_params_kernel[grid](grouped_params, cum_params, node_ids, node_nchs, num_param_blocks, group_size, batch_size, pseudocount, BLOCK_M, BLOCK_K, BLOCK_B)
 
-        params *= (grouped_params / params.sum(2)).unsqueeze(2)
+        params *= (grouped_params / (params.sum(2) + 1e-12)).unsqueeze(2)
 
     else:
         assert params.dim() == 3, "CPU version of `normalize_parameters` does not support `batch_size > 1` for now."
@@ -129,4 +129,4 @@ def normalize_parameters(params: torch.Tensor, node_ids: torch.Tensor, group_siz
             )
             params_buffer = torch.sparse.mm(cum_matrix2, node_buffer).reshape(num_param_blocks, group_size)
             
-            params *= (params_buffer / grouped_params).unsqueeze(2)
\ No newline at end of file
+            params *= (params_buffer / (grouped_params + 1e-12)).unsqueeze(2)
\ No newline at end of file
diff --git a/src/pyjuice/nodes/construction.py b/src/pyjuice/nodes/construction.py
index 21b46d51..b7cddc7b 100644
--- a/src/pyjuice/nodes/construction.py
+++ b/src/pyjuice/nodes/construction.py
@@ -54,7 +54,17 @@ def multiply(nodes1: ProdNodesChs, *args, edge_ids: Optional[Tensor] = None, **k
     return ProdNodes(num_node_groups, chs, edge_ids, group_size = group_size, **kwargs)
 
 
-def summate(nodes1: SumNodesChs, *args, num_node_groups: int = 0, edge_ids: Optional[Tensor] = None, group_size: int = 0, **kwargs):
+def summate(nodes1: SumNodesChs, *args, num_nodes: int = 0, num_node_groups: int = 0, edge_ids: Optional[Tensor] = None, group_size: int = 0, **kwargs):
+
+    if num_nodes > 0:
+        assert edge_ids is None
+        assert num_node_groups == 0, "Only one of `num_nodes` and `num_node_groups` can be set at the same time."
+        if group_size == 0:
+            group_size = CircuitNodes.DEFAULT_GROUP_SIZE
+
+        assert num_nodes % group_size == 0
+
+        num_node_groups = num_nodes // group_size
 
     assert isinstance(nodes1, ProdNodes) or isinstance(nodes1, InputNodes), f"Children of sum nodes must be input or product nodes, but found input of type {type(nodes1)}." 
 
diff --git a/src/pyjuice/nodes/methods/lvd_backend/counting.py b/src/pyjuice/nodes/methods/lvd_backend/counting.py
index c12a1db5..3d2ef90b 100644
--- a/src/pyjuice/nodes/methods/lvd_backend/counting.py
+++ b/src/pyjuice/nodes/methods/lvd_backend/counting.py
@@ -141,5 +141,6 @@ def lvd_by_counting(lvdistiller, ns: CircuitNodes):
                 ns._construct_edges(edge_mask)
 
             edge_params /= edge_params.sum(dim = 1, keepdim = True) + 1e-8
+            edge_params = edge_params[:,:,None,None]
 
             ns.set_params(edge_params, pseudocount = lvdistiller.pseudocount)
\ No newline at end of file
diff --git a/src/pyjuice/nodes/sum_nodes.py b/src/pyjuice/nodes/sum_nodes.py
index f5710907..2a1f6226 100644
--- a/src/pyjuice/nodes/sum_nodes.py
+++ b/src/pyjuice/nodes/sum_nodes.py
@@ -102,6 +102,9 @@ def set_params(self, params: torch.Tensor, normalize: bool = True, pseudocount:
 
             self._params = params[self.edge_ids[0,:],self.edge_ids[1,:],:,:].clone().contiguous()
 
+        else:
+            raise ValueError("Unsupported parameter input.")
+
         if normalize:
             normalize_parameters(self._params, self.edge_ids[0,:], group_size = self.group_size, 
                                  ch_group_size = self.ch_group_size, pseudocount = pseudocount)
diff --git a/tests/lvd/counting_lvd_test.py b/tests/lvd/counting_lvd_test.py
index 5a8ff56e..71aac6e6 100644
--- a/tests/lvd/counting_lvd_test.py
+++ b/tests/lvd/counting_lvd_test.py
@@ -27,8 +27,8 @@ def counting_lvd_test():
         m = multiply(n0, n1, lv_dataset = torch.tensor([0,0,1,1]))
         n = summate(m, num_nodes = 1)
 
-    assert torch.abs(n0._params - torch.tensor([1.0, 0.0, 0.0, 1.0])).max() < 1e-6
-    assert torch.abs(n1._params - torch.tensor([0.0, 1.0, 1.0, 0.0])).max() < 1e-6
+    assert torch.abs(n0._params.view(-1) - torch.tensor([1.0, 0.0, 0.0, 1.0])).max() < 1e-6
+    assert torch.abs(n1._params.view(-1) - torch.tensor([0.0, 1.0, 1.0, 0.0])).max() < 1e-6
 
 
 if __name__ == "__main__":

From ace4488253428def4f663467b22c607086fd01f9 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Fri, 1 Dec 2023 03:22:33 +0800
Subject: [PATCH 006/162] refactor: `deepcopy`

---
 src/pyjuice/transformations/copy.py | 15 +++---
 tests/transformations/copy_test.py  | 72 ++++++++++++++++-------------
 2 files changed, 48 insertions(+), 39 deletions(-)

diff --git a/src/pyjuice/transformations/copy.py b/src/pyjuice/transformations/copy.py
index 4df1e691..cb48c4a7 100644
--- a/src/pyjuice/transformations/copy.py
+++ b/src/pyjuice/transformations/copy.py
@@ -29,9 +29,10 @@ def dfs(ns: CircuitNodes):
         if ns.is_sum():
             if not tie_params:
                 new_ns = SumNodes(
-                    ns.num_nodes,
+                    ns.num_node_groups,
                     new_chs,
-                    ns.edge_ids.clone()
+                    ns.edge_ids.clone(),
+                    group_size = ns.group_size
                 )
                 params = ns.get_params()
                 if params is not None:
@@ -41,9 +42,10 @@ def dfs(ns: CircuitNodes):
             
         elif ns.is_prod():
             new_ns = ProdNodes(
-                ns.num_nodes,
+                ns.num_node_groups,
                 new_chs,
-                ns.edge_ids.clone()
+                ns.edge_ids.clone(),
+                group_size = ns.group_size
             )
             
         else:
@@ -61,9 +63,10 @@ def dfs(ns: CircuitNodes):
 
             if not tie_params:
                 new_ns = InputNodes(
-                    num_nodes = ns.num_nodes,
+                    num_node_groups = ns.num_node_groups,
                     scope = pydeepcopy(scope),
-                    dist = pydeepcopy(ns.dist)
+                    dist = pydeepcopy(ns.dist),
+                    group_size = ns.group_size
                 )
                 params = ns.get_params()
                 if params is not None:
diff --git a/tests/transformations/copy_test.py b/tests/transformations/copy_test.py
index 7dab7eea..ffcf1da3 100644
--- a/tests/transformations/copy_test.py
+++ b/tests/transformations/copy_test.py
@@ -11,52 +11,58 @@
 
 
 def copy_test():
-    num_nodes = 2
+    num_node_groups_candidates = [2, 4, 7]
+    group_size_candidates = [1, 4, 8]
 
-    i00 = inputs(0, num_nodes, dists.Categorical(num_cats = 5))
-    i10 = inputs(1, num_nodes, dists.Categorical(num_cats = 5))
-    i11 = inputs(1, num_nodes, dists.Categorical(num_cats = 5))
-    
-    ms0 = multiply(i00, i10)
-    ms1 = multiply(i00, i11)
+    for num_node_groups in num_node_groups_candidates:
+        for group_size in group_size_candidates:
 
-    ns = summate(ms0, ms1, num_nodes = 1)
+            with juice.set_group_size(group_size):
 
-    ns.init_parameters()
+                i00 = inputs(0, num_node_groups, dists.Categorical(num_cats = 5))
+                i10 = inputs(1, num_node_groups, dists.Categorical(num_cats = 5))
+                i11 = inputs(1, num_node_groups, dists.Categorical(num_cats = 5))
+                
+                ms0 = multiply(i00, i10)
+                ms1 = multiply(i00, i11)
 
-    ## Copy without parameter tying ##
+                ns = summate(ms0, ms1, num_node_groups = 1)
 
-    new_ns = deepcopy(ns)
+                ns.init_parameters()
 
-    assert ns.num_nodes == new_ns.num_nodes
-    assert torch.all(ns.edge_ids == new_ns.edge_ids)
-    assert torch.all(ns._params == new_ns._params)
+                ## Copy without parameter tying ##
 
-    assert torch.all(ns.chs[0].edge_ids == new_ns.chs[0].edge_ids)
-    assert torch.all(ns.chs[1].edge_ids == new_ns.chs[1].edge_ids)
+                new_ns = deepcopy(ns)
 
-    assert new_ns.chs[0].chs[0] == new_ns.chs[1].chs[0]
-    assert torch.all((ns.chs[0].chs[0].get_params() - new_ns.chs[0].chs[0].get_params()).abs() < 1e-6)
-    assert torch.all((ns.chs[0].chs[1].get_params() - new_ns.chs[0].chs[1].get_params()).abs() < 1e-6)
-    assert torch.all((ns.chs[1].chs[1].get_params() - new_ns.chs[1].chs[1].get_params()).abs() < 1e-6)
+                assert ns.num_nodes == new_ns.num_nodes
+                assert torch.all(ns.edge_ids == new_ns.edge_ids)
+                assert torch.all(ns._params == new_ns._params)
 
-    ## Copy with parameter tying ##
+                assert torch.all(ns.chs[0].edge_ids == new_ns.chs[0].edge_ids)
+                assert torch.all(ns.chs[1].edge_ids == new_ns.chs[1].edge_ids)
 
-    new_ns = deepcopy(ns, tie_params = True, var_mapping = {0: 2, 1: 3})
+                assert new_ns.chs[0].chs[0] == new_ns.chs[1].chs[0]
+                assert torch.all((ns.chs[0].chs[0].get_params() - new_ns.chs[0].chs[0].get_params()).abs() < 1e-6)
+                assert torch.all((ns.chs[0].chs[1].get_params() - new_ns.chs[0].chs[1].get_params()).abs() < 1e-6)
+                assert torch.all((ns.chs[1].chs[1].get_params() - new_ns.chs[1].chs[1].get_params()).abs() < 1e-6)
 
-    assert ns.num_nodes == new_ns.num_nodes
-    assert torch.all(ns.edge_ids == new_ns.edge_ids)
-    assert new_ns.get_source_ns() == ns
+                ## Copy with parameter tying ##
 
-    assert torch.all(ns.chs[0].edge_ids == new_ns.chs[0].edge_ids)
-    assert torch.all(ns.chs[1].edge_ids == new_ns.chs[1].edge_ids)
+                new_ns = deepcopy(ns, tie_params = True, var_mapping = {0: 2, 1: 3})
 
-    assert new_ns.chs[0].chs[0].get_source_ns() == ns.chs[0].chs[0]
-    assert new_ns.chs[0].chs[1].get_source_ns() == ns.chs[0].chs[1]
-    assert new_ns.chs[1].chs[1].get_source_ns() == ns.chs[1].chs[1]
-    assert tuple(new_ns.chs[0].chs[0].scope.to_list()) == (2,)
-    assert tuple(new_ns.chs[0].chs[1].scope.to_list()) == (3,)
-    assert tuple(new_ns.chs[1].chs[1].scope.to_list()) == (3,)
+                assert ns.num_nodes == new_ns.num_nodes
+                assert torch.all(ns.edge_ids == new_ns.edge_ids)
+                assert new_ns.get_source_ns() == ns
+
+                assert torch.all(ns.chs[0].edge_ids == new_ns.chs[0].edge_ids)
+                assert torch.all(ns.chs[1].edge_ids == new_ns.chs[1].edge_ids)
+
+                assert new_ns.chs[0].chs[0].get_source_ns() == ns.chs[0].chs[0]
+                assert new_ns.chs[0].chs[1].get_source_ns() == ns.chs[0].chs[1]
+                assert new_ns.chs[1].chs[1].get_source_ns() == ns.chs[1].chs[1]
+                assert tuple(new_ns.chs[0].chs[0].scope.to_list()) == (2,)
+                assert tuple(new_ns.chs[0].chs[1].scope.to_list()) == (3,)
+                assert tuple(new_ns.chs[1].chs[1].scope.to_list()) == (3,)
 
 
 if __name__ == "__main__":

From 3d252a1843ff2f0dbcbb535ab1e5b4cee8197cd9 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Fri, 1 Dec 2023 03:52:35 +0800
Subject: [PATCH 007/162] backward compatibility: support `num_nodes` input in
 `inputs`

---
 src/pyjuice/nodes/construction.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/pyjuice/nodes/construction.py b/src/pyjuice/nodes/construction.py
index b7cddc7b..9cc72d09 100644
--- a/src/pyjuice/nodes/construction.py
+++ b/src/pyjuice/nodes/construction.py
@@ -19,7 +19,17 @@
 
 
 def inputs(var: Union[int,Sequence[int]], num_node_groups: int, dist: Distribution, params: Optional[Tensor] = None, 
-           group_size: int = 0, **kwargs):
+           num_nodes: int = 0, group_size: int = 0, **kwargs):
+
+    if num_nodes > 0:
+        assert num_node_groups == 0, "Only one of `num_nodes` and `num_node_groups` can be set at the same time."
+        if group_size == 0:
+            group_size = CircuitNodes.DEFAULT_GROUP_SIZE
+
+        assert num_nodes % group_size == 0
+
+        num_node_groups = num_nodes // group_size
+
     return InputNodes(
         num_node_groups = num_node_groups,
         scope = [var] if isinstance(var, int) else var,

From 158db547421054bf26dea66a0ac0bfd3c11c1102 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Fri, 1 Dec 2023 04:01:02 +0800
Subject: [PATCH 008/162] refactor: `merge`

---
 src/pyjuice/transformations/merge.py |  52 +++++---
 tests/transformations/merge_test.py  | 174 ++++++++++++++-------------
 2 files changed, 127 insertions(+), 99 deletions(-)

diff --git a/src/pyjuice/transformations/merge.py b/src/pyjuice/transformations/merge.py
index 41245c5c..5802f27c 100644
--- a/src/pyjuice/transformations/merge.py
+++ b/src/pyjuice/transformations/merge.py
@@ -12,10 +12,12 @@ def merge_sum_nodes(ns1: SumNodes, ns2: SumNodes, *args) -> SumNodes:
     all_ns = [ns1, ns2, *args]
     for idx, ns in enumerate(all_ns):
         assert ns1.scope == ns.scope, "Sum nodes to be merged should have the same scope."
+        assert ns1.group_size == ns.group_size, "To-be-merged sum nodes must have the same group size."
         if not isinstance(ns, SumNodes):
-            edge_ids = torch.arange(0, ns.num_nodes).unsqueeze(0).repeat(2, 1)
-            params = torch.ones([ns.num_nodes])
-            new_ns = SumNodes(ns.num_nodes, [ns], edge_ids, params = params)
+            edge_ids = torch.arange(0, ns.num_node_groups).unsqueeze(0).repeat(2, 1)
+            group_size = ns.group_size
+            params = torch.eye(ns.group_size).unsqueeze(0).repeat(ns.num_node_groups, 1, 1)
+            new_ns = SumNodes(ns.num_node_groups, [ns], edge_ids, params = params, group_size = group_size)
             all_ns[idx] = new_ns
 
     sum_edge_ids = []
@@ -23,12 +25,18 @@ def merge_sum_nodes(ns1: SumNodes, ns2: SumNodes, *args) -> SumNodes:
     cs2start_id = dict()
     ns_start_id = 0
     global_cs_start_id = 0
+    ch_group_size = None
     for ns in all_ns:
-        ns_end_id = ns_start_id + ns.num_nodes
+        ns_end_id = ns_start_id + ns.num_node_groups
         curr_cs_sid = 0
         edge_ids = ns.edge_ids.clone()
         for cs in ns.chs:
-            curr_cs_eid = curr_cs_sid + cs.num_nodes
+            if ch_group_size is None:
+                ch_group_size = cs.group_size
+            else:
+                assert ch_group_size == cs.group_size, "Children must have the same group size."
+
+            curr_cs_eid = curr_cs_sid + cs.num_node_groups
             if cs in cs2start_id:
                 cs_start_id = cs2start_id[cs]
             else:
@@ -40,7 +48,7 @@ def merge_sum_nodes(ns1: SumNodes, ns2: SumNodes, *args) -> SumNodes:
             curr_cs_sid = curr_cs_eid
             if cs not in cs2start_id:
                 cs2start_id[cs] = global_cs_start_id
-                global_cs_start_id += cs.num_nodes
+                global_cs_start_id += cs.num_node_groups
                 sum_chs.append(cs)
 
         edge_ids[0,:] += ns_start_id
@@ -48,14 +56,14 @@ def merge_sum_nodes(ns1: SumNodes, ns2: SumNodes, *args) -> SumNodes:
         
         ns_start_id = ns_end_id
 
-    num_nodes = ns_start_id
+    num_node_groups = ns_start_id
     edge_ids = torch.cat(sum_edge_ids, dim = 1)
     if all([hasattr(ns, "_params") and ns._params is not None for ns in all_ns]):
         params = torch.cat([ns._params for ns in all_ns], dim = 0)
     else:
         params = None
     
-    return SumNodes(num_nodes, sum_chs, edge_ids, params = params)
+    return SumNodes(num_node_groups, sum_chs, edge_ids, params = params, group_size = ns1.group_size)
 
 
 def merge_prod_nodes(ns1: ProdNodes, ns2: ProdNodes, *args) -> ProdNodes:
@@ -65,18 +73,26 @@ def merge_prod_nodes(ns1: ProdNodes, ns2: ProdNodes, *args) -> ProdNodes:
     for ns in all_ns:
         assert isinstance(ns, ProdNodes), "Inputs should all be ProdNodes."
         assert ns1.scope == ns.scope, "Product nodes to be merged should have the same scope."
+        assert ns1.group_size == ns.group_size, "To-be-merged product nodes must have the same group size."
         for cs, scope in zip(ns.chs, ch_scopes):
             assert cs.scope == scope
 
     cs2start_id = dict()
     sum_chs = [[] for _ in range(num_scopes)]
     global_start_ids = [0 for _ in range(num_scopes)]
+    ch_group_size = None
     for ns in all_ns:
         for scope_id in range(num_scopes):
             cs = ns.chs[scope_id]
+
+            if ch_group_size is None:
+                ch_group_size = cs.group_size
+            else:
+                assert ch_group_size == cs.group_size, "Children must have the same group size."
+
             if cs not in cs2start_id:
                 cs2start_id[cs] = global_start_ids[scope_id]
-                global_start_ids[scope_id] += cs.num_nodes
+                global_start_ids[scope_id] += cs.num_node_groups
                 sum_chs[scope_id].append(cs)
 
     new_sum_chs = []
@@ -97,9 +113,9 @@ def merge_prod_nodes(ns1: ProdNodes, ns2: ProdNodes, *args) -> ProdNodes:
         prod_edge_ids.append(edge_ids)
 
     edge_ids = torch.cat(prod_edge_ids, dim = 0)
-    num_nodes = edge_ids.size(0)
+    num_node_groups = edge_ids.size(0)
 
-    return ProdNodes(num_nodes, new_sum_chs, edge_ids)
+    return ProdNodes(num_node_groups, new_sum_chs, edge_ids, group_size = ns1.group_size)
 
 
 def merge_by_region_node(root_ns: CircuitNodes) -> CircuitNodes:
@@ -124,7 +140,7 @@ def merge_by_region_node(root_ns: CircuitNodes) -> CircuitNodes:
         rg_hash = hash(rg)
         if isinstance(rg, InputRegionNode):
             for ns in rg2nodes[rg_hash]:
-                ns_old2new[ns] = (ns, (0, ns.num_nodes))
+                ns_old2new[ns] = (ns, (0, ns.num_node_groups))
         elif isinstance(rg, PartitionNode):
             prod_ns = []
             for ns in rg2nodes[rg_hash]:
@@ -135,7 +151,7 @@ def merge_by_region_node(root_ns: CircuitNodes) -> CircuitNodes:
                     edge_ids[:,scope_id] += sid
                     chs.append(new_cs)
 
-                prod_ns.append(ProdNodes(ns.num_nodes, chs, edge_ids))
+                prod_ns.append(ProdNodes(ns.num_node_groups, chs, edge_ids, group_size = ns.group_size))
 
             if len(prod_ns) == 1:
                 new_ns = prod_ns[0]
@@ -143,7 +159,7 @@ def merge_by_region_node(root_ns: CircuitNodes) -> CircuitNodes:
                 new_ns = merge_prod_nodes(*prod_ns)
             sid = 0
             for ns in rg2nodes[rg_hash]:
-                nid = sid + ns.num_nodes
+                nid = sid + ns.num_node_groups
                 ns_old2new[ns] = (new_ns, (sid, nid))
                 sid = nid
 
@@ -156,7 +172,7 @@ def merge_by_region_node(root_ns: CircuitNodes) -> CircuitNodes:
                 global_sid = 0
                 origin_sid = 0
                 for scope_id, cs in enumerate(ns.chs):
-                    origin_eid = origin_sid + cs.num_nodes
+                    origin_eid = origin_sid + cs.num_node_groups
                     new_cs, (offset_sid, offset_eid) = ns_old2new[cs]
                     if new_cs in ch2sid:
                         sid = ch2sid[new_cs]
@@ -169,7 +185,7 @@ def merge_by_region_node(root_ns: CircuitNodes) -> CircuitNodes:
                     if new_cs not in ch2sid:
                         chs.append(new_cs)
                         ch2sid[new_cs] = global_sid
-                        global_sid += new_cs.num_nodes
+                        global_sid += new_cs.num_node_groups
 
                     origin_sid = origin_eid
 
@@ -177,7 +193,7 @@ def merge_by_region_node(root_ns: CircuitNodes) -> CircuitNodes:
                     params = ns._params
                 else:
                     params = None
-                sum_ns.append(SumNodes(ns.num_nodes, chs, edge_ids, params = params))
+                sum_ns.append(SumNodes(ns.num_node_groups, chs, edge_ids, params = params, group_size = ns.group_size))
 
             if len(sum_ns) == 1:
                 new_ns = sum_ns[0]
@@ -185,7 +201,7 @@ def merge_by_region_node(root_ns: CircuitNodes) -> CircuitNodes:
                 new_ns = merge_sum_nodes(*sum_ns)
             sid = 0
             for ns in rg2nodes[rg_hash]:
-                nid = sid + ns.num_nodes
+                nid = sid + ns.num_node_groups
                 ns_old2new[ns] = (new_ns, (sid, nid))
                 sid = nid
 
diff --git a/tests/transformations/merge_test.py b/tests/transformations/merge_test.py
index d7174396..4cb2f8a6 100644
--- a/tests/transformations/merge_test.py
+++ b/tests/transformations/merge_test.py
@@ -11,101 +11,113 @@
 
 
 def sum_nodes_merge_test():
-    num_nodes = 2
+    num_node_groups = 2
 
-    i00 = inputs(0, num_nodes, dists.Categorical(num_cats = 5))
-    i01 = inputs(0, num_nodes, dists.Categorical(num_cats = 5))
-    i10 = inputs(1, num_nodes, dists.Categorical(num_cats = 5))
-    i11 = inputs(1, num_nodes, dists.Categorical(num_cats = 5))
-    
-    m00 = multiply(i00, i10)
-    m01 = multiply(i01, i11)
+    for group_size in [1, 2, 4, 8]:
+        
+        with juice.set_group_size(group_size):
 
-    n0 = summate(m00, num_nodes = num_nodes)
-    n1 = summate(m01, num_nodes = num_nodes)
-    n2 = summate(m00, num_nodes = num_nodes)
+            i00 = inputs(0, num_node_groups, dists.Categorical(num_cats = 5))
+            i01 = inputs(0, num_node_groups, dists.Categorical(num_cats = 5))
+            i10 = inputs(1, num_node_groups, dists.Categorical(num_cats = 5))
+            i11 = inputs(1, num_node_groups, dists.Categorical(num_cats = 5))
+            
+            m00 = multiply(i00, i10)
+            m01 = multiply(i01, i11)
 
-    n_new = merge_sum_nodes(n0, n1)
-    assert (n_new.edge_ids == torch.Tensor([[0,0,1,1,2,2,3,3],[0,1,0,1,2,3,2,3]])).all()
-    assert len(n_new.chs) == 2
-    assert n_new.chs[0] == m00
-    assert n_new.chs[1] == m01
+            n0 = summate(m00, num_node_groups = num_node_groups)
+            n1 = summate(m01, num_node_groups = num_node_groups)
+            n2 = summate(m00, num_node_groups = num_node_groups)
 
-    n_new = merge_sum_nodes(n0, n2)
-    assert (n_new.edge_ids == torch.Tensor([[0,0,1,1,2,2,3,3],[0,1,0,1,0,1,0,1]])).all()
-    assert len(n_new.chs) == 1
-    assert n_new.chs[0] == m00
+            n_new = merge_sum_nodes(n0, n1)
+            assert (n_new.edge_ids == torch.Tensor([[0,0,1,1,2,2,3,3],[0,1,0,1,2,3,2,3]])).all()
+            assert len(n_new.chs) == 2
+            assert n_new.chs[0] == m00
+            assert n_new.chs[1] == m01
+
+            n_new = merge_sum_nodes(n0, n2)
+            assert (n_new.edge_ids == torch.Tensor([[0,0,1,1,2,2,3,3],[0,1,0,1,0,1,0,1]])).all()
+            assert len(n_new.chs) == 1
+            assert n_new.chs[0] == m00
 
 
 def prod_nodes_merge_test():
-    num_nodes = 2
+    num_node_groups = 2
+
+    for group_size in [1, 2, 4, 8]:
+        
+        with juice.set_group_size(group_size):
 
-    i00 = inputs(0, num_nodes, dists.Categorical(num_cats = 5))
-    i01 = inputs(0, num_nodes, dists.Categorical(num_cats = 5))
-    i10 = inputs(1, num_nodes, dists.Categorical(num_cats = 5))
-    i11 = inputs(1, num_nodes, dists.Categorical(num_cats = 5))
+            i00 = inputs(0, num_node_groups, dists.Categorical(num_cats = 5))
+            i01 = inputs(0, num_node_groups, dists.Categorical(num_cats = 5))
+            i10 = inputs(1, num_node_groups, dists.Categorical(num_cats = 5))
+            i11 = inputs(1, num_node_groups, dists.Categorical(num_cats = 5))
 
-    m00 = multiply(i00, i10)
-    m01 = multiply(i01, i11)
-    m02 = multiply(i00, i10)
+            m00 = multiply(i00, i10)
+            m01 = multiply(i01, i11)
+            m02 = multiply(i00, i10)
 
-    m_new = merge_prod_nodes(m00, m01)
-    assert (m_new.edge_ids == torch.Tensor([[0,0],[1,1],[2,2],[3,3]])).all()
-    assert m_new.chs[0].chs[0].chs[0] == i00
-    assert m_new.chs[0].chs[1].chs[0] == i01
-    assert m_new.chs[1].chs[0].chs[0] == i10
-    assert m_new.chs[1].chs[1].chs[0] == i11
+            m_new = merge_prod_nodes(m00, m01)
+            assert (m_new.edge_ids == torch.Tensor([[0,0],[1,1],[2,2],[3,3]])).all()
+            assert m_new.chs[0].chs[0].chs[0] == i00
+            assert m_new.chs[0].chs[1].chs[0] == i01
+            assert m_new.chs[1].chs[0].chs[0] == i10
+            assert m_new.chs[1].chs[1].chs[0] == i11
 
-    m_new = merge_prod_nodes(m00, m02)
-    assert (m_new.edge_ids == torch.Tensor([[0,0],[1,1],[0,0],[1,1]])).all()
-    assert m_new.chs[0] == i00
-    assert m_new.chs[1] == i10
+            m_new = merge_prod_nodes(m00, m02)
+            assert (m_new.edge_ids == torch.Tensor([[0,0],[1,1],[0,0],[1,1]])).all()
+            assert m_new.chs[0] == i00
+            assert m_new.chs[1] == i10
 
 
 def merge_by_region_node_test():
-    num_nodes = 2
-
-    i00 = inputs(0, num_nodes, dists.Categorical(num_cats = 5))
-    i01 = inputs(0, num_nodes, dists.Categorical(num_cats = 5))
-    i10 = inputs(1, num_nodes, dists.Categorical(num_cats = 5))
-    i11 = inputs(1, num_nodes, dists.Categorical(num_cats = 5))
-    i20 = inputs(2, num_nodes, dists.Categorical(num_cats = 5))
-    i30 = inputs(3, num_nodes, dists.Categorical(num_cats = 5))
-
-    m00 = multiply(i00, i10)
-    m01 = multiply(i01, i11)
-    m02 = multiply(i00, i10)
-    m10 = multiply(i20, i30)
-
-    n00 = summate(m00, num_nodes = num_nodes)
-    n01 = summate(m01, m02, num_nodes = num_nodes)
-    n10 = summate(m10, num_nodes = num_nodes)
-
-    m20 = multiply(n00, n10)
-    m21 = multiply(n01, n10)
-
-    n = summate(m20, m21, num_nodes = 1)
-
-    new_n = merge_by_region_node(n)
-    
-    assert (new_n.edge_ids == torch.Tensor([[0,0,0,0],[0,1,2,3]])).all()
-    assert len(new_n.chs) == 1
-    assert (new_n.chs[0].edge_ids == torch.Tensor([[0,0],[1,1],[2,0],[3,1]])).all()
-    assert len(new_n.chs[0].chs) == 2
-    assert (new_n.chs[0].chs[0].edge_ids == torch.Tensor([[0,0,1,1,2,2,2,2,3,3,3,3],[0,1,0,1,2,3,4,5,2,3,4,5]])).all()
-    assert len(new_n.chs[0].chs[0].chs) == 1
-    assert (new_n.chs[0].chs[1].edge_ids == torch.Tensor([[0,0,1,1],[0,1,0,1]])).all()
-    assert len(new_n.chs[0].chs[1].chs) == 1
-    assert (new_n.chs[0].chs[0].chs[0].edge_ids == torch.Tensor([[0,0],[1,1],[2,2],[3,3],[0,0],[1,1]])).all()
-    assert len(new_n.chs[0].chs[0].chs[0].chs) == 2
-    assert (new_n.chs[0].chs[1].chs[0].edge_ids == torch.Tensor([[0,0],[1,1]])).all()
-    assert len(new_n.chs[0].chs[1].chs[0].chs) == 2
-    assert new_n.chs[0].chs[0].chs[0].chs[0].chs[0].chs[0] == i00
-    assert new_n.chs[0].chs[0].chs[0].chs[0].chs[1].chs[0] == i01
-    assert new_n.chs[0].chs[0].chs[0].chs[1].chs[0].chs[0] == i10
-    assert new_n.chs[0].chs[0].chs[0].chs[1].chs[1].chs[0] == i11
-    assert new_n.chs[0].chs[1].chs[0].chs[0] == i20
-    assert new_n.chs[0].chs[1].chs[0].chs[1] == i30
+    num_node_groups = 2
+
+    for group_size in [1, 2, 4, 8]:
+        
+        with juice.set_group_size(group_size):
+
+            i00 = inputs(0, num_node_groups, dists.Categorical(num_cats = 5))
+            i01 = inputs(0, num_node_groups, dists.Categorical(num_cats = 5))
+            i10 = inputs(1, num_node_groups, dists.Categorical(num_cats = 5))
+            i11 = inputs(1, num_node_groups, dists.Categorical(num_cats = 5))
+            i20 = inputs(2, num_node_groups, dists.Categorical(num_cats = 5))
+            i30 = inputs(3, num_node_groups, dists.Categorical(num_cats = 5))
+
+            m00 = multiply(i00, i10)
+            m01 = multiply(i01, i11)
+            m02 = multiply(i00, i10)
+            m10 = multiply(i20, i30)
+
+            n00 = summate(m00, num_node_groups = num_node_groups)
+            n01 = summate(m01, m02, num_node_groups = num_node_groups)
+            n10 = summate(m10, num_node_groups = num_node_groups)
+
+            m20 = multiply(n00, n10)
+            m21 = multiply(n01, n10)
+
+            n = summate(m20, m21, num_node_groups = 1, group_size = 1)
+
+            new_n = merge_by_region_node(n)
+            
+            assert (new_n.edge_ids == torch.Tensor([[0,0,0,0],[0,1,2,3]])).all()
+            assert len(new_n.chs) == 1
+            assert (new_n.chs[0].edge_ids == torch.Tensor([[0,0],[1,1],[2,0],[3,1]])).all()
+            assert len(new_n.chs[0].chs) == 2
+            assert (new_n.chs[0].chs[0].edge_ids == torch.Tensor([[0,0,1,1,2,2,2,2,3,3,3,3],[0,1,0,1,2,3,4,5,2,3,4,5]])).all()
+            assert len(new_n.chs[0].chs[0].chs) == 1
+            assert (new_n.chs[0].chs[1].edge_ids == torch.Tensor([[0,0,1,1],[0,1,0,1]])).all()
+            assert len(new_n.chs[0].chs[1].chs) == 1
+            assert (new_n.chs[0].chs[0].chs[0].edge_ids == torch.Tensor([[0,0],[1,1],[2,2],[3,3],[0,0],[1,1]])).all()
+            assert len(new_n.chs[0].chs[0].chs[0].chs) == 2
+            assert (new_n.chs[0].chs[1].chs[0].edge_ids == torch.Tensor([[0,0],[1,1]])).all()
+            assert len(new_n.chs[0].chs[1].chs[0].chs) == 2
+            assert new_n.chs[0].chs[0].chs[0].chs[0].chs[0].chs[0] == i00
+            assert new_n.chs[0].chs[0].chs[0].chs[0].chs[1].chs[0] == i01
+            assert new_n.chs[0].chs[0].chs[0].chs[1].chs[0].chs[0] == i10
+            assert new_n.chs[0].chs[0].chs[0].chs[1].chs[1].chs[0] == i11
+            assert new_n.chs[0].chs[1].chs[0].chs[0] == i20
+            assert new_n.chs[0].chs[1].chs[0].chs[1] == i30
 
 
 if __name__ == "__main__":

From 5f965b40671bfbe51ee6a2f5ac7f5ab246555fc8 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Fri, 1 Dec 2023 04:14:48 +0800
Subject: [PATCH 009/162] refactor: `pruning`

---
 src/pyjuice/transformations/prune.py  |  17 ++--
 tests/transformations/pruning_test.py | 110 ++++++++++++++------------
 2 files changed, 68 insertions(+), 59 deletions(-)

diff --git a/src/pyjuice/transformations/prune.py b/src/pyjuice/transformations/prune.py
index 2d13c994..ee5cb0d9 100644
--- a/src/pyjuice/transformations/prune.py
+++ b/src/pyjuice/transformations/prune.py
@@ -78,7 +78,7 @@ def _construct_pruned_circuit(ns: CircuitNodes, ch_outputs: Sequence[CircuitNode
                 edge_filter = selected_edges[score_ranges[ns][0]:score_ranges[ns][1]]
                 copied_edges = []
                 copied_params = []
-                for node_id in range(ns.num_nodes):
+                for node_id in range(ns.num_node_groups):
                     curr_eids = (edge_ids[0,:] == node_id) * edge_filter
                     if curr_eids.sum().item() == 0:
                         maxid = torch.argmax(
@@ -86,19 +86,20 @@ def _construct_pruned_circuit(ns: CircuitNodes, ch_outputs: Sequence[CircuitNode
                             (edge_ids[0,:] == node_id) * 1e-8
                         )
                         copied_edges.append(edge_ids[:,maxid].unsqueeze(1))
-                        copied_params.append(ns._params[maxid].reshape(1))
+                        copied_params.append(ns._params[maxid].reshape(1, ns.group_size, ns.ch_group_size))
                     else:
                         copied_edges.append(edge_ids[:,curr_eids])
-                        copied_params.append(ns._params[curr_eids].reshape(-1))
+                        copied_params.append(ns._params[curr_eids,:,:])
 
                 edge_ids = torch.cat(copied_edges, dim = 1)
                 params = torch.cat(copied_params, dim = 0)
 
                 new_ns = SumNodes(
-                    num_nodes = ns.num_nodes, 
+                    num_node_groups = ns.num_node_groups, 
                     chs = ch_outputs, 
                     edge_ids = edge_ids,
-                    params = params
+                    params = params,
+                    group_size = ns.group_size
                 )
             else:
                 # Keep the node as-is
@@ -117,11 +118,11 @@ def _construct_pruned_circuit(ns: CircuitNodes, ch_outputs: Sequence[CircuitNode
         ns_source = old2new[dup2source[ns]]
         ns._source_node = ns_source
 
-        assert ns.num_nodes == ns_source.num_nodes
+        assert ns.num_node_groups == ns_source.num_node_groups and ns.group_size == ns_source.group_size
 
         if hasattr(ns_source, "edge_ids"):
             ns.edge_ids = ns_source.edge_ids.clone()
-        if hasattr(ns_source, "_params"):
-            ns._params = ns_source._params.clone()
+        # if hasattr(ns_source, "_params"):
+        #     ns._params = ns_source._params.clone()
 
     return new_root_ns
\ No newline at end of file
diff --git a/tests/transformations/pruning_test.py b/tests/transformations/pruning_test.py
index b9b5d7b3..efedef4b 100644
--- a/tests/transformations/pruning_test.py
+++ b/tests/transformations/pruning_test.py
@@ -8,79 +8,87 @@
 
 
 def pruning_test():
-    num_nodes = 2
+    num_node_groups = 2
 
-    i0 = inputs(0, num_nodes, dists.Categorical(num_cats = 5))
-    i1 = inputs(1, num_nodes, dists.Categorical(num_cats = 5))
-    i2 = inputs(2, num_nodes, dists.Categorical(num_cats = 5))
-    i3 = inputs(3, num_nodes, dists.Categorical(num_cats = 5))
+    for group_size in [1, 2, 4, 8]:
+        
+        with juice.set_group_size(group_size):
 
-    m1 = multiply(i0, i1)
-    n1 = summate(m1, num_nodes = num_nodes)
+            i0 = inputs(0, num_node_groups, dists.Categorical(num_cats = 5))
+            i1 = inputs(1, num_node_groups, dists.Categorical(num_cats = 5))
+            i2 = inputs(2, num_node_groups, dists.Categorical(num_cats = 5))
+            i3 = inputs(3, num_node_groups, dists.Categorical(num_cats = 5))
 
-    m2 = multiply(i2, i3)
-    n2 = summate(m2, num_nodes = num_nodes)
+            m1 = multiply(i0, i1)
+            n1 = summate(m1, num_node_groups = num_node_groups)
 
-    m = multiply(n1, n2)
-    n = summate(m, num_nodes = 1)
+            m2 = multiply(i2, i3)
+            n2 = summate(m2, num_node_groups = num_node_groups)
 
-    n.init_parameters(perturbation = 2.0)
+            m = multiply(n1, n2)
+            n = summate(m, num_node_groups = 1)
+
+            n.init_parameters(perturbation = 2.0)
 
-    n1._scores = torch.Tensor([0.3, 0.2, 0.6, 0.8])
-    n2._scores = torch.Tensor([0.3, 0.9, 0.1, 0.8])
-    n._scores = torch.Tensor([0.6, 0.6])
+            n1._scores = torch.Tensor([0.3, 0.2, 0.6, 0.8])
+            n2._scores = torch.Tensor([0.3, 0.9, 0.1, 0.8])
+            n._scores = torch.Tensor([0.6, 0.6])
 
-    new_n = prune_by_score(n, score_threshold = 0.5)
+            new_n = prune_by_score(n, score_threshold = 0.5)
 
-    assert new_n.edge_ids.size(1) == 2
+            assert new_n.edge_ids.size(1) == 2
 
-    new_n1 = new_n.chs[0].chs[0]
-    assert new_n1.edge_ids.size(1) == 3
-    assert torch.all(new_n1.edge_ids == torch.tensor([[0,1,1],[0,0,1]]))
-    assert torch.all(torch.abs(new_n1._params[1:] - n1._params[[2,3]]) < 1e-4)
-    assert torch.all(torch.abs(new_n1._params[0] - 1.0) < 1e-4)
+            new_n1 = new_n.chs[0].chs[0]
+            assert new_n1.edge_ids.size(1) == 3
+            assert torch.all(new_n1.edge_ids == torch.tensor([[0,1,1],[0,0,1]]))
+            assert torch.all(torch.abs(new_n1._params[1:] - n1._params[[2,3]]) < 1e-4)
+            assert torch.all(torch.abs(new_n1._params[0].sum(dim = 1) - 1.0) < 1e-4)
 
-    new_n2 = new_n.chs[0].chs[1]
-    assert new_n2.edge_ids.size(1) == 2
-    assert torch.all(new_n2.edge_ids == torch.tensor([[0,1],[1,1]]))
-    assert torch.all(torch.abs(new_n2._params - 1.0) < 1e-4)
+            new_n2 = new_n.chs[0].chs[1]
+            assert new_n2.edge_ids.size(1) == 2
+            assert torch.all(new_n2.edge_ids == torch.tensor([[0,1],[1,1]]))
+            assert torch.all(torch.abs(new_n2._params.sum(dim = 2) - 1.0) < 1e-4)
 
 
 def pruning_with_param_tying_test():
-    num_nodes = 2
+    num_node_groups = 2
 
-    i0 = inputs(0, num_nodes, dists.Categorical(num_cats = 5))
-    i1 = inputs(1, num_nodes, dists.Categorical(num_cats = 5))
-    i2 = inputs(2, num_nodes, dists.Categorical(num_cats = 5))
-    i3 = inputs(3, num_nodes, dists.Categorical(num_cats = 5))
+    for group_size in [1, 2, 4, 8]:
+        
+        with juice.set_group_size(group_size):
 
-    m1 = multiply(i0, i1)
-    n1 = summate(m1, num_nodes = num_nodes)
+            i0 = inputs(0, num_node_groups, dists.Categorical(num_cats = 5))
+            i1 = inputs(1, num_node_groups, dists.Categorical(num_cats = 5))
+            i2 = inputs(2, num_node_groups, dists.Categorical(num_cats = 5))
+            i3 = inputs(3, num_node_groups, dists.Categorical(num_cats = 5))
 
-    m2 = multiply(i2, i3)
-    n2 = n1.duplicate(m2, tie_params = True)
+            m1 = multiply(i0, i1)
+            n1 = summate(m1, num_node_groups = num_node_groups)
 
-    m = multiply(n1, n2)
-    n = summate(m, num_nodes = 1)
+            m2 = multiply(i2, i3)
+            n2 = n1.duplicate(m2, tie_params = True)
 
-    n.init_parameters(perturbation = 2.0)
+            m = multiply(n1, n2)
+            n = summate(m, num_node_groups = 1)
+
+            n.init_parameters(perturbation = 2.0)
 
-    n1._scores = torch.Tensor([0.3, 0.2, 0.6, 0.8])
-    n._scores = torch.Tensor([0.6, 0.6])
+            n1._scores = torch.Tensor([0.3, 0.2, 0.6, 0.8])
+            n._scores = torch.Tensor([0.6, 0.6])
 
-    new_n = prune_by_score(n, score_threshold = 0.5)
+            new_n = prune_by_score(n, score_threshold = 0.5)
 
-    new_n1 = new_n.chs[0].chs[0]
-    assert new_n1.edge_ids.size(1) == 3
-    assert torch.all(new_n1.edge_ids == torch.tensor([[0,1,1],[0,0,1]]))
-    assert torch.all(torch.abs(new_n1._params[1:] - n1._params[[2,3]]) < 1e-4)
-    assert torch.all(torch.abs(new_n1._params[0] - 1.0) < 1e-4)
+            new_n1 = new_n.chs[0].chs[0]
+            assert new_n1.edge_ids.size(1) == 3
+            assert torch.all(new_n1.edge_ids == torch.tensor([[0,1,1],[0,0,1]]))
+            assert torch.all(torch.abs(new_n1._params[1:] - n1._params[[2,3]]) < 1e-4)
+            assert torch.all(torch.abs(new_n1._params[0].sum(dim = 1) - 1.0) < 1e-4)
 
-    new_n2 = new_n.chs[0].chs[1]
-    assert new_n2.edge_ids.size(1) == 3
-    assert torch.all(new_n2.edge_ids == torch.tensor([[0,1,1],[0,0,1]]))
-    assert torch.all(torch.abs(new_n2._params[1:] - n1._params[[2,3]]) < 1e-4)
-    assert torch.all(torch.abs(new_n2._params[0] - 1.0) < 1e-4)
+            new_n2 = new_n.chs[0].chs[1]
+            assert new_n2.edge_ids.size(1) == 3
+            assert torch.all(new_n2.edge_ids == torch.tensor([[0,1,1],[0,0,1]]))
+            assert torch.all(torch.abs(new_n2._source_node._params[1:] - n1._params[[2,3]]) < 1e-4)
+            assert torch.all(torch.abs(new_n2._source_node._params[0].sum(dim = 1) - 1.0) < 1e-4)
 
 
 def pruning_by_flow_test():

From cc915c840ce5cb03a3a5f664f1c224ca750d2695 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Fri, 1 Dec 2023 20:39:12 +0800
Subject: [PATCH 010/162] refactor: forward pass for `InputLayer`

---
 src/pyjuice/layer/input_layer.py              | 259 +++++++++++-------
 src/pyjuice/layer/layer.py                    |  10 +-
 src/pyjuice/layer/prod_layer.py               |   2 +-
 src/pyjuice/layer/sum_layer.py                |   2 +-
 src/pyjuice/model/tensorcircuit.py            |   2 +
 .../nodes/distributions/categorical.py        |  10 +-
 .../nodes/distributions/distributions.py      |  12 +-
 tests/layer/input_layer_test.py               | 104 +++++++
 tests/model/forward_test.py                   |  10 +-
 9 files changed, 284 insertions(+), 127 deletions(-)
 create mode 100644 tests/layer/input_layer_test.py

diff --git a/src/pyjuice/layer/input_layer.py b/src/pyjuice/layer/input_layer.py
index edba042e..68e9f407 100644
--- a/src/pyjuice/layer/input_layer.py
+++ b/src/pyjuice/layer/input_layer.py
@@ -23,11 +23,14 @@
 class InputLayer(Layer, nn.Module):
     def __init__(self, nodes: Sequence[InputNodes], cum_nodes: int = 0) -> None:
         nn.Module.__init__(self)
-        Layer.__init__(self)
+        Layer.__init__(self, nodes)
 
         # Reorder input nodes such that for any tied nodes, its source nodes appear before them
         self.nodes = self._reorder_nodes(nodes)
 
+        # Group size of the nodes in the current layer
+        self.group_size = self.nodes[0].group_size
+
         ## Parse input `nodes` ##
         node_vars = []
         node_sizes = []
@@ -35,13 +38,13 @@ def __init__(self, nodes: Sequence[InputNodes], cum_nodes: int = 0) -> None:
         layer_num_nodes = 0
         cum_params = 0
         cum_param_flows = 0
-        cum_source_ns = 0
+        cum_source_ngroups = 0
         dist_signature = None
         for ns in self.nodes:
             if dist_signature is None:
                 dist_signature = ns.dist.get_signature()
             else:
-                assert dist_signature == ns.dist.get_signature(), "Nodes of an InputLayer must have the same distribution type."
+                assert dist_signature == ns.dist.get_signature(), f"Nodes of an InputLayer must have the same distribution type, but got `{dist_signature}` and `{ns.dist.get_signature()}`."
 
             node_vars.append(ns.scope.to_list())
             node_sizes.append(ns.num_nodes)
@@ -58,7 +61,7 @@ def __init__(self, nodes: Sequence[InputNodes], cum_nodes: int = 0) -> None:
                 cum_param_flows += ns.num_nodes * ns.dist.num_param_flows()
                 ns._param_flow_range = (cum_param_flows - ns.num_nodes * ns.dist.num_param_flows(), cum_param_flows)
 
-                cum_source_ns += ns.num_nodes
+                cum_source_ngroups += ns.num_node_groups
             else:
                 source_ns = ns.get_source_ns()
                 ns._param_range = deepcopy(source_ns._param_range)
@@ -67,6 +70,7 @@ def __init__(self, nodes: Sequence[InputNodes], cum_nodes: int = 0) -> None:
         self.num_params = cum_params
         self.num_param_flows = cum_param_flows
         self.num_nodes = layer_num_nodes
+        self.num_node_groups = self.num_nodes // self.group_size
         self.dist_signature = dist_signature
 
         # Store the triton kernel functions implemented by the target `Distribution`
@@ -77,60 +81,76 @@ def __init__(self, nodes: Sequence[InputNodes], cum_nodes: int = 0) -> None:
 
         ## Prepair and compile the layer ##
         num_vars = len(node_vars[0])
-        # Start variable index: vids[i,:] are the variables of the ith node
-        vids = torch.empty([self.num_nodes, num_vars], dtype = torch.long)
-        # Start parameter index: params[s_pids[i]] is the first parameter of the ith node
-        s_pids = torch.empty([self.num_nodes], dtype = torch.long)
-        # Start parameter flow index: param_flows[s_pfids[i]] is the first parameter flow of the ith node
-        s_pfids = torch.empty([self.num_nodes], dtype = torch.long)
-        # Start metadata index: metadata[s_mids[i]] is the first metadata of the ith node
+        # Start variable index: vids[i,:] are the variables of the ith node group
+        vids = torch.empty([self.num_node_groups, num_vars], dtype = torch.long)
+        # Start parameter index: params[s_pids[i]] is the first parameter of the 1st node in the ith node group
+        s_pids = torch.empty([self.num_node_groups], dtype = torch.long)
+        # Pointer increment of the parameters: params[s_pids[i]+j*inc_pids[i]] is the first parameter 
+        # of the (j+1)th node in the ith node group
+        inc_pids = torch.empty([self.num_node_groups], dtype = torch.long)
+        # Start parameter flow index: param_flows[s_pfids[i]] is the first parameter flow of the 1st node in the ith node group
+        s_pfids = torch.empty([self.num_node_groups], dtype = torch.long)
+        # Pointer increment of the parameters: param_flows[s_pfids[i]+j*inc_pfids[i]] is the first parameter flow
+        # of the (j+1)th node in the ith node group
+        inc_pfids = torch.empty([self.num_node_groups], dtype = torch.long)
+        # Start metadata index: metadata[s_mids[i]] is the first metadata of the 1th node in the ith node group
         metadata = []
-        s_mids = torch.empty([self.num_nodes], dtype = torch.long)
-        # source node ids (nodes with their original parameters)
-        source_nids = torch.empty([cum_source_ns], dtype = torch.long)
+        s_mids = torch.empty([self.num_node_groups], dtype = torch.long)
+        # source node group ids (nodes with their original parameters)
+        source_ngids = torch.empty([cum_source_ngroups], dtype = torch.long)
 
         # Parameters of this layer
         params = torch.empty([self.num_params], dtype = torch.float32)
         
-        n_start = 0
-        source_n_start = 0
+        ng_start = 0
+        source_ng_start = 0
+        param_start = 0
         for ns_id, ns in enumerate(self.nodes):
-            n_end = n_start + ns.num_nodes
+            ng_end = ng_start + ns.num_node_groups
 
             # `vids`
-            assert len(node_vars[ns_id]) == num_vars
-            vids[n_start:n_end,:] = torch.tensor(node_vars[ns_id]).view(1, -1)
+            assert len(node_vars[ns_id]) == num_vars, f"Input nodes in the same layer should define on the same " \
+                                                      f"number of variables, but got {len(node_vars[ns_id])} and {num_vars}."
+            vids[ng_start:ng_end,:] = torch.tensor(node_vars[ns_id]).view(1, -1)
 
             # `s_pids` and `s_pfids`
             if not ns.is_tied():
                 source_ns = ns
             else:
                 source_ns = ns.get_source_ns()
-            pid_offsets = torch.arange(0, ns.num_nodes * ns.dist.num_parameters(), ns.dist.num_parameters())
-            s_pids[n_start:n_end] = source_ns._param_range[0] + pid_offsets
-            pfid_offsets = torch.arange(0, ns.num_nodes * ns.dist.num_param_flows(), ns.dist.num_param_flows())
-            s_pfids[n_start:n_end] = source_ns._param_flow_range[0] + pfid_offsets
 
-            # `source_nids`
+            n_params_per_group = self.group_size * ns.dist.num_parameters()
+            gpid_offsets = torch.arange(0, ns.num_node_groups * n_params_per_group, n_params_per_group)
+            s_pids[ng_start:ng_end] = source_ns._param_range[0] + gpid_offsets
+            inc_pids[ng_start:ng_end] = ns.dist.num_parameters()
+
+            n_pflows_per_group = self.group_size * ns.dist.num_param_flows()
+            gpfid_offsets = torch.arange(0, ns.num_node_groups * n_pflows_per_group, n_pflows_per_group)
+            s_pfids[ng_start:ng_end] = source_ns._param_flow_range[0] + gpfid_offsets
+            inc_pfids[ng_start:ng_end] = ns.dist.num_param_flows()
+
+            # `source_ngids`
             if not ns.is_tied():
-                source_n_end = source_n_start + ns.num_nodes
-                source_nids[source_n_start:source_n_end] = torch.arange(n_start, n_end)
-                source_n_start = source_n_end
+                source_ng_end = source_ng_start + ns.num_node_groups
+                source_ngids[source_ng_start:source_ng_end] = torch.arange(ng_start, ng_end)
+                source_ng_start = source_ng_end
 
             # `metadata` and `s_mids`
-            s_mids[n_start:n_end] = len(metadata)
+            s_mids[ng_start:ng_end] = len(metadata)
             metadata.extend(node_metadata[ns_id])
 
-            n_start = n_end
+            ng_start = ng_end
 
         self.register_buffer("vids", vids)
         self.register_buffer("s_pids", s_pids)
+        self.register_buffer("inc_pids", inc_pids)
         self.register_buffer("s_pfids", s_pfids)
+        self.register_buffer("inc_pfids", inc_pfids)
         self.register_buffer("metadata", torch.tensor(metadata).float())
         self.register_buffer("s_mids", s_mids)
-        self.register_buffer("source_nids", source_nids)
+        self.register_buffer("source_ngids", source_ngids)
 
-        self.params = nn.Parameter(params)
+        self.params = nn.Parameter(params) # Parameters will be set later in `self._init_parameters()`
         # Due to the custom inplace backward pass implementation, we do not track 
         # gradient of PC parameters by PyTorch.
         self.params.requires_grad = False
@@ -186,33 +206,43 @@ def forward(self, data: torch.Tensor, node_mars: torch.Tensor, params: Optional[
             batch_size = node_mars.size(1)
             node_offset = self._output_ind_range[0]
 
-            if not self.provided("fw_local_ids"):
+            if not self.provided("fw_local_group_ids"):
                 layer_num_nodes = self._output_ind_range[1] - self._output_ind_range[0]
-                fw_local_ids = None
+                fw_local_group_ids = None
             else:
-                layer_num_nodes = self.fw_local_ids.size(0)
-                fw_local_ids = self.fw_local_ids
+                layer_num_nodes = self.fw_local_group_ids.size(0)
+                fw_local_group_ids = self.fw_local_group_ids
 
             if not self.provided("_mars_kernel"):
                 self._mars_kernel = self._compile_triton_kernel(self._mars_kernel_template, mar_fn = self.fw_mar_fn)
 
-            grid = lambda meta: (triton.cdiv(layer_num_nodes * batch_size, meta['BLOCK_SIZE']),)
+            eval_num_groups = self.num_node_groups if not self.provided("fw_local_group_ids") else self.fw_local_group_ids.size(0)
+            BLOCK_B = min(batch_size, 1024)
+            TILE_SIZE_K = min(1024 // BLOCK_B, self.group_size)
+            BLOCK_M = 1
+
+            grid = (triton.cdiv(batch_size, BLOCK_B), triton.cdiv(eval_num_groups, BLOCK_M))
+
             self._mars_kernel[grid](
-                params_ptr = self.params, 
+                params_ptr = params, 
                 node_mars_ptr = node_mars, 
                 data_ptr = data, 
                 vids_ptr = self.vids, 
-                s_pids_ptr = self.s_pids, 
+                s_pids_ptr = self.s_pids,
+                inc_pids_ptr = self.inc_pids,
                 metadata_ptr = self.metadata, 
                 s_mids_ptr = self.s_mids, 
-                fw_local_ids_ptr = fw_local_ids,
-                layer_num_nodes = layer_num_nodes, 
+                fw_local_group_ids_ptr = fw_local_group_ids,
+                layer_num_node_groups = eval_num_groups, 
                 batch_size = batch_size, 
                 num_vars_per_node = self.num_vars_per_node, 
                 nv_block_size = triton.next_power_of_2(self.num_vars_per_node),
                 node_offset = node_offset, 
-                BLOCK_SIZE = 1024, 
-                partial_eval = 1 if fw_local_ids is not None else 0
+                group_size = self.group_size,
+                TILE_SIZE_K = TILE_SIZE_K,
+                K_NUM_TILES = self.group_size // TILE_SIZE_K,
+                BLOCK_B = BLOCK_B,
+                partial_eval = 1 if fw_local_group_ids is not None else 0
             )
 
             # Apply missing mask if required
@@ -226,12 +256,12 @@ def forward(self, data: torch.Tensor, node_mars: torch.Tensor, params: Optional[
                     missing_mask_ptr = missing_mask,
                     node_mars_ptr = node_mars, 
                     vids_ptr = self.vids, 
-                    fw_local_ids_ptr = fw_local_ids,
-                    layer_num_nodes = layer_num_nodes, 
+                    fw_local_group_ids_ptr = fw_local_group_ids,
+                    layer_num_node_groups = eval_num_groups,
                     batch_size = batch_size, 
                     node_offset = node_offset, 
                     BLOCK_SIZE = 1024, 
-                    partial_eval = 1 if fw_local_ids is not None else 0,
+                    partial_eval = 1 if fw_local_group_ids is not None else 0,
                     mask_dim = mask_dim
                 )
 
@@ -261,19 +291,19 @@ def backward(self, data: torch.Tensor, node_flows: torch.Tensor,
             batch_size = node_flows.size(1)
             node_offset = self._output_ind_range[0]
 
-            if not self.provided("bk_local_ids"):
+            if not self.provided("bk_local_group_ids"):
                 layer_num_nodes = self._output_ind_range[1] - self._output_ind_range[0]
-                bk_local_ids = None
+                bk_local_group_ids = None
             else:
-                layer_num_nodes = self.bk_local_ids.size(0)
-                bk_local_ids = self.bk_local_ids
+                layer_num_nodes = self.bk_local_group_ids.size(0)
+                bk_local_group_ids = self.bk_local_group_ids
 
             if not self.provided("_flows_kernel"):
                 self._flows_kernel = self._compile_triton_kernel(self._flows_kernel_template, flow_fn = self.bk_flow_fn)
 
             grid = lambda meta: (triton.cdiv(layer_num_nodes * batch_size, meta['BLOCK_SIZE']),)
             self._flows_kernel[grid](
-                params_ptr = self.params,
+                params_ptr = params,
                 param_flows_ptr = self.param_flows,
                 node_flows_ptr = node_flows, 
                 node_mars_ptr = node_mars,
@@ -283,14 +313,14 @@ def backward(self, data: torch.Tensor, node_flows: torch.Tensor,
                 s_pfids_ptr = self.s_pfids,
                 metadata_ptr = self.metadata, 
                 s_mids_ptr = self.s_mids, 
-                bk_local_ids_ptr = bk_local_ids,
+                bk_local_group_ids_ptr = bk_local_group_ids,
                 layer_num_nodes = layer_num_nodes, 
                 batch_size = batch_size, 
                 num_vars_per_node = self.num_vars_per_node, 
                 nv_block_size = triton.next_power_of_2(self.num_vars_per_node),
                 node_offset = node_offset, 
                 BLOCK_SIZE = 1024, 
-                partial_eval = 1 if bk_local_ids is not None else 0
+                partial_eval = 1 if bk_local_group_ids is not None else 0
             )
 
         else:
@@ -352,7 +382,7 @@ def mini_batch_em(self, step_size: float, pseudocount: float = 0.0):
             with torch.no_grad():
 
                 if "cuda" in self.device.type:
-                    layer_num_source_nodes = self.source_nids.size(0)
+                    layer_num_source_nodes = self.source_ngids.size(0)
 
                     if not self.provided("_em_kernel"):
                         self._em_kernel = self._compile_triton_kernel(self._em_kernel_template, em_fn = self.em_fn)
@@ -367,7 +397,7 @@ def mini_batch_em(self, step_size: float, pseudocount: float = 0.0):
                         s_pfids_ptr = self.s_pfids,
                         metadata_ptr = self.metadata,
                         s_mids_ptr = self.s_mids,
-                        source_nids_ptr = self.source_nids,
+                        source_ngids_ptr = self.source_ngids,
                         constexprs_ptr = constexprs,
                         layer_num_source_nodes = layer_num_source_nodes,
                         BLOCK_SIZE = 1024
@@ -382,49 +412,49 @@ def get_param_specs(self):
     def enable_partial_evaluation(self, fw_scopes: Optional[Union[Sequence[BitSet],Sequence[int]]] = None, 
                                   bk_scopes: Optional[Union[Sequence[BitSet],Sequence[int]]] = None, return_ids: bool = False):
         # Create cache if needed
-        if not self.provided("scope2localids"):
+        if not self.provided("scope2localgids"):
             self._prepare_scope2nids()
 
         # Filter forward nodes
         if fw_scopes is not None:
-            fw_local_ids = []
+            fw_local_group_ids = []
             for scope in fw_scopes:
                 if isinstance(scope, int):
                     scope = BitSet.from_array([scope])
 
-                if scope not in self.scope2localids:
+                if scope not in self.scope2localgids:
                     continue
 
-                fw_local_ids.append(self.scope2localids[scope])
+                fw_local_group_ids.append(self.scope2localgids[scope])
 
             if return_ids:
-                return torch.cat(fw_local_ids, dim = 0)
+                return torch.cat(fw_local_group_ids, dim = 0)
             else:
-                self.fw_local_ids = torch.cat(fw_local_ids, dim = 0)
+                self.fw_local_group_ids = torch.cat(fw_local_group_ids, dim = 0)
 
         # Filter backward nodes
         if bk_scopes is not None:
-            bk_local_ids = []
+            bk_local_group_ids = []
             for scope in bk_scopes:
                 if isinstance(scope, int):
                     scope = BitSet.from_array([scope])
 
-                if scope not in self.scope2localids:
+                if scope not in self.scope2localgids:
                     continue
 
-                bk_local_ids.append(self.scope2localids[scope])
+                bk_local_group_ids.append(self.scope2localgids[scope])
 
             if return_ids:
-                return torch.cat(bk_local_ids, dim = 0)
+                return torch.cat(bk_local_group_ids, dim = 0)
             else:
-                self.bk_local_ids = torch.cat(bk_local_ids, dim = 0)
+                self.bk_local_group_ids = torch.cat(bk_local_group_ids, dim = 0)
 
     def disable_partial_evaluation(self, forward: bool = True, backward: bool = True):
         if forward:
-            self.fw_local_ids = None
+            self.fw_local_group_ids = None
 
         if backward:
-            self.bk_local_ids = None
+            self.bk_local_group_ids = None
 
     def update_parameters(self):
         for idx, ns in enumerate(self.nodes):
@@ -435,27 +465,26 @@ def update_parameters(self):
             ns._params = self.params.data[par_start:par_end].detach().cpu().clone()
 
     def _prepare_scope2nids(self):
-        if not hasattr(self, "scope2localids"):
-            scope2localids = dict()
+        if not hasattr(self, "scope2localgids"):
+            scope2localgids = dict()
 
-            local_nid = 0
+            local_ngid = 0
             for ns in self.nodes:
                 scope = ns.scope
 
-                s_nid = local_nid
-                e_nid = local_nid + ns.num_nodes
+                s_ngid = local_ngid
+                e_ngid = local_ngid + ns.num_node_groups
 
                 with torch.no_grad():
-                    if scope not in scope2localids:
-                        scope2localids[scope] = [torch.zeros([0], dtype = torch.long)]
+                    if scope not in scope2localgids:
+                        scope2localgids[scope] = [torch.zeros([0], dtype = torch.long)]
 
-                    group_local_ids = torch.arange(s_nid, e_nid)
-                    scope2localids[scope].append(group_local_ids)
+                    scope2localgids[scope].append(torch.arange(s_nid, e_nid))
 
                 local_nid += ns.num_nodes
 
-            self.scope2localids = {
-                scope: torch.cat(ids, dim = 0).to(self.params.device) for scope, ids in scope2localids.items()
+            self.scope2localgids = {
+                scope: torch.cat(ids, dim = 0).to(self.params.device) for scope, ids in scope2localgids.items()
             }
 
     def _reorder_nodes(self, nodes):
@@ -503,45 +532,65 @@ def _init_parameters(self, perturbation):
                 p_start = p_end
 
     @staticmethod
-    def _mars_kernel_template(mar_fn, params_ptr, node_mars_ptr, data_ptr, vids_ptr, s_pids_ptr, metadata_ptr, s_mids_ptr, 
-                              fw_local_ids_ptr, partial_eval: tl.constexpr, layer_num_nodes: tl.constexpr, batch_size: tl.constexpr, 
-                              num_vars_per_node: tl.constexpr, nv_block_size: tl.constexpr, node_offset: tl.constexpr, BLOCK_SIZE: tl.constexpr):
-        pid = tl.program_id(axis = 0)
-        block_start = pid * BLOCK_SIZE
+    def _mars_kernel_template(mar_fn, params_ptr, node_mars_ptr, data_ptr, vids_ptr, s_pids_ptr, inc_pids_ptr, metadata_ptr, s_mids_ptr,
+                              fw_local_group_ids_ptr, partial_eval: tl.constexpr, layer_num_node_groups: tl.constexpr, batch_size: tl.constexpr, 
+                              num_vars_per_node: tl.constexpr, nv_block_size: tl.constexpr, node_offset: tl.constexpr, group_size: tl.constexpr,
+                              TILE_SIZE_K: tl.constexpr, K_NUM_TILES: tl.constexpr, BLOCK_B: tl.constexpr):
+        bid = tl.program_id(axis = 0)
+        ngroup_id = tl.program_id(axis = 1)
 
-        offsets = block_start + tl.arange(0, BLOCK_SIZE)
-        mask = offsets < layer_num_nodes * batch_size
-
-        # Raw batch and (local) node id
-        batch_offsets = (offsets % batch_size)
-        local_offsets = (offsets // batch_size)
+        # Batch ids to process
+        offs_batch = bid * BLOCK_B + tl.arange(0, BLOCK_B)
+        mask_batch = offs_batch < batch_size
 
         if partial_eval > 0:
-            local_offsets = tl.load(fw_local_ids_ptr + local_offsets, mask = mask, other = 0)
+            ngroup_id = tl.load(fw_local_group_ids_ptr + ngroup_id)
 
         if num_vars_per_node == 1:
-            # Get all variable ids
-            vids = tl.load(vids_ptr + local_offsets, mask = mask, other = 0)
+            # Get variable id
+            vid = tl.load(vids_ptr + ngroup_id)
 
             # Load the corresponding data
-            data = tl.load(data_ptr + vids * batch_size + batch_offsets, mask = mask, other = 0)
+            offs_data = vid * batch_size + offs_batch
+            data = tl.load(data_ptr + offs_data, mask = mask_batch, other = 0) # [BLOCK_B]
         else:
             # Get all variable ids
-            vids_offsets = tl.broadcast_to(local_offsets[:,None], (BLOCK_SIZE, nv_block_size)) * num_vars_per_node + \
-                tl.broadcast_to(tl.arange(0, nv_block_size)[None,:], (BLOCK_SIZE, nv_block_size))
-            vids_mask = tl.broadcast_to(mask[:,None], (BLOCK_SIZE, nv_block_size)) & \
-                tl.broadcast_to((tl.arange(0, nv_block_size) < num_vars_per_node)[None,:], (BLOCK_SIZE, nv_block_size))
-            vids = tl.load(vids_ptr + vids_offsets, mask = vids_mask, other = 0)
+            offs_vs = tl.arange(0, nv_block_size)
+            mask_vs = offs_vs < num_vars_per_node
+            offs_vids = ngroup_id * num_vars_per_node + offs_vs
+            mask_vids = mask_vs
+            vids = tl.load(vids_ptr + offs_vids, mask = mask_vids, other = 0)
 
             # Load the corresponding data
-            data = tl.load(data_ptr + vids * batch_size + batch_offsets, mask = vids_mask, other = 0)
+            offs_data = vids[:,None] * batch_size + offs_batch[None,:]
+            data = tl.load(data_ptr + offs_data, mask = (mask_vids[:,None] & mask_batch[None,:]), other = 0)
 
-        s_pids = tl.load(s_pids_ptr + local_offsets, mask = mask, other = 0)
+        # Initialize pointers to `params`
+        off_params = tl.load(s_pids_ptr + ngroup_id)
+        inc_params = tl.load(inc_pids_ptr + ngroup_id)
+        offs_node = tl.arange(0, TILE_SIZE_K)
+        p_params = params_ptr + off_params + inc_params * offs_node # [TILE_SIZE_K]
+
+        # Initialize pointers to `metadata`
+        offs_metadata = tl.load(s_mids_ptr + ngroup_id)
+        p_metadata = metadata_ptr + offs_metadata # [1]
+
+        # Initialize pointers to `node_mars`
+        p_nmars = node_mars_ptr + \
+            (ngroup_id * group_size + offs_node[:,None] + node_offset) * batch_size + \
+            offs_batch[None,:] # [TILE_SIZE_K, BLOCK_B]
+
+        # Inner loop to process everything in the node group
+        mask = mask_batch[None,:]
+        for i in range(K_NUM_TILES):
+
+            mars = mar_fn(data, p_params, p_metadata, mask, num_vars_per_node)
 
-        mars = mar_fn(local_offsets, data, params_ptr, s_pids, metadata_ptr, s_mids_ptr, mask, num_vars_per_node, BLOCK_SIZE)
+            tl.store(p_nmars, mars, mask = mask)
 
-        node_offsets = local_offsets + node_offset
-        tl.store(node_mars_ptr + node_offsets * batch_size + batch_offsets, mars, mask = mask)
+            # Increment pointers
+            p_params += inc_params * TILE_SIZE_K
+            p_nmars += TILE_SIZE_K * batch_size
 
     @staticmethod
     @triton.jit
@@ -653,7 +702,7 @@ def _sample_kernel_template(sample_fn, samples_ptr, params_ptr, nflow_xids_ptr,
 
     @staticmethod
     def _em_kernel_template(em_fn, params_ptr, param_flows_ptr, s_pids_ptr, s_pfids_ptr, metadata_ptr, s_mids_ptr,
-                            source_nids_ptr, constexprs_ptr, layer_num_source_nodes: tl.constexpr, BLOCK_SIZE: tl.constexpr):
+                            source_ngids_ptr, constexprs_ptr, layer_num_source_nodes: tl.constexpr, BLOCK_SIZE: tl.constexpr):
         pid = tl.program_id(axis = 0)
         block_start = pid * BLOCK_SIZE
 
@@ -665,7 +714,7 @@ def _em_kernel_template(em_fn, params_ptr, param_flows_ptr, s_pids_ptr, s_pfids_
         mask = offsets < layer_num_source_nodes
 
         # Get the local node ids
-        local_offsets = tl.load(source_nids_ptr + offsets, mask = mask, other = 0)
+        local_offsets = tl.load(source_ngids_ptr + offsets, mask = mask, other = 0)
 
         # Get the corresponding start id for `params` and `param_flows`
         s_pids = tl.load(s_pids_ptr + local_offsets, mask = mask, other = 0)
diff --git a/src/pyjuice/layer/layer.py b/src/pyjuice/layer/layer.py
index df452e19..2940091c 100644
--- a/src/pyjuice/layer/layer.py
+++ b/src/pyjuice/layer/layer.py
@@ -1,11 +1,17 @@
 from __future__ import annotations
 
 import torch
-from typing import Union
+from typing import Union, Sequence
+
+from pyjuice.nodes import CircuitNodes
 
 
 class Layer():
-    def __init__(self) -> None:
+    def __init__(self, nodes: Sequence[CircuitNodes]) -> None:
+
+        for i in range(1, len(nodes)):
+            assert nodes[i].group_size == nodes[0].group_size, "`group_size` of nodes in the same layer must be identical."
+
         self.device = torch.device("cpu")
 
     def init_layer(self, params: Union[torch.Tensor,None]):
diff --git a/src/pyjuice/layer/prod_layer.py b/src/pyjuice/layer/prod_layer.py
index 46459f20..6e4aabef 100644
--- a/src/pyjuice/layer/prod_layer.py
+++ b/src/pyjuice/layer/prod_layer.py
@@ -20,7 +20,7 @@ class ProdLayer(Layer, nn.Module):
 
     def __init__(self, nodes: Sequence[ProdNodes], layer_sparsity_tol: float = 0.0, 
                  max_num_groups: Optional[int] = None, disable_gpu_compilation: bool = False) -> None:
-        Layer.__init__(self)
+        Layer.__init__(self, nodes)
         nn.Module.__init__(self)
 
         assert len(nodes) > 0, "No input node."
diff --git a/src/pyjuice/layer/sum_layer.py b/src/pyjuice/layer/sum_layer.py
index 59ab5869..7033b45e 100644
--- a/src/pyjuice/layer/sum_layer.py
+++ b/src/pyjuice/layer/sum_layer.py
@@ -27,7 +27,7 @@ def __init__(self, nodes: Sequence[SumNodes], global_nid_start: int,
                  max_num_groups: Optional[int] = None,
                  disable_gpu_compilation: bool = False) -> None:
 
-        Layer.__init__(self)
+        Layer.__init__(self, nodes)
         nn.Module.__init__(self)
 
         assert len(nodes) > 0, "No input node."
diff --git a/src/pyjuice/model/tensorcircuit.py b/src/pyjuice/model/tensorcircuit.py
index 56dd2950..8284922c 100644
--- a/src/pyjuice/model/tensorcircuit.py
+++ b/src/pyjuice/model/tensorcircuit.py
@@ -554,6 +554,8 @@ def _init_layers(self, init_input_params: Optional[Sequence[torch.Tensor]] = Non
         tied_param_group_ids = []
         tied_param_ends = []
 
+        import pdb; pdb.set_trace()
+
         if verbose:
             print(f"Compiling {num_layers} layers...")
         layer_id = 0
diff --git a/src/pyjuice/nodes/distributions/categorical.py b/src/pyjuice/nodes/distributions/categorical.py
index a10342ac..4267af1e 100644
--- a/src/pyjuice/nodes/distributions/categorical.py
+++ b/src/pyjuice/nodes/distributions/categorical.py
@@ -36,12 +36,10 @@ def init_parameters(self, num_nodes: int, perturbation: float = 2.0, **kwargs):
         return params.reshape(-1)
 
     @staticmethod
-    def fw_mar_fn(local_offsets, data, params_ptr, s_pids, metadata_ptr, s_mids_ptr, mask, num_vars_per_node, BLOCK_SIZE):
-        # I am not sure why, but the following code will not work...
-        # probs = tl.load(params_ptr + s_pids + data, mask = mask, other = 0)
-        # Seems like a bug of triton.
-        param_idx = s_pids + data
-        probs = tl.load(params_ptr + param_idx, mask = mask, other = 0)
+    def fw_mar_fn(data, p_params, p_metadata, mask, num_vars_per_node):
+
+        p_tarpars = p_params[:,None] + data[None,:]
+        probs = tl.load(p_tarpars, mask = mask, other = 0)
         log_probs = tl.log(probs)
 
         return log_probs
diff --git a/src/pyjuice/nodes/distributions/distributions.py b/src/pyjuice/nodes/distributions/distributions.py
index c574b659..70d39728 100644
--- a/src/pyjuice/nodes/distributions/distributions.py
+++ b/src/pyjuice/nodes/distributions/distributions.py
@@ -37,15 +37,11 @@ def fw_mar_fn(*args, **kwargs):
         """
         Forward evaluation for log-probabilities.
         Args:
-        `local_offsets`: [BLOCK_SIZE] the local indices of the to-be-processed input nodes
-        `data`: [BLOCK_SIZE, num_vars_per_node] data of the corresponding nodes
-        `params_ptr`: pointer to the parameter vector
-        `s_pids`: [BLOCK_SIZE] start parameter index (offset) for all input nodes
-        `metadata_ptr`: pointer to metadata
-        `s_mids_ptr`: pointer to the start metadata index (offset)
-        `mask`: [BLOCK_SIZE] indicate whether each node should be processed
+        `data`: [BLOCK_M, BLOCK_B] data of the corresponding node groups
+        `p_params`: [BLOCK_M, TILE_SIZE_K] pointer to the parameters
+        `p_metadata`: [BLOCK_M] pointer to the metadata
+        `mask`: [BLOCK_M, BLOCK_B] full mask
         `num_vars_per_node`: numbers of variables per input node/distribution
-        `BLOCK_SIZE`: CUDA block size
         """
         raise NotImplementedError()
 
diff --git a/tests/layer/input_layer_test.py b/tests/layer/input_layer_test.py
new file mode 100644
index 00000000..8e26d8d4
--- /dev/null
+++ b/tests/layer/input_layer_test.py
@@ -0,0 +1,104 @@
+import pyjuice as juice
+import torch
+import numpy as np
+import time
+
+import pyjuice.nodes.distributions as dists
+from pyjuice.utils import BitSet
+from pyjuice.nodes import multiply, summate, inputs
+from pyjuice.model import TensorCircuit
+
+from pyjuice.layer import InputLayer
+
+import pytest
+
+
+def input_layer_test():
+
+    device = torch.device("cuda:0")
+
+    group_size = 4
+    
+    with juice.set_group_size(group_size):
+
+        ni0 = inputs(0, num_node_groups = 2, dist = dists.Categorical(num_cats = 2))
+        ni1 = inputs(1, num_node_groups = 2, dist = dists.Categorical(num_cats = 2))
+        ni2 = inputs(2, num_node_groups = 2, dist = dists.Categorical(num_cats = 2))
+        ni3 = inputs(3, num_node_groups = 2, dist = dists.Categorical(num_cats = 2))
+
+    layer = InputLayer([ni0, ni1, ni2, ni3], cum_nodes = 1)
+
+    layer._init_parameters(perturbation = 2.0)
+
+    assert torch.all(layer.vids == torch.tensor([0,0,1,1,2,2,3,3]).reshape(-1, 1))
+    npars_per_group = group_size * ni0.dist.num_parameters()
+    assert torch.all(layer.s_pids == torch.arange(0, npars_per_group * 8, npars_per_group))
+    assert torch.all(layer.inc_pids == ni0.dist.num_parameters())
+    npflows_per_group = group_size * ni0.dist.num_param_flows()
+    assert torch.all(layer.s_pfids == torch.arange(0, npflows_per_group * 8, npflows_per_group))
+    assert torch.all(layer.inc_pfids == ni0.dist.num_param_flows())
+    assert torch.all(layer.metadata == torch.ones([4]) * 2.0)
+    assert torch.all(layer.s_mids == torch.tensor([0,0,1,1,2,2,3,3]))
+    assert torch.all(layer.source_ngids == torch.arange(0, 8))
+
+    layer.to(device)
+
+    data = torch.randint(0, 2, (4, 16)).to(device)
+    node_mars = torch.zeros([33, 16]).to(device)
+
+    ## Forward tests ##
+
+    layer(data, node_mars)
+
+    for i in range(16):
+        for j in range(4 * 2 * group_size):
+            assert torch.abs(node_mars[j+1,i].exp() - layer.params[j*2+data[j//(2*group_size),i]]) < 1e-4
+
+    ## Forward with mask tests ##
+
+
+
+    import pdb; pdb.set_trace()
+
+
+def speed_test():
+
+    device = torch.device("cuda:0")
+
+    group_size = 128
+    num_vars = 16*16*3
+    num_node_groups = 256 // group_size
+
+    batch_size = 512
+    
+    with juice.set_group_size(group_size):
+
+        nis = []
+        for v in range(num_vars):
+            nis.append(inputs(0, num_node_groups = num_node_groups, dist = dists.Categorical(num_cats = 64)))
+
+    layer = InputLayer(nis, cum_nodes = 1)
+
+    layer._init_parameters(perturbation = 2.0)
+
+    layer.to(device)
+
+    data = torch.randint(0, 64, (num_vars, batch_size)).to(device)
+    node_mars = torch.zeros([1 + group_size * num_node_groups * num_vars, 16]).to(device)
+
+    ## Forward tests ##
+
+    layer(data, node_mars)
+
+    t0 = time.time()
+    torch.cuda.synchronize()
+    for _ in range(100):
+        layer(data, node_mars)
+    torch.cuda.synchronize()
+    t1 = time.time()
+    print((t1 - t0) / 100 * 1000)
+
+
+if __name__ == "__main__":
+    # input_layer_test()
+    speed_test()
\ No newline at end of file
diff --git a/tests/model/forward_test.py b/tests/model/forward_test.py
index 8c5dddbb..bcc9ccfc 100644
--- a/tests/model/forward_test.py
+++ b/tests/model/forward_test.py
@@ -12,10 +12,10 @@
 
 def forward_test():
 
-    ni0 = inputs(0, num_nodes = 2, dist = dists.Categorical(num_cats = 2))
-    ni1 = inputs(1, num_nodes = 2, dist = dists.Categorical(num_cats = 2))
-    ni2 = inputs(2, num_nodes = 2, dist = dists.Categorical(num_cats = 2))
-    ni3 = inputs(3, num_nodes = 2, dist = dists.Categorical(num_cats = 2))
+    ni0 = inputs(0, num_node_groups = 2, dist = dists.Categorical(num_cats = 2))
+    ni1 = inputs(1, num_node_groups = 2, dist = dists.Categorical(num_cats = 2))
+    ni2 = inputs(2, num_node_groups = 2, dist = dists.Categorical(num_cats = 2))
+    ni3 = inputs(3, num_node_groups = 2, dist = dists.Categorical(num_cats = 2))
 
     m1 = multiply(ni0, ni1, edge_ids = torch.tensor([[0, 0], [0, 1], [1, 0], [1, 1]], dtype = torch.long))
     n1 = summate(m1, edge_ids = torch.tensor([[0, 0, 0, 0, 1, 1, 1, 1], [0, 1, 2, 3, 0, 1, 2, 3]], dtype = torch.long))
@@ -28,6 +28,8 @@ def forward_test():
 
     pc = TensorCircuit(n)
 
+    import pdb; pdb.set_trace()
+
     device = torch.device("cuda:0")
     pc.to(device)
 

From eddab6ff8dddc1aa294fbdc18a49f059b30943ac Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Fri, 1 Dec 2023 21:38:59 +0800
Subject: [PATCH 011/162] fix: forward w/ & w/o mask of `InputLayer`

---
 src/pyjuice/layer/input_layer.py | 87 ++++++++++++++++++++------------
 tests/layer/input_layer_test.py  | 77 +++++++++++++++++++++++++---
 2 files changed, 126 insertions(+), 38 deletions(-)

diff --git a/src/pyjuice/layer/input_layer.py b/src/pyjuice/layer/input_layer.py
index 68e9f407..135af960 100644
--- a/src/pyjuice/layer/input_layer.py
+++ b/src/pyjuice/layer/input_layer.py
@@ -21,7 +21,7 @@
 
 
 class InputLayer(Layer, nn.Module):
-    def __init__(self, nodes: Sequence[InputNodes], cum_nodes: int = 0) -> None:
+    def __init__(self, nodes: Sequence[InputNodes], cum_nodes: int = 0, maximize_group_size: bool = True) -> None:
         nn.Module.__init__(self)
         Layer.__init__(self, nodes)
 
@@ -30,6 +30,9 @@ def __init__(self, nodes: Sequence[InputNodes], cum_nodes: int = 0) -> None:
 
         # Group size of the nodes in the current layer
         self.group_size = self.nodes[0].group_size
+        if maximize_group_size:
+            min_num_groups = min([node.num_node_groups for node in self.nodes])
+            self.group_size *= 2 ** (min_num_groups.bit_length() - 1)
 
         ## Parse input `nodes` ##
         node_vars = []
@@ -61,7 +64,7 @@ def __init__(self, nodes: Sequence[InputNodes], cum_nodes: int = 0) -> None:
                 cum_param_flows += ns.num_nodes * ns.dist.num_param_flows()
                 ns._param_flow_range = (cum_param_flows - ns.num_nodes * ns.dist.num_param_flows(), cum_param_flows)
 
-                cum_source_ngroups += ns.num_node_groups
+                cum_source_ngroups += ns.num_nodes // self.group_size
             else:
                 source_ns = ns.get_source_ns()
                 ns._param_range = deepcopy(source_ns._param_range)
@@ -106,7 +109,7 @@ def __init__(self, nodes: Sequence[InputNodes], cum_nodes: int = 0) -> None:
         source_ng_start = 0
         param_start = 0
         for ns_id, ns in enumerate(self.nodes):
-            ng_end = ng_start + ns.num_node_groups
+            ng_end = ng_start + ns.num_nodes // self.group_size
 
             # `vids`
             assert len(node_vars[ns_id]) == num_vars, f"Input nodes in the same layer should define on the same " \
@@ -119,19 +122,21 @@ def __init__(self, nodes: Sequence[InputNodes], cum_nodes: int = 0) -> None:
             else:
                 source_ns = ns.get_source_ns()
 
+            num_node_groups = ns.num_nodes // self.group_size
+
             n_params_per_group = self.group_size * ns.dist.num_parameters()
-            gpid_offsets = torch.arange(0, ns.num_node_groups * n_params_per_group, n_params_per_group)
+            gpid_offsets = torch.arange(0, num_node_groups * n_params_per_group, n_params_per_group)
             s_pids[ng_start:ng_end] = source_ns._param_range[0] + gpid_offsets
             inc_pids[ng_start:ng_end] = ns.dist.num_parameters()
 
             n_pflows_per_group = self.group_size * ns.dist.num_param_flows()
-            gpfid_offsets = torch.arange(0, ns.num_node_groups * n_pflows_per_group, n_pflows_per_group)
+            gpfid_offsets = torch.arange(0, num_node_groups * n_pflows_per_group, n_pflows_per_group)
             s_pfids[ng_start:ng_end] = source_ns._param_flow_range[0] + gpfid_offsets
             inc_pfids[ng_start:ng_end] = ns.dist.num_param_flows()
 
             # `source_ngids`
             if not ns.is_tied():
-                source_ng_end = source_ng_start + ns.num_node_groups
+                source_ng_end = source_ng_start + num_node_groups
                 source_ngids[source_ng_start:source_ng_end] = torch.arange(ng_start, ng_end)
                 source_ng_start = source_ng_end
 
@@ -233,7 +238,6 @@ def forward(self, data: torch.Tensor, node_mars: torch.Tensor, params: Optional[
                 metadata_ptr = self.metadata, 
                 s_mids_ptr = self.s_mids, 
                 fw_local_group_ids_ptr = fw_local_group_ids,
-                layer_num_node_groups = eval_num_groups, 
                 batch_size = batch_size, 
                 num_vars_per_node = self.num_vars_per_node, 
                 nv_block_size = triton.next_power_of_2(self.num_vars_per_node),
@@ -248,19 +252,21 @@ def forward(self, data: torch.Tensor, node_mars: torch.Tensor, params: Optional[
             # Apply missing mask if required
             if missing_mask is not None:
                 assert self.num_vars_per_node == 1, "`missing_mask` only supported for univariate distributions."
+                assert missing_mask.dtype == torch.bool, "`missing_mask` must be boolean."
 
                 mask_dim = missing_mask.dim()
 
-                grid = lambda meta: (triton.cdiv(layer_num_nodes * batch_size, meta['BLOCK_SIZE']),)
                 self._fw_missing_mask_kernel[grid](
                     missing_mask_ptr = missing_mask,
                     node_mars_ptr = node_mars, 
                     vids_ptr = self.vids, 
                     fw_local_group_ids_ptr = fw_local_group_ids,
-                    layer_num_node_groups = eval_num_groups,
                     batch_size = batch_size, 
                     node_offset = node_offset, 
-                    BLOCK_SIZE = 1024, 
+                    group_size = self.group_size,
+                    TILE_SIZE_K = TILE_SIZE_K,
+                    K_NUM_TILES = self.group_size // TILE_SIZE_K,
+                    BLOCK_B = BLOCK_B,
                     partial_eval = 1 if fw_local_group_ids is not None else 0,
                     mask_dim = mask_dim
                 )
@@ -533,7 +539,7 @@ def _init_parameters(self, perturbation):
 
     @staticmethod
     def _mars_kernel_template(mar_fn, params_ptr, node_mars_ptr, data_ptr, vids_ptr, s_pids_ptr, inc_pids_ptr, metadata_ptr, s_mids_ptr,
-                              fw_local_group_ids_ptr, partial_eval: tl.constexpr, layer_num_node_groups: tl.constexpr, batch_size: tl.constexpr, 
+                              fw_local_group_ids_ptr, partial_eval: tl.constexpr, batch_size: tl.constexpr, 
                               num_vars_per_node: tl.constexpr, nv_block_size: tl.constexpr, node_offset: tl.constexpr, group_size: tl.constexpr,
                               TILE_SIZE_K: tl.constexpr, K_NUM_TILES: tl.constexpr, BLOCK_B: tl.constexpr):
         bid = tl.program_id(axis = 0)
@@ -594,37 +600,56 @@ def _mars_kernel_template(mar_fn, params_ptr, node_mars_ptr, data_ptr, vids_ptr,
 
     @staticmethod
     @triton.jit
-    def _fw_missing_mask_kernel(missing_mask_ptr, node_mars_ptr, vids_ptr, fw_local_ids_ptr, layer_num_nodes: tl.constexpr, 
-                                batch_size: tl.constexpr, node_offset: tl.constexpr, BLOCK_SIZE: tl.constexpr, 
-                                partial_eval: tl.constexpr, mask_dim: tl.constexpr):
-        pid = tl.program_id(axis = 0)
-        block_start = pid * BLOCK_SIZE
+    def _fw_missing_mask_kernel(missing_mask_ptr, node_mars_ptr, vids_ptr, fw_local_group_ids_ptr, group_size: tl.constexpr,
+                                batch_size: tl.constexpr, node_offset: tl.constexpr, TILE_SIZE_K: tl.constexpr, K_NUM_TILES: tl.constexpr,
+                                BLOCK_B: tl.constexpr, partial_eval: tl.constexpr, mask_dim: tl.constexpr):
 
-        offsets = block_start + tl.arange(0, BLOCK_SIZE)
-        mask = offsets < layer_num_nodes * batch_size
+        bid = tl.program_id(axis = 0)
+        ngroup_id = tl.program_id(axis = 1)
 
-        # Raw batch and (local) node id
-        batch_offsets = (offsets % batch_size)
-        local_offsets = (offsets // batch_size)
+        # Batch ids to process
+        offs_batch = bid * BLOCK_B + tl.arange(0, BLOCK_B)
+        mask_batch = offs_batch < batch_size
 
         if partial_eval > 0:
-            local_offsets = tl.load(fw_local_ids_ptr + local_offsets, mask = mask, other = 0)
+            ngroup_id = tl.load(fw_local_group_ids_ptr + ngroup_id)
 
-        # Get all variable ids
-        vids = tl.load(vids_ptr + local_offsets, mask = mask, other = 0)
+        # Get variable id
+        vid = tl.load(vids_ptr + ngroup_id)
 
         # Fetch mask
         if mask_dim == 1:
-            missing_mask = tl.load(missing_mask_ptr + vids, mask = mask, other = False)
+            missing_mask = tl.load(missing_mask_ptr + vid)
         else:
-            mask_offsets = vids * batch_size + batch_offsets
-            missing_mask = tl.load(missing_mask_ptr + mask_offsets, mask = mask, other = False)
+            offs_mmask = vid * batch_size + offs_batch
+            missing_mask = tl.load(missing_mask_ptr + offs_mmask, mask = mask_batch, other = False)
+
+        # Initialize pointers to `node_mars`
+        offs_node = tl.arange(0, TILE_SIZE_K)
+        p_nmars = node_mars_ptr + \
+            (ngroup_id * group_size + offs_node[:,None] + node_offset) * batch_size + \
+            offs_batch[None,:] # [TILE_SIZE_K, BLOCK_B]
 
         # Apply mask
-        node_offsets = (local_offsets + node_offset) * batch_size + batch_offsets
-        mars = tl.load(node_mars_ptr + node_offsets, mask = mask, other = 0.0)
-        mars = tl.where(missing_mask, 0.0, mars)
-        tl.store(node_mars_ptr + node_offsets, mars, mask = mask)
+        mask = mask_batch[None,:]
+        if mask_dim == 1:
+            if missing_mask:
+                for i in range(K_NUM_TILES):
+
+                    # mars = tl.load(p_nmars, mask = mask, other = 0.0)
+                    tl.store(p_nmars, 0.0, mask = mask)
+
+                    # Increment pointers
+                    p_nmars += TILE_SIZE_K * batch_size
+        else:
+            for i in range(K_NUM_TILES):
+
+                mars = tl.load(p_nmars, mask = mask, other = 0.0)
+                mars = tl.where(missing_mask[None,:], 0.0, mars)
+                tl.store(p_nmars, mars, mask = mask)
+
+                # Increment pointers
+                p_nmars += TILE_SIZE_K * batch_size
 
     @staticmethod
     def _flows_kernel_template(flow_fn, params_ptr, param_flows_ptr, node_flows_ptr, node_mars_ptr, data_ptr, vids_ptr, s_pids_ptr, s_pfids_ptr,
diff --git a/tests/layer/input_layer_test.py b/tests/layer/input_layer_test.py
index 8e26d8d4..c19b96b6 100644
--- a/tests/layer/input_layer_test.py
+++ b/tests/layer/input_layer_test.py
@@ -18,6 +18,7 @@ def input_layer_test():
     device = torch.device("cuda:0")
 
     group_size = 4
+    batch_size = 16
     
     with juice.set_group_size(group_size):
 
@@ -26,7 +27,7 @@ def input_layer_test():
         ni2 = inputs(2, num_node_groups = 2, dist = dists.Categorical(num_cats = 2))
         ni3 = inputs(3, num_node_groups = 2, dist = dists.Categorical(num_cats = 2))
 
-    layer = InputLayer([ni0, ni1, ni2, ni3], cum_nodes = 1)
+    layer = InputLayer([ni0, ni1, ni2, ni3], cum_nodes = 1, maximize_group_size = False)
 
     layer._init_parameters(perturbation = 2.0)
 
@@ -43,8 +44,8 @@ def input_layer_test():
 
     layer.to(device)
 
-    data = torch.randint(0, 2, (4, 16)).to(device)
-    node_mars = torch.zeros([33, 16]).to(device)
+    data = torch.randint(0, 2, (4, batch_size)).to(device)
+    node_mars = torch.zeros([33, batch_size]).to(device)
 
     ## Forward tests ##
 
@@ -56,7 +57,31 @@ def input_layer_test():
 
     ## Forward with mask tests ##
 
+    missing_mask = torch.tensor([0,1,0,1]).bool().to(device)
 
+    layer(data, node_mars, missing_mask = missing_mask)
+
+    for i in range(16):
+        for j in range(4 * 2 * group_size):
+            v = j//(2*group_size)
+            if v == 0 or v == 2:
+                assert torch.abs(node_mars[j+1,i].exp() - layer.params[j*2+data[v,i]]) < 1e-4
+            else:
+                assert torch.abs(node_mars[j+1,i].exp() - 1.0) < 1e-4
+
+    missing_mask = torch.randint(0, 2, (4, batch_size)).bool().to(device)
+
+    layer(data, node_mars, missing_mask = missing_mask)
+
+    for i in range(16):
+        for j in range(4 * 2 * group_size):
+            v = j//(2*group_size)
+            if not missing_mask[v,i]:
+                assert torch.abs(node_mars[j+1,i].exp() - layer.params[j*2+data[v,i]]) < 1e-4
+            else:
+                assert torch.abs(node_mars[j+1,i].exp() - 1.0) < 1e-4
+
+    ## Backward tests ##
 
     import pdb; pdb.set_trace()
 
@@ -65,7 +90,7 @@ def speed_test():
 
     device = torch.device("cuda:0")
 
-    group_size = 128
+    group_size = 16
     num_vars = 16*16*3
     num_node_groups = 256 // group_size
 
@@ -96,9 +121,47 @@ def speed_test():
         layer(data, node_mars)
     torch.cuda.synchronize()
     t1 = time.time()
-    print((t1 - t0) / 100 * 1000)
+    forward_ms = (t1 - t0) / 100 * 1000
+
+    print(f"Forward pass on average takes {forward_ms:.3f}ms.")
+    print("Reference computation time on RTX 4090: 0.048ms.")
+    print("--------------------------------------------------------------")
+
+    ## Forward with mask tests ##
+
+    missing_mask = torch.randint(0, 2, (num_vars,)).bool().to(device)
+
+    layer(data, node_mars, missing_mask = missing_mask)
+
+    t0 = time.time()
+    torch.cuda.synchronize()
+    for _ in range(100):
+        layer(data, node_mars, missing_mask = missing_mask)
+    torch.cuda.synchronize()
+    t1 = time.time()
+    forward_ms = (t1 - t0) / 100 * 1000
+
+    print(f"Forward pass (w/ sample independent mask) on average takes {forward_ms:.3f}ms.")
+    print("Reference computation time on RTX 4090: 0.062ms.")
+    print("--------------------------------------------------------------")
+
+    missing_mask = torch.randint(0, 2, (num_vars, batch_size)).bool().to(device)
+
+    layer(data, node_mars, missing_mask = missing_mask)
+
+    t0 = time.time()
+    torch.cuda.synchronize()
+    for _ in range(100):
+        layer(data, node_mars, missing_mask = missing_mask)
+    torch.cuda.synchronize()
+    t1 = time.time()
+    forward_ms = (t1 - t0) / 100 * 1000
+
+    print(f"Forward pass (w/ sample dependent mask) on average takes {forward_ms:.3f}ms.")
+    print("Reference computation time on RTX 4090: 0.086ms.")
+    print("--------------------------------------------------------------")
 
 
 if __name__ == "__main__":
-    # input_layer_test()
-    speed_test()
\ No newline at end of file
+    input_layer_test()
+    # speed_test()
\ No newline at end of file

From b7b403a9e30cf3eba30c4e464112039e93201409 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Fri, 1 Dec 2023 22:14:05 +0800
Subject: [PATCH 012/162] refactor: backward pass of `InputLayer`

---
 src/pyjuice/layer/input_layer.py              | 97 +++++++++++++------
 .../nodes/distributions/categorical.py        | 11 +--
 tests/layer/input_layer_test.py               | 19 +++-
 3 files changed, 88 insertions(+), 39 deletions(-)

diff --git a/src/pyjuice/layer/input_layer.py b/src/pyjuice/layer/input_layer.py
index 135af960..68431b18 100644
--- a/src/pyjuice/layer/input_layer.py
+++ b/src/pyjuice/layer/input_layer.py
@@ -33,6 +33,7 @@ def __init__(self, nodes: Sequence[InputNodes], cum_nodes: int = 0, maximize_gro
         if maximize_group_size:
             min_num_groups = min([node.num_node_groups for node in self.nodes])
             self.group_size *= 2 ** (min_num_groups.bit_length() - 1)
+            self.group_size = min(self.group_size, 512)
 
         ## Parse input `nodes` ##
         node_vars = []
@@ -307,25 +308,34 @@ def backward(self, data: torch.Tensor, node_flows: torch.Tensor,
             if not self.provided("_flows_kernel"):
                 self._flows_kernel = self._compile_triton_kernel(self._flows_kernel_template, flow_fn = self.bk_flow_fn)
 
-            grid = lambda meta: (triton.cdiv(layer_num_nodes * batch_size, meta['BLOCK_SIZE']),)
+            eval_num_groups = self.num_node_groups if not self.provided("bk_local_group_ids") else self.bk_local_group_ids.size(0)
+            BLOCK_B = min(batch_size, 1024)
+            TILE_SIZE_K = min(1024 // BLOCK_B, self.group_size)
+            BLOCK_M = 1
+
+            grid = (triton.cdiv(batch_size, BLOCK_B), triton.cdiv(eval_num_groups, BLOCK_M))
+
             self._flows_kernel[grid](
                 params_ptr = params,
                 param_flows_ptr = self.param_flows,
-                node_flows_ptr = node_flows, 
-                node_mars_ptr = node_mars,
+                node_flows_ptr = node_flows,
                 data_ptr = data, 
                 vids_ptr = self.vids, 
                 s_pids_ptr = self.s_pids,
+                inc_pids_ptr = self.inc_pids,
                 s_pfids_ptr = self.s_pfids,
+                inc_pfids_ptr = self.inc_pfids,
                 metadata_ptr = self.metadata, 
                 s_mids_ptr = self.s_mids, 
                 bk_local_group_ids_ptr = bk_local_group_ids,
-                layer_num_nodes = layer_num_nodes, 
                 batch_size = batch_size, 
                 num_vars_per_node = self.num_vars_per_node, 
                 nv_block_size = triton.next_power_of_2(self.num_vars_per_node),
                 node_offset = node_offset, 
-                BLOCK_SIZE = 1024, 
+                group_size = self.group_size,
+                TILE_SIZE_K = TILE_SIZE_K,
+                K_NUM_TILES = self.group_size // TILE_SIZE_K,
+                BLOCK_B = BLOCK_B,
                 partial_eval = 1 if bk_local_group_ids is not None else 0
             )
 
@@ -652,48 +662,73 @@ def _fw_missing_mask_kernel(missing_mask_ptr, node_mars_ptr, vids_ptr, fw_local_
                 p_nmars += TILE_SIZE_K * batch_size
 
     @staticmethod
-    def _flows_kernel_template(flow_fn, params_ptr, param_flows_ptr, node_flows_ptr, node_mars_ptr, data_ptr, vids_ptr, s_pids_ptr, s_pfids_ptr,
-                               metadata_ptr, s_mids_ptr, bk_local_ids_ptr, partial_eval: tl.constexpr, layer_num_nodes: tl.constexpr, 
+    def _flows_kernel_template(flow_fn, params_ptr, param_flows_ptr, node_flows_ptr, data_ptr, vids_ptr, s_pids_ptr, inc_pids_ptr, 
+                               s_pfids_ptr, inc_pfids_ptr, metadata_ptr, s_mids_ptr, bk_local_group_ids_ptr, partial_eval: tl.constexpr, 
                                batch_size: tl.constexpr, num_vars_per_node: tl.constexpr, nv_block_size: tl.constexpr, node_offset: tl.constexpr, 
-                               BLOCK_SIZE: tl.constexpr):
-        pid = tl.program_id(axis = 0)
-        block_start = pid * BLOCK_SIZE
+                               group_size: tl.constexpr, TILE_SIZE_K: tl.constexpr, K_NUM_TILES: tl.constexpr, BLOCK_B: tl.constexpr):
 
-        offsets = block_start + tl.arange(0, BLOCK_SIZE)
-        mask = offsets < layer_num_nodes * batch_size
+        bid = tl.program_id(axis = 0)
+        ngroup_id = tl.program_id(axis = 1)
 
-        # Raw batch and (local) node id
-        batch_offsets = (offsets % batch_size)
-        local_offsets = (offsets // batch_size)
+        # Batch ids to process
+        offs_batch = bid * BLOCK_B + tl.arange(0, BLOCK_B)
+        mask_batch = offs_batch < batch_size
 
         if partial_eval > 0:
-            local_offsets = tl.load(bk_local_ids_ptr + local_offsets, mask = mask, other = 0)
+            ngroup_id = tl.load(fw_local_group_ids_ptr + ngroup_id)
 
         if num_vars_per_node == 1:
-            # Get all variable ids
-            vids = tl.load(vids_ptr + local_offsets, mask = mask, other = 0)
+            # Get variable id
+            vid = tl.load(vids_ptr + ngroup_id)
 
             # Load the corresponding data
-            data = tl.load(data_ptr + vids * batch_size + batch_offsets, mask = mask, other = 0)
+            offs_data = vid * batch_size + offs_batch
+            data = tl.load(data_ptr + offs_data, mask = mask_batch, other = 0) # [BLOCK_B]
         else:
             # Get all variable ids
-            vids_offsets = tl.broadcast_to(local_offsets[:,None], (BLOCK_SIZE, nv_block_size)) * num_vars_per_node + \
-                tl.broadcast_to(tl.arange(0, nv_block_size)[None,:], (BLOCK_SIZE, nv_block_size))
-            vids_mask = tl.broadcast_to(mask[:,None], (BLOCK_SIZE, nv_block_size)) & \
-                tl.broadcast_to((tl.arange(0, nv_block_size) < num_vars_per_node)[None,:], (BLOCK_SIZE, nv_block_size))
-            vids = tl.load(vids_ptr + vids_offsets, mask = vids_mask, other = 0)
+            offs_vs = tl.arange(0, nv_block_size)
+            mask_vs = offs_vs < num_vars_per_node
+            offs_vids = ngroup_id * num_vars_per_node + offs_vs
+            mask_vids = mask_vs
+            vids = tl.load(vids_ptr + offs_vids, mask = mask_vids, other = 0)
 
             # Load the corresponding data
-            data = tl.load(data_ptr + vids * batch_size + batch_offsets, mask = vids_mask, other = 0)
+            offs_data = vids[:,None] * batch_size + offs_batch[None,:]
+            data = tl.load(data_ptr + offs_data, mask = (mask_vids[:,None] & mask_batch[None,:]), other = 0)
 
-        s_pids = tl.load(s_pids_ptr + local_offsets, mask = mask, other = 0)
-        s_pfids = tl.load(s_pfids_ptr + local_offsets, mask = mask, other = 0)
+        # Initialize pointers to `params`
+        off_params = tl.load(s_pids_ptr + ngroup_id)
+        inc_params = tl.load(inc_pids_ptr + ngroup_id)
+        offs_node = tl.arange(0, TILE_SIZE_K)
+        p_params = params_ptr + off_params + inc_params * offs_node # [TILE_SIZE_K]
 
-        ns_offsets = (local_offsets + node_offset) * batch_size + batch_offsets
-        flows = tl.load(node_flows_ptr + ns_offsets, mask = mask, other = 0)
+        # Initialize pointers to `param_flows`
+        off_parflows = tl.load(s_pfids_ptr + ngroup_id)
+        inc_parflows = tl.load(inc_pfids_ptr + ngroup_id)
+        p_parflows = param_flows_ptr + off_parflows + inc_parflows * offs_node # [TILE_SIZE_K]
 
-        flow_fn(local_offsets, ns_offsets, data, flows, node_mars_ptr, params_ptr, param_flows_ptr, s_pids, s_pfids, metadata_ptr, 
-                s_mids_ptr, mask, num_vars_per_node, BLOCK_SIZE)
+        # Initialize pointers to `metadata`
+        offs_metadata = tl.load(s_mids_ptr + ngroup_id)
+        p_metadata = metadata_ptr + offs_metadata # [1]
+
+        # Initialize pointers to `node_mars`
+        p_nflows = node_flows_ptr + \
+            (ngroup_id * group_size + offs_node[:,None] + node_offset) * batch_size + \
+            offs_batch[None,:] # [TILE_SIZE_K, BLOCK_B]
+
+        # Inner loop to process everything in the node group
+        mask = mask_batch[None,:]
+        for i in range(K_NUM_TILES):
+
+            # Read out the flows
+            flows = tl.load(p_nflows, mask = mask, other = 0)
+
+            flow_fn(flows, data, p_parflows, p_params, p_metadata, mask, num_vars_per_node)
+
+            # Increment pointers
+            p_params += inc_params * TILE_SIZE_K
+            p_parflows += inc_parflows * TILE_SIZE_K
+            p_nflows += TILE_SIZE_K * batch_size
 
     @staticmethod
     def _sample_kernel_template(sample_fn, samples_ptr, params_ptr, nflow_xids_ptr, nflow_yids_ptr, vids_ptr, s_pids_ptr, metadata_ptr, s_mids_ptr,
diff --git a/src/pyjuice/nodes/distributions/categorical.py b/src/pyjuice/nodes/distributions/categorical.py
index 4267af1e..c9088185 100644
--- a/src/pyjuice/nodes/distributions/categorical.py
+++ b/src/pyjuice/nodes/distributions/categorical.py
@@ -45,13 +45,10 @@ def fw_mar_fn(data, p_params, p_metadata, mask, num_vars_per_node):
         return log_probs
 
     @staticmethod
-    def bk_flow_fn(local_offsets, ns_offsets, data, flows, node_mars_ptr, params_ptr, param_flows_ptr, s_pids, s_pfids, metadata_ptr, 
-                   s_mids_ptr, mask, num_vars_per_node, BLOCK_SIZE):
-        # I am not sure why, but the following code will not work...
-        # tl.atomic_add(param_flows_ptr + s_pfids + data, flows, mask = mask)
-        # Seems like a bug of triton.
-        pf_offsets = s_pfids + data
-        tl.atomic_add(param_flows_ptr + pf_offsets, flows, mask = mask)
+    def bk_flow_fn(flows, data, p_parflows, p_params, p_metadata, mask, num_vars_per_node):
+
+        p_tarflows = p_parflows[:,None] + data[None,:]
+        tl.atomic_add(p_tarflows, flows, mask = mask)
 
     @staticmethod
     def sample_fn(samples_ptr, local_offsets, batch_offsets, vids, s_pids, params_ptr, metadata_ptr, s_mids_ptr, mask, batch_size, BLOCK_SIZE, seed):
diff --git a/tests/layer/input_layer_test.py b/tests/layer/input_layer_test.py
index c19b96b6..d2f81503 100644
--- a/tests/layer/input_layer_test.py
+++ b/tests/layer/input_layer_test.py
@@ -75,7 +75,7 @@ def input_layer_test():
 
     for i in range(16):
         for j in range(4 * 2 * group_size):
-            v = j//(2*group_size)
+            v = j // (2*group_size)
             if not missing_mask[v,i]:
                 assert torch.abs(node_mars[j+1,i].exp() - layer.params[j*2+data[v,i]]) < 1e-4
             else:
@@ -83,6 +83,23 @@ def input_layer_test():
 
     ## Backward tests ##
 
+    node_flows = torch.rand([33, batch_size]).to(device)
+
+    layer.init_param_flows(flows_memory = 0.0)
+
+    layer(data, node_mars)
+    layer.backward(data, node_flows, node_mars)
+
+    param_flows = torch.zeros([group_size * 2 * 4 * 2]).to(device)
+
+    for i in range(16):
+        for j in range(4 * 2 * group_size):
+            v = j // (2*group_size)
+
+            param_flows[j*2+data[j//(2*group_size),i]] += node_flows[j+1,i]
+
+    assert torch.all(torch.abs(layer.param_flows - param_flows) < 1e-4)
+
     import pdb; pdb.set_trace()
 
 

From 8049945ae28d3b4b8f85f29dda4edec0ca26ffea Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Fri, 1 Dec 2023 22:33:30 +0800
Subject: [PATCH 013/162] fix: backward pass of `InputLayer`

---
 src/pyjuice/layer/input_layer.py |  4 +++-
 tests/layer/input_layer_test.py  | 25 +++++++++++++++++++++++--
 2 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/src/pyjuice/layer/input_layer.py b/src/pyjuice/layer/input_layer.py
index 68431b18..ed825a1a 100644
--- a/src/pyjuice/layer/input_layer.py
+++ b/src/pyjuice/layer/input_layer.py
@@ -723,7 +723,9 @@ def _flows_kernel_template(flow_fn, params_ptr, param_flows_ptr, node_flows_ptr,
             # Read out the flows
             flows = tl.load(p_nflows, mask = mask, other = 0)
 
-            flow_fn(flows, data, p_parflows, p_params, p_metadata, mask, num_vars_per_node)
+            # flow_fn(flows, data, p_parflows, p_params, p_metadata, mask, num_vars_per_node)
+            p_tarflows = p_parflows[:,None] + data[None,:]
+            tl.atomic_add(p_tarflows, flows, mask = mask)
 
             # Increment pointers
             p_params += inc_params * TILE_SIZE_K
diff --git a/tests/layer/input_layer_test.py b/tests/layer/input_layer_test.py
index d2f81503..66b5601e 100644
--- a/tests/layer/input_layer_test.py
+++ b/tests/layer/input_layer_test.py
@@ -178,7 +178,28 @@ def speed_test():
     print("Reference computation time on RTX 4090: 0.086ms.")
     print("--------------------------------------------------------------")
 
+    ## Backward tests ##
+
+    node_flows = torch.rand([1 + group_size * num_node_groups * num_vars, batch_size]).to(device)
+
+    layer.init_param_flows(flows_memory = 0.0)
+
+    layer(data, node_mars)
+    layer.backward(data, node_flows, node_mars)
+
+    t0 = time.time()
+    torch.cuda.synchronize()
+    for _ in range(100):
+        layer.backward(data, node_flows, node_mars)
+    torch.cuda.synchronize()
+    t1 = time.time()
+    forward_ms = (t1 - t0) / 100 * 1000
+
+    print(f"Backward pass on average takes {forward_ms:.3f}ms.")
+    print("Reference computation time on RTX 4090: 0.086ms.")
+    print("--------------------------------------------------------------")
+
 
 if __name__ == "__main__":
-    input_layer_test()
-    # speed_test()
\ No newline at end of file
+    # input_layer_test()
+    speed_test()
\ No newline at end of file

From a2b9ca0af1a98dffdb9ac4176de59ef62f9760fc Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Fri, 1 Dec 2023 23:29:19 +0800
Subject: [PATCH 014/162] one working but seemingly slow version..

---
 src/pyjuice/layer/input_layer.py | 12 ++----------
 tests/layer/input_layer_test.py  |  8 ++++----
 2 files changed, 6 insertions(+), 14 deletions(-)

diff --git a/src/pyjuice/layer/input_layer.py b/src/pyjuice/layer/input_layer.py
index ed825a1a..960afec8 100644
--- a/src/pyjuice/layer/input_layer.py
+++ b/src/pyjuice/layer/input_layer.py
@@ -208,15 +208,12 @@ def forward(self, data: torch.Tensor, node_mars: torch.Tensor, params: Optional[
             # Need to flatten data to ensure the memory is aligned following [num_vars, batch_size]
             data = data.reshape(-1).contiguous()
 
-            tot_num_nodes = node_mars.size(0)
             batch_size = node_mars.size(1)
             node_offset = self._output_ind_range[0]
 
             if not self.provided("fw_local_group_ids"):
-                layer_num_nodes = self._output_ind_range[1] - self._output_ind_range[0]
                 fw_local_group_ids = None
             else:
-                layer_num_nodes = self.fw_local_group_ids.size(0)
                 fw_local_group_ids = self.fw_local_group_ids
 
             if not self.provided("_mars_kernel"):
@@ -294,15 +291,12 @@ def backward(self, data: torch.Tensor, node_flows: torch.Tensor,
             # Need to flatten data to ensure the memory is aligned following [num_vars, batch_size]
             data = data.reshape(-1).contiguous()
 
-            tot_num_nodes = node_flows.size(0)
             batch_size = node_flows.size(1)
             node_offset = self._output_ind_range[0]
 
             if not self.provided("bk_local_group_ids"):
-                layer_num_nodes = self._output_ind_range[1] - self._output_ind_range[0]
                 bk_local_group_ids = None
             else:
-                layer_num_nodes = self.bk_local_group_ids.size(0)
                 bk_local_group_ids = self.bk_local_group_ids
 
             if not self.provided("_flows_kernel"):
@@ -675,7 +669,7 @@ def _flows_kernel_template(flow_fn, params_ptr, param_flows_ptr, node_flows_ptr,
         mask_batch = offs_batch < batch_size
 
         if partial_eval > 0:
-            ngroup_id = tl.load(fw_local_group_ids_ptr + ngroup_id)
+            ngroup_id = tl.load(bk_local_group_ids_ptr + ngroup_id)
 
         if num_vars_per_node == 1:
             # Get variable id
@@ -723,9 +717,7 @@ def _flows_kernel_template(flow_fn, params_ptr, param_flows_ptr, node_flows_ptr,
             # Read out the flows
             flows = tl.load(p_nflows, mask = mask, other = 0)
 
-            # flow_fn(flows, data, p_parflows, p_params, p_metadata, mask, num_vars_per_node)
-            p_tarflows = p_parflows[:,None] + data[None,:]
-            tl.atomic_add(p_tarflows, flows, mask = mask)
+            flow_fn(flows, data, p_parflows, p_params, p_metadata, mask, num_vars_per_node)
 
             # Increment pointers
             p_params += inc_params * TILE_SIZE_K
diff --git a/tests/layer/input_layer_test.py b/tests/layer/input_layer_test.py
index 66b5601e..b7ea74f2 100644
--- a/tests/layer/input_layer_test.py
+++ b/tests/layer/input_layer_test.py
@@ -107,8 +107,8 @@ def speed_test():
 
     device = torch.device("cuda:0")
 
-    group_size = 16
-    num_vars = 16*16*3
+    group_size = 4
+    num_vars = 28*28
     num_node_groups = 256 // group_size
 
     batch_size = 512
@@ -119,14 +119,14 @@ def speed_test():
         for v in range(num_vars):
             nis.append(inputs(0, num_node_groups = num_node_groups, dist = dists.Categorical(num_cats = 64)))
 
-    layer = InputLayer(nis, cum_nodes = 1)
+    layer = InputLayer(nis, cum_nodes = 1, maximize_group_size = False)
 
     layer._init_parameters(perturbation = 2.0)
 
     layer.to(device)
 
     data = torch.randint(0, 64, (num_vars, batch_size)).to(device)
-    node_mars = torch.zeros([1 + group_size * num_node_groups * num_vars, 16]).to(device)
+    node_mars = torch.zeros([1 + group_size * num_node_groups * num_vars, batch_size]).to(device)
 
     ## Forward tests ##
 

From 007e6c7b64eb79ea303e6264b0bb506d877a40eb Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Sat, 2 Dec 2023 21:51:38 +0800
Subject: [PATCH 015/162] switch back to the old input kernels since they are
 faster

---
 .gitignore                                    |   1 +
 src/pyjuice/layer/input_layer.py              | 376 +++++++-----------
 .../nodes/distributions/categorical.py        |  21 +-
 .../nodes/distributions/distributions.py      |  12 +-
 tests/layer/input_layer_test.py               |  32 +-
 5 files changed, 173 insertions(+), 269 deletions(-)

diff --git a/.gitignore b/.gitignore
index 95b3f888..5b1aa15d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,6 +7,7 @@ __pycache__/
 *.so
 
 temp.npz
+out.ncu-rep
 
 # Distribution / packaging
 .Python
diff --git a/src/pyjuice/layer/input_layer.py b/src/pyjuice/layer/input_layer.py
index 960afec8..5275a40b 100644
--- a/src/pyjuice/layer/input_layer.py
+++ b/src/pyjuice/layer/input_layer.py
@@ -28,13 +28,6 @@ def __init__(self, nodes: Sequence[InputNodes], cum_nodes: int = 0, maximize_gro
         # Reorder input nodes such that for any tied nodes, its source nodes appear before them
         self.nodes = self._reorder_nodes(nodes)
 
-        # Group size of the nodes in the current layer
-        self.group_size = self.nodes[0].group_size
-        if maximize_group_size:
-            min_num_groups = min([node.num_node_groups for node in self.nodes])
-            self.group_size *= 2 ** (min_num_groups.bit_length() - 1)
-            self.group_size = min(self.group_size, 512)
-
         ## Parse input `nodes` ##
         node_vars = []
         node_sizes = []
@@ -42,13 +35,13 @@ def __init__(self, nodes: Sequence[InputNodes], cum_nodes: int = 0, maximize_gro
         layer_num_nodes = 0
         cum_params = 0
         cum_param_flows = 0
-        cum_source_ngroups = 0
+        cum_source_ns = 0
         dist_signature = None
         for ns in self.nodes:
             if dist_signature is None:
                 dist_signature = ns.dist.get_signature()
             else:
-                assert dist_signature == ns.dist.get_signature(), f"Nodes of an InputLayer must have the same distribution type, but got `{dist_signature}` and `{ns.dist.get_signature()}`."
+                assert dist_signature == ns.dist.get_signature(), "Nodes of an InputLayer must have the same distribution type."
 
             node_vars.append(ns.scope.to_list())
             node_sizes.append(ns.num_nodes)
@@ -65,7 +58,7 @@ def __init__(self, nodes: Sequence[InputNodes], cum_nodes: int = 0, maximize_gro
                 cum_param_flows += ns.num_nodes * ns.dist.num_param_flows()
                 ns._param_flow_range = (cum_param_flows - ns.num_nodes * ns.dist.num_param_flows(), cum_param_flows)
 
-                cum_source_ngroups += ns.num_nodes // self.group_size
+                cum_source_ns += ns.num_nodes
             else:
                 source_ns = ns.get_source_ns()
                 ns._param_range = deepcopy(source_ns._param_range)
@@ -74,7 +67,6 @@ def __init__(self, nodes: Sequence[InputNodes], cum_nodes: int = 0, maximize_gro
         self.num_params = cum_params
         self.num_param_flows = cum_param_flows
         self.num_nodes = layer_num_nodes
-        self.num_node_groups = self.num_nodes // self.group_size
         self.dist_signature = dist_signature
 
         # Store the triton kernel functions implemented by the target `Distribution`
@@ -85,78 +77,60 @@ def __init__(self, nodes: Sequence[InputNodes], cum_nodes: int = 0, maximize_gro
 
         ## Prepair and compile the layer ##
         num_vars = len(node_vars[0])
-        # Start variable index: vids[i,:] are the variables of the ith node group
-        vids = torch.empty([self.num_node_groups, num_vars], dtype = torch.long)
-        # Start parameter index: params[s_pids[i]] is the first parameter of the 1st node in the ith node group
-        s_pids = torch.empty([self.num_node_groups], dtype = torch.long)
-        # Pointer increment of the parameters: params[s_pids[i]+j*inc_pids[i]] is the first parameter 
-        # of the (j+1)th node in the ith node group
-        inc_pids = torch.empty([self.num_node_groups], dtype = torch.long)
-        # Start parameter flow index: param_flows[s_pfids[i]] is the first parameter flow of the 1st node in the ith node group
-        s_pfids = torch.empty([self.num_node_groups], dtype = torch.long)
-        # Pointer increment of the parameters: param_flows[s_pfids[i]+j*inc_pfids[i]] is the first parameter flow
-        # of the (j+1)th node in the ith node group
-        inc_pfids = torch.empty([self.num_node_groups], dtype = torch.long)
-        # Start metadata index: metadata[s_mids[i]] is the first metadata of the 1th node in the ith node group
+        # Start variable index: vids[i,:] are the variables of the ith node
+        vids = torch.empty([self.num_nodes, num_vars], dtype = torch.long)
+        # Start parameter index: params[s_pids[i]] is the first parameter of the ith node
+        s_pids = torch.empty([self.num_nodes], dtype = torch.long)
+        # Start parameter flow index: param_flows[s_pfids[i]] is the first parameter flow of the ith node
+        s_pfids = torch.empty([self.num_nodes], dtype = torch.long)
+        # Start metadata index: metadata[s_mids[i]] is the first metadata of the ith node
         metadata = []
-        s_mids = torch.empty([self.num_node_groups], dtype = torch.long)
-        # source node group ids (nodes with their original parameters)
-        source_ngids = torch.empty([cum_source_ngroups], dtype = torch.long)
+        s_mids = torch.empty([self.num_nodes], dtype = torch.long)
+        # source node ids (nodes with their original parameters)
+        source_nids = torch.empty([cum_source_ns], dtype = torch.long)
 
         # Parameters of this layer
         params = torch.empty([self.num_params], dtype = torch.float32)
         
-        ng_start = 0
-        source_ng_start = 0
-        param_start = 0
+        n_start = 0
+        source_n_start = 0
         for ns_id, ns in enumerate(self.nodes):
-            ng_end = ng_start + ns.num_nodes // self.group_size
+            n_end = n_start + ns.num_nodes
 
             # `vids`
-            assert len(node_vars[ns_id]) == num_vars, f"Input nodes in the same layer should define on the same " \
-                                                      f"number of variables, but got {len(node_vars[ns_id])} and {num_vars}."
-            vids[ng_start:ng_end,:] = torch.tensor(node_vars[ns_id]).view(1, -1)
+            assert len(node_vars[ns_id]) == num_vars
+            vids[n_start:n_end,:] = torch.tensor(node_vars[ns_id]).view(1, -1)
 
             # `s_pids` and `s_pfids`
             if not ns.is_tied():
                 source_ns = ns
             else:
                 source_ns = ns.get_source_ns()
+            pid_offsets = torch.arange(0, ns.num_nodes * ns.dist.num_parameters(), ns.dist.num_parameters())
+            s_pids[n_start:n_end] = source_ns._param_range[0] + pid_offsets
+            pfid_offsets = torch.arange(0, ns.num_nodes * ns.dist.num_param_flows(), ns.dist.num_param_flows())
+            s_pfids[n_start:n_end] = source_ns._param_flow_range[0] + pfid_offsets
 
-            num_node_groups = ns.num_nodes // self.group_size
-
-            n_params_per_group = self.group_size * ns.dist.num_parameters()
-            gpid_offsets = torch.arange(0, num_node_groups * n_params_per_group, n_params_per_group)
-            s_pids[ng_start:ng_end] = source_ns._param_range[0] + gpid_offsets
-            inc_pids[ng_start:ng_end] = ns.dist.num_parameters()
-
-            n_pflows_per_group = self.group_size * ns.dist.num_param_flows()
-            gpfid_offsets = torch.arange(0, num_node_groups * n_pflows_per_group, n_pflows_per_group)
-            s_pfids[ng_start:ng_end] = source_ns._param_flow_range[0] + gpfid_offsets
-            inc_pfids[ng_start:ng_end] = ns.dist.num_param_flows()
-
-            # `source_ngids`
+            # `source_nids`
             if not ns.is_tied():
-                source_ng_end = source_ng_start + num_node_groups
-                source_ngids[source_ng_start:source_ng_end] = torch.arange(ng_start, ng_end)
-                source_ng_start = source_ng_end
+                source_n_end = source_n_start + ns.num_nodes
+                source_nids[source_n_start:source_n_end] = torch.arange(n_start, n_end)
+                source_n_start = source_n_end
 
             # `metadata` and `s_mids`
-            s_mids[ng_start:ng_end] = len(metadata)
+            s_mids[n_start:n_end] = len(metadata)
             metadata.extend(node_metadata[ns_id])
 
-            ng_start = ng_end
+            n_start = n_end
 
         self.register_buffer("vids", vids)
         self.register_buffer("s_pids", s_pids)
-        self.register_buffer("inc_pids", inc_pids)
         self.register_buffer("s_pfids", s_pfids)
-        self.register_buffer("inc_pfids", inc_pfids)
         self.register_buffer("metadata", torch.tensor(metadata).float())
         self.register_buffer("s_mids", s_mids)
-        self.register_buffer("source_ngids", source_ngids)
+        self.register_buffer("source_nids", source_nids)
 
-        self.params = nn.Parameter(params) # Parameters will be set later in `self._init_parameters()`
+        self.params = nn.Parameter(params)
         # Due to the custom inplace backward pass implementation, we do not track 
         # gradient of PC parameters by PyTorch.
         self.params.requires_grad = False
@@ -208,64 +182,56 @@ def forward(self, data: torch.Tensor, node_mars: torch.Tensor, params: Optional[
             # Need to flatten data to ensure the memory is aligned following [num_vars, batch_size]
             data = data.reshape(-1).contiguous()
 
+            tot_num_nodes = node_mars.size(0)
             batch_size = node_mars.size(1)
             node_offset = self._output_ind_range[0]
 
-            if not self.provided("fw_local_group_ids"):
-                fw_local_group_ids = None
+            if not self.provided("fw_local_ids"):
+                layer_num_nodes = self._output_ind_range[1] - self._output_ind_range[0]
+                fw_local_ids = None
             else:
-                fw_local_group_ids = self.fw_local_group_ids
+                layer_num_nodes = self.fw_local_ids.size(0)
+                fw_local_ids = self.fw_local_ids
 
             if not self.provided("_mars_kernel"):
                 self._mars_kernel = self._compile_triton_kernel(self._mars_kernel_template, mar_fn = self.fw_mar_fn)
 
-            eval_num_groups = self.num_node_groups if not self.provided("fw_local_group_ids") else self.fw_local_group_ids.size(0)
-            BLOCK_B = min(batch_size, 1024)
-            TILE_SIZE_K = min(1024 // BLOCK_B, self.group_size)
-            BLOCK_M = 1
-
-            grid = (triton.cdiv(batch_size, BLOCK_B), triton.cdiv(eval_num_groups, BLOCK_M))
-
+            grid = lambda meta: (triton.cdiv(layer_num_nodes * batch_size, meta['BLOCK_SIZE']),)
             self._mars_kernel[grid](
-                params_ptr = params, 
+                params_ptr = self.params, 
                 node_mars_ptr = node_mars, 
                 data_ptr = data, 
                 vids_ptr = self.vids, 
-                s_pids_ptr = self.s_pids,
-                inc_pids_ptr = self.inc_pids,
+                s_pids_ptr = self.s_pids, 
                 metadata_ptr = self.metadata, 
                 s_mids_ptr = self.s_mids, 
-                fw_local_group_ids_ptr = fw_local_group_ids,
+                fw_local_ids_ptr = fw_local_ids,
+                layer_num_nodes = layer_num_nodes, 
                 batch_size = batch_size, 
                 num_vars_per_node = self.num_vars_per_node, 
                 nv_block_size = triton.next_power_of_2(self.num_vars_per_node),
                 node_offset = node_offset, 
-                group_size = self.group_size,
-                TILE_SIZE_K = TILE_SIZE_K,
-                K_NUM_TILES = self.group_size // TILE_SIZE_K,
-                BLOCK_B = BLOCK_B,
-                partial_eval = 1 if fw_local_group_ids is not None else 0
+                BLOCK_SIZE = 1024, 
+                partial_eval = 1 if fw_local_ids is not None else 0
             )
 
             # Apply missing mask if required
             if missing_mask is not None:
                 assert self.num_vars_per_node == 1, "`missing_mask` only supported for univariate distributions."
-                assert missing_mask.dtype == torch.bool, "`missing_mask` must be boolean."
 
                 mask_dim = missing_mask.dim()
 
+                grid = lambda meta: (triton.cdiv(layer_num_nodes * batch_size, meta['BLOCK_SIZE']),)
                 self._fw_missing_mask_kernel[grid](
                     missing_mask_ptr = missing_mask,
                     node_mars_ptr = node_mars, 
                     vids_ptr = self.vids, 
-                    fw_local_group_ids_ptr = fw_local_group_ids,
+                    fw_local_ids_ptr = fw_local_ids,
+                    layer_num_nodes = layer_num_nodes, 
                     batch_size = batch_size, 
                     node_offset = node_offset, 
-                    group_size = self.group_size,
-                    TILE_SIZE_K = TILE_SIZE_K,
-                    K_NUM_TILES = self.group_size // TILE_SIZE_K,
-                    BLOCK_B = BLOCK_B,
-                    partial_eval = 1 if fw_local_group_ids is not None else 0,
+                    BLOCK_SIZE = 1024, 
+                    partial_eval = 1 if fw_local_ids is not None else 0,
                     mask_dim = mask_dim
                 )
 
@@ -291,46 +257,40 @@ def backward(self, data: torch.Tensor, node_flows: torch.Tensor,
             # Need to flatten data to ensure the memory is aligned following [num_vars, batch_size]
             data = data.reshape(-1).contiguous()
 
+            tot_num_nodes = node_flows.size(0)
             batch_size = node_flows.size(1)
             node_offset = self._output_ind_range[0]
 
-            if not self.provided("bk_local_group_ids"):
-                bk_local_group_ids = None
+            if not self.provided("bk_local_ids"):
+                layer_num_nodes = self._output_ind_range[1] - self._output_ind_range[0]
+                bk_local_ids = None
             else:
-                bk_local_group_ids = self.bk_local_group_ids
+                layer_num_nodes = self.bk_local_ids.size(0)
+                bk_local_ids = self.bk_local_ids
 
             if not self.provided("_flows_kernel"):
                 self._flows_kernel = self._compile_triton_kernel(self._flows_kernel_template, flow_fn = self.bk_flow_fn)
 
-            eval_num_groups = self.num_node_groups if not self.provided("bk_local_group_ids") else self.bk_local_group_ids.size(0)
-            BLOCK_B = min(batch_size, 1024)
-            TILE_SIZE_K = min(1024 // BLOCK_B, self.group_size)
-            BLOCK_M = 1
-
-            grid = (triton.cdiv(batch_size, BLOCK_B), triton.cdiv(eval_num_groups, BLOCK_M))
-
+            grid = lambda meta: (triton.cdiv(layer_num_nodes * batch_size, meta['BLOCK_SIZE']),)
             self._flows_kernel[grid](
-                params_ptr = params,
+                params_ptr = self.params,
                 param_flows_ptr = self.param_flows,
-                node_flows_ptr = node_flows,
+                node_flows_ptr = node_flows, 
+                node_mars_ptr = node_mars,
                 data_ptr = data, 
                 vids_ptr = self.vids, 
                 s_pids_ptr = self.s_pids,
-                inc_pids_ptr = self.inc_pids,
                 s_pfids_ptr = self.s_pfids,
-                inc_pfids_ptr = self.inc_pfids,
                 metadata_ptr = self.metadata, 
                 s_mids_ptr = self.s_mids, 
-                bk_local_group_ids_ptr = bk_local_group_ids,
+                bk_local_ids_ptr = bk_local_ids,
+                layer_num_nodes = layer_num_nodes, 
                 batch_size = batch_size, 
                 num_vars_per_node = self.num_vars_per_node, 
                 nv_block_size = triton.next_power_of_2(self.num_vars_per_node),
                 node_offset = node_offset, 
-                group_size = self.group_size,
-                TILE_SIZE_K = TILE_SIZE_K,
-                K_NUM_TILES = self.group_size // TILE_SIZE_K,
-                BLOCK_B = BLOCK_B,
-                partial_eval = 1 if bk_local_group_ids is not None else 0
+                BLOCK_SIZE = 1024, 
+                partial_eval = 1 if bk_local_ids is not None else 0
             )
 
         else:
@@ -542,187 +502,123 @@ def _init_parameters(self, perturbation):
                 p_start = p_end
 
     @staticmethod
-    def _mars_kernel_template(mar_fn, params_ptr, node_mars_ptr, data_ptr, vids_ptr, s_pids_ptr, inc_pids_ptr, metadata_ptr, s_mids_ptr,
-                              fw_local_group_ids_ptr, partial_eval: tl.constexpr, batch_size: tl.constexpr, 
-                              num_vars_per_node: tl.constexpr, nv_block_size: tl.constexpr, node_offset: tl.constexpr, group_size: tl.constexpr,
-                              TILE_SIZE_K: tl.constexpr, K_NUM_TILES: tl.constexpr, BLOCK_B: tl.constexpr):
-        bid = tl.program_id(axis = 0)
-        ngroup_id = tl.program_id(axis = 1)
+    def _mars_kernel_template(mar_fn, params_ptr, node_mars_ptr, data_ptr, vids_ptr, s_pids_ptr, metadata_ptr, s_mids_ptr, 
+                              fw_local_ids_ptr, partial_eval: tl.constexpr, layer_num_nodes: tl.constexpr, batch_size: tl.constexpr, 
+                              num_vars_per_node: tl.constexpr, nv_block_size: tl.constexpr, node_offset: tl.constexpr, BLOCK_SIZE: tl.constexpr):
+        pid = tl.program_id(axis = 0)
+        block_start = pid * BLOCK_SIZE
 
-        # Batch ids to process
-        offs_batch = bid * BLOCK_B + tl.arange(0, BLOCK_B)
-        mask_batch = offs_batch < batch_size
+        offsets = block_start + tl.arange(0, BLOCK_SIZE)
+        mask = offsets < layer_num_nodes * batch_size
+
+        # Raw batch and (local) node id
+        batch_offsets = (offsets % batch_size)
+        local_offsets = (offsets // batch_size)
 
         if partial_eval > 0:
-            ngroup_id = tl.load(fw_local_group_ids_ptr + ngroup_id)
+            local_offsets = tl.load(fw_local_ids_ptr + local_offsets, mask = mask, other = 0)
 
         if num_vars_per_node == 1:
-            # Get variable id
-            vid = tl.load(vids_ptr + ngroup_id)
+            # Get all variable ids
+            vids = tl.load(vids_ptr + local_offsets, mask = mask, other = 0)
 
             # Load the corresponding data
-            offs_data = vid * batch_size + offs_batch
-            data = tl.load(data_ptr + offs_data, mask = mask_batch, other = 0) # [BLOCK_B]
+            data = tl.load(data_ptr + vids * batch_size + batch_offsets, mask = mask, other = 0)
         else:
             # Get all variable ids
-            offs_vs = tl.arange(0, nv_block_size)
-            mask_vs = offs_vs < num_vars_per_node
-            offs_vids = ngroup_id * num_vars_per_node + offs_vs
-            mask_vids = mask_vs
-            vids = tl.load(vids_ptr + offs_vids, mask = mask_vids, other = 0)
+            vids_offsets = tl.broadcast_to(local_offsets[:,None], (BLOCK_SIZE, nv_block_size)) * num_vars_per_node + \
+                tl.broadcast_to(tl.arange(0, nv_block_size)[None,:], (BLOCK_SIZE, nv_block_size))
+            vids_mask = tl.broadcast_to(mask[:,None], (BLOCK_SIZE, nv_block_size)) & \
+                tl.broadcast_to((tl.arange(0, nv_block_size) < num_vars_per_node)[None,:], (BLOCK_SIZE, nv_block_size))
+            vids = tl.load(vids_ptr + vids_offsets, mask = vids_mask, other = 0)
 
             # Load the corresponding data
-            offs_data = vids[:,None] * batch_size + offs_batch[None,:]
-            data = tl.load(data_ptr + offs_data, mask = (mask_vids[:,None] & mask_batch[None,:]), other = 0)
-
-        # Initialize pointers to `params`
-        off_params = tl.load(s_pids_ptr + ngroup_id)
-        inc_params = tl.load(inc_pids_ptr + ngroup_id)
-        offs_node = tl.arange(0, TILE_SIZE_K)
-        p_params = params_ptr + off_params + inc_params * offs_node # [TILE_SIZE_K]
-
-        # Initialize pointers to `metadata`
-        offs_metadata = tl.load(s_mids_ptr + ngroup_id)
-        p_metadata = metadata_ptr + offs_metadata # [1]
+            data = tl.load(data_ptr + vids * batch_size + batch_offsets, mask = vids_mask, other = 0)
 
-        # Initialize pointers to `node_mars`
-        p_nmars = node_mars_ptr + \
-            (ngroup_id * group_size + offs_node[:,None] + node_offset) * batch_size + \
-            offs_batch[None,:] # [TILE_SIZE_K, BLOCK_B]
-
-        # Inner loop to process everything in the node group
-        mask = mask_batch[None,:]
-        for i in range(K_NUM_TILES):
-
-            mars = mar_fn(data, p_params, p_metadata, mask, num_vars_per_node)
+        s_pids = tl.load(s_pids_ptr + local_offsets, mask = mask, other = 0)
 
-            tl.store(p_nmars, mars, mask = mask)
+        mars = mar_fn(local_offsets, data, params_ptr, s_pids, metadata_ptr, s_mids_ptr, mask, num_vars_per_node, BLOCK_SIZE)
 
-            # Increment pointers
-            p_params += inc_params * TILE_SIZE_K
-            p_nmars += TILE_SIZE_K * batch_size
+        node_offsets = local_offsets + node_offset
+        tl.store(node_mars_ptr + node_offsets * batch_size + batch_offsets, mars, mask = mask)
 
     @staticmethod
     @triton.jit
-    def _fw_missing_mask_kernel(missing_mask_ptr, node_mars_ptr, vids_ptr, fw_local_group_ids_ptr, group_size: tl.constexpr,
-                                batch_size: tl.constexpr, node_offset: tl.constexpr, TILE_SIZE_K: tl.constexpr, K_NUM_TILES: tl.constexpr,
-                                BLOCK_B: tl.constexpr, partial_eval: tl.constexpr, mask_dim: tl.constexpr):
+    def _fw_missing_mask_kernel(missing_mask_ptr, node_mars_ptr, vids_ptr, fw_local_ids_ptr, layer_num_nodes: tl.constexpr, 
+                                batch_size: tl.constexpr, node_offset: tl.constexpr, BLOCK_SIZE: tl.constexpr, 
+                                partial_eval: tl.constexpr, mask_dim: tl.constexpr):
+        pid = tl.program_id(axis = 0)
+        block_start = pid * BLOCK_SIZE
 
-        bid = tl.program_id(axis = 0)
-        ngroup_id = tl.program_id(axis = 1)
+        offsets = block_start + tl.arange(0, BLOCK_SIZE)
+        mask = offsets < layer_num_nodes * batch_size
 
-        # Batch ids to process
-        offs_batch = bid * BLOCK_B + tl.arange(0, BLOCK_B)
-        mask_batch = offs_batch < batch_size
+        # Raw batch and (local) node id
+        batch_offsets = (offsets % batch_size)
+        local_offsets = (offsets // batch_size)
 
         if partial_eval > 0:
-            ngroup_id = tl.load(fw_local_group_ids_ptr + ngroup_id)
+            local_offsets = tl.load(fw_local_ids_ptr + local_offsets, mask = mask, other = 0)
 
-        # Get variable id
-        vid = tl.load(vids_ptr + ngroup_id)
+        # Get all variable ids
+        vids = tl.load(vids_ptr + local_offsets, mask = mask, other = 0)
 
         # Fetch mask
         if mask_dim == 1:
-            missing_mask = tl.load(missing_mask_ptr + vid)
+            missing_mask = tl.load(missing_mask_ptr + vids, mask = mask, other = False)
         else:
-            offs_mmask = vid * batch_size + offs_batch
-            missing_mask = tl.load(missing_mask_ptr + offs_mmask, mask = mask_batch, other = False)
-
-        # Initialize pointers to `node_mars`
-        offs_node = tl.arange(0, TILE_SIZE_K)
-        p_nmars = node_mars_ptr + \
-            (ngroup_id * group_size + offs_node[:,None] + node_offset) * batch_size + \
-            offs_batch[None,:] # [TILE_SIZE_K, BLOCK_B]
+            mask_offsets = vids * batch_size + batch_offsets
+            missing_mask = tl.load(missing_mask_ptr + mask_offsets, mask = mask, other = False)
 
         # Apply mask
-        mask = mask_batch[None,:]
-        if mask_dim == 1:
-            if missing_mask:
-                for i in range(K_NUM_TILES):
-
-                    # mars = tl.load(p_nmars, mask = mask, other = 0.0)
-                    tl.store(p_nmars, 0.0, mask = mask)
-
-                    # Increment pointers
-                    p_nmars += TILE_SIZE_K * batch_size
-        else:
-            for i in range(K_NUM_TILES):
-
-                mars = tl.load(p_nmars, mask = mask, other = 0.0)
-                mars = tl.where(missing_mask[None,:], 0.0, mars)
-                tl.store(p_nmars, mars, mask = mask)
-
-                # Increment pointers
-                p_nmars += TILE_SIZE_K * batch_size
+        node_offsets = (local_offsets + node_offset) * batch_size + batch_offsets
+        mars = tl.load(node_mars_ptr + node_offsets, mask = mask, other = 0.0)
+        mars = tl.where(missing_mask, 0.0, mars)
+        tl.store(node_mars_ptr + node_offsets, mars, mask = mask)
 
     @staticmethod
-    def _flows_kernel_template(flow_fn, params_ptr, param_flows_ptr, node_flows_ptr, data_ptr, vids_ptr, s_pids_ptr, inc_pids_ptr, 
-                               s_pfids_ptr, inc_pfids_ptr, metadata_ptr, s_mids_ptr, bk_local_group_ids_ptr, partial_eval: tl.constexpr, 
+    def _flows_kernel_template(flow_fn, params_ptr, param_flows_ptr, node_flows_ptr, node_mars_ptr, data_ptr, vids_ptr, s_pids_ptr, s_pfids_ptr,
+                               metadata_ptr, s_mids_ptr, bk_local_ids_ptr, partial_eval: tl.constexpr, layer_num_nodes: tl.constexpr, 
                                batch_size: tl.constexpr, num_vars_per_node: tl.constexpr, nv_block_size: tl.constexpr, node_offset: tl.constexpr, 
-                               group_size: tl.constexpr, TILE_SIZE_K: tl.constexpr, K_NUM_TILES: tl.constexpr, BLOCK_B: tl.constexpr):
+                               BLOCK_SIZE: tl.constexpr):
+        pid = tl.program_id(axis = 0)
+        block_start = pid * BLOCK_SIZE
 
-        bid = tl.program_id(axis = 0)
-        ngroup_id = tl.program_id(axis = 1)
+        offsets = block_start + tl.arange(0, BLOCK_SIZE)
+        mask = offsets < layer_num_nodes * batch_size
 
-        # Batch ids to process
-        offs_batch = bid * BLOCK_B + tl.arange(0, BLOCK_B)
-        mask_batch = offs_batch < batch_size
+        # Raw batch and (local) node id
+        batch_offsets = (offsets % batch_size)
+        local_offsets = (offsets // batch_size)
 
         if partial_eval > 0:
-            ngroup_id = tl.load(bk_local_group_ids_ptr + ngroup_id)
+            local_offsets = tl.load(bk_local_ids_ptr + local_offsets, mask = mask, other = 0)
 
         if num_vars_per_node == 1:
-            # Get variable id
-            vid = tl.load(vids_ptr + ngroup_id)
+            # Get all variable ids
+            vids = tl.load(vids_ptr + local_offsets, mask = mask, other = 0)
 
             # Load the corresponding data
-            offs_data = vid * batch_size + offs_batch
-            data = tl.load(data_ptr + offs_data, mask = mask_batch, other = 0) # [BLOCK_B]
+            data = tl.load(data_ptr + vids * batch_size + batch_offsets, mask = mask, other = 0)
         else:
             # Get all variable ids
-            offs_vs = tl.arange(0, nv_block_size)
-            mask_vs = offs_vs < num_vars_per_node
-            offs_vids = ngroup_id * num_vars_per_node + offs_vs
-            mask_vids = mask_vs
-            vids = tl.load(vids_ptr + offs_vids, mask = mask_vids, other = 0)
+            vids_offsets = tl.broadcast_to(local_offsets[:,None], (BLOCK_SIZE, nv_block_size)) * num_vars_per_node + \
+                tl.broadcast_to(tl.arange(0, nv_block_size)[None,:], (BLOCK_SIZE, nv_block_size))
+            vids_mask = tl.broadcast_to(mask[:,None], (BLOCK_SIZE, nv_block_size)) & \
+                tl.broadcast_to((tl.arange(0, nv_block_size) < num_vars_per_node)[None,:], (BLOCK_SIZE, nv_block_size))
+            vids = tl.load(vids_ptr + vids_offsets, mask = vids_mask, other = 0)
 
             # Load the corresponding data
-            offs_data = vids[:,None] * batch_size + offs_batch[None,:]
-            data = tl.load(data_ptr + offs_data, mask = (mask_vids[:,None] & mask_batch[None,:]), other = 0)
-
-        # Initialize pointers to `params`
-        off_params = tl.load(s_pids_ptr + ngroup_id)
-        inc_params = tl.load(inc_pids_ptr + ngroup_id)
-        offs_node = tl.arange(0, TILE_SIZE_K)
-        p_params = params_ptr + off_params + inc_params * offs_node # [TILE_SIZE_K]
-
-        # Initialize pointers to `param_flows`
-        off_parflows = tl.load(s_pfids_ptr + ngroup_id)
-        inc_parflows = tl.load(inc_pfids_ptr + ngroup_id)
-        p_parflows = param_flows_ptr + off_parflows + inc_parflows * offs_node # [TILE_SIZE_K]
-
-        # Initialize pointers to `metadata`
-        offs_metadata = tl.load(s_mids_ptr + ngroup_id)
-        p_metadata = metadata_ptr + offs_metadata # [1]
-
-        # Initialize pointers to `node_mars`
-        p_nflows = node_flows_ptr + \
-            (ngroup_id * group_size + offs_node[:,None] + node_offset) * batch_size + \
-            offs_batch[None,:] # [TILE_SIZE_K, BLOCK_B]
-
-        # Inner loop to process everything in the node group
-        mask = mask_batch[None,:]
-        for i in range(K_NUM_TILES):
-
-            # Read out the flows
-            flows = tl.load(p_nflows, mask = mask, other = 0)
-
-            flow_fn(flows, data, p_parflows, p_params, p_metadata, mask, num_vars_per_node)
-
-            # Increment pointers
-            p_params += inc_params * TILE_SIZE_K
-            p_parflows += inc_parflows * TILE_SIZE_K
-            p_nflows += TILE_SIZE_K * batch_size
+            data = tl.load(data_ptr + vids * batch_size + batch_offsets, mask = vids_mask, other = 0)
+
+        s_pids = tl.load(s_pids_ptr + local_offsets, mask = mask, other = 0)
+        s_pfids = tl.load(s_pfids_ptr + local_offsets, mask = mask, other = 0)
+
+        ns_offsets = (local_offsets + node_offset) * batch_size + batch_offsets
+        flows = tl.load(node_flows_ptr + ns_offsets, mask = mask, other = 0)
+
+        flow_fn(local_offsets, ns_offsets, data, flows, node_mars_ptr, params_ptr, param_flows_ptr, s_pids, s_pfids, metadata_ptr, 
+                s_mids_ptr, mask, num_vars_per_node, BLOCK_SIZE)
 
     @staticmethod
     def _sample_kernel_template(sample_fn, samples_ptr, params_ptr, nflow_xids_ptr, nflow_yids_ptr, vids_ptr, s_pids_ptr, metadata_ptr, s_mids_ptr,
diff --git a/src/pyjuice/nodes/distributions/categorical.py b/src/pyjuice/nodes/distributions/categorical.py
index c9088185..a10342ac 100644
--- a/src/pyjuice/nodes/distributions/categorical.py
+++ b/src/pyjuice/nodes/distributions/categorical.py
@@ -36,19 +36,24 @@ def init_parameters(self, num_nodes: int, perturbation: float = 2.0, **kwargs):
         return params.reshape(-1)
 
     @staticmethod
-    def fw_mar_fn(data, p_params, p_metadata, mask, num_vars_per_node):
-
-        p_tarpars = p_params[:,None] + data[None,:]
-        probs = tl.load(p_tarpars, mask = mask, other = 0)
+    def fw_mar_fn(local_offsets, data, params_ptr, s_pids, metadata_ptr, s_mids_ptr, mask, num_vars_per_node, BLOCK_SIZE):
+        # I am not sure why, but the following code will not work...
+        # probs = tl.load(params_ptr + s_pids + data, mask = mask, other = 0)
+        # Seems like a bug of triton.
+        param_idx = s_pids + data
+        probs = tl.load(params_ptr + param_idx, mask = mask, other = 0)
         log_probs = tl.log(probs)
 
         return log_probs
 
     @staticmethod
-    def bk_flow_fn(flows, data, p_parflows, p_params, p_metadata, mask, num_vars_per_node):
-
-        p_tarflows = p_parflows[:,None] + data[None,:]
-        tl.atomic_add(p_tarflows, flows, mask = mask)
+    def bk_flow_fn(local_offsets, ns_offsets, data, flows, node_mars_ptr, params_ptr, param_flows_ptr, s_pids, s_pfids, metadata_ptr, 
+                   s_mids_ptr, mask, num_vars_per_node, BLOCK_SIZE):
+        # I am not sure why, but the following code will not work...
+        # tl.atomic_add(param_flows_ptr + s_pfids + data, flows, mask = mask)
+        # Seems like a bug of triton.
+        pf_offsets = s_pfids + data
+        tl.atomic_add(param_flows_ptr + pf_offsets, flows, mask = mask)
 
     @staticmethod
     def sample_fn(samples_ptr, local_offsets, batch_offsets, vids, s_pids, params_ptr, metadata_ptr, s_mids_ptr, mask, batch_size, BLOCK_SIZE, seed):
diff --git a/src/pyjuice/nodes/distributions/distributions.py b/src/pyjuice/nodes/distributions/distributions.py
index 70d39728..c574b659 100644
--- a/src/pyjuice/nodes/distributions/distributions.py
+++ b/src/pyjuice/nodes/distributions/distributions.py
@@ -37,11 +37,15 @@ def fw_mar_fn(*args, **kwargs):
         """
         Forward evaluation for log-probabilities.
         Args:
-        `data`: [BLOCK_M, BLOCK_B] data of the corresponding node groups
-        `p_params`: [BLOCK_M, TILE_SIZE_K] pointer to the parameters
-        `p_metadata`: [BLOCK_M] pointer to the metadata
-        `mask`: [BLOCK_M, BLOCK_B] full mask
+        `local_offsets`: [BLOCK_SIZE] the local indices of the to-be-processed input nodes
+        `data`: [BLOCK_SIZE, num_vars_per_node] data of the corresponding nodes
+        `params_ptr`: pointer to the parameter vector
+        `s_pids`: [BLOCK_SIZE] start parameter index (offset) for all input nodes
+        `metadata_ptr`: pointer to metadata
+        `s_mids_ptr`: pointer to the start metadata index (offset)
+        `mask`: [BLOCK_SIZE] indicate whether each node should be processed
         `num_vars_per_node`: numbers of variables per input node/distribution
+        `BLOCK_SIZE`: CUDA block size
         """
         raise NotImplementedError()
 
diff --git a/tests/layer/input_layer_test.py b/tests/layer/input_layer_test.py
index b7ea74f2..55446e21 100644
--- a/tests/layer/input_layer_test.py
+++ b/tests/layer/input_layer_test.py
@@ -31,16 +31,12 @@ def input_layer_test():
 
     layer._init_parameters(perturbation = 2.0)
 
-    assert torch.all(layer.vids == torch.tensor([0,0,1,1,2,2,3,3]).reshape(-1, 1))
-    npars_per_group = group_size * ni0.dist.num_parameters()
-    assert torch.all(layer.s_pids == torch.arange(0, npars_per_group * 8, npars_per_group))
-    assert torch.all(layer.inc_pids == ni0.dist.num_parameters())
-    npflows_per_group = group_size * ni0.dist.num_param_flows()
-    assert torch.all(layer.s_pfids == torch.arange(0, npflows_per_group * 8, npflows_per_group))
-    assert torch.all(layer.inc_pfids == ni0.dist.num_param_flows())
+    assert torch.all(layer.vids == torch.tensor([0,1,2,3]).unsqueeze(1).repeat(1, 8).reshape(-1, 1))
+    assert torch.all(layer.s_pids == torch.arange(0, 32 * 2, 2))
+    assert torch.all(layer.s_pfids == torch.arange(0, 32 * 2, 2))
     assert torch.all(layer.metadata == torch.ones([4]) * 2.0)
-    assert torch.all(layer.s_mids == torch.tensor([0,0,1,1,2,2,3,3]))
-    assert torch.all(layer.source_ngids == torch.arange(0, 8))
+    assert torch.all(layer.s_mids == torch.tensor([0,1,2,3]).unsqueeze(1).repeat(1, 8).reshape(-1))
+    assert torch.all(layer.source_nids == torch.arange(0, 32))
 
     layer.to(device)
 
@@ -100,7 +96,9 @@ def input_layer_test():
 
     assert torch.all(torch.abs(layer.param_flows - param_flows) < 1e-4)
 
-    import pdb; pdb.set_trace()
+    ## EM tests ##
+
+    
 
 
 def speed_test():
@@ -134,14 +132,14 @@ def speed_test():
 
     t0 = time.time()
     torch.cuda.synchronize()
-    for _ in range(100):
+    for _ in range(2):
         layer(data, node_mars)
     torch.cuda.synchronize()
     t1 = time.time()
     forward_ms = (t1 - t0) / 100 * 1000
 
     print(f"Forward pass on average takes {forward_ms:.3f}ms.")
-    print("Reference computation time on RTX 4090: 0.048ms.")
+    print("Reference computation time on RTX 4090: 0.533ms.")
     print("--------------------------------------------------------------")
 
     ## Forward with mask tests ##
@@ -159,7 +157,7 @@ def speed_test():
     forward_ms = (t1 - t0) / 100 * 1000
 
     print(f"Forward pass (w/ sample independent mask) on average takes {forward_ms:.3f}ms.")
-    print("Reference computation time on RTX 4090: 0.062ms.")
+    print("Reference computation time on RTX 4090: 1.434ms.")
     print("--------------------------------------------------------------")
 
     missing_mask = torch.randint(0, 2, (num_vars, batch_size)).bool().to(device)
@@ -175,10 +173,10 @@ def speed_test():
     forward_ms = (t1 - t0) / 100 * 1000
 
     print(f"Forward pass (w/ sample dependent mask) on average takes {forward_ms:.3f}ms.")
-    print("Reference computation time on RTX 4090: 0.086ms.")
+    print("Reference computation time on RTX 4090: 1.431ms.")
     print("--------------------------------------------------------------")
 
-    ## Backward tests ##
+    # ## Backward tests ##
 
     node_flows = torch.rand([1 + group_size * num_node_groups * num_vars, batch_size]).to(device)
 
@@ -196,10 +194,10 @@ def speed_test():
     forward_ms = (t1 - t0) / 100 * 1000
 
     print(f"Backward pass on average takes {forward_ms:.3f}ms.")
-    print("Reference computation time on RTX 4090: 0.086ms.")
+    print("Reference computation time on RTX 4090: 0.825ms.")
     print("--------------------------------------------------------------")
 
 
 if __name__ == "__main__":
-    # input_layer_test()
+    input_layer_test()
     speed_test()
\ No newline at end of file

From 660804e9511f4809a6b616ad7704e649e0aa1c03 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Sat, 2 Dec 2023 22:07:21 +0800
Subject: [PATCH 016/162] benchmark and refactor em for `InputLayer`

---
 src/pyjuice/layer/input_layer.py |  9 +++++----
 tests/layer/input_layer_test.py  | 33 ++++++++++++++++++++++++++++++--
 2 files changed, 36 insertions(+), 6 deletions(-)

diff --git a/src/pyjuice/layer/input_layer.py b/src/pyjuice/layer/input_layer.py
index 5275a40b..39cd11ba 100644
--- a/src/pyjuice/layer/input_layer.py
+++ b/src/pyjuice/layer/input_layer.py
@@ -352,7 +352,7 @@ def mini_batch_em(self, step_size: float, pseudocount: float = 0.0):
             with torch.no_grad():
 
                 if "cuda" in self.device.type:
-                    layer_num_source_nodes = self.source_ngids.size(0)
+                    layer_num_source_nodes = self.source_nids.size(0)
 
                     if not self.provided("_em_kernel"):
                         self._em_kernel = self._compile_triton_kernel(self._em_kernel_template, em_fn = self.em_fn)
@@ -360,6 +360,7 @@ def mini_batch_em(self, step_size: float, pseudocount: float = 0.0):
                     constexprs = torch.tensor([step_size, pseudocount], dtype = torch.float32, device = self.device)
 
                     grid = lambda meta: (triton.cdiv(layer_num_source_nodes, meta['BLOCK_SIZE']),)
+
                     self._em_kernel[grid](
                         params_ptr = self.params,
                         param_flows_ptr = self.param_flows,
@@ -367,7 +368,7 @@ def mini_batch_em(self, step_size: float, pseudocount: float = 0.0):
                         s_pfids_ptr = self.s_pfids,
                         metadata_ptr = self.metadata,
                         s_mids_ptr = self.s_mids,
-                        source_ngids_ptr = self.source_ngids,
+                        source_nids_ptr = self.source_nids,
                         constexprs_ptr = constexprs,
                         layer_num_source_nodes = layer_num_source_nodes,
                         BLOCK_SIZE = 1024
@@ -652,7 +653,7 @@ def _sample_kernel_template(sample_fn, samples_ptr, params_ptr, nflow_xids_ptr,
 
     @staticmethod
     def _em_kernel_template(em_fn, params_ptr, param_flows_ptr, s_pids_ptr, s_pfids_ptr, metadata_ptr, s_mids_ptr,
-                            source_ngids_ptr, constexprs_ptr, layer_num_source_nodes: tl.constexpr, BLOCK_SIZE: tl.constexpr):
+                            source_nids_ptr, constexprs_ptr, layer_num_source_nodes: tl.constexpr, BLOCK_SIZE: tl.constexpr):
         pid = tl.program_id(axis = 0)
         block_start = pid * BLOCK_SIZE
 
@@ -664,7 +665,7 @@ def _em_kernel_template(em_fn, params_ptr, param_flows_ptr, s_pids_ptr, s_pfids_
         mask = offsets < layer_num_source_nodes
 
         # Get the local node ids
-        local_offsets = tl.load(source_ngids_ptr + offsets, mask = mask, other = 0)
+        local_offsets = tl.load(source_nids_ptr + offsets, mask = mask, other = 0)
 
         # Get the corresponding start id for `params` and `param_flows`
         s_pids = tl.load(s_pids_ptr + local_offsets, mask = mask, other = 0)
diff --git a/tests/layer/input_layer_test.py b/tests/layer/input_layer_test.py
index 55446e21..6dc188c9 100644
--- a/tests/layer/input_layer_test.py
+++ b/tests/layer/input_layer_test.py
@@ -98,7 +98,17 @@ def input_layer_test():
 
     ## EM tests ##
 
-    
+    original_params = layer.params.clone()
+
+    step_size = 0.3
+    pseudocount = 0.1
+
+    par_flows = layer.param_flows.clone().reshape(32, 2)
+    new_params = (1.0 - step_size) * original_params + step_size * ((par_flows + pseudocount / 2) / (par_flows.sum(dim = 1, keepdim = True) + pseudocount)).reshape(-1)
+
+    layer.mini_batch_em(step_size = step_size, pseudocount = pseudocount)
+
+    assert torch.all(torch.abs(new_params - layer.params) < 1e-4)
 
 
 def speed_test():
@@ -176,7 +186,7 @@ def speed_test():
     print("Reference computation time on RTX 4090: 1.431ms.")
     print("--------------------------------------------------------------")
 
-    # ## Backward tests ##
+    ## Backward tests ##
 
     node_flows = torch.rand([1 + group_size * num_node_groups * num_vars, batch_size]).to(device)
 
@@ -197,6 +207,25 @@ def speed_test():
     print("Reference computation time on RTX 4090: 0.825ms.")
     print("--------------------------------------------------------------")
 
+    ## EM tests ##
+
+    step_size = 0.01
+    pseudocount = 0.1
+
+    layer.mini_batch_em(step_size = step_size, pseudocount = pseudocount)
+
+    t0 = time.time()
+    torch.cuda.synchronize()
+    for _ in range(100):
+        layer.mini_batch_em(step_size = step_size, pseudocount = pseudocount)
+    torch.cuda.synchronize()
+    t1 = time.time()
+    forward_ms = (t1 - t0) / 100 * 1000
+
+    print(f"EM on average takes {forward_ms:.3f}ms.")
+    print("Reference computation time on RTX 4090: 0.825ms.")
+    print("--------------------------------------------------------------")
+
 
 if __name__ == "__main__":
     input_layer_test()

From e34270c9b8f537956c774ae166e601397da09127 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Sun, 3 Dec 2023 02:00:38 +0800
Subject: [PATCH 017/162] add compilation options for tied input nodes

---
 src/pyjuice/layer/input_layer.py | 102 ++++++++++++++++++++++++++++---
 tests/layer/input_layer_test.py  |  62 ++++++++++++++++++-
 2 files changed, 153 insertions(+), 11 deletions(-)

diff --git a/src/pyjuice/layer/input_layer.py b/src/pyjuice/layer/input_layer.py
index 39cd11ba..f2eb2824 100644
--- a/src/pyjuice/layer/input_layer.py
+++ b/src/pyjuice/layer/input_layer.py
@@ -21,7 +21,14 @@
 
 
 class InputLayer(Layer, nn.Module):
-    def __init__(self, nodes: Sequence[InputNodes], cum_nodes: int = 0, maximize_group_size: bool = True) -> None:
+    def __init__(self, nodes: Sequence[InputNodes], cum_nodes: int = 0, max_tied_ns_per_parflow_group: int = 4) -> None:
+        """
+        Compiler flags:
+        - `max_tied_ns_per_parflow_group`: the maximum number of tied nodes allowed in the backward pass. Setting to a larger value will
+                                           lead to reduced memory overhead but might lead to additional computational burden due to conflicts
+                                           in gradient accumulation.
+        """
+
         nn.Module.__init__(self)
         Layer.__init__(self, nodes)
 
@@ -37,7 +44,8 @@ def __init__(self, nodes: Sequence[InputNodes], cum_nodes: int = 0, maximize_gro
         cum_param_flows = 0
         cum_source_ns = 0
         dist_signature = None
-        for ns in self.nodes:
+        node2tiednodes = dict()
+        for node_id, ns in enumerate(self.nodes):
             if dist_signature is None:
                 dist_signature = ns.dist.get_signature()
             else:
@@ -63,6 +71,22 @@ def __init__(self, nodes: Sequence[InputNodes], cum_nodes: int = 0, maximize_gro
                 source_ns = ns.get_source_ns()
                 ns._param_range = deepcopy(source_ns._param_range)
 
+                if source_ns not in node2tiednodes:
+                    node2tiednodes[source_ns] = [[source_ns], 1, source_ns._param_flow_range]
+                
+                dup_count = node2tiednodes[source_ns][1]
+                if dup_count >= max_tied_ns_per_parflow_group:
+                    cum_param_flows += ns.num_nodes * ns.dist.num_param_flows()
+                    ns._param_flow_range = (cum_param_flows - ns.num_nodes * ns.dist.num_param_flows(), cum_param_flows)
+                    node2tiednodes[source_ns][2] = ns._param_flow_range
+
+                    node2tiednodes[source_ns][0].append(ns)
+                    node2tiednodes[source_ns][1] = 1
+                else:
+                    ns._param_flow_range = deepcopy(node2tiednodes[source_ns][2])
+
+                    node2tiednodes[source_ns][1] += 1
+
         self._output_ind_range = (cum_nodes - layer_num_nodes, cum_nodes)
         self.num_params = cum_params
         self.num_param_flows = cum_param_flows
@@ -102,14 +126,11 @@ def __init__(self, nodes: Sequence[InputNodes], cum_nodes: int = 0, maximize_gro
             vids[n_start:n_end,:] = torch.tensor(node_vars[ns_id]).view(1, -1)
 
             # `s_pids` and `s_pfids`
-            if not ns.is_tied():
-                source_ns = ns
-            else:
-                source_ns = ns.get_source_ns()
             pid_offsets = torch.arange(0, ns.num_nodes * ns.dist.num_parameters(), ns.dist.num_parameters())
-            s_pids[n_start:n_end] = source_ns._param_range[0] + pid_offsets
+            s_pids[n_start:n_end] = ns._param_range[0] + pid_offsets
+
             pfid_offsets = torch.arange(0, ns.num_nodes * ns.dist.num_param_flows(), ns.dist.num_param_flows())
-            s_pfids[n_start:n_end] = source_ns._param_flow_range[0] + pfid_offsets
+            s_pfids[n_start:n_end] = ns._param_flow_range[0] + pfid_offsets
 
             # `source_nids`
             if not ns.is_tied():
@@ -130,6 +151,20 @@ def __init__(self, nodes: Sequence[InputNodes], cum_nodes: int = 0, maximize_gro
         self.register_buffer("s_mids", s_mids)
         self.register_buffer("source_nids", source_nids)
 
+        ## Prepare info buffers for tied nodes ##
+        self.tied2source_nids = []
+        for source_ns, item in node2tiednodes.items():
+            if len(item[0]) > 1: # If the length is 1, then everything is already accumulated in the source node's parflow
+                num_par_flows = source_ns._param_flow_range[1] - source_ns._param_flow_range[0]
+                pfid_start = source_ns._param_flow_range[0]
+                ch_nodes = item[0]
+
+                ch_pfids = torch.empty([len(ch_nodes)], dtype = torch.long)
+                for ch_id, ch_ns in enumerate(ch_nodes):
+                    ch_pfids[ch_id] = ch_ns._param_flow_range[0]
+
+                self.tied2source_nids.append([pfid_start, num_par_flows, ch_pfids])
+
         self.params = nn.Parameter(params)
         # Due to the custom inplace backward pass implementation, we do not track 
         # gradient of PC parameters by PyTorch.
@@ -149,6 +184,10 @@ def __init__(self, nodes: Sequence[InputNodes], cum_nodes: int = 0, maximize_gro
     def to(self, device):
         nn.Module.to(self, device = device)
 
+        # Take special care to `tied2source_nids`
+        for i in range(len(self.tied2source_nids)):
+            self.tied2source_nids[i][2] = self.tied2source_nids[i][2].to(device)
+
         self.device = device
 
     def init_param_flows(self, flows_memory: float = 0.0):
@@ -352,6 +391,31 @@ def mini_batch_em(self, step_size: float, pseudocount: float = 0.0):
             with torch.no_grad():
 
                 if "cuda" in self.device.type:
+
+                    # Accumulate parameter flows of tied nodes
+                    for i in range(len(self.tied2source_nids)):
+                        pfid_start, num_par_flows, ch_pfids = self.tied2source_nids[i]
+                        num_coalesced_groups = ch_pfids.size(0)
+
+                        if num_coalesced_groups <= 1024:
+                            BLOCK_N = triton.next_power_of_2(num_coalesced_groups)
+                            BLOCK_M = min(1024 // BLOCK_N, num_par_flows)
+
+                            grid = (triton.cdiv(num_par_flows, BLOCK_M),)
+
+                            self._pflow_accum_kernel[grid](
+                                param_flows_ptr = self.param_flows,
+                                pfid_start = pfid_start,
+                                ch_pfids_ptr = ch_pfids,
+                                num_coalesced_groups = num_coalesced_groups,
+                                num_par_flows = num_par_flows,
+                                BLOCK_M = BLOCK_M,
+                                BLOCK_N = BLOCK_N,
+                            )
+                        else:
+                            raise NotImplementedError("Unsupported number of coalesced parameter flows.")
+
+
                     layer_num_source_nodes = self.source_nids.size(0)
 
                     if not self.provided("_em_kernel"):
@@ -651,6 +715,28 @@ def _sample_kernel_template(sample_fn, samples_ptr, params_ptr, nflow_xids_ptr,
 
         sample_fn(samples_ptr, local_offsets, batch_offsets, vids, s_pids, params_ptr, metadata_ptr, s_mids_ptr, mask, batch_size, BLOCK_SIZE, seed)
 
+    @staticmethod
+    @triton.jit
+    def _pflow_accum_kernel(param_flows_ptr, pfid_start, ch_pfids_ptr, num_coalesced_groups, num_par_flows, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr):
+        pid = tl.program_id(axis = 0)
+
+        offs_pflow = pid * BLOCK_M + tl.arange(0, BLOCK_M)
+        mask_pflow = offs_pflow < num_par_flows
+
+        offs_ch = tl.arange(0, BLOCK_N)
+        mask_ch = offs_ch < num_coalesced_groups
+
+        # Start id for all ch parflows
+        ch_pstart = tl.load(ch_pfids_ptr + offs_ch, mask = mask_ch)
+
+        offs_ch_pflow = offs_pflow[:,None] + ch_pstart[None,:]
+        mask_ch_pflow = mask_pflow[:,None] & mask_ch[None,:]
+        ch_pflows = tl.load(param_flows_ptr + offs_ch_pflow, mask = mask_ch_pflow, other = 0)
+
+        tar_pflows = tl.sum(ch_pflows, axis = 1)
+
+        tl.store(param_flows_ptr + pfid_start + offs_pflow, tar_pflows, mask = mask_pflow)
+
     @staticmethod
     def _em_kernel_template(em_fn, params_ptr, param_flows_ptr, s_pids_ptr, s_pfids_ptr, metadata_ptr, s_mids_ptr,
                             source_nids_ptr, constexprs_ptr, layer_num_source_nodes: tl.constexpr, BLOCK_SIZE: tl.constexpr):
diff --git a/tests/layer/input_layer_test.py b/tests/layer/input_layer_test.py
index 6dc188c9..d7ac02e3 100644
--- a/tests/layer/input_layer_test.py
+++ b/tests/layer/input_layer_test.py
@@ -27,7 +27,7 @@ def input_layer_test():
         ni2 = inputs(2, num_node_groups = 2, dist = dists.Categorical(num_cats = 2))
         ni3 = inputs(3, num_node_groups = 2, dist = dists.Categorical(num_cats = 2))
 
-    layer = InputLayer([ni0, ni1, ni2, ni3], cum_nodes = 1, maximize_group_size = False)
+    layer = InputLayer([ni0, ni1, ni2, ni3], cum_nodes = 1)
 
     layer._init_parameters(perturbation = 2.0)
 
@@ -111,6 +111,61 @@ def input_layer_test():
     assert torch.all(torch.abs(new_params - layer.params) < 1e-4)
 
 
+def tied_bp_test():
+
+    device = torch.device("cuda:0")
+
+    group_size = 4
+    batch_size = 16
+    
+    with juice.set_group_size(group_size):
+
+        ni0 = inputs(0, num_node_groups = 2, dist = dists.Categorical(num_cats = 2))
+        ni1 = inputs(1, num_node_groups = 2, dist = dists.Categorical(num_cats = 2))
+        ni2 = inputs(2, num_node_groups = 2, dist = dists.Categorical(num_cats = 2))
+        ni3 = ni1.duplicate(3, tie_params = True)
+
+    layer = InputLayer([ni0, ni1, ni2, ni3], cum_nodes = 1, max_tied_ns_per_parflow_group = 1.0)
+
+    layer._init_parameters(perturbation = 2.0)
+
+    assert torch.all(layer.vids == torch.tensor([0,1,2,3]).unsqueeze(1).repeat(1, 8).reshape(-1, 1))
+    s_pids = torch.arange(0, 32 * 2, 2)
+    s_pids[24:32] = s_pids[8:16]
+    assert torch.all(layer.s_pids == s_pids)
+    assert torch.all(layer.s_pfids == torch.arange(0, 32 * 2, 2))
+    assert torch.all(layer.metadata == torch.ones([4]) * 2.0)
+    assert torch.all(layer.s_mids == torch.tensor([0,1,2,3]).unsqueeze(1).repeat(1, 8).reshape(-1))
+    assert torch.all(layer.source_nids == torch.arange(0, 24))
+
+    assert layer.tied2source_nids[0][0] == 16
+    assert layer.tied2source_nids[0][1] == 16
+    assert torch.all(layer.tied2source_nids[0][2] == torch.tensor([16, 48]))
+
+    layer.to(device)
+
+    data = torch.randint(0, 2, (4, batch_size)).to(device)
+    node_mars = torch.zeros([33, batch_size]).to(device)
+    node_flows = torch.rand([33, batch_size]).to(device)
+
+    step_size = 0.3
+    pseudocount = 0.1
+
+    ## EM tests ##
+
+    layer.init_param_flows(flows_memory = 0.0)
+
+    layer(data, node_mars)
+    layer.backward(data, node_flows, node_mars)
+
+    param_flows = layer.param_flows.detach().clone()
+    param_flows[16:32] += param_flows[48:64]
+
+    layer.mini_batch_em(step_size = step_size, pseudocount = pseudocount)
+
+    assert torch.all(torch.abs(param_flows - layer.param_flows) < 1e-4)
+
+
 def speed_test():
 
     device = torch.device("cuda:0")
@@ -127,7 +182,7 @@ def speed_test():
         for v in range(num_vars):
             nis.append(inputs(0, num_node_groups = num_node_groups, dist = dists.Categorical(num_cats = 64)))
 
-    layer = InputLayer(nis, cum_nodes = 1, maximize_group_size = False)
+    layer = InputLayer(nis, cum_nodes = 1)
 
     layer._init_parameters(perturbation = 2.0)
 
@@ -223,10 +278,11 @@ def speed_test():
     forward_ms = (t1 - t0) / 100 * 1000
 
     print(f"EM on average takes {forward_ms:.3f}ms.")
-    print("Reference computation time on RTX 4090: 0.825ms.")
+    print("Reference computation time on RTX 4090: 0.784ms.")
     print("--------------------------------------------------------------")
 
 
 if __name__ == "__main__":
     input_layer_test()
+    tied_bp_test()
     speed_test()
\ No newline at end of file

From 0f21414eca97022ca7a8cc6a4eba28b82e8d928e Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Sun, 3 Dec 2023 21:40:50 +0800
Subject: [PATCH 018/162] refactor: `ProdLayer` fw and bk

---
 src/pyjuice/layer/backend/node_partition.py |  52 +--
 src/pyjuice/layer/compilation.py            | 172 +++++----
 src/pyjuice/layer/prod_layer.py             | 365 +++++++++++---------
 src/pyjuice/layer/sum_layer.py              |   6 +-
 src/pyjuice/model/tensorcircuit.py          |  12 +-
 tests/layer/input_layer_test.py             |   4 +-
 tests/layer/prod_layer_test.py              | 171 +++++++++
 7 files changed, 513 insertions(+), 269 deletions(-)
 create mode 100644 tests/layer/prod_layer_test.py

diff --git a/src/pyjuice/layer/backend/node_partition.py b/src/pyjuice/layer/backend/node_partition.py
index 97eec040..e98c254e 100644
--- a/src/pyjuice/layer/backend/node_partition.py
+++ b/src/pyjuice/layer/backend/node_partition.py
@@ -7,7 +7,7 @@
 
 
 @njit()
-def _partition_nodes_dp_simple_compiled(node_n_edges, dp, backtrace, max_num_groups, target_overhead):
+def _partition_nodes_dp_simple_compiled(node_n_edges, dp, backtrace, max_num_partitions, target_overhead):
     num_nodes = node_n_edges.shape[0]
 
     # Init
@@ -15,8 +15,8 @@ def _partition_nodes_dp_simple_compiled(node_n_edges, dp, backtrace, max_num_gro
         dp[i,1] = node_n_edges[i] * (i + 1)
 
     # Main DP
-    target_n_group = max_num_groups
-    for n_group in range(2, max_num_groups + 1):
+    target_n_group = max_num_partitions
+    for n_group in range(2, max_num_partitions + 1):
         dp[0,n_group] = node_n_edges[0]
         backtrace[0,n_group] = 0
         for i in range(1, num_nodes):
@@ -47,16 +47,16 @@ def _backtrace_fn(partitions, backtrace, target_n_group, num_nodes):
         i = backtrace[i,target_n_group]
 
 
-def _partition_nodes_dp_simple(node_n_edges: np.ndarray, max_num_groups: int, target_overhead: Optional[int]):
+def _partition_nodes_dp_simple(node_n_edges: np.ndarray, max_num_partitions: int, target_overhead: Optional[int]):
 
-    dp = np.zeros([node_n_edges.shape[0], max_num_groups + 1], dtype = np.int64)
-    backtrace = np.zeros([node_n_edges.shape[0], max_num_groups + 1], dtype = np.int64)
+    dp = np.zeros([node_n_edges.shape[0], max_num_partitions + 1], dtype = np.int64)
+    backtrace = np.zeros([node_n_edges.shape[0], max_num_partitions + 1], dtype = np.int64)
 
     overhead, target_n_group = _partition_nodes_dp_simple_compiled(
         np.ascontiguousarray(node_n_edges), 
         np.ascontiguousarray(dp), 
         np.ascontiguousarray(backtrace),
-        max_num_groups,
+        max_num_partitions,
         target_overhead = 0 if target_overhead is None else target_overhead
     )
 
@@ -115,7 +115,7 @@ def _coalesce(vals, tol_range = "auto"):
 
 
 @njit()
-def _weighted_partition_nodes_dp_simple_compiled(node_n_edges, cum_counts, dp, backtrace, max_num_groups, target_overhead):
+def _weighted_partition_nodes_dp_simple_compiled(node_n_edges, cum_counts, dp, backtrace, max_num_partitions, target_overhead):
     num_nodes = node_n_edges.shape[0]
 
     # Init
@@ -123,8 +123,8 @@ def _weighted_partition_nodes_dp_simple_compiled(node_n_edges, cum_counts, dp, b
         dp[i,1] = node_n_edges[i] * cum_counts[i]
 
     # Main DP
-    target_n_group = max_num_groups
-    for n_group in range(2, max_num_groups + 1):
+    target_n_group = max_num_partitions
+    for n_group in range(2, max_num_partitions + 1):
         dp[0,n_group] = node_n_edges[0] * cum_counts[0]
         backtrace[0,n_group] = 0
         for i in range(1, num_nodes):
@@ -148,20 +148,20 @@ def _weighted_partition_nodes_dp_simple_compiled(node_n_edges, cum_counts, dp, b
     return overhead, target_n_group
 
 
-def _weighted_partition_nodes_dp_simple(node_n_edges: np.ndarray, counts: np.ndarray, max_num_groups: int, 
+def _weighted_partition_nodes_dp_simple(node_n_edges: np.ndarray, counts: np.ndarray, max_num_partitions: int, 
                                         target_overhead: Optional[int]):
 
     cum_counts = np.cumsum(counts)
 
-    dp = np.zeros([node_n_edges.shape[0], max_num_groups + 1], dtype = np.int64)
-    backtrace = np.zeros([node_n_edges.shape[0], max_num_groups + 1], dtype = np.int64)
+    dp = np.zeros([node_n_edges.shape[0], max_num_partitions + 1], dtype = np.int64)
+    backtrace = np.zeros([node_n_edges.shape[0], max_num_partitions + 1], dtype = np.int64)
 
     overhead, target_n_group = _weighted_partition_nodes_dp_simple_compiled(
         np.ascontiguousarray(node_n_edges),
         np.ascontiguousarray(cum_counts),
         np.ascontiguousarray(dp), 
         np.ascontiguousarray(backtrace),
-        max_num_groups,
+        max_num_partitions,
         target_overhead = 0 if target_overhead is None else target_overhead
     )
 
@@ -174,22 +174,22 @@ def _weighted_partition_nodes_dp_simple(node_n_edges: np.ndarray, counts: np.nda
 
 
 def partition_nodes_by_n_edges(node_n_edges: Union[np.ndarray, torch.Tensor], 
-                               max_num_groups: Optional[int] = None, 
+                               max_num_partitions: Optional[int] = None, 
                                sparsity_tolerance: Optional[float] = None,
-                               algorithm: str = "dp_with_coalesce"):
+                               algorithm: str = "dp_with_coalesce", debug = False):
 
     if sparsity_tolerance is not None and sparsity_tolerance < 1e-6:
         sparsity_tolerance = None
-        max_num_groups = 1
+        max_num_partitions = 1
 
     if sparsity_tolerance is not None:
         assert sparsity_tolerance > 1e-6 and sparsity_tolerance <= 1.0
-        if max_num_groups is None:
-            max_num_groups = max(min(int(math.ceil(node_n_edges.shape[0] * sparsity_tolerance)), 16), 1)
-    elif max_num_groups is None:
-        max_num_groups = 1
+        if max_num_partitions is None:
+            max_num_partitions = max(min(int(math.ceil(node_n_edges.shape[0] * sparsity_tolerance)), 16), 1)
+    elif max_num_partitions is None:
+        max_num_partitions = 1
     else:
-        assert max_num_groups >= 1, "Should provide at least 1 group."
+        assert max_num_partitions >= 1, "Should provide at least 1 group."
 
     if isinstance(node_n_edges, torch.Tensor):
         node_n_edges = node_n_edges.detach().cpu().numpy()
@@ -197,7 +197,7 @@ def partition_nodes_by_n_edges(node_n_edges: Union[np.ndarray, torch.Tensor],
     total_num_edges = node_n_edges.sum()
     target_overhead = None if sparsity_tolerance is None else int(math.ceil(total_num_edges / sparsity_tolerance))
 
-    if max_num_groups == 1:
+    if max_num_partitions == 1:
         partitions = np.zeros([1], dtype = np.int64)
         partitions[0] = np.max(node_n_edges)
         return torch.from_numpy(partitions)
@@ -206,11 +206,13 @@ def partition_nodes_by_n_edges(node_n_edges: Union[np.ndarray, torch.Tensor],
     node_n_edges = np.sort(node_n_edges)
 
     if algorithm == "dp_simple":
-        group_sizes, overhead = _partition_nodes_dp_simple(node_n_edges, max_num_groups, target_overhead)
+        group_sizes, overhead = _partition_nodes_dp_simple(node_n_edges, max_num_partitions, target_overhead)
 
     elif algorithm == "dp_with_coalesce":
         unique_n_edges, counts = _coalesce(node_n_edges, tol_range = "auto")
-        group_sizes, overhead = _weighted_partition_nodes_dp_simple(unique_n_edges, counts, max_num_groups, target_overhead)
+        if debug:
+            import pdb; pdb.set_trace()
+        group_sizes, overhead = _weighted_partition_nodes_dp_simple(unique_n_edges, counts, max_num_partitions, target_overhead)
 
     else:
         raise ValueError(f"Unknown algorithm {algorithm} for `partition_nodes_by_n_edges`.")
diff --git a/src/pyjuice/layer/compilation.py b/src/pyjuice/layer/compilation.py
index 969324d8..89cf3e9f 100644
--- a/src/pyjuice/layer/compilation.py
+++ b/src/pyjuice/layer/compilation.py
@@ -968,6 +968,29 @@ def get_prod_layer_stats(nodes: Sequence[SumNodes]):
     return layer_num_nodes, layer_num_edges, n_chs
 
 
+def get_prod_layer_stats_new(nodes: Sequence[SumNodes], group_size: int):
+    layer_num_ngroup = sum(map(lambda ns: ns.num_node_groups, nodes))
+    layer_num_edges = 0
+    
+    global_nid_start = group_size # indices `0`` to `group_size - 1`` is reserved for the dummy node
+
+    ng_sid = 0
+    n_chgs = torch.zeros([layer_num_ngroup], dtype = torch.long)
+    for ns_idx, ns in enumerate(nodes):
+        ng_eid = ng_sid + ns.num_node_groups
+
+        n_chgs[ng_sid:ng_eid] = ns.num_chs
+
+        layer_num_edges += ns.num_nodes * ns.num_chs
+
+        ns._output_ind_range = (global_nid_start, global_nid_start + ns.num_nodes)
+        global_nid_start += ns.num_nodes
+
+        ng_sid = ng_eid
+
+    return layer_num_ngroup, layer_num_edges, n_chgs
+
+
 @torch.no_grad()
 def prod_layer_forward_compilation(nodes, fw_group_max_chs, n_group_ids, n_id_in_group, num_ns_in_group, use_cuda: bool = False):
 
@@ -998,6 +1021,33 @@ def prod_layer_forward_compilation(nodes, fw_group_max_chs, n_group_ids, n_id_in
     return nids, cids
 
 
+@torch.no_grad()
+def prod_layer_forward_compilation_new(nodes, fw_partition_max_chs, n_partition_ids, n_id_in_partition, num_ngs_in_partition, group_size, use_cuda: bool = False):
+    
+    if use_cuda and not torch.cuda.is_available():
+        use_cuda = False
+
+    nids = [torch.zeros([partition_size], dtype = torch.long) for partition_size in num_ngs_in_partition] # Node group start id
+    cids = [torch.zeros([partition_size, max_chs] , dtype = torch.long) for partition_size, max_chs in zip(num_ngs_in_partition, fw_partition_max_chs)] # Child group start id
+
+    for ns_id, ns in enumerate(nodes):
+
+        # `partition_id`:   which partition the current node belongs to
+        # `local_sid`:      the start index of the node within the current partition
+        # `partition_nchs`: maximum number of child nodes in the current partition
+        partition_id = n_partition_ids[ns_id]
+        local_sid = n_id_in_partition[ns_id]
+        local_eid = local_sid + ns.num_node_groups
+        partition_nchs = fw_partition_max_chs[partition_id]
+
+        n_sid = ns._output_ind_range[0]
+        nids[partition_id][local_sid:local_eid] = torch.arange(0, ns.num_nodes, group_size) + n_sid
+        for cs_id, cs in enumerate(ns.chs):
+            cids[partition_id][local_sid:local_eid,cs_id] = ns.edge_ids[:,cs_id] * group_size + cs._output_ind_range[0]
+
+    return nids, cids
+
+
 @torch.no_grad()
 def flatten_c_ids(nids, cids):
 
@@ -1051,8 +1101,8 @@ def _assign_cid2_group_local_id(flat_u_cids, n_group_ids, n_id_in_group, cid2gro
 
 
 @triton.jit
-def _assign_target_ucids_kernel(target_u_cids_ptr, flat_u_cids_ptr, n_group_ids_ptr, n_id_in_group_ptr, 
-                                u_cids_group_start_ptr, constexprs_ptr, BLOCK_SIZE: tl.constexpr):
+def _assign_target_ucids_kernel(target_u_cids_ptr, flat_u_cids_ptr, n_partition_ids_ptr, n_id_in_partition_ptr, 
+                                u_cids_partition_start_ptr, constexprs_ptr, BLOCK_SIZE: tl.constexpr):
 
     pid = tl.program_id(axis = 0)
     block_start = pid * BLOCK_SIZE
@@ -1066,12 +1116,12 @@ def _assign_target_ucids_kernel(target_u_cids_ptr, flat_u_cids_ptr, n_group_ids_
     # Get `cid`
     cid = tl.load(flat_u_cids_ptr + offsets, mask = mask, other = 0)
 
-    # Get `group_id` and `local_id`
-    group_id = tl.load(n_group_ids_ptr + offsets, mask = mask, other = 0)
-    local_id = tl.load(n_id_in_group_ptr + offsets, mask = mask, other = 0)
+    # Get `partition_id` and `local_id`
+    partition_id = tl.load(n_partition_ids_ptr + offsets, mask = mask, other = 0)
+    local_id = tl.load(n_id_in_partition_ptr + offsets, mask = mask, other = 0)
 
     # Get the corresponding start id in the target tensors
-    u_cids_start = tl.load(u_cids_group_start_ptr + group_id, mask = mask, other = 0)
+    u_cids_start = tl.load(u_cids_partition_start_ptr + partition_id, mask = mask, other = 0)
 
     # Assign to `target_u_cids`
     tl.store(target_u_cids_ptr + u_cids_start + local_id, cid, mask = mask)
@@ -1079,8 +1129,8 @@ def _assign_target_ucids_kernel(target_u_cids_ptr, flat_u_cids_ptr, n_group_ids_
 
 @triton.jit
 def _assign_prod_target_parids_kernel(target_parids_ptr, flat_cid2nid_ptr, flat_cids_ptr, 
-                                      cid2group_id_ptr, cid2local_id_ptr, parids_group_start_ptr,
-                                      flat_par_offsets_ptr, bk_group_max_pars_ptr, constexprs_ptr, 
+                                      cid2partition_id_ptr, cid2local_id_ptr, parids_partition_start_ptr,
+                                      flat_par_offsets_ptr, bk_partition_max_pars_ptr, constexprs_ptr, 
                                       BLOCK_SIZE: tl.constexpr):
 
     pid = tl.program_id(axis = 0)
@@ -1099,25 +1149,25 @@ def _assign_prod_target_parids_kernel(target_parids_ptr, flat_cid2nid_ptr, flat_
     # Mask out edges that point to the dummy node
     mask = mask & (cid != 0)
 
-    # Get `group_id` and `local_id` using `cid`
-    group_id = tl.load(cid2group_id_ptr + cid, mask = mask, other = 0)
+    # Get `partition_id` and `local_id` using `cid`
+    partition_id = tl.load(cid2partition_id_ptr + cid, mask = mask, other = 0)
     local_id = tl.load(cid2local_id_ptr + cid, mask = mask, other = 0)
 
     # Get the corresponding start id in the target tensors
-    parids_start = tl.load(parids_group_start_ptr + group_id, mask = mask, other = 0)
+    parids_start = tl.load(parids_partition_start_ptr + partition_id, mask = mask, other = 0)
 
     # Get `par_offset` of the edges
     par_offset = tl.load(flat_par_offsets_ptr + offsets, mask = mask, other = 0)
 
     # Assign to `target_parids`
-    group_max_n_pars = tl.load(bk_group_max_pars_ptr + group_id, mask = mask, other = 0)
-    parid_offsets = parids_start + local_id * group_max_n_pars + par_offset
+    partition_max_n_pars = tl.load(bk_partition_max_pars_ptr + partition_id, mask = mask, other = 0)
+    parid_offsets = parids_start + local_id * partition_max_n_pars + par_offset
     tl.store(target_parids_ptr + parid_offsets, nid, mask = mask)
 
 
 @torch.no_grad()
 def prod_layer_backward_compilation(flat_u_cids, flat_cids, flat_cid2nid, 
-                                    bk_group_max_pars, n_group_ids, n_id_in_group, num_ns_in_group, 
+                                    bk_partition_max_pars, n_partition_ids, n_id_in_partition, num_ns_in_partition, 
                                     use_cuda: bool = False):
     
     if use_cuda and not torch.cuda.is_available():
@@ -1125,102 +1175,102 @@ def prod_layer_backward_compilation(flat_u_cids, flat_cids, flat_cid2nid,
 
     if use_cuda:
 
-        # We construct a flattened version of `u_cids` where the vectors of every group is concatenated
-        # into a single vector. `u_cids_group_start` is used to indicate the start index of every group's
-        # `u_cids`. That is, `target_u_cids[u_cids_group_start[gid]:u_cids_group_start[gid+1]] == u_cids[gid]`
-        u_cids_group_start = torch.zeros_like(num_ns_in_group)
-        u_cids_group_start[1:] = torch.cumsum(num_ns_in_group[:-1], dim = 0)
-        target_u_cids = torch.zeros([num_ns_in_group.sum()], dtype = torch.long)
+        # We construct a flattened version of `u_cids` where the vectors of every partition is concatenated
+        # into a single vector. `u_cids_partition_start` is used to indicate the start index of every partition's
+        # `u_cids`. That is, `target_u_cids[u_cids_partition_start[gid]:u_cids_partition_start[gid+1]] == u_cids[gid]`
+        u_cids_partition_start = torch.zeros_like(num_ns_in_partition)
+        u_cids_partition_start[1:] = torch.cumsum(num_ns_in_partition[:-1], dim = 0)
+        target_u_cids = torch.zeros([num_ns_in_partition.sum()], dtype = torch.long)
 
-        # Similar to `target_u_cids`, we construct a flattened version of `parids` and use `parids_group_start`
+        # Similar to `target_u_cids`, we construct a flattened version of `parids` and use `parids_partition_start`
         # for indexing
-        parids_group_start = torch.zeros_like(num_ns_in_group)
-        parids_group_start[1:] = torch.cumsum((num_ns_in_group * bk_group_max_pars)[:-1], dim = 0)
-        target_parids = torch.zeros([(num_ns_in_group * bk_group_max_pars).sum()], dtype = torch.long)
+        parids_partition_start = torch.zeros_like(num_ns_in_partition)
+        parids_partition_start[1:] = torch.cumsum((num_ns_in_partition * bk_partition_max_pars)[:-1], dim = 0)
+        target_parids = torch.zeros([(num_ns_in_partition * bk_partition_max_pars).sum()], dtype = torch.long)
 
-        # Precompute the parent offset ids for every edge. That is, the `?` mark in `parids[group_id][local_id,?]`
+        # Precompute the parent offset ids for every edge. That is, the `?` mark in `parids[partition_id][local_id,?]`
         flat_par_offsets = np.zeros([flat_cids.size(0)], dtype = np.int64)
-        num_c_nodes = flat_u_cids.max().item() + 1
-        cum_c_nodes = np.zeros([num_c_nodes], dtype = np.int64)
+        num_c_ngroups = flat_u_cids.max().item() + 1
+        cum_c_ngroups = np.zeros([num_c_ngroups], dtype = np.int64)
 
-        _assign_c_idx_kernel(flat_cids.numpy(), flat_par_offsets, cum_c_nodes)
+        _assign_c_idx_kernel(flat_cids.numpy(), flat_par_offsets, cum_c_ngroups)
         flat_par_offsets = torch.from_numpy(flat_par_offsets).cuda()
 
         # Direct mapping from `cid` to `group_id` and `local_id`
-        cid2group_id = np.zeros([num_c_nodes], dtype = np.int64)
-        cid2local_id = np.zeros([num_c_nodes], dtype = np.int64)
+        cid2partition_id = np.zeros([num_c_ngroups], dtype = np.int64)
+        cid2local_id = np.zeros([num_c_ngroups], dtype = np.int64)
 
-        _assign_cid2_group_local_id(flat_u_cids.numpy(), n_group_ids.numpy(), n_id_in_group.numpy(), cid2group_id, cid2local_id)
-        cid2group_id = torch.from_numpy(cid2group_id).cuda()
+        _assign_cid2_group_local_id(flat_u_cids.numpy(), n_partition_ids.numpy(), n_id_in_partition.numpy(), cid2partition_id, cid2local_id)
+        cid2partition_id = torch.from_numpy(cid2partition_id).cuda()
         cid2local_id = torch.from_numpy(cid2local_id).cuda()
 
         # The following kernel assigns the indices to `target_u_cids` and `target_parids`. This is equivalent
         # to the easier-to-read CPU version enabled by setting `use_cuda = False`
-        num_nodes = flat_u_cids.size(0)
+        num_ngroups = flat_u_cids.size(0)
         num_edges = flat_cids.size(0)
         flat_u_cids = flat_u_cids.cuda()
-        n_group_ids = n_group_ids.cuda()
-        n_id_in_group = n_id_in_group.cuda()
+        n_partition_ids = n_partition_ids.cuda()
+        n_id_in_partition = n_id_in_partition.cuda()
         target_u_cids = target_u_cids.cuda()
         target_parids = target_parids.cuda()
         flat_cid2nid = flat_cid2nid.cuda()
         flat_cids = flat_cids.cuda()
-        u_cids_group_start = u_cids_group_start.cuda()
-        parids_group_start = parids_group_start.cuda()
-        bk_group_max_pars = bk_group_max_pars.cuda()
+        u_cids_partition_start = u_cids_partition_start.cuda()
+        parids_partition_start = parids_partition_start.cuda()
+        bk_partition_max_pars = bk_partition_max_pars.cuda()
 
         # We store these constants in a tensor and retrieve them in the kernel
-        constexprs1 = torch.tensor([num_nodes]).long().cuda()
+        constexprs1 = torch.tensor([num_ngroups]).long().cuda()
         constexprs2 = torch.tensor([num_edges]).long().cuda()
 
-        grid1 = lambda meta: (triton.cdiv(num_nodes, meta["BLOCK_SIZE"]),)
+        grid1 = lambda meta: (triton.cdiv(num_ngroups, meta["BLOCK_SIZE"]),)
 
         _assign_target_ucids_kernel[grid1](
-            target_u_cids, flat_u_cids, n_group_ids, n_id_in_group, 
-            u_cids_group_start, constexprs1, BLOCK_SIZE = 2048
+            target_u_cids, flat_u_cids, n_partition_ids, n_id_in_partition, 
+            u_cids_partition_start, constexprs1, BLOCK_SIZE = 2048
         )
 
         grid2 = lambda meta: (triton.cdiv(num_edges, meta["BLOCK_SIZE"]),)
 
         _assign_prod_target_parids_kernel[grid2](
             target_parids, flat_cid2nid, flat_cids, 
-            cid2group_id, cid2local_id, parids_group_start,
-            flat_par_offsets, bk_group_max_pars, constexprs2, BLOCK_SIZE = 2048
+            cid2partition_id, cid2local_id, parids_partition_start,
+            flat_par_offsets, bk_partition_max_pars, constexprs2, BLOCK_SIZE = 2048
         )
 
         target_u_cids = target_u_cids.cpu()
         u_cids = []
-        for group_id in range(num_ns_in_group.size(0)):
-            sid = u_cids_group_start[group_id]
-            eid = sid + num_ns_in_group[group_id]
+        for partition_id in range(num_ns_in_partition.size(0)):
+            sid = u_cids_partition_start[partition_id]
+            eid = sid + num_ns_in_partition[partition_id]
             u_cids.append(target_u_cids[sid:eid].contiguous())
 
         target_parids = target_parids.cpu()
         parids = []
-        for group_id in range(num_ns_in_group.size(0)):
-            sid = parids_group_start[group_id]
-            gsize = num_ns_in_group[group_id]
-            gnpar = bk_group_max_pars[group_id]
-            eid = sid + gsize * gnpar
-            parids.append(target_parids[sid:eid].reshape(gsize, gnpar).contiguous())
+        for partition_id in range(num_ns_in_partition.size(0)):
+            sid = parids_partition_start[partition_id]
+            psize = num_ns_in_partition[partition_id]
+            pnpar = bk_partition_max_pars[partition_id]
+            eid = sid + psize * pnpar
+            parids.append(target_parids[sid:eid].reshape(psize, pnpar).contiguous())
 
     else:
 
-        u_cids = [torch.zeros([group_size], dtype = torch.long) for group_size in num_ns_in_group] # Node id
-        parids = [torch.zeros([group_size, max_n_pars], dtype = torch.long) for group_size, max_n_pars in zip(num_ns_in_group, bk_group_max_pars)] # Parent id
+        u_cids = [torch.zeros([partition_size], dtype = torch.long) for partition_size in num_ns_in_partition] # Node group id
+        parids = [torch.zeros([partition_size, max_n_pars], dtype = torch.long) for partition_size, max_n_pars in zip(num_ns_in_partition, bk_partition_max_pars)] # Parent group id
 
         for idx in range(flat_u_cids.size(0)):
             cid = flat_u_cids[idx]
 
-            # `group_id`:   which group the current node belongs to
-            # `local_id`:   the index of the node within the current group
-            group_id = n_group_ids[idx]
-            local_id = n_id_in_group[idx]
+            # `partition_id`:   which partition the current node group belongs to
+            # `local_id`:       the index of the node group within the current partition
+            partition_id = n_partition_ids[idx]
+            local_id = n_id_in_partition[idx]
 
             criterion = (flat_cids == cid)
             npar = criterion.sum()
 
-            u_cids[group_id][local_id] = cid
-            parids[group_id][local_id,:npar] = flat_cid2nid[criterion]
+            u_cids[partition_id][local_id] = cid
+            parids[partition_id][local_id,:npar] = flat_cid2nid[criterion]
 
     return u_cids, parids
diff --git a/src/pyjuice/layer/prod_layer.py b/src/pyjuice/layer/prod_layer.py
index 6e4aabef..4038e47e 100644
--- a/src/pyjuice/layer/prod_layer.py
+++ b/src/pyjuice/layer/prod_layer.py
@@ -15,63 +15,69 @@
 from .compilation import next_power_of_2, get_prod_layer_stats, prod_layer_forward_compilation, \
                          flatten_c_ids, get_prod_layer_parstats, prod_layer_backward_compilation
 
+from .compilation import get_prod_layer_stats_new, prod_layer_forward_compilation_new
+
 
 class ProdLayer(Layer, nn.Module):
 
-    def __init__(self, nodes: Sequence[ProdNodes], layer_sparsity_tol: float = 0.0, 
-                 max_num_groups: Optional[int] = None, disable_gpu_compilation: bool = False) -> None:
+    def __init__(self, nodes: Sequence[ProdNodes], layer_sparsity_tol: Optional[float] = None,
+                 max_num_partitions: Optional[int] = None, disable_gpu_compilation: bool = False) -> None:
         Layer.__init__(self, nodes)
         nn.Module.__init__(self)
 
         assert len(nodes) > 0, "No input node."
 
+        for nid in range(1, len(nodes)):
+            assert nodes[0].group_size == nodes[nid].group_size, f"`group_size` within a `ProdLayer` should be the same, but found {nodes[0].group_size} and {nodes[nid].group_size}."
+
         self.nodes = nodes
+        self.group_size = nodes[0].group_size
 
         ## Get layer statistics & prepare for compilation ##
 
-        layer_num_nodes, layer_num_edges, n_chs = get_prod_layer_stats(self.nodes)
+        layer_num_ngroups, layer_num_edges, n_chgs = get_prod_layer_stats_new(self.nodes, self.group_size)
 
-        self.num_nodes = layer_num_nodes
+        self.num_nodes = layer_num_ngroups * self.group_size
         self.num_edges = layer_num_edges
 
-        # Find a good strategy to partition the nodes into groups according to their number of children 
+        # Find a good strategy to partition the nodes into partitions according to their number of children 
         # to minimize total computation cost
-        fw_group_max_chs = partition_nodes_by_n_edges(
-            n_chs, sparsity_tolerance = layer_sparsity_tol, max_num_groups = max_num_groups
+        fw_partition_max_chs = partition_nodes_by_n_edges(
+            n_chgs, sparsity_tolerance = layer_sparsity_tol, max_num_partitions = max_num_partitions
         )
 
-        # Since the triton kernels require the maximum number children for each group to be a power of 2,
+        # Since the triton kernels require the maximum number children for each partition to be a power of 2,
         # we postprocess the group sizes
-        fw_group_max_chs = torch.unique(next_power_of_2(fw_group_max_chs))
+        fw_partition_max_chs = torch.unique(next_power_of_2(fw_partition_max_chs))
 
-        self.num_fw_groups = len(fw_group_max_chs) # Number of groups
+        self.num_fw_partitions = len(fw_partition_max_chs) # Number of partitions
 
-        # fw_n_group_ids:     [num_ns]           stores the group id for each `ns` in `nodes`
-        # fw_n_id_in_group:   [num_ns]           stores the start index of each `ns` in the group
-        # fw_num_ns_in_group: [num_fw_groups]    number of nodes in each group
+        # fw_n_partition_ids:      [num_ns]          stores the partition id for each `ns` in `nodes`
+        # fw_n_id_in_partition:    [num_ns]          stores the start index of each `ns` in the corresponding partition
+        # fw_num_ngs_in_partition: [num_fw_partitions]  number of node groups in each partition
         num_ns = len(self.nodes)
-        fw_n_group_ids = torch.zeros([num_ns], dtype = torch.long)
-        fw_n_id_in_group = torch.zeros([num_ns], dtype = torch.long)
-        fw_num_ns_in_group = torch.zeros([self.num_fw_groups], dtype = torch.long)
+        fw_n_partition_ids = torch.zeros([num_ns], dtype = torch.long)
+        fw_n_id_in_partition = torch.zeros([num_ns], dtype = torch.long)
+        fw_num_ngs_in_partition = torch.zeros([self.num_fw_partitions], dtype = torch.long)
 
         for ns_id, ns in enumerate(self.nodes):
-            group_id = (ns.num_chs > fw_group_max_chs).sum().item()
+            partition_id = (ns.num_chs > fw_partition_max_chs).sum().item()
 
-            fw_n_group_ids[ns_id] = group_id
-            fw_n_id_in_group[ns_id] = fw_num_ns_in_group[group_id]
-            fw_num_ns_in_group[group_id] += ns.num_nodes
+            fw_n_partition_ids[ns_id] = partition_id
+            fw_n_id_in_partition[ns_id] = fw_num_ngs_in_partition[partition_id]
+            fw_num_ngs_in_partition[partition_id] += ns.num_node_groups
 
         ## Initialize forward pass ##
 
-        # nids:      List[[group_size]]                  stores node ids
-        # cids:      List[[group_size, group_max_n_chs]] stores indices of child nodes
-        nids, cids = prod_layer_forward_compilation(
-            self.nodes, fw_group_max_chs, fw_n_group_ids, fw_n_id_in_group, fw_num_ns_in_group
+        # nids:      List[[partition_size]]                      stores node ids
+        # cids:      List[[partition_size, partition_max_n_chs]] stores indices of child nodes
+        nids, cids = prod_layer_forward_compilation_new(
+            self.nodes, fw_partition_max_chs, fw_n_partition_ids, fw_n_id_in_partition, fw_num_ngs_in_partition, self.group_size
         )
 
         # Store buffers for the forward pass
-        self.grouped_nids = nn.ParameterList([nn.Parameter(tensor, requires_grad = False) for tensor in nids])
-        self.grouped_cids = nn.ParameterList([nn.Parameter(tensor, requires_grad = False) for tensor in cids])
+        self.partitioned_nids = nn.ParameterList([nn.Parameter(tensor, requires_grad = False) for tensor in nids])
+        self.partitioned_cids = nn.ParameterList([nn.Parameter(tensor, requires_grad = False) for tensor in cids])
 
         ## Initialize backward pass ##
 
@@ -79,58 +85,58 @@ def __init__(self, nodes: Sequence[ProdNodes], layer_sparsity_tol: float = 0.0,
         # flat_cid2id: mapping from every `flat_cids` to its corresponding `nids`
         flat_cids, flat_cid2nid = flatten_c_ids(nids, cids)
 
-        # flat_u_cids:        [num_used_ch_nodes]    child node ids that have at least one parent
-        # par_counts:         [num_used_ch_nodes]    the number of parents for each child node
+        # flat_u_cids:        [num_used_ch_ngroups]    child group ids that have at least one parent
+        # par_counts:         [num_used_ch_ngroups]    the number of parents for each child node group
         # Note: the dummy node has been removed from `flat_u_cids` and `par_counts`
         flat_u_cids, par_counts = get_prod_layer_parstats(flat_cids)
 
         # Find a good strategy to partition the child nodes into groups according to their number of parents 
         # to minimize total computation cost
-        bk_group_max_pars = partition_nodes_by_n_edges(
-            par_counts, sparsity_tolerance = layer_sparsity_tol, max_num_groups = max_num_groups
+        bk_partition_max_pars = partition_nodes_by_n_edges(
+            par_counts, sparsity_tolerance = layer_sparsity_tol, max_num_partitions = max_num_partitions, debug = True
         )
 
         # Since the triton kernels require the maximum number children for each group to be a power of 2,
         # we postprocess the group sizes
-        bk_group_max_pars = torch.unique(next_power_of_2(bk_group_max_pars))
+        bk_partition_max_pars = torch.unique(next_power_of_2(bk_partition_max_pars))
 
-        self.num_bk_groups = len(bk_group_max_pars) # Number of groups
+        self.num_bk_partitions = len(bk_partition_max_pars) # Number of partitions
 
-        # bk_n_group_ids:     [num_ch_nodes]       stores the group id for each `ns` in `nodes`
-        # bk_n_id_in_group:   [num_ch_nodes]       stores the start index of each `ns` in the group
-        # bk_num_ns_in_group: [num_bk_groups]      number of nodes in each group
-        num_ch_nodes = flat_u_cids.size(0)
-        bk_n_group_ids = torch.zeros([num_ch_nodes], dtype = torch.long)
-        bk_n_id_in_group = torch.zeros([num_ch_nodes], dtype = torch.long)
-        bk_num_ns_in_group = torch.zeros([self.num_bk_groups], dtype = torch.long)
+        # bk_n_partition_ids:     [num_ch_ngroups]       stores the group id for each `ns` in `nodes`
+        # bk_n_id_in_partition:   [num_ch_ngroups]       stores the start index of each `ns` in the partition
+        # bk_num_ns_in_partition: [num_bk_partitions]    number of node groups in each partition
+        num_ch_ngroups = flat_u_cids.size(0)
+        bk_n_partition_ids = torch.zeros([num_ch_ngroups], dtype = torch.long)
+        bk_n_id_in_partition = torch.zeros([num_ch_ngroups], dtype = torch.long)
+        bk_num_ns_in_partition = torch.zeros([self.num_bk_partitions], dtype = torch.long)
 
         min_n_pars = 0
-        for group_id, max_n_pars in enumerate(bk_group_max_pars):
+        for partition_id, max_n_pars in enumerate(bk_partition_max_pars):
             criterion = (par_counts >= min_n_pars) & (par_counts <= max_n_pars)
             filtered_idxs = torch.where(criterion)[0]
-            group_size = criterion.sum().item()
+            partition_size = criterion.sum().item()
 
-            bk_n_group_ids[criterion] = group_id
-            bk_n_id_in_group[criterion] = torch.arange(group_size)
-            bk_num_ns_in_group[group_id] = group_size
+            bk_n_partition_ids[criterion] = partition_id
+            bk_n_id_in_partition[criterion] = torch.arange(partition_size)
+            bk_num_ns_in_partition[partition_id] = partition_size
 
             min_n_pars = max_n_pars + 1
 
-        # u_cids:    List[[group_ch_size]]                   stores child node ids
-        # parids:    List[[group_ch_size, group_max_n_pars]] stores indices of parent nodes
+        # u_cids:    List[[partition_ch_size]]                       stores child node group ids
+        # parids:    List[[partition_ch_size, partition_max_n_pars]] stores indices of parent node groups
         u_cids, parids = prod_layer_backward_compilation(
             flat_u_cids, flat_cids, flat_cid2nid, 
-            bk_group_max_pars, bk_n_group_ids, bk_n_id_in_group, bk_num_ns_in_group,
+            bk_partition_max_pars, bk_n_partition_ids, bk_n_id_in_partition, bk_num_ns_in_partition,
             use_cuda = not disable_gpu_compilation and (flat_cids.size(0) > 4000)
         )
 
         # Store buffers for the backward pass
-        self.grouped_u_cids = nn.ParameterList([nn.Parameter(tensor, requires_grad = False) for tensor in u_cids])
-        self.grouped_parids = nn.ParameterList([nn.Parameter(tensor, requires_grad = False) for tensor in parids])
+        self.partitioned_u_cids = nn.ParameterList([nn.Parameter(tensor, requires_grad = False) for tensor in u_cids])
+        self.partitioned_parids = nn.ParameterList([nn.Parameter(tensor, requires_grad = False) for tensor in parids])
 
     def forward(self, node_mars: torch.Tensor, element_mars: torch.Tensor, _for_backward: bool = False) -> None:
         """
-        Computes the forward pass of a product layer:
+        Computes the forward pass of a product layer. If `group_size == 1`, it is equivalent to the following:
         ```
         element_mars[nids] = node_mars[cids].sum(dim = 1)
         ```
@@ -140,29 +146,29 @@ def forward(self, node_mars: torch.Tensor, element_mars: torch.Tensor, _for_back
         `element_mars`: [max_num_els, B]
         """
 
-        if not _for_backward and self.provided("fw_group_local_ids"):
+        if not _for_backward and self.provided("fw_partition_local_ids"):
             # Partial evaluation (for forward pass)
-            for group_id in range(self.num_fw_groups):
-                nids = self.grouped_nids[group_id]
-                cids = self.grouped_cids[group_id]
-                local_ids = self.fw_group_local_ids[group_id]
+            for partition_id in range(self.num_fw_partitions):
+                nids = self.partitioned_nids[partition_id]
+                cids = self.partitioned_cids[partition_id]
+                local_ids = self.fw_partition_local_ids[partition_id]
 
                 self._forward_backward(element_mars, node_mars, nids, cids, local_ids = local_ids, accum = False)
 
-        elif _for_backward and self.provided("bk_fw_group_local_ids"):
+        elif _for_backward and self.provided("bk_fw_partition_local_ids"):
             # Partial evaluation (for backward pass)
-            for group_id in range(self.num_fw_groups):
-                nids = self.grouped_nids[group_id]
-                cids = self.grouped_cids[group_id]
-                local_ids = self.bk_fw_group_local_ids[group_id]
+            for partition_id in range(self.num_fw_partitions):
+                nids = self.partitioned_nids[partition_id]
+                cids = self.partitioned_cids[partition_id]
+                local_ids = self.bk_fw_partition_local_ids[partition_id]
 
                 self._forward_backward(element_mars, node_mars, nids, cids, local_ids = local_ids, accum = False)
 
         else:
             # Evaluate the whole layer
-            for group_id in range(self.num_fw_groups):
-                nids = self.grouped_nids[group_id]
-                cids = self.grouped_cids[group_id]
+            for partition_id in range(self.num_fw_partitions):
+                nids = self.partitioned_nids[partition_id]
+                cids = self.partitioned_cids[partition_id]
 
                 self._forward_backward(element_mars, node_mars, nids, cids, accum = False)
 
@@ -180,22 +186,22 @@ def backward(self, node_flows: torch.Tensor, element_flows: torch.Tensor) -> Non
         `element_flows`: [max_num_els, B]
         """
         
-        if not self.provided("bk_group_local_ids"):
-            # Evaluate the whole layer
-            for group_id in range(self.num_bk_groups):
-                u_cids = self.grouped_u_cids[group_id]
-                parids = self.grouped_parids[group_id]
+        if self.provided("bk_partition_local_ids"):
+            # Partial evaluation
+            for partition_id in range(self.num_bk_partitions):
+                u_cids = self.partitioned_u_cids[partition_id]
+                parids = self.partitioned_parids[partition_id]
+                local_ids = self.bk_partition_local_ids[partition_id]
 
-                self._forward_backward(node_flows, element_flows, u_cids, parids, accum = True)
+                self._forward_backward(node_flows, element_flows, u_cids, parids, local_ids = local_ids, accum = True)
         
         else:
-            # Partial evaluation
-            for group_id in range(self.num_bk_groups):
-                u_cids = self.grouped_u_cids[group_id]
-                parids = self.grouped_parids[group_id]
-                local_ids = self.bk_group_local_ids[group_id]
+            # Evaluate the whole layer
+            for partition_id in range(self.num_bk_partitions):
+                u_cids = self.partitioned_u_cids[partition_id]
+                parids = self.partitioned_parids[partition_id]
 
-                self._forward_backward(node_flows, element_flows, u_cids, parids, local_ids = local_ids, accum = True)
+                self._forward_backward(node_flows, element_flows, u_cids, parids, accum = True)
         
         return None
 
@@ -204,145 +210,160 @@ def enable_partial_evaluation(self, fw_scopes: Optional[Sequence[BitSet]] = None
 
         # For product layers, we need a special forward pass during the backward process of the circuit
         if bk_scopes is not None:
-            bk_fw_group_local_ids = [[] for _ in range(self.num_fw_groups)]
+            bk_fw_partition_local_ids = [[] for _ in range(self.num_fw_groups)]
             for scope in bk_scopes:
                 if scope not in self.fw_scope2localids:
                     continue
 
                 for group_id, ids in enumerate(self.fw_scope2localids[scope]):
-                    bk_fw_group_local_ids[group_id].append(self.fw_scope2localids[scope][group_id])
+                    bk_fw_partition_local_ids[group_id].append(self.fw_scope2localids[scope][group_id])
 
-            self.bk_fw_group_local_ids = [
-                torch.cat(ids, dim = 0) if len(ids) > 0 else torch.zeros([0], dtype = torch.long) for ids in bk_fw_group_local_ids
+            self.bk_fw_partition_local_ids = [
+                torch.cat(ids, dim = 0) if len(ids) > 0 else torch.zeros([0], dtype = torch.long) for ids in bk_fw_partition_local_ids
             ]
 
     @staticmethod
     @triton.jit
-    def _forward_backward_kernel(node_vals_ptr, element_vals_ptr, nids_ptr, cids_ptr, tot_n_nodes, 
-                                 tot_n_eles, n_nodes, n_edges: tl.constexpr, batch_size,
-                                 n_nodes_per_block_m: tl.constexpr, BLOCK_M: tl.constexpr, 
-                                 BLOCK_N: tl.constexpr, accum: tl.constexpr):
-
-        # We use BLOCK_M to index over edges, and BLOCK_N to index over batches
-        pid0 = tl.program_id(axis = 0)
-        pid1 = tl.program_id(axis = 1)
-        ne_start = pid0 * BLOCK_M
-        b_start = pid1 * BLOCK_N
-
-        # Id of edges processed by the current block
-        ne_offsets = ne_start + tl.arange(0, BLOCK_M)
-        # Batch ids processed by the current block
-        b_offsets = b_start + tl.arange(0, BLOCK_N)
-        b_mask = b_offsets < batch_size
-
-        # Get node ids from `nids`
-        n_start = ne_start // n_edges
-        nid_offsets = n_start + tl.arange(0, n_nodes_per_block_m)
-        nid_mask = nid_offsets < n_nodes
-        n_ids = tl.load(nids_ptr + nid_offsets, mask = nid_mask, other = 0)
-
-        # Get edge ids from `cids`
-        cid_offsets = tl.view(ne_offsets, (n_edges, n_nodes_per_block_m))
-        cid_mask = tl.broadcast_to(nid_mask[None,:], (n_edges, n_nodes_per_block_m))
-        ch_ids = tl.load(cids_ptr + cid_offsets, mask = cid_mask, other = 0)
-
-        # Use `ch_ids` to retrieve the corresponding element mars
-        ele_offsets = tl.broadcast_to(ch_ids[None,:,:], (BLOCK_N, n_edges, n_nodes_per_block_m)) * batch_size + \
-            tl.broadcast_to(b_offsets[:,None,None], (BLOCK_N, n_edges, n_nodes_per_block_m))
-        ele_mask = tl.broadcast_to(nid_mask[None,None,:], (BLOCK_N, n_edges, n_nodes_per_block_m)) & \
-            tl.broadcast_to(b_mask[:,None,None], (BLOCK_N, n_edges, n_nodes_per_block_m))
-        ch_logps = tl.load(element_vals_ptr + ele_offsets, mask = ele_mask, other = 0)
-
-        # Take the sum of the child mars
-        n_logps = tl.sum(ch_logps, axis = 1)
-
-        # Read out the target indices for `node_vals`
-        nmar_offsets = tl.broadcast_to(n_ids[None,:], (BLOCK_N, n_nodes_per_block_m)) * batch_size + \
-            tl.broadcast_to(b_offsets[:,None], (BLOCK_N, n_nodes_per_block_m))
-        nmar_mask = tl.broadcast_to(nid_mask[None,:], (BLOCK_N, n_nodes_per_block_m)) & \
-            tl.broadcast_to(b_mask[:,None], (BLOCK_N, n_nodes_per_block_m))
-
-        # Accumulate the `node_vals`` if required
-        if accum == 1:
-            node_vals = tl.load(node_vals_ptr + nmar_offsets, mask = nmar_mask, other = 0)
-            n_logps += node_vals
+    def _forward_backward_kernel(node_vals_ptr, element_vals_ptr, local_ids_ptr, nids_ptr, cids_ptr, tot_n_nodes, tot_n_eles, n_ngroups,
+                                 n_edges: tl.constexpr, batch_size, BLOCK_M: tl.constexpr, BLOCK_B: tl.constexpr, 
+                                 group_size: tl.constexpr, accum: tl.constexpr, partial_eval: tl.constexpr):
         
-        tl.store(node_vals_ptr + nmar_offsets, n_logps, mask = nmar_mask)
+        pid_m = tl.program_id(axis = 0) # ID of size-`BLOCK_M` nodes
+        pid_b = tl.program_id(axis = 1) # ID of size-`BLOCK_B` batches
+
+        if group_size >= BLOCK_M:
+
+            # Get inferred node group id from `pid_m`
+            ngroup_id = pid_m // (group_size // BLOCK_M)
+            ntile_id = pid_m % (group_size // BLOCK_M)
+
+            # For partial evaluation
+            if partial_eval == 1:
+                ngroup_id = tl.load(local_ids_ptr + ngroup_id)
+
+            # Batch offsets and mask
+            offs_batch = tl.arange(0, BLOCK_B) + pid_b * BLOCK_B 
+            mask_batch = offs_batch < batch_size
+
+            # Get the group start ids for the children
+            # To make the triton compiler happy, we reload every index `BLOCK_M` times
+            offs_ne = tl.arange(0, n_edges * BLOCK_M) // BLOCK_M
+            offs_ne = tl.view(offs_ne, (BLOCK_M, n_edges))
+            offs_egstart = tl.load(cids_ptr + ngroup_id * n_edges + offs_ne) # [BLOCK_M, n_edges]
+
+            # Get the edge values from child nodes
+            group_nids = tl.arange(0, BLOCK_M) + ntile_id * BLOCK_M
+            offs_evals = offs_egstart + group_nids[:,None]
+            evals = tl.load(element_vals_ptr + offs_evals[None,:,:] * batch_size + offs_batch[:,None,None], mask = mask_batch[:,None,None])
+
+            # Take the sum of the child nodes' log-probabilities
+            nvals = tl.sum(evals, axis = 2)
+
+            # Node ids to `node_vals_ptr`
+            ngroup_start = tl.load(nids_ptr + ngroup_id)
+            offs_nvals = (ngroup_start + group_nids[None,:]) * batch_size + offs_batch[:,None]
+
+            # Accumulate the `node_vals` if required
+            if accum == 1:
+                node_vals = tl.load(node_vals_ptr + offs_nvals, mask = mask_batch[:,None], other = 0)
+                nvals += node_vals
+
+            tl.store(node_vals_ptr + offs_nvals, nvals, mask = mask_batch[:,None])
+
+        else:
+
+            # Node offsets and mask
+            offs_node = tl.arange(0, BLOCK_M) + pid_m * BLOCK_M
+            mask_node = offs_node < n_ngroups * group_size
+
+            # Inferred group ids
+            ngroup_ids = offs_node // group_size
+
+            # For partial evaluation
+            if partial_eval == 1:
+                ngroup_ids = tl.load(local_ids_ptr + ngroup_ids, mask = mask_node)
+
+            # Batch offsets and mask
+            offs_batch = tl.arange(0, BLOCK_B) + pid_b * BLOCK_B 
+            mask_batch = offs_batch < batch_size
+
+            # Get the group start ids for the children
+            offs_ne = tl.arange(0, n_edges * BLOCK_M) // BLOCK_M
+            offs_ne = tl.view(offs_ne, (BLOCK_M, n_edges))
+            offs_egstart = tl.load(cids_ptr + ngroup_ids[:,None] * n_edges + offs_ne, mask = mask_node[:,None]) # [BLOCK_M, n_edges]
+
+            # Get the edge values from child nodes
+            group_nids = (offs_node % group_size)
+            offs_evals = offs_egstart + group_nids[:,None]
+            evals = tl.load(element_vals_ptr + offs_evals[None,:,:] * batch_size + offs_batch[:,None,None], mask = (mask_batch[:,None,None] & mask_node[None,:,None]))
+
+            # Take the sum of the child nodes' log-probabilities
+            nvals = tl.sum(evals, axis = 2)
+
+            # Node ids to `node_vals_ptr`
+            ngroup_start = tl.load(nids_ptr + ngroup_ids[None,:])
+            offs_nvals = (ngroup_start + group_nids[None,:]) * batch_size + offs_batch[:,None]
+
+            # Accumulate the `node_vals` if required
+            if accum == 1:
+                node_vals = tl.load(node_vals_ptr + offs_nvals, mask = mask_batch[:,None], other = 0)
+                nvals += node_vals
+
+            tl.store(node_vals_ptr + offs_nvals, nvals, mask = mask_batch[:,None])
 
     @staticmethod
     @torch.compile(mode = "reduce-overhead", fullgraph = True)
     def _forward_backward_pytorch(node_vals, element_vals, nids, cids, accum: bool = False):
+        nids = nids[:,None] + torch.arange(0, self.group_size, device = node_vals.device)[None,:]
+        cids = cids[:,None,:] + torch.arange(0, self.group_size, device = node_vals.device)[None,:,None]
         if accum:
-            node_vals[nids] += element_vals[cids].sum(dim = 1)
+            node_vals[nids] += element_vals[cids].sum(dim = 2)
         else:
-            node_vals[nids] = element_vals[cids].sum(dim = 1)
+            node_vals[nids] = element_vals[cids].sum(dim = 2)
 
         return None
 
-    def _forward_backward(self, node_vals: torch.Tensor, element_vals: torch.Tensor, 
+    def _forward_backward(self, node_vals: torch.Tensor, element_vals: torch.Tensor,
                           nids: torch.Tensor, cids: torch.Tensor, local_ids: Optional[torch.Tensor] = None,
-                          BLOCK_M_HARD_LIMIT = 2**16, BLOCK_SIZE = 2**12, MAX_BLOCK_M = 512, 
-                          MAX_BLOCK_N = 64, accum: bool = False) -> None:
-        """
-        This function is equivalent to running:
-        ``` node_vals[nids] = element_vals[cids].sum(dim = 1) ```
-        
-        Parameters:
-        `node_vals`:    [N, B]
-        `element_vals`: [M, B]
-        `nids`:         [n]
-        `cids`:         [n, c]
-        """
-
-        if local_ids is not None and local_ids.size(0) == 0:
-            # Nothing need to be evaluated in the current group
-            return None
-        elif local_ids is not None:
-            # Select nodes
-            nids = nids[local_ids].contiguous()
-            cids = cids[local_ids,:].contiguous()
+                          accum: bool = False) -> None:
+        if local_ids is not None:
+            raise NotImplementedError()
 
         tot_n_nodes = node_vals.size(0)
         tot_n_eles = element_vals.size(0)
-        n_nodes = nids.size(0)
+        n_ngroups = nids.size(0) if local_ids is None else local_ids.size(0)
         n_edges = cids.size(1)
         batch_size = node_vals.size(1)
 
+        assert n_edges & (n_edges - 1) == 0, "`n_edges` must be power of 2."
+
         # Fall back to the `torch.compile` kernel in the case where we cannot store child edges within a single block
-        if n_edges > BLOCK_M_HARD_LIMIT or not node_vals.is_cuda:
-            self._forward_backward_pytorch(node_vals, element_vals, nids, cids)
+        if n_edges > 1024:
+            self._forward_backward_pytorch(node_vals, element_vals, nids, cids, accum = accum)
 
             return None
 
-        assert n_edges <= BLOCK_M_HARD_LIMIT, "Number of edges should be smaller than or equal to MAX_BLOCK_M."
-        assert n_edges & (n_edges - 1) == 0, "`n_edges` must be power of 2."
-
-        if n_edges <= MAX_BLOCK_M:
-            # In this case, we can find a better thread-block balance
-            MIN_BLOCK_M = min(triton.next_power_of_2(n_edges), MAX_BLOCK_M)
-            BLOCK_N = min(BLOCK_SIZE // MIN_BLOCK_M, MAX_BLOCK_N, triton.next_power_of_2(batch_size))
-            BLOCK_M = min(BLOCK_SIZE // BLOCK_N, MAX_BLOCK_M)
-        else:
-            # Try to fit all edges of a node in a single thread-block
-            BLOCK_M = triton.next_power_of_2(n_edges)
-            BLOCK_N = max(BLOCK_SIZE // BLOCK_M, 1)
+        BLOCK_B = min(1024 // n_edges, triton.next_power_of_2(batch_size))
+        BLOCK_M = min(max(1024 // (BLOCK_B * n_edges), 1), triton.next_power_of_2(n_ngroups) * self.group_size)
 
-        grid = (triton.cdiv(n_nodes * n_edges, BLOCK_M), triton.cdiv(batch_size, BLOCK_N), 1)
+        grid = (triton.cdiv(n_ngroups * self.group_size, BLOCK_M), triton.cdiv(batch_size, BLOCK_B))
 
         self._forward_backward_kernel[grid](
             node_vals_ptr = node_vals, 
-            element_vals_ptr = element_vals, 
+            element_vals_ptr = element_vals,
+            local_ids_ptr = local_ids,
             nids_ptr = nids, 
             cids_ptr = cids, 
             tot_n_nodes = tot_n_nodes,
             tot_n_eles = tot_n_eles,
-            n_nodes = n_nodes,
+            n_ngroups = n_ngroups,
             n_edges = n_edges,
             batch_size = batch_size,
-            n_nodes_per_block_m = BLOCK_M // n_edges,
             BLOCK_M = BLOCK_M, 
-            BLOCK_N = BLOCK_N,
-            accum = 1 if accum else 0
+            BLOCK_B = BLOCK_B,
+            group_size = self.group_size,
+            accum = 1 if accum else 0,
+            partial_eval = 1 if local_ids is not None else 0
         )
 
         return None
diff --git a/src/pyjuice/layer/sum_layer.py b/src/pyjuice/layer/sum_layer.py
index 7033b45e..6aea6c07 100644
--- a/src/pyjuice/layer/sum_layer.py
+++ b/src/pyjuice/layer/sum_layer.py
@@ -24,7 +24,7 @@ def __init__(self, nodes: Sequence[SumNodes], global_nid_start: int,
                  param_ends: Sequence, tied_param_ids: Sequence,
                  tied_param_group_ids: Sequence, tied_param_ends: Sequence,
                  ch_prod_layer_size: int, layer_sparsity_tol: float = 0.0, 
-                 max_num_groups: Optional[int] = None,
+                 max_num_partitions: Optional[int] = None,
                  disable_gpu_compilation: bool = False) -> None:
 
         Layer.__init__(self, nodes)
@@ -46,7 +46,7 @@ def __init__(self, nodes: Sequence[SumNodes], global_nid_start: int,
         # Find a good strategy to partition the nodes into groups according to their number of children 
         # to minimize total computation cost
         fw_group_max_chs = partition_nodes_by_n_edges(
-            n_chs, sparsity_tolerance = layer_sparsity_tol, max_num_groups = max_num_groups
+            n_chs, sparsity_tolerance = layer_sparsity_tol, max_num_partitions = max_num_partitions
         )
 
         # Since the triton kernels require the maximum number children for each group to be a power of 2,
@@ -97,7 +97,7 @@ def __init__(self, nodes: Sequence[SumNodes], global_nid_start: int,
         # to minimize total computation cost
         ch_n_pars = ch_n_pars[1:] # Strip away the dummy node. We will never use it in the following
         bk_group_max_pars = partition_nodes_by_n_edges(
-            ch_n_pars, sparsity_tolerance = layer_sparsity_tol, max_num_groups = max_num_groups
+            ch_n_pars, sparsity_tolerance = layer_sparsity_tol, max_num_partitions = max_num_partitions
         )
 
         # Since the triton kernels require the maximum number children for each group to be a power of 2,
diff --git a/src/pyjuice/model/tensorcircuit.py b/src/pyjuice/model/tensorcircuit.py
index 8284922c..abe08a14 100644
--- a/src/pyjuice/model/tensorcircuit.py
+++ b/src/pyjuice/model/tensorcircuit.py
@@ -49,7 +49,7 @@ def _pc_inputs_hook(grad, pc, i):
 
 class TensorCircuit(nn.Module):
     def __init__(self, root_nodes: CircuitNodes, layer_sparsity_tol: float = 0.5, 
-                 max_num_groups: Optional[int] = None, disable_gpu_compilation: bool = False, 
+                 max_num_partitions: Optional[int] = None, disable_gpu_compilation: bool = False, 
                  verbose: bool = True) -> None:
         """
         Create a tensorized circuit for the circuit rooted at `root_nodes`.
@@ -57,7 +57,7 @@ def __init__(self, root_nodes: CircuitNodes, layer_sparsity_tol: float = 0.5,
         Parameters:
         `root_nodes`:              root node(s) of the circuit
         `layer_sparsity_tol`:      the minimum allowed sparsity of compiled layers; ranges from 0.0 to 1.0; larger means more strict
-        `max_num_groups`:          how many groups do we want to split a layer into
+        `max_num_partitions`:        how many groups do we want to split a layer into
         `disable_gpu_compilation`: disable GPU compilation of the layers
         """
 
@@ -68,7 +68,7 @@ def __init__(self, root_nodes: CircuitNodes, layer_sparsity_tol: float = 0.5,
 
         self._init_pass_tensors()
         self._init_layers(
-            layer_sparsity_tol = layer_sparsity_tol, max_num_groups = max_num_groups, 
+            layer_sparsity_tol = layer_sparsity_tol, max_num_partitions = max_num_partitions, 
             disable_gpu_compilation = disable_gpu_compilation, verbose = verbose
         )
         self._init_ad_tensors()
@@ -524,7 +524,7 @@ def disable_partial_evaluation(self, forward: bool = True, backward: bool = True
 
     def _init_layers(self, init_input_params: Optional[Sequence[torch.Tensor]] = None, 
                      init_inner_params: Optional[torch.Tensor] = None,
-                     layer_sparsity_tol: float = 0.0, max_num_groups: Optional[int] = None,
+                     layer_sparsity_tol: float = 0.0, max_num_partitions: Optional[int] = None,
                      disable_gpu_compilation: bool = False, verbose: bool = True):
 
         self.root_nodes._clear_tensor_circuit_hooks()
@@ -579,7 +579,7 @@ def _init_layers(self, init_input_params: Optional[Sequence[torch.Tensor]] = Non
                 prod_layer = ProdLayer(
                     nodes = depth2nodes[depth]["prod"], 
                     layer_sparsity_tol = layer_sparsity_tol,
-                    max_num_groups = max_num_groups,
+                    max_num_partitions = max_num_partitions,
                     disable_gpu_compilation = disable_gpu_compilation
                 )
 
@@ -599,7 +599,7 @@ def _init_layers(self, init_input_params: Optional[Sequence[torch.Tensor]] = Non
                     tied_param_ends = tied_param_ends,
                     ch_prod_layer_size = prod_layer.num_nodes + 1,
                     layer_sparsity_tol = layer_sparsity_tol,
-                    max_num_groups = max_num_groups,
+                    max_num_partitions = max_num_partitions,
                     disable_gpu_compilation = disable_gpu_compilation
                 )
 
diff --git a/tests/layer/input_layer_test.py b/tests/layer/input_layer_test.py
index d7ac02e3..0cc0e63c 100644
--- a/tests/layer/input_layer_test.py
+++ b/tests/layer/input_layer_test.py
@@ -180,7 +180,7 @@ def speed_test():
 
         nis = []
         for v in range(num_vars):
-            nis.append(inputs(0, num_node_groups = num_node_groups, dist = dists.Categorical(num_cats = 64)))
+            nis.append(inputs(v, num_node_groups = num_node_groups, dist = dists.Categorical(num_cats = 64)))
 
     layer = InputLayer(nis, cum_nodes = 1)
 
@@ -197,7 +197,7 @@ def speed_test():
 
     t0 = time.time()
     torch.cuda.synchronize()
-    for _ in range(2):
+    for _ in range(100):
         layer(data, node_mars)
     torch.cuda.synchronize()
     t1 = time.time()
diff --git a/tests/layer/prod_layer_test.py b/tests/layer/prod_layer_test.py
new file mode 100644
index 00000000..f81b7177
--- /dev/null
+++ b/tests/layer/prod_layer_test.py
@@ -0,0 +1,171 @@
+import pyjuice as juice
+import torch
+import numpy as np
+import time
+import random
+
+import pyjuice.nodes.distributions as dists
+from pyjuice.utils import BitSet
+from pyjuice.nodes import multiply, summate, inputs
+from pyjuice.model import TensorCircuit
+
+from pyjuice.layer import InputLayer, ProdLayer
+
+import pytest
+
+
+def prod_layer_test():
+
+    device = torch.device("cuda:0")
+
+    for (group_size, batch_size) in [(1, 16), (8, 512)]:
+    
+        with juice.set_group_size(group_size):
+
+            ni0 = inputs(0, num_node_groups = 2, dist = dists.Categorical(num_cats = 2))
+            ni1 = inputs(1, num_node_groups = 2, dist = dists.Categorical(num_cats = 2))
+            ni2 = inputs(2, num_node_groups = 2, dist = dists.Categorical(num_cats = 2))
+            ni3 = inputs(3, num_node_groups = 2, dist = dists.Categorical(num_cats = 2))
+
+            np0 = multiply(ni0, ni1)
+            np1 = multiply(ni2, ni3)
+            np2 = multiply(ni1, ni2)
+
+        input_layer = InputLayer([ni0, ni1, ni2, ni3], cum_nodes = group_size)
+
+        layer = ProdLayer([np0, np1, np2])
+
+        assert torch.all(layer.partitioned_nids[0] == torch.arange(group_size, 7*group_size, group_size))
+        assert layer.partitioned_cids[0][0,0] == group_size
+        assert layer.partitioned_cids[0][0,1] == 3 * group_size
+        assert layer.partitioned_cids[0][1,0] == 2 * group_size
+        assert layer.partitioned_cids[0][1,1] == 4 * group_size
+        assert layer.partitioned_cids[0][2,0] == 5 * group_size
+        assert layer.partitioned_cids[0][2,1] == 7 * group_size
+        assert layer.partitioned_cids[0][3,0] == 6 * group_size
+        assert layer.partitioned_cids[0][3,1] == 8 * group_size
+        assert layer.partitioned_cids[0][4,0] == 3 * group_size
+        assert layer.partitioned_cids[0][4,1] == 5 * group_size
+        assert layer.partitioned_cids[0][5,0] == 4 * group_size
+        assert layer.partitioned_cids[0][5,1] == 6 * group_size
+
+        layer.to(device)
+
+        node_mars = torch.rand([group_size + group_size * 2 * 4, batch_size]).log().to(device)
+        element_mars = torch.zeros([group_size + 3 * 2 * 2 * group_size, batch_size]).to(device)
+
+        ## Forward tests ##
+
+        layer(node_mars, element_mars)
+
+        for i in range(group_size):
+            assert torch.all(torch.abs(element_mars[group_size+i,:] - (node_mars[group_size+i,:] + node_mars[3*group_size+i,:])) < 1e-4)
+            assert torch.all(torch.abs(element_mars[2*group_size+i,:] - (node_mars[2*group_size+i,:] + node_mars[4*group_size+i,:])) < 1e-4)
+
+            assert torch.all(torch.abs(element_mars[3*group_size+i,:] - (node_mars[5*group_size+i,:] + node_mars[7*group_size+i,:])) < 1e-4)
+            assert torch.all(torch.abs(element_mars[4*group_size+i,:] - (node_mars[6*group_size+i,:] + node_mars[8*group_size+i,:])) < 1e-4)
+
+            assert torch.all(torch.abs(element_mars[5*group_size+i,:] - (node_mars[3*group_size+i,:] + node_mars[5*group_size+i,:])) < 1e-4)
+            assert torch.all(torch.abs(element_mars[6*group_size+i,:] - (node_mars[4*group_size+i,:] + node_mars[6*group_size+i,:])) < 1e-4)
+
+        ## Backward tests ##
+
+        element_flows = torch.rand([group_size + 3 * 2 * 2 * group_size, batch_size]).to(device)
+        element_flows[:group_size,:] = 0.0
+        node_flows = torch.zeros([group_size + group_size * 2 * 4, batch_size]).to(device)
+
+        layer(node_mars, element_mars)
+        layer.backward(node_flows, element_flows)
+
+        for i in range(group_size):
+            assert torch.all(torch.abs(node_flows[group_size+i,:] - element_flows[group_size+i,:]) < 1e-4)
+            assert torch.all(torch.abs(node_flows[2*group_size+i,:] - element_flows[2*group_size+i,:]) < 1e-4)
+
+            assert torch.all(torch.abs(node_flows[3*group_size+i,:] - (element_flows[group_size+i,:] + element_flows[5*group_size+i,:])) < 1e-4)
+            assert torch.all(torch.abs(node_flows[4*group_size+i,:] - (element_flows[2*group_size+i,:] + element_flows[6*group_size+i,:])) < 1e-4)
+
+            assert torch.all(torch.abs(node_flows[5*group_size+i,:] - (element_flows[3*group_size+i,:] + element_flows[5*group_size+i,:])) < 1e-4)
+            assert torch.all(torch.abs(node_flows[6*group_size+i,:] - (element_flows[4*group_size+i,:] + element_flows[6*group_size+i,:])) < 1e-4)
+
+            assert torch.all(torch.abs(node_flows[7*group_size+i,:] - element_flows[3*group_size+i,:]) < 1e-4)
+            assert torch.all(torch.abs(node_flows[8*group_size+i,:] - element_flows[4*group_size+i,:]) < 1e-4)
+
+
+def speed_test():
+
+    device = torch.device("cuda:0")
+
+    group_size = 16
+    num_vars = 28*28
+    num_node_groups = 256 // group_size
+    num_prod_nodes = 200
+
+    batch_size = 512
+
+    with juice.set_group_size(group_size):
+
+        nis = []
+        for v in range(num_vars):
+            nis.append(inputs(v, num_node_groups = num_node_groups, dist = dists.Categorical(num_cats = 64)))
+
+        nps = []
+        for i in range(num_prod_nodes):
+            v1 = random.randint(0, num_vars - 1)
+            v2 = random.randint(0, num_vars - 1)
+            if v1 == v2:
+                if v1 == num_vars - 1:
+                    v1 -= 2
+                v2 = v1 + 1
+
+            nps.append(multiply(nis[v1], nis[v2]))
+
+    input_layer = InputLayer(nis, cum_nodes = group_size)
+
+    layer = ProdLayer(nps, layer_sparsity_tol = 0.1)
+
+    import pdb; pdb.set_trace()
+
+    layer.to(device)
+
+    node_mars = torch.rand([group_size + group_size * num_node_groups * num_vars, batch_size]).log().to(device)
+    element_mars = torch.zeros([group_size + num_prod_nodes * group_size * num_node_groups, batch_size]).to(device)
+
+    ## Forward tests ##
+
+    layer(node_mars, element_mars)
+
+    t0 = time.time()
+    torch.cuda.synchronize()
+    for _ in range(100):
+        layer(node_mars, element_mars)
+    torch.cuda.synchronize()
+    t1 = time.time()
+    forward_ms = (t1 - t0) / 100 * 1000
+
+    print(f"Forward pass on average takes {forward_ms:.3f}ms.")
+    print("Reference computation time on RTX 4090: 0.330ms.")
+    print("--------------------------------------------------------------")
+
+    element_flows = torch.rand([group_size + num_prod_nodes * num_node_groups * group_size, batch_size]).to(device)
+    element_flows[:group_size,:] = 0.0
+    node_flows = torch.zeros([group_size + group_size * num_node_groups * num_vars, batch_size]).to(device)
+
+    layer(node_mars, element_mars)
+    layer.backward(node_flows, element_flows)
+
+    t0 = time.time()
+    torch.cuda.synchronize()
+    for _ in range(100):
+        layer.backward(node_flows, element_flows)
+    torch.cuda.synchronize()
+    t1 = time.time()
+    forward_ms = (t1 - t0) / 100 * 1000
+
+    print(f"Backward pass on average takes {forward_ms:.3f}ms.")
+    print("Reference computation time on RTX 4090: 0.330ms.")
+    print("--------------------------------------------------------------")
+
+
+if __name__ == "__main__":
+    # prod_layer_test()
+    speed_test()

From b6f9c9cf5180b29aac9f220fd54ed24eb4ab9f3b Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Sun, 3 Dec 2023 22:05:33 +0800
Subject: [PATCH 019/162] fix bug in sparse tolerance

---
 src/pyjuice/layer/backend/node_partition.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/pyjuice/layer/backend/node_partition.py b/src/pyjuice/layer/backend/node_partition.py
index e98c254e..ebf41332 100644
--- a/src/pyjuice/layer/backend/node_partition.py
+++ b/src/pyjuice/layer/backend/node_partition.py
@@ -156,6 +156,9 @@ def _weighted_partition_nodes_dp_simple(node_n_edges: np.ndarray, counts: np.nda
     dp = np.zeros([node_n_edges.shape[0], max_num_partitions + 1], dtype = np.int64)
     backtrace = np.zeros([node_n_edges.shape[0], max_num_partitions + 1], dtype = np.int64)
 
+    # if debug:
+    #     import pdb; pdb.set_trace()
+
     overhead, target_n_group = _weighted_partition_nodes_dp_simple_compiled(
         np.ascontiguousarray(node_n_edges),
         np.ascontiguousarray(cum_counts),
@@ -176,7 +179,7 @@ def _weighted_partition_nodes_dp_simple(node_n_edges: np.ndarray, counts: np.nda
 def partition_nodes_by_n_edges(node_n_edges: Union[np.ndarray, torch.Tensor], 
                                max_num_partitions: Optional[int] = None, 
                                sparsity_tolerance: Optional[float] = None,
-                               algorithm: str = "dp_with_coalesce", debug = False):
+                               algorithm: str = "dp_with_coalesce"):
 
     if sparsity_tolerance is not None and sparsity_tolerance < 1e-6:
         sparsity_tolerance = None
@@ -195,7 +198,7 @@ def partition_nodes_by_n_edges(node_n_edges: Union[np.ndarray, torch.Tensor],
         node_n_edges = node_n_edges.detach().cpu().numpy()
 
     total_num_edges = node_n_edges.sum()
-    target_overhead = None if sparsity_tolerance is None else int(math.ceil(total_num_edges / sparsity_tolerance))
+    target_overhead = None if sparsity_tolerance is None else int(math.ceil(total_num_edges * sparsity_tolerance))
 
     if max_num_partitions == 1:
         partitions = np.zeros([1], dtype = np.int64)
@@ -210,8 +213,6 @@ def partition_nodes_by_n_edges(node_n_edges: Union[np.ndarray, torch.Tensor],
 
     elif algorithm == "dp_with_coalesce":
         unique_n_edges, counts = _coalesce(node_n_edges, tol_range = "auto")
-        if debug:
-            import pdb; pdb.set_trace()
         group_sizes, overhead = _weighted_partition_nodes_dp_simple(unique_n_edges, counts, max_num_partitions, target_overhead)
 
     else:

From a32163a731591a84731c67b39fccf7f3a308c407 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Sun, 3 Dec 2023 22:09:06 +0800
Subject: [PATCH 020/162] clean up

---
 src/pyjuice/layer/compilation.py | 57 ++------------------------------
 src/pyjuice/layer/prod_layer.py  |  8 ++---
 tests/layer/prod_layer_test.py   |  7 ++--
 3 files changed, 8 insertions(+), 64 deletions(-)

diff --git a/src/pyjuice/layer/compilation.py b/src/pyjuice/layer/compilation.py
index 89cf3e9f..2d5b0267 100644
--- a/src/pyjuice/layer/compilation.py
+++ b/src/pyjuice/layer/compilation.py
@@ -945,30 +945,7 @@ def sum_layer_backward_compilation(nodes, pids, fw_n_group_ids, fw_n_id_in_group
 ## Compilation for ProdLayer ##
 
 
-def get_prod_layer_stats(nodes: Sequence[SumNodes]):
-    layer_num_nodes = sum(map(lambda ns: ns.num_nodes, nodes))
-    layer_num_edges = 0
-    
-    global_nid_start = 1 # idx 0 is reserved for the dummy node
-
-    n_sid = 0
-    n_chs = torch.zeros([layer_num_nodes], dtype = torch.long)
-    for ns_idx, ns in enumerate(nodes):
-        n_eid = n_sid + ns.num_nodes
-
-        n_chs[n_sid:n_eid] = ns.num_chs
-
-        layer_num_edges += ns.num_nodes * ns.num_chs
-
-        ns._output_ind_range = (global_nid_start, global_nid_start + ns.num_nodes)
-        global_nid_start += ns.num_nodes
-
-        n_sid = n_eid
-
-    return layer_num_nodes, layer_num_edges, n_chs
-
-
-def get_prod_layer_stats_new(nodes: Sequence[SumNodes], group_size: int):
+def get_prod_layer_stats(nodes: Sequence[SumNodes], group_size: int):
     layer_num_ngroup = sum(map(lambda ns: ns.num_node_groups, nodes))
     layer_num_edges = 0
     
@@ -992,37 +969,7 @@ def get_prod_layer_stats_new(nodes: Sequence[SumNodes], group_size: int):
 
 
 @torch.no_grad()
-def prod_layer_forward_compilation(nodes, fw_group_max_chs, n_group_ids, n_id_in_group, num_ns_in_group, use_cuda: bool = False):
-
-    if use_cuda and not torch.cuda.is_available():
-        use_cuda = False
-
-    nids = [torch.zeros([group_size], dtype = torch.long) for group_size in num_ns_in_group] # Node id
-    cids = [torch.zeros([group_size, max_chs], dtype = torch.long) for group_size, max_chs in zip(num_ns_in_group, fw_group_max_chs)] # Child id
-
-    n_sid = 1 # offset the dummy node
-    for ns_id, ns in enumerate(nodes):
-        n_eid = n_sid + ns.num_nodes
-
-        # `group_id`:   which group the current node belongs to
-        # `local_sid`:  the start index of the node within the current group
-        # `group_nchs`: maximum number of child nodes in the current group
-        group_id = n_group_ids[ns_id]
-        local_sid = n_id_in_group[ns_id]
-        local_eid = local_sid + ns.num_nodes
-        group_nchs = fw_group_max_chs[group_id]
-
-        nids[group_id][local_sid:local_eid] = torch.arange(ns.num_nodes) + n_sid
-        for cs_id, cs in enumerate(ns.chs):
-            cids[group_id][local_sid:local_eid,cs_id] = ns.edge_ids[:,cs_id] + cs._output_ind_range[0]
-
-        n_sid = n_eid
-
-    return nids, cids
-
-
-@torch.no_grad()
-def prod_layer_forward_compilation_new(nodes, fw_partition_max_chs, n_partition_ids, n_id_in_partition, num_ngs_in_partition, group_size, use_cuda: bool = False):
+def prod_layer_forward_compilation(nodes, fw_partition_max_chs, n_partition_ids, n_id_in_partition, num_ngs_in_partition, group_size, use_cuda: bool = False):
     
     if use_cuda and not torch.cuda.is_available():
         use_cuda = False
diff --git a/src/pyjuice/layer/prod_layer.py b/src/pyjuice/layer/prod_layer.py
index 4038e47e..2f48c5ae 100644
--- a/src/pyjuice/layer/prod_layer.py
+++ b/src/pyjuice/layer/prod_layer.py
@@ -15,8 +15,6 @@
 from .compilation import next_power_of_2, get_prod_layer_stats, prod_layer_forward_compilation, \
                          flatten_c_ids, get_prod_layer_parstats, prod_layer_backward_compilation
 
-from .compilation import get_prod_layer_stats_new, prod_layer_forward_compilation_new
-
 
 class ProdLayer(Layer, nn.Module):
 
@@ -35,7 +33,7 @@ def __init__(self, nodes: Sequence[ProdNodes], layer_sparsity_tol: Optional[floa
 
         ## Get layer statistics & prepare for compilation ##
 
-        layer_num_ngroups, layer_num_edges, n_chgs = get_prod_layer_stats_new(self.nodes, self.group_size)
+        layer_num_ngroups, layer_num_edges, n_chgs = get_prod_layer_stats(self.nodes, self.group_size)
 
         self.num_nodes = layer_num_ngroups * self.group_size
         self.num_edges = layer_num_edges
@@ -71,7 +69,7 @@ def __init__(self, nodes: Sequence[ProdNodes], layer_sparsity_tol: Optional[floa
 
         # nids:      List[[partition_size]]                      stores node ids
         # cids:      List[[partition_size, partition_max_n_chs]] stores indices of child nodes
-        nids, cids = prod_layer_forward_compilation_new(
+        nids, cids = prod_layer_forward_compilation(
             self.nodes, fw_partition_max_chs, fw_n_partition_ids, fw_n_id_in_partition, fw_num_ngs_in_partition, self.group_size
         )
 
@@ -93,7 +91,7 @@ def __init__(self, nodes: Sequence[ProdNodes], layer_sparsity_tol: Optional[floa
         # Find a good strategy to partition the child nodes into groups according to their number of parents 
         # to minimize total computation cost
         bk_partition_max_pars = partition_nodes_by_n_edges(
-            par_counts, sparsity_tolerance = layer_sparsity_tol, max_num_partitions = max_num_partitions, debug = True
+            par_counts, sparsity_tolerance = layer_sparsity_tol, max_num_partitions = max_num_partitions
         )
 
         # Since the triton kernels require the maximum number children for each group to be a power of 2,
diff --git a/tests/layer/prod_layer_test.py b/tests/layer/prod_layer_test.py
index f81b7177..50493480 100644
--- a/tests/layer/prod_layer_test.py
+++ b/tests/layer/prod_layer_test.py
@@ -123,8 +123,6 @@ def speed_test():
 
     layer = ProdLayer(nps, layer_sparsity_tol = 0.1)
 
-    import pdb; pdb.set_trace()
-
     layer.to(device)
 
     node_mars = torch.rand([group_size + group_size * num_node_groups * num_vars, batch_size]).log().to(device)
@@ -162,10 +160,11 @@ def speed_test():
     forward_ms = (t1 - t0) / 100 * 1000
 
     print(f"Backward pass on average takes {forward_ms:.3f}ms.")
-    print("Reference computation time on RTX 4090: 0.330ms.")
+    print("Reference computation time on RTX 4090: 0.533ms.")
     print("--------------------------------------------------------------")
 
 
 if __name__ == "__main__":
-    # prod_layer_test()
+    torch.manual_seed(2390)
+    prod_layer_test()
     speed_test()

From ac2b5d26255607829fd03231b42a40c4c25a641f Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Mon, 4 Dec 2023 01:56:32 +0800
Subject: [PATCH 021/162] fix

---
 src/pyjuice/layer/prod_layer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/pyjuice/layer/prod_layer.py b/src/pyjuice/layer/prod_layer.py
index 2f48c5ae..c853f761 100644
--- a/src/pyjuice/layer/prod_layer.py
+++ b/src/pyjuice/layer/prod_layer.py
@@ -50,8 +50,8 @@ def __init__(self, nodes: Sequence[ProdNodes], layer_sparsity_tol: Optional[floa
 
         self.num_fw_partitions = len(fw_partition_max_chs) # Number of partitions
 
-        # fw_n_partition_ids:      [num_ns]          stores the partition id for each `ns` in `nodes`
-        # fw_n_id_in_partition:    [num_ns]          stores the start index of each `ns` in the corresponding partition
+        # fw_n_partition_ids:      [num_ns]             stores the partition id for each `ns` in `nodes`
+        # fw_n_id_in_partition:    [num_ns]             stores the start index of each `ns` in the corresponding partition
         # fw_num_ngs_in_partition: [num_fw_partitions]  number of node groups in each partition
         num_ns = len(self.nodes)
         fw_n_partition_ids = torch.zeros([num_ns], dtype = torch.long)

From 4bc2e4a13c4b2124bb64dd0cac56c6ec6fb006b7 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Mon, 4 Dec 2023 22:13:24 +0800
Subject: [PATCH 022/162] refactor: forward pass of `SumLayer`

---
 src/pyjuice/layer/compilation.py | 134 +++++++------
 src/pyjuice/layer/layer.py       |   2 +
 src/pyjuice/layer/prod_layer.py  | 135 ++++++++++---
 src/pyjuice/layer/sum_layer.py   | 320 +++++++++++++++++++++++--------
 tests/layer/sum_layer_test.py    | 143 ++++++++++++++
 5 files changed, 567 insertions(+), 167 deletions(-)
 create mode 100644 tests/layer/sum_layer_test.py

diff --git a/src/pyjuice/layer/compilation.py b/src/pyjuice/layer/compilation.py
index 2d5b0267..a30ee1f4 100644
--- a/src/pyjuice/layer/compilation.py
+++ b/src/pyjuice/layer/compilation.py
@@ -61,24 +61,25 @@ def next_power_of_2(x: torch.Tensor):
 
 
 def get_sum_layer_stats(nodes: Sequence[SumNodes], global_nid_start: int):
-    layer_num_nodes = sum(map(lambda ns: ns.num_nodes, nodes))
+    layer_num_ngroups = sum(map(lambda ns: ns.num_node_groups, nodes))
     layer_num_edges = 0
 
     n_sid = 0
-    n_chs = torch.zeros([layer_num_nodes], dtype = torch.long)
+    n_chs = torch.zeros([layer_num_ngroups], dtype = torch.long)
     for ns_idx, ns in enumerate(nodes):
-        n_eid = n_sid + ns.num_nodes
+        n_eid = n_sid + ns.num_node_groups
 
         curr_n_chs = torch.bincount(ns.edge_ids[0,:])
-        n_chs[n_sid:n_eid] = curr_n_chs
+        # To maximize flexibility, we point to individual child nodes instead of a node group
+        n_chs[n_sid:n_eid] = curr_n_chs * ns.ch_group_size
 
         ns._output_ind_range = (global_nid_start, global_nid_start + ns.num_nodes)
         global_nid_start += ns.num_nodes
-        layer_num_edges += ns.edge_ids.size(1)
+        layer_num_edges += ns.num_edges
 
         n_sid = n_eid
 
-    return layer_num_nodes, layer_num_edges, n_chs
+    return layer_num_ngroups, layer_num_edges, n_chs
 
 
 @torch.no_grad()
@@ -330,9 +331,9 @@ def _assign_chid_kernel(chs_offsets, ns_nchs, edge_ids):
 
 
 @triton.jit
-def _assign_target_ncpids_kernel(target_nids_ptr, nids_group_start_ptr, target_cids_ptr, pcids_group_start_ptr,
-                                 target_pids_ptr, edge_ids_ptr, chs_offsets_ptr, n_group_ids_ptr, n_id_in_group_ptr, 
-                                 cs_ele_id_start_ptr, cs_node_cum_ids_ptr, fw_group_max_chs_ptr, cum_n_chs_ptr, 
+def _assign_target_ncpids_kernel(target_nids_ptr, nids_partition_start_ptr, target_cids_ptr, pcids_partition_start_ptr,
+                                 target_pids_ptr, edge_ids_ptr, chs_offsets_ptr, n_partition_ids_ptr, n_id_in_partition_ptr, 
+                                 cs_ele_id_start_ptr, cs_node_cum_ids_ptr, fw_partition_max_chs_ptr, cum_n_chs_ptr, 
                                  ns_param_ids_ptr, ch_n_pars_ptr, constexprs_ptr, num_chs: tl.constexpr, 
                                  num_chs_np2: tl.constexpr, add_params_flag: tl.constexpr, BLOCK_SIZE: tl.constexpr):
 
@@ -342,20 +343,21 @@ def _assign_target_ncpids_kernel(target_nids_ptr, nids_group_start_ptr, target_c
     # Retrieve all constexprs
     global_nid_start = tl.load(constexprs_ptr)
     ns_pid_start = tl.load(constexprs_ptr + 1)
-    node_start = tl.load(constexprs_ptr + 2)
+    ngroup_start = tl.load(constexprs_ptr + 2)
     num_edges = tl.load(constexprs_ptr + 3)
+    group_size = tl.load(constexprs_ptr + 4)
 
     # Get edge indices to be processed by the current block
     offsets = block_start + tl.arange(0, BLOCK_SIZE)
     mask = offsets < num_edges
 
-    # Get `nid` and `cid`
+    # Get `nid` and `cid` (size of `edge_ids` is [2, num_edges])
     nid = tl.load(edge_ids_ptr + offsets, mask = mask, other = 0)
     cid = tl.load(edge_ids_ptr + offsets + num_edges, mask = mask, other = 0)
 
-    # Get `group_id` and `local_id`
-    group_id = tl.load(n_group_ids_ptr + nid + node_start, mask = mask, other = 0)
-    local_id = tl.load(n_id_in_group_ptr + nid + node_start, mask = mask, other = 0)
+    # Get `partition_id` and `local_id`
+    partition_id = tl.load(n_partition_ids_ptr + nid + ngroup_start, mask = mask, other = 0)
+    local_id = tl.load(n_id_in_partition_ptr + nid + ngroup_start, mask = mask, other = 0)
 
     # Get the child ns index every `cid` belongs to and the cum nodes & global sid
     cs_offsets = tl.arange(0, num_chs_np2)
@@ -369,18 +371,19 @@ def _assign_target_ncpids_kernel(target_nids_ptr, nids_group_start_ptr, target_c
     cs_ele_ind = tl.load(cs_ele_id_start_ptr + cid_node_id, mask = mask, other = 0)
 
     # Get child offsets
+    # Note: this is the `?` mark in `cids[group_id][local_id,?]`
     chs_offset = tl.load(chs_offsets_ptr + offsets, mask = mask, other = 0)
 
     # Store to `target_nids`
-    nids_start = tl.load(nids_group_start_ptr + group_id, mask = mask, other = 0)
-    global_nid = global_nid_start + node_start + nid
+    nids_start = tl.load(nids_partition_start_ptr + partition_id, mask = mask, other = 0)
+    global_nid = global_nid_start + (ngroup_start + nid) * group_size
     tl.store(target_nids_ptr + nids_start + local_id, global_nid, mask = mask)
 
     # Store to `target_cids`
-    group_max_n_chs = tl.load(fw_group_max_chs_ptr + group_id, mask = mask, other = 0)
-    pcids_start = tl.load(pcids_group_start_ptr + group_id, mask = mask, other = 0)
-    pcids_offsets = pcids_start + local_id * group_max_n_chs + chs_offset
-    global_cid = cid + cs_ele_ind - cs_cum_num
+    partition_max_n_chs = tl.load(fw_partition_max_chs_ptr + partition_id, mask = mask, other = 0)
+    pcids_start = tl.load(pcids_partition_start_ptr + partition_id, mask = mask, other = 0)
+    pcids_offsets = pcids_start + local_id * partition_max_n_chs + chs_offset
+    global_cid = cs_ele_ind + cid - cs_cum_num
     tl.store(target_cids_ptr + pcids_offsets, global_cid, mask = mask)
 
     # Cumulate number of parents for every child node
@@ -388,7 +391,7 @@ def _assign_target_ncpids_kernel(target_nids_ptr, nids_group_start_ptr, target_c
 
     # Store to `target_pids`
     ns_local_pid = tl.load(cum_n_chs_ptr + nid, mask = mask, other = 0)
-    global_pid = chs_offset + ns_pid_start + ns_local_pid
+    global_pid = ns_pid_start + (ns_local_pid + chs_offset) * group_size
     tl.store(target_pids_ptr + pcids_offsets, global_pid, mask = mask)
 
     # Global parameter indices for all edges
@@ -397,7 +400,7 @@ def _assign_target_ncpids_kernel(target_nids_ptr, nids_group_start_ptr, target_c
 
 
 @torch.no_grad()
-def sum_layer_forward_compilation(nodes, fw_group_max_chs, n_group_ids, n_id_in_group, num_ns_in_group, n_chs,
+def sum_layer_forward_compilation(nodes, fw_partition_max_chs, n_partition_ids, n_id_in_partition, num_ns_in_partition, n_chs,
                                   global_nid_start, ch_prod_layer_size, param_ends,
                                   num_threads: int = 1, use_cuda: bool = True, legacy: bool = False):
 
@@ -406,41 +409,44 @@ def sum_layer_forward_compilation(nodes, fw_group_max_chs, n_group_ids, n_id_in_
 
     # Also use the legacy code if we compile with CPU
     if not use_cuda or legacy:
+        # TODO: restore CPU compilation
         return sum_layer_forward_compilation_legacy(
-            nodes, fw_group_max_chs, n_group_ids, n_id_in_group, num_ns_in_group, n_chs,
+            nodes, fw_partition_max_chs, n_partition_ids, n_id_in_partition, num_ns_in_partition, n_chs,
             global_nid_start, ch_prod_layer_size, param_ends, num_threads = num_threads,
             use_cuda = use_cuda
         )
 
-    # We construct a flattened version of `nids` where the vectors of every group is concatenated
+    # We construct a flattened version of `nids` where the vectors of every partition is concatenated
     # into a single vector. `nids_group_start` is used to indicate the start index of every group's
-    # `nids`. That is, `target_nids[nids_group_start[gid]:nids_group_start[gid+1]] == nids[gid]`
-    nids_group_start = torch.zeros_like(num_ns_in_group)
-    nids_group_start[1:] = torch.cumsum(num_ns_in_group[:-1], dim = 0)
-    target_nids = torch.zeros([num_ns_in_group.sum()], dtype = torch.long).cuda()
+    # `nids`. That is, `target_nids[nids_partition_start[i]:nids_partition_start[i+1]] == nids[i]`
+    nids_partition_start = torch.zeros_like(num_ns_in_partition)
+    nids_partition_start[1:] = torch.cumsum(num_ns_in_partition[:-1], dim = 0)
+    target_nids = torch.zeros([num_ns_in_partition.sum()], dtype = torch.long).cuda()
 
     # Similarly, we flatten `cids`...
-    pcids_group_start = torch.zeros_like(num_ns_in_group)
-    pcids_group_start[1:] = torch.cumsum((num_ns_in_group * fw_group_max_chs)[:-1], dim = 0)
-    target_cids = torch.zeros([(num_ns_in_group * fw_group_max_chs).sum()], dtype = torch.long).cuda()
+    # Note: we call it `pcids...` because it is shared with `target_pids`
+    pcids_partition_start = torch.zeros_like(num_ns_in_partition)
+    pcids_partition_start[1:] = torch.cumsum((num_ns_in_partition * fw_partition_max_chs)[:-1], dim = 0)
+    target_cids = torch.zeros([(num_ns_in_partition * fw_partition_max_chs).sum()], dtype = torch.long).cuda()
 
     # ...and `pids`
-    target_pids = torch.zeros([(num_ns_in_group * fw_group_max_chs).sum()], dtype = torch.long).cuda()
+    target_pids = torch.zeros([(num_ns_in_partition * fw_partition_max_chs).sum()], dtype = torch.long).cuda()
 
+    # TODO: restore this when working on the backward pass
     # This tensor is to be filled with number of parents for every child node
     ch_n_pars = torch.zeros([ch_prod_layer_size], dtype = torch.int32).cuda()
 
     # Move necessary tensors to GPU
-    n_group_ids = n_group_ids.cuda()
-    n_id_in_group = n_id_in_group.cuda()
-    fw_group_max_chs = fw_group_max_chs.cuda()
+    n_partition_ids = n_partition_ids.cuda()
+    n_id_in_partition = n_id_in_partition.cuda()
+    fw_partition_max_chs = fw_partition_max_chs.cuda()
 
     all_ns_param_ids = dict()
-    original_param_nids = []
+    original_param_nids = [] # `ns` with their original parameters (i.e., not tied)
 
     # This is the main loop: iterate over `ns` in the layer
     global_pid_start = param_ends[-1]
-    node_start = 0 # The start index of nodes in the current `ns`
+    ngroup_start = 0 # The start index of the node groups in the current `ns`
     for ns_idx, ns in enumerate(nodes):
         if ns.is_tied():
             target_ns = ns.get_source_ns()
@@ -461,17 +467,23 @@ def sum_layer_forward_compilation(nodes, fw_group_max_chs, n_group_ids, n_id_in_
         # Global pid start index for `ns`
         ns_pid_start = target_ns._param_range[0]
 
-        # number of nodes
-        ns_num_nodes = ns.num_nodes
+        # number of node groups
+        ns_num_ngroups = ns.num_node_groups
 
         # Edge indices of size [2, ns_num_edges]
-        edge_ids = ns.edge_ids
+        # Here child ids of the edges are flattened out, i.e., every edge points to 
+        # an actual "node" instead of a node group
+        edge_ids = ns.edge_ids.clone()
+        edge_ids = edge_ids[:,:,None].repeat(1, 1, ns.ch_group_size)
+        edge_ids[1,:,:] *= ns.ch_group_size
+        edge_ids[1,:,:] += torch.arange(0, ns.ch_group_size)[None,:]
+        edge_ids = edge_ids.reshape(2, ns.edge_ids.size(1) * ns.ch_group_size).contiguous()
         ns_num_edges = edge_ids.size(1)
 
         # Precompute the child offset ids for every edge. That is, the `?` 
         # mark in `cids[group_id][local_id,?]`
         chs_offsets = np.zeros([ns_num_edges], dtype = np.int64)
-        ns_nchs = np.zeros([ns_num_nodes], dtype = np.int64)
+        ns_nchs = np.zeros([ns_num_ngroups], dtype = np.int64)
 
         _assign_chid_kernel(chs_offsets, ns_nchs, edge_ids.numpy())
         chs_offsets = torch.from_numpy(chs_offsets)
@@ -488,7 +500,7 @@ def sum_layer_forward_compilation(nodes, fw_group_max_chs, n_group_ids, n_id_in_
 
         # Cumulative nchs
         ns_nchs = torch.from_numpy(ns_nchs)
-        cum_n_chs = torch.zeros([ns_num_nodes], dtype = torch.long)
+        cum_n_chs = torch.zeros([ns_num_ngroups], dtype = torch.long)
         cum_n_chs[1:] = torch.cumsum(ns_nchs[:-1], dim = 0)
 
         if add_params_flag:
@@ -498,64 +510,68 @@ def sum_layer_forward_compilation(nodes, fw_group_max_chs, n_group_ids, n_id_in_
 
         # The following kernel assigns the corresponding indices to `nids`, `cids`, and `pids`
         # We first move necessary buffers to GPU
-        nids_group_start = nids_group_start.cuda()
+        nids_partition_start = nids_partition_start.cuda()
         edge_ids = edge_ids.cuda()
         chs_offsets = chs_offsets.cuda()
         cs_ele_id_start = cs_ele_id_start.cuda()
         cs_node_cum_ids = cs_node_cum_ids.cuda()
         cum_n_chs = cum_n_chs.cuda()
-        pcids_group_start = pcids_group_start.cuda()
+        pcids_partition_start = pcids_partition_start.cuda()
 
         # We store these constants in a tensor and retrieve them in the kernel
         # This is to avoid `triton` from compiling separate kernels for every layer configuration
         # Saves 99.9% compilation time :)
-        constexprs = torch.tensor([global_nid_start, ns_pid_start, node_start, ns_num_edges]).long().cuda()
+        constexprs = torch.tensor([global_nid_start, ns_pid_start, ngroup_start, ns_num_edges, ns.group_size]).long().cuda()
+
+        num_chs_np2 = triton.next_power_of_2(ns.num_chs)
 
         # Make the grid and launch kernel
         grid = lambda meta: (triton.cdiv(ns_num_edges, meta["BLOCK_SIZE"]),)
 
-        num_chs_np2 = triton.next_power_of_2(ns.num_chs)
         _assign_target_ncpids_kernel[grid](
-            target_nids, nids_group_start, target_cids, pcids_group_start,
-            target_pids, edge_ids, chs_offsets, n_group_ids, n_id_in_group, 
-            cs_ele_id_start, cs_node_cum_ids, fw_group_max_chs, cum_n_chs, 
+            target_nids, nids_partition_start, target_cids, pcids_partition_start,
+            target_pids, edge_ids, chs_offsets, n_partition_ids, n_id_in_partition, 
+            cs_ele_id_start, cs_node_cum_ids, fw_partition_max_chs, cum_n_chs, 
             ns_param_ids, ch_n_pars, constexprs, ns.num_chs, num_chs_np2, 
             add_params_flag, BLOCK_SIZE = min(2048, 2**20 // num_chs_np2)
         )
 
-        node_start += ns_num_nodes
+        ngroup_start += ns_num_ngroups
 
         if add_params_flag:
             all_ns_param_ids[ns_idx] = ns_param_ids
 
+    # TODO: fix broken
     # Store local -> global parameter id mapping in `ns`
     for ns_idx, param_ids in all_ns_param_ids.items():
         ns = nodes[ns_idx]
         ns._param_ids = param_ids.cpu()
 
+    # TODO: fix broken
     # Store global -> local parameter id mapping in `ns`
     for ns_idx in original_param_nids:
         ns = nodes[ns_idx]
         ns._param_range = (ns._param_ids.min().item(), ns._param_ids.max().item() + 1)
         ns._inverse_param_ids = torch.argsort(ns._param_ids)
 
+    # TODO: fix broken
     # Update `param_ends`
     npars = param_ends[-1]
     nid = 0
     for ns_idx in original_param_nids:
         ns = nodes[ns_idx]
-        for i in range(ns.num_nodes):
+        for i in range(ns.num_node_groups):
             npars += n_chs[nid+i].item()
             param_ends.append(npars)
         
-        nid += ns.num_nodes
+        nid += ns.num_node_groups
 
     # Restore `nids`
     target_nids = target_nids.cpu()
     nids = []
-    for group_id in range(num_ns_in_group.size(0)):
-        sid = nids_group_start[group_id]
-        eid = sid + num_ns_in_group[group_id]
+    for partition_id in range(num_ns_in_partition.size(0)):
+        sid = nids_partition_start[partition_id]
+        eid = sid + num_ns_in_partition[partition_id]
         nids.append(target_nids[sid:eid].contiguous())
 
     # Restore `cids` and `pids`
@@ -563,10 +579,10 @@ def sum_layer_forward_compilation(nodes, fw_group_max_chs, n_group_ids, n_id_in_
     target_pids = target_pids.cpu()
     cids = []
     pids = []
-    for group_id in range(num_ns_in_group.size(0)):
-        sid = pcids_group_start[group_id]
-        gsize = num_ns_in_group[group_id]
-        gnchs = fw_group_max_chs[group_id]
+    for partition_id in range(num_ns_in_partition.size(0)):
+        sid = pcids_partition_start[partition_id]
+        gsize = num_ns_in_partition[partition_id]
+        gnchs = fw_partition_max_chs[partition_id]
         eid = sid + gsize * gnchs
         cids.append(target_cids[sid:eid].reshape(gsize, gnchs).contiguous())
         pids.append(target_pids[sid:eid].reshape(gsize, gnchs).contiguous())
diff --git a/src/pyjuice/layer/layer.py b/src/pyjuice/layer/layer.py
index 2940091c..fd77c103 100644
--- a/src/pyjuice/layer/layer.py
+++ b/src/pyjuice/layer/layer.py
@@ -12,6 +12,8 @@ def __init__(self, nodes: Sequence[CircuitNodes]) -> None:
         for i in range(1, len(nodes)):
             assert nodes[i].group_size == nodes[0].group_size, "`group_size` of nodes in the same layer must be identical."
 
+        self.group_size = nodes[0].group_size
+
         self.device = torch.device("cpu")
 
     def init_layer(self, params: Union[torch.Tensor,None]):
diff --git a/src/pyjuice/layer/prod_layer.py b/src/pyjuice/layer/prod_layer.py
index c853f761..1d787007 100644
--- a/src/pyjuice/layer/prod_layer.py
+++ b/src/pyjuice/layer/prod_layer.py
@@ -6,6 +6,7 @@
 import triton.language as tl
 import warnings
 import time
+from packaging import version
 from typing import Sequence, Optional
 
 from pyjuice.nodes import ProdNodes
@@ -222,9 +223,12 @@ def enable_partial_evaluation(self, fw_scopes: Optional[Sequence[BitSet]] = None
 
     @staticmethod
     @triton.jit
-    def _forward_backward_kernel(node_vals_ptr, element_vals_ptr, local_ids_ptr, nids_ptr, cids_ptr, tot_n_nodes, tot_n_eles, n_ngroups,
-                                 n_edges: tl.constexpr, batch_size, BLOCK_M: tl.constexpr, BLOCK_B: tl.constexpr, 
-                                 group_size: tl.constexpr, accum: tl.constexpr, partial_eval: tl.constexpr):
+    def _forward_backward_kernel_3d(node_vals_ptr, element_vals_ptr, local_ids_ptr, nids_ptr, cids_ptr, tot_n_nodes, tot_n_eles, n_ngroups,
+                                    n_edges: tl.constexpr, batch_size, BLOCK_M: tl.constexpr, BLOCK_B: tl.constexpr, 
+                                    group_size: tl.constexpr, accum: tl.constexpr, partial_eval: tl.constexpr):
+        """
+        This kernel implements the function with 3d tensors. However, it only work with `triton==2.0.0`.
+        """
         
         pid_m = tl.program_id(axis = 0) # ID of size-`BLOCK_M` nodes
         pid_b = tl.program_id(axis = 1) # ID of size-`BLOCK_B` batches
@@ -309,6 +313,60 @@ def _forward_backward_kernel(node_vals_ptr, element_vals_ptr, local_ids_ptr, nid
 
             tl.store(node_vals_ptr + offs_nvals, nvals, mask = mask_batch[:,None])
 
+    @staticmethod
+    @triton.jit
+    def _forward_backward_kernel_2d(node_vals_ptr, element_vals_ptr, local_ids_ptr, nids_ptr, cids_ptr, tot_n_nodes, tot_n_eles, n_ngroups,
+                                    n_edges: tl.constexpr, batch_size, BLOCK_M: tl.constexpr, BLOCK_B: tl.constexpr, 
+                                    group_size: tl.constexpr, accum: tl.constexpr, partial_eval: tl.constexpr):
+        """
+        This kernel implements the function with 2d tensors. It works for all `triton` versions.
+        """
+
+        pid_m = tl.program_id(axis = 0) # ID of size-`BLOCK_M` nodes
+        pid_b = tl.program_id(axis = 1) # ID of size-`BLOCK_B` batches
+
+        # Get inferred node group id from `pid_m`
+        ngroup_id = pid_m // (group_size // BLOCK_M)
+        ntile_id = pid_m % (group_size // BLOCK_M)
+
+        # For partial evaluation
+        if partial_eval == 1:
+            ngroup_id = tl.load(local_ids_ptr + ngroup_id)
+
+        # Batch offsets and mask
+        offs_batch = tl.arange(0, BLOCK_B) + pid_b * BLOCK_B # [BLOCK_B]
+        mask_batch = offs_batch < batch_size
+
+        # Get the group start ids for the children
+        offs_edge = tl.arange(0, n_edges)
+        offs_egstart = tl.load(cids_ptr + ngroup_id * n_edges + offs_edge) # [n_edges]
+
+        # Base ptr for ch values
+        evals_ptr = element_vals_ptr + \
+            (offs_egstart[:,None] + ntile_id * BLOCK_M) * batch_size + \
+            offs_batch[None,:] # [n_edges, BLOCK_B]
+
+        # Base ptr for par values
+        ngroup_start = tl.load(nids_ptr + ngroup_id)
+        nvals_ptr = node_vals_ptr + \
+            (ngroup_start + ntile_id * BLOCK_M) * batch_size + \
+            offs_batch
+
+        # Inner loop
+        for i in range(0, BLOCK_M):
+            evals = tl.load(evals_ptr, mask = mask_batch[None,:], other = 0)
+            nvals = tl.sum(evals, axis = 0)
+
+            # Accumulate the `node_vals` if required
+            if accum == 1:
+                node_vals = tl.load(nvals_ptr, mask = mask_batch)
+                nvals += node_vals
+
+            tl.store(nvals_ptr, nvals, mask = mask_batch)
+
+            nvals_ptr += batch_size
+            evals_ptr += batch_size
+
     @staticmethod
     @torch.compile(mode = "reduce-overhead", fullgraph = True)
     def _forward_backward_pytorch(node_vals, element_vals, nids, cids, accum: bool = False):
@@ -341,28 +399,55 @@ def _forward_backward(self, node_vals: torch.Tensor, element_vals: torch.Tensor,
 
             return None
 
-        BLOCK_B = min(1024 // n_edges, triton.next_power_of_2(batch_size))
-        BLOCK_M = min(max(1024 // (BLOCK_B * n_edges), 1), triton.next_power_of_2(n_ngroups) * self.group_size)
-
-        grid = (triton.cdiv(n_ngroups * self.group_size, BLOCK_M), triton.cdiv(batch_size, BLOCK_B))
-
-        self._forward_backward_kernel[grid](
-            node_vals_ptr = node_vals, 
-            element_vals_ptr = element_vals,
-            local_ids_ptr = local_ids,
-            nids_ptr = nids, 
-            cids_ptr = cids, 
-            tot_n_nodes = tot_n_nodes,
-            tot_n_eles = tot_n_eles,
-            n_ngroups = n_ngroups,
-            n_edges = n_edges,
-            batch_size = batch_size,
-            BLOCK_M = BLOCK_M, 
-            BLOCK_B = BLOCK_B,
-            group_size = self.group_size,
-            accum = 1 if accum else 0,
-            partial_eval = 1 if local_ids is not None else 0
-        )
+        if version.parse(triton.__version__) > version.parse("2.0.0"):
+
+            BLOCK_B = min(1024 // n_edges, triton.next_power_of_2(batch_size))
+            BLOCK_M = min(max(1024 // (BLOCK_B * n_edges), 1), self.group_size)
+
+            grid = (triton.cdiv(n_ngroups * self.group_size, BLOCK_M), triton.cdiv(batch_size, BLOCK_B))
+
+            self._forward_backward_kernel_2d[grid](
+                node_vals_ptr = node_vals, 
+                element_vals_ptr = element_vals,
+                local_ids_ptr = local_ids,
+                nids_ptr = nids, 
+                cids_ptr = cids, 
+                tot_n_nodes = tot_n_nodes,
+                tot_n_eles = tot_n_eles,
+                n_ngroups = n_ngroups,
+                n_edges = n_edges,
+                batch_size = batch_size,
+                BLOCK_M = BLOCK_M, 
+                BLOCK_B = BLOCK_B,
+                group_size = self.group_size,
+                accum = 1 if accum else 0,
+                partial_eval = 1 if local_ids is not None else 0
+            )
+
+        else:
+
+            BLOCK_B = min(1024 // n_edges, triton.next_power_of_2(batch_size))
+            BLOCK_M = min(max(1024 // (BLOCK_B * n_edges), 1), triton.next_power_of_2(n_ngroups) * self.group_size)
+
+            grid = (triton.cdiv(n_ngroups * self.group_size, BLOCK_M), triton.cdiv(batch_size, BLOCK_B))
+
+            self._forward_backward_kernel_3d[grid](
+                node_vals_ptr = node_vals, 
+                element_vals_ptr = element_vals,
+                local_ids_ptr = local_ids,
+                nids_ptr = nids, 
+                cids_ptr = cids, 
+                tot_n_nodes = tot_n_nodes,
+                tot_n_eles = tot_n_eles,
+                n_ngroups = n_ngroups,
+                n_edges = n_edges,
+                batch_size = batch_size,
+                BLOCK_M = BLOCK_M, 
+                BLOCK_B = BLOCK_B,
+                group_size = self.group_size,
+                accum = 1 if accum else 0,
+                partial_eval = 1 if local_ids is not None else 0
+            )
 
         return None
 
diff --git a/src/pyjuice/layer/sum_layer.py b/src/pyjuice/layer/sum_layer.py
index 6aea6c07..11d60d46 100644
--- a/src/pyjuice/layer/sum_layer.py
+++ b/src/pyjuice/layer/sum_layer.py
@@ -21,6 +21,82 @@
 class SumLayer(Layer, nn.Module):
 
     def __init__(self, nodes: Sequence[SumNodes], global_nid_start: int, 
+                 param_ends: Sequence, tied_param_ids: Sequence,
+                 tied_param_group_ids: Sequence, tied_param_ends: Sequence,
+                 ch_prod_layer_size: int, layer_sparsity_tol: Optional[float] = None, 
+                 max_num_partitions: Optional[int] = None,
+                 disable_gpu_compilation: bool = False) -> None:
+
+        Layer.__init__(self, nodes)
+        nn.Module.__init__(self)
+
+        assert len(nodes) > 0, "No input node."
+
+        self.nodes = nodes
+        self.ch_prod_layer_size = ch_prod_layer_size
+
+        ## Get layer statistics & prepare for compilation ##
+
+        # n_chs:       [num_node_groups]          stores the number of child nodes of each node
+        # Note: to allow different nodes to have different `ch_group_size`s, we record the number of 
+        #       child **nodes** (instead of # node groups) in `n_chs`
+        layer_num_ngroups, layer_num_edges, n_chs = get_sum_layer_stats(self.nodes, global_nid_start)
+
+        self.num_nodes = layer_num_ngroups * self.group_size # Total number of nodes
+        self.num_edges = layer_num_edges # Total number of edges
+
+        # Find a good strategy to partition the node groups according to their number of children 
+        # to minimize total computation cost
+        fw_partition_max_chs = partition_nodes_by_n_edges(
+            n_chs, sparsity_tolerance = layer_sparsity_tol, max_num_partitions = max_num_partitions
+        )
+
+        # Since the triton kernels require the maximum number children for each group to be a power of 2,
+        # we postprocess the partition sizes
+        fw_partition_max_chs = torch.unique(next_power_of_2(fw_partition_max_chs))
+
+        self.num_fw_partitions = len(fw_partition_max_chs) # Number of groups
+
+        # fw_n_partition_ids:      [num_ngroups]           stores the partition id for each node node
+        # fw_n_id_in_partition:    [num_ngroups]           stores the index of the node groups in the partition
+        # fw_num_ns_in_partition:  [num_fw_partitions]     number of node groups in each partition
+        fw_n_partition_ids = torch.zeros([layer_num_ngroups], dtype = torch.long)
+        fw_n_id_in_partition = torch.zeros([layer_num_ngroups], dtype = torch.long)
+        fw_num_ns_in_partition = torch.zeros([self.num_fw_partitions], dtype = torch.long)
+
+        min_n_chs = 0
+        for partition_id, max_n_chs in enumerate(fw_partition_max_chs):
+            criterion = (n_chs >= min_n_chs) & (n_chs <= max_n_chs)
+            partition_size = criterion.sum().item()
+
+            fw_n_partition_ids[criterion] = partition_id
+            fw_n_id_in_partition[criterion] = torch.arange(partition_size)
+            fw_num_ns_in_partition[partition_id] = partition_size
+
+            min_n_chs = max_n_chs + 1
+
+        ## Initialize forward pass ##
+
+        # nids:      List[[partition_size]]                      stores node group ids
+        # cids:      List[[partition_size, partition_max_n_chs]] stores indices of child node groups
+        # pids:      List[[partition_size, partition_max_n_chs]] stores indices of edge parameters (1st parameter of every group)
+        # ch_n_pars: [ch_prod_layer_size]                        stores the number of parents for each child node
+        nids, cids, pids, ch_n_pars, param_ends = sum_layer_forward_compilation(
+            self.nodes, fw_partition_max_chs, fw_n_partition_ids, fw_n_id_in_partition, fw_num_ns_in_partition, 
+            n_chs, global_nid_start, ch_prod_layer_size, param_ends = param_ends,
+            # GPU compilation is slightly slower for small layer due to the kernel jit compilation time
+            use_cuda = True # not disable_gpu_compilation and (self.num_edges > 1000)
+        )
+
+        # Store buffers for the forward pass
+        self.partitioned_nids = nn.ParameterList([nn.Parameter(tensor, requires_grad = False) for tensor in nids])
+        self.partitioned_cids = nn.ParameterList([nn.Parameter(tensor, requires_grad = False) for tensor in cids])
+        self.partitioned_pids = nn.ParameterList([nn.Parameter(tensor, requires_grad = False) for tensor in pids])
+
+        # Store pre-compiled indices from `cids` and `pids` in the following buffer
+        self._cached_fw_pcids = dict()
+
+    def __init__old(self, nodes: Sequence[SumNodes], global_nid_start: int, 
                  param_ends: Sequence, tied_param_ids: Sequence,
                  tied_param_group_ids: Sequence, tied_param_ends: Sequence,
                  ch_prod_layer_size: int, layer_sparsity_tol: float = 0.0, 
@@ -146,6 +222,14 @@ def __init__(self, nodes: Sequence[SumNodes], global_nid_start: int,
         # This is used to implement partial evaluation
         self.global_nid_range = (global_nid_start, global_nid_start + self.num_nodes)
 
+    def to(self, device):
+        super(SumLayer, self).to(device)
+
+        # Move cached fw pcids to the new device
+        for k, v in self._cached_fw_pcids.items():
+            new_v = [tensor.to(device) for tensor in v]
+            self._cached_fw_compiled_pcids[k] = new_v
+
     def forward(self, node_mars: torch.Tensor, element_mars: torch.Tensor, params: torch.Tensor) -> None:
         """
         Computes the forward pass of a sum layer:
@@ -162,27 +246,29 @@ def forward(self, node_mars: torch.Tensor, element_mars: torch.Tensor, params: t
         `params`:       [num_params, B] or [num_params]
         """
 
-        if not self.provided("fw_group_local_ids"):
+        if not self.provided("fw_partition_local_ids"):
             # Evaluate the whole layer
-            for group_id in range(self.num_fw_groups):
-                nids = self.grouped_nids[group_id]
-                cids = self.grouped_cids[group_id]
-                pids = self.grouped_pids[group_id]
+            for partition_id in range(self.num_fw_partitions):
+                nids = self.partitioned_nids[partition_id]
+                cids = self.partitioned_cids[partition_id]
+                pids = self.partitioned_pids[partition_id]
 
                 self._forward(
-                    node_mars, element_mars, params, nids, cids, pids
+                    node_mars, element_mars, params, nids, cids, pids, partition_id = partition_id
                 )
 
         else:
             # Partial evaluation
-            for group_id in range(self.num_fw_groups):
-                nids = self.grouped_nids[group_id]
-                cids = self.grouped_cids[group_id]
-                pids = self.grouped_pids[group_id]
-                local_ids = self.fw_group_local_ids[group_id]
+            for partition_id in range(self.num_fw_partitions):
+                nids = self.partitioned_nids[partition_id]
+                cids = self.partitioned_cids[partition_id]
+                pids = self.partitioned_pids[partition_id]
+                local_ids = self.fw_partition_local_ids[partition_id]
 
                 self._forward(
-                    node_mars, element_mars, params, nids, cids, pids, local_ids = local_ids
+                    node_mars, element_mars, params, 
+                    nids, cids, pids, local_ids = local_ids,
+                    partition_id = partition_id
                 )
 
         return None
@@ -240,7 +326,7 @@ def backward(self, node_flows: torch.Tensor, element_flows: torch.Tensor,
         
     @staticmethod
     @triton.jit
-    def _forward_triton_kernel(node_mars_ptr, element_mars_ptr, params_ptr, 
+    def _forward_triton_kernel_old(node_mars_ptr, element_mars_ptr, params_ptr, 
                                nids_ptr, cids_ptr, pids_ptr, tot_n_nodes, 
                                tot_n_eles, n_nodes, n_edges: tl.constexpr, 
                                batch_size, n_nodes_per_block_m: tl.constexpr,
@@ -324,91 +410,159 @@ def _forward_pytorch_kernel(node_mars: torch.Tensor, element_mars: torch.Tensor,
 
         return None
 
-    def _forward(self, node_mars: torch.Tensor, element_mars: torch.Tensor, 
+    @staticmethod
+    @triton.jit
+    def _fw_triton_block_sparse_kernel(node_mars, element_mars, params, nids, cids_start, cids_increment,
+                                       pids_start, pids_increment, local_ids, batch_size, partial_eval: tl.constexpr,
+                                       BLOCK_B: tl.constexpr, TILE_SIZE_K: tl.constexpr, K_NUM_TILES: tl.constexpr,
+                                       TILE_SIZE_M: tl.constexpr, GROUP_SIZE_M: tl.constexpr):
+
+        pid_b = tl.program_id(0) # ID of size-`BLOCK_B` batches
+        pid_m = tl.program_id(1) # ID of size-`TILE_SIZE_M` nodes
+
+        # Get inferred node group id from `pid_m`
+        ngroup_id = pid_m // (GROUP_SIZE_M // TILE_SIZE_M)
+        tile_id = pid_m % (GROUP_SIZE_M // TILE_SIZE_M)
+
+        # Get the real node group id in the case of partial evaluation
+        if partial_eval == 1:
+            ngroup_id = tl.load(local_ids + ngroup_id)
+
+        # Initialize pointers to `params`
+        offs_node = tl.arange(0, TILE_SIZE_M) + tile_id * TILE_SIZE_M
+        offs_edge = tl.arange(0, TILE_SIZE_K)
+        par_start = tl.load(pids_start + ngroup_id * TILE_SIZE_K + offs_edge)
+        epars_ptr = params + \
+            offs_node[:,None] + \
+            par_start[None,:] # [TILE_SIZE_M, TILE_SIZE_K]
+
+        # Batch offsets and mask
+        offs_batch = tl.arange(0, BLOCK_B) + pid_b * BLOCK_B
+        mask_batch = offs_batch < batch_size
+
+        # Initialize pointers to `element_mars`
+        edge_start = tl.load(cids_start + ngroup_id * TILE_SIZE_K + offs_edge)
+        emars_ptr = element_mars + \
+            edge_start[:,None] * batch_size + \
+            offs_batch[None,:] # [TILE_SIZE_K, BLOCK_B]
+
+        # Batch increment pointers
+        pids_inc_ptr = pids_increment + ngroup_id * (K_NUM_TILES * TILE_SIZE_K) + offs_edge
+        cids_inc_ptr = cids_increment + ngroup_id * (K_NUM_TILES * TILE_SIZE_K) + offs_edge
+
+        # Inner loop
+        acc = tl.zeros([TILE_SIZE_M, BLOCK_B], dtype = tl.float32) - float("inf")
+
+        for k in range(0, K_NUM_TILES):
+            epars = tl.load(epars_ptr)
+            emars = tl.load(emars_ptr, mask = mask_batch[None,:])
+
+            emars_max = tl.max(emars, axis = 0)[None,:]
+            emars = tl.exp(emars - emars_max)
+            epars = epars.to(tl.float16)
+            emars = emars.to(tl.float16)
+            nmars = tl.dot(epars, emars).to(tl.float32)
+
+            acc = tl.where(emars_max > acc,
+                tl.log(nmars + tl.exp(acc - emars_max)) + emars_max,
+                tl.log(tl.exp(emars_max - acc) * nmars + 1.0) + acc
+            )
+
+            # Increment `epars_ptr`
+            pids_inc = tl.load(pids_inc_ptr)
+            epars_ptr += pids_inc[None,:]
+            pids_inc += TILE_SIZE_K
+
+            # Increment `emars_ptr`
+            cids_inc = tl.load(cids_inc_ptr)
+            emars_ptr += cids_inc[:,None] * batch_size
+            cids_inc += TILE_SIZE_K
+
+        # Write back
+        off_nids = tl.load(nids + ngroup_id)
+        offs_nmars = (off_nids + offs_node[:,None]) * batch_size + offs_batch[None,:]
+        tl.store(node_mars + offs_nmars, acc, mask = mask_batch[None,:])
+
+    def _forward(self, node_mars: torch.Tensor, element_mars: torch.Tensor,
                  params: torch.Tensor, nids: torch.Tensor, cids: torch.Tensor,
-                 pids: torch.Tensor, local_ids: Optional[torch.Tensor] = None, 
-                 BLOCK_M_HARD_LIMIT = 2**16, BLOCK_SIZE = 2**12, 
-                 MAX_BLOCK_M = 2**12, MAX_BLOCK_N = 64) -> None:
+                 pids: torch.Tensor, local_ids: Optional[torch.Tensor] = None,
+                 partition_id: int = -1) -> None:
         """
-        This function is equivalent to running:
-        ``` 
-        ch_mars = element_mars[cids]
-        maxval = ch_mars.max(dim = 1, keepdim = True).values
-        node_mars[nids] = (((ch_mars - maxval).exp() * params[pids]).sum(
-            dim = 1).clamp(min = 1e-10)).log() + maxval.squeeze(1)
-        ```
+        Forward pass of sum layers.
         
         Parameters:
         `node_mars`:    [N, B]
         `element_mars`: [M, B]
         `params`:       [E]
-        `nids`:         [n]
-        `cids`:         [n, c]
-        `pids`:         [n, c]
+        `nids`:         [ng]
+        `cids`:         [ng, c]
+        `pids`:         [ng, c]
         """
 
-        if local_ids is not None and local_ids.size(0) == 0:
-            # Nothing need to be evaluated in the current group
-            return None
-        elif local_ids is not None:
-            # Select nodes
-            nids = nids[local_ids].contiguous()
-            cids = cids[local_ids,:].contiguous()
-            pids = pids[local_ids,:].contiguous()
+        assert params.dim() == 1, "Expecting a 1D `params`."
 
-        tot_n_nodes = node_mars.size(0)
-        tot_n_eles = element_mars.size(0)
-        n_nodes = nids.size(0)
-        n_edges = cids.size(1)
+        num_ngroups = nids.size(0) if local_ids is None else local_ids.size(0)
+        layer_n_nodes = num_ngroups * self.group_size
+        num_edges = cids.size(1)
         batch_size = node_mars.size(1)
 
-        if params.dim() == 2 and params.size(1) == 1:
-            params = params.squeeze(1)
-
-        # Fall back to the `torch.compile` kernel in the case where we cannot store child edges within a single block
-        if n_edges > BLOCK_M_HARD_LIMIT or not node_mars.is_cuda:
-            self._forward_pytorch_kernel(node_mars, element_mars, params, nids, cids, pids)
-
-            return None
-
-        assert n_edges <= BLOCK_M_HARD_LIMIT, f"Number of edges should be smaller than or equal to {BLOCK_M_HARD_LIMIT}."
-        assert params.dim() == 1, "Expecting a 1D `params`."
-
-        if n_edges <= MAX_BLOCK_M:
-            # In this case, we can find a better thread-block balance
-            MIN_BLOCK_M = min(triton.next_power_of_2(n_edges), MAX_BLOCK_M)
-            BLOCK_N = min(BLOCK_SIZE // MIN_BLOCK_M, MAX_BLOCK_N, triton.next_power_of_2(batch_size))
-            BLOCK_M = min(BLOCK_SIZE // BLOCK_N, MAX_BLOCK_M)
+        # Heuristic to set `TILE_SIZE_M`, `TILE_SIZE_K`, and `BLOCK_B`
+        base_size = min(self.group_size, num_edges, batch_size, 128)
+        if base_size >= 64:
+            TILE_SIZE_K = base_size
+            TILE_SIZE_M = 2048 // base_size
+            BLOCK_B = 2048 // base_size
         else:
-            # Try to fit all edges of a node in a single thread-block
-            BLOCK_M = triton.next_power_of_2(n_edges)
-            BLOCK_N = max(BLOCK_SIZE // BLOCK_M, 1)
-
-        # import numpy as np
-        # np.savez("temp.npz", node_mars = node_mars.cpu().numpy(), element_mars = element_mars.cpu().numpy(), params = params.cpu().numpy(),
-        #          nids = nids.cpu().numpy(), cids = cids.cpu().numpy(), pids = pids.cpu().numpy(), tot_n_nodes = tot_n_nodes, tot_n_eles = tot_n_eles, n_nodes = n_nodes,
-        #          n_edges = n_edges, batch_size = batch_size, BLOCK_M = BLOCK_M, BLOCK_N = BLOCK_N)
-        # import pdb; pdb.set_trace()
-
-        grid = (triton.cdiv(n_nodes * n_edges, BLOCK_M), triton.cdiv(batch_size, BLOCK_N), 1)
+            remainder = 2048 // (base_size ** 2)
+
+            TILE_SIZE_K = min(2048 // remainder, base_size * remainder, num_edges)
+            TILE_SIZE_M = min(2048 // TILE_SIZE_K, self.group_size)
+            BLOCK_B = min(2048 // TILE_SIZE_K, batch_size)
+        K_NUM_TILES = num_edges // TILE_SIZE_K
+
+        signature = (partition_id, TILE_SIZE_K)
+        if signature not in self._cached_fw_pcids:
+            # Pre-compute pointer increments for `cids` and `pids`
+
+            cids = cids.clone().reshape(num_ngroups, K_NUM_TILES, TILE_SIZE_K)
+            cids_start = cids[:,0,:].contiguous()
+            cids_increment = torch.cat(
+                (cids[:,1:,:] - cids[:,:-1,:], cids[:,0:1,:] * 0), 
+                dim = 1
+            ).contiguous()
+
+            pids = pids.clone().reshape(num_ngroups, K_NUM_TILES, TILE_SIZE_K)
+            pids_start = pids[:,0,:].contiguous()
+            pids_increment = torch.cat(
+                (pids[:,1:,:] - pids[:,:-1,:], pids[:,0:1,:] * 0),
+                dim = 1
+            ).contiguous()
+
+            self._cached_fw_pcids[signature] = [cids_start, cids_increment, pids_start, pids_increment]
+        else:
+            cids_start, cids_increment, pids_start, pids_increment = self._cached_fw_pcids[signature]
 
-        self._forward_triton_kernel[grid](
-            node_mars_ptr = node_mars, 
-            element_mars_ptr = element_mars, 
-            params_ptr = params,
-            nids_ptr = nids, 
-            cids_ptr = cids, 
-            pids_ptr = pids,
-            tot_n_nodes = tot_n_nodes,
-            tot_n_eles = tot_n_eles,
-            n_nodes = n_nodes,
-            n_edges = n_edges,
-            batch_size = batch_size,
-            n_nodes_per_block_m = BLOCK_M // n_edges,
-            BLOCK_M = BLOCK_M, 
-            BLOCK_N = BLOCK_N
+        grid = (triton.cdiv(batch_size, BLOCK_B), triton.cdiv(layer_n_nodes, TILE_SIZE_M))
+        
+        self._fw_triton_block_sparse_kernel[grid](
+            node_mars, 
+            element_mars, 
+            params, 
+            nids, 
+            cids_start,
+            cids_increment, 
+            pids_start,
+            pids_increment,
+            local_ids,
+            batch_size,
+            partial_eval = 1 if local_ids is not None else 0,
+            BLOCK_B = BLOCK_B,
+            TILE_SIZE_K = TILE_SIZE_K,
+            K_NUM_TILES = K_NUM_TILES,
+            TILE_SIZE_M = TILE_SIZE_M,
+            GROUP_SIZE_M = self.group_size
         )
-
+        
         return None
 
     @staticmethod
diff --git a/tests/layer/sum_layer_test.py b/tests/layer/sum_layer_test.py
new file mode 100644
index 00000000..a619b67a
--- /dev/null
+++ b/tests/layer/sum_layer_test.py
@@ -0,0 +1,143 @@
+import pyjuice as juice
+import torch
+import numpy as np
+import time
+import random
+
+import pyjuice.nodes.distributions as dists
+from pyjuice.utils import BitSet
+from pyjuice.nodes import multiply, summate, inputs
+from pyjuice.model import TensorCircuit
+
+from pyjuice.layer import InputLayer, ProdLayer, SumLayer
+
+import pytest
+
+
+def sum_layer_test():
+
+    device = torch.device("cuda:0")
+
+    group_size = 16
+    batch_size = 16
+    
+    with juice.set_group_size(group_size):
+
+        ni0 = inputs(0, num_node_groups = 2, dist = dists.Categorical(num_cats = 2))
+        ni1 = inputs(1, num_node_groups = 2, dist = dists.Categorical(num_cats = 2))
+        ni2 = inputs(2, num_node_groups = 2, dist = dists.Categorical(num_cats = 2))
+        ni3 = inputs(3, num_node_groups = 2, dist = dists.Categorical(num_cats = 2))
+
+        np0 = multiply(ni0, ni1)
+        np1 = multiply(ni2, ni3)
+        np2 = multiply(ni1, ni2)
+
+        ns0 = summate(np0, num_node_groups = 2)
+        ns1 = summate(np1, num_node_groups = 2)
+        ns2 = summate(np2, num_node_groups = 2)
+
+    input_layer = InputLayer([ni0, ni1, ni2, ni3], cum_nodes = group_size)
+
+    prod_layer = ProdLayer([np0, np1, np2])
+
+    layer = SumLayer([ns0, ns1, ns2], global_nid_start = group_size,
+                     param_ends = [1], tied_param_ids = [],
+                     tied_param_group_ids = [], tied_param_ends = [],
+                     ch_prod_layer_size = prod_layer.num_nodes + group_size)
+
+    assert torch.all(layer.partitioned_nids[0] == torch.arange(group_size, 7 * group_size, group_size))
+    assert torch.all(layer.partitioned_cids[0][0:2,0] == group_size)
+    assert torch.all(layer.partitioned_cids[0][2:4,0] == 3 * group_size)
+    assert torch.all(layer.partitioned_cids[0][4:6,0] == 5 * group_size)
+    assert torch.all(layer.partitioned_cids[0][0:2,1] == group_size + 1)
+    assert torch.all(layer.partitioned_cids[0][2:4,1] == 3 * group_size + 1)
+    assert torch.all(layer.partitioned_cids[0][4:6,1] == 5 * group_size + 1)
+    assert torch.all(layer.partitioned_pids[0][:,0] == torch.arange(group_size, (group_size * 2 * 6 + 1) * group_size, 2 * group_size * group_size) - group_size + 1)
+    assert torch.all(layer.partitioned_pids[0][:,1] == torch.arange(group_size, (group_size * 2 * 6 + 1) * group_size, 2 * group_size * group_size) + 1)
+
+    layer.to(device)
+
+    ## Forward tests ##
+
+    element_mars = torch.rand([group_size + 3 * 2 * 2 * group_size, batch_size]).log().to(device)
+    element_mars[:group_size,:] = -float("inf")
+    node_mars = torch.zeros([group_size + group_size * 2 * 3, batch_size]).to(device)
+
+    params = torch.rand([1 + 3 * 4 * group_size * group_size]).to(device)
+
+    layer(node_mars, element_mars, params)
+
+    for i in range(group_size):
+        for j in range(6):
+            cmars = element_mars[layer.partitioned_cids[0][j,:]].exp()
+            epars = params[layer.partitioned_pids[0][j,:]+i]
+            assert torch.all(torch.abs(node_mars[(j+1)*group_size+i,:] - (epars[:,None] * cmars).sum(dim = 0).log()) < 1e-3)
+
+
+def speed_test():
+
+    device = torch.device("cuda:0")
+
+    group_size = 32
+    num_vars = 28*28
+    num_node_groups = 256 // group_size
+    num_prod_nodes = 200
+
+    batch_size = 512
+
+    with juice.set_group_size(group_size):
+
+        nis = []
+        for v in range(num_vars):
+            nis.append(inputs(v, num_node_groups = num_node_groups, dist = dists.Categorical(num_cats = 64)))
+
+        nps = []
+        for i in range(num_prod_nodes):
+            v1 = random.randint(0, num_vars - 1)
+            v2 = random.randint(0, num_vars - 1)
+            if v1 == v2:
+                if v1 == num_vars - 1:
+                    v1 -= 2
+                v2 = v1 + 1
+
+            nps.append(multiply(nis[v1], nis[v2]))
+
+        nodes = [summate(np, num_node_groups = num_node_groups) for np in nps]
+
+    input_layer = InputLayer(nis, cum_nodes = group_size)
+
+    prod_layer = ProdLayer(nps, layer_sparsity_tol = 0.1)
+
+    layer = SumLayer(nodes, global_nid_start = group_size,
+                         param_ends = [1], tied_param_ids = [],
+                         tied_param_group_ids = [], tied_param_ends = [],
+                         ch_prod_layer_size = prod_layer.num_nodes + group_size)
+
+    # import pdb; pdb.set_trace()
+
+    layer.to(device)
+
+    node_mars = torch.zeros([group_size + group_size * num_node_groups * num_prod_nodes, batch_size]).to(device)
+    element_mars = torch.rand([group_size + num_prod_nodes * group_size * num_node_groups, batch_size]).log().to(device)
+    params = torch.rand([layer.partitioned_pids[0].max() + group_size]).to(device)
+
+    ## Forward tests ##
+
+    layer(node_mars, element_mars, params)
+
+    t0 = time.time()
+    torch.cuda.synchronize()
+    for _ in range(100):
+        layer(node_mars, element_mars, params)
+    torch.cuda.synchronize()
+    t1 = time.time()
+    forward_ms = (t1 - t0) / 100 * 1000
+
+    print(f"Forward pass on average takes {forward_ms:.3f}ms.")
+    print("Reference computation time on RTX 4090: 11.255ms.")
+    print("--------------------------------------------------------------")
+
+
+if __name__ == "__main__":
+    # sum_layer_test()
+    speed_test()
\ No newline at end of file

From de474633f1e467f239c5197fd0b6822dae571582 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Tue, 5 Dec 2023 03:02:34 +0800
Subject: [PATCH 023/162] add "sparse" version of `SumLayer` fw kernel

---
 src/pyjuice/layer/sum_layer.py | 167 +++++++++++++++++++++++++++++++--
 1 file changed, 159 insertions(+), 8 deletions(-)

diff --git a/src/pyjuice/layer/sum_layer.py b/src/pyjuice/layer/sum_layer.py
index 11d60d46..22d1b64b 100644
--- a/src/pyjuice/layer/sum_layer.py
+++ b/src/pyjuice/layer/sum_layer.py
@@ -410,6 +410,52 @@ def _forward_pytorch_kernel(node_mars: torch.Tensor, element_mars: torch.Tensor,
 
         return None
 
+    def _forward(self, node_mars: torch.Tensor, element_mars: torch.Tensor,
+                 params: torch.Tensor, nids: torch.Tensor, cids: torch.Tensor,
+                 pids: torch.Tensor, local_ids: Optional[torch.Tensor] = None,
+                 partition_id: int = -1, mode: Optional[str] = None) -> None:
+        """
+        Forward pass of sum layers.
+        
+        Parameters:
+        `node_mars`:    [N, B]
+        `element_mars`: [M, B]
+        `params`:       [E]
+        `nids`:         [ng]
+        `cids`:         [ng, c]
+        `pids`:         [ng, c]
+        """
+
+        num_edges = cids.size(1)
+        batch_size = node_mars.size(1)
+
+        if mode is not None:
+            assert mode in ["block_sparse", "sparse"]
+
+        elif params.dim() == 1 and self.group_size >= 16 and num_edges >= 16 and batch_size >= 16:
+            # In this case, we should definitely use the block-sparse implementation
+            mode = "block_sparse"
+        elif self.group_size * num_edges < 16 and num_edges * batch_size < 16:
+            # In this case, we should definitely use the sparse implementation
+            mode = "sparse"
+        else:
+            mode = "sparse"
+
+        if mode == "block_sparse":
+            self._forward_block_sparse(
+                node_mars, element_mars, params, nids, cids, pids, local_ids,
+                partition_id = partition_id
+            )
+
+        elif mode == "sparse":
+            self._forward_sparse(
+                node_mars, element_mars, params, nids, cids, pids, local_ids,
+                partition_id = partition_id
+            )
+        
+        else:
+            raise ValueError(f"Unexpected mode `{mode}`.")
+
     @staticmethod
     @triton.jit
     def _fw_triton_block_sparse_kernel(node_mars, element_mars, params, nids, cids_start, cids_increment,
@@ -483,12 +529,12 @@ def _fw_triton_block_sparse_kernel(node_mars, element_mars, params, nids, cids_s
         offs_nmars = (off_nids + offs_node[:,None]) * batch_size + offs_batch[None,:]
         tl.store(node_mars + offs_nmars, acc, mask = mask_batch[None,:])
 
-    def _forward(self, node_mars: torch.Tensor, element_mars: torch.Tensor,
-                 params: torch.Tensor, nids: torch.Tensor, cids: torch.Tensor,
-                 pids: torch.Tensor, local_ids: Optional[torch.Tensor] = None,
-                 partition_id: int = -1) -> None:
+    def _forward_block_sparse(self, node_mars: torch.Tensor, element_mars: torch.Tensor,
+                              params: torch.Tensor, nids: torch.Tensor, cids: torch.Tensor,
+                              pids: torch.Tensor, local_ids: Optional[torch.Tensor] = None,
+                              partition_id: int = -1) -> None:
         """
-        Forward pass of sum layers.
+        Forward pass of sum layers with the block-sparse processing kernel.
         
         Parameters:
         `node_mars`:    [N, B]
@@ -505,9 +551,10 @@ def _forward(self, node_mars: torch.Tensor, element_mars: torch.Tensor,
         layer_n_nodes = num_ngroups * self.group_size
         num_edges = cids.size(1)
         batch_size = node_mars.size(1)
+        BATCH_SIZE_NP2 = triton.next_power_of_2(batch_size)
 
         # Heuristic to set `TILE_SIZE_M`, `TILE_SIZE_K`, and `BLOCK_B`
-        base_size = min(self.group_size, num_edges, batch_size, 128)
+        base_size = min(self.group_size, num_edges, BATCH_SIZE_NP2, 128)
         if base_size >= 64:
             TILE_SIZE_K = base_size
             TILE_SIZE_M = 2048 // base_size
@@ -517,10 +564,10 @@ def _forward(self, node_mars: torch.Tensor, element_mars: torch.Tensor,
 
             TILE_SIZE_K = min(2048 // remainder, base_size * remainder, num_edges)
             TILE_SIZE_M = min(2048 // TILE_SIZE_K, self.group_size)
-            BLOCK_B = min(2048 // TILE_SIZE_K, batch_size)
+            BLOCK_B = min(2048 // TILE_SIZE_K, BATCH_SIZE_NP2)
         K_NUM_TILES = num_edges // TILE_SIZE_K
 
-        signature = (partition_id, TILE_SIZE_K)
+        signature = ("block_sparse", partition_id, TILE_SIZE_K)
         if signature not in self._cached_fw_pcids:
             # Pre-compute pointer increments for `cids` and `pids`
 
@@ -565,6 +612,110 @@ def _forward(self, node_mars: torch.Tensor, element_mars: torch.Tensor,
         
         return None
 
+    @staticmethod
+    @triton.jit
+    def _fw_triton_sparse_kernel(node_mars, element_mars, params, nids, cids, pids,
+                                 local_ids, batch_size, partial_eval: tl.constexpr, n_edges: tl.constexpr, 
+                                 BLOCK_B: tl.constexpr, BLOCK_M: tl.constexpr, GROUP_SIZE_M: tl.constexpr):
+        
+        pid_b = tl.program_id(axis = 0) # ID of size-`BLOCK_B` batches
+        pid_m = tl.program_id(axis = 1) # ID of size-`BLOCK_M` nodes
+
+        # Get inferred node group id from `pid_m`
+        ngroup_id = pid_m // (GROUP_SIZE_M // BLOCK_M)
+        tile_id = pid_m % (GROUP_SIZE_M // BLOCK_M)
+
+        # Get the real node group id in the case of partial evaluation
+        if partial_eval == 1:
+            ngroup_id = tl.load(local_ids + ngroup_id)
+
+        # Initialize pointers to `params`
+        offs_edge = tl.arange(0, n_edges)
+        par_start = tl.load(pids + ngroup_id * n_edges + offs_edge)
+        epars_ptr = params + tile_id * BLOCK_M + par_start # [n_edges]
+
+        # Batch offsets and mask
+        offs_batch = tl.arange(0, BLOCK_B) + pid_b * BLOCK_B
+        mask_batch = offs_batch < batch_size
+
+        # Initialize and load edge mars
+        edge_ids = tl.load(cids + ngroup_id * n_edges + offs_edge)
+        emars_ptr = element_mars + \
+            edge_ids[:,None] * batch_size + \
+            offs_batch[None,:]
+        emars = tl.load(emars_ptr, mask = mask_batch[None,:]) # [n_edges, BLOCK_B]
+
+        # Compute max and subtract
+        emars_max = tl.max(emars, axis = 0)
+        emars = tl.exp(emars - emars_max[None,:])
+
+        # Initialize pointers to `node_mars`
+        off_nids = tl.load(nids + ngroup_id)
+        nmars_ptr = node_mars + \
+            (off_nids + tile_id * BLOCK_M) * batch_size + \
+            offs_batch
+
+        # Inner loop
+        for i in range(0, BLOCK_M):
+            epars = tl.load(epars_ptr)
+
+            nmars = tl.log(tl.sum(emars * epars[:,None], axis = 0)) + emars_max
+
+            tl.store(nmars_ptr, nmars, mask = mask_batch)
+
+            # Increment `epars_ptr`
+            epars_ptr += 1
+
+            # Increment `nmars_ptr`
+            nmars_ptr += batch_size
+
+    def _forward_sparse(self, node_mars: torch.Tensor, element_mars: torch.Tensor,
+                        params: torch.Tensor, nids: torch.Tensor, cids: torch.Tensor,
+                        pids: torch.Tensor, local_ids: Optional[torch.Tensor] = None,
+                        partition_id: int = -1) -> None:
+        """
+        Forward pass of sum layers with the sparse processing kernel.
+        
+        Parameters:
+        `node_mars`:    [N, B]
+        `element_mars`: [M, B]
+        `params`:       [E]
+        `nids`:         [ng]
+        `cids`:         [ng, c]
+        `pids`:         [ng, c]
+        """
+
+        num_ngroups = nids.size(0) if local_ids is None else local_ids.size(0)
+        layer_n_nodes = num_ngroups * self.group_size
+        n_edges = cids.size(1)
+        batch_size = node_mars.size(1)
+        BATCH_SIZE_NP2 = triton.next_power_of_2(batch_size)
+
+        assert n_edges <= 16384
+
+        BLOCK_B = max(min(2048 // n_edges, BATCH_SIZE_NP2), 1)
+        BLOCK_M = self.group_size
+
+        grid = (triton.cdiv(batch_size, BLOCK_B), triton.cdiv(layer_n_nodes, BLOCK_M))
+
+        self._fw_triton_sparse_kernel[grid](
+            node_mars = node_mars, 
+            element_mars = element_mars, 
+            params = params, 
+            nids = nids, 
+            cids = cids,
+            pids = pids,
+            local_ids = local_ids, 
+            batch_size = batch_size, 
+            partial_eval = 1 if local_ids is not None else 0, 
+            n_edges = n_edges, 
+            BLOCK_B = BLOCK_B, 
+            BLOCK_M = BLOCK_M, 
+            GROUP_SIZE_M = self.group_size
+        )
+
+        return None
+
     @staticmethod
     @triton.jit
     def _backward_kernel(node_flows_ptr, element_flows_ptr, params_ptr, 

From 190245d3c96afc81e06edeee97df0bb6c709a257 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Tue, 5 Dec 2023 06:19:19 +0800
Subject: [PATCH 024/162] prototying flow kernels

---
 src/pyjuice/layer/sum_layer.py |  11 ++
 tests/layer/sum_layer_test.py  | 264 +++++++++++++++++++++++++++++++++
 2 files changed, 275 insertions(+)

diff --git a/src/pyjuice/layer/sum_layer.py b/src/pyjuice/layer/sum_layer.py
index 22d1b64b..75fb349f 100644
--- a/src/pyjuice/layer/sum_layer.py
+++ b/src/pyjuice/layer/sum_layer.py
@@ -96,6 +96,17 @@ def __init__(self, nodes: Sequence[SumNodes], global_nid_start: int,
         # Store pre-compiled indices from `cids` and `pids` in the following buffer
         self._cached_fw_pcids = dict()
 
+        ## Initialize backward pass ##
+
+        # import pdb; pdb.set_trace()
+
+        # # Find a good strategy to partition the child nodes into groups according to their number of parents 
+        # # to minimize total computation cost
+        # ch_n_pars = ch_n_pars[1:] # Strip away the dummy node. We will never use it in the following
+        # bk_group_max_pars = partition_nodes_by_n_edges(
+        #     ch_n_pars, sparsity_tolerance = layer_sparsity_tol, max_num_partitions = max_num_partitions
+        # )
+
     def __init__old(self, nodes: Sequence[SumNodes], global_nid_start: int, 
                  param_ends: Sequence, tied_param_ids: Sequence,
                  tied_param_group_ids: Sequence, tied_param_ends: Sequence,
diff --git a/tests/layer/sum_layer_test.py b/tests/layer/sum_layer_test.py
index a619b67a..7ef2237d 100644
--- a/tests/layer/sum_layer_test.py
+++ b/tests/layer/sum_layer_test.py
@@ -13,6 +13,160 @@
 
 import pytest
 
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def _bk_triton_block_sparse_kernel(node_flows, element_flows, node_mars, element_mars, params, nids, cids_start, cids_increment,
+                                    pids_start, pids_increment, local_ids, batch_size, partial_eval: tl.constexpr,
+                                    BLOCK_B: tl.constexpr, TILE_SIZE_K: tl.constexpr, K_NUM_TILES: tl.constexpr,
+                                    TILE_SIZE_M: tl.constexpr, GROUP_SIZE_M: tl.constexpr):
+
+    pid_b = tl.program_id(0) # ID of size-`BLOCK_B` batches
+    pid_m = tl.program_id(1) # ID of size-`TILE_SIZE_M` nodes
+
+    # Get inferred node group id from `pid_m`
+    ngroup_id = pid_m // (GROUP_SIZE_M // TILE_SIZE_M)
+    tile_id = pid_m % (GROUP_SIZE_M // TILE_SIZE_M)
+
+    # Get the real node group id in the case of partial evaluation
+    if partial_eval == 1:
+        ngroup_id = tl.load(local_ids + ngroup_id)
+
+    # Initialize pointers to `params`
+    offs_node = tl.arange(0, TILE_SIZE_M) + tile_id * TILE_SIZE_M
+    offs_edge = tl.arange(0, TILE_SIZE_K)
+    par_start = tl.load(pids_start + ngroup_id * TILE_SIZE_K + offs_edge)
+    epars_ptr = params + \
+        offs_node[:,None] + \
+        par_start[None,:] # [TILE_SIZE_M, TILE_SIZE_K]
+
+    # Batch offsets and mask
+    offs_batch = tl.arange(0, BLOCK_B) + pid_b * BLOCK_B
+    mask_batch = offs_batch < batch_size
+
+    # Initialize pointers to `element_mars`
+    edge_start = tl.load(cids_start + ngroup_id * TILE_SIZE_K + offs_edge)
+    emars_ptr = element_mars + \
+        edge_start[:,None] * batch_size + \
+        offs_batch[None,:] # [TILE_SIZE_K, BLOCK_B]
+
+    eflows_ptr = element_mars + \
+        edge_start[:,None] * batch_size + \
+        offs_batch[None,:] # [TILE_SIZE_K, BLOCK_B]
+
+    # Initialize pointers to `node_flows`
+    off_nids = tl.load(nids + ngroup_id)
+    offs_nmfs = (off_nids + offs_node[:,None]) * batch_size + offs_batch[None,:]
+    nmars = tl.load(node_mars + offs_nmfs, mask = mask_batch[None,:]) # [TILE_SIZE_M, BLOCK_B]
+    nflows = tl.load(node_flows + offs_nmfs, mask = mask_batch[None,:])
+
+    nmars_max = tl.max(nmars, axis = 0)
+    nflows_div_mars = nflows / tl.exp(nmars - nmars_max[None,:])
+    nflows_div_mars = nflows_div_mars.to(tl.float16)
+
+    # Batch increment pointers
+    pids_inc_ptr = pids_increment + ngroup_id * (K_NUM_TILES * TILE_SIZE_K) + offs_edge
+    cids_inc_ptr = cids_increment + ngroup_id * (K_NUM_TILES * TILE_SIZE_K) + offs_edge
+
+    # Inner loop
+    acc = tl.zeros([TILE_SIZE_M, TILE_SIZE_K], dtype = tl.float32)
+
+    for k in range(0, K_NUM_TILES):
+        epars = tl.load(epars_ptr) # [TILE_SIZE_M, TILE_SIZE_K]
+        emars = tl.load(emars_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_K, BLOCK_B]
+
+        epars = epars.to(tl.float16)
+        aaa = tl.dot(tl.trans(epars), nflows_div_mars).to(tl.float32)
+        bbb = aaa * (emars - nmars_max[None,:])
+
+        tl.atomic_add(eflows_ptr, bbb, mask = mask_batch[None,:])
+        # acc += bbb
+
+        # Increment `epars_ptr`
+        pids_inc = tl.load(pids_inc_ptr)
+        epars_ptr += pids_inc[None,:]
+        pids_inc += TILE_SIZE_K
+
+        # Increment `emars_ptr`
+        cids_inc = tl.load(cids_inc_ptr)
+        emars_ptr += cids_inc[:,None] * batch_size
+        eflows_ptr += cids_inc[:,None] * batch_size
+        cids_inc += TILE_SIZE_K
+
+
+@triton.jit
+def _bkp_triton_block_sparse_kernel(node_flows, node_mars, element_mars, params, param_flows, nids, cids, pids,
+                                    local_ids, batch_size, n_edges: tl.constexpr, partial_eval: tl.constexpr,
+                                    TILE_SIZE_B: tl.constexpr, B_NUM_TILES: tl.constexpr, TILE_SIZE_K: tl.constexpr, TILE_SIZE_M: tl.constexpr, 
+                                    GROUP_SIZE_M: tl.constexpr):
+
+    pid_k = tl.program_id(0) # ID of size-`TILE_SIZE_K` batches
+    pid_m = tl.program_id(1) # ID of size-`TILE_SIZE_M` nodes
+
+    # Get inferred node group id from `pid_m`
+    ngroup_id = pid_m // (GROUP_SIZE_M // TILE_SIZE_M)
+    tile_id = pid_m % (GROUP_SIZE_M // TILE_SIZE_M)
+
+    # Get the real node group id in the case of partial evaluation
+    if partial_eval == 1:
+        ngroup_id = tl.load(local_ids + ngroup_id)
+
+    # Batch offsets and mask
+    offs_batch = tl.arange(0, TILE_SIZE_B)
+    mask_batch = offs_batch < batch_size
+
+    # Initialize pointers to `element_mars`
+    offs_edge = tl.arange(0, TILE_SIZE_K) + pid_k * TILE_SIZE_K
+    edge_start = tl.load(cids + ngroup_id * n_edges + offs_edge)
+    emars_ptr = element_mars + \
+        edge_start[:,None] * batch_size + \
+        offs_batch[None,:] # [TILE_SIZE_K, TILE_SIZE_B]
+
+    # Initialize pointers to `node_flows` and `node_mars`
+    offs_node = tl.arange(0, TILE_SIZE_M) + tile_id * TILE_SIZE_M
+    off_nids = tl.load(nids + ngroup_id)
+    offs_nmfs = (off_nids + offs_node[:,None]) * batch_size + offs_batch[None,:]
+    nmars_ptr = node_mars + offs_nmfs
+    nflows_ptr = node_flows + offs_nmfs
+
+    # Inner loop
+    acc = tl.zeros([TILE_SIZE_M, TILE_SIZE_K], dtype = tl.float32)
+
+    for b in range(0, B_NUM_TILES):
+        emars = tl.load(emars_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_K, BLOCK_B]
+        nmars = tl.load(nmars_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_M, BLOCK_B]
+        nflows = tl.load(nflows_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_M, BLOCK_B]
+
+        nmars_max = tl.max(nmars, axis = 0)
+        nflows_div_mars = nflows / tl.exp(nmars - nmars_max[None,:])
+        nflows_div_mars = nflows_div_mars.to(tl.float16)
+
+        emars = tl.exp(emars - nmars_max[None,:])
+        emars = emars.to(tl.float16)
+
+        pflows = tl.dot(nflows_div_mars, tl.trans(emars)).to(tl.float32)
+
+        acc += pflows
+
+        # Increment `emars_ptr`, `nmars_ptr`, and `nmars_ptr`
+        emars_ptr += TILE_SIZE_B
+        nmars_ptr += TILE_SIZE_B
+        nflows_ptr += TILE_SIZE_B
+
+        # Update batch mask
+        offs_batch += TILE_SIZE_B
+        mask_batch = offs_batch < batch_size
+
+    par_start = tl.load(pids + ngroup_id * n_edges + offs_edge)
+    epars_offsets = offs_node[:,None] + par_start[None,:] # [TILE_SIZE_M, TILE_SIZE_K]
+    
+    epars = tl.load(params + epars_offsets)
+    pflows = acc * epars
+
+    tl.store(param_flows + epars_offsets, pflows)
+
 
 def sum_layer_test():
 
@@ -137,6 +291,116 @@ def speed_test():
     print("Reference computation time on RTX 4090: 11.255ms.")
     print("--------------------------------------------------------------")
 
+    node_flows = torch.rand([group_size + group_size * num_node_groups * num_prod_nodes, batch_size]).to(device)
+    element_flows = torch.zeros([group_size + num_prod_nodes * group_size * num_node_groups, batch_size]).log().to(device)
+
+    # import pdb; pdb.set_trace()
+
+    nids = layer.partitioned_nids[0]
+    cids_start, cids_increment, pids_start, pids_increment = layer._cached_fw_pcids[("block_sparse", 0, 64)]
+
+    BLOCK_B = 128
+    TILE_SIZE_K = 64
+    K_NUM_TILES = layer.partitioned_cids[0].size(1) // TILE_SIZE_K
+    TILE_SIZE_M = 32
+
+    layer_n_nodes = nids.size(0) * layer.group_size
+    
+    grid = (triton.cdiv(batch_size, BLOCK_B), triton.cdiv(layer_n_nodes, TILE_SIZE_M))
+        
+    _bk_triton_block_sparse_kernel[grid](
+        node_flows,
+        element_flows,
+        node_mars, 
+        element_mars, 
+        params, 
+        nids, 
+        cids_start,
+        cids_increment, 
+        pids_start,
+        pids_increment,
+        local_ids = None,
+        batch_size = batch_size,
+        partial_eval = 0,
+        BLOCK_B = BLOCK_B,
+        TILE_SIZE_K = TILE_SIZE_K,
+        K_NUM_TILES = K_NUM_TILES,
+        TILE_SIZE_M = TILE_SIZE_M,
+        GROUP_SIZE_M = layer.group_size
+    )
+
+    t0 = time.time()
+    torch.cuda.synchronize()
+    for _ in range(100):
+        _bk_triton_block_sparse_kernel[grid](
+            node_flows,
+            element_flows,
+            node_mars, 
+            element_mars, 
+            params, 
+            nids, 
+            cids_start,
+            cids_increment, 
+            pids_start,
+            pids_increment,
+            local_ids = None,
+            batch_size = batch_size,
+            partial_eval = 0,
+            BLOCK_B = BLOCK_B,
+            TILE_SIZE_K = TILE_SIZE_K,
+            K_NUM_TILES = K_NUM_TILES,
+            TILE_SIZE_M = TILE_SIZE_M,
+            GROUP_SIZE_M = layer.group_size
+        )
+    torch.cuda.synchronize()
+    t1 = time.time()
+    backward_ms = (t1 - t0) / 100 * 1000
+
+    print(f"bkbk: {backward_ms:.3f}ms.")
+
+    nids = layer.partitioned_nids[0]
+    cids = layer.partitioned_cids[0]
+    pids = layer.partitioned_pids[0]
+
+    param_flows = params.clone() * 0.0
+
+    TILE_SIZE_B = 64
+    TILE_SIZE_K = 64
+    B_NUM_TILES = triton.cdiv(batch_size, TILE_SIZE_B)
+    TILE_SIZE_M = 32
+
+    n_edges = cids.size(1)
+
+    layer_n_nodes = nids.size(0) * layer.group_size
+    
+    grid = (triton.cdiv(n_edges, TILE_SIZE_K), triton.cdiv(layer_n_nodes, TILE_SIZE_M))
+
+    _bkp_triton_block_sparse_kernel[grid](
+        node_flows, node_mars, element_mars, params, 
+        param_flows, nids, cids, pids, local_ids = None, 
+        batch_size = batch_size, n_edges = n_edges, partial_eval = 0,
+        TILE_SIZE_B = TILE_SIZE_B, B_NUM_TILES = B_NUM_TILES, 
+        TILE_SIZE_K = TILE_SIZE_K, TILE_SIZE_M = TILE_SIZE_M, 
+        GROUP_SIZE_M = layer.group_size
+    )
+
+    t0 = time.time()
+    torch.cuda.synchronize()
+    for _ in range(100):
+        _bkp_triton_block_sparse_kernel[grid](
+        node_flows, node_mars, element_mars, params, 
+        param_flows, nids, cids, pids, local_ids = None, 
+        batch_size = batch_size, n_edges = n_edges, partial_eval = 0,
+        TILE_SIZE_B = TILE_SIZE_B, B_NUM_TILES = B_NUM_TILES, 
+        TILE_SIZE_K = TILE_SIZE_K, TILE_SIZE_M = TILE_SIZE_M, 
+        GROUP_SIZE_M = layer.group_size
+    )
+    torch.cuda.synchronize()
+    t1 = time.time()
+    backward_ms = (t1 - t0) / 100 * 1000
+
+    print(f"bkpbkp: {backward_ms:.3f}ms.")
+
 
 if __name__ == "__main__":
     # sum_layer_test()

From 83952976044408eb331eabc039bc5e275ccaf6a3 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Thu, 7 Dec 2023 21:04:37 +0800
Subject: [PATCH 025/162] add __repr__ for nodes

---
 src/pyjuice/nodes/input_nodes.py | 3 +++
 src/pyjuice/nodes/prod_nodes.py  | 5 ++++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/pyjuice/nodes/input_nodes.py b/src/pyjuice/nodes/input_nodes.py
index eb34f474..6ec0eb12 100644
--- a/src/pyjuice/nodes/input_nodes.py
+++ b/src/pyjuice/nodes/input_nodes.py
@@ -80,3 +80,6 @@ def init_parameters(self, perturbation: float = 2.0, recursive: bool = True,
                 ret_params = True,
                 **kwargs
             )
+
+    def __repr__(self):
+        return f"InputNodes(num_node_groups={self.num_node_groups}, group_size={self.group_size}, dist={type(self.dist)})"
diff --git a/src/pyjuice/nodes/prod_nodes.py b/src/pyjuice/nodes/prod_nodes.py
index 1f9b5212..88707ec3 100644
--- a/src/pyjuice/nodes/prod_nodes.py
+++ b/src/pyjuice/nodes/prod_nodes.py
@@ -75,4 +75,7 @@ def init_parameters(self, perturbation: float = 2.0, recursive: bool = True, is_
             recursive = recursive, 
             is_root = is_root,
             **kwargs
-        )
\ No newline at end of file
+        )
+
+    def __repr__(self):
+        return f"ProdNodes(num_node_groups={self.num_node_groups}, group_size={self.group_size}, num_chs={self.num_chs})"

From fd9173602f1575a7dbe303a894bbd99676fe2265 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Thu, 7 Dec 2023 23:07:52 +0800
Subject: [PATCH 026/162] sum layer backward compilation

---
 src/pyjuice/layer/compilation.py | 434 +++++++++++++++++++------------
 src/pyjuice/layer/sum_layer.py   | 110 ++++++--
 src/pyjuice/nodes/sum_nodes.py   |   3 +
 tests/layer/sum_layer_test.py    |  67 +++--
 4 files changed, 406 insertions(+), 208 deletions(-)

diff --git a/src/pyjuice/layer/compilation.py b/src/pyjuice/layer/compilation.py
index a30ee1f4..7d8ae867 100644
--- a/src/pyjuice/layer/compilation.py
+++ b/src/pyjuice/layer/compilation.py
@@ -12,6 +12,7 @@
 from numba import njit
 from copy import deepcopy
 from typing import Optional, Sequence
+from collections import OrderedDict
 
 from pyjuice.nodes import CircuitNodes, SumNodes
 
@@ -19,6 +20,45 @@
 ## Helper functions ##
 
 
+class OrderedSet():
+    def __init__(self):
+        self.item_set = set()
+        self.item_list = list()
+
+        self.index = 0
+
+    def append(self, item):
+        if item in self.item_set:
+            return None
+        
+        self.item_set.add(item)
+        self.item_list.append(item)
+
+    def index(self, item):
+        if item not in self.item_set:
+            raise ValueError("Item not found.")
+
+        return self.item_list.index(item)
+
+    def __iter__(self):
+        self.index = 0
+        return self
+
+    def __next__(self):
+        if self.index < len(self.item_list):
+            item = self.item_list[self.index]
+            self.index += 1
+            return item
+        else:
+            raise StopIteration  # To signal the end of iteration
+
+    def __getitem__(self, idx):
+        if idx >= len(self.item_list):
+            raise ValueError()
+
+        return self.item_list[idx]
+
+
 def flatten_sum_nodes(ns: SumNodes, *args, use_cuda: bool = False):
     edge_ids = ns.edge_ids
     if use_cuda:
@@ -60,7 +100,7 @@ def next_power_of_2(x: torch.Tensor):
 ## Compilation for SumLayer ##
 
 
-def get_sum_layer_stats(nodes: Sequence[SumNodes], global_nid_start: int):
+def get_sum_layer_forward_stats(nodes: Sequence[SumNodes], global_nid_start: int):
     layer_num_ngroups = sum(map(lambda ns: ns.num_node_groups, nodes))
     layer_num_edges = 0
 
@@ -82,6 +122,52 @@ def get_sum_layer_stats(nodes: Sequence[SumNodes], global_nid_start: int):
     return layer_num_ngroups, layer_num_edges, n_chs
 
 
+def get_sum_layer_backward_stats(nodes: Sequence[SumNodes]):
+    ch_gsize2cs = dict()
+    ch_gsize2num_ngroups = dict()
+    cs2parns = dict()
+
+    for ns in nodes:
+        for cs in ns.chs:
+            ch_gsize = cs.group_size
+
+            if ch_gsize not in ch_gsize2cs:
+                ch_gsize2cs[ch_gsize] = OrderedSet()
+                ch_gsize2num_ngroups[ch_gsize] = 0
+
+            ch_gsize2cs[ch_gsize].append(cs)
+            ch_gsize2num_ngroups[ch_gsize] += cs.num_node_groups
+
+            if cs not in cs2parns:
+                cs2parns[cs] = OrderedSet()
+
+            cs2parns[cs].append(ns)
+
+    # Iterate over all child nodes to get the parent (# node groups) counts
+    ch_gsize2n_pargs = dict()
+    for ch_gsize, ch_nodes in ch_gsize2cs.items():
+        n_sid = 0
+        n_pargs = torch.zeros([ch_gsize2num_ngroups[ch_gsize]], dtype = torch.long)
+        for cs in ch_nodes:
+            n_eid = n_sid + cs.num_node_groups
+
+            pargcounts = torch.zeros([cs.num_node_groups], dtype = torch.long)
+            for ns in cs2parns[cs]:
+                cs_id = ns.chs.index(cs)
+                edge_sid = sum([c.num_node_groups for c in ns.chs[:cs_id]])
+                edge_eid = edge_sid + cs.num_node_groups
+
+                criterion = (ns.edge_ids[1,:] >= edge_sid) & (ns.edge_ids[1,:] < edge_eid)
+                pargcounts += torch.bincount(ns.edge_ids[1,criterion] - edge_sid, minlength = cs.num_node_groups)
+
+            n_pargs[n_sid:n_eid] = pargcounts
+
+            n_sid = n_eid
+
+        ch_gsize2n_pargs[ch_gsize] = n_pargs
+
+    return ch_gsize2cs, ch_gsize2num_ngroups, ch_gsize2n_pargs, cs2parns
+
 @torch.no_grad()
 def sum_layer_forward_compilation_job(flat_nodes, nids, cids, pids, fw_group_max_chs, n_group_ids, n_id_in_group,
                                       global_nid_start, ch_prod_layer_size, job_start, job_end, return_dict = None, 
@@ -334,8 +420,8 @@ def _assign_chid_kernel(chs_offsets, ns_nchs, edge_ids):
 def _assign_target_ncpids_kernel(target_nids_ptr, nids_partition_start_ptr, target_cids_ptr, pcids_partition_start_ptr,
                                  target_pids_ptr, edge_ids_ptr, chs_offsets_ptr, n_partition_ids_ptr, n_id_in_partition_ptr, 
                                  cs_ele_id_start_ptr, cs_node_cum_ids_ptr, fw_partition_max_chs_ptr, cum_n_chs_ptr, 
-                                 ns_param_ids_ptr, ch_n_pars_ptr, constexprs_ptr, num_chs: tl.constexpr, 
-                                 num_chs_np2: tl.constexpr, add_params_flag: tl.constexpr, BLOCK_SIZE: tl.constexpr):
+                                 ns_param_ids_ptr, constexprs_ptr, num_chs: tl.constexpr, num_chs_np2: tl.constexpr, 
+                                 add_params_flag: tl.constexpr, BLOCK_SIZE: tl.constexpr):
 
     pid = tl.program_id(axis = 0)
     block_start = pid * BLOCK_SIZE
@@ -371,7 +457,7 @@ def _assign_target_ncpids_kernel(target_nids_ptr, nids_partition_start_ptr, targ
     cs_ele_ind = tl.load(cs_ele_id_start_ptr + cid_node_id, mask = mask, other = 0)
 
     # Get child offsets
-    # Note: this is the `?` mark in `cids[group_id][local_id,?]`
+    # Note: this is the `?` mark in `cids[partition_id][local_id,?]`
     chs_offset = tl.load(chs_offsets_ptr + offsets, mask = mask, other = 0)
 
     # Store to `target_nids`
@@ -386,9 +472,6 @@ def _assign_target_ncpids_kernel(target_nids_ptr, nids_partition_start_ptr, targ
     global_cid = cs_ele_ind + cid - cs_cum_num
     tl.store(target_cids_ptr + pcids_offsets, global_cid, mask = mask)
 
-    # Cumulate number of parents for every child node
-    tl.atomic_add(ch_n_pars_ptr + global_cid, 1, mask = mask)
-
     # Store to `target_pids`
     ns_local_pid = tl.load(cum_n_chs_ptr + nid, mask = mask, other = 0)
     global_pid = ns_pid_start + (ns_local_pid + chs_offset) * group_size
@@ -400,7 +483,7 @@ def _assign_target_ncpids_kernel(target_nids_ptr, nids_partition_start_ptr, targ
 
 
 @torch.no_grad()
-def sum_layer_forward_compilation(nodes, fw_partition_max_chs, n_partition_ids, n_id_in_partition, num_ns_in_partition, n_chs,
+def sum_layer_forward_compilation(nodes, fw_partition_max_chs, n_partition_ids, n_id_in_partition, num_ngs_in_partition, n_chs,
                                   global_nid_start, ch_prod_layer_size, param_ends,
                                   num_threads: int = 1, use_cuda: bool = True, legacy: bool = False):
 
@@ -410,8 +493,9 @@ def sum_layer_forward_compilation(nodes, fw_partition_max_chs, n_partition_ids,
     # Also use the legacy code if we compile with CPU
     if not use_cuda or legacy:
         # TODO: restore CPU compilation
+        raise RuntimeError()
         return sum_layer_forward_compilation_legacy(
-            nodes, fw_partition_max_chs, n_partition_ids, n_id_in_partition, num_ns_in_partition, n_chs,
+            nodes, fw_partition_max_chs, n_partition_ids, n_id_in_partition, num_ngs_in_partition, n_chs,
             global_nid_start, ch_prod_layer_size, param_ends, num_threads = num_threads,
             use_cuda = use_cuda
         )
@@ -419,22 +503,18 @@ def sum_layer_forward_compilation(nodes, fw_partition_max_chs, n_partition_ids,
     # We construct a flattened version of `nids` where the vectors of every partition is concatenated
     # into a single vector. `nids_group_start` is used to indicate the start index of every group's
     # `nids`. That is, `target_nids[nids_partition_start[i]:nids_partition_start[i+1]] == nids[i]`
-    nids_partition_start = torch.zeros_like(num_ns_in_partition)
-    nids_partition_start[1:] = torch.cumsum(num_ns_in_partition[:-1], dim = 0)
-    target_nids = torch.zeros([num_ns_in_partition.sum()], dtype = torch.long).cuda()
+    nids_partition_start = torch.zeros_like(num_ngs_in_partition)
+    nids_partition_start[1:] = torch.cumsum(num_ngs_in_partition[:-1], dim = 0)
+    target_nids = torch.zeros([num_ngs_in_partition.sum()], dtype = torch.long).cuda()
 
     # Similarly, we flatten `cids`...
     # Note: we call it `pcids...` because it is shared with `target_pids`
-    pcids_partition_start = torch.zeros_like(num_ns_in_partition)
-    pcids_partition_start[1:] = torch.cumsum((num_ns_in_partition * fw_partition_max_chs)[:-1], dim = 0)
-    target_cids = torch.zeros([(num_ns_in_partition * fw_partition_max_chs).sum()], dtype = torch.long).cuda()
+    pcids_partition_start = torch.zeros_like(num_ngs_in_partition)
+    pcids_partition_start[1:] = torch.cumsum((num_ngs_in_partition * fw_partition_max_chs)[:-1], dim = 0)
+    target_cids = torch.zeros([(num_ngs_in_partition * fw_partition_max_chs).sum()], dtype = torch.long).cuda()
 
     # ...and `pids`
-    target_pids = torch.zeros([(num_ns_in_partition * fw_partition_max_chs).sum()], dtype = torch.long).cuda()
-
-    # TODO: restore this when working on the backward pass
-    # This tensor is to be filled with number of parents for every child node
-    ch_n_pars = torch.zeros([ch_prod_layer_size], dtype = torch.int32).cuda()
+    target_pids = torch.zeros([(num_ngs_in_partition * fw_partition_max_chs).sum()], dtype = torch.long).cuda()
 
     # Move necessary tensors to GPU
     n_partition_ids = n_partition_ids.cuda()
@@ -481,7 +561,7 @@ def sum_layer_forward_compilation(nodes, fw_partition_max_chs, n_partition_ids,
         ns_num_edges = edge_ids.size(1)
 
         # Precompute the child offset ids for every edge. That is, the `?` 
-        # mark in `cids[group_id][local_id,?]`
+        # mark in `cids[partition][local_id,?]`
         chs_offsets = np.zeros([ns_num_edges], dtype = np.int64)
         ns_nchs = np.zeros([ns_num_ngroups], dtype = np.int64)
 
@@ -532,7 +612,7 @@ def sum_layer_forward_compilation(nodes, fw_partition_max_chs, n_partition_ids,
             target_nids, nids_partition_start, target_cids, pcids_partition_start,
             target_pids, edge_ids, chs_offsets, n_partition_ids, n_id_in_partition, 
             cs_ele_id_start, cs_node_cum_ids, fw_partition_max_chs, cum_n_chs, 
-            ns_param_ids, ch_n_pars, constexprs, ns.num_chs, num_chs_np2, 
+            ns_param_ids, constexprs, ns.num_chs, num_chs_np2, 
             add_params_flag, BLOCK_SIZE = min(2048, 2**20 // num_chs_np2)
         )
 
@@ -569,9 +649,9 @@ def sum_layer_forward_compilation(nodes, fw_partition_max_chs, n_partition_ids,
     # Restore `nids`
     target_nids = target_nids.cpu()
     nids = []
-    for partition_id in range(num_ns_in_partition.size(0)):
+    for partition_id in range(num_ngs_in_partition.size(0)):
         sid = nids_partition_start[partition_id]
-        eid = sid + num_ns_in_partition[partition_id]
+        eid = sid + num_ngs_in_partition[partition_id]
         nids.append(target_nids[sid:eid].contiguous())
 
     # Restore `cids` and `pids`
@@ -579,18 +659,15 @@ def sum_layer_forward_compilation(nodes, fw_partition_max_chs, n_partition_ids,
     target_pids = target_pids.cpu()
     cids = []
     pids = []
-    for partition_id in range(num_ns_in_partition.size(0)):
+    for partition_id in range(num_ngs_in_partition.size(0)):
         sid = pcids_partition_start[partition_id]
-        gsize = num_ns_in_partition[partition_id]
+        gsize = num_ngs_in_partition[partition_id]
         gnchs = fw_partition_max_chs[partition_id]
         eid = sid + gsize * gnchs
         cids.append(target_cids[sid:eid].reshape(gsize, gnchs).contiguous())
         pids.append(target_pids[sid:eid].reshape(gsize, gnchs).contiguous())
 
-    # Convert `ch_n_pars` to `torch.long` type
-    ch_n_pars = ch_n_pars.cpu().long()
-
-    return nids, cids, pids, ch_n_pars, param_ends
+    return nids, cids, pids, param_ends
 
 
 @torch.no_grad()
@@ -714,6 +791,15 @@ def sum_layer_backward_compilation_legacy(nodes, pids, fw_n_group_ids, fw_n_id_i
     return parids, parpids
 
 
+@njit
+def _assign_chid_kernel(chs_offsets, ns_nchs, edge_ids):
+    for i in range(edge_ids.shape[1]):
+        nid = edge_ids[0,i]
+        idx = ns_nchs[nid]
+        chs_offsets[i] = idx
+        ns_nchs[nid] = idx + 1
+
+
 @triton.jit
 def _assign_global_eleids_kernel(global_ele_ids_ptr, cs_ele_id_start_ptr, cs_node_cum_ids_ptr, edge_ids_ptr, 
                                  constexprs_ptr, num_chs: tl.constexpr, num_chs_np2: tl.constexpr, BLOCK_SIZE: tl.constexpr):
@@ -748,78 +834,78 @@ def _assign_global_eleids_kernel(global_ele_ids_ptr, cs_ele_id_start_ptr, cs_nod
 
 
 @njit
-def _assign_parid_kernel(par_offsets, par_counts, global_ele_ids):
-    for i in range(par_offsets.shape[0]):
-        global_cid = global_ele_ids[i]
-        idx = par_counts[global_cid]
-        par_offsets[i] = idx
-        par_counts[global_cid] = idx + 1
+def _assign_parid_kernel(pars_offsets, cs_npars, edge_ids, edge_sid):
+    for i in range(edge_ids.shape[1]):
+        cid = edge_ids[1,i]
+        idx = cs_npars[cid]
+        pars_offsets[edge_sid+i] = idx
+        cs_npars[cid] = idx + 1
 
 
 @triton.jit
-def _assign_target_parids_kernel(target_parids_ptr, target_parpids_ptr, parids_group_start_ptr, flat_pids_ptr, pids_group_start_ptr,
-                                 edge_ids_ptr, global_ele_ids_ptr, chs_offsets_ptr, par_offsets_ptr,
-                                 fw_n_group_ids_ptr, fw_n_id_in_group_ptr, bk_n_group_ids_ptr, bk_n_id_in_group_ptr,
-                                 fw_group_max_chs_ptr, bk_group_max_pars_ptr, constexprs_ptr, BLOCK_SIZE: tl.constexpr):
-
+def _assign_target_chpapids_kernel(target_chids_ptr, chids_partition_start_ptr, target_parids_ptr, target_parpids_ptr, 
+                                   parids_partition_start_ptr, edge_ids_ptr, pars_offsets_ptr, n_partition_ids_ptr, 
+                                   n_id_in_partition_ptr, num_ngs_in_partition_ptr, partition_max_pars_ptr, cum_n_chs_ptr, 
+                                   chs_offsets_ptr, constexprs_ptr, BLOCK_SIZE: tl.constexpr):
+    
     pid = tl.program_id(axis = 0)
     block_start = pid * BLOCK_SIZE
 
     # Retrieve all constexprs
-    global_nid_start = tl.load(constexprs_ptr)
-    node_start = tl.load(constexprs_ptr + 1)
-    num_edges = tl.load(constexprs_ptr + 2)
+    ns_global_node_start = tl.load(constexprs_ptr)
+    cs_global_ele_start = tl.load(constexprs_ptr + 1)
+    ns_group_size = tl.load(constexprs_ptr + 2)
+    cs_group_size = tl.load(constexprs_ptr + 3)
+    ns_pid_start = tl.load(constexprs_ptr + 4)
+    num_edges = tl.load(constexprs_ptr + 5)
+    cs_ngroup_start = tl.load(constexprs_ptr + 6)
 
     # Get edge indices to be processed by the current block
     offsets = block_start + tl.arange(0, BLOCK_SIZE)
     mask = offsets < num_edges
 
-    # Get `nid` and `global_cid`
+    # Get `cid` and `nid` (size of `edge_ids` is [2, num_edges])
+    cid = tl.load(edge_ids_ptr + offsets + num_edges, mask = mask, other = 0)
     nid = tl.load(edge_ids_ptr + offsets, mask = mask, other = 0)
-    global_cid = tl.load(global_ele_ids_ptr + offsets, mask = mask, other = 0)
 
-    # Get `fw_group_id` and `fw_local_id` (for indexing `pids`)
-    fw_group_id = tl.load(fw_n_group_ids_ptr + nid + node_start, mask = mask, other = 0)
-    fw_local_id = tl.load(fw_n_id_in_group_ptr + nid + node_start, mask = mask, other = 0)
+    # Get `partition_id` and `local_id`
+    partition_id = tl.load(n_partition_ids_ptr + cid + cs_ngroup_start, mask = mask, other = 0)
+    local_id = tl.load(n_id_in_partition_ptr + cid + cs_ngroup_start, mask = mask, other = 0)
 
-    # Get `bk_group_id` and `bk_local_id` (for indexing `parids` and `parpids`)
-    bk_group_id = tl.load(bk_n_group_ids_ptr + global_cid, mask = mask, other = 0)
-    bk_local_id = tl.load(bk_n_id_in_group_ptr + global_cid, mask = mask, other = 0)
+    # Get parent offsets
+    # Note: this is the `?` mark in `parids[partition_id][local_id,?]`
+    pars_offset = tl.load(pars_offsets_ptr + offsets, mask = mask, other = 0)
 
-    # Get child offsets (for indexing `pids`) and parent offsets (for indexing `parids` and `parpids`)
-    chs_offset = tl.load(chs_offsets_ptr + offsets, mask = mask, other = 0)
-    par_offset = tl.load(par_offsets_ptr + offsets, mask = mask, other = 0)
-    
-    # Store to `target_parids`
-    group_max_n_pars = tl.load(bk_group_max_pars_ptr + bk_group_id, mask = mask, other = 0)
-    parids_start = tl.load(parids_group_start_ptr + bk_group_id, mask = mask, other = 0)
-    parids_offsets = parids_start + bk_local_id * group_max_n_pars + par_offset
-    global_nid = global_nid_start + node_start + nid
-    tl.store(target_parids_ptr + parids_offsets, global_nid, mask = mask)
+    # Store to `target_chids`
+    chids_start = tl.load(chids_partition_start_ptr + partition_id, mask = mask, other = 0)
+    global_chid = cs_global_ele_start + cid * cs_group_size
+    tl.store(target_chids_ptr + chids_start + local_id, global_chid, mask = mask)
 
-    # Get the parameter ids of the edges...
-    group_max_n_chs = tl.load(fw_group_max_chs_ptr + fw_group_id, mask = mask, other = 0)
-    pids_start = tl.load(pids_group_start_ptr + fw_group_id, mask = mask, other = 0)
-    pids_offsets = pids_start + fw_local_id * group_max_n_chs + chs_offset
-    pid = tl.load(flat_pids_ptr + pids_offsets, mask = mask, other = 0)
+    # Store to `target_parids`
+    partition_max_n_pargs = tl.load(partition_max_pars_ptr + partition_id, mask = mask, other = 0)
+    parids_start = tl.load(parids_partition_start_ptr + partition_id, mask = mask, other = 0)
+    parids_offsets = parids_start + local_id * partition_max_n_pargs + pars_offset
+    global_parid = ns_global_node_start + nid * ns_group_size
+    tl.store(target_parids_ptr + parids_offsets, global_parid, mask = mask)
 
-    # ...and store them to `target_parpids`
-    tl.store(target_parpids_ptr + parids_offsets, pid, mask = mask)
+    # Store to `target_parpids`
+    ns_local_pid = tl.load(cum_n_chs_ptr + nid, mask = mask, other = 0)
+    chs_offset = tl.load(chs_offsets_ptr + offsets, mask = mask, other = 0)
+    global_pid = ns_pid_start + (ns_local_pid + chs_offset) * ns_group_size * cs_group_size
+    tl.store(target_parpids_ptr + parids_offsets, global_pid, mask = mask)
 
 
 @torch.no_grad()
-def sum_layer_backward_compilation(nodes, pids, fw_n_group_ids, fw_n_id_in_group, 
-                                   num_bk_groups, bk_n_group_ids, bk_n_id_in_group, 
-                                   fw_group_max_chs, bk_group_max_pars, 
-                                   fw_num_ns_in_group, bk_num_ns_in_group,
-                                   ch_prod_layer_size, global_nid_start, use_cuda: bool = False,
-                                   legacy: bool = False):
+def sum_layer_backward_compilation(nodes, cs2parns, n_partition_ids, n_id_in_partition, num_ngs_in_partition, partition_max_pars,
+                                   use_cuda: bool = False, legacy: bool = False):
 
     if use_cuda and not torch.cuda.is_available():
         use_cuda = False
 
     # Also use the legacy code if we compile with CPU
     if not use_cuda or legacy:
+        # TODO: restore CPU compilation
+        raise ValueError()
         return sum_layer_backward_compilation_legacy(
             nodes, pids, fw_n_group_ids, fw_n_id_in_group, 
             num_bk_groups, bk_n_group_ids, bk_n_id_in_group, 
@@ -827,135 +913,143 @@ def sum_layer_backward_compilation(nodes, pids, fw_n_group_ids, fw_n_id_in_group
             ch_prod_layer_size, global_nid_start, use_cuda = use_cuda
         )
 
-    # We construct a flattened version of `parids` where the vectors of every group is concatenated
-    # into a single vector. `parids_group_start` is used to indicate the start index of every group's
-    # `parids`. That is, `target_parids[parids_group_start[gid]:parids_group_start[gid+1]] == parids[gid]`
-    parids_group_start = torch.zeros_like(bk_num_ns_in_group)
-    parids_group_start[1:] = torch.cumsum((bk_num_ns_in_group * bk_group_max_pars)[:-1], dim = 0)
-    target_parids = torch.zeros([(bk_num_ns_in_group * bk_group_max_pars).sum()], dtype = torch.long).cuda()
+    # We construct a flattened version of `chids` where the vectors of every partition is concatenated
+    # into a single vector. `chids_partition_start` is used to indicate the start index of every partition's
+    # `chids`. That is, `target_chids[chids_partition_start[i]:chids_partition_start[i+1]] == chids[i]`
+    chids_partition_start = torch.zeros_like(num_ngs_in_partition)
+    chids_partition_start[1:] = torch.cumsum(num_ngs_in_partition[:-1], dim = 0)
+    target_chids = torch.zeros([num_ngs_in_partition.sum()], dtype = torch.long).cuda()
 
-    # Do the same to `parpids`
-    target_parpids = torch.zeros([(bk_num_ns_in_group * bk_group_max_pars).sum()], dtype = torch.long).cuda()
-
-    parids_group_start = parids_group_start.cuda()
+    # Similarly, we flatten `parids`...
+    # Note: we call it `pcids...` because it is shared with `target_pids`
+    parids_partition_start = torch.zeros_like(num_ngs_in_partition)
+    parids_partition_start[1:] = torch.cumsum((num_ngs_in_partition * partition_max_pars)[:-1], dim = 0)
+    target_parids = torch.zeros([(num_ngs_in_partition * partition_max_pars).sum()], dtype = torch.long).cuda()
 
-    # We also re-create `flat_pids` to be used to fill `parpids`
-    pids_group_start = torch.zeros_like(fw_num_ns_in_group)
-    pids_group_start[1:] = torch.cumsum((fw_num_ns_in_group * fw_group_max_chs)[:-1], dim = 0)
-    flat_pids = torch.zeros([(fw_num_ns_in_group * fw_group_max_chs).sum()], dtype = torch.long)
-    sid = 0
-    for group_id, (gsize, gnchs) in enumerate(zip(fw_num_ns_in_group, fw_group_max_chs)):
-        eid = sid + (gsize * gnchs)
-        flat_pids[sid:eid] = pids[group_id].reshape(gsize * gnchs)
-        sid = eid
+    # ...and `parpids`
+    target_parpids = torch.zeros([(num_ngs_in_partition * partition_max_pars).sum()], dtype = torch.long).cuda()
 
-    flat_pids = flat_pids.cuda()
-    pids_group_start = pids_group_start.cuda()
+    # Move tensors to GPU
+    n_partition_ids = n_partition_ids.cuda()
+    n_id_in_partition = n_id_in_partition.cuda()
+    num_ngs_in_partition = num_ngs_in_partition.cuda()
+    partition_max_pars = partition_max_pars.cuda()
 
-    # This vector maintains the "current" count of parents that have been processed for every child node
-    par_counts = torch.zeros([ch_prod_layer_size], dtype = torch.long)
+    # This is the main loop: iterate over `cs` in the layer
+    cs_ngroup_start = 0 # The start index of nodes in the current `cs`
+    ns2cum_n_chs = dict()
+    ns2chs_offsets = dict()
+    for cs in nodes:
 
-    # Move tensors to GPU
-    fw_n_group_ids = fw_n_group_ids.cuda()
-    fw_n_id_in_group = fw_n_id_in_group.cuda()
-    bk_n_group_ids = bk_n_group_ids.cuda()
-    bk_n_id_in_group = bk_n_id_in_group.cuda()
-    fw_group_max_chs = fw_group_max_chs.cuda()
-    bk_group_max_pars = bk_group_max_pars.cuda()
+        # Collect all edge ids that point to `cs` in every parent `ns`
+        par_edge_ids = []
+        for ns in cs2parns[cs]:
+            cs_id = ns.chs.index(cs)
+            edge_sid = sum([c.num_node_groups for c in ns.chs[:cs_id]])
+            edge_eid = edge_sid + cs.num_node_groups
 
-    # This is the main loop: iterate over `ns` in the layer
-    node_start = 0 # The start index of nodes in the current `ns`
-    for ns in nodes:
-        node_end = node_start + ns.num_nodes
+            criterion = (ns.edge_ids[1,:] >= edge_sid) & (ns.edge_ids[1,:] < edge_eid)
+            extracted_edge_ids = ns.edge_ids[:,criterion].clone()
+            extracted_edge_ids[1,:] -= edge_sid
 
-        # number of nodes
-        ns_num_nodes = ns.num_nodes
+            par_edge_ids.append(extracted_edge_ids)
 
-        # Edge indices of size [2, ns_num_edges]
-        edge_ids = ns.edge_ids.cuda()
-        ns_num_edges = edge_ids.size(1)
+            # Recreate `chs_offsets` and `cum_n_chs` to get compute the parameter ids
+            if not ns in ns2cum_n_chs:
+                chs_offsets = np.zeros([ns.edge_ids.size(1)], dtype = np.int64)
+                ns_nchs = np.zeros([ns.num_node_groups], dtype = np.int64)
 
-        # Construct helper indices for child nodes
-        # `cs_ele_id_start` contains the global start indices of the child nodes
-        # `cs_node_cum_ids` contains the local cumulative number of child nodes
-        cs_ele_id_start = torch.zeros([ns.num_chs], dtype = torch.long)
-        cs_node_cum_ids = torch.zeros([ns.num_chs], dtype = torch.long)
-        for i, cs in enumerate(ns.chs):
-            cs_ele_id_start[i] = cs._output_ind_range[0]
-            if i < ns.num_chs - 1:
-                cs_node_cum_ids[i+1] = cs_node_cum_ids[i] + cs.num_nodes
+                _assign_chid_kernel(chs_offsets, ns_nchs, ns.edge_ids.numpy())
+                chs_offsets = torch.from_numpy(chs_offsets)
 
-        cs_ele_id_start = cs_ele_id_start.cuda()
-        cs_node_cum_ids = cs_node_cum_ids.cuda()
+                ns_nchs = torch.from_numpy(ns_nchs)
+                cum_n_chs = torch.zeros([ns.num_node_groups], dtype = torch.long)
+                cum_n_chs[1:] = torch.cumsum(ns_nchs[:-1], dim = 0)
 
-        # Get the global element ids for the child node of all edges
-        global_ele_ids = torch.zeros([ns_num_edges], dtype = torch.long).cuda()
+                ns2cum_n_chs[ns] = cum_n_chs
+                ns2chs_offsets[ns] = chs_offsets[criterion]
 
-        # We store these constants in a tensor and retrieve them in the kernel
-        constexprs = torch.tensor([ns_num_edges]).long().cuda()
+        cs_num_ngroups = cs.num_node_groups
+        cs_num_edges = sum([edge_ids.size(1) for edge_ids in par_edge_ids])
+        
+        # Precompute the parent offset ids for every. That is, the `?`
+        # mark in `parids[partition_id][local_id,?]`
+        pars_offsets = np.zeros([cs_num_edges], dtype = np.int64)
+        cs_npars = np.zeros([cs_num_ngroups], dtype = np.int64)
 
-        # Make the grid and launch kernel
-        grid = lambda meta: (triton.cdiv(ns_num_edges, meta["BLOCK_SIZE"]),)
+        edge_sid = 0
+        for edge_ids in par_edge_ids:
+            edge_eid = edge_sid + edge_ids.size(1)
 
-        num_chs_np2 = triton.next_power_of_2(ns.num_chs)
-        _assign_global_eleids_kernel[grid](
-            global_ele_ids, cs_ele_id_start, cs_node_cum_ids, edge_ids, 
-            constexprs, ns.num_chs, num_chs_np2, BLOCK_SIZE = 2048
-        )
+            _assign_parid_kernel(pars_offsets, cs_npars, edge_ids.numpy(), edge_sid)
 
-        # [Recomputed] the child offset ids for every edge. That is, the `?` 
-        # mark in `pids[fw_group_id][fw_local_id,?]`
-        chs_offsets = np.zeros([ns_num_edges], dtype = np.int64)
-        ns_nchs = np.zeros([ns_num_nodes], dtype = np.int64)
-        edge_ids_np = ns.edge_ids.numpy()
+            edge_sid = edge_eid
 
-        _assign_chid_kernel(chs_offsets, ns_nchs, edge_ids_np)
-        chs_offsets = torch.from_numpy(chs_offsets).cuda()
+        pars_offsets = torch.from_numpy(pars_offsets)
 
-        # Compute the parent offset ids for every edge. That is, the `?`
-        # mark in `parids[bk_group_id][bk_local_id,?]`
-        par_offsets = np.zeros([ns_num_edges], dtype = np.int64)
-        par_counts_np = par_counts.numpy()
-        global_ele_ids_np = global_ele_ids.cpu().numpy()
+        # Move necessary buffers to GPU
+        chids_partition_start = chids_partition_start.cuda()
+        parids_partition_start = parids_partition_start.cuda()
+        pars_offsets = pars_offsets.cuda()
 
-        _assign_parid_kernel(par_offsets, par_counts_np, global_ele_ids_np)
+        for ns, edge_ids in zip(cs2parns[cs], par_edge_ids):
 
-        par_counts = torch.from_numpy(par_counts_np)
-        par_offsets = torch.from_numpy(par_offsets).cuda()
+            ns_num_edges = edge_ids.size(1)
+            edge_ids = edge_ids.cuda()
 
-        # The following kernel assigns the corresponding indices to `pids` and `psids`
+            if ns.is_tied():
+                ns_pid_start = ns.get_source_ns()._param_range[0]
+            else:
+                ns_pid_start = ns._param_range[0]
+
+            # Get `cum_n_chs` and `chs_offsets`, which are used to get the parameter indices
+            cum_n_chs = ns2cum_n_chs[ns].cuda()
+            chs_offsets = ns2chs_offsets[ns].cuda()
+
+            # We store these constants in a tensor and retrieve them in the kernel
+            # This is to avoid `triton` from compiling separate kernels for every layer configuration
+            # Saves 99.9% compilation time :)
+            cs_global_ele_start = cs._output_ind_range[0]
+            ns_global_node_start = ns._output_ind_range[0]
+            ns_group_size = ns.group_size
+            cs_group_size = cs.group_size
+            
+            constexprs = torch.tensor([ns_global_node_start, cs_global_ele_start, ns_group_size, cs_group_size, 
+                                       ns_pid_start, ns_num_edges, cs_ngroup_start]).long().cuda()
 
-        # We store these constants in a tensor and retrieve them in the kernel
-        # This is to avoid `triton` from compiling separate kernels for every layer configuration
-        # Saves 99.9% compilation time :)
-        constexprs = torch.tensor([global_nid_start, node_start, ns_num_edges]).long().cuda()
+            # Make the grid and launch kernel
+            grid = lambda meta: (triton.cdiv(ns_num_edges, meta["BLOCK_SIZE"]),)
 
-        # Make the grid and launch kernel
-        grid = lambda meta: (triton.cdiv(ns_num_edges, meta["BLOCK_SIZE"]),)
+            _assign_target_chpapids_kernel[grid](
+                target_chids, chids_partition_start, target_parids, target_parpids, parids_partition_start,
+                edge_ids, pars_offsets, n_partition_ids, n_id_in_partition, num_ngs_in_partition,
+                partition_max_pars, cum_n_chs, chs_offsets, constexprs, BLOCK_SIZE = 1024
+            )
 
-        _assign_target_parids_kernel[grid](
-            target_parids, target_parpids, parids_group_start, flat_pids, pids_group_start,
-            edge_ids, global_ele_ids, chs_offsets, par_offsets,
-            fw_n_group_ids, fw_n_id_in_group, bk_n_group_ids, bk_n_id_in_group,
-            fw_group_max_chs, bk_group_max_pars, constexprs, BLOCK_SIZE = min(2048, 2**20 // num_chs_np2)
-        )
+            cs_ngroup_start += cs.num_node_groups
 
-        node_start = node_end
+    # Restore `chids`
+    target_chids = target_chids.cpu()
+    chids = []
+    for partition_id in range(num_ngs_in_partition.size(0)):
+        sid = chids_partition_start[partition_id]
+        eid = sid + num_ngs_in_partition[partition_id]
+        chids.append(target_chids[sid:eid].contiguous())
 
     # Restore `parids` and `parpids`
     target_parids = target_parids.cpu()
     target_parpids = target_parpids.cpu()
     parids = []
     parpids = []
-    for group_id in range(bk_num_ns_in_group.size(0)):
-        sid = parids_group_start[group_id]
-        gsize = bk_num_ns_in_group[group_id]
-        gnchs = bk_group_max_pars[group_id]
+    for partition_id in range(num_ngs_in_partition.size(0)):
+        sid = parids_partition_start[partition_id]
+        gsize = num_ngs_in_partition[partition_id]
+        gnchs = partition_max_pars[partition_id]
         eid = sid + gsize * gnchs
         parids.append(target_parids[sid:eid].reshape(gsize, gnchs).contiguous())
         parpids.append(target_parpids[sid:eid].reshape(gsize, gnchs).contiguous())
 
-    return parids, parpids
+    return chids, parids, parpids
 
 
 ## Compilation for ProdLayer ##
diff --git a/src/pyjuice/layer/sum_layer.py b/src/pyjuice/layer/sum_layer.py
index 75fb349f..b8dc6ab0 100644
--- a/src/pyjuice/layer/sum_layer.py
+++ b/src/pyjuice/layer/sum_layer.py
@@ -14,7 +14,8 @@
 from .layer import Layer
 from .backend.node_partition import partition_nodes_by_n_edges
 from .backend.index_set import batched_index_set, index_cum
-from .compilation import get_sum_layer_stats, sum_layer_forward_compilation, \
+from .compilation import get_sum_layer_forward_stats, sum_layer_forward_compilation, \
+                         get_sum_layer_backward_stats, \
                          sum_layer_backward_compilation, next_power_of_2
 
 
@@ -40,7 +41,7 @@ def __init__(self, nodes: Sequence[SumNodes], global_nid_start: int,
         # n_chs:       [num_node_groups]          stores the number of child nodes of each node
         # Note: to allow different nodes to have different `ch_group_size`s, we record the number of 
         #       child **nodes** (instead of # node groups) in `n_chs`
-        layer_num_ngroups, layer_num_edges, n_chs = get_sum_layer_stats(self.nodes, global_nid_start)
+        layer_num_ngroups, layer_num_edges, n_chs = get_sum_layer_forward_stats(self.nodes, global_nid_start)
 
         self.num_nodes = layer_num_ngroups * self.group_size # Total number of nodes
         self.num_edges = layer_num_edges # Total number of edges
@@ -59,10 +60,10 @@ def __init__(self, nodes: Sequence[SumNodes], global_nid_start: int,
 
         # fw_n_partition_ids:      [num_ngroups]           stores the partition id for each node node
         # fw_n_id_in_partition:    [num_ngroups]           stores the index of the node groups in the partition
-        # fw_num_ns_in_partition:  [num_fw_partitions]     number of node groups in each partition
+        # fw_num_ngs_in_partition: [num_fw_partitions]     number of node groups in each partition
         fw_n_partition_ids = torch.zeros([layer_num_ngroups], dtype = torch.long)
         fw_n_id_in_partition = torch.zeros([layer_num_ngroups], dtype = torch.long)
-        fw_num_ns_in_partition = torch.zeros([self.num_fw_partitions], dtype = torch.long)
+        fw_num_ngs_in_partition = torch.zeros([self.num_fw_partitions], dtype = torch.long)
 
         min_n_chs = 0
         for partition_id, max_n_chs in enumerate(fw_partition_max_chs):
@@ -71,7 +72,7 @@ def __init__(self, nodes: Sequence[SumNodes], global_nid_start: int,
 
             fw_n_partition_ids[criterion] = partition_id
             fw_n_id_in_partition[criterion] = torch.arange(partition_size)
-            fw_num_ns_in_partition[partition_id] = partition_size
+            fw_num_ngs_in_partition[partition_id] = partition_size
 
             min_n_chs = max_n_chs + 1
 
@@ -80,9 +81,8 @@ def __init__(self, nodes: Sequence[SumNodes], global_nid_start: int,
         # nids:      List[[partition_size]]                      stores node group ids
         # cids:      List[[partition_size, partition_max_n_chs]] stores indices of child node groups
         # pids:      List[[partition_size, partition_max_n_chs]] stores indices of edge parameters (1st parameter of every group)
-        # ch_n_pars: [ch_prod_layer_size]                        stores the number of parents for each child node
-        nids, cids, pids, ch_n_pars, param_ends = sum_layer_forward_compilation(
-            self.nodes, fw_partition_max_chs, fw_n_partition_ids, fw_n_id_in_partition, fw_num_ns_in_partition, 
+        nids, cids, pids, param_ends = sum_layer_forward_compilation(
+            self.nodes, fw_partition_max_chs, fw_n_partition_ids, fw_n_id_in_partition, fw_num_ngs_in_partition, 
             n_chs, global_nid_start, ch_prod_layer_size, param_ends = param_ends,
             # GPU compilation is slightly slower for small layer due to the kernel jit compilation time
             use_cuda = True # not disable_gpu_compilation and (self.num_edges > 1000)
@@ -98,14 +98,74 @@ def __init__(self, nodes: Sequence[SumNodes], global_nid_start: int,
 
         ## Initialize backward pass ##
 
-        # import pdb; pdb.set_trace()
+        # A sum layer could have children of different group sizes
+        # We separate and partition them into different backward kernels
+        ch_gsize2cs, ch_gsize2num_ngroups, ch_gsize2n_pargs, cs2parns = get_sum_layer_backward_stats(nodes)
+
+        # For every possible child group size, we first compute the best partition strategy.
+        # We then move on to do the actual compilation
+        chids = []
+        parids = []
+        parpids = []
+        cs_group_sizes = []
+        for ch_gsize in ch_gsize2n_pargs:
+
+            num_ngroups = ch_gsize2num_ngroups[ch_gsize]
+            n_pargs = ch_gsize2n_pargs[ch_gsize]
+
+            # Find a good strategy to partition the node groups according to their number of children 
+            # to minimize total computation cost
+            bk_partition_max_pars = partition_nodes_by_n_edges(
+                n_pargs, sparsity_tolerance = layer_sparsity_tol, max_num_partitions = max_num_partitions
+            )
+
+            # Since the triton kernels require the maximum number children for each group to be a power of 2,
+            # we postprocess the partition sizes
+            bk_partition_max_pars = torch.unique(next_power_of_2(bk_partition_max_pars))
+            num_bk_partitions = bk_partition_max_pars.size(0)
+
+            # bk_n_partition_ids:      [num_ngroups]           stores the partition id for each node group
+            # bk_n_id_in_partition:    [num_ngroups]           stores the index of the node groups in the partition
+            # bk_num_ngs_in_partition: [num_bk_partitions]     number of node groups in each partition
+            bk_n_partition_ids = torch.zeros([num_ngroups], dtype = torch.long)
+            bk_n_id_in_partition = torch.zeros([num_ngroups], dtype = torch.long)
+            bk_num_ngs_in_partition = torch.zeros([num_bk_partitions], dtype = torch.long)
+
+            min_n_pars = 0
+            for partition_id, max_n_pars in enumerate(bk_partition_max_pars):
+                criterion = (n_pargs >= min_n_pars) & (n_pargs <= max_n_pars)
+                partition_size = criterion.sum().item()
+
+                bk_n_partition_ids[criterion] = partition_id
+                bk_n_id_in_partition[criterion] = torch.arange(partition_size)
+                bk_num_ngs_in_partition[partition_id] = partition_size
+
+                min_n_pars = max_n_pars + 1
+
+            # chids:      List[[partition_num_chs]]                         stores child group ids
+            # parids:     List[[partition_num_chs, partition_max_n_pargs]]  stores parent node groups' ids for each child node
+            # parpids:    List[[partition_num_chs, partition_max_n_pargs]]  param id for the edges to parent (correspond to `parids`)
+            curr_chids, curr_parids, curr_parpids = sum_layer_backward_compilation(
+                nodes = ch_gsize2cs[ch_gsize], 
+                cs2parns = cs2parns,
+                n_partition_ids = bk_n_partition_ids, 
+                n_id_in_partition = bk_n_id_in_partition, 
+                num_ngs_in_partition = bk_num_ngs_in_partition,
+                partition_max_pars = bk_partition_max_pars,
+                # GPU compilation is slightly slower for small layer due to the kernel jit compilation time
+                use_cuda = not disable_gpu_compilation and (self.num_edges > 1000)
+            )
+
+            chids.extend(curr_chids)
+            parids.extend(curr_parids)
+            parpids.extend(curr_parpids)
+            cs_group_sizes.extend([ch_gsize] * num_bk_partitions)
 
-        # # Find a good strategy to partition the child nodes into groups according to their number of parents 
-        # # to minimize total computation cost
-        # ch_n_pars = ch_n_pars[1:] # Strip away the dummy node. We will never use it in the following
-        # bk_group_max_pars = partition_nodes_by_n_edges(
-        #     ch_n_pars, sparsity_tolerance = layer_sparsity_tol, max_num_partitions = max_num_partitions
-        # )
+        # Store buffers for the forward pass
+        self.partitioned_chids = nn.ParameterList([nn.Parameter(tensor, requires_grad = False) for tensor in chids])
+        self.partitioned_parids = nn.ParameterList([nn.Parameter(tensor, requires_grad = False) for tensor in parids])
+        self.partitioned_parpids = nn.ParameterList([nn.Parameter(tensor, requires_grad = False) for tensor in parpids])
+        self.cs_group_sizes = cs_group_sizes
 
     def __init__old(self, nodes: Sequence[SumNodes], global_nid_start: int, 
                  param_ends: Sequence, tied_param_ids: Sequence,
@@ -470,7 +530,7 @@ def _forward(self, node_mars: torch.Tensor, element_mars: torch.Tensor,
     @staticmethod
     @triton.jit
     def _fw_triton_block_sparse_kernel(node_mars, element_mars, params, nids, cids_start, cids_increment,
-                                       pids_start, pids_increment, local_ids, batch_size, partial_eval: tl.constexpr,
+                                       pids_start, pids_increment, local_ids, batch_size: tl.constexpr, partial_eval: tl.constexpr,
                                        BLOCK_B: tl.constexpr, TILE_SIZE_K: tl.constexpr, K_NUM_TILES: tl.constexpr,
                                        TILE_SIZE_M: tl.constexpr, GROUP_SIZE_M: tl.constexpr):
 
@@ -485,20 +545,28 @@ def _fw_triton_block_sparse_kernel(node_mars, element_mars, params, nids, cids_s
         if partial_eval == 1:
             ngroup_id = tl.load(local_ids + ngroup_id)
 
-        # Initialize pointers to `params`
+        # Node offsets
         offs_node = tl.arange(0, TILE_SIZE_M) + tile_id * TILE_SIZE_M
+        offs_node = tl.max_contiguous(offs_node, TILE_SIZE_M)
+
+        # Edge offsets
         offs_edge = tl.arange(0, TILE_SIZE_K)
-        par_start = tl.load(pids_start + ngroup_id * TILE_SIZE_K + offs_edge)
+
+        # Initialize pointers to `params`
+        offs_estart = ngroup_id * TILE_SIZE_K + offs_edge
+        offs_estart = tl.max_contiguous(offs_estart, TILE_SIZE_K)
+        par_start = tl.load(pids_start + offs_estart)
         epars_ptr = params + \
             offs_node[:,None] + \
             par_start[None,:] # [TILE_SIZE_M, TILE_SIZE_K]
 
         # Batch offsets and mask
         offs_batch = tl.arange(0, BLOCK_B) + pid_b * BLOCK_B
+        offs_batch = tl.max_contiguous(offs_batch, BLOCK_B)
         mask_batch = offs_batch < batch_size
 
         # Initialize pointers to `element_mars`
-        edge_start = tl.load(cids_start + ngroup_id * TILE_SIZE_K + offs_edge)
+        edge_start = tl.load(cids_start + offs_estart)
         emars_ptr = element_mars + \
             edge_start[:,None] * batch_size + \
             offs_batch[None,:] # [TILE_SIZE_K, BLOCK_B]
@@ -528,12 +596,12 @@ def _fw_triton_block_sparse_kernel(node_mars, element_mars, params, nids, cids_s
             # Increment `epars_ptr`
             pids_inc = tl.load(pids_inc_ptr)
             epars_ptr += pids_inc[None,:]
-            pids_inc += TILE_SIZE_K
+            pids_inc_ptr += TILE_SIZE_K
 
             # Increment `emars_ptr`
             cids_inc = tl.load(cids_inc_ptr)
             emars_ptr += cids_inc[:,None] * batch_size
-            cids_inc += TILE_SIZE_K
+            cids_inc_ptr += TILE_SIZE_K
 
         # Write back
         off_nids = tl.load(nids + ngroup_id)
diff --git a/src/pyjuice/nodes/sum_nodes.py b/src/pyjuice/nodes/sum_nodes.py
index 2a1f6226..83a934ff 100644
--- a/src/pyjuice/nodes/sum_nodes.py
+++ b/src/pyjuice/nodes/sum_nodes.py
@@ -188,3 +188,6 @@ def _construct_edges(self, edge_ids: Optional[Union[Tensor,Sequence[Tensor]]]):
         assert par_ns.size(0) == self.num_node_groups and par_ns.max() == self.num_node_groups - 1, "Some node has no edge."
 
         self.edge_ids = edge_ids
+
+    def __repr__(self):
+        return f"SumNodes(num_node_groups={self.num_node_groups}, group_size={self.group_size}, num_chs={self.num_chs}, num_edges={self.num_edges})"
diff --git a/tests/layer/sum_layer_test.py b/tests/layer/sum_layer_test.py
index 7ef2237d..6f880c4d 100644
--- a/tests/layer/sum_layer_test.py
+++ b/tests/layer/sum_layer_test.py
@@ -87,22 +87,22 @@ def _bk_triton_block_sparse_kernel(node_flows, element_flows, node_mars, element
         # Increment `epars_ptr`
         pids_inc = tl.load(pids_inc_ptr)
         epars_ptr += pids_inc[None,:]
-        pids_inc += TILE_SIZE_K
+        pids_inc_ptr += TILE_SIZE_K
 
         # Increment `emars_ptr`
         cids_inc = tl.load(cids_inc_ptr)
         emars_ptr += cids_inc[:,None] * batch_size
         eflows_ptr += cids_inc[:,None] * batch_size
-        cids_inc += TILE_SIZE_K
+        cids_inc_ptr += TILE_SIZE_K
 
 
 @triton.jit
 def _bkp_triton_block_sparse_kernel(node_flows, node_mars, element_mars, params, param_flows, nids, cids, pids,
-                                    local_ids, batch_size, n_edges: tl.constexpr, partial_eval: tl.constexpr,
+                                    local_ids, batch_size: tl.constexpr, n_edges: tl.constexpr, partial_eval: tl.constexpr,
                                     TILE_SIZE_B: tl.constexpr, B_NUM_TILES: tl.constexpr, TILE_SIZE_K: tl.constexpr, TILE_SIZE_M: tl.constexpr, 
                                     GROUP_SIZE_M: tl.constexpr):
 
-    pid_k = tl.program_id(0) # ID of size-`TILE_SIZE_K` batches
+    pid_k = tl.program_id(0) # ID of size-`TILE_SIZE_K` edges
     pid_m = tl.program_id(1) # ID of size-`TILE_SIZE_M` nodes
 
     # Get inferred node group id from `pid_m`
@@ -135,9 +135,9 @@ def _bkp_triton_block_sparse_kernel(node_flows, node_mars, element_mars, params,
     acc = tl.zeros([TILE_SIZE_M, TILE_SIZE_K], dtype = tl.float32)
 
     for b in range(0, B_NUM_TILES):
-        emars = tl.load(emars_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_K, BLOCK_B]
-        nmars = tl.load(nmars_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_M, BLOCK_B]
-        nflows = tl.load(nflows_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_M, BLOCK_B]
+        emars = tl.load(emars_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_K, TILE_SIZE_B]
+        nmars = tl.load(nmars_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_M, TILE_SIZE_B]
+        nflows = tl.load(nflows_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_M, TILE_SIZE_B]
 
         nmars_max = tl.max(nmars, axis = 0)
         nflows_div_mars = nflows / tl.exp(nmars - nmars_max[None,:])
@@ -209,6 +209,28 @@ def sum_layer_test():
     assert torch.all(layer.partitioned_pids[0][:,0] == torch.arange(group_size, (group_size * 2 * 6 + 1) * group_size, 2 * group_size * group_size) - group_size + 1)
     assert torch.all(layer.partitioned_pids[0][:,1] == torch.arange(group_size, (group_size * 2 * 6 + 1) * group_size, 2 * group_size * group_size) + 1)
 
+    assert torch.all(layer.partitioned_chids[0] == torch.arange(group_size, 7 * group_size, group_size))
+    assert torch.all(layer.partitioned_parids[0][0:2,0] == group_size)
+    assert torch.all(layer.partitioned_parids[0][0:2,1] == 2 * group_size)
+    assert torch.all(layer.partitioned_parids[0][2:4,0] == 3 * group_size)
+    assert torch.all(layer.partitioned_parids[0][2:4,1] == 4 * group_size)
+    assert torch.all(layer.partitioned_parids[0][4:6,0] == 5 * group_size)
+    assert torch.all(layer.partitioned_parids[0][4:6,1] == 6 * group_size)
+    assert torch.all(layer.partitioned_parpids[0][0,0] == 1)
+    assert torch.all(layer.partitioned_parpids[0][1,0] == 1 + group_size**2)
+    assert torch.all(layer.partitioned_parpids[0][0,1] == 1 + 2 * group_size**2)
+    assert torch.all(layer.partitioned_parpids[0][1,1] == 1 + 3 * group_size**2)
+    assert torch.all(layer.partitioned_parpids[0][2,0] == 1 + 4 * group_size**2)
+    assert torch.all(layer.partitioned_parpids[0][3,0] == 1 + 5 * group_size**2)
+    assert torch.all(layer.partitioned_parpids[0][2,1] == 1 + 6 * group_size**2)
+    assert torch.all(layer.partitioned_parpids[0][3,1] == 1 + 7 * group_size**2)
+    assert torch.all(layer.partitioned_parpids[0][4,0] == 1 + 8 * group_size**2)
+    assert torch.all(layer.partitioned_parpids[0][5,0] == 1 + 9 * group_size**2)
+    assert torch.all(layer.partitioned_parpids[0][4,1] == 1 + 10 * group_size**2)
+    assert torch.all(layer.partitioned_parpids[0][5,1] == 1 + 11 * group_size**2)
+
+    import pdb; pdb.set_trace()
+
     layer.to(device)
 
     ## Forward tests ##
@@ -275,6 +297,8 @@ def speed_test():
     element_mars = torch.rand([group_size + num_prod_nodes * group_size * num_node_groups, batch_size]).log().to(device)
     params = torch.rand([layer.partitioned_pids[0].max() + group_size]).to(device)
 
+    # import pdb; pdb.set_trace()
+
     ## Forward tests ##
 
     layer(node_mars, element_mars, params)
@@ -291,6 +315,8 @@ def speed_test():
     print("Reference computation time on RTX 4090: 11.255ms.")
     print("--------------------------------------------------------------")
 
+    # exit()
+
     node_flows = torch.rand([group_size + group_size * num_node_groups * num_prod_nodes, batch_size]).to(device)
     element_flows = torch.zeros([group_size + num_prod_nodes * group_size * num_node_groups, batch_size]).log().to(device)
 
@@ -362,7 +388,7 @@ def speed_test():
     cids = layer.partitioned_cids[0]
     pids = layer.partitioned_pids[0]
 
-    param_flows = params.clone() * 0.0
+    param_flows = torch.zeros(params.size()).to(device)
 
     TILE_SIZE_B = 64
     TILE_SIZE_K = 64
@@ -375,6 +401,8 @@ def speed_test():
     
     grid = (triton.cdiv(n_edges, TILE_SIZE_K), triton.cdiv(layer_n_nodes, TILE_SIZE_M))
 
+    # print("aaa")
+
     _bkp_triton_block_sparse_kernel[grid](
         node_flows, node_mars, element_mars, params, 
         param_flows, nids, cids, pids, local_ids = None, 
@@ -385,23 +413,28 @@ def speed_test():
     )
 
     t0 = time.time()
+    # print("bbb")
     torch.cuda.synchronize()
+    # print("ccc")
     for _ in range(100):
         _bkp_triton_block_sparse_kernel[grid](
-        node_flows, node_mars, element_mars, params, 
-        param_flows, nids, cids, pids, local_ids = None, 
-        batch_size = batch_size, n_edges = n_edges, partial_eval = 0,
-        TILE_SIZE_B = TILE_SIZE_B, B_NUM_TILES = B_NUM_TILES, 
-        TILE_SIZE_K = TILE_SIZE_K, TILE_SIZE_M = TILE_SIZE_M, 
-        GROUP_SIZE_M = layer.group_size
-    )
+            node_flows, node_mars, element_mars, params, 
+            param_flows, nids, cids, pids, local_ids = None, 
+            batch_size = batch_size, n_edges = n_edges, partial_eval = 0,
+            TILE_SIZE_B = TILE_SIZE_B, B_NUM_TILES = B_NUM_TILES, 
+            TILE_SIZE_K = TILE_SIZE_K, TILE_SIZE_M = TILE_SIZE_M, 
+            GROUP_SIZE_M = layer.group_size
+        )
+        # print("ddd")
     torch.cuda.synchronize()
     t1 = time.time()
     backward_ms = (t1 - t0) / 100 * 1000
 
+    # print("eee")
+
     print(f"bkpbkp: {backward_ms:.3f}ms.")
 
 
 if __name__ == "__main__":
-    # sum_layer_test()
-    speed_test()
\ No newline at end of file
+    sum_layer_test()
+    # speed_test()
\ No newline at end of file

From ee6c3223302a2148676b2db7386ec4d23ed38fff Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Thu, 7 Dec 2023 23:10:29 +0800
Subject: [PATCH 027/162] delete old init function

---
 src/pyjuice/layer/sum_layer.py | 126 ---------------------------------
 1 file changed, 126 deletions(-)

diff --git a/src/pyjuice/layer/sum_layer.py b/src/pyjuice/layer/sum_layer.py
index b8dc6ab0..fe2c46ff 100644
--- a/src/pyjuice/layer/sum_layer.py
+++ b/src/pyjuice/layer/sum_layer.py
@@ -167,132 +167,6 @@ def __init__(self, nodes: Sequence[SumNodes], global_nid_start: int,
         self.partitioned_parpids = nn.ParameterList([nn.Parameter(tensor, requires_grad = False) for tensor in parpids])
         self.cs_group_sizes = cs_group_sizes
 
-    def __init__old(self, nodes: Sequence[SumNodes], global_nid_start: int, 
-                 param_ends: Sequence, tied_param_ids: Sequence,
-                 tied_param_group_ids: Sequence, tied_param_ends: Sequence,
-                 ch_prod_layer_size: int, layer_sparsity_tol: float = 0.0, 
-                 max_num_partitions: Optional[int] = None,
-                 disable_gpu_compilation: bool = False) -> None:
-
-        Layer.__init__(self, nodes)
-        nn.Module.__init__(self)
-
-        assert len(nodes) > 0, "No input node."
-
-        self.nodes = nodes
-        self.ch_prod_layer_size = ch_prod_layer_size
-
-        ## Get layer statistics & prepare for compilation ##
-
-        # n_chs:       [num_nodes]            stores the number of child nodes of each node
-        layer_num_nodes, layer_num_edges, n_chs = get_sum_layer_stats(self.nodes, global_nid_start)
-
-        self.num_nodes = layer_num_nodes # Total number of nodes
-        self.num_edges = layer_num_edges # Total number of edges
-
-        # Find a good strategy to partition the nodes into groups according to their number of children 
-        # to minimize total computation cost
-        fw_group_max_chs = partition_nodes_by_n_edges(
-            n_chs, sparsity_tolerance = layer_sparsity_tol, max_num_partitions = max_num_partitions
-        )
-
-        # Since the triton kernels require the maximum number children for each group to be a power of 2,
-        # we postprocess the group sizes
-        fw_group_max_chs = torch.unique(next_power_of_2(fw_group_max_chs))
-        
-        self.num_fw_groups = len(fw_group_max_chs) # Number of groups
-
-        # fw_n_group_ids:     [num_nodes]          stores the group id for each node
-        # fw_n_id_in_group:   [num_nodes]          stores the index of the nodes in the group
-        # fw_num_ns_in_group: [num_fw_groups]      number of nodes in each group
-        fw_n_group_ids = torch.zeros([self.num_nodes], dtype = torch.long)
-        fw_n_id_in_group = torch.zeros([self.num_nodes], dtype = torch.long)
-        fw_num_ns_in_group = torch.zeros([self.num_fw_groups], dtype = torch.long)
-
-        min_n_chs = 0
-        for group_id, max_n_chs in enumerate(fw_group_max_chs):
-            criterion = (n_chs >= min_n_chs) & (n_chs <= max_n_chs)
-            group_size = criterion.sum().item()
-
-            fw_n_group_ids[criterion] = group_id
-            fw_n_id_in_group[criterion] = torch.arange(group_size)
-            fw_num_ns_in_group[group_id] = group_size
-
-            min_n_chs = max_n_chs + 1
-
-        ## Initialize forward pass ##
-
-        # nids:      List[[group_size]]                  stores node ids
-        # cids:      List[[group_size, group_max_n_chs]] stores indices of child nodes
-        # pids:      List[[group_size, group_max_n_chs]] stores indices of edge parameters
-        # ch_n_pars: [ch_prod_layer_size]                stores the number of parents for each child node
-        nids, cids, pids, ch_n_pars, param_ends = sum_layer_forward_compilation(
-            self.nodes, fw_group_max_chs, fw_n_group_ids, fw_n_id_in_group, fw_num_ns_in_group, 
-            n_chs, global_nid_start, ch_prod_layer_size, param_ends = param_ends,
-            # GPU compilation is slightly slower for small layer due to the kernel jit compilation time
-            use_cuda = not disable_gpu_compilation and (self.num_edges > 1000)
-        )
-
-        # Store buffers for the forward pass
-        self.grouped_nids = nn.ParameterList([nn.Parameter(tensor, requires_grad = False) for tensor in nids])
-        self.grouped_cids = nn.ParameterList([nn.Parameter(tensor, requires_grad = False) for tensor in cids])
-        self.grouped_pids = nn.ParameterList([nn.Parameter(tensor, requires_grad = False) for tensor in pids])
-
-        ## Initialize backward pass ##
-
-        # Find a good strategy to partition the child nodes into groups according to their number of parents 
-        # to minimize total computation cost
-        ch_n_pars = ch_n_pars[1:] # Strip away the dummy node. We will never use it in the following
-        bk_group_max_pars = partition_nodes_by_n_edges(
-            ch_n_pars, sparsity_tolerance = layer_sparsity_tol, max_num_partitions = max_num_partitions
-        )
-
-        # Since the triton kernels require the maximum number children for each group to be a power of 2,
-        # we postprocess the group sizes
-        bk_group_max_pars = torch.unique(next_power_of_2(bk_group_max_pars))
-
-        self.num_bk_groups = len(bk_group_max_pars) # Number of groups
-
-        # bk_n_group_ids:     [ch_prod_layer_size]  stores the group id for each (child) node
-        # bk_n_id_in_group:   [ch_prod_layer_size]  stores the index of the (child) nodes in the group
-        # bk_num_ns_in_group: [num_bk_groups]       number of nodes in each group
-        # chids:              List[[group_size]]    stores child ids
-        bk_n_group_ids = torch.zeros([self.ch_prod_layer_size], dtype = torch.long)
-        bk_n_id_in_group = torch.zeros([self.ch_prod_layer_size], dtype = torch.long)
-        bk_num_ns_in_group = torch.zeros([self.num_bk_groups], dtype = torch.long)
-        chids = []
-
-        min_n_pars = 0
-        for group_id, max_n_pars in enumerate(bk_group_max_pars):
-            criterion = (ch_n_pars >= min_n_pars) & (ch_n_pars <= max_n_pars)
-            filtered_idxs = torch.where(criterion)[0] + 1 # plus one to offset the dummy node since it is removed from `ch_n_pars`
-            group_size = criterion.sum().item()
-
-            bk_n_group_ids[filtered_idxs] = group_id
-            bk_n_id_in_group[filtered_idxs] = torch.arange(group_size)
-            bk_num_ns_in_group[group_id] = group_size
-            chids.append(filtered_idxs)
-
-            min_n_pars = max_n_pars + 1
-
-        # parids:     List[[group_ch_size, group_max_n_pars]]  stores parameter ids for each child node
-        # parpids:    List[[group_ch_size, group_max_n_pars]]  param id for the edges to parent (correspond to `parids`)
-        parids, parpids = sum_layer_backward_compilation(
-            self.nodes, pids, fw_n_group_ids, fw_n_id_in_group, self.num_bk_groups, bk_n_group_ids, bk_n_id_in_group,
-            fw_group_max_chs, bk_group_max_pars, fw_num_ns_in_group, bk_num_ns_in_group, ch_prod_layer_size, global_nid_start,
-            # GPU compilation is slightly slower for small layer due to the kernel jit compilation time
-            use_cuda = not disable_gpu_compilation and (self.num_edges > 1000)
-        )
-
-        # Store buffers for the backward pass
-        self.grouped_chids = nn.ParameterList([nn.Parameter(tensor, requires_grad = False) for tensor in chids])
-        self.grouped_parids = nn.ParameterList([nn.Parameter(tensor, requires_grad = False) for tensor in parids])
-        self.grouped_parpids = nn.ParameterList([nn.Parameter(tensor, requires_grad = False) for tensor in parpids])
-
-        # Store the range of global node indices belonging to this layer
-        # This is used to implement partial evaluation
-        self.global_nid_range = (global_nid_start, global_nid_start + self.num_nodes)
-
     def to(self, device):
         super(SumLayer, self).to(device)
 

From c3a32ccc2f761ce02277b5d91060f8ee940870a1 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Fri, 8 Dec 2023 21:43:38 +0800
Subject: [PATCH 028/162] sum layer backward kernel for eles

---
 src/pyjuice/layer/sum_layer.py | 582 +++++++++++++++++++--------------
 tests/layer/sum_layer_test.py  |  57 +++-
 2 files changed, 384 insertions(+), 255 deletions(-)

diff --git a/src/pyjuice/layer/sum_layer.py b/src/pyjuice/layer/sum_layer.py
index fe2c46ff..9dfd5995 100644
--- a/src/pyjuice/layer/sum_layer.py
+++ b/src/pyjuice/layer/sum_layer.py
@@ -167,6 +167,11 @@ def __init__(self, nodes: Sequence[SumNodes], global_nid_start: int,
         self.partitioned_parpids = nn.ParameterList([nn.Parameter(tensor, requires_grad = False) for tensor in parpids])
         self.cs_group_sizes = cs_group_sizes
 
+        self.num_bk_partitions = len(chids)
+
+        # Store pre-compiled indices from `parids` and `parpids` in the following buffer
+        self._cached_bk_parids = dict()
+
     def to(self, device):
         super(SumLayer, self).to(device)
 
@@ -241,112 +246,69 @@ def backward(self, node_flows: torch.Tensor, element_flows: torch.Tensor,
         `params`:        [num_params, B] or [num_params]
         """
         
+        ## Compute flows w.r.t. elements (i.e., product nodes) ##
         if not self.provided("bk_group_local_ids"):
             # Evaluate the whole layer
-            for group_id in range(self.num_bk_groups):
-                chids = self.grouped_chids[group_id]
-                parids = self.grouped_parids[group_id]
-                parpids = self.grouped_parpids[group_id]
+            for partition_id in range(self.num_bk_partitions):
+                chids = self.partitioned_chids[partition_id]
+                parids = self.partitioned_parids[partition_id]
+                parpids = self.partitioned_parpids[partition_id]
+                cs_group_size = self.cs_group_sizes[partition_id]
 
                 self._backward(
                     node_flows, element_flows, params, node_mars, 
-                    element_mars, param_flows, chids, parids, parpids
+                    element_mars, param_flows, chids, parids, parpids,
+                    cs_group_size
                 )
 
         else:
             # Partial evaluation
-            for group_id in range(self.num_bk_groups):
-                chids = self.grouped_chids[group_id]
-                parids = self.grouped_parids[group_id]
-                parpids = self.grouped_parpids[group_id]
-                local_ids = self.bk_group_local_ids[group_id]
+            for partition_id in range(self.num_bk_partitions):
+                chids = self.grouped_chids[partition_id]
+                parids = self.grouped_parids[partition_id]
+                parpids = self.grouped_parpids[partition_id]
+                cs_group_size = self.cs_group_sizes[partition_id]
+                local_ids = self.bk_group_local_ids[partition_id]
 
                 self._backward(
                     node_flows, element_flows, params, node_mars,
                     element_mars, param_flows, chids, parids, parpids,
-                    local_ids = local_ids
+                    cs_group_size, local_ids = local_ids
+                )
+
+        ## Compute flows w.r.t. sum parameters ##
+        if param_flows is not None:
+            for partition_id in range(self.num_fw_partitions):
+                nids = self.partitioned_nids[partition_id]
+                cids = self.partitioned_cids[partition_id]
+                pids = self.partitioned_pids[partition_id]
+
+                self._backward(
+                    node_flows, element_flows, params, node_mars, 
+                    element_mars, param_flows, nids, cids, pids, 
+                    partition_id = partition_id
                 )
 
         return None
-        
-    @staticmethod
-    @triton.jit
-    def _forward_triton_kernel_old(node_mars_ptr, element_mars_ptr, params_ptr, 
-                               nids_ptr, cids_ptr, pids_ptr, tot_n_nodes, 
-                               tot_n_eles, n_nodes, n_edges: tl.constexpr, 
-                               batch_size, n_nodes_per_block_m: tl.constexpr,
-                               BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr):
-
-        # We use BLOCK_M to index over edges, and BLOCK_N to index over batches
-        pid0 = tl.program_id(axis = 0)
-        pid1 = tl.program_id(axis = 1)
-        ne_start = pid0 * BLOCK_M
-        b_start = pid1 * BLOCK_N
-
-        # Id of edges processed by the current block
-        ne_offsets = ne_start + tl.arange(0, BLOCK_M)
-        # Batch ids processed by the current block
-        b_offsets = b_start + tl.arange(0, BLOCK_N)
-        b_mask = b_offsets < batch_size
-
-        # Get node ids from `nids`
-        n_start = ne_start // n_edges
-        nid_offsets = n_start + tl.arange(0, n_nodes_per_block_m)
-        nid_mask = nid_offsets < n_nodes
-        n_ids = tl.load(nids_ptr + nid_offsets, mask = nid_mask, other = 0)
-
-        # Get edge ids from `cids`
-        cid_offsets = tl.view(ne_offsets, (n_edges, n_nodes_per_block_m))
-        cid_mask = tl.broadcast_to(nid_mask[None,:], (n_edges, n_nodes_per_block_m))
-        ch_ids = tl.load(cids_ptr + cid_offsets, mask = cid_mask, other = 0)
-
-        # Use `ch_ids` to retrieve the corresponding element mars
-        ele_offsets = tl.broadcast_to(ch_ids[None,:,:], (BLOCK_N, n_edges, n_nodes_per_block_m)) * batch_size + \
-            tl.broadcast_to(b_offsets[:,None,None], (BLOCK_N, n_edges, n_nodes_per_block_m))
-        ele_mask = tl.broadcast_to(nid_mask[None,None,:], (BLOCK_N, n_edges, n_nodes_per_block_m)) & \
-            tl.broadcast_to(b_mask[:,None,None], (BLOCK_N, n_edges, n_nodes_per_block_m))
-        ch_logps = tl.load(element_mars_ptr + ele_offsets, mask = ele_mask, other = 0) # `element_mars[cids]`
-
-        # Take the max of the child mars
-        ch_max_logp = tl.max(ch_logps, axis = 1) # `maxval`
-
-        # Subtract the max from child mars
-        ch_logps_sub_max = ch_logps - tl.broadcast_to(ch_max_logp[:,None,:], (BLOCK_N, n_edges, n_nodes_per_block_m))
-
-        # Take exp
-        ch_ps_sub_max = tl.exp(ch_logps_sub_max)
-
-        # Get param ids from `pids`
-        # Here we reuse `cid_offsets` and `cid_mask` thank to their similar structure
-        par_ids = tl.load(pids_ptr + cid_offsets, mask = cid_mask, other = 0)
-
-        # Use `par_ids` to retrieve the corresponding parameters
-        par_mask = tl.broadcast_to(nid_mask[None,:], (n_edges, n_nodes_per_block_m))
-        ch_pars = tl.load(params_ptr + par_ids, mask = par_mask, other = 0) # `params[pids]`
-        ch_pars = tl.broadcast_to(ch_pars[None,:,:], (BLOCK_N, n_edges, n_nodes_per_block_m))
-
-        # Sum node marginals (unnormalized)
-        n_ps = tl.sum(ch_ps_sub_max * ch_pars, axis = 1)
-
-        # Take log and subtract max vals
-        n_logps = tl.log(tl.maximum(n_ps, 1e-10)) + ch_max_logp
-
-        # Read out the target indices for `node_mars`
-        nmar_offsets = tl.broadcast_to(n_ids[None,:], (BLOCK_N, n_nodes_per_block_m)) * batch_size + \
-            tl.broadcast_to(b_offsets[:,None], (BLOCK_N, n_nodes_per_block_m))
-        nmar_mask = tl.broadcast_to(nid_mask[None,:], (BLOCK_N, n_nodes_per_block_m)) & \
-            tl.broadcast_to(b_mask[:,None], (BLOCK_N, n_nodes_per_block_m))
-        
-        # Reshape seems to be necessary for certain combinations of (BLOCK_N, n_nodes_per_block_m)
-        nmar_offsets = tl.view(nmar_offsets, (BLOCK_N * n_nodes_per_block_m,))
-        nmar_mask = tl.view(nmar_mask, (BLOCK_N * n_nodes_per_block_m,))
-        n_logps = tl.view(n_logps, (BLOCK_N * n_nodes_per_block_m,))
-        tl.store(node_mars_ptr + nmar_offsets, n_logps, mask = nmar_mask)
 
     @staticmethod
     @torch.compile(mode = "reduce-overhead", fullgraph = True)
-    def _forward_pytorch_kernel(node_mars: torch.Tensor, element_mars: torch.Tensor, 
-                                params: torch.Tensor, nids: torch.Tensor, cids: torch.Tensor, pids: torch.Tensor):
+    def _forward_pytorch_kernel(node_mars: torch.Tensor, element_mars: torch.Tensor, params: torch.Tensor, 
+                                nids: torch.Tensor, cids: torch.Tensor, pids: torch.Tensor,
+                                local_ids: torch.Tensor):
+
+        if local_ids is not None:
+            nids = nids[local_ids]
+            cids = cids[local_ids]
+            pids = pids[local_ids]
+
+        num_ngroups = nids.size(0)
+        num_edges = cids.size(1)
+        nids = (nids[:,None].repeat(1, self.group_size) + \
+            torch.arange(0, self.group_size, device = nids.device)[None,:]).reshape(num_ngroups * self.group_size)
+        cids = cids[:,None,:].repeat(1, self.group_size, 1).reshape(num_ngroups * self.group_size, num_edges)
+        pids = (pids[:,None,:].repeat(1, self.group_size, 1) + \
+            torch.arange(0, self.group_size, device = cids.device)[None,:,None]).reshape(num_ngroups * self.group_size, num_edges)
 
         ch_mars = element_mars[cids]
         maxval = ch_mars.max(dim = 1, keepdim = True).values
@@ -397,6 +359,11 @@ def _forward(self, node_mars: torch.Tensor, element_mars: torch.Tensor,
                 node_mars, element_mars, params, nids, cids, pids, local_ids,
                 partition_id = partition_id
             )
+
+        elif mode == "pytorch":
+            self._forward_pytorch_kernel(
+                node_mars, element_mars, params, nids, cids, pids, local_ids
+            )
         
         else:
             raise ValueError(f"Unexpected mode `{mode}`.")
@@ -524,14 +491,14 @@ def _forward_block_sparse(self, node_mars: torch.Tensor, element_mars: torch.Ten
         if signature not in self._cached_fw_pcids:
             # Pre-compute pointer increments for `cids` and `pids`
 
-            cids = cids.clone().reshape(num_ngroups, K_NUM_TILES, TILE_SIZE_K)
+            cids = cids.clone().reshape(cids.size(0), K_NUM_TILES, TILE_SIZE_K)
             cids_start = cids[:,0,:].contiguous()
             cids_increment = torch.cat(
                 (cids[:,1:,:] - cids[:,:-1,:], cids[:,0:1,:] * 0), 
                 dim = 1
             ).contiguous()
 
-            pids = pids.clone().reshape(num_ngroups, K_NUM_TILES, TILE_SIZE_K)
+            pids = pids.clone().reshape(pids.size(0), K_NUM_TILES, TILE_SIZE_K)
             pids_start = pids[:,0,:].contiguous()
             pids_increment = torch.cat(
                 (pids[:,1:,:] - pids[:,:-1,:], pids[:,0:1,:] * 0),
@@ -669,191 +636,304 @@ def _forward_sparse(self, node_mars: torch.Tensor, element_mars: torch.Tensor,
 
         return None
 
-    @staticmethod
-    @triton.jit
-    def _backward_kernel(node_flows_ptr, element_flows_ptr, params_ptr, 
-                         node_mars_ptr, element_mars_ptr, param_flows_ptr,
-                         chids_ptr, parids_ptr, parpids_ptr, tot_n_nodes, 
-                         tot_n_eles, n_nodes, n_edges: tl.constexpr, batch_size,
-                         n_nodes_per_block_m: tl.constexpr,
-                         accumulate_param_flows: tl.constexpr,
-                         BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr):
-        # We use BLOCK_M to index over edges, and BLOCK_N to index over batches
-        pid0 = tl.program_id(axis = 0)
-        pid1 = tl.program_id(axis = 1)
-        ne_start = pid0 * BLOCK_M
-        b_start = pid1 * BLOCK_N
-
-        # Id of edges processed by the current block
-        ne_offsets = ne_start + tl.arange(0, BLOCK_M)
-        # Batch ids processed by the current block
-        b_offsets = b_start + tl.arange(0, BLOCK_N)
-        b_mask = b_offsets < batch_size
-
-        # Node mask for future reuse
-        n_start = ne_start // n_edges
-        n_offsets = n_start + tl.arange(0, n_nodes_per_block_m)
-        n_mask = n_offsets < n_nodes
-
-        # Reusable ids for index tensors
-        par_offsets = tl.view(ne_offsets, (n_edges, n_nodes_per_block_m))
-        par_mask = tl.broadcast_to(n_mask[None,:], (n_edges, n_nodes_per_block_m)) 
-        bpar_mask = tl.broadcast_to(n_mask[None,None,:], (BLOCK_N, n_edges, n_nodes_per_block_m)) & \
-            tl.broadcast_to(b_mask[:,None,None], (BLOCK_N, n_edges, n_nodes_per_block_m))
-
-        # Get node ids from `parids` and retrieve the corresponding node flows and node mars
-        node_ids = tl.load(parids_ptr + par_offsets, mask = par_mask, other = 0)
-        node_offsets = tl.broadcast_to(node_ids[None,:,:], (BLOCK_N, n_edges, n_nodes_per_block_m)) * batch_size + \
-            tl.broadcast_to(b_offsets[:,None,None], (BLOCK_N, n_edges, n_nodes_per_block_m))
-        nflows = tl.load(node_flows_ptr + node_offsets, mask = bpar_mask, other = 0) # node_flows[parids]
-        nmars = tl.load(node_mars_ptr + node_offsets, mask = bpar_mask, other = 0) # node_mars[parids]
-
-        # Get param ids from `parpids` and retrieve the corresponding node params
-        eparam_ids = tl.load(parpids_ptr + par_offsets, mask = par_mask, other = 0)
-        eparams = tl.load(params_ptr + eparam_ids, mask = par_mask, other = 0)
-        eparams = tl.broadcast_to(eparams[None,:,:], (BLOCK_N, n_edges, n_nodes_per_block_m)) # params[parpids]
-
-        # Compute edge flows (partially)
-        cum_flow = nflows * eparams
-
-        # Get element ids from `cids` and retrieve the corresponding element mars
-        ele_ids = tl.load(chids_ptr + n_offsets, mask = n_mask, other = 0)
-        ele_offsets = tl.broadcast_to(ele_ids[None,:], (BLOCK_N, n_nodes_per_block_m)) * batch_size + \
-            tl.broadcast_to(b_offsets[:,None], (BLOCK_N, n_nodes_per_block_m))
-        ele_mask = tl.broadcast_to(n_mask[None,:], (BLOCK_N, n_nodes_per_block_m)) & \
-            tl.broadcast_to(b_mask[:,None], (BLOCK_N, n_nodes_per_block_m))
-        emars = tl.load(element_mars_ptr + ele_offsets, mask = ele_mask, other = 0) # element_mars[chids]
-        emars = tl.broadcast_to(emars[:,None,:], (BLOCK_N, n_edges, n_nodes_per_block_m)) # element_mars[chids].unsqueeze(1)
-
-        # Compute edge flows
-        emars_log_diff = emars - nmars
-        emars_diff = tl.exp(emars_log_diff)
-        eflows = cum_flow * emars_diff
-
-        # Store to `element_flows[chids]`
-        cum_eflows = tl.sum(eflows, axis = 1) # [BLOCK_N, n_nodes_per_block_m]
-        tl.store(element_flows_ptr + ele_offsets, cum_eflows, mask = ele_mask)
-
-        # Compute parameter flows
-        if accumulate_param_flows:
-            parflows = tl.sum(eflows, axis = 0) # [n_edges, n_nodes_per_block_m]
-            # Here the `eparam_ids > 0` term masks out dummy edges
-            parflow_mask = (eparam_ids > 0) & tl.broadcast_to(n_mask[None,:], (n_edges, n_nodes_per_block_m))
-            tl.atomic_add(param_flows_ptr + eparam_ids, parflows, mask = parflow_mask)
-
-    @staticmethod
-    @torch.compile(mode = "reduce-overhead", fullgraph = True)
-    def _backward_pytorch_kernel(node_flows: torch.Tensor, element_flows: torch.Tensor, 
-                                 params: torch.Tensor, node_mars: torch.Tensor, 
-                                 element_mars: torch.Tensor, param_flows: Optional[torch.Tensor], 
-                                 chids: torch.Tensor, parids: torch.Tensor, parpids: torch.Tensor):
-        
-        element_flows[chids] = (node_flows[parids] * params[parpids].unsqueeze(-1) * \
-            (element_mars[chids].unsqueeze(1) - node_mars[parids]).exp()).sum(dim = 1)
-
-        return None
-
     def _backward(self, node_flows: torch.Tensor, element_flows: torch.Tensor, 
                   params: torch.Tensor, node_mars: torch.Tensor, 
-                  element_mars: torch.Tensor, param_flows: torch.Tensor, 
+                  element_mars: torch.Tensor, param_flows: torch.Tensor,
                   chids: torch.Tensor, parids: torch.Tensor, parpids: torch.Tensor, 
-                  local_ids: Optional[torch.Tensor] = None, 
-                  BLOCK_M_HARD_LIMIT = 2**16, BLOCK_SIZE = 2**12, MAX_BLOCK_M = 2**11, 
-                  MAX_BLOCK_N = 64) -> None:
+                  cs_group_size: int, local_ids: Optional[torch.Tensor] = None, 
+                  partition_id: int = -1, mode: Optional[str] = None) -> None:
         """
-        This function is equivalent to running:
-        ``` 
-        element_flows[chids] = (node_flows[parids] * params[parpids] * \
-            (element_mars[chids].unsqueeze(1) - node_mars[parids]).exp()).sum(dim = 1)
-
-        param_flows[seq_parpids] += (node_flows[parids] * params[parpids] * \
-            (element_mars[chids].unsqueeze(1) - node_mars[parids]).exp()).sum(dim = 2)[seq_ids0, seq_ids1]
-        ```
+        Back pass of sum layers.
         
         Parameters:
-        `node_flows`:    [N, B]
-        `element_flows`: [M, B]
-        `params`:        [E]
-        `node_mars`:     [N, B]
-        `element_mars`:  [M, B]
-        `param_flows`:   [E]
-        `chids`:         [n]
-        `parids`:        [n, p]
-        `parpids`:       [n, p]
+        `node_flows`:   [N, B]
+        `element_flows: [M, B]
+        `params`:       [E]
+        `node_mars`:    [N, B]
+        `element_mars`: [M, B]
+        `param_flows`:  [E]
+        `chids`:        [ng]
+        `parids`:       [ng, c]
+        `parpids`:      [ng, c]
         """
 
-        if local_ids is not None and local_ids.size(0) == 0:
-            # Nothing need to be evaluated in the current group
-            return None
-        elif local_ids is not None:
-            # Select nodes
-            chids = chids[local_ids].contiguous()
-            parids = parids[local_ids,:].contiguous()
-            parpids = parpids[local_ids,:].contiguous()
-
-        tot_n_nodes = node_mars.size(0)
-        tot_n_eles = element_mars.size(0)
-        n_nodes = chids.size(0)
-        n_edges = parids.size(1)
-        batch_size = node_mars.size(1)
+        num_edges = parids.size(1) * self.group_size
+        batch_size = node_flows.size(1)
 
-        if params.dim() == 2 and params.size(1) == 1:
-            params = params.squeeze(1)
+        if mode is not None:
+            assert mode in ["block_sparse", "sparse"]
 
-        # If child nodes in the current group have no parent, we set the corresponding element flows to 0
-        if n_edges == 0:
-            element_flows[chids] = 0.0
+        elif params.dim() == 1 and self.group_size >= 16 and num_edges >= 16 and batch_size >= 16:
+            # In this case, we should definitely use the block-sparse implementation
+            mode = "block_sparse"
 
-            return None
+        if mode == "block_sparse":
+            self._backward_block_sparse(
+                node_flows, element_flows, params, node_mars, element_mars, param_flows, 
+                nids, cids, pids, chids, parids, parpids, cs_group_size, local_ids, 
+                partition_id = partition_id
+            )
 
-        # Fall back to the `torch.compile` kernel in the case where we cannot store child edges within a single block
-        if n_edges > BLOCK_M_HARD_LIMIT or not node_mars.is_cuda:
-            assert param_flows is None
-            self._backward_pytorch_kernel(
+        elif mode == "pytorch":
+            self._backward_pytorch(
                 node_flows, element_flows, params, node_mars, 
-                element_mars, param_flows, chids, parids, parpids
+                element_mars, param_flows, chids, parids, parpids,
+                cs_group_size
+            )
+
+    def _backward_block_sparse(self, node_flows: torch.Tensor, element_flows: torch.Tensor, 
+                               params: torch.Tensor, node_mars: torch.Tensor, 
+                               element_mars: torch.Tensor, param_flows: torch.Tensor, 
+                               nids: Optional[torch.Tensor], cids: Optional[torch.Tensor], pids: Optional[torch.Tensor],
+                               chids: Optional[torch.Tensor], parids: Optional[torch.Tensor], parpids: Optional[torch.Tensor], 
+                               cs_group_size: int, local_ids: Optional[torch.Tensor] = None,
+                               partition_id: int = -1) -> None:
+        """
+        Back pass of sum layers with block-sparse processing kernel.
+        
+        Parameters:
+        `node_flows`:   [N, B]
+        `element_flows: [M, B]
+        `params`:       [E]
+        `node_mars`:    [N, B]
+        `element_mars`: [M, B]
+        `param_flows`:  [E]
+        `chids`:        [ng]
+        `parids`:       [ng, c]
+        `parpids`:      [ng, c]
+        """
+
+        # Flows w.r.t. input elements (product nodes)
+        if chids is not None:
+            self._backward_block_sparse_ele_flows(
+                node_flows, element_flows, params, node_mars, element_mars,
+                chids, parids, parpids, cs_group_size, local_ids, partition_id
+            )
+
+        # Flows w.r.t. parameters
+        if param_flows is not None and nids is not None:
+            self._backward_block_sparse_par_flows(
+                node_flows, element_flows, params, node_mars, element_mars, 
+                nids, cids, pids, partition_id
             )
 
-            return None
+        return None
+
+    @staticmethod
+    @triton.jit
+    def _bk_triton_block_sparse_ele_kernel(node_flows, element_flows, node_mars, element_mars, params, 
+                                           chids, parids_start, parids_increment, parpids_start, parpids_increment, 
+                                           local_ids, batch_size: tl.constexpr, partial_eval: tl.constexpr, ptr_inc_step: tl.constexpr, 
+                                           BLOCK_B: tl.constexpr, TILE_SIZE_K: tl.constexpr, K_NUM_TILES: tl.constexpr, TILE_SIZE_M: tl.constexpr, 
+                                           GROUP_SIZE_M: tl.constexpr, GROUP_SIZE_K: tl.constexpr):
+
+        pid_b = tl.program_id(0) # ID of size-`BLOCK_B` batches
+        pid_m = tl.program_id(1) # ID of size-`TILE_SIZE_M` nodes
+
+        # Get inferred node group id from `pid_m`
+        elegroup_id = pid_m // (GROUP_SIZE_M // TILE_SIZE_M)
+        tile_id = pid_m % (GROUP_SIZE_M // TILE_SIZE_M)
+
+        # Get the real node group id in the case of partial evaluation
+        if partial_eval == 1:
+            elegroup_id = tl.load(local_ids + elegroup_id)
+
+        # Initialize pointers to `params`
+        offs_ele = tl.arange(0, TILE_SIZE_M) + tile_id * TILE_SIZE_M
+        offs_edge = tl.arange(0, TILE_SIZE_K)
+        offs_edge_gid = offs_edge // GROUP_SIZE_K
+        offs_edge_nid = (offs_edge % GROUP_SIZE_K)
+        par_start = tl.load(parpids_start + elegroup_id * ptr_inc_step + offs_edge_gid)
+        epars_ptr = params + \
+            offs_ele[:,None] + \
+            (par_start + offs_edge_nid * GROUP_SIZE_K)[None,:] # [TILE_SIZE_M, TILE_SIZE_K]
+
+        # Batch offsets and mask
+        offs_batch = tl.arange(0, BLOCK_B) + pid_b * BLOCK_B
+        mask_batch = offs_batch < batch_size
+
+        # Initialize pointers to `node_mars`
+        edge_start = tl.load(parids_start + elegroup_id * ptr_inc_step + offs_edge_gid)
+        nmars_ptr = node_mars + \
+            (edge_start + offs_edge_nid)[:,None] * batch_size + \
+            offs_batch[None,:] # [TILE_SIZE_K, BLOCK_B]
+        nflows_ptr = node_flows + \
+            (edge_start + offs_edge_nid)[:,None] * batch_size + \
+            offs_batch[None,:] # [TILE_SIZE_K, BLOCK_B]
+
+        # Initialize pointers to `element_mars`
+        off_eleids = tl.load(chids + elegroup_id)
+        offs_elemfs = (off_eleids + offs_ele[:,None]) * batch_size + offs_batch[None,:]
+        tmp_emars = tl.load(element_mars + offs_elemfs, mask = mask_batch[None,:]) # [TILE_SIZE_K, BLOCK_B]
+        emars_max = tl.max(tmp_emars, axis = 0) # [BLOCK_B]
+
+        # Batch increment pointers
+        parids_inc_ptr = parids_increment + elegroup_id * (K_NUM_TILES * ptr_inc_step) + offs_edge_gid
+        parpids_inc_ptr = parpids_increment + elegroup_id * (K_NUM_TILES * ptr_inc_step) + offs_edge_gid
+
+        # Inner loop
+        acc = tl.zeros([TILE_SIZE_M, BLOCK_B], dtype = tl.float32)
+
+        for k in range(0, K_NUM_TILES):
+            epars = tl.load(epars_ptr) # [TILE_SIZE_M, TILE_SIZE_K]
+            nmars = tl.load(nmars_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_K, BLOCK_B]
+            nflows = tl.load(nflows_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_K, BLOCK_B]
+
+            # Set a hard upper bound of 1e20 to avoid overflow
+            # However, this should not happen unless we have extremely small parameters
+            nflows_div_mars = nflows * tl.minimum(tl.exp(emars_max[None,:] - nmars), 1.0e20) 
+            
+            epars = epars.to(tl.bfloat16)
+            nflows_div_mars = nflows_div_mars.to(tl.bfloat16)
+            eflows = tl.dot(epars, nflows_div_mars).to(tl.float32)
+
+            acc += eflows
+
+            # Increment `epars_ptr`
+            parpids_inc = tl.load(parpids_inc_ptr)
+            epars_ptr += parpids_inc[None,:]
+            parpids_inc_ptr += ptr_inc_step
+
+            # Increment `nmars_ptr`
+            parids_inc = tl.load(parids_inc_ptr)
+            nmars_ptr += parids_inc[:,None] * batch_size
+            nflows_ptr += parids_inc[:,None] * batch_size
+            parids_inc += ptr_inc_step
+
+        # Initialize pointers to `element_mars`
+        off_eleids = tl.load(chids + elegroup_id)
+        offs_elemfs = (off_eleids + offs_ele[:,None]) * batch_size + offs_batch[None,:]
+        emars = tl.load(element_mars + offs_elemfs, mask = mask_batch[None,:]) # [TILE_SIZE_K, BLOCK_B]
+
+        eflows = acc * tl.exp(emars - emars_max[None,:])
+        tl.store(element_flows + offs_elemfs, eflows, mask = mask_batch[None,:])
+
+    def _backward_block_sparse_ele_flows(self, node_flows: torch.Tensor, element_flows: torch.Tensor,
+                                         params: torch.Tensor, node_mars: torch.Tensor,
+                                         element_mars: torch.Tensor, chids: torch.Tensor, parids: torch.Tensor,
+                                         parpids: torch.Tensor, cs_group_size: int, local_ids: Optional[torch.Tensor] = None,
+                                         partition_id: int = -1) -> None:
 
-        assert n_edges <= BLOCK_M_HARD_LIMIT, f"Number of edges should be smaller than or equal to {BLOCK_M_HARD_LIMIT}."
         assert params.dim() == 1, "Expecting a 1D `params`."
 
-        if n_edges <= MAX_BLOCK_M:
-            # In this case, we can find a better thread-block balance
-            MIN_BLOCK_M = min(triton.next_power_of_2(n_edges), MAX_BLOCK_M)
-            BLOCK_N = min(BLOCK_SIZE // MIN_BLOCK_M, MAX_BLOCK_N, triton.next_power_of_2(batch_size))
-            BLOCK_M = min(BLOCK_SIZE // BLOCK_N, MAX_BLOCK_M)
+        num_ngroups = chids.size(0) if local_ids is None else local_ids.size(0)
+        layer_n_nodes = num_ngroups * cs_group_size
+        num_edges = parids.size(1) * self.group_size
+        batch_size = node_flows.size(1)
+        BATCH_SIZE_NP2 = triton.next_power_of_2(batch_size)
+
+        # Heuristic to set `TILE_SIZE_M`, `TILE_SIZE_K`, and `BLOCK_B`
+        base_size = min(self.group_size, num_edges, BATCH_SIZE_NP2, 128)
+        if base_size >= 64:
+            TILE_SIZE_K = base_size
+            TILE_SIZE_M = 2048 // base_size
+            BLOCK_B = 2048 // base_size
+        else:
+            remainder = 2048 // (base_size ** 2)
+
+            TILE_SIZE_K = min(2048 // remainder, base_size * remainder, num_edges)
+            TILE_SIZE_M = min(2048 // TILE_SIZE_K, cs_group_size)
+            BLOCK_B = min(2048 // TILE_SIZE_K, BATCH_SIZE_NP2)
+        K_NUM_TILES = num_edges // TILE_SIZE_K
+
+        signature = ("block_sparse", partition_id, TILE_SIZE_K)
+        if signature not in self._cached_bk_parids:
+            # Pre-compute pointer increments for `parids` and `parpids`
+
+            if TILE_SIZE_K <= self.group_size:
+                ptr_inc_step = 1
+
+                num_rep = self.group_size // TILE_SIZE_K
+                parids = (parids[:,:,None].repeat(1, 1, num_rep) + \
+                    torch.arange(0, self.group_size, TILE_SIZE_K, device = parids.device)[None,None,:]).reshape(
+                        parids.size(0), K_NUM_TILES, 1)
+                parpids = (parpids[:,:,None].repeat(1, 1, num_rep) + \
+                    torch.arange(0, self.group_size * cs_group_size, TILE_SIZE_K * cs_group_size, device = parpids.device)[None,None,:]).reshape(
+                        parpids.size(0), K_NUM_TILES, 1)
+
+            else:
+                ptr_inc_step = TILE_SIZE_K // self.group_size
+
+                parids = parids.reshape(parids.size(0), K_NUM_TILES, ptr_inc_step)
+                parpids = parpids.reshape(parpids.size(0), K_NUM_TILES, ptr_inc_step)
+
+            parids_start = parids[:,0,:].contiguous()
+            parids_increment = torch.cat(
+                (parids[:,1:,:] - parids[:,:-1,:], parids[:,0:1,:] * 0),
+                dim = 1
+            ).contiguous()
+
+            parpids_start = parpids[:,0,:].contiguous()
+            parpids_increment = torch.cat(
+                (parpids[:,1:,:] - parpids[:,:-1], parpids[:,0:1,:] * 0),
+                dim = 1
+            ).contiguous()
+
+            self._cached_bk_parids[signature] = [parids_start, parids_increment, parpids_start, parpids_increment, ptr_inc_step]
         else:
-            # Try to fit all edges of a node in a single thread-block
-            BLOCK_M = triton.next_power_of_2(n_edges)
-            BLOCK_N = max(BLOCK_SIZE // BLOCK_M, 1)
-
-        grid = (triton.cdiv(n_nodes * n_edges, BLOCK_M), triton.cdiv(batch_size, BLOCK_N), 1)
-
-        self._backward_kernel[grid](
-            node_flows_ptr = node_flows,
-            element_flows_ptr = element_flows,
-            params_ptr = params,
-            node_mars_ptr = node_mars, 
-            element_mars_ptr = element_mars,
-            param_flows_ptr = param_flows, 
-            chids_ptr = chids, 
-            parids_ptr = parids, 
-            parpids_ptr = parpids,
-            tot_n_nodes = tot_n_nodes,
-            tot_n_eles = tot_n_eles,
-            n_nodes = n_nodes,
-            n_edges = n_edges,
-            batch_size = batch_size,
-            n_nodes_per_block_m = BLOCK_M // n_edges,
-            accumulate_param_flows = (param_flows is not None),
-            BLOCK_M = BLOCK_M, 
-            BLOCK_N = BLOCK_N
+            parids_start, parids_increment, parpids_start, parpids_increment, ptr_inc_step = self._cached_bk_parids[signature]
+
+        grid = (triton.cdiv(batch_size, BLOCK_B), triton.cdiv(layer_n_nodes, TILE_SIZE_M))
+
+        self._bk_triton_block_sparse_ele_kernel[grid](
+            node_flows = node_flows, 
+            element_flows = element_flows, 
+            node_mars = node_mars, 
+            element_mars = element_mars, 
+            params = params, 
+            chids = chids, 
+            parids_start = parids_start,
+            parids_increment = parids_increment,
+            parpids_start = parpids_start,
+            parpids_increment = parpids_increment, 
+            local_ids = local_ids, 
+            batch_size = batch_size, 
+            partial_eval = 1 if local_ids is not None else 0,
+            ptr_inc_step = ptr_inc_step,
+            BLOCK_B = BLOCK_B, 
+            TILE_SIZE_K = TILE_SIZE_K, 
+            K_NUM_TILES = K_NUM_TILES,
+            TILE_SIZE_M = TILE_SIZE_M, 
+            GROUP_SIZE_M = cs_group_size,
+            GROUP_SIZE_K = self.group_size
         )
 
         return None
 
+    def _backward_block_sparse_par_flows(self, node_flows: torch.Tensor, element_flows: torch.Tensor, 
+                                         params: torch.Tensor, node_mars: torch.Tensor, element_mars: torch.Tensor, 
+                                         nids: torch.Tensor, cids: torch.Tensor, pids: torch.Tensor, 
+                                         partition_id: int = -1) -> None:
+
+        pass
+
+    # @torch.compile(mode = "reduce-overhead", fullgraph = True)
+    def _backward_pytorch(self, node_flows: torch.Tensor, element_flows: torch.Tensor, 
+                          params: torch.Tensor, node_mars: torch.Tensor, 
+                          element_mars: torch.Tensor, param_flows: Optional[torch.Tensor], 
+                          chids: torch.Tensor, parids: torch.Tensor, parpids: torch.Tensor,
+                          cs_group_size: int):
+
+        if param_flows is not None:
+            raise ValueError("PyTorch kernel does not support computing parameter flows.")
+
+        num_ngroups = chids.size(0)
+        num_egroups = parids.size(1)
+        parids = (parids[:,:,None].repeat(1, 1, self.group_size) + torch.arange(0, self.group_size, device = parids.device)).reshape(num_ngroups, num_egroups * self.group_size)
+        parpids = (parpids[:,:,None] + torch.arange(0, self.group_size * cs_group_size, cs_group_size, device = parids.device)).reshape(
+            num_ngroups, num_egroups * self.group_size)
+
+        chids = (chids[:,None].repeat(1, cs_group_size) + torch.arange(0, cs_group_size, device = chids.device)).reshape(num_ngroups * cs_group_size)
+        parids = parids[:,None,:].repeat(1, cs_group_size, 1).reshape(num_ngroups * cs_group_size, num_egroups * self.group_size)
+        parpids = (parpids[:,None,:].repeat(1, cs_group_size, 1) + torch.arange(0, cs_group_size, device = parpids.device)[None,:,None]).reshape(
+            num_ngroups * cs_group_size, num_egroups * self.group_size
+        )
+        
+        element_flows[chids] = (node_flows[parids] * params[parpids].unsqueeze(-1) * \
+            (element_mars[chids].unsqueeze(1) - node_mars[parids]).exp()).sum(dim = 1)
+
+        return None
+
     def _prepare_scope2nids(self, prod_scope_eleids: Sequence[Tuple[BitSet, torch.Tensor]]):
         if not (hasattr(self, "fw_scope2localids") and hasattr(self, "bk_scope2localids")):
             fw_scope2localids = dict()
diff --git a/tests/layer/sum_layer_test.py b/tests/layer/sum_layer_test.py
index 6f880c4d..0bb03734 100644
--- a/tests/layer/sum_layer_test.py
+++ b/tests/layer/sum_layer_test.py
@@ -229,8 +229,6 @@ def sum_layer_test():
     assert torch.all(layer.partitioned_parpids[0][4,1] == 1 + 10 * group_size**2)
     assert torch.all(layer.partitioned_parpids[0][5,1] == 1 + 11 * group_size**2)
 
-    import pdb; pdb.set_trace()
-
     layer.to(device)
 
     ## Forward tests ##
@@ -249,6 +247,39 @@ def sum_layer_test():
             epars = params[layer.partitioned_pids[0][j,:]+i]
             assert torch.all(torch.abs(node_mars[(j+1)*group_size+i,:] - (epars[:,None] * cmars).sum(dim = 0).log()) < 1e-3)
 
+    ## Backward tests ##
+
+    node_flows = torch.rand([group_size + group_size * 2 * 3, batch_size]).to(device)
+    element_flows = torch.zeros([group_size + 3 * 2 * 2 * group_size, batch_size]).to(device)
+
+    param_flows = torch.zeros([1 + 3 * 4 * group_size * group_size]).to(device)
+
+    layer.backward(node_flows, element_flows, node_mars, element_mars, params, param_flows)
+
+    chids = layer.partitioned_chids[0]
+    parids = layer.partitioned_parids[0]
+    parpids = layer.partitioned_parpids[0]
+
+    num_ngroups = chids.size(0)
+    num_egroups = parids.size(1)
+    parids = (parids[:,:,None].repeat(1, 1, group_size) + torch.arange(0, group_size, device = parids.device)).reshape(num_ngroups, num_egroups * group_size)
+    parpids = (parpids[:,:,None] + torch.arange(0, group_size * group_size, group_size, device = parids.device)).reshape(
+        num_ngroups, num_egroups * group_size)
+
+    for i in range(group_size):
+        for j in range(6):
+            nmars = node_mars[parids[j,:]].exp()
+            nflows = node_flows[parids[j,:]]
+            emars = element_mars[(j+1)*group_size+i,:].exp()
+            epars = params[parpids[j,:]+i]
+            eflows = (nflows * epars[:,None] * emars[None,:] / nmars).sum(dim = 0)
+
+            import pdb; pdb.set_trace()
+
+            assert torch.all(torch.abs(eflows - element_flows[(j+1)*group_size+i,:]) < 1e-3)
+            
+    import pdb; pdb.set_trace()
+
 
 def speed_test():
 
@@ -319,6 +350,23 @@ def speed_test():
 
     node_flows = torch.rand([group_size + group_size * num_node_groups * num_prod_nodes, batch_size]).to(device)
     element_flows = torch.zeros([group_size + num_prod_nodes * group_size * num_node_groups, batch_size]).log().to(device)
+    param_flows = torch.zeros([layer.partitioned_pids[0].max() + group_size]).to(device)
+
+    layer.backward(node_flows, element_flows, node_mars, element_mars, params, param_flows)
+
+    t0 = time.time()
+    torch.cuda.synchronize()
+    for _ in range(100):
+        layer.backward(node_flows, element_flows, node_mars, element_mars, params, param_flows)
+    torch.cuda.synchronize()
+    t1 = time.time()
+    backward_ms = (t1 - t0) / 100 * 1000
+
+    print(f"Backward pass on average takes {forward_ms:.3f}ms.")
+    print("Reference computation time on RTX 4090: 11.255ms.")
+    print("--------------------------------------------------------------")
+
+    exit()
 
     # import pdb; pdb.set_trace()
 
@@ -436,5 +484,6 @@ def speed_test():
 
 
 if __name__ == "__main__":
-    sum_layer_test()
-    # speed_test()
\ No newline at end of file
+    torch.manual_seed(3890)
+    # sum_layer_test()
+    speed_test()
\ No newline at end of file

From 12d1bf9614f69ffbece32386fb6af4a39fcc7fc3 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Fri, 8 Dec 2023 23:55:47 +0800
Subject: [PATCH 029/162] full backward

---
 src/pyjuice/layer/sum_layer.py | 165 ++++++++++++++++--
 tests/layer/sum_layer_test.py  | 299 ++-------------------------------
 2 files changed, 165 insertions(+), 299 deletions(-)

diff --git a/src/pyjuice/layer/sum_layer.py b/src/pyjuice/layer/sum_layer.py
index 9dfd5995..6d51076e 100644
--- a/src/pyjuice/layer/sum_layer.py
+++ b/src/pyjuice/layer/sum_layer.py
@@ -257,8 +257,9 @@ def backward(self, node_flows: torch.Tensor, element_flows: torch.Tensor,
 
                 self._backward(
                     node_flows, element_flows, params, node_mars, 
-                    element_mars, param_flows, chids, parids, parpids,
-                    cs_group_size
+                    element_mars, param_flows, 
+                    chids = chids, parids = parids, parpids = parpids,
+                    cs_group_size = cs_group_size
                 )
 
         else:
@@ -272,8 +273,9 @@ def backward(self, node_flows: torch.Tensor, element_flows: torch.Tensor,
 
                 self._backward(
                     node_flows, element_flows, params, node_mars,
-                    element_mars, param_flows, chids, parids, parpids,
-                    cs_group_size, local_ids = local_ids
+                    element_mars, param_flows, 
+                    chids = chids, parids = parids, parpids = parpids,
+                    cs_group_size = cs_group_size, local_ids = local_ids
                 )
 
         ## Compute flows w.r.t. sum parameters ##
@@ -285,8 +287,8 @@ def backward(self, node_flows: torch.Tensor, element_flows: torch.Tensor,
 
                 self._backward(
                     node_flows, element_flows, params, node_mars, 
-                    element_mars, param_flows, nids, cids, pids, 
-                    partition_id = partition_id
+                    element_mars, param_flows, nids = nids, 
+                    cids = cids, pids = pids, partition_id = partition_id
                 )
 
         return None
@@ -639,8 +641,10 @@ def _forward_sparse(self, node_mars: torch.Tensor, element_mars: torch.Tensor,
     def _backward(self, node_flows: torch.Tensor, element_flows: torch.Tensor, 
                   params: torch.Tensor, node_mars: torch.Tensor, 
                   element_mars: torch.Tensor, param_flows: torch.Tensor,
-                  chids: torch.Tensor, parids: torch.Tensor, parpids: torch.Tensor, 
-                  cs_group_size: int, local_ids: Optional[torch.Tensor] = None, 
+                  nids: Optional[torch.Tensor] = None, cids: Optional[torch.Tensor] = None, 
+                  pids: Optional[torch.Tensor] = None, chids: Optional[torch.Tensor] = None, 
+                  parids: Optional[torch.Tensor] = None, parpids: Optional[torch.Tensor] = None, 
+                  cs_group_size: int = 0, local_ids: Optional[torch.Tensor] = None, 
                   partition_id: int = -1, mode: Optional[str] = None) -> None:
         """
         Back pass of sum layers.
@@ -657,7 +661,10 @@ def _backward(self, node_flows: torch.Tensor, element_flows: torch.Tensor,
         `parpids`:      [ng, c]
         """
 
-        num_edges = parids.size(1) * self.group_size
+        if cids is not None:
+            num_edges = cids.size(1) * self.group_size
+        else:
+            num_edges = parids.size(1) * self.group_size
         batch_size = node_flows.size(1)
 
         if mode is not None:
@@ -707,14 +714,16 @@ def _backward_block_sparse(self, node_flows: torch.Tensor, element_flows: torch.
         if chids is not None:
             self._backward_block_sparse_ele_flows(
                 node_flows, element_flows, params, node_mars, element_mars,
-                chids, parids, parpids, cs_group_size, local_ids, partition_id
+                chids = chids, parids = parids, parpids = parpids, 
+                cs_group_size = cs_group_size, local_ids = local_ids, 
+                partition_id = partition_id
             )
 
         # Flows w.r.t. parameters
         if param_flows is not None and nids is not None:
             self._backward_block_sparse_par_flows(
-                node_flows, element_flows, params, node_mars, element_mars, 
-                nids, cids, pids, partition_id
+                node_flows, params, node_mars, element_mars, param_flows,
+                nids = nids, cids = cids, pids = pids
             )
 
         return None
@@ -900,14 +909,134 @@ def _backward_block_sparse_ele_flows(self, node_flows: torch.Tensor, element_flo
 
         return None
 
-    def _backward_block_sparse_par_flows(self, node_flows: torch.Tensor, element_flows: torch.Tensor, 
-                                         params: torch.Tensor, node_mars: torch.Tensor, element_mars: torch.Tensor, 
-                                         nids: torch.Tensor, cids: torch.Tensor, pids: torch.Tensor, 
-                                         partition_id: int = -1) -> None:
+    @staticmethod
+    @triton.jit
+    def _bk_triton_block_sparse_par_kernel(node_flows, node_mars, element_mars, params, param_flows, nids, cids, pids,
+                                           batch_size: tl.constexpr, num_edges: tl.constexpr, TILE_SIZE_B: tl.constexpr, 
+                                           B_NUM_TILES: tl.constexpr, TILE_SIZE_K: tl.constexpr, 
+                                           TILE_SIZE_M: tl.constexpr, GROUP_SIZE_M: tl.constexpr):
+
+        pid_k = tl.program_id(0) # ID of size-`TILE_SIZE_K` edges
+        pid_m = tl.program_id(1) # ID of size-`TILE_SIZE_M` nodes
+
+        # Get inferred node group id from `pid_m`
+        ngroup_id = pid_m // (GROUP_SIZE_M // TILE_SIZE_M)
+        tile_id = pid_m % (GROUP_SIZE_M // TILE_SIZE_M)
+
+        # Batch offsets and mask
+        offs_batch = tl.arange(0, TILE_SIZE_B)
+        mask_batch = offs_batch < batch_size
+
+        # Initialize pointers to `element_mars`
+        offs_edge = tl.arange(0, TILE_SIZE_K) + pid_k * TILE_SIZE_K
+        edge_start = tl.load(cids + ngroup_id * num_edges + offs_edge)
+        emars_ptr = element_mars + \
+            edge_start[:,None] * batch_size + \
+            offs_batch[None,:] # [TILE_SIZE_K, TILE_SIZE_B]
+
+        # Initialize pointers to `node_flows` and `node_mars`
+        offs_node = tl.arange(0, TILE_SIZE_M) + tile_id * TILE_SIZE_M
+        off_nids = tl.load(nids + ngroup_id)
+        nmars_ptr = node_mars + (off_nids + offs_node[:,None]) * batch_size + offs_batch[None,:]
+        nflows_ptr = node_flows + (off_nids + offs_node[:,None]) * batch_size + offs_batch[None,:]
+
+        # Inner loop
+        acc = tl.zeros([TILE_SIZE_M, TILE_SIZE_K], dtype = tl.float32)
+
+        for b in range(0, B_NUM_TILES):
+            emars = tl.load(emars_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_K, TILE_SIZE_B]
+            nmars = tl.load(nmars_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_M, TILE_SIZE_B]
+            nflows = tl.load(nflows_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_M, TILE_SIZE_B]
+
+            emars_max = tl.max(emars, axis = 0)
+            nflows_div_mars = nflows * tl.exp(emars_max[None,:] - nmars)
+            nflows_div_mars = nflows_div_mars.to(tl.bfloat16)
+
+            emars = tl.exp(emars - emars_max[None,:])
+            emars = emars.to(tl.bfloat16)
+
+            pflows = tl.dot(nflows_div_mars, tl.trans(emars)).to(tl.float32)
+
+            acc += pflows
+
+            # Increment `emars_ptr`, `nmars_ptr`, and `nmars_ptr`
+            emars_ptr += TILE_SIZE_B
+            nmars_ptr += TILE_SIZE_B
+            nflows_ptr += TILE_SIZE_B
+
+            # Update batch mask
+            offs_batch += TILE_SIZE_B
+            mask_batch = offs_batch < batch_size
+
+        par_start = tl.load(pids + ngroup_id * num_edges + offs_edge)
+        epars_offsets = offs_node[:,None] + par_start[None,:] # [TILE_SIZE_M, TILE_SIZE_K]
+        
+        epars = tl.load(params + epars_offsets)
+        pflows = acc * epars
 
-        pass
+        tl.store(param_flows + epars_offsets, pflows)
 
-    # @torch.compile(mode = "reduce-overhead", fullgraph = True)
+    def _backward_block_sparse_par_flows(self, node_flows: torch.Tensor, params: torch.Tensor, node_mars: torch.Tensor, 
+                                         element_mars: torch.Tensor, param_flows: torch.Tensor, nids: torch.Tensor, 
+                                         cids: torch.Tensor, pids: torch.Tensor, ) -> None:
+        """
+        Backward pass of sum layers w.r.t. sum parameters with the block-sparse processing kernel.
+        
+        Parameters:
+        `node_flows`:    [N, B]
+        `element_flows`: [M, B]
+        `params`:        [E]
+        `node_mars`:     [N, B]
+        `element_mars`:  [M, B]
+        `param_flows`:   [E]
+        `nids`:          [ng]
+        `cids`:          [ng, c]
+        `pids`:          [ng, c]
+        """
+
+        assert params.dim() == 1, "Expecting a 1D `params`."
+
+        num_ngroups = nids.size(0)
+        layer_n_nodes = num_ngroups * self.group_size
+        num_edges = cids.size(1)
+        batch_size = node_mars.size(1)
+        BATCH_SIZE_NP2 = triton.next_power_of_2(batch_size)
+
+        # Heuristic to set `TILE_SIZE_M`, `TILE_SIZE_K`, and `BLOCK_B`
+        base_size = min(self.group_size, num_edges, BATCH_SIZE_NP2, 128)
+        if base_size >= 64:
+            TILE_SIZE_B = base_size
+            TILE_SIZE_M = 2048 // base_size
+            TILE_SIZE_K = 2048 // base_size
+        else:
+            remainder = 2048 // (base_size ** 2)
+
+            TILE_SIZE_B = min(2048 // remainder, base_size * remainder, BATCH_SIZE_NP2)
+            TILE_SIZE_M = min(2048 // TILE_SIZE_B, self.group_size)
+            TILE_SIZE_K = min(2048 // TILE_SIZE_B, num_edges)
+        B_NUM_TILES = batch_size // TILE_SIZE_B
+
+        grid = (triton.cdiv(num_edges, TILE_SIZE_K), triton.cdiv(layer_n_nodes, TILE_SIZE_M))
+
+        self._bk_triton_block_sparse_par_kernel[grid](
+            node_flows = node_flows, 
+            node_mars = node_mars, 
+            element_mars = element_mars, 
+            params = params, 
+            param_flows = param_flows, 
+            nids = nids, 
+            cids = cids, 
+            pids = pids,
+            batch_size = batch_size, 
+            num_edges = num_edges, 
+            TILE_SIZE_B = TILE_SIZE_B, 
+            B_NUM_TILES = B_NUM_TILES, 
+            TILE_SIZE_K = TILE_SIZE_K, 
+            TILE_SIZE_M = TILE_SIZE_M, 
+            GROUP_SIZE_M = self.group_size
+        )
+
+    @torch.compile(mode = "reduce-overhead", fullgraph = True)
     def _backward_pytorch(self, node_flows: torch.Tensor, element_flows: torch.Tensor, 
                           params: torch.Tensor, node_mars: torch.Tensor, 
                           element_mars: torch.Tensor, param_flows: Optional[torch.Tensor], 
diff --git a/tests/layer/sum_layer_test.py b/tests/layer/sum_layer_test.py
index 0bb03734..f683d249 100644
--- a/tests/layer/sum_layer_test.py
+++ b/tests/layer/sum_layer_test.py
@@ -17,157 +17,6 @@
 import triton.language as tl
 
 
-@triton.jit
-def _bk_triton_block_sparse_kernel(node_flows, element_flows, node_mars, element_mars, params, nids, cids_start, cids_increment,
-                                    pids_start, pids_increment, local_ids, batch_size, partial_eval: tl.constexpr,
-                                    BLOCK_B: tl.constexpr, TILE_SIZE_K: tl.constexpr, K_NUM_TILES: tl.constexpr,
-                                    TILE_SIZE_M: tl.constexpr, GROUP_SIZE_M: tl.constexpr):
-
-    pid_b = tl.program_id(0) # ID of size-`BLOCK_B` batches
-    pid_m = tl.program_id(1) # ID of size-`TILE_SIZE_M` nodes
-
-    # Get inferred node group id from `pid_m`
-    ngroup_id = pid_m // (GROUP_SIZE_M // TILE_SIZE_M)
-    tile_id = pid_m % (GROUP_SIZE_M // TILE_SIZE_M)
-
-    # Get the real node group id in the case of partial evaluation
-    if partial_eval == 1:
-        ngroup_id = tl.load(local_ids + ngroup_id)
-
-    # Initialize pointers to `params`
-    offs_node = tl.arange(0, TILE_SIZE_M) + tile_id * TILE_SIZE_M
-    offs_edge = tl.arange(0, TILE_SIZE_K)
-    par_start = tl.load(pids_start + ngroup_id * TILE_SIZE_K + offs_edge)
-    epars_ptr = params + \
-        offs_node[:,None] + \
-        par_start[None,:] # [TILE_SIZE_M, TILE_SIZE_K]
-
-    # Batch offsets and mask
-    offs_batch = tl.arange(0, BLOCK_B) + pid_b * BLOCK_B
-    mask_batch = offs_batch < batch_size
-
-    # Initialize pointers to `element_mars`
-    edge_start = tl.load(cids_start + ngroup_id * TILE_SIZE_K + offs_edge)
-    emars_ptr = element_mars + \
-        edge_start[:,None] * batch_size + \
-        offs_batch[None,:] # [TILE_SIZE_K, BLOCK_B]
-
-    eflows_ptr = element_mars + \
-        edge_start[:,None] * batch_size + \
-        offs_batch[None,:] # [TILE_SIZE_K, BLOCK_B]
-
-    # Initialize pointers to `node_flows`
-    off_nids = tl.load(nids + ngroup_id)
-    offs_nmfs = (off_nids + offs_node[:,None]) * batch_size + offs_batch[None,:]
-    nmars = tl.load(node_mars + offs_nmfs, mask = mask_batch[None,:]) # [TILE_SIZE_M, BLOCK_B]
-    nflows = tl.load(node_flows + offs_nmfs, mask = mask_batch[None,:])
-
-    nmars_max = tl.max(nmars, axis = 0)
-    nflows_div_mars = nflows / tl.exp(nmars - nmars_max[None,:])
-    nflows_div_mars = nflows_div_mars.to(tl.float16)
-
-    # Batch increment pointers
-    pids_inc_ptr = pids_increment + ngroup_id * (K_NUM_TILES * TILE_SIZE_K) + offs_edge
-    cids_inc_ptr = cids_increment + ngroup_id * (K_NUM_TILES * TILE_SIZE_K) + offs_edge
-
-    # Inner loop
-    acc = tl.zeros([TILE_SIZE_M, TILE_SIZE_K], dtype = tl.float32)
-
-    for k in range(0, K_NUM_TILES):
-        epars = tl.load(epars_ptr) # [TILE_SIZE_M, TILE_SIZE_K]
-        emars = tl.load(emars_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_K, BLOCK_B]
-
-        epars = epars.to(tl.float16)
-        aaa = tl.dot(tl.trans(epars), nflows_div_mars).to(tl.float32)
-        bbb = aaa * (emars - nmars_max[None,:])
-
-        tl.atomic_add(eflows_ptr, bbb, mask = mask_batch[None,:])
-        # acc += bbb
-
-        # Increment `epars_ptr`
-        pids_inc = tl.load(pids_inc_ptr)
-        epars_ptr += pids_inc[None,:]
-        pids_inc_ptr += TILE_SIZE_K
-
-        # Increment `emars_ptr`
-        cids_inc = tl.load(cids_inc_ptr)
-        emars_ptr += cids_inc[:,None] * batch_size
-        eflows_ptr += cids_inc[:,None] * batch_size
-        cids_inc_ptr += TILE_SIZE_K
-
-
-@triton.jit
-def _bkp_triton_block_sparse_kernel(node_flows, node_mars, element_mars, params, param_flows, nids, cids, pids,
-                                    local_ids, batch_size: tl.constexpr, n_edges: tl.constexpr, partial_eval: tl.constexpr,
-                                    TILE_SIZE_B: tl.constexpr, B_NUM_TILES: tl.constexpr, TILE_SIZE_K: tl.constexpr, TILE_SIZE_M: tl.constexpr, 
-                                    GROUP_SIZE_M: tl.constexpr):
-
-    pid_k = tl.program_id(0) # ID of size-`TILE_SIZE_K` edges
-    pid_m = tl.program_id(1) # ID of size-`TILE_SIZE_M` nodes
-
-    # Get inferred node group id from `pid_m`
-    ngroup_id = pid_m // (GROUP_SIZE_M // TILE_SIZE_M)
-    tile_id = pid_m % (GROUP_SIZE_M // TILE_SIZE_M)
-
-    # Get the real node group id in the case of partial evaluation
-    if partial_eval == 1:
-        ngroup_id = tl.load(local_ids + ngroup_id)
-
-    # Batch offsets and mask
-    offs_batch = tl.arange(0, TILE_SIZE_B)
-    mask_batch = offs_batch < batch_size
-
-    # Initialize pointers to `element_mars`
-    offs_edge = tl.arange(0, TILE_SIZE_K) + pid_k * TILE_SIZE_K
-    edge_start = tl.load(cids + ngroup_id * n_edges + offs_edge)
-    emars_ptr = element_mars + \
-        edge_start[:,None] * batch_size + \
-        offs_batch[None,:] # [TILE_SIZE_K, TILE_SIZE_B]
-
-    # Initialize pointers to `node_flows` and `node_mars`
-    offs_node = tl.arange(0, TILE_SIZE_M) + tile_id * TILE_SIZE_M
-    off_nids = tl.load(nids + ngroup_id)
-    offs_nmfs = (off_nids + offs_node[:,None]) * batch_size + offs_batch[None,:]
-    nmars_ptr = node_mars + offs_nmfs
-    nflows_ptr = node_flows + offs_nmfs
-
-    # Inner loop
-    acc = tl.zeros([TILE_SIZE_M, TILE_SIZE_K], dtype = tl.float32)
-
-    for b in range(0, B_NUM_TILES):
-        emars = tl.load(emars_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_K, TILE_SIZE_B]
-        nmars = tl.load(nmars_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_M, TILE_SIZE_B]
-        nflows = tl.load(nflows_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_M, TILE_SIZE_B]
-
-        nmars_max = tl.max(nmars, axis = 0)
-        nflows_div_mars = nflows / tl.exp(nmars - nmars_max[None,:])
-        nflows_div_mars = nflows_div_mars.to(tl.float16)
-
-        emars = tl.exp(emars - nmars_max[None,:])
-        emars = emars.to(tl.float16)
-
-        pflows = tl.dot(nflows_div_mars, tl.trans(emars)).to(tl.float32)
-
-        acc += pflows
-
-        # Increment `emars_ptr`, `nmars_ptr`, and `nmars_ptr`
-        emars_ptr += TILE_SIZE_B
-        nmars_ptr += TILE_SIZE_B
-        nflows_ptr += TILE_SIZE_B
-
-        # Update batch mask
-        offs_batch += TILE_SIZE_B
-        mask_batch = offs_batch < batch_size
-
-    par_start = tl.load(pids + ngroup_id * n_edges + offs_edge)
-    epars_offsets = offs_node[:,None] + par_start[None,:] # [TILE_SIZE_M, TILE_SIZE_K]
-    
-    epars = tl.load(params + epars_offsets)
-    pflows = acc * epars
-
-    tl.store(param_flows + epars_offsets, pflows)
-
-
 def sum_layer_test():
 
     device = torch.device("cuda:0")
@@ -274,11 +123,21 @@ def sum_layer_test():
             epars = params[parpids[j,:]+i]
             eflows = (nflows * epars[:,None] * emars[None,:] / nmars).sum(dim = 0)
 
-            import pdb; pdb.set_trace()
+            assert torch.all(torch.abs(eflows - element_flows[(j+1)*group_size+i,:]) < 1e-2)
+
+    my_pflows = torch.zeros_like(param_flows)
+
+    for i in range(group_size):
+        for j in range(6):
+            emars = element_mars[layer.partitioned_cids[0][j,:]].exp()
+            epars = params[layer.partitioned_pids[0][j,:]+i]
+            nmars = node_mars[(j+1)*group_size+i,:].exp()
+            nflows = node_flows[(j+1)*group_size+i,:]
+            pflows = epars * (nflows[None,:] * emars / nmars[None,:]).sum(dim = 1)
+
+            my_pflows[layer.partitioned_pids[0][j,:]+i] = pflows
 
-            assert torch.all(torch.abs(eflows - element_flows[(j+1)*group_size+i,:]) < 1e-3)
-            
-    import pdb; pdb.set_trace()
+    assert torch.all(torch.abs(my_pflows - param_flows) < 2e-3)
 
 
 def speed_test():
@@ -320,16 +179,12 @@ def speed_test():
                          tied_param_group_ids = [], tied_param_ends = [],
                          ch_prod_layer_size = prod_layer.num_nodes + group_size)
 
-    # import pdb; pdb.set_trace()
-
     layer.to(device)
 
     node_mars = torch.zeros([group_size + group_size * num_node_groups * num_prod_nodes, batch_size]).to(device)
     element_mars = torch.rand([group_size + num_prod_nodes * group_size * num_node_groups, batch_size]).log().to(device)
     params = torch.rand([layer.partitioned_pids[0].max() + group_size]).to(device)
 
-    # import pdb; pdb.set_trace()
-
     ## Forward tests ##
 
     layer(node_mars, element_mars, params)
@@ -343,11 +198,9 @@ def speed_test():
     forward_ms = (t1 - t0) / 100 * 1000
 
     print(f"Forward pass on average takes {forward_ms:.3f}ms.")
-    print("Reference computation time on RTX 4090: 11.255ms.")
+    print("Reference computation time on RTX 4090: 0.871ms.")
     print("--------------------------------------------------------------")
 
-    # exit()
-
     node_flows = torch.rand([group_size + group_size * num_node_groups * num_prod_nodes, batch_size]).to(device)
     element_flows = torch.zeros([group_size + num_prod_nodes * group_size * num_node_groups, batch_size]).log().to(device)
     param_flows = torch.zeros([layer.partitioned_pids[0].max() + group_size]).to(device)
@@ -362,128 +215,12 @@ def speed_test():
     t1 = time.time()
     backward_ms = (t1 - t0) / 100 * 1000
 
-    print(f"Backward pass on average takes {forward_ms:.3f}ms.")
-    print("Reference computation time on RTX 4090: 11.255ms.")
+    print(f"Backward pass on average takes {backward_ms:.3f}ms.")
+    print("Reference computation time on RTX 4090: 1.200ms.")
     print("--------------------------------------------------------------")
 
-    exit()
-
-    # import pdb; pdb.set_trace()
-
-    nids = layer.partitioned_nids[0]
-    cids_start, cids_increment, pids_start, pids_increment = layer._cached_fw_pcids[("block_sparse", 0, 64)]
-
-    BLOCK_B = 128
-    TILE_SIZE_K = 64
-    K_NUM_TILES = layer.partitioned_cids[0].size(1) // TILE_SIZE_K
-    TILE_SIZE_M = 32
-
-    layer_n_nodes = nids.size(0) * layer.group_size
-    
-    grid = (triton.cdiv(batch_size, BLOCK_B), triton.cdiv(layer_n_nodes, TILE_SIZE_M))
-        
-    _bk_triton_block_sparse_kernel[grid](
-        node_flows,
-        element_flows,
-        node_mars, 
-        element_mars, 
-        params, 
-        nids, 
-        cids_start,
-        cids_increment, 
-        pids_start,
-        pids_increment,
-        local_ids = None,
-        batch_size = batch_size,
-        partial_eval = 0,
-        BLOCK_B = BLOCK_B,
-        TILE_SIZE_K = TILE_SIZE_K,
-        K_NUM_TILES = K_NUM_TILES,
-        TILE_SIZE_M = TILE_SIZE_M,
-        GROUP_SIZE_M = layer.group_size
-    )
-
-    t0 = time.time()
-    torch.cuda.synchronize()
-    for _ in range(100):
-        _bk_triton_block_sparse_kernel[grid](
-            node_flows,
-            element_flows,
-            node_mars, 
-            element_mars, 
-            params, 
-            nids, 
-            cids_start,
-            cids_increment, 
-            pids_start,
-            pids_increment,
-            local_ids = None,
-            batch_size = batch_size,
-            partial_eval = 0,
-            BLOCK_B = BLOCK_B,
-            TILE_SIZE_K = TILE_SIZE_K,
-            K_NUM_TILES = K_NUM_TILES,
-            TILE_SIZE_M = TILE_SIZE_M,
-            GROUP_SIZE_M = layer.group_size
-        )
-    torch.cuda.synchronize()
-    t1 = time.time()
-    backward_ms = (t1 - t0) / 100 * 1000
-
-    print(f"bkbk: {backward_ms:.3f}ms.")
-
-    nids = layer.partitioned_nids[0]
-    cids = layer.partitioned_cids[0]
-    pids = layer.partitioned_pids[0]
-
-    param_flows = torch.zeros(params.size()).to(device)
-
-    TILE_SIZE_B = 64
-    TILE_SIZE_K = 64
-    B_NUM_TILES = triton.cdiv(batch_size, TILE_SIZE_B)
-    TILE_SIZE_M = 32
-
-    n_edges = cids.size(1)
-
-    layer_n_nodes = nids.size(0) * layer.group_size
-    
-    grid = (triton.cdiv(n_edges, TILE_SIZE_K), triton.cdiv(layer_n_nodes, TILE_SIZE_M))
-
-    # print("aaa")
-
-    _bkp_triton_block_sparse_kernel[grid](
-        node_flows, node_mars, element_mars, params, 
-        param_flows, nids, cids, pids, local_ids = None, 
-        batch_size = batch_size, n_edges = n_edges, partial_eval = 0,
-        TILE_SIZE_B = TILE_SIZE_B, B_NUM_TILES = B_NUM_TILES, 
-        TILE_SIZE_K = TILE_SIZE_K, TILE_SIZE_M = TILE_SIZE_M, 
-        GROUP_SIZE_M = layer.group_size
-    )
-
-    t0 = time.time()
-    # print("bbb")
-    torch.cuda.synchronize()
-    # print("ccc")
-    for _ in range(100):
-        _bkp_triton_block_sparse_kernel[grid](
-            node_flows, node_mars, element_mars, params, 
-            param_flows, nids, cids, pids, local_ids = None, 
-            batch_size = batch_size, n_edges = n_edges, partial_eval = 0,
-            TILE_SIZE_B = TILE_SIZE_B, B_NUM_TILES = B_NUM_TILES, 
-            TILE_SIZE_K = TILE_SIZE_K, TILE_SIZE_M = TILE_SIZE_M, 
-            GROUP_SIZE_M = layer.group_size
-        )
-        # print("ddd")
-    torch.cuda.synchronize()
-    t1 = time.time()
-    backward_ms = (t1 - t0) / 100 * 1000
-
-    # print("eee")
-
-    print(f"bkpbkp: {backward_ms:.3f}ms.")
-
 
 if __name__ == "__main__":
     torch.manual_seed(3890)
-    # sum_layer_test()
+    sum_layer_test()
     speed_test()
\ No newline at end of file

From ba7c1a1760b3625fbb7c8d64f70a3d2cdf3400f5 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Sun, 10 Dec 2023 20:07:21 +0800
Subject: [PATCH 030/162] fix boundary condition for `max_num_partitions`

---
 src/pyjuice/layer/backend/node_partition.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pyjuice/layer/backend/node_partition.py b/src/pyjuice/layer/backend/node_partition.py
index ebf41332..902fbeba 100644
--- a/src/pyjuice/layer/backend/node_partition.py
+++ b/src/pyjuice/layer/backend/node_partition.py
@@ -188,7 +188,7 @@ def partition_nodes_by_n_edges(node_n_edges: Union[np.ndarray, torch.Tensor],
     if sparsity_tolerance is not None:
         assert sparsity_tolerance > 1e-6 and sparsity_tolerance <= 1.0
         if max_num_partitions is None:
-            max_num_partitions = max(min(int(math.ceil(node_n_edges.shape[0] * sparsity_tolerance)), 16), 1)
+            max_num_partitions = max(min(torch.unique(node_n_edges).shape[0], 16), 1)
     elif max_num_partitions is None:
         max_num_partitions = 1
     else:

From 7fef00ecf454fbc90f87e8e2c34bb92a903505d1 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Sun, 10 Dec 2023 20:29:16 +0800
Subject: [PATCH 031/162] test matmul small matrices

---
 tests/layer/matmul_kernel_test.py | 110 ++++++++++++++++++++++++++++++
 1 file changed, 110 insertions(+)
 create mode 100644 tests/layer/matmul_kernel_test.py

diff --git a/tests/layer/matmul_kernel_test.py b/tests/layer/matmul_kernel_test.py
new file mode 100644
index 00000000..f169e1f4
--- /dev/null
+++ b/tests/layer/matmul_kernel_test.py
@@ -0,0 +1,110 @@
+import pyjuice as juice
+import torch
+import numpy as np
+import time
+import random
+
+import pyjuice.nodes.distributions as dists
+from pyjuice.utils import BitSet
+from pyjuice.nodes import multiply, summate, inputs
+from pyjuice.model import TensorCircuit
+
+from pyjuice.layer import InputLayer, ProdLayer, SumLayer
+
+import pytest
+
+
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def kernel1(a, b, c, M: tl.constexpr, N: tl.constexpr, K: tl.constexpr):
+    pid = tl.program_id(axis = 0)
+
+    offs_a = tl.arange(0, M)[:,None] * N + tl.arange(0, N)[None,:]
+    aa = tl.load(a + offs_a) # .to(tl.bfloat16)
+
+    offs_b = tl.arange(0, N)[:,None] * K + tl.arange(0, K)[None,:]
+    bb = tl.load(b + offs_b) # .to(tl.bfloat16)
+
+    cc = tl.dot(aa, bb, allow_tf32 = True) # .to(tl.float32)
+
+    offs_c = tl.arange(0, M)[:,None] * K + tl.arange(0, K)[None,:]
+    tl.store(c + offs_c, cc)
+
+
+@triton.jit
+def kernel2(a, b, c, M: tl.constexpr, N: tl.constexpr, K: tl.constexpr):
+    pid = tl.program_id(axis = 0)
+
+    offs_a = tl.arange(0, M)[:,None] * N + tl.arange(0, N)[None,:]
+    aa = tl.load(a + offs_a) # .to(tl.bfloat16)
+
+    offs_b = tl.arange(0, N)[:,None] * K + tl.arange(0, K)[None,:]
+    bb = tl.load(b + offs_b) # .to(tl.bfloat16)
+
+    cc = tl.sum(aa[:,:,None] * bb[None,:,:], axis = 1) # .to(tl.float32)
+
+    # cc = tl.dot(aa, bb)
+
+    offs_c = tl.arange(0, M)[:,None] * K + tl.arange(0, K)[None,:]
+    tl.store(c + offs_c, cc)
+
+
+@triton.jit
+def kernel3(a, b, c, M: tl.constexpr, N: tl.constexpr, K: tl.constexpr):
+    pid = tl.program_id(axis = 0)
+
+    offs_a = tl.arange(0, M)[:,None] * N + tl.arange(0, N)[None,:]
+    aa = tl.load(a + offs_a)
+
+    offs_b = tl.arange(0, N)[:,None] * K + tl.arange(0, K)[None,:]
+    bb = tl.load(b + offs_b)
+
+    aa = tl.view(tl.broadcast_to(aa[:,None,:], (M, 16 // M, N)), (16, N))
+    cc = tl.dot(aa, bb)
+    cc = tl.max(tl.view(cc, (M, 16 // M, N)), axis = 1)
+
+    offs_c = tl.arange(0, M)[:,None] * K + tl.arange(0, K)[None,:]
+    tl.store(c + offs_c, cc)
+
+
+if __name__ == "__main__":
+    import time
+
+    M = 16
+    N = 16
+    K = 16
+
+    a = torch.rand([M, N]).cuda()
+    b = torch.rand([N, K]).cuda()
+    c = torch.rand([M, K]).cuda()
+
+    grid = (1000,)
+
+    kernel1[grid](a, b, c, M, N, K)
+
+    # torch.cuda.synchronize()
+    # t0 = time.time()
+    # for _ in range(100):
+    #     kernel1[grid](a, b, c, M, N, K)
+    #     torch.cuda.synchronize()
+    # t1 = time.time()
+
+    # print((t1 - t0) / 100 * 1000)
+
+    kernel2[grid](a, b, c, M, N, K)
+
+    # torch.cuda.synchronize()
+    # t0 = time.time()
+    # for _ in range(100):
+    #     kernel2[grid](a, b, c, M, N, K)
+    #     torch.cuda.synchronize()
+    # t1 = time.time()
+
+    # print((t1 - t0) / 100 * 1000)
+
+    # cc = torch.matmul(a, b)
+
+    # print((c - cc).abs().max())
\ No newline at end of file

From 894b8d68a84307aac6efda60468721b34e2de5c4 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Sun, 10 Dec 2023 20:29:32 +0800
Subject: [PATCH 032/162] cpu version of prod layer compilation

---
 src/pyjuice/layer/prod_layer.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/pyjuice/layer/prod_layer.py b/src/pyjuice/layer/prod_layer.py
index 1d787007..6a61c035 100644
--- a/src/pyjuice/layer/prod_layer.py
+++ b/src/pyjuice/layer/prod_layer.py
@@ -19,8 +19,9 @@
 
 class ProdLayer(Layer, nn.Module):
 
-    def __init__(self, nodes: Sequence[ProdNodes], layer_sparsity_tol: Optional[float] = None,
-                 max_num_partitions: Optional[int] = None, disable_gpu_compilation: bool = False) -> None:
+    def __init__(self, nodes: Sequence[ProdNodes], global_nid_start: Optional[int] = None, 
+                 layer_sparsity_tol: Optional[float] = None, max_num_partitions: Optional[int] = None, 
+                 disable_gpu_compilation: bool = False, force_gpu_compilation: bool = False) -> None:
         Layer.__init__(self, nodes)
         nn.Module.__init__(self)
 
@@ -32,9 +33,12 @@ def __init__(self, nodes: Sequence[ProdNodes], layer_sparsity_tol: Optional[floa
         self.nodes = nodes
         self.group_size = nodes[0].group_size
 
+        if global_nid_start is None:
+            global_nid_start = self.group_size
+
         ## Get layer statistics & prepare for compilation ##
 
-        layer_num_ngroups, layer_num_edges, n_chgs = get_prod_layer_stats(self.nodes, self.group_size)
+        layer_num_ngroups, layer_num_edges, n_chgs = get_prod_layer_stats(self.nodes, self.group_size, global_nid_start = global_nid_start)
 
         self.num_nodes = layer_num_ngroups * self.group_size
         self.num_edges = layer_num_edges
@@ -87,7 +91,7 @@ def __init__(self, nodes: Sequence[ProdNodes], layer_sparsity_tol: Optional[floa
         # flat_u_cids:        [num_used_ch_ngroups]    child group ids that have at least one parent
         # par_counts:         [num_used_ch_ngroups]    the number of parents for each child node group
         # Note: the dummy node has been removed from `flat_u_cids` and `par_counts`
-        flat_u_cids, par_counts = get_prod_layer_parstats(flat_cids)
+        flat_u_cids, par_counts = get_prod_layer_parstats(flat_cids, global_nid_start = global_nid_start)
 
         # Find a good strategy to partition the child nodes into groups according to their number of parents 
         # to minimize total computation cost
@@ -126,7 +130,7 @@ def __init__(self, nodes: Sequence[ProdNodes], layer_sparsity_tol: Optional[floa
         u_cids, parids = prod_layer_backward_compilation(
             flat_u_cids, flat_cids, flat_cid2nid, 
             bk_partition_max_pars, bk_n_partition_ids, bk_n_id_in_partition, bk_num_ns_in_partition,
-            use_cuda = not disable_gpu_compilation and (flat_cids.size(0) > 4000)
+            use_cuda = force_gpu_compilation or (not disable_gpu_compilation and (flat_cids.size(0) > 4000))
         )
 
         # Store buffers for the backward pass

From 6b79d01354d9130235d374c16b62fa7d68a66886 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Sun, 10 Dec 2023 20:29:55 +0800
Subject: [PATCH 033/162] cpu version of fw sum layer compilation

---
 src/pyjuice/layer/compilation.py      | 330 +++++++-------------------
 src/pyjuice/layer/sum_layer.py        |  17 +-
 tests/layer/layer_compilation_test.py |  92 +++++++
 tests/layer/sum_layer_test.py         |   5 +-
 4 files changed, 194 insertions(+), 250 deletions(-)
 create mode 100644 tests/layer/layer_compilation_test.py

diff --git a/src/pyjuice/layer/compilation.py b/src/pyjuice/layer/compilation.py
index 7d8ae867..f786ad0d 100644
--- a/src/pyjuice/layer/compilation.py
+++ b/src/pyjuice/layer/compilation.py
@@ -168,243 +168,89 @@ def get_sum_layer_backward_stats(nodes: Sequence[SumNodes]):
 
     return ch_gsize2cs, ch_gsize2num_ngroups, ch_gsize2n_pargs, cs2parns
 
-@torch.no_grad()
-def sum_layer_forward_compilation_job(flat_nodes, nids, cids, pids, fw_group_max_chs, n_group_ids, n_id_in_group,
-                                      global_nid_start, ch_prod_layer_size, job_start, job_end, return_dict = None, 
-                                      idx = 0, use_cuda: bool = False):
-    """
-    Note: Only process jobs in [job_start, job_end).
-    """
-    all_ns_param_ids = dict()
-
-    node_start = 0
-    for ns_idx, flat_ns in enumerate(flat_nodes):
-        # Outer iteration over `ns` in this layer
-        ns_num_nodes = flat_ns[0]
-        if node_start + ns_num_nodes < job_start:
-            node_start += ns_num_nodes
-            continue # Move on to the next ns
-        elif node_start >= job_end:
-            break # All jobs completed
-
-        edge_ids = flat_ns[1] # Edge indices of this `ns`
-        ns_num_edges = edge_ids.size(1)
-
-        add_params_flag = flat_ns[4]
-        if add_params_flag:
-            ns_param_ids = torch.zeros([edge_ids.size(1)], dtype = torch.long, device = edge_ids.device)
-
-        # Pre-compute cid flags for future reuse
-        num_chs = len(flat_ns[3])
-        cid_starts = torch.zeros([num_chs], dtype = torch.long)
-        cid_ends = torch.zeros([num_chs], dtype = torch.long)
-        cid_start = 0
-        for cnode_id, flat_cs in enumerate(flat_ns[3]):
-            cs_num_nodes = flat_cs[0]
-            cid_end = cid_start + cs_num_nodes
-            cid_starts[cnode_id] = cid_start
-            cid_ends[cnode_id] = cid_end
-
-            cid_start = cid_end
-        
-        if use_cuda:
-            cid_starts = cid_starts.cuda()
-            cid_ends = cid_ends.cuda()
-
-        # Shape: [num_chs, num_edges]
-        cs_criterion = (edge_ids[1,:].unsqueeze(0) >= cid_starts[:,None]) & \
-                       (edge_ids[1,:].unsqueeze(0) < cid_ends[:,None])
 
-        # Loop over the nodes assigned to the current thread
-        nid_start = 0 if node_start >= job_start else job_start - node_start
-        nid_end = ns_num_nodes if node_start + ns_num_nodes <= job_end else job_end - node_start
-        ns_pid_start = flat_ns[2][0] # Start param id
-        ns_local_pid = (edge_ids[0,:] < nid_start).sum().item()
-        for nid in range(nid_start, nid_end):
-            # Global node idx
-            global_nid = global_nid_start + node_start + nid
-
-            # `group_id`:   which group the current node belongs to
-            # `local_id`:   the index of the node within the current group
-            # `group_nchs`: maximum number of child nodes in the current group
-            group_id = n_group_ids[node_start + nid]
-            local_id = n_id_in_group[node_start + nid]
-            group_nchs = fw_group_max_chs[group_id]
-
-            ns_criterion = (edge_ids[0,:] == nid)
-
-            # assign node id
-            nids[group_id][local_id] = global_nid
-
-            ch_start = 0
-            cid_start = 0
-            for cnode_id, flat_cs in enumerate(flat_ns[3]):
-                cs_num_nodes = flat_cs[0]
-                cs_out_ind_range = flat_cs[1]
-                cid_end = cid_start + cs_num_nodes
-
-                criterion = cs_criterion[cnode_id,:] & ns_criterion
+def sum_layer_forward_compilation_cpu(nodes, fw_partition_max_chs, n_partition_ids, n_id_in_partition, 
+                                      num_ngs_in_partition, n_chs, global_nid_start, param_ends):
 
-                # assign child ids
-                ch_ids = edge_ids[1,criterion] + (cs_out_ind_range[0] - cid_start)
-                cids[group_id][local_id,ch_start:ch_start+ch_ids.size(0)] = ch_ids
+    nids = [torch.zeros([num_ngs_in_partition[i]], dtype = torch.long) for i in range(len(num_ngs_in_partition))]
+    cids = [torch.zeros([num_ngs_in_partition[i], fw_partition_max_chs[i]], dtype = torch.long) for i in range(len(num_ngs_in_partition))]
+    pids = [torch.zeros([num_ngs_in_partition[i], fw_partition_max_chs[i]], dtype = torch.long) for i in range(len(num_ngs_in_partition))]
 
-                # mapping from the current params to global params
-                if add_params_flag:
-                    curr_ids = torch.where(criterion)[0]
-                    curr_param_ids = torch.arange(curr_ids.size(0), device = edge_ids.device) + (ns_pid_start + ns_local_pid + ch_start)
-                    ns_param_ids[curr_ids] = curr_param_ids
-
-                ch_start += ch_ids.size(0)
-                cid_start = cid_end
-
-            # assign parameter ids
-            parids = torch.arange(ch_start, device = edge_ids.device) + (ns_pid_start + ns_local_pid)
-            pids[group_id][local_id,:ch_start] = parids
-
-            ns_local_pid += ch_start
-
-        node_start += ns_num_nodes
-        ns_pid_start += ns_num_edges
-
-        if add_params_flag:
-            all_ns_param_ids[ns_idx] = ns_param_ids
-
-    if return_dict is not None:
-        return_dict[idx] = all_ns_param_ids
-    else:
-        return all_ns_param_ids
-
-
-@torch.no_grad()
-def sum_layer_forward_compilation_legacy(nodes, fw_group_max_chs, n_group_ids, n_id_in_group, num_ns_in_group, n_chs,
-                                         global_nid_start, ch_prod_layer_size, param_ends, 
-                                         num_threads: int = 1, use_cuda: bool = False):
-
-    if use_cuda and not torch.cuda.is_available():
-        use_cuda = False
-
-    total_num_jobs = sum(map(lambda ns: ns.num_nodes, nodes))
-
-    # Construct flattened_nodes
+    original_param_nids = [] # `ns` with their original parameters (i.e., not tied)
+    
+    # This is the main loop: iterate over `ns` in the layer
     global_pid_start = param_ends[-1]
-    flat_nodes = []
-    add_ns_params_flag = []
-    for ns in nodes:
+    ngroup_start = 0 # The start index of the node groups in the current `ns`
+    ngid_in_partition = torch.zeros([len(num_ngs_in_partition)], dtype = torch.long)
+    for ns_idx, ns in enumerate(nodes):
         if ns.is_tied():
-            source_ns = ns.get_source_ns()
-            if not hasattr(source_ns, "_param_range") or source_ns._param_range is None:
-                global_pid_end = global_pid_start + source_ns.num_edges
-                source_ns._param_range = (global_pid_start, global_pid_end)
-                global_pid_start = global_pid_end
-
-                add_params_flag = True
-            else:
-                add_params_flag = False
+            target_ns = ns.get_source_ns()
         else:
-            if not hasattr(ns, "_param_range") or ns._param_range is None:
-                global_pid_end = global_pid_start + ns.num_edges
-                ns._param_range = (global_pid_start, global_pid_end)
-                global_pid_start = global_pid_end
+            target_ns = ns
 
-                add_params_flag = True
-            else:
-                add_params_flag = False
+        # If the parameters have not been instantiated, do it :)
+        if not hasattr(target_ns, "_param_range") or target_ns._param_range is None:
+            global_pid_end = global_pid_start + target_ns.num_edges
+            target_ns._param_range = (global_pid_start, global_pid_end)
+            global_pid_start = global_pid_end
 
-        add_ns_params_flag.append(add_params_flag)
-        flat_nodes.append(flatten_sum_nodes(ns, add_params_flag, use_cuda = use_cuda))
+            add_params_flag = True
+            original_param_nids.append(ns_idx)
+        else:
+            add_params_flag = False
 
-    # Allocate target buffers
-    nids = [torch.zeros([group_size], dtype = torch.long) for group_size in num_ns_in_group] # Node id
-    cids = [torch.zeros([group_size, max_chs], dtype = torch.long) for group_size, max_chs in zip(num_ns_in_group, fw_group_max_chs)] # Child id
-    pids = [torch.zeros([group_size, max_chs], dtype = torch.long) for group_size, max_chs in zip(num_ns_in_group, fw_group_max_chs)] # Parameter id
+        # Global pid start index for `ns`
+        ns_pid_start = target_ns._param_range[0]
 
-    if use_cuda:
-        nids = [tensor.cuda() for tensor in nids]
-        cids = [tensor.cuda() for tensor in cids]
-        pids = [tensor.cuda() for tensor in pids]
-
-    if num_threads == 1:
-        curr_ns_param_ids = sum_layer_forward_compilation_job(
-            flat_nodes, nids, cids, pids, fw_group_max_chs, n_group_ids, n_id_in_group,
-            global_nid_start, ch_prod_layer_size, 0, total_num_jobs, use_cuda = use_cuda
-        )
-        all_ns_param_ids = [curr_ns_param_ids]
+        # number of node groups
+        ns_num_ngroups = ns.num_node_groups
 
-    else:
-        job_indices = get_chunk_ids(total_num_jobs, num_threads)
-
-        threads = []
-        return_dict = dict()
-        for idx, (job_start, job_end) in enumerate(job_indices):
-            th = threading.Thread(
-                target = sum_layer_forward_compilation_job, 
-                args = (flat_nodes, nids, cids, pids, fw_group_max_chs, n_group_ids, n_id_in_group,
-                        global_nid_start, ch_prod_layer_size, job_start, job_end, return_dict, idx, 
-                        use_cuda)
-            )
-            th.start()
-            threads.append(th)
+        # Edge indices of size [2, ns_num_edges]
+        # Here child ids of the edges are flattened out, i.e., every edge points to 
+        # an actual "node" instead of a node group
+        edge_ids = ns.edge_ids.clone()
+        edge_ids = edge_ids[:,:,None].repeat(1, 1, ns.ch_group_size)
+        edge_ids[1,:,:] *= ns.ch_group_size
+        edge_ids[1,:,:] += torch.arange(0, ns.ch_group_size)[None,:]
+        edge_ids = edge_ids.reshape(2, ns.edge_ids.size(1) * ns.ch_group_size).contiguous()
+        ns_num_edges = edge_ids.size(1)
 
-        for th in threads:
-            th.join()
+        # Get number of child nodes for all nodes
+        ns_nchs = torch.bincount(edge_ids[0,:], minlength = ns_num_ngroups)
 
-        all_ns_param_ids = []
-        for idx in range(num_threads):
-            curr_ns_param_ids = return_dict[idx]
-            all_ns_param_ids.append(curr_ns_param_ids)
+        cs_node_cum_nodes = torch.zeros([ns.num_chs], dtype = torch.long)
+        cs_node_cum_nodes[0] = ns.chs[0].num_nodes
+        for i in range(1, ns.num_chs):
+            cs_node_cum_nodes[i] = cs_node_cum_nodes[i-1] + ns.chs[i].num_nodes
 
-    # Compute the number of (sum) parents for each (prod) input node
-    ch_n_pars = torch.zeros([ch_prod_layer_size], dtype = torch.long) # Number of parents for each child node
-    for ns in nodes:
-        ch_start = 0
-        for cs in ns.chs:
-            ch_end = ch_start + cs.num_nodes
-            criterion = (ns.edge_ids[1,:] >= ch_start) & (ns.edge_ids[1,:] < ch_end)
-            
-            cs_s_oind = cs._output_ind_range[0]
-            cs_e_oind = cs._output_ind_range[1]
-            c_ns_counts = torch.bincount(ns.edge_ids[1,criterion] - ch_start, minlength = cs.num_nodes)
-            ch_n_pars[cs_s_oind:cs_e_oind] = c_ns_counts
+        # Iterate over node groups
+        cum_n_chs = 0
+        for ng_id in range(ns_num_ngroups):
+            partition_id = (ns_nchs[ng_id] > fw_partition_max_chs).sum()
+            local_id = ngid_in_partition[partition_id]
 
-            ch_start = ch_end
+            global_nid = ns._output_ind_range[0] + ng_id * ns.group_size
 
-    # Store local -> global parameter id mapping in `ns`
-    for ns_param_ids in all_ns_param_ids:
-        for ns_idx, param_ids in ns_param_ids.items():
-            if use_cuda:
-                param_ids = param_ids.cpu()
-            ns = nodes[ns_idx]
-            if not hasattr(ns, "_param_ids") or ns._param_ids is None:
-                ns._param_ids = param_ids
-            else:
-                mask = (param_ids > 0)
-                ns._param_ids[mask] = param_ids[mask]
+            # Assign `nids`
+            nids[partition_id][local_id] = global_nid
 
-    # Store global -> local parameter id mapping in `ns`
-    for ns, add_params_flag in zip(nodes, add_ns_params_flag):
-        if add_params_flag:
-            ns._param_range = (ns._param_ids.min().item(), ns._param_ids.max().item() + 1)
-            ns._inverse_param_ids = torch.argsort(ns._param_ids)
+            # Assign `cids`
+            criterion = (edge_ids[0,:] == ng_id)
+            local_cids = edge_ids[1,criterion]
+            cids_gid = (local_cids[:,None] >= cs_node_cum_nodes[None,:]).sum(dim = 1)
+            for ch_id in range(local_cids.size(0)):
+                local_base = cs_node_cum_nodes[cids_gid[ch_id]-1] if cids_gid[ch_id] >= 1 else 0
+                global_cid = ns.chs[cids_gid[ch_id]]._output_ind_range[0] + local_cids[ch_id] - local_base
+                cids[partition_id][local_id, ch_id] = global_cid
 
-    # Update `param_ends`
-    npars = param_ends[-1]
-    nid = 0
-    for ns, add_params_flag in zip(nodes, add_ns_params_flag):
-        if add_params_flag:
-            for i in range(ns.num_nodes):
-                npars += n_chs[nid+i].item()
-                param_ends.append(npars)
-        
-        nid += ns.num_nodes
+            # Assign `pids`
+            global_pids = ns_pid_start + cum_n_chs + torch.arange(0, ns.group_size * criterion.sum(), ns.group_size)
+            pids[partition_id][local_id, 0:global_pids.size(0)] = global_pids
+            cum_n_chs += ns.group_size * criterion.sum()
 
-    if use_cuda:
-        # Move buffers back to CPU
-        nids = [tensor.cpu() for tensor in nids]
-        cids = [tensor.cpu() for tensor in cids]
-        pids = [tensor.cpu() for tensor in pids]
+            ngid_in_partition[partition_id] = local_id + 1
 
-    return nids, cids, pids, ch_n_pars, param_ends
+    return nids, cids, pids, param_ends
 
 
 @njit
@@ -484,20 +330,16 @@ def _assign_target_ncpids_kernel(target_nids_ptr, nids_partition_start_ptr, targ
 
 @torch.no_grad()
 def sum_layer_forward_compilation(nodes, fw_partition_max_chs, n_partition_ids, n_id_in_partition, num_ngs_in_partition, n_chs,
-                                  global_nid_start, ch_prod_layer_size, param_ends,
-                                  num_threads: int = 1, use_cuda: bool = True, legacy: bool = False):
+                                  global_nid_start, param_ends, use_cuda: bool = True, legacy: bool = False):
 
     if use_cuda and not torch.cuda.is_available():
         use_cuda = False
 
     # Also use the legacy code if we compile with CPU
     if not use_cuda or legacy:
-        # TODO: restore CPU compilation
-        raise RuntimeError()
-        return sum_layer_forward_compilation_legacy(
-            nodes, fw_partition_max_chs, n_partition_ids, n_id_in_partition, num_ngs_in_partition, n_chs,
-            global_nid_start, ch_prod_layer_size, param_ends, num_threads = num_threads,
-            use_cuda = use_cuda
+        return sum_layer_forward_compilation_cpu(
+            nodes, fw_partition_max_chs, n_partition_ids, n_id_in_partition, 
+            num_ngs_in_partition, n_chs, global_nid_start, param_ends
         )
 
     # We construct a flattened version of `nids` where the vectors of every partition is concatenated
@@ -1055,11 +897,9 @@ def sum_layer_backward_compilation(nodes, cs2parns, n_partition_ids, n_id_in_par
 ## Compilation for ProdLayer ##
 
 
-def get_prod_layer_stats(nodes: Sequence[SumNodes], group_size: int):
+def get_prod_layer_stats(nodes: Sequence[SumNodes], group_size: int, global_nid_start: int):
     layer_num_ngroup = sum(map(lambda ns: ns.num_node_groups, nodes))
     layer_num_edges = 0
-    
-    global_nid_start = group_size # indices `0`` to `group_size - 1`` is reserved for the dummy node
 
     ng_sid = 0
     n_chgs = torch.zeros([layer_num_ngroup], dtype = torch.long)
@@ -1079,13 +919,19 @@ def get_prod_layer_stats(nodes: Sequence[SumNodes], group_size: int):
 
 
 @torch.no_grad()
-def prod_layer_forward_compilation(nodes, fw_partition_max_chs, n_partition_ids, n_id_in_partition, num_ngs_in_partition, group_size, use_cuda: bool = False):
+def prod_layer_forward_compilation(nodes, fw_partition_max_chs, n_partition_ids, n_id_in_partition, num_ngs_in_partition, 
+                                   group_size, use_cuda: bool = False):
     
     if use_cuda and not torch.cuda.is_available():
         use_cuda = False
 
-    nids = [torch.zeros([partition_size], dtype = torch.long) for partition_size in num_ngs_in_partition] # Node group start id
-    cids = [torch.zeros([partition_size, max_chs] , dtype = torch.long) for partition_size, max_chs in zip(num_ngs_in_partition, fw_partition_max_chs)] # Child group start id
+    if use_cuda:
+        device = torch.device("cuda:0")
+    else:
+        device = torch.device("cpu")
+
+    nids = [torch.zeros([partition_size], dtype = torch.long, device = device) for partition_size in num_ngs_in_partition] # Node group start id
+    cids = [torch.zeros([partition_size, max_chs] , dtype = torch.long, device = device) for partition_size, max_chs in zip(num_ngs_in_partition, fw_partition_max_chs)] # Child group start id
 
     for ns_id, ns in enumerate(nodes):
 
@@ -1098,15 +944,19 @@ def prod_layer_forward_compilation(nodes, fw_partition_max_chs, n_partition_ids,
         partition_nchs = fw_partition_max_chs[partition_id]
 
         n_sid = ns._output_ind_range[0]
-        nids[partition_id][local_sid:local_eid] = torch.arange(0, ns.num_nodes, group_size) + n_sid
+        nids[partition_id][local_sid:local_eid] = torch.arange(0, ns.num_nodes, group_size, device = device) + n_sid
         for cs_id, cs in enumerate(ns.chs):
-            cids[partition_id][local_sid:local_eid,cs_id] = ns.edge_ids[:,cs_id] * group_size + cs._output_ind_range[0]
+            cids[partition_id][local_sid:local_eid,cs_id] = ns.edge_ids[:,cs_id].to(device) * group_size + cs._output_ind_range[0]
+
+    if use_cuda:
+        nids = [tensor.cpu() for tensor in nids]
+        cids = [tensor.cpu() for tensor in cids]
 
     return nids, cids
 
 
 @torch.no_grad()
-def flatten_c_ids(nids, cids):
+def flatten_c_ids(nids: torch.Tensor, cids: torch.Tensor):
 
     num_cid_slots = sum(map(lambda x: x.size(0) * x.size(1), cids))
     flat_cids = torch.zeros([num_cid_slots], dtype = torch.long)
@@ -1128,14 +978,16 @@ def flatten_c_ids(nids, cids):
 
 
 @torch.no_grad()
-def get_prod_layer_parstats(flat_cids):
+def get_prod_layer_parstats(flat_cids: torch.Tensor, global_nid_start: int):
 
     u_cids, par_counts = torch.unique(flat_cids, sorted = True, return_counts = True)
 
-    if u_cids[0] == 0:
-        # Strip away the dummy node
-        u_cids = u_cids[1:]
-        par_counts = par_counts[1:]
+    c_sid = torch.arange(0, u_cids.size(0))[u_cids == global_nid_start].min()
+
+    if c_sid > 0:
+        # Strip away dummy nodes
+        u_cids = u_cids[c_sid:]
+        par_counts = par_counts[c_sid:]
 
     return u_cids, par_counts
 
diff --git a/src/pyjuice/layer/sum_layer.py b/src/pyjuice/layer/sum_layer.py
index 6d51076e..92c374eb 100644
--- a/src/pyjuice/layer/sum_layer.py
+++ b/src/pyjuice/layer/sum_layer.py
@@ -22,11 +22,11 @@
 class SumLayer(Layer, nn.Module):
 
     def __init__(self, nodes: Sequence[SumNodes], global_nid_start: int, 
-                 param_ends: Sequence, tied_param_ids: Sequence,
-                 tied_param_group_ids: Sequence, tied_param_ends: Sequence,
-                 ch_prod_layer_size: int, layer_sparsity_tol: Optional[float] = None, 
+                 param_ends: Sequence, 
+                 layer_sparsity_tol: Optional[float] = None, 
                  max_num_partitions: Optional[int] = None,
-                 disable_gpu_compilation: bool = False) -> None:
+                 disable_gpu_compilation: bool = False,
+                 force_gpu_compilation: bool = False) -> None:
 
         Layer.__init__(self, nodes)
         nn.Module.__init__(self)
@@ -34,7 +34,6 @@ def __init__(self, nodes: Sequence[SumNodes], global_nid_start: int,
         assert len(nodes) > 0, "No input node."
 
         self.nodes = nodes
-        self.ch_prod_layer_size = ch_prod_layer_size
 
         ## Get layer statistics & prepare for compilation ##
 
@@ -82,10 +81,10 @@ def __init__(self, nodes: Sequence[SumNodes], global_nid_start: int,
         # cids:      List[[partition_size, partition_max_n_chs]] stores indices of child node groups
         # pids:      List[[partition_size, partition_max_n_chs]] stores indices of edge parameters (1st parameter of every group)
         nids, cids, pids, param_ends = sum_layer_forward_compilation(
-            self.nodes, fw_partition_max_chs, fw_n_partition_ids, fw_n_id_in_partition, fw_num_ngs_in_partition, 
-            n_chs, global_nid_start, ch_prod_layer_size, param_ends = param_ends,
+            self.nodes, fw_partition_max_chs, fw_n_partition_ids, fw_n_id_in_partition, 
+            fw_num_ngs_in_partition, n_chs, global_nid_start, param_ends = param_ends,
             # GPU compilation is slightly slower for small layer due to the kernel jit compilation time
-            use_cuda = True # not disable_gpu_compilation and (self.num_edges > 1000)
+            use_cuda = force_gpu_compilation or (not disable_gpu_compilation and (self.num_edges > 1000))
         )
 
         # Store buffers for the forward pass
@@ -96,6 +95,8 @@ def __init__(self, nodes: Sequence[SumNodes], global_nid_start: int,
         # Store pre-compiled indices from `cids` and `pids` in the following buffer
         self._cached_fw_pcids = dict()
 
+        return None # debug
+
         ## Initialize backward pass ##
 
         # A sum layer could have children of different group sizes
diff --git a/tests/layer/layer_compilation_test.py b/tests/layer/layer_compilation_test.py
new file mode 100644
index 00000000..52b43680
--- /dev/null
+++ b/tests/layer/layer_compilation_test.py
@@ -0,0 +1,92 @@
+import pyjuice as juice
+import torch
+import numpy as np
+import time
+import random
+
+import pyjuice.nodes.distributions as dists
+from pyjuice.utils import BitSet
+from pyjuice.nodes import multiply, summate, inputs
+from pyjuice.model import TensorCircuit
+
+from pyjuice.layer import InputLayer, ProdLayer, SumLayer
+
+import pytest
+
+
+def prod_layer_compilation_test():
+    
+    for group_size in [1, 8, 16]:
+    
+        with juice.set_group_size(group_size):
+
+            ni0 = inputs(0, num_node_groups = 3, dist = dists.Categorical(num_cats = 2))
+            ni1 = inputs(1, num_node_groups = 7, dist = dists.Categorical(num_cats = 2))
+            ni2 = inputs(2, num_node_groups = 6, dist = dists.Categorical(num_cats = 2))
+            ni3 = inputs(3, num_node_groups = 12, dist = dists.Categorical(num_cats = 2))
+            ni4 = inputs(4, num_node_groups = 4, dist = dists.Categorical(num_cats = 2))
+
+            np0 = multiply(ni0, ni1, edge_ids = torch.tensor([[0, 1, 2, 2, 1, 0, 1], [0, 1, 2, 3, 4, 5, 6]]).permute(1, 0))
+            np1 = multiply(ni2, ni3, edge_ids = torch.tensor([[0, 1, 2, 3, 4, 5, 5, 4, 1, 2, 3, 0], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]]).permute(1, 0))
+            np2 = multiply(ni1, ni2, edge_ids = torch.tensor([[2, 3, 1, 4, 0, 6, 5], [0, 0, 1, 2, 3, 4, 5]]).permute(1, 0))
+            np3 = multiply(ni1, ni3, ni4, edge_ids = torch.tensor([[3, 6, 5, 1, 0, 5, 3, 4, 2, 2, 3, 1], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [2, 3, 1, 2, 0, 2, 3, 0, 1, 2, 1, 3]]).permute(1, 0))
+            np4 = multiply(ni3, edge_ids = torch.tensor([[0, 1, 2, 3]]).permute(1, 0))
+            np5 = multiply(ni0, ni1, ni2, ni4, edge_ids = torch.tensor([[0, 1, 2, 2, 1, 2, 0], [0, 1, 2, 3, 4, 5, 6], [0, 1, 1, 2, 3, 4, 5], [1, 3, 2, 0, 1, 2, 2]]).permute(1, 0))
+
+        input_layer = InputLayer([ni0, ni1, ni2, ni3, ni4], cum_nodes = group_size)
+
+        prod_layer_cpu = ProdLayer([np0, np1, np2, np3, np4, np5], layer_sparsity_tol = 0.1, disable_gpu_compilation = True)
+        prod_layer_gpu = ProdLayer([np0, np1, np2, np3, np4, np5], layer_sparsity_tol = 0.1, force_gpu_compilation = True)
+
+        for i in range(3):
+            assert torch.all(prod_layer_cpu.partitioned_nids[i] == prod_layer_gpu.partitioned_nids[i])
+            assert torch.all(prod_layer_cpu.partitioned_cids[i] == prod_layer_gpu.partitioned_cids[i])
+
+        for i in range(2):
+            assert torch.all(prod_layer_cpu.partitioned_u_cids[i] == prod_layer_gpu.partitioned_u_cids[i])
+            assert torch.all(prod_layer_cpu.partitioned_parids[i] == prod_layer_gpu.partitioned_parids[i])
+
+
+def sum_layer_compilation_test():
+
+    for group_size in [8, 16]:
+    
+        with juice.set_group_size(group_size):
+
+            ni0 = inputs(0, num_node_groups = 3, dist = dists.Categorical(num_cats = 2))
+            ni1 = inputs(1, num_node_groups = 7, dist = dists.Categorical(num_cats = 2))
+            ni2 = inputs(2, num_node_groups = 6, dist = dists.Categorical(num_cats = 2))
+            ni3 = inputs(3, num_node_groups = 12, dist = dists.Categorical(num_cats = 2))
+            ni4 = inputs(4, num_node_groups = 4, dist = dists.Categorical(num_cats = 2))
+
+            np0 = multiply(ni0, ni1, edge_ids = torch.tensor([[0, 1, 2, 2, 1, 0, 1], [0, 1, 2, 3, 4, 5, 6]]).permute(1, 0))
+            np1 = multiply(ni2, ni3, edge_ids = torch.tensor([[0, 1, 2, 3, 4, 5, 5, 4, 1, 2, 3, 0], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]]).permute(1, 0))
+            np2 = multiply(ni1, ni2, edge_ids = torch.tensor([[2, 3, 1, 4, 0, 6, 5], [0, 0, 1, 2, 3, 4, 5]]).permute(1, 0))
+            np3 = multiply(ni1, ni3, ni4, edge_ids = torch.tensor([[3, 6, 5, 1, 0, 5, 3, 4, 2, 2, 3, 1], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [2, 3, 1, 2, 0, 2, 3, 0, 1, 2, 1, 3]]).permute(1, 0))
+            np4 = multiply(ni3, edge_ids = torch.tensor([[0, 1, 2, 3]]).permute(1, 0))
+            np5 = multiply(ni0, ni1, ni2, ni4, edge_ids = torch.tensor([[0, 1, 2, 2, 1, 2, 0], [0, 1, 2, 3, 4, 5, 6], [0, 1, 1, 2, 3, 4, 5], [1, 3, 2, 0, 1, 2, 2]]).permute(1, 0))
+            np6 = multiply(ni0, ni1, edge_ids = torch.tensor([[2, 2, 1, 0, 1, 2, 0], [0, 5, 6, 3, 4, 2, 1]]).permute(1, 0))
+
+            ns0 = summate(np0, edge_ids = torch.tensor([[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 2, 4, 2, 1, 5, 6, 2, 1]]))
+            ns1 = summate(np0, np6, edge_ids = torch.tensor([[0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 5], [0, 2, 4, 2, 1, 5, 6, 2, 1, 10, 3, 8, 9]]))
+
+        input_layer = InputLayer([ni0, ni1, ni2, ni3, ni4], cum_nodes = group_size)
+        prod_layer = ProdLayer([np0, np1, np2, np3, np4, np5, np6], layer_sparsity_tol = 0.1, force_gpu_compilation = True)
+
+        sum_layer_cpu = SumLayer([ns0, ns1], global_nid_start = input_layer.num_nodes + group_size, 
+                                 param_ends = [1], layer_sparsity_tol = 0.1, disable_gpu_compilation = True)
+        sum_layer_gpu = SumLayer([ns0, ns1], global_nid_start = input_layer.num_nodes + group_size, 
+                                 param_ends = [1], layer_sparsity_tol = 0.1, force_gpu_compilation = True)
+
+        for i in range(3):
+            assert torch.all(sum_layer_cpu.partitioned_nids[i] == sum_layer_gpu.partitioned_nids[i])
+            assert torch.all(sum_layer_cpu.partitioned_cids[i] == sum_layer_gpu.partitioned_cids[i])
+            assert torch.all(sum_layer_cpu.partitioned_pids[i] == sum_layer_gpu.partitioned_pids[i])
+
+        import pdb; pdb.set_trace()
+        
+
+
+if __name__ == "__main__":
+    # prod_layer_compilation_test()
+    sum_layer_compilation_test()
diff --git a/tests/layer/sum_layer_test.py b/tests/layer/sum_layer_test.py
index f683d249..87ef0602 100644
--- a/tests/layer/sum_layer_test.py
+++ b/tests/layer/sum_layer_test.py
@@ -13,9 +13,6 @@
 
 import pytest
 
-import triton
-import triton.language as tl
-
 
 def sum_layer_test():
 
@@ -219,6 +216,8 @@ def speed_test():
     print("Reference computation time on RTX 4090: 1.200ms.")
     print("--------------------------------------------------------------")
 
+    import pdb; pdb.set_trace()
+
 
 if __name__ == "__main__":
     torch.manual_seed(3890)

From 2285a052d634e69b4988e3967e278f1aec6f2789 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Sun, 10 Dec 2023 20:33:47 +0800
Subject: [PATCH 034/162] rm legacy mode

---
 src/pyjuice/layer/compilation.py | 168 +------------------------------
 src/pyjuice/layer/sum_layer.py   |   2 -
 2 files changed, 5 insertions(+), 165 deletions(-)

diff --git a/src/pyjuice/layer/compilation.py b/src/pyjuice/layer/compilation.py
index f786ad0d..66d328fe 100644
--- a/src/pyjuice/layer/compilation.py
+++ b/src/pyjuice/layer/compilation.py
@@ -330,13 +330,12 @@ def _assign_target_ncpids_kernel(target_nids_ptr, nids_partition_start_ptr, targ
 
 @torch.no_grad()
 def sum_layer_forward_compilation(nodes, fw_partition_max_chs, n_partition_ids, n_id_in_partition, num_ngs_in_partition, n_chs,
-                                  global_nid_start, param_ends, use_cuda: bool = True, legacy: bool = False):
+                                  global_nid_start, param_ends, use_cuda: bool = True):
 
     if use_cuda and not torch.cuda.is_available():
         use_cuda = False
 
-    # Also use the legacy code if we compile with CPU
-    if not use_cuda or legacy:
+    if not use_cuda:
         return sum_layer_forward_compilation_cpu(
             nodes, fw_partition_max_chs, n_partition_ids, n_id_in_partition, 
             num_ngs_in_partition, n_chs, global_nid_start, param_ends
@@ -512,127 +511,6 @@ def sum_layer_forward_compilation(nodes, fw_partition_max_chs, n_partition_ids,
     return nids, cids, pids, param_ends
 
 
-@torch.no_grad()
-def sum_layer_backward_compilation_legacy(nodes, pids, fw_n_group_ids, fw_n_id_in_group, 
-                                          num_bk_groups, bk_n_group_ids, bk_n_id_in_group, 
-                                          bk_group_max_pars, bk_num_ns_in_group,
-                                          ch_prod_layer_size, global_nid_start, use_cuda: bool = False):
-
-    if use_cuda and not torch.cuda.is_available():
-        use_cuda = False
-
-    # Since we will be iterating over parent nodes, we want to create a flattened scratch space for the 
-    # buffers. In the following, `flat_parids` and `flat_parpids` are the flattened version of 
-    # `parids` and `parpids`, respectively. Also, we create `ch2flatidx` which points to the start 
-    # location of the scratch space (`flat_parids` and `flat_parpids``) for every child node.
-    group2flatidx = torch.zeros([num_bk_groups], dtype = torch.long)
-    flatidx = 0
-    for group_id in range(num_bk_groups):
-        group_size = bk_num_ns_in_group[group_id]
-        max_n_par = bk_group_max_pars[group_id]
-
-        group2flatidx[group_id] = flatidx
-
-        flatidx += group_size * max_n_par
-    num_slots = flatidx
-
-    # parids:  indices of parent nodes for each child node
-    # parpids: parameter indices for these edges
-    flat_parids = torch.zeros([num_slots], dtype = torch.long)
-    flat_parpids = torch.zeros([num_slots], dtype = torch.long)
-
-    # The indexing vector pointing to the start position in the scratch space
-    ch2flatidx = group2flatidx[bk_n_group_ids] + bk_n_id_in_group * bk_group_max_pars[bk_n_group_ids]
-
-    # This vector maintains the count of parents that have been processed for every child node
-    par_counts = torch.zeros([ch_prod_layer_size], dtype = torch.long)
-
-    if use_cuda:
-        # Move buffers to GPU
-        flat_parids = flat_parids.cuda()
-        flat_parpids = flat_parpids.cuda()
-        ch2flatidx = ch2flatidx.cuda()
-        par_counts = par_counts.cuda()
-
-        fw_n_group_ids = fw_n_group_ids.cuda()
-        fw_n_id_in_group = fw_n_id_in_group.cuda()
-
-    node_start = 0
-    for ns in nodes:
-        node_end = node_start + ns.num_nodes
-        if use_cuda:
-            edge_ids = ns.edge_ids.cuda()
-        else:
-            edge_ids = ns.edge_ids
-
-        # Pre-compute cid flags for future reuse
-        cid_starts = torch.zeros([ns.num_chs], dtype = torch.long)
-        cid_ends = torch.zeros([ns.num_chs], dtype = torch.long)
-        cid_start = 0
-        for cnode_id, cs in enumerate(ns.chs):
-            cid_end = cid_start + cs.num_nodes
-            cid_starts[cnode_id] = cid_start
-            cid_ends[cnode_id] = cid_end
-            cid_start = cid_end
-        
-        if use_cuda:
-            cid_starts = cid_starts.cuda()
-            cid_ends = cid_ends.cuda()
-
-        # Shape: [ns.num_chs, num_edges]
-        cs_criterion = (edge_ids[1,:].unsqueeze(0) >= cid_starts[:,None]) & \
-                       (edge_ids[1,:].unsqueeze(0) < cid_ends[:,None])
-
-        for nid in range(ns.num_nodes):
-            # `group_id`: which group the current node belongs to
-            # `local_id`: the index of the node within the current group
-            group_id = fw_n_group_ids[node_start + nid]
-            local_id = fw_n_id_in_group[node_start + nid]
-            curr_pids = pids[group_id][local_id,:]
-            if use_cuda:
-                curr_pids = curr_pids.cuda()
-
-            ns_criterion = (edge_ids[0,:] == nid)
-
-            cid_start = 0
-            pid_start = 0
-            for cnode_id, cs in enumerate(ns.chs):
-                cid_end = cid_start + cs.num_nodes
-                criterion = cs_criterion[cnode_id,:] & ns_criterion
-                pid_end = pid_start + criterion.sum().item()
-
-                ch_ids = edge_ids[1,criterion] + (cs._output_ind_range[0] - cid_start)
-                flat_cids = ch2flatidx[ch_ids] + par_counts[ch_ids] # start position specified by `ch2flatidx` + offset specified by `par_counts`
-                flat_parids[flat_cids] = global_nid_start + node_start + nid
-                flat_parpids[flat_cids] = curr_pids[pid_start:pid_end]
-
-                par_counts[ch_ids] += 1
-                cid_start = cid_end
-                pid_start = pid_end
-
-        node_start = node_end
-
-    if use_cuda:
-        flat_parids = flat_parids.cpu()
-        flat_parpids = flat_parpids.cpu()
-
-    # Restore the original `parids` and `parpids`
-    parids = []
-    parpids = []
-    flatid_start = 0
-    for group_id in range(num_bk_groups):
-        group_size = bk_num_ns_in_group[group_id]
-        max_n_par = bk_group_max_pars[group_id]
-        flatid_end = flatid_start + group_size * max_n_par
-
-        parids.append(flat_parids[flatid_start:flatid_end].reshape(group_size, max_n_par).contiguous())
-        parpids.append(flat_parpids[flatid_start:flatid_end].reshape(group_size, max_n_par).contiguous())
-
-        flatid_start = flatid_end
-
-    return parids, parpids
-
-
 @njit
 def _assign_chid_kernel(chs_offsets, ns_nchs, edge_ids):
     for i in range(edge_ids.shape[1]):
@@ -642,39 +520,6 @@ def _assign_chid_kernel(chs_offsets, ns_nchs, edge_ids):
         ns_nchs[nid] = idx + 1
 
 
-@triton.jit
-def _assign_global_eleids_kernel(global_ele_ids_ptr, cs_ele_id_start_ptr, cs_node_cum_ids_ptr, edge_ids_ptr, 
-                                 constexprs_ptr, num_chs: tl.constexpr, num_chs_np2: tl.constexpr, BLOCK_SIZE: tl.constexpr):
-
-    pid = tl.program_id(axis = 0)
-    block_start = pid * BLOCK_SIZE
-
-    # Retrieve all constexprs
-    num_edges = tl.load(constexprs_ptr)
-
-    # Get edge indices to be processed by the current block
-    offsets = block_start + tl.arange(0, BLOCK_SIZE)
-    mask = offsets < num_edges
-
-    # Get `cid`
-    cid = tl.load(edge_ids_ptr + offsets + num_edges, mask = mask, other = 0)
-
-    # Get the child ns index every `cid` belongs to and the cum nodes & global sid
-    cs_offsets = tl.arange(0, num_chs_np2)
-    cs_node_cum_ids = tl.load(cs_node_cum_ids_ptr + cs_offsets, mask = (cs_offsets < num_chs), other = 0)
-    
-    cid_node_id = tl.sum(tl.broadcast_to(cid[:,None], (BLOCK_SIZE, num_chs_np2)) >= \
-        tl.broadcast_to(cs_node_cum_ids[None,:], (BLOCK_SIZE, num_chs_np2)), axis = 1) - \
-        (1 + num_chs_np2 - num_chs)
-
-    cs_cum_num = tl.load(cs_node_cum_ids_ptr + cid_node_id, mask = mask, other = 0)
-    cs_ele_ind = tl.load(cs_ele_id_start_ptr + cid_node_id, mask = mask, other = 0)
-
-    # Compute global cids and store them
-    global_cid = cid + cs_ele_ind - cs_cum_num
-    tl.store(global_ele_ids_ptr + offsets, global_cid, mask = mask)
-
-
 @njit
 def _assign_parid_kernel(pars_offsets, cs_npars, edge_ids, edge_sid):
     for i in range(edge_ids.shape[1]):
@@ -739,16 +584,13 @@ def _assign_target_chpapids_kernel(target_chids_ptr, chids_partition_start_ptr,
 
 @torch.no_grad()
 def sum_layer_backward_compilation(nodes, cs2parns, n_partition_ids, n_id_in_partition, num_ngs_in_partition, partition_max_pars,
-                                   use_cuda: bool = False, legacy: bool = False):
+                                   use_cuda: bool = False):
 
     if use_cuda and not torch.cuda.is_available():
         use_cuda = False
 
-    # Also use the legacy code if we compile with CPU
-    if not use_cuda or legacy:
-        # TODO: restore CPU compilation
-        raise ValueError()
-        return sum_layer_backward_compilation_legacy(
+    if not use_cuda:
+        return sum_layer_backward_compilation_cpu(
             nodes, pids, fw_n_group_ids, fw_n_id_in_group, 
             num_bk_groups, bk_n_group_ids, bk_n_id_in_group, 
             bk_group_max_pars, bk_num_ns_in_group,
diff --git a/src/pyjuice/layer/sum_layer.py b/src/pyjuice/layer/sum_layer.py
index 92c374eb..623ec3e9 100644
--- a/src/pyjuice/layer/sum_layer.py
+++ b/src/pyjuice/layer/sum_layer.py
@@ -95,8 +95,6 @@ def __init__(self, nodes: Sequence[SumNodes], global_nid_start: int,
         # Store pre-compiled indices from `cids` and `pids` in the following buffer
         self._cached_fw_pcids = dict()
 
-        return None # debug
-
         ## Initialize backward pass ##
 
         # A sum layer could have children of different group sizes

From 114c94034b894033f053f77f7cf8b8ef736b88b5 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Sun, 10 Dec 2023 21:56:22 +0800
Subject: [PATCH 035/162] dynamic buffers for param_flows

---
 src/pyjuice/layer/compilation.py | 201 ++++++++++++++++++++++---------
 src/pyjuice/layer/sum_layer.py   |  18 ++-
 src/pyjuice/nodes/nodes.py       |   5 +-
 3 files changed, 160 insertions(+), 64 deletions(-)

diff --git a/src/pyjuice/layer/compilation.py b/src/pyjuice/layer/compilation.py
index 66d328fe..59483289 100644
--- a/src/pyjuice/layer/compilation.py
+++ b/src/pyjuice/layer/compilation.py
@@ -169,38 +169,78 @@ def get_sum_layer_backward_stats(nodes: Sequence[SumNodes]):
     return ch_gsize2cs, ch_gsize2num_ngroups, ch_gsize2n_pargs, cs2parns
 
 
-def sum_layer_forward_compilation_cpu(nodes, fw_partition_max_chs, n_partition_ids, n_id_in_partition, 
-                                      num_ngs_in_partition, n_chs, global_nid_start, param_ends):
+def sum_layer_forward_compilation_cpu(nodes, fw_partition_max_chs, n_partition_ids, n_id_in_partition, num_ngs_in_partition, n_chs,
+                                      global_nid_start: int, global_pid_start: int, global_pfid_start: int, node2tiednodes: dict, 
+                                      max_tied_ns_per_parflow_group: int = 4):
 
     nids = [torch.zeros([num_ngs_in_partition[i]], dtype = torch.long) for i in range(len(num_ngs_in_partition))]
     cids = [torch.zeros([num_ngs_in_partition[i], fw_partition_max_chs[i]], dtype = torch.long) for i in range(len(num_ngs_in_partition))]
     pids = [torch.zeros([num_ngs_in_partition[i], fw_partition_max_chs[i]], dtype = torch.long) for i in range(len(num_ngs_in_partition))]
 
+    all_ns_param_ids = dict()
     original_param_nids = [] # `ns` with their original parameters (i.e., not tied)
     
     # This is the main loop: iterate over `ns` in the layer
-    global_pid_start = param_ends[-1]
     ngroup_start = 0 # The start index of the node groups in the current `ns`
     ngid_in_partition = torch.zeros([len(num_ngs_in_partition)], dtype = torch.long)
     for ns_idx, ns in enumerate(nodes):
-        if ns.is_tied():
-            target_ns = ns.get_source_ns()
-        else:
-            target_ns = ns
+        
+        if not ns.is_tied():
+            if not ns.provided("_param_range"):
+                global_pid_end = global_pid_start + ns.num_edges
+                ns._param_range = (global_pid_start, global_pid_end)
+                global_pid_start = global_pid_end
+
+                global_pfid_end = global_pfid_start + ns.num_edges
+                ns._param_flow_range = (global_pfid_start, global_pfid_end)
+                global_pfid_start = global_pfid_end
 
-        # If the parameters have not been instantiated, do it :)
-        if not hasattr(target_ns, "_param_range") or target_ns._param_range is None:
-            global_pid_end = global_pid_start + target_ns.num_edges
-            target_ns._param_range = (global_pid_start, global_pid_end)
-            global_pid_start = global_pid_end
+                add_params_flag = True
+            else:
+                add_params_flag = False
 
-            add_params_flag = True
             original_param_nids.append(ns_idx)
+                
+            # Global pid start index for `ns`
+            ns_pid_start = ns._param_range[0]
         else:
-            add_params_flag = False
+            source_ns = ns.get_source_ns()
+
+            # Initialize parameters
+            if not source_ns.provided("_param_range"):
+                global_pid_end = global_pid_start + ns.num_edges
+                ns._param_range = (global_pid_start, global_pid_end)
+                global_pid_start = global_pid_end
 
-        # Global pid start index for `ns`
-        ns_pid_start = target_ns._param_range[0]
+                global_pfid_end = global_pfid_start + ns.num_edges
+                ns._param_flow_range = (global_pfid_start, global_pfid_end)
+                global_pfid_start = global_pfid_end
+
+                add_params_flag = True
+            else:
+                ns._param_range = deepcopy(source_ns._param_range)
+
+                add_params_flag = False
+
+            if source_ns not in node2tiednodes:
+                node2tiednodes[source_ns] = [[source_ns], 1, source_ns._param_flow_range]
+            
+            dup_count = node2tiednodes[source_ns][1]
+            if dup_count >= max_tied_ns_per_parflow_group:
+                global_pfid_end = global_pfid_start + ns.num_edges
+                ns._param_flow_range = (global_pfid_start, global_pfid_end)
+                global_pfid_start = global_pfid_end
+                node2tiednodes[source_ns][2] = ns._param_flow_range
+
+                node2tiednodes[source_ns][0].append(ns)
+                node2tiednodes[source_ns][1] = 1
+            else:
+                ns._param_flow_range = deepcopy(node2tiednodes[source_ns][2])
+
+                node2tiednodes[source_ns][1] += 1
+
+            # Global pid start index for `ns`
+            ns_pid_start = source_ns._param_range[0]
 
         # number of node groups
         ns_num_ngroups = ns.num_node_groups
@@ -223,8 +263,14 @@ def sum_layer_forward_compilation_cpu(nodes, fw_partition_max_chs, n_partition_i
         for i in range(1, ns.num_chs):
             cs_node_cum_nodes[i] = cs_node_cum_nodes[i-1] + ns.chs[i].num_nodes
 
+        if add_params_flag:
+            ns_param_ids = torch.zeros([ns_num_edges], dtype = torch.long).cuda()
+        else:
+            ns_param_ids = None
+
         # Iterate over node groups
         cum_n_chs = 0
+        all_ns_param_ids = dict()
         for ng_id in range(ns_num_ngroups):
             partition_id = (ns_nchs[ng_id] > fw_partition_max_chs).sum()
             local_id = ngid_in_partition[partition_id]
@@ -248,9 +294,24 @@ def sum_layer_forward_compilation_cpu(nodes, fw_partition_max_chs, n_partition_i
             pids[partition_id][local_id, 0:global_pids.size(0)] = global_pids
             cum_n_chs += ns.group_size * criterion.sum()
 
+            if add_params_flag:
+                ns_param_ids[criterion] = global_pids
+
             ngid_in_partition[partition_id] = local_id + 1
 
-    return nids, cids, pids, param_ends
+        all_ns_param_ids[ns_idx] = ns_param_ids
+
+    # Store global -> local parameter id mapping in `ns`
+    for ns_idx, param_ids in all_ns_param_ids.items():
+        ns = nodes[ns_idx]
+        ns._param_ids = param_ids.cpu()[0::ns.ch_group_size] # Every edge specify the start id of [ch_group_size, group_size] parameters
+
+    # Store local -> global parameter id mapping in `ns`
+    for ns_idx in original_param_nids:
+        ns = nodes[ns_idx]
+        ns._inverse_param_ids = torch.argsort(ns._param_ids)
+
+    return nids, cids, pids, global_pid_end, global_pfid_end
 
 
 @njit
@@ -330,7 +391,8 @@ def _assign_target_ncpids_kernel(target_nids_ptr, nids_partition_start_ptr, targ
 
 @torch.no_grad()
 def sum_layer_forward_compilation(nodes, fw_partition_max_chs, n_partition_ids, n_id_in_partition, num_ngs_in_partition, n_chs,
-                                  global_nid_start, param_ends, use_cuda: bool = True):
+                                  global_nid_start: int, global_pid_start: int, global_pfid_start: int, node2tiednodes: dict, 
+                                  max_tied_ns_per_parflow_group: int = 4, use_cuda: bool = True):
 
     if use_cuda and not torch.cuda.is_available():
         use_cuda = False
@@ -338,7 +400,8 @@ def sum_layer_forward_compilation(nodes, fw_partition_max_chs, n_partition_ids,
     if not use_cuda:
         return sum_layer_forward_compilation_cpu(
             nodes, fw_partition_max_chs, n_partition_ids, n_id_in_partition, 
-            num_ngs_in_partition, n_chs, global_nid_start, param_ends
+            num_ngs_in_partition, n_chs, global_nid_start, global_pid_start,
+            global_pfid_start, node2tiednodes, max_tied_ns_per_parflow_group
         )
 
     # We construct a flattened version of `nids` where the vectors of every partition is concatenated
@@ -366,27 +429,65 @@ def sum_layer_forward_compilation(nodes, fw_partition_max_chs, n_partition_ids,
     original_param_nids = [] # `ns` with their original parameters (i.e., not tied)
 
     # This is the main loop: iterate over `ns` in the layer
-    global_pid_start = param_ends[-1]
     ngroup_start = 0 # The start index of the node groups in the current `ns`
     for ns_idx, ns in enumerate(nodes):
-        if ns.is_tied():
-            target_ns = ns.get_source_ns()
-        else:
-            target_ns = ns
 
-        # If the parameters have not been instantiated, do it :)
-        if not hasattr(target_ns, "_param_range") or target_ns._param_range is None:
-            global_pid_end = global_pid_start + target_ns.num_edges
-            target_ns._param_range = (global_pid_start, global_pid_end)
-            global_pid_start = global_pid_end
+        if not ns.is_tied():
+            if not ns.provided("_param_range"):
+                global_pid_end = global_pid_start + ns.num_edges
+                ns._param_range = (global_pid_start, global_pid_end)
+                global_pid_start = global_pid_end
+
+                global_pfid_end = global_pfid_start + ns.num_edges
+                ns._param_flow_range = (global_pfid_start, global_pfid_end)
+                global_pfid_start = global_pfid_end
+
+                add_params_flag = True
+            else:
+                add_params_flag = False
 
-            add_params_flag = True
             original_param_nids.append(ns_idx)
+                
+            # Global pid start index for `ns`
+            ns_pid_start = ns._param_range[0]
         else:
-            add_params_flag = False
+            source_ns = ns.get_source_ns()
+
+            # Initialize parameters
+            if not source_ns.provided("_param_range"):
+                global_pid_end = global_pid_start + ns.num_edges
+                ns._param_range = (global_pid_start, global_pid_end)
+                global_pid_start = global_pid_end
+
+                global_pfid_end = global_pfid_start + ns.num_edges
+                ns._param_flow_range = (global_pfid_start, global_pfid_end)
+                global_pfid_start = global_pfid_end
+
+                add_params_flag = True
+            else:
+                ns._param_range = deepcopy(source_ns._param_range)
+
+                add_params_flag = False
+
+            if source_ns not in node2tiednodes:
+                node2tiednodes[source_ns] = [[source_ns], 1, source_ns._param_flow_range]
+            
+            dup_count = node2tiednodes[source_ns][1]
+            if dup_count >= max_tied_ns_per_parflow_group:
+                global_pfid_end = global_pfid_start + ns.num_edges
+                ns._param_flow_range = (global_pfid_start, global_pfid_end)
+                global_pfid_start = global_pfid_end
+                node2tiednodes[source_ns][2] = ns._param_flow_range
+
+                node2tiednodes[source_ns][0].append(ns)
+                node2tiednodes[source_ns][1] = 1
+            else:
+                ns._param_flow_range = deepcopy(node2tiednodes[source_ns][2])
+
+                node2tiednodes[source_ns][1] += 1
 
-        # Global pid start index for `ns`
-        ns_pid_start = target_ns._param_range[0]
+            # Global pid start index for `ns`
+            ns_pid_start = source_ns._param_range[0]
 
         # number of node groups
         ns_num_ngroups = ns.num_node_groups
@@ -462,31 +563,16 @@ def sum_layer_forward_compilation(nodes, fw_partition_max_chs, n_partition_ids,
         if add_params_flag:
             all_ns_param_ids[ns_idx] = ns_param_ids
 
-    # TODO: fix broken
-    # Store local -> global parameter id mapping in `ns`
+    # Store global -> local parameter id mapping in `ns`
     for ns_idx, param_ids in all_ns_param_ids.items():
         ns = nodes[ns_idx]
-        ns._param_ids = param_ids.cpu()
+        ns._param_ids = param_ids.cpu()[0::ns.ch_group_size] # Every edge specify the start id of [ch_group_size, group_size] parameters
 
-    # TODO: fix broken
-    # Store global -> local parameter id mapping in `ns`
+    # Store local -> global parameter id mapping in `ns`
     for ns_idx in original_param_nids:
         ns = nodes[ns_idx]
-        ns._param_range = (ns._param_ids.min().item(), ns._param_ids.max().item() + 1)
         ns._inverse_param_ids = torch.argsort(ns._param_ids)
 
-    # TODO: fix broken
-    # Update `param_ends`
-    npars = param_ends[-1]
-    nid = 0
-    for ns_idx in original_param_nids:
-        ns = nodes[ns_idx]
-        for i in range(ns.num_node_groups):
-            npars += n_chs[nid+i].item()
-            param_ends.append(npars)
-        
-        nid += ns.num_node_groups
-
     # Restore `nids`
     target_nids = target_nids.cpu()
     nids = []
@@ -508,7 +594,7 @@ def sum_layer_forward_compilation(nodes, fw_partition_max_chs, n_partition_ids,
         cids.append(target_cids[sid:eid].reshape(gsize, gnchs).contiguous())
         pids.append(target_pids[sid:eid].reshape(gsize, gnchs).contiguous())
 
-    return nids, cids, pids, param_ends
+    return nids, cids, pids, global_pid_end, global_pfid_end
 
 
 @njit
@@ -589,13 +675,10 @@ def sum_layer_backward_compilation(nodes, cs2parns, n_partition_ids, n_id_in_par
     if use_cuda and not torch.cuda.is_available():
         use_cuda = False
 
-    if not use_cuda:
-        return sum_layer_backward_compilation_cpu(
-            nodes, pids, fw_n_group_ids, fw_n_id_in_group, 
-            num_bk_groups, bk_n_group_ids, bk_n_id_in_group, 
-            bk_group_max_pars, bk_num_ns_in_group,
-            ch_prod_layer_size, global_nid_start, use_cuda = use_cuda
-        )
+    if use_cuda:
+        device = torch.device("cuda:0")
+    else:
+        device = torch.device("cpu")
 
     # We construct a flattened version of `chids` where the vectors of every partition is concatenated
     # into a single vector. `chids_partition_start` is used to indicate the start index of every partition's
diff --git a/src/pyjuice/layer/sum_layer.py b/src/pyjuice/layer/sum_layer.py
index 623ec3e9..40a16328 100644
--- a/src/pyjuice/layer/sum_layer.py
+++ b/src/pyjuice/layer/sum_layer.py
@@ -22,7 +22,7 @@
 class SumLayer(Layer, nn.Module):
 
     def __init__(self, nodes: Sequence[SumNodes], global_nid_start: int, 
-                 param_ends: Sequence, 
+                 global_pid_start: int, global_pfid_start: int, node2tiednodes: dict(),
                  layer_sparsity_tol: Optional[float] = None, 
                  max_num_partitions: Optional[int] = None,
                  disable_gpu_compilation: bool = False,
@@ -35,6 +35,10 @@ def __init__(self, nodes: Sequence[SumNodes], global_nid_start: int,
 
         self.nodes = nodes
 
+        layer_nid_start = global_nid_start
+        layer_pid_start = global_pid_start
+        layer_pfid_start = global_pfid_start
+
         ## Get layer statistics & prepare for compilation ##
 
         # n_chs:       [num_node_groups]          stores the number of child nodes of each node
@@ -80,9 +84,9 @@ def __init__(self, nodes: Sequence[SumNodes], global_nid_start: int,
         # nids:      List[[partition_size]]                      stores node group ids
         # cids:      List[[partition_size, partition_max_n_chs]] stores indices of child node groups
         # pids:      List[[partition_size, partition_max_n_chs]] stores indices of edge parameters (1st parameter of every group)
-        nids, cids, pids, param_ends = sum_layer_forward_compilation(
+        nids, cids, pids, param_ends, layer_pid_end, layer_pfid_end = sum_layer_forward_compilation(
             self.nodes, fw_partition_max_chs, fw_n_partition_ids, fw_n_id_in_partition, 
-            fw_num_ngs_in_partition, n_chs, global_nid_start, param_ends = param_ends,
+            fw_num_ngs_in_partition, n_chs, global_nid_start, global_pid_end, global_pfid_start, node2tiednodes,
             # GPU compilation is slightly slower for small layer due to the kernel jit compilation time
             use_cuda = force_gpu_compilation or (not disable_gpu_compilation and (self.num_edges > 1000))
         )
@@ -95,6 +99,11 @@ def __init__(self, nodes: Sequence[SumNodes], global_nid_start: int,
         # Store pre-compiled indices from `cids` and `pids` in the following buffer
         self._cached_fw_pcids = dict()
 
+        # Layer info
+        self._layer_nid_range = (layer_nid_start, layer_nid_start + self.num_nodes)
+        self._layer_pid_range = (layer_pid_start, layer_pid_end)
+        self._layer_pfid_range = (layer_pfid_start, layer_pfid_end)
+
         ## Initialize backward pass ##
 
         # A sum layer could have children of different group sizes
@@ -971,7 +980,8 @@ def _bk_triton_block_sparse_par_kernel(node_flows, node_mars, element_mars, para
         epars_offsets = offs_node[:,None] + par_start[None,:] # [TILE_SIZE_M, TILE_SIZE_K]
         
         epars = tl.load(params + epars_offsets)
-        pflows = acc * epars
+        pflows = tl.load(param_flows + epars_offsets)
+        pflows += acc * epars
 
         tl.store(param_flows + epars_offsets, pflows)
 
diff --git a/src/pyjuice/nodes/nodes.py b/src/pyjuice/nodes/nodes.py
index 3024ef07..f87b0160 100644
--- a/src/pyjuice/nodes/nodes.py
+++ b/src/pyjuice/nodes/nodes.py
@@ -165,4 +165,7 @@ def _clear_tensor_circuit_hooks(self, recursive: bool = True):
                 self._inverse_param_ids = None
 
     def __iter__(self):
-        return node_iterator(self)
\ No newline at end of file
+        return node_iterator(self)
+
+    def provided(self, var_name):
+        return hasattr(self, var_name) and getattr(self, var_name) is not None

From fc5d2149a0fc0bc100290048db9d86a63a25fde5 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Sun, 10 Dec 2023 21:57:17 +0800
Subject: [PATCH 036/162] .cuda() -> .to(device)

---
 src/pyjuice/layer/compilation.py | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/pyjuice/layer/compilation.py b/src/pyjuice/layer/compilation.py
index 59483289..e7109c24 100644
--- a/src/pyjuice/layer/compilation.py
+++ b/src/pyjuice/layer/compilation.py
@@ -685,22 +685,22 @@ def sum_layer_backward_compilation(nodes, cs2parns, n_partition_ids, n_id_in_par
     # `chids`. That is, `target_chids[chids_partition_start[i]:chids_partition_start[i+1]] == chids[i]`
     chids_partition_start = torch.zeros_like(num_ngs_in_partition)
     chids_partition_start[1:] = torch.cumsum(num_ngs_in_partition[:-1], dim = 0)
-    target_chids = torch.zeros([num_ngs_in_partition.sum()], dtype = torch.long).cuda()
+    target_chids = torch.zeros([num_ngs_in_partition.sum()], dtype = torch.long).to(device)
 
     # Similarly, we flatten `parids`...
     # Note: we call it `pcids...` because it is shared with `target_pids`
     parids_partition_start = torch.zeros_like(num_ngs_in_partition)
     parids_partition_start[1:] = torch.cumsum((num_ngs_in_partition * partition_max_pars)[:-1], dim = 0)
-    target_parids = torch.zeros([(num_ngs_in_partition * partition_max_pars).sum()], dtype = torch.long).cuda()
+    target_parids = torch.zeros([(num_ngs_in_partition * partition_max_pars).sum()], dtype = torch.long).to(device)
 
     # ...and `parpids`
-    target_parpids = torch.zeros([(num_ngs_in_partition * partition_max_pars).sum()], dtype = torch.long).cuda()
+    target_parpids = torch.zeros([(num_ngs_in_partition * partition_max_pars).sum()], dtype = torch.long).to(device)
 
     # Move tensors to GPU
-    n_partition_ids = n_partition_ids.cuda()
-    n_id_in_partition = n_id_in_partition.cuda()
-    num_ngs_in_partition = num_ngs_in_partition.cuda()
-    partition_max_pars = partition_max_pars.cuda()
+    n_partition_ids = n_partition_ids.to(device)
+    n_id_in_partition = n_id_in_partition.to(device)
+    num_ngs_in_partition = num_ngs_in_partition.to(device)
+    partition_max_pars = partition_max_pars.to(device)
 
     # This is the main loop: iterate over `cs` in the layer
     cs_ngroup_start = 0 # The start index of nodes in the current `cs`
@@ -755,14 +755,14 @@ def sum_layer_backward_compilation(nodes, cs2parns, n_partition_ids, n_id_in_par
         pars_offsets = torch.from_numpy(pars_offsets)
 
         # Move necessary buffers to GPU
-        chids_partition_start = chids_partition_start.cuda()
-        parids_partition_start = parids_partition_start.cuda()
-        pars_offsets = pars_offsets.cuda()
+        chids_partition_start = chids_partition_start.to(device)
+        parids_partition_start = parids_partition_start.to(device)
+        pars_offsets = pars_offsets.to(device)
 
         for ns, edge_ids in zip(cs2parns[cs], par_edge_ids):
 
             ns_num_edges = edge_ids.size(1)
-            edge_ids = edge_ids.cuda()
+            edge_ids = edge_ids.to(device)
 
             if ns.is_tied():
                 ns_pid_start = ns.get_source_ns()._param_range[0]
@@ -770,8 +770,8 @@ def sum_layer_backward_compilation(nodes, cs2parns, n_partition_ids, n_id_in_par
                 ns_pid_start = ns._param_range[0]
 
             # Get `cum_n_chs` and `chs_offsets`, which are used to get the parameter indices
-            cum_n_chs = ns2cum_n_chs[ns].cuda()
-            chs_offsets = ns2chs_offsets[ns].cuda()
+            cum_n_chs = ns2cum_n_chs[ns].to(device)
+            chs_offsets = ns2chs_offsets[ns].to(device)
 
             # We store these constants in a tensor and retrieve them in the kernel
             # This is to avoid `triton` from compiling separate kernels for every layer configuration
@@ -782,7 +782,7 @@ def sum_layer_backward_compilation(nodes, cs2parns, n_partition_ids, n_id_in_par
             cs_group_size = cs.group_size
             
             constexprs = torch.tensor([ns_global_node_start, cs_global_ele_start, ns_group_size, cs_group_size, 
-                                       ns_pid_start, ns_num_edges, cs_ngroup_start]).long().cuda()
+                                       ns_pid_start, ns_num_edges, cs_ngroup_start]).long().to(device)
 
             # Make the grid and launch kernel
             grid = lambda meta: (triton.cdiv(ns_num_edges, meta["BLOCK_SIZE"]),)

From daa0d08b82d91e29a67650fadf9fc814ffdc77af Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Sun, 10 Dec 2023 23:20:09 +0800
Subject: [PATCH 037/162] fix node partition function

---
 src/pyjuice/layer/backend/node_partition.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/src/pyjuice/layer/backend/node_partition.py b/src/pyjuice/layer/backend/node_partition.py
index 902fbeba..3f424dcf 100644
--- a/src/pyjuice/layer/backend/node_partition.py
+++ b/src/pyjuice/layer/backend/node_partition.py
@@ -122,6 +122,9 @@ def _weighted_partition_nodes_dp_simple_compiled(node_n_edges, cum_counts, dp, b
     for i in range(num_nodes):
         dp[i,1] = node_n_edges[i] * cum_counts[i]
 
+    if dp[-1,1] <= target_overhead:
+        return dp[-1, 1], 1
+
     # Main DP
     target_n_group = max_num_partitions
     for n_group in range(2, max_num_partitions + 1):
@@ -139,7 +142,7 @@ def _weighted_partition_nodes_dp_simple_compiled(node_n_edges, cum_counts, dp, b
             dp[i,n_group] = min_overhead
             backtrace[i,n_group] = best_idx
 
-        if dp[-1,n_group] < target_overhead:
+        if dp[-1,n_group] <= target_overhead:
             target_n_group = n_group
             break
 
@@ -156,9 +159,6 @@ def _weighted_partition_nodes_dp_simple(node_n_edges: np.ndarray, counts: np.nda
     dp = np.zeros([node_n_edges.shape[0], max_num_partitions + 1], dtype = np.int64)
     backtrace = np.zeros([node_n_edges.shape[0], max_num_partitions + 1], dtype = np.int64)
 
-    # if debug:
-    #     import pdb; pdb.set_trace()
-
     overhead, target_n_group = _weighted_partition_nodes_dp_simple_compiled(
         np.ascontiguousarray(node_n_edges),
         np.ascontiguousarray(cum_counts),
@@ -197,8 +197,8 @@ def partition_nodes_by_n_edges(node_n_edges: Union[np.ndarray, torch.Tensor],
     if isinstance(node_n_edges, torch.Tensor):
         node_n_edges = node_n_edges.detach().cpu().numpy()
 
-    total_num_edges = node_n_edges.sum()
-    target_overhead = None if sparsity_tolerance is None else int(math.ceil(total_num_edges * sparsity_tolerance))
+    max_num_edges = node_n_edges.max()
+    target_overhead = None if sparsity_tolerance is None else int(math.ceil(node_n_edges.shape[0] * max_num_edges * sparsity_tolerance))
 
     if max_num_partitions == 1:
         partitions = np.zeros([1], dtype = np.int64)
@@ -208,6 +208,10 @@ def partition_nodes_by_n_edges(node_n_edges: Union[np.ndarray, torch.Tensor],
     # Sort in non-descending order
     node_n_edges = np.sort(node_n_edges)
 
+    if node_n_edges[0] == 0:
+        num_zeros = (node_n_edges == 0).sum()
+        node_n_edges = node_n_edges[num_zeros:]
+
     if algorithm == "dp_simple":
         group_sizes, overhead = _partition_nodes_dp_simple(node_n_edges, max_num_partitions, target_overhead)
 

From aac0141e05f952302e60b527e3917f9ce2391962 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Mon, 11 Dec 2023 01:10:49 +0800
Subject: [PATCH 038/162] fix sum layer backward compilation

---
 src/pyjuice/layer/compilation.py      | 479 ++++++++++++--------------
 src/pyjuice/layer/sum_layer.py        |  10 +-
 tests/layer/layer_compilation_test.py |  42 ++-
 3 files changed, 263 insertions(+), 268 deletions(-)

diff --git a/src/pyjuice/layer/compilation.py b/src/pyjuice/layer/compilation.py
index e7109c24..44750a65 100644
--- a/src/pyjuice/layer/compilation.py
+++ b/src/pyjuice/layer/compilation.py
@@ -58,6 +58,9 @@ def __getitem__(self, idx):
 
         return self.item_list[idx]
 
+    def __contains__(self, item):
+        return item in self.item_set
+
 
 def flatten_sum_nodes(ns: SumNodes, *args, use_cuda: bool = False):
     edge_ids = ns.edge_ids
@@ -135,8 +138,9 @@ def get_sum_layer_backward_stats(nodes: Sequence[SumNodes]):
                 ch_gsize2cs[ch_gsize] = OrderedSet()
                 ch_gsize2num_ngroups[ch_gsize] = 0
 
-            ch_gsize2cs[ch_gsize].append(cs)
-            ch_gsize2num_ngroups[ch_gsize] += cs.num_node_groups
+            if cs not in ch_gsize2cs[ch_gsize]:
+                ch_gsize2cs[ch_gsize].append(cs)
+                ch_gsize2num_ngroups[ch_gsize] += cs.num_node_groups
 
             if cs not in cs2parns:
                 cs2parns[cs] = OrderedSet()
@@ -169,151 +173,6 @@ def get_sum_layer_backward_stats(nodes: Sequence[SumNodes]):
     return ch_gsize2cs, ch_gsize2num_ngroups, ch_gsize2n_pargs, cs2parns
 
 
-def sum_layer_forward_compilation_cpu(nodes, fw_partition_max_chs, n_partition_ids, n_id_in_partition, num_ngs_in_partition, n_chs,
-                                      global_nid_start: int, global_pid_start: int, global_pfid_start: int, node2tiednodes: dict, 
-                                      max_tied_ns_per_parflow_group: int = 4):
-
-    nids = [torch.zeros([num_ngs_in_partition[i]], dtype = torch.long) for i in range(len(num_ngs_in_partition))]
-    cids = [torch.zeros([num_ngs_in_partition[i], fw_partition_max_chs[i]], dtype = torch.long) for i in range(len(num_ngs_in_partition))]
-    pids = [torch.zeros([num_ngs_in_partition[i], fw_partition_max_chs[i]], dtype = torch.long) for i in range(len(num_ngs_in_partition))]
-
-    all_ns_param_ids = dict()
-    original_param_nids = [] # `ns` with their original parameters (i.e., not tied)
-    
-    # This is the main loop: iterate over `ns` in the layer
-    ngroup_start = 0 # The start index of the node groups in the current `ns`
-    ngid_in_partition = torch.zeros([len(num_ngs_in_partition)], dtype = torch.long)
-    for ns_idx, ns in enumerate(nodes):
-        
-        if not ns.is_tied():
-            if not ns.provided("_param_range"):
-                global_pid_end = global_pid_start + ns.num_edges
-                ns._param_range = (global_pid_start, global_pid_end)
-                global_pid_start = global_pid_end
-
-                global_pfid_end = global_pfid_start + ns.num_edges
-                ns._param_flow_range = (global_pfid_start, global_pfid_end)
-                global_pfid_start = global_pfid_end
-
-                add_params_flag = True
-            else:
-                add_params_flag = False
-
-            original_param_nids.append(ns_idx)
-                
-            # Global pid start index for `ns`
-            ns_pid_start = ns._param_range[0]
-        else:
-            source_ns = ns.get_source_ns()
-
-            # Initialize parameters
-            if not source_ns.provided("_param_range"):
-                global_pid_end = global_pid_start + ns.num_edges
-                ns._param_range = (global_pid_start, global_pid_end)
-                global_pid_start = global_pid_end
-
-                global_pfid_end = global_pfid_start + ns.num_edges
-                ns._param_flow_range = (global_pfid_start, global_pfid_end)
-                global_pfid_start = global_pfid_end
-
-                add_params_flag = True
-            else:
-                ns._param_range = deepcopy(source_ns._param_range)
-
-                add_params_flag = False
-
-            if source_ns not in node2tiednodes:
-                node2tiednodes[source_ns] = [[source_ns], 1, source_ns._param_flow_range]
-            
-            dup_count = node2tiednodes[source_ns][1]
-            if dup_count >= max_tied_ns_per_parflow_group:
-                global_pfid_end = global_pfid_start + ns.num_edges
-                ns._param_flow_range = (global_pfid_start, global_pfid_end)
-                global_pfid_start = global_pfid_end
-                node2tiednodes[source_ns][2] = ns._param_flow_range
-
-                node2tiednodes[source_ns][0].append(ns)
-                node2tiednodes[source_ns][1] = 1
-            else:
-                ns._param_flow_range = deepcopy(node2tiednodes[source_ns][2])
-
-                node2tiednodes[source_ns][1] += 1
-
-            # Global pid start index for `ns`
-            ns_pid_start = source_ns._param_range[0]
-
-        # number of node groups
-        ns_num_ngroups = ns.num_node_groups
-
-        # Edge indices of size [2, ns_num_edges]
-        # Here child ids of the edges are flattened out, i.e., every edge points to 
-        # an actual "node" instead of a node group
-        edge_ids = ns.edge_ids.clone()
-        edge_ids = edge_ids[:,:,None].repeat(1, 1, ns.ch_group_size)
-        edge_ids[1,:,:] *= ns.ch_group_size
-        edge_ids[1,:,:] += torch.arange(0, ns.ch_group_size)[None,:]
-        edge_ids = edge_ids.reshape(2, ns.edge_ids.size(1) * ns.ch_group_size).contiguous()
-        ns_num_edges = edge_ids.size(1)
-
-        # Get number of child nodes for all nodes
-        ns_nchs = torch.bincount(edge_ids[0,:], minlength = ns_num_ngroups)
-
-        cs_node_cum_nodes = torch.zeros([ns.num_chs], dtype = torch.long)
-        cs_node_cum_nodes[0] = ns.chs[0].num_nodes
-        for i in range(1, ns.num_chs):
-            cs_node_cum_nodes[i] = cs_node_cum_nodes[i-1] + ns.chs[i].num_nodes
-
-        if add_params_flag:
-            ns_param_ids = torch.zeros([ns_num_edges], dtype = torch.long).cuda()
-        else:
-            ns_param_ids = None
-
-        # Iterate over node groups
-        cum_n_chs = 0
-        all_ns_param_ids = dict()
-        for ng_id in range(ns_num_ngroups):
-            partition_id = (ns_nchs[ng_id] > fw_partition_max_chs).sum()
-            local_id = ngid_in_partition[partition_id]
-
-            global_nid = ns._output_ind_range[0] + ng_id * ns.group_size
-
-            # Assign `nids`
-            nids[partition_id][local_id] = global_nid
-
-            # Assign `cids`
-            criterion = (edge_ids[0,:] == ng_id)
-            local_cids = edge_ids[1,criterion]
-            cids_gid = (local_cids[:,None] >= cs_node_cum_nodes[None,:]).sum(dim = 1)
-            for ch_id in range(local_cids.size(0)):
-                local_base = cs_node_cum_nodes[cids_gid[ch_id]-1] if cids_gid[ch_id] >= 1 else 0
-                global_cid = ns.chs[cids_gid[ch_id]]._output_ind_range[0] + local_cids[ch_id] - local_base
-                cids[partition_id][local_id, ch_id] = global_cid
-
-            # Assign `pids`
-            global_pids = ns_pid_start + cum_n_chs + torch.arange(0, ns.group_size * criterion.sum(), ns.group_size)
-            pids[partition_id][local_id, 0:global_pids.size(0)] = global_pids
-            cum_n_chs += ns.group_size * criterion.sum()
-
-            if add_params_flag:
-                ns_param_ids[criterion] = global_pids
-
-            ngid_in_partition[partition_id] = local_id + 1
-
-        all_ns_param_ids[ns_idx] = ns_param_ids
-
-    # Store global -> local parameter id mapping in `ns`
-    for ns_idx, param_ids in all_ns_param_ids.items():
-        ns = nodes[ns_idx]
-        ns._param_ids = param_ids.cpu()[0::ns.ch_group_size] # Every edge specify the start id of [ch_group_size, group_size] parameters
-
-    # Store local -> global parameter id mapping in `ns`
-    for ns_idx in original_param_nids:
-        ns = nodes[ns_idx]
-        ns._inverse_param_ids = torch.argsort(ns._param_ids)
-
-    return nids, cids, pids, global_pid_end, global_pfid_end
-
-
 @njit
 def _assign_chid_kernel(chs_offsets, ns_nchs, edge_ids):
     for i in range(edge_ids.shape[1]):
@@ -397,33 +256,39 @@ def sum_layer_forward_compilation(nodes, fw_partition_max_chs, n_partition_ids,
     if use_cuda and not torch.cuda.is_available():
         use_cuda = False
 
-    if not use_cuda:
-        return sum_layer_forward_compilation_cpu(
-            nodes, fw_partition_max_chs, n_partition_ids, n_id_in_partition, 
-            num_ngs_in_partition, n_chs, global_nid_start, global_pid_start,
-            global_pfid_start, node2tiednodes, max_tied_ns_per_parflow_group
-        )
-
-    # We construct a flattened version of `nids` where the vectors of every partition is concatenated
-    # into a single vector. `nids_group_start` is used to indicate the start index of every group's
-    # `nids`. That is, `target_nids[nids_partition_start[i]:nids_partition_start[i+1]] == nids[i]`
-    nids_partition_start = torch.zeros_like(num_ngs_in_partition)
-    nids_partition_start[1:] = torch.cumsum(num_ngs_in_partition[:-1], dim = 0)
-    target_nids = torch.zeros([num_ngs_in_partition.sum()], dtype = torch.long).cuda()
+    if use_cuda:
+        device = torch.device("cuda:0")
+    else:
+        device = torch.device("cpu")
 
-    # Similarly, we flatten `cids`...
-    # Note: we call it `pcids...` because it is shared with `target_pids`
-    pcids_partition_start = torch.zeros_like(num_ngs_in_partition)
-    pcids_partition_start[1:] = torch.cumsum((num_ngs_in_partition * fw_partition_max_chs)[:-1], dim = 0)
-    target_cids = torch.zeros([(num_ngs_in_partition * fw_partition_max_chs).sum()], dtype = torch.long).cuda()
+    if use_cuda:
+        # We construct a flattened version of `nids` where the vectors of every partition is concatenated
+        # into a single vector. `nids_group_start` is used to indicate the start index of every group's
+        # `nids`. That is, `target_nids[nids_partition_start[i]:nids_partition_start[i+1]] == nids[i]`
+        nids_partition_start = torch.zeros_like(num_ngs_in_partition)
+        nids_partition_start[1:] = torch.cumsum(num_ngs_in_partition[:-1], dim = 0)
+        target_nids = torch.zeros([num_ngs_in_partition.sum()], dtype = torch.long).to(device)
+
+        # Similarly, we flatten `cids`...
+        # Note: we call it `pcids...` because it is shared with `target_pids`
+        pcids_partition_start = torch.zeros_like(num_ngs_in_partition)
+        pcids_partition_start[1:] = torch.cumsum((num_ngs_in_partition * fw_partition_max_chs)[:-1], dim = 0)
+        target_cids = torch.zeros([(num_ngs_in_partition * fw_partition_max_chs).sum()], dtype = torch.long).to(device)
+
+        # ...and `pids`
+        target_pids = torch.zeros([(num_ngs_in_partition * fw_partition_max_chs).sum()], dtype = torch.long).to(device)
+
+        # Move necessary tensors to GPU
+        n_partition_ids = n_partition_ids.to(device)
+        n_id_in_partition = n_id_in_partition.to(device)
+        fw_partition_max_chs = fw_partition_max_chs.to(device)
 
-    # ...and `pids`
-    target_pids = torch.zeros([(num_ngs_in_partition * fw_partition_max_chs).sum()], dtype = torch.long).cuda()
+    else:
+        nids = [torch.zeros([num_ngs_in_partition[i]], dtype = torch.long) for i in range(len(num_ngs_in_partition))]
+        cids = [torch.zeros([num_ngs_in_partition[i], fw_partition_max_chs[i]], dtype = torch.long) for i in range(len(num_ngs_in_partition))]
+        pids = [torch.zeros([num_ngs_in_partition[i], fw_partition_max_chs[i]], dtype = torch.long) for i in range(len(num_ngs_in_partition))]
 
-    # Move necessary tensors to GPU
-    n_partition_ids = n_partition_ids.cuda()
-    n_id_in_partition = n_id_in_partition.cuda()
-    fw_partition_max_chs = fw_partition_max_chs.cuda()
+        ngid_in_partition = torch.zeros([len(num_ngs_in_partition)], dtype = torch.long)
 
     all_ns_param_ids = dict()
     original_param_nids = [] # `ns` with their original parameters (i.e., not tied)
@@ -502,63 +367,112 @@ def sum_layer_forward_compilation(nodes, fw_partition_max_chs, n_partition_ids,
         edge_ids = edge_ids.reshape(2, ns.edge_ids.size(1) * ns.ch_group_size).contiguous()
         ns_num_edges = edge_ids.size(1)
 
-        # Precompute the child offset ids for every edge. That is, the `?` 
-        # mark in `cids[partition][local_id,?]`
-        chs_offsets = np.zeros([ns_num_edges], dtype = np.int64)
-        ns_nchs = np.zeros([ns_num_ngroups], dtype = np.int64)
-
-        _assign_chid_kernel(chs_offsets, ns_nchs, edge_ids.numpy())
-        chs_offsets = torch.from_numpy(chs_offsets)
-
-        # Construct helper indices for child nodes
-        # `cs_ele_id_start` contains the global start indices of the child nodes
-        # `cs_node_cum_ids` contains the local cumulative number of child nodes
-        cs_ele_id_start = torch.zeros([ns.num_chs], dtype = torch.long)
-        cs_node_cum_ids = torch.zeros([ns.num_chs], dtype = torch.long)
-        for i, cs in enumerate(ns.chs):
-            cs_ele_id_start[i] = cs._output_ind_range[0]
-            if i < ns.num_chs - 1:
-                cs_node_cum_ids[i+1] = cs_node_cum_ids[i] + cs.num_nodes
-
-        # Cumulative nchs
-        ns_nchs = torch.from_numpy(ns_nchs)
-        cum_n_chs = torch.zeros([ns_num_ngroups], dtype = torch.long)
-        cum_n_chs[1:] = torch.cumsum(ns_nchs[:-1], dim = 0)
+        if use_cuda:
+            ## GPU mode ##
+
+            # Precompute the child offset ids for every edge. That is, the `?` 
+            # mark in `cids[partition][local_id,?]`
+            chs_offsets = np.zeros([ns_num_edges], dtype = np.int64)
+            ns_nchs = np.zeros([ns_num_ngroups], dtype = np.int64)
+
+            _assign_chid_kernel(chs_offsets, ns_nchs, edge_ids.numpy())
+            chs_offsets = torch.from_numpy(chs_offsets)
+
+            # Construct helper indices for child nodes
+            # `cs_ele_id_start` contains the global start indices of the child nodes
+            # `cs_node_cum_ids` contains the local cumulative number of child nodes
+            cs_ele_id_start = torch.zeros([ns.num_chs], dtype = torch.long)
+            cs_node_cum_ids = torch.zeros([ns.num_chs], dtype = torch.long)
+            for i, cs in enumerate(ns.chs):
+                cs_ele_id_start[i] = cs._output_ind_range[0]
+                if i < ns.num_chs - 1:
+                    cs_node_cum_ids[i+1] = cs_node_cum_ids[i] + cs.num_nodes
+
+            # Cumulative nchs
+            ns_nchs = torch.from_numpy(ns_nchs)
+            cum_n_chs = torch.zeros([ns_num_ngroups], dtype = torch.long)
+            cum_n_chs[1:] = torch.cumsum(ns_nchs[:-1], dim = 0)
+
+            if add_params_flag:
+                ns_param_ids = torch.zeros([ns_num_edges], dtype = torch.long).to(device)
+            else:
+                ns_param_ids = None
+
+            # The following kernel assigns the corresponding indices to `nids`, `cids`, and `pids`
+            # We first move necessary buffers to GPU
+            nids_partition_start = nids_partition_start.to(device)
+            edge_ids = edge_ids.to(device)
+            chs_offsets = chs_offsets.to(device)
+            cs_ele_id_start = cs_ele_id_start.to(device)
+            cs_node_cum_ids = cs_node_cum_ids.to(device)
+            cum_n_chs = cum_n_chs.to(device)
+            pcids_partition_start = pcids_partition_start.to(device)
+
+            # We store these constants in a tensor and retrieve them in the kernel
+            # This is to avoid `triton` from compiling separate kernels for every layer configuration
+            # Saves 99.9% compilation time :)
+            constexprs = torch.tensor([global_nid_start, ns_pid_start, ngroup_start, ns_num_edges, ns.group_size]).long().to(device)
+
+            num_chs_np2 = triton.next_power_of_2(ns.num_chs)
+
+            # Make the grid and launch kernel
+            grid = lambda meta: (triton.cdiv(ns_num_edges, meta["BLOCK_SIZE"]),)
+
+            _assign_target_ncpids_kernel[grid](
+                target_nids, nids_partition_start, target_cids, pcids_partition_start,
+                target_pids, edge_ids, chs_offsets, n_partition_ids, n_id_in_partition, 
+                cs_ele_id_start, cs_node_cum_ids, fw_partition_max_chs, cum_n_chs, 
+                ns_param_ids, constexprs, ns.num_chs, num_chs_np2, 
+                add_params_flag, BLOCK_SIZE = min(2048, 2**20 // num_chs_np2)
+            )
+
+            ngroup_start += ns_num_ngroups
 
-        if add_params_flag:
-            ns_param_ids = torch.zeros([ns_num_edges], dtype = torch.long).cuda()
         else:
-            ns_param_ids = None
+            ## CPU mode ##
 
-        # The following kernel assigns the corresponding indices to `nids`, `cids`, and `pids`
-        # We first move necessary buffers to GPU
-        nids_partition_start = nids_partition_start.cuda()
-        edge_ids = edge_ids.cuda()
-        chs_offsets = chs_offsets.cuda()
-        cs_ele_id_start = cs_ele_id_start.cuda()
-        cs_node_cum_ids = cs_node_cum_ids.cuda()
-        cum_n_chs = cum_n_chs.cuda()
-        pcids_partition_start = pcids_partition_start.cuda()
+            # Get number of child nodes for all nodes
+            ns_nchs = torch.bincount(edge_ids[0,:], minlength = ns_num_ngroups)
 
-        # We store these constants in a tensor and retrieve them in the kernel
-        # This is to avoid `triton` from compiling separate kernels for every layer configuration
-        # Saves 99.9% compilation time :)
-        constexprs = torch.tensor([global_nid_start, ns_pid_start, ngroup_start, ns_num_edges, ns.group_size]).long().cuda()
+            cs_node_cum_nodes = torch.zeros([ns.num_chs], dtype = torch.long)
+            cs_node_cum_nodes[0] = ns.chs[0].num_nodes
+            for i in range(1, ns.num_chs):
+                cs_node_cum_nodes[i] = cs_node_cum_nodes[i-1] + ns.chs[i].num_nodes
 
-        num_chs_np2 = triton.next_power_of_2(ns.num_chs)
+            if add_params_flag:
+                ns_param_ids = torch.zeros([ns_num_edges], dtype = torch.long)
+            else:
+                ns_param_ids = None
 
-        # Make the grid and launch kernel
-        grid = lambda meta: (triton.cdiv(ns_num_edges, meta["BLOCK_SIZE"]),)
+            # Iterate over node groups
+            cum_n_chs = 0
+            for ng_id in range(ns_num_ngroups):
+                partition_id = (ns_nchs[ng_id] > fw_partition_max_chs).sum()
+                local_id = ngid_in_partition[partition_id]
 
-        _assign_target_ncpids_kernel[grid](
-            target_nids, nids_partition_start, target_cids, pcids_partition_start,
-            target_pids, edge_ids, chs_offsets, n_partition_ids, n_id_in_partition, 
-            cs_ele_id_start, cs_node_cum_ids, fw_partition_max_chs, cum_n_chs, 
-            ns_param_ids, constexprs, ns.num_chs, num_chs_np2, 
-            add_params_flag, BLOCK_SIZE = min(2048, 2**20 // num_chs_np2)
-        )
+                global_nid = ns._output_ind_range[0] + ng_id * ns.group_size
 
-        ngroup_start += ns_num_ngroups
+                # Assign `nids`
+                nids[partition_id][local_id] = global_nid
+
+                # Assign `cids`
+                criterion = (edge_ids[0,:] == ng_id)
+                local_cids = edge_ids[1,criterion]
+                cids_gid = (local_cids[:,None] >= cs_node_cum_nodes[None,:]).sum(dim = 1)
+                for ch_id in range(local_cids.size(0)):
+                    local_base = cs_node_cum_nodes[cids_gid[ch_id]-1] if cids_gid[ch_id] >= 1 else 0
+                    global_cid = ns.chs[cids_gid[ch_id]]._output_ind_range[0] + local_cids[ch_id] - local_base
+                    cids[partition_id][local_id, ch_id] = global_cid
+
+                # Assign `pids`
+                global_pids = ns_pid_start + cum_n_chs + torch.arange(0, ns.group_size * criterion.sum(), ns.group_size)
+                pids[partition_id][local_id, 0:global_pids.size(0)] = global_pids
+                cum_n_chs += ns.group_size * criterion.sum()
+
+                if add_params_flag:
+                    ns_param_ids[criterion] = global_pids
+
+                ngid_in_partition[partition_id] = local_id + 1
 
         if add_params_flag:
             all_ns_param_ids[ns_idx] = ns_param_ids
@@ -573,28 +487,29 @@ def sum_layer_forward_compilation(nodes, fw_partition_max_chs, n_partition_ids,
         ns = nodes[ns_idx]
         ns._inverse_param_ids = torch.argsort(ns._param_ids)
 
-    # Restore `nids`
-    target_nids = target_nids.cpu()
-    nids = []
-    for partition_id in range(num_ngs_in_partition.size(0)):
-        sid = nids_partition_start[partition_id]
-        eid = sid + num_ngs_in_partition[partition_id]
-        nids.append(target_nids[sid:eid].contiguous())
-
-    # Restore `cids` and `pids`
-    target_cids = target_cids.cpu()
-    target_pids = target_pids.cpu()
-    cids = []
-    pids = []
-    for partition_id in range(num_ngs_in_partition.size(0)):
-        sid = pcids_partition_start[partition_id]
-        gsize = num_ngs_in_partition[partition_id]
-        gnchs = fw_partition_max_chs[partition_id]
-        eid = sid + gsize * gnchs
-        cids.append(target_cids[sid:eid].reshape(gsize, gnchs).contiguous())
-        pids.append(target_pids[sid:eid].reshape(gsize, gnchs).contiguous())
-
-    return nids, cids, pids, global_pid_end, global_pfid_end
+    if use_cuda:
+        # Restore `nids`
+        target_nids = target_nids.cpu()
+        nids = []
+        for partition_id in range(num_ngs_in_partition.size(0)):
+            sid = nids_partition_start[partition_id]
+            eid = sid + num_ngs_in_partition[partition_id]
+            nids.append(target_nids[sid:eid].contiguous())
+
+        # Restore `cids` and `pids`
+        target_cids = target_cids.cpu()
+        target_pids = target_pids.cpu()
+        cids = []
+        pids = []
+        for partition_id in range(num_ngs_in_partition.size(0)):
+            sid = pcids_partition_start[partition_id]
+            gsize = num_ngs_in_partition[partition_id]
+            gnchs = fw_partition_max_chs[partition_id]
+            eid = sid + gsize * gnchs
+            cids.append(target_cids[sid:eid].reshape(gsize, gnchs).contiguous())
+            pids.append(target_pids[sid:eid].reshape(gsize, gnchs).contiguous())
+
+    return nids, cids, pids, global_pid_start, global_pfid_start
 
 
 @njit
@@ -632,6 +547,7 @@ def _assign_target_chpapids_kernel(target_chids_ptr, chids_partition_start_ptr,
     ns_pid_start = tl.load(constexprs_ptr + 4)
     num_edges = tl.load(constexprs_ptr + 5)
     cs_ngroup_start = tl.load(constexprs_ptr + 6)
+    pars_offsets_start = tl.load(constexprs_ptr + 7)
 
     # Get edge indices to be processed by the current block
     offsets = block_start + tl.arange(0, BLOCK_SIZE)
@@ -647,7 +563,7 @@ def _assign_target_chpapids_kernel(target_chids_ptr, chids_partition_start_ptr,
 
     # Get parent offsets
     # Note: this is the `?` mark in `parids[partition_id][local_id,?]`
-    pars_offset = tl.load(pars_offsets_ptr + offsets, mask = mask, other = 0)
+    pars_offset = tl.load(pars_offsets_ptr + pars_offsets_start + offsets, mask = mask, other = 0)
 
     # Store to `target_chids`
     chids_start = tl.load(chids_partition_start_ptr + partition_id, mask = mask, other = 0)
@@ -668,6 +584,43 @@ def _assign_target_chpapids_kernel(target_chids_ptr, chids_partition_start_ptr,
     tl.store(target_parpids_ptr + parids_offsets, global_pid, mask = mask)
 
 
+def _assign_target_chpapids_cpu(target_chids, chids_partition_start, target_parids, target_parpids, parids_partition_start,
+                                edge_ids, pars_offsets, n_partition_ids, n_id_in_partition, num_ngs_in_partition,
+                                partition_max_pars, cum_n_chs, chs_offsets, ns_global_node_start, cs_global_ele_start, 
+                                ns_group_size, cs_group_size, ns_pid_start, ns_num_edges, cs_ngroup_start, pars_offsets_start):
+
+    for edge_id in range(ns_num_edges):
+        # Get `cid` and `nid` (size of `edge_ids` is [2, num_edges])
+        cid = edge_ids[1, edge_id]
+        nid = edge_ids[0, edge_id]
+
+        # Get `partition_id` and `local_id`
+        partition_id = n_partition_ids[cid + cs_ngroup_start]
+        local_id = n_id_in_partition[cid + cs_ngroup_start]
+
+        # Get parent offsets
+        # Note: this is the `?` mark in `parids[partition_id][local_id,?]`
+        pars_offset = pars_offsets[pars_offsets_start + edge_id]
+
+        # Store to `target_chids`
+        chids_start = chids_partition_start[partition_id]
+        global_chid = cs_global_ele_start + cid * cs_group_size
+        target_chids[chids_start + local_id] = global_chid
+
+        # Store to `target_parids`
+        partition_max_n_pargs = partition_max_pars[partition_id]
+        parids_start = parids_partition_start[partition_id]
+        parids_offsets = parids_start + local_id * partition_max_n_pargs + pars_offset
+        global_parid = ns_global_node_start + nid * ns_group_size
+        target_parids[parids_offsets] = global_parid
+
+        # Store to `target_parpids`
+        ns_local_pid = cum_n_chs[nid]
+        chs_offset = chs_offsets[edge_id]
+        global_pid = ns_pid_start + (ns_local_pid + chs_offset) * ns_group_size * cs_group_size
+        target_parpids[parids_offsets] = global_pid
+
+
 @torch.no_grad()
 def sum_layer_backward_compilation(nodes, cs2parns, n_partition_ids, n_id_in_partition, num_ngs_in_partition, partition_max_pars,
                                    use_cuda: bool = False):
@@ -710,6 +663,7 @@ def sum_layer_backward_compilation(nodes, cs2parns, n_partition_ids, n_id_in_par
 
         # Collect all edge ids that point to `cs` in every parent `ns`
         par_edge_ids = []
+        local_ns2chs_offsets = dict()
         for ns in cs2parns[cs]:
             cs_id = ns.chs.index(cs)
             edge_sid = sum([c.num_node_groups for c in ns.chs[:cs_id]])
@@ -734,7 +688,9 @@ def sum_layer_backward_compilation(nodes, cs2parns, n_partition_ids, n_id_in_par
                 cum_n_chs[1:] = torch.cumsum(ns_nchs[:-1], dim = 0)
 
                 ns2cum_n_chs[ns] = cum_n_chs
-                ns2chs_offsets[ns] = chs_offsets[criterion]
+                ns2chs_offsets[ns] = chs_offsets
+
+            local_ns2chs_offsets[ns] = ns2chs_offsets[ns][criterion]
 
         cs_num_ngroups = cs.num_node_groups
         cs_num_edges = sum([edge_ids.size(1) for edge_ids in par_edge_ids])
@@ -759,6 +715,7 @@ def sum_layer_backward_compilation(nodes, cs2parns, n_partition_ids, n_id_in_par
         parids_partition_start = parids_partition_start.to(device)
         pars_offsets = pars_offsets.to(device)
 
+        pars_offsets_start = 0
         for ns, edge_ids in zip(cs2parns[cs], par_edge_ids):
 
             ns_num_edges = edge_ids.size(1)
@@ -771,7 +728,7 @@ def sum_layer_backward_compilation(nodes, cs2parns, n_partition_ids, n_id_in_par
 
             # Get `cum_n_chs` and `chs_offsets`, which are used to get the parameter indices
             cum_n_chs = ns2cum_n_chs[ns].to(device)
-            chs_offsets = ns2chs_offsets[ns].to(device)
+            chs_offsets = local_ns2chs_offsets[ns].to(device)
 
             # We store these constants in a tensor and retrieve them in the kernel
             # This is to avoid `triton` from compiling separate kernels for every layer configuration
@@ -781,19 +738,31 @@ def sum_layer_backward_compilation(nodes, cs2parns, n_partition_ids, n_id_in_par
             ns_group_size = ns.group_size
             cs_group_size = cs.group_size
             
-            constexprs = torch.tensor([ns_global_node_start, cs_global_ele_start, ns_group_size, cs_group_size, 
-                                       ns_pid_start, ns_num_edges, cs_ngroup_start]).long().to(device)
+            if use_cuda:
+                constexprs = torch.tensor([ns_global_node_start, cs_global_ele_start, ns_group_size, cs_group_size, 
+                                           ns_pid_start, ns_num_edges, cs_ngroup_start, pars_offsets_start]).long().to(device)
+
+                # Make the grid and launch kernel
+                grid = lambda meta: (triton.cdiv(ns_num_edges, meta["BLOCK_SIZE"]),)
+
+                _assign_target_chpapids_kernel[grid](
+                    target_chids, chids_partition_start, target_parids, target_parpids, parids_partition_start,
+                    edge_ids, pars_offsets, n_partition_ids, n_id_in_partition, num_ngs_in_partition,
+                    partition_max_pars, cum_n_chs, chs_offsets, constexprs, BLOCK_SIZE = 1024
+                )
+            else:
+                _assign_target_chpapids_cpu(
+                    target_chids, chids_partition_start, target_parids, target_parpids, parids_partition_start,
+                    edge_ids, pars_offsets, n_partition_ids, n_id_in_partition, num_ngs_in_partition,
+                    partition_max_pars, cum_n_chs, chs_offsets, ns_global_node_start, cs_global_ele_start, 
+                    ns_group_size, cs_group_size, ns_pid_start, ns_num_edges, cs_ngroup_start, pars_offsets_start
+                )
 
-            # Make the grid and launch kernel
-            grid = lambda meta: (triton.cdiv(ns_num_edges, meta["BLOCK_SIZE"]),)
+            pars_offsets_start += ns_num_edges
 
-            _assign_target_chpapids_kernel[grid](
-                target_chids, chids_partition_start, target_parids, target_parpids, parids_partition_start,
-                edge_ids, pars_offsets, n_partition_ids, n_id_in_partition, num_ngs_in_partition,
-                partition_max_pars, cum_n_chs, chs_offsets, constexprs, BLOCK_SIZE = 1024
-            )
+        cs_ngroup_start += cs.num_node_groups
 
-            cs_ngroup_start += cs.num_node_groups
+        # import pdb; pdb.set_trace()
 
     # Restore `chids`
     target_chids = target_chids.cpu()
diff --git a/src/pyjuice/layer/sum_layer.py b/src/pyjuice/layer/sum_layer.py
index 40a16328..1aa08182 100644
--- a/src/pyjuice/layer/sum_layer.py
+++ b/src/pyjuice/layer/sum_layer.py
@@ -68,7 +68,7 @@ def __init__(self, nodes: Sequence[SumNodes], global_nid_start: int,
         fw_n_id_in_partition = torch.zeros([layer_num_ngroups], dtype = torch.long)
         fw_num_ngs_in_partition = torch.zeros([self.num_fw_partitions], dtype = torch.long)
 
-        min_n_chs = 0
+        min_n_chs = 1
         for partition_id, max_n_chs in enumerate(fw_partition_max_chs):
             criterion = (n_chs >= min_n_chs) & (n_chs <= max_n_chs)
             partition_size = criterion.sum().item()
@@ -84,9 +84,9 @@ def __init__(self, nodes: Sequence[SumNodes], global_nid_start: int,
         # nids:      List[[partition_size]]                      stores node group ids
         # cids:      List[[partition_size, partition_max_n_chs]] stores indices of child node groups
         # pids:      List[[partition_size, partition_max_n_chs]] stores indices of edge parameters (1st parameter of every group)
-        nids, cids, pids, param_ends, layer_pid_end, layer_pfid_end = sum_layer_forward_compilation(
+        nids, cids, pids, layer_pid_end, layer_pfid_end = sum_layer_forward_compilation(
             self.nodes, fw_partition_max_chs, fw_n_partition_ids, fw_n_id_in_partition, 
-            fw_num_ngs_in_partition, n_chs, global_nid_start, global_pid_end, global_pfid_start, node2tiednodes,
+            fw_num_ngs_in_partition, n_chs, global_nid_start, global_pid_start, global_pfid_start, node2tiednodes,
             # GPU compilation is slightly slower for small layer due to the kernel jit compilation time
             use_cuda = force_gpu_compilation or (not disable_gpu_compilation and (self.num_edges > 1000))
         )
@@ -139,7 +139,7 @@ def __init__(self, nodes: Sequence[SumNodes], global_nid_start: int,
             bk_n_id_in_partition = torch.zeros([num_ngroups], dtype = torch.long)
             bk_num_ngs_in_partition = torch.zeros([num_bk_partitions], dtype = torch.long)
 
-            min_n_pars = 0
+            min_n_pars = 1
             for partition_id, max_n_pars in enumerate(bk_partition_max_pars):
                 criterion = (n_pargs >= min_n_pars) & (n_pargs <= max_n_pars)
                 partition_size = criterion.sum().item()
@@ -161,7 +161,7 @@ def __init__(self, nodes: Sequence[SumNodes], global_nid_start: int,
                 num_ngs_in_partition = bk_num_ngs_in_partition,
                 partition_max_pars = bk_partition_max_pars,
                 # GPU compilation is slightly slower for small layer due to the kernel jit compilation time
-                use_cuda = not disable_gpu_compilation and (self.num_edges > 1000)
+                use_cuda = force_gpu_compilation or (not disable_gpu_compilation and (self.num_edges > 1000))
             )
 
             chids.extend(curr_chids)
diff --git a/tests/layer/layer_compilation_test.py b/tests/layer/layer_compilation_test.py
index 52b43680..96e35785 100644
--- a/tests/layer/layer_compilation_test.py
+++ b/tests/layer/layer_compilation_test.py
@@ -49,7 +49,7 @@ def prod_layer_compilation_test():
 
 def sum_layer_compilation_test():
 
-    for group_size in [8, 16]:
+    for group_size in [1, 8, 16]:
     
         with juice.set_group_size(group_size):
 
@@ -69,22 +69,48 @@ def sum_layer_compilation_test():
 
             ns0 = summate(np0, edge_ids = torch.tensor([[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 2, 4, 2, 1, 5, 6, 2, 1]]))
             ns1 = summate(np0, np6, edge_ids = torch.tensor([[0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 5], [0, 2, 4, 2, 1, 5, 6, 2, 1, 10, 3, 8, 9]]))
+            ns2 = summate(np5, edge_ids = torch.tensor([[0, 0, 1, 1, 2, 3], [6, 4, 2, 1, 3, 5]]))
 
         input_layer = InputLayer([ni0, ni1, ni2, ni3, ni4], cum_nodes = group_size)
         prod_layer = ProdLayer([np0, np1, np2, np3, np4, np5, np6], layer_sparsity_tol = 0.1, force_gpu_compilation = True)
 
-        sum_layer_cpu = SumLayer([ns0, ns1], global_nid_start = input_layer.num_nodes + group_size, 
-                                 param_ends = [1], layer_sparsity_tol = 0.1, disable_gpu_compilation = True)
-        sum_layer_gpu = SumLayer([ns0, ns1], global_nid_start = input_layer.num_nodes + group_size, 
-                                 param_ends = [1], layer_sparsity_tol = 0.1, force_gpu_compilation = True)
+        sum_layer_cpu = SumLayer([ns0, ns1, ns2], global_nid_start = input_layer.num_nodes + group_size, 
+                                 global_pid_start = 1, global_pfid_start = 0, node2tiednodes = dict(), 
+                                 layer_sparsity_tol = 0.1, disable_gpu_compilation = True)
+        sum_layer_gpu = SumLayer([ns0, ns1, ns2], global_nid_start = input_layer.num_nodes + group_size, 
+                                 global_pid_start = 1, global_pfid_start = 0, node2tiednodes = dict(), 
+                                 layer_sparsity_tol = 0.1, force_gpu_compilation = True)
 
-        for i in range(3):
+        # import pdb; pdb.set_trace()
+
+        for i in range(len(sum_layer_cpu.partitioned_nids)):
             assert torch.all(sum_layer_cpu.partitioned_nids[i] == sum_layer_gpu.partitioned_nids[i])
             assert torch.all(sum_layer_cpu.partitioned_cids[i] == sum_layer_gpu.partitioned_cids[i])
             assert torch.all(sum_layer_cpu.partitioned_pids[i] == sum_layer_gpu.partitioned_pids[i])
 
-        import pdb; pdb.set_trace()
-        
+        for i in range(len(sum_layer_cpu.partitioned_chids)):
+            assert torch.all(sum_layer_cpu.partitioned_chids[i] == sum_layer_gpu.partitioned_chids[i])
+            assert torch.all(sum_layer_cpu.partitioned_parids[i] == sum_layer_gpu.partitioned_parids[i])
+            assert torch.all(sum_layer_cpu.partitioned_parpids[i] == sum_layer_gpu.partitioned_parpids[i])
+
+        ncpids = set()
+        for i in range(len(sum_layer_cpu.partitioned_nids)):
+            for j in range(sum_layer_gpu.partitioned_cids[i].size(0)):
+                nid = sum_layer_gpu.partitioned_nids[i][j].item()
+                for k in range(sum_layer_gpu.partitioned_cids[i].size(1)):
+                    cid = sum_layer_gpu.partitioned_cids[i][j,k].item()
+                    pid = sum_layer_gpu.partitioned_pids[i][j,k].item()
+                    if cid != 0:
+                        ncpids.add((nid, cid, pid))
+
+        for i in range(len(sum_layer_cpu.partitioned_chids)):
+            for j in range(sum_layer_gpu.partitioned_parids[i].size(0)):
+                chid = sum_layer_gpu.partitioned_chids[i][j].item()
+                for k in range(sum_layer_gpu.partitioned_parids[i].size(1)):
+                    parid = sum_layer_gpu.partitioned_parids[i][j,k].item()
+                    pid = sum_layer_gpu.partitioned_parpids[i][j,k].item()
+                    if parid != 0:
+                        assert (parid, chid, pid) in ncpids, f"({parid}, {chid}, {pid})"
 
 
 if __name__ == "__main__":

From 9f999378369e1e942fcd582ab7595b64aab7b697 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Mon, 11 Dec 2023 01:44:32 +0800
Subject: [PATCH 039/162] expose compilation option
 `max_tied_ns_per_parflow_group`

---
 src/pyjuice/layer/sum_layer.py | 37 +++++++++++++++++++++-------------
 1 file changed, 23 insertions(+), 14 deletions(-)

diff --git a/src/pyjuice/layer/sum_layer.py b/src/pyjuice/layer/sum_layer.py
index 1aa08182..8ff27ad6 100644
--- a/src/pyjuice/layer/sum_layer.py
+++ b/src/pyjuice/layer/sum_layer.py
@@ -25,6 +25,7 @@ def __init__(self, nodes: Sequence[SumNodes], global_nid_start: int,
                  global_pid_start: int, global_pfid_start: int, node2tiednodes: dict(),
                  layer_sparsity_tol: Optional[float] = None, 
                  max_num_partitions: Optional[int] = None,
+                 max_tied_ns_per_parflow_group: int = 8,
                  disable_gpu_compilation: bool = False,
                  force_gpu_compilation: bool = False) -> None:
 
@@ -84,9 +85,11 @@ def __init__(self, nodes: Sequence[SumNodes], global_nid_start: int,
         # nids:      List[[partition_size]]                      stores node group ids
         # cids:      List[[partition_size, partition_max_n_chs]] stores indices of child node groups
         # pids:      List[[partition_size, partition_max_n_chs]] stores indices of edge parameters (1st parameter of every group)
-        nids, cids, pids, layer_pid_end, layer_pfid_end = sum_layer_forward_compilation(
+        # pfids:     List[[partition_size, partition_max_n_chs]] stores indices of edge parameter flows (1st parameter flow of every group)
+        nids, cids, pids, pfids, layer_pid_end, layer_pfid_end = sum_layer_forward_compilation(
             self.nodes, fw_partition_max_chs, fw_n_partition_ids, fw_n_id_in_partition, 
             fw_num_ngs_in_partition, n_chs, global_nid_start, global_pid_start, global_pfid_start, node2tiednodes,
+            max_tied_ns_per_parflow_group = max_tied_ns_per_parflow_group,
             # GPU compilation is slightly slower for small layer due to the kernel jit compilation time
             use_cuda = force_gpu_compilation or (not disable_gpu_compilation and (self.num_edges > 1000))
         )
@@ -95,6 +98,7 @@ def __init__(self, nodes: Sequence[SumNodes], global_nid_start: int,
         self.partitioned_nids = nn.ParameterList([nn.Parameter(tensor, requires_grad = False) for tensor in nids])
         self.partitioned_cids = nn.ParameterList([nn.Parameter(tensor, requires_grad = False) for tensor in cids])
         self.partitioned_pids = nn.ParameterList([nn.Parameter(tensor, requires_grad = False) for tensor in pids])
+        self.partitioned_pfids = nn.ParameterList([nn.Parameter(tensor, requires_grad = False) for tensor in pfids])
 
         # Store pre-compiled indices from `cids` and `pids` in the following buffer
         self._cached_fw_pcids = dict()
@@ -209,7 +213,7 @@ def forward(self, node_mars: torch.Tensor, element_mars: torch.Tensor, params: t
             for partition_id in range(self.num_fw_partitions):
                 nids = self.partitioned_nids[partition_id]
                 cids = self.partitioned_cids[partition_id]
-                pids = self.partitioned_pids[partition_id]
+                pfids = self.partitioned_pfids[partition_id]
 
                 self._forward(
                     node_mars, element_mars, params, nids, cids, pids, partition_id = partition_id
@@ -292,11 +296,12 @@ def backward(self, node_flows: torch.Tensor, element_flows: torch.Tensor,
                 nids = self.partitioned_nids[partition_id]
                 cids = self.partitioned_cids[partition_id]
                 pids = self.partitioned_pids[partition_id]
+                pfids = self.partitioned_pfids[partition_id]
 
                 self._backward(
                     node_flows, element_flows, params, node_mars, 
                     element_mars, param_flows, nids = nids, 
-                    cids = cids, pids = pids, partition_id = partition_id
+                    cids = cids, pids = pids, pfids = pfids, partition_id = partition_id
                 )
 
         return None
@@ -650,8 +655,9 @@ def _backward(self, node_flows: torch.Tensor, element_flows: torch.Tensor,
                   params: torch.Tensor, node_mars: torch.Tensor, 
                   element_mars: torch.Tensor, param_flows: torch.Tensor,
                   nids: Optional[torch.Tensor] = None, cids: Optional[torch.Tensor] = None, 
-                  pids: Optional[torch.Tensor] = None, chids: Optional[torch.Tensor] = None, 
-                  parids: Optional[torch.Tensor] = None, parpids: Optional[torch.Tensor] = None, 
+                  pids: Optional[torch.Tensor] = None, pfids: Optional[torch.Tensor] = None, 
+                  chids: Optional[torch.Tensor] = None, parids: Optional[torch.Tensor] = None, 
+                  parpids: Optional[torch.Tensor] = None, 
                   cs_group_size: int = 0, local_ids: Optional[torch.Tensor] = None, 
                   partition_id: int = -1, mode: Optional[str] = None) -> None:
         """
@@ -685,7 +691,7 @@ def _backward(self, node_flows: torch.Tensor, element_flows: torch.Tensor,
         if mode == "block_sparse":
             self._backward_block_sparse(
                 node_flows, element_flows, params, node_mars, element_mars, param_flows, 
-                nids, cids, pids, chids, parids, parpids, cs_group_size, local_ids, 
+                nids, cids, pids, pfids, chids, parids, parpids, cs_group_size, local_ids, 
                 partition_id = partition_id
             )
 
@@ -699,7 +705,7 @@ def _backward(self, node_flows: torch.Tensor, element_flows: torch.Tensor,
     def _backward_block_sparse(self, node_flows: torch.Tensor, element_flows: torch.Tensor, 
                                params: torch.Tensor, node_mars: torch.Tensor, 
                                element_mars: torch.Tensor, param_flows: torch.Tensor, 
-                               nids: Optional[torch.Tensor], cids: Optional[torch.Tensor], pids: Optional[torch.Tensor],
+                               nids: Optional[torch.Tensor], cids: Optional[torch.Tensor], pids: Optional[torch.Tensor], pfids: Optional[torch.Tensor],
                                chids: Optional[torch.Tensor], parids: Optional[torch.Tensor], parpids: Optional[torch.Tensor], 
                                cs_group_size: int, local_ids: Optional[torch.Tensor] = None,
                                partition_id: int = -1) -> None:
@@ -731,7 +737,7 @@ def _backward_block_sparse(self, node_flows: torch.Tensor, element_flows: torch.
         if param_flows is not None and nids is not None:
             self._backward_block_sparse_par_flows(
                 node_flows, params, node_mars, element_mars, param_flows,
-                nids = nids, cids = cids, pids = pids
+                nids = nids, cids = cids, pids = pids, pfids = pfids
             )
 
         return None
@@ -919,7 +925,7 @@ def _backward_block_sparse_ele_flows(self, node_flows: torch.Tensor, element_flo
 
     @staticmethod
     @triton.jit
-    def _bk_triton_block_sparse_par_kernel(node_flows, node_mars, element_mars, params, param_flows, nids, cids, pids,
+    def _bk_triton_block_sparse_par_kernel(node_flows, node_mars, element_mars, params, param_flows, nids, cids, pids, pfids,
                                            batch_size: tl.constexpr, num_edges: tl.constexpr, TILE_SIZE_B: tl.constexpr, 
                                            B_NUM_TILES: tl.constexpr, TILE_SIZE_K: tl.constexpr, 
                                            TILE_SIZE_M: tl.constexpr, GROUP_SIZE_M: tl.constexpr):
@@ -978,16 +984,18 @@ def _bk_triton_block_sparse_par_kernel(node_flows, node_mars, element_mars, para
 
         par_start = tl.load(pids + ngroup_id * num_edges + offs_edge)
         epars_offsets = offs_node[:,None] + par_start[None,:] # [TILE_SIZE_M, TILE_SIZE_K]
-        
         epars = tl.load(params + epars_offsets)
-        pflows = tl.load(param_flows + epars_offsets)
-        pflows += acc * epars
 
-        tl.store(param_flows + epars_offsets, pflows)
+        parflow_start = tl.load(pfids + ngroup_id * num_edges + offs_edge)
+        epars_offsets = offs_node[:,None] + parflow_start[None,:] # [TILE_SIZE_M, TILE_SIZE_K]
+        
+        curr_pflows = acc * epars
+
+        tl.atomic_add(param_flows + epars_offsets, curr_pflows) # TODO: reimplement with the lock mechanism
 
     def _backward_block_sparse_par_flows(self, node_flows: torch.Tensor, params: torch.Tensor, node_mars: torch.Tensor, 
                                          element_mars: torch.Tensor, param_flows: torch.Tensor, nids: torch.Tensor, 
-                                         cids: torch.Tensor, pids: torch.Tensor, ) -> None:
+                                         cids: torch.Tensor, pids: torch.Tensor, pfids: torch.Tensor) -> None:
         """
         Backward pass of sum layers w.r.t. sum parameters with the block-sparse processing kernel.
         
@@ -1036,6 +1044,7 @@ def _backward_block_sparse_par_flows(self, node_flows: torch.Tensor, params: tor
             nids = nids, 
             cids = cids, 
             pids = pids,
+            pfids = pfids,
             batch_size = batch_size, 
             num_edges = num_edges, 
             TILE_SIZE_B = TILE_SIZE_B, 

From c6a4cec3009a39c0ced1e1c7a0b966167a6b67b8 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Mon, 11 Dec 2023 02:18:07 +0800
Subject: [PATCH 040/162] kernel for parflow fusing

---
 src/pyjuice/layer/backend/parflow_fusing.py | 96 +++++++++++++++++++++
 1 file changed, 96 insertions(+)
 create mode 100644 src/pyjuice/layer/backend/parflow_fusing.py

diff --git a/src/pyjuice/layer/backend/parflow_fusing.py b/src/pyjuice/layer/backend/parflow_fusing.py
new file mode 100644
index 00000000..53198b75
--- /dev/null
+++ b/src/pyjuice/layer/backend/parflow_fusing.py
@@ -0,0 +1,96 @@
+from __future__ import annotations
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+
+def compile_cum_par_flows_fn(node2tiednodes, MAX_NGROUPS = 2048, BLOCK_SIZE = 2048):
+
+    ngroup2kernel_specs = []
+    for source_ns, item in node2tiednodes.items():
+        if len(item[0]) > 1: # If the length is 1, then everything is already accumulated in the source node's parflow
+            num_par_flows = source_ns._param_flow_range[1] - source_ns._param_flow_range[0]
+            pfid_start = source_ns._param_flow_range[0]
+            ch_nodes = item[0]
+
+            assert len(ch_nodes) <= MAX_NGROUPS, f"We only support fusing at most {MAX_NGROUPS} groups for parameter flow accumulation. " \
+                                                  "Consider setting a greater `max_tied_ns_per_parflow_group` when compiling sum layers."
+
+            ngroup = triton.next_power_of_2(len(ch_nodes))
+
+            ch_pfids = []
+            for ch_ns in ch_nodes:
+                ch_pfids.append(ch_ns._param_flow_range[0])
+
+            if ngroup not in ngroup2kernel_specs:
+                ngroup2kernel_specs[ngroup] = []
+
+            ngroup2kernel_specs[ngroup].append([pfid_start, num_par_flows, ch_pfids])
+
+    kernels_args = []
+    for ngroup, kernel_specs in ngroup2kernel_specs.items():
+
+        BLOCK_G = ngroup
+        BLOCK_M = BLOCK_SIZE // BLOCK_G
+
+        target_pfids = []
+        block_sizes = []
+        ch_pfids = []
+        for kernel_spec in kernel_specs:
+            pfid_start, num_par_flows, ch_pfids = kernel_spec
+            for blk_start in range(0, num_par_flows, BLOCK_M):
+                blk_end = min(blk_start + BLOCK_M, num_par_flows)
+                blk_size = blk_end - blk_start
+
+                ch_pfid = [chid_start + blk_start for chid_start in ch_pfids]
+                ch_pfid.extend([0] * (BLOCK_G - len(ch_pfid)))
+
+                target_pfids.append(pfid_start + blk_start)
+                block_sizes.append(blk_size)
+                ch_pfids.append()
+
+        target_pfids = torch.tensor(target_pfids).contiguous()
+        block_sizes = torch.tensor(block_sizes).contiguous()
+        ch_pfids = torch.tensor(ch_pfids).contiguous()
+
+        kernels_args.append([target_pfids, block_sizes, ch_pfids, BLOCK_G, BLOCK_M])
+
+    return kernels_args
+
+
+@triton.jit
+def cum_par_flows_kernel(param_flows, target_pfids, block_sizes, ch_pfids, BLOCK_G: tl.constexpr, BLOCK_M: tl.constexpr):
+
+    pid = tl.program_id(axis = 0)
+
+    offs_g = tl.arange(0, BLOCK_G) + pid * BLOCK_G
+    offs_chblk = tl.load(ch_pfids + offs_chblk)
+    mask_chblk = offs_chblk >= 0
+
+    block_size = tl.load(block_sizes + pid)
+    offs_m = tl.arange(0, BLOCK_M)[None,:]
+    mask_m = offs_m < block_size
+
+    offs_chs = offs_chblk[:,None] + tl.arange(0, BLOCK_M)[None,:]
+    ch_pflows = tl.load(param_flows + offs_chs, mask = mask_chblk[:,None] & mask_m[None,:], other = 0)
+    
+    tar_pflows = tl.sum(ch_pflows, axis = 0)
+
+    tar_pfid = tl.load(target_pfids + pid)
+    tl.store(param_flows + tar_pfid + offs_m, tar_pflows, mask = mask_m)
+
+
+def compute_cum_par_flows(param_flows, kernels_args):
+
+    for kernel_args in kernels_args:
+
+        target_pfids, block_sizes, ch_pfids, BLOCK_G, BLOCK_M = kernel_args
+
+        grid = (target_pfids.size(0),)
+
+        cum_par_flows_kernel[grid](param_flows, target_pfids, block_sizes, ch_pfids, BLOCK_G, BLOCK_M)
+
+    return None
+            
\ No newline at end of file

From 5cd99713364477a30bc5c63ba235ceedbd02230c Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Mon, 11 Dec 2023 02:18:57 +0800
Subject: [PATCH 041/162] compile `pfids` separately

---
 src/pyjuice/layer/compilation.py | 50 ++++++++++++++++++++++----------
 1 file changed, 34 insertions(+), 16 deletions(-)

diff --git a/src/pyjuice/layer/compilation.py b/src/pyjuice/layer/compilation.py
index 44750a65..62b749a2 100644
--- a/src/pyjuice/layer/compilation.py
+++ b/src/pyjuice/layer/compilation.py
@@ -184,10 +184,10 @@ def _assign_chid_kernel(chs_offsets, ns_nchs, edge_ids):
 
 @triton.jit
 def _assign_target_ncpids_kernel(target_nids_ptr, nids_partition_start_ptr, target_cids_ptr, pcids_partition_start_ptr,
-                                 target_pids_ptr, edge_ids_ptr, chs_offsets_ptr, n_partition_ids_ptr, n_id_in_partition_ptr, 
-                                 cs_ele_id_start_ptr, cs_node_cum_ids_ptr, fw_partition_max_chs_ptr, cum_n_chs_ptr, 
-                                 ns_param_ids_ptr, constexprs_ptr, num_chs: tl.constexpr, num_chs_np2: tl.constexpr, 
-                                 add_params_flag: tl.constexpr, BLOCK_SIZE: tl.constexpr):
+                                 target_pids_ptr, target_pfids_ptr, edge_ids_ptr, chs_offsets_ptr, n_partition_ids_ptr, 
+                                 n_id_in_partition_ptr, cs_ele_id_start_ptr, cs_node_cum_ids_ptr, fw_partition_max_chs_ptr, 
+                                 cum_n_chs_ptr, ns_param_ids_ptr, constexprs_ptr, num_chs: tl.constexpr, 
+                                 num_chs_np2: tl.constexpr, add_params_flag: tl.constexpr, BLOCK_SIZE: tl.constexpr):
 
     pid = tl.program_id(axis = 0)
     block_start = pid * BLOCK_SIZE
@@ -195,9 +195,10 @@ def _assign_target_ncpids_kernel(target_nids_ptr, nids_partition_start_ptr, targ
     # Retrieve all constexprs
     global_nid_start = tl.load(constexprs_ptr)
     ns_pid_start = tl.load(constexprs_ptr + 1)
-    ngroup_start = tl.load(constexprs_ptr + 2)
-    num_edges = tl.load(constexprs_ptr + 3)
-    group_size = tl.load(constexprs_ptr + 4)
+    ns_pfid_start = tl.load(constexprs_ptr + 2)
+    ngroup_start = tl.load(constexprs_ptr + 3)
+    num_edges = tl.load(constexprs_ptr + 4)
+    group_size = tl.load(constexprs_ptr + 5)
 
     # Get edge indices to be processed by the current block
     offsets = block_start + tl.arange(0, BLOCK_SIZE)
@@ -243,6 +244,10 @@ def _assign_target_ncpids_kernel(target_nids_ptr, nids_partition_start_ptr, targ
     global_pid = ns_pid_start + (ns_local_pid + chs_offset) * group_size
     tl.store(target_pids_ptr + pcids_offsets, global_pid, mask = mask)
 
+    # Store to `target_pfids`
+    global_pfid = ns_pfid_start + (ns_local_pid + chs_offset) * group_size
+    tl.store(target_pfids_ptr + pcids_offsets, global_pfid, mask = mask)
+
     # Global parameter indices for all edges
     if add_params_flag:
         tl.store(ns_param_ids_ptr + offsets, global_pid, mask = mask)
@@ -278,6 +283,9 @@ def sum_layer_forward_compilation(nodes, fw_partition_max_chs, n_partition_ids,
         # ...and `pids`
         target_pids = torch.zeros([(num_ngs_in_partition * fw_partition_max_chs).sum()], dtype = torch.long).to(device)
 
+        # ...and `pfids`
+        target_pfids = torch.zeros([(num_ngs_in_partition * fw_partition_max_chs).sum()], dtype = torch.long).to(device)
+
         # Move necessary tensors to GPU
         n_partition_ids = n_partition_ids.to(device)
         n_id_in_partition = n_id_in_partition.to(device)
@@ -287,6 +295,7 @@ def sum_layer_forward_compilation(nodes, fw_partition_max_chs, n_partition_ids,
         nids = [torch.zeros([num_ngs_in_partition[i]], dtype = torch.long) for i in range(len(num_ngs_in_partition))]
         cids = [torch.zeros([num_ngs_in_partition[i], fw_partition_max_chs[i]], dtype = torch.long) for i in range(len(num_ngs_in_partition))]
         pids = [torch.zeros([num_ngs_in_partition[i], fw_partition_max_chs[i]], dtype = torch.long) for i in range(len(num_ngs_in_partition))]
+        pfids = [torch.zeros([num_ngs_in_partition[i], fw_partition_max_chs[i]], dtype = torch.long) for i in range(len(num_ngs_in_partition))]
 
         ngid_in_partition = torch.zeros([len(num_ngs_in_partition)], dtype = torch.long)
 
@@ -309,12 +318,15 @@ def sum_layer_forward_compilation(nodes, fw_partition_max_chs, n_partition_ids,
 
                 add_params_flag = True
             else:
+                assert ns.provided("_param_flow_range")
+
                 add_params_flag = False
 
             original_param_nids.append(ns_idx)
                 
-            # Global pid start index for `ns`
+            # Global pid and pfid start index for `ns`
             ns_pid_start = ns._param_range[0]
+            ns_pfid_start = ns._param_range[0]
         else:
             source_ns = ns.get_source_ns()
 
@@ -351,8 +363,9 @@ def sum_layer_forward_compilation(nodes, fw_partition_max_chs, n_partition_ids,
 
                 node2tiednodes[source_ns][1] += 1
 
-            # Global pid start index for `ns`
+            # Global pid and pfid start index for `ns`
             ns_pid_start = source_ns._param_range[0]
+            ns_pfid_start = ns._param_flow_range[0]
 
         # number of node groups
         ns_num_ngroups = ns.num_node_groups
@@ -411,7 +424,7 @@ def sum_layer_forward_compilation(nodes, fw_partition_max_chs, n_partition_ids,
             # We store these constants in a tensor and retrieve them in the kernel
             # This is to avoid `triton` from compiling separate kernels for every layer configuration
             # Saves 99.9% compilation time :)
-            constexprs = torch.tensor([global_nid_start, ns_pid_start, ngroup_start, ns_num_edges, ns.group_size]).long().to(device)
+            constexprs = torch.tensor([global_nid_start, ns_pid_start, ns_pfid_start, ngroup_start, ns_num_edges, ns.group_size]).long().to(device)
 
             num_chs_np2 = triton.next_power_of_2(ns.num_chs)
 
@@ -420,9 +433,9 @@ def sum_layer_forward_compilation(nodes, fw_partition_max_chs, n_partition_ids,
 
             _assign_target_ncpids_kernel[grid](
                 target_nids, nids_partition_start, target_cids, pcids_partition_start,
-                target_pids, edge_ids, chs_offsets, n_partition_ids, n_id_in_partition, 
-                cs_ele_id_start, cs_node_cum_ids, fw_partition_max_chs, cum_n_chs, 
-                ns_param_ids, constexprs, ns.num_chs, num_chs_np2, 
+                target_pids, target_pfids_ptr, edge_ids, chs_offsets, n_partition_ids, 
+                n_id_in_partition, cs_ele_id_start, cs_node_cum_ids, fw_partition_max_chs, 
+                cum_n_chs, ns_param_ids, constexprs, ns.num_chs, num_chs_np2, 
                 add_params_flag, BLOCK_SIZE = min(2048, 2**20 // num_chs_np2)
             )
 
@@ -467,6 +480,11 @@ def sum_layer_forward_compilation(nodes, fw_partition_max_chs, n_partition_ids,
                 # Assign `pids`
                 global_pids = ns_pid_start + cum_n_chs + torch.arange(0, ns.group_size * criterion.sum(), ns.group_size)
                 pids[partition_id][local_id, 0:global_pids.size(0)] = global_pids
+
+                # Assign `pfids`
+                global_pfids = ns_pfid_start + cum_n_chs + torch.arange(0, ns.group_size * criterion.sum(), ns.group_size)
+                pfids[partition_id][local_id, 0:global_pfids.size(0)] = global_pfids
+
                 cum_n_chs += ns.group_size * criterion.sum()
 
                 if add_params_flag:
@@ -501,6 +519,7 @@ def sum_layer_forward_compilation(nodes, fw_partition_max_chs, n_partition_ids,
         target_pids = target_pids.cpu()
         cids = []
         pids = []
+        pfids = []
         for partition_id in range(num_ngs_in_partition.size(0)):
             sid = pcids_partition_start[partition_id]
             gsize = num_ngs_in_partition[partition_id]
@@ -508,8 +527,9 @@ def sum_layer_forward_compilation(nodes, fw_partition_max_chs, n_partition_ids,
             eid = sid + gsize * gnchs
             cids.append(target_cids[sid:eid].reshape(gsize, gnchs).contiguous())
             pids.append(target_pids[sid:eid].reshape(gsize, gnchs).contiguous())
+            pfids.append(target_pfids[sid:eid].reshape(gsize, gnchs).contiguous())
 
-    return nids, cids, pids, global_pid_start, global_pfid_start
+    return nids, cids, pids, pfids, global_pid_start, global_pfid_start
 
 
 @njit
@@ -762,8 +782,6 @@ def sum_layer_backward_compilation(nodes, cs2parns, n_partition_ids, n_id_in_par
 
         cs_ngroup_start += cs.num_node_groups
 
-        # import pdb; pdb.set_trace()
-
     # Restore `chids`
     target_chids = target_chids.cpu()
     chids = []

From 027cf0ffdf2a3924d04b2674f275213c964631f5 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Mon, 11 Dec 2023 02:19:10 +0800
Subject: [PATCH 042/162] clean code

---
 tests/layer/layer_compilation_test.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/layer/layer_compilation_test.py b/tests/layer/layer_compilation_test.py
index 96e35785..5370216c 100644
--- a/tests/layer/layer_compilation_test.py
+++ b/tests/layer/layer_compilation_test.py
@@ -81,8 +81,6 @@ def sum_layer_compilation_test():
                                  global_pid_start = 1, global_pfid_start = 0, node2tiednodes = dict(), 
                                  layer_sparsity_tol = 0.1, force_gpu_compilation = True)
 
-        # import pdb; pdb.set_trace()
-
         for i in range(len(sum_layer_cpu.partitioned_nids)):
             assert torch.all(sum_layer_cpu.partitioned_nids[i] == sum_layer_gpu.partitioned_nids[i])
             assert torch.all(sum_layer_cpu.partitioned_cids[i] == sum_layer_gpu.partitioned_cids[i])

From 990c043e3310aed87ed677230a55f53fa5565c80 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Mon, 11 Dec 2023 02:26:30 +0800
Subject: [PATCH 043/162] fix runtests

---
 src/pyjuice/layer/compilation.py |  7 ++++---
 src/pyjuice/layer/sum_layer.py   |  2 +-
 tests/layer/sum_layer_test.py    | 10 ++--------
 3 files changed, 7 insertions(+), 12 deletions(-)

diff --git a/src/pyjuice/layer/compilation.py b/src/pyjuice/layer/compilation.py
index 62b749a2..edb0737f 100644
--- a/src/pyjuice/layer/compilation.py
+++ b/src/pyjuice/layer/compilation.py
@@ -433,7 +433,7 @@ def sum_layer_forward_compilation(nodes, fw_partition_max_chs, n_partition_ids,
 
             _assign_target_ncpids_kernel[grid](
                 target_nids, nids_partition_start, target_cids, pcids_partition_start,
-                target_pids, target_pfids_ptr, edge_ids, chs_offsets, n_partition_ids, 
+                target_pids, target_pfids, edge_ids, chs_offsets, n_partition_ids, 
                 n_id_in_partition, cs_ele_id_start, cs_node_cum_ids, fw_partition_max_chs, 
                 cum_n_chs, ns_param_ids, constexprs, ns.num_chs, num_chs_np2, 
                 add_params_flag, BLOCK_SIZE = min(2048, 2**20 // num_chs_np2)
@@ -894,10 +894,11 @@ def get_prod_layer_parstats(flat_cids: torch.Tensor, global_nid_start: int):
 
     u_cids, par_counts = torch.unique(flat_cids, sorted = True, return_counts = True)
 
-    c_sid = torch.arange(0, u_cids.size(0))[u_cids == global_nid_start].min()
+    c_sids = torch.arange(0, u_cids.size(0))[u_cids == global_nid_start]
 
-    if c_sid > 0:
+    if c_sids.numel() > 0:
         # Strip away dummy nodes
+        c_sid = c_sids.min() + 1
         u_cids = u_cids[c_sid:]
         par_counts = par_counts[c_sid:]
 
diff --git a/src/pyjuice/layer/sum_layer.py b/src/pyjuice/layer/sum_layer.py
index 8ff27ad6..c8a90f2a 100644
--- a/src/pyjuice/layer/sum_layer.py
+++ b/src/pyjuice/layer/sum_layer.py
@@ -213,7 +213,7 @@ def forward(self, node_mars: torch.Tensor, element_mars: torch.Tensor, params: t
             for partition_id in range(self.num_fw_partitions):
                 nids = self.partitioned_nids[partition_id]
                 cids = self.partitioned_cids[partition_id]
-                pfids = self.partitioned_pfids[partition_id]
+                pids = self.partitioned_pids[partition_id]
 
                 self._forward(
                     node_mars, element_mars, params, nids, cids, pids, partition_id = partition_id
diff --git a/tests/layer/sum_layer_test.py b/tests/layer/sum_layer_test.py
index 87ef0602..b84fc8ea 100644
--- a/tests/layer/sum_layer_test.py
+++ b/tests/layer/sum_layer_test.py
@@ -41,9 +41,7 @@ def sum_layer_test():
     prod_layer = ProdLayer([np0, np1, np2])
 
     layer = SumLayer([ns0, ns1, ns2], global_nid_start = group_size,
-                     param_ends = [1], tied_param_ids = [],
-                     tied_param_group_ids = [], tied_param_ends = [],
-                     ch_prod_layer_size = prod_layer.num_nodes + group_size)
+                     global_pid_start = 1, global_pfid_start = 0, node2tiednodes = dict(), )
 
     assert torch.all(layer.partitioned_nids[0] == torch.arange(group_size, 7 * group_size, group_size))
     assert torch.all(layer.partitioned_cids[0][0:2,0] == group_size)
@@ -172,9 +170,7 @@ def speed_test():
     prod_layer = ProdLayer(nps, layer_sparsity_tol = 0.1)
 
     layer = SumLayer(nodes, global_nid_start = group_size,
-                         param_ends = [1], tied_param_ids = [],
-                         tied_param_group_ids = [], tied_param_ends = [],
-                         ch_prod_layer_size = prod_layer.num_nodes + group_size)
+                     global_pid_start = 1, global_pfid_start = 0, node2tiednodes = dict(), )
 
     layer.to(device)
 
@@ -216,8 +212,6 @@ def speed_test():
     print("Reference computation time on RTX 4090: 1.200ms.")
     print("--------------------------------------------------------------")
 
-    import pdb; pdb.set_trace()
-
 
 if __name__ == "__main__":
     torch.manual_seed(3890)

From 66138c3957b6c99a3183b507a4214d6f2d60961e Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Mon, 11 Dec 2023 20:50:02 +0800
Subject: [PATCH 044/162] em parameter update kernels

---
 src/pyjuice/layer/backend/par_update.py | 213 ++++++++++++++++++++++++
 1 file changed, 213 insertions(+)
 create mode 100644 src/pyjuice/layer/backend/par_update.py

diff --git a/src/pyjuice/layer/backend/par_update.py b/src/pyjuice/layer/backend/par_update.py
new file mode 100644
index 00000000..0eba2109
--- /dev/null
+++ b/src/pyjuice/layer/backend/par_update.py
@@ -0,0 +1,213 @@
+from __future__ import annotations
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+from numba import njit
+
+from pyjuice.nodes import CircuitNodes
+
+
+@njit
+def _record_par_blks(par_start_ids, pflow_start_ids, blk_sizes, blk_intervals, global_nids, 
+                     num_edges_per_ng, ns_num_node_groups, ns_group_size, cs_group_size, pid, 
+                     global_nid, par_start, pflow_start, BLOCK_SIZE):
+    for local_ngid in range(ns_num_node_groups):
+        num_edges = num_edges_per_ng[local_ngid]
+        num_chs = num_edges * cs_group_size
+
+        for sid in range(0, num_chs, BLOCK_SIZE):
+            eid = min(sid + BLOCK_SIZE, num_chs)
+            blk_size = eid - sid
+
+            for gid in range(ns_group_size):
+                psid = par_start + sid * ns_group_size + gid
+                pfsid = pflow_start + sid * ns_group_size + gid
+                global_ind = global_nid + gid
+
+                par_start_ids[pid] = par_start + sid * ns_group_size + gid
+                pflow_start_ids[pid] = pflow_start + sid * ns_group_size + gid
+                blk_sizes[pid] = blk_size
+                blk_intervals[pid] = ns_group_size
+                global_nids[pid] = global_nid + gid
+
+                pid += 1
+
+        global_nid += ns_group_size
+
+    return global_nid, pid
+
+
+@torch.no_grad()
+def compile_par_update_fn(root_ns: CircuitNodes, BLOCK_SIZE: int = 32, buffer_inc_interval: int = 10000, use_numba: bool = True):
+
+    assert BLOCK_SIZE & (BLOCK_SIZE - 1) == 0, "`BLOCK_SIZE` must be power of 2."
+
+    par_start_ids = np.zeros([buffer_inc_interval], dtype = np.int64)
+    pflow_start_ids = np.zeros([buffer_inc_interval], dtype = np.int64)
+    blk_sizes = np.zeros([buffer_inc_interval], dtype = np.int64)
+    blk_intervals = np.zeros([buffer_inc_interval], dtype = np.int64)
+    global_nids = np.zeros([buffer_inc_interval], dtype = np.int64)
+    pid = 0
+
+    global_nid = 0
+    for ns in root_ns:
+        if not ns.is_sum() or ns.is_tied():
+            continue
+
+        par_start = ns._param_range[0]
+        pflow_start = ns._param_flow_range[0]
+        tot_n_pars = ns._param_range[1] - ns._param_range[0]
+
+        num_edges_per_ng = torch.bincount(ns.edge_ids[0,:], minlength = ns.num_node_groups).contiguous().numpy()
+
+        # Enlarge the buffer if needed
+        est_num_slots = triton.cdiv(ns.edges.size(1) * ns.group_size * ns.ch_group_size, BLOCK_SIZE) + ns.num_nodes
+        if pid + est_num_slots > par_start_ids.shape[0]:
+            curr_size = par_start_ids.shape[0]
+            inc_shape = triton.cdiv(pid + est_num_slots - curr_size, buffer_inc_interval) * buffer_inc_interval
+
+            par_start_ids = np.ascontiguousarray(par_start_ids.resize(curr_size + inc_shape))
+            pflow_start_ids = np.ascontiguousarray(pflow_start_ids.resize(curr_size + inc_shape))
+            blk_sizes = np.ascontiguousarray(blk_sizes.resize(curr_size + inc_shape))
+            blk_intervals = np.ascontiguousarray(blk_intervals.resize(curr_size + inc_shape))
+            global_nids = np.ascontiguousarray(global_nids.resize(curr_size + inc_shape))
+
+        if use_numba:
+
+            ns_num_node_groups = ns.num_node_groups
+            ns_group_size = ns.group_size
+            cs_group_size = ns.ch_group_size
+
+            global_nid, pid = _record_par_blks(
+                par_start_ids, pflow_start_ids, blk_sizes, blk_intervals, global_nids, 
+                num_edges_per_ng, ns_num_node_groups, ns_group_size, cs_group_size, pid, 
+                global_nid, par_start, pflow_start, BLOCK_SIZE
+            )
+
+        else:
+            ns_gid_range = torch.arange(0, ns.group_size)
+
+            for local_ngid in range(ns.num_node_groups):
+                num_edges = num_edges_per_ng[local_ngid]
+                num_chs = num_edges * ns.ch_group_size
+
+                for sid in range(0, num_chs, BLOCK_SIZE):
+                    eid = min(sid + BLOCK_SIZE, num_chs)
+                    blk_size = eid - sid
+
+                    curr_psids = par_start + sid * ns.group_size + ns_gid_range
+                    curr_pfsids = pflow_start + sid * ns.group_size + ns_gid_range
+                    curr_global_nids = global_nid + ns_gid_range
+
+                    par_start_ids[pid:pid+ns.group_size] = curr_psids
+                    pflow_start_ids[pid:pid+ns.group_size] = curr_pfsids
+                    blk_sizes[pid:pid+ns.group_size] = blk_size
+                    blk_intervals[pid:pid+ns.group_size] = ns.group_size
+                    global_nids[pid:pid+ns.group_size] = curr_global_nids
+
+                    pid += ns.group_size
+
+                global_nid += ns.group_size
+
+    par_start_ids = torch.from_numpy(par_start_ids[:pid]).contiguous()
+    pflow_start_ids = torch.from_numpy(pflow_start_ids[:pid]).contiguous()
+    blk_sizes = torch.from_numpy(blk_sizes[:pid]).contiguous()
+    blk_intervals = torch.from_numpy(blk_intervals[:pid]).contiguous()
+    global_nids = torch.from_numpy(global_nids[:pid]).contiguous()
+
+    cum_pflows = torch.zeros([global_nids[-1] + 1], dtype = torch.float32)
+    
+    metadata = {"tot_num_nodes": global_nids[-1] + 1, "BLOCK_SIZE": BLOCK_SIZE}
+
+    return par_start_ids, pflow_start_ids, blk_sizes, blk_intervals, global_nids, cum_pflows, metadata
+
+
+@triton.jit
+def cum_pflow_kernel(cum_pflows, param_flows, pflow_start_ids, blk_sizes, blk_intervals, 
+                     global_nids, num_blocks, BLOCK_ID: tl.constexpr, BLOCK_SIZE: tl.constexpr):
+
+    pid = tl.program_id(axis = 0)
+
+    offs_m = pid * BLOCK_ID + tl.arange(0, BLOCK_ID)
+    mask_m = offs_m < num_blocks
+
+    offs_blk = tl.arange(0, BLOCK_SIZE)
+
+    pflow_start = tl.load(pflow_start_ids + offs_m, mask = mask_m, other = 0)
+    blk_size = tl.load(blk_sizes + offs_m, mask = mask_m, other = 0)
+    blk_interval = tl.load(blk_intervals + offs_m, mask = mask_m, other = 0)
+    global_nid = tl.load(global_nids + offs_m, mask = mask_m, other = 0)
+
+    offs_pflow = pflow_start[:,None] + offs_blk[None,:] * blk_interval[:,None]
+    mask_pflow = mask_m[:,None] & (offs_blk[None,:] < blk_size[:,None])
+    pflows = tl.load(param_flows + offs_pflow, mask = mask_pflow, other = 0)
+    nflows = tl.sum(pflows, axis = 1)
+
+    tl.atomic_add(cum_pflows + global_nid, nflows, mask = mask_m)
+
+
+def par_update_kernel(params, param_flows, cum_pflows, nchs, par_start_ids, pflow_start_ids, blk_sizes, blk_intervals,
+                      global_nids, constexprs, num_blocks, BLOCK_ID: tl.constexpr, BLOCK_SIZE: tl.constexpr):
+
+    pid = tl.program_id(axis = 0)
+
+    # Retrieve the constants
+    step_size = tl.load(constexprs)
+    pseudocount = tl.load(constexprs + 1)
+
+    offs_m = pid * BLOCK_ID + tl.arange(0, BLOCK_ID)
+    mask_m = offs_m < num_blocks
+
+    offs_blk = tl.arange(0, BLOCK_SIZE)
+
+    par_start = tl.load(par_start_ids + offs_m, mask = mask_m, other = 0)
+    pflow_start = tl.load(pflow_start_ids + offs_m, mask = mask_m, other = 0)
+    blk_size = tl.load(blk_sizes + offs_m, mask = mask_m, other = 0)
+    blk_interval = tl.load(blk_intervals + offs_m, mask = mask_m, other = 0)
+    global_nid = tl.load(global_nids + offs_m, mask = mask_m, other = 0)
+
+    offs_pflow = pflow_start[:,None] + offs_blk[None,:] * blk_interval[:,None]
+    mask_pflow = mask_m[:,None] & (offs_blk[None,:] < blk_size[:,None])
+    pflows = tl.load(param_flows + offs_pflow, mask = mask_pflow, other = 0)
+
+    nflows = tl.load(cum_pflows + global_nid, mask = mask_m, other = 1)
+    nch = tl.load(nchs + global_nid, mask = mask_m, other = 1)
+
+    new_param = (pflows + pseudocount / nch[:,None]) / (nflows[:,None] + pseudocount)
+
+    offs_par = par_start[:,None] + offs_blk[None,:] * blk_interval[:,None]
+    old_param = tl.load(params + offs_par, mask = mask_pflow, other = 0)
+
+    updated_param = (1.0 - step_size) * old_param + step_size * new_param
+    tl.store(params + offs_par, updated_param, mask = mask_pflow)
+
+
+def em_par_update(params, param_flows, par_start_ids, pflow_start_ids, blk_sizes, blk_intervals, 
+                  global_nids, metadata, step_size: float, pseudocount: float = 0.0, cum_pflows = None):
+
+    tot_num_nodes = metadata["tot_num_nodes"]
+    BLOCK_SIZE = metadata["BLOCK_SIZE"]
+
+    if cum_pflows is None:
+        cum_pflows = torch.zeros([tot_num_nodes], dtype = torch.float32, device = params.device)
+    else:
+        cum_pflows[:] = 0.0
+
+    num_blocks = par_start_ids.size(0)
+    BLOCK_ID = 2048 // BLOCK_SIZE
+
+    grid = (triton.cdiv(num_blocks, BLOCK_ID),)
+
+    cum_pflow_kernel[grid](
+        cum_pflows, param_flows, pflow_start_ids, blk_sizes, blk_intervals, 
+        global_nids, num_blocks, BLOCK_ID, BLOCK_SIZE
+    )
+
+    constexprs = torch.tensor([step_size, pseudocount]).to(params.device)
+
+    par_update_kernel[grid](
+        params, param_flows, cum_pflows, par_start_ids, pflow_start_ids, blk_sizes, blk_intervals,
+        global_nids, constexprs, num_blocks, BLOCK_ID, BLOCK_SIZE
+    )

From ce071eaa4a07014e2520222ef47f177d38bb0746 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Mon, 11 Dec 2023 20:50:16 +0800
Subject: [PATCH 045/162] remove outdated tests

---
 tests/layer/sum_block_sparse_test.py | 406 ---------------------------
 1 file changed, 406 deletions(-)
 delete mode 100644 tests/layer/sum_block_sparse_test.py

diff --git a/tests/layer/sum_block_sparse_test.py b/tests/layer/sum_block_sparse_test.py
deleted file mode 100644
index 0185bb09..00000000
--- a/tests/layer/sum_block_sparse_test.py
+++ /dev/null
@@ -1,406 +0,0 @@
-import triton
-import triton.language as tl
-import torch
-import numpy as np
-import time
-
-
-@triton.jit
-def _forward_triton_kernel(node_mars_ptr, element_mars_ptr, params_ptr, 
-                            nids_ptr, cids_ptr, pids_ptr, tot_n_nodes, 
-                            tot_n_eles, n_nodes, n_edges: tl.constexpr, 
-                            batch_size, n_nodes_per_block_m: tl.constexpr,
-                            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr):
-
-    # We use BLOCK_M to index over edges, and BLOCK_N to index over batches
-    pid0 = tl.program_id(axis = 0)
-    pid1 = tl.program_id(axis = 1)
-    ne_start = pid0 * BLOCK_M
-    b_start = pid1 * BLOCK_N
-
-    # Id of edges processed by the current block (0.081ms)
-    ne_offsets = ne_start + tl.arange(0, BLOCK_M)
-    # Batch ids processed by the current block
-    b_offsets = b_start + tl.arange(0, BLOCK_N)
-
-    # Get node ids from `nids`
-    n_start = ne_start // n_edges
-    nid_offsets = n_start + tl.arange(0, n_nodes_per_block_m)
-    n_ids = tl.load(nids_ptr + nid_offsets)
-
-    # Get edge ids from `cids`
-    cid_offsets = tl.view(ne_offsets, (n_edges, n_nodes_per_block_m))
-    ch_ids = tl.load(cids_ptr + cid_offsets)
-    # Use `ch_ids` to retrieve the corresponding element mars
-    ele_offsets = ch_ids[None,:,:] * batch_size + b_offsets[:,None,None]
-    ch_logps = tl.load(element_mars_ptr + ele_offsets) # `element_mars[cids]`
-
-    # Get param ids from `pids`
-    # Here we reuse `cid_offsets` and `cid_mask` thank to their similar structure
-    par_ids = tl.load(pids_ptr + cid_offsets)
-
-    # Use `par_ids` to retrieve the corresponding parameters
-    ch_pars = tl.load(params_ptr + par_ids) # `params[pids]`
-
-    # Take the max of the child mars
-    ch_max_logp = tl.max(ch_logps, axis = 1) # `maxval`
-    # Subtract the max from child mars
-    ch_logps_sub_max = ch_logps - ch_max_logp[:,None,:]
-    # Take exp
-    ch_ps_sub_max = tl.exp(ch_logps_sub_max)
-
-    # Sum node marginals (unnormalized)
-    n_ps = tl.sum(ch_ps_sub_max * ch_pars[None,:,:], axis = 1)
-
-    # Take log and subtract max vals
-    n_logps = tl.log(tl.maximum(n_ps, 1e-10)) + ch_max_logp
-
-    # Read out the target indices for `node_mars`
-    nmar_offsets = n_ids[None,:] * batch_size + b_offsets[:,None]
-    
-    # Reshape seems to be necessary for certain combinations of (BLOCK_N, n_nodes_per_block_m)
-    nmar_offsets = tl.view(nmar_offsets, (BLOCK_N * n_nodes_per_block_m,))
-    n_logps = tl.view(n_logps, (BLOCK_N * n_nodes_per_block_m,))
-    tl.store(node_mars_ptr + nmar_offsets, n_logps)
-
-
-@triton.jit
-def block_sparse_kernel(ddd, node_mars, element_mars, params, nids, cids_start, cids_increment, pids_start, pids_increment,
-                        tot_n_nodes, tot_n_eles, layer_n_nodes, layer_n_edge_groups, batch_size,
-                        BLOCK_M: tl.constexpr, GROUP_SIZE: tl.constexpr):
-    
-    pid_m = tl.program_id(0)
-    pid_b = tl.program_id(1) # batch id
-
-    # initialize pointers to `element_mars`
-    node_start = tl.multiple_of(pid_m * layer_n_edge_groups * GROUP_SIZE, 8) # compiler hint
-    offs_node = tl.arange(0, BLOCK_M) + node_start
-    mask_node = offs_node < layer_n_nodes
-    offs_edge = tl.arange(0, GROUP_SIZE)
-    edge_start = tl.load(cids_start + offs_node, mask = mask_node, other = 0)
-    emars_ptr = element_mars + pid_b * tot_n_eles + edge_start[:,None] + offs_edge[None,:]
-    emars_ptr = tl.view(emars_ptr, (BLOCK_M, GROUP_SIZE))
-
-    # initialize pointers to `params`
-    param_start = tl.load(pids_start + offs_node, mask = mask_node, other = 0)
-    params_ptr = params + param_start[:,None] + offs_edge[None,:]
-    # params_ptr = params + offs_edge[:,None] + param_start[None,:]
-    params_ptr = tl.view(params_ptr, (BLOCK_M, GROUP_SIZE))
-
-    # Inner loop
-    acc = tl.zeros((BLOCK_M,), dtype = tl.float32) - float("inf")
-
-    cids_inc_ptr = cids_increment + offs_node
-    pids_inc_ptr = pids_increment + offs_node
-    for k in range(0, layer_n_edge_groups):
-        emars = tl.load(emars_ptr, mask = mask_node[:,None])
-        epars = tl.load(params_ptr, mask = mask_node[:,None])
-        emars_max = tl.max(emars, axis = 1)
-        emars = tl.exp(emars - emars_max[:,None])
-
-        # nmars = tl.dot(emars, params)
-        nmars = tl.sum(emars * epars, axis = 1)
-
-        acc = tl.where(emars_max > acc, 
-            tl.log(nmars + tl.exp(acc - emars_max)) + emars_max,
-            tl.log(tl.exp(emars_max - acc) * nmars + 1.0) + acc
-        )
-
-        cids_inc = tl.load(cids_inc_ptr, mask = mask_node)
-        pids_inc = tl.load(pids_inc_ptr, mask = mask_node)
-        emars_ptr += cids_inc
-        params_ptr += pids_inc
-        cids_inc_ptr += 1
-        pids_inc_ptr += 1
-
-    # Write back
-    ns = tl.load(nids + offs_node, mask = mask_node)
-    tl.store(node_mars + ns + pid_b * tot_n_nodes, tl.ravel(acc), mask = mask_node)
-
-
-def main_baseline():
-    data = np.load("temp.npz")
-
-    device = torch.device("cuda:0")
-
-    node_mars = torch.from_numpy(data["node_mars"]).to(device)
-    node_mars2 = node_mars.clone()
-    element_mars = torch.from_numpy(data["element_mars"]).to(device)
-    params = torch.from_numpy(data["params"]).to(device)
-    nids = torch.from_numpy(data["nids"]).to(device)
-    cids = torch.from_numpy(data["cids"]).to(device)
-    pids = torch.from_numpy(data["pids"]).to(device)
-    tot_n_nodes = int(data["tot_n_nodes"])
-    tot_n_eles = int(data["tot_n_eles"])
-    n_nodes = int(data["n_nodes"])
-    n_edges = int(data["n_edges"])
-    batch_size = int(data["batch_size"])
-    BLOCK_M = int(data["BLOCK_M"])
-    BLOCK_N = int(data["BLOCK_N"])
-
-    # ddd = torch.zeros([n_nodes * n_edges]).to(device)
-
-    BLOCK_M = 128
-    BLOCK_N = 64
-
-    grid = (triton.cdiv(n_nodes * n_edges, BLOCK_M), triton.cdiv(batch_size, BLOCK_N), 1)
-
-    ts = []
-    for i in range(5):
-        t0 = time.time()
-        _forward_triton_kernel[grid](
-            node_mars_ptr = node_mars, 
-            element_mars_ptr = element_mars, 
-            params_ptr = params,
-            nids_ptr = nids, 
-            cids_ptr = cids, 
-            pids_ptr = pids,
-            tot_n_nodes = tot_n_nodes,
-            tot_n_eles = tot_n_eles,
-            n_nodes = n_nodes,
-            n_edges = n_edges,
-            batch_size = batch_size,
-            n_nodes_per_block_m = BLOCK_M // n_edges,
-            BLOCK_M = BLOCK_M, 
-            BLOCK_N = BLOCK_N
-        )
-        torch.cuda.synchronize()
-        t1 = time.time()
-
-        if i > 0:
-            ts.append(t1 - t0)
-
-    aveg_t, std_t = torch.tensor(ts).mean().item() * 1000, torch.tensor(ts).std().item() * 1000
-    print(f"{aveg_t:.3f}±{std_t:.3f}ms")
-
-    # node_mars_gt = node_mars.clone()
-    # ch_mars = element_mars[cids]
-    # maxval = ch_mars.max(dim = 1, keepdim = True).values
-    # aaa = (((ch_mars - maxval).exp() * params[pids].unsqueeze(-1)).sum(
-    #     dim = 1).clamp(min = 1e-10)).log() + maxval.squeeze(1)
-
-    # bbb = node_mars[nids]
-
-    # print(torch.max((aaa - bbb).abs()))
-
-
-def main_blocksparse():
-
-    GROUP_SIZE = 128
-
-    data = np.load("temp.npz")
-
-    device = torch.device("cuda:0")
-
-    node_mars = torch.from_numpy(data["node_mars"]).permute(1, 0).contiguous().to(device)
-    element_mars = torch.from_numpy(data["element_mars"]).permute(1, 0).contiguous().to(device)
-    params = torch.from_numpy(data["params"]).to(device)
-
-    # Convert `nids`, `cids`, and `pids` into block sparse format
-    nids = torch.from_numpy(data["nids"]).to(device)
-    cids = torch.from_numpy(data["cids"])
-    pids = torch.from_numpy(data["pids"])
-
-    cids = cids[:,::GROUP_SIZE].contiguous()
-    pids = pids[:,::GROUP_SIZE].contiguous()
-
-    cids_start = cids[:,0].contiguous().to(device)
-    pids_start = pids[:,0].contiguous().to(device)
-    cids_increment = torch.cat((cids[:,1:] - cids[:,:-1], cids[:,0:1] * 0), dim = 1).contiguous().to(device)
-    pids_increment = torch.cat((pids[:,1:] - pids[:,:-1], pids[:,0:1] * 0), dim = 1).contiguous().to(device)
-
-    tot_n_nodes = int(data["tot_n_nodes"])
-    tot_n_eles = int(data["tot_n_eles"])
-    layer_n_nodes = int(data["n_nodes"])
-    layer_n_edges = int(data["n_edges"])
-    batch_size = int(data["batch_size"])
-
-    BLOCK_M = 16
-
-    grid = (triton.cdiv(layer_n_nodes, BLOCK_M), batch_size)
-
-    ddd = torch.zeros([layer_n_nodes, batch_size], dtype = torch.long, device = device)
-
-    ts = []
-    for i in range(5):
-        t0 = time.time()
-        block_sparse_kernel[grid](
-            ddd,
-            node_mars, 
-            element_mars, 
-            params, 
-            nids, 
-            cids_start, 
-            cids_increment, 
-            pids_start, 
-            pids_increment,
-            tot_n_nodes, 
-            tot_n_eles, 
-            layer_n_nodes, 
-            layer_n_edge_groups = layer_n_edges // GROUP_SIZE, 
-            batch_size = batch_size,
-            BLOCK_M = BLOCK_M, 
-            GROUP_SIZE = GROUP_SIZE
-        )
-        torch.cuda.synchronize()
-        t1 = time.time()
-
-        if i > 0:
-            ts.append(t1 - t0)
-
-    aveg_t, std_t = torch.tensor(ts).mean().item() * 1000, torch.tensor(ts).std().item() * 1000
-    print(f"{aveg_t:.3f}±{std_t:.3f}ms")
-
-    # import pdb; pdb.set_trace()
-
-
-@triton.jit
-def block_sparse_2d_kernel(node_mars, element_mars, params, nids, cids_start, cids_increment, pids_start,
-                           layer_n_edge_groups, batch_size, stride_pa, stride_pb, 
-                           BLOCK_B: tl.constexpr, TILE_SIZE_K: tl.constexpr, 
-                           TILE_SIZE_M: tl.constexpr, GROUP_SIZE_M: tl.constexpr):
-    
-    pid_b = tl.program_id(0) # ID of size-`BLOCK_B` batches
-    pid_m = tl.program_id(1) # ID of size-`TILE_SIZE_M` nodes
-
-    # Get inferred node group id from `pid_m`
-    ngroup_id = pid_m // (GROUP_SIZE_M // TILE_SIZE_M)
-    ntile_id = pid_m % (GROUP_SIZE_M // TILE_SIZE_M)
-
-    # initialize pointers to `params`
-    offs_node = tl.arange(0, TILE_SIZE_M)
-    offs_edge = tl.arange(0, TILE_SIZE_K)
-    par_start = tl.load(pids_start + ngroup_id * stride_pa + ntile_id * TILE_SIZE_M * stride_pb + offs_node * stride_pb)
-    epars_ptr = params + par_start[:,None] + offs_edge[None,:]
-
-    # initialize pointers to `element_mars`
-    offs_batch = tl.arange(0, BLOCK_B) + pid_b * BLOCK_B 
-    mask_batch = offs_batch < batch_size
-    edge_start = tl.load(cids_start + ngroup_id * TILE_SIZE_K + offs_edge)
-    emars_ptr = element_mars + \
-        edge_start[:,None] * batch_size + \
-        offs_batch[None,:]
-
-    # Inner loop
-    acc = tl.zeros((TILE_SIZE_M, BLOCK_B), dtype = tl.float32) - float("inf")
-
-    cids_inc_ptr = cids_increment + ngroup_id * (layer_n_edge_groups * TILE_SIZE_K) + offs_edge
-    for k in range(0, layer_n_edge_groups):
-        epars = tl.load(epars_ptr)
-        emars = tl.load(emars_ptr, mask = mask_batch[None,:])
-
-        emars_max = tl.max(emars, axis = 0)[None,:]
-        emars = tl.exp(emars - emars_max)
-        epars = epars.to(tl.float16)
-        emars = emars.to(tl.float16)
-        nmars = tl.dot(epars, emars).to(tl.float32)
-
-        # if TILE_SIZE_M < 16:
-        #     epars = tl.view(tl.broadcast_to(epars[:,None,:], (TILE_SIZE_M, 16 // TILE_SIZE_M, TILE_SIZE_K)), (16, TILE_SIZE_K))
-        #     nmars = tl.dot(epars, emars).to(tl.float32)
-        #     nmars = tl.max(tl.view(nmars, (TILE_SIZE_M, 16 // TILE_SIZE_M, BLOCK_B)), axis = 1)
-
-        acc = tl.where(emars_max > acc, 
-            tl.log(nmars + tl.exp(acc - emars_max)) + emars_max,
-            tl.log(tl.exp(emars_max - acc) * nmars + 1.0) + acc
-        )
-
-        cids_inc = tl.load(cids_inc_ptr)
-        emars_ptr += cids_inc[:,None] * batch_size
-        cids_inc += TILE_SIZE_K
-
-        epars_ptr += TILE_SIZE_K
-
-    # Write back
-    offs_nids = tl.load(nids + ngroup_id * GROUP_SIZE_M + ntile_id * TILE_SIZE_M + offs_node)
-    offs_nmars = offs_nids[:,None] * batch_size + offs_batch[None,:]
-    tl.store(node_mars + offs_nmars, acc, mask = mask_batch[None,:])
-
-
-def main_blocksparse_2d():
-
-    GROUP_SIZE_M = 32
-
-    TILE_SIZE_M = 16
-    TILE_SIZE_K = 64
-
-    BLOCK_B = max(128, 16)
-
-    data = np.load("temp.npz")
-
-    device = torch.device("cuda:0")
-
-    node_mars = torch.from_numpy(data["node_mars"]).to(device)
-    element_mars = torch.from_numpy(data["element_mars"]).to(device)
-    params = torch.from_numpy(data["params"]).to(device)
-
-    # Convert `nids`, `cids`, and `pids` into block sparse format
-    nids = torch.from_numpy(data["nids"])# .to(device)
-    cids = torch.from_numpy(data["cids"])# .to(device)
-    pids = torch.from_numpy(data["pids"])# .to(device)
-
-    node_mars_gt = node_mars.clone()
-    ch_mars = element_mars[cids]
-    maxval = ch_mars.max(dim = 1, keepdim = True).values
-    aaa = (((ch_mars - maxval).exp() * params[pids].unsqueeze(-1)).sum(
-        dim = 1).clamp(min = 1e-10)).log() + maxval.squeeze(1)
-
-    nids = nids.reshape(-1, GROUP_SIZE_M).contiguous().to(device)
-    cids = cids[::GROUP_SIZE_M,:].reshape(nids.size(0), -1, TILE_SIZE_K).contiguous()
-    pids_start = pids.reshape(nids.size(0), GROUP_SIZE_M, -1)[:,:,0].contiguous().to(device)
-
-    cids_start = cids[:,0,:].contiguous().to(device)
-    cids_increment = torch.cat((cids[:,1:,:] - cids[:,:-1,:], cids[:,0:1,:] * 0), dim = 1).contiguous().to(device)
-
-    tot_n_nodes = int(data["tot_n_nodes"])
-    tot_n_eles = int(data["tot_n_eles"])
-    layer_n_nodes = int(data["n_nodes"])
-    layer_n_edges = int(data["n_edges"])
-    batch_size = int(data["batch_size"])
-
-    layer_n_node_groups = layer_n_nodes // GROUP_SIZE_M
-    layer_n_edge_groups = layer_n_edges // TILE_SIZE_K
-
-    grid = (triton.cdiv(batch_size, BLOCK_B), triton.cdiv(layer_n_nodes, TILE_SIZE_M))
-
-    ts = []
-    for i in range(50):
-        # print("enter")
-        t0 = time.time()
-        block_sparse_2d_kernel[grid](
-            node_mars, 
-            element_mars, 
-            params, 
-            nids, 
-            cids_start, 
-            cids_increment, 
-            pids_start,
-            layer_n_edge_groups, 
-            batch_size,
-            stride_pa = pids_start.stride(0),
-            stride_pb = pids_start.stride(1), # Do not provide pids.stride(2) since it is 1
-            BLOCK_B = BLOCK_B,
-            TILE_SIZE_K = TILE_SIZE_K,
-            TILE_SIZE_M = TILE_SIZE_M,
-            GROUP_SIZE_M = GROUP_SIZE_M
-        )
-        torch.cuda.synchronize()
-        t1 = time.time()
-
-        if i > 0:
-            ts.append(t1 - t0)
-
-    aveg_t, std_t = torch.tensor(ts).mean().item() * 1000, torch.tensor(ts).std().item() * 1000
-    print(f"{aveg_t:.3f}±{std_t:.3f}ms")
-
-    bbb = node_mars[nids]
-
-    print(torch.max((aaa - bbb.flatten(0, 1)).abs()))
-
-    # import pdb; pdb.set_trace()
-
-
-if __name__ == "__main__":
-    # main_baseline()
-    # main_blocksparse()
-    main_blocksparse_2d()
\ No newline at end of file

From ded715fe7601752a85f9a863e7eb7192bffc3afd Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Mon, 11 Dec 2023 20:53:42 +0800
Subject: [PATCH 046/162] move par_update.py and parflow_fusing.py

---
 src/pyjuice/{layer => model}/backend/par_update.py     | 0
 src/pyjuice/{layer => model}/backend/parflow_fusing.py | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename src/pyjuice/{layer => model}/backend/par_update.py (100%)
 rename src/pyjuice/{layer => model}/backend/parflow_fusing.py (100%)

diff --git a/src/pyjuice/layer/backend/par_update.py b/src/pyjuice/model/backend/par_update.py
similarity index 100%
rename from src/pyjuice/layer/backend/par_update.py
rename to src/pyjuice/model/backend/par_update.py
diff --git a/src/pyjuice/layer/backend/parflow_fusing.py b/src/pyjuice/model/backend/parflow_fusing.py
similarity index 100%
rename from src/pyjuice/layer/backend/parflow_fusing.py
rename to src/pyjuice/model/backend/parflow_fusing.py

From 35c144915f5436ed23e6fb786f9d0932107985f5 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Tue, 12 Dec 2023 05:00:58 +0800
Subject: [PATCH 047/162] reinstate `TensorCircuit`

---
 src/pyjuice/functional/__init__.py            |   3 -
 src/pyjuice/functional/softmax.py             | 176 -----------
 src/pyjuice/functional/tying.py               | 108 -------
 src/pyjuice/layer/backend/node_partition.py   |   3 +-
 src/pyjuice/layer/compilation.py              |   4 +-
 src/pyjuice/layer/input_layer.py              |   6 +-
 src/pyjuice/layer/sum_layer.py                |   4 +
 src/pyjuice/model/backend/__init__.py         |   3 +
 src/pyjuice/model/backend/normalize.py        | 148 +++++++++
 src/pyjuice/model/backend/par_update.py       |  35 ++-
 src/pyjuice/model/backend/parflow_fusing.py   |  17 +-
 src/pyjuice/model/tensorcircuit.py            | 287 ++++++++++--------
 src/pyjuice/nodes/backend/__init__.py         |   1 +
 .../backend}/normalize.py                     |   4 +-
 src/pyjuice/nodes/nodes.py                    |  27 +-
 src/pyjuice/nodes/sum_nodes.py                |  10 +-
 tests/functional/tying_test.py                | 204 -------------
 tests/model/numba_test.py                     |  21 ++
 tests/model/simple_model_test.py              | 143 +++++++++
 19 files changed, 557 insertions(+), 647 deletions(-)
 delete mode 100644 src/pyjuice/functional/__init__.py
 delete mode 100644 src/pyjuice/functional/softmax.py
 delete mode 100644 src/pyjuice/functional/tying.py
 create mode 100644 src/pyjuice/model/backend/__init__.py
 create mode 100644 src/pyjuice/model/backend/normalize.py
 create mode 100644 src/pyjuice/nodes/backend/__init__.py
 rename src/pyjuice/{functional => nodes/backend}/normalize.py (96%)
 delete mode 100644 tests/functional/tying_test.py
 create mode 100644 tests/model/numba_test.py
 create mode 100644 tests/model/simple_model_test.py

diff --git a/src/pyjuice/functional/__init__.py b/src/pyjuice/functional/__init__.py
deleted file mode 100644
index 57affb90..00000000
--- a/src/pyjuice/functional/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from .normalize import normalize_parameters
-from .tying import tie_param_flows
-from .softmax import flat_softmax_fw, flat_softmax_bp
\ No newline at end of file
diff --git a/src/pyjuice/functional/softmax.py b/src/pyjuice/functional/softmax.py
deleted file mode 100644
index ba994496..00000000
--- a/src/pyjuice/functional/softmax.py
+++ /dev/null
@@ -1,176 +0,0 @@
-import torch
-import triton
-import triton.language as tl
-
-
-@triton.jit
-def _fw_cum_logits_kernel(logits_ptr, cum_weights_ptr, node_ids_ptr, tot_num_logits, batch_size, BLOCK_SIZE: tl.constexpr):
-
-    pid = tl.program_id(axis = 0)
-    block_start = pid * BLOCK_SIZE
-
-    offsets = block_start + tl.arange(0, BLOCK_SIZE)
-    mask = offsets < tot_num_logits * batch_size
-
-    param_offsets = offsets // batch_size
-    batch_offsets = offsets % batch_size
-
-    n_offsets = tl.load(node_ids_ptr + param_offsets, mask = mask, other = 0)
-    n_offsets = n_offsets * batch_size + batch_offsets
-
-    logits = tl.load(logits_ptr + offsets, mask = mask, other = 0)
-    logits = tl.exp(logits)
-
-    tl.atomic_add(cum_weights_ptr + n_offsets, logits, mask = mask)
-
-
-@triton.jit
-def _fw_norm_logits_kernel(logits_ptr, targets_ptr, cum_weights_ptr, node_ids_ptr, tot_num_logits, 
-                           batch_size, BLOCK_SIZE: tl.constexpr):
-    
-    pid = tl.program_id(axis = 0)
-    block_start = pid * BLOCK_SIZE
-
-    offsets = block_start + tl.arange(0, BLOCK_SIZE)
-    mask = offsets < tot_num_logits * batch_size
-
-    param_offsets = offsets // batch_size
-    batch_offsets = offsets % batch_size
-
-    n_offsets = tl.load(node_ids_ptr + param_offsets, mask = mask, other = 0)
-    n_offsets = n_offsets * batch_size + batch_offsets
-
-    logits = tl.load(logits_ptr + offsets, mask = mask, other = 0)
-    cum_weights = tl.load(cum_weights_ptr + n_offsets, mask = mask, other = 1)
-
-    normed_logits = tl.exp(logits) / cum_weights
-    tl.store(targets_ptr + offsets, normed_logits, mask = mask)
-
-
-@triton.jit
-def _bp_cum_logits_kernel(grads_ptr, normed_values_ptr, cum_grads_ptr, node_ids_ptr, tot_num_logits, batch_size, BLOCK_SIZE: tl.constexpr):
-
-    pid = tl.program_id(axis = 0)
-    block_start = pid * BLOCK_SIZE
-
-    offsets = block_start + tl.arange(0, BLOCK_SIZE)
-    mask = offsets < tot_num_logits * batch_size
-
-    param_offsets = offsets // batch_size
-    batch_offsets = offsets % batch_size
-
-    n_offsets = tl.load(node_ids_ptr + param_offsets, mask = mask, other = 0)
-    n_offsets = n_offsets * batch_size + batch_offsets
-
-    grads = tl.load(grads_ptr + offsets, mask = mask, other = 0)
-    normed_values = tl.load(normed_values_ptr + offsets, mask = mask, other = 0)
-    cum_grads = grads * normed_values
-
-    tl.atomic_add(cum_grads_ptr + n_offsets, cum_grads, mask = mask)
-
-
-@triton.jit
-def _bp_norm_grads_p_kernel(grads_ptr, targets_ptr, normed_values_ptr, cum_grads_ptr, node_ids_ptr, tot_num_logits, 
-                            batch_size, BLOCK_SIZE: tl.constexpr):
-    
-    pid = tl.program_id(axis = 0)
-    block_start = pid * BLOCK_SIZE
-
-    offsets = block_start + tl.arange(0, BLOCK_SIZE)
-    mask = offsets < tot_num_logits * batch_size
-
-    param_offsets = offsets // batch_size
-    batch_offsets = offsets % batch_size
-
-    n_offsets = tl.load(node_ids_ptr + param_offsets, mask = mask, other = 0)
-    n_offsets = n_offsets * batch_size + batch_offsets
-
-    grads = tl.load(grads_ptr + offsets, mask = mask, other = 0)
-    normed_values = tl.load(normed_values_ptr + offsets, mask = mask, other = 0)
-    cum_grads = tl.load(cum_grads_ptr + n_offsets, mask = mask, other = 1)
-
-    grads = normed_values * (grads - cum_grads)
-    tl.store(targets_ptr + offsets, grads, mask = mask)
-
-
-@triton.jit
-def _bp_norm_grads_logp_kernel(grads_ptr, targets_ptr, cum_grads_ptr, node_ids_ptr, tot_num_logits, 
-                               batch_size, BLOCK_SIZE: tl.constexpr):
-    
-    pid = tl.program_id(axis = 0)
-    block_start = pid * BLOCK_SIZE
-
-    offsets = block_start + tl.arange(0, BLOCK_SIZE)
-    mask = offsets < tot_num_logits * batch_size
-
-    param_offsets = offsets // batch_size
-    batch_offsets = offsets % batch_size
-
-    n_offsets = tl.load(node_ids_ptr + param_offsets, mask = mask, other = 0)
-    n_offsets = n_offsets * batch_size + batch_offsets
-
-    grads = tl.load(grads_ptr + offsets, mask = mask, other = 0)
-    cum_grads = tl.load(cum_grads_ptr + n_offsets, mask = mask, other = 1)
-
-    grads = grads - cum_grads
-    tl.store(targets_ptr + offsets, grads, mask = mask)
-
-
-def flat_softmax_fw(logits: torch.Tensor, node_ids: torch.Tensor, inplace: bool = False):
-
-    num_logits = logits.size(0)
-    num_nodes = torch.max(node_ids).detach().cpu().item() + 1
-
-    if inplace:
-        targets = logits
-    else:
-        targets = torch.empty_like(logits)
-
-    assert logits.is_cuda, "Input `logits` should be on GPU."
-
-    if logits.dim() == 1:
-        logits = logits.unsqueeze(1)
-
-    batch_size = logits.size(1)
-
-    cum_weights = torch.zeros([num_nodes, batch_size], dtype = torch.float32, device = logits.device)
-
-    grid1 = lambda meta: (triton.cdiv(num_logits * batch_size, meta['BLOCK_SIZE']),)
-    grid2 = lambda meta: (triton.cdiv(num_logits * batch_size, meta['BLOCK_SIZE']),)
-
-    _fw_cum_logits_kernel[grid1](logits, cum_weights, node_ids, num_logits, batch_size, BLOCK_SIZE = 1024)
-    _fw_norm_logits_kernel[grid2](logits, targets, cum_weights, node_ids, num_logits, batch_size, BLOCK_SIZE = 1024)
-
-    return targets
-
-
-def flat_softmax_bp(grads: torch.Tensor, normed_values: torch.Tensor, node_ids: torch.Tensor, 
-                    log_param_grad: bool = False, inplace: bool = False):
-
-    num_logits = grads.size(0)
-    num_nodes = torch.max(node_ids).detach().cpu().item() + 1
-
-    if inplace:
-        target_grads = grads
-    else:
-        target_grads = torch.empty_like(grads)
-
-    assert grads.is_cuda, "Input `grads` should be on GPU."
-
-    if grads.dim() == 1:
-        grads = grads.unsqueeze(1)
-
-    batch_size = grads.size(1)
-
-    cum_grads = torch.zeros([num_nodes, batch_size], dtype = torch.float32, device = grads.device)
-
-    grid1 = lambda meta: (triton.cdiv(num_logits * batch_size, meta['BLOCK_SIZE']),)
-    grid2 = lambda meta: (triton.cdiv(num_logits * batch_size, meta['BLOCK_SIZE']),)
-
-    _bp_cum_logits_kernel[grid1](grads, normed_values, cum_grads, node_ids, num_logits, batch_size, BLOCK_SIZE = 1024)
-    if not log_param_grad:
-        _bp_norm_grads_p_kernel[grid2](grads, target_grads, normed_values, cum_grads, node_ids, num_logits, batch_size, BLOCK_SIZE = 1024)
-    else:
-        _bp_norm_grads_logp_kernel[grid2](grads, target_grads, cum_grads, node_ids, num_logits, batch_size, BLOCK_SIZE = 1024)
-
-    return target_grads
diff --git a/src/pyjuice/functional/tying.py b/src/pyjuice/functional/tying.py
deleted file mode 100644
index 60e4e1df..00000000
--- a/src/pyjuice/functional/tying.py
+++ /dev/null
@@ -1,108 +0,0 @@
-import torch
-import triton
-import triton.language as tl
-
-from typing import Optional
-
-
-@triton.jit
-def _aggregate_flows_kernel(param_flows_ptr, tied_param_flows_ptr, tied_param_ids_ptr, tied_param_group_ids_ptr,
-                            num_params: tl.constexpr, batch_size: tl.constexpr, BLOCK_SIZE: tl.constexpr):
-    pid = tl.program_id(axis = 0)
-    block_start = pid * BLOCK_SIZE
-
-    offsets = block_start + tl.arange(0, BLOCK_SIZE)
-    mask = offsets < num_params * batch_size
-
-    param_offsets = offsets // batch_size
-    batch_offsets = offsets % batch_size
-
-    p_offsets = tl.load(tied_param_ids_ptr + param_offsets, mask = mask, other = 0)
-    p_offsets = p_offsets * batch_size + batch_offsets
-
-    g_offsets = tl.load(tied_param_group_ids_ptr + param_offsets, mask = mask, other = 0)
-    g_offsets = g_offsets * batch_size + batch_offsets
-
-    param_flows = tl.load(param_flows_ptr + p_offsets, mask = mask, other = 0)
-
-    tl.atomic_add(tied_param_flows_ptr + g_offsets, param_flows, mask = mask)
-
-
-@triton.jit
-def _assign_flows_kernel(param_flows_ptr, tied_param_flows_ptr, tied_param_ids_ptr, tied_param_group_ids_ptr,
-                         num_params: tl.constexpr, batch_size: tl.constexpr, BLOCK_SIZE: tl.constexpr):
-    pid = tl.program_id(axis = 0)
-    block_start = pid * BLOCK_SIZE
-
-    offsets = block_start + tl.arange(0, BLOCK_SIZE)
-    mask = offsets < num_params * batch_size
-
-    param_offsets = offsets // batch_size
-    batch_offsets = offsets % batch_size
-
-    p_offsets = tl.load(tied_param_ids_ptr + param_offsets, mask = mask, other = 0)
-    p_offsets = p_offsets * batch_size + batch_offsets
-
-    g_offsets = tl.load(tied_param_group_ids_ptr + param_offsets, mask = mask, other = 0)
-    g_offsets = g_offsets * batch_size + batch_offsets
-
-    gparam_flows = tl.load(tied_param_flows_ptr + g_offsets, mask = mask, other = 0)
-
-    tl.store(param_flows_ptr + p_offsets, gparam_flows, mask = mask)
-
-
-def tie_param_flows(param_flows: torch.Tensor, num_tied_params: int, 
-                    tied_param_ids: torch.Tensor, tied_param_group_ids: torch.Tensor,
-                    tied_param_flows: Optional[torch.Tensor] = None, BLOCK_SIZE: int = 1024):
-
-    if param_flows.dim() == 1:
-        param_flows = param_flows.unsqueeze(1)
-
-    num_params = tied_param_ids.size(0)
-    batch_size = param_flows.size(1)
-
-    # Allocate buffer if not already
-    if tied_param_flows is None:
-        tied_param_flows = torch.zeros([num_tied_params, batch_size], device = param_flows.device)
-    else:
-        assert tied_param_flows.size(0) == num_tied_params and tied_param_flows.size(1) == batch_size, "Size of `tied_param_flows` is incorrect."
-        tied_param_flows = tied_param_flows[:,:]
-
-    if param_flows.is_cuda:
-        assert tied_param_flows.is_cuda and tied_param_ids.is_cuda and tied_param_group_ids.is_cuda
-
-        grid = lambda meta: (triton.cdiv(num_params * batch_size, meta['BLOCK_SIZE']),)
-
-        _aggregate_flows_kernel[grid](
-            param_flows_ptr = param_flows, 
-            tied_param_flows_ptr = tied_param_flows,
-            tied_param_ids_ptr = tied_param_ids, 
-            tied_param_group_ids_ptr = tied_param_group_ids,
-            num_params = num_params,
-            batch_size = batch_size,
-            BLOCK_SIZE = BLOCK_SIZE
-        )
-
-        grid = lambda meta: (triton.cdiv(num_params * batch_size, meta['BLOCK_SIZE']),)
-
-        _assign_flows_kernel[grid](
-            param_flows_ptr = param_flows, 
-            tied_param_flows_ptr = tied_param_flows,
-            tied_param_ids_ptr = tied_param_ids, 
-            tied_param_group_ids_ptr = tied_param_group_ids,
-            num_params = num_params,
-            batch_size = batch_size,
-            BLOCK_SIZE = BLOCK_SIZE
-        )
-
-    else:
-        cum_matrix = torch.sparse_coo_tensor(
-            torch.stack((tied_param_group_ids, tied_param_ids), dim = 0), 
-            torch.ones([num_params], dtype = torch.float32, device = param_flows.device), 
-            (num_tied_params, param_flows.size(0))
-        )
-        par_group_buffer = torch.sparse.mm(cum_matrix, param_flows) # [num_tied_params, B]
-
-        param_flows[tied_param_ids] = par_group_buffer[tied_param_group_ids]
-
-    return None
\ No newline at end of file
diff --git a/src/pyjuice/layer/backend/node_partition.py b/src/pyjuice/layer/backend/node_partition.py
index 3f424dcf..ec846dd2 100644
--- a/src/pyjuice/layer/backend/node_partition.py
+++ b/src/pyjuice/layer/backend/node_partition.py
@@ -197,8 +197,7 @@ def partition_nodes_by_n_edges(node_n_edges: Union[np.ndarray, torch.Tensor],
     if isinstance(node_n_edges, torch.Tensor):
         node_n_edges = node_n_edges.detach().cpu().numpy()
 
-    max_num_edges = node_n_edges.max()
-    target_overhead = None if sparsity_tolerance is None else int(math.ceil(node_n_edges.shape[0] * max_num_edges * sparsity_tolerance))
+    target_overhead = None if sparsity_tolerance is None else int(math.ceil(node_n_edges.sum() * (1.0 + sparsity_tolerance)))
 
     if max_num_partitions == 1:
         partitions = np.zeros([1], dtype = np.int64)
diff --git a/src/pyjuice/layer/compilation.py b/src/pyjuice/layer/compilation.py
index edb0737f..f1f18c8d 100644
--- a/src/pyjuice/layer/compilation.py
+++ b/src/pyjuice/layer/compilation.py
@@ -894,11 +894,11 @@ def get_prod_layer_parstats(flat_cids: torch.Tensor, global_nid_start: int):
 
     u_cids, par_counts = torch.unique(flat_cids, sorted = True, return_counts = True)
 
-    c_sids = torch.arange(0, u_cids.size(0))[u_cids == global_nid_start]
+    c_sids = torch.arange(0, u_cids.size(0))[u_cids < global_nid_start]
 
     if c_sids.numel() > 0:
         # Strip away dummy nodes
-        c_sid = c_sids.min() + 1
+        c_sid = c_sids.max() + 1
         u_cids = u_cids[c_sid:]
         par_counts = par_counts[c_sid:]
 
diff --git a/src/pyjuice/layer/input_layer.py b/src/pyjuice/layer/input_layer.py
index f2eb2824..6cdfc767 100644
--- a/src/pyjuice/layer/input_layer.py
+++ b/src/pyjuice/layer/input_layer.py
@@ -88,7 +88,7 @@ def __init__(self, nodes: Sequence[InputNodes], cum_nodes: int = 0, max_tied_ns_
                     node2tiednodes[source_ns][1] += 1
 
         self._output_ind_range = (cum_nodes - layer_num_nodes, cum_nodes)
-        self.num_params = cum_params
+        self.num_parameters = cum_params
         self.num_param_flows = cum_param_flows
         self.num_nodes = layer_num_nodes
         self.dist_signature = dist_signature
@@ -114,7 +114,7 @@ def __init__(self, nodes: Sequence[InputNodes], cum_nodes: int = 0, max_tied_ns_
         source_nids = torch.empty([cum_source_ns], dtype = torch.long)
 
         # Parameters of this layer
-        params = torch.empty([self.num_params], dtype = torch.float32)
+        params = torch.empty([self.num_parameters], dtype = torch.float32)
         
         n_start = 0
         source_n_start = 0
@@ -442,7 +442,7 @@ def mini_batch_em(self, step_size: float, pseudocount: float = 0.0):
                     raise NotImplementedError("CPU minibatch em fn for input nodes is not implemented.")
 
     def get_param_specs(self):
-        return {"params": torch.Size([self.num_params])}
+        return {"params": torch.Size([self.num_parameters])}
 
     def enable_partial_evaluation(self, fw_scopes: Optional[Union[Sequence[BitSet],Sequence[int]]] = None, 
                                   bk_scopes: Optional[Union[Sequence[BitSet],Sequence[int]]] = None, return_ids: bool = False):
diff --git a/src/pyjuice/layer/sum_layer.py b/src/pyjuice/layer/sum_layer.py
index c8a90f2a..e0d349f7 100644
--- a/src/pyjuice/layer/sum_layer.py
+++ b/src/pyjuice/layer/sum_layer.py
@@ -192,6 +192,10 @@ def to(self, device):
             new_v = [tensor.to(device) for tensor in v]
             self._cached_fw_compiled_pcids[k] = new_v
 
+    @property
+    def num_parameters(self):
+        return self._layer_pid_range[1] - self._layer_pid_range[0]
+
     def forward(self, node_mars: torch.Tensor, element_mars: torch.Tensor, params: torch.Tensor) -> None:
         """
         Computes the forward pass of a sum layer:
diff --git a/src/pyjuice/model/backend/__init__.py b/src/pyjuice/model/backend/__init__.py
new file mode 100644
index 00000000..6580dde5
--- /dev/null
+++ b/src/pyjuice/model/backend/__init__.py
@@ -0,0 +1,3 @@
+from .parflow_fusing import compile_cum_par_flows_fn, compute_cum_par_flows, cum_par_flows_to_device
+from .par_update import compile_par_update_fn, em_par_update, par_update_to_device
+from .normalize import normalize_parameters
diff --git a/src/pyjuice/model/backend/normalize.py b/src/pyjuice/model/backend/normalize.py
new file mode 100644
index 00000000..1da31b4c
--- /dev/null
+++ b/src/pyjuice/model/backend/normalize.py
@@ -0,0 +1,148 @@
+from __future__ import annotations
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+from numba import njit
+
+
+@triton.jit
+def cum_par_kernel(cum_pflows, params, par_start_ids, blk_sizes, blk_intervals, 
+                   global_nids, num_blocks, BLOCK_ID: tl.constexpr, BLOCK_SIZE: tl.constexpr):
+
+    pid = tl.program_id(axis = 0)
+
+    offs_m = pid * BLOCK_ID + tl.arange(0, BLOCK_ID)
+    mask_m = offs_m < num_blocks
+
+    offs_blk = tl.arange(0, BLOCK_SIZE)
+
+    par_start = tl.load(par_start_ids + offs_m, mask = mask_m, other = 0)
+    blk_size = tl.load(blk_sizes + offs_m, mask = mask_m, other = 0)
+    blk_interval = tl.load(blk_intervals + offs_m, mask = mask_m, other = 0)
+    global_nid = tl.load(global_nids + offs_m, mask = mask_m, other = 0)
+
+    offs_par = par_start[:,None] + offs_blk[None,:] * blk_interval[:,None]
+    mask_par = mask_m[:,None] & (offs_blk[None,:] < blk_size[:,None])
+    pars = tl.load(params + offs_par, mask = mask_par, other = 0)
+    sum_pars = tl.sum(pars, axis = 1)
+
+    tl.atomic_add(cum_pflows + global_nid, sum_pars, mask = mask_m)
+
+
+@triton.jit
+def par_update_kernel(params, cum_pflows, nchs, par_start_ids, blk_sizes, blk_intervals,
+                      global_nids, constexprs, num_blocks, BLOCK_ID: tl.constexpr, BLOCK_SIZE: tl.constexpr):
+
+    pid = tl.program_id(axis = 0)
+
+    # Retrieve the constants
+    pseudocount = tl.load(constexprs)
+
+    offs_m = pid * BLOCK_ID + tl.arange(0, BLOCK_ID)
+    mask_m = offs_m < num_blocks
+
+    offs_blk = tl.arange(0, BLOCK_SIZE)
+
+    par_start = tl.load(par_start_ids + offs_m, mask = mask_m, other = 0)
+    blk_size = tl.load(blk_sizes + offs_m, mask = mask_m, other = 0)
+    blk_interval = tl.load(blk_intervals + offs_m, mask = mask_m, other = 0)
+    global_nid = tl.load(global_nids + offs_m, mask = mask_m, other = 0)
+
+    offs_par = par_start[:,None] + offs_blk[None,:] * blk_interval[:,None]
+    mask_par = mask_m[:,None] & (offs_blk[None,:] < blk_size[:,None])
+    pars = tl.load(params + offs_par, mask = mask_par, other = 0)
+
+    sum_pars = tl.load(cum_pflows + global_nid, mask = mask_m, other = 1)
+    nch = tl.load(nchs + global_nid, mask = mask_m, other = 1)
+
+    norm_param = (pars + pseudocount / nch[:,None]) / (sum_pars[:,None] + pseudocount)
+
+    tl.store(params + offs_par, norm_param, mask = mask_par)
+
+
+@njit
+def cum_par_numba_kernel(cum_pflows, params, par_start_ids, blk_sizes, blk_intervals, global_nids):
+    for i in range(par_start_ids.shape[0]):
+        par_start_id = par_start_ids[i]
+        blk_size = blk_sizes[i]
+        blk_interval = blk_intervals[i]
+        global_nid = global_nids[i]
+
+        cum_par = 0.0
+        for j in range(blk_size):
+            cum_par += params[par_start_id+j*blk_interval]
+
+        cum_pflows[global_nid] += cum_par
+
+
+@njit
+def par_update_numba_kernel(params, cum_pflows, nchs, par_start_ids, blk_sizes, blk_intervals, global_nids, pseudocount):
+    for i in range(par_start_ids.shape[0]):
+        par_start_id = par_start_ids[i]
+        blk_size = blk_sizes[i]
+        blk_interval = blk_intervals[i]
+        global_nid = global_nids[i]
+
+        cum_par = cum_pflows[global_nid]
+        nch = nchs[global_nid]
+
+        for j in range(blk_size):
+            par = params[par_start_id+j*blk_interval]
+            norm_par = (par + pseudocount / nch) + (cum_par + pseudocount)
+            params[par_start_id+j*blk_interval] = norm_par
+
+
+def normalize_parameters(params, par_update_kwargs, pseudocount: float = 0.0):
+
+    par_start_ids, _, blk_sizes, blk_intervals, global_nids, nchs, cum_pflows, metadata = par_update_kwargs
+
+    tot_num_nodes = metadata["tot_num_nodes"]
+    BLOCK_SIZE = metadata["BLOCK_SIZE"]
+
+    if cum_pflows is None:
+        cum_pflows = torch.zeros([tot_num_nodes], dtype = torch.float32, device = params.device)
+    else:
+        cum_pflows[:] = 0.0
+
+    use_cuda = params.is_cuda
+
+    if use_cuda:
+
+        num_blocks = par_start_ids.size(0)
+        BLOCK_ID = 2048 // BLOCK_SIZE
+
+        grid = (triton.cdiv(num_blocks, BLOCK_ID),)
+
+        cum_par_kernel[grid](
+            cum_pflows, params, par_start_ids, blk_sizes, blk_intervals, 
+            global_nids, num_blocks, BLOCK_ID, BLOCK_SIZE
+        )
+
+        constexprs = torch.tensor([pseudocount]).to(params.device)
+
+        par_update_kernel[grid](
+            params, cum_pflows, nchs, par_start_ids, blk_sizes, blk_intervals,
+            global_nids, constexprs, num_blocks, BLOCK_ID, BLOCK_SIZE
+        )
+
+    else:
+
+        cum_pflows = cum_pflows.numpy()
+        params = params.numpy()
+        par_start_ids = par_start_ids.numpy()
+        blk_sizes = blk_sizes.numpy()
+        blk_intervals = blk_intervals.numpy()
+        global_nids = global_nids.numpy()
+        nchs = nchs.numpy()
+
+        cum_par_numba_kernel(
+            cum_pflows, params, par_start_ids, blk_sizes, 
+            blk_intervals, global_nids
+        )
+
+        par_update_numba_kernel(
+            params, cum_pflows, nchs, par_start_ids, blk_sizes, 
+            blk_intervals, global_nids, pseudocount
+        )
diff --git a/src/pyjuice/model/backend/par_update.py b/src/pyjuice/model/backend/par_update.py
index 0eba2109..a5cc20dc 100644
--- a/src/pyjuice/model/backend/par_update.py
+++ b/src/pyjuice/model/backend/par_update.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import numpy as np
 import torch
 import torch.nn as nn
 import triton
@@ -10,7 +11,7 @@
 
 
 @njit
-def _record_par_blks(par_start_ids, pflow_start_ids, blk_sizes, blk_intervals, global_nids, 
+def _record_par_blks(par_start_ids, pflow_start_ids, blk_sizes, blk_intervals, global_nids, nchs,
                      num_edges_per_ng, ns_num_node_groups, ns_group_size, cs_group_size, pid, 
                      global_nid, par_start, pflow_start, BLOCK_SIZE):
     for local_ngid in range(ns_num_node_groups):
@@ -31,6 +32,7 @@ def _record_par_blks(par_start_ids, pflow_start_ids, blk_sizes, blk_intervals, g
                 blk_sizes[pid] = blk_size
                 blk_intervals[pid] = ns_group_size
                 global_nids[pid] = global_nid + gid
+                nchs[pid] = num_edges * cs_group_size
 
                 pid += 1
 
@@ -49,6 +51,7 @@ def compile_par_update_fn(root_ns: CircuitNodes, BLOCK_SIZE: int = 32, buffer_in
     blk_sizes = np.zeros([buffer_inc_interval], dtype = np.int64)
     blk_intervals = np.zeros([buffer_inc_interval], dtype = np.int64)
     global_nids = np.zeros([buffer_inc_interval], dtype = np.int64)
+    nchs = np.zeros([buffer_inc_interval], dtype = np.int64)
     pid = 0
 
     global_nid = 0
@@ -63,7 +66,7 @@ def compile_par_update_fn(root_ns: CircuitNodes, BLOCK_SIZE: int = 32, buffer_in
         num_edges_per_ng = torch.bincount(ns.edge_ids[0,:], minlength = ns.num_node_groups).contiguous().numpy()
 
         # Enlarge the buffer if needed
-        est_num_slots = triton.cdiv(ns.edges.size(1) * ns.group_size * ns.ch_group_size, BLOCK_SIZE) + ns.num_nodes
+        est_num_slots = triton.cdiv(ns.edge_ids.size(1) * ns.group_size * ns.ch_group_size, BLOCK_SIZE) + ns.num_nodes
         if pid + est_num_slots > par_start_ids.shape[0]:
             curr_size = par_start_ids.shape[0]
             inc_shape = triton.cdiv(pid + est_num_slots - curr_size, buffer_inc_interval) * buffer_inc_interval
@@ -73,6 +76,7 @@ def compile_par_update_fn(root_ns: CircuitNodes, BLOCK_SIZE: int = 32, buffer_in
             blk_sizes = np.ascontiguousarray(blk_sizes.resize(curr_size + inc_shape))
             blk_intervals = np.ascontiguousarray(blk_intervals.resize(curr_size + inc_shape))
             global_nids = np.ascontiguousarray(global_nids.resize(curr_size + inc_shape))
+            nchs = np.ascontiguousarray(nchs.resize(curr_size + inc_shape))
 
         if use_numba:
 
@@ -81,7 +85,7 @@ def compile_par_update_fn(root_ns: CircuitNodes, BLOCK_SIZE: int = 32, buffer_in
             cs_group_size = ns.ch_group_size
 
             global_nid, pid = _record_par_blks(
-                par_start_ids, pflow_start_ids, blk_sizes, blk_intervals, global_nids, 
+                par_start_ids, pflow_start_ids, blk_sizes, blk_intervals, global_nids, nchs,
                 num_edges_per_ng, ns_num_node_groups, ns_group_size, cs_group_size, pid, 
                 global_nid, par_start, pflow_start, BLOCK_SIZE
             )
@@ -106,6 +110,7 @@ def compile_par_update_fn(root_ns: CircuitNodes, BLOCK_SIZE: int = 32, buffer_in
                     blk_sizes[pid:pid+ns.group_size] = blk_size
                     blk_intervals[pid:pid+ns.group_size] = ns.group_size
                     global_nids[pid:pid+ns.group_size] = curr_global_nids
+                    nchs[pid:pid+ns.group_size] = num_edges * ns.ch_group_size
 
                     pid += ns.group_size
 
@@ -116,12 +121,29 @@ def compile_par_update_fn(root_ns: CircuitNodes, BLOCK_SIZE: int = 32, buffer_in
     blk_sizes = torch.from_numpy(blk_sizes[:pid]).contiguous()
     blk_intervals = torch.from_numpy(blk_intervals[:pid]).contiguous()
     global_nids = torch.from_numpy(global_nids[:pid]).contiguous()
+    nchs = torch.from_numpy(nchs[:pid]).contiguous()
 
     cum_pflows = torch.zeros([global_nids[-1] + 1], dtype = torch.float32)
     
     metadata = {"tot_num_nodes": global_nids[-1] + 1, "BLOCK_SIZE": BLOCK_SIZE}
 
-    return par_start_ids, pflow_start_ids, blk_sizes, blk_intervals, global_nids, cum_pflows, metadata
+    return [par_start_ids, pflow_start_ids, blk_sizes, blk_intervals, global_nids, nchs, cum_pflows, metadata]
+
+
+def par_update_to_device(par_update_kwargs, device):
+
+    par_start_ids, pflow_start_ids, blk_sizes, blk_intervals, global_nids, nchs, cum_pflows, metadata = par_update_kwargs
+
+    return [
+        par_start_ids.to(device),
+        pflow_start_ids.to(device),
+        blk_sizes.to(device),
+        blk_intervals.to(device),
+        global_nids.to(device),
+        nchs.to(device),
+        cum_pflows.to(device),
+        metadata
+    ]
 
 
 @triton.jit
@@ -148,6 +170,7 @@ def cum_pflow_kernel(cum_pflows, param_flows, pflow_start_ids, blk_sizes, blk_in
     tl.atomic_add(cum_pflows + global_nid, nflows, mask = mask_m)
 
 
+@triton.jit
 def par_update_kernel(params, param_flows, cum_pflows, nchs, par_start_ids, pflow_start_ids, blk_sizes, blk_intervals,
                       global_nids, constexprs, num_blocks, BLOCK_ID: tl.constexpr, BLOCK_SIZE: tl.constexpr):
 
@@ -185,7 +208,7 @@ def par_update_kernel(params, param_flows, cum_pflows, nchs, par_start_ids, pflo
 
 
 def em_par_update(params, param_flows, par_start_ids, pflow_start_ids, blk_sizes, blk_intervals, 
-                  global_nids, metadata, step_size: float, pseudocount: float = 0.0, cum_pflows = None):
+                  global_nids, nchs, metadata, step_size: float, pseudocount: float = 0.0, cum_pflows = None):
 
     tot_num_nodes = metadata["tot_num_nodes"]
     BLOCK_SIZE = metadata["BLOCK_SIZE"]
@@ -208,6 +231,6 @@ def em_par_update(params, param_flows, par_start_ids, pflow_start_ids, blk_sizes
     constexprs = torch.tensor([step_size, pseudocount]).to(params.device)
 
     par_update_kernel[grid](
-        params, param_flows, cum_pflows, par_start_ids, pflow_start_ids, blk_sizes, blk_intervals,
+        params, param_flows, cum_pflows, nchs, par_start_ids, pflow_start_ids, blk_sizes, blk_intervals,
         global_nids, constexprs, num_blocks, BLOCK_ID, BLOCK_SIZE
     )
diff --git a/src/pyjuice/model/backend/parflow_fusing.py b/src/pyjuice/model/backend/parflow_fusing.py
index 53198b75..e3c3741d 100644
--- a/src/pyjuice/model/backend/parflow_fusing.py
+++ b/src/pyjuice/model/backend/parflow_fusing.py
@@ -8,7 +8,7 @@
 
 def compile_cum_par_flows_fn(node2tiednodes, MAX_NGROUPS = 2048, BLOCK_SIZE = 2048):
 
-    ngroup2kernel_specs = []
+    ngroup2kernel_specs = dict()
     for source_ns, item in node2tiednodes.items():
         if len(item[0]) > 1: # If the length is 1, then everything is already accumulated in the source node's parflow
             num_par_flows = source_ns._param_flow_range[1] - source_ns._param_flow_range[0]
@@ -60,6 +60,21 @@ def compile_cum_par_flows_fn(node2tiednodes, MAX_NGROUPS = 2048, BLOCK_SIZE = 20
     return kernels_args
 
 
+def cum_par_flows_to_device(kernels_args, device):
+    for i in range(len(kernels_args)):
+        target_pfids, block_sizes, ch_pfids, BLOCK_G, BLOCK_M = kernels_args[i]
+
+        kernels_args[i] = [
+            target_pfids.to(device),
+            block_sizes.to(device),
+            ch_pfids.to(device),
+            BLOCK_G,
+            BLOCK_M
+        ]
+
+    return kernels_args
+
+
 @triton.jit
 def cum_par_flows_kernel(param_flows, target_pfids, block_sizes, ch_pfids, BLOCK_G: tl.constexpr, BLOCK_M: tl.constexpr):
 
diff --git a/src/pyjuice/model/tensorcircuit.py b/src/pyjuice/model/tensorcircuit.py
index abe08a14..772d010d 100644
--- a/src/pyjuice/model/tensorcircuit.py
+++ b/src/pyjuice/model/tensorcircuit.py
@@ -11,10 +11,13 @@
 
 from pyjuice.nodes import CircuitNodes, InputNodes, ProdNodes, SumNodes, foreach
 from pyjuice.layer import Layer, InputLayer, ProdLayer, SumLayer
-from pyjuice.functional import normalize_parameters, flat_softmax_fw, flat_softmax_bp
 from pyjuice.utils.grad_fns import ReverseGrad, PseudoHookFunc
 from pyjuice.utils import BitSet
 
+from .backend import compile_cum_par_flows_fn, compute_cum_par_flows, cum_par_flows_to_device, \
+                     compile_par_update_fn, em_par_update, par_update_to_device, \
+                     normalize_parameters
+
 
 def _pc_model_backward_hook(grad, pc, **kwargs):
     grad = grad.permute(1, 0)
@@ -48,30 +51,43 @@ def _pc_inputs_hook(grad, pc, i):
 
 
 class TensorCircuit(nn.Module):
-    def __init__(self, root_nodes: CircuitNodes, layer_sparsity_tol: float = 0.5, 
+    def __init__(self, root_ns: CircuitNodes, layer_sparsity_tol: float = 0.5, 
                  max_num_partitions: Optional[int] = None, disable_gpu_compilation: bool = False, 
+                 force_gpu_compilation: bool = False,
+                 max_tied_ns_per_parflow_group: int = 8,
                  verbose: bool = True) -> None:
         """
-        Create a tensorized circuit for the circuit rooted at `root_nodes`.
+        Create a tensorized circuit for the circuit rooted at `root_ns`.
 
         Parameters:
-        `root_nodes`:              root node(s) of the circuit
-        `layer_sparsity_tol`:      the minimum allowed sparsity of compiled layers; ranges from 0.0 to 1.0; larger means more strict
-        `max_num_partitions`:        how many groups do we want to split a layer into
-        `disable_gpu_compilation`: disable GPU compilation of the layers
+        `root_ns`:                       root nodes of the circuit
+        `layer_sparsity_tol`:            the minimum allowed sparsity of compiled layers; ranges from 0.0 to 1.0; smaller means more strict
+        `max_num_partitions`:            how many groups do we want to split a layer into
+        `disable_gpu_compilation`:       disable GPU compilation of the layers
+        `force_gpu_compilation`:         always use GPU when compiling the layers
+        `max_tied_ns_per_parflow_group`: when there are tied nodes, specify at most how many nodes share a parameter flow accumulation buffer 
         """
 
-        super().__init__()
+        super(TensorCircuit, self).__init__()
 
-        self.root_nodes = root_nodes
+        self.root_ns = root_ns
         self.device = torch.device("cpu")
 
         self._init_pass_tensors()
         self._init_layers(
-            layer_sparsity_tol = layer_sparsity_tol, max_num_partitions = max_num_partitions, 
-            disable_gpu_compilation = disable_gpu_compilation, verbose = verbose
+            layer_sparsity_tol = layer_sparsity_tol, 
+            max_num_partitions = max_num_partitions, 
+            disable_gpu_compilation = disable_gpu_compilation, 
+            force_gpu_compilation = force_gpu_compilation,
+            max_tied_ns_per_parflow_group = max_tied_ns_per_parflow_group,
+            verbose = verbose
         )
-        self._init_ad_tensors()
+        
+        # Hyperparameters for backward pass
+        self._optim_hyperparams = {
+            "compute_param_flows": True,
+            "flows_memory": 0.0
+        }
 
     def _init_pass_tensors(self):
         self.node_mars = None
@@ -79,17 +95,6 @@ def _init_pass_tensors(self):
         self.node_flows = None
         self.element_flows = None
         self.param_flows = None
-
-    def _init_ad_tensors(self):
-        self._inputs = [None, None]
-        self._inputs_grad = [None, None]
-        self._backward_buffer = dict()
-
-        self._optim_hyperparams = {
-            "compute_param_flows": True,
-            "flows_memory": 0.0
-        }
-        self._used_external_sum_params = False
         
     def forward(self, inputs: torch.Tensor, 
                 params: Optional[torch.Tensor] = None, 
@@ -426,6 +431,12 @@ def to(self, device):
 
         self.device = device
 
+        # For parameter flow accumulation
+        self.parflow_fusing_kwargs = cum_par_flows_to_device(self.parflow_fusing_kwargs, device)
+        
+        # For parameter update
+        self.par_update_kwargs = par_update_to_device(self.par_update_kwargs, device)
+
         return self
 
     def get_param_specs(self):
@@ -522,139 +533,156 @@ def disable_partial_evaluation(self, forward: bool = True, backward: bool = True
 
         self._pv_node_flows_mask = None
 
-    def _init_layers(self, init_input_params: Optional[Sequence[torch.Tensor]] = None, 
-                     init_inner_params: Optional[torch.Tensor] = None,
-                     layer_sparsity_tol: float = 0.0, max_num_partitions: Optional[int] = None,
-                     disable_gpu_compilation: bool = False, verbose: bool = True):
-
-        self.root_nodes._clear_tensor_circuit_hooks()
-        depth2nodes, num_layers = self._create_node_layers()
+    def _init_layers(self, layer_sparsity_tol: Optional[float] = None, max_num_partitions: Optional[int] = None,
+                     disable_gpu_compilation: bool = False, force_gpu_compilation: bool = False, 
+                     max_tied_ns_per_parflow_group: int = 8, verbose: bool = True):
 
         if hasattr(self, "input_layers") or hasattr(self, "inner_layers"):
             raise ValueError("Attempting to initialize a TensorCircuit for the second time. " + \
-                "Please instead create a new TensorCircuit instance by `TensorCircuit(nodes)`.")
+                "Please instead create a new TensorCircuit instance by calling `pc = TensorCircuit(root_ns)`.")
+
+        # Clear hooks/pointers used by previous `TensorCircuit`s
+        self.root_ns._clear_tensor_circuit_hooks()
+
+        # Create layers
+        depth2nodes, num_layers, max_node_group_size, max_ele_group_size = self._create_node_layers()
 
         self.input_layers = []
         self.inner_layers = []
 
-        # Nodes include one dummy node and all input/sum nodes in the PC
-        num_nodes = 1
+        self.num_dummy_nodes = max_ele_group_size
+        self.num_dummy_eles = max_node_group_size
+
+        # Nodes include `max_ele_group_size` dummy nodes and all input/sum nodes in the PC
+        num_nodes = max_ele_group_size
 
         # Total number of edges
         num_edges = 0
 
-        # Elements include one dummy element and all product nodes in the PC
-        num_elements = 1
+        # Elements include `max_node_group_size` dummy elements and all product nodes in the PC
+        num_elements = max_node_group_size
 
-        # Number of parameters for sum nodes in the PC, plus one dummy parameter
-        param_ends = [1]
+        # Number of parameters
+        num_parameters = max_node_group_size
 
-        # Index mapping from original parameter space to a tied parameter space
-        tied_param_ids = []
-        tied_param_group_ids = []
-        tied_param_ends = []
+        # Number of parameter flows
+        num_param_flows = 0
 
-        import pdb; pdb.set_trace()
+        # Stores distributed parameter flows
+        node2tiednodes = dict()
 
         if verbose:
             print(f"Compiling {num_layers} layers...")
+
         layer_id = 0
         for depth in tqdm(range(num_layers), disable = not verbose):
             if depth == 0:
                 # Input layer
-                type2nodes = self._categorize_input_nodes(depth2nodes[0]["input"])
+                signature2nodes = self._categorize_input_nodes(depth2nodes[0]["input"])
                 input_layer_id = 0
-                for NodeType, nodes in type2nodes.items():
-                    input_layer = InputLayer(nodes = nodes, cum_nodes = num_nodes)
+                for signature, nodes in signature2nodes.items():
+                    input_layer = InputLayer(
+                        nodes = nodes, cum_nodes = num_nodes,
+                        max_tied_ns_per_parflow_group = max_tied_ns_per_parflow_group
+                    )
 
-                    num_nodes += input_layer.num_nodes
                     self.input_layers.append(input_layer)
                     self.add_module(f"input_layer_{input_layer_id}", input_layer)
+                    
                     input_layer_id += 1
+                    num_nodes += input_layer.num_nodes
             else:
                 assert len(depth2nodes[depth]["prod"]) > 0 and len(depth2nodes[depth]["sum"]) > 0, \
-                    "Depth {}: ({}, {})".format(depth, len(depth2nodes[depth]["prod"]), len(depth2nodes[depth]["sum"]))
-
-                # Product layer
-                prod_layer = ProdLayer(
-                    nodes = depth2nodes[depth]["prod"], 
-                    layer_sparsity_tol = layer_sparsity_tol,
-                    max_num_partitions = max_num_partitions,
-                    disable_gpu_compilation = disable_gpu_compilation
-                )
-
-                if prod_layer.num_nodes + 1 > num_elements:
-                    num_elements = prod_layer.num_nodes + 1
-
-                self.add_module(f"prod_layer_{layer_id}", prod_layer)
-                self.inner_layers.append(prod_layer)
-
-                # Sum layer
-                sum_layer = SumLayer(
-                    nodes = depth2nodes[depth]["sum"],
-                    global_nid_start = num_nodes, 
-                    param_ends = param_ends, 
-                    tied_param_ids = tied_param_ids,
-                    tied_param_group_ids = tied_param_group_ids,
-                    tied_param_ends = tied_param_ends,
-                    ch_prod_layer_size = prod_layer.num_nodes + 1,
-                    layer_sparsity_tol = layer_sparsity_tol,
-                    max_num_partitions = max_num_partitions,
-                    disable_gpu_compilation = disable_gpu_compilation
-                )
-
-                num_nodes += sum_layer.num_nodes
-                num_edges += prod_layer.num_edges + sum_layer.num_edges
-
-                self.add_module(f"sum_layer_{layer_id}", sum_layer)
-                self.inner_layers.append(sum_layer)
+                    "Depth {}: (# prod nodes: {}, # sum nodes: {})".format(depth, len(depth2nodes[depth]["prod"]), len(depth2nodes[depth]["sum"]))
+
+                # Product layer(s)
+                gsize2prod_nodes = dict()
+                for ns in depth2nodes[depth]["prod"]:
+                    gsize = ns.group_size
+                    if gsize not in gsize2prod_nodes:
+                        gsize2prod_nodes[gsize] = []
+                    gsize2prod_nodes[gsize].append(ns)
+                
+                layer_num_elements = max_node_group_size
+                for gsize, nodes in gsize2prod_nodes.items():
+                    prod_layer = ProdLayer(
+                        nodes = nodes, 
+                        global_nid_start = layer_num_elements,
+                        layer_sparsity_tol = layer_sparsity_tol,
+                        max_num_partitions = max_num_partitions,
+                        disable_gpu_compilation = disable_gpu_compilation,
+                        force_gpu_compilation = force_gpu_compilation
+                    )
+
+                    layer_num_elements += prod_layer.num_nodes
+                    num_edges += prod_layer.num_edges
+
+                    self.add_module(f"prod_layer_{layer_id}_{gsize}", prod_layer)
+                    self.inner_layers.append(prod_layer)
+
+                if layer_num_elements > num_elements:
+                    num_elements = layer_num_elements
+
+                # Sum layer(s)
+                gsize2sum_nodes = dict()
+                for ns in depth2nodes[depth]["sum"]:
+                    gsize = ns.group_size
+                    if gsize not in gsize2sum_nodes:
+                        gsize2sum_nodes[gsize] = []
+                    gsize2sum_nodes[gsize].append(ns)
+                
+                for gsize, nodes in gsize2sum_nodes.items():
+                    sum_layer = SumLayer(
+                        nodes = nodes,
+                        global_nid_start = num_nodes, 
+                        global_pid_start = num_parameters,
+                        global_pfid_start = num_param_flows,
+                        node2tiednodes = node2tiednodes,
+                        layer_sparsity_tol = layer_sparsity_tol,
+                        max_num_partitions = max_num_partitions,
+                        max_tied_ns_per_parflow_group = max_tied_ns_per_parflow_group,
+                        disable_gpu_compilation = disable_gpu_compilation,
+                        force_gpu_compilation = force_gpu_compilation
+                    )
+
+                    num_nodes += sum_layer.num_nodes
+                    num_edges += sum_layer.num_edges
+                    num_parameters += sum_layer.num_parameters
+
+                    self.add_module(f"sum_layer_{layer_id}_{gsize}", sum_layer)
+                    self.inner_layers.append(sum_layer)
 
                 layer_id += 1
 
         self.num_nodes = num_nodes
         self.num_edges = num_edges
         self.num_elements = num_elements
-        self.num_sum_params = param_ends[-1]
-        self.param_ends = param_ends
-
-        # For parameter normalization
-        # Node that input nodes are implicitly omitted as they have no child
-        node_ids = torch.empty([self.num_sum_params], dtype = torch.long)
-        node_nchs = torch.empty([len(self.param_ends)], dtype = torch.long)
-        node_ids[:self.param_ends[0]] = 0
-        node_nchs[0] = self.param_ends[0]
-        for i in range(1, len(self.param_ends)):
-            node_ids[self.param_ends[i-1]:self.param_ends[i]] = i
-            node_nchs[i] = self.param_ends[i] - self.param_ends[i-1]
-        
-        self.register_buffer("node_ids", node_ids)
-        self.register_buffer("node_nchs", node_nchs)
+        self.num_sum_params = num_parameters
+        self.num_param_flows = num_param_flows
 
-        # For parameter tying
-        self.num_tied_params = tied_param_ends[-1] if len(tied_param_ends) > 0 else 0
-        if self.num_tied_params > 0:
-            tied_param_ids = torch.tensor(tied_param_ids).long()
-            tied_param_group_ids = torch.tensor(tied_param_group_ids).long()
-            self.register_buffer("tied_param_ids", tied_param_ids)
-            self.register_buffer("tied_param_group_ids", tied_param_group_ids)
+        # For parameter flow accumulation
+        self.parflow_fusing_kwargs = compile_cum_par_flows_fn(node2tiednodes, MAX_NGROUPS = 2048, BLOCK_SIZE = 2048)
+        
+        # For parameter update
+        self.par_update_kwargs = compile_par_update_fn(self.root_ns, BLOCK_SIZE = 32)
 
         # Register root nodes
-        self.num_root_nodes = self.inner_layers[-1].num_nodes
+        self.num_root_nodes = self.root_ns.num_nodes
         self._root_node_range = (self.num_nodes - self.num_root_nodes, self.num_nodes)
 
         # Initialize parameters
         self._init_parameters()
 
-    def _init_parameters(self, perturbation: float = 4.0, pseudocount: float = 1e-6):
+    def _init_parameters(self, perturbation: float = 4.0, pseudocount: float = 0.0):
         params = torch.exp(torch.rand([self.num_sum_params]) * -perturbation)
+        params[:self.num_dummy_eles] = 0.0
 
         # Copy initial parameters if provided
-        for layer in self.inner_layers:
-            if isinstance(layer, SumLayer):
-                for ns in layer.nodes:
-                    if not ns.is_tied() and ns.has_params():
-                        sidx, eidx = ns._param_range
-                        params[sidx:eidx] = ns._params[ns._inverse_param_ids].to(params.device)
+        for ns in self.root_ns:
+            if ns.is_sum() and not ns.is_tied() and ns.has_params():
+                sidx, eidx = ns._param_range
+                ns_params = ns._params[ns._inverse_param_ids,:,:].permute(0, 2, 1).reshape(-1)
+                params[sidx:eidx] = ns_params.to(params.device)
 
         self._normalize_parameters(params, pseudocount = pseudocount)
         self.params = nn.Parameter(params)
@@ -663,20 +691,28 @@ def _init_parameters(self, perturbation: float = 4.0, pseudocount: float = 1e-6)
         # gradient of PC parameters by PyTorch.
         self.params.requires_grad = False
 
+        # Initialize parameters for input layers
         for idx, layer in enumerate(self.input_layers):
             layer._init_parameters(perturbation)
 
     def _normalize_parameters(self, params, pseudocount: float = 0.0):
         if params is not None:
-            normalize_parameters(params, self.node_ids, self.node_nchs, pseudocount)
+            normalize_parameters(params, self.par_update_kwargs, pseudocount)
 
     def _create_node_layers(self):
         depth2nodes = dict()
         nodes2depth = dict()
 
-        num_layers = [1]
+        num_layers = 1
+        max_node_group_size = 0
+        max_ele_group_size = 0
 
         def dfs(ns: CircuitNodes):
+
+            nonlocal num_layers
+            nonlocal max_node_group_size
+            nonlocal max_ele_group_size
+
             if ns in nodes2depth:
                 return
             if ns.is_input():
@@ -689,7 +725,7 @@ def dfs(ns: CircuitNodes):
                     dfs(cs)
 
                 depth = max(map(lambda ms: nodes2depth[ms], ns.chs)) + (1 if ns.is_prod() else 0)
-                num_layers[0] = max(depth + 1, num_layers[0])
+                num_layers = max(depth + 1, num_layers)
                 nodes2depth[ns] = depth
 
                 if depth not in depth2nodes:
@@ -697,10 +733,15 @@ def dfs(ns: CircuitNodes):
                 
                 if ns.is_sum():
                     depth2nodes[depth]["sum"].append(ns)
-                elif not ns.is_prod():
+                    if ns.group_size > max_node_group_size:
+                        max_node_group_size = ns.group_size
+                elif ns.is_prod():
+                    if ns.group_size > max_ele_group_size:
+                        max_ele_group_size = ns.group_size
+                else:
                     raise NotImplementedError(f"Unsupported node type {type(n)}.")
 
-        dfs(self.root_nodes)
+        dfs(self.root_ns)
 
         pns2layer = dict()
         for layer in range(1, len(depth2nodes)):
@@ -713,17 +754,17 @@ def dfs(ns: CircuitNodes):
                             depth2nodes[layer]["prod"].append(cs)
                             pns2layer[id(cs)] = layer
 
-        return depth2nodes, num_layers[0]
+        return depth2nodes, num_layers, max_node_group_size, max_ele_group_size
 
     def _categorize_input_nodes(self, nodes: Sequence[InputNodes]):
-        type2nodes = dict()
+        signature2nodes = dict()
         for ns in nodes:
-            ltype = ns.dist.get_signature()
-            if ltype not in type2nodes:
-                type2nodes[ltype] = []
-            type2nodes[ltype].append(ns)
+            signature = ns.dist.get_signature()
+            if signature not in signature2nodes:
+                signature2nodes[signature] = []
+            signature2nodes[signature].append(ns)
 
-        return type2nodes
+        return signature2nodes
 
     def _create_scope2nid_cache(self):
         # Input layers
diff --git a/src/pyjuice/nodes/backend/__init__.py b/src/pyjuice/nodes/backend/__init__.py
new file mode 100644
index 00000000..6a1cf5a3
--- /dev/null
+++ b/src/pyjuice/nodes/backend/__init__.py
@@ -0,0 +1 @@
+from .normalize import normalize_ns_parameters
\ No newline at end of file
diff --git a/src/pyjuice/functional/normalize.py b/src/pyjuice/nodes/backend/normalize.py
similarity index 96%
rename from src/pyjuice/functional/normalize.py
rename to src/pyjuice/nodes/backend/normalize.py
index 27809b8a..b2be384b 100644
--- a/src/pyjuice/functional/normalize.py
+++ b/src/pyjuice/nodes/backend/normalize.py
@@ -64,8 +64,8 @@ def _norm_params_kernel(params_ptr, cum_params_ptr, node_ids_ptr, node_nchs_ptr,
     tl.store(params_ptr + p_offsets, normed_params, mask = mask)
 
 
-def normalize_parameters(params: torch.Tensor, node_ids: torch.Tensor, group_size: int, ch_group_size: int, 
-                         node_nchs: Optional[torch.Tensor] = None, pseudocount: float = 0.0):
+def normalize_ns_parameters(params: torch.Tensor, node_ids: torch.Tensor, group_size: int, ch_group_size: int, 
+                            node_nchs: Optional[torch.Tensor] = None, pseudocount: float = 0.0):
 
     assert 3 <= params.dim() <= 4 and params.size(1) == group_size and params.size(2) == ch_group_size
 
diff --git a/src/pyjuice/nodes/nodes.py b/src/pyjuice/nodes/nodes.py
index f87b0160..5d108918 100644
--- a/src/pyjuice/nodes/nodes.py
+++ b/src/pyjuice/nodes/nodes.py
@@ -148,21 +148,24 @@ def has_params(self):
             return hasattr(source_ns, "_params") and source_ns._params is not None
 
     def _clear_tensor_circuit_hooks(self, recursive: bool = True):
+
+        def clear_hooks(ns):
+            if hasattr(ns, "_param_range"):
+                ns._param_range = None
+            if hasattr(ns, "_param_ids"):
+                ns._param_ids = None
+            if hasattr(ns, "_inverse_param_ids"):
+                ns._inverse_param_ids = None
+            if hasattr(ns, "_param_flow_range"):
+                ns._param_flow_range = None
+            if hasattr(ns, "_output_ind_range"):
+                ns._output_ind_range = None
+
         if recursive:
             for ns in self:
-                if hasattr(ns, "_param_range"):
-                    ns._param_range = None
-                if hasattr(ns, "_param_ids"):
-                    ns._param_ids = None
-                if hasattr(ns, "_inverse_param_ids"):
-                    ns._inverse_param_ids = None
+                clear_hooks(ns)
         else:
-            if hasattr(self, "_param_range"):
-                self._param_range = None
-            if hasattr(self, "_param_ids"):
-                self._param_ids = None
-            if hasattr(self, "_inverse_param_ids"):
-                self._inverse_param_ids = None
+            clear_hooks(self)
 
     def __iter__(self):
         return node_iterator(self)
diff --git a/src/pyjuice/nodes/sum_nodes.py b/src/pyjuice/nodes/sum_nodes.py
index 83a934ff..6d743b3d 100644
--- a/src/pyjuice/nodes/sum_nodes.py
+++ b/src/pyjuice/nodes/sum_nodes.py
@@ -7,7 +7,7 @@
 from functools import reduce
 
 from pyjuice.graph import InnerRegionNode
-from pyjuice.functional import normalize_parameters
+from .backend import normalize_ns_parameters
 from .nodes import CircuitNodes
 from .prod_nodes import ProdNodes
 
@@ -106,8 +106,8 @@ def set_params(self, params: torch.Tensor, normalize: bool = True, pseudocount:
             raise ValueError("Unsupported parameter input.")
 
         if normalize:
-            normalize_parameters(self._params, self.edge_ids[0,:], group_size = self.group_size, 
-                                 ch_group_size = self.ch_group_size, pseudocount = pseudocount)
+            normalize_ns_parameters(self._params, self.edge_ids[0,:], group_size = self.group_size, 
+                                    ch_group_size = self.ch_group_size, pseudocount = pseudocount)
 
     def set_edges(self, edge_ids: Union[Tensor,Sequence[Tensor]]):
         self._construct_edges(edge_ids)
@@ -118,8 +118,8 @@ def init_parameters(self, perturbation: float = 2.0, recursive: bool = True, is_
         if self._source_node is None:
             self._params = torch.exp(torch.rand([self.edge_ids.size(1), self.group_size, self.ch_group_size]) * -perturbation)
 
-            normalize_parameters(self._params, self.edge_ids[0,:], group_size = self.group_size, 
-                                 ch_group_size = self.ch_group_size, pseudocount = 0.0)
+            normalize_ns_parameters(self._params, self.edge_ids[0,:], group_size = self.group_size, 
+                                    ch_group_size = self.ch_group_size, pseudocount = 0.0)
 
         super(SumNodes, self).init_parameters(
             perturbation = perturbation, 
diff --git a/tests/functional/tying_test.py b/tests/functional/tying_test.py
deleted file mode 100644
index b2eac11d..00000000
--- a/tests/functional/tying_test.py
+++ /dev/null
@@ -1,204 +0,0 @@
-import pyjuice as juice
-import torch
-import pyjuice.nodes.distributions as dists
-from pyjuice.functional import tie_param_flows
-from pyjuice import inputs, multiply, summate
-
-
-def tie_function_test():
-
-    device = torch.device("cuda:0")
-
-    N = 20
-
-    param_flows = torch.rand([1000]).cuda()
-    tied_param_ids = torch.arange(20).cuda()
-    tied_param_group_ids = torch.arange(10).unsqueeze(1).repeat(1, 2).reshape(-1).cuda()
-
-    tied_flows = param_flows[:20].reshape(10, 2).sum(dim = 1)
-
-    tie_param_flows(
-        param_flows = param_flows, 
-        num_tied_params = N, 
-        tied_param_ids = tied_param_ids, 
-        tied_param_group_ids = tied_param_group_ids
-    )
-
-    assert torch.max(torch.abs(tied_flows.unsqueeze(1).repeat(1, 2).reshape(-1) - param_flows[:20])) < 1e-6
-
-
-def tie_sum_nodes_test():
-
-    device = torch.device("cuda:0")
-
-    num_nodes = 2
-    
-    i0 = juice.inputs(0, num_nodes, dists.Categorical(num_cats = 5))
-    i1 = juice.inputs(1, num_nodes, dists.Categorical(num_cats = 5))
-    i2 = juice.inputs(2, num_nodes, dists.Categorical(num_cats = 5))
-    i3 = juice.inputs(3, num_nodes, dists.Categorical(num_cats = 5))
-
-    m1 = juice.multiply(i0, i1)
-    n1 = juice.summate(m1, num_nodes = num_nodes)
-
-    m2 = juice.multiply(i2, i3)
-    n2 = n1.duplicate(m2, tie_params = True)
-
-    m = juice.multiply(n1, n2)
-    n = juice.summate(m, num_nodes = 1)
-
-    pc = juice.TensorCircuit(n)
-    pc.to(device)
-
-    data = torch.randint(0, 2, [16, 4]).to(device)
-
-    lls = pc(data)
-
-    pc.backward(data)
-
-    f11 = (torch.exp(pc.node_mars[1,:] + pc.node_mars[3,:] + torch.log(pc.params[1]) - pc.node_mars[9,:]) * pc.node_flows[9,:]).sum()
-    f12 = (torch.exp(pc.node_mars[2,:] + pc.node_mars[4,:] + torch.log(pc.params[2]) - pc.node_mars[9,:]) * pc.node_flows[9,:]).sum()
-    f13 = (torch.exp(pc.node_mars[1,:] + pc.node_mars[3,:] + torch.log(pc.params[3]) - pc.node_mars[10,:]) * pc.node_flows[10,:]).sum()
-    f14 = (torch.exp(pc.node_mars[2,:] + pc.node_mars[4,:] + torch.log(pc.params[4]) - pc.node_mars[10,:]) * pc.node_flows[10,:]).sum()
-    
-    f21 = (torch.exp(pc.node_mars[5,:] + pc.node_mars[7,:] + torch.log(pc.params[1]) - pc.node_mars[11,:]) * pc.node_flows[11,:]).sum()
-    f22 = (torch.exp(pc.node_mars[6,:] + pc.node_mars[8,:] + torch.log(pc.params[2]) - pc.node_mars[11,:]) * pc.node_flows[11,:]).sum()
-    f23 = (torch.exp(pc.node_mars[5,:] + pc.node_mars[7,:] + torch.log(pc.params[3]) - pc.node_mars[12,:]) * pc.node_flows[12,:]).sum()
-    f24 = (torch.exp(pc.node_mars[6,:] + pc.node_mars[8,:] + torch.log(pc.params[4]) - pc.node_mars[12,:]) * pc.node_flows[12,:]).sum()
-
-    assert torch.abs(f11 + f21 - pc.param_flows[1]) < 1e-4
-    assert torch.abs(f12 + f22 - pc.param_flows[2]) < 1e-4
-    assert torch.abs(f13 + f23 - pc.param_flows[3]) < 1e-4
-    assert torch.abs(f14 + f24 - pc.param_flows[4]) < 1e-4
-
-
-def tie_input_nodes_test():
-    
-    device = torch.device("cuda:0")
-
-    num_nodes = 2
-    
-    i0 = juice.inputs(0, num_nodes, dists.Categorical(num_cats = 5))
-    i1 = i0.duplicate(1, tie_params = True)
-
-    m = juice.multiply(i0, i1)
-    n = juice.summate(m, num_nodes = 1)
-
-    n.init_parameters()
-
-    assert i1.is_tied()
-    assert i1.get_source_ns() == i0
-
-    pc = juice.TensorCircuit(n)
-
-    assert torch.all(pc.input_layers[0].vids == torch.tensor([0,0,1,1]).reshape(-1, 1))
-    assert torch.all(pc.input_layers[0].s_pids == torch.tensor([0,5,0,5]))
-    assert torch.all((pc.input_layers[0].params - i0._params).abs() < 1e-6)
-
-    pc.to(device)
-
-    data = torch.randint(0, 5, [16, 2]).to(device)
-
-    lls = pc(data)
-
-    pc.backward(data)
-
-    dids = data.clone().cpu()
-    m1p = i0._params[dids[:,0]] * i0._params[dids[:,1]]
-    m2p = i0._params[dids[:,0]+5] * i0._params[dids[:,1]+5]
-    log_np = torch.log(m1p * n._params[0] + m2p * n._params[1])
-
-    assert torch.all((log_np - lls.reshape(-1).cpu()).abs() < 1e-6)
-
-    m1f = m1p * n._params[0] / (m1p * n._params[0] + m2p * n._params[1])
-    m2f = m2p * n._params[1] / (m1p * n._params[0] + m2p * n._params[1])
-
-    assert torch.all((m1f - pc.node_flows[3,:].cpu()).abs() < 1e-6)
-    assert torch.all((m2f - pc.node_flows[4,:].cpu()).abs() < 1e-6)
-
-    for i in range(5):
-        assert (pc.input_layers[0].param_flows[i] - m1f[dids[:,0] == i].sum() - m1f[dids[:,1] == i].sum()).abs() < 1e-3
-        assert (pc.input_layers[0].param_flows[i+5] - m2f[dids[:,0] == i].sum() - m2f[dids[:,1] == i].sum()).abs() < 1e-3
-
-
-def tie_sparse_nodes_test():
-    
-    device = torch.device("cuda:0")
-
-    num_nodes = 2
-    
-    i0 = juice.inputs(0, num_nodes, dists.Categorical(num_cats = 5))
-    i1 = juice.inputs(1, num_nodes, dists.Categorical(num_cats = 5))
-    i2 = juice.inputs(2, num_nodes, dists.Categorical(num_cats = 5))
-    i3 = juice.inputs(3, num_nodes, dists.Categorical(num_cats = 5))
-
-    m00 = multiply(i0, i1)
-    m01 = multiply(i0, i1, edge_ids = torch.tensor([[1,0]], dtype = torch.long))
-    n0 = summate(m00, m01, edge_ids = torch.tensor([[0,0,0,1,1],[0,1,2,1,2]], dtype = torch.long))
-
-    m10 = multiply(i2, i3)
-    m11 = multiply(i2, i3, edge_ids = torch.tensor([[1,0]], dtype = torch.long))
-    n1 = n0.duplicate(m10, m11, tie_params = True)
-
-    m = multiply(n0, n1)
-    n = summate(m, num_nodes = 1)
-
-    n.init_parameters()
-
-    pc = juice.TensorCircuit(n)
-
-    pc.to(device)
-
-    data = torch.randint(0, 5, [1, 4]).to(device)
-
-    lls = pc(data)
-
-    pc.backward(data)
-
-    ## Unit tests for compilation result ##
-
-    assert torch.all(pc.inner_layers[1].grouped_nids[0].cpu() == torch.tensor([10, 12], dtype = torch.long))
-    assert torch.all(pc.inner_layers[1].grouped_nids[1].cpu() == torch.tensor([9, 11], dtype = torch.long))
-    assert torch.all(pc.inner_layers[1].grouped_cids[0].cpu() == torch.tensor([[2,3],[5,6]], dtype = torch.long))
-    assert torch.all(pc.inner_layers[1].grouped_cids[1].cpu() == torch.tensor([[1,2,3,0],[4,5,6,0]], dtype = torch.long))
-    assert torch.all(pc.inner_layers[1].grouped_pids[0].cpu() == torch.tensor([[4,5],[4,5]], dtype = torch.long))
-    assert torch.all(pc.inner_layers[1].grouped_pids[1].cpu() == torch.tensor([[1,2,3,0],[1,2,3,0]], dtype = torch.long))
-
-    assert torch.all(pc.inner_layers[1].grouped_chids[0].cpu() == torch.tensor([1,4], dtype = torch.long))
-    assert torch.all(pc.inner_layers[1].grouped_chids[1].cpu() == torch.tensor([2,3,5,6], dtype = torch.long))
-    assert torch.all(pc.inner_layers[1].grouped_parids[0].cpu() == torch.tensor([[9],[11]], dtype = torch.long))
-    assert torch.all(pc.inner_layers[1].grouped_parids[1].cpu() == torch.tensor([[9,10],[9,10],[11,12],[11,12]], dtype = torch.long))
-    assert torch.all(pc.inner_layers[1].grouped_parpids[0].cpu() == torch.tensor([[1],[1]], dtype = torch.long))
-    assert torch.all(pc.inner_layers[1].grouped_parpids[1].cpu() == torch.tensor([[2,4],[3,5],[2,4],[3,5]], dtype = torch.long))
-
-    ## Unit tests for parameter flows ##
-
-    p9 = torch.exp(pc.node_mars[9,0])
-    p10 = torch.exp(pc.node_mars[10,0])
-    p11 = torch.exp(pc.node_mars[11,0])
-    p12 = torch.exp(pc.node_mars[12,0])
-
-    f9 = pc.node_flows[9,0]
-    f10 = pc.node_flows[10,0]
-    f11 = pc.node_flows[11,0]
-    f12 = pc.node_flows[12,0]
-
-    pm1 = torch.exp(pc.node_mars[1,0] + pc.node_mars[3,0])
-    pm2 = torch.exp(pc.node_mars[2,0] + pc.node_mars[4,0])
-    pm3 = torch.exp(pc.node_mars[2,0] + pc.node_mars[3,0])
-    pm4 = torch.exp(pc.node_mars[5,0] + pc.node_mars[7,0])
-    pm5 = torch.exp(pc.node_mars[6,0] + pc.node_mars[8,0])
-    pm6 = torch.exp(pc.node_mars[6,0] + pc.node_mars[7,0])
-
-    assert torch.abs(f9 * pm1 * pc.params[1] / p9 + f11 * pm4 * pc.params[1] / p11 - pc.param_flows[1]) < 1e-4
-    assert torch.abs(f9 * pm2 * pc.params[2] / p9 + f11 * pm5 * pc.params[2] / p11 - pc.param_flows[2]) < 1e-4
-    assert torch.abs(f9 * pm3 * pc.params[3] / p9 + f11 * pm6 * pc.params[3] / p11 - pc.param_flows[3]) < 1e-4
-    assert torch.abs(f10 * pm2 * pc.params[4] / p10 + f12 * pm5 * pc.params[4] / p12 - pc.param_flows[4]) < 1e-4
-    assert torch.abs(f10 * pm3 * pc.params[5] / p10 + f12 * pm6 * pc.params[5] / p12 - pc.param_flows[5]) < 1e-4
-
-
-if __name__ == "__main__":
-    tie_function_test()
-    tie_sum_nodes_test()
-    tie_input_nodes_test()
-    tie_sparse_nodes_test()
\ No newline at end of file
diff --git a/tests/model/numba_test.py b/tests/model/numba_test.py
new file mode 100644
index 00000000..005140de
--- /dev/null
+++ b/tests/model/numba_test.py
@@ -0,0 +1,21 @@
+import numpy as np
+from numba import njit, prange
+
+
+@njit(parallel = True)
+def ff(a, b):
+    for i in prange(10000000000):
+        a[i%1000000000] = b[i%1000000000]
+
+
+if __name__ == "__main__":
+    a = np.random.uniform(size = [1000000000])
+    b = np.random.uniform(size = [1000000000])
+
+    ff(a, b)
+
+    import time
+    t0 = time.time()
+    ff(a, b)
+    t1 = time.time()
+    print(t1 - t0)
\ No newline at end of file
diff --git a/tests/model/simple_model_test.py b/tests/model/simple_model_test.py
new file mode 100644
index 00000000..d9e6e79f
--- /dev/null
+++ b/tests/model/simple_model_test.py
@@ -0,0 +1,143 @@
+import pyjuice as juice
+import torch
+import numpy as np
+import time
+import random
+
+import pyjuice.nodes.distributions as dists
+from pyjuice.utils import BitSet
+from pyjuice.nodes import multiply, summate, inputs
+from pyjuice.model import TensorCircuit
+
+from pyjuice.layer import InputLayer, ProdLayer, SumLayer
+
+import pytest
+
+
+def simple_model_test():
+    
+    device = torch.device("cuda:0")
+
+    group_size = 16
+    
+    with juice.set_group_size(group_size):
+
+        ni0 = inputs(0, num_node_groups = 2, dist = dists.Categorical(num_cats = 4))
+        ni1 = inputs(1, num_node_groups = 2, dist = dists.Categorical(num_cats = 4))
+        ni2 = inputs(2, num_node_groups = 2, dist = dists.Categorical(num_cats = 6))
+        ni3 = inputs(3, num_node_groups = 2, dist = dists.Categorical(num_cats = 6))
+
+        np0 = multiply(ni0, ni1)
+        np1 = multiply(ni2, ni3)
+        np2 = multiply(ni1, ni2)
+        np3 = multiply(ni0, ni1)
+
+        ns0 = summate(np0, np3, num_node_groups = 2)
+        ns1 = summate(np1, num_node_groups = 2)
+        ns2 = summate(np2, num_node_groups = 2)
+
+        np4 = multiply(ns0, ni2, ni3)
+        np5 = multiply(ns1, ni0, ni1)
+        np6 = multiply(ns2, ni0, ni3)
+
+        ns = summate(np4, np5, np6, num_node_groups = 1, group_size = 1)
+
+    ns.init_parameters()
+
+    pc = TensorCircuit(ns, layer_sparsity_tol = 0.1)
+
+    ## Test all compilation-related stuff ##
+
+    input_layer = pc.input_layers[0]
+
+    assert torch.all(input_layer.vids[0:32,0] == 0)
+    assert torch.all(input_layer.vids[32:64,0] == 1)
+    assert torch.all(input_layer.vids[64:96,0] == 2)
+    assert torch.all(input_layer.vids[96:128,0] == 3)
+
+    assert torch.all(input_layer.s_pids[:64] == torch.arange(0, 64*4, 4))
+    assert torch.all(input_layer.s_pids[64:] == torch.arange(64*4, 64*(4+6), 6))
+
+    assert torch.all(input_layer.s_pfids[:64] == torch.arange(0, 64*4, 4))
+    assert torch.all(input_layer.s_pfids[64:] == torch.arange(64*4, 64*(4+6), 6))
+
+    assert torch.all(input_layer.s_mids[0:32] == 0)
+    assert torch.all(input_layer.s_mids[32:64] == 1)
+    assert torch.all(input_layer.s_mids[64:96] == 2)
+    assert torch.all(input_layer.s_mids[96:128] == 3)
+
+    assert torch.all(input_layer.source_nids == torch.arange(0, 128))
+
+    assert input_layer.num_parameters == 64 * (4 + 6)
+
+    assert torch.all(torch.abs(input_layer.params[:64*4].reshape(64, 4).sum(dim = 1) - 1.0) < 1e-4)
+    assert torch.all(torch.abs(input_layer.params[64*4:].reshape(64, 6).sum(dim = 1) - 1.0) < 1e-4)
+
+    prod_layer0 = pc.inner_layers[0]
+
+    assert prod_layer0.num_nodes == 4 * 16 * 2
+    assert prod_layer0.num_edges == 8 * 16 * 2
+
+    assert torch.all(prod_layer0.partitioned_nids[0] == torch.arange(16, 144, 16))
+
+    assert torch.all(prod_layer0.partitioned_cids[0][0:2,:] == torch.tensor([[16, 48], [32, 64]]))
+    assert torch.all(prod_layer0.partitioned_cids[0][2:4,:] == torch.tensor([[16, 48], [32, 64]]))
+    assert torch.all(prod_layer0.partitioned_cids[0][4:6,:] == torch.tensor([[80, 112], [96, 128]]))
+    assert torch.all(prod_layer0.partitioned_cids[0][6:8,:] == torch.tensor([[48, 80], [64, 96]]))
+
+    assert torch.all(prod_layer0.partitioned_u_cids[0] == torch.tensor([16, 32, 80, 96, 112, 128]))
+    assert torch.all(prod_layer0.partitioned_u_cids[1] == torch.tensor([48, 64]))
+
+    assert torch.all(prod_layer0.partitioned_parids[0] == torch.tensor([[16, 48], [32, 64], [80, 112], [96, 128], [80, 0], [96, 0]]))
+    assert torch.all(prod_layer0.partitioned_parids[1] == torch.tensor([[16, 48, 112, 0], [32, 64, 128, 0]]))
+
+    sum_layer0 = pc.inner_layers[1]
+
+    assert torch.all(sum_layer0.partitioned_nids[0] == torch.tensor([176, 192, 208, 224]))
+    assert torch.all(sum_layer0.partitioned_nids[1] == torch.tensor([144, 160]))
+
+    assert torch.all(sum_layer0.partitioned_cids[0][0,:] == torch.arange(80, 112))
+    assert torch.all(sum_layer0.partitioned_cids[0][1,:] == torch.arange(80, 112))
+    assert torch.all(sum_layer0.partitioned_cids[0][2,:] == torch.arange(112, 144))
+    assert torch.all(sum_layer0.partitioned_cids[0][3,:] == torch.arange(112, 144))
+    assert torch.all(sum_layer0.partitioned_cids[1][0,:] == torch.arange(16, 80))
+    assert torch.all(sum_layer0.partitioned_cids[1][1,:] == torch.arange(16, 80))
+
+    assert torch.all(sum_layer0.partitioned_pids[0][0,:] == torch.arange(2064, 2576, 16))
+    assert torch.all(sum_layer0.partitioned_pids[0][1,:] == torch.arange(2576, 3088, 16))
+    assert torch.all(sum_layer0.partitioned_pids[0][2,:] == torch.arange(3088, 3600, 16))
+    assert torch.all(sum_layer0.partitioned_pids[0][3,:] == torch.arange(3600, 4112, 16))
+    assert torch.all(sum_layer0.partitioned_pids[1][0,:] == torch.arange(16, 1040, 16))
+    assert torch.all(sum_layer0.partitioned_pids[1][1,:] == torch.arange(1040, 2064, 16))
+
+    assert torch.all(sum_layer0.partitioned_chids[0] == torch.arange(16, 144, 16))
+
+    assert torch.all(sum_layer0.partitioned_parids[0][:4] == torch.tensor([[144, 160]]))
+    assert torch.all(sum_layer0.partitioned_parids[0][4:6] == torch.tensor([[176, 192]]))
+    assert torch.all(sum_layer0.partitioned_parids[0][6:8] == torch.tensor([[208, 224]]))
+
+    assert torch.all(sum_layer0.partitioned_parpids[0][0,:] == torch.tensor([16, 1040]))
+    assert torch.all(sum_layer0.partitioned_parpids[0][1,:] == torch.tensor([272, 1296]))
+    assert torch.all(sum_layer0.partitioned_parpids[0][2,:] == torch.tensor([528, 1552]))
+    assert torch.all(sum_layer0.partitioned_parpids[0][3,:] == torch.tensor([784, 1808]))
+    assert torch.all(sum_layer0.partitioned_parpids[0][4,:] == torch.tensor([2064, 2576]))
+    assert torch.all(sum_layer0.partitioned_parpids[0][5,:] == torch.tensor([2320, 2832]))
+    assert torch.all(sum_layer0.partitioned_parpids[0][6,:] == torch.tensor([3088, 3600]))
+    assert torch.all(sum_layer0.partitioned_parpids[0][7,:] == torch.tensor([3344, 3856]))
+
+    prod_layer1 = pc.inner_layers[2]
+
+    assert torch.all(prod_layer1.partitioned_nids[0] == torch.arange(16, 112, 16))
+
+    assert torch.all(prod_layer1.partitioned_cids[0][0,:] == torch.tensor([144, 80, 112, 0]))
+    assert torch.all(prod_layer1.partitioned_cids[0][1,:] == torch.tensor([160, 96, 128, 0]))
+    assert torch.all(prod_layer1.partitioned_cids[0][2,:] == torch.tensor([176, 16, 48, 0]))
+    assert torch.all(prod_layer1.partitioned_cids[0][3,:] == torch.tensor([192, 32, 64, 0]))
+    assert torch.all(prod_layer1.partitioned_cids[0][4,:] == torch.tensor([176, 16, 112, 0]))
+    assert torch.all(prod_layer1.partitioned_cids[0][5,:] == torch.tensor([192, 32, 128, 0]))
+
+    import pdb; pdb.set_trace()
+
+
+if __name__ == "__main__":
+    simple_model_test()

From 432d9b214da84066992a4b3f74f7c908d73e1910 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Tue, 12 Dec 2023 18:05:26 +0800
Subject: [PATCH 048/162] pc compilation preliminary pass

---
 src/pyjuice/model/backend/normalize.py  |  8 ++--
 src/pyjuice/model/backend/par_update.py |  4 ++
 tests/layer/sum_layer_test.py           |  2 +-
 tests/model/simple_model_test.py        | 51 ++++++++++++++++++++++++-
 4 files changed, 58 insertions(+), 7 deletions(-)

diff --git a/src/pyjuice/model/backend/normalize.py b/src/pyjuice/model/backend/normalize.py
index 1da31b4c..64ba0716 100644
--- a/src/pyjuice/model/backend/normalize.py
+++ b/src/pyjuice/model/backend/normalize.py
@@ -80,7 +80,7 @@ def cum_par_numba_kernel(cum_pflows, params, par_start_ids, blk_sizes, blk_inter
 @njit
 def par_update_numba_kernel(params, cum_pflows, nchs, par_start_ids, blk_sizes, blk_intervals, global_nids, pseudocount):
     for i in range(par_start_ids.shape[0]):
-        par_start_id = par_start_ids[i]
+        par_start = par_start_ids[i]
         blk_size = blk_sizes[i]
         blk_interval = blk_intervals[i]
         global_nid = global_nids[i]
@@ -89,9 +89,9 @@ def par_update_numba_kernel(params, cum_pflows, nchs, par_start_ids, blk_sizes,
         nch = nchs[global_nid]
 
         for j in range(blk_size):
-            par = params[par_start_id+j*blk_interval]
-            norm_par = (par + pseudocount / nch) + (cum_par + pseudocount)
-            params[par_start_id+j*blk_interval] = norm_par
+            par = params[par_start+j*blk_interval]
+            norm_par = (par + pseudocount / nch) / (cum_par + pseudocount)
+            params[par_start+j*blk_interval] = norm_par
 
 
 def normalize_parameters(params, par_update_kwargs, pseudocount: float = 0.0):
diff --git a/src/pyjuice/model/backend/par_update.py b/src/pyjuice/model/backend/par_update.py
index a5cc20dc..34918dfa 100644
--- a/src/pyjuice/model/backend/par_update.py
+++ b/src/pyjuice/model/backend/par_update.py
@@ -36,6 +36,8 @@ def _record_par_blks(par_start_ids, pflow_start_ids, blk_sizes, blk_intervals, g
 
                 pid += 1
 
+        pflow_start += ns_group_size * num_edges * cs_group_size
+        par_start += ns_group_size * num_edges * cs_group_size
         global_nid += ns_group_size
 
     return global_nid, pid
@@ -114,6 +116,8 @@ def compile_par_update_fn(root_ns: CircuitNodes, BLOCK_SIZE: int = 32, buffer_in
 
                     pid += ns.group_size
 
+                par_start += ns.group_size * num_edges * ns.ch_group_size
+                pflow_start += ns.group_size * num_edges * ns.ch_group_size
                 global_nid += ns.group_size
 
     par_start_ids = torch.from_numpy(par_start_ids[:pid]).contiguous()
diff --git a/tests/layer/sum_layer_test.py b/tests/layer/sum_layer_test.py
index b84fc8ea..98a96044 100644
--- a/tests/layer/sum_layer_test.py
+++ b/tests/layer/sum_layer_test.py
@@ -202,7 +202,7 @@ def speed_test():
 
     t0 = time.time()
     torch.cuda.synchronize()
-    for _ in range(100):
+    for _ in range(10000000000000000):
         layer.backward(node_flows, element_flows, node_mars, element_mars, params, param_flows)
     torch.cuda.synchronize()
     t1 = time.time()
diff --git a/tests/model/simple_model_test.py b/tests/model/simple_model_test.py
index d9e6e79f..d91a6f7f 100644
--- a/tests/model/simple_model_test.py
+++ b/tests/model/simple_model_test.py
@@ -125,6 +125,18 @@ def simple_model_test():
     assert torch.all(sum_layer0.partitioned_parpids[0][6,:] == torch.tensor([3088, 3600]))
     assert torch.all(sum_layer0.partitioned_parpids[0][7,:] == torch.tensor([3344, 3856]))
 
+    assert torch.all(torch.abs(ns0._params.reshape(2, 4, 16, 16).sum(dim = 3).sum(dim = 1) - 1.0) < 1e-4)
+    assert torch.all(torch.abs(ns1._params.reshape(2, 2, 16, 16).sum(dim = 3).sum(dim = 1) - 1.0) < 1e-4)
+    assert torch.all(torch.abs(ns2._params.reshape(2, 2, 16, 16).sum(dim = 3).sum(dim = 1) - 1.0) < 1e-4)
+
+    assert torch.all(torch.abs(pc.params[:16] - 0.0) < 1e-4)
+    assert torch.all(torch.abs(pc.params[16:1040].reshape(1, 4, 16, 16).sum(dim = 2).sum(dim = 1) - 1.0) < 1e-4)
+    assert torch.all(torch.abs(pc.params[1040:2064].reshape(1, 4, 16, 16).sum(dim = 2).sum(dim = 1) - 1.0) < 1e-4)
+    assert torch.all(torch.abs(pc.params[2064:2576].reshape(1, 2, 16, 16).sum(dim = 2).sum(dim = 1) - 1.0) < 1e-4)
+    assert torch.all(torch.abs(pc.params[2576:3088].reshape(1, 2, 16, 16).sum(dim = 2).sum(dim = 1) - 1.0) < 1e-4)
+    assert torch.all(torch.abs(pc.params[3088:3600].reshape(1, 2, 16, 16).sum(dim = 2).sum(dim = 1) - 1.0) < 1e-4)
+    assert torch.all(torch.abs(pc.params[3600:4112].reshape(1, 2, 16, 16).sum(dim = 2).sum(dim = 1) - 1.0) < 1e-4)
+
     prod_layer1 = pc.inner_layers[2]
 
     assert torch.all(prod_layer1.partitioned_nids[0] == torch.arange(16, 112, 16))
@@ -133,8 +145,43 @@ def simple_model_test():
     assert torch.all(prod_layer1.partitioned_cids[0][1,:] == torch.tensor([160, 96, 128, 0]))
     assert torch.all(prod_layer1.partitioned_cids[0][2,:] == torch.tensor([176, 16, 48, 0]))
     assert torch.all(prod_layer1.partitioned_cids[0][3,:] == torch.tensor([192, 32, 64, 0]))
-    assert torch.all(prod_layer1.partitioned_cids[0][4,:] == torch.tensor([176, 16, 112, 0]))
-    assert torch.all(prod_layer1.partitioned_cids[0][5,:] == torch.tensor([192, 32, 128, 0]))
+    assert torch.all(prod_layer1.partitioned_cids[0][4,:] == torch.tensor([208, 16, 112, 0]))
+    assert torch.all(prod_layer1.partitioned_cids[0][5,:] == torch.tensor([224, 32, 128, 0]))
+
+    assert torch.all(prod_layer1.partitioned_u_cids[0] == torch.tensor([48, 64, 80, 96, 144, 160, 176, 192, 208, 224]))
+    assert torch.all(prod_layer1.partitioned_u_cids[1] == torch.tensor([16, 32, 112, 128]))
+
+    assert torch.all(prod_layer1.partitioned_parids[0][0:2,:] == torch.tensor([[48], [64]]))
+    assert torch.all(prod_layer1.partitioned_parids[0][2:4,:] == torch.tensor([[16], [32]]))
+    assert torch.all(prod_layer1.partitioned_parids[0][4:6,:] == torch.tensor([[16], [32]]))
+    assert torch.all(prod_layer1.partitioned_parids[0][6:8,:] == torch.tensor([[48], [64]]))
+    assert torch.all(prod_layer1.partitioned_parids[0][8:10,:] == torch.tensor([[80], [96]]))
+    assert torch.all(prod_layer1.partitioned_parids[1][0:2,:] == torch.tensor([[48, 80], [64, 96]]))
+    assert torch.all(prod_layer1.partitioned_parids[1][2:4,:] == torch.tensor([[16, 80], [32, 96]]))
+
+    sum_layer1 = pc.inner_layers[3]
+
+    assert sum_layer1.group_size == 1
+
+    assert torch.all(sum_layer1.partitioned_nids[0] == torch.tensor([240]))
+
+    assert torch.all(sum_layer1.partitioned_cids[0][0,:96] == torch.arange(16, 112))
+    assert torch.all(sum_layer1.partitioned_cids[0][0,96:] == 0)
+
+    assert torch.all(sum_layer1.partitioned_pids[0][0,:96] == torch.arange(4112, 4208))
+    assert torch.all(sum_layer1.partitioned_pids[0][0,96:] == 0)
+
+    assert torch.all(sum_layer1.partitioned_chids[0] == torch.arange(16, 112, 16))
+
+    assert torch.all(sum_layer1.partitioned_parids[0] == 240)
+
+    assert torch.all(sum_layer1.partitioned_parpids[0] == torch.arange(4112, 4208, 16)[:,None])
+
+    assert torch.abs(pc.params[4112:4208].sum() - 1.0) < 1e-4
+
+    ## Forward pass ##
+
+    pc.to(device)
 
     import pdb; pdb.set_trace()
 

From a6e18cb0ad866cc301aaa1050c99556b6ec3bff8 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Tue, 12 Dec 2023 23:37:33 +0800
Subject: [PATCH 049/162] test cases for forward pass

---
 src/pyjuice/layer/sum_layer.py     |  17 +--
 src/pyjuice/model/tensorcircuit.py | 166 +++++++++++------------------
 tests/model/simple_model_test.py   |  81 ++++++++++++++
 3 files changed, 157 insertions(+), 107 deletions(-)

diff --git a/src/pyjuice/layer/sum_layer.py b/src/pyjuice/layer/sum_layer.py
index e0d349f7..cf00a362 100644
--- a/src/pyjuice/layer/sum_layer.py
+++ b/src/pyjuice/layer/sum_layer.py
@@ -392,7 +392,7 @@ def _forward(self, node_mars: torch.Tensor, element_mars: torch.Tensor,
     def _fw_triton_block_sparse_kernel(node_mars, element_mars, params, nids, cids_start, cids_increment,
                                        pids_start, pids_increment, local_ids, batch_size: tl.constexpr, partial_eval: tl.constexpr,
                                        BLOCK_B: tl.constexpr, TILE_SIZE_K: tl.constexpr, K_NUM_TILES: tl.constexpr,
-                                       TILE_SIZE_M: tl.constexpr, GROUP_SIZE_M: tl.constexpr):
+                                       TILE_SIZE_M: tl.constexpr, GROUP_SIZE_M: tl.constexpr, use_fp16: tl.constexpr):
 
         pid_b = tl.program_id(0) # ID of size-`BLOCK_B` batches
         pid_m = tl.program_id(1) # ID of size-`TILE_SIZE_M` nodes
@@ -444,9 +444,13 @@ def _fw_triton_block_sparse_kernel(node_mars, element_mars, params, nids, cids_s
 
             emars_max = tl.max(emars, axis = 0)[None,:]
             emars = tl.exp(emars - emars_max)
-            epars = epars.to(tl.float16)
-            emars = emars.to(tl.float16)
-            nmars = tl.dot(epars, emars).to(tl.float32)
+
+            if use_fp16 == 1:
+                epars = epars.to(tl.float16) * (2**12)
+                emars = emars.to(tl.float16)
+                nmars = tl.dot(epars, emars).to(tl.float32) / (2**12)
+            else:
+                nmars = tl.dot(epars, emars)
 
             acc = tl.where(emars_max > acc,
                 tl.log(nmars + tl.exp(acc - emars_max)) + emars_max,
@@ -471,7 +475,7 @@ def _fw_triton_block_sparse_kernel(node_mars, element_mars, params, nids, cids_s
     def _forward_block_sparse(self, node_mars: torch.Tensor, element_mars: torch.Tensor,
                               params: torch.Tensor, nids: torch.Tensor, cids: torch.Tensor,
                               pids: torch.Tensor, local_ids: Optional[torch.Tensor] = None,
-                              partition_id: int = -1) -> None:
+                              partition_id: int = -1, use_fp16: bool = True) -> None:
         """
         Forward pass of sum layers with the block-sparse processing kernel.
         
@@ -546,7 +550,8 @@ def _forward_block_sparse(self, node_mars: torch.Tensor, element_mars: torch.Ten
             TILE_SIZE_K = TILE_SIZE_K,
             K_NUM_TILES = K_NUM_TILES,
             TILE_SIZE_M = TILE_SIZE_M,
-            GROUP_SIZE_M = self.group_size
+            GROUP_SIZE_M = self.group_size,
+            use_fp16 = 1 if use_fp16 else 0
         )
         
         return None
diff --git a/src/pyjuice/model/tensorcircuit.py b/src/pyjuice/model/tensorcircuit.py
index 772d010d..c40ae630 100644
--- a/src/pyjuice/model/tensorcircuit.py
+++ b/src/pyjuice/model/tensorcircuit.py
@@ -7,7 +7,7 @@
 import triton.language as tl
 from tqdm import tqdm
 from functools import partial
-from typing import Optional, Sequence, Callable, Union
+from typing import Optional, Sequence, Callable, Union, Tuple, Dict
 
 from pyjuice.nodes import CircuitNodes, InputNodes, ProdNodes, SumNodes, foreach
 from pyjuice.layer import Layer, InputLayer, ProdLayer, SumLayer
@@ -28,28 +28,9 @@ def _pc_model_backward_hook(grad, pc, **kwargs):
         **kwargs
     )
 
-    pc._backward_buffer.clear()
-
     return None
 
 
-def _pc_inputs_hook(grad, pc, i):
-
-    if pc._inputs_grad[i] is not None:
-        if grad is not None:
-            grad = grad + pc._inputs_grad[i]
-        else:
-            grad = pc._inputs_grad[i]
-
-    if pc._inputs[i] is not None:
-        pc._inputs[i] = None
-    
-    if pc._inputs_grad[i] is not None:
-        pc._inputs_grad[i] = None
-    
-    return grad
-
-
 class TensorCircuit(nn.Module):
     def __init__(self, root_ns: CircuitNodes, layer_sparsity_tol: float = 0.5, 
                  max_num_partitions: Optional[int] = None, disable_gpu_compilation: bool = False, 
@@ -96,20 +77,13 @@ def _init_pass_tensors(self):
         self.element_flows = None
         self.param_flows = None
         
-    def forward(self, inputs: torch.Tensor, 
-                params: Optional[torch.Tensor] = None, 
-                input_params: Optional[Dict[str,torch.Tensor]] = None,
-                input_layer_fn: Optional[Union[str,Callable]] = None,
-                cache: Optional[dict] = None,
-                return_cache: bool = False,
-                **kwargs):
+    def forward(self, inputs: torch.Tensor, input_layer_fn: Optional[Union[str,Callable]] = None,
+                cache: Optional[dict] = None, return_cache: bool = False, **kwargs):
         """
         Forward the circuit.
 
         Parameters:
         `inputs`:         [B, num_vars]
-        `params`:         None or [B, num_params]
-        `input_params`:   A dictionary of input parameters
         `input_layer_fn`: Custom forward function for input layers;
                           if it is a string, then try to call 
                           the corresponding member function of `input_layer`
@@ -121,68 +95,32 @@ def forward(self, inputs: torch.Tensor,
         
         ## Initialize buffers for forward pass ##
 
-        if not isinstance(self.node_mars, torch.Tensor) or self.node_mars.size(0) != self.num_nodes or \
-                self.node_mars.size(1) != B or self.node_mars.device != self.device:
-            self.node_mars = torch.empty([self.num_nodes, B], device = self.device)
-
-        if not isinstance(self.element_mars, torch.Tensor) or self.element_mars.size(0) != self.num_elements or \
-                self.element_mars.size(1) != B or self.element_mars.device != self.device:
-            self.element_mars = torch.empty([self.num_elements, B], device = self.device)
+        self._init_buffer(name = "node_mars", shape = (self.num_nodes, B), set_value = 0.0)
+        self._init_buffer(name = "element_mars", shape = (self.num_elements, B), set_value = -torch.inf)
 
         # Load cached node marginals
-        if cache is not None and "node_mars" in cache:
-            assert cache["node_mars"].dim() == 2 and cache["node_mars"].size(0) == self.node_mars.size(0) and \
-                cache["node_mars"].size(1) == self.node_mars.size(1)
+        if self._buffer_matches(name = "node_mars", cache = cache):
             self.node_mars[:,:] = cache["node_mars"]
 
-        self.node_mars[0,:] = 0.0
-        self.element_mars[0,:] = -torch.inf
-
-        ## Preprocess parameters ##
-
-        if params is None:
-            params = self.params
-        else:
-            if params.dim() == 2:
-                if params.size(1) == self.num_sum_params:
-                    params = params.permute(1, 0)
-                else:
-                    assert params.size(0) == self.num_sum_params, "Size of `params` does not match the number of sum parameters."
-
-            self._inputs[1] = ReverseGrad.apply(params)
-
-            # normalize
-            params = flat_softmax_fw(logits = params, node_ids = self.node_ids, inplace = False)
-            params[0] = 1.0
-            self._backward_buffer["normalized_params"] = params
-
-        if input_params is not None:
-            grad_hook_idx = 2
-            self._backward_buffer["external_input_layers"] = set()
-
         ## Run forward pass ##
 
         with torch.no_grad():
-            # Compute forward pass for all input layers
+            # Input layers
             for idx, layer in enumerate(self.input_layers):
-                if input_params is not None and f"input_{idx}" in input_params:
-                    layer_params = input_params[f"input_{idx}"]
-
-                    self._backward_buffer["external_input_layers"].add(idx)
-                    grad_hook_idx = layer._hook_params(grad_hook_idx, self._inputs, layer_params)
-                else:
-                    layer_params = None
-
                 if input_layer_fn is None:
-                    layer(inputs, self.node_mars, params = layer_params, **kwargs)
+                    layer(inputs, self.node_mars, **kwargs)
+
                 elif isinstance(input_layer_fn, str):
                     assert hasattr(layer, input_layer_fn), f"Custom input function `{input_layer_fn}` not found for layer type {type(layer)}."
-                    getattr(layer, input_layer_fn)(inputs, self.node_mars, params = layer_params, **kwargs)
+                    getattr(layer, input_layer_fn)(inputs, self.node_mars, **kwargs)
+
+                elif isinstance(input_layer_fn, Callable):
+                    input_layer_fn(layer, inputs, self.node_mars, **kwargs)
+
                 else:
-                    assert isinstance(input_layer_fn, Callable), f"Custom input function should be either a `str` or a `Callable`. " + \
-                                                           f"Found {type(input_layer_fn)} instead."
-                    input_layer_fn(layer, inputs, self.node_mars, params = layer_params, **kwargs)
+                    raise ValueError(f"Custom input function should be either a `str` or a `Callable`. Found {type(input_layer_fn)} instead.")
 
+            # Inner layers
             for layer in self.inner_layers:
                 if isinstance(layer, ProdLayer):
                     # Prod layer
@@ -190,7 +128,7 @@ def forward(self, inputs: torch.Tensor,
 
                 elif isinstance(layer, SumLayer):
                     # Sum layer
-                    layer(self.node_mars, self.element_mars, params)
+                    layer(self.node_mars, self.element_mars, self.params)
 
                 else:
                     raise ValueError(f"Unknown layer type {type(layer)}.")
@@ -213,20 +151,6 @@ def forward(self, inputs: torch.Tensor,
             lls.requires_grad = True
             lls.register_hook(partial(_pc_model_backward_hook, pc = self, **kwargs))
 
-            self._inputs[0] = ReverseGrad.apply(inputs) # Record inputs for backward
-
-            tensors = []
-            for i in range(len(self._inputs)):
-                if self._inputs[i] is not None and self._inputs[i].requires_grad:
-                    self._inputs[i].register_hook(partial(_pc_inputs_hook, pc = self, i = i))
-                    tensors.append(self._inputs[i])
-            tensors.append(lls)
-
-            if return_cache:
-                return PseudoHookFunc.apply(*tensors).clone(), cache
-            else:
-                return PseudoHookFunc.apply(*tensors).clone()
-
         if return_cache:
             return lls.clone(), cache
         else:
@@ -439,14 +363,6 @@ def to(self, device):
 
         return self
 
-    def get_param_specs(self):
-        param_specs = dict()
-        param_specs["inner"] = torch.Size([self.num_sum_params])
-        for i, layer in enumerate(self.input_layers):
-            param_specs[f"input_{i}"] = layer.get_param_specs()
-
-        return param_specs
-
     def update_parameters(self, clone_params: bool = True, update_flows: bool = False):
         """
         Copy parameters from this `TensorCircuit` to the original `CircuitNodes`
@@ -483,6 +399,7 @@ def print_statistics(self):
         print(f"> Number of sum parameters: {self.num_sum_params}")
 
     def copy_param_flows(self, clone_param_flows: bool = True, target_name: str = "_scores"):
+        raise NotImplementedError("To be updated")
         param_flows = self.param_flows.detach().cpu()
 
         for ns in self.root_nodes:
@@ -495,6 +412,7 @@ def copy_param_flows(self, clone_param_flows: bool = True, target_name: str = "_
 
     def enable_partial_evaluation(self, scopes: Union[Sequence[BitSet],Sequence[int]], 
                                   forward: bool = False, backward: bool = False):
+        raise NotImplementedError("To be updated")
         
         # Create scope2nid cache
         self._create_scope2nid_cache()
@@ -523,6 +441,8 @@ def enable_partial_evaluation(self, scopes: Union[Sequence[BitSet],Sequence[int]
             self._pv_node_flows_mask = _pv_node_flows_mask.to(self.device)
 
     def disable_partial_evaluation(self, forward: bool = True, backward: bool = True):
+        raise NotImplementedError("To be updated")
+
         # Input layers
         for layer in self.input_layers:
             layer.disable_partial_evaluation(forward = forward, backward = backward)
@@ -533,6 +453,50 @@ def disable_partial_evaluation(self, forward: bool = True, backward: bool = True
 
         self._pv_node_flows_mask = None
 
+    def _init_buffer(self, name: str, shape: Tuple, set_value: Optional[float] = None, check_device: bool = True):
+        flag = False
+        if not name in self.__dict__:
+            flag = True
+        
+        tensor = self.__dict__[name]
+        if not flag and not isinstance(tensor, torch.Tensor):
+            flag = True
+        
+        if not flag and tensor.dim() != len(shape):
+            flag = True
+
+        for i, d in enumerate(shape):
+            if not flag and tensor.size(i) != d:
+                flag = True
+        
+        if not flag and check_device and tensor.device != self.device:
+            flag = True
+
+        if flag:
+            self.__dict__[name] = torch.empty(shape, device = self.device)
+
+        if set_value:
+            self.__dict__[name][:] = set_value
+
+    def _buffer_matches(self, name: str, cache: Optional[dict], check_device: bool = True):
+        if cache is None:
+            return False
+
+        assert name in self.__dict__
+
+        tensor = self.__dict__[name]
+        
+        if name not in cache:
+            return False
+
+        if tensor.size() != cache[name].size():
+            return False
+
+        if check_device and tensor.device != cache[name].device:
+            return False
+
+        return True
+
     def _init_layers(self, layer_sparsity_tol: Optional[float] = None, max_num_partitions: Optional[int] = None,
                      disable_gpu_compilation: bool = False, force_gpu_compilation: bool = False, 
                      max_tied_ns_per_parflow_group: int = 8, verbose: bool = True):
diff --git a/tests/model/simple_model_test.py b/tests/model/simple_model_test.py
index d91a6f7f..abdc6984 100644
--- a/tests/model/simple_model_test.py
+++ b/tests/model/simple_model_test.py
@@ -183,6 +183,87 @@ def simple_model_test():
 
     pc.to(device)
 
+    data = torch.randint(0, 4, [512, 4], device = device)
+
+    lls = pc(data)
+
+    node_mars = pc.node_mars.cpu()
+    data = data.cpu()
+
+    sid, eid = ni0._output_ind_range
+    ni0_lls = node_mars[sid:eid,:]
+    assert torch.all(torch.abs(ni0_lls - ni0._params.reshape(-1, 4)[:,data[:,0]].log()) < 1e-4)
+
+    sid, eid = ni1._output_ind_range
+    ni1_lls = node_mars[sid:eid,:]
+    assert torch.all(torch.abs(ni1_lls - ni1._params.reshape(-1, 4)[:,data[:,1]].log()) < 1e-4)
+
+    sid, eid = ni2._output_ind_range
+    ni2_lls = node_mars[sid:eid,:]
+    assert torch.all(torch.abs(ni2_lls - ni2._params.reshape(-1, 6)[:,data[:,2]].log()) < 1e-4)
+
+    sid, eid = ni3._output_ind_range
+    ni3_lls = node_mars[sid:eid,:]
+    assert torch.all(torch.abs(ni3_lls - ni3._params.reshape(-1, 6)[:,data[:,3]].log()) < 1e-4)
+
+    np0_lls = ni0_lls + ni1_lls
+    np1_lls = ni2_lls + ni3_lls
+    np2_lls = ni1_lls + ni2_lls
+    np3_lls = ni0_lls + ni1_lls
+
+    pc.inner_layers[0].forward(pc.node_mars, pc.element_mars)
+    element_mars = pc.element_mars.cpu()
+
+    sid, eid = np0._output_ind_range
+    assert torch.all(torch.abs(np0_lls - element_mars[sid:eid,:]) < 1e-4)
+
+    sid, eid = np1._output_ind_range
+    assert torch.all(torch.abs(np1_lls - element_mars[sid:eid,:]) < 1e-4)
+
+    sid, eid = np2._output_ind_range
+    assert torch.all(torch.abs(np2_lls - element_mars[sid:eid,:]) < 1e-4)
+
+    sid, eid = np3._output_ind_range
+    assert torch.all(torch.abs(np3_lls - element_mars[sid:eid,:]) < 1e-4)
+
+    ch_lls = torch.cat((np0_lls, np3_lls), dim = 0)
+    epars = ns0._params.reshape(2, 4, 16, 16).permute(0, 2, 1, 3).reshape(32, 64)
+    ns0_lls = torch.matmul(epars, ch_lls.exp()).log()
+    sid, eid = ns0._output_ind_range
+    assert torch.all(torch.abs(ns0_lls - node_mars[sid:eid,:]) < 1e-3)
+    
+    epars = ns1._params.reshape(2, 2, 16, 16).permute(0, 2, 1, 3).reshape(32, 32)
+    ns1_lls = torch.matmul(epars, np1_lls.exp()).log()
+    sid, eid = ns1._output_ind_range
+    assert torch.all(torch.abs(ns1_lls - node_mars[sid:eid,:]) < 1e-3)
+
+    epars = ns2._params.reshape(2, 2, 16, 16).permute(0, 2, 1, 3).reshape(32, 32)
+    ns2_lls = torch.matmul(epars, np2_lls.exp()).log()
+    sid, eid = ns2._output_ind_range
+    assert torch.all(torch.abs(ns2_lls - node_mars[sid:eid,:]) < 1e-3)
+
+    np4_lls = ns0_lls + ni2_lls + ni3_lls
+    np5_lls = ns1_lls + ni0_lls + ni1_lls
+    np6_lls = ns2_lls + ni0_lls + ni3_lls
+
+    pc.inner_layers[2].forward(pc.node_mars, pc.element_mars)
+    element_mars = pc.element_mars.cpu()
+
+    sid, eid = np4._output_ind_range
+    assert torch.all(torch.abs(np4_lls - element_mars[sid:eid,:]) < 1e-3)
+
+    sid, eid = np5._output_ind_range
+    assert torch.all(torch.abs(np5_lls - element_mars[sid:eid,:]) < 1e-3)
+
+    sid, eid = np6._output_ind_range
+    assert torch.all(torch.abs(np6_lls - element_mars[sid:eid,:]) < 1e-3)
+
+    ch_lls = torch.cat((np4_lls, np5_lls, np6_lls), dim = 0)
+    epars = ns._params.reshape(1, 6, 1, 16).permute(0, 2, 1, 3).reshape(1, 96)
+    ns_lls = torch.matmul(epars, ch_lls.exp()).log()
+    sid, eid = ns._output_ind_range
+    assert torch.all(torch.abs(ns_lls - node_mars[sid:eid,:]) < 1e-3)
+
     import pdb; pdb.set_trace()
 
 

From 1a6723b899f8799cef5bd5882e1da51455e925c7 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Tue, 12 Dec 2023 23:39:45 +0800
Subject: [PATCH 050/162] change default flows_memory to 1

---
 src/pyjuice/model/tensorcircuit.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pyjuice/model/tensorcircuit.py b/src/pyjuice/model/tensorcircuit.py
index c40ae630..3aaa5a50 100644
--- a/src/pyjuice/model/tensorcircuit.py
+++ b/src/pyjuice/model/tensorcircuit.py
@@ -67,7 +67,7 @@ def __init__(self, root_ns: CircuitNodes, layer_sparsity_tol: float = 0.5,
         # Hyperparameters for backward pass
         self._optim_hyperparams = {
             "compute_param_flows": True,
-            "flows_memory": 0.0
+            "flows_memory": 1.0
         }
 
     def _init_pass_tensors(self):

From 848e9d97b5213b1bba37e2d7c2e6358ff35cd089 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Wed, 13 Dec 2023 03:40:38 +0800
Subject: [PATCH 051/162] use `LayerGroup` + fix backward hook

---
 src/pyjuice/layer/__init__.py      |   3 +-
 src/pyjuice/layer/layer.py         |   2 +-
 src/pyjuice/layer/layer_group.py   |  79 +++++++++++++
 src/pyjuice/layer/sum_layer.py     |   4 +
 src/pyjuice/model/tensorcircuit.py | 183 ++++++++++++-----------------
 tests/model/simple_model_test.py   |  18 +--
 6 files changed, 169 insertions(+), 120 deletions(-)
 create mode 100644 src/pyjuice/layer/layer_group.py

diff --git a/src/pyjuice/layer/__init__.py b/src/pyjuice/layer/__init__.py
index bfb657d8..c56f9eb5 100644
--- a/src/pyjuice/layer/__init__.py
+++ b/src/pyjuice/layer/__init__.py
@@ -1,4 +1,5 @@
 from .layer import Layer
 from .input_layer import InputLayer
 from .prod_layer import ProdLayer
-from .sum_layer import SumLayer
\ No newline at end of file
+from .sum_layer import SumLayer
+from .layer_group import LayerGroup
\ No newline at end of file
diff --git a/src/pyjuice/layer/layer.py b/src/pyjuice/layer/layer.py
index fd77c103..5d57f6d4 100644
--- a/src/pyjuice/layer/layer.py
+++ b/src/pyjuice/layer/layer.py
@@ -59,4 +59,4 @@ def disable_partial_evaluation(self, forward: bool = True, backward: bool = True
             self.bk_group_local_ids = None
 
     def provided(self, var_name):
-        return hasattr(self, var_name) and getattr(self, var_name) is not None
\ No newline at end of file
+        return hasattr(self, var_name) and getattr(self, var_name) is not None
diff --git a/src/pyjuice/layer/layer_group.py b/src/pyjuice/layer/layer_group.py
new file mode 100644
index 00000000..f2467e45
--- /dev/null
+++ b/src/pyjuice/layer/layer_group.py
@@ -0,0 +1,79 @@
+from __future__ import annotations
+
+import torch
+import torch.nn as nn
+
+from typing import Sequence
+
+from .layer import Layer
+from .input_layer import InputLayer
+from .prod_layer import ProdLayer
+from .sum_layer import SumLayer
+
+
+class LayerGroup(nn.Module):
+    def __init__(self, layers: Sequence[Layer]):
+        super(LayerGroup, self).__init__()
+
+        assert len(layers) >= 1, "A `LayerGroup` must contains at least 1 layer."
+
+        for i in range(1, len(layers)):
+            assert type(layers[i]) == type(layers[0])
+
+        if isinstance(layers[0], InputLayer):
+            self.layer_type = "input"
+        elif isinstance(layers[0], ProdLayer):
+            self.layer_type = "prod"
+        else:
+            assert isinstance(layers[0], SumLayer)
+            self.layer_type = "sum"
+
+        self.num_layers = len(layers)
+
+        self.layers = []
+        for i, layer in enumerate(layers):
+            self.add_module(f"layer_{i}", layer)
+            self.layers.append(layer)
+
+    def to(self, device):
+        super(LayerGroup, self).to(device)
+
+        for layer in self.layers:
+            layer.to(device)
+
+    def forward(self, *args, **kwargs):
+
+        for layer in self.layers:
+            layer.forward(*args, **kwargs)
+
+    def backward(self, *args, **kwargs):
+
+        for layer in self.layers:
+            layer.backward(*args, **kwargs)
+
+    def is_input(self):
+        return self.layer_type == "input"
+        
+    def is_prod(self):
+        return self.layer_type == "prod"
+
+    def is_sum(self):
+        return self.layer_type == "sum"
+
+    def __len__(self):
+        self.num_layers
+
+    def __getitem__(self, idx):
+        return self.layers[idx]
+
+    def __iter__(self):
+        self.iter_idx = 0
+        return self
+
+    def __next__(self):
+        if self.iter_idx < self.num_layers:
+            layer = self.layers[self.iter_idx]
+            self.iter_idx += 1
+            return layer
+        else:
+            raise StopIteration
diff --git a/src/pyjuice/layer/sum_layer.py b/src/pyjuice/layer/sum_layer.py
index cf00a362..70f9af82 100644
--- a/src/pyjuice/layer/sum_layer.py
+++ b/src/pyjuice/layer/sum_layer.py
@@ -196,6 +196,10 @@ def to(self, device):
     def num_parameters(self):
         return self._layer_pid_range[1] - self._layer_pid_range[0]
 
+    @property
+    def num_param_flows(self):
+        return self._layer_pfid_range[1] - self._layer_pfid_range[0]
+
     def forward(self, node_mars: torch.Tensor, element_mars: torch.Tensor, params: torch.Tensor) -> None:
         """
         Computes the forward pass of a sum layer:
diff --git a/src/pyjuice/model/tensorcircuit.py b/src/pyjuice/model/tensorcircuit.py
index 3aaa5a50..c55bf562 100644
--- a/src/pyjuice/model/tensorcircuit.py
+++ b/src/pyjuice/model/tensorcircuit.py
@@ -10,7 +10,7 @@
 from typing import Optional, Sequence, Callable, Union, Tuple, Dict
 
 from pyjuice.nodes import CircuitNodes, InputNodes, ProdNodes, SumNodes, foreach
-from pyjuice.layer import Layer, InputLayer, ProdLayer, SumLayer
+from pyjuice.layer import Layer, InputLayer, ProdLayer, SumLayer, LayerGroup
 from pyjuice.utils.grad_fns import ReverseGrad, PseudoHookFunc
 from pyjuice.utils import BitSet
 
@@ -19,9 +19,10 @@
                      normalize_parameters
 
 
-def _pc_model_backward_hook(grad, pc, **kwargs):
+def _pc_model_backward_hook(grad, pc, inputs, **kwargs):
     grad = grad.permute(1, 0)
     pc.backward(
+        inputs = inputs,
         ll_weights = grad / grad.sum() * grad.size(1),
         compute_param_flows = pc._optim_hyperparams["compute_param_flows"], 
         flows_memory = pc._optim_hyperparams["flows_memory"],
@@ -106,7 +107,7 @@ def forward(self, inputs: torch.Tensor, input_layer_fn: Optional[Union[str,Calla
 
         with torch.no_grad():
             # Input layers
-            for idx, layer in enumerate(self.input_layers):
+            for idx, layer in enumerate(self.input_layer_group):
                 if input_layer_fn is None:
                     layer(inputs, self.node_mars, **kwargs)
 
@@ -121,14 +122,14 @@ def forward(self, inputs: torch.Tensor, input_layer_fn: Optional[Union[str,Calla
                     raise ValueError(f"Custom input function should be either a `str` or a `Callable`. Found {type(input_layer_fn)} instead.")
 
             # Inner layers
-            for layer in self.inner_layers:
-                if isinstance(layer, ProdLayer):
+            for layer_group in self.inner_layer_groups:
+                if layer_group.is_prod():
                     # Prod layer
-                    layer(self.node_mars, self.element_mars)
+                    layer_group(self.node_mars, self.element_mars)
 
-                elif isinstance(layer, SumLayer):
+                elif layer_group.is_sum():
                     # Sum layer
-                    layer(self.node_mars, self.element_mars, self.params)
+                    layer_group(self.node_mars, self.element_mars, self.params)
 
                 else:
                     raise ValueError(f"Unknown layer type {type(layer)}.")
@@ -149,7 +150,7 @@ def forward(self, inputs: torch.Tensor, input_layer_fn: Optional[Union[str,Calla
 
         if torch.is_grad_enabled():
             lls.requires_grad = True
-            lls.register_hook(partial(_pc_model_backward_hook, pc = self, **kwargs))
+            lls.register_hook(partial(_pc_model_backward_hook, pc = self, inputs = inputs, **kwargs))
 
         if return_cache:
             return lls.clone(), cache
@@ -159,7 +160,7 @@ def forward(self, inputs: torch.Tensor, input_layer_fn: Optional[Union[str,Calla
     def backward(self, inputs: Optional[torch.Tensor] = None, 
                  ll_weights: Optional[torch.Tensor] = None,
                  compute_param_flows: bool = True, 
-                 flows_memory: float = 0.0,
+                 flows_memory: float = 1.0,
                  input_layer_fn: Optional[Union[str,Callable]] = None,
                  cache: Optional[dict] = None,
                  return_cache: bool = False,
@@ -182,16 +183,8 @@ def backward(self, inputs: Optional[torch.Tensor] = None,
 
         ## Initialize buffers for backward pass ##
 
-        if not isinstance(self.node_flows, torch.Tensor) or self.node_flows.size(0) != self.num_nodes or \
-                self.node_flows.size(1) != B or self.node_flows.device != self.device:
-            self.node_flows = torch.zeros([self.num_nodes, B], device = self.device)
-        
-        if not isinstance(self.element_flows, torch.Tensor) or self.element_flows.size(0) != self.num_elements or \
-                self.element_flows.size(1) != B or self.element_flows.device != self.device:
-            self.element_flows = torch.zeros([self.num_elements, B], device = self.device)
-
-        # Clear node flows
-        self.node_flows[:,:] = 0.0
+        self._init_buffer(name = "node_flows", shape = (self.num_nodes, B), set_value = 0.0)
+        self._init_buffer(name = "element_flows", shape = (self.num_elements, B), set_value = 0.0)
 
         # Set root node flows
         if ll_weights is None:
@@ -205,95 +198,50 @@ def backward(self, inputs: Optional[torch.Tensor] = None,
             self.node_flows[self._root_node_range[0]:self._root_node_range[1],:] = ll_weights
 
         # Load cached node flows
-        if cache is not None and "node_flows" in cache:
-            assert cache["node_flows"].dim() == 2 and cache["node_flows"].size(0) == self.node_flows.size(0) and \
-                cache["node_flows"].size(1) == self.node_flows.size(1)
-
-            if "replace_root_flows" in cache and cache["replace_root_flows"]:
-                if hasattr(self, "_pv_node_flows_mask") and getattr(self, "_pv_node_flows_mask") is not None:
-                    self.node_flows[self._pv_node_flows_mask,:] = 0.0
-
-                self.node_flows[:self._root_node_range[0],:] = cache["node_flows"][:self._root_node_range[0],:].to(self.device)
-                self.node_flows[self._root_node_range[1]:,:] = cache["node_flows"][self._root_node_range[1]:,:].to(self.device)
-            else:
-                self.node_flows[:,:] = cache["node_flows"]
-
-                if hasattr(self, "_pv_node_flows_mask") and getattr(self, "_pv_node_flows_mask") is not None:
-                    self.node_flows[self._pv_node_flows_mask,:] = 0.0
-                    self.node_flows[self._root_node_range[0]:self._root_node_range[1],:] = cache["node_flows"][self._root_node_range[0]:self._root_node_range[1],:]
-
-        ## Retrieve parameters and initialize parameter flows ##
-        if self._inputs[1] is not None:
-            params = self._backward_buffer["normalized_params"]
-        else:
-            params = self.params
+        if self._buffer_matches(name = "node_flows", cache = cache):
+            self.node_flows[:,:] = cache["node_flows"]
 
+        ## Initialize parameter flows ##
         if compute_param_flows:
             self.init_param_flows(flows_memory = flows_memory)
 
         ## Run backward pass ##
 
         with torch.no_grad():
-            for layer_id in range(len(self.inner_layers) - 1, -1, -1):
-                layer = self.inner_layers[layer_id]
+            for layer_id in range(len(self.inner_layer_groups) - 1, -1, -1):
+                layer_group = self.inner_layer_groups[layer_id]
 
-                if isinstance(layer, ProdLayer):
+                if layer_group.is_prod():
                     # Prod layer
-                    layer.backward(self.node_flows, self.element_flows)
+                    layer_group.backward(self.node_flows, self.element_flows)
 
-                elif isinstance(layer, SumLayer):
+                elif layer_group.is_sum():
                     # Sum layer
 
                     # First recompute the previous product layer
-                    self.inner_layers[layer_id-1].forward(self.node_mars, self.element_mars, _for_backward = True)
+                    self.inner_layer_groups[layer_id-1].forward(self.node_mars, self.element_mars, _for_backward = True)
 
                     # Backward sum layer
-                    layer.backward(self.node_flows, self.element_flows, self.node_mars, self.element_mars, params, 
-                                   param_flows = self.param_flows if compute_param_flows else None)
+                    layer_group.backward(self.node_flows, self.element_flows, self.node_mars, self.element_mars, self.params, 
+                                         param_flows = self.param_flows if compute_param_flows else None)
 
                 else:
                     raise ValueError(f"Unknown layer type {type(layer)}.")
 
-            if inputs is None:
-                inputs = self._inputs[0]
-            else:
-                inputs = inputs.permute(1, 0)
-
             # Compute backward pass for all input layers
-            grad_hook_idx = 2
-            for idx, layer in enumerate(self.input_layers):
+            for idx, layer in enumerate(self.input_layer_group):
                 if input_layer_fn is None:
                     layer.backward(inputs, self.node_flows, self.node_mars, **kwargs)
+
                 elif isinstance(input_layer_fn, str):
                     assert hasattr(layer, input_layer_fn), f"Custom input function `{input_layer_fn}` not found for layer type {type(layer)}."
                     getattr(layer, input_layer_fn)(inputs, self.node_flows, self.node_mars, **kwargs)
-                else:
-                    assert isinstance(input_layer_fn, Callable), f"Custom input function should be either a `str` or a `Callable`. " + \
-                                                                    f"Found {type(input_layer_fn)} instead."
-                    input_layer_fn(layer, inputs, self.node_flows, self.node_mars, **kwargs)
-
-                if "external_input_layers" in self._backward_buffer and idx in self._backward_buffer["external_input_layers"]:
-                    grad_hook_idx = layer._hook_param_grads(grad_hook_idx, self._inputs, self._inputs_grad)
-
-            if self._inputs[1] is not None:
-                B = self._inputs[0].size(0)
 
-                # Below computes the parameter gradients derived from flows
-                # grads = self.param_flows / params / B
-                # grads[0] = 0.0
-                # self._inputs_grad[1] = flat_softmax_bp(grads, params, self.node_ids, log_param_grad = False, inplace = False)
-
-                # However, using the gradients directly generally leads to slow convergence
-                # Instead, we use a scaled version of the gradient, as shown below
-                flows = self.param_flows
-                self._normalize_parameters(flows, pseudocount = self._pseudocount)
-                flows[0] = 1.0
-                grads = 0.5 * (torch.log(flows) - torch.log(params))
-                self._inputs_grad[1] = flat_softmax_bp(grads, params, self.node_ids, log_param_grad = True, inplace = False)
+                elif isinstance(input_layer_fn, Callable):
+                    input_layer_fn(layer, inputs, self.node_flows, self.node_mars, **kwargs)
 
-                self._used_external_sum_params = True
-            else:
-                self._used_external_sum_params = False
+                else:
+                    raise ValueError(f"Custom input function should be either a `str` or a `Callable`. Found {type(input_layer_fn)} instead.")
 
         if return_cache:
             if cache is None:
@@ -303,12 +251,11 @@ def backward(self, inputs: Optional[torch.Tensor] = None,
                 cache["node_flows"] = self.node_flows.clone()
 
             return cache
-
         else:
             return None
 
     def mini_batch_em(self, step_size: float, pseudocount: float = 0.0):
-        for layer in self.input_layers:
+        for layer in self.input_layer_group:
             layer.mini_batch_em(step_size = step_size, pseudocount = pseudocount)
         
         # Only apply parameter update if external parameters are not used in the previous forward/backward pass
@@ -327,22 +274,22 @@ def cumulate_flows(self, inputs: torch.Tensor, params: Optional[torch.Tensor] =
             self.forward(inputs, params)
             self.backward(inputs = inputs, compute_param_flows = True, flows_memory = 1.0)
 
-    def init_param_flows(self, flows_memory: float = 0.0):
-        batch_size = self._inputs[1].size(1) if self._inputs[1] is not None and self._inputs[1].dim() == 2 else 1
-        if self.param_flows is None or self.param_flows.size(0) != self.params.size(0) \
-                or (self.param_flows.dim() == 1 and batch_size > 1) \
-                or (self.param_flows.dim() == 2 and batch_size != self.param_flows.size(1)):
-            if batch_size == 1:
-                shape = [self.params.size(0)]
-            else:
-                shape = [self.params.size(0), batch_size]
-            self.param_flows = torch.zeros(shape, device = self.device)
+    def init_param_flows(self, flows_memory: float = 1.0, batch_size: Optional[int] = None):
+
+        assert 0.0 <= flows_memory <= 1.0, f"`flows_memory` should be in [0.0, 1.0]"
+
+        if batch_size is None:
+            pflow_shape = (self.num_param_flows,)
         else:
-            assert self.param_flows.size(0) == self.params.size(0)
+            pflow_shape = (self.num_param_flows, batch_size)
+            
+        self._init_buffer(name = "param_flows", shape = pflow_shape)
+
+        if flows_memory < 1.0:
             self.param_flows[:] *= flows_memory
 
         # For input layers
-        for layer in self.input_layers:
+        for layer in self.input_layer_group:
             layer.init_param_flows(flows_memory = flows_memory)
 
         return None
@@ -350,8 +297,7 @@ def init_param_flows(self, flows_memory: float = 0.0):
     def to(self, device):
         super(TensorCircuit, self).to(device)
 
-        for layer in self.input_layers:
-            layer.device = device
+        self.input_layer_group.to(device)
 
         self.device = device
 
@@ -385,7 +331,7 @@ def update_parameters(self, clone_params: bool = True, update_flows: bool = Fals
                     else:
                         ns._flows = param_flows[ns._param_ids]
 
-        for layer in self.input_layers:
+        for layer in self.input_layer_group:
             layer.update_parameters()
 
         return None
@@ -473,7 +419,7 @@ def _init_buffer(self, name: str, shape: Tuple, set_value: Optional[float] = Non
             flag = True
 
         if flag:
-            self.__dict__[name] = torch.empty(shape, device = self.device)
+            self.__dict__[name] = torch.zeros(shape, device = self.device)
 
         if set_value:
             self.__dict__[name][:] = set_value
@@ -501,7 +447,7 @@ def _init_layers(self, layer_sparsity_tol: Optional[float] = None, max_num_parti
                      disable_gpu_compilation: bool = False, force_gpu_compilation: bool = False, 
                      max_tied_ns_per_parflow_group: int = 8, verbose: bool = True):
 
-        if hasattr(self, "input_layers") or hasattr(self, "inner_layers"):
+        if hasattr(self, "input_layer_group") or hasattr(self, "inner_layer_groups"):
             raise ValueError("Attempting to initialize a TensorCircuit for the second time. " + \
                 "Please instead create a new TensorCircuit instance by calling `pc = TensorCircuit(root_ns)`.")
 
@@ -511,8 +457,8 @@ def _init_layers(self, layer_sparsity_tol: Optional[float] = None, max_num_parti
         # Create layers
         depth2nodes, num_layers, max_node_group_size, max_ele_group_size = self._create_node_layers()
 
-        self.input_layers = []
-        self.inner_layers = []
+        self.input_layer_group = None
+        self.inner_layer_groups = []
 
         self.num_dummy_nodes = max_ele_group_size
         self.num_dummy_eles = max_node_group_size
@@ -544,17 +490,20 @@ def _init_layers(self, layer_sparsity_tol: Optional[float] = None, max_num_parti
                 # Input layer
                 signature2nodes = self._categorize_input_nodes(depth2nodes[0]["input"])
                 input_layer_id = 0
+                input_layers = []
                 for signature, nodes in signature2nodes.items():
                     input_layer = InputLayer(
                         nodes = nodes, cum_nodes = num_nodes,
                         max_tied_ns_per_parflow_group = max_tied_ns_per_parflow_group
                     )
 
-                    self.input_layers.append(input_layer)
-                    self.add_module(f"input_layer_{input_layer_id}", input_layer)
+                    input_layers.append(input_layer)
                     
                     input_layer_id += 1
                     num_nodes += input_layer.num_nodes
+
+                self.input_layer_group = LayerGroup(input_layers)
+
             else:
                 assert len(depth2nodes[depth]["prod"]) > 0 and len(depth2nodes[depth]["sum"]) > 0, \
                     "Depth {}: (# prod nodes: {}, # sum nodes: {})".format(depth, len(depth2nodes[depth]["prod"]), len(depth2nodes[depth]["sum"]))
@@ -568,6 +517,7 @@ def _init_layers(self, layer_sparsity_tol: Optional[float] = None, max_num_parti
                     gsize2prod_nodes[gsize].append(ns)
                 
                 layer_num_elements = max_node_group_size
+                prod_layers = []
                 for gsize, nodes in gsize2prod_nodes.items():
                     prod_layer = ProdLayer(
                         nodes = nodes, 
@@ -581,8 +531,11 @@ def _init_layers(self, layer_sparsity_tol: Optional[float] = None, max_num_parti
                     layer_num_elements += prod_layer.num_nodes
                     num_edges += prod_layer.num_edges
 
-                    self.add_module(f"prod_layer_{layer_id}_{gsize}", prod_layer)
-                    self.inner_layers.append(prod_layer)
+                    prod_layers.append(prod_layer)
+                
+                prod_layer_group = LayerGroup(prod_layers)
+                self.inner_layer_groups.append(prod_layer_group)
+                self.add_module(f"prod_layer_{layer_id}", prod_layer_group)
 
                 if layer_num_elements > num_elements:
                     num_elements = layer_num_elements
@@ -595,6 +548,7 @@ def _init_layers(self, layer_sparsity_tol: Optional[float] = None, max_num_parti
                         gsize2sum_nodes[gsize] = []
                     gsize2sum_nodes[gsize].append(ns)
                 
+                sum_layers = []
                 for gsize, nodes in gsize2sum_nodes.items():
                     sum_layer = SumLayer(
                         nodes = nodes,
@@ -612,9 +566,13 @@ def _init_layers(self, layer_sparsity_tol: Optional[float] = None, max_num_parti
                     num_nodes += sum_layer.num_nodes
                     num_edges += sum_layer.num_edges
                     num_parameters += sum_layer.num_parameters
+                    num_param_flows += sum_layer.num_param_flows
+
+                    sum_layers.append(sum_layer)
 
-                    self.add_module(f"sum_layer_{layer_id}_{gsize}", sum_layer)
-                    self.inner_layers.append(sum_layer)
+                sum_layer_group = LayerGroup(sum_layers)
+                self.inner_layer_groups.append(sum_layer_group)
+                self.add_module(f"sum_layer_{layer_id}", sum_layer_group)
 
                 layer_id += 1
 
@@ -656,7 +614,7 @@ def _init_parameters(self, perturbation: float = 4.0, pseudocount: float = 0.0):
         self.params.requires_grad = False
 
         # Initialize parameters for input layers
-        for idx, layer in enumerate(self.input_layers):
+        for idx, layer in enumerate(self.input_layer_group):
             layer._init_parameters(perturbation)
 
     def _normalize_parameters(self, params, pseudocount: float = 0.0):
@@ -731,8 +689,11 @@ def _categorize_input_nodes(self, nodes: Sequence[InputNodes]):
         return signature2nodes
 
     def _create_scope2nid_cache(self):
+
+        raise NotImplementedError()
+
         # Input layers
-        for idx, layer in enumerate(self.input_layers):
+        for idx, layer in enumerate(self.input_layer_group):
             layer._prepare_scope2nids()
 
         # Inner layers
diff --git a/tests/model/simple_model_test.py b/tests/model/simple_model_test.py
index abdc6984..a916d7df 100644
--- a/tests/model/simple_model_test.py
+++ b/tests/model/simple_model_test.py
@@ -48,7 +48,7 @@ def simple_model_test():
 
     ## Test all compilation-related stuff ##
 
-    input_layer = pc.input_layers[0]
+    input_layer = pc.input_layer_group[0]
 
     assert torch.all(input_layer.vids[0:32,0] == 0)
     assert torch.all(input_layer.vids[32:64,0] == 1)
@@ -73,7 +73,7 @@ def simple_model_test():
     assert torch.all(torch.abs(input_layer.params[:64*4].reshape(64, 4).sum(dim = 1) - 1.0) < 1e-4)
     assert torch.all(torch.abs(input_layer.params[64*4:].reshape(64, 6).sum(dim = 1) - 1.0) < 1e-4)
 
-    prod_layer0 = pc.inner_layers[0]
+    prod_layer0 = pc.inner_layer_groups[0][0]
 
     assert prod_layer0.num_nodes == 4 * 16 * 2
     assert prod_layer0.num_edges == 8 * 16 * 2
@@ -91,7 +91,7 @@ def simple_model_test():
     assert torch.all(prod_layer0.partitioned_parids[0] == torch.tensor([[16, 48], [32, 64], [80, 112], [96, 128], [80, 0], [96, 0]]))
     assert torch.all(prod_layer0.partitioned_parids[1] == torch.tensor([[16, 48, 112, 0], [32, 64, 128, 0]]))
 
-    sum_layer0 = pc.inner_layers[1]
+    sum_layer0 = pc.inner_layer_groups[1][0]
 
     assert torch.all(sum_layer0.partitioned_nids[0] == torch.tensor([176, 192, 208, 224]))
     assert torch.all(sum_layer0.partitioned_nids[1] == torch.tensor([144, 160]))
@@ -137,7 +137,7 @@ def simple_model_test():
     assert torch.all(torch.abs(pc.params[3088:3600].reshape(1, 2, 16, 16).sum(dim = 2).sum(dim = 1) - 1.0) < 1e-4)
     assert torch.all(torch.abs(pc.params[3600:4112].reshape(1, 2, 16, 16).sum(dim = 2).sum(dim = 1) - 1.0) < 1e-4)
 
-    prod_layer1 = pc.inner_layers[2]
+    prod_layer1 = pc.inner_layer_groups[2][0]
 
     assert torch.all(prod_layer1.partitioned_nids[0] == torch.arange(16, 112, 16))
 
@@ -159,7 +159,7 @@ def simple_model_test():
     assert torch.all(prod_layer1.partitioned_parids[1][0:2,:] == torch.tensor([[48, 80], [64, 96]]))
     assert torch.all(prod_layer1.partitioned_parids[1][2:4,:] == torch.tensor([[16, 80], [32, 96]]))
 
-    sum_layer1 = pc.inner_layers[3]
+    sum_layer1 = pc.inner_layer_groups[3][0]
 
     assert sum_layer1.group_size == 1
 
@@ -211,7 +211,7 @@ def simple_model_test():
     np2_lls = ni1_lls + ni2_lls
     np3_lls = ni0_lls + ni1_lls
 
-    pc.inner_layers[0].forward(pc.node_mars, pc.element_mars)
+    pc.inner_layer_groups[0][0].forward(pc.node_mars, pc.element_mars)
     element_mars = pc.element_mars.cpu()
 
     sid, eid = np0._output_ind_range
@@ -246,7 +246,7 @@ def simple_model_test():
     np5_lls = ns1_lls + ni0_lls + ni1_lls
     np6_lls = ns2_lls + ni0_lls + ni3_lls
 
-    pc.inner_layers[2].forward(pc.node_mars, pc.element_mars)
+    pc.inner_layer_groups[2][0].forward(pc.node_mars, pc.element_mars)
     element_mars = pc.element_mars.cpu()
 
     sid, eid = np4._output_ind_range
@@ -264,6 +264,10 @@ def simple_model_test():
     sid, eid = ns._output_ind_range
     assert torch.all(torch.abs(ns_lls - node_mars[sid:eid,:]) < 1e-3)
 
+    ## Backward pass ##
+
+    lls.mean().backward()
+
     import pdb; pdb.set_trace()
 
 

From f6f78ac15e1886355afcb0623b94f6178ee6a200 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Wed, 13 Dec 2023 05:18:28 +0800
Subject: [PATCH 052/162] "sparse" mode backward for sum layers

---
 src/pyjuice/layer/sum_layer.py   | 288 +++++++++++++++++++++++++++++--
 tests/model/simple_model_test.py |   2 +
 2 files changed, 277 insertions(+), 13 deletions(-)

diff --git a/src/pyjuice/layer/sum_layer.py b/src/pyjuice/layer/sum_layer.py
index 70f9af82..8aa5af9a 100644
--- a/src/pyjuice/layer/sum_layer.py
+++ b/src/pyjuice/layer/sum_layer.py
@@ -563,7 +563,7 @@ def _forward_block_sparse(self, node_mars: torch.Tensor, element_mars: torch.Ten
     @staticmethod
     @triton.jit
     def _fw_triton_sparse_kernel(node_mars, element_mars, params, nids, cids, pids,
-                                 local_ids, batch_size, partial_eval: tl.constexpr, n_edges: tl.constexpr, 
+                                 local_ids, batch_size, partial_eval: tl.constexpr, num_edges: tl.constexpr, 
                                  BLOCK_B: tl.constexpr, BLOCK_M: tl.constexpr, GROUP_SIZE_M: tl.constexpr):
         
         pid_b = tl.program_id(axis = 0) # ID of size-`BLOCK_B` batches
@@ -578,20 +578,20 @@ def _fw_triton_sparse_kernel(node_mars, element_mars, params, nids, cids, pids,
             ngroup_id = tl.load(local_ids + ngroup_id)
 
         # Initialize pointers to `params`
-        offs_edge = tl.arange(0, n_edges)
-        par_start = tl.load(pids + ngroup_id * n_edges + offs_edge)
-        epars_ptr = params + tile_id * BLOCK_M + par_start # [n_edges]
+        offs_edge = tl.arange(0, num_edges)
+        par_start = tl.load(pids + ngroup_id * num_edges + offs_edge)
+        epars_ptr = params + tile_id * BLOCK_M + par_start # [num_edges]
 
         # Batch offsets and mask
         offs_batch = tl.arange(0, BLOCK_B) + pid_b * BLOCK_B
         mask_batch = offs_batch < batch_size
 
         # Initialize and load edge mars
-        edge_ids = tl.load(cids + ngroup_id * n_edges + offs_edge)
+        edge_ids = tl.load(cids + ngroup_id * num_edges + offs_edge)
         emars_ptr = element_mars + \
             edge_ids[:,None] * batch_size + \
             offs_batch[None,:]
-        emars = tl.load(emars_ptr, mask = mask_batch[None,:]) # [n_edges, BLOCK_B]
+        emars = tl.load(emars_ptr, mask = mask_batch[None,:]) # [num_edges, BLOCK_B]
 
         # Compute max and subtract
         emars_max = tl.max(emars, axis = 0)
@@ -635,13 +635,13 @@ def _forward_sparse(self, node_mars: torch.Tensor, element_mars: torch.Tensor,
 
         num_ngroups = nids.size(0) if local_ids is None else local_ids.size(0)
         layer_n_nodes = num_ngroups * self.group_size
-        n_edges = cids.size(1)
+        num_edges = cids.size(1)
         batch_size = node_mars.size(1)
         BATCH_SIZE_NP2 = triton.next_power_of_2(batch_size)
 
-        assert n_edges <= 16384
+        assert num_edges <= 16384, "The sparse forward kernel only support nodes with # edges smaller than 16384."
 
-        BLOCK_B = max(min(2048 // n_edges, BATCH_SIZE_NP2), 1)
+        BLOCK_B = max(min(2048 // num_edges, BATCH_SIZE_NP2), 1)
         BLOCK_M = self.group_size
 
         grid = (triton.cdiv(batch_size, BLOCK_B), triton.cdiv(layer_n_nodes, BLOCK_M))
@@ -656,7 +656,7 @@ def _forward_sparse(self, node_mars: torch.Tensor, element_mars: torch.Tensor,
             local_ids = local_ids, 
             batch_size = batch_size, 
             partial_eval = 1 if local_ids is not None else 0, 
-            n_edges = n_edges, 
+            num_edges = num_edges, 
             BLOCK_B = BLOCK_B, 
             BLOCK_M = BLOCK_M, 
             GROUP_SIZE_M = self.group_size
@@ -696,10 +696,14 @@ def _backward(self, node_flows: torch.Tensor, element_flows: torch.Tensor,
 
         if mode is not None:
             assert mode in ["block_sparse", "sparse"]
-
         elif params.dim() == 1 and self.group_size >= 16 and num_edges >= 16 and batch_size >= 16:
             # In this case, we should definitely use the block-sparse implementation
             mode = "block_sparse"
+        elif self.group_size * num_edges < 4 and num_edges * batch_size < 4:
+            # In this case, we should definitely use the sparse implementation
+            mode = "sparse"
+        else:
+            mode = "sparse"
 
         if mode == "block_sparse":
             self._backward_block_sparse(
@@ -707,7 +711,12 @@ def _backward(self, node_flows: torch.Tensor, element_flows: torch.Tensor,
                 nids, cids, pids, pfids, chids, parids, parpids, cs_group_size, local_ids, 
                 partition_id = partition_id
             )
-
+        elif mode == "sparse":
+            self._backward_sparse(
+                node_flows, element_flows, params, node_mars, element_mars, param_flows, 
+                nids, cids, pids, pfids, chids, parids, parpids, cs_group_size, local_ids, 
+                partition_id = partition_id
+            )
         elif mode == "pytorch":
             self._backward_pytorch(
                 node_flows, element_flows, params, node_mars, 
@@ -1004,7 +1013,7 @@ def _bk_triton_block_sparse_par_kernel(node_flows, node_mars, element_mars, para
         
         curr_pflows = acc * epars
 
-        tl.atomic_add(param_flows + epars_offsets, curr_pflows) # TODO: reimplement with the lock mechanism
+        tl.atomic_add(param_flows + epars_offsets, curr_pflows)
 
     def _backward_block_sparse_par_flows(self, node_flows: torch.Tensor, params: torch.Tensor, node_mars: torch.Tensor, 
                                          element_mars: torch.Tensor, param_flows: torch.Tensor, nids: torch.Tensor, 
@@ -1067,6 +1076,259 @@ def _backward_block_sparse_par_flows(self, node_flows: torch.Tensor, params: tor
             GROUP_SIZE_M = self.group_size
         )
 
+    def _backward_sparse(self, node_flows: torch.Tensor, element_flows: torch.Tensor, 
+                         params: torch.Tensor, node_mars: torch.Tensor, 
+                         element_mars: torch.Tensor, param_flows: torch.Tensor, 
+                         nids: Optional[torch.Tensor], cids: Optional[torch.Tensor], pids: Optional[torch.Tensor], pfids: Optional[torch.Tensor],
+                         chids: Optional[torch.Tensor], parids: Optional[torch.Tensor], parpids: Optional[torch.Tensor], 
+                         cs_group_size: int, local_ids: Optional[torch.Tensor] = None,
+                         partition_id: int = -1) -> None:
+        """
+        Back pass of sum layers with sparse processing kernel.
+        
+        Parameters:
+        `node_flows`:   [N, B]
+        `element_flows: [M, B]
+        `params`:       [E]
+        `node_mars`:    [N, B]
+        `element_mars`: [M, B]
+        `param_flows`:  [E]
+        `chids`:        [ng]
+        `parids`:       [ng, c]
+        `parpids`:      [ng, c]
+        """
+
+        # Flows w.r.t. input elements (product nodes)
+        if chids is not None:
+            self._backward_sparse_ele_flows(
+                node_flows, element_flows, params, node_mars, element_mars,
+                chids = chids, parids = parids, parpids = parpids, 
+                cs_group_size = cs_group_size, local_ids = local_ids
+            )
+
+        # Flows w.r.t. parameters
+        if param_flows is not None and nids is not None:
+            self._backward_sparse_par_flows(
+                node_flows, params, node_mars, element_mars, param_flows,
+                nids = nids, cids = cids, pids = pids, pfids = pfids
+            )
+
+        return None
+
+    @staticmethod
+    @triton.jit
+    def _bk_triton_sparse_ele_kernel(node_flows, element_flows, node_mars, element_mars, params, 
+                                     chids, parids, parpids, local_ids, batch_size: tl.constexpr, partial_eval: tl.constexpr,
+                                     n_edge_groups: tl.constexpr, BLOCK_B: tl.constexpr, BLOCK_M: tl.constexpr, 
+                                     GROUP_SIZE_K: tl.constexpr):
+
+        pid_b = tl.program_id(0) # ID of size-`BLOCK_B` batches
+        pid_m = tl.program_id(1) # ID of size-`TILE_SIZE_M` nodes
+
+        # Get inferred node group id from `pid_m`
+        elegroup_id = pid_m
+
+        # Get the real node group id in the case of partial evaluation
+        if partial_eval == 1:
+            elegroup_id = tl.load(local_ids + elegroup_id)
+
+        # Initialize pointers to `params`
+        offs_edge = tl.arange(0, n_edge_groups * GROUP_SIZE_K) # I.e., [0, num_edges)
+        offs_edge_gid = offs_edge // GROUP_SIZE_K
+        offs_edge_nid = (offs_edge % GROUP_SIZE_K)
+        par_start = tl.load(parpids + elegroup_id * n_edge_groups + offs_edge_gid)
+        epars_ptr = params + par_start + offs_edge_nid # [num_edges]
+
+        # Batch offsets and mask
+        offs_batch = tl.arange(0, BLOCK_B) + pid_b * BLOCK_B
+        mask_batch = offs_batch < batch_size
+
+        # Initialize `node_flows` and `node_mars`
+        edge_start = tl.load(parids + elegroup_id * n_edge_groups + offs_edge_gid)
+        nmars_ptr = node_mars + \
+            (edge_start + offs_edge_nid)[:,None] * batch_size + \
+            offs_batch[None,:]
+        nmars = tl.load(nmars_ptr, mask = mask_batch[None,:]) # [num_edges, BLOCK_B]
+        nflows_ptr = node_flows + \
+            (edge_start + offs_edge_nid)[:,None] * batch_size + \
+            offs_batch[None,:]
+        nflows = tl.load(nflows_ptr, mask = mask_batch[None,:]) # [num_edges, BLOCK_B]
+
+        # Initialize pointers to `element_flows` and `element_mars`
+        off_eleids = tl.load(chids + elegroup_id)
+        eflows_ptr = element_flows + off_eleids * batch_size + offs_batch # [BLOCK_B]
+        emars_ptr = element_mars + off_eleids * batch_size + offs_batch # [BLOCK_B]
+
+        # Inner loop
+        for i in range(0, BLOCK_M):
+            epars = tl.load(epars_ptr) # [num_edges]
+            emars = tl.load(emars_ptr, mask = mask_batch) # [BLOCK_B]
+
+            eflows = tl.sum(nflows * epars[:,None] * tl.exp(emars[None,:] - nmars), axis = 0)
+
+            tl.store(eflows_ptr, eflows, mask = mask_batch)
+
+            # Increment `emars_ptr` and `eflows_ptr`
+            emars_ptr += batch_size
+            eflows_ptr += batch_size
+
+    def _backward_sparse_ele_flows(self, node_flows: torch.Tensor, element_flows: torch.Tensor,
+                                   params: torch.Tensor, node_mars: torch.Tensor,
+                                   element_mars: torch.Tensor, chids: torch.Tensor, parids: torch.Tensor,
+                                   parpids: torch.Tensor, cs_group_size: int, local_ids: Optional[torch.Tensor] = None) -> None:
+
+        assert params.dim() == 1, "Expecting a 1D `params`."
+
+        num_ngroups = chids.size(0) if local_ids is None else local_ids.size(0)
+        layer_n_nodes = num_ngroups * cs_group_size
+        n_edge_groups = parids.size(1)
+        num_edges = n_edge_groups * self.group_size
+        batch_size = node_flows.size(1)
+        BATCH_SIZE_NP2 = triton.next_power_of_2(batch_size)
+
+        assert num_edges <= 16384, "The sparse backward kernel only support nodes with # edges smaller than 16384."
+
+        BLOCK_B = max(min(2048 // num_edges, BATCH_SIZE_NP2), 1)
+        BLOCK_M = cs_group_sizes
+
+        grid = (triton.cdiv(batch_size, BLOCK_B), triton.cdiv(layer_n_nodes, BLOCK_M))
+
+        self._bk_triton_sparse_ele_kernel[grid](
+            node_flows = node_flows, 
+            element_flows = element_flows, 
+            node_mars = node_mars, 
+            element_mars = element_mars, 
+            params = params, 
+            chids = chids, 
+            parids = parids,
+            parpids = parpids,
+            local_ids = local_ids, 
+            batch_size = batch_size, 
+            partial_eval = 1 if local_ids is not None else 0,
+            n_edge_groups = n_edge_groups,
+            BLOCK_B = BLOCK_B, 
+            BLOCK_M = BLOCK_M,
+            GROUP_SIZE_K = self.group_size
+        )
+
+        return None
+
+    @staticmethod
+    @triton.jit
+    def _bk_triton_sparse_par_kernel(node_flows, node_mars, element_mars, params, param_flows, nids, cids, pids, pfids,
+                                     batch_size: tl.constexpr, num_edges: tl.constexpr, BLOCK_M: tl.constexpr, 
+                                     BLOCK_B: tl.constexpr, B_NUM_BLOCKS: tl.constexpr, GROUP_SIZE_M: tl.constexpr):
+
+        pid_m = tl.program_id(0) # ID of size-`BLOCK_M` nodes
+
+        # Get inferred node group id from `pid_m`
+        ngroup_id = pid_m // GROUP_SIZE_M
+        tile_id = pid_m % GROUP_SIZE_M
+
+        # Batch offsets and mask
+        offs_batch = tl.arange(0, BLOCK_B)
+        mask_batch = offs_batch < batch_size
+
+        # Initialize pointers to `element_mars`
+        offs_edge = tl.arange(0, num_edges)
+        edge_start = tl.load(cids + ngroup_id * num_edges + offs_edge)
+        emars_ptr = element_mars + \
+            edge_start[:,None] * batch_size + \
+            offs_batch[None,:] # [num_edges, BLOCK_B]
+
+        # Initialize pointers to `node_flows` and `node_mars`
+        off_nids = tl.load(nids + ngroup_id)
+        nmars_ptr = node_mars + (off_nids + tile_id) * batch_size + offs_batch # [BLOCK_B]
+        nflows_ptr = node_flows + off_nids * batch_size + offs_batch # [BLOCK_B]
+
+        # Initialize `params`
+        par_start = tl.load(pids + ngroup_id * num_edges + offs_edge)
+        epars_ptr = params + par_start + tile_id
+        epars = tl.load(epars_ptr) # [num_edges]
+
+        # Inner loop
+        acc = tl.zeros([num_edges], dtype = tl.float32)
+
+        for b in range(0, B_NUM_BLOCKS):
+            emars = tl.load(emars_ptr, mask = mask_batch[None,:]) # [num_edges, BLOCK_B]
+            nmars = tl.load(nmars_ptr, mask = mask_batch[None,:]) # [BLOCK_B]
+            nflows = tl.load(nflows_ptr, mask = mask_batch[None,:]) # [BLOCK_B]
+
+            pflows = tl.sum(nflows[None,:] * tl.exp(emars - nmars[None,:]), axis = 1)
+
+            acc += pflows
+
+            # Increment `emars_ptr`, `nmars_ptr`, and `nmars_ptr`
+            emars_ptr += BLOCK_B
+            nmars_ptr += BLOCK_B
+            nflows_ptr += BLOCK_B
+
+            # Update batch mask
+            offs_batch += BLOCK_B
+            mask_batch = offs_batch < batch_size
+
+        par_start = tl.load(pids + ngroup_id * num_edges + offs_edge)
+        epars_ptr = params + par_start + tile_id
+        epars = tl.load(epars_ptr) # [num_edges]
+
+        parflow_start = tl.load(pfids + ngroup_id * num_edges + offs_edge)
+        eparflows_ptr = param_flows + par_start + tile_id
+        
+        curr_pflows = acc * epars
+
+        tl.atomic_add(eparflows_ptr, curr_pflows)
+
+    def _backward_sparse_par_flows(self, node_flows: torch.Tensor, params: torch.Tensor, node_mars: torch.Tensor, 
+                                   element_mars: torch.Tensor, param_flows: torch.Tensor, nids: torch.Tensor, 
+                                   cids: torch.Tensor, pids: torch.Tensor, pfids: torch.Tensor) -> None:
+        """
+        Backward pass of sum layers w.r.t. sum parameters with the block-sparse processing kernel.
+        
+        Parameters:
+        `node_flows`:    [N, B]
+        `element_flows`: [M, B]
+        `params`:        [E]
+        `node_mars`:     [N, B]
+        `element_mars`:  [M, B]
+        `param_flows`:   [E]
+        `nids`:          [ng]
+        `cids`:          [ng, c]
+        `pids`:          [ng, c]
+        """
+
+        assert params.dim() == 1, "Expecting a 1D `params`."
+
+        num_ngroups = nids.size(0)
+        layer_n_nodes = num_ngroups * self.group_size
+        num_edges = cids.size(1)
+        batch_size = node_mars.size(1)
+        BATCH_SIZE_NP2 = triton.next_power_of_2(batch_size)
+
+        assert num_edges <= 16384, "The sparse backward kernel only support nodes with # edges smaller than 16384."
+
+        BLOCK_B = max(min(2048 // num_edges, BATCH_SIZE_NP2), 1)
+        BLOCK_M = self.group_sizes
+
+        grid = (layer_n_nodes,)
+
+        self._bk_triton_sparse_par_kernel[grid](
+            node_flows = node_flows, 
+            node_mars = node_mars, 
+            element_mars = element_mars, 
+            params = params, 
+            param_flows = param_flows, 
+            nids = nids, 
+            cids = cids, 
+            pids = pids,
+            pfids = pfids,
+            batch_size = batch_size, 
+            num_edges = num_edges, 
+            BLOCK_M = BLOCK_M,
+            BLOCK_B = BLOCK_B,
+            B_NUM_BLOCKS = triton.cdiv(batch_size, BLOCK_B),
+            GROUP_SIZE_M = self.group_size
+        )
+
     @torch.compile(mode = "reduce-overhead", fullgraph = True)
     def _backward_pytorch(self, node_flows: torch.Tensor, element_flows: torch.Tensor, 
                           params: torch.Tensor, node_mars: torch.Tensor, 
diff --git a/tests/model/simple_model_test.py b/tests/model/simple_model_test.py
index a916d7df..15baae73 100644
--- a/tests/model/simple_model_test.py
+++ b/tests/model/simple_model_test.py
@@ -268,6 +268,8 @@ def simple_model_test():
 
     lls.mean().backward()
 
+    node_flows = pc.node_flows.cpu()
+
     import pdb; pdb.set_trace()
 
 

From e2dd7dca5433acea66cd1767910f04c67d1b2b95 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Wed, 13 Dec 2023 16:30:29 +0800
Subject: [PATCH 053/162] fix dummy param offsets

---
 src/pyjuice/model/tensorcircuit.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/pyjuice/model/tensorcircuit.py b/src/pyjuice/model/tensorcircuit.py
index c55bf562..cbeb622b 100644
--- a/src/pyjuice/model/tensorcircuit.py
+++ b/src/pyjuice/model/tensorcircuit.py
@@ -473,10 +473,10 @@ def _init_layers(self, layer_sparsity_tol: Optional[float] = None, max_num_parti
         num_elements = max_node_group_size
 
         # Number of parameters
-        num_parameters = max_node_group_size
+        num_parameters = max_node_group_size * max_ele_group_size
 
         # Number of parameter flows
-        num_param_flows = 0
+        num_param_flows = max_node_group_size * max_ele_group_size
 
         # Stores distributed parameter flows
         node2tiednodes = dict()

From 4a928298415935c32e81483114a6640693e10c0c Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Wed, 13 Dec 2023 16:53:48 +0800
Subject: [PATCH 054/162] fix runtests for sum layer

---
 src/pyjuice/layer/compilation.py |  3 ++-
 src/pyjuice/layer/sum_layer.py   | 17 +++++++++----
 tests/layer/sum_layer_test.py    | 43 ++++++++++++++++++--------------
 3 files changed, 38 insertions(+), 25 deletions(-)

diff --git a/src/pyjuice/layer/compilation.py b/src/pyjuice/layer/compilation.py
index f1f18c8d..2a714d20 100644
--- a/src/pyjuice/layer/compilation.py
+++ b/src/pyjuice/layer/compilation.py
@@ -326,7 +326,7 @@ def sum_layer_forward_compilation(nodes, fw_partition_max_chs, n_partition_ids,
                 
             # Global pid and pfid start index for `ns`
             ns_pid_start = ns._param_range[0]
-            ns_pfid_start = ns._param_range[0]
+            ns_pfid_start = ns._param_flow_range[0]
         else:
             source_ns = ns.get_source_ns()
 
@@ -517,6 +517,7 @@ def sum_layer_forward_compilation(nodes, fw_partition_max_chs, n_partition_ids,
         # Restore `cids` and `pids`
         target_cids = target_cids.cpu()
         target_pids = target_pids.cpu()
+        target_pfids = target_pfids.cpu()
         cids = []
         pids = []
         pfids = []
diff --git a/src/pyjuice/layer/sum_layer.py b/src/pyjuice/layer/sum_layer.py
index 8aa5af9a..1ae707a3 100644
--- a/src/pyjuice/layer/sum_layer.py
+++ b/src/pyjuice/layer/sum_layer.py
@@ -695,7 +695,7 @@ def _backward(self, node_flows: torch.Tensor, element_flows: torch.Tensor,
         batch_size = node_flows.size(1)
 
         if mode is not None:
-            assert mode in ["block_sparse", "sparse"]
+            assert mode in ["block_sparse", "sparse", "pytorch"]
         elif params.dim() == 1 and self.group_size >= 16 and num_edges >= 16 and batch_size >= 16:
             # In this case, we should definitely use the block-sparse implementation
             mode = "block_sparse"
@@ -705,6 +705,8 @@ def _backward(self, node_flows: torch.Tensor, element_flows: torch.Tensor,
         else:
             mode = "sparse"
 
+        # mode = "sparse" ##### debug
+
         if mode == "block_sparse":
             self._backward_block_sparse(
                 node_flows, element_flows, params, node_mars, element_mars, param_flows, 
@@ -723,6 +725,8 @@ def _backward(self, node_flows: torch.Tensor, element_flows: torch.Tensor,
                 element_mars, param_flows, chids, parids, parpids,
                 cs_group_size
             )
+        else:
+            raise ValueError(f"Not supported mode `{mode}`.")
 
     def _backward_block_sparse(self, node_flows: torch.Tensor, element_flows: torch.Tensor, 
                                params: torch.Tensor, node_mars: torch.Tensor, 
@@ -1168,6 +1172,9 @@ def _bk_triton_sparse_ele_kernel(node_flows, element_flows, node_mars, element_m
 
             tl.store(eflows_ptr, eflows, mask = mask_batch)
 
+            # Increment `epars_ptr`
+            epars_ptr += GROUP_SIZE_K
+
             # Increment `emars_ptr` and `eflows_ptr`
             emars_ptr += batch_size
             eflows_ptr += batch_size
@@ -1189,7 +1196,7 @@ def _backward_sparse_ele_flows(self, node_flows: torch.Tensor, element_flows: to
         assert num_edges <= 16384, "The sparse backward kernel only support nodes with # edges smaller than 16384."
 
         BLOCK_B = max(min(2048 // num_edges, BATCH_SIZE_NP2), 1)
-        BLOCK_M = cs_group_sizes
+        BLOCK_M = cs_group_size
 
         grid = (triton.cdiv(batch_size, BLOCK_B), triton.cdiv(layer_n_nodes, BLOCK_M))
 
@@ -1251,8 +1258,8 @@ def _bk_triton_sparse_par_kernel(node_flows, node_mars, element_mars, params, pa
 
         for b in range(0, B_NUM_BLOCKS):
             emars = tl.load(emars_ptr, mask = mask_batch[None,:]) # [num_edges, BLOCK_B]
-            nmars = tl.load(nmars_ptr, mask = mask_batch[None,:]) # [BLOCK_B]
-            nflows = tl.load(nflows_ptr, mask = mask_batch[None,:]) # [BLOCK_B]
+            nmars = tl.load(nmars_ptr, mask = mask_batch) # [BLOCK_B]
+            nflows = tl.load(nflows_ptr, mask = mask_batch) # [BLOCK_B]
 
             pflows = tl.sum(nflows[None,:] * tl.exp(emars - nmars[None,:]), axis = 1)
 
@@ -1307,7 +1314,7 @@ def _backward_sparse_par_flows(self, node_flows: torch.Tensor, params: torch.Ten
         assert num_edges <= 16384, "The sparse backward kernel only support nodes with # edges smaller than 16384."
 
         BLOCK_B = max(min(2048 // num_edges, BATCH_SIZE_NP2), 1)
-        BLOCK_M = self.group_sizes
+        BLOCK_M = self.group_size
 
         grid = (layer_n_nodes,)
 
diff --git a/tests/layer/sum_layer_test.py b/tests/layer/sum_layer_test.py
index 98a96044..c0d02976 100644
--- a/tests/layer/sum_layer_test.py
+++ b/tests/layer/sum_layer_test.py
@@ -41,7 +41,8 @@ def sum_layer_test():
     prod_layer = ProdLayer([np0, np1, np2])
 
     layer = SumLayer([ns0, ns1, ns2], global_nid_start = group_size,
-                     global_pid_start = 1, global_pfid_start = 0, node2tiednodes = dict(), )
+                     global_pid_start = group_size ** 2, 
+                     global_pfid_start = group_size ** 2, node2tiednodes = dict(), )
 
     assert torch.all(layer.partitioned_nids[0] == torch.arange(group_size, 7 * group_size, group_size))
     assert torch.all(layer.partitioned_cids[0][0:2,0] == group_size)
@@ -50,8 +51,10 @@ def sum_layer_test():
     assert torch.all(layer.partitioned_cids[0][0:2,1] == group_size + 1)
     assert torch.all(layer.partitioned_cids[0][2:4,1] == 3 * group_size + 1)
     assert torch.all(layer.partitioned_cids[0][4:6,1] == 5 * group_size + 1)
-    assert torch.all(layer.partitioned_pids[0][:,0] == torch.arange(group_size, (group_size * 2 * 6 + 1) * group_size, 2 * group_size * group_size) - group_size + 1)
-    assert torch.all(layer.partitioned_pids[0][:,1] == torch.arange(group_size, (group_size * 2 * 6 + 1) * group_size, 2 * group_size * group_size) + 1)
+    assert torch.all(layer.partitioned_pids[0][:,0] == torch.arange(group_size, (group_size * 2 * 6 + 1) * group_size, 2 * group_size * group_size) - group_size + group_size ** 2)
+    assert torch.all(layer.partitioned_pids[0][:,1] == torch.arange(group_size, (group_size * 2 * 6 + 1) * group_size, 2 * group_size * group_size) + group_size ** 2)
+    assert torch.all(layer.partitioned_pfids[0][:,0] == torch.arange(group_size, (group_size * 2 * 6 + 1) * group_size, 2 * group_size * group_size) - group_size + group_size ** 2)
+    assert torch.all(layer.partitioned_pfids[0][:,1] == torch.arange(group_size, (group_size * 2 * 6 + 1) * group_size, 2 * group_size * group_size) + group_size ** 2)
 
     assert torch.all(layer.partitioned_chids[0] == torch.arange(group_size, 7 * group_size, group_size))
     assert torch.all(layer.partitioned_parids[0][0:2,0] == group_size)
@@ -60,18 +63,18 @@ def sum_layer_test():
     assert torch.all(layer.partitioned_parids[0][2:4,1] == 4 * group_size)
     assert torch.all(layer.partitioned_parids[0][4:6,0] == 5 * group_size)
     assert torch.all(layer.partitioned_parids[0][4:6,1] == 6 * group_size)
-    assert torch.all(layer.partitioned_parpids[0][0,0] == 1)
-    assert torch.all(layer.partitioned_parpids[0][1,0] == 1 + group_size**2)
-    assert torch.all(layer.partitioned_parpids[0][0,1] == 1 + 2 * group_size**2)
-    assert torch.all(layer.partitioned_parpids[0][1,1] == 1 + 3 * group_size**2)
-    assert torch.all(layer.partitioned_parpids[0][2,0] == 1 + 4 * group_size**2)
-    assert torch.all(layer.partitioned_parpids[0][3,0] == 1 + 5 * group_size**2)
-    assert torch.all(layer.partitioned_parpids[0][2,1] == 1 + 6 * group_size**2)
-    assert torch.all(layer.partitioned_parpids[0][3,1] == 1 + 7 * group_size**2)
-    assert torch.all(layer.partitioned_parpids[0][4,0] == 1 + 8 * group_size**2)
-    assert torch.all(layer.partitioned_parpids[0][5,0] == 1 + 9 * group_size**2)
-    assert torch.all(layer.partitioned_parpids[0][4,1] == 1 + 10 * group_size**2)
-    assert torch.all(layer.partitioned_parpids[0][5,1] == 1 + 11 * group_size**2)
+    assert torch.all(layer.partitioned_parpids[0][0,0] == group_size**2)
+    assert torch.all(layer.partitioned_parpids[0][1,0] == 2 * group_size**2)
+    assert torch.all(layer.partitioned_parpids[0][0,1] == 3 * group_size**2)
+    assert torch.all(layer.partitioned_parpids[0][1,1] == 4 * group_size**2)
+    assert torch.all(layer.partitioned_parpids[0][2,0] == 5 * group_size**2)
+    assert torch.all(layer.partitioned_parpids[0][3,0] == 6 * group_size**2)
+    assert torch.all(layer.partitioned_parpids[0][2,1] == 7 * group_size**2)
+    assert torch.all(layer.partitioned_parpids[0][3,1] == 8 * group_size**2)
+    assert torch.all(layer.partitioned_parpids[0][4,0] == 9 * group_size**2)
+    assert torch.all(layer.partitioned_parpids[0][5,0] == 10 * group_size**2)
+    assert torch.all(layer.partitioned_parpids[0][4,1] == 11 * group_size**2)
+    assert torch.all(layer.partitioned_parpids[0][5,1] == 12 * group_size**2)
 
     layer.to(device)
 
@@ -81,7 +84,7 @@ def sum_layer_test():
     element_mars[:group_size,:] = -float("inf")
     node_mars = torch.zeros([group_size + group_size * 2 * 3, batch_size]).to(device)
 
-    params = torch.rand([1 + 3 * 4 * group_size * group_size]).to(device)
+    params = torch.rand([group_size ** 2 + 3 * 4 * group_size * group_size]).to(device)
 
     layer(node_mars, element_mars, params)
 
@@ -96,7 +99,7 @@ def sum_layer_test():
     node_flows = torch.rand([group_size + group_size * 2 * 3, batch_size]).to(device)
     element_flows = torch.zeros([group_size + 3 * 2 * 2 * group_size, batch_size]).to(device)
 
-    param_flows = torch.zeros([1 + 3 * 4 * group_size * group_size]).to(device)
+    param_flows = torch.zeros([group_size ** 2 + 3 * 4 * group_size * group_size]).to(device)
 
     layer.backward(node_flows, element_flows, node_mars, element_mars, params, param_flows)
 
@@ -118,6 +121,7 @@ def sum_layer_test():
             epars = params[parpids[j,:]+i]
             eflows = (nflows * epars[:,None] * emars[None,:] / nmars).sum(dim = 0)
 
+            # import pdb; pdb.set_trace()
             assert torch.all(torch.abs(eflows - element_flows[(j+1)*group_size+i,:]) < 1e-2)
 
     my_pflows = torch.zeros_like(param_flows)
@@ -130,7 +134,7 @@ def sum_layer_test():
             nflows = node_flows[(j+1)*group_size+i,:]
             pflows = epars * (nflows[None,:] * emars / nmars[None,:]).sum(dim = 1)
 
-            my_pflows[layer.partitioned_pids[0][j,:]+i] = pflows
+            my_pflows[layer.partitioned_pfids[0][j,:]+i] = pflows
 
     assert torch.all(torch.abs(my_pflows - param_flows) < 2e-3)
 
@@ -204,6 +208,7 @@ def speed_test():
     torch.cuda.synchronize()
     for _ in range(10000000000000000):
         layer.backward(node_flows, element_flows, node_mars, element_mars, params, param_flows)
+        time.sleep(0.002)
     torch.cuda.synchronize()
     t1 = time.time()
     backward_ms = (t1 - t0) / 100 * 1000
@@ -216,4 +221,4 @@ def speed_test():
 if __name__ == "__main__":
     torch.manual_seed(3890)
     sum_layer_test()
-    speed_test()
\ No newline at end of file
+    # speed_test()
\ No newline at end of file

From a8abe35029ba344f7d8b6ec9677747e3de8aa556 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Wed, 13 Dec 2023 17:25:06 +0800
Subject: [PATCH 055/162] reinstate pytorch kernel

---
 src/pyjuice/layer/sum_layer.py     | 34 ++++++++++++++++++------------
 src/pyjuice/model/tensorcircuit.py |  2 +-
 tests/layer/sum_layer_test.py      | 14 +++++++-----
 3 files changed, 30 insertions(+), 20 deletions(-)

diff --git a/src/pyjuice/layer/sum_layer.py b/src/pyjuice/layer/sum_layer.py
index 1ae707a3..387a0137 100644
--- a/src/pyjuice/layer/sum_layer.py
+++ b/src/pyjuice/layer/sum_layer.py
@@ -705,7 +705,7 @@ def _backward(self, node_flows: torch.Tensor, element_flows: torch.Tensor,
         else:
             mode = "sparse"
 
-        # mode = "sparse" ##### debug
+        mode = "pytorch" ##### debug
 
         if mode == "block_sparse":
             self._backward_block_sparse(
@@ -722,8 +722,8 @@ def _backward(self, node_flows: torch.Tensor, element_flows: torch.Tensor,
         elif mode == "pytorch":
             self._backward_pytorch(
                 node_flows, element_flows, params, node_mars, 
-                element_mars, param_flows, chids, parids, parpids,
-                cs_group_size
+                element_mars, param_flows, nids, cids, pids, pfids, 
+                chids, parids, parpids, cs_group_size
             )
         else:
             raise ValueError(f"Not supported mode `{mode}`.")
@@ -794,8 +794,8 @@ def _bk_triton_block_sparse_ele_kernel(node_flows, element_flows, node_mars, ele
         offs_edge_nid = (offs_edge % GROUP_SIZE_K)
         par_start = tl.load(parpids_start + elegroup_id * ptr_inc_step + offs_edge_gid)
         epars_ptr = params + \
-            offs_ele[:,None] + \
-            (par_start + offs_edge_nid * GROUP_SIZE_K)[None,:] # [TILE_SIZE_M, TILE_SIZE_K]
+            offs_ele[:,None] * GROUP_SIZE_K + \
+            (par_start + offs_edge_nid)[None,:] # [TILE_SIZE_M, TILE_SIZE_K]
 
         # Batch offsets and mask
         offs_batch = tl.arange(0, BLOCK_B) + pid_b * BLOCK_B
@@ -1336,25 +1336,31 @@ def _backward_sparse_par_flows(self, node_flows: torch.Tensor, params: torch.Ten
             GROUP_SIZE_M = self.group_size
         )
 
+    def _backward_pytorch(self, node_flows, element_flows, params, node_mars, 
+                          element_mars, param_flows, nids, cids, pids, pfids, 
+                          chids, parids, parpids, cs_group_size):
+
+        
+
     @torch.compile(mode = "reduce-overhead", fullgraph = True)
-    def _backward_pytorch(self, node_flows: torch.Tensor, element_flows: torch.Tensor, 
-                          params: torch.Tensor, node_mars: torch.Tensor, 
-                          element_mars: torch.Tensor, param_flows: Optional[torch.Tensor], 
-                          chids: torch.Tensor, parids: torch.Tensor, parpids: torch.Tensor,
-                          cs_group_size: int):
+    def _backward_pytorch_ele_kernel(self, node_flows: torch.Tensor, element_flows: torch.Tensor, 
+                                     params: torch.Tensor, node_mars: torch.Tensor, 
+                                     element_mars: torch.Tensor, param_flows: Optional[torch.Tensor], 
+                                     chids: torch.Tensor, parids: torch.Tensor, parpids: torch.Tensor,
+                                     cs_group_size: int):
 
-        if param_flows is not None:
-            raise ValueError("PyTorch kernel does not support computing parameter flows.")
+        # if param_flows is not None:
+        #     raise ValueError("PyTorch kernel does not support computing parameter flows.")
 
         num_ngroups = chids.size(0)
         num_egroups = parids.size(1)
         parids = (parids[:,:,None].repeat(1, 1, self.group_size) + torch.arange(0, self.group_size, device = parids.device)).reshape(num_ngroups, num_egroups * self.group_size)
-        parpids = (parpids[:,:,None] + torch.arange(0, self.group_size * cs_group_size, cs_group_size, device = parids.device)).reshape(
+        parpids = (parpids[:,:,None] + torch.arange(0, self.group_size, device = parids.device)).reshape(
             num_ngroups, num_egroups * self.group_size)
 
         chids = (chids[:,None].repeat(1, cs_group_size) + torch.arange(0, cs_group_size, device = chids.device)).reshape(num_ngroups * cs_group_size)
         parids = parids[:,None,:].repeat(1, cs_group_size, 1).reshape(num_ngroups * cs_group_size, num_egroups * self.group_size)
-        parpids = (parpids[:,None,:].repeat(1, cs_group_size, 1) + torch.arange(0, cs_group_size, device = parpids.device)[None,:,None]).reshape(
+        parpids = (parpids[:,None,:].repeat(1, cs_group_size, 1) + torch.arange(0, cs_group_size * self.group_size, self.group_size, device = parpids.device)[None,:,None]).reshape(
             num_ngroups * cs_group_size, num_egroups * self.group_size
         )
         
diff --git a/src/pyjuice/model/tensorcircuit.py b/src/pyjuice/model/tensorcircuit.py
index cbeb622b..f3bdd482 100644
--- a/src/pyjuice/model/tensorcircuit.py
+++ b/src/pyjuice/model/tensorcircuit.py
@@ -476,7 +476,7 @@ def _init_layers(self, layer_sparsity_tol: Optional[float] = None, max_num_parti
         num_parameters = max_node_group_size * max_ele_group_size
 
         # Number of parameter flows
-        num_param_flows = max_node_group_size * max_ele_group_size
+        num_param_flows = 0
 
         # Stores distributed parameter flows
         node2tiednodes = dict()
diff --git a/tests/layer/sum_layer_test.py b/tests/layer/sum_layer_test.py
index c0d02976..bb9393e2 100644
--- a/tests/layer/sum_layer_test.py
+++ b/tests/layer/sum_layer_test.py
@@ -110,20 +110,22 @@ def sum_layer_test():
     num_ngroups = chids.size(0)
     num_egroups = parids.size(1)
     parids = (parids[:,:,None].repeat(1, 1, group_size) + torch.arange(0, group_size, device = parids.device)).reshape(num_ngroups, num_egroups * group_size)
-    parpids = (parpids[:,:,None] + torch.arange(0, group_size * group_size, group_size, device = parids.device)).reshape(
+    parpids_start = (parpids[:,:,None] + torch.arange(0, group_size, device = parids.device)).reshape(
         num_ngroups, num_egroups * group_size)
 
-    for i in range(group_size):
-        for j in range(6):
+    for j in range(6):
+        parpids = parpids_start.clone()
+        for i in range(group_size):
             nmars = node_mars[parids[j,:]].exp()
             nflows = node_flows[parids[j,:]]
             emars = element_mars[(j+1)*group_size+i,:].exp()
-            epars = params[parpids[j,:]+i]
+            epars = params[parpids[j,:]]
             eflows = (nflows * epars[:,None] * emars[None,:] / nmars).sum(dim = 0)
 
-            # import pdb; pdb.set_trace()
             assert torch.all(torch.abs(eflows - element_flows[(j+1)*group_size+i,:]) < 1e-2)
 
+            parpids += group_size
+
     my_pflows = torch.zeros_like(param_flows)
 
     for i in range(group_size):
@@ -136,6 +138,8 @@ def sum_layer_test():
 
             my_pflows[layer.partitioned_pfids[0][j,:]+i] = pflows
 
+    import pdb; pdb.set_trace()
+
     assert torch.all(torch.abs(my_pflows - param_flows) < 2e-3)
 
 

From d9134db1f942dbe8a452d3671c1a7f945ae090ab Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Wed, 13 Dec 2023 20:27:37 +0800
Subject: [PATCH 056/162] fix sparse backward pass for sum layers

---
 src/pyjuice/layer/sum_layer.py | 48 +++++++++++++++++++++++++---------
 tests/layer/sum_layer_test.py  |  8 +++---
 2 files changed, 38 insertions(+), 18 deletions(-)

diff --git a/src/pyjuice/layer/sum_layer.py b/src/pyjuice/layer/sum_layer.py
index 387a0137..4d4cccdd 100644
--- a/src/pyjuice/layer/sum_layer.py
+++ b/src/pyjuice/layer/sum_layer.py
@@ -705,7 +705,7 @@ def _backward(self, node_flows: torch.Tensor, element_flows: torch.Tensor,
         else:
             mode = "sparse"
 
-        mode = "pytorch" ##### debug
+        mode = "sparse" ##### debug
 
         if mode == "block_sparse":
             self._backward_block_sparse(
@@ -1224,13 +1224,13 @@ def _backward_sparse_ele_flows(self, node_flows: torch.Tensor, element_flows: to
     @triton.jit
     def _bk_triton_sparse_par_kernel(node_flows, node_mars, element_mars, params, param_flows, nids, cids, pids, pfids,
                                      batch_size: tl.constexpr, num_edges: tl.constexpr, BLOCK_M: tl.constexpr, 
-                                     BLOCK_B: tl.constexpr, B_NUM_BLOCKS: tl.constexpr, GROUP_SIZE_M: tl.constexpr):
+                                     BLOCK_B: tl.constexpr, B_NUM_BLOCKS: tl.constexpr):
 
         pid_m = tl.program_id(0) # ID of size-`BLOCK_M` nodes
 
         # Get inferred node group id from `pid_m`
-        ngroup_id = pid_m // GROUP_SIZE_M
-        tile_id = pid_m % GROUP_SIZE_M
+        ngroup_id = pid_m // BLOCK_M
+        tile_id = pid_m % BLOCK_M
 
         # Batch offsets and mask
         offs_batch = tl.arange(0, BLOCK_B)
@@ -1246,7 +1246,7 @@ def _bk_triton_sparse_par_kernel(node_flows, node_mars, element_mars, params, pa
         # Initialize pointers to `node_flows` and `node_mars`
         off_nids = tl.load(nids + ngroup_id)
         nmars_ptr = node_mars + (off_nids + tile_id) * batch_size + offs_batch # [BLOCK_B]
-        nflows_ptr = node_flows + off_nids * batch_size + offs_batch # [BLOCK_B]
+        nflows_ptr = node_flows + (off_nids + tile_id) * batch_size + offs_batch # [BLOCK_B]
 
         # Initialize `params`
         par_start = tl.load(pids + ngroup_id * num_edges + offs_edge)
@@ -1279,7 +1279,7 @@ def _bk_triton_sparse_par_kernel(node_flows, node_mars, element_mars, params, pa
         epars = tl.load(epars_ptr) # [num_edges]
 
         parflow_start = tl.load(pfids + ngroup_id * num_edges + offs_edge)
-        eparflows_ptr = param_flows + par_start + tile_id
+        eparflows_ptr = param_flows + parflow_start + tile_id
         
         curr_pflows = acc * epars
 
@@ -1332,26 +1332,45 @@ def _backward_sparse_par_flows(self, node_flows: torch.Tensor, params: torch.Ten
             num_edges = num_edges, 
             BLOCK_M = BLOCK_M,
             BLOCK_B = BLOCK_B,
-            B_NUM_BLOCKS = triton.cdiv(batch_size, BLOCK_B),
-            GROUP_SIZE_M = self.group_size
+            B_NUM_BLOCKS = triton.cdiv(batch_size, BLOCK_B)
         )
 
     def _backward_pytorch(self, node_flows, element_flows, params, node_mars, 
                           element_mars, param_flows, nids, cids, pids, pfids, 
                           chids, parids, parpids, cs_group_size):
-
+        """
+        Back pass of sum layers with native pytorch.
         
+        Parameters:
+        `node_flows`:   [N, B]
+        `element_flows: [M, B]
+        `params`:       [E]
+        `node_mars`:    [N, B]
+        `element_mars`: [M, B]
+        `param_flows`:  [E]
+        `chids`:        [ng]
+        `parids`:       [ng, c]
+        `parpids`:      [ng, c]
+        """
 
-    @torch.compile(mode = "reduce-overhead", fullgraph = True)
+        # Flows w.r.t. input elements (product nodes)
+        if chids is not None:
+            self._backward_pytorch_ele_kernel(
+                node_flows, element_flows, params, node_mars, element_mars, 
+                param_flows, chids, parids, parpids, cs_group_size
+            )
+
+        # Flows w.r.t. parameters
+        if param_flows is not None and nids is not None:
+            pass
+
+    @torch.compile(mode = "reduce-overhead")
     def _backward_pytorch_ele_kernel(self, node_flows: torch.Tensor, element_flows: torch.Tensor, 
                                      params: torch.Tensor, node_mars: torch.Tensor, 
                                      element_mars: torch.Tensor, param_flows: Optional[torch.Tensor], 
                                      chids: torch.Tensor, parids: torch.Tensor, parpids: torch.Tensor,
                                      cs_group_size: int):
 
-        # if param_flows is not None:
-        #     raise ValueError("PyTorch kernel does not support computing parameter flows.")
-
         num_ngroups = chids.size(0)
         num_egroups = parids.size(1)
         parids = (parids[:,:,None].repeat(1, 1, self.group_size) + torch.arange(0, self.group_size, device = parids.device)).reshape(num_ngroups, num_egroups * self.group_size)
@@ -1369,6 +1388,9 @@ def _backward_pytorch_ele_kernel(self, node_flows: torch.Tensor, element_flows:
 
         return None
 
+    def _backward_pytorch_par_kernel(self, node_flows):
+        pass
+
     def _prepare_scope2nids(self, prod_scope_eleids: Sequence[Tuple[BitSet, torch.Tensor]]):
         if not (hasattr(self, "fw_scope2localids") and hasattr(self, "bk_scope2localids")):
             fw_scope2localids = dict()
diff --git a/tests/layer/sum_layer_test.py b/tests/layer/sum_layer_test.py
index bb9393e2..6de0994d 100644
--- a/tests/layer/sum_layer_test.py
+++ b/tests/layer/sum_layer_test.py
@@ -42,7 +42,7 @@ def sum_layer_test():
 
     layer = SumLayer([ns0, ns1, ns2], global_nid_start = group_size,
                      global_pid_start = group_size ** 2, 
-                     global_pfid_start = group_size ** 2, node2tiednodes = dict(), )
+                     global_pfid_start = 0, node2tiednodes = dict(), )
 
     assert torch.all(layer.partitioned_nids[0] == torch.arange(group_size, 7 * group_size, group_size))
     assert torch.all(layer.partitioned_cids[0][0:2,0] == group_size)
@@ -53,8 +53,8 @@ def sum_layer_test():
     assert torch.all(layer.partitioned_cids[0][4:6,1] == 5 * group_size + 1)
     assert torch.all(layer.partitioned_pids[0][:,0] == torch.arange(group_size, (group_size * 2 * 6 + 1) * group_size, 2 * group_size * group_size) - group_size + group_size ** 2)
     assert torch.all(layer.partitioned_pids[0][:,1] == torch.arange(group_size, (group_size * 2 * 6 + 1) * group_size, 2 * group_size * group_size) + group_size ** 2)
-    assert torch.all(layer.partitioned_pfids[0][:,0] == torch.arange(group_size, (group_size * 2 * 6 + 1) * group_size, 2 * group_size * group_size) - group_size + group_size ** 2)
-    assert torch.all(layer.partitioned_pfids[0][:,1] == torch.arange(group_size, (group_size * 2 * 6 + 1) * group_size, 2 * group_size * group_size) + group_size ** 2)
+    assert torch.all(layer.partitioned_pfids[0][:,0] == torch.arange(group_size, (group_size * 2 * 6 + 1) * group_size, 2 * group_size * group_size) - group_size)
+    assert torch.all(layer.partitioned_pfids[0][:,1] == torch.arange(group_size, (group_size * 2 * 6 + 1) * group_size, 2 * group_size * group_size))
 
     assert torch.all(layer.partitioned_chids[0] == torch.arange(group_size, 7 * group_size, group_size))
     assert torch.all(layer.partitioned_parids[0][0:2,0] == group_size)
@@ -138,8 +138,6 @@ def sum_layer_test():
 
             my_pflows[layer.partitioned_pfids[0][j,:]+i] = pflows
 
-    import pdb; pdb.set_trace()
-
     assert torch.all(torch.abs(my_pflows - param_flows) < 2e-3)
 
 

From 9d82eee0ef3fdbbeb8c66922445302828f27f54f Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Wed, 13 Dec 2023 23:54:25 +0800
Subject: [PATCH 057/162] fix weird triton bug that causes kernels to stall

---
 src/pyjuice/layer/sum_layer.py | 100 +++++++++++++++++++--------------
 tests/layer/sum_layer_test.py  |  13 ++---
 2 files changed, 64 insertions(+), 49 deletions(-)

diff --git a/src/pyjuice/layer/sum_layer.py b/src/pyjuice/layer/sum_layer.py
index 4d4cccdd..4c6ba42c 100644
--- a/src/pyjuice/layer/sum_layer.py
+++ b/src/pyjuice/layer/sum_layer.py
@@ -314,32 +314,6 @@ def backward(self, node_flows: torch.Tensor, element_flows: torch.Tensor,
 
         return None
 
-    @staticmethod
-    @torch.compile(mode = "reduce-overhead", fullgraph = True)
-    def _forward_pytorch_kernel(node_mars: torch.Tensor, element_mars: torch.Tensor, params: torch.Tensor, 
-                                nids: torch.Tensor, cids: torch.Tensor, pids: torch.Tensor,
-                                local_ids: torch.Tensor):
-
-        if local_ids is not None:
-            nids = nids[local_ids]
-            cids = cids[local_ids]
-            pids = pids[local_ids]
-
-        num_ngroups = nids.size(0)
-        num_edges = cids.size(1)
-        nids = (nids[:,None].repeat(1, self.group_size) + \
-            torch.arange(0, self.group_size, device = nids.device)[None,:]).reshape(num_ngroups * self.group_size)
-        cids = cids[:,None,:].repeat(1, self.group_size, 1).reshape(num_ngroups * self.group_size, num_edges)
-        pids = (pids[:,None,:].repeat(1, self.group_size, 1) + \
-            torch.arange(0, self.group_size, device = cids.device)[None,:,None]).reshape(num_ngroups * self.group_size, num_edges)
-
-        ch_mars = element_mars[cids]
-        maxval = ch_mars.max(dim = 1, keepdim = True).values
-        node_mars[nids] = (((ch_mars - maxval).exp() * params[pids].unsqueeze(-1)).sum(
-            dim = 1).clamp(min = 1e-10)).log() + maxval.squeeze(1)
-
-        return None
-
     def _forward(self, node_mars: torch.Tensor, element_mars: torch.Tensor,
                  params: torch.Tensor, nids: torch.Tensor, cids: torch.Tensor,
                  pids: torch.Tensor, local_ids: Optional[torch.Tensor] = None,
@@ -664,6 +638,32 @@ def _forward_sparse(self, node_mars: torch.Tensor, element_mars: torch.Tensor,
 
         return None
 
+    @staticmethod
+    @torch.compile(mode = "reduce-overhead", fullgraph = True)
+    def _forward_pytorch_kernel(node_mars: torch.Tensor, element_mars: torch.Tensor, params: torch.Tensor, 
+                                nids: torch.Tensor, cids: torch.Tensor, pids: torch.Tensor,
+                                local_ids: torch.Tensor):
+
+        if local_ids is not None:
+            nids = nids[local_ids]
+            cids = cids[local_ids]
+            pids = pids[local_ids]
+
+        num_ngroups = nids.size(0)
+        num_edges = cids.size(1)
+        nids = (nids[:,None].repeat(1, self.group_size) + \
+            torch.arange(0, self.group_size, device = nids.device)[None,:]).reshape(num_ngroups * self.group_size)
+        cids = cids[:,None,:].repeat(1, self.group_size, 1).reshape(num_ngroups * self.group_size, num_edges)
+        pids = (pids[:,None,:].repeat(1, self.group_size, 1) + \
+            torch.arange(0, self.group_size, device = cids.device)[None,:,None]).reshape(num_ngroups * self.group_size, num_edges)
+
+        ch_mars = element_mars[cids]
+        maxval = ch_mars.max(dim = 1, keepdim = True).values
+        node_mars[nids] = (((ch_mars - maxval).exp() * params[pids].unsqueeze(-1)).sum(
+            dim = 1).clamp(min = 1e-10)).log() + maxval.squeeze(1)
+
+        return None
+
     def _backward(self, node_flows: torch.Tensor, element_flows: torch.Tensor, 
                   params: torch.Tensor, node_mars: torch.Tensor, 
                   element_mars: torch.Tensor, param_flows: torch.Tensor,
@@ -705,8 +705,6 @@ def _backward(self, node_flows: torch.Tensor, element_flows: torch.Tensor,
         else:
             mode = "sparse"
 
-        mode = "sparse" ##### debug
-
         if mode == "block_sparse":
             self._backward_block_sparse(
                 node_flows, element_flows, params, node_mars, element_mars, param_flows, 
@@ -984,6 +982,10 @@ def _bk_triton_block_sparse_par_kernel(node_flows, node_mars, element_mars, para
         acc = tl.zeros([TILE_SIZE_M, TILE_SIZE_K], dtype = tl.float32)
 
         for b in range(0, B_NUM_TILES):
+            # Update batch mask
+            offs_batch = tl.arange(0, TILE_SIZE_B) + b * TILE_SIZE_B
+            mask_batch = offs_batch < batch_size
+
             emars = tl.load(emars_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_K, TILE_SIZE_B]
             nmars = tl.load(nmars_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_M, TILE_SIZE_B]
             nflows = tl.load(nflows_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_M, TILE_SIZE_B]
@@ -1004,20 +1006,16 @@ def _bk_triton_block_sparse_par_kernel(node_flows, node_mars, element_mars, para
             nmars_ptr += TILE_SIZE_B
             nflows_ptr += TILE_SIZE_B
 
-            # Update batch mask
-            offs_batch += TILE_SIZE_B
-            mask_batch = offs_batch < batch_size
-
         par_start = tl.load(pids + ngroup_id * num_edges + offs_edge)
         epars_offsets = offs_node[:,None] + par_start[None,:] # [TILE_SIZE_M, TILE_SIZE_K]
         epars = tl.load(params + epars_offsets)
 
         parflow_start = tl.load(pfids + ngroup_id * num_edges + offs_edge)
-        epars_offsets = offs_node[:,None] + parflow_start[None,:] # [TILE_SIZE_M, TILE_SIZE_K]
+        eparflows_offsets = offs_node[:,None] + parflow_start[None,:] # [TILE_SIZE_M, TILE_SIZE_K]
         
         curr_pflows = acc * epars
 
-        tl.atomic_add(param_flows + epars_offsets, curr_pflows)
+        tl.atomic_add(param_flows + eparflows_offsets, curr_pflows)
 
     def _backward_block_sparse_par_flows(self, node_flows: torch.Tensor, params: torch.Tensor, node_mars: torch.Tensor, 
                                          element_mars: torch.Tensor, param_flows: torch.Tensor, nids: torch.Tensor, 
@@ -1248,11 +1246,6 @@ def _bk_triton_sparse_par_kernel(node_flows, node_mars, element_mars, params, pa
         nmars_ptr = node_mars + (off_nids + tile_id) * batch_size + offs_batch # [BLOCK_B]
         nflows_ptr = node_flows + (off_nids + tile_id) * batch_size + offs_batch # [BLOCK_B]
 
-        # Initialize `params`
-        par_start = tl.load(pids + ngroup_id * num_edges + offs_edge)
-        epars_ptr = params + par_start + tile_id
-        epars = tl.load(epars_ptr) # [num_edges]
-
         # Inner loop
         acc = tl.zeros([num_edges], dtype = tl.float32)
 
@@ -1362,7 +1355,10 @@ def _backward_pytorch(self, node_flows, element_flows, params, node_mars,
 
         # Flows w.r.t. parameters
         if param_flows is not None and nids is not None:
-            pass
+            self._backward_pytorch_par_kernel(
+                node_flows, params, node_mars, element_mars, param_flows, 
+                nids, cids, pids, pfids, self.group_size
+            )
 
     @torch.compile(mode = "reduce-overhead")
     def _backward_pytorch_ele_kernel(self, node_flows: torch.Tensor, element_flows: torch.Tensor, 
@@ -1388,8 +1384,28 @@ def _backward_pytorch_ele_kernel(self, node_flows: torch.Tensor, element_flows:
 
         return None
 
-    def _backward_pytorch_par_kernel(self, node_flows):
-        pass
+    @torch.compile(mode = "reduce-overhead")
+    def _backward_pytorch_par_kernel(self, node_flows: torch.Tensor, params: torch.Tensor, node_mars: torch.Tensor, 
+                                     element_mars: torch.Tensor, param_flows: torch.Tensor, nids: torch.Tensor, 
+                                     cids: torch.Tensor, pids: torch.Tensor, pfids: torch.Tensor, ns_group_size: int):
+
+        num_ngroups = nids.size(0)
+        num_edges = cids.size(1)
+        nids = (nids[:,None].repeat(1, self.group_size) + \
+            torch.arange(0, self.group_size, device = nids.device)[None,:]).reshape(num_ngroups * self.group_size)
+        cids = cids[:,None,:].repeat(1, self.group_size, 1).reshape(num_ngroups * self.group_size, num_edges)
+        pids = (pids[:,None,:].repeat(1, self.group_size, 1) + \
+            torch.arange(0, self.group_size, device = cids.device)[None,:,None]).reshape(num_ngroups * self.group_size, num_edges)
+        pfids = (pfids[:,None,:].repeat(1, self.group_size, 1) + \
+            torch.arange(0, self.group_size, device = cids.device)[None,:,None]).reshape(num_ngroups * self.group_size, num_edges)
+
+        parflows = (node_flows[nids].unsqueeze(1) * params[pids].unsqueeze(-1) * (element_mars[cids] - node_mars[nids].unsqueeze(1)).exp()).sum(dim = 2)
+
+        for i in range(num_ngroups):
+            sid, eid = ns_group_size * i, ns_group_size * (i + 1)
+            param_flows[pfids[sid:eid,:]] += parflows[sid:eid,:]
+
+        return None
 
     def _prepare_scope2nids(self, prod_scope_eleids: Sequence[Tuple[BitSet, torch.Tensor]]):
         if not (hasattr(self, "fw_scope2localids") and hasattr(self, "bk_scope2localids")):
diff --git a/tests/layer/sum_layer_test.py b/tests/layer/sum_layer_test.py
index 6de0994d..97ec82af 100644
--- a/tests/layer/sum_layer_test.py
+++ b/tests/layer/sum_layer_test.py
@@ -176,7 +176,7 @@ def speed_test():
     prod_layer = ProdLayer(nps, layer_sparsity_tol = 0.1)
 
     layer = SumLayer(nodes, global_nid_start = group_size,
-                     global_pid_start = 1, global_pfid_start = 0, node2tiednodes = dict(), )
+                     global_pid_start = group_size ** 2, global_pfid_start = 0, node2tiednodes = dict(), )
 
     layer.to(device)
 
@@ -197,30 +197,29 @@ def speed_test():
     forward_ms = (t1 - t0) / 100 * 1000
 
     print(f"Forward pass on average takes {forward_ms:.3f}ms.")
-    print("Reference computation time on RTX 4090: 0.871ms.")
+    print("Reference computation time on RTX 4090: 0.669ms.")
     print("--------------------------------------------------------------")
 
     node_flows = torch.rand([group_size + group_size * num_node_groups * num_prod_nodes, batch_size]).to(device)
     element_flows = torch.zeros([group_size + num_prod_nodes * group_size * num_node_groups, batch_size]).log().to(device)
-    param_flows = torch.zeros([layer.partitioned_pids[0].max() + group_size]).to(device)
+    param_flows = torch.zeros([group_size ** 2 + layer.partitioned_pids[0].max() + group_size]).to(device)
 
     layer.backward(node_flows, element_flows, node_mars, element_mars, params, param_flows)
 
     t0 = time.time()
     torch.cuda.synchronize()
-    for _ in range(10000000000000000):
+    for _ in range(100):
         layer.backward(node_flows, element_flows, node_mars, element_mars, params, param_flows)
-        time.sleep(0.002)
     torch.cuda.synchronize()
     t1 = time.time()
     backward_ms = (t1 - t0) / 100 * 1000
 
     print(f"Backward pass on average takes {backward_ms:.3f}ms.")
-    print("Reference computation time on RTX 4090: 1.200ms.")
+    print("Reference computation time on RTX 4090: 1.593ms.")
     print("--------------------------------------------------------------")
 
 
 if __name__ == "__main__":
     torch.manual_seed(3890)
     sum_layer_test()
-    # speed_test()
\ No newline at end of file
+    speed_test()
\ No newline at end of file

From ada5f3929cd2c84f83c8558183893e6abf3665c6 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Thu, 14 Dec 2023 00:13:56 +0800
Subject: [PATCH 058/162] update runtests

---
 tests/model/simple_model_test.py | 50 ++++++++++++++++----------------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/tests/model/simple_model_test.py b/tests/model/simple_model_test.py
index 15baae73..44204894 100644
--- a/tests/model/simple_model_test.py
+++ b/tests/model/simple_model_test.py
@@ -103,12 +103,12 @@ def simple_model_test():
     assert torch.all(sum_layer0.partitioned_cids[1][0,:] == torch.arange(16, 80))
     assert torch.all(sum_layer0.partitioned_cids[1][1,:] == torch.arange(16, 80))
 
-    assert torch.all(sum_layer0.partitioned_pids[0][0,:] == torch.arange(2064, 2576, 16))
-    assert torch.all(sum_layer0.partitioned_pids[0][1,:] == torch.arange(2576, 3088, 16))
-    assert torch.all(sum_layer0.partitioned_pids[0][2,:] == torch.arange(3088, 3600, 16))
-    assert torch.all(sum_layer0.partitioned_pids[0][3,:] == torch.arange(3600, 4112, 16))
-    assert torch.all(sum_layer0.partitioned_pids[1][0,:] == torch.arange(16, 1040, 16))
-    assert torch.all(sum_layer0.partitioned_pids[1][1,:] == torch.arange(1040, 2064, 16))
+    assert torch.all(sum_layer0.partitioned_pids[0][0,:] == torch.arange(2304, 2816, 16))
+    assert torch.all(sum_layer0.partitioned_pids[0][1,:] == torch.arange(2816, 3328, 16))
+    assert torch.all(sum_layer0.partitioned_pids[0][2,:] == torch.arange(3328, 3840, 16))
+    assert torch.all(sum_layer0.partitioned_pids[0][3,:] == torch.arange(3840, 4352, 16))
+    assert torch.all(sum_layer0.partitioned_pids[1][0,:] == torch.arange(256, 1280, 16))
+    assert torch.all(sum_layer0.partitioned_pids[1][1,:] == torch.arange(1280, 2304, 16))
 
     assert torch.all(sum_layer0.partitioned_chids[0] == torch.arange(16, 144, 16))
 
@@ -116,26 +116,26 @@ def simple_model_test():
     assert torch.all(sum_layer0.partitioned_parids[0][4:6] == torch.tensor([[176, 192]]))
     assert torch.all(sum_layer0.partitioned_parids[0][6:8] == torch.tensor([[208, 224]]))
 
-    assert torch.all(sum_layer0.partitioned_parpids[0][0,:] == torch.tensor([16, 1040]))
-    assert torch.all(sum_layer0.partitioned_parpids[0][1,:] == torch.tensor([272, 1296]))
-    assert torch.all(sum_layer0.partitioned_parpids[0][2,:] == torch.tensor([528, 1552]))
-    assert torch.all(sum_layer0.partitioned_parpids[0][3,:] == torch.tensor([784, 1808]))
-    assert torch.all(sum_layer0.partitioned_parpids[0][4,:] == torch.tensor([2064, 2576]))
-    assert torch.all(sum_layer0.partitioned_parpids[0][5,:] == torch.tensor([2320, 2832]))
-    assert torch.all(sum_layer0.partitioned_parpids[0][6,:] == torch.tensor([3088, 3600]))
-    assert torch.all(sum_layer0.partitioned_parpids[0][7,:] == torch.tensor([3344, 3856]))
+    assert torch.all(sum_layer0.partitioned_parpids[0][0,:] == torch.tensor([256, 1280]))
+    assert torch.all(sum_layer0.partitioned_parpids[0][1,:] == torch.tensor([512, 1536]))
+    assert torch.all(sum_layer0.partitioned_parpids[0][2,:] == torch.tensor([768, 1792]))
+    assert torch.all(sum_layer0.partitioned_parpids[0][3,:] == torch.tensor([1024, 2048]))
+    assert torch.all(sum_layer0.partitioned_parpids[0][4,:] == torch.tensor([2304, 2816]))
+    assert torch.all(sum_layer0.partitioned_parpids[0][5,:] == torch.tensor([2560, 3072]))
+    assert torch.all(sum_layer0.partitioned_parpids[0][6,:] == torch.tensor([3328, 3840]))
+    assert torch.all(sum_layer0.partitioned_parpids[0][7,:] == torch.tensor([3584, 4096]))
 
     assert torch.all(torch.abs(ns0._params.reshape(2, 4, 16, 16).sum(dim = 3).sum(dim = 1) - 1.0) < 1e-4)
     assert torch.all(torch.abs(ns1._params.reshape(2, 2, 16, 16).sum(dim = 3).sum(dim = 1) - 1.0) < 1e-4)
     assert torch.all(torch.abs(ns2._params.reshape(2, 2, 16, 16).sum(dim = 3).sum(dim = 1) - 1.0) < 1e-4)
 
-    assert torch.all(torch.abs(pc.params[:16] - 0.0) < 1e-4)
-    assert torch.all(torch.abs(pc.params[16:1040].reshape(1, 4, 16, 16).sum(dim = 2).sum(dim = 1) - 1.0) < 1e-4)
-    assert torch.all(torch.abs(pc.params[1040:2064].reshape(1, 4, 16, 16).sum(dim = 2).sum(dim = 1) - 1.0) < 1e-4)
-    assert torch.all(torch.abs(pc.params[2064:2576].reshape(1, 2, 16, 16).sum(dim = 2).sum(dim = 1) - 1.0) < 1e-4)
-    assert torch.all(torch.abs(pc.params[2576:3088].reshape(1, 2, 16, 16).sum(dim = 2).sum(dim = 1) - 1.0) < 1e-4)
-    assert torch.all(torch.abs(pc.params[3088:3600].reshape(1, 2, 16, 16).sum(dim = 2).sum(dim = 1) - 1.0) < 1e-4)
-    assert torch.all(torch.abs(pc.params[3600:4112].reshape(1, 2, 16, 16).sum(dim = 2).sum(dim = 1) - 1.0) < 1e-4)
+    assert torch.all(torch.abs(pc.params[:256] - 0.0) < 1e-4)
+    assert torch.all(torch.abs(pc.params[256:1280].reshape(1, 4, 16, 16).sum(dim = 2).sum(dim = 1) - 1.0) < 1e-4)
+    assert torch.all(torch.abs(pc.params[1280:2304].reshape(1, 4, 16, 16).sum(dim = 2).sum(dim = 1) - 1.0) < 1e-4)
+    assert torch.all(torch.abs(pc.params[2304:2816].reshape(1, 2, 16, 16).sum(dim = 2).sum(dim = 1) - 1.0) < 1e-4)
+    assert torch.all(torch.abs(pc.params[2816:3328].reshape(1, 2, 16, 16).sum(dim = 2).sum(dim = 1) - 1.0) < 1e-4)
+    assert torch.all(torch.abs(pc.params[3328:3840].reshape(1, 2, 16, 16).sum(dim = 2).sum(dim = 1) - 1.0) < 1e-4)
+    assert torch.all(torch.abs(pc.params[3840:4352].reshape(1, 2, 16, 16).sum(dim = 2).sum(dim = 1) - 1.0) < 1e-4)
 
     prod_layer1 = pc.inner_layer_groups[2][0]
 
@@ -168,16 +168,16 @@ def simple_model_test():
     assert torch.all(sum_layer1.partitioned_cids[0][0,:96] == torch.arange(16, 112))
     assert torch.all(sum_layer1.partitioned_cids[0][0,96:] == 0)
 
-    assert torch.all(sum_layer1.partitioned_pids[0][0,:96] == torch.arange(4112, 4208))
+    assert torch.all(sum_layer1.partitioned_pids[0][0,:96] == torch.arange(4352, 4448))
     assert torch.all(sum_layer1.partitioned_pids[0][0,96:] == 0)
 
     assert torch.all(sum_layer1.partitioned_chids[0] == torch.arange(16, 112, 16))
 
     assert torch.all(sum_layer1.partitioned_parids[0] == 240)
 
-    assert torch.all(sum_layer1.partitioned_parpids[0] == torch.arange(4112, 4208, 16)[:,None])
+    assert torch.all(sum_layer1.partitioned_parpids[0] == torch.arange(4352, 4448, 16)[:,None])
 
-    assert torch.abs(pc.params[4112:4208].sum() - 1.0) < 1e-4
+    assert torch.abs(pc.params[4352:4448].sum() - 1.0) < 1e-4
 
     ## Forward pass ##
 
@@ -268,7 +268,7 @@ def simple_model_test():
 
     lls.mean().backward()
 
-    node_flows = pc.node_flows.cpu()
+    # node_flows = pc.node_flows.cpu()
 
     import pdb; pdb.set_trace()
 

From b53e06e5e1adca535031176c5e1c8e0cda13b3a0 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Thu, 14 Dec 2023 00:14:10 +0800
Subject: [PATCH 059/162] fix triton bug for sparse backward kernels

---
 src/pyjuice/layer/sum_layer.py     | 8 ++++----
 src/pyjuice/model/tensorcircuit.py | 9 +++++----
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/pyjuice/layer/sum_layer.py b/src/pyjuice/layer/sum_layer.py
index 4c6ba42c..93488782 100644
--- a/src/pyjuice/layer/sum_layer.py
+++ b/src/pyjuice/layer/sum_layer.py
@@ -1250,6 +1250,10 @@ def _bk_triton_sparse_par_kernel(node_flows, node_mars, element_mars, params, pa
         acc = tl.zeros([num_edges], dtype = tl.float32)
 
         for b in range(0, B_NUM_BLOCKS):
+            # Update batch mask
+            offs_batch = tl.arange(0, BLOCK_B) + b * BLOCK_B
+            mask_batch = offs_batch < batch_size
+
             emars = tl.load(emars_ptr, mask = mask_batch[None,:]) # [num_edges, BLOCK_B]
             nmars = tl.load(nmars_ptr, mask = mask_batch) # [BLOCK_B]
             nflows = tl.load(nflows_ptr, mask = mask_batch) # [BLOCK_B]
@@ -1263,10 +1267,6 @@ def _bk_triton_sparse_par_kernel(node_flows, node_mars, element_mars, params, pa
             nmars_ptr += BLOCK_B
             nflows_ptr += BLOCK_B
 
-            # Update batch mask
-            offs_batch += BLOCK_B
-            mask_batch = offs_batch < batch_size
-
         par_start = tl.load(pids + ngroup_id * num_edges + offs_edge)
         epars_ptr = params + par_start + tile_id
         epars = tl.load(epars_ptr) # [num_edges]
diff --git a/src/pyjuice/model/tensorcircuit.py b/src/pyjuice/model/tensorcircuit.py
index f3bdd482..e18acd8d 100644
--- a/src/pyjuice/model/tensorcircuit.py
+++ b/src/pyjuice/model/tensorcircuit.py
@@ -462,18 +462,19 @@ def _init_layers(self, layer_sparsity_tol: Optional[float] = None, max_num_parti
 
         self.num_dummy_nodes = max_ele_group_size
         self.num_dummy_eles = max_node_group_size
+        self.num_dummy_params = max_node_group_size * max_ele_group_size
 
         # Nodes include `max_ele_group_size` dummy nodes and all input/sum nodes in the PC
-        num_nodes = max_ele_group_size
+        num_nodes = self.num_dummy_nodes
 
         # Total number of edges
         num_edges = 0
 
         # Elements include `max_node_group_size` dummy elements and all product nodes in the PC
-        num_elements = max_node_group_size
+        num_elements = self.num_dummy_eles
 
         # Number of parameters
-        num_parameters = max_node_group_size * max_ele_group_size
+        num_parameters = self.num_dummy_params
 
         # Number of parameter flows
         num_param_flows = 0
@@ -597,7 +598,7 @@ def _init_layers(self, layer_sparsity_tol: Optional[float] = None, max_num_parti
 
     def _init_parameters(self, perturbation: float = 4.0, pseudocount: float = 0.0):
         params = torch.exp(torch.rand([self.num_sum_params]) * -perturbation)
-        params[:self.num_dummy_eles] = 0.0
+        params[:self.num_dummy_params] = 0.0
 
         # Copy initial parameters if provided
         for ns in self.root_ns:

From d125c30bfebcf73e81726bc4e5ee189a769a0803 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Thu, 14 Dec 2023 04:51:13 +0800
Subject: [PATCH 060/162] update dependencies

---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 18b91d3e..0637cb56 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,9 +8,9 @@ version="0.0.1"
 description = "Probabilistic Circuits Library"
 dependencies = [
     "numpy",
-    "torch",
+    "torch>=2.0.0",
     "typing",
-    "triton",
+    "triton>=2.1.0",
     "networkx",
     "numba"
 ]

From b4e62294d190e34b6b2c5c1ab16044d3e0346d65 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Thu, 14 Dec 2023 04:51:52 +0800
Subject: [PATCH 061/162] fix triton issues

---
 src/pyjuice/layer/sum_layer.py   | 22 ++++++++++++----------
 tests/model/simple_model_test.py |  7 +++++++
 2 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/src/pyjuice/layer/sum_layer.py b/src/pyjuice/layer/sum_layer.py
index 93488782..01345c88 100644
--- a/src/pyjuice/layer/sum_layer.py
+++ b/src/pyjuice/layer/sum_layer.py
@@ -705,6 +705,8 @@ def _backward(self, node_flows: torch.Tensor, element_flows: torch.Tensor,
         else:
             mode = "sparse"
 
+        # mode = "pytorch" # debug
+
         if mode == "block_sparse":
             self._backward_block_sparse(
                 node_flows, element_flows, params, node_mars, element_mars, param_flows, 
@@ -980,12 +982,8 @@ def _bk_triton_block_sparse_par_kernel(node_flows, node_mars, element_mars, para
 
         # Inner loop
         acc = tl.zeros([TILE_SIZE_M, TILE_SIZE_K], dtype = tl.float32)
-
+        
         for b in range(0, B_NUM_TILES):
-            # Update batch mask
-            offs_batch = tl.arange(0, TILE_SIZE_B) + b * TILE_SIZE_B
-            mask_batch = offs_batch < batch_size
-
             emars = tl.load(emars_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_K, TILE_SIZE_B]
             nmars = tl.load(nmars_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_M, TILE_SIZE_B]
             nflows = tl.load(nflows_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_M, TILE_SIZE_B]
@@ -1006,6 +1004,10 @@ def _bk_triton_block_sparse_par_kernel(node_flows, node_mars, element_mars, para
             nmars_ptr += TILE_SIZE_B
             nflows_ptr += TILE_SIZE_B
 
+            # Update batch mask
+            offs_batch += TILE_SIZE_B
+            mask_batch = offs_batch < batch_size
+
         par_start = tl.load(pids + ngroup_id * num_edges + offs_edge)
         epars_offsets = offs_node[:,None] + par_start[None,:] # [TILE_SIZE_M, TILE_SIZE_K]
         epars = tl.load(params + epars_offsets)
@@ -1044,7 +1046,7 @@ def _backward_block_sparse_par_flows(self, node_flows: torch.Tensor, params: tor
         BATCH_SIZE_NP2 = triton.next_power_of_2(batch_size)
 
         # Heuristic to set `TILE_SIZE_M`, `TILE_SIZE_K`, and `BLOCK_B`
-        base_size = min(self.group_size, num_edges, BATCH_SIZE_NP2, 128)
+        base_size = min(self.group_size, num_edges, BATCH_SIZE_NP2, 64)
         if base_size >= 64:
             TILE_SIZE_B = base_size
             TILE_SIZE_M = 2048 // base_size
@@ -1250,10 +1252,6 @@ def _bk_triton_sparse_par_kernel(node_flows, node_mars, element_mars, params, pa
         acc = tl.zeros([num_edges], dtype = tl.float32)
 
         for b in range(0, B_NUM_BLOCKS):
-            # Update batch mask
-            offs_batch = tl.arange(0, BLOCK_B) + b * BLOCK_B
-            mask_batch = offs_batch < batch_size
-
             emars = tl.load(emars_ptr, mask = mask_batch[None,:]) # [num_edges, BLOCK_B]
             nmars = tl.load(nmars_ptr, mask = mask_batch) # [BLOCK_B]
             nflows = tl.load(nflows_ptr, mask = mask_batch) # [BLOCK_B]
@@ -1267,6 +1265,10 @@ def _bk_triton_sparse_par_kernel(node_flows, node_mars, element_mars, params, pa
             nmars_ptr += BLOCK_B
             nflows_ptr += BLOCK_B
 
+            # Update batch mask
+            offs_batch += TILE_SIZE_B
+            mask_batch = offs_batch < batch_size
+
         par_start = tl.load(pids + ngroup_id * num_edges + offs_edge)
         epars_ptr = params + par_start + tile_id
         epars = tl.load(epars_ptr) # [num_edges]
diff --git a/tests/model/simple_model_test.py b/tests/model/simple_model_test.py
index 44204894..4d3de3ce 100644
--- a/tests/model/simple_model_test.py
+++ b/tests/model/simple_model_test.py
@@ -110,6 +110,13 @@ def simple_model_test():
     assert torch.all(sum_layer0.partitioned_pids[1][0,:] == torch.arange(256, 1280, 16))
     assert torch.all(sum_layer0.partitioned_pids[1][1,:] == torch.arange(1280, 2304, 16))
 
+    assert torch.all(sum_layer0.partitioned_pfids[0][0,:] == torch.arange(2048, 2560, 16))
+    assert torch.all(sum_layer0.partitioned_pfids[0][1,:] == torch.arange(2560, 3072, 16))
+    assert torch.all(sum_layer0.partitioned_pfids[0][2,:] == torch.arange(3072, 3584, 16))
+    assert torch.all(sum_layer0.partitioned_pfids[0][3,:] == torch.arange(3584, 4096, 16))
+    assert torch.all(sum_layer0.partitioned_pfids[1][0,:] == torch.arange(0, 1024, 16))
+    assert torch.all(sum_layer0.partitioned_pfids[1][1,:] == torch.arange(1024, 2048, 16))
+
     assert torch.all(sum_layer0.partitioned_chids[0] == torch.arange(16, 144, 16))
 
     assert torch.all(sum_layer0.partitioned_parids[0][:4] == torch.tensor([[144, 160]]))

From 0ec430c760386ef53cd9b3c52cec9cb5d3f1902c Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Thu, 14 Dec 2023 04:54:15 +0800
Subject: [PATCH 062/162] fix typo

---
 src/pyjuice/layer/sum_layer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pyjuice/layer/sum_layer.py b/src/pyjuice/layer/sum_layer.py
index 01345c88..eef0ba0f 100644
--- a/src/pyjuice/layer/sum_layer.py
+++ b/src/pyjuice/layer/sum_layer.py
@@ -1266,7 +1266,7 @@ def _bk_triton_sparse_par_kernel(node_flows, node_mars, element_mars, params, pa
             nflows_ptr += BLOCK_B
 
             # Update batch mask
-            offs_batch += TILE_SIZE_B
+            offs_batch += BLOCK_B
             mask_batch = offs_batch < batch_size
 
         par_start = tl.load(pids + ngroup_id * num_edges + offs_edge)

From 8dbfdff918d2d24a7c4a837b30dd7f478eb29701 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Thu, 14 Dec 2023 06:49:02 +0800
Subject: [PATCH 063/162] more runtests for backward

---
 src/pyjuice/layer/sum_layer.py     |   4 +-
 src/pyjuice/model/tensorcircuit.py |   1 +
 tests/layer/sum_layer_test.py      |   4 +-
 tests/model/simple_model_test.py   | 112 ++++++++++++++++++++++++++++-
 4 files changed, 116 insertions(+), 5 deletions(-)

diff --git a/src/pyjuice/layer/sum_layer.py b/src/pyjuice/layer/sum_layer.py
index eef0ba0f..32eab00f 100644
--- a/src/pyjuice/layer/sum_layer.py
+++ b/src/pyjuice/layer/sum_layer.py
@@ -990,10 +990,10 @@ def _bk_triton_block_sparse_par_kernel(node_flows, node_mars, element_mars, para
 
             emars_max = tl.max(emars, axis = 0)
             nflows_div_mars = nflows * tl.exp(emars_max[None,:] - nmars)
-            nflows_div_mars = nflows_div_mars.to(tl.bfloat16)
+            nflows_div_mars = nflows_div_mars.to(tl.float16)
 
             emars = tl.exp(emars - emars_max[None,:])
-            emars = emars.to(tl.bfloat16)
+            emars = emars.to(tl.float16)
 
             pflows = tl.dot(nflows_div_mars, tl.trans(emars)).to(tl.float32)
 
diff --git a/src/pyjuice/model/tensorcircuit.py b/src/pyjuice/model/tensorcircuit.py
index e18acd8d..9924d7c2 100644
--- a/src/pyjuice/model/tensorcircuit.py
+++ b/src/pyjuice/model/tensorcircuit.py
@@ -313,6 +313,7 @@ def update_parameters(self, clone_params: bool = True, update_flows: bool = Fals
         """
         Copy parameters from this `TensorCircuit` to the original `CircuitNodes`
         """
+        raise NotImplementedError()
         params = self.params.detach().cpu()
         if update_flows:
             param_flows = self.param_flows.detach().cpu()
diff --git a/tests/layer/sum_layer_test.py b/tests/layer/sum_layer_test.py
index 97ec82af..99f191b0 100644
--- a/tests/layer/sum_layer_test.py
+++ b/tests/layer/sum_layer_test.py
@@ -197,7 +197,7 @@ def speed_test():
     forward_ms = (t1 - t0) / 100 * 1000
 
     print(f"Forward pass on average takes {forward_ms:.3f}ms.")
-    print("Reference computation time on RTX 4090: 0.669ms.")
+    print("Reference computation time on RTX 4090: 0.635ms.")
     print("--------------------------------------------------------------")
 
     node_flows = torch.rand([group_size + group_size * num_node_groups * num_prod_nodes, batch_size]).to(device)
@@ -215,7 +215,7 @@ def speed_test():
     backward_ms = (t1 - t0) / 100 * 1000
 
     print(f"Backward pass on average takes {backward_ms:.3f}ms.")
-    print("Reference computation time on RTX 4090: 1.593ms.")
+    print("Reference computation time on RTX 4090: 1.274ms.")
     print("--------------------------------------------------------------")
 
 
diff --git a/tests/model/simple_model_test.py b/tests/model/simple_model_test.py
index 4d3de3ce..d258e6d8 100644
--- a/tests/model/simple_model_test.py
+++ b/tests/model/simple_model_test.py
@@ -275,10 +275,120 @@ def simple_model_test():
 
     lls.mean().backward()
 
-    # node_flows = pc.node_flows.cpu()
+    node_flows = pc.node_flows.cpu()
+    param_flows = pc.param_flows.cpu()
+
+    sid, eid = ns._output_ind_range
+    assert torch.all(torch.abs(node_flows[sid:eid,:] - 1.0) < 1e-4)
+
+    pc.inner_layer_groups[2][0].forward(pc.node_mars, pc.element_mars)
+    pc.inner_layer_groups[3][0].backward(pc.node_flows, pc.element_flows, pc.node_mars, pc.element_mars, pc.params, pc.param_flows)
+    element_flows = pc.element_flows.cpu()
+
+    ch_lls = torch.cat((np4_lls, np5_lls, np6_lls), dim = 0)
+    epars = ns._params.reshape(1, 6, 1, 16).permute(0, 2, 1, 3).reshape(96, 1)
+    eflows = epars * (ch_lls - ns_lls).exp()
+
+    sid, eid = np4._output_ind_range
+    np4_flows = eflows[0:32,:]
+    assert torch.all(torch.abs(np4_flows - element_flows[sid:eid,:]) < 1e-4)
+
+    sid, eid = np5._output_ind_range
+    np5_flows = eflows[32:64,:]
+    assert torch.all(torch.abs(np5_flows - element_flows[sid:eid,:]) < 1e-4)
+
+    sid, eid = np6._output_ind_range
+    np6_flows = eflows[64:96,:]
+    assert torch.all(torch.abs(np6_flows - element_flows[sid:eid,:]) < 1e-4)
+
+    ns_parflows = eflows.sum(dim = 1)
+    ref_parflows = param_flows[4096:4192]
+    assert torch.all(torch.abs(ns_parflows - ref_parflows) < 1e-3)
+
+    sid, eid = ns0._output_ind_range
+    ns0_flows = np4_flows
+    assert torch.all(torch.abs(ns0_flows - node_flows[sid:eid,:]) < 1e-4)
+
+    sid, eid = ns1._output_ind_range
+    ns1_flows = np5_flows
+    assert torch.all(torch.abs(ns1_flows - node_flows[sid:eid,:]) < 1e-4)
+
+    sid, eid = ns2._output_ind_range
+    ns2_flows = np6_flows
+    assert torch.all(torch.abs(ns2_flows - node_flows[sid:eid,:]) < 1e-4)
+
+    pc.inner_layer_groups[0][0].forward(pc.node_mars, pc.element_mars)
+    pc.inner_layer_groups[1][0].backward(pc.node_flows, pc.element_flows, pc.node_mars, pc.element_mars, pc.params, pc.param_flows)
+    element_flows = pc.element_flows.cpu()
+
+    ch_lls = torch.cat((np0_lls, np3_lls), dim = 0)
+    epars = ns0._params.reshape(2, 4, 16, 16).permute(0, 2, 1, 3).reshape(32, 64)
+    ch_lls_max = ch_lls.max(dim = 0).values
+    nflow_div_mar = ns0_flows * (ch_lls_max[None,:] - ns0_lls).exp()
+    emars = (ch_lls - ch_lls_max[None,:]).exp()
+    eflows = emars * torch.matmul(epars.permute(1, 0), nflow_div_mar)
+
+    sid, eid = np0._output_ind_range
+    np0_flows = eflows[0:32,:]
+    assert torch.all(torch.abs(np0_flows - element_flows[sid:eid,:]) < 1e-4)
+
+    sid, eid = np3._output_ind_range
+    np3_flows = eflows[32:64,:]
+    assert torch.all(torch.abs(np3_flows - element_flows[sid:eid,:]) < 1e-4)
+
+    ns0_parflows = epars * torch.matmul(nflow_div_mar, emars.permute(1, 0))
+    ref_parflows = param_flows[0:2048].reshape(2, 64, 16).permute(0, 2, 1).reshape(32, 64)
+    assert torch.all(torch.abs(ns0_parflows - ref_parflows) < 1e-4)
+
+    ch_lls = np1_lls
+    epars = ns1._params.reshape(2, 2, 16, 16).permute(0, 2, 1, 3).reshape(32, 32)
+    ch_lls_max = ch_lls.max(dim = 0).values
+    nflow_div_mar = ns1_flows * (ch_lls_max[None,:] - ns1_lls).exp()
+    emars = (ch_lls - ch_lls_max[None,:]).exp()
+    eflows = emars * torch.matmul(epars.permute(1, 0), nflow_div_mar)
+
+    sid, eid = np1._output_ind_range
+    np1_flows = eflows
+    assert torch.all(torch.abs(np1_flows - element_flows[sid:eid,:]) < 1e-4)
+
+    ns1_parflows = epars * torch.matmul(nflow_div_mar, emars.permute(1, 0))
+    ref_parflows = param_flows[2048:3072].reshape(2, 32, 16).permute(0, 2, 1).reshape(32, 32)
+    assert torch.all(torch.abs(ns1_parflows - ref_parflows) < 1e-4)
+
+    ch_lls = np2_lls
+    epars = ns2._params.reshape(2, 2, 16, 16).permute(0, 2, 1, 3).reshape(32, 32)
+    ch_lls_max = ch_lls.max(dim = 0).values
+    nflow_div_mar = ns2_flows * (ch_lls_max[None,:] - ns2_lls).exp()
+    emars = (ch_lls - ch_lls_max[None,:]).exp()
+    eflows = emars * torch.matmul(epars.permute(1, 0), nflow_div_mar)
+
+    sid, eid = np2._output_ind_range
+    np2_flows = eflows
+    assert torch.all(torch.abs(np2_flows - element_flows[sid:eid,:]) < 1e-4)
+
+    ns2_parflows = epars * torch.matmul(nflow_div_mar, emars.permute(1, 0))
+    ref_parflows = param_flows[3072:4096].reshape(2, 32, 16).permute(0, 2, 1).reshape(32, 32)
+    assert torch.all(torch.abs(ns2_parflows - ref_parflows) < 3e-4)
+
+    sid, eid = ni0._output_ind_range
+    ni0_flows = np0_flows + np3_flows + np5_flows + np6_flows
+    assert torch.all(torch.abs(ni0_flows - node_flows[sid:eid,:]) < 2e-4)
+
+    sid, eid = ni1._output_ind_range
+    ni1_flows = np0_flows + np2_flows + np3_flows + np5_flows
+    assert torch.all(torch.abs(ni1_flows - node_flows[sid:eid,:]) < 2e-4)
+
+    sid, eid = ni2._output_ind_range
+    ni2_flows = np1_flows + np2_flows + np4_flows
+    assert torch.all(torch.abs(ni2_flows - node_flows[sid:eid,:]) < 2e-4)
+
+    sid, eid = ni3._output_ind_range
+    ni3_flows = np1_flows + np4_flows + np6_flows
+    assert torch.all(torch.abs(ni3_flows - node_flows[sid:eid,:]) < 2e-4)
 
     import pdb; pdb.set_trace()
 
 
 if __name__ == "__main__":
+    torch.manual_seed(23892)
     simple_model_test()

From 68f30d367ebcc416a91554b6e7c107e1ba38ac30 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Thu, 14 Dec 2023 07:00:53 +0800
Subject: [PATCH 064/162] runtests for input layers

---
 tests/model/simple_model_test.py | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/tests/model/simple_model_test.py b/tests/model/simple_model_test.py
index d258e6d8..84d8d1fa 100644
--- a/tests/model/simple_model_test.py
+++ b/tests/model/simple_model_test.py
@@ -386,6 +386,34 @@ def simple_model_test():
     ni3_flows = np1_flows + np4_flows + np6_flows
     assert torch.all(torch.abs(ni3_flows - node_flows[sid:eid,:]) < 2e-4)
 
+    input_layer = pc.input_layer_group[0]
+    input_pflows = input_layer.param_flows.cpu()
+    data = data.cpu()
+
+    ni0_pflows = input_pflows[0:128].reshape(32, 4)
+    ref_pflows = torch.zeros_like(ni0_pflows)
+    for b in range(512):
+        ref_pflows[:,data[b,0]] += ni0_flows[:,b]
+    assert torch.all(torch.abs(ni0_pflows - ref_pflows) < 4e-3)
+
+    ni1_pflows = input_pflows[128:256].reshape(32, 4)
+    ref_pflows = torch.zeros_like(ni1_pflows)
+    for b in range(512):
+        ref_pflows[:,data[b,1]] += ni1_flows[:,b]
+    assert torch.all(torch.abs(ni1_pflows - ref_pflows) < 4e-3)
+
+    ni2_pflows = input_pflows[256:448].reshape(32, 6)
+    ref_pflows = torch.zeros_like(ni2_pflows)
+    for b in range(512):
+        ref_pflows[:,data[b,2]] += ni2_flows[:,b]
+    assert torch.all(torch.abs(ni2_pflows - ref_pflows) < 4e-3)
+
+    ni3_pflows = input_pflows[448:640].reshape(32, 6)
+    ref_pflows = torch.zeros_like(ni3_pflows)
+    for b in range(512):
+        ref_pflows[:,data[b,3]] += ni3_flows[:,b]
+    assert torch.all(torch.abs(ni3_pflows - ref_pflows) < 4e-3)
+
     import pdb; pdb.set_trace()
 
 

From 12a93127a0ccc2197845733ca3d5395255a68d17 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Thu, 14 Dec 2023 17:14:48 +0800
Subject: [PATCH 065/162] update parameters and parflows

---
 src/pyjuice/layer/compilation.py   | 45 ++++++++++++++++++++++++++----
 src/pyjuice/layer/layer.py         |  3 --
 src/pyjuice/model/tensorcircuit.py | 31 +++++++++-----------
 src/pyjuice/nodes/sum_nodes.py     | 45 ++++++++++++++++++++++++++++++
 4 files changed, 98 insertions(+), 26 deletions(-)

diff --git a/src/pyjuice/layer/compilation.py b/src/pyjuice/layer/compilation.py
index 2a714d20..5a5e528b 100644
--- a/src/pyjuice/layer/compilation.py
+++ b/src/pyjuice/layer/compilation.py
@@ -186,8 +186,9 @@ def _assign_chid_kernel(chs_offsets, ns_nchs, edge_ids):
 def _assign_target_ncpids_kernel(target_nids_ptr, nids_partition_start_ptr, target_cids_ptr, pcids_partition_start_ptr,
                                  target_pids_ptr, target_pfids_ptr, edge_ids_ptr, chs_offsets_ptr, n_partition_ids_ptr, 
                                  n_id_in_partition_ptr, cs_ele_id_start_ptr, cs_node_cum_ids_ptr, fw_partition_max_chs_ptr, 
-                                 cum_n_chs_ptr, ns_param_ids_ptr, constexprs_ptr, num_chs: tl.constexpr, 
-                                 num_chs_np2: tl.constexpr, add_params_flag: tl.constexpr, BLOCK_SIZE: tl.constexpr):
+                                 cum_n_chs_ptr, ns_param_ids_ptr, ns_param_flow_ids_ptr, constexprs_ptr, num_chs: tl.constexpr, 
+                                 num_chs_np2: tl.constexpr, add_params_flag: tl.constexpr, add_param_flows_flag: tl.constexpr, 
+                                 BLOCK_SIZE: tl.constexpr):
 
     pid = tl.program_id(axis = 0)
     block_start = pid * BLOCK_SIZE
@@ -252,6 +253,10 @@ def _assign_target_ncpids_kernel(target_nids_ptr, nids_partition_start_ptr, targ
     if add_params_flag:
         tl.store(ns_param_ids_ptr + offsets, global_pid, mask = mask)
 
+    # Global parameter flow indices for all edges
+    if add_param_flows_flag:
+        tl.store(ns_param_flow_ids_ptr + offsets, global_pfid, mask = mask)
+
 
 @torch.no_grad()
 def sum_layer_forward_compilation(nodes, fw_partition_max_chs, n_partition_ids, n_id_in_partition, num_ngs_in_partition, n_chs,
@@ -300,6 +305,7 @@ def sum_layer_forward_compilation(nodes, fw_partition_max_chs, n_partition_ids,
         ngid_in_partition = torch.zeros([len(num_ngs_in_partition)], dtype = torch.long)
 
     all_ns_param_ids = dict()
+    all_ns_param_flow_ids = dict()
     original_param_nids = [] # `ns` with their original parameters (i.e., not tied)
 
     # This is the main loop: iterate over `ns` in the layer
@@ -317,10 +323,12 @@ def sum_layer_forward_compilation(nodes, fw_partition_max_chs, n_partition_ids,
                 global_pfid_start = global_pfid_end
 
                 add_params_flag = True
+                add_param_flows_flag = True
             else:
                 assert ns.provided("_param_flow_range")
 
                 add_params_flag = False
+                add_param_flows_flag = False
 
             original_param_nids.append(ns_idx)
                 
@@ -358,11 +366,15 @@ def sum_layer_forward_compilation(nodes, fw_partition_max_chs, n_partition_ids,
 
                 node2tiednodes[source_ns][0].append(ns)
                 node2tiednodes[source_ns][1] = 1
+
+                add_param_flows_flag = True
             else:
                 ns._param_flow_range = deepcopy(node2tiednodes[source_ns][2])
 
                 node2tiednodes[source_ns][1] += 1
 
+                add_param_flows_flag = False
+
             # Global pid and pfid start index for `ns`
             ns_pid_start = source_ns._param_range[0]
             ns_pfid_start = ns._param_flow_range[0]
@@ -411,6 +423,11 @@ def sum_layer_forward_compilation(nodes, fw_partition_max_chs, n_partition_ids,
             else:
                 ns_param_ids = None
 
+            if add_param_flows_flag:
+                ns_param_flow_ids = torch.zeros([ns_num_edges], dtype = torch.long).to(device)
+            else:
+                ns_param_flow_ids = None
+
             # The following kernel assigns the corresponding indices to `nids`, `cids`, and `pids`
             # We first move necessary buffers to GPU
             nids_partition_start = nids_partition_start.to(device)
@@ -435,8 +452,8 @@ def sum_layer_forward_compilation(nodes, fw_partition_max_chs, n_partition_ids,
                 target_nids, nids_partition_start, target_cids, pcids_partition_start,
                 target_pids, target_pfids, edge_ids, chs_offsets, n_partition_ids, 
                 n_id_in_partition, cs_ele_id_start, cs_node_cum_ids, fw_partition_max_chs, 
-                cum_n_chs, ns_param_ids, constexprs, ns.num_chs, num_chs_np2, 
-                add_params_flag, BLOCK_SIZE = min(2048, 2**20 // num_chs_np2)
+                cum_n_chs, ns_param_ids, ns_param_flow_ids, constexprs, ns.num_chs, num_chs_np2, 
+                add_params_flag, add_param_flows_flag, BLOCK_SIZE = min(2048, 2**20 // num_chs_np2)
             )
 
             ngroup_start += ns_num_ngroups
@@ -457,6 +474,11 @@ def sum_layer_forward_compilation(nodes, fw_partition_max_chs, n_partition_ids,
             else:
                 ns_param_ids = None
 
+            if add_param_flows_flag:
+                ns_param_flow_ids = torch.zeros([ns_num_edges], dtype = torch.long).to(device)
+            else:
+                ns_param_flow_ids = None
+
             # Iterate over node groups
             cum_n_chs = 0
             for ng_id in range(ns_num_ngroups):
@@ -490,15 +512,28 @@ def sum_layer_forward_compilation(nodes, fw_partition_max_chs, n_partition_ids,
                 if add_params_flag:
                     ns_param_ids[criterion] = global_pids
 
+                if add_param_flows_flag:
+                    ns_param_flow_ids[criterion] = global_pfids
+
                 ngid_in_partition[partition_id] = local_id + 1
 
         if add_params_flag:
             all_ns_param_ids[ns_idx] = ns_param_ids
 
+        if add_param_flows_flag:
+            all_ns_param_flow_ids[ns_idx] = ns_param_flow_ids
+
     # Store global -> local parameter id mapping in `ns`
     for ns_idx, param_ids in all_ns_param_ids.items():
         ns = nodes[ns_idx]
-        ns._param_ids = param_ids.cpu()[0::ns.ch_group_size] # Every edge specify the start id of [ch_group_size, group_size] parameters
+        # Every edge specify the start id of [ch_group_size, group_size] parameters
+        ns._param_ids = param_ids.cpu()[0::ns.ch_group_size]
+
+    # Store global -> local parameter flow id mapping in `ns`
+    for ns_idx, param_flow_ids in all_ns_param_flow_ids.items():
+        ns = nodes[ns_idx]
+        # Every edge specify the start id of [ch_group_size, group_size] parameter flows
+        ns._param_flow_ids = param_flow_ids.cpu()[0::ns.ch_group_size]
 
     # Store local -> global parameter id mapping in `ns`
     for ns_idx in original_param_nids:
diff --git a/src/pyjuice/layer/layer.py b/src/pyjuice/layer/layer.py
index 5d57f6d4..bbf0ea25 100644
--- a/src/pyjuice/layer/layer.py
+++ b/src/pyjuice/layer/layer.py
@@ -16,9 +16,6 @@ def __init__(self, nodes: Sequence[CircuitNodes]) -> None:
 
         self.device = torch.device("cpu")
 
-    def init_layer(self, params: Union[torch.Tensor,None]):
-        raise NotImplementedError()
-
     def enable_partial_evaluation(self, fw_scopes: Optional[Sequence[BitSet]] = None, bk_scopes: Optional[Sequence[BitSet]] = None):
         if not self.provided("fw_scope2localids") or not self.provided("bk_scope2localids"):
             raise ValueError("Please initialize node cache by calling `pc._create_scope2nid_cache()` first.")
diff --git a/src/pyjuice/model/tensorcircuit.py b/src/pyjuice/model/tensorcircuit.py
index 9924d7c2..36db210d 100644
--- a/src/pyjuice/model/tensorcircuit.py
+++ b/src/pyjuice/model/tensorcircuit.py
@@ -309,34 +309,31 @@ def to(self, device):
 
         return self
 
-    def update_parameters(self, clone_params: bool = True, update_flows: bool = False):
+    def update_parameters(self, clone: bool = True):
         """
         Copy parameters from this `TensorCircuit` to the original `CircuitNodes`
         """
-        raise NotImplementedError()
         params = self.params.detach().cpu()
-        if update_flows:
-            param_flows = self.param_flows.detach().cpu()
 
         for ns in self.root_nodes:
             if ns.is_sum() and not ns.is_tied():
-                psid, peid = ns._param_range
-                if clone_params:
-                    ns._params = params[ns._param_ids].clone()
-                else:
-                    ns._params = params[ns._param_ids]
-                
-                if update_flows:
-                    if clone_params:
-                        ns._flows = param_flows[ns._param_ids].clone()
-                    else:
-                        ns._flows = param_flows[ns._param_ids]
+                ns.update_parameters(params, clone = clone)
 
         for layer in self.input_layer_group:
             layer.update_parameters()
 
         return None
 
+    def update_param_flows(self, clone: bool = True, origin_ns_only: bool = True):
+        """
+        Copy parameter flows from this `TensorCircuit` to the original `CircuitNodes`
+        """
+        param_flows = self.param_flows.detach().cpu()
+
+        for ns in self.root_nodes:
+            if ns.is_sum() and not ns.is_tied():
+                ns.update_param_flows(params, clone = clone, origin_ns_only = origin_ns_only)
+
     def print_statistics(self):
         """
         Print the statistics of the PC.
@@ -604,9 +601,7 @@ def _init_parameters(self, perturbation: float = 4.0, pseudocount: float = 0.0):
         # Copy initial parameters if provided
         for ns in self.root_ns:
             if ns.is_sum() and not ns.is_tied() and ns.has_params():
-                sidx, eidx = ns._param_range
-                ns_params = ns._params[ns._inverse_param_ids,:,:].permute(0, 2, 1).reshape(-1)
-                params[sidx:eidx] = ns_params.to(params.device)
+                ns.gather_parameters(params)
 
         self._normalize_parameters(params, pseudocount = pseudocount)
         self.params = nn.Parameter(params)
diff --git a/src/pyjuice/nodes/sum_nodes.py b/src/pyjuice/nodes/sum_nodes.py
index 6d743b3d..591997ec 100644
--- a/src/pyjuice/nodes/sum_nodes.py
+++ b/src/pyjuice/nodes/sum_nodes.py
@@ -128,6 +128,51 @@ def init_parameters(self, perturbation: float = 2.0, recursive: bool = True, is_
             **kwargs
         )
 
+    def update_parameters(self, params: torch.Tensor, clone: bool = True):
+        assert self.provided("_param_range"), "The `SumNodes` has not been compiled into a `TensorCircuit`."
+
+        if self.is_tied():
+            # Do not update parameters for tied nodes
+            return None
+
+        psid, peid = self._param_range
+        if clone:
+            ns_params = params[psid:peid].cpu().clone()
+        else:
+            ns_params = params[psid:peid].cpu()
+
+        local_parids = (self._param_ids - psid) // (self.group_size * self.ch_group_size)
+        num_pargroups = local_parids.size()
+        ns_params = ns_params.reshape(num_pargroups, self.ch_group_size, self.group_size)
+        ns._params = ns_params[local_parids,:,:].permute(0, 2, 1)
+
+    def update_param_flows(self, param_flows: torch.Tensor, origin_ns_only: bool = True, clone: bool = True):
+        assert self.provided("_param_flow_range"), "The `SumNodes` has not been compiled into a `TensorCircuit`."
+
+        if origin_ns_only and self.is_tied():
+            return None
+
+        pfsid, pfeid = self._param_flow_range
+        if clone:
+            ns_param_flows = param_flows[pfsid:pfeid].cpu().clone()
+        else:
+            ns_param_flows = param_flows[pfsid:pfeid].cpu()
+
+        local_parfids = (self._param_flow_ids - pfsid) // (self.group_size * self.ch_group_size)
+        num_parfgroups = local_parfids.size()
+        ns_param_flows = ns_param_flows.reshape(num_parfgroups, self.ch_group_size, self.group_size)
+        ns._param_flows = ns_param_flows[local_parfids,:,:].permute(0, 2, 1)
+
+    def gather_parameters(self, params: torch.Tensor):
+        assert self.provided("_param_range"), "The `SumNodes` has not been compiled into a `TensorCircuit`."
+
+        if self.is_tied() or not self.has_params():
+            return None
+
+        psid, peid = self._param_range
+        ns_params = self._params[self._inverse_param_ids,:,:].permute(0, 2, 1).reshape(-1)
+        params[psid:peid] = ns_params.to(params.device)
+
     def _get_edges_as_mask(self):
         mask = torch.zeros([self.num_node_groups, self.num_ch_nodes], dtype = torch.bool)
         mask[self.edge_ids[0,:], self.edge_ids[1,:]] = True

From 4e4a85fc9e97a48aad537facf8ffbcbd8eb252ed Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Thu, 14 Dec 2023 17:57:55 +0800
Subject: [PATCH 066/162] minibatch EM

---
 src/pyjuice/model/backend/par_update.py |  7 ++-
 src/pyjuice/model/tensorcircuit.py      | 74 ++++++++++---------------
 2 files changed, 33 insertions(+), 48 deletions(-)

diff --git a/src/pyjuice/model/backend/par_update.py b/src/pyjuice/model/backend/par_update.py
index 34918dfa..d8ed343b 100644
--- a/src/pyjuice/model/backend/par_update.py
+++ b/src/pyjuice/model/backend/par_update.py
@@ -6,6 +6,7 @@
 import triton
 import triton.language as tl
 from numba import njit
+from typing import Sequence
 
 from pyjuice.nodes import CircuitNodes
 
@@ -211,8 +212,10 @@ def par_update_kernel(params, param_flows, cum_pflows, nchs, par_start_ids, pflo
     tl.store(params + offs_par, updated_param, mask = mask_pflow)
 
 
-def em_par_update(params, param_flows, par_start_ids, pflow_start_ids, blk_sizes, blk_intervals, 
-                  global_nids, nchs, metadata, step_size: float, pseudocount: float = 0.0, cum_pflows = None):
+def em_par_update(params: torch.Tensor, param_flows: torch.Tensor, parflow_fusing_kwargs: Sequence, 
+                  step_size: float, pseudocount: float = 0.0):
+
+    par_start_ids, pflow_start_ids, blk_sizes, blk_intervals, global_nids, nchs, cum_pflows, metadata = parflow_fusing_kwargs
 
     tot_num_nodes = metadata["tot_num_nodes"]
     BLOCK_SIZE = metadata["BLOCK_SIZE"]
diff --git a/src/pyjuice/model/tensorcircuit.py b/src/pyjuice/model/tensorcircuit.py
index 36db210d..be9421c0 100644
--- a/src/pyjuice/model/tensorcircuit.py
+++ b/src/pyjuice/model/tensorcircuit.py
@@ -11,7 +11,7 @@
 
 from pyjuice.nodes import CircuitNodes, InputNodes, ProdNodes, SumNodes, foreach
 from pyjuice.layer import Layer, InputLayer, ProdLayer, SumLayer, LayerGroup
-from pyjuice.utils.grad_fns import ReverseGrad, PseudoHookFunc
+from pyjuice.utils.grad_fns import ReverseGrad
 from pyjuice.utils import BitSet
 
 from .backend import compile_cum_par_flows_fn, compute_cum_par_flows, cum_par_flows_to_device, \
@@ -55,7 +55,12 @@ def __init__(self, root_ns: CircuitNodes, layer_sparsity_tol: float = 0.5,
         self.root_ns = root_ns
         self.device = torch.device("cpu")
 
-        self._init_pass_tensors()
+        self.node_mars = None
+        self.element_mars = None
+        self.node_flows = None
+        self.element_flows = None
+        self.param_flows = None
+        
         self._init_layers(
             layer_sparsity_tol = layer_sparsity_tol, 
             max_num_partitions = max_num_partitions, 
@@ -71,12 +76,20 @@ def __init__(self, root_ns: CircuitNodes, layer_sparsity_tol: float = 0.5,
             "flows_memory": 1.0
         }
 
-    def _init_pass_tensors(self):
-        self.node_mars = None
-        self.element_mars = None
-        self.node_flows = None
-        self.element_flows = None
-        self.param_flows = None
+    def to(self, device):
+        super(TensorCircuit, self).to(device)
+
+        self.input_layer_group.to(device)
+
+        self.device = device
+
+        # For parameter flow accumulation
+        self.parflow_fusing_kwargs = cum_par_flows_to_device(self.parflow_fusing_kwargs, device)
+        
+        # For parameter update
+        self.par_update_kwargs = par_update_to_device(self.par_update_kwargs, device)
+
+        return self
         
     def forward(self, inputs: torch.Tensor, input_layer_fn: Optional[Union[str,Callable]] = None,
                 cache: Optional[dict] = None, return_cache: bool = False, **kwargs):
@@ -255,19 +268,15 @@ def backward(self, inputs: Optional[torch.Tensor] = None,
             return None
 
     def mini_batch_em(self, step_size: float, pseudocount: float = 0.0):
+        # Update input layers
         for layer in self.input_layer_group:
             layer.mini_batch_em(step_size = step_size, pseudocount = pseudocount)
-        
-        # Only apply parameter update if external parameters are not used in the previous forward/backward pass
-        if not self._used_external_sum_params:
-            # Normalize and update parameters
-            with torch.no_grad():
-                flows = self.param_flows
-                if flows is None:
-                    return None
-                self._normalize_parameters(flows, pseudocount = pseudocount)
-                self.params.data = (1.0 - step_size) * self.params.data + step_size * flows
-                self.params[0] = 1.0
+
+        # Accumulate parameter flows of tied nodes
+        compute_cum_par_flows(self.parflow_fusing_kwargs)
+
+        # Normalize and update parameters
+        em_par_update(self.param, self.param_flows, self.par_update_kwargs, step_size = step_size, pseudocount = pseudocount)
 
     def cumulate_flows(self, inputs: torch.Tensor, params: Optional[torch.Tensor] = None):
         with torch.no_grad():
@@ -294,21 +303,6 @@ def init_param_flows(self, flows_memory: float = 1.0, batch_size: Optional[int]
 
         return None
 
-    def to(self, device):
-        super(TensorCircuit, self).to(device)
-
-        self.input_layer_group.to(device)
-
-        self.device = device
-
-        # For parameter flow accumulation
-        self.parflow_fusing_kwargs = cum_par_flows_to_device(self.parflow_fusing_kwargs, device)
-        
-        # For parameter update
-        self.par_update_kwargs = par_update_to_device(self.par_update_kwargs, device)
-
-        return self
-
     def update_parameters(self, clone: bool = True):
         """
         Copy parameters from this `TensorCircuit` to the original `CircuitNodes`
@@ -342,18 +336,6 @@ def print_statistics(self):
         print(f"> Number of edges: {self.num_edges}")
         print(f"> Number of sum parameters: {self.num_sum_params}")
 
-    def copy_param_flows(self, clone_param_flows: bool = True, target_name: str = "_scores"):
-        raise NotImplementedError("To be updated")
-        param_flows = self.param_flows.detach().cpu()
-
-        for ns in self.root_nodes:
-            if clone_params:
-                setattr(ns, target_name, param_flows[ns._param_ids].clone())
-            else:
-                setattr(ns, target_name, param_flows[ns._param_ids])
-
-        return None
-
     def enable_partial_evaluation(self, scopes: Union[Sequence[BitSet],Sequence[int]], 
                                   forward: bool = False, backward: bool = False):
         raise NotImplementedError("To be updated")

From 2307df5e4e5f0c7ac8fca87de8fc7079f2400a69 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Thu, 14 Dec 2023 20:20:40 +0800
Subject: [PATCH 067/162] simulate tl.dot when necessary

---
 src/pyjuice/layer/sum_layer.py    | 123 +++++++++++++++++++++++-------
 tests/layer/matmul_kernel_test.py |  28 ++++---
 tests/model/numba_test.py         |  21 -----
 3 files changed, 113 insertions(+), 59 deletions(-)
 delete mode 100644 tests/model/numba_test.py

diff --git a/src/pyjuice/layer/sum_layer.py b/src/pyjuice/layer/sum_layer.py
index 32eab00f..67014be1 100644
--- a/src/pyjuice/layer/sum_layer.py
+++ b/src/pyjuice/layer/sum_layer.py
@@ -423,12 +423,22 @@ def _fw_triton_block_sparse_kernel(node_mars, element_mars, params, nids, cids_s
             emars_max = tl.max(emars, axis = 0)[None,:]
             emars = tl.exp(emars - emars_max)
 
-            if use_fp16 == 1:
-                epars = epars.to(tl.float16) * (2**12)
-                emars = emars.to(tl.float16)
-                nmars = tl.dot(epars, emars).to(tl.float32) / (2**12)
+            if TILE_SIZE_M >= 16 and TILE_SIZE_K >= 16 and BLOCK_B >= 16:
+                # We can use the built-in matmul kernel of triton
+                if use_fp16 == 1:
+                    epars = (epars * (2**12)).to(tl.float16)
+                    emars = emars.to(tl.float16)
+                    nmars = tl.dot(epars, emars).to(tl.float32) / (2**12)
+                else:
+                    nmars = tl.dot(epars, emars)
             else:
-                nmars = tl.dot(epars, emars)
+                # We have to simulate matmul
+                if use_fp16 == 1:
+                    epars = (epars * (2**12)).to(tl.float16)
+                    emars = emars.to(tl.float16)
+                    nmars = tl.sum(epars[:,:,None] * emars[None,:,:], axis = 1).to(tl.float32) / (2**12)
+                else:
+                    nmars = tl.sum(epars[:,:,None] * emars[None,:,:], axis = 1)
 
             acc = tl.where(emars_max > acc,
                 tl.log(nmars + tl.exp(acc - emars_max)) + emars_max,
@@ -453,7 +463,8 @@ def _fw_triton_block_sparse_kernel(node_mars, element_mars, params, nids, cids_s
     def _forward_block_sparse(self, node_mars: torch.Tensor, element_mars: torch.Tensor,
                               params: torch.Tensor, nids: torch.Tensor, cids: torch.Tensor,
                               pids: torch.Tensor, local_ids: Optional[torch.Tensor] = None,
-                              partition_id: int = -1, use_fp16: bool = True) -> None:
+                              partition_id: int = -1, force_use_fp16: bool = False,
+                              force_use_fp32: bool = False) -> None:
         """
         Forward pass of sum layers with the block-sparse processing kernel.
         
@@ -510,6 +521,17 @@ def _forward_block_sparse(self, node_mars: torch.Tensor, element_mars: torch.Ten
         else:
             cids_start, cids_increment, pids_start, pids_increment = self._cached_fw_pcids[signature]
 
+        if force_use_fp16:
+            assert not force_use_fp32
+            use_fp16 = True
+        elif force_use_fp32:
+            use_fp16 = False
+        else:
+            if TILE_SIZE_M >= 16 and TILE_SIZE_K >= 16 and BLOCK_B >= 16:
+                use_fp16 = True
+            else:
+                use_fp16 = False
+
         grid = (triton.cdiv(batch_size, BLOCK_B), triton.cdiv(layer_n_nodes, TILE_SIZE_M))
         
         self._fw_triton_block_sparse_kernel[grid](
@@ -774,7 +796,7 @@ def _bk_triton_block_sparse_ele_kernel(node_flows, element_flows, node_mars, ele
                                            chids, parids_start, parids_increment, parpids_start, parpids_increment, 
                                            local_ids, batch_size: tl.constexpr, partial_eval: tl.constexpr, ptr_inc_step: tl.constexpr, 
                                            BLOCK_B: tl.constexpr, TILE_SIZE_K: tl.constexpr, K_NUM_TILES: tl.constexpr, TILE_SIZE_M: tl.constexpr, 
-                                           GROUP_SIZE_M: tl.constexpr, GROUP_SIZE_K: tl.constexpr):
+                                           GROUP_SIZE_M: tl.constexpr, GROUP_SIZE_K: tl.constexpr, use_fp16: tl.constexpr):
 
         pid_b = tl.program_id(0) # ID of size-`BLOCK_B` batches
         pid_m = tl.program_id(1) # ID of size-`TILE_SIZE_M` nodes
@@ -828,13 +850,24 @@ def _bk_triton_block_sparse_ele_kernel(node_flows, element_flows, node_mars, ele
             nmars = tl.load(nmars_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_K, BLOCK_B]
             nflows = tl.load(nflows_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_K, BLOCK_B]
 
-            # Set a hard upper bound of 1e20 to avoid overflow
-            # However, this should not happen unless we have extremely small parameters
-            nflows_div_mars = nflows * tl.minimum(tl.exp(emars_max[None,:] - nmars), 1.0e20) 
+            nflows_div_mars = nflows * tl.exp(emars_max[None,:] - nmars)
             
-            epars = epars.to(tl.bfloat16)
-            nflows_div_mars = nflows_div_mars.to(tl.bfloat16)
-            eflows = tl.dot(epars, nflows_div_mars).to(tl.float32)
+            if TILE_SIZE_M >= 16 and TILE_SIZE_K >= 16 and BLOCK_B >= 16:
+                # We can use the built-in matmul kernel of triton
+                if use_fp16 == 1:
+                    epars = epars.to(tl.float16)
+                    nflows_div_mars = nflows_div_mars.to(tl.float16)
+                    eflows = tl.dot(epars, nflows_div_mars).to(tl.float32)
+                else:
+                    eflows = tl.dot(epars, nflows_div_mars)
+            else:
+                # We have to simulate matmul
+                if use_fp16 == 1:
+                    epars = epars.to(tl.float16)
+                    nflows_div_mars = nflows_div_mars.to(tl.float16)
+                    eflows = tl.sum(epars[:,:,None] * nflows_div_mars[None,:,:], axis = 1).to(tl.float32)
+                else:
+                    eflows = tl.sum(epars[:,:,None] * nflows_div_mars[None,:,:], axis = 1)
 
             acc += eflows
 
@@ -861,7 +894,7 @@ def _backward_block_sparse_ele_flows(self, node_flows: torch.Tensor, element_flo
                                          params: torch.Tensor, node_mars: torch.Tensor,
                                          element_mars: torch.Tensor, chids: torch.Tensor, parids: torch.Tensor,
                                          parpids: torch.Tensor, cs_group_size: int, local_ids: Optional[torch.Tensor] = None,
-                                         partition_id: int = -1) -> None:
+                                         partition_id: int = -1, force_use_fp16: bool = False, force_use_fp32: bool = False) -> None:
 
         assert params.dim() == 1, "Expecting a 1D `params`."
 
@@ -922,6 +955,17 @@ def _backward_block_sparse_ele_flows(self, node_flows: torch.Tensor, element_flo
         else:
             parids_start, parids_increment, parpids_start, parpids_increment, ptr_inc_step = self._cached_bk_parids[signature]
 
+        if force_use_fp16:
+            assert not force_use_fp32
+            use_fp16 = True
+        elif force_use_fp32:
+            use_fp16 = False
+        else:
+            if TILE_SIZE_M >= 16 and TILE_SIZE_K >= 16 and BLOCK_B >= 16:
+                use_fp16 = True
+            else:
+                use_fp16 = False
+
         grid = (triton.cdiv(batch_size, BLOCK_B), triton.cdiv(layer_n_nodes, TILE_SIZE_M))
 
         self._bk_triton_block_sparse_ele_kernel[grid](
@@ -944,7 +988,8 @@ def _backward_block_sparse_ele_flows(self, node_flows: torch.Tensor, element_flo
             K_NUM_TILES = K_NUM_TILES,
             TILE_SIZE_M = TILE_SIZE_M, 
             GROUP_SIZE_M = cs_group_size,
-            GROUP_SIZE_K = self.group_size
+            GROUP_SIZE_K = self.group_size,
+            use_fp16 = use_fp16
         )
 
         return None
@@ -954,7 +999,7 @@ def _backward_block_sparse_ele_flows(self, node_flows: torch.Tensor, element_flo
     def _bk_triton_block_sparse_par_kernel(node_flows, node_mars, element_mars, params, param_flows, nids, cids, pids, pfids,
                                            batch_size: tl.constexpr, num_edges: tl.constexpr, TILE_SIZE_B: tl.constexpr, 
                                            B_NUM_TILES: tl.constexpr, TILE_SIZE_K: tl.constexpr, 
-                                           TILE_SIZE_M: tl.constexpr, GROUP_SIZE_M: tl.constexpr):
+                                           TILE_SIZE_M: tl.constexpr, GROUP_SIZE_M: tl.constexpr, use_fp16: tl.constexpr):
 
         pid_k = tl.program_id(0) # ID of size-`TILE_SIZE_K` edges
         pid_m = tl.program_id(1) # ID of size-`TILE_SIZE_M` nodes
@@ -971,8 +1016,8 @@ def _bk_triton_block_sparse_par_kernel(node_flows, node_mars, element_mars, para
         offs_edge = tl.arange(0, TILE_SIZE_K) + pid_k * TILE_SIZE_K
         edge_start = tl.load(cids + ngroup_id * num_edges + offs_edge)
         emars_ptr = element_mars + \
-            edge_start[:,None] * batch_size + \
-            offs_batch[None,:] # [TILE_SIZE_K, TILE_SIZE_B]
+            edge_start[None,:] * batch_size + \
+            offs_batch[:,None] # [TILE_SIZE_B, TILE_SIZE_K]
 
         # Initialize pointers to `node_flows` and `node_mars`
         offs_node = tl.arange(0, TILE_SIZE_M) + tile_id * TILE_SIZE_M
@@ -984,18 +1029,31 @@ def _bk_triton_block_sparse_par_kernel(node_flows, node_mars, element_mars, para
         acc = tl.zeros([TILE_SIZE_M, TILE_SIZE_K], dtype = tl.float32)
         
         for b in range(0, B_NUM_TILES):
-            emars = tl.load(emars_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_K, TILE_SIZE_B]
+            emars = tl.load(emars_ptr, mask = mask_batch[:,None]) # [TILE_SIZE_B, TILE_SIZE_K]
             nmars = tl.load(nmars_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_M, TILE_SIZE_B]
             nflows = tl.load(nflows_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_M, TILE_SIZE_B]
 
-            emars_max = tl.max(emars, axis = 0)
+            emars_max = tl.max(emars, axis = 1)
             nflows_div_mars = nflows * tl.exp(emars_max[None,:] - nmars)
-            nflows_div_mars = nflows_div_mars.to(tl.float16)
 
-            emars = tl.exp(emars - emars_max[None,:])
-            emars = emars.to(tl.float16)
+            emars = tl.exp(emars - emars_max[:,None])
 
-            pflows = tl.dot(nflows_div_mars, tl.trans(emars)).to(tl.float32)
+            if TILE_SIZE_M >= 16 and TILE_SIZE_K >= 16 and TILE_SIZE_B >= 16:
+                # We can use the built-in matmul kernel of triton
+                if use_fp16 == 1:
+                    nflows_div_mars = nflows_div_mars.to(tl.float16)
+                    emars = emars.to(tl.float16)
+                    pflows = tl.dot(nflows_div_mars, emars).to(tl.float32)
+                else:
+                    pflows = tl.dot(nflows_div_mars, emars).to(tl.float32)
+            else:
+                # We have to simulate matmul
+                if use_fp16 == 1:
+                    nflows_div_mars = nflows_div_mars.to(tl.float16)
+                    emars = emars.to(tl.float16)
+                    pflows = tl.sum(nflows_div_mars[:,:,None] * emars[None,:,:], axis = 1).to(tl.float32)
+                else:
+                    pflows = tl.sum(nflows_div_mars[:,:,None] * emars[None,:,:], axis = 1)
 
             acc += pflows
 
@@ -1021,7 +1079,8 @@ def _bk_triton_block_sparse_par_kernel(node_flows, node_mars, element_mars, para
 
     def _backward_block_sparse_par_flows(self, node_flows: torch.Tensor, params: torch.Tensor, node_mars: torch.Tensor, 
                                          element_mars: torch.Tensor, param_flows: torch.Tensor, nids: torch.Tensor, 
-                                         cids: torch.Tensor, pids: torch.Tensor, pfids: torch.Tensor) -> None:
+                                         cids: torch.Tensor, pids: torch.Tensor, pfids: torch.Tensor,
+                                         force_use_fp16: bool = False, force_use_fp32: bool = False) -> None:
         """
         Backward pass of sum layers w.r.t. sum parameters with the block-sparse processing kernel.
         
@@ -1059,6 +1118,17 @@ def _backward_block_sparse_par_flows(self, node_flows: torch.Tensor, params: tor
             TILE_SIZE_K = min(2048 // TILE_SIZE_B, num_edges)
         B_NUM_TILES = batch_size // TILE_SIZE_B
 
+        if force_use_fp16:
+            assert not force_use_fp32
+            use_fp16 = True
+        elif force_use_fp32:
+            use_fp16 = False
+        else:
+            if TILE_SIZE_M >= 16 and TILE_SIZE_K >= 16 and TILE_SIZE_B >= 16:
+                use_fp16 = True
+            else:
+                use_fp16 = False
+
         grid = (triton.cdiv(num_edges, TILE_SIZE_K), triton.cdiv(layer_n_nodes, TILE_SIZE_M))
 
         self._bk_triton_block_sparse_par_kernel[grid](
@@ -1077,7 +1147,8 @@ def _backward_block_sparse_par_flows(self, node_flows: torch.Tensor, params: tor
             B_NUM_TILES = B_NUM_TILES, 
             TILE_SIZE_K = TILE_SIZE_K, 
             TILE_SIZE_M = TILE_SIZE_M, 
-            GROUP_SIZE_M = self.group_size
+            GROUP_SIZE_M = self.group_size,
+            use_fp16 = use_fp16
         )
 
     def _backward_sparse(self, node_flows: torch.Tensor, element_flows: torch.Tensor, 
diff --git a/tests/layer/matmul_kernel_test.py b/tests/layer/matmul_kernel_test.py
index f169e1f4..f5c628d9 100644
--- a/tests/layer/matmul_kernel_test.py
+++ b/tests/layer/matmul_kernel_test.py
@@ -23,12 +23,12 @@ def kernel1(a, b, c, M: tl.constexpr, N: tl.constexpr, K: tl.constexpr):
     pid = tl.program_id(axis = 0)
 
     offs_a = tl.arange(0, M)[:,None] * N + tl.arange(0, N)[None,:]
-    aa = tl.load(a + offs_a) # .to(tl.bfloat16)
+    aa = tl.load(a + offs_a).to(tl.float16)
 
     offs_b = tl.arange(0, N)[:,None] * K + tl.arange(0, K)[None,:]
-    bb = tl.load(b + offs_b) # .to(tl.bfloat16)
+    bb = tl.load(b + offs_b).to(tl.float16)
 
-    cc = tl.dot(aa, bb, allow_tf32 = True) # .to(tl.float32)
+    cc = tl.dot(aa, bb).to(tl.float32)
 
     offs_c = tl.arange(0, M)[:,None] * K + tl.arange(0, K)[None,:]
     tl.store(c + offs_c, cc)
@@ -39,12 +39,12 @@ def kernel2(a, b, c, M: tl.constexpr, N: tl.constexpr, K: tl.constexpr):
     pid = tl.program_id(axis = 0)
 
     offs_a = tl.arange(0, M)[:,None] * N + tl.arange(0, N)[None,:]
-    aa = tl.load(a + offs_a) # .to(tl.bfloat16)
+    aa = tl.load(a + offs_a)#.to(tl.float16)
 
     offs_b = tl.arange(0, N)[:,None] * K + tl.arange(0, K)[None,:]
-    bb = tl.load(b + offs_b) # .to(tl.bfloat16)
+    bb = tl.load(b + offs_b)#.to(tl.float16)
 
-    cc = tl.sum(aa[:,:,None] * bb[None,:,:], axis = 1) # .to(tl.float32)
+    cc = tl.sum(aa[:,:,None] * bb[None,:,:], axis = 1)#.to(tl.float32)
 
     # cc = tl.dot(aa, bb)
 
@@ -75,15 +75,15 @@ def kernel3(a, b, c, M: tl.constexpr, N: tl.constexpr, K: tl.constexpr):
 
     M = 16
     N = 16
-    K = 16
+    K = 8
 
     a = torch.rand([M, N]).cuda()
     b = torch.rand([N, K]).cuda()
-    c = torch.rand([M, K]).cuda()
+    c = torch.zeros([M, K]).cuda()
 
-    grid = (1000,)
+    grid = (1,)
 
-    kernel1[grid](a, b, c, M, N, K)
+    # kernel1[grid](a, b, c, M, N, K)
 
     # torch.cuda.synchronize()
     # t0 = time.time()
@@ -105,6 +105,10 @@ def kernel3(a, b, c, M: tl.constexpr, N: tl.constexpr, K: tl.constexpr):
 
     # print((t1 - t0) / 100 * 1000)
 
-    # cc = torch.matmul(a, b)
+    cc = torch.matmul(a, b)
 
-    # print((c - cc).abs().max())
\ No newline at end of file
+    print((c - cc).abs().max())
+
+    ccc = c
+
+    import pdb; pdb.set_trace()
\ No newline at end of file
diff --git a/tests/model/numba_test.py b/tests/model/numba_test.py
deleted file mode 100644
index 005140de..00000000
--- a/tests/model/numba_test.py
+++ /dev/null
@@ -1,21 +0,0 @@
-import numpy as np
-from numba import njit, prange
-
-
-@njit(parallel = True)
-def ff(a, b):
-    for i in prange(10000000000):
-        a[i%1000000000] = b[i%1000000000]
-
-
-if __name__ == "__main__":
-    a = np.random.uniform(size = [1000000000])
-    b = np.random.uniform(size = [1000000000])
-
-    ff(a, b)
-
-    import time
-    t0 = time.time()
-    ff(a, b)
-    t1 = time.time()
-    print(t1 - t0)
\ No newline at end of file

From 7c8d4c646561304244a0540bc5808312ea221410 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Thu, 14 Dec 2023 21:58:34 +0800
Subject: [PATCH 068/162] fix init_buffer set value bug

---
 src/pyjuice/model/tensorcircuit.py | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/src/pyjuice/model/tensorcircuit.py b/src/pyjuice/model/tensorcircuit.py
index be9421c0..b8b9931e 100644
--- a/src/pyjuice/model/tensorcircuit.py
+++ b/src/pyjuice/model/tensorcircuit.py
@@ -309,7 +309,7 @@ def update_parameters(self, clone: bool = True):
         """
         params = self.params.detach().cpu()
 
-        for ns in self.root_nodes:
+        for ns in self.root_ns:
             if ns.is_sum() and not ns.is_tied():
                 ns.update_parameters(params, clone = clone)
 
@@ -324,9 +324,9 @@ def update_param_flows(self, clone: bool = True, origin_ns_only: bool = True):
         """
         param_flows = self.param_flows.detach().cpu()
 
-        for ns in self.root_nodes:
+        for ns in self.root_ns:
             if ns.is_sum() and not ns.is_tied():
-                ns.update_param_flows(params, clone = clone, origin_ns_only = origin_ns_only)
+                ns.update_param_flows(param_flows, clone = clone, origin_ns_only = origin_ns_only)
 
     def print_statistics(self):
         """
@@ -401,8 +401,19 @@ def _init_buffer(self, name: str, shape: Tuple, set_value: Optional[float] = Non
         if flag:
             self.__dict__[name] = torch.zeros(shape, device = self.device)
 
-        if set_value:
-            self.__dict__[name][:] = set_value
+        if set_value is not None:
+            if len(shape) == 1:
+                self.__dict__[name][:] = set_value
+            elif len(shape) == 2:
+                self.__dict__[name][:,:] = set_value
+            elif len(shape) == 3:
+                self.__dict__[name][:,:,:] = set_value
+            elif len(shape) == 4:
+                self.__dict__[name][:,:,:,:] = set_value
+            elif len(shape) == 5:
+                self.__dict__[name][:,:,:,:,:] = set_value
+            else:
+                raise ValueError(f"Too many dimensions ({len(shape)}).")
 
     def _buffer_matches(self, name: str, cache: Optional[dict], check_device: bool = True):
         if cache is None:

From 1e068baa5a34418516a08125faf5ab41f9c0d898 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Thu, 14 Dec 2023 21:58:48 +0800
Subject: [PATCH 069/162] update kernel selection criterion

---
 src/pyjuice/layer/sum_layer.py | 169 +++++++++++++++++++++------------
 src/pyjuice/nodes/sum_nodes.py |   8 +-
 2 files changed, 113 insertions(+), 64 deletions(-)

diff --git a/src/pyjuice/layer/sum_layer.py b/src/pyjuice/layer/sum_layer.py
index 67014be1..5b56f659 100644
--- a/src/pyjuice/layer/sum_layer.py
+++ b/src/pyjuice/layer/sum_layer.py
@@ -339,11 +339,14 @@ def _forward(self, node_mars: torch.Tensor, element_mars: torch.Tensor,
         elif params.dim() == 1 and self.group_size >= 16 and num_edges >= 16 and batch_size >= 16:
             # In this case, we should definitely use the block-sparse implementation
             mode = "block_sparse"
-        elif self.group_size * num_edges < 16 and num_edges * batch_size < 16:
+        elif self.group_size == 1 and num_edges < 16384:
             # In this case, we should definitely use the sparse implementation
             mode = "sparse"
-        else:
+        elif num_edges < 4:
+            # In this case, the block-sparse kernel will have compilation issues
             mode = "sparse"
+        else:
+            mode = "block_sparse"
 
         if mode == "block_sparse":
             self._forward_block_sparse(
@@ -370,7 +373,7 @@ def _forward(self, node_mars: torch.Tensor, element_mars: torch.Tensor,
     def _fw_triton_block_sparse_kernel(node_mars, element_mars, params, nids, cids_start, cids_increment,
                                        pids_start, pids_increment, local_ids, batch_size: tl.constexpr, partial_eval: tl.constexpr,
                                        BLOCK_B: tl.constexpr, TILE_SIZE_K: tl.constexpr, K_NUM_TILES: tl.constexpr,
-                                       TILE_SIZE_M: tl.constexpr, GROUP_SIZE_M: tl.constexpr, use_fp16: tl.constexpr):
+                                       TILE_SIZE_M: tl.constexpr, GROUP_SIZE_M: tl.constexpr, OP_MODE: tl.constexpr):
 
         pid_b = tl.program_id(0) # ID of size-`BLOCK_B` batches
         pid_m = tl.program_id(1) # ID of size-`TILE_SIZE_M` nodes
@@ -423,22 +426,22 @@ def _fw_triton_block_sparse_kernel(node_mars, element_mars, params, nids, cids_s
             emars_max = tl.max(emars, axis = 0)[None,:]
             emars = tl.exp(emars - emars_max)
 
-            if TILE_SIZE_M >= 16 and TILE_SIZE_K >= 16 and BLOCK_B >= 16:
-                # We can use the built-in matmul kernel of triton
-                if use_fp16 == 1:
-                    epars = (epars * (2**12)).to(tl.float16)
-                    emars = emars.to(tl.float16)
-                    nmars = tl.dot(epars, emars).to(tl.float32) / (2**12)
-                else:
-                    nmars = tl.dot(epars, emars)
-            else:
-                # We have to simulate matmul
-                if use_fp16 == 1:
-                    epars = (epars * (2**12)).to(tl.float16)
-                    emars = emars.to(tl.float16)
-                    nmars = tl.sum(epars[:,:,None] * emars[None,:,:], axis = 1).to(tl.float32) / (2**12)
-                else:
-                    nmars = tl.sum(epars[:,:,None] * emars[None,:,:], axis = 1)
+            if OP_MODE == 0:
+                # Built-in matmul kernel of triton + float16
+                epars = (epars * (2**12)).to(tl.float16)
+                emars = emars.to(tl.float16)
+                nmars = tl.dot(epars, emars).to(tl.float32) / (2**12)
+            if OP_MODE == 1:
+                # Built-in matmul kernel of triton + float32
+                nmars = tl.dot(epars, emars)
+            if OP_MODE == 2:
+                # Simulated matmul kernel + float16
+                epars = (epars * (2**12)).to(tl.float16)
+                emars = emars.to(tl.float16)
+                nmars = tl.sum(epars[:,:,None] * emars[None,:,:], axis = 1).to(tl.float32) / (2**12)
+            if OP_MODE == 3:
+                # Simulated matmul kernel + float32
+                nmars = tl.sum(epars[:,:,None] * emars[None,:,:], axis = 1)
 
             acc = tl.where(emars_max > acc,
                 tl.log(nmars + tl.exp(acc - emars_max)) + emars_max,
@@ -499,6 +502,10 @@ def _forward_block_sparse(self, node_mars: torch.Tensor, element_mars: torch.Ten
             BLOCK_B = min(2048 // TILE_SIZE_K, BATCH_SIZE_NP2)
         K_NUM_TILES = num_edges // TILE_SIZE_K
 
+        assert TILE_SIZE_K >= 4, f"`TILE_SIZE_K` should be greater than 4 (but got {TILE_SIZE_K}) in order to use the block-sparse kernel. " \
+                                  "This is an internal error of PyJuice. Please consider checking the kernel dispatching criterions and use the " \
+                                  "corresponding sparse kernel instead."
+
         signature = ("block_sparse", partition_id, TILE_SIZE_K)
         if signature not in self._cached_fw_pcids:
             # Pre-compute pointer increments for `cids` and `pids`
@@ -532,6 +539,17 @@ def _forward_block_sparse(self, node_mars: torch.Tensor, element_mars: torch.Ten
             else:
                 use_fp16 = False
 
+        if TILE_SIZE_M >= 16 and TILE_SIZE_K >= 16 and BLOCK_B >= 16:
+            if use_fp16:
+                OP_MODE = 0
+            else:
+                OP_MODE = 1
+        else:
+            if use_fp16:
+                OP_MODE = 2
+            else:
+                OP_MODE = 3
+
         grid = (triton.cdiv(batch_size, BLOCK_B), triton.cdiv(layer_n_nodes, TILE_SIZE_M))
         
         self._fw_triton_block_sparse_kernel[grid](
@@ -551,7 +569,7 @@ def _forward_block_sparse(self, node_mars: torch.Tensor, element_mars: torch.Ten
             K_NUM_TILES = K_NUM_TILES,
             TILE_SIZE_M = TILE_SIZE_M,
             GROUP_SIZE_M = self.group_size,
-            use_fp16 = 1 if use_fp16 else 0
+            OP_MODE = OP_MODE
         )
         
         return None
@@ -721,13 +739,14 @@ def _backward(self, node_flows: torch.Tensor, element_flows: torch.Tensor,
         elif params.dim() == 1 and self.group_size >= 16 and num_edges >= 16 and batch_size >= 16:
             # In this case, we should definitely use the block-sparse implementation
             mode = "block_sparse"
-        elif self.group_size * num_edges < 4 and num_edges * batch_size < 4:
+        elif (cs_group_size == 1 or self.group_size == 1) and num_edges < 16384:
             # In this case, we should definitely use the sparse implementation
             mode = "sparse"
-        else:
+        elif num_edges < 4 or batch_size < 4:
+            # In this case, the block-sparse kernel will have compilation issues
             mode = "sparse"
-
-        # mode = "pytorch" # debug
+        else:
+            mode = "block_sparse"
 
         if mode == "block_sparse":
             self._backward_block_sparse(
@@ -796,7 +815,7 @@ def _bk_triton_block_sparse_ele_kernel(node_flows, element_flows, node_mars, ele
                                            chids, parids_start, parids_increment, parpids_start, parpids_increment, 
                                            local_ids, batch_size: tl.constexpr, partial_eval: tl.constexpr, ptr_inc_step: tl.constexpr, 
                                            BLOCK_B: tl.constexpr, TILE_SIZE_K: tl.constexpr, K_NUM_TILES: tl.constexpr, TILE_SIZE_M: tl.constexpr, 
-                                           GROUP_SIZE_M: tl.constexpr, GROUP_SIZE_K: tl.constexpr, use_fp16: tl.constexpr):
+                                           GROUP_SIZE_M: tl.constexpr, GROUP_SIZE_K: tl.constexpr, OP_MODE: tl.constexpr):
 
         pid_b = tl.program_id(0) # ID of size-`BLOCK_B` batches
         pid_m = tl.program_id(1) # ID of size-`TILE_SIZE_M` nodes
@@ -852,22 +871,22 @@ def _bk_triton_block_sparse_ele_kernel(node_flows, element_flows, node_mars, ele
 
             nflows_div_mars = nflows * tl.exp(emars_max[None,:] - nmars)
             
-            if TILE_SIZE_M >= 16 and TILE_SIZE_K >= 16 and BLOCK_B >= 16:
-                # We can use the built-in matmul kernel of triton
-                if use_fp16 == 1:
-                    epars = epars.to(tl.float16)
-                    nflows_div_mars = nflows_div_mars.to(tl.float16)
-                    eflows = tl.dot(epars, nflows_div_mars).to(tl.float32)
-                else:
-                    eflows = tl.dot(epars, nflows_div_mars)
-            else:
-                # We have to simulate matmul
-                if use_fp16 == 1:
-                    epars = epars.to(tl.float16)
-                    nflows_div_mars = nflows_div_mars.to(tl.float16)
-                    eflows = tl.sum(epars[:,:,None] * nflows_div_mars[None,:,:], axis = 1).to(tl.float32)
-                else:
-                    eflows = tl.sum(epars[:,:,None] * nflows_div_mars[None,:,:], axis = 1)
+            if OP_MODE == 0:
+                # Built-in matmul kernel of triton + float16
+                epars = epars.to(tl.float16)
+                nflows_div_mars = nflows_div_mars.to(tl.float16)
+                eflows = tl.dot(epars, nflows_div_mars).to(tl.float32)
+            if OP_MODE == 1:
+                # Built-in matmul kernel of triton + float32
+                eflows = tl.dot(epars, nflows_div_mars)
+            if OP_MODE == 2:
+                # Simulated matmul kernel + float16
+                epars = epars.to(tl.float16)
+                nflows_div_mars = nflows_div_mars.to(tl.float16)
+                eflows = tl.sum(epars[:,:,None] * nflows_div_mars[None,:,:], axis = 1).to(tl.float32)
+            if OP_MODE == 3:
+                # Simulated matmul kernel + float32
+                eflows = tl.sum(epars[:,:,None] * nflows_div_mars[None,:,:], axis = 1)
 
             acc += eflows
 
@@ -918,6 +937,10 @@ def _backward_block_sparse_ele_flows(self, node_flows: torch.Tensor, element_flo
             BLOCK_B = min(2048 // TILE_SIZE_K, BATCH_SIZE_NP2)
         K_NUM_TILES = num_edges // TILE_SIZE_K
 
+        assert TILE_SIZE_K >= 4, f"`TILE_SIZE_K` should be greater than 4 (but got {TILE_SIZE_K}) in order to use the block-sparse kernel. " \
+                                  "This is an internal error of PyJuice. Please consider checking the kernel dispatching criterions and use the " \
+                                  "corresponding sparse kernel instead."
+
         signature = ("block_sparse", partition_id, TILE_SIZE_K)
         if signature not in self._cached_bk_parids:
             # Pre-compute pointer increments for `parids` and `parpids`
@@ -966,6 +989,17 @@ def _backward_block_sparse_ele_flows(self, node_flows: torch.Tensor, element_flo
             else:
                 use_fp16 = False
 
+        if TILE_SIZE_M >= 16 and TILE_SIZE_K >= 16 and BLOCK_B >= 16:
+            if use_fp16:
+                OP_MODE = 0
+            else:
+                OP_MODE = 1
+        else:
+            if use_fp16:
+                OP_MODE = 2
+            else:
+                OP_MODE = 3
+
         grid = (triton.cdiv(batch_size, BLOCK_B), triton.cdiv(layer_n_nodes, TILE_SIZE_M))
 
         self._bk_triton_block_sparse_ele_kernel[grid](
@@ -989,7 +1023,7 @@ def _backward_block_sparse_ele_flows(self, node_flows: torch.Tensor, element_flo
             TILE_SIZE_M = TILE_SIZE_M, 
             GROUP_SIZE_M = cs_group_size,
             GROUP_SIZE_K = self.group_size,
-            use_fp16 = use_fp16
+            OP_MODE = OP_MODE
         )
 
         return None
@@ -999,7 +1033,7 @@ def _backward_block_sparse_ele_flows(self, node_flows: torch.Tensor, element_flo
     def _bk_triton_block_sparse_par_kernel(node_flows, node_mars, element_mars, params, param_flows, nids, cids, pids, pfids,
                                            batch_size: tl.constexpr, num_edges: tl.constexpr, TILE_SIZE_B: tl.constexpr, 
                                            B_NUM_TILES: tl.constexpr, TILE_SIZE_K: tl.constexpr, 
-                                           TILE_SIZE_M: tl.constexpr, GROUP_SIZE_M: tl.constexpr, use_fp16: tl.constexpr):
+                                           TILE_SIZE_M: tl.constexpr, GROUP_SIZE_M: tl.constexpr, OP_MODE: tl.constexpr):
 
         pid_k = tl.program_id(0) # ID of size-`TILE_SIZE_K` edges
         pid_m = tl.program_id(1) # ID of size-`TILE_SIZE_M` nodes
@@ -1038,22 +1072,22 @@ def _bk_triton_block_sparse_par_kernel(node_flows, node_mars, element_mars, para
 
             emars = tl.exp(emars - emars_max[:,None])
 
-            if TILE_SIZE_M >= 16 and TILE_SIZE_K >= 16 and TILE_SIZE_B >= 16:
-                # We can use the built-in matmul kernel of triton
-                if use_fp16 == 1:
-                    nflows_div_mars = nflows_div_mars.to(tl.float16)
-                    emars = emars.to(tl.float16)
-                    pflows = tl.dot(nflows_div_mars, emars).to(tl.float32)
-                else:
-                    pflows = tl.dot(nflows_div_mars, emars).to(tl.float32)
-            else:
-                # We have to simulate matmul
-                if use_fp16 == 1:
-                    nflows_div_mars = nflows_div_mars.to(tl.float16)
-                    emars = emars.to(tl.float16)
-                    pflows = tl.sum(nflows_div_mars[:,:,None] * emars[None,:,:], axis = 1).to(tl.float32)
-                else:
-                    pflows = tl.sum(nflows_div_mars[:,:,None] * emars[None,:,:], axis = 1)
+            if OP_MODE == 0:
+                # Built-in matmul kernel of triton + float16
+                nflows_div_mars = nflows_div_mars.to(tl.float16)
+                emars = emars.to(tl.float16)
+                pflows = tl.dot(nflows_div_mars, emars).to(tl.float32)
+            if OP_MODE == 1:
+                # Built-in matmul kernel of triton + float32
+                pflows = tl.dot(nflows_div_mars, emars).to(tl.float32)
+            if OP_MODE == 2:
+                # Simulated matmul kernel + float16
+                nflows_div_mars = nflows_div_mars.to(tl.float16)
+                emars = emars.to(tl.float16)
+                pflows = tl.sum(nflows_div_mars[:,:,None] * emars[None,:,:], axis = 1).to(tl.float32)
+            if OP_MODE == 3:
+                # Simulated matmul kernel + float32
+                pflows = tl.sum(nflows_div_mars[:,:,None] * emars[None,:,:], axis = 1)
 
             acc += pflows
 
@@ -1118,6 +1152,10 @@ def _backward_block_sparse_par_flows(self, node_flows: torch.Tensor, params: tor
             TILE_SIZE_K = min(2048 // TILE_SIZE_B, num_edges)
         B_NUM_TILES = batch_size // TILE_SIZE_B
 
+        assert TILE_SIZE_B >= 4, f"`TILE_SIZE_B` should be greater than 4 (but got {TILE_SIZE_B}) in order to use the block-sparse kernel. " \
+                                  "This is an internal error of PyJuice. Please consider checking the kernel dispatching criterions and use the " \
+                                  "corresponding sparse kernel instead."
+
         if force_use_fp16:
             assert not force_use_fp32
             use_fp16 = True
@@ -1129,6 +1167,17 @@ def _backward_block_sparse_par_flows(self, node_flows: torch.Tensor, params: tor
             else:
                 use_fp16 = False
 
+        if TILE_SIZE_M >= 16 and TILE_SIZE_K >= 16 and TILE_SIZE_B >= 16:
+            if use_fp16:
+                OP_MODE = 0
+            else:
+                OP_MODE = 1
+        else:
+            if use_fp16:
+                OP_MODE = 2
+            else:
+                OP_MODE = 3
+
         grid = (triton.cdiv(num_edges, TILE_SIZE_K), triton.cdiv(layer_n_nodes, TILE_SIZE_M))
 
         self._bk_triton_block_sparse_par_kernel[grid](
@@ -1148,7 +1197,7 @@ def _backward_block_sparse_par_flows(self, node_flows: torch.Tensor, params: tor
             TILE_SIZE_K = TILE_SIZE_K, 
             TILE_SIZE_M = TILE_SIZE_M, 
             GROUP_SIZE_M = self.group_size,
-            use_fp16 = use_fp16
+            OP_MODE = OP_MODE
         )
 
     def _backward_sparse(self, node_flows: torch.Tensor, element_flows: torch.Tensor, 
diff --git a/src/pyjuice/nodes/sum_nodes.py b/src/pyjuice/nodes/sum_nodes.py
index 591997ec..efcdabda 100644
--- a/src/pyjuice/nodes/sum_nodes.py
+++ b/src/pyjuice/nodes/sum_nodes.py
@@ -142,9 +142,9 @@ def update_parameters(self, params: torch.Tensor, clone: bool = True):
             ns_params = params[psid:peid].cpu()
 
         local_parids = (self._param_ids - psid) // (self.group_size * self.ch_group_size)
-        num_pargroups = local_parids.size()
+        num_pargroups = local_parids.size(0)
         ns_params = ns_params.reshape(num_pargroups, self.ch_group_size, self.group_size)
-        ns._params = ns_params[local_parids,:,:].permute(0, 2, 1)
+        self._params = ns_params[local_parids,:,:].permute(0, 2, 1)
 
     def update_param_flows(self, param_flows: torch.Tensor, origin_ns_only: bool = True, clone: bool = True):
         assert self.provided("_param_flow_range"), "The `SumNodes` has not been compiled into a `TensorCircuit`."
@@ -159,9 +159,9 @@ def update_param_flows(self, param_flows: torch.Tensor, origin_ns_only: bool = T
             ns_param_flows = param_flows[pfsid:pfeid].cpu()
 
         local_parfids = (self._param_flow_ids - pfsid) // (self.group_size * self.ch_group_size)
-        num_parfgroups = local_parfids.size()
+        num_parfgroups = local_parfids.size(0)
         ns_param_flows = ns_param_flows.reshape(num_parfgroups, self.ch_group_size, self.group_size)
-        ns._param_flows = ns_param_flows[local_parfids,:,:].permute(0, 2, 1)
+        self._param_flows = ns_param_flows[local_parfids,:,:].permute(0, 2, 1)
 
     def gather_parameters(self, params: torch.Tensor):
         assert self.provided("_param_range"), "The `SumNodes` has not been compiled into a `TensorCircuit`."

From 42629cb7493bc58e3c840da0508828dafd6cb67a Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Thu, 14 Dec 2023 23:30:32 +0800
Subject: [PATCH 070/162] fix parameter update

---
 src/pyjuice/model/backend/par_update.py |   6 +-
 src/pyjuice/model/tensorcircuit.py      |   4 +-
 tests/model/simple_model_test.py        | 121 +++++++++++++++++++++---
 3 files changed, 115 insertions(+), 16 deletions(-)

diff --git a/src/pyjuice/model/backend/par_update.py b/src/pyjuice/model/backend/par_update.py
index d8ed343b..2e7026a8 100644
--- a/src/pyjuice/model/backend/par_update.py
+++ b/src/pyjuice/model/backend/par_update.py
@@ -130,7 +130,7 @@ def compile_par_update_fn(root_ns: CircuitNodes, BLOCK_SIZE: int = 32, buffer_in
 
     cum_pflows = torch.zeros([global_nids[-1] + 1], dtype = torch.float32)
     
-    metadata = {"tot_num_nodes": global_nids[-1] + 1, "BLOCK_SIZE": BLOCK_SIZE}
+    metadata = {"tot_num_nodes": global_nids[-1].item() + 1, "BLOCK_SIZE": BLOCK_SIZE}
 
     return [par_start_ids, pflow_start_ids, blk_sizes, blk_intervals, global_nids, nchs, cum_pflows, metadata]
 
@@ -212,10 +212,10 @@ def par_update_kernel(params, param_flows, cum_pflows, nchs, par_start_ids, pflo
     tl.store(params + offs_par, updated_param, mask = mask_pflow)
 
 
-def em_par_update(params: torch.Tensor, param_flows: torch.Tensor, parflow_fusing_kwargs: Sequence, 
+def em_par_update(params: torch.Tensor, param_flows: torch.Tensor, par_update_kwargs: Sequence, 
                   step_size: float, pseudocount: float = 0.0):
 
-    par_start_ids, pflow_start_ids, blk_sizes, blk_intervals, global_nids, nchs, cum_pflows, metadata = parflow_fusing_kwargs
+    par_start_ids, pflow_start_ids, blk_sizes, blk_intervals, global_nids, nchs, cum_pflows, metadata = par_update_kwargs
 
     tot_num_nodes = metadata["tot_num_nodes"]
     BLOCK_SIZE = metadata["BLOCK_SIZE"]
diff --git a/src/pyjuice/model/tensorcircuit.py b/src/pyjuice/model/tensorcircuit.py
index b8b9931e..46d34277 100644
--- a/src/pyjuice/model/tensorcircuit.py
+++ b/src/pyjuice/model/tensorcircuit.py
@@ -273,10 +273,10 @@ def mini_batch_em(self, step_size: float, pseudocount: float = 0.0):
             layer.mini_batch_em(step_size = step_size, pseudocount = pseudocount)
 
         # Accumulate parameter flows of tied nodes
-        compute_cum_par_flows(self.parflow_fusing_kwargs)
+        compute_cum_par_flows(self.param_flows, self.parflow_fusing_kwargs)
 
         # Normalize and update parameters
-        em_par_update(self.param, self.param_flows, self.par_update_kwargs, step_size = step_size, pseudocount = pseudocount)
+        em_par_update(self.params, self.param_flows, self.par_update_kwargs, step_size = step_size, pseudocount = pseudocount)
 
     def cumulate_flows(self, inputs: torch.Tensor, params: Optional[torch.Tensor] = None):
         with torch.no_grad():
diff --git a/tests/model/simple_model_test.py b/tests/model/simple_model_test.py
index 84d8d1fa..5bd3d9a1 100644
--- a/tests/model/simple_model_test.py
+++ b/tests/model/simple_model_test.py
@@ -195,23 +195,23 @@ def simple_model_test():
     lls = pc(data)
 
     node_mars = pc.node_mars.cpu()
-    data = data.cpu()
+    data_cpu = data.cpu()
 
     sid, eid = ni0._output_ind_range
     ni0_lls = node_mars[sid:eid,:]
-    assert torch.all(torch.abs(ni0_lls - ni0._params.reshape(-1, 4)[:,data[:,0]].log()) < 1e-4)
+    assert torch.all(torch.abs(ni0_lls - ni0._params.reshape(-1, 4)[:,data_cpu[:,0]].log()) < 1e-4)
 
     sid, eid = ni1._output_ind_range
     ni1_lls = node_mars[sid:eid,:]
-    assert torch.all(torch.abs(ni1_lls - ni1._params.reshape(-1, 4)[:,data[:,1]].log()) < 1e-4)
+    assert torch.all(torch.abs(ni1_lls - ni1._params.reshape(-1, 4)[:,data_cpu[:,1]].log()) < 1e-4)
 
     sid, eid = ni2._output_ind_range
     ni2_lls = node_mars[sid:eid,:]
-    assert torch.all(torch.abs(ni2_lls - ni2._params.reshape(-1, 6)[:,data[:,2]].log()) < 1e-4)
+    assert torch.all(torch.abs(ni2_lls - ni2._params.reshape(-1, 6)[:,data_cpu[:,2]].log()) < 1e-4)
 
     sid, eid = ni3._output_ind_range
     ni3_lls = node_mars[sid:eid,:]
-    assert torch.all(torch.abs(ni3_lls - ni3._params.reshape(-1, 6)[:,data[:,3]].log()) < 1e-4)
+    assert torch.all(torch.abs(ni3_lls - ni3._params.reshape(-1, 6)[:,data_cpu[:,3]].log()) < 1e-4)
 
     np0_lls = ni0_lls + ni1_lls
     np1_lls = ni2_lls + ni3_lls
@@ -388,33 +388,132 @@ def simple_model_test():
 
     input_layer = pc.input_layer_group[0]
     input_pflows = input_layer.param_flows.cpu()
-    data = data.cpu()
+    data_cpu = data.cpu()
 
     ni0_pflows = input_pflows[0:128].reshape(32, 4)
     ref_pflows = torch.zeros_like(ni0_pflows)
     for b in range(512):
-        ref_pflows[:,data[b,0]] += ni0_flows[:,b]
+        ref_pflows[:,data_cpu[b,0]] += ni0_flows[:,b]
     assert torch.all(torch.abs(ni0_pflows - ref_pflows) < 4e-3)
 
     ni1_pflows = input_pflows[128:256].reshape(32, 4)
     ref_pflows = torch.zeros_like(ni1_pflows)
     for b in range(512):
-        ref_pflows[:,data[b,1]] += ni1_flows[:,b]
+        ref_pflows[:,data_cpu[b,1]] += ni1_flows[:,b]
     assert torch.all(torch.abs(ni1_pflows - ref_pflows) < 4e-3)
 
     ni2_pflows = input_pflows[256:448].reshape(32, 6)
     ref_pflows = torch.zeros_like(ni2_pflows)
     for b in range(512):
-        ref_pflows[:,data[b,2]] += ni2_flows[:,b]
+        ref_pflows[:,data_cpu[b,2]] += ni2_flows[:,b]
     assert torch.all(torch.abs(ni2_pflows - ref_pflows) < 4e-3)
 
     ni3_pflows = input_pflows[448:640].reshape(32, 6)
     ref_pflows = torch.zeros_like(ni3_pflows)
     for b in range(512):
-        ref_pflows[:,data[b,3]] += ni3_flows[:,b]
+        ref_pflows[:,data_cpu[b,3]] += ni3_flows[:,b]
     assert torch.all(torch.abs(ni3_pflows - ref_pflows) < 4e-3)
 
-    import pdb; pdb.set_trace()
+    ## EM Optimization tests ##
+
+    pc.backward(data.permute(1, 0), flows_memory = 0.0)
+
+    ns0_old_params = ns0._params.clone().reshape(2, 4, 16, 16).permute(0, 2, 1, 3).reshape(32, 64)
+    ns1_old_params = ns1._params.clone().reshape(2, 2, 16, 16).permute(0, 2, 1, 3).reshape(32, 32)
+    ns2_old_params = ns2._params.clone().reshape(2, 2, 16, 16).permute(0, 2, 1, 3).reshape(32, 32)
+
+    ns_old_params = ns._params.clone().reshape(96)
+
+    pc.update_param_flows()
+
+    ref_parflows = ns0._param_flows.reshape(2, 4, 16, 16).permute(0, 2, 1, 3).reshape(32, 64)
+    assert torch.all(torch.abs(ns0_parflows - ref_parflows) < 1e-3)
+
+    ref_parflows = ns1._param_flows.reshape(2, 2, 16, 16).permute(0, 2, 1, 3).reshape(32, 32)
+    assert torch.all(torch.abs(ns1_parflows - ref_parflows) < 1e-3)
+
+    ref_parflows = ns2._param_flows.reshape(2, 2, 16, 16).permute(0, 2, 1, 3).reshape(32, 32)
+    assert torch.all(torch.abs(ns2_parflows - ref_parflows) < 1e-3)
+
+    par_start_ids, pflow_start_ids, blk_sizes, blk_intervals, global_nids, nchs, cum_pflows, metadata = pc.par_update_kwargs
+
+    if metadata["BLOCK_SIZE"] == 32:
+        par_start_ids = par_start_ids.cpu()
+        assert torch.all(par_start_ids[0:16] == torch.arange(256, 272))
+        assert torch.all(par_start_ids[16:32] == torch.arange(768, 784))
+        assert torch.all(par_start_ids[32:48] == torch.arange(1280, 1296))
+        assert torch.all(par_start_ids[48:64] == torch.arange(1792, 1808))
+        assert torch.all(par_start_ids[64:80] == torch.arange(2304, 2320))
+        assert torch.all(par_start_ids[80:96] == torch.arange(2816, 2832))
+        assert torch.all(par_start_ids[96:112] == torch.arange(3328, 3344))
+        assert torch.all(par_start_ids[112:128] == torch.arange(3840, 3856))
+        assert torch.all(par_start_ids[128:131] == torch.tensor([4352, 4384, 4416]))
+
+        pflow_start_ids = pflow_start_ids.cpu()
+        assert torch.all(par_start_ids - pflow_start_ids == 256)
+
+        blk_sizes = blk_sizes.cpu()
+        assert torch.all(blk_sizes[0:128] == 32)
+        assert torch.all(blk_sizes[128:131] == 32)
+
+        blk_intervals = blk_intervals.cpu()
+        assert torch.all(blk_intervals[0:128] == 16)
+        assert torch.all(blk_intervals[128:131] == 1)
+
+        global_nids = global_nids.cpu()
+        assert torch.all(global_nids[0:16] == torch.arange(0, 16))
+        assert torch.all(global_nids[16:32] == torch.arange(0, 16))
+        assert torch.all(global_nids[32:48] == torch.arange(16, 32))
+        assert torch.all(global_nids[48:64] == torch.arange(16, 32))
+        assert torch.all(global_nids[64:80] == torch.arange(32, 48))
+        assert torch.all(global_nids[80:96] == torch.arange(48, 64))
+        assert torch.all(global_nids[96:112] == torch.arange(64, 80))
+        assert torch.all(global_nids[112:128] == torch.arange(80, 96))
+        assert torch.all(global_nids[128:131] == 96)
+
+        nchs = nchs.cpu()
+        assert torch.all(nchs[0:32] == 64)
+        assert torch.all(nchs[32:64] == 64)
+        assert torch.all(nchs[64:128] == 32)
+        assert torch.all(nchs[128:131] == 96)
+
+        assert cum_pflows.size(0) == 97
+
+    step_size = 0.25
+    pseudocount = 0.01
+
+    pc.mini_batch_em(step_size = step_size, pseudocount = pseudocount)
+
+    cum_pflows = pc.par_update_kwargs[6].cpu()
+
+    assert torch.all(torch.abs(ns0_parflows.sum(dim = 1) - cum_pflows[0:32]) < 1e-3)
+    assert torch.all(torch.abs(ns1_parflows.sum(dim = 1) - cum_pflows[32:64]) < 1e-3)
+    assert torch.all(torch.abs(ns2_parflows.sum(dim = 1) - cum_pflows[64:96]) < 1e-3)
+    assert torch.abs(ns_parflows.sum() - cum_pflows[96]) < 1e-3
+
+    ns0_new_params = (ns0_parflows + pseudocount / 64) / (ns0_parflows.sum(dim = 1, keepdim = True) + pseudocount)
+    ns1_new_params = (ns1_parflows + pseudocount / 32) / (ns1_parflows.sum(dim = 1, keepdim = True) + pseudocount)
+    ns2_new_params = (ns2_parflows + pseudocount / 32) / (ns2_parflows.sum(dim = 1, keepdim = True) + pseudocount)
+    ns_new_params = (ns_parflows + pseudocount / 96) / (ns_parflows.sum() + pseudocount)
+
+    ns0_updated_params = (1.0 - step_size) * ns0_old_params + step_size * ns0_new_params
+    ns1_updated_params = (1.0 - step_size) * ns1_old_params + step_size * ns1_new_params
+    ns2_updated_params = (1.0 - step_size) * ns2_old_params + step_size * ns2_new_params
+    ns_updated_params = (1.0 - step_size) * ns_old_params + step_size * ns_new_params
+
+    pc.update_parameters()
+
+    ref_params = ns0._params.clone().reshape(2, 4, 16, 16).permute(0, 2, 1, 3).reshape(32, 64)
+    assert torch.all(torch.abs(ns0_updated_params - ref_params) < 1e-4)
+
+    ref_params = ns1._params.clone().reshape(2, 2, 16, 16).permute(0, 2, 1, 3).reshape(32, 32)
+    assert torch.all(torch.abs(ns1_updated_params - ref_params) < 1e-4)
+
+    ref_params = ns2._params.clone().reshape(2, 2, 16, 16).permute(0, 2, 1, 3).reshape(32, 32)
+    assert torch.all(torch.abs(ns2_updated_params - ref_params) < 1e-4)
+
+    ref_params = ns._params.clone().reshape(96)
+    assert torch.all(torch.abs(ns_updated_params - ref_params) < 1e-4)
 
 
 if __name__ == "__main__":

From 807adc8837b18b1fbef2c106c543f27cabfca6a5 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Fri, 15 Dec 2023 00:25:19 +0800
Subject: [PATCH 071/162] stage temporal changes

---
 src/pyjuice/layer/sum_layer.py          |   2 +
 src/pyjuice/model/backend/par_update.py |  29 ++++--
 src/pyjuice/optim/optim.py              |  24 ++---
 src/pyjuice/optim/scheduler.py          |   6 +-
 src/pyjuice/structures/compilation.py   |  61 +++++++-----
 src/pyjuice/structures/hclt.py          |  12 ++-
 src/pyjuice/utils/util.py               |  17 ++++
 tests/layer/matmul_kernel_test.py       |   6 +-
 tests/structures/hclt_test_new.py       | 127 ++++++++++++++++++++++++
 9 files changed, 231 insertions(+), 53 deletions(-)
 create mode 100644 src/pyjuice/utils/util.py
 create mode 100644 tests/structures/hclt_test_new.py

diff --git a/src/pyjuice/layer/sum_layer.py b/src/pyjuice/layer/sum_layer.py
index 5b56f659..afa26957 100644
--- a/src/pyjuice/layer/sum_layer.py
+++ b/src/pyjuice/layer/sum_layer.py
@@ -348,6 +348,8 @@ def _forward(self, node_mars: torch.Tensor, element_mars: torch.Tensor,
         else:
             mode = "block_sparse"
 
+        mode = "sparse" #### debug
+
         if mode == "block_sparse":
             self._forward_block_sparse(
                 node_mars, element_mars, params, nids, cids, pids, local_ids,
diff --git a/src/pyjuice/model/backend/par_update.py b/src/pyjuice/model/backend/par_update.py
index 2e7026a8..d21c1b4d 100644
--- a/src/pyjuice/model/backend/par_update.py
+++ b/src/pyjuice/model/backend/par_update.py
@@ -74,12 +74,29 @@ def compile_par_update_fn(root_ns: CircuitNodes, BLOCK_SIZE: int = 32, buffer_in
             curr_size = par_start_ids.shape[0]
             inc_shape = triton.cdiv(pid + est_num_slots - curr_size, buffer_inc_interval) * buffer_inc_interval
 
-            par_start_ids = np.ascontiguousarray(par_start_ids.resize(curr_size + inc_shape))
-            pflow_start_ids = np.ascontiguousarray(pflow_start_ids.resize(curr_size + inc_shape))
-            blk_sizes = np.ascontiguousarray(blk_sizes.resize(curr_size + inc_shape))
-            blk_intervals = np.ascontiguousarray(blk_intervals.resize(curr_size + inc_shape))
-            global_nids = np.ascontiguousarray(global_nids.resize(curr_size + inc_shape))
-            nchs = np.ascontiguousarray(nchs.resize(curr_size + inc_shape))
+            par_start_ids_new = np.zeros([curr_size + inc_shape], dtype = np.int64)
+            par_start_ids_new[:curr_size] = par_start_ids[:curr_size]
+            par_start_ids = par_start_ids_new
+
+            pflow_start_ids_new = np.zeros([curr_size + inc_shape], dtype = np.int64)
+            pflow_start_ids_new[:curr_size] = pflow_start_ids[:curr_size]
+            pflow_start_ids = pflow_start_ids_new
+
+            blk_sizes_new = np.zeros([curr_size + inc_shape], dtype = np.int64)
+            blk_sizes_new[:curr_size] = blk_sizes[:curr_size]
+            blk_sizes = blk_sizes_new
+
+            blk_intervals_new = np.zeros([curr_size + inc_shape], dtype = np.int64)
+            blk_intervals_new[:curr_size] = blk_intervals[:curr_size]
+            blk_intervals = blk_intervals_new
+
+            global_nids_new = np.zeros([curr_size + inc_shape], dtype = np.int64)
+            global_nids_new[:curr_size] = global_nids[:curr_size]
+            global_nids = global_nids_new
+
+            nchs_new = np.zeros([curr_size + inc_shape], dtype = np.int64)
+            nchs_new[:curr_size] = nchs[:curr_size]
+            nchs = nchs_new
 
         if use_numba:
 
diff --git a/src/pyjuice/optim/optim.py b/src/pyjuice/optim/optim.py
index 5504b766..a2eefe41 100644
--- a/src/pyjuice/optim/optim.py
+++ b/src/pyjuice/optim/optim.py
@@ -10,14 +10,14 @@ class CircuitOptimizer():
 
     SUPPORTED_OPTIM_METHODS = ["EM"]
 
-    def __init__(self, circuit: TensorCircuit, base_optimizer: Optional[Optimizer] = None, method: str = "EM", lr: float = 0.1,
+    def __init__(self, pc: TensorCircuit, base_optimizer: Optional[Optimizer] = None, method: str = "EM", lr: float = 0.1,
                  pseudocount: float = 0.1):
         
-        self.circuit = circuit
+        self.pc = pc
 
         self.base_optimizer = base_optimizer
 
-        assert method in self.SUPPORTED_OPTIM_METHODS, f"Unsupported optimization method {method} for circuits."
+        assert method in self.SUPPORTED_OPTIM_METHODS, f"Unsupported optimization method {method} for PCs."
         self.method = method
 
         self.lr = lr
@@ -27,19 +27,19 @@ def zero_grad(self, flows_memory: float = 0.0):
         if self.base_optimizer is not None:
             self.base_optimizer.zero_grad()
 
-        self.circuit._optim_hyperparams["flows_memory"] = flows_memory
+        self.pc._optim_hyperparams["flows_memory"] = flows_memory
 
     def step(self, closure = None):
         if self.base_optimizer is not None:
             self.base_optimizer.step()
 
         if self.method == "EM":
-            self.circuit.mini_batch_em(
+            self.pc.mini_batch_em(
                 step_size = self.lr,
                 pseudocount = self.pseudocount
             )
         else:
-            raise ValueError(f"Unknown circuit optimization method {self.method}.")
+            raise ValueError(f"Unknown PC optimization method {self.method}.")
 
     def state_dict(self):
         if self.base_optimizer is not None:
@@ -47,20 +47,20 @@ def state_dict(self):
         else:
             state_dict = dict()
 
-        state_dict["circuit_states"] = {
+        state_dict["pc_states"] = {
             "method": self.method, 
             "lr": self.lr, 
             "pseudocount": self.pseudocount
         }
 
     def load_state_dict(self, state_dict: Dict):
-        circuit_states = state_dict["circuit_states"]
+        pc_states = state_dict["pc_states"]
         
-        self.method = circuit_states["method"]
-        self.lr = circuit_states["lr"]
-        self.pseudocount = circuit_states["pseudocount"]
+        self.method = pc_states["method"]
+        self.lr = pc_states["lr"]
+        self.pseudocount = pc_states["pseudocount"]
 
-        del state_dict["circuit_states"]
+        del state_dict["pc_states"]
 
         if self.base_optimizer is not None:
             self.base_optimizer.load_state_dict(state_dict)
diff --git a/src/pyjuice/optim/scheduler.py b/src/pyjuice/optim/scheduler.py
index e5a39022..e40e0f85 100644
--- a/src/pyjuice/optim/scheduler.py
+++ b/src/pyjuice/optim/scheduler.py
@@ -79,17 +79,17 @@ def state_dict(self):
         else:
             state_dict = dict()
 
-        state_dict["circuit_states"] = dict()
+        state_dict["pc_states"] = dict()
         for key, value in self.__dict__.items():
             if key != "optimizer" and key != "base_scheduler":
-                state_dict["circuit_states"][key] = value
+                state_dict["pc_states"][key] = value
 
     def load_state_dict(self, state_dict: Dict):
 
         for key, value in state_dict["circuit_states"].items():
             self.__dict__[key] = value
 
-        del state_dict["circuit_states"]
+        del state_dict["pc_states"]
 
         if self.base_optimizer is not None:
             self.base_optimizer.load_state_dict(state_dict)
\ No newline at end of file
diff --git a/src/pyjuice/structures/compilation.py b/src/pyjuice/structures/compilation.py
index 24702a40..20c6f215 100644
--- a/src/pyjuice/structures/compilation.py
+++ b/src/pyjuice/structures/compilation.py
@@ -2,10 +2,11 @@
 
 import torch
 import networkx as nx
-from typing import Type
+from typing import Type, Optional
 
-from pyjuice.nodes import multiply, summate, inputs
+from pyjuice.nodes import multiply, summate, inputs, set_group_size
 from pyjuice.nodes.distributions import Distribution
+from pyjuice.utils.util import max_cdf_power_of_2
 
 
 def BayesianTreeToHiddenRegionGraph(tree: nx.Graph, 
@@ -13,7 +14,8 @@ def BayesianTreeToHiddenRegionGraph(tree: nx.Graph,
                                     num_latents: int,  
                                     InputDist: Type[Distribution], 
                                     dist_params: dict,
-                                    num_root_ns: int = 1) -> RegionGraph:
+                                    num_root_ns: int = 1,
+                                    group_size: Optional[int] = None) -> RegionGraph:
     """
     Given a Tree Bayesian Network tree T1 (i.e. at most one parents), 
     
@@ -31,6 +33,12 @@ def BayesianTreeToHiddenRegionGraph(tree: nx.Graph,
                                                         z1 -> z2   
     """
 
+    # Specify group size
+    if group_size is None:
+        group_size = min(64, max_cdf_power_of_2(num_latents))
+
+    num_node_groups = num_latents // group_size
+
     # Root the tree at `root`
     clt = nx.bfs_tree(tree, root)
     def children(n: int):
@@ -43,31 +51,32 @@ def children(n: int):
     # Compile the region graph for the circuit equivalent to T2
     node_seq = list(nx.dfs_postorder_nodes(tree, root))
     var2rnode = dict()
-    for v in node_seq:
-        chs = children(v)
-
-        if len(chs) == 0:             
-            # Input Region
-            r = inputs(v, num_nodes = num_latents, dist = InputDist(**dist_params))
-            var2rnode[v] = r
-        else:
-            # Inner Region
-            
-            # children(z_v)
-            ch_regions = [var2rnode[c] for c in chs]
-
-            # Add x_v to children(z_v)
-            leaf_r = inputs(v, num_nodes = num_latents, dist = InputDist(**dist_params))
-            ch_regions.append(leaf_r)
-
-            rp = multiply(*ch_regions)
-
-            if v == root:
-                r = summate(rp, num_nodes = num_root_ns)
+    with set_group_size(group_size):
+        for v in node_seq:
+            chs = children(v)
+
+            if len(chs) == 0:
+                # Input Region
+                r = inputs(v, num_node_groups = num_node_groups, dist = InputDist(**dist_params))
+                var2rnode[v] = r
             else:
-                r = summate(rp, num_nodes = num_latents)
+                # Inner Region
+                
+                # children(z_v)
+                ch_regions = [var2rnode[c] for c in chs]
+
+                # Add x_v to children(z_v)
+                leaf_r = inputs(v, num_node_groups = num_node_groups, dist = InputDist(**dist_params))
+                ch_regions.append(leaf_r)
+
+                rp = multiply(*ch_regions)
+
+                if v == root:
+                    r = summate(rp, num_node_groups = num_root_ns, group_size = 1)
+                else:
+                    r = summate(rp, num_node_groups = num_node_groups)
 
-            var2rnode[v] = r
+                var2rnode[v] = r
 
     root_r = var2rnode[root]
     return root_r
\ No newline at end of file
diff --git a/src/pyjuice/structures/hclt.py b/src/pyjuice/structures/hclt.py
index 37f94e1d..0cfad935 100644
--- a/src/pyjuice/structures/hclt.py
+++ b/src/pyjuice/structures/hclt.py
@@ -3,9 +3,9 @@
 import torch
 import numpy as np
 import networkx as nx
-from typing import Type
+from typing import Type, Optional
+
 from pyjuice.nodes.distributions import *
-from typing import Optional
 from .compilation import BayesianTreeToHiddenRegionGraph
 
 
@@ -68,12 +68,18 @@ def HCLT(x: torch.Tensor, num_bins: int, sigma: float,
          chunk_size: int,
          num_latents: int, 
          num_root_ns: int = 1,
+         group_size: Optional[int] = None,
          input_layer_type: Type[Distribution] = Categorical, 
          input_layer_params: dict = {"num_cats": 256}):
     
     mi = mutual_information_chunked(x, x, num_bins, sigma, chunk_size = chunk_size).detach().cpu().numpy()
     T = chow_liu_tree(mi)
     root = nx.center(T)[0]
-    root_r = BayesianTreeToHiddenRegionGraph(T, root, num_latents, input_layer_type, input_layer_params, num_root_ns = num_root_ns)
+
+    root_r = BayesianTreeToHiddenRegionGraph(
+        T, root, num_latents, input_layer_type, 
+        input_layer_params, num_root_ns = num_root_ns,
+        group_size = group_size
+    )
     
     return root_r
\ No newline at end of file
diff --git a/src/pyjuice/utils/util.py b/src/pyjuice/utils/util.py
new file mode 100644
index 00000000..442dc488
--- /dev/null
+++ b/src/pyjuice/utils/util.py
@@ -0,0 +1,17 @@
+from __future__ import annotations
+
+import math
+
+
+def max_cdf_power_of_2(val: int):
+    count = 0
+    while True:
+        halfval = val // 2
+
+        if halfval * 2 != val:
+            break
+
+        val = halfval
+        count += 1
+
+    return 2 ** count
diff --git a/tests/layer/matmul_kernel_test.py b/tests/layer/matmul_kernel_test.py
index f5c628d9..8a476f36 100644
--- a/tests/layer/matmul_kernel_test.py
+++ b/tests/layer/matmul_kernel_test.py
@@ -73,9 +73,9 @@ def kernel3(a, b, c, M: tl.constexpr, N: tl.constexpr, K: tl.constexpr):
 if __name__ == "__main__":
     import time
 
-    M = 16
-    N = 16
-    K = 8
+    M = 1
+    N = 4
+    K = 1
 
     a = torch.rand([M, N]).cuda()
     b = torch.rand([N, K]).cuda()
diff --git a/tests/structures/hclt_test_new.py b/tests/structures/hclt_test_new.py
new file mode 100644
index 00000000..bc996733
--- /dev/null
+++ b/tests/structures/hclt_test_new.py
@@ -0,0 +1,127 @@
+import torch
+import torchvision
+import time
+from torch.utils.data import TensorDataset, DataLoader
+
+import pyjuice as juice
+import pyjuice.nodes.distributions as dists
+
+
+def evaluate(pc, loader):
+    lls_total = 0.0
+    for batch in loader:
+        x = batch[0].to(pc.device)
+        lls = pc(x)
+        lls_total += lls.mean().detach().cpu().numpy().item()
+    
+    lls_total /= len(loader)
+    return lls_total
+
+
+def mini_batch_em_epoch(num_epochs, pc, optimizer, scheduler, train_loader, test_loader, device):
+    for epoch in range(num_epochs):
+        t0 = time.time()
+        train_ll = 0.0
+        for batch in train_loader:
+            x = batch[0].to(device)
+
+            optimizer.zero_grad()
+
+            lls = pc(x)
+            lls.mean().backward()
+
+            train_ll += lls.mean().detach().cpu().numpy().item()
+
+            optimizer.step()
+            scheduler.step()
+
+        train_ll /= len(train_loader)
+
+        t1 = time.time()
+        test_ll = evaluate(pc, loader=test_loader)
+        t2 = time.time()
+
+        print(f"[Epoch {epoch}/{num_epochs}][train LL: {train_ll:.2f}; test LL: {test_ll:.2f}].....[train forward+backward+step {t1-t0:.2f}; test forward {t2-t1:.2f}] ")
+
+
+def full_batch_em_epoch(pc, train_loader, test_loader, device):
+    with torch.no_grad():
+        t0 = time.time()
+        train_ll = 0.0
+        for batch in train_loader:
+            x = batch[0].to(device)
+
+            lls = pc(x)
+            pc.backward(x, flows_memory = 1.0)
+
+            train_ll += lls.mean().detach().cpu().numpy().item()
+
+        pc.mini_batch_em(step_size = 1.0, pseudocount = 0.1)
+
+        train_ll /= len(train_loader)
+
+        t1 = time.time()
+        test_ll = evaluate(pc, loader=test_loader)
+        t2 = time.time()
+        print(f"[train LL: {train_ll:.2f}; test LL: {test_ll:.2f}].....[train forward+backward+step {t1-t0:.2f}; test forward {t2-t1:.2f}] ")
+
+
+def hclt_test():
+
+    device = torch.device("cuda:0")
+
+    train_dataset = torchvision.datasets.MNIST(root = "./examples/data", train = True, download = True)
+    test_dataset = torchvision.datasets.MNIST(root = "./examples/data", train = False, download = True)
+
+    train_data = train_dataset.data.reshape(60000, 28*28)
+    test_data = test_dataset.data.reshape(10000, 28*28)
+
+    num_features = train_data.size(1)
+
+    train_loader = DataLoader(
+        dataset = TensorDataset(train_data),
+        batch_size = 512,
+        shuffle = True,
+        drop_last = True
+    )
+    test_loader = DataLoader(
+        dataset = TensorDataset(test_data),
+        batch_size = 512,
+        shuffle = False,
+        drop_last = True
+    )
+
+    ns = juice.structures.HCLT(
+        train_data.float().to(device), 
+        num_bins = 32, 
+        sigma = 0.5 / 32, 
+        num_latents = 128, 
+        chunk_size = 32
+    )
+    pc = juice.TensorCircuit(ns)
+
+    pc.to(device)
+
+    optimizer = juice.optim.CircuitOptimizer(pc, lr = 0.1, pseudocount = 0.1)
+    scheduler = juice.optim.CircuitScheduler(
+        optimizer, 
+        method = "multi_linear", 
+        lrs = [0.9, 0.1, 0.05], 
+        milestone_steps = [0, len(train_loader) * 100, len(train_loader) * 350]
+    )
+
+
+    for _ in range(1000):
+        t0 = time.time()
+        lls_total = evaluate(pc, train_loader)
+        torch.cuda.synchronize()
+        t1 = time.time()
+        print(t1 - t0, lls_total)
+
+
+    # mini_batch_em_epoch(350, pc, optimizer, scheduler, train_loader, test_loader, device)
+    # full_batch_em_epoch(pc, train_loader, test_loader, device)
+
+
+if __name__ == "__main__":
+    hclt_test()

From 7900c6759966d416628967e440970c2ed4419670 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Fri, 15 Dec 2023 17:04:36 +0800
Subject: [PATCH 072/162] reverse node traverse (parents before children)

---
 src/pyjuice/nodes/nodes.py | 64 +++++++++++++++++++++++++++++++-------
 1 file changed, 52 insertions(+), 12 deletions(-)

diff --git a/src/pyjuice/nodes/nodes.py b/src/pyjuice/nodes/nodes.py
index 5d108918..bae535f2 100644
--- a/src/pyjuice/nodes/nodes.py
+++ b/src/pyjuice/nodes/nodes.py
@@ -2,17 +2,16 @@
 
 import numpy as np
 import torch
-from typing import Sequence, Union, Optional
+from typing import Sequence, Union, Optional, Callable
 from copy import deepcopy
+from collections import deque
+
 from pyjuice.utils import BitSet
 from pyjuice.graph import RegionGraph, PartitionNode, InnerRegionNode, InputRegionNode
 
 
-def node_iterator(root_ns: CircuitNodes):
-    visited = set()
-    node_list = list()
-
-    def dfs(ns: CircuitNodes):
+def node_iterator(root_ns: CircuitNodes, reverse: bool = False):
+    def dfs(ns: CircuitNodes, fn: Callable, visited: set = set()):
         if ns in visited:
             return
 
@@ -21,14 +20,48 @@ def dfs(ns: CircuitNodes):
         # Recursively traverse children
         if ns.is_sum() or ns.is_prod():
             for cs in ns.chs:
-                dfs(cs)
+                dfs(cs, fn = fn, visited = visited)
+
+        fn(ns)
+
+    if not reverse:
+        visited = set()
+        node_list = list()
+
+        def record_fn(ns):
+            node_list.append(ns)
+
+        dfs(root_ns, record_fn)
+
+        for ns in node_list:
+            yield ns
+    
+    else:
+        parcount = dict()
+        node_list = list()
+
+        def inc_parcount(ns):
+            for cs in ns.chs:
+                if cs not in parcount:
+                    parcount[cs] = 0
+                parcount[cs] += 1
 
-        node_list.append(ns)
+        dfs(root_ns, inc_parcount)
+
+        queue = deque()
+        queue.append(root_ns)
+        while len(queue) > 0:
+            ns = queue.popleft()
+            node_list.append(ns)
+            for cs in ns.chs:
+                parcount[cs] -= 1
+                if parcount[cs] == 0:
+                    queue.append(cs)
 
-    dfs(root_ns)
+        assert len(parcount) + 1 == len(node_list)
 
-    for ns in node_list:
-        yield ns
+        for ns in node_list:
+            yield ns
 
 
 class CircuitNodes():
@@ -67,6 +100,8 @@ def __init__(self, num_node_groups: int, region_node: RegionGraph, group_size: i
 
         self._tied_param_group_ids = None
 
+        self._reverse_iter = False
+
     def _run_init_callbacks(self, **kwargs):
         for func in self.INIT_CALLBACKS:
             func(self, **kwargs)
@@ -168,7 +203,12 @@ def clear_hooks(ns):
             clear_hooks(self)
 
     def __iter__(self):
-        return node_iterator(self)
+        return node_iterator(self, self._reverse_iter)
+
+    def __call__(self, reverse: bool = False):
+        self._reverse_iter = reverse
+
+        return self
 
     def provided(self, var_name):
         return hasattr(self, var_name) and getattr(self, var_name) is not None

From 14a379eb6372c821376e9f2c3caea48a88a91cb3 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Fri, 15 Dec 2023 17:05:11 +0800
Subject: [PATCH 073/162] fix kernel for custom forward matmul

---
 src/pyjuice/layer/sum_layer.py | 201 +++++++++++++++++++++++++--------
 1 file changed, 152 insertions(+), 49 deletions(-)

diff --git a/src/pyjuice/layer/sum_layer.py b/src/pyjuice/layer/sum_layer.py
index afa26957..8331006c 100644
--- a/src/pyjuice/layer/sum_layer.py
+++ b/src/pyjuice/layer/sum_layer.py
@@ -348,7 +348,7 @@ def _forward(self, node_mars: torch.Tensor, element_mars: torch.Tensor,
         else:
             mode = "block_sparse"
 
-        mode = "sparse" #### debug
+        # mode = "sparse" #### debug
 
         if mode == "block_sparse":
             self._forward_block_sparse(
@@ -372,10 +372,10 @@ def _forward(self, node_mars: torch.Tensor, element_mars: torch.Tensor,
 
     @staticmethod
     @triton.jit
-    def _fw_triton_block_sparse_kernel(node_mars, element_mars, params, nids, cids_start, cids_increment,
-                                       pids_start, pids_increment, local_ids, batch_size: tl.constexpr, partial_eval: tl.constexpr,
-                                       BLOCK_B: tl.constexpr, TILE_SIZE_K: tl.constexpr, K_NUM_TILES: tl.constexpr,
-                                       TILE_SIZE_M: tl.constexpr, GROUP_SIZE_M: tl.constexpr, OP_MODE: tl.constexpr):
+    def _fw_triton_block_sparse_tlmm_kernel(node_mars, element_mars, params, nids, cids_start, cids_increment,
+                                            pids_start, pids_increment, local_ids, batch_size: tl.constexpr, partial_eval: tl.constexpr,
+                                            BLOCK_B: tl.constexpr, TILE_SIZE_K: tl.constexpr, K_NUM_TILES: tl.constexpr,
+                                            TILE_SIZE_M: tl.constexpr, GROUP_SIZE_M: tl.constexpr, use_fp16: tl.constexpr):
 
         pid_b = tl.program_id(0) # ID of size-`BLOCK_B` batches
         pid_m = tl.program_id(1) # ID of size-`TILE_SIZE_M` nodes
@@ -426,24 +426,16 @@ def _fw_triton_block_sparse_kernel(node_mars, element_mars, params, nids, cids_s
             emars = tl.load(emars_ptr, mask = mask_batch[None,:])
 
             emars_max = tl.max(emars, axis = 0)[None,:]
-            emars = tl.exp(emars - emars_max)
+            emars_sub = tl.exp(emars - emars_max)
 
-            if OP_MODE == 0:
+            if use_fp16 == 1:
                 # Built-in matmul kernel of triton + float16
-                epars = (epars * (2**12)).to(tl.float16)
-                emars = emars.to(tl.float16)
-                nmars = tl.dot(epars, emars).to(tl.float32) / (2**12)
-            if OP_MODE == 1:
+                epars_fp16 = (epars * (2**12)).to(tl.float16)
+                emars_fp16 = emars_sub.to(tl.float16)
+                nmars = tl.dot(epars_fp16, emars_fp16).to(tl.float32) / (2**12)
+            else:
                 # Built-in matmul kernel of triton + float32
-                nmars = tl.dot(epars, emars)
-            if OP_MODE == 2:
-                # Simulated matmul kernel + float16
-                epars = (epars * (2**12)).to(tl.float16)
-                emars = emars.to(tl.float16)
-                nmars = tl.sum(epars[:,:,None] * emars[None,:,:], axis = 1).to(tl.float32) / (2**12)
-            if OP_MODE == 3:
-                # Simulated matmul kernel + float32
-                nmars = tl.sum(epars[:,:,None] * emars[None,:,:], axis = 1)
+                nmars = tl.dot(epars, emars_sub)
 
             acc = tl.where(emars_max > acc,
                 tl.log(nmars + tl.exp(acc - emars_max)) + emars_max,
@@ -465,6 +457,93 @@ def _fw_triton_block_sparse_kernel(node_mars, element_mars, params, nids, cids_s
         offs_nmars = (off_nids + offs_node[:,None]) * batch_size + offs_batch[None,:]
         tl.store(node_mars + offs_nmars, acc, mask = mask_batch[None,:])
 
+    @staticmethod
+    @triton.jit
+    def _fw_triton_block_sparse_csmm_kernel(node_mars, element_mars, params, nids, cids_start, cids_increment,
+                                            pids_start, pids_increment, local_ids, batch_size: tl.constexpr, partial_eval: tl.constexpr,
+                                            BLOCK_B: tl.constexpr, TILE_SIZE_K: tl.constexpr, K_NUM_TILES: tl.constexpr,
+                                            TILE_SIZE_M: tl.constexpr, GROUP_SIZE_M: tl.constexpr, use_fp16: tl.constexpr):
+
+        pid_b = tl.program_id(0) # ID of size-`BLOCK_B` batches
+        pid_m = tl.program_id(1) # ID of size-`TILE_SIZE_M` nodes
+
+        # Get inferred node group id from `pid_m`
+        ngroup_id = pid_m // (GROUP_SIZE_M // TILE_SIZE_M)
+        tile_id = pid_m % (GROUP_SIZE_M // TILE_SIZE_M)
+
+        # Get the real node group id in the case of partial evaluation
+        if partial_eval == 1:
+            ngroup_id = tl.load(local_ids + ngroup_id)
+
+        # Node offsets
+        offs_node = tl.arange(0, TILE_SIZE_M) + tile_id * TILE_SIZE_M
+        offs_node = tl.max_contiguous(offs_node, TILE_SIZE_M)
+
+        # Edge offsets
+        offs_edge = tl.arange(0, TILE_SIZE_K)
+
+        # Initialize pointers to `params`
+        offs_estart = ngroup_id * TILE_SIZE_K + offs_edge
+        offs_estart = tl.max_contiguous(offs_estart, TILE_SIZE_K)
+        par_start = tl.load(pids_start + offs_estart)
+        epars_ptr = params + \
+            offs_node[:,None] + \
+            par_start[None,:] # [TILE_SIZE_M, TILE_SIZE_K]
+
+        # Batch offsets and mask
+        offs_batch = tl.arange(0, BLOCK_B) + pid_b * BLOCK_B
+        offs_batch = tl.max_contiguous(offs_batch, BLOCK_B)
+        mask_batch = offs_batch < batch_size
+
+        # Initialize pointers to `element_mars`
+        edge_start = tl.load(cids_start + offs_estart)
+        emars_ptr = element_mars + \
+            edge_start[None,:] * batch_size + \
+            offs_batch[:,None] # [BLOCK_B, TILE_SIZE_K]
+
+        # Batch increment pointers
+        pids_inc_ptr = pids_increment + ngroup_id * (K_NUM_TILES * TILE_SIZE_K) + offs_edge
+        cids_inc_ptr = cids_increment + ngroup_id * (K_NUM_TILES * TILE_SIZE_K) + offs_edge
+
+        # Inner loop
+        acc = tl.zeros([TILE_SIZE_M, BLOCK_B], dtype = tl.float32) - float("inf")
+
+        for k in range(0, K_NUM_TILES):
+            epars = tl.load(epars_ptr)
+            emars = tl.load(emars_ptr, mask = mask_batch[:,None])
+
+            emars_max = tl.max(emars, axis = 1)
+            emars_sub = tl.exp(emars - emars_max[:,None])
+
+            if use_fp16 == 1:
+                # Simulated matmul kernel + float16
+                epars = (epars * (2**12)).to(tl.float16)
+                emars = emars.to(tl.float16)
+                nmars = tl.sum(epars[:,:,None] * tl.trans(emars_sub)[None,:,:], axis = 1).to(tl.float32) / (2**12)
+            else:
+                # Simulated matmul kernel + float32
+                nmars = tl.sum(epars[:,:,None] * tl.trans(emars_sub)[None,:,:], axis = 1)
+
+            acc = tl.where(emars_max[None,:] > acc,
+                tl.log(nmars + tl.exp(acc - emars_max[None,:])) + emars_max[None,:],
+                tl.log(tl.exp(emars_max[None,:] - acc) * nmars + 1.0) + acc
+            )
+
+            # Increment `epars_ptr`
+            pids_inc = tl.load(pids_inc_ptr)
+            epars_ptr += pids_inc[None,:]
+            pids_inc_ptr += TILE_SIZE_K
+
+            # Increment `emars_ptr`
+            cids_inc = tl.load(cids_inc_ptr)
+            emars_ptr += cids_inc[None,:] * batch_size
+            cids_inc_ptr += TILE_SIZE_K
+
+        # Write back
+        off_nids = tl.load(nids + ngroup_id)
+        offs_nmars = (off_nids + offs_node[:,None]) * batch_size + offs_batch[None,:]
+        tl.store(node_mars + offs_nmars, acc, mask = mask_batch[None,:])
+
     def _forward_block_sparse(self, node_mars: torch.Tensor, element_mars: torch.Tensor,
                               params: torch.Tensor, nids: torch.Tensor, cids: torch.Tensor,
                               pids: torch.Tensor, local_ids: Optional[torch.Tensor] = None,
@@ -541,38 +620,62 @@ def _forward_block_sparse(self, node_mars: torch.Tensor, element_mars: torch.Ten
             else:
                 use_fp16 = False
 
+        grid = (triton.cdiv(batch_size, BLOCK_B), triton.cdiv(layer_n_nodes, TILE_SIZE_M))
+
+        # print("========")
+        # print(layer_n_nodes)
+        # print(grid, grid[0] * grid[1])
+        # print(TILE_SIZE_M, TILE_SIZE_K, BLOCK_B)
+
+        # import time
+        # torch.cuda.synchronize()
+        # t0 = time.time()
+        
         if TILE_SIZE_M >= 16 and TILE_SIZE_K >= 16 and BLOCK_B >= 16:
-            if use_fp16:
-                OP_MODE = 0
-            else:
-                OP_MODE = 1
+            self._fw_triton_block_sparse_tlmm_kernel[grid](
+                node_mars, 
+                element_mars, 
+                params, 
+                nids, 
+                cids_start,
+                cids_increment, 
+                pids_start,
+                pids_increment,
+                local_ids,
+                batch_size,
+                partial_eval = 1 if local_ids is not None else 0,
+                BLOCK_B = BLOCK_B,
+                TILE_SIZE_K = TILE_SIZE_K,
+                K_NUM_TILES = K_NUM_TILES,
+                TILE_SIZE_M = TILE_SIZE_M,
+                GROUP_SIZE_M = self.group_size,
+                use_fp16 = use_fp16
+            )
         else:
-            if use_fp16:
-                OP_MODE = 2
-            else:
-                OP_MODE = 3
+            self._fw_triton_block_sparse_csmm_kernel[grid](
+                node_mars, 
+                element_mars, 
+                params, 
+                nids, 
+                cids_start,
+                cids_increment, 
+                pids_start,
+                pids_increment,
+                local_ids,
+                batch_size,
+                partial_eval = 1 if local_ids is not None else 0,
+                BLOCK_B = BLOCK_B,
+                TILE_SIZE_K = TILE_SIZE_K,
+                K_NUM_TILES = K_NUM_TILES,
+                TILE_SIZE_M = TILE_SIZE_M,
+                GROUP_SIZE_M = self.group_size,
+                use_fp16 = use_fp16
+            )
 
-        grid = (triton.cdiv(batch_size, BLOCK_B), triton.cdiv(layer_n_nodes, TILE_SIZE_M))
-        
-        self._fw_triton_block_sparse_kernel[grid](
-            node_mars, 
-            element_mars, 
-            params, 
-            nids, 
-            cids_start,
-            cids_increment, 
-            pids_start,
-            pids_increment,
-            local_ids,
-            batch_size,
-            partial_eval = 1 if local_ids is not None else 0,
-            BLOCK_B = BLOCK_B,
-            TILE_SIZE_K = TILE_SIZE_K,
-            K_NUM_TILES = K_NUM_TILES,
-            TILE_SIZE_M = TILE_SIZE_M,
-            GROUP_SIZE_M = self.group_size,
-            OP_MODE = OP_MODE
-        )
+        # torch.cuda.synchronize()
+        # t1 = time.time()
+
+        # print(f"kernel time: {(t1-t0)*1000:.3f}ms")
         
         return None
 

From cf9a50e574b97598368c70e379e2a40421c2e972 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Fri, 15 Dec 2023 17:05:53 +0800
Subject: [PATCH 074/162] cudagraphs for forward pass

---
 src/pyjuice/model/tensorcircuit.py | 65 ++++++++++++++++++++++++------
 1 file changed, 53 insertions(+), 12 deletions(-)

diff --git a/src/pyjuice/model/tensorcircuit.py b/src/pyjuice/model/tensorcircuit.py
index 46d34277..c7aff431 100644
--- a/src/pyjuice/model/tensorcircuit.py
+++ b/src/pyjuice/model/tensorcircuit.py
@@ -55,6 +55,8 @@ def __init__(self, root_ns: CircuitNodes, layer_sparsity_tol: float = 0.5,
         self.root_ns = root_ns
         self.device = torch.device("cpu")
 
+        self.num_vars = self._get_num_vars(self.root_ns)
+
         self.node_mars = None
         self.element_mars = None
         self.node_flows = None
@@ -76,6 +78,9 @@ def __init__(self, root_ns: CircuitNodes, layer_sparsity_tol: float = 0.5,
             "flows_memory": 1.0
         }
 
+        # Recorded CudaGraphs
+        self._recorded_cuda_graphs = dict()
+
     def to(self, device):
         super(TensorCircuit, self).to(device)
 
@@ -92,7 +97,8 @@ def to(self, device):
         return self
         
     def forward(self, inputs: torch.Tensor, input_layer_fn: Optional[Union[str,Callable]] = None,
-                cache: Optional[dict] = None, return_cache: bool = False, **kwargs):
+                cache: Optional[dict] = None, return_cache: bool = False, record_cudagraph: bool = False, 
+                apply_cudagraph: bool = False, **kwargs):
         """
         Forward the circuit.
 
@@ -103,6 +109,8 @@ def forward(self, inputs: torch.Tensor, input_layer_fn: Optional[Union[str,Calla
                           the corresponding member function of `input_layer`
         `kwargs`:         Additional arguments for input layers
         """
+
+        assert inputs.dim() == 2 and inputs.size(1) == self.num_vars
         
         B = inputs.size(0)
         inputs = inputs.permute(1, 0)
@@ -135,17 +143,42 @@ def forward(self, inputs: torch.Tensor, input_layer_fn: Optional[Union[str,Calla
                     raise ValueError(f"Custom input function should be either a `str` or a `Callable`. Found {type(input_layer_fn)} instead.")
 
             # Inner layers
-            for layer_group in self.inner_layer_groups:
-                if layer_group.is_prod():
-                    # Prod layer
-                    layer_group(self.node_mars, self.element_mars)
-
-                elif layer_group.is_sum():
-                    # Sum layer
-                    layer_group(self.node_mars, self.element_mars, self.params)
-
-                else:
-                    raise ValueError(f"Unknown layer type {type(layer)}.")
+            def _run_inner_layers():
+                for layer_group in self.inner_layer_groups:
+                    if layer_group.is_prod():
+                        # Prod layer
+                        layer_group(self.node_mars, self.element_mars)
+
+                    elif layer_group.is_sum():
+                        # Sum layer
+                        layer_group(self.node_mars, self.element_mars, self.params)
+
+                    else:
+                        raise ValueError(f"Unknown layer type {type(layer)}.")
+
+            signature = (id(self.node_mars), id(self.element_mars), B)
+            if record_cudagraph and signature not in self._recorded_cuda_graphs:
+                # Warmup
+                s = torch.cuda.Stream()
+                s.wait_stream(torch.cuda.current_stream())
+                with torch.cuda.stream(s):
+                    for _ in range(3):
+                        _run_inner_layers()
+                torch.cuda.current_stream().wait_stream(s)
+
+                # Capture
+                g = torch.cuda.CUDAGraph()
+                with torch.cuda.graph(g):
+                    _run_inner_layers()
+
+                # Save
+                self._recorded_cuda_graphs[signature] = g
+
+            if apply_cudagraph and signature in self._recorded_cuda_graphs:
+                g = self._recorded_cuda_graphs[signature]
+                g.replay()
+            else:
+                _run_inner_layers()
                 
         lls = self.node_mars[self._root_node_range[0]:self._root_node_range[1],:]
         lls = lls.permute(1, 0)
@@ -191,6 +224,7 @@ def backward(self, inputs: Optional[torch.Tensor] = None,
         """
 
         assert self.node_mars is not None and self.element_mars is not None, "Should run forward path first."
+        assert inputs.size(0) == self.num_vars
 
         B = self.node_mars.size(1)
 
@@ -434,6 +468,13 @@ def _buffer_matches(self, name: str, cache: Optional[dict], check_device: bool =
 
         return True
 
+    def _get_num_vars(self, ns: CircuitNodes):
+        num_vars = 0
+        for v in ns.scope:
+            if (v + 1) > num_vars:
+                num_vars = v + 1
+        return num_vars
+
     def _init_layers(self, layer_sparsity_tol: Optional[float] = None, max_num_partitions: Optional[int] = None,
                      disable_gpu_compilation: bool = False, force_gpu_compilation: bool = False, 
                      max_tied_ns_per_parflow_group: int = 8, verbose: bool = True):

From 0ee95abaaf73469bfcd4650024d7469e976e4e62 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Fri, 15 Dec 2023 19:39:13 +0800
Subject: [PATCH 075/162] HCLT forward & backward correctness check

---
 tests/structures/hclt_correctness_test.py | 223 ++++++++++++++++++++++
 tests/structures/hclt_test_new.py         | 127 ------------
 2 files changed, 223 insertions(+), 127 deletions(-)
 create mode 100644 tests/structures/hclt_correctness_test.py
 delete mode 100644 tests/structures/hclt_test_new.py

diff --git a/tests/structures/hclt_correctness_test.py b/tests/structures/hclt_correctness_test.py
new file mode 100644
index 00000000..67ea8dc1
--- /dev/null
+++ b/tests/structures/hclt_correctness_test.py
@@ -0,0 +1,223 @@
+import torch
+import torchvision
+import time
+from torch.utils.data import TensorDataset, DataLoader
+
+import pyjuice as juice
+import pyjuice.nodes.distributions as dists
+
+
+def hclt_forward_test():
+
+    device = torch.device("cuda:0")
+
+    train_dataset = torchvision.datasets.MNIST(root = "./examples/data", train = True, download = True)
+
+    train_data = train_dataset.data.reshape(60000, 28*28)[:5000,:]
+
+    num_features = train_data.size(1)
+    num_latents = 128
+
+    root_ns = juice.structures.HCLT(
+        train_data.float().to(device), 
+        num_bins = 32, 
+        sigma = 0.5 / 32, 
+        num_latents = num_latents, 
+        chunk_size = 32
+    )
+    root_ns.init_parameters()
+
+    pc = juice.TensorCircuit(root_ns)
+
+    pc.to(device)
+
+    group_size = root_ns.chs[0].group_size
+    num_groups = num_latents // group_size
+
+    batch_data = train_data[:512,:].contiguous().to(device)
+    data_cpu = batch_data.cpu().long()
+    batch_size = batch_data.size(0)
+
+    lls = pc(batch_data)
+
+    node_mars = pc.node_mars.cpu()
+
+    ns2mars = dict()
+
+    with torch.no_grad():
+        for ns in root_ns:
+            if ns.is_input():
+                v = ns.scope.to_list()[0]
+                params = ns._params.reshape(num_latents, 256)
+
+                mars = params[:,data_cpu[:,v]].log()
+
+                sid, eid = ns._output_ind_range
+                assert torch.all(torch.abs(mars - node_mars[sid:eid,:]) < 1e-4)
+
+                ns2mars[ns] = mars
+
+            elif ns.is_prod():
+                mars = torch.zeros([num_latents, batch_size])
+                for cs in ns.chs:
+                    mars += ns2mars[cs]
+
+                ns2mars[ns] = mars
+
+            elif ns.is_sum() and ns != root_ns:
+                emars = torch.cat([ns2mars[cs] for cs in ns.chs], dim = 0)
+                params = ns._params.reshape(num_groups, num_groups * ns.num_chs, group_size, group_size).permute(0, 2, 1, 3)
+                params = params.reshape(num_latents, num_latents * ns.num_chs)
+
+                emars_max = torch.max(emars, dim = 0).values[None,:]
+                emars = (emars - emars_max).exp()
+
+                nmars = torch.matmul(params, emars)
+                nmars = nmars.log() + emars_max
+
+                sid, eid = ns._output_ind_range
+                assert torch.all(torch.abs(nmars - node_mars[sid:eid,:]) < 4e-3)
+
+                ns2mars[ns] = nmars
+
+            else:
+                assert ns == root_ns
+
+                emars = torch.cat([ns2mars[cs] for cs in ns.chs], dim = 0)
+                params = ns._params.reshape(1, num_groups * ns.num_chs, 1, group_size).permute(0, 2, 1, 3)
+                params = params.reshape(1, num_latents * ns.num_chs)
+
+                emars_max = torch.max(emars, dim = 0).values[None,:]
+                emars = (emars - emars_max).exp()
+
+                nmars = torch.matmul(params, emars)
+                nmars = nmars.log() + emars_max
+
+                sid, eid = ns._output_ind_range
+                assert torch.all(torch.abs(nmars - node_mars[sid:eid,:]) < 4e-3)
+
+
+def hclt_backward_test():
+
+    device = torch.device("cuda:0")
+
+    train_dataset = torchvision.datasets.MNIST(root = "./examples/data", train = True, download = True)
+
+    train_data = train_dataset.data.reshape(60000, 28*28)[:5000,:]
+
+    num_features = train_data.size(1)
+    num_latents = 128
+
+    root_ns = juice.structures.HCLT(
+        train_data.float().to(device), 
+        num_bins = 32, 
+        sigma = 0.5 / 32, 
+        num_latents = num_latents, 
+        chunk_size = 32
+    )
+    root_ns.init_parameters()
+
+    pc = juice.TensorCircuit(root_ns)
+
+    pc.to(device)
+
+    group_size = root_ns.chs[0].group_size
+    num_groups = num_latents // group_size
+
+    batch_data = train_data[:512,:].contiguous().to(device)
+    data_cpu = batch_data.cpu().long()
+    batch_size = batch_data.size(0)
+
+    lls = pc(batch_data)
+    lls.mean().backward()
+
+    pc.update_param_flows()
+
+    node_mars = pc.node_mars.cpu()
+    node_flows = pc.node_flows.cpu()
+
+    ns2flows = dict()
+    ns2flows[root_ns] = torch.ones([1, batch_size])
+
+    with torch.no_grad():
+        for ns in root_ns(reverse = True):
+            if ns == root_ns:
+
+                sid, eid = ns._output_ind_range
+                assert torch.all(torch.abs(node_flows[sid:eid,:] - 1.0) < 1e-4)
+
+                nflows = ns2flows[ns]
+                nmars = node_mars[sid:eid,:]
+
+                for i, cs in enumerate(ns.chs):
+                    params = ns._params.reshape(1, num_groups * ns.num_chs, 1, group_size).permute(0, 2, 1, 3)
+                    params = params[:,:,i*num_groups:(i+1)*num_groups,:].reshape(1, num_latents)
+
+                    param_flows = ns._param_flows.reshape(1, num_groups * ns.num_chs, 1, group_size).permute(0, 2, 1, 3)
+                    param_flows = param_flows[:,:,i*num_groups:(i+1)*num_groups,:].reshape(1, num_latents)
+
+                    if cs.is_prod():
+                        emars = torch.zeros([num_latents, batch_size])
+                        for cns in cs.chs:
+                            sid, eid = cns._output_ind_range
+                            emars += node_mars[sid:eid,:]
+                    else:
+                        raise ValueError()
+
+                    eflows = nflows * params.permute(1, 0) * (emars - nmars).exp()
+                    pflows = eflows.sum(dim = 1)
+
+                    assert torch.all(torch.abs(pflows - param_flows[0,:]) < 3e-3)
+
+                    if cs not in ns2flows:
+                        ns2flows[cs] = torch.zeros([num_latents, batch_size])
+                    ns2flows[cs] += eflows
+
+            elif ns.is_prod():
+                nflows = ns2flows[ns]
+                for cs in ns.chs:
+                    if cs not in ns2flows:
+                        ns2flows[cs] = torch.zeros([num_latents, batch_size])
+                    ns2flows[cs] += nflows
+
+            elif ns.is_sum():
+
+                nflows = ns2flows[ns]
+
+                sid, eid = ns._output_ind_range
+                assert torch.all(torch.abs(nflows - node_flows[sid:eid,:]) < 2e-4)
+
+                nmars = node_mars[sid:eid,:]
+
+                for i, cs in enumerate(ns.chs):
+                    params = ns._params.reshape(num_groups, num_groups * ns.num_chs, group_size, group_size).permute(0, 2, 1, 3)
+                    params = params[:,:,i*num_groups:(i+1)*num_groups,:].reshape(num_latents, num_latents)
+
+                    param_flows = ns._param_flows.reshape(num_groups, num_groups * ns.num_chs, group_size, group_size).permute(0, 2, 1, 3)
+                    param_flows = param_flows[:,:,i*num_groups:(i+1)*num_groups,:].reshape(num_latents, num_latents)
+
+                    if cs.is_prod():
+                        emars = torch.zeros([num_latents, batch_size])
+                        for cns in cs.chs:
+                            sid, eid = cns._output_ind_range
+                            emars += node_mars[sid:eid,:]
+                    else:
+                        raise ValueError()
+
+                    emars_max = emars.max(dim = 0).values
+                    nflows_div_mars = nflows * (emars_max[None,:] - nmars).exp()
+                    eflows = torch.matmul(params.permute(1, 0), nflows_div_mars) * (emars - emars_max[None,:]).exp()
+
+                    pflows = torch.matmul(nflows_div_mars, (emars - emars_max[None,:]).exp().permute(1, 0)) * params
+
+                    assert torch.all(torch.abs(pflows - param_flows) < 3e-3)
+
+                    if cs not in ns2flows:
+                        ns2flows[cs] = torch.zeros([num_latents, batch_size])
+                    ns2flows[cs] += eflows
+
+
+if __name__ == "__main__":
+    torch.manual_seed(320942)
+    hclt_forward_test()
+    hclt_backward_test()
diff --git a/tests/structures/hclt_test_new.py b/tests/structures/hclt_test_new.py
deleted file mode 100644
index bc996733..00000000
--- a/tests/structures/hclt_test_new.py
+++ /dev/null
@@ -1,127 +0,0 @@
-import torch
-import torchvision
-import time
-from torch.utils.data import TensorDataset, DataLoader
-
-import pyjuice as juice
-import pyjuice.nodes.distributions as dists
-
-
-def evaluate(pc, loader):
-    lls_total = 0.0
-    for batch in loader:
-        x = batch[0].to(pc.device)
-        lls = pc(x)
-        lls_total += lls.mean().detach().cpu().numpy().item()
-    
-    lls_total /= len(loader)
-    return lls_total
-
-
-def mini_batch_em_epoch(num_epochs, pc, optimizer, scheduler, train_loader, test_loader, device):
-    for epoch in range(num_epochs):
-        t0 = time.time()
-        train_ll = 0.0
-        for batch in train_loader:
-            x = batch[0].to(device)
-
-            optimizer.zero_grad()
-
-            lls = pc(x)
-            lls.mean().backward()
-
-            train_ll += lls.mean().detach().cpu().numpy().item()
-
-            optimizer.step()
-            scheduler.step()
-
-        train_ll /= len(train_loader)
-
-        t1 = time.time()
-        test_ll = evaluate(pc, loader=test_loader)
-        t2 = time.time()
-
-        print(f"[Epoch {epoch}/{num_epochs}][train LL: {train_ll:.2f}; test LL: {test_ll:.2f}].....[train forward+backward+step {t1-t0:.2f}; test forward {t2-t1:.2f}] ")
-
-
-def full_batch_em_epoch(pc, train_loader, test_loader, device):
-    with torch.no_grad():
-        t0 = time.time()
-        train_ll = 0.0
-        for batch in train_loader:
-            x = batch[0].to(device)
-
-            lls = pc(x)
-            pc.backward(x, flows_memory = 1.0)
-
-            train_ll += lls.mean().detach().cpu().numpy().item()
-
-        pc.mini_batch_em(step_size = 1.0, pseudocount = 0.1)
-
-        train_ll /= len(train_loader)
-
-        t1 = time.time()
-        test_ll = evaluate(pc, loader=test_loader)
-        t2 = time.time()
-        print(f"[train LL: {train_ll:.2f}; test LL: {test_ll:.2f}].....[train forward+backward+step {t1-t0:.2f}; test forward {t2-t1:.2f}] ")
-
-
-def hclt_test():
-
-    device = torch.device("cuda:0")
-
-    train_dataset = torchvision.datasets.MNIST(root = "./examples/data", train = True, download = True)
-    test_dataset = torchvision.datasets.MNIST(root = "./examples/data", train = False, download = True)
-
-    train_data = train_dataset.data.reshape(60000, 28*28)
-    test_data = test_dataset.data.reshape(10000, 28*28)
-
-    num_features = train_data.size(1)
-
-    train_loader = DataLoader(
-        dataset = TensorDataset(train_data),
-        batch_size = 512,
-        shuffle = True,
-        drop_last = True
-    )
-    test_loader = DataLoader(
-        dataset = TensorDataset(test_data),
-        batch_size = 512,
-        shuffle = False,
-        drop_last = True
-    )
-
-    ns = juice.structures.HCLT(
-        train_data.float().to(device), 
-        num_bins = 32, 
-        sigma = 0.5 / 32, 
-        num_latents = 128, 
-        chunk_size = 32
-    )
-    pc = juice.TensorCircuit(ns)
-
-    pc.to(device)
-
-    optimizer = juice.optim.CircuitOptimizer(pc, lr = 0.1, pseudocount = 0.1)
-    scheduler = juice.optim.CircuitScheduler(
-        optimizer, 
-        method = "multi_linear", 
-        lrs = [0.9, 0.1, 0.05], 
-        milestone_steps = [0, len(train_loader) * 100, len(train_loader) * 350]
-    )
-
-
-    for _ in range(1000):
-        t0 = time.time()
-        lls_total = evaluate(pc, train_loader)
-        torch.cuda.synchronize()
-        t1 = time.time()
-        print(t1 - t0, lls_total)
-
-
-    # mini_batch_em_epoch(350, pc, optimizer, scheduler, train_loader, test_loader, device)
-    # full_batch_em_epoch(pc, train_loader, test_loader, device)
-
-
-if __name__ == "__main__":
-    hclt_test()

From 11744e3c5687f18096083d64b9416def9da14237 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Fri, 15 Dec 2023 19:49:35 +0800
Subject: [PATCH 076/162] update compilation logging

---
 src/pyjuice/model/tensorcircuit.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pyjuice/model/tensorcircuit.py b/src/pyjuice/model/tensorcircuit.py
index c7aff431..7d80d1f9 100644
--- a/src/pyjuice/model/tensorcircuit.py
+++ b/src/pyjuice/model/tensorcircuit.py
@@ -515,7 +515,7 @@ def _init_layers(self, layer_sparsity_tol: Optional[float] = None, max_num_parti
         node2tiednodes = dict()
 
         if verbose:
-            print(f"Compiling {num_layers} layers...")
+            print(f"Compiling {num_layers} TensorCircuit layers...")
 
         layer_id = 0
         for depth in tqdm(range(num_layers), disable = not verbose):

From 7411ce30be8f6253af287cd10114b337a81168b4 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Fri, 15 Dec 2023 22:40:58 +0800
Subject: [PATCH 077/162] debug

---
 src/pyjuice/layer/sum_layer.py            | 177 ++++++++++++++++----
 tests/structures/debug_bk_kernel.py       | 188 ++++++++++++++++++++++
 tests/structures/hclt_correctness_test.py |  86 +++++++++-
 tests/structures/hclt_test.py             |  36 ++++-
 4 files changed, 453 insertions(+), 34 deletions(-)
 create mode 100644 tests/structures/debug_bk_kernel.py

diff --git a/src/pyjuice/layer/sum_layer.py b/src/pyjuice/layer/sum_layer.py
index 8331006c..eb43eb5c 100644
--- a/src/pyjuice/layer/sum_layer.py
+++ b/src/pyjuice/layer/sum_layer.py
@@ -275,6 +275,16 @@ def backward(self, node_flows: torch.Tensor, element_flows: torch.Tensor,
                 parpids = self.partitioned_parpids[partition_id]
                 cs_group_size = self.cs_group_sizes[partition_id]
 
+                torch.cuda.synchronize()
+
+                if node_flows.isnan().any() or node_flows.isinf().any():
+                    import pdb; pdb.set_trace()
+                    a = 0
+
+                if element_flows.isnan().any() or element_flows.isinf().any():
+                    import pdb; pdb.set_trace()
+                    a = 0
+
                 self._backward(
                     node_flows, element_flows, params, node_mars, 
                     element_mars, param_flows, 
@@ -282,6 +292,22 @@ def backward(self, node_flows: torch.Tensor, element_flows: torch.Tensor,
                     cs_group_size = cs_group_size
                 )
 
+                torch.cuda.synchronize()
+
+                if node_flows.isnan().any() or node_flows.isinf().any():
+                    import pdb; pdb.set_trace()
+                    a = 0
+
+                if element_flows.isnan().any() or element_flows.isinf().any():
+                    self._backward(
+                        node_flows, element_flows, params, node_mars, 
+                        element_mars, param_flows, 
+                        chids = chids, parids = parids, parpids = parpids,
+                        cs_group_size = cs_group_size, debug = True
+                    )
+                    import pdb; pdb.set_trace()
+                    a = 0
+
         else:
             # Partial evaluation
             for partition_id in range(self.num_bk_partitions):
@@ -817,7 +843,7 @@ def _backward(self, node_flows: torch.Tensor, element_flows: torch.Tensor,
                   chids: Optional[torch.Tensor] = None, parids: Optional[torch.Tensor] = None, 
                   parpids: Optional[torch.Tensor] = None, 
                   cs_group_size: int = 0, local_ids: Optional[torch.Tensor] = None, 
-                  partition_id: int = -1, mode: Optional[str] = None) -> None:
+                  partition_id: int = -1, mode: Optional[str] = None, debug = False) -> None:
         """
         Back pass of sum layers.
         
@@ -857,14 +883,16 @@ def _backward(self, node_flows: torch.Tensor, element_flows: torch.Tensor,
             self._backward_block_sparse(
                 node_flows, element_flows, params, node_mars, element_mars, param_flows, 
                 nids, cids, pids, pfids, chids, parids, parpids, cs_group_size, local_ids, 
-                partition_id = partition_id
+                partition_id = partition_id, debug = debug
             )
+
         elif mode == "sparse":
             self._backward_sparse(
                 node_flows, element_flows, params, node_mars, element_mars, param_flows, 
                 nids, cids, pids, pfids, chids, parids, parpids, cs_group_size, local_ids, 
                 partition_id = partition_id
             )
+
         elif mode == "pytorch":
             self._backward_pytorch(
                 node_flows, element_flows, params, node_mars, 
@@ -880,7 +908,7 @@ def _backward_block_sparse(self, node_flows: torch.Tensor, element_flows: torch.
                                nids: Optional[torch.Tensor], cids: Optional[torch.Tensor], pids: Optional[torch.Tensor], pfids: Optional[torch.Tensor],
                                chids: Optional[torch.Tensor], parids: Optional[torch.Tensor], parpids: Optional[torch.Tensor], 
                                cs_group_size: int, local_ids: Optional[torch.Tensor] = None,
-                               partition_id: int = -1) -> None:
+                               partition_id: int = -1, debug = False) -> None:
         """
         Back pass of sum layers with block-sparse processing kernel.
         
@@ -902,7 +930,7 @@ def _backward_block_sparse(self, node_flows: torch.Tensor, element_flows: torch.
                 node_flows, element_flows, params, node_mars, element_mars,
                 chids = chids, parids = parids, parpids = parpids, 
                 cs_group_size = cs_group_size, local_ids = local_ids, 
-                partition_id = partition_id
+                partition_id = partition_id, debug = debug
             )
 
         # Flows w.r.t. parameters
@@ -959,8 +987,7 @@ def _bk_triton_block_sparse_ele_kernel(node_flows, element_flows, node_mars, ele
         # Initialize pointers to `element_mars`
         off_eleids = tl.load(chids + elegroup_id)
         offs_elemfs = (off_eleids + offs_ele[:,None]) * batch_size + offs_batch[None,:]
-        tmp_emars = tl.load(element_mars + offs_elemfs, mask = mask_batch[None,:]) # [TILE_SIZE_K, BLOCK_B]
-        emars_max = tl.max(tmp_emars, axis = 0) # [BLOCK_B]
+        emars = tl.load(element_mars + offs_elemfs, mask = mask_batch[None,:]) # [TILE_SIZE_K, BLOCK_B]
 
         # Batch increment pointers
         parids_inc_ptr = parids_increment + elegroup_id * (K_NUM_TILES * ptr_inc_step) + offs_edge_gid
@@ -974,24 +1001,27 @@ def _bk_triton_block_sparse_ele_kernel(node_flows, element_flows, node_mars, ele
             nmars = tl.load(nmars_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_K, BLOCK_B]
             nflows = tl.load(nflows_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_K, BLOCK_B]
 
-            nflows_div_mars = nflows * tl.exp(emars_max[None,:] - nmars)
+            nmars_max = tl.max(nmars, axis = 0) # [BLOCK_B]
+            nflows_div_mars = nflows * tl.exp(nmars_max[None,:] - nmars)
+
+            eflows = tl.sum(epars[:,:,None] * tl.exp(emars[:,None,:] - nmars[None,:,:]) * nflows[None,:,:], axis = 1)
             
-            if OP_MODE == 0:
-                # Built-in matmul kernel of triton + float16
-                epars = epars.to(tl.float16)
-                nflows_div_mars = nflows_div_mars.to(tl.float16)
-                eflows = tl.dot(epars, nflows_div_mars).to(tl.float32)
-            if OP_MODE == 1:
-                # Built-in matmul kernel of triton + float32
-                eflows = tl.dot(epars, nflows_div_mars)
-            if OP_MODE == 2:
-                # Simulated matmul kernel + float16
-                epars = epars.to(tl.float16)
-                nflows_div_mars = nflows_div_mars.to(tl.float16)
-                eflows = tl.sum(epars[:,:,None] * nflows_div_mars[None,:,:], axis = 1).to(tl.float32)
-            if OP_MODE == 3:
-                # Simulated matmul kernel + float32
-                eflows = tl.sum(epars[:,:,None] * nflows_div_mars[None,:,:], axis = 1)
+            # if OP_MODE == 0:
+            #     # Built-in matmul kernel of triton + float16
+            #     epars = epars.to(tl.float16)
+            #     nflows_div_mars = nflows_div_mars.to(tl.float16)
+            #     eflows = tl.dot(epars, nflows_div_mars).to(tl.float32)
+            # if OP_MODE == 1:
+            #     # Built-in matmul kernel of triton + float32
+            #     eflows = tl.dot(epars, nflows_div_mars)
+            # if OP_MODE == 2:
+            #     # Simulated matmul kernel + float16
+            #     epars = epars.to(tl.float16)
+            #     nflows_div_mars = nflows_div_mars.to(tl.float16)
+            #     eflows = tl.sum(epars[:,:,None] * nflows_div_mars[None,:,:], axis = 1).to(tl.float32)
+            # if OP_MODE == 3:
+            #     # Simulated matmul kernel + float32
+            #     eflows = tl.sum(epars[:,:,None] * nflows_div_mars[None,:,:], axis = 1)
 
             acc += eflows
 
@@ -1006,19 +1036,16 @@ def _bk_triton_block_sparse_ele_kernel(node_flows, element_flows, node_mars, ele
             nflows_ptr += parids_inc[:,None] * batch_size
             parids_inc += ptr_inc_step
 
-        # Initialize pointers to `element_mars`
+        # Write back
         off_eleids = tl.load(chids + elegroup_id)
         offs_elemfs = (off_eleids + offs_ele[:,None]) * batch_size + offs_batch[None,:]
-        emars = tl.load(element_mars + offs_elemfs, mask = mask_batch[None,:]) # [TILE_SIZE_K, BLOCK_B]
-
-        eflows = acc * tl.exp(emars - emars_max[None,:])
-        tl.store(element_flows + offs_elemfs, eflows, mask = mask_batch[None,:])
+        tl.store(element_flows + offs_elemfs, acc, mask = mask_batch[None,:])
 
     def _backward_block_sparse_ele_flows(self, node_flows: torch.Tensor, element_flows: torch.Tensor,
                                          params: torch.Tensor, node_mars: torch.Tensor,
                                          element_mars: torch.Tensor, chids: torch.Tensor, parids: torch.Tensor,
                                          parpids: torch.Tensor, cs_group_size: int, local_ids: Optional[torch.Tensor] = None,
-                                         partition_id: int = -1, force_use_fp16: bool = False, force_use_fp32: bool = False) -> None:
+                                         partition_id: int = -1, force_use_fp16: bool = False, force_use_fp32: bool = False, debug = False) -> None:
 
         assert params.dim() == 1, "Expecting a 1D `params`."
 
@@ -1107,6 +1134,78 @@ def _backward_block_sparse_ele_flows(self, node_flows: torch.Tensor, element_flo
 
         grid = (triton.cdiv(batch_size, BLOCK_B), triton.cdiv(layer_n_nodes, TILE_SIZE_M))
 
+        if debug:
+            import pdb; pdb.set_trace()
+
+            # num_ngroups = chids.size(0)
+            # num_egroups = parids.size(1)
+            # parids = (parids[:,:,None].repeat(1, 1, self.group_size) + torch.arange(0, self.group_size, device = parids.device)).reshape(num_ngroups, num_egroups * self.group_size)
+            # parpids = (parpids[:,:,None] + torch.arange(0, self.group_size, device = parids.device)).reshape(
+            #     num_ngroups, num_egroups * self.group_size)
+
+            # chids = (chids[:,None].repeat(1, cs_group_size) + torch.arange(0, cs_group_size, device = chids.device)).reshape(num_ngroups * cs_group_size)
+            # parids = parids[:,None,:].repeat(1, cs_group_size, 1).reshape(num_ngroups * cs_group_size, num_egroups * self.group_size)
+            # parpids = (parpids[:,None,:].repeat(1, cs_group_size, 1) + torch.arange(0, cs_group_size * self.group_size, self.group_size, device = parpids.device)[None,:,None]).reshape(
+            #     num_ngroups * cs_group_size, num_egroups * self.group_size
+            # )
+            
+            # element_flows[chids] = (node_flows[parids] * params[parpids].unsqueeze(-1) * \
+            #     (element_mars[chids].unsqueeze(1) - node_mars[parids]).exp()).sum(dim = 1)
+
+            import numpy as np
+            np.savez("temp.npz", node_flows = node_flows.cpu().numpy(),
+                     element_flow = element_flows.cpu().numpy(),
+                     node_mars = node_mars.cpu().numpy(),
+                     element_mars = element_mars.cpu().numpy(),
+                     params = params.cpu().numpy(),
+                     chids = chids.cpu().numpy(),
+                     parids = parids.cpu().numpy(),
+                     parids_start = parids_start.cpu().numpy(),
+                     parids_increment = parids_increment.cpu().numpy(),
+                     parpids = parpids.cpu().numpy(),
+                     parpids_start = parpids_start.cpu().numpy(),
+                     parpids_increment = parpids_increment.cpu().numpy(),
+                     batch_size = batch_size,
+                     ptr_inc_step = ptr_inc_step,
+                     BLOCK_B = BLOCK_B, 
+                     TILE_SIZE_K = TILE_SIZE_K, 
+                     K_NUM_TILES = K_NUM_TILES,
+                     TILE_SIZE_M = TILE_SIZE_M, 
+                     GROUP_SIZE_M = cs_group_size,
+                     GROUP_SIZE_K = self.group_size,
+                     OP_MODE = OP_MODE, 
+                     layer_n_nodes = layer_n_nodes)
+
+            import pdb; pdb.set_trace()
+
+            OP_MODE = 1
+
+            self._bk_triton_block_sparse_ele_kernel[grid](
+                node_flows = node_flows, 
+                element_flows = element_flows, 
+                node_mars = node_mars, 
+                element_mars = element_mars, 
+                params = params, 
+                chids = chids, 
+                parids_start = parids_start,
+                parids_increment = parids_increment,
+                parpids_start = parpids_start,
+                parpids_increment = parpids_increment, 
+                local_ids = local_ids, 
+                batch_size = batch_size, 
+                partial_eval = 1 if local_ids is not None else 0,
+                ptr_inc_step = ptr_inc_step,
+                BLOCK_B = BLOCK_B, 
+                TILE_SIZE_K = TILE_SIZE_K, 
+                K_NUM_TILES = K_NUM_TILES,
+                TILE_SIZE_M = TILE_SIZE_M, 
+                GROUP_SIZE_M = cs_group_size,
+                GROUP_SIZE_K = self.group_size,
+                OP_MODE = OP_MODE
+            )
+
+            import pdb; pdb.set_trace()
+
         self._bk_triton_block_sparse_ele_kernel[grid](
             node_flows = node_flows, 
             element_flows = element_flows, 
@@ -1425,6 +1524,16 @@ def _backward_sparse_ele_flows(self, node_flows: torch.Tensor, element_flows: to
 
         grid = (triton.cdiv(batch_size, BLOCK_B), triton.cdiv(layer_n_nodes, BLOCK_M))
 
+        torch.cuda.synchronize()
+
+        if node_flows.isnan().any() or node_flows.isinf().any():
+            import pdb; pdb.set_trace()
+            a = 0
+
+        if element_flows.isnan().any() or element_flows.isinf().any():
+            import pdb; pdb.set_trace()
+            a = 0
+
         self._bk_triton_sparse_ele_kernel[grid](
             node_flows = node_flows, 
             element_flows = element_flows, 
@@ -1443,6 +1552,16 @@ def _backward_sparse_ele_flows(self, node_flows: torch.Tensor, element_flows: to
             GROUP_SIZE_K = self.group_size
         )
 
+        torch.cuda.synchronize()
+
+        if node_flows.isnan().any() or node_flows.isinf().any():
+            import pdb; pdb.set_trace()
+            a = 0
+
+        if element_flows.isnan().any() or element_flows.isinf().any():
+            import pdb; pdb.set_trace()
+            a = 0
+
         return None
 
     @staticmethod
diff --git a/tests/structures/debug_bk_kernel.py b/tests/structures/debug_bk_kernel.py
new file mode 100644
index 00000000..6311ce7e
--- /dev/null
+++ b/tests/structures/debug_bk_kernel.py
@@ -0,0 +1,188 @@
+import numpy as np
+import torch
+
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def _bk_triton_block_sparse_ele_kernel(node_flows, element_flows, node_mars, element_mars, params, 
+                                        chids, parids_start, parids_increment, parpids_start, parpids_increment, 
+                                        local_ids, batch_size: tl.constexpr, partial_eval: tl.constexpr, ptr_inc_step: tl.constexpr, 
+                                        BLOCK_B: tl.constexpr, TILE_SIZE_K: tl.constexpr, K_NUM_TILES: tl.constexpr, TILE_SIZE_M: tl.constexpr, 
+                                        GROUP_SIZE_M: tl.constexpr, GROUP_SIZE_K: tl.constexpr, OP_MODE: tl.constexpr):
+
+    pid_b = tl.program_id(0) # ID of size-`BLOCK_B` batches
+    pid_m = tl.program_id(1) # ID of size-`TILE_SIZE_M` nodes
+
+    # Get inferred node group id from `pid_m`
+    elegroup_id = pid_m // (GROUP_SIZE_M // TILE_SIZE_M)
+    tile_id = pid_m % (GROUP_SIZE_M // TILE_SIZE_M)
+
+    # Get the real node group id in the case of partial evaluation
+    if partial_eval == 1:
+        elegroup_id = tl.load(local_ids + elegroup_id)
+
+    # Initialize pointers to `params`
+    offs_ele = tl.arange(0, TILE_SIZE_M) + tile_id * TILE_SIZE_M
+    offs_edge = tl.arange(0, TILE_SIZE_K)
+    offs_edge_gid = offs_edge // GROUP_SIZE_K
+    offs_edge_nid = (offs_edge % GROUP_SIZE_K)
+    par_start = tl.load(parpids_start + elegroup_id * ptr_inc_step + offs_edge_gid)
+    epars_ptr = params + \
+        offs_ele[:,None] * GROUP_SIZE_K + \
+        (par_start + offs_edge_nid)[None,:] # [TILE_SIZE_M, TILE_SIZE_K]
+
+    # Batch offsets and mask
+    offs_batch = tl.arange(0, BLOCK_B) + pid_b * BLOCK_B
+    mask_batch = offs_batch < batch_size
+
+    # Initialize pointers to `node_mars`
+    edge_start = tl.load(parids_start + elegroup_id * ptr_inc_step + offs_edge_gid)
+    nmars_ptr = node_mars + \
+        (edge_start + offs_edge_nid)[:,None] * batch_size + \
+        offs_batch[None,:] # [TILE_SIZE_K, BLOCK_B]
+    nflows_ptr = node_flows + \
+        (edge_start + offs_edge_nid)[:,None] * batch_size + \
+        offs_batch[None,:] # [TILE_SIZE_K, BLOCK_B]
+
+    # Initialize pointers to `element_mars`
+    off_eleids = tl.load(chids + elegroup_id)
+    offs_elemfs = (off_eleids + offs_ele[:,None]) * batch_size + offs_batch[None,:]
+    emars = tl.load(element_mars + offs_elemfs, mask = mask_batch[None,:]) # [TILE_SIZE_K, BLOCK_B]
+
+    # Batch increment pointers
+    parids_inc_ptr = parids_increment + elegroup_id * (K_NUM_TILES * ptr_inc_step) + offs_edge_gid
+    parpids_inc_ptr = parpids_increment + elegroup_id * (K_NUM_TILES * ptr_inc_step) + offs_edge_gid
+
+    # Inner loop
+    acc = tl.zeros([TILE_SIZE_M, BLOCK_B], dtype = tl.float32)
+
+    for k in range(0, K_NUM_TILES):
+        epars = tl.load(epars_ptr) # [TILE_SIZE_M, TILE_SIZE_K]
+        nmars = tl.load(nmars_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_K, BLOCK_B]
+        nflows = tl.load(nflows_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_K, BLOCK_B]
+
+        nmars_max = tl.max(nmars, axis = 0) # [BLOCK_B]
+        nflows_div_mars = nflows * tl.exp(nmars_max[None,:] - nmars)
+
+        eflows = tl.sum(epars[:,:,None] * tl.exp(emars[:,None,:] - nmars[None,:,:]) * nflows[None,:,:], axis = 1)
+        
+        # if OP_MODE == 0:
+        #     # Built-in matmul kernel of triton + float16
+        #     epars = epars.to(tl.float16)
+        #     nflows_div_mars = nflows_div_mars.to(tl.float16)
+        #     eflows = tl.dot(epars, nflows_div_mars).to(tl.float32)
+        # if OP_MODE == 1:
+        #     # Built-in matmul kernel of triton + float32
+        #     eflows = tl.dot(epars, nflows_div_mars)
+        # if OP_MODE == 2:
+        #     # Simulated matmul kernel + float16
+        #     epars = epars.to(tl.float16)
+        #     nflows_div_mars = nflows_div_mars.to(tl.float16)
+        #     eflows = tl.sum(epars[:,:,None] * nflows_div_mars[None,:,:], axis = 1).to(tl.float32)
+        # if OP_MODE == 3:
+        #     # Simulated matmul kernel + float32
+        #     eflows = tl.sum(epars[:,:,None] * nflows_div_mars[None,:,:], axis = 1)
+
+        acc += eflows
+
+        # Increment `epars_ptr`
+        parpids_inc = tl.load(parpids_inc_ptr)
+        epars_ptr += parpids_inc[None,:]
+        parpids_inc_ptr += ptr_inc_step
+
+        # Increment `nmars_ptr`
+        parids_inc = tl.load(parids_inc_ptr)
+        nmars_ptr += parids_inc[:,None] * batch_size
+        nflows_ptr += parids_inc[:,None] * batch_size
+        parids_inc += ptr_inc_step
+
+    # Write back
+    off_eleids = tl.load(chids + elegroup_id)
+    offs_elemfs = (off_eleids + offs_ele[:,None]) * batch_size + offs_batch[None,:]
+    tl.store(element_flows + offs_elemfs, acc, mask = mask_batch[None,:])
+
+
+def main():
+
+    device = torch.device("cuda:0")
+
+    data = np.load("temp.npz")
+
+    node_flows = torch.from_numpy(data["node_flows"]).to(device)
+    element_flows = torch.from_numpy(data["element_flow"]).to(device)
+    node_mars = torch.from_numpy(data["node_mars"]).to(device)
+    element_mars = torch.from_numpy(data["element_mars"]).to(device)
+    params = torch.from_numpy(data["params"]).to(device)
+    chids = torch.from_numpy(data["chids"]).to(device)
+    parids = torch.from_numpy(data["parids"]).to(device)
+    parids_start = torch.from_numpy(data["parids_start"]).to(device)
+    parids_increment = torch.from_numpy(data["parids_increment"]).to(device)
+    parpids = torch.from_numpy(data["parpids"]).to(device)
+    parpids_start = torch.from_numpy(data["parpids_start"]).to(device)
+    parpids_increment = torch.from_numpy(data["parpids_increment"]).to(device)
+    batch_size = int(data["batch_size"])
+    ptr_inc_step = int(data["ptr_inc_step"])
+    BLOCK_B = int(data["BLOCK_B"])
+    TILE_SIZE_M = int(data["TILE_SIZE_M"])
+    TILE_SIZE_K = int(data["TILE_SIZE_K"])
+    K_NUM_TILES = int(data["K_NUM_TILES"])
+    GROUP_SIZE_M = int(data["GROUP_SIZE_M"])
+    GROUP_SIZE_K = int(data["GROUP_SIZE_K"])
+    OP_MODE = int(data["OP_MODE"])
+    layer_n_nodes = int(data["layer_n_nodes"])
+
+    grid = (triton.cdiv(batch_size, BLOCK_B), triton.cdiv(layer_n_nodes, TILE_SIZE_M))
+
+    import pdb; pdb.set_trace()
+
+    ori_chids = chids
+    ori_parids = parids
+    ori_parpids = parpids
+
+    num_ngroups = chids.size(0)
+    num_egroups = parids.size(1)
+    parids = (parids[:,:,None].repeat(1, 1, GROUP_SIZE_M) + torch.arange(0, GROUP_SIZE_M, device = parids.device)).reshape(num_ngroups, num_egroups * GROUP_SIZE_M)
+    parpids = (parpids[:,:,None] + torch.arange(0, GROUP_SIZE_M, device = parids.device)).reshape(
+        num_ngroups, num_egroups * GROUP_SIZE_M)
+
+    chids = (chids[:,None].repeat(1, GROUP_SIZE_K) + torch.arange(0, GROUP_SIZE_K, device = chids.device)).reshape(num_ngroups * GROUP_SIZE_K)
+    parids = parids[:,None,:].repeat(1, GROUP_SIZE_K, 1).reshape(num_ngroups * GROUP_SIZE_K, num_egroups * GROUP_SIZE_M)
+    parpids = (parpids[:,None,:].repeat(1, GROUP_SIZE_K, 1) + torch.arange(0, GROUP_SIZE_K * GROUP_SIZE_M, GROUP_SIZE_M, device = parpids.device)[None,:,None]).reshape(
+        num_ngroups * GROUP_SIZE_K, num_egroups * GROUP_SIZE_M
+    )
+    
+    element_flows[chids] = (node_flows[parids] * params[parpids].unsqueeze(-1) * (element_mars[chids].unsqueeze(1) - node_mars[parids]).exp()).sum(dim = 1)
+
+    import pdb; pdb.set_trace()
+
+    _bk_triton_block_sparse_ele_kernel[grid](
+        node_flows = node_flows, 
+        element_flows = element_flows, 
+        node_mars = node_mars, 
+        element_mars = element_mars, 
+        params = params, 
+        chids = ori_chids, 
+        parids_start = parids_start,
+        parids_increment = parids_increment,
+        parpids_start = parpids_start,
+        parpids_increment = parpids_increment, 
+        local_ids = None, 
+        batch_size = batch_size, 
+        partial_eval = 0,
+        ptr_inc_step = ptr_inc_step,
+        BLOCK_B = BLOCK_B, 
+        TILE_SIZE_K = TILE_SIZE_K, 
+        K_NUM_TILES = K_NUM_TILES,
+        TILE_SIZE_M = TILE_SIZE_M, 
+        GROUP_SIZE_M = GROUP_SIZE_M,
+        GROUP_SIZE_K = GROUP_SIZE_K,
+        OP_MODE = OP_MODE
+    )
+
+    import pdb; pdb.set_trace()
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/tests/structures/hclt_correctness_test.py b/tests/structures/hclt_correctness_test.py
index 67ea8dc1..b7768cb7 100644
--- a/tests/structures/hclt_correctness_test.py
+++ b/tests/structures/hclt_correctness_test.py
@@ -167,7 +167,7 @@ def hclt_backward_test():
                     eflows = nflows * params.permute(1, 0) * (emars - nmars).exp()
                     pflows = eflows.sum(dim = 1)
 
-                    assert torch.all(torch.abs(pflows - param_flows[0,:]) < 3e-3)
+                    assert torch.all(torch.abs(pflows - param_flows[0,:]) < 6e-3)
 
                     if cs not in ns2flows:
                         ns2flows[cs] = torch.zeros([num_latents, batch_size])
@@ -210,14 +210,96 @@ def hclt_backward_test():
 
                     pflows = torch.matmul(nflows_div_mars, (emars - emars_max[None,:]).exp().permute(1, 0)) * params
 
-                    assert torch.all(torch.abs(pflows - param_flows) < 3e-3)
+                    assert torch.all(torch.abs(pflows - param_flows) < 6e-3)
 
                     if cs not in ns2flows:
                         ns2flows[cs] = torch.zeros([num_latents, batch_size])
                     ns2flows[cs] += eflows
 
 
+def hclt_em_test():
+
+    device = torch.device("cuda:0")
+
+    train_dataset = torchvision.datasets.MNIST(root = "./examples/data", train = True, download = True)
+
+    train_data = train_dataset.data.reshape(60000, 28*28)[:5000,:]
+
+    num_features = train_data.size(1)
+    num_latents = 128
+
+    root_ns = juice.structures.HCLT(
+        train_data.float().to(device), 
+        num_bins = 32, 
+        sigma = 0.5 / 32, 
+        num_latents = num_latents, 
+        chunk_size = 32
+    )
+    root_ns.init_parameters()
+
+    pc = juice.TensorCircuit(root_ns)
+
+    pc.to(device)
+
+    group_size = root_ns.chs[0].group_size
+    num_groups = num_latents // group_size
+
+    batch_data = train_data[:512,:].contiguous().to(device)
+    data_cpu = batch_data.cpu().long()
+    batch_size = batch_data.size(0)
+
+    lls = pc(batch_data)
+    lls.mean().backward()
+
+    ns2old_params = dict()
+    for ns in root_ns:
+        if ns.is_sum() and ns.has_params():
+            ns2old_params[ns] = ns._params.clone()
+
+    pseudocount = 0.01
+    step_size = 0.24
+
+    pc.mini_batch_em(step_size = step_size, pseudocount = pseudocount)
+
+    pc.update_parameters()
+    pc.update_param_flows()
+
+    for ns in root_ns:
+        if ns.is_sum() and ns != root_ns:
+            old_params = ns2old_params[ns].reshape(num_groups, num_groups * ns.num_chs, group_size, group_size).permute(0, 2, 1, 3)
+            old_params = old_params.reshape(num_latents, num_latents * ns.num_chs)
+
+            ref_params = ns._params.reshape(num_groups, num_groups * ns.num_chs, group_size, group_size).permute(0, 2, 1, 3)
+            ref_params = ref_params.reshape(num_latents, num_latents * ns.num_chs)
+
+            par_flows = ns._param_flows.reshape(num_groups, num_groups * ns.num_chs, group_size, group_size).permute(0, 2, 1, 3)
+            par_flows = par_flows.reshape(num_latents, num_latents * ns.num_chs)
+
+            new_params = (par_flows + pseudocount / par_flows.size(1)) / (par_flows.sum(dim = 1, keepdim = True) + pseudocount)
+
+            updated_params = (1.0 - step_size) * old_params + step_size * new_params
+
+            assert torch.all(torch.abs(ref_params - updated_params) < 1e-4)
+
+        elif ns == root_ns:
+            old_params = ns2old_params[ns].reshape(1, num_groups * ns.num_chs, 1, group_size).permute(0, 2, 1, 3)
+            old_params = old_params.reshape(1, num_latents * ns.num_chs)
+
+            ref_params = ns._params.reshape(1, num_groups * ns.num_chs, 1, group_size).permute(0, 2, 1, 3)
+            ref_params = ref_params.reshape(1, num_latents * ns.num_chs)
+
+            par_flows = ns._param_flows.reshape(1, num_groups * ns.num_chs, 1, group_size).permute(0, 2, 1, 3)
+            par_flows = par_flows.reshape(1, num_latents * ns.num_chs)
+
+            new_params = (par_flows + pseudocount / par_flows.size(1)) / (par_flows.sum(dim = 1, keepdim = True) + pseudocount)
+
+            updated_params = (1.0 - step_size) * old_params + step_size * new_params
+
+            assert torch.all(torch.abs(ref_params - updated_params) < 1e-4)
+
+
 if __name__ == "__main__":
     torch.manual_seed(320942)
     hclt_forward_test()
     hclt_backward_test()
+    hclt_em_test()
diff --git a/tests/structures/hclt_test.py b/tests/structures/hclt_test.py
index 562c304a..9c297218 100644
--- a/tests/structures/hclt_test.py
+++ b/tests/structures/hclt_test.py
@@ -109,8 +109,38 @@ def hclt_test():
         milestone_steps = [0, len(train_loader) * 100, len(train_loader) * 350]
     )
 
-    mini_batch_em_epoch(350, pc, optimizer, scheduler, train_loader, test_loader, device)
-    full_batch_em_epoch(pc, train_loader, test_loader, device)
+    for epoch in range(10):
+        t0 = time.time()
+        train_ll = 0.0
+        for idx, batch in enumerate(train_loader):
+            x = batch[0].to(device)
+
+            optimizer.zero_grad()
+
+            lls = pc(x)
+
+            if lls.isnan().any():
+                import pdb; pdb.set_trace()
+
+            lls.mean().backward()
+
+            if pc.node_flows.isnan().any():
+                import pdb; pdb.set_trace()
+
+            train_ll += lls.mean().detach().cpu().numpy().item()
+
+            optimizer.step()
+            scheduler.step()
+
+            if pc.params.isnan().any():
+                import pdb; pdb.set_trace()
+
+        train_ll /= len(train_loader)
+
+    exit()
+
+    # mini_batch_em_epoch(350, pc, optimizer, scheduler, train_loader, test_loader, device)
+    # full_batch_em_epoch(pc, train_loader, test_loader, device)
 
 
 def hclt_logistic_test():
@@ -167,4 +197,4 @@ def hclt_logistic_test():
 
 if __name__ == "__main__":
     hclt_test()
-    hclt_logistic_test()
+    # hclt_logistic_test()

From 22899eda64038d91a3945bc4500d9460e84c92e2 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Fri, 15 Dec 2023 23:35:21 +0800
Subject: [PATCH 078/162] fix numerical stability issue

---
 src/pyjuice/layer/sum_layer.py | 230 +++------------------------------
 tests/structures/hclt_test.py  |  42 +++---
 2 files changed, 39 insertions(+), 233 deletions(-)

diff --git a/src/pyjuice/layer/sum_layer.py b/src/pyjuice/layer/sum_layer.py
index eb43eb5c..c9fcfff4 100644
--- a/src/pyjuice/layer/sum_layer.py
+++ b/src/pyjuice/layer/sum_layer.py
@@ -275,16 +275,6 @@ def backward(self, node_flows: torch.Tensor, element_flows: torch.Tensor,
                 parpids = self.partitioned_parpids[partition_id]
                 cs_group_size = self.cs_group_sizes[partition_id]
 
-                torch.cuda.synchronize()
-
-                if node_flows.isnan().any() or node_flows.isinf().any():
-                    import pdb; pdb.set_trace()
-                    a = 0
-
-                if element_flows.isnan().any() or element_flows.isinf().any():
-                    import pdb; pdb.set_trace()
-                    a = 0
-
                 self._backward(
                     node_flows, element_flows, params, node_mars, 
                     element_mars, param_flows, 
@@ -294,11 +284,7 @@ def backward(self, node_flows: torch.Tensor, element_flows: torch.Tensor,
 
                 torch.cuda.synchronize()
 
-                if node_flows.isnan().any() or node_flows.isinf().any():
-                    import pdb; pdb.set_trace()
-                    a = 0
-
-                if element_flows.isnan().any() or element_flows.isinf().any():
+                if param_flows.isnan().any() or param_flows.isinf().any():
                     self._backward(
                         node_flows, element_flows, params, node_mars, 
                         element_mars, param_flows, 
@@ -930,14 +916,14 @@ def _backward_block_sparse(self, node_flows: torch.Tensor, element_flows: torch.
                 node_flows, element_flows, params, node_mars, element_mars,
                 chids = chids, parids = parids, parpids = parpids, 
                 cs_group_size = cs_group_size, local_ids = local_ids, 
-                partition_id = partition_id, debug = debug
+                partition_id = partition_id
             )
 
         # Flows w.r.t. parameters
         if param_flows is not None and nids is not None:
             self._backward_block_sparse_par_flows(
                 node_flows, params, node_mars, element_mars, param_flows,
-                nids = nids, cids = cids, pids = pids, pfids = pfids
+                nids = nids, cids = cids, pids = pids, pfids = pfids, debug = debug
             )
 
         return None
@@ -948,7 +934,7 @@ def _bk_triton_block_sparse_ele_kernel(node_flows, element_flows, node_mars, ele
                                            chids, parids_start, parids_increment, parpids_start, parpids_increment, 
                                            local_ids, batch_size: tl.constexpr, partial_eval: tl.constexpr, ptr_inc_step: tl.constexpr, 
                                            BLOCK_B: tl.constexpr, TILE_SIZE_K: tl.constexpr, K_NUM_TILES: tl.constexpr, TILE_SIZE_M: tl.constexpr, 
-                                           GROUP_SIZE_M: tl.constexpr, GROUP_SIZE_K: tl.constexpr, OP_MODE: tl.constexpr):
+                                           GROUP_SIZE_M: tl.constexpr, GROUP_SIZE_K: tl.constexpr):
 
         pid_b = tl.program_id(0) # ID of size-`BLOCK_B` batches
         pid_m = tl.program_id(1) # ID of size-`TILE_SIZE_M` nodes
@@ -986,8 +972,8 @@ def _bk_triton_block_sparse_ele_kernel(node_flows, element_flows, node_mars, ele
 
         # Initialize pointers to `element_mars`
         off_eleids = tl.load(chids + elegroup_id)
-        offs_elemfs = (off_eleids + offs_ele[:,None]) * batch_size + offs_batch[None,:]
-        emars = tl.load(element_mars + offs_elemfs, mask = mask_batch[None,:]) # [TILE_SIZE_K, BLOCK_B]
+        emars_ptr = element_mars + (off_eleids + offs_ele[:,None]) * batch_size + offs_batch[None,:]
+        emars = tl.load(emars_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_K, BLOCK_B]
 
         # Batch increment pointers
         parids_inc_ptr = parids_increment + elegroup_id * (K_NUM_TILES * ptr_inc_step) + offs_edge_gid
@@ -1001,27 +987,7 @@ def _bk_triton_block_sparse_ele_kernel(node_flows, element_flows, node_mars, ele
             nmars = tl.load(nmars_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_K, BLOCK_B]
             nflows = tl.load(nflows_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_K, BLOCK_B]
 
-            nmars_max = tl.max(nmars, axis = 0) # [BLOCK_B]
-            nflows_div_mars = nflows * tl.exp(nmars_max[None,:] - nmars)
-
             eflows = tl.sum(epars[:,:,None] * tl.exp(emars[:,None,:] - nmars[None,:,:]) * nflows[None,:,:], axis = 1)
-            
-            # if OP_MODE == 0:
-            #     # Built-in matmul kernel of triton + float16
-            #     epars = epars.to(tl.float16)
-            #     nflows_div_mars = nflows_div_mars.to(tl.float16)
-            #     eflows = tl.dot(epars, nflows_div_mars).to(tl.float32)
-            # if OP_MODE == 1:
-            #     # Built-in matmul kernel of triton + float32
-            #     eflows = tl.dot(epars, nflows_div_mars)
-            # if OP_MODE == 2:
-            #     # Simulated matmul kernel + float16
-            #     epars = epars.to(tl.float16)
-            #     nflows_div_mars = nflows_div_mars.to(tl.float16)
-            #     eflows = tl.sum(epars[:,:,None] * nflows_div_mars[None,:,:], axis = 1).to(tl.float32)
-            # if OP_MODE == 3:
-            #     # Simulated matmul kernel + float32
-            #     eflows = tl.sum(epars[:,:,None] * nflows_div_mars[None,:,:], axis = 1)
 
             acc += eflows
 
@@ -1037,7 +1003,6 @@ def _bk_triton_block_sparse_ele_kernel(node_flows, element_flows, node_mars, ele
             parids_inc += ptr_inc_step
 
         # Write back
-        off_eleids = tl.load(chids + elegroup_id)
         offs_elemfs = (off_eleids + offs_ele[:,None]) * batch_size + offs_batch[None,:]
         tl.store(element_flows + offs_elemfs, acc, mask = mask_batch[None,:])
 
@@ -1045,7 +1010,7 @@ def _backward_block_sparse_ele_flows(self, node_flows: torch.Tensor, element_flo
                                          params: torch.Tensor, node_mars: torch.Tensor,
                                          element_mars: torch.Tensor, chids: torch.Tensor, parids: torch.Tensor,
                                          parpids: torch.Tensor, cs_group_size: int, local_ids: Optional[torch.Tensor] = None,
-                                         partition_id: int = -1, force_use_fp16: bool = False, force_use_fp32: bool = False, debug = False) -> None:
+                                         partition_id: int = -1) -> None:
 
         assert params.dim() == 1, "Expecting a 1D `params`."
 
@@ -1110,102 +1075,8 @@ def _backward_block_sparse_ele_flows(self, node_flows: torch.Tensor, element_flo
         else:
             parids_start, parids_increment, parpids_start, parpids_increment, ptr_inc_step = self._cached_bk_parids[signature]
 
-        if force_use_fp16:
-            assert not force_use_fp32
-            use_fp16 = True
-        elif force_use_fp32:
-            use_fp16 = False
-        else:
-            if TILE_SIZE_M >= 16 and TILE_SIZE_K >= 16 and BLOCK_B >= 16:
-                use_fp16 = True
-            else:
-                use_fp16 = False
-
-        if TILE_SIZE_M >= 16 and TILE_SIZE_K >= 16 and BLOCK_B >= 16:
-            if use_fp16:
-                OP_MODE = 0
-            else:
-                OP_MODE = 1
-        else:
-            if use_fp16:
-                OP_MODE = 2
-            else:
-                OP_MODE = 3
-
         grid = (triton.cdiv(batch_size, BLOCK_B), triton.cdiv(layer_n_nodes, TILE_SIZE_M))
 
-        if debug:
-            import pdb; pdb.set_trace()
-
-            # num_ngroups = chids.size(0)
-            # num_egroups = parids.size(1)
-            # parids = (parids[:,:,None].repeat(1, 1, self.group_size) + torch.arange(0, self.group_size, device = parids.device)).reshape(num_ngroups, num_egroups * self.group_size)
-            # parpids = (parpids[:,:,None] + torch.arange(0, self.group_size, device = parids.device)).reshape(
-            #     num_ngroups, num_egroups * self.group_size)
-
-            # chids = (chids[:,None].repeat(1, cs_group_size) + torch.arange(0, cs_group_size, device = chids.device)).reshape(num_ngroups * cs_group_size)
-            # parids = parids[:,None,:].repeat(1, cs_group_size, 1).reshape(num_ngroups * cs_group_size, num_egroups * self.group_size)
-            # parpids = (parpids[:,None,:].repeat(1, cs_group_size, 1) + torch.arange(0, cs_group_size * self.group_size, self.group_size, device = parpids.device)[None,:,None]).reshape(
-            #     num_ngroups * cs_group_size, num_egroups * self.group_size
-            # )
-            
-            # element_flows[chids] = (node_flows[parids] * params[parpids].unsqueeze(-1) * \
-            #     (element_mars[chids].unsqueeze(1) - node_mars[parids]).exp()).sum(dim = 1)
-
-            import numpy as np
-            np.savez("temp.npz", node_flows = node_flows.cpu().numpy(),
-                     element_flow = element_flows.cpu().numpy(),
-                     node_mars = node_mars.cpu().numpy(),
-                     element_mars = element_mars.cpu().numpy(),
-                     params = params.cpu().numpy(),
-                     chids = chids.cpu().numpy(),
-                     parids = parids.cpu().numpy(),
-                     parids_start = parids_start.cpu().numpy(),
-                     parids_increment = parids_increment.cpu().numpy(),
-                     parpids = parpids.cpu().numpy(),
-                     parpids_start = parpids_start.cpu().numpy(),
-                     parpids_increment = parpids_increment.cpu().numpy(),
-                     batch_size = batch_size,
-                     ptr_inc_step = ptr_inc_step,
-                     BLOCK_B = BLOCK_B, 
-                     TILE_SIZE_K = TILE_SIZE_K, 
-                     K_NUM_TILES = K_NUM_TILES,
-                     TILE_SIZE_M = TILE_SIZE_M, 
-                     GROUP_SIZE_M = cs_group_size,
-                     GROUP_SIZE_K = self.group_size,
-                     OP_MODE = OP_MODE, 
-                     layer_n_nodes = layer_n_nodes)
-
-            import pdb; pdb.set_trace()
-
-            OP_MODE = 1
-
-            self._bk_triton_block_sparse_ele_kernel[grid](
-                node_flows = node_flows, 
-                element_flows = element_flows, 
-                node_mars = node_mars, 
-                element_mars = element_mars, 
-                params = params, 
-                chids = chids, 
-                parids_start = parids_start,
-                parids_increment = parids_increment,
-                parpids_start = parpids_start,
-                parpids_increment = parpids_increment, 
-                local_ids = local_ids, 
-                batch_size = batch_size, 
-                partial_eval = 1 if local_ids is not None else 0,
-                ptr_inc_step = ptr_inc_step,
-                BLOCK_B = BLOCK_B, 
-                TILE_SIZE_K = TILE_SIZE_K, 
-                K_NUM_TILES = K_NUM_TILES,
-                TILE_SIZE_M = TILE_SIZE_M, 
-                GROUP_SIZE_M = cs_group_size,
-                GROUP_SIZE_K = self.group_size,
-                OP_MODE = OP_MODE
-            )
-
-            import pdb; pdb.set_trace()
-
         self._bk_triton_block_sparse_ele_kernel[grid](
             node_flows = node_flows, 
             element_flows = element_flows, 
@@ -1226,8 +1097,7 @@ def _backward_block_sparse_ele_flows(self, node_flows: torch.Tensor, element_flo
             K_NUM_TILES = K_NUM_TILES,
             TILE_SIZE_M = TILE_SIZE_M, 
             GROUP_SIZE_M = cs_group_size,
-            GROUP_SIZE_K = self.group_size,
-            OP_MODE = OP_MODE
+            GROUP_SIZE_K = self.group_size
         )
 
         return None
@@ -1237,7 +1107,7 @@ def _backward_block_sparse_ele_flows(self, node_flows: torch.Tensor, element_flo
     def _bk_triton_block_sparse_par_kernel(node_flows, node_mars, element_mars, params, param_flows, nids, cids, pids, pfids,
                                            batch_size: tl.constexpr, num_edges: tl.constexpr, TILE_SIZE_B: tl.constexpr, 
                                            B_NUM_TILES: tl.constexpr, TILE_SIZE_K: tl.constexpr, 
-                                           TILE_SIZE_M: tl.constexpr, GROUP_SIZE_M: tl.constexpr, OP_MODE: tl.constexpr):
+                                           TILE_SIZE_M: tl.constexpr, GROUP_SIZE_M: tl.constexpr):
 
         pid_k = tl.program_id(0) # ID of size-`TILE_SIZE_K` edges
         pid_m = tl.program_id(1) # ID of size-`TILE_SIZE_M` nodes
@@ -1263,6 +1133,11 @@ def _bk_triton_block_sparse_par_kernel(node_flows, node_mars, element_mars, para
         nmars_ptr = node_mars + (off_nids + offs_node[:,None]) * batch_size + offs_batch[None,:]
         nflows_ptr = node_flows + (off_nids + offs_node[:,None]) * batch_size + offs_batch[None,:]
 
+        # Initialize `params`
+        par_start = tl.load(pids + ngroup_id * num_edges + offs_edge)
+        epars_offsets = offs_node[:,None] + par_start[None,:] # [TILE_SIZE_M, TILE_SIZE_K]
+        epars = tl.load(params + epars_offsets)
+
         # Inner loop
         acc = tl.zeros([TILE_SIZE_M, TILE_SIZE_K], dtype = tl.float32)
         
@@ -1271,27 +1146,7 @@ def _bk_triton_block_sparse_par_kernel(node_flows, node_mars, element_mars, para
             nmars = tl.load(nmars_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_M, TILE_SIZE_B]
             nflows = tl.load(nflows_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_M, TILE_SIZE_B]
 
-            emars_max = tl.max(emars, axis = 1)
-            nflows_div_mars = nflows * tl.exp(emars_max[None,:] - nmars)
-
-            emars = tl.exp(emars - emars_max[:,None])
-
-            if OP_MODE == 0:
-                # Built-in matmul kernel of triton + float16
-                nflows_div_mars = nflows_div_mars.to(tl.float16)
-                emars = emars.to(tl.float16)
-                pflows = tl.dot(nflows_div_mars, emars).to(tl.float32)
-            if OP_MODE == 1:
-                # Built-in matmul kernel of triton + float32
-                pflows = tl.dot(nflows_div_mars, emars).to(tl.float32)
-            if OP_MODE == 2:
-                # Simulated matmul kernel + float16
-                nflows_div_mars = nflows_div_mars.to(tl.float16)
-                emars = emars.to(tl.float16)
-                pflows = tl.sum(nflows_div_mars[:,:,None] * emars[None,:,:], axis = 1).to(tl.float32)
-            if OP_MODE == 3:
-                # Simulated matmul kernel + float32
-                pflows = tl.sum(nflows_div_mars[:,:,None] * emars[None,:,:], axis = 1)
+            pflows = tl.sum(epars[:,None,:] * tl.exp(emars[None,:,:] - nmars[:,:,None]) * nflows[:,:,None], axis = 1)
 
             acc += pflows
 
@@ -1304,21 +1159,15 @@ def _bk_triton_block_sparse_par_kernel(node_flows, node_mars, element_mars, para
             offs_batch += TILE_SIZE_B
             mask_batch = offs_batch < batch_size
 
-        par_start = tl.load(pids + ngroup_id * num_edges + offs_edge)
-        epars_offsets = offs_node[:,None] + par_start[None,:] # [TILE_SIZE_M, TILE_SIZE_K]
-        epars = tl.load(params + epars_offsets)
-
         parflow_start = tl.load(pfids + ngroup_id * num_edges + offs_edge)
         eparflows_offsets = offs_node[:,None] + parflow_start[None,:] # [TILE_SIZE_M, TILE_SIZE_K]
-        
-        curr_pflows = acc * epars
 
-        tl.atomic_add(param_flows + eparflows_offsets, curr_pflows)
+        tl.atomic_add(param_flows + eparflows_offsets, acc)
 
     def _backward_block_sparse_par_flows(self, node_flows: torch.Tensor, params: torch.Tensor, node_mars: torch.Tensor, 
                                          element_mars: torch.Tensor, param_flows: torch.Tensor, nids: torch.Tensor, 
                                          cids: torch.Tensor, pids: torch.Tensor, pfids: torch.Tensor,
-                                         force_use_fp16: bool = False, force_use_fp32: bool = False) -> None:
+                                         debug = False) -> None:
         """
         Backward pass of sum layers w.r.t. sum parameters with the block-sparse processing kernel.
         
@@ -1360,28 +1209,6 @@ def _backward_block_sparse_par_flows(self, node_flows: torch.Tensor, params: tor
                                   "This is an internal error of PyJuice. Please consider checking the kernel dispatching criterions and use the " \
                                   "corresponding sparse kernel instead."
 
-        if force_use_fp16:
-            assert not force_use_fp32
-            use_fp16 = True
-        elif force_use_fp32:
-            use_fp16 = False
-        else:
-            if TILE_SIZE_M >= 16 and TILE_SIZE_K >= 16 and TILE_SIZE_B >= 16:
-                use_fp16 = True
-            else:
-                use_fp16 = False
-
-        if TILE_SIZE_M >= 16 and TILE_SIZE_K >= 16 and TILE_SIZE_B >= 16:
-            if use_fp16:
-                OP_MODE = 0
-            else:
-                OP_MODE = 1
-        else:
-            if use_fp16:
-                OP_MODE = 2
-            else:
-                OP_MODE = 3
-
         grid = (triton.cdiv(num_edges, TILE_SIZE_K), triton.cdiv(layer_n_nodes, TILE_SIZE_M))
 
         self._bk_triton_block_sparse_par_kernel[grid](
@@ -1400,8 +1227,7 @@ def _backward_block_sparse_par_flows(self, node_flows: torch.Tensor, params: tor
             B_NUM_TILES = B_NUM_TILES, 
             TILE_SIZE_K = TILE_SIZE_K, 
             TILE_SIZE_M = TILE_SIZE_M, 
-            GROUP_SIZE_M = self.group_size,
-            OP_MODE = OP_MODE
+            GROUP_SIZE_M = self.group_size
         )
 
     def _backward_sparse(self, node_flows: torch.Tensor, element_flows: torch.Tensor, 
@@ -1524,16 +1350,6 @@ def _backward_sparse_ele_flows(self, node_flows: torch.Tensor, element_flows: to
 
         grid = (triton.cdiv(batch_size, BLOCK_B), triton.cdiv(layer_n_nodes, BLOCK_M))
 
-        torch.cuda.synchronize()
-
-        if node_flows.isnan().any() or node_flows.isinf().any():
-            import pdb; pdb.set_trace()
-            a = 0
-
-        if element_flows.isnan().any() or element_flows.isinf().any():
-            import pdb; pdb.set_trace()
-            a = 0
-
         self._bk_triton_sparse_ele_kernel[grid](
             node_flows = node_flows, 
             element_flows = element_flows, 
@@ -1552,16 +1368,6 @@ def _backward_sparse_ele_flows(self, node_flows: torch.Tensor, element_flows: to
             GROUP_SIZE_K = self.group_size
         )
 
-        torch.cuda.synchronize()
-
-        if node_flows.isnan().any() or node_flows.isinf().any():
-            import pdb; pdb.set_trace()
-            a = 0
-
-        if element_flows.isnan().any() or element_flows.isinf().any():
-            import pdb; pdb.set_trace()
-            a = 0
-
         return None
 
     @staticmethod
diff --git a/tests/structures/hclt_test.py b/tests/structures/hclt_test.py
index 9c297218..6cec0fb0 100644
--- a/tests/structures/hclt_test.py
+++ b/tests/structures/hclt_test.py
@@ -109,38 +109,38 @@ def hclt_test():
         milestone_steps = [0, len(train_loader) * 100, len(train_loader) * 350]
     )
 
-    for epoch in range(10):
-        t0 = time.time()
-        train_ll = 0.0
-        for idx, batch in enumerate(train_loader):
-            x = batch[0].to(device)
+    # for epoch in range(10):
+    #     t0 = time.time()
+    #     train_ll = 0.0
+    #     for idx, batch in enumerate(train_loader):
+    #         x = batch[0].to(device)
 
-            optimizer.zero_grad()
+    #         optimizer.zero_grad()
 
-            lls = pc(x)
+    #         lls = pc(x)
 
-            if lls.isnan().any():
-                import pdb; pdb.set_trace()
+    #         if lls.isnan().any():
+    #             import pdb; pdb.set_trace()
 
-            lls.mean().backward()
+    #         lls.mean().backward()
 
-            if pc.node_flows.isnan().any():
-                import pdb; pdb.set_trace()
+    #         if pc.node_flows.isnan().any():
+    #             import pdb; pdb.set_trace()
 
-            train_ll += lls.mean().detach().cpu().numpy().item()
+    #         train_ll += lls.mean().detach().cpu().numpy().item()
 
-            optimizer.step()
-            scheduler.step()
+    #         optimizer.step()
+    #         scheduler.step()
 
-            if pc.params.isnan().any():
-                import pdb; pdb.set_trace()
+    #         if pc.params.isnan().any():
+    #             import pdb; pdb.set_trace()
 
-        train_ll /= len(train_loader)
+    #     train_ll /= len(train_loader)
 
-    exit()
+    # exit()
 
-    # mini_batch_em_epoch(350, pc, optimizer, scheduler, train_loader, test_loader, device)
-    # full_batch_em_epoch(pc, train_loader, test_loader, device)
+    mini_batch_em_epoch(350, pc, optimizer, scheduler, train_loader, test_loader, device)
+    full_batch_em_epoch(pc, train_loader, test_loader, device)
 
 
 def hclt_logistic_test():

From a5efa4ea5211f3f2c134e86d19256b8bf42d0584 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Sat, 16 Dec 2023 16:29:28 +0800
Subject: [PATCH 079/162] now this seems to work

---
 src/pyjuice/layer/sum_layer.py            | 283 +++++++++++++++++--
 src/pyjuice/model/tensorcircuit.py        |   4 +-
 tests/layer/sum_layer_test.py             |   2 +-
 tests/model/simple_model_test.py          |  51 ++--
 tests/structures/debug.py                 | 326 ++++++++++++++++++++++
 tests/structures/debug_bk_kernel.py       | 188 -------------
 tests/structures/hclt_correctness_test.py |  16 +-
 tests/structures/hclt_test.py             |  33 ++-
 8 files changed, 637 insertions(+), 266 deletions(-)
 create mode 100644 tests/structures/debug.py
 delete mode 100644 tests/structures/debug_bk_kernel.py

diff --git a/src/pyjuice/layer/sum_layer.py b/src/pyjuice/layer/sum_layer.py
index c9fcfff4..494e2077 100644
--- a/src/pyjuice/layer/sum_layer.py
+++ b/src/pyjuice/layer/sum_layer.py
@@ -284,16 +284,6 @@ def backward(self, node_flows: torch.Tensor, element_flows: torch.Tensor,
 
                 torch.cuda.synchronize()
 
-                if param_flows.isnan().any() or param_flows.isinf().any():
-                    self._backward(
-                        node_flows, element_flows, params, node_mars, 
-                        element_mars, param_flows, 
-                        chids = chids, parids = parids, parpids = parpids,
-                        cs_group_size = cs_group_size, debug = True
-                    )
-                    import pdb; pdb.set_trace()
-                    a = 0
-
         else:
             # Partial evaluation
             for partition_id in range(self.num_bk_partitions):
@@ -360,8 +350,6 @@ def _forward(self, node_mars: torch.Tensor, element_mars: torch.Tensor,
         else:
             mode = "block_sparse"
 
-        # mode = "sparse" #### debug
-
         if mode == "block_sparse":
             self._forward_block_sparse(
                 node_mars, element_mars, params, nids, cids, pids, local_ids,
@@ -829,7 +817,7 @@ def _backward(self, node_flows: torch.Tensor, element_flows: torch.Tensor,
                   chids: Optional[torch.Tensor] = None, parids: Optional[torch.Tensor] = None, 
                   parpids: Optional[torch.Tensor] = None, 
                   cs_group_size: int = 0, local_ids: Optional[torch.Tensor] = None, 
-                  partition_id: int = -1, mode: Optional[str] = None, debug = False) -> None:
+                  partition_id: int = -1, mode: Optional[str] = None) -> None:
         """
         Back pass of sum layers.
         
@@ -869,7 +857,7 @@ def _backward(self, node_flows: torch.Tensor, element_flows: torch.Tensor,
             self._backward_block_sparse(
                 node_flows, element_flows, params, node_mars, element_mars, param_flows, 
                 nids, cids, pids, pfids, chids, parids, parpids, cs_group_size, local_ids, 
-                partition_id = partition_id, debug = debug
+                partition_id = partition_id
             )
 
         elif mode == "sparse":
@@ -894,7 +882,7 @@ def _backward_block_sparse(self, node_flows: torch.Tensor, element_flows: torch.
                                nids: Optional[torch.Tensor], cids: Optional[torch.Tensor], pids: Optional[torch.Tensor], pfids: Optional[torch.Tensor],
                                chids: Optional[torch.Tensor], parids: Optional[torch.Tensor], parpids: Optional[torch.Tensor], 
                                cs_group_size: int, local_ids: Optional[torch.Tensor] = None,
-                               partition_id: int = -1, debug = False) -> None:
+                               partition_id: int = -1) -> None:
         """
         Back pass of sum layers with block-sparse processing kernel.
         
@@ -923,7 +911,7 @@ def _backward_block_sparse(self, node_flows: torch.Tensor, element_flows: torch.
         if param_flows is not None and nids is not None:
             self._backward_block_sparse_par_flows(
                 node_flows, params, node_mars, element_mars, param_flows,
-                nids = nids, cids = cids, pids = pids, pfids = pfids, debug = debug
+                nids = nids, cids = cids, pids = pids, pfids = pfids
             )
 
         return None
@@ -981,14 +969,25 @@ def _bk_triton_block_sparse_ele_kernel(node_flows, element_flows, node_mars, ele
 
         # Inner loop
         acc = tl.zeros([TILE_SIZE_M, BLOCK_B], dtype = tl.float32)
+        log_max = tl.zeros([BLOCK_B], dtype = tl.float32) - float("inf")
 
         for k in range(0, K_NUM_TILES):
             epars = tl.load(epars_ptr) # [TILE_SIZE_M, TILE_SIZE_K]
             nmars = tl.load(nmars_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_K, BLOCK_B]
             nflows = tl.load(nflows_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_K, BLOCK_B]
 
-            eflows = tl.sum(epars[:,:,None] * tl.exp(emars[:,None,:] - nmars[None,:,:]) * nflows[None,:,:], axis = 1)
+            # log_n_fdm = tl.log(nflows) - nmars
+            # log_n_fdm_max = tl.max(log_n_fdm, axis = 0)
+            # n_fdm_sub = tl.exp(log_n_fdm - log_n_fdm_max[None,:])
 
+            # partial_flows = tl.dot(epars, n_fdm_sub)
+
+            # acc = tl.where(log_max[None,:] > log_n_fdm_max[None,:], 
+            #                acc + tl.exp(log_n_fdm_max - log_max)[None,:] * partial_flows,
+            #                partial_flows + tl.exp(log_max - log_n_fdm_max)[None,:] * acc)
+            # log_max = tl.maximum(log_max, log_n_fdm_max)
+
+            eflows = tl.sum(epars[:,:,None] * tl.exp(emars[:,None,:] - nmars[None,:,:]) * nflows[None,:,:], axis = 1)
             acc += eflows
 
             # Increment `epars_ptr`
@@ -1002,10 +1001,105 @@ def _bk_triton_block_sparse_ele_kernel(node_flows, element_flows, node_mars, ele
             nflows_ptr += parids_inc[:,None] * batch_size
             parids_inc += ptr_inc_step
 
+        # # Initialize pointers to `element_mars`
+        # off_eleids = tl.load(chids + elegroup_id)
+        # emars_ptr = element_mars + (off_eleids + offs_ele[:,None]) * batch_size + offs_batch[None,:]
+        # emars = tl.load(emars_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_K, BLOCK_B]
+
+        # eflows = acc * tl.exp(emars + log_max[None,:])
+
         # Write back
         offs_elemfs = (off_eleids + offs_ele[:,None]) * batch_size + offs_batch[None,:]
         tl.store(element_flows + offs_elemfs, acc, mask = mask_batch[None,:])
 
+    @staticmethod
+    @triton.jit
+    def my_kernel(node_flows, element_flows, node_mars, element_mars, params, 
+                                            chids, parids_start, parids_increment, parpids_start, parpids_increment, 
+                                            local_ids, batch_size: tl.constexpr, partial_eval: tl.constexpr, ptr_inc_step: tl.constexpr, 
+                                            BLOCK_B: tl.constexpr, TILE_SIZE_K: tl.constexpr, K_NUM_TILES: tl.constexpr, TILE_SIZE_M: tl.constexpr, 
+                                            GROUP_SIZE_M: tl.constexpr, GROUP_SIZE_K: tl.constexpr):
+
+        pid_b = tl.program_id(0) # ID of size-`BLOCK_B` batches
+        pid_m = tl.program_id(1) # ID of size-`TILE_SIZE_M` nodes
+
+        # Get inferred node group id from `pid_m`
+        elegroup_id = pid_m // (GROUP_SIZE_M // TILE_SIZE_M)
+        tile_id = pid_m % (GROUP_SIZE_M // TILE_SIZE_M)
+
+        # Get the real node group id in the case of partial evaluation
+        if partial_eval == 1:
+            elegroup_id = tl.load(local_ids + elegroup_id)
+
+        # Initialize pointers to `params`
+        offs_ele = tl.arange(0, TILE_SIZE_M) + tile_id * TILE_SIZE_M
+        offs_edge = tl.arange(0, TILE_SIZE_K)
+        offs_edge_gid = offs_edge // GROUP_SIZE_K
+        offs_edge_nid = (offs_edge % GROUP_SIZE_K)
+        par_start = tl.load(parpids_start + elegroup_id * ptr_inc_step + offs_edge_gid)
+        epars_ptr = params + \
+            offs_ele[:,None] * GROUP_SIZE_K + \
+            (par_start + offs_edge_nid)[None,:] # [TILE_SIZE_M, TILE_SIZE_K]
+
+        # Batch offsets and mask
+        offs_batch = tl.arange(0, BLOCK_B) + pid_b * BLOCK_B
+        mask_batch = offs_batch < batch_size
+
+        # Initialize pointers to `node_mars`
+        edge_start = tl.load(parids_start + elegroup_id * ptr_inc_step + offs_edge_gid)
+        nmars_ptr = node_mars + \
+            (edge_start + offs_edge_nid)[:,None] * batch_size + \
+            offs_batch[None,:] # [TILE_SIZE_K, BLOCK_B]
+        nflows_ptr = node_flows + \
+            (edge_start + offs_edge_nid)[:,None] * batch_size + \
+            offs_batch[None,:] # [TILE_SIZE_K, BLOCK_B]
+
+        # Batch increment pointers
+        parids_inc_ptr = parids_increment + elegroup_id * (K_NUM_TILES * ptr_inc_step) + offs_edge_gid
+        parpids_inc_ptr = parpids_increment + elegroup_id * (K_NUM_TILES * ptr_inc_step) + offs_edge_gid
+
+        # Inner loop
+        acc = tl.zeros([TILE_SIZE_M, BLOCK_B], dtype = tl.float32) - float("inf")
+
+        for k in range(0, K_NUM_TILES):
+            epars = tl.load(epars_ptr) # [TILE_SIZE_M, TILE_SIZE_K]
+            nmars = tl.load(nmars_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_K, BLOCK_B]
+            nflows = tl.load(nflows_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_K, BLOCK_B]
+
+            log_n_fdm = tl.log(nflows) - nmars
+            log_n_fdm_max = tl.max(log_n_fdm, axis = 0)
+            n_fdm_sub = tl.where(log_n_fdm_max[None,:] != -float("inf"), tl.exp(log_n_fdm - log_n_fdm_max[None,:]), 0.0)
+
+            partial_flows = tl.dot(epars, n_fdm_sub, allow_tf32 = True)
+            # partial_flows = tl.sum(epars[:,:,None] * n_fdm_sub[None,:,:], axis = 1)
+
+            acc = tl.where(log_n_fdm_max[None,:] > acc,
+                tl.log(partial_flows + tl.exp(acc - log_n_fdm_max[None,:])) + log_n_fdm_max[None,:],
+                tl.log(tl.exp(log_n_fdm_max[None,:] - acc) * partial_flows + 1.0) + acc
+            )
+
+            # Increment `epars_ptr`
+            parpids_inc = tl.load(parpids_inc_ptr)
+            epars_ptr += parpids_inc[None,:]
+            parpids_inc_ptr += ptr_inc_step
+
+            # Increment `nmars_ptr`
+            parids_inc = tl.load(parids_inc_ptr)
+            nmars_ptr += parids_inc[:,None] * batch_size
+            nflows_ptr += parids_inc[:,None] * batch_size
+            parids_inc += ptr_inc_step
+
+        # Initialize pointers to `element_mars`
+        off_eleids = tl.load(chids + elegroup_id)
+        emars_ptr = element_mars + (off_eleids + offs_ele[:,None]) * batch_size + offs_batch[None,:]
+        emars = tl.load(emars_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_K, BLOCK_B]
+
+        eflows = tl.exp(acc + emars)
+
+        # Write back
+        offs_elemfs = (off_eleids + offs_ele[:,None]) * batch_size + offs_batch[None,:]
+        tl.store(element_flows + offs_elemfs, eflows, mask = mask_batch[None,:])
+
     def _backward_block_sparse_ele_flows(self, node_flows: torch.Tensor, element_flows: torch.Tensor,
                                          params: torch.Tensor, node_mars: torch.Tensor,
                                          element_mars: torch.Tensor, chids: torch.Tensor, parids: torch.Tensor,
@@ -1021,17 +1115,17 @@ def _backward_block_sparse_ele_flows(self, node_flows: torch.Tensor, element_flo
         BATCH_SIZE_NP2 = triton.next_power_of_2(batch_size)
 
         # Heuristic to set `TILE_SIZE_M`, `TILE_SIZE_K`, and `BLOCK_B`
-        base_size = min(self.group_size, num_edges, BATCH_SIZE_NP2, 128)
-        if base_size >= 64:
+        base_size = min(self.group_size, num_edges, BATCH_SIZE_NP2, 64)
+        if base_size >= 32:
             TILE_SIZE_K = base_size
-            TILE_SIZE_M = 2048 // base_size
-            BLOCK_B = 2048 // base_size
+            TILE_SIZE_M = 1024 // base_size
+            BLOCK_B = 1024 // base_size
         else:
-            remainder = 2048 // (base_size ** 2)
+            remainder = 1024 // (base_size ** 2)
 
-            TILE_SIZE_K = min(2048 // remainder, base_size * remainder, num_edges)
-            TILE_SIZE_M = min(2048 // TILE_SIZE_K, cs_group_size)
-            BLOCK_B = min(2048 // TILE_SIZE_K, BATCH_SIZE_NP2)
+            TILE_SIZE_K = min(1024 // remainder, base_size * remainder, num_edges)
+            TILE_SIZE_M = min(1024 // TILE_SIZE_K, cs_group_size)
+            BLOCK_B = min(1024 // TILE_SIZE_K, BATCH_SIZE_NP2)
         K_NUM_TILES = num_edges // TILE_SIZE_K
 
         assert TILE_SIZE_K >= 4, f"`TILE_SIZE_K` should be greater than 4 (but got {TILE_SIZE_K}) in order to use the block-sparse kernel. " \
@@ -1077,7 +1171,29 @@ def _backward_block_sparse_ele_flows(self, node_flows: torch.Tensor, element_flo
 
         grid = (triton.cdiv(batch_size, BLOCK_B), triton.cdiv(layer_n_nodes, TILE_SIZE_M))
 
-        self._bk_triton_block_sparse_ele_kernel[grid](
+        # self._bk_triton_block_sparse_ele_kernel[grid](
+        #     node_flows = node_flows, 
+        #     element_flows = element_flows, 
+        #     node_mars = node_mars, 
+        #     element_mars = element_mars, 
+        #     params = params, 
+        #     chids = chids, 
+        #     parids_start = parids_start,
+        #     parids_increment = parids_increment,
+        #     parpids_start = parpids_start,
+        #     parpids_increment = parpids_increment, 
+        #     local_ids = local_ids, 
+        #     batch_size = batch_size, 
+        #     partial_eval = 1 if local_ids is not None else 0,
+        #     ptr_inc_step = ptr_inc_step,
+        #     BLOCK_B = BLOCK_B, 
+        #     TILE_SIZE_K = TILE_SIZE_K, 
+        #     K_NUM_TILES = K_NUM_TILES,
+        #     TILE_SIZE_M = TILE_SIZE_M, 
+        #     GROUP_SIZE_M = cs_group_size,
+        #     GROUP_SIZE_K = self.group_size
+        # )
+        self.my_kernel[grid](
             node_flows = node_flows, 
             element_flows = element_flows, 
             node_mars = node_mars, 
@@ -1100,6 +1216,12 @@ def _backward_block_sparse_ele_flows(self, node_flows: torch.Tensor, element_flo
             GROUP_SIZE_K = self.group_size
         )
 
+        # torch.cuda.synchronize()
+        element_flows[0,:] = 0.0
+
+        # if element_flows.isnan().any() or element_flows.isinf().any():
+        #     import pdb; pdb.set_trace()
+
         return None
 
     @staticmethod
@@ -1146,8 +1268,16 @@ def _bk_triton_block_sparse_par_kernel(node_flows, node_mars, element_mars, para
             nmars = tl.load(nmars_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_M, TILE_SIZE_B]
             nflows = tl.load(nflows_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_M, TILE_SIZE_B]
 
-            pflows = tl.sum(epars[:,None,:] * tl.exp(emars[None,:,:] - nmars[:,:,None]) * nflows[:,:,None], axis = 1)
+            # log_n_fdm = tl.log(nflows) - nmars
+            # log_n_fdm_max = tl.max(log_n_fdm, axis = 0)
+            # n_fdm_sub = tl.exp(log_n_fdm - log_n_fdm_max[None,:])
 
+            # scaled_emars = tl.exp(emars + log_n_fdm_max[:,None])
+
+            # partial_flows = tl.dot(n_fdm_sub, scaled_emars)
+            # acc += partial_flows
+
+            pflows = tl.sum(epars[:,None,:] * tl.exp(emars[None,:,:] - nmars[:,:,None]) * nflows[:,:,None], axis = 1)
             acc += pflows
 
             # Increment `emars_ptr`, `nmars_ptr`, and `nmars_ptr`
@@ -1159,15 +1289,90 @@ def _bk_triton_block_sparse_par_kernel(node_flows, node_mars, element_mars, para
             offs_batch += TILE_SIZE_B
             mask_batch = offs_batch < batch_size
 
+        # # Initialize `params`
+        # par_start = tl.load(pids + ngroup_id * num_edges + offs_edge)
+        # epars_offsets = offs_node[:,None] + par_start[None,:] # [TILE_SIZE_M, TILE_SIZE_K]
+        # epars = tl.load(params + epars_offsets)
+
+        # pflows = acc * epars
+
         parflow_start = tl.load(pfids + ngroup_id * num_edges + offs_edge)
         eparflows_offsets = offs_node[:,None] + parflow_start[None,:] # [TILE_SIZE_M, TILE_SIZE_K]
 
         tl.atomic_add(param_flows + eparflows_offsets, acc)
 
+    @staticmethod
+    @triton.jit
+    def my_kernel2(node_flows, node_mars, element_mars, params, param_flows, nids, cids, pids, pfids,
+                                           batch_size: tl.constexpr, num_edges: tl.constexpr, TILE_SIZE_B: tl.constexpr, 
+                                           B_NUM_TILES: tl.constexpr, TILE_SIZE_K: tl.constexpr, 
+                                           TILE_SIZE_M: tl.constexpr, GROUP_SIZE_M: tl.constexpr):
+
+        pid_k = tl.program_id(0) # ID of size-`TILE_SIZE_K` edges
+        pid_m = tl.program_id(1) # ID of size-`TILE_SIZE_M` nodes
+
+        # Get inferred node group id from `pid_m`
+        ngroup_id = pid_m // (GROUP_SIZE_M // TILE_SIZE_M)
+        tile_id = pid_m % (GROUP_SIZE_M // TILE_SIZE_M)
+
+        # Batch offsets and mask
+        offs_batch = tl.arange(0, TILE_SIZE_B)
+        mask_batch = offs_batch < batch_size
+
+        # Initialize pointers to `element_mars`
+        offs_edge = tl.arange(0, TILE_SIZE_K) + pid_k * TILE_SIZE_K
+        edge_start = tl.load(cids + ngroup_id * num_edges + offs_edge)
+        emars_ptr = element_mars + \
+            edge_start[None,:] * batch_size + \
+            offs_batch[:,None] # [TILE_SIZE_B, TILE_SIZE_K]
+
+        # Initialize pointers to `node_flows` and `node_mars`
+        offs_node = tl.arange(0, TILE_SIZE_M) + tile_id * TILE_SIZE_M
+        off_nids = tl.load(nids + ngroup_id)
+        nmars_ptr = node_mars + (off_nids + offs_node[:,None]) * batch_size + offs_batch[None,:]
+        nflows_ptr = node_flows + (off_nids + offs_node[:,None]) * batch_size + offs_batch[None,:]
+
+        # Inner loop
+        acc = tl.zeros([TILE_SIZE_M, TILE_SIZE_K], dtype = tl.float32)
+        
+        for b in range(0, B_NUM_TILES):
+            emars = tl.load(emars_ptr, mask = mask_batch[:,None]) # [TILE_SIZE_B, TILE_SIZE_K]
+            nmars = tl.load(nmars_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_M, TILE_SIZE_B]
+            nflows = tl.load(nflows_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_M, TILE_SIZE_B]
+
+            log_n_fdm = tl.log(nflows) - nmars
+            log_n_fdm_max = tl.max(log_n_fdm, axis = 0)
+            n_fdm_sub = tl.where(log_n_fdm_max[None,:] != -float("inf"), tl.exp(log_n_fdm - log_n_fdm_max[None,:]), 0.0)
+
+            scaled_emars = tl.exp(emars + log_n_fdm_max[:,None])
+
+            partial_flows = tl.dot(n_fdm_sub, scaled_emars, allow_tf32 = True)
+            acc += partial_flows
+
+            # Increment `emars_ptr`, `nmars_ptr`, and `nmars_ptr`
+            emars_ptr += TILE_SIZE_B
+            nmars_ptr += TILE_SIZE_B
+            nflows_ptr += TILE_SIZE_B
+
+            # Update batch mask
+            offs_batch += TILE_SIZE_B
+            mask_batch = offs_batch < batch_size
+
+        # Initialize `params`
+        par_start = tl.load(pids + ngroup_id * num_edges + offs_edge)
+        epars_offsets = offs_node[:,None] + par_start[None,:] # [TILE_SIZE_M, TILE_SIZE_K]
+        epars = tl.load(params + epars_offsets)
+
+        pflows = acc * epars
+
+        parflow_start = tl.load(pfids + ngroup_id * num_edges + offs_edge)
+        eparflows_offsets = offs_node[:,None] + parflow_start[None,:] # [TILE_SIZE_M, TILE_SIZE_K]
+
+        tl.atomic_add(param_flows + eparflows_offsets, pflows)
+
     def _backward_block_sparse_par_flows(self, node_flows: torch.Tensor, params: torch.Tensor, node_mars: torch.Tensor, 
                                          element_mars: torch.Tensor, param_flows: torch.Tensor, nids: torch.Tensor, 
-                                         cids: torch.Tensor, pids: torch.Tensor, pfids: torch.Tensor,
-                                         debug = False) -> None:
+                                         cids: torch.Tensor, pids: torch.Tensor, pfids: torch.Tensor) -> None:
         """
         Backward pass of sum layers w.r.t. sum parameters with the block-sparse processing kernel.
         
@@ -1211,7 +1416,25 @@ def _backward_block_sparse_par_flows(self, node_flows: torch.Tensor, params: tor
 
         grid = (triton.cdiv(num_edges, TILE_SIZE_K), triton.cdiv(layer_n_nodes, TILE_SIZE_M))
 
-        self._bk_triton_block_sparse_par_kernel[grid](
+        # self._bk_triton_block_sparse_par_kernel[grid](
+        #     node_flows = node_flows, 
+        #     node_mars = node_mars, 
+        #     element_mars = element_mars, 
+        #     params = params, 
+        #     param_flows = param_flows, 
+        #     nids = nids, 
+        #     cids = cids, 
+        #     pids = pids,
+        #     pfids = pfids,
+        #     batch_size = batch_size, 
+        #     num_edges = num_edges, 
+        #     TILE_SIZE_B = TILE_SIZE_B, 
+        #     B_NUM_TILES = B_NUM_TILES, 
+        #     TILE_SIZE_K = TILE_SIZE_K, 
+        #     TILE_SIZE_M = TILE_SIZE_M, 
+        #     GROUP_SIZE_M = self.group_size
+        # )
+        self.my_kernel2[grid](
             node_flows = node_flows, 
             node_mars = node_mars, 
             element_mars = element_mars, 
diff --git a/src/pyjuice/model/tensorcircuit.py b/src/pyjuice/model/tensorcircuit.py
index 7d80d1f9..bff52499 100644
--- a/src/pyjuice/model/tensorcircuit.py
+++ b/src/pyjuice/model/tensorcircuit.py
@@ -98,7 +98,7 @@ def to(self, device):
         
     def forward(self, inputs: torch.Tensor, input_layer_fn: Optional[Union[str,Callable]] = None,
                 cache: Optional[dict] = None, return_cache: bool = False, record_cudagraph: bool = False, 
-                apply_cudagraph: bool = False, **kwargs):
+                apply_cudagraph: bool = True, **kwargs):
         """
         Forward the circuit.
 
@@ -109,7 +109,7 @@ def forward(self, inputs: torch.Tensor, input_layer_fn: Optional[Union[str,Calla
                           the corresponding member function of `input_layer`
         `kwargs`:         Additional arguments for input layers
         """
-
+        
         assert inputs.dim() == 2 and inputs.size(1) == self.num_vars
         
         B = inputs.size(0)
diff --git a/tests/layer/sum_layer_test.py b/tests/layer/sum_layer_test.py
index 99f191b0..e60a30d5 100644
--- a/tests/layer/sum_layer_test.py
+++ b/tests/layer/sum_layer_test.py
@@ -222,4 +222,4 @@ def speed_test():
 if __name__ == "__main__":
     torch.manual_seed(3890)
     sum_layer_test()
-    speed_test()
\ No newline at end of file
+    # speed_test()
\ No newline at end of file
diff --git a/tests/model/simple_model_test.py b/tests/model/simple_model_test.py
index 5bd3d9a1..94bbaa18 100644
--- a/tests/model/simple_model_test.py
+++ b/tests/model/simple_model_test.py
@@ -323,10 +323,11 @@ def simple_model_test():
 
     ch_lls = torch.cat((np0_lls, np3_lls), dim = 0)
     epars = ns0._params.reshape(2, 4, 16, 16).permute(0, 2, 1, 3).reshape(32, 64)
-    ch_lls_max = ch_lls.max(dim = 0).values
-    nflow_div_mar = ns0_flows * (ch_lls_max[None,:] - ns0_lls).exp()
-    emars = (ch_lls - ch_lls_max[None,:]).exp()
-    eflows = emars * torch.matmul(epars.permute(1, 0), nflow_div_mar)
+    log_n_fdm = ns0_flows.log() - ns0_lls
+    log_n_fdm_max = log_n_fdm.max(dim = 0).values
+    n_fdm_sub = (log_n_fdm - log_n_fdm_max[None,:]).exp()
+    emars = (ch_lls + log_n_fdm_max[None,:]).exp()
+    eflows = emars * torch.matmul(epars.permute(1, 0), n_fdm_sub)
 
     sid, eid = np0._output_ind_range
     np0_flows = eflows[0:32,:]
@@ -336,39 +337,41 @@ def simple_model_test():
     np3_flows = eflows[32:64,:]
     assert torch.all(torch.abs(np3_flows - element_flows[sid:eid,:]) < 1e-4)
 
-    ns0_parflows = epars * torch.matmul(nflow_div_mar, emars.permute(1, 0))
+    ns0_parflows = epars * torch.matmul(n_fdm_sub, emars.permute(1, 0))
     ref_parflows = param_flows[0:2048].reshape(2, 64, 16).permute(0, 2, 1).reshape(32, 64)
-    assert torch.all(torch.abs(ns0_parflows - ref_parflows) < 1e-4)
+    assert torch.all(torch.abs(ns0_parflows - ref_parflows) < 1e-3)
 
     ch_lls = np1_lls
     epars = ns1._params.reshape(2, 2, 16, 16).permute(0, 2, 1, 3).reshape(32, 32)
-    ch_lls_max = ch_lls.max(dim = 0).values
-    nflow_div_mar = ns1_flows * (ch_lls_max[None,:] - ns1_lls).exp()
-    emars = (ch_lls - ch_lls_max[None,:]).exp()
-    eflows = emars * torch.matmul(epars.permute(1, 0), nflow_div_mar)
+    log_n_fdm = ns1_flows.log() - ns1_lls
+    log_n_fdm_max = log_n_fdm.max(dim = 0).values
+    n_fdm_sub = (log_n_fdm - log_n_fdm_max[None,:]).exp()
+    emars = (ch_lls + log_n_fdm_max[None,:]).exp()
+    eflows = emars * torch.matmul(epars.permute(1, 0), n_fdm_sub)
 
     sid, eid = np1._output_ind_range
     np1_flows = eflows
     assert torch.all(torch.abs(np1_flows - element_flows[sid:eid,:]) < 1e-4)
 
-    ns1_parflows = epars * torch.matmul(nflow_div_mar, emars.permute(1, 0))
+    ns1_parflows = epars * torch.matmul(n_fdm_sub, emars.permute(1, 0))
     ref_parflows = param_flows[2048:3072].reshape(2, 32, 16).permute(0, 2, 1).reshape(32, 32)
-    assert torch.all(torch.abs(ns1_parflows - ref_parflows) < 1e-4)
+    assert torch.all(torch.abs(ns1_parflows - ref_parflows) < 1e-3)
 
     ch_lls = np2_lls
     epars = ns2._params.reshape(2, 2, 16, 16).permute(0, 2, 1, 3).reshape(32, 32)
-    ch_lls_max = ch_lls.max(dim = 0).values
-    nflow_div_mar = ns2_flows * (ch_lls_max[None,:] - ns2_lls).exp()
-    emars = (ch_lls - ch_lls_max[None,:]).exp()
-    eflows = emars * torch.matmul(epars.permute(1, 0), nflow_div_mar)
+    log_n_fdm = ns2_flows.log() - ns2_lls
+    log_n_fdm_max = log_n_fdm.max(dim = 0).values
+    n_fdm_sub = (log_n_fdm - log_n_fdm_max[None,:]).exp()
+    emars = (ch_lls + log_n_fdm_max[None,:]).exp()
+    eflows = emars * torch.matmul(epars.permute(1, 0), n_fdm_sub)
 
     sid, eid = np2._output_ind_range
     np2_flows = eflows
     assert torch.all(torch.abs(np2_flows - element_flows[sid:eid,:]) < 1e-4)
 
-    ns2_parflows = epars * torch.matmul(nflow_div_mar, emars.permute(1, 0))
+    ns2_parflows = epars * torch.matmul(n_fdm_sub, emars.permute(1, 0))
     ref_parflows = param_flows[3072:4096].reshape(2, 32, 16).permute(0, 2, 1).reshape(32, 32)
-    assert torch.all(torch.abs(ns2_parflows - ref_parflows) < 3e-4)
+    assert torch.all(torch.abs(ns2_parflows - ref_parflows) < 1e-3)
 
     sid, eid = ni0._output_ind_range
     ni0_flows = np0_flows + np3_flows + np5_flows + np6_flows
@@ -394,25 +397,25 @@ def simple_model_test():
     ref_pflows = torch.zeros_like(ni0_pflows)
     for b in range(512):
         ref_pflows[:,data_cpu[b,0]] += ni0_flows[:,b]
-    assert torch.all(torch.abs(ni0_pflows - ref_pflows) < 4e-3)
+    assert torch.all(torch.abs(ni0_pflows - ref_pflows) < 6e-3)
 
     ni1_pflows = input_pflows[128:256].reshape(32, 4)
     ref_pflows = torch.zeros_like(ni1_pflows)
     for b in range(512):
         ref_pflows[:,data_cpu[b,1]] += ni1_flows[:,b]
-    assert torch.all(torch.abs(ni1_pflows - ref_pflows) < 4e-3)
+    assert torch.all(torch.abs(ni1_pflows - ref_pflows) < 6e-3)
 
     ni2_pflows = input_pflows[256:448].reshape(32, 6)
     ref_pflows = torch.zeros_like(ni2_pflows)
     for b in range(512):
         ref_pflows[:,data_cpu[b,2]] += ni2_flows[:,b]
-    assert torch.all(torch.abs(ni2_pflows - ref_pflows) < 4e-3)
+    assert torch.all(torch.abs(ni2_pflows - ref_pflows) < 6e-3)
 
     ni3_pflows = input_pflows[448:640].reshape(32, 6)
     ref_pflows = torch.zeros_like(ni3_pflows)
     for b in range(512):
         ref_pflows[:,data_cpu[b,3]] += ni3_flows[:,b]
-    assert torch.all(torch.abs(ni3_pflows - ref_pflows) < 4e-3)
+    assert torch.all(torch.abs(ni3_pflows - ref_pflows) < 6e-3)
 
     ## EM Optimization tests ##
 
@@ -486,6 +489,10 @@ def simple_model_test():
 
     cum_pflows = pc.par_update_kwargs[6].cpu()
 
+    ns0_parflows = ns0._param_flows.reshape(2, 4, 16, 16).permute(0, 2, 1, 3).reshape(32, 64)
+    ns1_parflows = ns1._param_flows.reshape(2, 2, 16, 16).permute(0, 2, 1, 3).reshape(32, 32)
+    ns2_parflows = ns2._param_flows.reshape(2, 2, 16, 16).permute(0, 2, 1, 3).reshape(32, 32)
+
     assert torch.all(torch.abs(ns0_parflows.sum(dim = 1) - cum_pflows[0:32]) < 1e-3)
     assert torch.all(torch.abs(ns1_parflows.sum(dim = 1) - cum_pflows[32:64]) < 1e-3)
     assert torch.all(torch.abs(ns2_parflows.sum(dim = 1) - cum_pflows[64:96]) < 1e-3)
diff --git a/tests/structures/debug.py b/tests/structures/debug.py
new file mode 100644
index 00000000..2c42c63a
--- /dev/null
+++ b/tests/structures/debug.py
@@ -0,0 +1,326 @@
+import numpy as np
+import torch
+
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def ref_kernel(node_flows, element_flows, node_mars, element_mars, params, 
+                                        chids, parids_start, parids_increment, parpids_start, parpids_increment, 
+                                        local_ids, batch_size: tl.constexpr, partial_eval: tl.constexpr, ptr_inc_step: tl.constexpr, 
+                                        BLOCK_B: tl.constexpr, TILE_SIZE_K: tl.constexpr, K_NUM_TILES: tl.constexpr, TILE_SIZE_M: tl.constexpr, 
+                                        GROUP_SIZE_M: tl.constexpr, GROUP_SIZE_K: tl.constexpr):
+
+    pid_b = tl.program_id(0) # ID of size-`BLOCK_B` batches
+    pid_m = tl.program_id(1) # ID of size-`TILE_SIZE_M` nodes
+
+    # Get inferred node group id from `pid_m`
+    elegroup_id = pid_m // (GROUP_SIZE_M // TILE_SIZE_M)
+    tile_id = pid_m % (GROUP_SIZE_M // TILE_SIZE_M)
+
+    # Get the real node group id in the case of partial evaluation
+    if partial_eval == 1:
+        elegroup_id = tl.load(local_ids + elegroup_id)
+
+    # Initialize pointers to `params`
+    offs_ele = tl.arange(0, TILE_SIZE_M) + tile_id * TILE_SIZE_M
+    offs_edge = tl.arange(0, TILE_SIZE_K)
+    offs_edge_gid = offs_edge // GROUP_SIZE_K
+    offs_edge_nid = (offs_edge % GROUP_SIZE_K)
+    par_start = tl.load(parpids_start + elegroup_id * ptr_inc_step + offs_edge_gid)
+    epars_ptr = params + \
+        offs_ele[:,None] * GROUP_SIZE_K + \
+        (par_start + offs_edge_nid)[None,:] # [TILE_SIZE_M, TILE_SIZE_K]
+
+    # Batch offsets and mask
+    offs_batch = tl.arange(0, BLOCK_B) + pid_b * BLOCK_B
+    mask_batch = offs_batch < batch_size
+
+    # Initialize pointers to `node_mars`
+    edge_start = tl.load(parids_start + elegroup_id * ptr_inc_step + offs_edge_gid)
+    nmars_ptr = node_mars + \
+        (edge_start + offs_edge_nid)[:,None] * batch_size + \
+        offs_batch[None,:] # [TILE_SIZE_K, BLOCK_B]
+    nflows_ptr = node_flows + \
+        (edge_start + offs_edge_nid)[:,None] * batch_size + \
+        offs_batch[None,:] # [TILE_SIZE_K, BLOCK_B]
+
+    # Initialize pointers to `element_mars`
+    off_eleids = tl.load(chids + elegroup_id)
+    emars_ptr = element_mars + (off_eleids + offs_ele[:,None]) * batch_size + offs_batch[None,:]
+    emars = tl.load(emars_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_K, BLOCK_B]
+
+    # Batch increment pointers
+    parids_inc_ptr = parids_increment + elegroup_id * (K_NUM_TILES * ptr_inc_step) + offs_edge_gid
+    parpids_inc_ptr = parpids_increment + elegroup_id * (K_NUM_TILES * ptr_inc_step) + offs_edge_gid
+
+    # Inner loop
+    acc = tl.zeros([TILE_SIZE_M, BLOCK_B], dtype = tl.float32)
+    log_max = tl.zeros([BLOCK_B], dtype = tl.float32) - float("inf")
+
+    for k in range(0, K_NUM_TILES):
+        epars = tl.load(epars_ptr) # [TILE_SIZE_M, TILE_SIZE_K]
+        nmars = tl.load(nmars_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_K, BLOCK_B]
+        nflows = tl.load(nflows_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_K, BLOCK_B]
+
+        # log_n_fdm = tl.log(nflows) - nmars
+        # log_n_fdm_max = tl.max(log_n_fdm, axis = 0)
+        # n_fdm_sub = tl.exp(log_n_fdm - log_n_fdm_max[None,:])
+
+        # partial_flows = tl.dot(epars, n_fdm_sub)
+
+        # acc = tl.where(log_max[None,:] > log_n_fdm_max[None,:], 
+        #                acc + tl.exp(log_n_fdm_max - log_max)[None,:] * partial_flows,
+        #                partial_flows + tl.exp(log_max - log_n_fdm_max)[None,:] * acc)
+        # log_max = tl.maximum(log_max, log_n_fdm_max)
+
+        eflows = tl.sum(epars[:,:,None] * tl.exp(emars[:,None,:] - nmars[None,:,:]) * nflows[None,:,:], axis = 1)
+        acc += eflows
+
+        # Increment `epars_ptr`
+        parpids_inc = tl.load(parpids_inc_ptr)
+        epars_ptr += parpids_inc[None,:]
+        parpids_inc_ptr += ptr_inc_step
+
+        # Increment `nmars_ptr`
+        parids_inc = tl.load(parids_inc_ptr)
+        nmars_ptr += parids_inc[:,None] * batch_size
+        nflows_ptr += parids_inc[:,None] * batch_size
+        parids_inc += ptr_inc_step
+
+    # # Initialize pointers to `element_mars`
+    # off_eleids = tl.load(chids + elegroup_id)
+    # emars_ptr = element_mars + (off_eleids + offs_ele[:,None]) * batch_size + offs_batch[None,:]
+    # emars = tl.load(emars_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_K, BLOCK_B]
+
+    # eflows = acc * tl.exp(emars + log_max[None,:])
+
+    # Write back
+    offs_elemfs = (off_eleids + offs_ele[:,None]) * batch_size + offs_batch[None,:]
+    tl.store(element_flows + offs_elemfs, acc, mask = mask_batch[None,:])
+
+
+@triton.jit
+def my_kernel(aaa, bbb, ccc, ddd, eee, node_flows, element_flows, node_mars, element_mars, params, 
+                                        chids, parids_start, parids_increment, parpids_start, parpids_increment, 
+                                        local_ids, batch_size: tl.constexpr, partial_eval: tl.constexpr, ptr_inc_step: tl.constexpr, 
+                                        BLOCK_B: tl.constexpr, TILE_SIZE_K: tl.constexpr, K_NUM_TILES: tl.constexpr, TILE_SIZE_M: tl.constexpr, 
+                                        GROUP_SIZE_M: tl.constexpr, GROUP_SIZE_K: tl.constexpr):
+
+    pid_b = tl.program_id(0) # ID of size-`BLOCK_B` batches
+    pid_m = tl.program_id(1) # ID of size-`TILE_SIZE_M` nodes
+
+    # Get inferred node group id from `pid_m`
+    elegroup_id = pid_m // (GROUP_SIZE_M // TILE_SIZE_M)
+    tile_id = pid_m % (GROUP_SIZE_M // TILE_SIZE_M)
+
+    # Get the real node group id in the case of partial evaluation
+    if partial_eval == 1:
+        elegroup_id = tl.load(local_ids + elegroup_id)
+
+    # Initialize pointers to `params`
+    offs_ele = tl.arange(0, TILE_SIZE_M) + tile_id * TILE_SIZE_M
+    offs_edge = tl.arange(0, TILE_SIZE_K)
+    offs_edge_gid = offs_edge // GROUP_SIZE_K
+    offs_edge_nid = (offs_edge % GROUP_SIZE_K)
+    par_start = tl.load(parpids_start + elegroup_id * ptr_inc_step + offs_edge_gid)
+    epars_ptr = params + \
+        offs_ele[:,None] * GROUP_SIZE_K + \
+        (par_start + offs_edge_nid)[None,:] # [TILE_SIZE_M, TILE_SIZE_K]
+
+    epars = tl.load(epars_ptr)
+    offs1 = pid_m * (TILE_SIZE_M * TILE_SIZE_K) + tl.arange(0, TILE_SIZE_M)[:,None] * TILE_SIZE_K + tl.arange(0, TILE_SIZE_K)[None,:]
+    tl.store(aaa + offs1, epars)
+    tl.store(bbb + offs1, offs_ele[:,None] * GROUP_SIZE_K + (par_start + offs_edge_nid)[None,:])
+
+    # Batch offsets and mask
+    offs_batch = tl.arange(0, BLOCK_B) + pid_b * BLOCK_B
+    mask_batch = offs_batch < batch_size
+
+    # Initialize pointers to `node_mars`
+    edge_start = tl.load(parids_start + elegroup_id * ptr_inc_step + offs_edge_gid)
+    nmars_ptr = node_mars + \
+        (edge_start + offs_edge_nid)[:,None] * batch_size + \
+        offs_batch[None,:] # [TILE_SIZE_K, BLOCK_B]
+    nflows_ptr = node_flows + \
+        (edge_start + offs_edge_nid)[:,None] * batch_size + \
+        offs_batch[None,:] # [TILE_SIZE_K, BLOCK_B]
+
+    # Batch increment pointers
+    parids_inc_ptr = parids_increment + elegroup_id * (K_NUM_TILES * ptr_inc_step) + offs_edge_gid
+    parpids_inc_ptr = parpids_increment + elegroup_id * (K_NUM_TILES * ptr_inc_step) + offs_edge_gid
+
+    # Inner loop
+    acc = tl.zeros([TILE_SIZE_M, BLOCK_B], dtype = tl.float32) - float("inf")
+    # acc = tl.zeros([TILE_SIZE_M, BLOCK_B], dtype = tl.float32)
+
+    for k in range(0, K_NUM_TILES):
+    # for k in range(0, 1):
+        epars = tl.load(epars_ptr) # [TILE_SIZE_M, TILE_SIZE_K]
+        nmars = tl.load(nmars_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_K, BLOCK_B]
+        nflows = tl.load(nflows_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_K, BLOCK_B]
+
+        log_n_fdm = tl.log(nflows) - nmars
+        log_n_fdm_max = tl.max(log_n_fdm, axis = 0)
+        n_fdm_sub = tl.where(log_n_fdm_max[None,:] != -float("inf"), 
+            tl.exp(log_n_fdm - log_n_fdm_max[None,:]), 0.0)
+
+        offs2 = pid_m * (K_NUM_TILES * TILE_SIZE_K * batch_size) + k * (TILE_SIZE_K * batch_size) + tl.arange(0, TILE_SIZE_K)[:,None] * batch_size + offs_batch[None,:]
+        tl.store(ccc + offs2, log_n_fdm, mask = mask_batch[None,:])
+        tl.store(ddd + offs2, n_fdm_sub, mask = mask_batch[None,:])
+
+        partial_flows = tl.dot(epars, n_fdm_sub)
+        # partial_flows = tl.sum(epars[:,:,None] * n_fdm_sub[None,:,:], axis = 1)
+
+        offs3 = pid_m * (K_NUM_TILES * TILE_SIZE_K * batch_size) + k * (TILE_SIZE_K * batch_size) + tl.arange(0, TILE_SIZE_M)[:,None] * batch_size + offs_batch[None,:]
+        tl.store(eee + offs3, partial_flows, mask = mask_batch[None,:])
+
+        acc = tl.where(log_n_fdm_max[None,:] > acc,
+            tl.log(partial_flows + tl.exp(acc - log_n_fdm_max[None,:])) + log_n_fdm_max[None,:],
+            tl.log(tl.exp(log_n_fdm_max[None,:] - acc) * partial_flows + 1.0) + acc
+        )
+        # acc += partial_flows
+
+        # Increment `epars_ptr`
+        parpids_inc = tl.load(parpids_inc_ptr)
+        epars_ptr += parpids_inc[None,:]
+        parpids_inc_ptr += ptr_inc_step
+
+        # Increment `nmars_ptr`
+        parids_inc = tl.load(parids_inc_ptr)
+        nmars_ptr += parids_inc[:,None] * batch_size
+        nflows_ptr += parids_inc[:,None] * batch_size
+        parids_inc += ptr_inc_step
+
+    # Initialize pointers to `element_mars`
+    off_eleids = tl.load(chids + elegroup_id)
+    emars_ptr = element_mars + (off_eleids + offs_ele[:,None]) * batch_size + offs_batch[None,:]
+    emars = tl.load(emars_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_K, BLOCK_B]
+
+    eflows = tl.exp(acc + emars)
+    # eflows = acc
+
+    # Write back
+    offs_elemfs = (off_eleids + offs_ele[:,None]) * batch_size + offs_batch[None,:]
+    tl.store(element_flows + offs_elemfs, eflows, mask = mask_batch[None,:])
+
+
+def main():
+
+    device = torch.device("cuda:0")
+
+    data = np.load("temp.npz")
+
+    node_flows = torch.from_numpy(data["node_flows"]).to(device)
+    element_flows = torch.from_numpy(data["element_flow"]).to(device)
+    node_mars = torch.from_numpy(data["node_mars"]).to(device)
+    element_mars = torch.from_numpy(data["element_mars"]).to(device)
+    params = torch.from_numpy(data["params"]).to(device)
+    chids = torch.from_numpy(data["chids"]).to(device)
+    parids = torch.from_numpy(data["parids"]).to(device)
+    parids_start = torch.from_numpy(data["parids_start"]).to(device)
+    parids_increment = torch.from_numpy(data["parids_increment"]).to(device)
+    parpids = torch.from_numpy(data["parpids"]).to(device)
+    parpids_start = torch.from_numpy(data["parpids_start"]).to(device)
+    parpids_increment = torch.from_numpy(data["parpids_increment"]).to(device)
+    batch_size = int(data["batch_size"])
+    ptr_inc_step = int(data["ptr_inc_step"])
+    BLOCK_B = int(data["BLOCK_B"])
+    TILE_SIZE_M = int(data["TILE_SIZE_M"])
+    TILE_SIZE_K = int(data["TILE_SIZE_K"])
+    K_NUM_TILES = int(data["K_NUM_TILES"])
+    GROUP_SIZE_M = int(data["GROUP_SIZE_M"])
+    GROUP_SIZE_K = int(data["GROUP_SIZE_K"])
+    OP_MODE = int(data["OP_MODE"])
+    layer_n_nodes = int(data["layer_n_nodes"])
+
+    # node_flows = torch.rand(node_flows.size(), device = device)
+
+    grid = (triton.cdiv(batch_size, BLOCK_B), triton.cdiv(layer_n_nodes, TILE_SIZE_M))
+    # grid = (1, triton.cdiv(layer_n_nodes, TILE_SIZE_M))
+
+    ref_kernel[grid](
+        node_flows = node_flows, 
+        element_flows = element_flows, 
+        node_mars = node_mars, 
+        element_mars = element_mars, 
+        params = params, 
+        chids = chids, 
+        parids_start = parids_start,
+        parids_increment = parids_increment,
+        parpids_start = parpids_start,
+        parpids_increment = parpids_increment, 
+        local_ids = None, 
+        batch_size = batch_size, 
+        partial_eval = 0,
+        ptr_inc_step = ptr_inc_step,
+        BLOCK_B = BLOCK_B, 
+        TILE_SIZE_K = TILE_SIZE_K, 
+        K_NUM_TILES = K_NUM_TILES,
+        TILE_SIZE_M = TILE_SIZE_M, 
+        GROUP_SIZE_M = GROUP_SIZE_M,
+        GROUP_SIZE_K = GROUP_SIZE_K
+    )
+
+    torch.cuda.synchronize()
+
+    element_flows_ref = element_flows.clone()
+
+    aaa = torch.zeros([grid[1], TILE_SIZE_M, TILE_SIZE_K]).cuda()
+    bbb = torch.zeros([grid[1], TILE_SIZE_M, TILE_SIZE_K], dtype = torch.long).cuda()
+    ccc = torch.zeros([grid[1], K_NUM_TILES, TILE_SIZE_K, batch_size]).cuda()
+    ddd = torch.zeros([grid[1], K_NUM_TILES, TILE_SIZE_K, batch_size]).cuda()
+    eee = torch.zeros([grid[1], K_NUM_TILES, TILE_SIZE_M, batch_size]).cuda()
+
+    my_kernel[grid](
+        aaa = aaa,
+        bbb = bbb,
+        ccc = ccc,
+        ddd = ddd,
+        eee = eee,
+        node_flows = node_flows, 
+        element_flows = element_flows, 
+        node_mars = node_mars, 
+        element_mars = element_mars, 
+        params = params, 
+        chids = chids, 
+        parids_start = parids_start,
+        parids_increment = parids_increment,
+        parpids_start = parpids_start,
+        parpids_increment = parpids_increment, 
+        local_ids = None, 
+        batch_size = batch_size, 
+        partial_eval = 0,
+        ptr_inc_step = ptr_inc_step,
+        BLOCK_B = BLOCK_B, 
+        TILE_SIZE_K = TILE_SIZE_K, 
+        K_NUM_TILES = K_NUM_TILES,
+        TILE_SIZE_M = TILE_SIZE_M, 
+        GROUP_SIZE_M = GROUP_SIZE_M,
+        GROUP_SIZE_K = GROUP_SIZE_K
+    )
+
+    # nflows = node_flows[parids[0,0]:parids[0,1],:] # ccc
+    # nmars = node_mars[parids[0,0]:parids[0,1],:]
+    # epars = params[bbb[0,:,:]] # aaa
+    # assert (epars - aaa[0,:,:]).abs().max() < 1e-4
+
+    # log_n_fdm = nflows.log() - nmars
+    # log_n_fdm_max = torch.max(log_n_fdm, dim = 0).values
+    # n_fdm_sub = torch.exp(log_n_fdm - log_n_fdm_max[None,:]) # ddd
+    # assert (n_fdm_sub[:,:BLOCK_B] - ddd[0,:,:BLOCK_B]).abs().max() < 1e-4
+
+    # partial_flows = torch.matmul(epars, n_fdm_sub) # eee
+    # # (partial_flows[:,:BLOCK_B].log() - eee[0,:,:BLOCK_B]).abs()
+
+    # print((element_flows_ref[chids,:] - element_flows[chids,:]).abs().max())
+
+    element_flows_ref[chids,143]
+    element_flows[chids,143]
+
+    import pdb; pdb.set_trace()
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/tests/structures/debug_bk_kernel.py b/tests/structures/debug_bk_kernel.py
deleted file mode 100644
index 6311ce7e..00000000
--- a/tests/structures/debug_bk_kernel.py
+++ /dev/null
@@ -1,188 +0,0 @@
-import numpy as np
-import torch
-
-import triton
-import triton.language as tl
-
-
-@triton.jit
-def _bk_triton_block_sparse_ele_kernel(node_flows, element_flows, node_mars, element_mars, params, 
-                                        chids, parids_start, parids_increment, parpids_start, parpids_increment, 
-                                        local_ids, batch_size: tl.constexpr, partial_eval: tl.constexpr, ptr_inc_step: tl.constexpr, 
-                                        BLOCK_B: tl.constexpr, TILE_SIZE_K: tl.constexpr, K_NUM_TILES: tl.constexpr, TILE_SIZE_M: tl.constexpr, 
-                                        GROUP_SIZE_M: tl.constexpr, GROUP_SIZE_K: tl.constexpr, OP_MODE: tl.constexpr):
-
-    pid_b = tl.program_id(0) # ID of size-`BLOCK_B` batches
-    pid_m = tl.program_id(1) # ID of size-`TILE_SIZE_M` nodes
-
-    # Get inferred node group id from `pid_m`
-    elegroup_id = pid_m // (GROUP_SIZE_M // TILE_SIZE_M)
-    tile_id = pid_m % (GROUP_SIZE_M // TILE_SIZE_M)
-
-    # Get the real node group id in the case of partial evaluation
-    if partial_eval == 1:
-        elegroup_id = tl.load(local_ids + elegroup_id)
-
-    # Initialize pointers to `params`
-    offs_ele = tl.arange(0, TILE_SIZE_M) + tile_id * TILE_SIZE_M
-    offs_edge = tl.arange(0, TILE_SIZE_K)
-    offs_edge_gid = offs_edge // GROUP_SIZE_K
-    offs_edge_nid = (offs_edge % GROUP_SIZE_K)
-    par_start = tl.load(parpids_start + elegroup_id * ptr_inc_step + offs_edge_gid)
-    epars_ptr = params + \
-        offs_ele[:,None] * GROUP_SIZE_K + \
-        (par_start + offs_edge_nid)[None,:] # [TILE_SIZE_M, TILE_SIZE_K]
-
-    # Batch offsets and mask
-    offs_batch = tl.arange(0, BLOCK_B) + pid_b * BLOCK_B
-    mask_batch = offs_batch < batch_size
-
-    # Initialize pointers to `node_mars`
-    edge_start = tl.load(parids_start + elegroup_id * ptr_inc_step + offs_edge_gid)
-    nmars_ptr = node_mars + \
-        (edge_start + offs_edge_nid)[:,None] * batch_size + \
-        offs_batch[None,:] # [TILE_SIZE_K, BLOCK_B]
-    nflows_ptr = node_flows + \
-        (edge_start + offs_edge_nid)[:,None] * batch_size + \
-        offs_batch[None,:] # [TILE_SIZE_K, BLOCK_B]
-
-    # Initialize pointers to `element_mars`
-    off_eleids = tl.load(chids + elegroup_id)
-    offs_elemfs = (off_eleids + offs_ele[:,None]) * batch_size + offs_batch[None,:]
-    emars = tl.load(element_mars + offs_elemfs, mask = mask_batch[None,:]) # [TILE_SIZE_K, BLOCK_B]
-
-    # Batch increment pointers
-    parids_inc_ptr = parids_increment + elegroup_id * (K_NUM_TILES * ptr_inc_step) + offs_edge_gid
-    parpids_inc_ptr = parpids_increment + elegroup_id * (K_NUM_TILES * ptr_inc_step) + offs_edge_gid
-
-    # Inner loop
-    acc = tl.zeros([TILE_SIZE_M, BLOCK_B], dtype = tl.float32)
-
-    for k in range(0, K_NUM_TILES):
-        epars = tl.load(epars_ptr) # [TILE_SIZE_M, TILE_SIZE_K]
-        nmars = tl.load(nmars_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_K, BLOCK_B]
-        nflows = tl.load(nflows_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_K, BLOCK_B]
-
-        nmars_max = tl.max(nmars, axis = 0) # [BLOCK_B]
-        nflows_div_mars = nflows * tl.exp(nmars_max[None,:] - nmars)
-
-        eflows = tl.sum(epars[:,:,None] * tl.exp(emars[:,None,:] - nmars[None,:,:]) * nflows[None,:,:], axis = 1)
-        
-        # if OP_MODE == 0:
-        #     # Built-in matmul kernel of triton + float16
-        #     epars = epars.to(tl.float16)
-        #     nflows_div_mars = nflows_div_mars.to(tl.float16)
-        #     eflows = tl.dot(epars, nflows_div_mars).to(tl.float32)
-        # if OP_MODE == 1:
-        #     # Built-in matmul kernel of triton + float32
-        #     eflows = tl.dot(epars, nflows_div_mars)
-        # if OP_MODE == 2:
-        #     # Simulated matmul kernel + float16
-        #     epars = epars.to(tl.float16)
-        #     nflows_div_mars = nflows_div_mars.to(tl.float16)
-        #     eflows = tl.sum(epars[:,:,None] * nflows_div_mars[None,:,:], axis = 1).to(tl.float32)
-        # if OP_MODE == 3:
-        #     # Simulated matmul kernel + float32
-        #     eflows = tl.sum(epars[:,:,None] * nflows_div_mars[None,:,:], axis = 1)
-
-        acc += eflows
-
-        # Increment `epars_ptr`
-        parpids_inc = tl.load(parpids_inc_ptr)
-        epars_ptr += parpids_inc[None,:]
-        parpids_inc_ptr += ptr_inc_step
-
-        # Increment `nmars_ptr`
-        parids_inc = tl.load(parids_inc_ptr)
-        nmars_ptr += parids_inc[:,None] * batch_size
-        nflows_ptr += parids_inc[:,None] * batch_size
-        parids_inc += ptr_inc_step
-
-    # Write back
-    off_eleids = tl.load(chids + elegroup_id)
-    offs_elemfs = (off_eleids + offs_ele[:,None]) * batch_size + offs_batch[None,:]
-    tl.store(element_flows + offs_elemfs, acc, mask = mask_batch[None,:])
-
-
-def main():
-
-    device = torch.device("cuda:0")
-
-    data = np.load("temp.npz")
-
-    node_flows = torch.from_numpy(data["node_flows"]).to(device)
-    element_flows = torch.from_numpy(data["element_flow"]).to(device)
-    node_mars = torch.from_numpy(data["node_mars"]).to(device)
-    element_mars = torch.from_numpy(data["element_mars"]).to(device)
-    params = torch.from_numpy(data["params"]).to(device)
-    chids = torch.from_numpy(data["chids"]).to(device)
-    parids = torch.from_numpy(data["parids"]).to(device)
-    parids_start = torch.from_numpy(data["parids_start"]).to(device)
-    parids_increment = torch.from_numpy(data["parids_increment"]).to(device)
-    parpids = torch.from_numpy(data["parpids"]).to(device)
-    parpids_start = torch.from_numpy(data["parpids_start"]).to(device)
-    parpids_increment = torch.from_numpy(data["parpids_increment"]).to(device)
-    batch_size = int(data["batch_size"])
-    ptr_inc_step = int(data["ptr_inc_step"])
-    BLOCK_B = int(data["BLOCK_B"])
-    TILE_SIZE_M = int(data["TILE_SIZE_M"])
-    TILE_SIZE_K = int(data["TILE_SIZE_K"])
-    K_NUM_TILES = int(data["K_NUM_TILES"])
-    GROUP_SIZE_M = int(data["GROUP_SIZE_M"])
-    GROUP_SIZE_K = int(data["GROUP_SIZE_K"])
-    OP_MODE = int(data["OP_MODE"])
-    layer_n_nodes = int(data["layer_n_nodes"])
-
-    grid = (triton.cdiv(batch_size, BLOCK_B), triton.cdiv(layer_n_nodes, TILE_SIZE_M))
-
-    import pdb; pdb.set_trace()
-
-    ori_chids = chids
-    ori_parids = parids
-    ori_parpids = parpids
-
-    num_ngroups = chids.size(0)
-    num_egroups = parids.size(1)
-    parids = (parids[:,:,None].repeat(1, 1, GROUP_SIZE_M) + torch.arange(0, GROUP_SIZE_M, device = parids.device)).reshape(num_ngroups, num_egroups * GROUP_SIZE_M)
-    parpids = (parpids[:,:,None] + torch.arange(0, GROUP_SIZE_M, device = parids.device)).reshape(
-        num_ngroups, num_egroups * GROUP_SIZE_M)
-
-    chids = (chids[:,None].repeat(1, GROUP_SIZE_K) + torch.arange(0, GROUP_SIZE_K, device = chids.device)).reshape(num_ngroups * GROUP_SIZE_K)
-    parids = parids[:,None,:].repeat(1, GROUP_SIZE_K, 1).reshape(num_ngroups * GROUP_SIZE_K, num_egroups * GROUP_SIZE_M)
-    parpids = (parpids[:,None,:].repeat(1, GROUP_SIZE_K, 1) + torch.arange(0, GROUP_SIZE_K * GROUP_SIZE_M, GROUP_SIZE_M, device = parpids.device)[None,:,None]).reshape(
-        num_ngroups * GROUP_SIZE_K, num_egroups * GROUP_SIZE_M
-    )
-    
-    element_flows[chids] = (node_flows[parids] * params[parpids].unsqueeze(-1) * (element_mars[chids].unsqueeze(1) - node_mars[parids]).exp()).sum(dim = 1)
-
-    import pdb; pdb.set_trace()
-
-    _bk_triton_block_sparse_ele_kernel[grid](
-        node_flows = node_flows, 
-        element_flows = element_flows, 
-        node_mars = node_mars, 
-        element_mars = element_mars, 
-        params = params, 
-        chids = ori_chids, 
-        parids_start = parids_start,
-        parids_increment = parids_increment,
-        parpids_start = parpids_start,
-        parpids_increment = parpids_increment, 
-        local_ids = None, 
-        batch_size = batch_size, 
-        partial_eval = 0,
-        ptr_inc_step = ptr_inc_step,
-        BLOCK_B = BLOCK_B, 
-        TILE_SIZE_K = TILE_SIZE_K, 
-        K_NUM_TILES = K_NUM_TILES,
-        TILE_SIZE_M = TILE_SIZE_M, 
-        GROUP_SIZE_M = GROUP_SIZE_M,
-        GROUP_SIZE_K = GROUP_SIZE_K,
-        OP_MODE = OP_MODE
-    )
-
-    import pdb; pdb.set_trace()
-
-
-if __name__ == "__main__":
-    main()
\ No newline at end of file
diff --git a/tests/structures/hclt_correctness_test.py b/tests/structures/hclt_correctness_test.py
index b7768cb7..5ad39bd1 100644
--- a/tests/structures/hclt_correctness_test.py
+++ b/tests/structures/hclt_correctness_test.py
@@ -185,7 +185,8 @@ def hclt_backward_test():
                 nflows = ns2flows[ns]
 
                 sid, eid = ns._output_ind_range
-                assert torch.all(torch.abs(nflows - node_flows[sid:eid,:]) < 2e-4)
+
+                assert (torch.abs(nflows - node_flows[sid:eid,:]) > 1e-3).float().mean() < 0.02
 
                 nmars = node_mars[sid:eid,:]
 
@@ -204,13 +205,16 @@ def hclt_backward_test():
                     else:
                         raise ValueError()
 
-                    emars_max = emars.max(dim = 0).values
-                    nflows_div_mars = nflows * (emars_max[None,:] - nmars).exp()
-                    eflows = torch.matmul(params.permute(1, 0), nflows_div_mars) * (emars - emars_max[None,:]).exp()
+                    log_n_fdm = nflows.log() - nmars
+                    log_n_fdm_max = log_n_fdm.max(dim = 0).values
+                    n_fdm_sub = (log_n_fdm - log_n_fdm_max[None,:]).exp()
+
+                    eflows = torch.matmul(params.permute(1, 0), n_fdm_sub) * (emars + log_n_fdm_max[None,:]).exp()
 
-                    pflows = torch.matmul(nflows_div_mars, (emars - emars_max[None,:]).exp().permute(1, 0)) * params
+                    scaled_emars = (emars + log_n_fdm_max[None,:]).exp()
+                    pflows = torch.matmul(n_fdm_sub, scaled_emars.permute(1, 0)) * params
 
-                    assert torch.all(torch.abs(pflows - param_flows) < 6e-3)
+                    assert torch.all(torch.abs(pflows - param_flows) < 0.5)
 
                     if cs not in ns2flows:
                         ns2flows[cs] = torch.zeros([num_latents, batch_size])
diff --git a/tests/structures/hclt_test.py b/tests/structures/hclt_test.py
index 6cec0fb0..d6c9e63b 100644
--- a/tests/structures/hclt_test.py
+++ b/tests/structures/hclt_test.py
@@ -109,35 +109,34 @@ def hclt_test():
         milestone_steps = [0, len(train_loader) * 100, len(train_loader) * 350]
     )
 
-    # for epoch in range(10):
-    #     t0 = time.time()
-    #     train_ll = 0.0
-    #     for idx, batch in enumerate(train_loader):
-    #         x = batch[0].to(device)
+    # for batch in train_loader:
+    #     x = batch[0].to(device)
 
-    #         optimizer.zero_grad()
+    #     optimizer.zero_grad()
 
-    #         lls = pc(x)
+    #     lls = pc(x)
+    #     lls.mean().backward()
 
-    #         if lls.isnan().any():
-    #             import pdb; pdb.set_trace()
+    #     optimizer.step()
+    #     scheduler.step()
 
-    #         lls.mean().backward()
+    # from torch.profiler import profile, record_function, ProfilerActivity
+    # with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
+    #     for batch in train_loader:
+    #         x = batch[0].to(device)
 
-    #         if pc.node_flows.isnan().any():
-    #             import pdb; pdb.set_trace()
+    #         optimizer.zero_grad()
 
-    #         train_ll += lls.mean().detach().cpu().numpy().item()
+    #         lls = pc(x)
+    #         lls.mean().backward()
 
     #         optimizer.step()
     #         scheduler.step()
 
-    #         if pc.params.isnan().any():
-    #             import pdb; pdb.set_trace()
+    #         break
 
-    #     train_ll /= len(train_loader)
+    # prof.export_chrome_trace("trace_new2.json")
 
-    # exit()
 
     mini_batch_em_epoch(350, pc, optimizer, scheduler, train_loader, test_loader, device)
     full_batch_em_epoch(pc, train_loader, test_loader, device)

From 82af4c6a5eccbf60774ae7b630614cbeb0910937 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Sun, 17 Dec 2023 05:53:27 +0800
Subject: [PATCH 080/162] seems fixed but still have occasional nans

---
 src/pyjuice/layer/prod_layer.py    |  68 ++++---
 src/pyjuice/layer/sum_layer.py     | 291 ++++-------------------------
 src/pyjuice/model/tensorcircuit.py |  76 ++++++--
 tests/structures/debug.py          |  26 ++-
 tests/structures/hclt_test.py      |  27 ++-
 5 files changed, 182 insertions(+), 306 deletions(-)

diff --git a/src/pyjuice/layer/prod_layer.py b/src/pyjuice/layer/prod_layer.py
index 6a61c035..c1787df1 100644
--- a/src/pyjuice/layer/prod_layer.py
+++ b/src/pyjuice/layer/prod_layer.py
@@ -156,7 +156,9 @@ def forward(self, node_mars: torch.Tensor, element_mars: torch.Tensor, _for_back
                 cids = self.partitioned_cids[partition_id]
                 local_ids = self.fw_partition_local_ids[partition_id]
 
-                self._forward_backward(element_mars, node_mars, nids, cids, local_ids = local_ids, accum = False)
+                self._forward_backward(
+                    element_mars, node_mars, nids, cids, local_ids = local_ids, accum = False
+                )
 
         elif _for_backward and self.provided("bk_fw_partition_local_ids"):
             # Partial evaluation (for backward pass)
@@ -165,7 +167,9 @@ def forward(self, node_mars: torch.Tensor, element_mars: torch.Tensor, _for_back
                 cids = self.partitioned_cids[partition_id]
                 local_ids = self.bk_fw_partition_local_ids[partition_id]
 
-                self._forward_backward(element_mars, node_mars, nids, cids, local_ids = local_ids, accum = False)
+                self._forward_backward(
+                    element_mars, node_mars, nids, cids, local_ids = local_ids, accum = False
+                )
 
         else:
             # Evaluate the whole layer
@@ -173,7 +177,9 @@ def forward(self, node_mars: torch.Tensor, element_mars: torch.Tensor, _for_back
                 nids = self.partitioned_nids[partition_id]
                 cids = self.partitioned_cids[partition_id]
 
-                self._forward_backward(element_mars, node_mars, nids, cids, accum = False)
+                self._forward_backward(
+                    element_mars, node_mars, nids, cids, accum = False
+                )
 
         return None
 
@@ -228,7 +234,7 @@ def enable_partial_evaluation(self, fw_scopes: Optional[Sequence[BitSet]] = None
     @staticmethod
     @triton.jit
     def _forward_backward_kernel_3d(node_vals_ptr, element_vals_ptr, local_ids_ptr, nids_ptr, cids_ptr, tot_n_nodes, tot_n_eles, n_ngroups,
-                                    n_edges: tl.constexpr, batch_size, BLOCK_M: tl.constexpr, BLOCK_B: tl.constexpr, 
+                                    num_edges: tl.constexpr, batch_size, BLOCK_M: tl.constexpr, BLOCK_B: tl.constexpr, 
                                     group_size: tl.constexpr, accum: tl.constexpr, partial_eval: tl.constexpr):
         """
         This kernel implements the function with 3d tensors. However, it only work with `triton==2.0.0`.
@@ -253,9 +259,9 @@ def _forward_backward_kernel_3d(node_vals_ptr, element_vals_ptr, local_ids_ptr,
 
             # Get the group start ids for the children
             # To make the triton compiler happy, we reload every index `BLOCK_M` times
-            offs_ne = tl.arange(0, n_edges * BLOCK_M) // BLOCK_M
-            offs_ne = tl.view(offs_ne, (BLOCK_M, n_edges))
-            offs_egstart = tl.load(cids_ptr + ngroup_id * n_edges + offs_ne) # [BLOCK_M, n_edges]
+            offs_ne = tl.arange(0, num_edges * BLOCK_M) // BLOCK_M
+            offs_ne = tl.view(offs_ne, (BLOCK_M, num_edges))
+            offs_egstart = tl.load(cids_ptr + ngroup_id * num_edges + offs_ne) # [BLOCK_M, num_edges]
 
             # Get the edge values from child nodes
             group_nids = tl.arange(0, BLOCK_M) + ntile_id * BLOCK_M
@@ -294,9 +300,9 @@ def _forward_backward_kernel_3d(node_vals_ptr, element_vals_ptr, local_ids_ptr,
             mask_batch = offs_batch < batch_size
 
             # Get the group start ids for the children
-            offs_ne = tl.arange(0, n_edges * BLOCK_M) // BLOCK_M
-            offs_ne = tl.view(offs_ne, (BLOCK_M, n_edges))
-            offs_egstart = tl.load(cids_ptr + ngroup_ids[:,None] * n_edges + offs_ne, mask = mask_node[:,None]) # [BLOCK_M, n_edges]
+            offs_ne = tl.arange(0, num_edges * BLOCK_M) // BLOCK_M
+            offs_ne = tl.view(offs_ne, (BLOCK_M, num_edges))
+            offs_egstart = tl.load(cids_ptr + ngroup_ids[:,None] * num_edges + offs_ne, mask = mask_node[:,None]) # [BLOCK_M, num_edges]
 
             # Get the edge values from child nodes
             group_nids = (offs_node % group_size)
@@ -320,7 +326,7 @@ def _forward_backward_kernel_3d(node_vals_ptr, element_vals_ptr, local_ids_ptr,
     @staticmethod
     @triton.jit
     def _forward_backward_kernel_2d(node_vals_ptr, element_vals_ptr, local_ids_ptr, nids_ptr, cids_ptr, tot_n_nodes, tot_n_eles, n_ngroups,
-                                    n_edges: tl.constexpr, batch_size, BLOCK_M: tl.constexpr, BLOCK_B: tl.constexpr, 
+                                    num_edges: tl.constexpr, batch_size, BLOCK_M: tl.constexpr, BLOCK_B: tl.constexpr, 
                                     group_size: tl.constexpr, accum: tl.constexpr, partial_eval: tl.constexpr):
         """
         This kernel implements the function with 2d tensors. It works for all `triton` versions.
@@ -342,13 +348,13 @@ def _forward_backward_kernel_2d(node_vals_ptr, element_vals_ptr, local_ids_ptr,
         mask_batch = offs_batch < batch_size
 
         # Get the group start ids for the children
-        offs_edge = tl.arange(0, n_edges)
-        offs_egstart = tl.load(cids_ptr + ngroup_id * n_edges + offs_edge) # [n_edges]
+        offs_edge = tl.arange(0, num_edges)
+        offs_egstart = tl.load(cids_ptr + ngroup_id * num_edges + offs_edge) # [num_edges]
 
         # Base ptr for ch values
         evals_ptr = element_vals_ptr + \
             (offs_egstart[:,None] + ntile_id * BLOCK_M) * batch_size + \
-            offs_batch[None,:] # [n_edges, BLOCK_B]
+            offs_batch[None,:] # [num_edges, BLOCK_B]
 
         # Base ptr for par values
         ngroup_start = tl.load(nids_ptr + ngroup_id)
@@ -392,21 +398,25 @@ def _forward_backward(self, node_vals: torch.Tensor, element_vals: torch.Tensor,
         tot_n_nodes = node_vals.size(0)
         tot_n_eles = element_vals.size(0)
         n_ngroups = nids.size(0) if local_ids is None else local_ids.size(0)
-        n_edges = cids.size(1)
+        num_edges = cids.size(1)
         batch_size = node_vals.size(1)
 
-        assert n_edges & (n_edges - 1) == 0, "`n_edges` must be power of 2."
+        group_size = self.group_size
+        accum = 1 if accum else 0
+        partial_eval = 1 if local_ids is not None else 0
+
+        assert num_edges & (num_edges - 1) == 0, "`num_edges` must be power of 2."
 
         # Fall back to the `torch.compile` kernel in the case where we cannot store child edges within a single block
-        if n_edges > 1024:
+        if num_edges > 1024:
             self._forward_backward_pytorch(node_vals, element_vals, nids, cids, accum = accum)
 
             return None
 
         if version.parse(triton.__version__) > version.parse("2.0.0"):
 
-            BLOCK_B = min(1024 // n_edges, triton.next_power_of_2(batch_size))
-            BLOCK_M = min(max(1024 // (BLOCK_B * n_edges), 1), self.group_size)
+            BLOCK_B = min(1024 // num_edges, triton.next_power_of_2(batch_size))
+            BLOCK_M = min(max(1024 // (BLOCK_B * num_edges), 1), self.group_size)
 
             grid = (triton.cdiv(n_ngroups * self.group_size, BLOCK_M), triton.cdiv(batch_size, BLOCK_B))
 
@@ -419,19 +429,19 @@ def _forward_backward(self, node_vals: torch.Tensor, element_vals: torch.Tensor,
                 tot_n_nodes = tot_n_nodes,
                 tot_n_eles = tot_n_eles,
                 n_ngroups = n_ngroups,
-                n_edges = n_edges,
+                num_edges = num_edges,
                 batch_size = batch_size,
                 BLOCK_M = BLOCK_M, 
                 BLOCK_B = BLOCK_B,
-                group_size = self.group_size,
-                accum = 1 if accum else 0,
-                partial_eval = 1 if local_ids is not None else 0
+                group_size = group_size,
+                accum = accum,
+                partial_eval = partial_eval
             )
 
         else:
 
-            BLOCK_B = min(1024 // n_edges, triton.next_power_of_2(batch_size))
-            BLOCK_M = min(max(1024 // (BLOCK_B * n_edges), 1), triton.next_power_of_2(n_ngroups) * self.group_size)
+            BLOCK_B = min(1024 // num_edges, triton.next_power_of_2(batch_size))
+            BLOCK_M = min(max(1024 // (BLOCK_B * num_edges), 1), triton.next_power_of_2(n_ngroups) * self.group_size)
 
             grid = (triton.cdiv(n_ngroups * self.group_size, BLOCK_M), triton.cdiv(batch_size, BLOCK_B))
 
@@ -444,13 +454,13 @@ def _forward_backward(self, node_vals: torch.Tensor, element_vals: torch.Tensor,
                 tot_n_nodes = tot_n_nodes,
                 tot_n_eles = tot_n_eles,
                 n_ngroups = n_ngroups,
-                n_edges = n_edges,
+                num_edges = num_edges,
                 batch_size = batch_size,
                 BLOCK_M = BLOCK_M, 
                 BLOCK_B = BLOCK_B,
-                group_size = self.group_size,
-                accum = 1 if accum else 0,
-                partial_eval = 1 if local_ids is not None else 0
+                group_size = group_size,
+                accum = accum,
+                partial_eval = partial_eval
             )
 
         return None
diff --git a/src/pyjuice/layer/sum_layer.py b/src/pyjuice/layer/sum_layer.py
index 494e2077..94cc07af 100644
--- a/src/pyjuice/layer/sum_layer.py
+++ b/src/pyjuice/layer/sum_layer.py
@@ -202,13 +202,7 @@ def num_param_flows(self):
 
     def forward(self, node_mars: torch.Tensor, element_mars: torch.Tensor, params: torch.Tensor) -> None:
         """
-        Computes the forward pass of a sum layer:
-        ```
-        ch_mars = element_mars[cids]
-        maxval = ch_mars.max(dim = 1, keepdim = True).values
-        node_mars[nids] = (((ch_mars - maxval).exp() * params[pids]).sum(
-            dim = 1).clamp(min = 1e-10)).log() + maxval.squeeze(1)
-        ```
+        Computes the forward pass of a sum layer.
 
         Parameters:
         `node_mars`:    [num_nodes, B]
@@ -282,8 +276,6 @@ def backward(self, node_flows: torch.Tensor, element_flows: torch.Tensor,
                     cs_group_size = cs_group_size
                 )
 
-                torch.cuda.synchronize()
-
         else:
             # Partial evaluation
             for partition_id in range(self.num_bk_partitions):
@@ -363,7 +355,7 @@ def _forward(self, node_mars: torch.Tensor, element_mars: torch.Tensor,
             )
 
         elif mode == "pytorch":
-            self._forward_pytorch_kernel(
+            self._forward_pytorch(
                 node_mars, element_mars, params, nids, cids, pids, local_ids
             )
         
@@ -609,6 +601,9 @@ def _forward_block_sparse(self, node_mars: torch.Tensor, element_mars: torch.Ten
         else:
             cids_start, cids_increment, pids_start, pids_increment = self._cached_fw_pcids[signature]
 
+        partial_eval = 1 if local_ids is not None else 0
+        GROUP_SIZE_M = self.group_size
+
         if force_use_fp16:
             assert not force_use_fp32
             use_fp16 = True
@@ -621,15 +616,6 @@ def _forward_block_sparse(self, node_mars: torch.Tensor, element_mars: torch.Ten
                 use_fp16 = False
 
         grid = (triton.cdiv(batch_size, BLOCK_B), triton.cdiv(layer_n_nodes, TILE_SIZE_M))
-
-        # print("========")
-        # print(layer_n_nodes)
-        # print(grid, grid[0] * grid[1])
-        # print(TILE_SIZE_M, TILE_SIZE_K, BLOCK_B)
-
-        # import time
-        # torch.cuda.synchronize()
-        # t0 = time.time()
         
         if TILE_SIZE_M >= 16 and TILE_SIZE_K >= 16 and BLOCK_B >= 16:
             self._fw_triton_block_sparse_tlmm_kernel[grid](
@@ -643,14 +629,15 @@ def _forward_block_sparse(self, node_mars: torch.Tensor, element_mars: torch.Ten
                 pids_increment,
                 local_ids,
                 batch_size,
-                partial_eval = 1 if local_ids is not None else 0,
+                partial_eval = partial_eval,
                 BLOCK_B = BLOCK_B,
                 TILE_SIZE_K = TILE_SIZE_K,
                 K_NUM_TILES = K_NUM_TILES,
                 TILE_SIZE_M = TILE_SIZE_M,
-                GROUP_SIZE_M = self.group_size,
+                GROUP_SIZE_M = GROUP_SIZE_M,
                 use_fp16 = use_fp16
             )
+
         else:
             self._fw_triton_block_sparse_csmm_kernel[grid](
                 node_mars, 
@@ -663,19 +650,14 @@ def _forward_block_sparse(self, node_mars: torch.Tensor, element_mars: torch.Ten
                 pids_increment,
                 local_ids,
                 batch_size,
-                partial_eval = 1 if local_ids is not None else 0,
+                partial_eval = partial_eval,
                 BLOCK_B = BLOCK_B,
                 TILE_SIZE_K = TILE_SIZE_K,
                 K_NUM_TILES = K_NUM_TILES,
                 TILE_SIZE_M = TILE_SIZE_M,
-                GROUP_SIZE_M = self.group_size,
+                GROUP_SIZE_M = GROUP_SIZE_M,
                 use_fp16 = use_fp16
             )
-
-        # torch.cuda.synchronize()
-        # t1 = time.time()
-
-        # print(f"kernel time: {(t1-t0)*1000:.3f}ms")
         
         return None
 
@@ -763,6 +745,9 @@ def _forward_sparse(self, node_mars: torch.Tensor, element_mars: torch.Tensor,
         BLOCK_B = max(min(2048 // num_edges, BATCH_SIZE_NP2), 1)
         BLOCK_M = self.group_size
 
+        partial_eval = 1 if local_ids is not None else 0
+        GROUP_SIZE_M = self.group_size
+
         grid = (triton.cdiv(batch_size, BLOCK_B), triton.cdiv(layer_n_nodes, BLOCK_M))
 
         self._fw_triton_sparse_kernel[grid](
@@ -774,11 +759,11 @@ def _forward_sparse(self, node_mars: torch.Tensor, element_mars: torch.Tensor,
             pids = pids,
             local_ids = local_ids, 
             batch_size = batch_size, 
-            partial_eval = 1 if local_ids is not None else 0, 
+            partial_eval = partial_eval, 
             num_edges = num_edges, 
             BLOCK_B = BLOCK_B, 
             BLOCK_M = BLOCK_M, 
-            GROUP_SIZE_M = self.group_size
+            GROUP_SIZE_M = GROUP_SIZE_M
         )
 
         return None
@@ -809,6 +794,14 @@ def _forward_pytorch_kernel(node_mars: torch.Tensor, element_mars: torch.Tensor,
 
         return None
 
+    def _forward_pytorch(node_mars: torch.Tensor, element_mars: torch.Tensor, params: torch.Tensor, 
+                         nids: torch.Tensor, cids: torch.Tensor, pids: torch.Tensor,
+                         local_ids: torch.Tensor):
+
+        self._forward_pytorch_kernel(
+            node_mars, element_mars, params, nids, cids, pids, local_ids
+        )
+
     def _backward(self, node_flows: torch.Tensor, element_flows: torch.Tensor, 
                   params: torch.Tensor, node_mars: torch.Tensor, 
                   element_mars: torch.Tensor, param_flows: torch.Tensor,
@@ -949,102 +942,6 @@ def _bk_triton_block_sparse_ele_kernel(node_flows, element_flows, node_mars, ele
         offs_batch = tl.arange(0, BLOCK_B) + pid_b * BLOCK_B
         mask_batch = offs_batch < batch_size
 
-        # Initialize pointers to `node_mars`
-        edge_start = tl.load(parids_start + elegroup_id * ptr_inc_step + offs_edge_gid)
-        nmars_ptr = node_mars + \
-            (edge_start + offs_edge_nid)[:,None] * batch_size + \
-            offs_batch[None,:] # [TILE_SIZE_K, BLOCK_B]
-        nflows_ptr = node_flows + \
-            (edge_start + offs_edge_nid)[:,None] * batch_size + \
-            offs_batch[None,:] # [TILE_SIZE_K, BLOCK_B]
-
-        # Initialize pointers to `element_mars`
-        off_eleids = tl.load(chids + elegroup_id)
-        emars_ptr = element_mars + (off_eleids + offs_ele[:,None]) * batch_size + offs_batch[None,:]
-        emars = tl.load(emars_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_K, BLOCK_B]
-
-        # Batch increment pointers
-        parids_inc_ptr = parids_increment + elegroup_id * (K_NUM_TILES * ptr_inc_step) + offs_edge_gid
-        parpids_inc_ptr = parpids_increment + elegroup_id * (K_NUM_TILES * ptr_inc_step) + offs_edge_gid
-
-        # Inner loop
-        acc = tl.zeros([TILE_SIZE_M, BLOCK_B], dtype = tl.float32)
-        log_max = tl.zeros([BLOCK_B], dtype = tl.float32) - float("inf")
-
-        for k in range(0, K_NUM_TILES):
-            epars = tl.load(epars_ptr) # [TILE_SIZE_M, TILE_SIZE_K]
-            nmars = tl.load(nmars_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_K, BLOCK_B]
-            nflows = tl.load(nflows_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_K, BLOCK_B]
-
-            # log_n_fdm = tl.log(nflows) - nmars
-            # log_n_fdm_max = tl.max(log_n_fdm, axis = 0)
-            # n_fdm_sub = tl.exp(log_n_fdm - log_n_fdm_max[None,:])
-
-            # partial_flows = tl.dot(epars, n_fdm_sub)
-
-            # acc = tl.where(log_max[None,:] > log_n_fdm_max[None,:], 
-            #                acc + tl.exp(log_n_fdm_max - log_max)[None,:] * partial_flows,
-            #                partial_flows + tl.exp(log_max - log_n_fdm_max)[None,:] * acc)
-            # log_max = tl.maximum(log_max, log_n_fdm_max)
-
-            eflows = tl.sum(epars[:,:,None] * tl.exp(emars[:,None,:] - nmars[None,:,:]) * nflows[None,:,:], axis = 1)
-            acc += eflows
-
-            # Increment `epars_ptr`
-            parpids_inc = tl.load(parpids_inc_ptr)
-            epars_ptr += parpids_inc[None,:]
-            parpids_inc_ptr += ptr_inc_step
-
-            # Increment `nmars_ptr`
-            parids_inc = tl.load(parids_inc_ptr)
-            nmars_ptr += parids_inc[:,None] * batch_size
-            nflows_ptr += parids_inc[:,None] * batch_size
-            parids_inc += ptr_inc_step
-
-        # # Initialize pointers to `element_mars`
-        # off_eleids = tl.load(chids + elegroup_id)
-        # emars_ptr = element_mars + (off_eleids + offs_ele[:,None]) * batch_size + offs_batch[None,:]
-        # emars = tl.load(emars_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_K, BLOCK_B]
-
-        # eflows = acc * tl.exp(emars + log_max[None,:])
-
-        # Write back
-        offs_elemfs = (off_eleids + offs_ele[:,None]) * batch_size + offs_batch[None,:]
-        tl.store(element_flows + offs_elemfs, acc, mask = mask_batch[None,:])
-
-    @staticmethod
-    @triton.jit
-    def my_kernel(node_flows, element_flows, node_mars, element_mars, params, 
-                                            chids, parids_start, parids_increment, parpids_start, parpids_increment, 
-                                            local_ids, batch_size: tl.constexpr, partial_eval: tl.constexpr, ptr_inc_step: tl.constexpr, 
-                                            BLOCK_B: tl.constexpr, TILE_SIZE_K: tl.constexpr, K_NUM_TILES: tl.constexpr, TILE_SIZE_M: tl.constexpr, 
-                                            GROUP_SIZE_M: tl.constexpr, GROUP_SIZE_K: tl.constexpr):
-
-        pid_b = tl.program_id(0) # ID of size-`BLOCK_B` batches
-        pid_m = tl.program_id(1) # ID of size-`TILE_SIZE_M` nodes
-
-        # Get inferred node group id from `pid_m`
-        elegroup_id = pid_m // (GROUP_SIZE_M // TILE_SIZE_M)
-        tile_id = pid_m % (GROUP_SIZE_M // TILE_SIZE_M)
-
-        # Get the real node group id in the case of partial evaluation
-        if partial_eval == 1:
-            elegroup_id = tl.load(local_ids + elegroup_id)
-
-        # Initialize pointers to `params`
-        offs_ele = tl.arange(0, TILE_SIZE_M) + tile_id * TILE_SIZE_M
-        offs_edge = tl.arange(0, TILE_SIZE_K)
-        offs_edge_gid = offs_edge // GROUP_SIZE_K
-        offs_edge_nid = (offs_edge % GROUP_SIZE_K)
-        par_start = tl.load(parpids_start + elegroup_id * ptr_inc_step + offs_edge_gid)
-        epars_ptr = params + \
-            offs_ele[:,None] * GROUP_SIZE_K + \
-            (par_start + offs_edge_nid)[None,:] # [TILE_SIZE_M, TILE_SIZE_K]
-
-        # Batch offsets and mask
-        offs_batch = tl.arange(0, BLOCK_B) + pid_b * BLOCK_B
-        mask_batch = offs_batch < batch_size
-
         # Initialize pointers to `node_mars`
         edge_start = tl.load(parids_start + elegroup_id * ptr_inc_step + offs_edge_gid)
         nmars_ptr = node_mars + \
@@ -1070,7 +967,7 @@ def my_kernel(node_flows, element_flows, node_mars, element_mars, params,
             log_n_fdm_max = tl.max(log_n_fdm, axis = 0)
             n_fdm_sub = tl.where(log_n_fdm_max[None,:] != -float("inf"), tl.exp(log_n_fdm - log_n_fdm_max[None,:]), 0.0)
 
-            partial_flows = tl.dot(epars, n_fdm_sub, allow_tf32 = True)
+            partial_flows = tl.dot(epars, n_fdm_sub)
             # partial_flows = tl.sum(epars[:,:,None] * n_fdm_sub[None,:,:], axis = 1)
 
             acc = tl.where(log_n_fdm_max[None,:] > acc,
@@ -1169,31 +1066,13 @@ def _backward_block_sparse_ele_flows(self, node_flows: torch.Tensor, element_flo
         else:
             parids_start, parids_increment, parpids_start, parpids_increment, ptr_inc_step = self._cached_bk_parids[signature]
 
+        partial_eval = 1 if local_ids is not None else 0
+        GROUP_SIZE_M = cs_group_size
+        GROUP_SIZE_K = self.group_size
+
         grid = (triton.cdiv(batch_size, BLOCK_B), triton.cdiv(layer_n_nodes, TILE_SIZE_M))
 
-        # self._bk_triton_block_sparse_ele_kernel[grid](
-        #     node_flows = node_flows, 
-        #     element_flows = element_flows, 
-        #     node_mars = node_mars, 
-        #     element_mars = element_mars, 
-        #     params = params, 
-        #     chids = chids, 
-        #     parids_start = parids_start,
-        #     parids_increment = parids_increment,
-        #     parpids_start = parpids_start,
-        #     parpids_increment = parpids_increment, 
-        #     local_ids = local_ids, 
-        #     batch_size = batch_size, 
-        #     partial_eval = 1 if local_ids is not None else 0,
-        #     ptr_inc_step = ptr_inc_step,
-        #     BLOCK_B = BLOCK_B, 
-        #     TILE_SIZE_K = TILE_SIZE_K, 
-        #     K_NUM_TILES = K_NUM_TILES,
-        #     TILE_SIZE_M = TILE_SIZE_M, 
-        #     GROUP_SIZE_M = cs_group_size,
-        #     GROUP_SIZE_K = self.group_size
-        # )
-        self.my_kernel[grid](
+        self._bk_triton_block_sparse_ele_kernel[grid](
             node_flows = node_flows, 
             element_flows = element_flows, 
             node_mars = node_mars, 
@@ -1206,17 +1085,18 @@ def _backward_block_sparse_ele_flows(self, node_flows: torch.Tensor, element_flo
             parpids_increment = parpids_increment, 
             local_ids = local_ids, 
             batch_size = batch_size, 
-            partial_eval = 1 if local_ids is not None else 0,
+            partial_eval = partial_eval,
             ptr_inc_step = ptr_inc_step,
             BLOCK_B = BLOCK_B, 
             TILE_SIZE_K = TILE_SIZE_K, 
             K_NUM_TILES = K_NUM_TILES,
             TILE_SIZE_M = TILE_SIZE_M, 
-            GROUP_SIZE_M = cs_group_size,
-            GROUP_SIZE_K = self.group_size
+            GROUP_SIZE_M = GROUP_SIZE_M,
+            GROUP_SIZE_K = GROUP_SIZE_K
         )
 
-        # torch.cuda.synchronize()
+        # This doesn't seem to be necessary, but is necessary to avoid producing nans
+        # Needs to investigate more
         element_flows[0,:] = 0.0
 
         # if element_flows.isnan().any() or element_flows.isinf().any():
@@ -1255,83 +1135,6 @@ def _bk_triton_block_sparse_par_kernel(node_flows, node_mars, element_mars, para
         nmars_ptr = node_mars + (off_nids + offs_node[:,None]) * batch_size + offs_batch[None,:]
         nflows_ptr = node_flows + (off_nids + offs_node[:,None]) * batch_size + offs_batch[None,:]
 
-        # Initialize `params`
-        par_start = tl.load(pids + ngroup_id * num_edges + offs_edge)
-        epars_offsets = offs_node[:,None] + par_start[None,:] # [TILE_SIZE_M, TILE_SIZE_K]
-        epars = tl.load(params + epars_offsets)
-
-        # Inner loop
-        acc = tl.zeros([TILE_SIZE_M, TILE_SIZE_K], dtype = tl.float32)
-        
-        for b in range(0, B_NUM_TILES):
-            emars = tl.load(emars_ptr, mask = mask_batch[:,None]) # [TILE_SIZE_B, TILE_SIZE_K]
-            nmars = tl.load(nmars_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_M, TILE_SIZE_B]
-            nflows = tl.load(nflows_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_M, TILE_SIZE_B]
-
-            # log_n_fdm = tl.log(nflows) - nmars
-            # log_n_fdm_max = tl.max(log_n_fdm, axis = 0)
-            # n_fdm_sub = tl.exp(log_n_fdm - log_n_fdm_max[None,:])
-
-            # scaled_emars = tl.exp(emars + log_n_fdm_max[:,None])
-
-            # partial_flows = tl.dot(n_fdm_sub, scaled_emars)
-            # acc += partial_flows
-
-            pflows = tl.sum(epars[:,None,:] * tl.exp(emars[None,:,:] - nmars[:,:,None]) * nflows[:,:,None], axis = 1)
-            acc += pflows
-
-            # Increment `emars_ptr`, `nmars_ptr`, and `nmars_ptr`
-            emars_ptr += TILE_SIZE_B
-            nmars_ptr += TILE_SIZE_B
-            nflows_ptr += TILE_SIZE_B
-
-            # Update batch mask
-            offs_batch += TILE_SIZE_B
-            mask_batch = offs_batch < batch_size
-
-        # # Initialize `params`
-        # par_start = tl.load(pids + ngroup_id * num_edges + offs_edge)
-        # epars_offsets = offs_node[:,None] + par_start[None,:] # [TILE_SIZE_M, TILE_SIZE_K]
-        # epars = tl.load(params + epars_offsets)
-
-        # pflows = acc * epars
-
-        parflow_start = tl.load(pfids + ngroup_id * num_edges + offs_edge)
-        eparflows_offsets = offs_node[:,None] + parflow_start[None,:] # [TILE_SIZE_M, TILE_SIZE_K]
-
-        tl.atomic_add(param_flows + eparflows_offsets, acc)
-
-    @staticmethod
-    @triton.jit
-    def my_kernel2(node_flows, node_mars, element_mars, params, param_flows, nids, cids, pids, pfids,
-                                           batch_size: tl.constexpr, num_edges: tl.constexpr, TILE_SIZE_B: tl.constexpr, 
-                                           B_NUM_TILES: tl.constexpr, TILE_SIZE_K: tl.constexpr, 
-                                           TILE_SIZE_M: tl.constexpr, GROUP_SIZE_M: tl.constexpr):
-
-        pid_k = tl.program_id(0) # ID of size-`TILE_SIZE_K` edges
-        pid_m = tl.program_id(1) # ID of size-`TILE_SIZE_M` nodes
-
-        # Get inferred node group id from `pid_m`
-        ngroup_id = pid_m // (GROUP_SIZE_M // TILE_SIZE_M)
-        tile_id = pid_m % (GROUP_SIZE_M // TILE_SIZE_M)
-
-        # Batch offsets and mask
-        offs_batch = tl.arange(0, TILE_SIZE_B)
-        mask_batch = offs_batch < batch_size
-
-        # Initialize pointers to `element_mars`
-        offs_edge = tl.arange(0, TILE_SIZE_K) + pid_k * TILE_SIZE_K
-        edge_start = tl.load(cids + ngroup_id * num_edges + offs_edge)
-        emars_ptr = element_mars + \
-            edge_start[None,:] * batch_size + \
-            offs_batch[:,None] # [TILE_SIZE_B, TILE_SIZE_K]
-
-        # Initialize pointers to `node_flows` and `node_mars`
-        offs_node = tl.arange(0, TILE_SIZE_M) + tile_id * TILE_SIZE_M
-        off_nids = tl.load(nids + ngroup_id)
-        nmars_ptr = node_mars + (off_nids + offs_node[:,None]) * batch_size + offs_batch[None,:]
-        nflows_ptr = node_flows + (off_nids + offs_node[:,None]) * batch_size + offs_batch[None,:]
-
         # Inner loop
         acc = tl.zeros([TILE_SIZE_M, TILE_SIZE_K], dtype = tl.float32)
         
@@ -1346,7 +1149,7 @@ def my_kernel2(node_flows, node_mars, element_mars, params, param_flows, nids, c
 
             scaled_emars = tl.exp(emars + log_n_fdm_max[:,None])
 
-            partial_flows = tl.dot(n_fdm_sub, scaled_emars, allow_tf32 = True)
+            partial_flows = tl.dot(n_fdm_sub, scaled_emars)
             acc += partial_flows
 
             # Increment `emars_ptr`, `nmars_ptr`, and `nmars_ptr`
@@ -1416,25 +1219,7 @@ def _backward_block_sparse_par_flows(self, node_flows: torch.Tensor, params: tor
 
         grid = (triton.cdiv(num_edges, TILE_SIZE_K), triton.cdiv(layer_n_nodes, TILE_SIZE_M))
 
-        # self._bk_triton_block_sparse_par_kernel[grid](
-        #     node_flows = node_flows, 
-        #     node_mars = node_mars, 
-        #     element_mars = element_mars, 
-        #     params = params, 
-        #     param_flows = param_flows, 
-        #     nids = nids, 
-        #     cids = cids, 
-        #     pids = pids,
-        #     pfids = pfids,
-        #     batch_size = batch_size, 
-        #     num_edges = num_edges, 
-        #     TILE_SIZE_B = TILE_SIZE_B, 
-        #     B_NUM_TILES = B_NUM_TILES, 
-        #     TILE_SIZE_K = TILE_SIZE_K, 
-        #     TILE_SIZE_M = TILE_SIZE_M, 
-        #     GROUP_SIZE_M = self.group_size
-        # )
-        self.my_kernel2[grid](
+        self._bk_triton_block_sparse_par_kernel[grid](
             node_flows = node_flows, 
             node_mars = node_mars, 
             element_mars = element_mars, 
@@ -1453,6 +1238,10 @@ def _backward_block_sparse_par_flows(self, node_flows: torch.Tensor, params: tor
             GROUP_SIZE_M = self.group_size
         )
 
+        # This doesn't seem to be necessary, but is necessary to avoid producing nans
+        # Needs to investigate more
+        param_flows[0:1] = param_flows[0:1] * 1
+
     def _backward_sparse(self, node_flows: torch.Tensor, element_flows: torch.Tensor, 
                          params: torch.Tensor, node_mars: torch.Tensor, 
                          element_mars: torch.Tensor, param_flows: torch.Tensor, 
diff --git a/src/pyjuice/model/tensorcircuit.py b/src/pyjuice/model/tensorcircuit.py
index bff52499..f85f3a60 100644
--- a/src/pyjuice/model/tensorcircuit.py
+++ b/src/pyjuice/model/tensorcircuit.py
@@ -19,13 +19,15 @@
                      normalize_parameters
 
 
-def _pc_model_backward_hook(grad, pc, inputs, **kwargs):
+def _pc_model_backward_hook(grad, pc, inputs, record_cudagraph, apply_cudagraph, **kwargs):
     grad = grad.permute(1, 0)
     pc.backward(
         inputs = inputs,
         ll_weights = grad / grad.sum() * grad.size(1),
         compute_param_flows = pc._optim_hyperparams["compute_param_flows"], 
         flows_memory = pc._optim_hyperparams["flows_memory"],
+        record_cudagraph = record_cudagraph,
+        apply_cudagraph = apply_cudagraph,
         **kwargs
     )
 
@@ -78,7 +80,7 @@ def __init__(self, root_ns: CircuitNodes, layer_sparsity_tol: float = 0.5,
             "flows_memory": 1.0
         }
 
-        # Recorded CudaGraphs
+        # CudaGraph options
         self._recorded_cuda_graphs = dict()
 
     def to(self, device):
@@ -156,7 +158,7 @@ def _run_inner_layers():
                     else:
                         raise ValueError(f"Unknown layer type {type(layer)}.")
 
-            signature = (id(self.node_mars), id(self.element_mars), B)
+            signature = (0, id(self.node_mars), id(self.element_mars), id(self.params), B)
             if record_cudagraph and signature not in self._recorded_cuda_graphs:
                 # Warmup
                 s = torch.cuda.Stream()
@@ -196,7 +198,16 @@ def _run_inner_layers():
 
         if torch.is_grad_enabled():
             lls.requires_grad = True
-            lls.register_hook(partial(_pc_model_backward_hook, pc = self, inputs = inputs, **kwargs))
+            lls.register_hook(
+                partial(
+                    _pc_model_backward_hook, 
+                    pc = self, 
+                    inputs = inputs, 
+                    record_cudagraph = record_cudagraph, 
+                    apply_cudagraph = apply_cudagraph,
+                    **kwargs
+                )
+            )
 
         if return_cache:
             return lls.clone(), cache
@@ -210,6 +221,8 @@ def backward(self, inputs: Optional[torch.Tensor] = None,
                  input_layer_fn: Optional[Union[str,Callable]] = None,
                  cache: Optional[dict] = None,
                  return_cache: bool = False,
+                 record_cudagraph: bool = False, 
+                 apply_cudagraph: bool = True,
                  **kwargs):
         """
         Compute circuit flows.
@@ -255,25 +268,52 @@ def backward(self, inputs: Optional[torch.Tensor] = None,
         ## Run backward pass ##
 
         with torch.no_grad():
-            for layer_id in range(len(self.inner_layer_groups) - 1, -1, -1):
-                layer_group = self.inner_layer_groups[layer_id]
 
-                if layer_group.is_prod():
-                    # Prod layer
-                    layer_group.backward(self.node_flows, self.element_flows)
+            # Inner layers
+            def _run_inner_layers():
+                for layer_id in range(len(self.inner_layer_groups) - 1, -1, -1):
+                    layer_group = self.inner_layer_groups[layer_id]
 
-                elif layer_group.is_sum():
-                    # Sum layer
+                    if layer_group.is_prod():
+                        # Prod layer
+                        layer_group.backward(self.node_flows, self.element_flows)
 
-                    # First recompute the previous product layer
-                    self.inner_layer_groups[layer_id-1].forward(self.node_mars, self.element_mars, _for_backward = True)
+                    elif layer_group.is_sum():
+                        # Sum layer
 
-                    # Backward sum layer
-                    layer_group.backward(self.node_flows, self.element_flows, self.node_mars, self.element_mars, self.params, 
-                                         param_flows = self.param_flows if compute_param_flows else None)
+                        # First recompute the previous product layer
+                        self.inner_layer_groups[layer_id-1].forward(self.node_mars, self.element_mars, _for_backward = True)
 
-                else:
-                    raise ValueError(f"Unknown layer type {type(layer)}.")
+                        # Backward sum layer
+                        layer_group.backward(self.node_flows, self.element_flows, self.node_mars, self.element_mars, self.params, 
+                                            param_flows = self.param_flows if compute_param_flows else None)
+
+                    else:
+                        raise ValueError(f"Unknown layer type {type(layer)}.")
+
+            signature = (1, id(self.node_flows), id(self.element_flows), id(self.node_mars), id(self.element_mars), id(self.params), id(self.param_flows), B)
+            if record_cudagraph and signature not in self._recorded_cuda_graphs:
+                # Warmup
+                s = torch.cuda.Stream()
+                s.wait_stream(torch.cuda.current_stream())
+                with torch.cuda.stream(s):
+                    for _ in range(3):
+                        _run_inner_layers()
+                torch.cuda.current_stream().wait_stream(s)
+
+                # Capture
+                g = torch.cuda.CUDAGraph()
+                with torch.cuda.graph(g):
+                    _run_inner_layers()
+
+                # Save
+                self._recorded_cuda_graphs[signature] = g
+
+            if apply_cudagraph and signature in self._recorded_cuda_graphs:
+                g = self._recorded_cuda_graphs[signature]
+                g.replay()
+            else:
+                _run_inner_layers()
 
             # Compute backward pass for all input layers
             for idx, layer in enumerate(self.input_layer_group):
diff --git a/tests/structures/debug.py b/tests/structures/debug.py
index 2c42c63a..0d37d331 100644
--- a/tests/structures/debug.py
+++ b/tests/structures/debug.py
@@ -240,7 +240,31 @@ def main():
     grid = (triton.cdiv(batch_size, BLOCK_B), triton.cdiv(layer_n_nodes, TILE_SIZE_M))
     # grid = (1, triton.cdiv(layer_n_nodes, TILE_SIZE_M))
 
-    ref_kernel[grid](
+    # ref_kernel[grid](
+    #     node_flows = node_flows, 
+    #     element_flows = element_flows, 
+    #     node_mars = node_mars, 
+    #     element_mars = element_mars, 
+    #     params = params, 
+    #     chids = chids, 
+    #     parids_start = parids_start,
+    #     parids_increment = parids_increment,
+    #     parpids_start = parpids_start,
+    #     parpids_increment = parpids_increment, 
+    #     local_ids = None, 
+    #     batch_size = batch_size, 
+    #     partial_eval = 0,
+    #     ptr_inc_step = ptr_inc_step,
+    #     BLOCK_B = BLOCK_B, 
+    #     TILE_SIZE_K = TILE_SIZE_K, 
+    #     K_NUM_TILES = K_NUM_TILES,
+    #     TILE_SIZE_M = TILE_SIZE_M, 
+    #     GROUP_SIZE_M = GROUP_SIZE_M,
+    #     GROUP_SIZE_K = GROUP_SIZE_K
+    # )
+
+    aaa = ref_kernel[grid]
+    aaa(
         node_flows = node_flows, 
         element_flows = element_flows, 
         node_mars = node_mars, 
diff --git a/tests/structures/hclt_test.py b/tests/structures/hclt_test.py
index d6c9e63b..03843305 100644
--- a/tests/structures/hclt_test.py
+++ b/tests/structures/hclt_test.py
@@ -109,16 +109,29 @@ def hclt_test():
         milestone_steps = [0, len(train_loader) * 100, len(train_loader) * 350]
     )
 
-    # for batch in train_loader:
-    #     x = batch[0].to(device)
+    for batch in train_loader:
+        x = batch[0].to(device)
 
-    #     optimizer.zero_grad()
+        # optimizer.zero_grad()
 
-    #     lls = pc(x)
-    #     lls.mean().backward()
+        lls = pc(x, record_cudagraph = True)
+        lls.mean().backward()
+        break
 
-    #     optimizer.step()
-    #     scheduler.step()
+        # optimizer.step()
+        # scheduler.step()
+
+    # for _ in range(100):
+    #     t0 = time.time()
+    #     for batch in train_loader:
+    #         x = batch[0].to(device)
+
+    #         lls = pc(x, apply_cudagraph = True)
+    #         lls.mean().backward()
+
+    #     torch.cuda.synchronize()
+    #     t1 = time.time()
+    #     print(f"{(t1-t0)*1000:.3f}ms")
 
     # from torch.profiler import profile, record_function, ProfilerActivity
     # with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:

From 77528aed93d5448138e937caac4075212f4d3cea Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Thu, 21 Dec 2023 00:49:24 +0800
Subject: [PATCH 081/162] hclt tests worked & initial effert to speedup kernels

---
 src/pyjuice/layer/sum_layer.py | 145 ++++++++++++++++++++++++++-------
 tests/layer/sum_layer_test.py  |   8 +-
 tests/structures/hclt_test.py  |  44 ++--------
 3 files changed, 123 insertions(+), 74 deletions(-)

diff --git a/src/pyjuice/layer/sum_layer.py b/src/pyjuice/layer/sum_layer.py
index 94cc07af..66801078 100644
--- a/src/pyjuice/layer/sum_layer.py
+++ b/src/pyjuice/layer/sum_layer.py
@@ -239,7 +239,8 @@ def forward(self, node_mars: torch.Tensor, element_mars: torch.Tensor, params: t
 
     def backward(self, node_flows: torch.Tensor, element_flows: torch.Tensor, 
                  node_mars: torch.Tensor, element_mars: torch.Tensor, 
-                 params: torch.Tensor, param_flows: Optional[torch.Tensor] = None) -> None:
+                 params: torch.Tensor, param_flows: Optional[torch.Tensor] = None,
+                 allow_modify_flows: bool = False) -> None:
         """
         Computes the forward pass of a sum layer:
         ```
@@ -259,6 +260,17 @@ def backward(self, node_flows: torch.Tensor, element_flows: torch.Tensor,
         `element_mars`:  [max_num_els, B]
         `params`:        [num_params, B] or [num_params]
         """
+
+        ## Pre-compute `nflows.log() - nmars` if needed ##
+        if allow_modify_flows:
+            for partition_id in range(self.num_fw_partitions):
+                nids = self.partitioned_nids[partition_id]
+                # TODO: be careful when restoring `local_ids`
+                local_ids = None
+
+                self._bk_triton_block_sparse_modify_flow(
+                    node_flows, node_mars, nids, local_ids
+                )
         
         ## Compute flows w.r.t. elements (i.e., product nodes) ##
         if not self.provided("bk_group_local_ids"):
@@ -273,7 +285,8 @@ def backward(self, node_flows: torch.Tensor, element_flows: torch.Tensor,
                     node_flows, element_flows, params, node_mars, 
                     element_mars, param_flows, 
                     chids = chids, parids = parids, parpids = parpids,
-                    cs_group_size = cs_group_size
+                    cs_group_size = cs_group_size,
+                    allow_modify_flows = allow_modify_flows
                 )
 
         else:
@@ -289,7 +302,8 @@ def backward(self, node_flows: torch.Tensor, element_flows: torch.Tensor,
                     node_flows, element_flows, params, node_mars,
                     element_mars, param_flows, 
                     chids = chids, parids = parids, parpids = parpids,
-                    cs_group_size = cs_group_size, local_ids = local_ids
+                    cs_group_size = cs_group_size, local_ids = local_ids,
+                    allow_modify_flows = allow_modify_flows
                 )
 
         ## Compute flows w.r.t. sum parameters ##
@@ -810,7 +824,8 @@ def _backward(self, node_flows: torch.Tensor, element_flows: torch.Tensor,
                   chids: Optional[torch.Tensor] = None, parids: Optional[torch.Tensor] = None, 
                   parpids: Optional[torch.Tensor] = None, 
                   cs_group_size: int = 0, local_ids: Optional[torch.Tensor] = None, 
-                  partition_id: int = -1, mode: Optional[str] = None) -> None:
+                  partition_id: int = -1, mode: Optional[str] = None,
+                  allow_modify_flows: bool = False) -> None:
         """
         Back pass of sum layers.
         
@@ -850,17 +865,19 @@ def _backward(self, node_flows: torch.Tensor, element_flows: torch.Tensor,
             self._backward_block_sparse(
                 node_flows, element_flows, params, node_mars, element_mars, param_flows, 
                 nids, cids, pids, pfids, chids, parids, parpids, cs_group_size, local_ids, 
-                partition_id = partition_id
+                partition_id = partition_id, allow_modify_flows = allow_modify_flows
             )
 
         elif mode == "sparse":
             self._backward_sparse(
                 node_flows, element_flows, params, node_mars, element_mars, param_flows, 
                 nids, cids, pids, pfids, chids, parids, parpids, cs_group_size, local_ids, 
-                partition_id = partition_id
+                partition_id = partition_id, allow_modify_flows = allow_modify_flows
             )
 
         elif mode == "pytorch":
+            assert not allow_modify_flows, "Please set `allow_modify_flows` to False when " \
+                                           "using the native PyTorch backward."
             self._backward_pytorch(
                 node_flows, element_flows, params, node_mars, 
                 element_mars, param_flows, nids, cids, pids, pfids, 
@@ -869,13 +886,76 @@ def _backward(self, node_flows: torch.Tensor, element_flows: torch.Tensor,
         else:
             raise ValueError(f"Not supported mode `{mode}`.")
 
+    @staticmethod
+    @triton.jit
+    def _bk_triton_block_sparse_modify_flow_kernel(node_flows, node_mars, local_ids, nids, batch_size: tl.constexpr, partial_eval: tl.constexpr, 
+                                                   BLOCK_B: tl.constexpr, BLOCK_M: tl.constexpr, GROUP_SIZE_M: tl.constexpr):
+
+        pid_b = tl.program_id(0) # ID of size-`BLOCK_B` examples
+        pid_m = tl.program_id(1) # ID of size-`BLOCK_M` nodes
+
+        # Get inferred node group id from `pid_m`
+        ngroup_id = pid_m // (GROUP_SIZE_M // BLOCK_M)
+        tile_id = pid_m % (GROUP_SIZE_M // BLOCK_M)
+
+        # Get the real node group id in the case of partial evaluation
+        if partial_eval == 1:
+            ngroup_id = tl.load(local_ids + ngroup_id)
+
+        # Batch offsets and mask
+        offs_batch = tl.arange(0, BLOCK_B)
+        mask_batch = offs_batch < batch_size
+
+        # Initialize pointers to `node_flows` and `node_mars`
+        offs_node = tl.arange(0, BLOCK_M) + tile_id * BLOCK_M
+        off_nids = tl.load(nids + ngroup_id)
+        offs_nmfs = (off_nids + offs_node[:,None]) * batch_size + offs_batch[None,:]
+
+        nmars = tl.load(node_mars + offs_nmfs, mask = mask_batch[None,:])
+        nflows = tl.load(node_flows + offs_nmfs, mask = mask_batch[None,:])
+
+        uflows = tl.log(nflows) - nmars
+
+        tl.store(node_flows + offs_nmfs, uflows, mask = mask_batch[None,:])
+
+    def _bk_triton_block_sparse_modify_flow(self, node_flows: torch.Tensor, node_mars: torch.Tensor,
+                                            nids: torch.Tensor, local_ids: Optional[torch.Tensor] = None):
+        """
+        Replace `node_flows[nids]` with `node_flows[nids].log() - node_mars[nids]`
+        """
+
+        num_ngroups = nids.size(0) if local_ids is None else local_ids.size(0)
+        layer_n_nodes = num_ngroups * self.group_size
+        batch_size = node_mars.size(1)
+        BATCH_SIZE_NP2 = triton.next_power_of_2(batch_size)
+
+        BLOCK_B = min(2048, BATCH_SIZE_NP2)
+        BLOCK_M = min(2048 // BLOCK_B, self.group_size)
+
+        partial_eval = 1 if local_ids is not None else 0
+        GROUP_SIZE_M = self.group_size
+
+        grid = (triton.cdiv(batch_size, BLOCK_B), triton.cdiv(layer_n_nodes, BLOCK_M))
+
+        self._bk_triton_block_sparse_modify_flow_kernel[grid](
+            node_flows = node_flows, 
+            node_mars = node_mars, 
+            local_ids = local_ids, 
+            nids = nids, 
+            batch_size = batch_size, 
+            partial_eval = partial_eval, 
+            BLOCK_B = BLOCK_B, 
+            BLOCK_M = BLOCK_M, 
+            GROUP_SIZE_M = GROUP_SIZE_M
+        )
+
     def _backward_block_sparse(self, node_flows: torch.Tensor, element_flows: torch.Tensor, 
                                params: torch.Tensor, node_mars: torch.Tensor, 
                                element_mars: torch.Tensor, param_flows: torch.Tensor, 
                                nids: Optional[torch.Tensor], cids: Optional[torch.Tensor], pids: Optional[torch.Tensor], pfids: Optional[torch.Tensor],
                                chids: Optional[torch.Tensor], parids: Optional[torch.Tensor], parpids: Optional[torch.Tensor], 
                                cs_group_size: int, local_ids: Optional[torch.Tensor] = None,
-                               partition_id: int = -1) -> None:
+                               partition_id: int = -1, allow_modify_flows: bool = False) -> None:
         """
         Back pass of sum layers with block-sparse processing kernel.
         
@@ -897,14 +977,15 @@ def _backward_block_sparse(self, node_flows: torch.Tensor, element_flows: torch.
                 node_flows, element_flows, params, node_mars, element_mars,
                 chids = chids, parids = parids, parpids = parpids, 
                 cs_group_size = cs_group_size, local_ids = local_ids, 
-                partition_id = partition_id
+                partition_id = partition_id, allow_modify_flows = allow_modify_flows
             )
 
         # Flows w.r.t. parameters
         if param_flows is not None and nids is not None:
             self._backward_block_sparse_par_flows(
                 node_flows, params, node_mars, element_mars, param_flows,
-                nids = nids, cids = cids, pids = pids, pfids = pfids
+                nids = nids, cids = cids, pids = pids, pfids = pfids,
+                allow_modify_flows = allow_modify_flows
             )
 
         return None
@@ -914,7 +995,8 @@ def _backward_block_sparse(self, node_flows: torch.Tensor, element_flows: torch.
     def _bk_triton_block_sparse_ele_kernel(node_flows, element_flows, node_mars, element_mars, params, 
                                            chids, parids_start, parids_increment, parpids_start, parpids_increment, 
                                            local_ids, batch_size: tl.constexpr, partial_eval: tl.constexpr, ptr_inc_step: tl.constexpr, 
-                                           BLOCK_B: tl.constexpr, TILE_SIZE_K: tl.constexpr, K_NUM_TILES: tl.constexpr, TILE_SIZE_M: tl.constexpr, 
+                                           allow_modify_flows: tl.constexpr, BLOCK_B: tl.constexpr, TILE_SIZE_K: tl.constexpr, 
+                                           K_NUM_TILES: tl.constexpr, TILE_SIZE_M: tl.constexpr, 
                                            GROUP_SIZE_M: tl.constexpr, GROUP_SIZE_K: tl.constexpr):
 
         pid_b = tl.program_id(0) # ID of size-`BLOCK_B` batches
@@ -970,10 +1052,12 @@ def _bk_triton_block_sparse_ele_kernel(node_flows, element_flows, node_mars, ele
             partial_flows = tl.dot(epars, n_fdm_sub)
             # partial_flows = tl.sum(epars[:,:,None] * n_fdm_sub[None,:,:], axis = 1)
 
+            neginf_flag = (log_n_fdm_max[None,:] == -float("inf")) & (acc == -float("inf"))
             acc = tl.where(log_n_fdm_max[None,:] > acc,
                 tl.log(partial_flows + tl.exp(acc - log_n_fdm_max[None,:])) + log_n_fdm_max[None,:],
                 tl.log(tl.exp(log_n_fdm_max[None,:] - acc) * partial_flows + 1.0) + acc
             )
+            acc = tl.where(neginf_flag, -float("inf"), acc)
 
             # Increment `epars_ptr`
             parpids_inc = tl.load(parpids_inc_ptr)
@@ -1001,7 +1085,7 @@ def _backward_block_sparse_ele_flows(self, node_flows: torch.Tensor, element_flo
                                          params: torch.Tensor, node_mars: torch.Tensor,
                                          element_mars: torch.Tensor, chids: torch.Tensor, parids: torch.Tensor,
                                          parpids: torch.Tensor, cs_group_size: int, local_ids: Optional[torch.Tensor] = None,
-                                         partition_id: int = -1) -> None:
+                                         partition_id: int = -1, allow_modify_flows: bool = False) -> None:
 
         assert params.dim() == 1, "Expecting a 1D `params`."
 
@@ -1069,6 +1153,7 @@ def _backward_block_sparse_ele_flows(self, node_flows: torch.Tensor, element_flo
         partial_eval = 1 if local_ids is not None else 0
         GROUP_SIZE_M = cs_group_size
         GROUP_SIZE_K = self.group_size
+        allow_modify_flows = 1 if allow_modify_flows else 0
 
         grid = (triton.cdiv(batch_size, BLOCK_B), triton.cdiv(layer_n_nodes, TILE_SIZE_M))
 
@@ -1087,6 +1172,7 @@ def _backward_block_sparse_ele_flows(self, node_flows: torch.Tensor, element_flo
             batch_size = batch_size, 
             partial_eval = partial_eval,
             ptr_inc_step = ptr_inc_step,
+            allow_modify_flows = allow_modify_flows,
             BLOCK_B = BLOCK_B, 
             TILE_SIZE_K = TILE_SIZE_K, 
             K_NUM_TILES = K_NUM_TILES,
@@ -1095,20 +1181,13 @@ def _backward_block_sparse_ele_flows(self, node_flows: torch.Tensor, element_flo
             GROUP_SIZE_K = GROUP_SIZE_K
         )
 
-        # This doesn't seem to be necessary, but is necessary to avoid producing nans
-        # Needs to investigate more
-        element_flows[0,:] = 0.0
-
-        # if element_flows.isnan().any() or element_flows.isinf().any():
-        #     import pdb; pdb.set_trace()
-
         return None
 
     @staticmethod
     @triton.jit
     def _bk_triton_block_sparse_par_kernel(node_flows, node_mars, element_mars, params, param_flows, nids, cids, pids, pfids,
-                                           batch_size: tl.constexpr, num_edges: tl.constexpr, TILE_SIZE_B: tl.constexpr, 
-                                           B_NUM_TILES: tl.constexpr, TILE_SIZE_K: tl.constexpr, 
+                                           batch_size: tl.constexpr, num_edges: tl.constexpr, allow_modify_flows: tl.constexpr, 
+                                           TILE_SIZE_B: tl.constexpr, B_NUM_TILES: tl.constexpr, TILE_SIZE_K: tl.constexpr, 
                                            TILE_SIZE_M: tl.constexpr, GROUP_SIZE_M: tl.constexpr):
 
         pid_k = tl.program_id(0) # ID of size-`TILE_SIZE_K` edges
@@ -1175,7 +1254,8 @@ def _bk_triton_block_sparse_par_kernel(node_flows, node_mars, element_mars, para
 
     def _backward_block_sparse_par_flows(self, node_flows: torch.Tensor, params: torch.Tensor, node_mars: torch.Tensor, 
                                          element_mars: torch.Tensor, param_flows: torch.Tensor, nids: torch.Tensor, 
-                                         cids: torch.Tensor, pids: torch.Tensor, pfids: torch.Tensor) -> None:
+                                         cids: torch.Tensor, pids: torch.Tensor, pfids: torch.Tensor,
+                                         allow_modify_flows: bool = False) -> None:
         """
         Backward pass of sum layers w.r.t. sum parameters with the block-sparse processing kernel.
         
@@ -1213,6 +1293,8 @@ def _backward_block_sparse_par_flows(self, node_flows: torch.Tensor, params: tor
             TILE_SIZE_K = min(2048 // TILE_SIZE_B, num_edges)
         B_NUM_TILES = batch_size // TILE_SIZE_B
 
+        allow_modify_flows = 1 if allow_modify_flows else 0
+
         assert TILE_SIZE_B >= 4, f"`TILE_SIZE_B` should be greater than 4 (but got {TILE_SIZE_B}) in order to use the block-sparse kernel. " \
                                   "This is an internal error of PyJuice. Please consider checking the kernel dispatching criterions and use the " \
                                   "corresponding sparse kernel instead."
@@ -1231,6 +1313,7 @@ def _backward_block_sparse_par_flows(self, node_flows: torch.Tensor, params: tor
             pfids = pfids,
             batch_size = batch_size, 
             num_edges = num_edges, 
+            allow_modify_flows = allow_modify_flows, 
             TILE_SIZE_B = TILE_SIZE_B, 
             B_NUM_TILES = B_NUM_TILES, 
             TILE_SIZE_K = TILE_SIZE_K, 
@@ -1238,17 +1321,13 @@ def _backward_block_sparse_par_flows(self, node_flows: torch.Tensor, params: tor
             GROUP_SIZE_M = self.group_size
         )
 
-        # This doesn't seem to be necessary, but is necessary to avoid producing nans
-        # Needs to investigate more
-        param_flows[0:1] = param_flows[0:1] * 1
-
     def _backward_sparse(self, node_flows: torch.Tensor, element_flows: torch.Tensor, 
                          params: torch.Tensor, node_mars: torch.Tensor, 
                          element_mars: torch.Tensor, param_flows: torch.Tensor, 
                          nids: Optional[torch.Tensor], cids: Optional[torch.Tensor], pids: Optional[torch.Tensor], pfids: Optional[torch.Tensor],
                          chids: Optional[torch.Tensor], parids: Optional[torch.Tensor], parpids: Optional[torch.Tensor], 
                          cs_group_size: int, local_ids: Optional[torch.Tensor] = None,
-                         partition_id: int = -1) -> None:
+                         partition_id: int = -1, allow_modify_flows: bool = False) -> None:
         """
         Back pass of sum layers with sparse processing kernel.
         
@@ -1269,14 +1348,16 @@ def _backward_sparse(self, node_flows: torch.Tensor, element_flows: torch.Tensor
             self._backward_sparse_ele_flows(
                 node_flows, element_flows, params, node_mars, element_mars,
                 chids = chids, parids = parids, parpids = parpids, 
-                cs_group_size = cs_group_size, local_ids = local_ids
+                cs_group_size = cs_group_size, local_ids = local_ids,
+                allow_modify_flows = allow_modify_flows
             )
 
         # Flows w.r.t. parameters
         if param_flows is not None and nids is not None:
             self._backward_sparse_par_flows(
                 node_flows, params, node_mars, element_mars, param_flows,
-                nids = nids, cids = cids, pids = pids, pfids = pfids
+                nids = nids, cids = cids, pids = pids, pfids = pfids,
+                allow_modify_flows = allow_modify_flows
             )
 
         return None
@@ -1344,7 +1425,8 @@ def _bk_triton_sparse_ele_kernel(node_flows, element_flows, node_mars, element_m
     def _backward_sparse_ele_flows(self, node_flows: torch.Tensor, element_flows: torch.Tensor,
                                    params: torch.Tensor, node_mars: torch.Tensor,
                                    element_mars: torch.Tensor, chids: torch.Tensor, parids: torch.Tensor,
-                                   parpids: torch.Tensor, cs_group_size: int, local_ids: Optional[torch.Tensor] = None) -> None:
+                                   parpids: torch.Tensor, cs_group_size: int, local_ids: Optional[torch.Tensor] = None,
+                                   allow_modify_flows: bool = False) -> None:
 
         assert params.dim() == 1, "Expecting a 1D `params`."
 
@@ -1444,7 +1526,8 @@ def _bk_triton_sparse_par_kernel(node_flows, node_mars, element_mars, params, pa
 
     def _backward_sparse_par_flows(self, node_flows: torch.Tensor, params: torch.Tensor, node_mars: torch.Tensor, 
                                    element_mars: torch.Tensor, param_flows: torch.Tensor, nids: torch.Tensor, 
-                                   cids: torch.Tensor, pids: torch.Tensor, pfids: torch.Tensor) -> None:
+                                   cids: torch.Tensor, pids: torch.Tensor, pfids: torch.Tensor,
+                                   allow_modify_flows: bool = False) -> None:
         """
         Backward pass of sum layers w.r.t. sum parameters with the block-sparse processing kernel.
         
@@ -1619,4 +1702,4 @@ def _prepare_scope2nids(self, prod_scope_eleids: Sequence[Tuple[BitSet, torch.Te
                         )
 
             self.fw_scope2localids = fw_scope2localids
-            self.bk_scope2localids = bk_scope2localids
+            self.bk_scope2localids = bk_scope2localids
\ No newline at end of file
diff --git a/tests/layer/sum_layer_test.py b/tests/layer/sum_layer_test.py
index e60a30d5..eeb87e62 100644
--- a/tests/layer/sum_layer_test.py
+++ b/tests/layer/sum_layer_test.py
@@ -182,7 +182,7 @@ def speed_test():
 
     node_mars = torch.zeros([group_size + group_size * num_node_groups * num_prod_nodes, batch_size]).to(device)
     element_mars = torch.rand([group_size + num_prod_nodes * group_size * num_node_groups, batch_size]).log().to(device)
-    params = torch.rand([layer.partitioned_pids[0].max() + group_size]).to(device)
+    params = torch.rand([layer.partitioned_pids[0].max() + group_size ** 2]).to(device)
 
     ## Forward tests ##
 
@@ -215,11 +215,11 @@ def speed_test():
     backward_ms = (t1 - t0) / 100 * 1000
 
     print(f"Backward pass on average takes {backward_ms:.3f}ms.")
-    print("Reference computation time on RTX 4090: 1.274ms.")
+    print("Reference computation time on RTX 4090: 1.814ms.")
     print("--------------------------------------------------------------")
 
 
 if __name__ == "__main__":
     torch.manual_seed(3890)
-    sum_layer_test()
-    # speed_test()
\ No newline at end of file
+    # sum_layer_test()
+    speed_test()
\ No newline at end of file
diff --git a/tests/structures/hclt_test.py b/tests/structures/hclt_test.py
index 03843305..0968507b 100644
--- a/tests/structures/hclt_test.py
+++ b/tests/structures/hclt_test.py
@@ -109,46 +109,12 @@ def hclt_test():
         milestone_steps = [0, len(train_loader) * 100, len(train_loader) * 350]
     )
 
-    for batch in train_loader:
-        x = batch[0].to(device)
+    # for batch in train_loader:
+    #     x = batch[0].to(device)
 
-        # optimizer.zero_grad()
-
-        lls = pc(x, record_cudagraph = True)
-        lls.mean().backward()
-        break
-
-        # optimizer.step()
-        # scheduler.step()
-
-    # for _ in range(100):
-    #     t0 = time.time()
-    #     for batch in train_loader:
-    #         x = batch[0].to(device)
-
-    #         lls = pc(x, apply_cudagraph = True)
-    #         lls.mean().backward()
-
-    #     torch.cuda.synchronize()
-    #     t1 = time.time()
-    #     print(f"{(t1-t0)*1000:.3f}ms")
-
-    # from torch.profiler import profile, record_function, ProfilerActivity
-    # with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
-    #     for batch in train_loader:
-    #         x = batch[0].to(device)
-
-    #         optimizer.zero_grad()
-
-    #         lls = pc(x)
-    #         lls.mean().backward()
-
-    #         optimizer.step()
-    #         scheduler.step()
-
-    #         break
-
-    # prof.export_chrome_trace("trace_new2.json")
+    #     lls = pc(x, record_cudagraph = True)
+    #     lls.mean().backward()
+    #     break
 
 
     mini_batch_em_epoch(350, pc, optimizer, scheduler, train_loader, test_loader, device)

From f89a87ec9d28496d8e11e6bc9b5a5fc63c818348 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Tue, 26 Dec 2023 15:25:45 +0800
Subject: [PATCH 082/162] temp update

---
 tests/structures/debug.py | 91 +++++++++++++--------------------------
 1 file changed, 31 insertions(+), 60 deletions(-)

diff --git a/tests/structures/debug.py b/tests/structures/debug.py
index 0d37d331..d72ee3f3 100644
--- a/tests/structures/debug.py
+++ b/tests/structures/debug.py
@@ -102,11 +102,11 @@ def ref_kernel(node_flows, element_flows, node_mars, element_mars, params,
 
 
 @triton.jit
-def my_kernel(aaa, bbb, ccc, ddd, eee, node_flows, element_flows, node_mars, element_mars, params, 
-                                        chids, parids_start, parids_increment, parpids_start, parpids_increment, 
-                                        local_ids, batch_size: tl.constexpr, partial_eval: tl.constexpr, ptr_inc_step: tl.constexpr, 
-                                        BLOCK_B: tl.constexpr, TILE_SIZE_K: tl.constexpr, K_NUM_TILES: tl.constexpr, TILE_SIZE_M: tl.constexpr, 
-                                        GROUP_SIZE_M: tl.constexpr, GROUP_SIZE_K: tl.constexpr):
+def my_kernel(aaa, bbb, ccc, node_flows, element_flows, node_mars, element_mars, params, 
+            chids, parids_start, parids_increment, parpids_start, parpids_increment, 
+            local_ids, batch_size: tl.constexpr, partial_eval: tl.constexpr, ptr_inc_step: tl.constexpr, 
+            BLOCK_B: tl.constexpr, TILE_SIZE_K: tl.constexpr, K_NUM_TILES: tl.constexpr, TILE_SIZE_M: tl.constexpr, 
+            GROUP_SIZE_M: tl.constexpr, GROUP_SIZE_K: tl.constexpr):
 
     pid_b = tl.program_id(0) # ID of size-`BLOCK_B` batches
     pid_m = tl.program_id(1) # ID of size-`TILE_SIZE_M` nodes
@@ -129,11 +129,6 @@ def my_kernel(aaa, bbb, ccc, ddd, eee, node_flows, element_flows, node_mars, ele
         offs_ele[:,None] * GROUP_SIZE_K + \
         (par_start + offs_edge_nid)[None,:] # [TILE_SIZE_M, TILE_SIZE_K]
 
-    epars = tl.load(epars_ptr)
-    offs1 = pid_m * (TILE_SIZE_M * TILE_SIZE_K) + tl.arange(0, TILE_SIZE_M)[:,None] * TILE_SIZE_K + tl.arange(0, TILE_SIZE_K)[None,:]
-    tl.store(aaa + offs1, epars)
-    tl.store(bbb + offs1, offs_ele[:,None] * GROUP_SIZE_K + (par_start + offs_edge_nid)[None,:])
-
     # Batch offsets and mask
     offs_batch = tl.arange(0, BLOCK_B) + pid_b * BLOCK_B
     mask_batch = offs_batch < batch_size
@@ -153,34 +148,35 @@ def my_kernel(aaa, bbb, ccc, ddd, eee, node_flows, element_flows, node_mars, ele
 
     # Inner loop
     acc = tl.zeros([TILE_SIZE_M, BLOCK_B], dtype = tl.float32) - float("inf")
-    # acc = tl.zeros([TILE_SIZE_M, BLOCK_B], dtype = tl.float32)
 
-    for k in range(0, K_NUM_TILES):
-    # for k in range(0, 1):
+    # for k in range(0, K_NUM_TILES):
+    for k in range(0, 1):
         epars = tl.load(epars_ptr) # [TILE_SIZE_M, TILE_SIZE_K]
         nmars = tl.load(nmars_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_K, BLOCK_B]
         nflows = tl.load(nflows_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_K, BLOCK_B]
 
         log_n_fdm = tl.log(nflows) - nmars
         log_n_fdm_max = tl.max(log_n_fdm, axis = 0)
-        n_fdm_sub = tl.where(log_n_fdm_max[None,:] != -float("inf"), 
-            tl.exp(log_n_fdm - log_n_fdm_max[None,:]), 0.0)
+        n_fdm_sub = tl.where(log_n_fdm_max[None,:] != -float("inf"), tl.exp(log_n_fdm - log_n_fdm_max[None,:]), 0.0)
 
-        offs2 = pid_m * (K_NUM_TILES * TILE_SIZE_K * batch_size) + k * (TILE_SIZE_K * batch_size) + tl.arange(0, TILE_SIZE_K)[:,None] * batch_size + offs_batch[None,:]
-        tl.store(ccc + offs2, log_n_fdm, mask = mask_batch[None,:])
-        tl.store(ddd + offs2, n_fdm_sub, mask = mask_batch[None,:])
+        offs_aaa = pid_m * (TILE_SIZE_K * batch_size) + tl.arange(0, TILE_SIZE_K)[:,None] * batch_size + offs_batch[None,:]
+        tl.store(aaa + offs_aaa, n_fdm_sub, mask = mask_batch[None,:])
 
         partial_flows = tl.dot(epars, n_fdm_sub)
         # partial_flows = tl.sum(epars[:,:,None] * n_fdm_sub[None,:,:], axis = 1)
 
-        offs3 = pid_m * (K_NUM_TILES * TILE_SIZE_K * batch_size) + k * (TILE_SIZE_K * batch_size) + tl.arange(0, TILE_SIZE_M)[:,None] * batch_size + offs_batch[None,:]
-        tl.store(eee + offs3, partial_flows, mask = mask_batch[None,:])
+        offs_bbb = pid_m * (TILE_SIZE_M * batch_size) + tl.arange(0, TILE_SIZE_M)[:,None] * batch_size + offs_batch[None,:]
+        tl.store(bbb + offs_bbb, partial_flows, mask = mask_batch[None,:])
 
+        offs_ccc = pid_m * batch_size + offs_batch
+        tl.store(ccc + offs_ccc, log_n_fdm_max, mask = mask_batch)
+
+        neginf_flag = (log_n_fdm_max[None,:] == -float("inf")) & (acc == -float("inf"))
         acc = tl.where(log_n_fdm_max[None,:] > acc,
             tl.log(partial_flows + tl.exp(acc - log_n_fdm_max[None,:])) + log_n_fdm_max[None,:],
             tl.log(tl.exp(log_n_fdm_max[None,:] - acc) * partial_flows + 1.0) + acc
         )
-        # acc += partial_flows
+        acc = tl.where(neginf_flag, -float("inf"), acc)
 
         # Increment `epars_ptr`
         parpids_inc = tl.load(parpids_inc_ptr)
@@ -199,7 +195,6 @@ def my_kernel(aaa, bbb, ccc, ddd, eee, node_flows, element_flows, node_mars, ele
     emars = tl.load(emars_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_K, BLOCK_B]
 
     eflows = tl.exp(acc + emars)
-    # eflows = acc
 
     # Write back
     offs_elemfs = (off_eleids + offs_ele[:,None]) * batch_size + offs_batch[None,:]
@@ -213,7 +208,7 @@ def main():
     data = np.load("temp.npz")
 
     node_flows = torch.from_numpy(data["node_flows"]).to(device)
-    element_flows = torch.from_numpy(data["element_flow"]).to(device)
+    element_flows = torch.from_numpy(data["element_flows"]).to(device)
     node_mars = torch.from_numpy(data["node_mars"]).to(device)
     element_mars = torch.from_numpy(data["element_mars"]).to(device)
     params = torch.from_numpy(data["params"]).to(device)
@@ -232,39 +227,11 @@ def main():
     K_NUM_TILES = int(data["K_NUM_TILES"])
     GROUP_SIZE_M = int(data["GROUP_SIZE_M"])
     GROUP_SIZE_K = int(data["GROUP_SIZE_K"])
-    OP_MODE = int(data["OP_MODE"])
     layer_n_nodes = int(data["layer_n_nodes"])
 
-    # node_flows = torch.rand(node_flows.size(), device = device)
-
     grid = (triton.cdiv(batch_size, BLOCK_B), triton.cdiv(layer_n_nodes, TILE_SIZE_M))
-    # grid = (1, triton.cdiv(layer_n_nodes, TILE_SIZE_M))
-
-    # ref_kernel[grid](
-    #     node_flows = node_flows, 
-    #     element_flows = element_flows, 
-    #     node_mars = node_mars, 
-    #     element_mars = element_mars, 
-    #     params = params, 
-    #     chids = chids, 
-    #     parids_start = parids_start,
-    #     parids_increment = parids_increment,
-    #     parpids_start = parpids_start,
-    #     parpids_increment = parpids_increment, 
-    #     local_ids = None, 
-    #     batch_size = batch_size, 
-    #     partial_eval = 0,
-    #     ptr_inc_step = ptr_inc_step,
-    #     BLOCK_B = BLOCK_B, 
-    #     TILE_SIZE_K = TILE_SIZE_K, 
-    #     K_NUM_TILES = K_NUM_TILES,
-    #     TILE_SIZE_M = TILE_SIZE_M, 
-    #     GROUP_SIZE_M = GROUP_SIZE_M,
-    #     GROUP_SIZE_K = GROUP_SIZE_K
-    # )
-
-    aaa = ref_kernel[grid]
-    aaa(
+
+    ref_kernel[grid](
         node_flows = node_flows, 
         element_flows = element_flows, 
         node_mars = node_mars, 
@@ -289,20 +256,24 @@ def main():
 
     torch.cuda.synchronize()
 
+    import pdb; pdb.set_trace()
+
     element_flows_ref = element_flows.clone()
 
-    aaa = torch.zeros([grid[1], TILE_SIZE_M, TILE_SIZE_K]).cuda()
-    bbb = torch.zeros([grid[1], TILE_SIZE_M, TILE_SIZE_K], dtype = torch.long).cuda()
-    ccc = torch.zeros([grid[1], K_NUM_TILES, TILE_SIZE_K, batch_size]).cuda()
-    ddd = torch.zeros([grid[1], K_NUM_TILES, TILE_SIZE_K, batch_size]).cuda()
-    eee = torch.zeros([grid[1], K_NUM_TILES, TILE_SIZE_M, batch_size]).cuda()
+    # aaa = torch.zeros([grid[1], TILE_SIZE_M, TILE_SIZE_K]).cuda()
+    # bbb = torch.zeros([grid[1], TILE_SIZE_M, TILE_SIZE_K], dtype = torch.long).cuda()
+    # ccc = torch.zeros([grid[1], K_NUM_TILES, TILE_SIZE_K, batch_size]).cuda()
+    # ddd = torch.zeros([grid[1], K_NUM_TILES, TILE_SIZE_K, batch_size]).cuda()
+    # eee = torch.zeros([grid[1], K_NUM_TILES, TILE_SIZE_M, batch_size]).cuda()
+
+    aaa = torch.zeros([grid[1], TILE_SIZE_K, batch_size]).cuda()
+    bbb = torch.zeros([grid[1], TILE_SIZE_M, batch_size]).cuda()
+    ccc = torch.zeros([grid[1], batch_size]).cuda()
 
     my_kernel[grid](
         aaa = aaa,
         bbb = bbb,
         ccc = ccc,
-        ddd = ddd,
-        eee = eee,
         node_flows = node_flows, 
         element_flows = element_flows, 
         node_mars = node_mars, 

From 1b45af0452058734b590f9296e07b13e29fa1ee2 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Tue, 26 Dec 2023 15:41:27 +0800
Subject: [PATCH 083/162] update dependent packages

---
 pyproject.toml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 0637cb56..fd45b996 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -12,7 +12,10 @@ dependencies = [
     "typing",
     "triton>=2.1.0",
     "networkx",
-    "numba"
+    "numba",
+    "packaging",
+    "matplotlib",
+    "tqdm"
 ]
 authors = [
     {name="StarAI", email="guyvdb@cs.ucla.edu"},

From 413ac044bf6a2465b55f3bac26672c94d4728d24 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Tue, 26 Dec 2023 17:35:38 +0800
Subject: [PATCH 084/162] improve thread-block allocation for sparse bk kernels

---
 src/pyjuice/layer/sum_layer.py | 58 ++++++++++++++++++++++++----------
 1 file changed, 42 insertions(+), 16 deletions(-)

diff --git a/src/pyjuice/layer/sum_layer.py b/src/pyjuice/layer/sum_layer.py
index 66801078..3d01b6ce 100644
--- a/src/pyjuice/layer/sum_layer.py
+++ b/src/pyjuice/layer/sum_layer.py
@@ -1467,25 +1467,27 @@ def _backward_sparse_ele_flows(self, node_flows: torch.Tensor, element_flows: to
     @staticmethod
     @triton.jit
     def _bk_triton_sparse_par_kernel(node_flows, node_mars, element_mars, params, param_flows, nids, cids, pids, pfids,
-                                     batch_size: tl.constexpr, num_edges: tl.constexpr, BLOCK_M: tl.constexpr, 
-                                     BLOCK_B: tl.constexpr, B_NUM_BLOCKS: tl.constexpr):
+                                     num_edges: tl.constexpr, batch_size: tl.constexpr, BLOCK_M: tl.constexpr, 
+                                     BLOCK_K: tl.constexpr, BLOCK_B: tl.constexpr, TILE_SIZE_B: tl.constexpr, B_NUM_BLOCKS: tl.constexpr):
 
-        pid_m = tl.program_id(0) # ID of size-`BLOCK_M` nodes
+        pid_b = tl.program_id(0) # ID of size-`BLOCK_B` samples
+        pid_e = tl.program_id(1) # ID of size-`BLOCK_K` edges
+        pid_m = tl.program_id(2) # ID of size-`BLOCK_M` nodes
 
         # Get inferred node group id from `pid_m`
         ngroup_id = pid_m // BLOCK_M
         tile_id = pid_m % BLOCK_M
 
         # Batch offsets and mask
-        offs_batch = tl.arange(0, BLOCK_B)
+        offs_batch = tl.arange(0, BLOCK_B) + pid_b * TILE_SIZE_B
         mask_batch = offs_batch < batch_size
 
         # Initialize pointers to `element_mars`
-        offs_edge = tl.arange(0, num_edges)
+        offs_edge = tl.arange(0, BLOCK_K) + pid_e * BLOCK_K
         edge_start = tl.load(cids + ngroup_id * num_edges + offs_edge)
         emars_ptr = element_mars + \
             edge_start[:,None] * batch_size + \
-            offs_batch[None,:] # [num_edges, BLOCK_B]
+            offs_batch[None,:] # [BLOCK_K, BLOCK_B]
 
         # Initialize pointers to `node_flows` and `node_mars`
         off_nids = tl.load(nids + ngroup_id)
@@ -1493,10 +1495,13 @@ def _bk_triton_sparse_par_kernel(node_flows, node_mars, element_mars, params, pa
         nflows_ptr = node_flows + (off_nids + tile_id) * batch_size + offs_batch # [BLOCK_B]
 
         # Inner loop
-        acc = tl.zeros([num_edges], dtype = tl.float32)
+        acc = tl.zeros([BLOCK_K], dtype = tl.float32)
 
         for b in range(0, B_NUM_BLOCKS):
-            emars = tl.load(emars_ptr, mask = mask_batch[None,:]) # [num_edges, BLOCK_B]
+            # Update batch mask
+            mask_batch = (offs_batch < batch_size)
+
+            emars = tl.load(emars_ptr, mask = mask_batch[None,:]) # [BLOCK_K, BLOCK_B]
             nmars = tl.load(nmars_ptr, mask = mask_batch) # [BLOCK_B]
             nflows = tl.load(nflows_ptr, mask = mask_batch) # [BLOCK_B]
 
@@ -1509,13 +1514,12 @@ def _bk_triton_sparse_par_kernel(node_flows, node_mars, element_mars, params, pa
             nmars_ptr += BLOCK_B
             nflows_ptr += BLOCK_B
 
-            # Update batch mask
+            # Update batch offsets
             offs_batch += BLOCK_B
-            mask_batch = offs_batch < batch_size
 
         par_start = tl.load(pids + ngroup_id * num_edges + offs_edge)
         epars_ptr = params + par_start + tile_id
-        epars = tl.load(epars_ptr) # [num_edges]
+        epars = tl.load(epars_ptr) # [BLOCK_K]
 
         parflow_start = tl.load(pfids + ngroup_id * num_edges + offs_edge)
         eparflows_ptr = param_flows + parflow_start + tile_id
@@ -1553,10 +1557,30 @@ def _backward_sparse_par_flows(self, node_flows: torch.Tensor, params: torch.Ten
 
         assert num_edges <= 16384, "The sparse backward kernel only support nodes with # edges smaller than 16384."
 
-        BLOCK_B = max(min(2048 // num_edges, BATCH_SIZE_NP2), 1)
-        BLOCK_M = self.group_size
+        if num_edges <= 1024:
+            BLOCK_B = max(min(2048 // num_edges, BATCH_SIZE_NP2), 1)
+            BLOCK_K = num_edges
+            BLOCK_M = max(min(2048 // num_edges, self.group_size), 1)
+        else:
+            BLOCK_B = min(512, BATCH_SIZE_NP2)
+            BLOCK_K = min(2048 // BLOCK_B, num_edges)
+            BLOCK_M = max(min(2048 // num_edges, self.group_size), 1)
+        B_NUM_BLOCKS = triton.cdiv(batch_size, BLOCK_B)
+        K_NUM_BLOCKS = triton.cdiv(num_edges, BLOCK_K)
+
+        # When a thread-block is allocated for too much work, the overhead 
+        # outweigh that incurred by `atomic_add`. Add more thread-blocks 
+        # for parallel processing in this case.
+        if B_NUM_BLOCKS >= 4:
+            TILE_SIZE_B = 4 * BLOCK_B
+            B_NUM_BLOCKS = 4
+        else:
+            TILE_SIZE_B = batch_size
+        B_NUM_TILES = triton.cdiv(batch_size, TILE_SIZE_B)
 
-        grid = (layer_n_nodes,)
+        allow_modify_flows = 1 if allow_modify_flows else 0
+
+        grid = (B_NUM_TILES, K_NUM_BLOCKS, layer_n_nodes)
 
         self._bk_triton_sparse_par_kernel[grid](
             node_flows = node_flows, 
@@ -1568,11 +1592,13 @@ def _backward_sparse_par_flows(self, node_flows: torch.Tensor, params: torch.Ten
             cids = cids, 
             pids = pids,
             pfids = pfids,
+            num_edges = num_edges,
             batch_size = batch_size, 
-            num_edges = num_edges, 
             BLOCK_M = BLOCK_M,
+            BLOCK_K = BLOCK_K,
             BLOCK_B = BLOCK_B,
-            B_NUM_BLOCKS = triton.cdiv(batch_size, BLOCK_B)
+            TILE_SIZE_B = TILE_SIZE_B,
+            B_NUM_BLOCKS = B_NUM_BLOCKS
         )
 
     def _backward_pytorch(self, node_flows, element_flows, params, node_mars, 

From dd8428c999617e78e2f351b9519af32a3b549b3b Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Tue, 26 Dec 2023 17:50:56 +0800
Subject: [PATCH 085/162] implement flow-modified backward pass

---
 src/pyjuice/layer/sum_layer.py | 49 +++++++++++++++++++++++++---------
 1 file changed, 36 insertions(+), 13 deletions(-)

diff --git a/src/pyjuice/layer/sum_layer.py b/src/pyjuice/layer/sum_layer.py
index 3d01b6ce..32b74cf1 100644
--- a/src/pyjuice/layer/sum_layer.py
+++ b/src/pyjuice/layer/sum_layer.py
@@ -1043,9 +1043,13 @@ def _bk_triton_block_sparse_ele_kernel(node_flows, element_flows, node_mars, ele
         for k in range(0, K_NUM_TILES):
             epars = tl.load(epars_ptr) # [TILE_SIZE_M, TILE_SIZE_K]
             nmars = tl.load(nmars_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_K, BLOCK_B]
-            nflows = tl.load(nflows_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_K, BLOCK_B]
 
-            log_n_fdm = tl.log(nflows) - nmars
+            if allow_modify_flows == 1:
+                log_n_fdm = tl.load(nflows_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_K, BLOCK_B]
+            else:
+                nflows = tl.load(nflows_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_K, BLOCK_B]
+                log_n_fdm = tl.log(nflows) - nmars
+
             log_n_fdm_max = tl.max(log_n_fdm, axis = 0)
             n_fdm_sub = tl.where(log_n_fdm_max[None,:] != -float("inf"), tl.exp(log_n_fdm - log_n_fdm_max[None,:]), 0.0)
 
@@ -1220,9 +1224,13 @@ def _bk_triton_block_sparse_par_kernel(node_flows, node_mars, element_mars, para
         for b in range(0, B_NUM_TILES):
             emars = tl.load(emars_ptr, mask = mask_batch[:,None]) # [TILE_SIZE_B, TILE_SIZE_K]
             nmars = tl.load(nmars_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_M, TILE_SIZE_B]
-            nflows = tl.load(nflows_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_M, TILE_SIZE_B]
 
-            log_n_fdm = tl.log(nflows) - nmars
+            if allow_modify_flows == 1:
+                log_n_fdm = tl.load(nflows_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_M, TILE_SIZE_B]
+            else:
+                nflows = tl.load(nflows_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_M, TILE_SIZE_B]
+                log_n_fdm = tl.log(nflows) - nmars
+
             log_n_fdm_max = tl.max(log_n_fdm, axis = 0)
             n_fdm_sub = tl.where(log_n_fdm_max[None,:] != -float("inf"), tl.exp(log_n_fdm - log_n_fdm_max[None,:]), 0.0)
 
@@ -1366,8 +1374,8 @@ def _backward_sparse(self, node_flows: torch.Tensor, element_flows: torch.Tensor
     @triton.jit
     def _bk_triton_sparse_ele_kernel(node_flows, element_flows, node_mars, element_mars, params, 
                                      chids, parids, parpids, local_ids, batch_size: tl.constexpr, partial_eval: tl.constexpr,
-                                     n_edge_groups: tl.constexpr, BLOCK_B: tl.constexpr, BLOCK_M: tl.constexpr, 
-                                     GROUP_SIZE_K: tl.constexpr):
+                                     n_edge_groups: tl.constexpr, allow_modify_flows: tl.constexpr, BLOCK_B: tl.constexpr, 
+                                     BLOCK_M: tl.constexpr, GROUP_SIZE_K: tl.constexpr):
 
         pid_b = tl.program_id(0) # ID of size-`BLOCK_B` batches
         pid_m = tl.program_id(1) # ID of size-`TILE_SIZE_M` nodes
@@ -1399,7 +1407,10 @@ def _bk_triton_sparse_ele_kernel(node_flows, element_flows, node_mars, element_m
         nflows_ptr = node_flows + \
             (edge_start + offs_edge_nid)[:,None] * batch_size + \
             offs_batch[None,:]
-        nflows = tl.load(nflows_ptr, mask = mask_batch[None,:]) # [num_edges, BLOCK_B]
+        if allow_modify_flows == 1:
+            log_n_fdm = tl.load(nflows_ptr, mask = mask_batch[None,:]) # [num_edges, BLOCK_B]
+        else:
+            nflows = tl.load(nflows_ptr, mask = mask_batch[None,:]) # [num_edges, BLOCK_B]
 
         # Initialize pointers to `element_flows` and `element_mars`
         off_eleids = tl.load(chids + elegroup_id)
@@ -1411,7 +1422,10 @@ def _bk_triton_sparse_ele_kernel(node_flows, element_flows, node_mars, element_m
             epars = tl.load(epars_ptr) # [num_edges]
             emars = tl.load(emars_ptr, mask = mask_batch) # [BLOCK_B]
 
-            eflows = tl.sum(nflows * epars[:,None] * tl.exp(emars[None,:] - nmars), axis = 0)
+            if allow_modify_flows == 1:
+                eflows = tl.sum(epars[:,None] * tl.exp(emars[None,:] - log_n_fdm), axis = 0)
+            else:
+                eflows = tl.sum(nflows * epars[:,None] * tl.exp(emars[None,:] - nmars), axis = 0)
 
             tl.store(eflows_ptr, eflows, mask = mask_batch)
 
@@ -1442,6 +1456,8 @@ def _backward_sparse_ele_flows(self, node_flows: torch.Tensor, element_flows: to
         BLOCK_B = max(min(2048 // num_edges, BATCH_SIZE_NP2), 1)
         BLOCK_M = cs_group_size
 
+        allow_modify_flows = 1 if allow_modify_flows else 0
+
         grid = (triton.cdiv(batch_size, BLOCK_B), triton.cdiv(layer_n_nodes, BLOCK_M))
 
         self._bk_triton_sparse_ele_kernel[grid](
@@ -1457,6 +1473,7 @@ def _backward_sparse_ele_flows(self, node_flows: torch.Tensor, element_flows: to
             batch_size = batch_size, 
             partial_eval = 1 if local_ids is not None else 0,
             n_edge_groups = n_edge_groups,
+            allow_modify_flows = allow_modify_flows,
             BLOCK_B = BLOCK_B, 
             BLOCK_M = BLOCK_M,
             GROUP_SIZE_K = self.group_size
@@ -1467,8 +1484,9 @@ def _backward_sparse_ele_flows(self, node_flows: torch.Tensor, element_flows: to
     @staticmethod
     @triton.jit
     def _bk_triton_sparse_par_kernel(node_flows, node_mars, element_mars, params, param_flows, nids, cids, pids, pfids,
-                                     num_edges: tl.constexpr, batch_size: tl.constexpr, BLOCK_M: tl.constexpr, 
-                                     BLOCK_K: tl.constexpr, BLOCK_B: tl.constexpr, TILE_SIZE_B: tl.constexpr, B_NUM_BLOCKS: tl.constexpr):
+                                     num_edges: tl.constexpr, batch_size: tl.constexpr, allow_modify_flows: tl.constexpr, 
+                                     BLOCK_M: tl.constexpr, BLOCK_K: tl.constexpr, BLOCK_B: tl.constexpr, 
+                                     TILE_SIZE_B: tl.constexpr, B_NUM_BLOCKS: tl.constexpr):
 
         pid_b = tl.program_id(0) # ID of size-`BLOCK_B` samples
         pid_e = tl.program_id(1) # ID of size-`BLOCK_K` edges
@@ -1503,9 +1521,13 @@ def _bk_triton_sparse_par_kernel(node_flows, node_mars, element_mars, params, pa
 
             emars = tl.load(emars_ptr, mask = mask_batch[None,:]) # [BLOCK_K, BLOCK_B]
             nmars = tl.load(nmars_ptr, mask = mask_batch) # [BLOCK_B]
-            nflows = tl.load(nflows_ptr, mask = mask_batch) # [BLOCK_B]
 
-            pflows = tl.sum(nflows[None,:] * tl.exp(emars - nmars[None,:]), axis = 1)
+            if allow_modify_flows == 1:
+                log_n_fdm = tl.load(nflows_ptr, mask = mask_batch) # [BLOCK_B]
+                pflows = tl.sum(tl.exp(emars - log_n_fdm[None,:]), axis = 1)
+            else:
+                nflows = tl.load(nflows_ptr, mask = mask_batch) # [BLOCK_B]
+                pflows = tl.sum(nflows[None,:] * tl.exp(emars - nmars[None,:]), axis = 1)
 
             acc += pflows
 
@@ -1593,7 +1615,8 @@ def _backward_sparse_par_flows(self, node_flows: torch.Tensor, params: torch.Ten
             pids = pids,
             pfids = pfids,
             num_edges = num_edges,
-            batch_size = batch_size, 
+            batch_size = batch_size,
+            allow_modify_flows = allow_modify_flows,
             BLOCK_M = BLOCK_M,
             BLOCK_K = BLOCK_K,
             BLOCK_B = BLOCK_B,

From 2701fd85585305d658e3900c2649cbc406d4ad1c Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Tue, 26 Dec 2023 19:04:24 +0800
Subject: [PATCH 086/162] change optimizer behavior

---
 src/pyjuice/optim/optim.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/pyjuice/optim/optim.py b/src/pyjuice/optim/optim.py
index a2eefe41..16258395 100644
--- a/src/pyjuice/optim/optim.py
+++ b/src/pyjuice/optim/optim.py
@@ -23,11 +23,11 @@ def __init__(self, pc: TensorCircuit, base_optimizer: Optional[Optimizer] = None
         self.lr = lr
         self.pseudocount = pseudocount
 
-    def zero_grad(self, flows_memory: float = 0.0):
+    def zero_grad(self):
         if self.base_optimizer is not None:
             self.base_optimizer.zero_grad()
 
-        self.pc._optim_hyperparams["flows_memory"] = flows_memory
+        self.pc.init_param_flows(flows_memory = 0.0)
 
     def step(self, closure = None):
         if self.base_optimizer is not None:

From b024bce36087a38bc61df99d9155ec61f7809cce Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Tue, 26 Dec 2023 19:04:37 +0800
Subject: [PATCH 087/162] hclt tests passed

---
 src/pyjuice/layer/sum_layer.py     | 10 ++++++----
 src/pyjuice/model/tensorcircuit.py | 28 ++++++++++++++++++++--------
 tests/structures/hclt_test.py      | 13 ++++++-------
 3 files changed, 32 insertions(+), 19 deletions(-)

diff --git a/src/pyjuice/layer/sum_layer.py b/src/pyjuice/layer/sum_layer.py
index 32b74cf1..afc1e59d 100644
--- a/src/pyjuice/layer/sum_layer.py
+++ b/src/pyjuice/layer/sum_layer.py
@@ -317,7 +317,9 @@ def backward(self, node_flows: torch.Tensor, element_flows: torch.Tensor,
                 self._backward(
                     node_flows, element_flows, params, node_mars, 
                     element_mars, param_flows, nids = nids, 
-                    cids = cids, pids = pids, pfids = pfids, partition_id = partition_id
+                    cids = cids, pids = pids, pfids = pfids, 
+                    partition_id = partition_id,
+                    allow_modify_flows = allow_modify_flows
                 )
 
         return None
@@ -1423,7 +1425,7 @@ def _bk_triton_sparse_ele_kernel(node_flows, element_flows, node_mars, element_m
             emars = tl.load(emars_ptr, mask = mask_batch) # [BLOCK_B]
 
             if allow_modify_flows == 1:
-                eflows = tl.sum(epars[:,None] * tl.exp(emars[None,:] - log_n_fdm), axis = 0)
+                eflows = tl.sum(epars[:,None] * tl.exp(emars[None,:] + log_n_fdm), axis = 0)
             else:
                 eflows = tl.sum(nflows * epars[:,None] * tl.exp(emars[None,:] - nmars), axis = 0)
 
@@ -1520,12 +1522,12 @@ def _bk_triton_sparse_par_kernel(node_flows, node_mars, element_mars, params, pa
             mask_batch = (offs_batch < batch_size)
 
             emars = tl.load(emars_ptr, mask = mask_batch[None,:]) # [BLOCK_K, BLOCK_B]
-            nmars = tl.load(nmars_ptr, mask = mask_batch) # [BLOCK_B]
 
             if allow_modify_flows == 1:
                 log_n_fdm = tl.load(nflows_ptr, mask = mask_batch) # [BLOCK_B]
-                pflows = tl.sum(tl.exp(emars - log_n_fdm[None,:]), axis = 1)
+                pflows = tl.sum(tl.exp(emars + log_n_fdm[None,:]), axis = 1)
             else:
+                nmars = tl.load(nmars_ptr, mask = mask_batch) # [BLOCK_B]
                 nflows = tl.load(nflows_ptr, mask = mask_batch) # [BLOCK_B]
                 pflows = tl.sum(nflows[None,:] * tl.exp(emars - nmars[None,:]), axis = 1)
 
diff --git a/src/pyjuice/model/tensorcircuit.py b/src/pyjuice/model/tensorcircuit.py
index f85f3a60..f4464b3f 100644
--- a/src/pyjuice/model/tensorcircuit.py
+++ b/src/pyjuice/model/tensorcircuit.py
@@ -223,6 +223,7 @@ def backward(self, inputs: Optional[torch.Tensor] = None,
                  return_cache: bool = False,
                  record_cudagraph: bool = False, 
                  apply_cudagraph: bool = True,
+                 allow_modify_flows: bool = True,
                  **kwargs):
         """
         Compute circuit flows.
@@ -247,15 +248,19 @@ def backward(self, inputs: Optional[torch.Tensor] = None,
         self._init_buffer(name = "element_flows", shape = (self.num_elements, B), set_value = 0.0)
 
         # Set root node flows
-        if ll_weights is None:
-            self.node_flows[self._root_node_range[0]:self._root_node_range[1],:] = 1.0
-        else:
-            if ll_weights.dim() == 1:
-                ll_weights = ll_weights.unsqueeze(1)
+        def _set_root_node_flows():
+            nonlocal ll_weights
+            if ll_weights is None:
+                self.node_flows[self._root_node_range[0]:self._root_node_range[1],:] = 1.0
+            else:
+                if ll_weights.dim() == 1:
+                    ll_weights = ll_weights.unsqueeze(1)
 
-            assert ll_weights.size(0) == self.num_root_nodes
+                assert ll_weights.size(0) == self.num_root_nodes
 
-            self.node_flows[self._root_node_range[0]:self._root_node_range[1],:] = ll_weights
+                self.node_flows[self._root_node_range[0]:self._root_node_range[1],:] = ll_weights
+
+        _set_root_node_flows()
 
         # Load cached node flows
         if self._buffer_matches(name = "node_flows", cache = cache):
@@ -271,6 +276,8 @@ def backward(self, inputs: Optional[torch.Tensor] = None,
 
             # Inner layers
             def _run_inner_layers():
+
+                # Backward pass for inner layers
                 for layer_id in range(len(self.inner_layer_groups) - 1, -1, -1):
                     layer_group = self.inner_layer_groups[layer_id]
 
@@ -286,7 +293,8 @@ def _run_inner_layers():
 
                         # Backward sum layer
                         layer_group.backward(self.node_flows, self.element_flows, self.node_mars, self.element_mars, self.params, 
-                                            param_flows = self.param_flows if compute_param_flows else None)
+                                             param_flows = self.param_flows if compute_param_flows else None,
+                                             allow_modify_flows = allow_modify_flows)
 
                     else:
                         raise ValueError(f"Unknown layer type {type(layer)}.")
@@ -298,10 +306,14 @@ def _run_inner_layers():
                 s.wait_stream(torch.cuda.current_stream())
                 with torch.cuda.stream(s):
                     for _ in range(3):
+                        self.node_flows[:,:] = 0.0
+                        _set_root_node_flows()
                         _run_inner_layers()
                 torch.cuda.current_stream().wait_stream(s)
 
                 # Capture
+                self.node_flows[:,:] = 0.0
+                _set_root_node_flows()
                 g = torch.cuda.CUDAGraph()
                 with torch.cuda.graph(g):
                     _run_inner_layers()
diff --git a/tests/structures/hclt_test.py b/tests/structures/hclt_test.py
index 0968507b..db7f5ad4 100644
--- a/tests/structures/hclt_test.py
+++ b/tests/structures/hclt_test.py
@@ -51,7 +51,7 @@ def full_batch_em_epoch(pc, train_loader, test_loader, device):
             x = batch[0].to(device)
 
             lls = pc(x)
-            pc.backward(x, flows_memory = 1.0)
+            lls.mean().backward()
 
             train_ll += lls.mean().detach().cpu().numpy().item()
 
@@ -109,13 +109,12 @@ def hclt_test():
         milestone_steps = [0, len(train_loader) * 100, len(train_loader) * 350]
     )
 
-    # for batch in train_loader:
-    #     x = batch[0].to(device)
-
-    #     lls = pc(x, record_cudagraph = True)
-    #     lls.mean().backward()
-    #     break
+    for batch in train_loader:
+        x = batch[0].to(device)
 
+        lls = pc(x, record_cudagraph = True)
+        lls.mean().backward()
+        break
 
     mini_batch_em_epoch(350, pc, optimizer, scheduler, train_loader, test_loader, device)
     full_batch_em_epoch(pc, train_loader, test_loader, device)

From 46804079fab844144cd762141d8af758b1d9cd5c Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Tue, 26 Dec 2023 19:15:38 +0800
Subject: [PATCH 088/162] refactor io

---
 src/pyjuice/io/serialization.py | 10 ++++++----
 tests/io/io_test.py             | 24 +++++++++++++-----------
 2 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/src/pyjuice/io/serialization.py b/src/pyjuice/io/serialization.py
index 55c578fc..0f3a8c71 100644
--- a/src/pyjuice/io/serialization.py
+++ b/src/pyjuice/io/serialization.py
@@ -22,7 +22,8 @@ def serialize_nodes(root_ns: CircuitNodes):
 
         ns_info = {
             "type": ntype, 
-            "num_nodes": ns.num_nodes,
+            "num_node_groups": ns.num_node_groups,
+            "group_size": ns.group_size,
             "chs": tuple(ns2id[cs] for cs in ns.chs)
         }
 
@@ -50,14 +51,15 @@ def serialize_nodes(root_ns: CircuitNodes):
 def deserialize_nodes(nodes_list: Sequence):
     id2ns = dict()
     for ns_id, ns_info in enumerate(nodes_list):
-        num_nodes = ns_info["num_nodes"]
+        num_node_groups = ns_info["num_node_groups"]
+        group_size = ns_info["group_size"]
         chids = ns_info["chs"]
 
         if ns_info["type"] == "Input":
             scope = ns_info["scope"]
             dist = pickle.loads(ns_info["dist"])
 
-            ns = inputs(scope, num_nodes, dist)
+            ns = inputs(scope, num_node_groups, dist)
 
             if "params" in ns_info:
                 ns._params = torch.from_numpy(ns_info["params"])
@@ -78,7 +80,7 @@ def deserialize_nodes(nodes_list: Sequence):
             else:
                 params = None
 
-            ns = summate(*chs, edge_ids = edge_ids, params = params)            
+            ns = summate(*chs, edge_ids = edge_ids, params = params, group_size = group_size)            
 
         id2ns[ns_id] = ns
 
diff --git a/tests/io/io_test.py b/tests/io/io_test.py
index 22562a11..0cf013a0 100644
--- a/tests/io/io_test.py
+++ b/tests/io/io_test.py
@@ -12,17 +12,19 @@
 
 
 def io_test():
-    num_nodes = 2
-
-    i00 = inputs(0, num_nodes, dists.Categorical(num_cats = 5))
-    i01 = inputs(0, num_nodes, dists.Categorical(num_cats = 5))
-    i10 = inputs(1, num_nodes, dists.Categorical(num_cats = 5))
-    i11 = inputs(1, num_nodes, dists.Categorical(num_cats = 5))
-    
-    m00 = multiply(i00, i10)
-    m01 = multiply(i01, i11)
-
-    n0 = summate(m00, m01, num_nodes = num_nodes)
+    num_node_groups = 2
+    group_size = 4
+
+    with juice.set_group_size(group_size):
+        i00 = inputs(0, num_node_groups, dists.Categorical(num_cats = 5))
+        i01 = inputs(0, num_node_groups, dists.Categorical(num_cats = 5))
+        i10 = inputs(1, num_node_groups, dists.Categorical(num_cats = 5))
+        i11 = inputs(1, num_node_groups, dists.Categorical(num_cats = 5))
+        
+        m00 = multiply(i00, i10)
+        m01 = multiply(i01, i11)
+
+        n0 = summate(m00, m01, num_node_groups = num_node_groups)
 
     temp_file = tempfile.NamedTemporaryFile(suffix='.jpc')
     temp_file_name = temp_file.name

From f8cf2c4bcbf3b3e17c87029409b2f3c074767b5d Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Tue, 26 Dec 2023 20:35:58 +0800
Subject: [PATCH 089/162] restore layer runtests

---
 tests/layer/matmul_kernel_test.py | 114 ------------------------------
 tests/layer/sum_layer_test.py     |  10 +--
 2 files changed, 6 insertions(+), 118 deletions(-)
 delete mode 100644 tests/layer/matmul_kernel_test.py

diff --git a/tests/layer/matmul_kernel_test.py b/tests/layer/matmul_kernel_test.py
deleted file mode 100644
index 8a476f36..00000000
--- a/tests/layer/matmul_kernel_test.py
+++ /dev/null
@@ -1,114 +0,0 @@
-import pyjuice as juice
-import torch
-import numpy as np
-import time
-import random
-
-import pyjuice.nodes.distributions as dists
-from pyjuice.utils import BitSet
-from pyjuice.nodes import multiply, summate, inputs
-from pyjuice.model import TensorCircuit
-
-from pyjuice.layer import InputLayer, ProdLayer, SumLayer
-
-import pytest
-
-
-import triton
-import triton.language as tl
-
-
-@triton.jit
-def kernel1(a, b, c, M: tl.constexpr, N: tl.constexpr, K: tl.constexpr):
-    pid = tl.program_id(axis = 0)
-
-    offs_a = tl.arange(0, M)[:,None] * N + tl.arange(0, N)[None,:]
-    aa = tl.load(a + offs_a).to(tl.float16)
-
-    offs_b = tl.arange(0, N)[:,None] * K + tl.arange(0, K)[None,:]
-    bb = tl.load(b + offs_b).to(tl.float16)
-
-    cc = tl.dot(aa, bb).to(tl.float32)
-
-    offs_c = tl.arange(0, M)[:,None] * K + tl.arange(0, K)[None,:]
-    tl.store(c + offs_c, cc)
-
-
-@triton.jit
-def kernel2(a, b, c, M: tl.constexpr, N: tl.constexpr, K: tl.constexpr):
-    pid = tl.program_id(axis = 0)
-
-    offs_a = tl.arange(0, M)[:,None] * N + tl.arange(0, N)[None,:]
-    aa = tl.load(a + offs_a)#.to(tl.float16)
-
-    offs_b = tl.arange(0, N)[:,None] * K + tl.arange(0, K)[None,:]
-    bb = tl.load(b + offs_b)#.to(tl.float16)
-
-    cc = tl.sum(aa[:,:,None] * bb[None,:,:], axis = 1)#.to(tl.float32)
-
-    # cc = tl.dot(aa, bb)
-
-    offs_c = tl.arange(0, M)[:,None] * K + tl.arange(0, K)[None,:]
-    tl.store(c + offs_c, cc)
-
-
-@triton.jit
-def kernel3(a, b, c, M: tl.constexpr, N: tl.constexpr, K: tl.constexpr):
-    pid = tl.program_id(axis = 0)
-
-    offs_a = tl.arange(0, M)[:,None] * N + tl.arange(0, N)[None,:]
-    aa = tl.load(a + offs_a)
-
-    offs_b = tl.arange(0, N)[:,None] * K + tl.arange(0, K)[None,:]
-    bb = tl.load(b + offs_b)
-
-    aa = tl.view(tl.broadcast_to(aa[:,None,:], (M, 16 // M, N)), (16, N))
-    cc = tl.dot(aa, bb)
-    cc = tl.max(tl.view(cc, (M, 16 // M, N)), axis = 1)
-
-    offs_c = tl.arange(0, M)[:,None] * K + tl.arange(0, K)[None,:]
-    tl.store(c + offs_c, cc)
-
-
-if __name__ == "__main__":
-    import time
-
-    M = 1
-    N = 4
-    K = 1
-
-    a = torch.rand([M, N]).cuda()
-    b = torch.rand([N, K]).cuda()
-    c = torch.zeros([M, K]).cuda()
-
-    grid = (1,)
-
-    # kernel1[grid](a, b, c, M, N, K)
-
-    # torch.cuda.synchronize()
-    # t0 = time.time()
-    # for _ in range(100):
-    #     kernel1[grid](a, b, c, M, N, K)
-    #     torch.cuda.synchronize()
-    # t1 = time.time()
-
-    # print((t1 - t0) / 100 * 1000)
-
-    kernel2[grid](a, b, c, M, N, K)
-
-    # torch.cuda.synchronize()
-    # t0 = time.time()
-    # for _ in range(100):
-    #     kernel2[grid](a, b, c, M, N, K)
-    #     torch.cuda.synchronize()
-    # t1 = time.time()
-
-    # print((t1 - t0) / 100 * 1000)
-
-    cc = torch.matmul(a, b)
-
-    print((c - cc).abs().max())
-
-    ccc = c
-
-    import pdb; pdb.set_trace()
\ No newline at end of file
diff --git a/tests/layer/sum_layer_test.py b/tests/layer/sum_layer_test.py
index eeb87e62..4efe88c0 100644
--- a/tests/layer/sum_layer_test.py
+++ b/tests/layer/sum_layer_test.py
@@ -204,22 +204,24 @@ def speed_test():
     element_flows = torch.zeros([group_size + num_prod_nodes * group_size * num_node_groups, batch_size]).log().to(device)
     param_flows = torch.zeros([group_size ** 2 + layer.partitioned_pids[0].max() + group_size]).to(device)
 
-    layer.backward(node_flows, element_flows, node_mars, element_mars, params, param_flows)
+    layer.backward(node_flows, element_flows, node_mars, element_mars, params, param_flows,
+                   allow_modify_flows = True)
 
     t0 = time.time()
     torch.cuda.synchronize()
     for _ in range(100):
-        layer.backward(node_flows, element_flows, node_mars, element_mars, params, param_flows)
+        layer.backward(node_flows, element_flows, node_mars, element_mars, params, param_flows,
+                       allow_modify_flows = True)
     torch.cuda.synchronize()
     t1 = time.time()
     backward_ms = (t1 - t0) / 100 * 1000
 
     print(f"Backward pass on average takes {backward_ms:.3f}ms.")
-    print("Reference computation time on RTX 4090: 1.814ms.")
+    print("Reference computation time on RTX 4090: 2.175ms.")
     print("--------------------------------------------------------------")
 
 
 if __name__ == "__main__":
     torch.manual_seed(3890)
-    # sum_layer_test()
+    sum_layer_test()
     speed_test()
\ No newline at end of file

From 96f24690bd9b963c4b536a896919ea73cba3d99b Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Tue, 26 Dec 2023 20:49:08 +0800
Subject: [PATCH 090/162] fix `simple_model_test`

---
 tests/model/simple_model_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/model/simple_model_test.py b/tests/model/simple_model_test.py
index 94bbaa18..a27072d8 100644
--- a/tests/model/simple_model_test.py
+++ b/tests/model/simple_model_test.py
@@ -273,7 +273,7 @@ def simple_model_test():
 
     ## Backward pass ##
 
-    lls.mean().backward()
+    pc.backward(data.permute(1, 0), allow_modify_flows = False)
 
     node_flows = pc.node_flows.cpu()
     param_flows = pc.param_flows.cpu()

From 785e3ae605dfc75f769952077c7709175ae207e2 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Tue, 26 Dec 2023 20:49:33 +0800
Subject: [PATCH 091/162] fix compilation tests

---
 src/pyjuice/model/backend/parflow_fusing.py | 8 ++++----
 src/pyjuice/nodes/construction.py           | 1 -
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/pyjuice/model/backend/parflow_fusing.py b/src/pyjuice/model/backend/parflow_fusing.py
index e3c3741d..e285ff8b 100644
--- a/src/pyjuice/model/backend/parflow_fusing.py
+++ b/src/pyjuice/model/backend/parflow_fusing.py
@@ -37,7 +37,7 @@ def compile_cum_par_flows_fn(node2tiednodes, MAX_NGROUPS = 2048, BLOCK_SIZE = 20
 
         target_pfids = []
         block_sizes = []
-        ch_pfids = []
+        child_pfids = []
         for kernel_spec in kernel_specs:
             pfid_start, num_par_flows, ch_pfids = kernel_spec
             for blk_start in range(0, num_par_flows, BLOCK_M):
@@ -49,13 +49,13 @@ def compile_cum_par_flows_fn(node2tiednodes, MAX_NGROUPS = 2048, BLOCK_SIZE = 20
 
                 target_pfids.append(pfid_start + blk_start)
                 block_sizes.append(blk_size)
-                ch_pfids.append()
+                child_pfids.append(ch_pfid)
 
         target_pfids = torch.tensor(target_pfids).contiguous()
         block_sizes = torch.tensor(block_sizes).contiguous()
-        ch_pfids = torch.tensor(ch_pfids).contiguous()
+        child_pfids = torch.tensor(child_pfids).contiguous()
 
-        kernels_args.append([target_pfids, block_sizes, ch_pfids, BLOCK_G, BLOCK_M])
+        kernels_args.append([target_pfids, block_sizes, child_pfids, BLOCK_G, BLOCK_M])
 
     return kernels_args
 
diff --git a/src/pyjuice/nodes/construction.py b/src/pyjuice/nodes/construction.py
index 9cc72d09..fa6a6986 100644
--- a/src/pyjuice/nodes/construction.py
+++ b/src/pyjuice/nodes/construction.py
@@ -67,7 +67,6 @@ def multiply(nodes1: ProdNodesChs, *args, edge_ids: Optional[Tensor] = None, **k
 def summate(nodes1: SumNodesChs, *args, num_nodes: int = 0, num_node_groups: int = 0, edge_ids: Optional[Tensor] = None, group_size: int = 0, **kwargs):
 
     if num_nodes > 0:
-        assert edge_ids is None
         assert num_node_groups == 0, "Only one of `num_nodes` and `num_node_groups` can be set at the same time."
         if group_size == 0:
             group_size = CircuitNodes.DEFAULT_GROUP_SIZE

From 76c6fc35805f1af30ee30206b91c3d7935b820d9 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Tue, 26 Dec 2023 20:54:47 +0800
Subject: [PATCH 092/162] make `juice.inputs` backward compatible

---
 src/pyjuice/nodes/construction.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/pyjuice/nodes/construction.py b/src/pyjuice/nodes/construction.py
index fa6a6986..5c3361a9 100644
--- a/src/pyjuice/nodes/construction.py
+++ b/src/pyjuice/nodes/construction.py
@@ -18,8 +18,8 @@
 SumNodesChs = Union[ProdNodes,InputNodes]
 
 
-def inputs(var: Union[int,Sequence[int]], num_node_groups: int, dist: Distribution, params: Optional[Tensor] = None, 
-           num_nodes: int = 0, group_size: int = 0, **kwargs):
+def inputs(var: Union[int,Sequence[int]], num_node_groups: int = 0, dist: Distribution = Distribution(), 
+           params: Optional[Tensor] = None, num_nodes: int = 0, group_size: int = 0, **kwargs):
 
     if num_nodes > 0:
         assert num_node_groups == 0, "Only one of `num_nodes` and `num_node_groups` can be set at the same time."

From a8b9ee9128be11d42a1cecf14c660021568351bd Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Tue, 26 Dec 2023 20:56:05 +0800
Subject: [PATCH 093/162] restore `forward_test`

---
 tests/model/forward_test.py | 58 ++++++++++++++++++-------------------
 1 file changed, 28 insertions(+), 30 deletions(-)

diff --git a/tests/model/forward_test.py b/tests/model/forward_test.py
index bcc9ccfc..8ac68e69 100644
--- a/tests/model/forward_test.py
+++ b/tests/model/forward_test.py
@@ -28,8 +28,6 @@ def forward_test():
 
     pc = TensorCircuit(n)
 
-    import pdb; pdb.set_trace()
-
     device = torch.device("cuda:0")
     pc.to(device)
 
@@ -39,14 +37,14 @@ def forward_test():
 
     ## Unit tests for forward pass ##
 
-    assert torch.abs(pc.node_mars[1,0] - torch.log(pc.input_layers[0].params[data[0,0]])) < 1e-4
-    assert torch.abs(pc.node_mars[2,0] - torch.log(pc.input_layers[0].params[2+data[0,0]])) < 1e-4
-    assert torch.abs(pc.node_mars[3,0] - torch.log(pc.input_layers[0].params[4+data[0,1]])) < 1e-4
-    assert torch.abs(pc.node_mars[4,0] - torch.log(pc.input_layers[0].params[6+data[0,1]])) < 1e-4
-    assert torch.abs(pc.node_mars[5,0] - torch.log(pc.input_layers[0].params[8+data[0,2]])) < 1e-4
-    assert torch.abs(pc.node_mars[6,0] - torch.log(pc.input_layers[0].params[10+data[0,2]])) < 1e-4
-    assert torch.abs(pc.node_mars[7,0] - torch.log(pc.input_layers[0].params[12+data[0,3]])) < 1e-4
-    assert torch.abs(pc.node_mars[8,0] - torch.log(pc.input_layers[0].params[14+data[0,3]])) < 1e-4
+    assert torch.abs(pc.node_mars[1,0] - torch.log(pc.input_layer_group[0].params[data[0,0]])) < 1e-4
+    assert torch.abs(pc.node_mars[2,0] - torch.log(pc.input_layer_group[0].params[2+data[0,0]])) < 1e-4
+    assert torch.abs(pc.node_mars[3,0] - torch.log(pc.input_layer_group[0].params[4+data[0,1]])) < 1e-4
+    assert torch.abs(pc.node_mars[4,0] - torch.log(pc.input_layer_group[0].params[6+data[0,1]])) < 1e-4
+    assert torch.abs(pc.node_mars[5,0] - torch.log(pc.input_layer_group[0].params[8+data[0,2]])) < 1e-4
+    assert torch.abs(pc.node_mars[6,0] - torch.log(pc.input_layer_group[0].params[10+data[0,2]])) < 1e-4
+    assert torch.abs(pc.node_mars[7,0] - torch.log(pc.input_layer_group[0].params[12+data[0,3]])) < 1e-4
+    assert torch.abs(pc.node_mars[8,0] - torch.log(pc.input_layer_group[0].params[14+data[0,3]])) < 1e-4
 
     p1 = torch.exp(pc.node_mars[1,0] + pc.node_mars[3,0])
     p2 = torch.exp(pc.node_mars[1,0] + pc.node_mars[4,0])
@@ -103,18 +101,18 @@ def non_sd_pc_forward_test():
 
     ## Unit tests for forward pass ##
 
-    assert torch.abs(pc.node_mars[1,0] - torch.log(pc.input_layers[0].params[data[0,0]])) < 1e-3
-    assert torch.abs(pc.node_mars[2,0] - torch.log(pc.input_layers[0].params[2+data[0,0]])) < 1e-3
-    assert torch.abs(pc.node_mars[3,0] - torch.log(pc.input_layers[0].params[4+data[0,1]])) < 1e-3
-    assert torch.abs(pc.node_mars[4,0] - torch.log(pc.input_layers[0].params[6+data[0,1]])) < 1e-3
-    assert torch.abs(pc.node_mars[5,0] - torch.log(pc.input_layers[0].params[8+data[0,2]])) < 1e-3
-    assert torch.abs(pc.node_mars[6,0] - torch.log(pc.input_layers[0].params[10+data[0,2]])) < 1e-3
-    assert torch.abs(pc.node_mars[7,0] - torch.log(pc.input_layers[0].params[12+data[0,1]])) < 1e-3
-    assert torch.abs(pc.node_mars[8,0] - torch.log(pc.input_layers[0].params[14+data[0,1]])) < 1e-3
-    assert torch.abs(pc.node_mars[9,0] - torch.log(pc.input_layers[0].params[16+data[0,2]])) < 1e-3
-    assert torch.abs(pc.node_mars[10,0] - torch.log(pc.input_layers[0].params[18+data[0,2]])) < 1e-3
-    assert torch.abs(pc.node_mars[11,0] - torch.log(pc.input_layers[0].params[20+data[0,0]])) < 1e-3
-    assert torch.abs(pc.node_mars[12,0] - torch.log(pc.input_layers[0].params[22+data[0,0]])) < 1e-3
+    assert torch.abs(pc.node_mars[1,0] - torch.log(pc.input_layer_group[0].params[data[0,0]])) < 1e-3
+    assert torch.abs(pc.node_mars[2,0] - torch.log(pc.input_layer_group[0].params[2+data[0,0]])) < 1e-3
+    assert torch.abs(pc.node_mars[3,0] - torch.log(pc.input_layer_group[0].params[4+data[0,1]])) < 1e-3
+    assert torch.abs(pc.node_mars[4,0] - torch.log(pc.input_layer_group[0].params[6+data[0,1]])) < 1e-3
+    assert torch.abs(pc.node_mars[5,0] - torch.log(pc.input_layer_group[0].params[8+data[0,2]])) < 1e-3
+    assert torch.abs(pc.node_mars[6,0] - torch.log(pc.input_layer_group[0].params[10+data[0,2]])) < 1e-3
+    assert torch.abs(pc.node_mars[7,0] - torch.log(pc.input_layer_group[0].params[12+data[0,1]])) < 1e-3
+    assert torch.abs(pc.node_mars[8,0] - torch.log(pc.input_layer_group[0].params[14+data[0,1]])) < 1e-3
+    assert torch.abs(pc.node_mars[9,0] - torch.log(pc.input_layer_group[0].params[16+data[0,2]])) < 1e-3
+    assert torch.abs(pc.node_mars[10,0] - torch.log(pc.input_layer_group[0].params[18+data[0,2]])) < 1e-3
+    assert torch.abs(pc.node_mars[11,0] - torch.log(pc.input_layer_group[0].params[20+data[0,0]])) < 1e-3
+    assert torch.abs(pc.node_mars[12,0] - torch.log(pc.input_layer_group[0].params[22+data[0,0]])) < 1e-3
 
     f1 = torch.exp(pc.node_mars[1,0] + pc.node_mars[3,0]) * pc.params[1]
     f2 = torch.exp(pc.node_mars[2,0] + pc.node_mars[4,0]) * pc.params[2]
@@ -167,14 +165,14 @@ def sparse_pc_forward_test():
 
     ## Unit tests for forward pass ##
 
-    assert torch.abs(pc.node_mars[1,0] - torch.log(pc.input_layers[0].params[data[0,0]])) < 1e-4
-    assert torch.abs(pc.node_mars[2,0] - torch.log(pc.input_layers[0].params[2+data[0,0]])) < 1e-4
-    assert torch.abs(pc.node_mars[3,0] - torch.log(pc.input_layers[0].params[4+data[0,1]])) < 1e-4
-    assert torch.abs(pc.node_mars[4,0] - torch.log(pc.input_layers[0].params[6+data[0,1]])) < 1e-4
-    assert torch.abs(pc.node_mars[5,0] - torch.log(pc.input_layers[0].params[8+data[0,2]])) < 1e-4
-    assert torch.abs(pc.node_mars[6,0] - torch.log(pc.input_layers[0].params[10+data[0,2]])) < 1e-4
-    assert torch.abs(pc.node_mars[7,0] - torch.log(pc.input_layers[0].params[12+data[0,3]])) < 1e-4
-    assert torch.abs(pc.node_mars[8,0] - torch.log(pc.input_layers[0].params[14+data[0,3]])) < 1e-4
+    assert torch.abs(pc.node_mars[1,0] - torch.log(pc.input_layer_group[0].params[data[0,0]])) < 1e-4
+    assert torch.abs(pc.node_mars[2,0] - torch.log(pc.input_layer_group[0].params[2+data[0,0]])) < 1e-4
+    assert torch.abs(pc.node_mars[3,0] - torch.log(pc.input_layer_group[0].params[4+data[0,1]])) < 1e-4
+    assert torch.abs(pc.node_mars[4,0] - torch.log(pc.input_layer_group[0].params[6+data[0,1]])) < 1e-4
+    assert torch.abs(pc.node_mars[5,0] - torch.log(pc.input_layer_group[0].params[8+data[0,2]])) < 1e-4
+    assert torch.abs(pc.node_mars[6,0] - torch.log(pc.input_layer_group[0].params[10+data[0,2]])) < 1e-4
+    assert torch.abs(pc.node_mars[7,0] - torch.log(pc.input_layer_group[0].params[12+data[0,3]])) < 1e-4
+    assert torch.abs(pc.node_mars[8,0] - torch.log(pc.input_layer_group[0].params[14+data[0,3]])) < 1e-4
 
     p1 = torch.exp(pc.node_mars[1,0] + pc.node_mars[3,0])
     p2 = torch.exp(pc.node_mars[1,0] + pc.node_mars[4,0])

From e54b314948eb99f1e288eb0e08adc58f9d04118b Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Tue, 26 Dec 2023 21:32:16 +0800
Subject: [PATCH 094/162] fix `backward_test`

---
 tests/model/backward_test.py | 72 ++++++++++++++++++------------------
 1 file changed, 37 insertions(+), 35 deletions(-)

diff --git a/tests/model/backward_test.py b/tests/model/backward_test.py
index 34af0abb..3fd59541 100644
--- a/tests/model/backward_test.py
+++ b/tests/model/backward_test.py
@@ -35,7 +35,7 @@ def backward_test():
 
     lls = pc(data)
 
-    pc.backward(data)
+    pc.backward(data.permute(1, 0), allow_modify_flows = False)
 
     ## Unit tests for backward pass ##
 
@@ -72,8 +72,8 @@ def backward_test():
 
     pf11 = (pc.node_flows[13,:] * torch.exp(pc.node_mars[9,:] + pc.node_mars[11,:]) * pc.params[13].unsqueeze(0) / torch.exp(pc.node_mars[13,:])).sum(dim = 0)
     pf12 = (pc.node_flows[13,:] * torch.exp(pc.node_mars[10,:] + pc.node_mars[12,:]) * pc.params[14].unsqueeze(0) / torch.exp(pc.node_mars[13,:])).sum(dim = 0)
-    assert torch.abs(pc.param_flows[13] - pf11) < 1e-4
-    assert torch.abs(pc.param_flows[14] - pf12) < 1e-4
+    assert torch.abs(pc.param_flows[12] - pf11) < 1e-4
+    assert torch.abs(pc.param_flows[13] - pf12) < 1e-4
     pf21 = pc.node_flows[9,:] * torch.exp(pc.node_mars[1,:] + pc.node_mars[3,:]) * pc.params[1].unsqueeze(0) / torch.exp(pc.node_mars[9,:])
     pf22 = pc.node_flows[9,:] * torch.exp(pc.node_mars[1,:] + pc.node_mars[4,:]) * pc.params[2].unsqueeze(0) / torch.exp(pc.node_mars[9,:])
     pf23 = pc.node_flows[9,:] * torch.exp(pc.node_mars[2,:] + pc.node_mars[3,:]) * pc.params[3].unsqueeze(0) / torch.exp(pc.node_mars[9,:])
@@ -82,35 +82,37 @@ def backward_test():
     pf26 = pc.node_flows[10,:] * torch.exp(pc.node_mars[1,:] + pc.node_mars[4,:]) * pc.params[6].unsqueeze(0) / torch.exp(pc.node_mars[10,:])
     pf27 = pc.node_flows[10,:] * torch.exp(pc.node_mars[2,:] + pc.node_mars[3,:]) * pc.params[7].unsqueeze(0) / torch.exp(pc.node_mars[10,:])
     pf28 = pc.node_flows[10,:] * torch.exp(pc.node_mars[2,:] + pc.node_mars[4,:]) * pc.params[8].unsqueeze(0) / torch.exp(pc.node_mars[10,:])
-    assert torch.abs(pc.param_flows[1] - pf21.sum(dim = 0)) < 1e-4
-    assert torch.abs(pc.param_flows[2] - pf22.sum(dim = 0)) < 1e-4
-    assert torch.abs(pc.param_flows[3] - pf23.sum(dim = 0)) < 1e-4
-    assert torch.abs(pc.param_flows[4] - pf24.sum(dim = 0)) < 1e-4
-    assert torch.abs(pc.param_flows[5] - pf25.sum(dim = 0)) < 1e-4
-    assert torch.abs(pc.param_flows[6] - pf26.sum(dim = 0)) < 1e-4
-    assert torch.abs(pc.param_flows[7] - pf27.sum(dim = 0)) < 1e-4
-    assert torch.abs(pc.param_flows[8] - pf28.sum(dim = 0)) < 1e-4
-
-    assert torch.abs((pc.node_flows[1,:] * (data[:,0] == 0)).sum() - pc.input_layers[0].param_flows[0]) < 1e-4
-    assert torch.abs((pc.node_flows[1,:] * (data[:,0] == 1)).sum() - pc.input_layers[0].param_flows[1]) < 1e-4
-    assert torch.abs((pc.node_flows[2,:] * (data[:,0] == 0)).sum() - pc.input_layers[0].param_flows[2]) < 1e-4
-    assert torch.abs((pc.node_flows[2,:] * (data[:,0] == 1)).sum() - pc.input_layers[0].param_flows[3]) < 1e-4
-    assert torch.abs((pc.node_flows[3,:] * (data[:,1] == 0)).sum() - pc.input_layers[0].param_flows[4]) < 1e-4
-    assert torch.abs((pc.node_flows[3,:] * (data[:,1] == 1)).sum() - pc.input_layers[0].param_flows[5]) < 1e-4
-    assert torch.abs((pc.node_flows[4,:] * (data[:,1] == 0)).sum() - pc.input_layers[0].param_flows[6]) < 1e-4
-    assert torch.abs((pc.node_flows[4,:] * (data[:,1] == 1)).sum() - pc.input_layers[0].param_flows[7]) < 1e-4
-    assert torch.abs((pc.node_flows[5,:] * (data[:,2] == 0)).sum() - pc.input_layers[0].param_flows[8]) < 1e-4
-    assert torch.abs((pc.node_flows[5,:] * (data[:,2] == 1)).sum() - pc.input_layers[0].param_flows[9]) < 1e-4
-    assert torch.abs((pc.node_flows[6,:] * (data[:,2] == 0)).sum() - pc.input_layers[0].param_flows[10]) < 1e-4
-    assert torch.abs((pc.node_flows[6,:] * (data[:,2] == 1)).sum() - pc.input_layers[0].param_flows[11]) < 1e-4
-    assert torch.abs((pc.node_flows[7,:] * (data[:,3] == 0)).sum() - pc.input_layers[0].param_flows[12]) < 1e-4
-    assert torch.abs((pc.node_flows[7,:] * (data[:,3] == 1)).sum() - pc.input_layers[0].param_flows[13]) < 1e-4
-    assert torch.abs((pc.node_flows[8,:] * (data[:,3] == 0)).sum() - pc.input_layers[0].param_flows[14]) < 1e-4
-    assert torch.abs((pc.node_flows[8,:] * (data[:,3] == 1)).sum() - pc.input_layers[0].param_flows[15]) < 1e-4
+    assert torch.abs(pc.param_flows[0] - pf21.sum(dim = 0)) < 1e-4
+    assert torch.abs(pc.param_flows[1] - pf22.sum(dim = 0)) < 1e-4
+    assert torch.abs(pc.param_flows[2] - pf23.sum(dim = 0)) < 1e-4
+    assert torch.abs(pc.param_flows[3] - pf24.sum(dim = 0)) < 1e-4
+    assert torch.abs(pc.param_flows[4] - pf25.sum(dim = 0)) < 1e-4
+    assert torch.abs(pc.param_flows[5] - pf26.sum(dim = 0)) < 1e-4
+    assert torch.abs(pc.param_flows[6] - pf27.sum(dim = 0)) < 1e-4
+    assert torch.abs(pc.param_flows[7] - pf28.sum(dim = 0)) < 1e-4
+
+    assert torch.abs((pc.node_flows[1,:] * (data[:,0] == 0)).sum() - pc.input_layer_group[0].param_flows[0]) < 1e-4
+    assert torch.abs((pc.node_flows[1,:] * (data[:,0] == 1)).sum() - pc.input_layer_group[0].param_flows[1]) < 1e-4
+    assert torch.abs((pc.node_flows[2,:] * (data[:,0] == 0)).sum() - pc.input_layer_group[0].param_flows[2]) < 1e-4
+    assert torch.abs((pc.node_flows[2,:] * (data[:,0] == 1)).sum() - pc.input_layer_group[0].param_flows[3]) < 1e-4
+    assert torch.abs((pc.node_flows[3,:] * (data[:,1] == 0)).sum() - pc.input_layer_group[0].param_flows[4]) < 1e-4
+    assert torch.abs((pc.node_flows[3,:] * (data[:,1] == 1)).sum() - pc.input_layer_group[0].param_flows[5]) < 1e-4
+    assert torch.abs((pc.node_flows[4,:] * (data[:,1] == 0)).sum() - pc.input_layer_group[0].param_flows[6]) < 1e-4
+    assert torch.abs((pc.node_flows[4,:] * (data[:,1] == 1)).sum() - pc.input_layer_group[0].param_flows[7]) < 1e-4
+    assert torch.abs((pc.node_flows[5,:] * (data[:,2] == 0)).sum() - pc.input_layer_group[0].param_flows[8]) < 1e-4
+    assert torch.abs((pc.node_flows[5,:] * (data[:,2] == 1)).sum() - pc.input_layer_group[0].param_flows[9]) < 1e-4
+    assert torch.abs((pc.node_flows[6,:] * (data[:,2] == 0)).sum() - pc.input_layer_group[0].param_flows[10]) < 1e-4
+    assert torch.abs((pc.node_flows[6,:] * (data[:,2] == 1)).sum() - pc.input_layer_group[0].param_flows[11]) < 1e-4
+    assert torch.abs((pc.node_flows[7,:] * (data[:,3] == 0)).sum() - pc.input_layer_group[0].param_flows[12]) < 1e-4
+    assert torch.abs((pc.node_flows[7,:] * (data[:,3] == 1)).sum() - pc.input_layer_group[0].param_flows[13]) < 1e-4
+    assert torch.abs((pc.node_flows[8,:] * (data[:,3] == 0)).sum() - pc.input_layer_group[0].param_flows[14]) < 1e-4
+    assert torch.abs((pc.node_flows[8,:] * (data[:,3] == 1)).sum() - pc.input_layer_group[0].param_flows[15]) < 1e-4
 
     ## Unit tests for params ##
 
-    inner_param_flows = pc.param_flows.clone()
+    inner_param_flows = torch.cat(
+        (torch.zeros([1], device = device), pc.param_flows.clone()), dim = 0
+    )
     pc._normalize_parameters(inner_param_flows)
     assert torch.abs(pf21.sum(dim = 0) / pf22.sum(dim = 0) - inner_param_flows[1] / inner_param_flows[2]) < 1e-4
     assert torch.abs(pf11 / pf12 - inner_param_flows[13] / inner_param_flows[14]) < 1e-4
@@ -145,7 +147,7 @@ def non_sd_pc_backward_test():
 
     lls = pc(data)
 
-    pc.backward(data)
+    pc.backward(data.permute(1, 0), allow_modify_flows = False)
 
     ## Unit tests for backward pass ##
 
@@ -169,10 +171,10 @@ def non_sd_pc_backward_test():
     fp4 = (torch.exp(pc.node_mars[16,:] + pc.node_mars[12,:]) * pc.params[12] / \
         torch.exp(pc.node_mars[17,:])).sum()
 
-    assert torch.abs(pc.param_flows[9] - fp1) < 1e-3
-    assert torch.abs(pc.param_flows[10] - fp2) < 1e-3
-    assert torch.abs(pc.param_flows[11] - fp3) < 1e-3
-    assert torch.abs(pc.param_flows[12] - fp4) < 1e-3
+    assert torch.abs(pc.param_flows[8] - fp1) < 1e-3
+    assert torch.abs(pc.param_flows[9] - fp2) < 1e-3
+    assert torch.abs(pc.param_flows[10] - fp3) < 1e-3
+    assert torch.abs(pc.param_flows[11] - fp4) < 1e-3
 
 
 def sparse_pc_backward_test():
@@ -200,7 +202,7 @@ def sparse_pc_backward_test():
 
     lls = pc(data)
 
-    pc.backward(data)
+    pc.backward(data.permute(1, 0), allow_modify_flows = False)
 
     ## Unit tests for backward pass ##
 

From f3b837c1fb4463d462cb85b5b6c8e662718b0d7d Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Tue, 26 Dec 2023 21:48:12 +0800
Subject: [PATCH 095/162] restore `non_sd_pcs_test`

---
 tests/model/non_sd_pcs_test.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/tests/model/non_sd_pcs_test.py b/tests/model/non_sd_pcs_test.py
index 94e6ffce..c7be5e37 100644
--- a/tests/model/non_sd_pcs_test.py
+++ b/tests/model/non_sd_pcs_test.py
@@ -39,7 +39,7 @@ def non_sd_test():
 
     lls = pc(data)
 
-    pc.backward(data)
+    pc.backward(data.permute(1, 0), allow_modify_flows = False)
 
     pc.update_parameters()
 
@@ -66,7 +66,7 @@ def non_sd_test():
     fwnp1 = fw910 * fw1112
     fwnp2 = fw12 * fw1314 * fw78
     fwnp3 = fw12 * fw34 * fw1112
-    fw15 = (ns._params * torch.cat((fwnp1, fwnp2, fwnp3), dim = 0)).sum()
+    fw15 = (ns._params.reshape(-1) * torch.cat((fwnp1, fwnp2, fwnp3), dim = 0)).sum()
 
     assert torch.all(torch.abs(fw15.log() - pc.node_mars[15,0].cpu()) < 1e-4)
 
@@ -74,9 +74,9 @@ def non_sd_test():
 
     assert torch.abs(pc.node_flows[15,0] - 1.0) < 1e-4
 
-    bknp1 = ns._params[0:2] * fwnp1 / fw15
-    bknp2 = ns._params[2:4] * fwnp2 / fw15
-    bknp3 = ns._params[4:6] * fwnp3 / fw15
+    bknp1 = ns._params.reshape(-1)[0:2] * fwnp1 / fw15
+    bknp2 = ns._params.reshape(-1)[2:4] * fwnp2 / fw15
+    bknp3 = ns._params.reshape(-1)[4:6] * fwnp3 / fw15
 
     bk910 = bknp1
     bk1112 = bknp1 + bknp3
@@ -86,9 +86,9 @@ def non_sd_test():
     assert torch.all(torch.abs(bk910 - pc.node_flows[9:11,0].cpu()) < 1e-4)
     assert torch.all(torch.abs(bk910 - pc.node_flows[9:11,0].cpu()) < 1e-4)
 
-    bknp12 = bk910[0] * ns12._params[0:2] * fw12 * fw34 / fw910[0] + bk910[1] * ns12._params[2:4] * fw12 * fw34 / fw910[1]
-    bknp23 = bk1314[0] * ns23._params[0:2] * fw34 * fw56 / fw1314[0] + bk1314[1] * ns23._params[2:4] * fw34 * fw56 / fw1314[1]
-    bknp34 = bk1112[0] * ns34._params[0:2] * fw56 * fw78 / fw1112[0] + bk1112[1] * ns34._params[2:4] * fw56 * fw78 / fw1112[1]
+    bknp12 = bk910[0] * ns12._params.reshape(-1)[0:2] * fw12 * fw34 / fw910[0] + bk910[1] * ns12._params.reshape(-1)[2:4] * fw12 * fw34 / fw910[1]
+    bknp23 = bk1314[0] * ns23._params.reshape(-1)[0:2] * fw34 * fw56 / fw1314[0] + bk1314[1] * ns23._params.reshape(-1)[2:4] * fw34 * fw56 / fw1314[1]
+    bknp34 = bk1112[0] * ns34._params.reshape(-1)[0:2] * fw56 * fw78 / fw1112[0] + bk1112[1] * ns34._params.reshape(-1)[2:4] * fw56 * fw78 / fw1112[1]
 
     bk12 = bknp12 + bknp2 + bknp3
     bk34 = bknp12 + bknp23 + bknp3

From 780010d8da4ce4291eb7fdd8d87bf64ec4bde5ae Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Wed, 27 Dec 2023 00:38:15 +0800
Subject: [PATCH 096/162] examine triton's kernel launching overhead

---
 tests/structures/kernel_launch_test.py | 99 ++++++++++++++++++++++++++
 1 file changed, 99 insertions(+)
 create mode 100644 tests/structures/kernel_launch_test.py

diff --git a/tests/structures/kernel_launch_test.py b/tests/structures/kernel_launch_test.py
new file mode 100644
index 00000000..bf3beb07
--- /dev/null
+++ b/tests/structures/kernel_launch_test.py
@@ -0,0 +1,99 @@
+import pyjuice as juice
+import torch
+import numpy as np
+import time
+import random
+
+import pyjuice.nodes.distributions as dists
+from pyjuice.utils import BitSet
+from pyjuice.nodes import multiply, summate, inputs
+from pyjuice.model import TensorCircuit
+
+from pyjuice.layer import InputLayer, ProdLayer, SumLayer
+
+import pytest
+
+
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def kernel1(a, b, c, M: tl.constexpr, N: tl.constexpr, K: tl.constexpr):
+    pid = tl.program_id(axis = 0)
+
+    offs_a = tl.arange(0, M)[:,None] * N + tl.arange(0, N)[None,:]
+    aa = tl.load(a + offs_a).to(tl.float16)
+
+    offs_b = tl.arange(0, N)[:,None] * K + tl.arange(0, K)[None,:]
+    bb = tl.load(b + offs_b).to(tl.float16)
+
+    cc = tl.dot(aa, bb).to(tl.float32)
+
+    offs_c = tl.arange(0, M)[:,None] * K + tl.arange(0, K)[None,:]
+    tl.store(c + offs_c, cc)
+
+
+@triton.jit
+def kernel2(a, b, c, M: tl.constexpr, N: tl.constexpr, K: tl.constexpr):
+    pid = tl.program_id(axis = 0)
+
+    offs_a = tl.arange(0, M)[:,None] * N + tl.arange(0, N)[None,:]
+    aa = tl.load(a + offs_a)#.to(tl.float16)
+
+    offs_b = tl.arange(0, N)[:,None] * K + tl.arange(0, K)[None,:]
+    bb = tl.load(b + offs_b)#.to(tl.float16)
+
+    cc = tl.sum(aa[:,:,None] * bb[None,:,:], axis = 1)#.to(tl.float32)
+
+    # cc = tl.dot(aa, bb)
+
+    offs_c = tl.arange(0, M)[:,None] * K + tl.arange(0, K)[None,:]
+    tl.store(c + offs_c, cc)
+
+
+@triton.jit
+def kernel3(a, b, c, M: tl.constexpr, N: tl.constexpr, K: tl.constexpr):
+    pid = tl.program_id(axis = 0)
+
+    offs_a = tl.arange(0, M)[:,None] * N + tl.arange(0, N)[None,:]
+    aa = tl.load(a + offs_a)
+
+    offs_b = tl.arange(0, N)[:,None] * K + tl.arange(0, K)[None,:]
+    bb = tl.load(b + offs_b)
+
+    aa = tl.view(tl.broadcast_to(aa[:,None,:], (M, 16 // M, N)), (16, N))
+    cc = tl.dot(aa, bb)
+    cc = tl.max(tl.view(cc, (M, 16 // M, N)), axis = 1)
+
+    offs_c = tl.arange(0, M)[:,None] * K + tl.arange(0, K)[None,:]
+    tl.store(c + offs_c, cc)
+
+
+if __name__ == "__main__":
+    import time
+
+    M = 16
+    N = 16
+    K = 16
+
+    a = torch.rand([M, N]).cuda()
+    b = torch.rand([N, K]).cuda()
+    c = torch.zeros([M, K]).cuda()
+
+    grid = (400,)
+
+    kernel1[grid](a, b, c, M, N, K)
+
+    aaa = [item for item in kernel1.cache[0].values()]
+    raw_kernel = aaa[0][(grid[0], 1, 1)]
+
+    from torch.profiler import profile, record_function, ProfilerActivity
+    with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], with_stack = True) as prof:
+        for i in range(50):
+            # kernel1[grid](a, b, c, M, N, K)
+            raw_kernel(a, b, c)
+
+    prof.export_chrome_trace("trace.json")
+
+    import pdb; pdb.set_trace()
\ No newline at end of file

From eb4d58be2537bfe4d095f1b1513b5155bebae3bd Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Wed, 27 Dec 2023 02:21:00 +0800
Subject: [PATCH 097/162] refactor partial evaluation

---
 src/pyjuice/layer/input_layer.py   | 24 ++++++-------
 src/pyjuice/layer/layer.py         | 24 ++++++-------
 src/pyjuice/layer/layer_group.py   | 23 +++++++++++++
 src/pyjuice/layer/prod_layer.py    | 33 ++++++++----------
 src/pyjuice/layer/sum_layer.py     | 45 ++++++++++++------------
 src/pyjuice/model/tensorcircuit.py | 55 +++++++++++++++---------------
 tests/model/partial_eval_test.py   | 27 +++++++++------
 7 files changed, 131 insertions(+), 100 deletions(-)

diff --git a/src/pyjuice/layer/input_layer.py b/src/pyjuice/layer/input_layer.py
index 6cdfc767..adf97a06 100644
--- a/src/pyjuice/layer/input_layer.py
+++ b/src/pyjuice/layer/input_layer.py
@@ -452,7 +452,7 @@ def enable_partial_evaluation(self, fw_scopes: Optional[Union[Sequence[BitSet],S
 
         # Filter forward nodes
         if fw_scopes is not None:
-            fw_local_group_ids = []
+            fw_local_ids = []
             for scope in fw_scopes:
                 if isinstance(scope, int):
                     scope = BitSet.from_array([scope])
@@ -460,16 +460,16 @@ def enable_partial_evaluation(self, fw_scopes: Optional[Union[Sequence[BitSet],S
                 if scope not in self.scope2localgids:
                     continue
 
-                fw_local_group_ids.append(self.scope2localgids[scope])
+                fw_local_ids.append(self.scope2localgids[scope])
 
             if return_ids:
-                return torch.cat(fw_local_group_ids, dim = 0)
+                return torch.cat(fw_local_ids, dim = 0)
             else:
-                self.fw_local_group_ids = torch.cat(fw_local_group_ids, dim = 0)
+                self.fw_local_ids = torch.cat(fw_local_ids, dim = 0)
 
         # Filter backward nodes
         if bk_scopes is not None:
-            bk_local_group_ids = []
+            bk_local_ids = []
             for scope in bk_scopes:
                 if isinstance(scope, int):
                     scope = BitSet.from_array([scope])
@@ -477,19 +477,19 @@ def enable_partial_evaluation(self, fw_scopes: Optional[Union[Sequence[BitSet],S
                 if scope not in self.scope2localgids:
                     continue
 
-                bk_local_group_ids.append(self.scope2localgids[scope])
+                bk_local_ids.append(self.scope2localgids[scope])
 
             if return_ids:
-                return torch.cat(bk_local_group_ids, dim = 0)
+                return torch.cat(bk_local_ids, dim = 0)
             else:
-                self.bk_local_group_ids = torch.cat(bk_local_group_ids, dim = 0)
+                self.bk_local_ids = torch.cat(bk_local_ids, dim = 0)
 
     def disable_partial_evaluation(self, forward: bool = True, backward: bool = True):
         if forward:
-            self.fw_local_group_ids = None
+            self.fw_local_ids = None
 
         if backward:
-            self.bk_local_group_ids = None
+            self.bk_local_ids = None
 
     def update_parameters(self):
         for idx, ns in enumerate(self.nodes):
@@ -514,9 +514,9 @@ def _prepare_scope2nids(self):
                     if scope not in scope2localgids:
                         scope2localgids[scope] = [torch.zeros([0], dtype = torch.long)]
 
-                    scope2localgids[scope].append(torch.arange(s_nid, e_nid))
+                    scope2localgids[scope].append(torch.arange(s_ngid, e_ngid))
 
-                local_nid += ns.num_nodes
+                local_ngid += ns.num_node_groups
 
             self.scope2localgids = {
                 scope: torch.cat(ids, dim = 0).to(self.params.device) for scope, ids in scope2localgids.items()
diff --git a/src/pyjuice/layer/layer.py b/src/pyjuice/layer/layer.py
index bbf0ea25..755e3309 100644
--- a/src/pyjuice/layer/layer.py
+++ b/src/pyjuice/layer/layer.py
@@ -22,38 +22,38 @@ def enable_partial_evaluation(self, fw_scopes: Optional[Sequence[BitSet]] = None
 
         # Filter forward nodes
         if fw_scopes is not None:
-            fw_group_local_ids = [[] for _ in range(self.num_fw_groups)]
+            fw_partition_local_ids = [[] for _ in range(self.num_fw_partitions)]
             for scope in fw_scopes:
                 if scope not in self.fw_scope2localids:
                     continue
 
-                for group_id, ids in enumerate(self.fw_scope2localids[scope]):
-                    fw_group_local_ids[group_id].append(self.fw_scope2localids[scope][group_id])
+                for partition_id, ids in enumerate(self.fw_scope2localids[scope]):
+                    fw_partition_local_ids[partition_id].append(self.fw_scope2localids[scope][partition_id])
 
-            self.fw_group_local_ids = [
-                torch.cat(ids, dim = 0) if len(ids) > 0 else torch.zeros([0], dtype = torch.long) for ids in fw_group_local_ids
+            self.fw_partition_local_ids = [
+                torch.cat(ids, dim = 0) if len(ids) > 0 else torch.zeros([0], dtype = torch.long) for ids in fw_partition_local_ids
             ]
 
         # Filter backward nodes
         if bk_scopes is not None:
-            bk_group_local_ids = [[] for _ in range(self.num_bk_groups)]
+            bk_partition_local_ids = [[] for _ in range(self.num_bk_partitions)]
             for scope in bk_scopes:
                 if scope not in self.bk_scope2localids:
                     continue
 
-                for group_id, ids in enumerate(self.bk_scope2localids[scope]):
-                    bk_group_local_ids[group_id].append(self.bk_scope2localids[scope][group_id])
+                for partition_id, ids in enumerate(self.bk_scope2localids[scope]):
+                    bk_partition_local_ids[partition_id].append(self.bk_scope2localids[scope][partition_id])
 
-            self.bk_group_local_ids = [
-                torch.cat(ids, dim = 0) if len(ids) > 0 else torch.zeros([0], dtype = torch.long) for ids in bk_group_local_ids
+            self.bk_partition_local_ids = [
+                torch.cat(ids, dim = 0) if len(ids) > 0 else torch.zeros([0], dtype = torch.long) for ids in bk_partition_local_ids
             ]
 
     def disable_partial_evaluation(self, forward: bool = True, backward: bool = True):
         if forward:
-            self.fw_group_local_ids = None
+            self.fw_partition_local_ids = None
 
         if backward:
-            self.bk_group_local_ids = None
+            self.bk_partition_local_ids = None
 
     def provided(self, var_name):
         return hasattr(self, var_name) and getattr(self, var_name) is not None
diff --git a/src/pyjuice/layer/layer_group.py b/src/pyjuice/layer/layer_group.py
index f2467e45..d2b97237 100644
--- a/src/pyjuice/layer/layer_group.py
+++ b/src/pyjuice/layer/layer_group.py
@@ -51,6 +51,16 @@ def backward(self, *args, **kwargs):
         for layer in self.layers:
             layer.backward(*args, **kwargs)
 
+    def enable_partial_evaluation(self, *args, **kwargs):
+
+        for layer in self.layers:
+            layer.enable_partial_evaluation(*args, **kwargs)
+
+    def disable_partial_evaluation(self, *args, **kwargs):
+
+        for layer in self.layers:
+            layer.disable_partial_evaluation(*args, **kwargs)
+
     def is_input(self):
         return self.layer_type == "input"
         
@@ -77,3 +87,16 @@ def __next__(self):
             return layer
         else:
             raise StopIteration
+
+    def _prepare_scope2nids(self, *args, **kwargs):
+        
+        if self.is_prod():
+            prod_scope_eleids = list()
+            for layer in self.layers:
+                prod_scope_eleids.extend(layer._prepare_scope2nids(*args, **kwargs))
+
+            return prod_scope_eleids
+        
+        else:
+            for layer in self.layers:
+                layer._prepare_scope2nids(*args, **kwargs)
diff --git a/src/pyjuice/layer/prod_layer.py b/src/pyjuice/layer/prod_layer.py
index c1787df1..9f99f5c8 100644
--- a/src/pyjuice/layer/prod_layer.py
+++ b/src/pyjuice/layer/prod_layer.py
@@ -219,13 +219,13 @@ def enable_partial_evaluation(self, fw_scopes: Optional[Sequence[BitSet]] = None
 
         # For product layers, we need a special forward pass during the backward process of the circuit
         if bk_scopes is not None:
-            bk_fw_partition_local_ids = [[] for _ in range(self.num_fw_groups)]
+            bk_fw_partition_local_ids = [[] for _ in range(self.num_fw_partitions)]
             for scope in bk_scopes:
                 if scope not in self.fw_scope2localids:
                     continue
 
-                for group_id, ids in enumerate(self.fw_scope2localids[scope]):
-                    bk_fw_partition_local_ids[group_id].append(self.fw_scope2localids[scope][group_id])
+                for partition_id, ids in enumerate(self.fw_scope2localids[scope]):
+                    bk_fw_partition_local_ids[partition_id].append(self.fw_scope2localids[scope][partition_id])
 
             self.bk_fw_partition_local_ids = [
                 torch.cat(ids, dim = 0) if len(ids) > 0 else torch.zeros([0], dtype = torch.long) for ids in bk_fw_partition_local_ids
@@ -392,9 +392,6 @@ def _forward_backward_pytorch(node_vals, element_vals, nids, cids, accum: bool =
     def _forward_backward(self, node_vals: torch.Tensor, element_vals: torch.Tensor,
                           nids: torch.Tensor, cids: torch.Tensor, local_ids: Optional[torch.Tensor] = None,
                           accum: bool = False) -> None:
-        if local_ids is not None:
-            raise NotImplementedError()
-
         tot_n_nodes = node_vals.size(0)
         tot_n_eles = element_vals.size(0)
         n_ngroups = nids.size(0) if local_ids is None else local_ids.size(0)
@@ -493,15 +490,15 @@ def _prepare_scope2nids(self):
                 with torch.no_grad():
                     if scope not in fw_scope2localids:
                         fw_scope2localids[scope] = [
-                            torch.zeros([0], dtype = torch.long).to(self.grouped_nids[0].device) for _ in range(self.num_fw_groups)
+                            torch.zeros([0], dtype = torch.long).to(self.partitioned_nids[0].device) for _ in range(self.num_fw_partitions)
                         ]
 
-                    for group_id in range(self.num_fw_groups):
-                        nids = self.grouped_nids[group_id]
-                        group_local_ids = torch.where((nids >= s_eid) & (nids < e_eid))[0]
+                    for partition_id in range(self.num_fw_partitions):
+                        nids = self.partitioned_nids[partition_id]
+                        partition_local_ids = torch.where((nids >= s_eid) & (nids < e_eid))[0]
 
-                        fw_scope2localids[scope][group_id] = torch.cat(
-                            (fw_scope2localids[scope][group_id], group_local_ids), dim = 0
+                        fw_scope2localids[scope][partition_id] = torch.cat(
+                            (fw_scope2localids[scope][partition_id], partition_local_ids), dim = 0
                         )
 
                 global_eid += ns.num_nodes
@@ -516,15 +513,15 @@ def _prepare_scope2nids(self):
 
                     if scope not in bk_scope2localids:
                         bk_scope2localids[scope] = [
-                            torch.zeros([0], dtype = torch.long).to(self.grouped_nids[0].device) for _ in range(self.num_bk_groups)
+                            torch.zeros([0], dtype = torch.long).to(self.partitioned_nids[0].device) for _ in range(self.num_bk_partitions)
                         ]
 
-                    for group_id in range(self.num_bk_groups):
-                        u_cids = self.grouped_u_cids[group_id]
-                        group_local_ids = torch.where((u_cids >= s_nid) & (u_cids < e_nid))[0]
+                    for partition_id in range(self.num_bk_partitions):
+                        u_cids = self.partitioned_u_cids[partition_id]
+                        partition_local_ids = torch.where((u_cids >= s_nid) & (u_cids < e_nid))[0]
 
-                        bk_scope2localids[scope][group_id] = torch.cat(
-                            (bk_scope2localids[scope][group_id], group_local_ids), dim = 0
+                        bk_scope2localids[scope][partition_id] = torch.cat(
+                            (bk_scope2localids[scope][partition_id], partition_local_ids), dim = 0
                         )
 
             self.fw_scope2localids = fw_scope2localids
diff --git a/src/pyjuice/layer/sum_layer.py b/src/pyjuice/layer/sum_layer.py
index afc1e59d..1294d86a 100644
--- a/src/pyjuice/layer/sum_layer.py
+++ b/src/pyjuice/layer/sum_layer.py
@@ -261,19 +261,22 @@ def backward(self, node_flows: torch.Tensor, element_flows: torch.Tensor,
         `params`:        [num_params, B] or [num_params]
         """
 
+        # Disallow modifications of `node_flows` in case of partial evaluation
+        if self.provided("bk_partition_local_ids") and allow_modify_flows:
+            allow_modify_flows = False
+
         ## Pre-compute `nflows.log() - nmars` if needed ##
         if allow_modify_flows:
+            assert not self.provided("bk_partition_local_ids"), "Must set `allow_modify_flows = False` for partial evaluation."
             for partition_id in range(self.num_fw_partitions):
                 nids = self.partitioned_nids[partition_id]
-                # TODO: be careful when restoring `local_ids`
-                local_ids = None
 
                 self._bk_triton_block_sparse_modify_flow(
-                    node_flows, node_mars, nids, local_ids
+                    node_flows, node_mars, nids, local_ids = None
                 )
         
         ## Compute flows w.r.t. elements (i.e., product nodes) ##
-        if not self.provided("bk_group_local_ids"):
+        if not self.provided("bk_partition_local_ids"):
             # Evaluate the whole layer
             for partition_id in range(self.num_bk_partitions):
                 chids = self.partitioned_chids[partition_id]
@@ -292,11 +295,11 @@ def backward(self, node_flows: torch.Tensor, element_flows: torch.Tensor,
         else:
             # Partial evaluation
             for partition_id in range(self.num_bk_partitions):
-                chids = self.grouped_chids[partition_id]
-                parids = self.grouped_parids[partition_id]
-                parpids = self.grouped_parpids[partition_id]
+                chids = self.partitioned_chids[partition_id]
+                parids = self.partitioned_parids[partition_id]
+                parpids = self.partitioned_parpids[partition_id]
                 cs_group_size = self.cs_group_sizes[partition_id]
-                local_ids = self.bk_group_local_ids[partition_id]
+                local_ids = self.bk_partition_local_ids[partition_id]
 
                 self._backward(
                     node_flows, element_flows, params, node_mars,
@@ -1711,7 +1714,7 @@ def _prepare_scope2nids(self, prod_scope_eleids: Sequence[Tuple[BitSet, torch.Te
             bk_scope2localids = dict()
 
             # Forward local indices
-            global_nid = self.global_nid_range[0]
+            global_nid = self._layer_nid_range[0]
             for ns in self.nodes:
                 scope = ns.scope
 
@@ -1721,15 +1724,15 @@ def _prepare_scope2nids(self, prod_scope_eleids: Sequence[Tuple[BitSet, torch.Te
                 with torch.no_grad():
                     if scope not in fw_scope2localids:
                         fw_scope2localids[scope] = [
-                            torch.zeros([0], dtype = torch.long).to(self.grouped_nids[0].device) for _ in range(self.num_fw_groups)
+                            torch.zeros([0], dtype = torch.long).to(self.partitioned_nids[0].device) for _ in range(self.num_fw_partitions)
                         ]
 
-                    for group_id in range(self.num_fw_groups):
-                        nids = self.grouped_nids[group_id]
-                        group_local_ids = torch.where((nids >= s_nid) & (nids < e_nid))[0]
+                    for partition_id in range(self.num_fw_partitions):
+                        nids = self.partitioned_nids[partition_id]
+                        partition_local_ids = torch.where((nids >= s_nid) & (nids < e_nid))[0]
 
-                        fw_scope2localids[scope][group_id] = torch.cat(
-                            (fw_scope2localids[scope][group_id], group_local_ids), dim = 0
+                        fw_scope2localids[scope][partition_id] = torch.cat(
+                            (fw_scope2localids[scope][partition_id], partition_local_ids), dim = 0
                         )
 
                 global_nid += ns.num_nodes
@@ -1741,15 +1744,15 @@ def _prepare_scope2nids(self, prod_scope_eleids: Sequence[Tuple[BitSet, torch.Te
                 with torch.no_grad():
                     if scope not in bk_scope2localids:
                         bk_scope2localids[scope] = [
-                            torch.zeros([0], dtype = torch.long).to(self.grouped_nids[0].device) for _ in range(self.num_bk_groups)
+                            torch.zeros([0], dtype = torch.long).to(self.partitioned_chids[0].device) for _ in range(self.num_bk_partitions)
                         ]
 
-                    for group_id in range(self.num_bk_groups):
-                        chids = self.grouped_chids[group_id]
-                        group_local_ids = torch.where((chids >= s_eid) & (chids < e_eid))[0]
+                    for partition_id in range(self.num_bk_partitions):
+                        chids = self.partitioned_chids[partition_id]
+                        partition_local_ids = torch.where((chids >= s_eid) & (chids < e_eid))[0]
 
-                        bk_scope2localids[scope][group_id] = torch.cat(
-                            (bk_scope2localids[scope][group_id], group_local_ids), dim = 0
+                        bk_scope2localids[scope][partition_id] = torch.cat(
+                            (bk_scope2localids[scope][partition_id], partition_local_ids), dim = 0
                         )
 
             self.fw_scope2localids = fw_scope2localids
diff --git a/src/pyjuice/model/tensorcircuit.py b/src/pyjuice/model/tensorcircuit.py
index f4464b3f..3a86641d 100644
--- a/src/pyjuice/model/tensorcircuit.py
+++ b/src/pyjuice/model/tensorcircuit.py
@@ -80,6 +80,10 @@ def __init__(self, root_ns: CircuitNodes, layer_sparsity_tol: float = 0.5,
             "flows_memory": 1.0
         }
 
+        # Partial evaluation
+        self._fw_partial_eval_enabled = False
+        self._bk_partial_eval_enabled = False
+
         # CudaGraph options
         self._recorded_cuda_graphs = dict()
 
@@ -423,12 +427,13 @@ def print_statistics(self):
         print(f"> Number of sum parameters: {self.num_sum_params}")
 
     def enable_partial_evaluation(self, scopes: Union[Sequence[BitSet],Sequence[int]], 
-                                  forward: bool = False, backward: bool = False):
-        raise NotImplementedError("To be updated")
-        
+                                  forward: bool = False, backward: bool = False, overwrite: bool = False):
         # Create scope2nid cache
         self._create_scope2nid_cache()
 
+        if not overwrite and (forward and self._fw_partial_eval_enabled or backward and self._bk_partial_eval_enabled):
+            raise RuntimeError("Partial evaluation already enabled, consider calling `disable_partial_evaluation` first.")
+
         if isinstance(scopes[0], int):
             scopes = [BitSet.from_array([var]) for var in scopes]
 
@@ -436,34 +441,33 @@ def enable_partial_evaluation(self, scopes: Union[Sequence[BitSet],Sequence[int]
         bk_scopes = scopes if backward else None
 
         # Input layers
-        for layer in self.input_layers:
+        for layer in self.input_layer_group:
             layer.enable_partial_evaluation(fw_scopes = fw_scopes, bk_scopes = bk_scopes)
 
         # Inner layers
-        for layer in self.inner_layers:
-            layer.enable_partial_evaluation(fw_scopes = fw_scopes, bk_scopes = bk_scopes)
+        for layer_group in self.inner_layer_groups:
+            layer_group.enable_partial_evaluation(fw_scopes = fw_scopes, bk_scopes = bk_scopes)
+
+        if forward:
+            self._fw_partial_eval_enabled = True
 
         if backward:
-            scopes = set(scopes)
-            _pv_node_flows_mask = torch.zeros([self.num_nodes], dtype = torch.bool)
-            for ns in self.root_nodes:
-                if (ns.is_sum() or ns.is_input()) and ns.scope in scopes:
-                    sid, eid = ns._output_ind_range
-                    _pv_node_flows_mask[sid:eid] = True
-            self._pv_node_flows_mask = _pv_node_flows_mask.to(self.device)
+            self._bk_partial_eval_enabled = True
 
     def disable_partial_evaluation(self, forward: bool = True, backward: bool = True):
-        raise NotImplementedError("To be updated")
-
         # Input layers
-        for layer in self.input_layers:
+        for layer in self.input_layer_group:
             layer.disable_partial_evaluation(forward = forward, backward = backward)
 
         # Inner layers
-        for layer in self.inner_layers:
-            layer.disable_partial_evaluation(forward = forward, backward = backward)
+        for layer_group in self.inner_layer_groups:
+            layer_group.disable_partial_evaluation(forward = forward, backward = backward)
+
+        if forward:
+            self._fw_partial_eval_enabled = False
 
-        self._pv_node_flows_mask = None
+        if backward:
+            self._bk_partial_eval_enabled = False
 
     def _init_buffer(self, name: str, shape: Tuple, set_value: Optional[float] = None, check_device: bool = True):
         flag = False
@@ -772,19 +776,16 @@ def _categorize_input_nodes(self, nodes: Sequence[InputNodes]):
         return signature2nodes
 
     def _create_scope2nid_cache(self):
-
-        raise NotImplementedError()
-
         # Input layers
         for idx, layer in enumerate(self.input_layer_group):
             layer._prepare_scope2nids()
 
         # Inner layers
         prod_scope_eleids = None
-        for layer in self.inner_layers:
-            if isinstance(layer, ProdLayer):
-                prod_scope_eleids = layer._prepare_scope2nids()
+        for layer_group in self.inner_layer_groups:
+            if layer_group.is_prod():
+                prod_scope_eleids = layer_group._prepare_scope2nids()
             else:
-                assert isinstance(layer, SumLayer)
+                assert layer_group.is_sum()
 
-                layer._prepare_scope2nids(prod_scope_eleids)
+                layer_group._prepare_scope2nids(prod_scope_eleids)
diff --git a/tests/model/partial_eval_test.py b/tests/model/partial_eval_test.py
index 7c8eed98..e654c787 100644
--- a/tests/model/partial_eval_test.py
+++ b/tests/model/partial_eval_test.py
@@ -48,9 +48,11 @@ def partial_eval_forward_test():
 
     assert torch.all(torch.abs(lls - lls2) < 1e-4)
 
-    assert (pc.input_layers[0].fw_local_ids.cpu() == torch.tensor([2, 3])).all()
-    assert (pc.inner_layers[0].fw_group_local_ids[0].cpu() == torch.tensor([0, 1, 2, 3])).all()
-    assert (pc.inner_layers[1].fw_group_local_ids[1].cpu() == torch.tensor([0, 1])).all()
+    assert (pc.input_layer_group[0].fw_local_ids.cpu() == torch.tensor([2, 3])).all()
+    assert (pc.inner_layer_groups[0][0].fw_partition_local_ids[0].cpu() == torch.tensor([0, 1, 2, 3])).all()
+    assert (pc.inner_layer_groups[1][0].fw_partition_local_ids[0].cpu() == torch.tensor([0, 1])).all()
+
+    pc.disable_partial_evaluation()
 
     for var in range(4):
         pseudo_data = data.clone()
@@ -61,6 +63,7 @@ def partial_eval_forward_test():
         pc.enable_partial_evaluation(scopes = scopes, forward = True)
 
         lls2 = pc(data, cache = cache)
+        pc.disable_partial_evaluation()
 
         assert torch.all(torch.abs(lls - lls2) < 1e-4)
 
@@ -89,19 +92,23 @@ def partial_eval_backward_test():
     data = torch.randint(0, 2, [16, 4]).to(device)
 
     lls, cache = pc(data, return_cache = True)
-    cache = pc.backward(data, cache = cache, return_cache = True)
+    cache = pc.backward(data.permute(1, 0), cache = cache, return_cache = True, allow_modify_flows = False)
+    new_cache = deepcopy(cache)
+    new_cache["node_flows"][3:5,:] = 0.0
+    new_cache["node_flows"][9:11,:] = 0.0
 
     scopes = get_subsumed_scopes(n, [1])
     pc.enable_partial_evaluation(scopes = scopes, forward = False, backward = True)
 
-    cache2 = pc.backward(data, cache = deepcopy(cache), return_cache = True)
+    cache2 = pc.backward(data.permute(1, 0), cache = new_cache, return_cache = True)
 
-    assert torch.all(cache["node_flows"] == cache2["node_flows"])
+    assert torch.all((cache["node_flows"] - cache2["node_flows"]).abs() < 1e-4)
 
-    assert (pc.input_layers[0].bk_local_ids.cpu() == torch.tensor([2, 3])).all()
-    assert (pc.inner_layers[0].bk_group_local_ids[1].cpu() == torch.tensor([2, 3])).all()
-    assert (pc.inner_layers[1].bk_group_local_ids[0].cpu() == torch.tensor([0, 1, 2, 3])).all()
-    assert (pc.inner_layers[2].bk_group_local_ids[0].cpu() == torch.tensor([0, 1])).all()
+    assert (pc.input_layer_group[0].bk_local_ids.cpu() == torch.tensor([2, 3])).all()
+    assert (pc.inner_layer_groups[0][0].bk_partition_local_ids[0].cpu() == torch.tensor([2, 3])).all()
+    assert (pc.inner_layer_groups[1][0].bk_partition_local_ids[0].cpu() == torch.tensor([0, 1, 2, 3])).all()
+    assert (pc.inner_layer_groups[2][0].bk_partition_local_ids[0].cpu() == torch.tensor([0, 1])).all()
+    assert (pc.inner_layer_groups[3][0].bk_partition_local_ids[0].cpu() == torch.tensor([0, 1])).all()
 
 
 if __name__ == "__main__":

From 2ce194b5bd43825994d75a959b6c4545b6ede942 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Wed, 27 Dec 2023 02:21:11 +0800
Subject: [PATCH 098/162] update hclt test

---
 tests/structures/hclt_test.py | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/tests/structures/hclt_test.py b/tests/structures/hclt_test.py
index db7f5ad4..97818db0 100644
--- a/tests/structures/hclt_test.py
+++ b/tests/structures/hclt_test.py
@@ -44,25 +44,24 @@ def mini_batch_em_epoch(num_epochs, pc, optimizer, scheduler, train_loader, test
 
 
 def full_batch_em_epoch(pc, train_loader, test_loader, device):
-    with torch.no_grad():
-        t0 = time.time()
-        train_ll = 0.0
-        for batch in train_loader:
-            x = batch[0].to(device)
+    t0 = time.time()
+    train_ll = 0.0
+    for batch in train_loader:
+        x = batch[0].to(device)
 
-            lls = pc(x)
-            lls.mean().backward()
+        lls = pc(x)
+        lls.mean().backward()
 
-            train_ll += lls.mean().detach().cpu().numpy().item()
+        train_ll += lls.mean().detach().cpu().numpy().item()
 
-        pc.mini_batch_em(step_size = 1.0, pseudocount = 0.1)
+    pc.mini_batch_em(step_size = 1.0, pseudocount = 0.1)
 
-        train_ll /= len(train_loader)
+    train_ll /= len(train_loader)
 
-        t1 = time.time()
-        test_ll = evaluate(pc, loader=test_loader)
-        t2 = time.time()
-        print(f"[train LL: {train_ll:.2f}; test LL: {test_ll:.2f}].....[train forward+backward+step {t1-t0:.2f}; test forward {t2-t1:.2f}] ")
+    t1 = time.time()
+    test_ll = evaluate(pc, loader=test_loader)
+    t2 = time.time()
+    print(f"[train LL: {train_ll:.2f}; test LL: {test_ll:.2f}].....[train forward+backward+step {t1-t0:.2f}; test forward {t2-t1:.2f}] ")
 
 
 def hclt_test():

From 18f225f9dde3b7aac73fc1287ea0b24c922c9bf9 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Wed, 27 Dec 2023 02:47:47 +0800
Subject: [PATCH 099/162] fix tests

---
 tests/nodes/input_dists_test.py           | 105 +++++++++++-----------
 tests/nodes/nodes_test.py                 |   9 --
 tests/structures/hclt_correctness_test.py |   4 +-
 3 files changed, 55 insertions(+), 63 deletions(-)

diff --git a/tests/nodes/input_dists_test.py b/tests/nodes/input_dists_test.py
index 4917f2c0..5c0ca73f 100644
--- a/tests/nodes/input_dists_test.py
+++ b/tests/nodes/input_dists_test.py
@@ -36,19 +36,19 @@ def categorical_nodes_test():
 
     lls = pc(data)
 
-    pc.backward(data)
+    pc.backward(data.permute(1, 0))
 
     ## Input node forward tests ##
 
     for i in range(16):
-        assert torch.abs(pc.node_mars[1,i] - torch.log(pc.input_layers[0].params[data[i,0]])) < 1e-4
-        assert torch.abs(pc.node_mars[2,i] - torch.log(pc.input_layers[0].params[2+data[i,0]])) < 1e-4
-        assert torch.abs(pc.node_mars[3,i] - torch.log(pc.input_layers[0].params[4+data[i,1]])) < 1e-4
-        assert torch.abs(pc.node_mars[4,i] - torch.log(pc.input_layers[0].params[6+data[i,1]])) < 1e-4
-        assert torch.abs(pc.node_mars[5,i] - torch.log(pc.input_layers[0].params[8+data[i,2]])) < 1e-4
-        assert torch.abs(pc.node_mars[6,i] - torch.log(pc.input_layers[0].params[10+data[i,2]])) < 1e-4
-        assert torch.abs(pc.node_mars[7,i] - torch.log(pc.input_layers[0].params[12+data[i,3]])) < 1e-4
-        assert torch.abs(pc.node_mars[8,i] - torch.log(pc.input_layers[0].params[14+data[i,3]])) < 1e-4
+        assert torch.abs(pc.node_mars[1,i] - torch.log(pc.input_layer_group[0].params[data[i,0]])) < 1e-4
+        assert torch.abs(pc.node_mars[2,i] - torch.log(pc.input_layer_group[0].params[2+data[i,0]])) < 1e-4
+        assert torch.abs(pc.node_mars[3,i] - torch.log(pc.input_layer_group[0].params[4+data[i,1]])) < 1e-4
+        assert torch.abs(pc.node_mars[4,i] - torch.log(pc.input_layer_group[0].params[6+data[i,1]])) < 1e-4
+        assert torch.abs(pc.node_mars[5,i] - torch.log(pc.input_layer_group[0].params[8+data[i,2]])) < 1e-4
+        assert torch.abs(pc.node_mars[6,i] - torch.log(pc.input_layer_group[0].params[10+data[i,2]])) < 1e-4
+        assert torch.abs(pc.node_mars[7,i] - torch.log(pc.input_layer_group[0].params[12+data[i,3]])) < 1e-4
+        assert torch.abs(pc.node_mars[8,i] - torch.log(pc.input_layer_group[0].params[14+data[i,3]])) < 1e-4
 
     ## Input node backward tests ##
 
@@ -64,21 +64,21 @@ def categorical_nodes_test():
         gt_param_flows[12+data[i,3]] += pc.node_flows[7,i]
         gt_param_flows[14+data[i,3]] += pc.node_flows[8,i]
 
-    assert torch.all(torch.abs(gt_param_flows - pc.input_layers[0].param_flows) < 1e-4)
+    assert torch.all(torch.abs(gt_param_flows - pc.input_layer_group[0].param_flows) < 1e-4)
 
     ## EM tests ##
 
-    original_params = pc.input_layers[0].params.clone()
+    original_params = pc.input_layer_group[0].params.clone()
 
     step_size = 0.3
     pseudocount = 0.1
 
-    par_flows = pc.input_layers[0].param_flows.clone().reshape(8, 2)
+    par_flows = pc.input_layer_group[0].param_flows.clone().reshape(8, 2)
     new_params = (1.0 - step_size) * original_params + step_size * ((par_flows + pseudocount / 2) / (par_flows.sum(dim = 1, keepdim = True) + pseudocount)).reshape(-1)
 
     pc.mini_batch_em(step_size = step_size, pseudocount = pseudocount)
 
-    assert torch.all(torch.abs(new_params - pc.input_layers[0].params) < 1e-4)
+    assert torch.all(torch.abs(new_params - pc.input_layer_group[0].params) < 1e-4)
 
 
 def bernoulli_nodes_test():
@@ -106,19 +106,19 @@ def bernoulli_nodes_test():
 
     lls = pc(data)
 
-    pc.backward(data)
+    pc.backward(data.permute(1, 0))
 
     ## Input node forward tests ##
 
     for i in range(16):
-        assert torch.abs(pc.node_mars[1,i].exp() - (pc.input_layers[0].params[0] if data[i,0] == 1 else (1.0 - pc.input_layers[0].params[0]))) < 1e-4
-        assert torch.abs(pc.node_mars[2,i].exp() - (pc.input_layers[0].params[1] if data[i,0] == 1 else (1.0 - pc.input_layers[0].params[1]))) < 1e-4
-        assert torch.abs(pc.node_mars[3,i].exp() - (pc.input_layers[0].params[2] if data[i,1] == 1 else (1.0 - pc.input_layers[0].params[2]))) < 1e-4
-        assert torch.abs(pc.node_mars[4,i].exp() - (pc.input_layers[0].params[3] if data[i,1] == 1 else (1.0 - pc.input_layers[0].params[3]))) < 1e-4
-        assert torch.abs(pc.node_mars[5,i].exp() - (pc.input_layers[0].params[4] if data[i,2] == 1 else (1.0 - pc.input_layers[0].params[4]))) < 1e-4
-        assert torch.abs(pc.node_mars[6,i].exp() - (pc.input_layers[0].params[5] if data[i,2] == 1 else (1.0 - pc.input_layers[0].params[5]))) < 1e-4
-        assert torch.abs(pc.node_mars[7,i].exp() - (pc.input_layers[0].params[6] if data[i,3] == 1 else (1.0 - pc.input_layers[0].params[6]))) < 1e-4
-        assert torch.abs(pc.node_mars[8,i].exp() - (pc.input_layers[0].params[7] if data[i,3] == 1 else (1.0 - pc.input_layers[0].params[7]))) < 1e-4
+        assert torch.abs(pc.node_mars[1,i].exp() - (pc.input_layer_group[0].params[0] if data[i,0] == 1 else (1.0 - pc.input_layer_group[0].params[0]))) < 1e-4
+        assert torch.abs(pc.node_mars[2,i].exp() - (pc.input_layer_group[0].params[1] if data[i,0] == 1 else (1.0 - pc.input_layer_group[0].params[1]))) < 1e-4
+        assert torch.abs(pc.node_mars[3,i].exp() - (pc.input_layer_group[0].params[2] if data[i,1] == 1 else (1.0 - pc.input_layer_group[0].params[2]))) < 1e-4
+        assert torch.abs(pc.node_mars[4,i].exp() - (pc.input_layer_group[0].params[3] if data[i,1] == 1 else (1.0 - pc.input_layer_group[0].params[3]))) < 1e-4
+        assert torch.abs(pc.node_mars[5,i].exp() - (pc.input_layer_group[0].params[4] if data[i,2] == 1 else (1.0 - pc.input_layer_group[0].params[4]))) < 1e-4
+        assert torch.abs(pc.node_mars[6,i].exp() - (pc.input_layer_group[0].params[5] if data[i,2] == 1 else (1.0 - pc.input_layer_group[0].params[5]))) < 1e-4
+        assert torch.abs(pc.node_mars[7,i].exp() - (pc.input_layer_group[0].params[6] if data[i,3] == 1 else (1.0 - pc.input_layer_group[0].params[6]))) < 1e-4
+        assert torch.abs(pc.node_mars[8,i].exp() - (pc.input_layer_group[0].params[7] if data[i,3] == 1 else (1.0 - pc.input_layer_group[0].params[7]))) < 1e-4
 
     ## Input node backward tests ##
 
@@ -135,21 +135,21 @@ def bernoulli_nodes_test():
         gt_param_flows[12+offsets[i,3]] += pc.node_flows[7,i]
         gt_param_flows[14+offsets[i,3]] += pc.node_flows[8,i]
 
-    assert torch.all(torch.abs(gt_param_flows - pc.input_layers[0].param_flows) < 1e-4)
+    assert torch.all(torch.abs(gt_param_flows - pc.input_layer_group[0].param_flows) < 1e-4)
 
     ## EM tests ##
 
-    original_params = pc.input_layers[0].params.clone()
+    original_params = pc.input_layer_group[0].params.clone()
 
     step_size = 0.3
     pseudocount = 0.1
 
-    par_flows = pc.input_layers[0].param_flows.clone().reshape(8, 2)
+    par_flows = pc.input_layer_group[0].param_flows.clone().reshape(8, 2)
     new_params = (1.0 - step_size) * original_params + step_size * ((par_flows + pseudocount / 2) / (par_flows.sum(dim = 1, keepdim = True) + pseudocount))[:,0]
 
     pc.mini_batch_em(step_size = step_size, pseudocount = pseudocount)
 
-    assert torch.all(torch.abs(new_params - pc.input_layers[0].params) < 1e-4)
+    assert torch.all(torch.abs(new_params - pc.input_layer_group[0].params) < 1e-4)
 
 
 def gaussian_nodes_test():
@@ -177,12 +177,12 @@ def gaussian_nodes_test():
 
     lls = pc(data)
 
-    pc.backward(data)
+    pc.backward(data.permute(1, 0))
 
     ## Input node forward tests ##
 
     for j in range(8):
-        gt_probs = torch.distributions.normal.Normal(pc.input_layers[0].params[2*j], pc.input_layers[0].params[2*j+1]).log_prob(data[:,j//2])
+        gt_probs = torch.distributions.normal.Normal(pc.input_layer_group[0].params[2*j], pc.input_layer_group[0].params[2*j+1]).log_prob(data[:,j//2])
         assert torch.all(torch.abs(gt_probs - pc.node_mars[j+1,:]) < 1e-4)
 
     ## Input node backward tests ##
@@ -194,12 +194,12 @@ def gaussian_nodes_test():
         gt_param_flows[3*j+1] = ((data[:,j//2] ** 2) * pc.node_flows[j+1,:]).sum()
         gt_param_flows[3*j+2] = (pc.node_flows[j+1,:]).sum()
 
-    assert torch.all(torch.abs(gt_param_flows - pc.input_layers[0].param_flows) < 1e-2)
+    assert torch.all(torch.abs(gt_param_flows - pc.input_layer_group[0].param_flows) < 1e-2)
 
     ## EM tests ##
 
-    mu = pc.input_layers[0].params.reshape(8, 2)[:,0].clone()
-    sigma = pc.input_layers[0].params.reshape(8, 2)[:,1].clone()
+    mu = pc.input_layer_group[0].params.reshape(8, 2)[:,0].clone()
+    sigma = pc.input_layer_group[0].params.reshape(8, 2)[:,1].clone()
     ori_theta1 = mu
     ori_theta2 = sigma * sigma + mu * mu
 
@@ -207,9 +207,9 @@ def gaussian_nodes_test():
     pseudocount = 0.1
     min_sigma = 0.01
 
-    stat1 = pc.input_layers[0].param_flows.reshape(8, 3)[:,0]
-    stat2 = pc.input_layers[0].param_flows.reshape(8, 3)[:,1]
-    stat3 = pc.input_layers[0].param_flows.reshape(8, 3)[:,2]
+    stat1 = pc.input_layer_group[0].param_flows.reshape(8, 3)[:,0]
+    stat2 = pc.input_layer_group[0].param_flows.reshape(8, 3)[:,1]
+    stat3 = pc.input_layer_group[0].param_flows.reshape(8, 3)[:,2]
 
     new_theta1 = stat1 / (stat3 + 1e-10)
     new_theta2 = stat2 / (stat3 + 1e-10)
@@ -225,8 +225,8 @@ def gaussian_nodes_test():
 
     pc.mini_batch_em(step_size = step_size, pseudocount = pseudocount)
 
-    assert torch.all(torch.abs(updated_mu - pc.input_layers[0].params.reshape(8, 2)[:,0]) < 1e-4)
-    assert torch.all(torch.abs(updated_sigma.clamp(min = 0.01) - pc.input_layers[0].params.reshape(8, 2)[:,1]) < 1e-4)
+    assert torch.all(torch.abs(updated_mu - pc.input_layer_group[0].params.reshape(8, 2)[:,0]) < 1e-4)
+    assert torch.all(torch.abs(updated_sigma.clamp(min = 0.01) - pc.input_layer_group[0].params.reshape(8, 2)[:,1]) < 1e-4)
 
 
 def discrete_logistic_nodes_test():
@@ -254,7 +254,7 @@ def discrete_logistic_nodes_test():
 
     lls = pc(data)
 
-    pc.backward(data)
+    pc.backward(data.permute(1, 0))
 
     ## Input node forward tests ##
 
@@ -268,8 +268,8 @@ def discrete_logistic_nodes_test():
         vlow = data[:,j//2] * interval + range_low
         vhigh = vlow + interval
 
-        mu = pc.input_layers[0].params[2*j]
-        s = pc.input_layers[0].params[2*j+1]
+        mu = pc.input_layer_group[0].params[2*j]
+        s = pc.input_layer_group[0].params[2*j+1]
 
         cdfhigh = torch.where(data[:,j//2] == num_cats - 1, 1.0, 1.0 / (1.0 + torch.exp((mu - vhigh) / s)))
         cdflow = torch.where(data[:,j//2] == 0, 0.0, 1.0 / (1.0 + torch.exp((mu - vlow) / s)))
@@ -290,12 +290,12 @@ def discrete_logistic_nodes_test():
         gt_param_flows[3*j+1] = ((vmid ** 2) * pc.node_flows[j+1,:]).sum()
         gt_param_flows[3*j+2] = (pc.node_flows[j+1,:]).sum()
 
-    assert torch.all(torch.abs(gt_param_flows - pc.input_layers[0].param_flows) < 1e-4)
+    assert torch.all(torch.abs(gt_param_flows - pc.input_layer_group[0].param_flows) < 1e-4)
 
     ## EM tests ##
 
-    mu = pc.input_layers[0].params.reshape(8, 2)[:,0].clone()
-    std = pc.input_layers[0].params.reshape(8, 2)[:,1].clone() * math.pi / math.sqrt(3.0)
+    mu = pc.input_layer_group[0].params.reshape(8, 2)[:,0].clone()
+    std = pc.input_layer_group[0].params.reshape(8, 2)[:,1].clone() * math.pi / math.sqrt(3.0)
     ori_theta1 = mu
     ori_theta2 = std * std + mu * mu
 
@@ -303,9 +303,9 @@ def discrete_logistic_nodes_test():
     pseudocount = 0.1
     min_std = 0.01
 
-    stat1 = pc.input_layers[0].param_flows.reshape(8, 3)[:,0]
-    stat2 = pc.input_layers[0].param_flows.reshape(8, 3)[:,1]
-    stat3 = pc.input_layers[0].param_flows.reshape(8, 3)[:,2]
+    stat1 = pc.input_layer_group[0].param_flows.reshape(8, 3)[:,0]
+    stat2 = pc.input_layer_group[0].param_flows.reshape(8, 3)[:,1]
+    stat3 = pc.input_layer_group[0].param_flows.reshape(8, 3)[:,2]
 
     new_theta1 = stat1 / (stat3 + 1e-10)
     new_theta2 = stat2 / (stat3 + 1e-10)
@@ -322,8 +322,8 @@ def discrete_logistic_nodes_test():
 
     pc.mini_batch_em(step_size = step_size, pseudocount = pseudocount)
 
-    assert torch.all(torch.abs(updated_mu - pc.input_layers[0].params.reshape(8, 2)[:,0]) < 1e-4)
-    assert torch.all(torch.abs(updated_s - pc.input_layers[0].params.reshape(8, 2)[:,1]) < 1e-4)
+    assert torch.all(torch.abs(updated_mu - pc.input_layer_group[0].params.reshape(8, 2)[:,0]) < 1e-4)
+    assert torch.all(torch.abs(updated_s - pc.input_layer_group[0].params.reshape(8, 2)[:,1]) < 1e-4)
 
 
 def discrete_logistic_nodes_behavior_test():
@@ -333,8 +333,8 @@ def discrete_logistic_nodes_behavior_test():
     ni2 = inputs(2, num_nodes = 2, dist = dists.DiscreteLogistic(val_range = [-1.0, 1.0], num_cats = 5))
     ni3 = inputs(3, num_nodes = 2, dist = dists.DiscreteLogistic(val_range = [-1.0, 1.0], num_cats = 5))
 
-    m1 = multiply(ni0, ni1, edge_ids = torch.tensor([[0, 0], [0, 1], [1, 0], [1, 1]], dtype = torch.long))
-    n1 = summate(m1, edge_ids = torch.tensor([[0, 0, 0, 0, 1, 1, 1, 1], [0, 1, 2, 3, 0, 1, 2, 3]], dtype = torch.long))
+    m1 = multiply(ni0, ni1, edge_ids = torch.tensor([[0, 0], [1, 1]], dtype = torch.long))
+    n1 = summate(m1, edge_ids = torch.tensor([[0, 0, 1, 1], [0, 1, 0, 1]], dtype = torch.long))
 
     m2 = multiply(ni2, ni3, edge_ids = torch.tensor([[0, 0], [1, 1]], dtype = torch.long))
     n2 = summate(m2, edge_ids = torch.tensor([[0, 0, 1, 1], [0, 1, 0, 1]], dtype = torch.long))
@@ -359,7 +359,7 @@ def discrete_logistic_nodes_behavior_test():
     for _ in range(40):
         lls = pc(data)
 
-        pc.backward(data)
+        pc.backward(data.permute(1, 0), flows_memory = 0.0)
 
         pc.mini_batch_em(step_size = 1.0, pseudocount = 0.01)
 
@@ -369,11 +369,12 @@ def discrete_logistic_nodes_behavior_test():
 
     assert (ni0._params[0] > 0.05 and ni0._params[2] < -0.6) or (ni0._params[2] > 0.05 and ni0._params[0] < -0.6)
     assert (ni1._params[0] > 0.05 and ni1._params[2] < -0.6) or (ni1._params[2] > 0.05 and ni1._params[0] < -0.6)
-    assert (ni2._params[0] > 0.78 and ni2._params[2] < 0.4) or (ni2._params[2] > 0.78 and ni2._params[0] < 0.4)
-    assert (ni3._params[0] > 0.78 and ni3._params[2] < 0.4) or (ni3._params[2] > 0.78 and ni3._params[0] < 0.4)
+    assert (ni2._params[0] > 0.65 and ni2._params[2] < 0.4) or (ni2._params[2] > 0.65 and ni2._params[0] < 0.4)
+    assert (ni3._params[0] > 0.65 and ni3._params[2] < 0.4) or (ni3._params[2] > 0.65 and ni3._params[0] < 0.4)
 
 
 if __name__ == "__main__":
+    torch.manual_seed(2390)
     categorical_nodes_test()
     bernoulli_nodes_test()
     gaussian_nodes_test()
diff --git a/tests/nodes/nodes_test.py b/tests/nodes/nodes_test.py
index ec570bc9..177c9dbc 100644
--- a/tests/nodes/nodes_test.py
+++ b/tests/nodes/nodes_test.py
@@ -5,7 +5,6 @@
 import pyjuice.nodes.distributions as dists
 from pyjuice.utils import BitSet
 from pyjuice.nodes import multiply, summate, inputs
-from pyjuice.functional.normalize import normalize_parameters
 
 import pytest
 
@@ -44,14 +43,6 @@ def nodes_test():
 
                 assert torch.all(torch.abs(n._params.sum(dim = 2).sum(dim = 0) - 1.0) < 1e-4)
 
-                n._params = n._params.to(device)
-                n.edge_ids = n.edge_ids.to(device)
-
-                normalize_parameters(n._params, n.edge_ids[0,:].contiguous(), group_size = n.group_size, 
-                                     ch_group_size = n.ch_group_size, pseudocount = 0.0)
-
-                assert torch.all(torch.abs(n._params.sum(dim = 2).sum(dim = 0) - 1.0) < 1e-4)
-
 
 if __name__ == "__main__":
     nodes_test()
\ No newline at end of file
diff --git a/tests/structures/hclt_correctness_test.py b/tests/structures/hclt_correctness_test.py
index 5ad39bd1..64ab6d23 100644
--- a/tests/structures/hclt_correctness_test.py
+++ b/tests/structures/hclt_correctness_test.py
@@ -129,7 +129,7 @@ def hclt_backward_test():
     batch_size = batch_data.size(0)
 
     lls = pc(batch_data)
-    lls.mean().backward()
+    pc.backward(batch_data.permute(1, 0), allow_modify_flows = False)
 
     pc.update_param_flows()
 
@@ -253,7 +253,7 @@ def hclt_em_test():
     batch_size = batch_data.size(0)
 
     lls = pc(batch_data)
-    lls.mean().backward()
+    pc.backward(batch_data.permute(1, 0), allow_modify_flows = False)
 
     ns2old_params = dict()
     for ns in root_ns:

From b613409f4d7533c923d89cf9bb0ad961876f69ec Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Wed, 27 Dec 2023 05:29:25 +0800
Subject: [PATCH 100/162] speedup `triton` kernel launches

---
 src/pyjuice/layer/prod_layer.py      | 19 ++++----
 src/pyjuice/layer/sum_layer.py       | 40 ++++++++++-------
 src/pyjuice/utils/kernel_launcher.py | 66 ++++++++++++++++++++++++++++
 src/pyjuice/utils/parameter_list.py  |  8 ++++
 tests/structures/hclt_test.py        | 34 +++++++++++---
 5 files changed, 139 insertions(+), 28 deletions(-)
 create mode 100644 src/pyjuice/utils/kernel_launcher.py
 create mode 100644 src/pyjuice/utils/parameter_list.py

diff --git a/src/pyjuice/layer/prod_layer.py b/src/pyjuice/layer/prod_layer.py
index 9f99f5c8..ab8bc87d 100644
--- a/src/pyjuice/layer/prod_layer.py
+++ b/src/pyjuice/layer/prod_layer.py
@@ -6,10 +6,11 @@
 import triton.language as tl
 import warnings
 import time
-from packaging import version
 from typing import Sequence, Optional
 
 from pyjuice.nodes import ProdNodes
+from pyjuice.utils.parameter_list import FastParamList
+from pyjuice.utils.kernel_launcher import FastJITFunction
 from .layer import Layer
 from .backend.node_partition import partition_nodes_by_n_edges
 from .backend.index_set import batched_index_set, batched_index_cum
@@ -79,8 +80,8 @@ def __init__(self, nodes: Sequence[ProdNodes], global_nid_start: Optional[int] =
         )
 
         # Store buffers for the forward pass
-        self.partitioned_nids = nn.ParameterList([nn.Parameter(tensor, requires_grad = False) for tensor in nids])
-        self.partitioned_cids = nn.ParameterList([nn.Parameter(tensor, requires_grad = False) for tensor in cids])
+        self.partitioned_nids = FastParamList([nn.Parameter(tensor, requires_grad = False) for tensor in nids])
+        self.partitioned_cids = FastParamList([nn.Parameter(tensor, requires_grad = False) for tensor in cids])
 
         ## Initialize backward pass ##
 
@@ -134,8 +135,8 @@ def __init__(self, nodes: Sequence[ProdNodes], global_nid_start: Optional[int] =
         )
 
         # Store buffers for the backward pass
-        self.partitioned_u_cids = nn.ParameterList([nn.Parameter(tensor, requires_grad = False) for tensor in u_cids])
-        self.partitioned_parids = nn.ParameterList([nn.Parameter(tensor, requires_grad = False) for tensor in parids])
+        self.partitioned_u_cids = FastParamList([nn.Parameter(tensor, requires_grad = False) for tensor in u_cids])
+        self.partitioned_parids = FastParamList([nn.Parameter(tensor, requires_grad = False) for tensor in parids])
 
     def forward(self, node_mars: torch.Tensor, element_mars: torch.Tensor, _for_backward: bool = False) -> None:
         """
@@ -232,7 +233,8 @@ def enable_partial_evaluation(self, fw_scopes: Optional[Sequence[BitSet]] = None
             ]
 
     @staticmethod
-    @triton.jit
+    # @triton.jit
+    @FastJITFunction
     def _forward_backward_kernel_3d(node_vals_ptr, element_vals_ptr, local_ids_ptr, nids_ptr, cids_ptr, tot_n_nodes, tot_n_eles, n_ngroups,
                                     num_edges: tl.constexpr, batch_size, BLOCK_M: tl.constexpr, BLOCK_B: tl.constexpr, 
                                     group_size: tl.constexpr, accum: tl.constexpr, partial_eval: tl.constexpr):
@@ -324,7 +326,8 @@ def _forward_backward_kernel_3d(node_vals_ptr, element_vals_ptr, local_ids_ptr,
             tl.store(node_vals_ptr + offs_nvals, nvals, mask = mask_batch[:,None])
 
     @staticmethod
-    @triton.jit
+    # @triton.jit
+    @FastJITFunction
     def _forward_backward_kernel_2d(node_vals_ptr, element_vals_ptr, local_ids_ptr, nids_ptr, cids_ptr, tot_n_nodes, tot_n_eles, n_ngroups,
                                     num_edges: tl.constexpr, batch_size, BLOCK_M: tl.constexpr, BLOCK_B: tl.constexpr, 
                                     group_size: tl.constexpr, accum: tl.constexpr, partial_eval: tl.constexpr):
@@ -410,7 +413,7 @@ def _forward_backward(self, node_vals: torch.Tensor, element_vals: torch.Tensor,
 
             return None
 
-        if version.parse(triton.__version__) > version.parse("2.0.0"):
+        if not triton.__version__ == "2.0.0":
 
             BLOCK_B = min(1024 // num_edges, triton.next_power_of_2(batch_size))
             BLOCK_M = min(max(1024 // (BLOCK_B * num_edges), 1), self.group_size)
diff --git a/src/pyjuice/layer/sum_layer.py b/src/pyjuice/layer/sum_layer.py
index 1294d86a..77280456 100644
--- a/src/pyjuice/layer/sum_layer.py
+++ b/src/pyjuice/layer/sum_layer.py
@@ -11,6 +11,8 @@
 
 from pyjuice.nodes import SumNodes
 from pyjuice.utils import BitSet
+from pyjuice.utils.parameter_list import FastParamList
+from pyjuice.utils.kernel_launcher import FastJITFunction
 from .layer import Layer
 from .backend.node_partition import partition_nodes_by_n_edges
 from .backend.index_set import batched_index_set, index_cum
@@ -95,10 +97,10 @@ def __init__(self, nodes: Sequence[SumNodes], global_nid_start: int,
         )
 
         # Store buffers for the forward pass
-        self.partitioned_nids = nn.ParameterList([nn.Parameter(tensor, requires_grad = False) for tensor in nids])
-        self.partitioned_cids = nn.ParameterList([nn.Parameter(tensor, requires_grad = False) for tensor in cids])
-        self.partitioned_pids = nn.ParameterList([nn.Parameter(tensor, requires_grad = False) for tensor in pids])
-        self.partitioned_pfids = nn.ParameterList([nn.Parameter(tensor, requires_grad = False) for tensor in pfids])
+        self.partitioned_nids = FastParamList([nn.Parameter(tensor, requires_grad = False) for tensor in nids])
+        self.partitioned_cids = FastParamList([nn.Parameter(tensor, requires_grad = False) for tensor in cids])
+        self.partitioned_pids = FastParamList([nn.Parameter(tensor, requires_grad = False) for tensor in pids])
+        self.partitioned_pfids = FastParamList([nn.Parameter(tensor, requires_grad = False) for tensor in pfids])
 
         # Store pre-compiled indices from `cids` and `pids` in the following buffer
         self._cached_fw_pcids = dict()
@@ -174,9 +176,9 @@ def __init__(self, nodes: Sequence[SumNodes], global_nid_start: int,
             cs_group_sizes.extend([ch_gsize] * num_bk_partitions)
 
         # Store buffers for the forward pass
-        self.partitioned_chids = nn.ParameterList([nn.Parameter(tensor, requires_grad = False) for tensor in chids])
-        self.partitioned_parids = nn.ParameterList([nn.Parameter(tensor, requires_grad = False) for tensor in parids])
-        self.partitioned_parpids = nn.ParameterList([nn.Parameter(tensor, requires_grad = False) for tensor in parpids])
+        self.partitioned_chids = FastParamList([nn.Parameter(tensor, requires_grad = False) for tensor in chids])
+        self.partitioned_parids = FastParamList([nn.Parameter(tensor, requires_grad = False) for tensor in parids])
+        self.partitioned_parpids = FastParamList([nn.Parameter(tensor, requires_grad = False) for tensor in parpids])
         self.cs_group_sizes = cs_group_sizes
 
         self.num_bk_partitions = len(chids)
@@ -382,7 +384,8 @@ def _forward(self, node_mars: torch.Tensor, element_mars: torch.Tensor,
             raise ValueError(f"Unexpected mode `{mode}`.")
 
     @staticmethod
-    @triton.jit
+    # @triton.jit
+    @FastJITFunction
     def _fw_triton_block_sparse_tlmm_kernel(node_mars, element_mars, params, nids, cids_start, cids_increment,
                                             pids_start, pids_increment, local_ids, batch_size: tl.constexpr, partial_eval: tl.constexpr,
                                             BLOCK_B: tl.constexpr, TILE_SIZE_K: tl.constexpr, K_NUM_TILES: tl.constexpr,
@@ -469,7 +472,8 @@ def _fw_triton_block_sparse_tlmm_kernel(node_mars, element_mars, params, nids, c
         tl.store(node_mars + offs_nmars, acc, mask = mask_batch[None,:])
 
     @staticmethod
-    @triton.jit
+    # @triton.jit
+    @FastJITFunction
     def _fw_triton_block_sparse_csmm_kernel(node_mars, element_mars, params, nids, cids_start, cids_increment,
                                             pids_start, pids_increment, local_ids, batch_size: tl.constexpr, partial_eval: tl.constexpr,
                                             BLOCK_B: tl.constexpr, TILE_SIZE_K: tl.constexpr, K_NUM_TILES: tl.constexpr,
@@ -681,7 +685,8 @@ def _forward_block_sparse(self, node_mars: torch.Tensor, element_mars: torch.Ten
         return None
 
     @staticmethod
-    @triton.jit
+    # @triton.jit
+    @FastJITFunction
     def _fw_triton_sparse_kernel(node_mars, element_mars, params, nids, cids, pids,
                                  local_ids, batch_size, partial_eval: tl.constexpr, num_edges: tl.constexpr, 
                                  BLOCK_B: tl.constexpr, BLOCK_M: tl.constexpr, GROUP_SIZE_M: tl.constexpr):
@@ -892,7 +897,8 @@ def _backward(self, node_flows: torch.Tensor, element_flows: torch.Tensor,
             raise ValueError(f"Not supported mode `{mode}`.")
 
     @staticmethod
-    @triton.jit
+    # @triton.jit
+    @FastJITFunction
     def _bk_triton_block_sparse_modify_flow_kernel(node_flows, node_mars, local_ids, nids, batch_size: tl.constexpr, partial_eval: tl.constexpr, 
                                                    BLOCK_B: tl.constexpr, BLOCK_M: tl.constexpr, GROUP_SIZE_M: tl.constexpr):
 
@@ -996,7 +1002,8 @@ def _backward_block_sparse(self, node_flows: torch.Tensor, element_flows: torch.
         return None
 
     @staticmethod
-    @triton.jit
+    # @triton.jit
+    @FastJITFunction
     def _bk_triton_block_sparse_ele_kernel(node_flows, element_flows, node_mars, element_mars, params, 
                                            chids, parids_start, parids_increment, parpids_start, parpids_increment, 
                                            local_ids, batch_size: tl.constexpr, partial_eval: tl.constexpr, ptr_inc_step: tl.constexpr, 
@@ -1193,7 +1200,8 @@ def _backward_block_sparse_ele_flows(self, node_flows: torch.Tensor, element_flo
         return None
 
     @staticmethod
-    @triton.jit
+    # @triton.jit
+    @FastJITFunction
     def _bk_triton_block_sparse_par_kernel(node_flows, node_mars, element_mars, params, param_flows, nids, cids, pids, pfids,
                                            batch_size: tl.constexpr, num_edges: tl.constexpr, allow_modify_flows: tl.constexpr, 
                                            TILE_SIZE_B: tl.constexpr, B_NUM_TILES: tl.constexpr, TILE_SIZE_K: tl.constexpr, 
@@ -1376,7 +1384,8 @@ def _backward_sparse(self, node_flows: torch.Tensor, element_flows: torch.Tensor
         return None
 
     @staticmethod
-    @triton.jit
+    # @triton.jit
+    @FastJITFunction
     def _bk_triton_sparse_ele_kernel(node_flows, element_flows, node_mars, element_mars, params, 
                                      chids, parids, parpids, local_ids, batch_size: tl.constexpr, partial_eval: tl.constexpr,
                                      n_edge_groups: tl.constexpr, allow_modify_flows: tl.constexpr, BLOCK_B: tl.constexpr, 
@@ -1487,7 +1496,8 @@ def _backward_sparse_ele_flows(self, node_flows: torch.Tensor, element_flows: to
         return None
 
     @staticmethod
-    @triton.jit
+    # @triton.jit
+    @FastJITFunction
     def _bk_triton_sparse_par_kernel(node_flows, node_mars, element_mars, params, param_flows, nids, cids, pids, pfids,
                                      num_edges: tl.constexpr, batch_size: tl.constexpr, allow_modify_flows: tl.constexpr, 
                                      BLOCK_M: tl.constexpr, BLOCK_K: tl.constexpr, BLOCK_B: tl.constexpr, 
diff --git a/src/pyjuice/utils/kernel_launcher.py b/src/pyjuice/utils/kernel_launcher.py
new file mode 100644
index 00000000..159e6532
--- /dev/null
+++ b/src/pyjuice/utils/kernel_launcher.py
@@ -0,0 +1,66 @@
+import torch
+import triton
+from typing import Callable, Tuple
+
+
+class FastJITFunction():
+    def __init__(self, fn: Callable):
+        self.jit_fn = triton.JITFunction(fn)
+
+        try:
+            self.constexpr_ids = [p.num for p in self.jit_fn.params if p.is_constexpr]
+            self.constexpr_names = {p.name: p.num for p in self.jit_fn.params if p.is_constexpr}
+            self.nonconstexpr_names = [p.name for p in self.jit_fn.params if not p.is_constexpr]
+        except AttributeError:
+            self.constexpr_ids = self.jit_fn.constexprs
+            self.constexpr_names = {self.jit_fn.arg_names[i]: i for i in self.jit_fn.constexprs}
+            self.nonconstexpr_names = [self.jit_fn.arg_names[i] for i in range(len(self.jit_fn.arg_names)) if i not in self.jit_fn.constexprs]
+
+        self.constexpr_ids_set = set(self.constexpr_ids)
+
+        self.cache = dict()
+
+    def __getitem__(self, grid: Tuple):
+        
+        def wrapper(*args, **kwargs):
+            signature_list = list()
+
+            device_id = torch.cuda.current_device()
+            signature_list.append(device_id)
+
+            for i in self.constexpr_ids:
+                if i >= len(args):
+                    break
+                signature_list.append(args[i])
+
+            for k, v in kwargs.items():
+                if k in self.constexpr_names:
+                    signature_list.append((self.constexpr_names[k], v))
+
+            grid0 = grid[0]
+            grid1 = grid[1] if len(grid) > 1 else 1
+            grid2 = grid[2] if len(grid) > 2 else 1
+
+            signature = tuple(signature_list)
+            if signature in self.cache:
+                kernel = self.cache[signature]
+
+                aligned_args = list()
+                for i, arg in enumerate(args):
+                    if i not in self.constexpr_ids_set:
+                        aligned_args.append(arg)
+
+                for k in self.nonconstexpr_names:
+                    if k in kwargs:
+                        aligned_args.append(kwargs[k])
+
+                kernel[(grid0, grid1, grid2)](*aligned_args)
+            else:
+                kernel = self.jit_fn[grid](*args, **kwargs)
+                self.cache[signature] = kernel
+
+        return wrapper
+
+
+def triton_jit(fn: Callable):
+    return FastJITFunction(fn)
diff --git a/src/pyjuice/utils/parameter_list.py b/src/pyjuice/utils/parameter_list.py
new file mode 100644
index 00000000..60365a12
--- /dev/null
+++ b/src/pyjuice/utils/parameter_list.py
@@ -0,0 +1,8 @@
+import torch
+import torch.nn as nn
+
+
+class FastParamList(nn.ParameterList):
+
+    def __getitem__(self, idx):
+        return getattr(self, str(idx))
\ No newline at end of file
diff --git a/tests/structures/hclt_test.py b/tests/structures/hclt_test.py
index 97818db0..e3a63881 100644
--- a/tests/structures/hclt_test.py
+++ b/tests/structures/hclt_test.py
@@ -108,12 +108,36 @@ def hclt_test():
         milestone_steps = [0, len(train_loader) * 100, len(train_loader) * 350]
     )
 
-    for batch in train_loader:
-        x = batch[0].to(device)
+    # for batch in train_loader:
+    #     x = batch[0].to(device)
 
-        lls = pc(x, record_cudagraph = True)
-        lls.mean().backward()
-        break
+    #     lls = pc(x, record_cudagraph = True)
+    #     lls.mean().backward()
+    #     break
+
+    # for i, batch in enumerate(train_loader):
+    #     x = batch[0].to(device)
+
+    #     lls = pc(x, record_cudagraph = False)
+    #     lls.mean().backward()
+    #     if i > 5:
+    #         break
+
+    from torch.profiler import profile, record_function, ProfilerActivity
+    with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], with_stack = True) as prof:
+        for i, batch in enumerate(train_loader):
+            x = batch[0].to(device)
+
+            lls = pc(x, record_cudagraph = False)
+            lls.mean().backward()
+            if i > 5:
+                break
+
+    prof.export_chrome_trace("trace3.json")
+    # torch.autograd.profiler.tensorboard_trace_to_flame_graph('trace.json', 'flamegraph.svg')
+    # prof.export_stacks("trace.txt", "cpu_time_total")
+    import pdb; pdb.set_trace()
+    exit()
 
     mini_batch_em_epoch(350, pc, optimizer, scheduler, train_loader, test_loader, device)
     full_batch_em_epoch(pc, train_loader, test_loader, device)

From d248e89ce4f9eba14eb0d7b9caac229e9f332236 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Wed, 27 Dec 2023 19:42:44 +0800
Subject: [PATCH 101/162] fix block size allocator

---
 src/pyjuice/layer/sum_layer.py | 42 ++++++++++++++++++++--------------
 1 file changed, 25 insertions(+), 17 deletions(-)

diff --git a/src/pyjuice/layer/sum_layer.py b/src/pyjuice/layer/sum_layer.py
index 77280456..7baad0d2 100644
--- a/src/pyjuice/layer/sum_layer.py
+++ b/src/pyjuice/layer/sum_layer.py
@@ -1074,6 +1074,13 @@ def _bk_triton_block_sparse_ele_kernel(node_flows, element_flows, node_mars, ele
                 tl.log(tl.exp(log_n_fdm_max[None,:] - acc) * partial_flows + 1.0) + acc
             )
             acc = tl.where(neginf_flag, -float("inf"), acc)
+            # acc = tl.where(log_n_fdm_max[None,:] == acc,
+            #     acc + 0.69314718056, # log(2)
+            #     tl.where(log_n_fdm_max[None,:] > acc,
+            #         tl.log(partial_flows + tl.exp(acc - log_n_fdm_max[None,:])) + log_n_fdm_max[None,:],
+            #         tl.log(tl.exp(log_n_fdm_max[None,:] - acc) * partial_flows + 1.0) + acc
+            #     )
+            # )
 
             # Increment `epars_ptr`
             parpids_inc = tl.load(parpids_inc_ptr)
@@ -1113,16 +1120,16 @@ def _backward_block_sparse_ele_flows(self, node_flows: torch.Tensor, element_flo
 
         # Heuristic to set `TILE_SIZE_M`, `TILE_SIZE_K`, and `BLOCK_B`
         base_size = min(self.group_size, num_edges, BATCH_SIZE_NP2, 64)
-        if base_size >= 32:
-            TILE_SIZE_K = base_size
-            TILE_SIZE_M = 1024 // base_size
-            BLOCK_B = 1024 // base_size
+        if base_size >= 64:
+            TILE_SIZE_K = min(2048 // 32, num_edges)
+            TILE_SIZE_M = min(2048 // TILE_SIZE_K, cs_group_size)
+            BLOCK_B = min(2048 // TILE_SIZE_K, BATCH_SIZE_NP2)
         else:
-            remainder = 1024 // (base_size ** 2)
+            remainder = 2048 // (base_size ** 2)
 
-            TILE_SIZE_K = min(1024 // remainder, base_size * remainder, num_edges)
-            TILE_SIZE_M = min(1024 // TILE_SIZE_K, cs_group_size)
-            BLOCK_B = min(1024 // TILE_SIZE_K, BATCH_SIZE_NP2)
+            TILE_SIZE_K = min(512, base_size * remainder, num_edges)
+            TILE_SIZE_M = min(2048 // TILE_SIZE_K, cs_group_size)
+            BLOCK_B = min(2048 // TILE_SIZE_K, BATCH_SIZE_NP2)
         K_NUM_TILES = num_edges // TILE_SIZE_K
 
         assert TILE_SIZE_K >= 4, f"`TILE_SIZE_K` should be greater than 4 (but got {TILE_SIZE_K}) in order to use the block-sparse kernel. " \
@@ -1194,7 +1201,9 @@ def _backward_block_sparse_ele_flows(self, node_flows: torch.Tensor, element_flo
             K_NUM_TILES = K_NUM_TILES,
             TILE_SIZE_M = TILE_SIZE_M, 
             GROUP_SIZE_M = GROUP_SIZE_M,
-            GROUP_SIZE_K = GROUP_SIZE_K
+            GROUP_SIZE_K = GROUP_SIZE_K,
+            num_warps = 2, # TODO: test for different devices
+            num_stages = 2
         )
 
         return None
@@ -1301,17 +1310,14 @@ def _backward_block_sparse_par_flows(self, node_flows: torch.Tensor, params: tor
         BATCH_SIZE_NP2 = triton.next_power_of_2(batch_size)
 
         # Heuristic to set `TILE_SIZE_M`, `TILE_SIZE_K`, and `BLOCK_B`
-        base_size = min(self.group_size, num_edges, BATCH_SIZE_NP2, 64)
+        base_size = min(self.group_size, num_edges, BATCH_SIZE_NP2)
         if base_size >= 64:
-            TILE_SIZE_B = base_size
-            TILE_SIZE_M = 2048 // base_size
-            TILE_SIZE_K = 2048 // base_size
+            TILE_SIZE_B = min(2048 // 32, BATCH_SIZE_NP2)
         else:
             remainder = 2048 // (base_size ** 2)
-
             TILE_SIZE_B = min(2048 // remainder, base_size * remainder, BATCH_SIZE_NP2)
-            TILE_SIZE_M = min(2048 // TILE_SIZE_B, self.group_size)
-            TILE_SIZE_K = min(2048 // TILE_SIZE_B, num_edges)
+        TILE_SIZE_M = min(2048 // TILE_SIZE_B, self.group_size)
+        TILE_SIZE_K = min(2048 // TILE_SIZE_B, num_edges)
         B_NUM_TILES = batch_size // TILE_SIZE_B
 
         allow_modify_flows = 1 if allow_modify_flows else 0
@@ -1339,7 +1345,9 @@ def _backward_block_sparse_par_flows(self, node_flows: torch.Tensor, params: tor
             B_NUM_TILES = B_NUM_TILES, 
             TILE_SIZE_K = TILE_SIZE_K, 
             TILE_SIZE_M = TILE_SIZE_M, 
-            GROUP_SIZE_M = self.group_size
+            GROUP_SIZE_M = self.group_size,
+            num_warps = 4, # TODO: test for different devices
+            num_stages = 3
         )
 
     def _backward_sparse(self, node_flows: torch.Tensor, element_flows: torch.Tensor, 

From b88e192817cc08239015c73579593833d951ed37 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Wed, 27 Dec 2023 19:43:00 +0800
Subject: [PATCH 102/162] speedup kernel launch

---
 src/pyjuice/utils/kernel_launcher.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/pyjuice/utils/kernel_launcher.py b/src/pyjuice/utils/kernel_launcher.py
index 159e6532..5b313eb5 100644
--- a/src/pyjuice/utils/kernel_launcher.py
+++ b/src/pyjuice/utils/kernel_launcher.py
@@ -25,9 +25,6 @@ def __getitem__(self, grid: Tuple):
         def wrapper(*args, **kwargs):
             signature_list = list()
 
-            device_id = torch.cuda.current_device()
-            signature_list.append(device_id)
-
             for i in self.constexpr_ids:
                 if i >= len(args):
                     break

From 503aa55ccbd2e0fef8671a8cbdbb7ffcffb7b39b Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Wed, 27 Dec 2023 22:05:39 +0800
Subject: [PATCH 103/162] add debugging files to gitignore

---
 .gitignore | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.gitignore b/.gitignore
index 5b1aa15d..3363da48 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,6 +8,9 @@ __pycache__/
 
 temp.npz
 out.ncu-rep
+trace.json
+trace2.json
+trace3.json
 
 # Distribution / packaging
 .Python

From 9611137d50484fbefb6ba2bdc7b30f77b0d195b3 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Wed, 27 Dec 2023 22:06:14 +0800
Subject: [PATCH 104/162] fix sum layer fw bug caused by 0 in `cids`

---
 src/pyjuice/layer/sum_layer.py | 82 ++++++++++++++++++++++++----------
 1 file changed, 59 insertions(+), 23 deletions(-)

diff --git a/src/pyjuice/layer/sum_layer.py b/src/pyjuice/layer/sum_layer.py
index 7baad0d2..422196ea 100644
--- a/src/pyjuice/layer/sum_layer.py
+++ b/src/pyjuice/layer/sum_layer.py
@@ -23,6 +23,11 @@
 
 class SumLayer(Layer, nn.Module):
 
+    BLOCK_SPARSE = 0
+    SPARSE = 1
+    PYTORCH = 2
+    STR2MODE = {"block_sparse": 0, "sparse": 1, "pytorch": 2}
+
     def __init__(self, nodes: Sequence[SumNodes], global_nid_start: int, 
                  global_pid_start: int, global_pfid_start: int, node2tiednodes: dict(),
                  layer_sparsity_tol: Optional[float] = None, 
@@ -291,6 +296,7 @@ def backward(self, node_flows: torch.Tensor, element_flows: torch.Tensor,
                     element_mars, param_flows, 
                     chids = chids, parids = parids, parpids = parpids,
                     cs_group_size = cs_group_size,
+                    partition_id = partition_id,
                     allow_modify_flows = allow_modify_flows
                 )
 
@@ -308,6 +314,7 @@ def backward(self, node_flows: torch.Tensor, element_flows: torch.Tensor,
                     element_mars, param_flows, 
                     chids = chids, parids = parids, parpids = parpids,
                     cs_group_size = cs_group_size, local_ids = local_ids,
+                    partition_id = partition_id,
                     allow_modify_flows = allow_modify_flows
                 )
 
@@ -349,33 +356,34 @@ def _forward(self, node_mars: torch.Tensor, element_mars: torch.Tensor,
         batch_size = node_mars.size(1)
 
         if mode is not None:
-            assert mode in ["block_sparse", "sparse"]
+            assert mode in STR2MODE
+            mode = self.STR2MODE[mode]
 
         elif params.dim() == 1 and self.group_size >= 16 and num_edges >= 16 and batch_size >= 16:
             # In this case, we should definitely use the block-sparse implementation
-            mode = "block_sparse"
+            mode = self.BLOCK_SPARSE
         elif self.group_size == 1 and num_edges < 16384:
             # In this case, we should definitely use the sparse implementation
-            mode = "sparse"
+            mode = self.SPARSE
         elif num_edges < 4:
             # In this case, the block-sparse kernel will have compilation issues
-            mode = "sparse"
+            mode = self.SPARSE
         else:
-            mode = "block_sparse"
+            mode = self.BLOCK_SPARSE
 
-        if mode == "block_sparse":
+        if mode == self.BLOCK_SPARSE:
             self._forward_block_sparse(
                 node_mars, element_mars, params, nids, cids, pids, local_ids,
                 partition_id = partition_id
             )
 
-        elif mode == "sparse":
+        elif mode == self.SPARSE:
             self._forward_sparse(
                 node_mars, element_mars, params, nids, cids, pids, local_ids,
                 partition_id = partition_id
             )
 
-        elif mode == "pytorch":
+        elif mode == self.PYTORCH:
             self._forward_pytorch(
                 node_mars, element_mars, params, nids, cids, pids, local_ids
             )
@@ -440,7 +448,7 @@ def _fw_triton_block_sparse_tlmm_kernel(node_mars, element_mars, params, nids, c
             emars = tl.load(emars_ptr, mask = mask_batch[None,:])
 
             emars_max = tl.max(emars, axis = 0)[None,:]
-            emars_sub = tl.exp(emars - emars_max)
+            emars_sub = tl.where(emars_max != -float("inf"), tl.exp(emars - emars_max), 0.0)
 
             if use_fp16 == 1:
                 # Built-in matmul kernel of triton + float16
@@ -528,7 +536,7 @@ def _fw_triton_block_sparse_csmm_kernel(node_mars, element_mars, params, nids, c
             emars = tl.load(emars_ptr, mask = mask_batch[:,None])
 
             emars_max = tl.max(emars, axis = 1)
-            emars_sub = tl.exp(emars - emars_max[:,None])
+            emars_sub = tl.where(emars_max != -float("inf"), tl.exp(emars - emars_max), 0.0)
 
             if use_fp16 == 1:
                 # Simulated matmul kernel + float16
@@ -661,6 +669,35 @@ def _forward_block_sparse(self, node_mars: torch.Tensor, element_mars: torch.Ten
                 use_fp16 = use_fp16
             )
 
+            # if node_mars.isnan().any():
+            #     import pdb; pdb.set_trace()
+
+            #     import numpy as np
+
+            #     np.savez("temp.npz",
+            #         node_mars = node_mars.detach().cpu().numpy(),
+            #         element_mars = element_mars.detach().cpu().numpy(),
+            #         params = params.detach().cpu().numpy(),
+            #         nids = nids.detach().cpu().numpy(),
+            #         cids = cids.detach().cpu().numpy(),
+            #         cids_start = cids_start.detach().cpu().numpy(),
+            #         cids_increment = cids_increment.detach().cpu().numpy(),
+            #         pids = pids.detach().cpu().numpy(),
+            #         pids_start = pids_start.detach().cpu().numpy(),
+            #         pids_increment = pids_increment.detach().cpu().numpy(),
+            #         batch_size = batch_size,
+            #         partial_eval = partial_eval,
+            #         BLOCK_B = BLOCK_B,
+            #         TILE_SIZE_K = TILE_SIZE_K,
+            #         K_NUM_TILES = K_NUM_TILES,
+            #         TILE_SIZE_M = TILE_SIZE_M,
+            #         GROUP_SIZE_M = GROUP_SIZE_M,
+            #         use_fp16 = use_fp16,
+            #         layer_n_nodes = layer_n_nodes
+            #     )
+
+            #     import numpy as np
+
         else:
             self._fw_triton_block_sparse_csmm_kernel[grid](
                 node_mars, 
@@ -858,34 +895,36 @@ def _backward(self, node_flows: torch.Tensor, element_flows: torch.Tensor,
         batch_size = node_flows.size(1)
 
         if mode is not None:
-            assert mode in ["block_sparse", "sparse", "pytorch"]
+            assert mode in STR2MODE
+            mode = self.STR2MODE[mode]
+
         elif params.dim() == 1 and self.group_size >= 16 and num_edges >= 16 and batch_size >= 16:
             # In this case, we should definitely use the block-sparse implementation
-            mode = "block_sparse"
+            mode = self.BLOCK_SPARSE
         elif (cs_group_size == 1 or self.group_size == 1) and num_edges < 16384:
             # In this case, we should definitely use the sparse implementation
-            mode = "sparse"
+            mode = self.SPARSE
         elif num_edges < 4 or batch_size < 4:
             # In this case, the block-sparse kernel will have compilation issues
-            mode = "sparse"
+            mode = self.SPARSE
         else:
-            mode = "block_sparse"
+            mode = self.BLOCK_SPARSE
 
-        if mode == "block_sparse":
+        if mode == self.BLOCK_SPARSE:
             self._backward_block_sparse(
                 node_flows, element_flows, params, node_mars, element_mars, param_flows, 
                 nids, cids, pids, pfids, chids, parids, parpids, cs_group_size, local_ids, 
                 partition_id = partition_id, allow_modify_flows = allow_modify_flows
             )
 
-        elif mode == "sparse":
+        elif mode == self.SPARSE:
             self._backward_sparse(
                 node_flows, element_flows, params, node_mars, element_mars, param_flows, 
                 nids, cids, pids, pfids, chids, parids, parpids, cs_group_size, local_ids, 
                 partition_id = partition_id, allow_modify_flows = allow_modify_flows
             )
 
-        elif mode == "pytorch":
+        elif mode == self.PYTORCH:
             assert not allow_modify_flows, "Please set `allow_modify_flows` to False when " \
                                            "using the native PyTorch backward."
             self._backward_pytorch(
@@ -1122,14 +1161,11 @@ def _backward_block_sparse_ele_flows(self, node_flows: torch.Tensor, element_flo
         base_size = min(self.group_size, num_edges, BATCH_SIZE_NP2, 64)
         if base_size >= 64:
             TILE_SIZE_K = min(2048 // 32, num_edges)
-            TILE_SIZE_M = min(2048 // TILE_SIZE_K, cs_group_size)
-            BLOCK_B = min(2048 // TILE_SIZE_K, BATCH_SIZE_NP2)
         else:
             remainder = 2048 // (base_size ** 2)
-
             TILE_SIZE_K = min(512, base_size * remainder, num_edges)
-            TILE_SIZE_M = min(2048 // TILE_SIZE_K, cs_group_size)
-            BLOCK_B = min(2048 // TILE_SIZE_K, BATCH_SIZE_NP2)
+        TILE_SIZE_M = min(2048 // TILE_SIZE_K, cs_group_size)
+        BLOCK_B = min(2048 // TILE_SIZE_K, BATCH_SIZE_NP2)
         K_NUM_TILES = num_edges // TILE_SIZE_K
 
         assert TILE_SIZE_K >= 4, f"`TILE_SIZE_K` should be greater than 4 (but got {TILE_SIZE_K}) in order to use the block-sparse kernel. " \

From ac29feb78ead877f9012438b56ca5ff8462da4ea Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Wed, 27 Dec 2023 22:06:43 +0800
Subject: [PATCH 105/162] define group_size in PD

---
 src/pyjuice/structures/pd.py | 42 +++++++++++++++++++++++-------------
 1 file changed, 27 insertions(+), 15 deletions(-)

diff --git a/src/pyjuice/structures/pd.py b/src/pyjuice/structures/pd.py
index 5691e7df..ded31630 100644
--- a/src/pyjuice/structures/pd.py
+++ b/src/pyjuice/structures/pd.py
@@ -7,10 +7,11 @@
 
 import pyjuice.transformations as jtf
 from typing import Tuple, Sequence, Optional, Type, Dict
-from pyjuice.nodes import multiply, summate, inputs
+from pyjuice.nodes import multiply, summate, inputs, set_group_size
 from pyjuice.nodes.distributions import *
 from pyjuice.structures.hclt import HCLT
 from pyjuice.utils import BitSet
+from pyjuice.utils.util import max_cdf_power_of_2
 
 
 def PD(data_shape: Tuple, num_latents: int, 
@@ -21,7 +22,8 @@ def PD(data_shape: Tuple, num_latents: int,
        structure_type: str = "sum_dominated",
        input_layer_fn: Optional[Callable] = None,
        input_layer_type: Type[Distribution] = Categorical, 
-       input_layer_params: Dict = {"num_cats": 256}):
+       input_layer_params: Dict = {"num_cats": 256},
+       group_size: Optional[int] = None):
     """
     The PD structure was proposed in
         Sum-Product Networks: A New Deep Architecture
@@ -31,6 +33,15 @@ def PD(data_shape: Tuple, num_latents: int,
     """
     assert structure_type in ["sum_dominated", "prod_dominated"]
 
+    # Specify group size
+    if group_size is None:
+        if num_latents <= 32:
+            group_size = min(16, max_cdf_power_of_2(num_latents))
+        else:
+            group_size = min(32, max_cdf_power_of_2(num_latents))
+
+    num_node_groups = num_latents // group_size
+
     num_axes = len(data_shape)
 
     # Construct split points
@@ -85,11 +96,11 @@ def create_input_ns(hypercube):
         else:
             input_nodes = []
             for var in scope:
-                ns = inputs(var, num_nodes = num_latents, dist = input_layer_type(**input_layer_params))
+                ns = inputs(var, num_node_groups = num_node_groups, dist = input_layer_type(**input_layer_params))
                 input_nodes.append(ns)
 
-            edge_ids = torch.arange(0, num_latents)[None,:].repeat(2, 1)
-            return summate(multiply(*input_nodes), num_nodes = num_latents, edge_ids = edge_ids)
+            edge_ids = torch.arange(0, num_node_groups)[None,:].repeat(2, 1)
+            return summate(multiply(*input_nodes), num_node_groups = num_node_groups, edge_ids = edge_ids)
 
     def recursive_construct(hypercube, depth = 1):
         if hypercube in hypercube2ns:
@@ -121,21 +132,22 @@ def recursive_construct(hypercube, depth = 1):
             # No split point found. Create input nodes instead
             ns = create_input_ns(hypercube)
         elif hypercube == root_hypercube:
-            ns = summate(*pns, num_nodes = 1)
+            ns = summate(*pns, num_node_groups = 1, group_size = 1)
         elif len(pns) <= max_prod_group_conns:
-            ns = summate(*pns, num_nodes = num_latents)
+            ns = summate(*pns, num_node_groups = num_node_groups)
         else:
-            group_ids = torch.topk(torch.rand([num_latents, len(pns)]), k = max_prod_group_conns, dim = 1).indices
-            par_ids = torch.arange(0, num_latents)[:,None,None].repeat(1, max_prod_group_conns, num_latents)
-            chs_ids = group_ids[:,:,None] * num_latents + torch.arange(0, num_latents)[None,None,:]
+            group_ids = torch.topk(torch.rand([num_node_groups, len(pns)]), k = max_prod_group_conns, dim = 1).indices
+            par_ids = torch.arange(0, num_node_groups)[:,None,None].repeat(1, max_prod_group_conns, num_node_groups)
+            chs_ids = group_ids[:,:,None] * num_node_groups + torch.arange(0, num_node_groups)[None,None,:]
             edge_ids = torch.stack((par_ids.reshape(-1), chs_ids.reshape(-1)), dim = 0)
-            ns = summate(*pns, num_nodes = num_latents, edge_ids = edge_ids)
+            ns = summate(*pns, num_node_groups = num_node_groups, edge_ids = edge_ids)
 
         hypercube2ns[hypercube] = ns
         return ns
 
-    root_hypercube = ((0,) * num_axes, deepcopy(data_shape))
-    root_ns = recursive_construct(root_hypercube)
+    with set_group_size(group_size = group_size):
+        root_hypercube = ((0,) * num_axes, deepcopy(data_shape))
+        root_ns = recursive_construct(root_hypercube)
 
     return root_ns
 
@@ -176,7 +188,7 @@ def input_layer_fn(scope, num_latents):
             structure_type = structure_type, input_layer_fn = input_layer_fn,
             input_layer_type = input_layer_type, input_layer_params = input_layer_params)
 
-    if ns.num_nodes > 1:
-        ns = summate(*ns.chs, num_nodes = 1)
+    if ns.num_node_groups > 1:
+        ns = summate(*ns.chs, num_node_groups = 1, group_size = 1)
 
     return ns

From 78db63db9c2376f16f0c4c047c665b48281fc254 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Wed, 27 Dec 2023 22:08:04 +0800
Subject: [PATCH 106/162] update PDHCLT

---
 src/pyjuice/structures/pd.py           |  8 ++-
 tests/structures/kernel_launch_test.py | 99 --------------------------
 tests/structures/pd_test.py            | 25 ++++++-
 3 files changed, 29 insertions(+), 103 deletions(-)
 delete mode 100644 tests/structures/kernel_launch_test.py

diff --git a/src/pyjuice/structures/pd.py b/src/pyjuice/structures/pd.py
index ded31630..b4b897d6 100644
--- a/src/pyjuice/structures/pd.py
+++ b/src/pyjuice/structures/pd.py
@@ -92,7 +92,7 @@ def updated_hypercube(hypercube, axis, s = None, e = None):
     def create_input_ns(hypercube):
         scope = hypercube2scope(hypercube)
         if input_layer_fn is not None:
-            return input_layer_fn(scope, num_latents)
+            return input_layer_fn(scope, num_latents, group_size)
         else:
             input_nodes = []
             for var in scope:
@@ -160,7 +160,8 @@ def PDHCLT(data: torch.Tensor, data_shape: Tuple, num_latents: int,
            structure_type: str = "sum_dominated",
            input_layer_type: Type[Distribution] = Categorical, 
            input_layer_params: Dict = {"num_cats": 256},
-           hclt_kwargs: Dict = {"num_bins": 32, "sigma": 0.5 / 32, "chunk_size": 32}):
+           hclt_kwargs: Dict = {"num_bins": 32, "sigma": 0.5 / 32, "chunk_size": 32},
+           group_size: Optional[int] = None):
 
     assert data.dim() == 2
     assert data.size(1) == reduce(lambda x, y: x * y, data_shape)
@@ -186,7 +187,8 @@ def input_layer_fn(scope, num_latents):
             split_intervals = split_intervals, split_points = split_points,
             max_split_depth = max_split_depth, max_prod_group_conns = max_prod_group_conns,
             structure_type = structure_type, input_layer_fn = input_layer_fn,
-            input_layer_type = input_layer_type, input_layer_params = input_layer_params)
+            input_layer_type = input_layer_type, input_layer_params = input_layer_params,
+            group_size = group_size)
 
     if ns.num_node_groups > 1:
         ns = summate(*ns.chs, num_node_groups = 1, group_size = 1)
diff --git a/tests/structures/kernel_launch_test.py b/tests/structures/kernel_launch_test.py
deleted file mode 100644
index bf3beb07..00000000
--- a/tests/structures/kernel_launch_test.py
+++ /dev/null
@@ -1,99 +0,0 @@
-import pyjuice as juice
-import torch
-import numpy as np
-import time
-import random
-
-import pyjuice.nodes.distributions as dists
-from pyjuice.utils import BitSet
-from pyjuice.nodes import multiply, summate, inputs
-from pyjuice.model import TensorCircuit
-
-from pyjuice.layer import InputLayer, ProdLayer, SumLayer
-
-import pytest
-
-
-import triton
-import triton.language as tl
-
-
-@triton.jit
-def kernel1(a, b, c, M: tl.constexpr, N: tl.constexpr, K: tl.constexpr):
-    pid = tl.program_id(axis = 0)
-
-    offs_a = tl.arange(0, M)[:,None] * N + tl.arange(0, N)[None,:]
-    aa = tl.load(a + offs_a).to(tl.float16)
-
-    offs_b = tl.arange(0, N)[:,None] * K + tl.arange(0, K)[None,:]
-    bb = tl.load(b + offs_b).to(tl.float16)
-
-    cc = tl.dot(aa, bb).to(tl.float32)
-
-    offs_c = tl.arange(0, M)[:,None] * K + tl.arange(0, K)[None,:]
-    tl.store(c + offs_c, cc)
-
-
-@triton.jit
-def kernel2(a, b, c, M: tl.constexpr, N: tl.constexpr, K: tl.constexpr):
-    pid = tl.program_id(axis = 0)
-
-    offs_a = tl.arange(0, M)[:,None] * N + tl.arange(0, N)[None,:]
-    aa = tl.load(a + offs_a)#.to(tl.float16)
-
-    offs_b = tl.arange(0, N)[:,None] * K + tl.arange(0, K)[None,:]
-    bb = tl.load(b + offs_b)#.to(tl.float16)
-
-    cc = tl.sum(aa[:,:,None] * bb[None,:,:], axis = 1)#.to(tl.float32)
-
-    # cc = tl.dot(aa, bb)
-
-    offs_c = tl.arange(0, M)[:,None] * K + tl.arange(0, K)[None,:]
-    tl.store(c + offs_c, cc)
-
-
-@triton.jit
-def kernel3(a, b, c, M: tl.constexpr, N: tl.constexpr, K: tl.constexpr):
-    pid = tl.program_id(axis = 0)
-
-    offs_a = tl.arange(0, M)[:,None] * N + tl.arange(0, N)[None,:]
-    aa = tl.load(a + offs_a)
-
-    offs_b = tl.arange(0, N)[:,None] * K + tl.arange(0, K)[None,:]
-    bb = tl.load(b + offs_b)
-
-    aa = tl.view(tl.broadcast_to(aa[:,None,:], (M, 16 // M, N)), (16, N))
-    cc = tl.dot(aa, bb)
-    cc = tl.max(tl.view(cc, (M, 16 // M, N)), axis = 1)
-
-    offs_c = tl.arange(0, M)[:,None] * K + tl.arange(0, K)[None,:]
-    tl.store(c + offs_c, cc)
-
-
-if __name__ == "__main__":
-    import time
-
-    M = 16
-    N = 16
-    K = 16
-
-    a = torch.rand([M, N]).cuda()
-    b = torch.rand([N, K]).cuda()
-    c = torch.zeros([M, K]).cuda()
-
-    grid = (400,)
-
-    kernel1[grid](a, b, c, M, N, K)
-
-    aaa = [item for item in kernel1.cache[0].values()]
-    raw_kernel = aaa[0][(grid[0], 1, 1)]
-
-    from torch.profiler import profile, record_function, ProfilerActivity
-    with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], with_stack = True) as prof:
-        for i in range(50):
-            # kernel1[grid](a, b, c, M, N, K)
-            raw_kernel(a, b, c)
-
-    prof.export_chrome_trace("trace.json")
-
-    import pdb; pdb.set_trace()
\ No newline at end of file
diff --git a/tests/structures/pd_test.py b/tests/structures/pd_test.py
index 2b20f881..60c56001 100644
--- a/tests/structures/pd_test.py
+++ b/tests/structures/pd_test.py
@@ -92,7 +92,7 @@ def pd_test():
 
     ns = juice.structures.PD(
         data_shape = (28, 28),
-        num_latents = 32,
+        num_latents = 128,
         split_intervals = (4, 4),
         structure_type = "sum_dominated"
     )
@@ -102,6 +102,29 @@ def pd_test():
 
     optimizer = juice.optim.CircuitOptimizer(pc, lr = 0.1, pseudocount = 0.0001)
 
+    # for batch in train_loader:
+    #     x = batch[0].to(device)
+
+    #     lls = pc(x, record_cudagraph = True)
+    #     lls.mean().backward()
+    #     break
+
+    # from torch.profiler import profile, record_function, ProfilerActivity
+    # with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], with_stack = True) as prof:
+    #     for i, batch in enumerate(train_loader):
+    #         x = batch[0].to(device)
+
+    #         lls = pc(x, record_cudagraph = False)
+    #         lls.mean().backward()
+    #         if i > 10:
+    #             break
+
+    # prof.export_chrome_trace("trace3.json")
+    # # torch.autograd.profiler.tensorboard_trace_to_flame_graph('trace.json', 'flamegraph.svg')
+    # # prof.export_stacks("trace.txt", "cpu_time_total")
+    # import pdb; pdb.set_trace()
+    # exit()
+
     mini_batch_em_epoch(350, pc, optimizer, None, train_loader, test_loader, device)
     full_batch_em_epoch(pc, train_loader, test_loader, device)
 

From f3c8460a8c052e286c4aeed4715f40567ddedb6c Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Wed, 27 Dec 2023 22:08:18 +0800
Subject: [PATCH 107/162] iter fn for `FastParamList`

---
 src/pyjuice/utils/parameter_list.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/pyjuice/utils/parameter_list.py b/src/pyjuice/utils/parameter_list.py
index 60365a12..79d8cd55 100644
--- a/src/pyjuice/utils/parameter_list.py
+++ b/src/pyjuice/utils/parameter_list.py
@@ -1,8 +1,12 @@
 import torch
 import torch.nn as nn
+from typing import Iterator, Any
 
 
 class FastParamList(nn.ParameterList):
 
-    def __getitem__(self, idx):
-        return getattr(self, str(idx))
\ No newline at end of file
+    def __getitem__(self, idx) -> Any:
+        return getattr(self, str(idx))
+
+    def __iter__(self) -> Iterator[Any]:
+        return iter(self[i] for i in range(len(self)))

From 5abd7d9126e6bcf4b73e949ca9ceae2fb06063ba Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Wed, 27 Dec 2023 22:08:51 +0800
Subject: [PATCH 108/162] update reference time

---
 tests/layer/sum_layer_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/layer/sum_layer_test.py b/tests/layer/sum_layer_test.py
index 4efe88c0..702aa838 100644
--- a/tests/layer/sum_layer_test.py
+++ b/tests/layer/sum_layer_test.py
@@ -217,7 +217,7 @@ def speed_test():
     backward_ms = (t1 - t0) / 100 * 1000
 
     print(f"Backward pass on average takes {backward_ms:.3f}ms.")
-    print("Reference computation time on RTX 4090: 2.175ms.")
+    print("Reference computation time on RTX 4090: 1.544ms.")
     print("--------------------------------------------------------------")
 
 

From c6e2bf9b60c9ef25f74b7924e20b01fbb11aef69 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Thu, 28 Dec 2023 01:56:41 +0800
Subject: [PATCH 109/162] fix `parflow_fusing`

---
 src/pyjuice/model/backend/parflow_fusing.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/pyjuice/model/backend/parflow_fusing.py b/src/pyjuice/model/backend/parflow_fusing.py
index e285ff8b..13c99411 100644
--- a/src/pyjuice/model/backend/parflow_fusing.py
+++ b/src/pyjuice/model/backend/parflow_fusing.py
@@ -5,6 +5,8 @@
 import triton
 import triton.language as tl
 
+from pyjuice.utils.kernel_launcher import FastJITFunction
+
 
 def compile_cum_par_flows_fn(node2tiednodes, MAX_NGROUPS = 2048, BLOCK_SIZE = 2048):
 
@@ -75,17 +77,18 @@ def cum_par_flows_to_device(kernels_args, device):
     return kernels_args
 
 
-@triton.jit
+# @triton.jit
+@FastJITFunction
 def cum_par_flows_kernel(param_flows, target_pfids, block_sizes, ch_pfids, BLOCK_G: tl.constexpr, BLOCK_M: tl.constexpr):
 
     pid = tl.program_id(axis = 0)
 
     offs_g = tl.arange(0, BLOCK_G) + pid * BLOCK_G
-    offs_chblk = tl.load(ch_pfids + offs_chblk)
+    offs_chblk = tl.load(ch_pfids + offs_g)
     mask_chblk = offs_chblk >= 0
 
     block_size = tl.load(block_sizes + pid)
-    offs_m = tl.arange(0, BLOCK_M)[None,:]
+    offs_m = tl.arange(0, BLOCK_M)
     mask_m = offs_m < block_size
 
     offs_chs = offs_chblk[:,None] + tl.arange(0, BLOCK_M)[None,:]

From 09783e26bd66f4e2485fbf3f98069ed377fd93f6 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Thu, 28 Dec 2023 03:30:44 +0800
Subject: [PATCH 110/162] fix: prod layer fail occasionally due to triton
 errors

---
 src/pyjuice/layer/prod_layer.py | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/src/pyjuice/layer/prod_layer.py b/src/pyjuice/layer/prod_layer.py
index ab8bc87d..78d1dde2 100644
--- a/src/pyjuice/layer/prod_layer.py
+++ b/src/pyjuice/layer/prod_layer.py
@@ -355,30 +355,26 @@ def _forward_backward_kernel_2d(node_vals_ptr, element_vals_ptr, local_ids_ptr,
         offs_egstart = tl.load(cids_ptr + ngroup_id * num_edges + offs_edge) # [num_edges]
 
         # Base ptr for ch values
-        evals_ptr = element_vals_ptr + \
-            (offs_egstart[:,None] + ntile_id * BLOCK_M) * batch_size + \
-            offs_batch[None,:] # [num_edges, BLOCK_B]
+        offs_evals = (offs_egstart[:,None] + ntile_id * BLOCK_M) * batch_size + offs_batch[None,:] # [num_edges, BLOCK_B]
 
         # Base ptr for par values
         ngroup_start = tl.load(nids_ptr + ngroup_id)
-        nvals_ptr = node_vals_ptr + \
-            (ngroup_start + ntile_id * BLOCK_M) * batch_size + \
-            offs_batch
+        offs_nvals = (ngroup_start + ntile_id * BLOCK_M) * batch_size + offs_batch # [BLOCK_B]
 
         # Inner loop
         for i in range(0, BLOCK_M):
-            evals = tl.load(evals_ptr, mask = mask_batch[None,:], other = 0)
+            evals = tl.load(element_vals_ptr + offs_evals, mask = mask_batch[None,:], other = 0)
             nvals = tl.sum(evals, axis = 0)
 
             # Accumulate the `node_vals` if required
             if accum == 1:
-                node_vals = tl.load(nvals_ptr, mask = mask_batch)
+                node_vals = tl.load(node_vals_ptr + offs_nvals, mask = mask_batch)
                 nvals += node_vals
 
-            tl.store(nvals_ptr, nvals, mask = mask_batch)
+            tl.store(node_vals_ptr + offs_nvals, nvals, mask = mask_batch)
 
-            nvals_ptr += batch_size
-            evals_ptr += batch_size
+            offs_nvals += batch_size
+            offs_evals += batch_size
 
     @staticmethod
     @torch.compile(mode = "reduce-overhead", fullgraph = True)
@@ -415,8 +411,8 @@ def _forward_backward(self, node_vals: torch.Tensor, element_vals: torch.Tensor,
 
         if not triton.__version__ == "2.0.0":
 
-            BLOCK_B = min(1024 // num_edges, triton.next_power_of_2(batch_size))
-            BLOCK_M = min(max(1024 // (BLOCK_B * num_edges), 1), self.group_size)
+            BLOCK_B = min(2048 // num_edges, triton.next_power_of_2(batch_size))
+            BLOCK_M = min(max(2048 // (BLOCK_B * num_edges), 1), self.group_size)
 
             grid = (triton.cdiv(n_ngroups * self.group_size, BLOCK_M), triton.cdiv(batch_size, BLOCK_B))
 

From 16d25b0218abe62f954b914402eb3fad3bcd92d2 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Thu, 28 Dec 2023 03:32:45 +0800
Subject: [PATCH 111/162] return self in `get_source_ns` if self is not tied

---
 src/pyjuice/nodes/nodes.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pyjuice/nodes/nodes.py b/src/pyjuice/nodes/nodes.py
index bae535f2..760520c9 100644
--- a/src/pyjuice/nodes/nodes.py
+++ b/src/pyjuice/nodes/nodes.py
@@ -164,7 +164,7 @@ def is_tied(self):
         return self._source_node is not None
 
     def get_source_ns(self):
-        return self._source_node
+        return self._source_node if self.is_tied() else self
 
     def set_source_ns(self, source_ns: CircuitNodes):
         assert type(source_ns) == type(self), f"Node type of the source ns ({type(source_ns)}) does not match that of self ({type(self)})."

From 502ce41f4b375c8bd008d3e25e206bca0bda1c83 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Thu, 28 Dec 2023 04:15:40 +0800
Subject: [PATCH 112/162] parameter tying tests

---
 tests/model/parameter_tying_test.py | 615 ++++++++++++++++++++++++++++
 1 file changed, 615 insertions(+)
 create mode 100644 tests/model/parameter_tying_test.py

diff --git a/tests/model/parameter_tying_test.py b/tests/model/parameter_tying_test.py
new file mode 100644
index 00000000..6199e969
--- /dev/null
+++ b/tests/model/parameter_tying_test.py
@@ -0,0 +1,615 @@
+import pyjuice as juice
+import torch
+import numpy as np
+
+import pyjuice.nodes.distributions as dists
+from pyjuice.utils import BitSet
+from pyjuice.nodes import multiply, summate, inputs, set_group_size
+from pyjuice.model import TensorCircuit
+from pyjuice.model.backend import compute_cum_par_flows, em_par_update
+
+import pytest
+
+
+def simple_structure_test_group1():
+
+    group_size = 1
+    
+    with set_group_size(group_size = group_size):
+
+        ni0 = inputs(0, num_node_groups = 2, dist = dists.Categorical(num_cats = 5))
+        ni1 = inputs(1, num_node_groups = 2, dist = dists.Categorical(num_cats = 5))
+        ni2 = inputs(2, num_node_groups = 2, dist = dists.Categorical(num_cats = 5))
+        ni3 = inputs(3, num_node_groups = 2, dist = dists.Categorical(num_cats = 5))
+
+        np01 = multiply(ni0, ni1)
+        np12 = multiply(ni1, ni2)
+        np23 = multiply(ni2, ni3)
+
+        ns01 = summate(np01, num_node_groups = 2)
+        ns12 = ns01.duplicate(np12, tie_params = True)
+        ns23 = ns01.duplicate(np23, tie_params = True)
+
+        np012_0 = multiply(ns01, ni2)
+        np012_1 = multiply(ns12, ni0)
+        ns012 = summate(np012_0, np012_1, num_node_groups = 2)
+
+        np123_0 = multiply(ns12, ni3)
+        np123_1 = multiply(ns23, ni1)
+        ns123 = ns012.duplicate(np123_0, np123_1, tie_params = True)
+
+        np0123_0 = multiply(ns012, ni3)
+        np0123_1 = multiply(ns123, ni0)
+        ns0123 = ns123.duplicate(np0123_0, np0123_1, tie_params = True)
+
+    pc = TensorCircuit(ns0123, max_tied_ns_per_parflow_group = 2)
+
+    device = torch.device("cuda:0")
+
+    ## Compilation tests ##
+
+    assert torch.all(pc.input_layer_group[0].vids == torch.tensor([0, 0, 1, 1, 2, 2, 3, 3]).reshape(8, 1))
+    assert torch.all(pc.input_layer_group[0].s_pids == torch.tensor([0, 5, 10, 15, 20, 25, 30, 35]))
+    assert torch.all(pc.input_layer_group[0].s_pfids == torch.tensor([0, 5, 10, 15, 20, 25, 30, 35]))
+    assert torch.all(pc.input_layer_group[0].metadata == torch.tensor([5.0, 5.0, 5.0, 5.0]))
+    assert torch.all(pc.input_layer_group[0].s_mids == torch.tensor([0, 0, 1, 1, 2, 2, 3, 3]))
+    assert torch.all(pc.input_layer_group[0].source_nids == torch.tensor([0, 1, 2, 3, 4, 5, 6, 7]))
+    assert pc.input_layer_group[0]._output_ind_range[0] == 1
+    assert pc.input_layer_group[0]._output_ind_range[1] == 9
+
+    assert torch.all(pc.inner_layer_groups[0][0].partitioned_nids[0] == torch.tensor([1, 2, 3, 4, 5, 6]))
+    assert torch.all(pc.inner_layer_groups[0][0].partitioned_cids[0][0,:] == torch.tensor([1, 3]))
+    assert torch.all(pc.inner_layer_groups[0][0].partitioned_cids[0][1,:] == torch.tensor([2, 4]))
+    assert torch.all(pc.inner_layer_groups[0][0].partitioned_cids[0][2,:] == torch.tensor([3, 5]))
+    assert torch.all(pc.inner_layer_groups[0][0].partitioned_cids[0][3,:] == torch.tensor([4, 6]))
+    assert torch.all(pc.inner_layer_groups[0][0].partitioned_cids[0][4,:] == torch.tensor([5, 7]))
+    assert torch.all(pc.inner_layer_groups[0][0].partitioned_cids[0][5,:] == torch.tensor([6, 8]))
+
+    assert torch.all(pc.inner_layer_groups[0][0].partitioned_u_cids[0] == torch.tensor([1, 2, 3, 4, 5, 6, 7, 8]))
+    assert torch.all(pc.inner_layer_groups[0][0].partitioned_parids[0][0,:] == torch.tensor([1, 0]))
+    assert torch.all(pc.inner_layer_groups[0][0].partitioned_parids[0][1,:] == torch.tensor([2, 0]))
+    assert torch.all(pc.inner_layer_groups[0][0].partitioned_parids[0][2,:] == torch.tensor([1, 3]))
+    assert torch.all(pc.inner_layer_groups[0][0].partitioned_parids[0][3,:] == torch.tensor([2, 4]))
+    assert torch.all(pc.inner_layer_groups[0][0].partitioned_parids[0][4,:] == torch.tensor([3, 5]))
+    assert torch.all(pc.inner_layer_groups[0][0].partitioned_parids[0][5,:] == torch.tensor([4, 6]))
+    assert torch.all(pc.inner_layer_groups[0][0].partitioned_parids[0][6,:] == torch.tensor([5, 0]))
+    assert torch.all(pc.inner_layer_groups[0][0].partitioned_parids[0][7,:] == torch.tensor([6, 0]))
+
+    assert torch.all(pc.inner_layer_groups[1][0].partitioned_nids[0] == torch.arange(9, 15))
+    assert torch.all(pc.inner_layer_groups[1][0].partitioned_cids[0][0:2,:] == torch.tensor([1, 2]).reshape(1, 2))
+    assert torch.all(pc.inner_layer_groups[1][0].partitioned_cids[0][2:4,:] == torch.tensor([3, 4]).reshape(1, 2))
+    assert torch.all(pc.inner_layer_groups[1][0].partitioned_cids[0][4:6,:] == torch.tensor([5, 6]).reshape(1, 2))
+
+    assert torch.all(pc.inner_layer_groups[1][0].partitioned_pids[0][0:2,:] == torch.tensor([[1, 2], [3, 4]]))
+    assert torch.all(pc.inner_layer_groups[1][0].partitioned_pids[0][2:4,:] == torch.tensor([[1, 2], [3, 4]]))
+    assert torch.all(pc.inner_layer_groups[1][0].partitioned_pids[0][4:6,:] == torch.tensor([[1, 2], [3, 4]]))
+
+    assert torch.all(pc.inner_layer_groups[1][0].partitioned_pfids[0][0:2,:] == torch.tensor([[0, 1], [2, 3]]))
+    assert torch.all(pc.inner_layer_groups[1][0].partitioned_pfids[0][2:4,:] == torch.tensor([[0, 1], [2, 3]]))
+    assert torch.all(pc.inner_layer_groups[1][0].partitioned_pfids[0][4:6,:] == torch.tensor([[4, 5], [6, 7]]))
+
+    assert torch.all(pc.inner_layer_groups[1][0].partitioned_chids[0] == torch.arange(1, 7))
+    assert torch.all(pc.inner_layer_groups[1][0].partitioned_parids[0][0:2,:] == torch.tensor([9, 10]).reshape(1, 2))
+    assert torch.all(pc.inner_layer_groups[1][0].partitioned_parids[0][2:4,:] == torch.tensor([11, 12]).reshape(1, 2))
+    assert torch.all(pc.inner_layer_groups[1][0].partitioned_parids[0][4:6,:] == torch.tensor([13, 14]).reshape(1, 2))
+
+    assert torch.all(pc.inner_layer_groups[1][0].partitioned_parpids[0][0:2,:] == torch.tensor([[1, 3], [2, 4]]))
+    assert torch.all(pc.inner_layer_groups[1][0].partitioned_parpids[0][2:4,:] == torch.tensor([[1, 3], [2, 4]]))
+    assert torch.all(pc.inner_layer_groups[1][0].partitioned_parpids[0][4:6,:] == torch.tensor([[1, 3], [2, 4]]))
+
+    assert torch.all(pc.inner_layer_groups[2][0].partitioned_nids[0] == torch.arange(1, 9))
+    assert torch.all(pc.inner_layer_groups[2][0].partitioned_cids[0][0:2,:] == torch.tensor([[9, 5], [10, 6]]))
+    assert torch.all(pc.inner_layer_groups[2][0].partitioned_cids[0][2:4,:] == torch.tensor([[11, 1], [12, 2]]))
+    assert torch.all(pc.inner_layer_groups[2][0].partitioned_cids[0][4:6,:] == torch.tensor([[11, 7], [12, 8]]))
+    assert torch.all(pc.inner_layer_groups[2][0].partitioned_cids[0][6:8,:] == torch.tensor([[13, 3], [14, 4]]))
+
+    assert torch.all(pc.inner_layer_groups[2][0].partitioned_u_cids[0] == torch.tensor([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 13, 14]))
+    assert torch.all(pc.inner_layer_groups[2][0].partitioned_u_cids[1] == torch.tensor([11, 12]))
+    assert torch.all(pc.inner_layer_groups[2][0].partitioned_parids[0] == torch.tensor([3, 4, 7, 8, 1, 2, 5, 6, 1, 2, 7, 8]).reshape(12, 1))
+    assert torch.all(pc.inner_layer_groups[2][0].partitioned_parids[1] == torch.tensor([[3, 5], [4, 6]]))
+
+    assert torch.all(pc.inner_layer_groups[3][0].partitioned_nids[0] == torch.arange(15, 19))
+    assert torch.all(pc.inner_layer_groups[3][0].partitioned_cids[0][0:2,:] == torch.tensor([1, 2, 3, 4]).reshape(1, 4))
+    assert torch.all(pc.inner_layer_groups[3][0].partitioned_cids[0][2:4,:] == torch.tensor([5, 6, 7, 8]).reshape(1, 4))
+
+    assert torch.all(pc.inner_layer_groups[3][0].partitioned_pids[0][0:2,:] == torch.tensor([[5, 6, 7, 8], [9, 10, 11, 12]]))
+    assert torch.all(pc.inner_layer_groups[3][0].partitioned_pids[0][2:4,:] == torch.tensor([[5, 6, 7, 8], [9, 10, 11, 12]]))
+
+    assert torch.all(pc.inner_layer_groups[3][0].partitioned_pfids[0][0:2,:] == torch.tensor([[8, 9, 10, 11], [12, 13, 14, 15]]))
+    assert torch.all(pc.inner_layer_groups[3][0].partitioned_pfids[0][2:4,:] == torch.tensor([[8, 9, 10, 11], [12, 13, 14, 15]]))
+
+    assert torch.all(pc.inner_layer_groups[3][0].partitioned_chids[0] == torch.arange(1, 9))
+    assert torch.all(pc.inner_layer_groups[3][0].partitioned_parids[0][0:4,:] == torch.tensor([15, 16]).reshape(1, 2))
+    assert torch.all(pc.inner_layer_groups[3][0].partitioned_parids[0][4:8,:] == torch.tensor([17, 18]).reshape(1, 2))
+
+    assert torch.all(pc.inner_layer_groups[3][0].partitioned_parpids[0][0:4,:] == torch.tensor([[5, 9], [6, 10], [7, 11], [8, 12]]))
+    assert torch.all(pc.inner_layer_groups[3][0].partitioned_parpids[0][4:8,:] == torch.tensor([[5, 9], [6, 10], [7, 11], [8, 12]]))
+
+    assert torch.all(pc.inner_layer_groups[4][0].partitioned_nids[0] == torch.tensor([1, 2, 3, 4]))
+    assert torch.all(pc.inner_layer_groups[4][0].partitioned_cids[0] == torch.tensor([[15, 7], [16, 8], [17, 1], [18, 2]]))
+
+    assert torch.all(pc.inner_layer_groups[4][0].partitioned_u_cids[0] == torch.tensor([1, 2, 7, 8, 15, 16, 17, 18]))
+    assert torch.all(pc.inner_layer_groups[4][0].partitioned_parids[0] == torch.tensor([3, 4, 1, 2, 1, 2, 3, 4]).reshape(8, 1))
+
+    assert torch.all(pc.inner_layer_groups[5][0].partitioned_nids[0] == torch.arange(19, 21))
+    assert torch.all(pc.inner_layer_groups[5][0].partitioned_cids[0] == torch.tensor([1, 2, 3, 4]).reshape(1, 4))
+
+    assert torch.all(pc.inner_layer_groups[5][0].partitioned_pids[0] == torch.tensor([[5, 6, 7, 8], [9, 10, 11, 12]]))
+
+    assert torch.all(pc.inner_layer_groups[5][0].partitioned_pfids[0] == torch.tensor([[16, 17, 18, 19], [20, 21, 22, 23]]))
+
+    assert torch.all(pc.inner_layer_groups[5][0].partitioned_chids[0] == torch.arange(1, 5))
+    assert torch.all(pc.inner_layer_groups[5][0].partitioned_parids[0] == torch.tensor([19, 20]).reshape(1, 2))
+
+    assert torch.all(pc.inner_layer_groups[5][0].partitioned_parpids[0] == torch.tensor([[5, 9], [6, 10], [7, 11], [8, 12]]))
+
+    pc.to(device)
+
+    ## Forward tests ##
+
+    data = torch.randint(0, 5, [16, 4]).to(device)
+
+    lls = pc(data)
+
+    node_mars = pc.node_mars.detach().cpu()
+    params = pc.params.detach().cpu()
+
+    params0 = params[1:5].reshape(2, 2)
+
+    np01_lls = node_mars[1:3,:] + node_mars[3:5,:]
+    ns01_lls = torch.matmul(params0, np01_lls.exp()).log()
+    assert torch.all(torch.abs(node_mars[9:11,:] - ns01_lls) < 1e-4)
+
+    np12_lls = node_mars[3:5,:] + node_mars[5:7,:]
+    ns12_lls = torch.matmul(params0, np12_lls.exp()).log()
+    assert torch.all(torch.abs(node_mars[11:13,:] - ns12_lls) < 1e-4)
+
+    np23_lls = node_mars[5:7,:] + node_mars[7:9,:]
+    ns23_lls = torch.matmul(params0, np23_lls.exp()).log()
+    assert torch.all(torch.abs(node_mars[13:15,:] - ns23_lls) < 1e-4)
+
+    params1 = params[5:13].reshape(2, 4)
+
+    np012_0_lls = ns01_lls + node_mars[5:7,:]
+    np012_1_lls = ns12_lls + node_mars[1:3,:]
+    np012_lls = torch.cat((np012_0_lls, np012_1_lls), dim = 0)
+    ns012_lls = torch.matmul(params1, np012_lls.exp()).log()
+    assert torch.all(torch.abs(node_mars[15:17,:] - ns012_lls) < 1e-4)
+
+    np123_0_lls = ns12_lls + node_mars[7:9,:]
+    np123_1_lls = ns23_lls + node_mars[3:5,:]
+    np123_lls = torch.cat((np123_0_lls, np123_1_lls), dim = 0)
+    ns123_lls = torch.matmul(params1, np123_lls.exp()).log()
+    assert torch.all(torch.abs(node_mars[17:19,:] - ns123_lls) < 1e-4)
+
+    np0123_0_lls = ns012_lls + node_mars[7:9,:]
+    np0123_1_lls = ns123_lls + node_mars[1:3,:]
+    np0123_lls = torch.cat((np0123_0_lls, np0123_1_lls), dim = 0)
+    ns0123_lls = torch.matmul(params1, np0123_lls.exp()).log()
+    assert torch.all(torch.abs(node_mars[19:21,:] - ns0123_lls) < 1e-4)
+
+    ## Backward tests ##
+
+    pc.backward(data.permute(1, 0), allow_modify_flows = False)
+
+    node_flows = pc.node_flows.detach().cpu().clone()
+    param_flows = pc.param_flows.detach().cpu().clone()
+
+    assert torch.all(torch.abs(node_flows[19:21,:] - 1.0) < 1e-4)
+
+    pc.inner_layer_groups[4][0](pc.node_mars, pc.element_mars, _for_backward = True)
+    pc.inner_layer_groups[5][0].backward(
+        pc.node_flows, pc.element_flows, pc.node_mars, pc.element_mars, 
+        pc.params, pc.param_flows
+    )
+    element_flows = pc.element_flows.detach().cpu()
+
+    np0123_flows = torch.matmul(params1.permute(1, 0), 1.0 / ns0123_lls.exp()) * np0123_lls.exp()
+    assert torch.all(torch.abs(element_flows[1:5,:] - np0123_flows) < 1e-4)
+
+    param_flows1 = torch.matmul(1.0 / ns0123_lls.exp(), np0123_lls.exp().permute(1, 0)) * params1
+
+    ns012_flows = element_flows[1:3,:]
+    assert torch.all(torch.abs(node_flows[15:17,:] - ns012_flows) < 1e-4)
+
+    ns123_flows = element_flows[3:5,:]
+    assert torch.all(torch.abs(node_flows[17:19,:] - ns123_flows) < 1e-4)
+
+    ni0_flows = element_flows[3:5,:].clone()
+    ni3_flows = element_flows[1:3,:].clone()
+
+    pc.inner_layer_groups[2][0](pc.node_mars, pc.element_mars, _for_backward = True)
+    pc.inner_layer_groups[3][0].backward(
+        pc.node_flows, pc.element_flows, pc.node_mars, pc.element_mars, 
+        pc.params, pc.param_flows
+    )
+    element_flows = pc.element_flows.detach().cpu()
+
+    np012_flows = torch.matmul(params1.permute(1, 0), ns012_flows / ns012_lls.exp()) * np012_lls.exp()
+    assert torch.all(torch.abs(element_flows[1:5,:] - np012_flows) < 1e-4)
+
+    param_flows1 += torch.matmul(ns012_flows / ns012_lls.exp(), np012_lls.exp().permute(1, 0)) * params1
+
+    np123_flows = torch.matmul(params1.permute(1, 0), ns123_flows / ns123_lls.exp()) * np123_lls.exp()
+    assert torch.all(torch.abs(element_flows[5:9,:] - np123_flows) < 1e-4)
+
+    param_flows1 += torch.matmul(ns123_flows / ns123_lls.exp(), np123_lls.exp().permute(1, 0)) * params1
+
+    ns01_flows = np012_flows[0:2,:]
+    assert torch.all(torch.abs(node_flows[9:11,:] - ns01_flows) < 1e-4)
+
+    ns12_flows = np012_flows[2:4,:] + np123_flows[0:2,:]
+    assert torch.all(torch.abs(node_flows[11:13,:] - ns12_flows) < 1e-4)
+
+    ns23_flows = np123_flows[2:4,:]
+    assert torch.all(torch.abs(node_flows[13:15,:] - ns23_flows) < 1e-4)
+
+    ni2_flows = np012_flows[0:2,:].clone()
+    ni0_flows += np012_flows[2:4,:].clone()
+    ni3_flows += np123_flows[0:2,:].clone()
+    ni1_flows = np123_flows[2:4,:].clone()
+
+    pc.inner_layer_groups[0][0](pc.node_mars, pc.element_mars, _for_backward = True)
+    pc.inner_layer_groups[1][0].backward(
+        pc.node_flows, pc.element_flows, pc.node_mars, pc.element_mars, 
+        pc.params, pc.param_flows
+    )
+    element_flows = pc.element_flows.detach().cpu()
+
+    np01_flows = torch.matmul(params0.permute(1, 0), ns01_flows / ns01_lls.exp()) * np01_lls.exp()
+    assert torch.all(torch.abs(element_flows[1:3,:] - np01_flows) < 1e-4)
+
+    param_flows0 = torch.matmul(ns01_flows / ns01_lls.exp(), np01_lls.exp().permute(1, 0)) * params0
+
+    np12_flows = torch.matmul(params0.permute(1, 0), ns12_flows / ns12_lls.exp()) * np12_lls.exp()
+    assert torch.all(torch.abs(element_flows[3:5,:] - np12_flows) < 1e-4)
+
+    param_flows0 += torch.matmul(ns12_flows / ns12_lls.exp(), np12_lls.exp().permute(1, 0)) * params0
+
+    np23_flows = torch.matmul(params0.permute(1, 0), ns23_flows / ns23_lls.exp()) * np23_lls.exp()
+    assert torch.all(torch.abs(element_flows[5:7,:] - np23_flows) < 1e-4)
+
+    param_flows0 += torch.matmul(ns23_flows / ns23_lls.exp(), np23_lls.exp().permute(1, 0)) * params0
+
+    ni0_flows += np01_flows.clone()
+    ni1_flows += np01_flows.clone() + np12_flows.clone()
+    ni2_flows += np12_flows.clone() + np23_flows.clone()
+    ni3_flows += np23_flows.clone()
+
+    assert torch.all(torch.abs(node_flows[1:3,:] - ni0_flows) < 1e-4)
+    assert torch.all(torch.abs(node_flows[3:5,:] - ni1_flows) < 1e-4)
+    assert torch.all(torch.abs(node_flows[5:7,:] - ni2_flows) < 1e-4)
+    assert torch.all(torch.abs(node_flows[7:9,:] - ni3_flows) < 1e-4)
+
+    assert torch.all(torch.abs(param_flows0.reshape(-1) - (param_flows[0:4] + param_flows[4:8])) < 1e-4)
+    assert torch.all(torch.abs(param_flows1.reshape(-1) - (param_flows[8:16] + param_flows[16:24])) < 1e-4)
+
+    ## Parameter learning & flow aggregation tests ##
+
+    temp_param_flows = param_flows.clone().to(device)
+
+    compute_cum_par_flows(temp_param_flows, pc.parflow_fusing_kwargs)
+
+    assert torch.all(torch.abs(param_flows0.reshape(-1) - temp_param_flows[0:4].cpu()) < 1e-4)
+    assert torch.all(torch.abs(param_flows1.reshape(-1) - temp_param_flows[8:16].cpu()) < 1e-4)
+
+    em_par_update(pc.params, temp_param_flows, pc.par_update_kwargs, step_size = 1.0, pseudocount = 0.0)
+
+    param_flows0 /= param_flows0.sum(dim = 1, keepdim = True)
+    assert torch.all(torch.abs(param_flows0.reshape(-1) - pc.params[1:5].cpu()) < 1e-4)
+
+    param_flows1 /= param_flows1.sum(dim = 1, keepdim = True)
+    assert torch.all(torch.abs(param_flows1.reshape(-1) - pc.params[5:13].cpu()) < 1e-4)
+
+
+def simple_structure_test_group16():
+
+    group_size = 16
+    
+    with set_group_size(group_size = group_size):
+
+        ni0 = inputs(0, num_node_groups = 2, dist = dists.Categorical(num_cats = 5))
+        ni1 = inputs(1, num_node_groups = 2, dist = dists.Categorical(num_cats = 5))
+        ni2 = inputs(2, num_node_groups = 2, dist = dists.Categorical(num_cats = 5))
+        ni3 = inputs(3, num_node_groups = 2, dist = dists.Categorical(num_cats = 5))
+
+        np01 = multiply(ni0, ni1)
+        np12 = multiply(ni1, ni2)
+        np23 = multiply(ni2, ni3)
+
+        ns01 = summate(np01, num_node_groups = 2)
+        ns12 = ns01.duplicate(np12, tie_params = True)
+        ns23 = ns01.duplicate(np23, tie_params = True)
+
+        np012_0 = multiply(ns01, ni2)
+        np012_1 = multiply(ns12, ni0)
+        ns012 = summate(np012_0, np012_1, num_node_groups = 2)
+
+        np123_0 = multiply(ns12, ni3)
+        np123_1 = multiply(ns23, ni1)
+        ns123 = ns012.duplicate(np123_0, np123_1, tie_params = True)
+
+        np0123_0 = multiply(ns012, ni3)
+        np0123_1 = multiply(ns123, ni0)
+        ns0123 = ns123.duplicate(np0123_0, np0123_1, tie_params = True)
+
+    ns0123.init_parameters()
+    pc = TensorCircuit(ns0123, max_tied_ns_per_parflow_group = 2)
+
+    device = torch.device("cuda:0")
+
+    ## Compilation tests ##
+
+    assert pc.input_layer_group[0]._output_ind_range[0] == 16
+    assert pc.input_layer_group[0]._output_ind_range[1] == 144
+
+    assert torch.all(pc.inner_layer_groups[0][0].partitioned_nids[0] == torch.tensor([16, 32, 48, 64, 80, 96]))
+    assert torch.all(pc.inner_layer_groups[0][0].partitioned_cids[0][0,:] == torch.tensor([16, 48]))
+    assert torch.all(pc.inner_layer_groups[0][0].partitioned_cids[0][1,:] == torch.tensor([32, 64]))
+    assert torch.all(pc.inner_layer_groups[0][0].partitioned_cids[0][2,:] == torch.tensor([48, 80]))
+    assert torch.all(pc.inner_layer_groups[0][0].partitioned_cids[0][3,:] == torch.tensor([64, 96]))
+    assert torch.all(pc.inner_layer_groups[0][0].partitioned_cids[0][4,:] == torch.tensor([80, 112]))
+    assert torch.all(pc.inner_layer_groups[0][0].partitioned_cids[0][5,:] == torch.tensor([96, 128]))
+
+    assert torch.all(pc.inner_layer_groups[0][0].partitioned_u_cids[0] == torch.tensor([16, 32, 48, 64, 80, 96, 112, 128]))
+    assert torch.all(pc.inner_layer_groups[0][0].partitioned_parids[0][0,:] == torch.tensor([16, 0]))
+    assert torch.all(pc.inner_layer_groups[0][0].partitioned_parids[0][1,:] == torch.tensor([32, 0]))
+    assert torch.all(pc.inner_layer_groups[0][0].partitioned_parids[0][2,:] == torch.tensor([16, 48]))
+    assert torch.all(pc.inner_layer_groups[0][0].partitioned_parids[0][3,:] == torch.tensor([32, 64]))
+    assert torch.all(pc.inner_layer_groups[0][0].partitioned_parids[0][4,:] == torch.tensor([48, 80]))
+    assert torch.all(pc.inner_layer_groups[0][0].partitioned_parids[0][5,:] == torch.tensor([64, 96]))
+    assert torch.all(pc.inner_layer_groups[0][0].partitioned_parids[0][6,:] == torch.tensor([80, 0]))
+    assert torch.all(pc.inner_layer_groups[0][0].partitioned_parids[0][7,:] == torch.tensor([96, 0]))
+
+    assert torch.all(pc.inner_layer_groups[1][0].partitioned_nids[0] == torch.arange(144, 240, 16))
+    assert torch.all(pc.inner_layer_groups[1][0].partitioned_cids[0][0:2,:] == torch.arange(16, 48).reshape(1, 32))
+    assert torch.all(pc.inner_layer_groups[1][0].partitioned_cids[0][2:4,:] == torch.arange(48, 80).reshape(1, 32))
+    assert torch.all(pc.inner_layer_groups[1][0].partitioned_cids[0][4:6,:] == torch.arange(80, 112).reshape(1, 32))
+
+    assert torch.all(pc.inner_layer_groups[1][0].partitioned_pids[0][0,:] == torch.arange(256, 768, 16))
+    assert torch.all(pc.inner_layer_groups[1][0].partitioned_pids[0][1,:] == torch.arange(768, 1280, 16))
+    assert torch.all(pc.inner_layer_groups[1][0].partitioned_pids[0][2,:] == torch.arange(256, 768, 16))
+    assert torch.all(pc.inner_layer_groups[1][0].partitioned_pids[0][3,:] == torch.arange(768, 1280, 16))
+    assert torch.all(pc.inner_layer_groups[1][0].partitioned_pids[0][4,:] == torch.arange(256, 768, 16))
+    assert torch.all(pc.inner_layer_groups[1][0].partitioned_pids[0][5,:] == torch.arange(768, 1280, 16))
+
+    assert torch.all(pc.inner_layer_groups[1][0].partitioned_pfids[0][0,:] == torch.arange(0, 512, 16))
+    assert torch.all(pc.inner_layer_groups[1][0].partitioned_pfids[0][1,:] == torch.arange(512, 1024, 16))
+    assert torch.all(pc.inner_layer_groups[1][0].partitioned_pfids[0][2,:] == torch.arange(0, 512, 16))
+    assert torch.all(pc.inner_layer_groups[1][0].partitioned_pfids[0][3,:] == torch.arange(512, 1024, 16))
+    assert torch.all(pc.inner_layer_groups[1][0].partitioned_pfids[0][4,:] == torch.arange(1024, 1536, 16))
+    assert torch.all(pc.inner_layer_groups[1][0].partitioned_pfids[0][5,:] == torch.arange(1536, 2048, 16))
+
+    assert torch.all(pc.inner_layer_groups[1][0].partitioned_chids[0] == torch.arange(16, 112, 16))
+    assert torch.all(pc.inner_layer_groups[1][0].partitioned_parids[0][0:2,:] == torch.tensor([144, 160]).reshape(1, 2))
+    assert torch.all(pc.inner_layer_groups[1][0].partitioned_parids[0][2:4,:] == torch.tensor([176, 192]).reshape(1, 2))
+    assert torch.all(pc.inner_layer_groups[1][0].partitioned_parids[0][4:6,:] == torch.tensor([208, 224]).reshape(1, 2))
+
+    assert torch.all(pc.inner_layer_groups[1][0].partitioned_parpids[0][0:2,:] == torch.tensor([[256, 768], [512, 1024]]))
+    assert torch.all(pc.inner_layer_groups[1][0].partitioned_parpids[0][2:4,:] == torch.tensor([[256, 768], [512, 1024]]))
+    assert torch.all(pc.inner_layer_groups[1][0].partitioned_parpids[0][4:6,:] == torch.tensor([[256, 768], [512, 1024]]))
+
+    assert torch.all(pc.inner_layer_groups[2][0].partitioned_nids[0] == torch.arange(16, 144, 16))
+    assert torch.all(pc.inner_layer_groups[2][0].partitioned_cids[0][0:2,:] == torch.tensor([[144, 80], [160, 96]]))
+    assert torch.all(pc.inner_layer_groups[2][0].partitioned_cids[0][2:4,:] == torch.tensor([[176, 16], [192, 32]]))
+    assert torch.all(pc.inner_layer_groups[2][0].partitioned_cids[0][4:6,:] == torch.tensor([[176, 112], [192, 128]]))
+    assert torch.all(pc.inner_layer_groups[2][0].partitioned_cids[0][6:8,:] == torch.tensor([[208, 48], [224, 64]]))
+
+    assert torch.all(pc.inner_layer_groups[2][0].partitioned_u_cids[0] == torch.tensor([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 13, 14]) * 16)
+    assert torch.all(pc.inner_layer_groups[2][0].partitioned_u_cids[1] == torch.tensor([11, 12]) * 16)
+    assert torch.all(pc.inner_layer_groups[2][0].partitioned_parids[0] == torch.tensor([3, 4, 7, 8, 1, 2, 5, 6, 1, 2, 7, 8]).reshape(12, 1) * 16)
+    assert torch.all(pc.inner_layer_groups[2][0].partitioned_parids[1] == torch.tensor([[3, 5], [4, 6]]) * 16)
+
+    assert torch.all(pc.inner_layer_groups[3][0].partitioned_nids[0] == torch.arange(15, 19) * 16)
+    assert torch.all(pc.inner_layer_groups[3][0].partitioned_cids[0][0:2,:] == torch.arange(16, 80).reshape(1, 64))
+    assert torch.all(pc.inner_layer_groups[3][0].partitioned_cids[0][2:4,:] == torch.arange(80, 144).reshape(1, 64))
+
+    assert torch.all(pc.inner_layer_groups[3][0].partitioned_pids[0][0,:] == torch.arange(1280, 2304, 16))
+    assert torch.all(pc.inner_layer_groups[3][0].partitioned_pids[0][1,:] == torch.arange(2304, 3328, 16))
+    assert torch.all(pc.inner_layer_groups[3][0].partitioned_pids[0][2,:] == torch.arange(1280, 2304, 16))
+    assert torch.all(pc.inner_layer_groups[3][0].partitioned_pids[0][3,:] == torch.arange(2304, 3328, 16))
+
+    assert torch.all(pc.inner_layer_groups[3][0].partitioned_pfids[0][0,:] == torch.arange(2048, 3072, 16))
+    assert torch.all(pc.inner_layer_groups[3][0].partitioned_pfids[0][1,:] == torch.arange(3072, 4096, 16))
+    assert torch.all(pc.inner_layer_groups[3][0].partitioned_pfids[0][2,:] == torch.arange(2048, 3072, 16))
+    assert torch.all(pc.inner_layer_groups[3][0].partitioned_pfids[0][3,:] == torch.arange(3072, 4096, 16))
+
+    assert torch.all(pc.inner_layer_groups[3][0].partitioned_chids[0] == torch.arange(1, 9) * 16)
+    assert torch.all(pc.inner_layer_groups[3][0].partitioned_parids[0][0:4,:] == torch.tensor([15, 16]).reshape(1, 2) * 16)
+    assert torch.all(pc.inner_layer_groups[3][0].partitioned_parids[0][4:8,:] == torch.tensor([17, 18]).reshape(1, 2) * 16)
+
+    assert torch.all(pc.inner_layer_groups[3][0].partitioned_parpids[0][0:4,:] == torch.tensor([[5, 9], [6, 10], [7, 11], [8, 12]]) * 256)
+    assert torch.all(pc.inner_layer_groups[3][0].partitioned_parpids[0][4:8,:] == torch.tensor([[5, 9], [6, 10], [7, 11], [8, 12]]) * 256)
+
+    assert torch.all(pc.inner_layer_groups[4][0].partitioned_nids[0] == torch.tensor([1, 2, 3, 4]) * 16)
+    assert torch.all(pc.inner_layer_groups[4][0].partitioned_cids[0] == torch.tensor([[15, 7], [16, 8], [17, 1], [18, 2]]) * 16)
+
+    assert torch.all(pc.inner_layer_groups[4][0].partitioned_u_cids[0] == torch.tensor([1, 2, 7, 8, 15, 16, 17, 18]) * 16)
+    assert torch.all(pc.inner_layer_groups[4][0].partitioned_parids[0] == torch.tensor([3, 4, 1, 2, 1, 2, 3, 4]).reshape(8, 1) * 16)
+
+    assert torch.all(pc.inner_layer_groups[5][0].partitioned_nids[0] == torch.arange(19, 21) * 16)
+    assert torch.all(pc.inner_layer_groups[5][0].partitioned_cids[0] == torch.arange(16, 80).reshape(1, 64))
+
+    assert torch.all(pc.inner_layer_groups[5][0].partitioned_pids[0][0,:] == torch.arange(1280, 2304, 16))
+    assert torch.all(pc.inner_layer_groups[5][0].partitioned_pids[0][1,:] == torch.arange(2304, 3328, 16))
+
+    assert torch.all(pc.inner_layer_groups[5][0].partitioned_pfids[0][0,:] == torch.arange(4096, 5120, 16))
+    assert torch.all(pc.inner_layer_groups[5][0].partitioned_pfids[0][1,:] == torch.arange(5120, 6144, 16))
+
+    assert torch.all(pc.inner_layer_groups[5][0].partitioned_chids[0] == torch.arange(1, 5) * 16)
+    assert torch.all(pc.inner_layer_groups[5][0].partitioned_parids[0] == torch.tensor([19, 20]).reshape(1, 2) * 16)
+
+    assert torch.all(pc.inner_layer_groups[5][0].partitioned_parpids[0] == torch.tensor([[5, 9], [6, 10], [7, 11], [8, 12]]) * 256)
+
+    pc.to(device)
+
+    ## Forward tests ##
+
+    data = torch.randint(0, 5, [16, 4]).to(device)
+
+    lls = pc(data)
+
+    node_mars = pc.node_mars.detach().cpu()
+    params = pc.params.detach().cpu()
+
+    params0 = ns01.get_source_ns()._params.reshape(2, 2, 16, 16).permute(0, 2, 1, 3).reshape(32, 32)
+
+    np01_lls = node_mars[16:48,:] + node_mars[48:80,:]
+    ns01_lls = torch.matmul(params0, np01_lls.exp()).log()
+    assert torch.all(torch.abs(node_mars[144:176,:] - ns01_lls) < 1e-3)
+
+    np12_lls = node_mars[48:80,:] + node_mars[80:112,:]
+    ns12_lls = torch.matmul(params0, np12_lls.exp()).log()
+    assert torch.all(torch.abs(node_mars[176:208,:] - ns12_lls) < 1e-3)
+
+    np23_lls = node_mars[80:112,:] + node_mars[112:144,:]
+    ns23_lls = torch.matmul(params0, np23_lls.exp()).log()
+    assert torch.all(torch.abs(node_mars[208:240,:] - ns23_lls) < 1e-3)
+
+    params1 = ns0123.get_source_ns()._params.reshape(2, 4, 16, 16).permute(0, 2, 1, 3).reshape(32, 64)
+
+    np012_0_lls = ns01_lls + node_mars[80:112,:]
+    np012_1_lls = ns12_lls + node_mars[16:48,:]
+    np012_lls = torch.cat((np012_0_lls, np012_1_lls), dim = 0)
+    ns012_lls = torch.matmul(params1, np012_lls.exp()).log()
+    assert torch.all(torch.abs(node_mars[240:272,:] - ns012_lls) < 1e-3)
+
+    np123_0_lls = ns12_lls + node_mars[112:144,:]
+    np123_1_lls = ns23_lls + node_mars[48:80,:]
+    np123_lls = torch.cat((np123_0_lls, np123_1_lls), dim = 0)
+    ns123_lls = torch.matmul(params1, np123_lls.exp()).log()
+    assert torch.all(torch.abs(node_mars[272:304,:] - ns123_lls) < 1e-3)
+
+    np0123_0_lls = ns012_lls + node_mars[112:144,:]
+    np0123_1_lls = ns123_lls + node_mars[16:48,:]
+    np0123_lls = torch.cat((np0123_0_lls, np0123_1_lls), dim = 0)
+    ns0123_lls = torch.matmul(params1, np0123_lls.exp()).log()
+    assert torch.all(torch.abs(node_mars[304:336,:] - ns0123_lls) < 1e-3)
+
+    ## Backward tests ##
+
+    pc.backward(data.permute(1, 0), allow_modify_flows = False)
+
+    node_flows = pc.node_flows.detach().cpu().clone()
+    param_flows = pc.param_flows.detach().cpu().clone()
+
+    assert torch.all(torch.abs(node_flows[304:336,:] - 1.0) < 1e-4)
+
+    pc.inner_layer_groups[4][0](pc.node_mars, pc.element_mars, _for_backward = True)
+    pc.inner_layer_groups[5][0].backward(
+        pc.node_flows, pc.element_flows, pc.node_mars, pc.element_mars, 
+        pc.params, pc.param_flows
+    )
+    element_flows = pc.element_flows.detach().cpu()
+
+    np0123_flows = torch.matmul(params1.permute(1, 0), 1.0 / ns0123_lls.exp()) * np0123_lls.exp()
+    assert torch.all(torch.abs(element_flows[16:80,:] - np0123_flows) < 4e-3)
+
+    param_flows1 = torch.matmul(1.0 / ns0123_lls.exp(), np0123_lls.exp().permute(1, 0)) * params1
+
+    ns012_flows = element_flows[16:48,:]
+    assert torch.all(torch.abs(node_flows[240:272,:] - ns012_flows) < 4e-3)
+
+    ns123_flows = element_flows[48:80,:]
+    assert torch.all(torch.abs(node_flows[272:304,:] - ns123_flows) < 4e-3)
+
+    ni0_flows = element_flows[48:80,:].clone()
+    ni3_flows = element_flows[16:48,:].clone()
+
+    pc.inner_layer_groups[2][0](pc.node_mars, pc.element_mars, _for_backward = True)
+    pc.inner_layer_groups[3][0].backward(
+        pc.node_flows, pc.element_flows, pc.node_mars, pc.element_mars, 
+        pc.params, pc.param_flows
+    )
+    element_flows = pc.element_flows.detach().cpu()
+
+    np012_flows = torch.matmul(params1.permute(1, 0), ns012_flows / ns012_lls.exp()) * np012_lls.exp()
+    assert torch.all(torch.abs(element_flows[16:80,:] - np012_flows) < 4e-3)
+
+    param_flows1 += torch.matmul(ns012_flows / ns012_lls.exp(), np012_lls.exp().permute(1, 0)) * params1
+
+    np123_flows = torch.matmul(params1.permute(1, 0), ns123_flows / ns123_lls.exp()) * np123_lls.exp()
+    assert torch.all(torch.abs(element_flows[80:144,:] - np123_flows) < 4e-3)
+
+    param_flows1 += torch.matmul(ns123_flows / ns123_lls.exp(), np123_lls.exp().permute(1, 0)) * params1
+
+    ns01_flows = np012_flows[0:32,:]
+    assert torch.all(torch.abs(node_flows[144:176,:] - ns01_flows) < 4e-3)
+
+    ns12_flows = np012_flows[32:64,:] + np123_flows[0:32,:]
+    assert torch.all(torch.abs(node_flows[176:208,:] - ns12_flows) < 4e-3)
+
+    ns23_flows = np123_flows[32:64,:]
+    assert torch.all(torch.abs(node_flows[208:240,:] - ns23_flows) < 1e-3)
+
+    ni2_flows = np012_flows[0:32,:].clone()
+    ni0_flows += np012_flows[32:64,:].clone()
+    ni3_flows += np123_flows[0:32,:].clone()
+    ni1_flows = np123_flows[32:64,:].clone()
+
+    pc.inner_layer_groups[0][0](pc.node_mars, pc.element_mars, _for_backward = True)
+    pc.inner_layer_groups[1][0].backward(
+        pc.node_flows, pc.element_flows, pc.node_mars, pc.element_mars, 
+        pc.params, pc.param_flows
+    )
+    element_flows = pc.element_flows.detach().cpu()
+
+    np01_flows = torch.matmul(params0.permute(1, 0), ns01_flows / ns01_lls.exp()) * np01_lls.exp()
+    assert torch.all(torch.abs(element_flows[16:48,:] - np01_flows) < 1e-2)
+
+    param_flows0 = torch.matmul(ns01_flows / ns01_lls.exp(), np01_lls.exp().permute(1, 0)) * params0
+
+    np12_flows = torch.matmul(params0.permute(1, 0), ns12_flows / ns12_lls.exp()) * np12_lls.exp()
+    assert torch.all(torch.abs(element_flows[48:80,:] - np12_flows) < 1e-2)
+
+    param_flows0 += torch.matmul(ns12_flows / ns12_lls.exp(), np12_lls.exp().permute(1, 0)) * params0
+
+    np23_flows = torch.matmul(params0.permute(1, 0), ns23_flows / ns23_lls.exp()) * np23_lls.exp()
+    assert torch.all(torch.abs(element_flows[80:112,:] - np23_flows) < 1e-2)
+
+    param_flows0 += torch.matmul(ns23_flows / ns23_lls.exp(), np23_lls.exp().permute(1, 0)) * params0
+
+    ni0_flows += np01_flows.clone()
+    ni1_flows += np01_flows.clone() + np12_flows.clone()
+    ni2_flows += np12_flows.clone() + np23_flows.clone()
+    ni3_flows += np23_flows.clone()
+
+    assert torch.all(torch.abs(node_flows[16:48,:] - ni0_flows) < 1e-2)
+    assert torch.all(torch.abs(node_flows[48:80,:] - ni1_flows) < 1e-2)
+    assert torch.all(torch.abs(node_flows[80:112,:] - ni2_flows) < 1e-2)
+    assert torch.all(torch.abs(node_flows[112:144,:] - ni3_flows) < 1e-2)
+
+    ref_param_flows0 = (param_flows[0:1024] + param_flows[1024:2048]).reshape(2, 2, 16, 16).permute(0, 3, 1, 2).reshape(-1)
+    assert torch.all(torch.abs(param_flows0.reshape(-1) - ref_param_flows0) < 1e-2)
+
+    ref_param_flows1 = (param_flows[2048:4096] + param_flows[4096:6144]).reshape(2, 4, 16, 16).permute(0, 3, 1, 2).reshape(-1)
+    assert torch.all(torch.abs(param_flows1.reshape(-1) - ref_param_flows1) < 1e-2)
+
+    ## Parameter learning & flow aggregation tests ##
+
+    temp_param_flows = param_flows.clone().to(device)
+
+    compute_cum_par_flows(temp_param_flows, pc.parflow_fusing_kwargs)
+
+    ref_param_flows0 = temp_param_flows[0:1024].reshape(2, 2, 16, 16).permute(0, 3, 1, 2).reshape(-1)
+    assert torch.all(torch.abs(param_flows0.reshape(-1) - ref_param_flows0.cpu()) < 1e-2)
+
+    ref_param_flows1 = temp_param_flows[2048:4096].reshape(2, 4, 16, 16).permute(0, 3, 1, 2).reshape(-1)
+    assert torch.all(torch.abs(param_flows1.reshape(-1) - ref_param_flows1.cpu()) < 1e-2)
+
+    em_par_update(pc.params, temp_param_flows, pc.par_update_kwargs, step_size = 1.0, pseudocount = 0.0)
+
+    param_flows0 /= param_flows0.sum(dim = 1, keepdim = True)
+    ref_params0 = pc.params[256:1280].reshape(2, 2, 16, 16).permute(0, 3, 1, 2).reshape(-1)
+    assert torch.all(torch.abs(param_flows0.reshape(-1) - ref_params0.cpu()) < 1e-4)
+
+    param_flows1 /= param_flows1.sum(dim = 1, keepdim = True)
+    ref_params1 = pc.params[1280:3328].reshape(2, 4, 16, 16).permute(0, 3, 1, 2).reshape(-1)
+    assert torch.all(torch.abs(param_flows1.reshape(-1) - ref_params1.cpu()) < 1e-4)
+
+
+if __name__ == "__main__":
+    torch.manual_seed(2390)
+    simple_structure_test_group1()
+    simple_structure_test_group16()
+

From 10981b5b773f87c711502660b271d8f8400dfcdb Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Thu, 28 Dec 2023 04:17:28 +0800
Subject: [PATCH 113/162] remove unused file

---
 tests/structures/debug.py | 321 --------------------------------------
 1 file changed, 321 deletions(-)
 delete mode 100644 tests/structures/debug.py

diff --git a/tests/structures/debug.py b/tests/structures/debug.py
deleted file mode 100644
index d72ee3f3..00000000
--- a/tests/structures/debug.py
+++ /dev/null
@@ -1,321 +0,0 @@
-import numpy as np
-import torch
-
-import triton
-import triton.language as tl
-
-
-@triton.jit
-def ref_kernel(node_flows, element_flows, node_mars, element_mars, params, 
-                                        chids, parids_start, parids_increment, parpids_start, parpids_increment, 
-                                        local_ids, batch_size: tl.constexpr, partial_eval: tl.constexpr, ptr_inc_step: tl.constexpr, 
-                                        BLOCK_B: tl.constexpr, TILE_SIZE_K: tl.constexpr, K_NUM_TILES: tl.constexpr, TILE_SIZE_M: tl.constexpr, 
-                                        GROUP_SIZE_M: tl.constexpr, GROUP_SIZE_K: tl.constexpr):
-
-    pid_b = tl.program_id(0) # ID of size-`BLOCK_B` batches
-    pid_m = tl.program_id(1) # ID of size-`TILE_SIZE_M` nodes
-
-    # Get inferred node group id from `pid_m`
-    elegroup_id = pid_m // (GROUP_SIZE_M // TILE_SIZE_M)
-    tile_id = pid_m % (GROUP_SIZE_M // TILE_SIZE_M)
-
-    # Get the real node group id in the case of partial evaluation
-    if partial_eval == 1:
-        elegroup_id = tl.load(local_ids + elegroup_id)
-
-    # Initialize pointers to `params`
-    offs_ele = tl.arange(0, TILE_SIZE_M) + tile_id * TILE_SIZE_M
-    offs_edge = tl.arange(0, TILE_SIZE_K)
-    offs_edge_gid = offs_edge // GROUP_SIZE_K
-    offs_edge_nid = (offs_edge % GROUP_SIZE_K)
-    par_start = tl.load(parpids_start + elegroup_id * ptr_inc_step + offs_edge_gid)
-    epars_ptr = params + \
-        offs_ele[:,None] * GROUP_SIZE_K + \
-        (par_start + offs_edge_nid)[None,:] # [TILE_SIZE_M, TILE_SIZE_K]
-
-    # Batch offsets and mask
-    offs_batch = tl.arange(0, BLOCK_B) + pid_b * BLOCK_B
-    mask_batch = offs_batch < batch_size
-
-    # Initialize pointers to `node_mars`
-    edge_start = tl.load(parids_start + elegroup_id * ptr_inc_step + offs_edge_gid)
-    nmars_ptr = node_mars + \
-        (edge_start + offs_edge_nid)[:,None] * batch_size + \
-        offs_batch[None,:] # [TILE_SIZE_K, BLOCK_B]
-    nflows_ptr = node_flows + \
-        (edge_start + offs_edge_nid)[:,None] * batch_size + \
-        offs_batch[None,:] # [TILE_SIZE_K, BLOCK_B]
-
-    # Initialize pointers to `element_mars`
-    off_eleids = tl.load(chids + elegroup_id)
-    emars_ptr = element_mars + (off_eleids + offs_ele[:,None]) * batch_size + offs_batch[None,:]
-    emars = tl.load(emars_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_K, BLOCK_B]
-
-    # Batch increment pointers
-    parids_inc_ptr = parids_increment + elegroup_id * (K_NUM_TILES * ptr_inc_step) + offs_edge_gid
-    parpids_inc_ptr = parpids_increment + elegroup_id * (K_NUM_TILES * ptr_inc_step) + offs_edge_gid
-
-    # Inner loop
-    acc = tl.zeros([TILE_SIZE_M, BLOCK_B], dtype = tl.float32)
-    log_max = tl.zeros([BLOCK_B], dtype = tl.float32) - float("inf")
-
-    for k in range(0, K_NUM_TILES):
-        epars = tl.load(epars_ptr) # [TILE_SIZE_M, TILE_SIZE_K]
-        nmars = tl.load(nmars_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_K, BLOCK_B]
-        nflows = tl.load(nflows_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_K, BLOCK_B]
-
-        # log_n_fdm = tl.log(nflows) - nmars
-        # log_n_fdm_max = tl.max(log_n_fdm, axis = 0)
-        # n_fdm_sub = tl.exp(log_n_fdm - log_n_fdm_max[None,:])
-
-        # partial_flows = tl.dot(epars, n_fdm_sub)
-
-        # acc = tl.where(log_max[None,:] > log_n_fdm_max[None,:], 
-        #                acc + tl.exp(log_n_fdm_max - log_max)[None,:] * partial_flows,
-        #                partial_flows + tl.exp(log_max - log_n_fdm_max)[None,:] * acc)
-        # log_max = tl.maximum(log_max, log_n_fdm_max)
-
-        eflows = tl.sum(epars[:,:,None] * tl.exp(emars[:,None,:] - nmars[None,:,:]) * nflows[None,:,:], axis = 1)
-        acc += eflows
-
-        # Increment `epars_ptr`
-        parpids_inc = tl.load(parpids_inc_ptr)
-        epars_ptr += parpids_inc[None,:]
-        parpids_inc_ptr += ptr_inc_step
-
-        # Increment `nmars_ptr`
-        parids_inc = tl.load(parids_inc_ptr)
-        nmars_ptr += parids_inc[:,None] * batch_size
-        nflows_ptr += parids_inc[:,None] * batch_size
-        parids_inc += ptr_inc_step
-
-    # # Initialize pointers to `element_mars`
-    # off_eleids = tl.load(chids + elegroup_id)
-    # emars_ptr = element_mars + (off_eleids + offs_ele[:,None]) * batch_size + offs_batch[None,:]
-    # emars = tl.load(emars_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_K, BLOCK_B]
-
-    # eflows = acc * tl.exp(emars + log_max[None,:])
-
-    # Write back
-    offs_elemfs = (off_eleids + offs_ele[:,None]) * batch_size + offs_batch[None,:]
-    tl.store(element_flows + offs_elemfs, acc, mask = mask_batch[None,:])
-
-
-@triton.jit
-def my_kernel(aaa, bbb, ccc, node_flows, element_flows, node_mars, element_mars, params, 
-            chids, parids_start, parids_increment, parpids_start, parpids_increment, 
-            local_ids, batch_size: tl.constexpr, partial_eval: tl.constexpr, ptr_inc_step: tl.constexpr, 
-            BLOCK_B: tl.constexpr, TILE_SIZE_K: tl.constexpr, K_NUM_TILES: tl.constexpr, TILE_SIZE_M: tl.constexpr, 
-            GROUP_SIZE_M: tl.constexpr, GROUP_SIZE_K: tl.constexpr):
-
-    pid_b = tl.program_id(0) # ID of size-`BLOCK_B` batches
-    pid_m = tl.program_id(1) # ID of size-`TILE_SIZE_M` nodes
-
-    # Get inferred node group id from `pid_m`
-    elegroup_id = pid_m // (GROUP_SIZE_M // TILE_SIZE_M)
-    tile_id = pid_m % (GROUP_SIZE_M // TILE_SIZE_M)
-
-    # Get the real node group id in the case of partial evaluation
-    if partial_eval == 1:
-        elegroup_id = tl.load(local_ids + elegroup_id)
-
-    # Initialize pointers to `params`
-    offs_ele = tl.arange(0, TILE_SIZE_M) + tile_id * TILE_SIZE_M
-    offs_edge = tl.arange(0, TILE_SIZE_K)
-    offs_edge_gid = offs_edge // GROUP_SIZE_K
-    offs_edge_nid = (offs_edge % GROUP_SIZE_K)
-    par_start = tl.load(parpids_start + elegroup_id * ptr_inc_step + offs_edge_gid)
-    epars_ptr = params + \
-        offs_ele[:,None] * GROUP_SIZE_K + \
-        (par_start + offs_edge_nid)[None,:] # [TILE_SIZE_M, TILE_SIZE_K]
-
-    # Batch offsets and mask
-    offs_batch = tl.arange(0, BLOCK_B) + pid_b * BLOCK_B
-    mask_batch = offs_batch < batch_size
-
-    # Initialize pointers to `node_mars`
-    edge_start = tl.load(parids_start + elegroup_id * ptr_inc_step + offs_edge_gid)
-    nmars_ptr = node_mars + \
-        (edge_start + offs_edge_nid)[:,None] * batch_size + \
-        offs_batch[None,:] # [TILE_SIZE_K, BLOCK_B]
-    nflows_ptr = node_flows + \
-        (edge_start + offs_edge_nid)[:,None] * batch_size + \
-        offs_batch[None,:] # [TILE_SIZE_K, BLOCK_B]
-
-    # Batch increment pointers
-    parids_inc_ptr = parids_increment + elegroup_id * (K_NUM_TILES * ptr_inc_step) + offs_edge_gid
-    parpids_inc_ptr = parpids_increment + elegroup_id * (K_NUM_TILES * ptr_inc_step) + offs_edge_gid
-
-    # Inner loop
-    acc = tl.zeros([TILE_SIZE_M, BLOCK_B], dtype = tl.float32) - float("inf")
-
-    # for k in range(0, K_NUM_TILES):
-    for k in range(0, 1):
-        epars = tl.load(epars_ptr) # [TILE_SIZE_M, TILE_SIZE_K]
-        nmars = tl.load(nmars_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_K, BLOCK_B]
-        nflows = tl.load(nflows_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_K, BLOCK_B]
-
-        log_n_fdm = tl.log(nflows) - nmars
-        log_n_fdm_max = tl.max(log_n_fdm, axis = 0)
-        n_fdm_sub = tl.where(log_n_fdm_max[None,:] != -float("inf"), tl.exp(log_n_fdm - log_n_fdm_max[None,:]), 0.0)
-
-        offs_aaa = pid_m * (TILE_SIZE_K * batch_size) + tl.arange(0, TILE_SIZE_K)[:,None] * batch_size + offs_batch[None,:]
-        tl.store(aaa + offs_aaa, n_fdm_sub, mask = mask_batch[None,:])
-
-        partial_flows = tl.dot(epars, n_fdm_sub)
-        # partial_flows = tl.sum(epars[:,:,None] * n_fdm_sub[None,:,:], axis = 1)
-
-        offs_bbb = pid_m * (TILE_SIZE_M * batch_size) + tl.arange(0, TILE_SIZE_M)[:,None] * batch_size + offs_batch[None,:]
-        tl.store(bbb + offs_bbb, partial_flows, mask = mask_batch[None,:])
-
-        offs_ccc = pid_m * batch_size + offs_batch
-        tl.store(ccc + offs_ccc, log_n_fdm_max, mask = mask_batch)
-
-        neginf_flag = (log_n_fdm_max[None,:] == -float("inf")) & (acc == -float("inf"))
-        acc = tl.where(log_n_fdm_max[None,:] > acc,
-            tl.log(partial_flows + tl.exp(acc - log_n_fdm_max[None,:])) + log_n_fdm_max[None,:],
-            tl.log(tl.exp(log_n_fdm_max[None,:] - acc) * partial_flows + 1.0) + acc
-        )
-        acc = tl.where(neginf_flag, -float("inf"), acc)
-
-        # Increment `epars_ptr`
-        parpids_inc = tl.load(parpids_inc_ptr)
-        epars_ptr += parpids_inc[None,:]
-        parpids_inc_ptr += ptr_inc_step
-
-        # Increment `nmars_ptr`
-        parids_inc = tl.load(parids_inc_ptr)
-        nmars_ptr += parids_inc[:,None] * batch_size
-        nflows_ptr += parids_inc[:,None] * batch_size
-        parids_inc += ptr_inc_step
-
-    # Initialize pointers to `element_mars`
-    off_eleids = tl.load(chids + elegroup_id)
-    emars_ptr = element_mars + (off_eleids + offs_ele[:,None]) * batch_size + offs_batch[None,:]
-    emars = tl.load(emars_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_K, BLOCK_B]
-
-    eflows = tl.exp(acc + emars)
-
-    # Write back
-    offs_elemfs = (off_eleids + offs_ele[:,None]) * batch_size + offs_batch[None,:]
-    tl.store(element_flows + offs_elemfs, eflows, mask = mask_batch[None,:])
-
-
-def main():
-
-    device = torch.device("cuda:0")
-
-    data = np.load("temp.npz")
-
-    node_flows = torch.from_numpy(data["node_flows"]).to(device)
-    element_flows = torch.from_numpy(data["element_flows"]).to(device)
-    node_mars = torch.from_numpy(data["node_mars"]).to(device)
-    element_mars = torch.from_numpy(data["element_mars"]).to(device)
-    params = torch.from_numpy(data["params"]).to(device)
-    chids = torch.from_numpy(data["chids"]).to(device)
-    parids = torch.from_numpy(data["parids"]).to(device)
-    parids_start = torch.from_numpy(data["parids_start"]).to(device)
-    parids_increment = torch.from_numpy(data["parids_increment"]).to(device)
-    parpids = torch.from_numpy(data["parpids"]).to(device)
-    parpids_start = torch.from_numpy(data["parpids_start"]).to(device)
-    parpids_increment = torch.from_numpy(data["parpids_increment"]).to(device)
-    batch_size = int(data["batch_size"])
-    ptr_inc_step = int(data["ptr_inc_step"])
-    BLOCK_B = int(data["BLOCK_B"])
-    TILE_SIZE_M = int(data["TILE_SIZE_M"])
-    TILE_SIZE_K = int(data["TILE_SIZE_K"])
-    K_NUM_TILES = int(data["K_NUM_TILES"])
-    GROUP_SIZE_M = int(data["GROUP_SIZE_M"])
-    GROUP_SIZE_K = int(data["GROUP_SIZE_K"])
-    layer_n_nodes = int(data["layer_n_nodes"])
-
-    grid = (triton.cdiv(batch_size, BLOCK_B), triton.cdiv(layer_n_nodes, TILE_SIZE_M))
-
-    ref_kernel[grid](
-        node_flows = node_flows, 
-        element_flows = element_flows, 
-        node_mars = node_mars, 
-        element_mars = element_mars, 
-        params = params, 
-        chids = chids, 
-        parids_start = parids_start,
-        parids_increment = parids_increment,
-        parpids_start = parpids_start,
-        parpids_increment = parpids_increment, 
-        local_ids = None, 
-        batch_size = batch_size, 
-        partial_eval = 0,
-        ptr_inc_step = ptr_inc_step,
-        BLOCK_B = BLOCK_B, 
-        TILE_SIZE_K = TILE_SIZE_K, 
-        K_NUM_TILES = K_NUM_TILES,
-        TILE_SIZE_M = TILE_SIZE_M, 
-        GROUP_SIZE_M = GROUP_SIZE_M,
-        GROUP_SIZE_K = GROUP_SIZE_K
-    )
-
-    torch.cuda.synchronize()
-
-    import pdb; pdb.set_trace()
-
-    element_flows_ref = element_flows.clone()
-
-    # aaa = torch.zeros([grid[1], TILE_SIZE_M, TILE_SIZE_K]).cuda()
-    # bbb = torch.zeros([grid[1], TILE_SIZE_M, TILE_SIZE_K], dtype = torch.long).cuda()
-    # ccc = torch.zeros([grid[1], K_NUM_TILES, TILE_SIZE_K, batch_size]).cuda()
-    # ddd = torch.zeros([grid[1], K_NUM_TILES, TILE_SIZE_K, batch_size]).cuda()
-    # eee = torch.zeros([grid[1], K_NUM_TILES, TILE_SIZE_M, batch_size]).cuda()
-
-    aaa = torch.zeros([grid[1], TILE_SIZE_K, batch_size]).cuda()
-    bbb = torch.zeros([grid[1], TILE_SIZE_M, batch_size]).cuda()
-    ccc = torch.zeros([grid[1], batch_size]).cuda()
-
-    my_kernel[grid](
-        aaa = aaa,
-        bbb = bbb,
-        ccc = ccc,
-        node_flows = node_flows, 
-        element_flows = element_flows, 
-        node_mars = node_mars, 
-        element_mars = element_mars, 
-        params = params, 
-        chids = chids, 
-        parids_start = parids_start,
-        parids_increment = parids_increment,
-        parpids_start = parpids_start,
-        parpids_increment = parpids_increment, 
-        local_ids = None, 
-        batch_size = batch_size, 
-        partial_eval = 0,
-        ptr_inc_step = ptr_inc_step,
-        BLOCK_B = BLOCK_B, 
-        TILE_SIZE_K = TILE_SIZE_K, 
-        K_NUM_TILES = K_NUM_TILES,
-        TILE_SIZE_M = TILE_SIZE_M, 
-        GROUP_SIZE_M = GROUP_SIZE_M,
-        GROUP_SIZE_K = GROUP_SIZE_K
-    )
-
-    # nflows = node_flows[parids[0,0]:parids[0,1],:] # ccc
-    # nmars = node_mars[parids[0,0]:parids[0,1],:]
-    # epars = params[bbb[0,:,:]] # aaa
-    # assert (epars - aaa[0,:,:]).abs().max() < 1e-4
-
-    # log_n_fdm = nflows.log() - nmars
-    # log_n_fdm_max = torch.max(log_n_fdm, dim = 0).values
-    # n_fdm_sub = torch.exp(log_n_fdm - log_n_fdm_max[None,:]) # ddd
-    # assert (n_fdm_sub[:,:BLOCK_B] - ddd[0,:,:BLOCK_B]).abs().max() < 1e-4
-
-    # partial_flows = torch.matmul(epars, n_fdm_sub) # eee
-    # # (partial_flows[:,:BLOCK_B].log() - eee[0,:,:BLOCK_B]).abs()
-
-    # print((element_flows_ref[chids,:] - element_flows[chids,:]).abs().max())
-
-    element_flows_ref[chids,143]
-    element_flows[chids,143]
-
-    import pdb; pdb.set_trace()
-
-
-if __name__ == "__main__":
-    main()
\ No newline at end of file

From 3067da184b0372721b8118d99b48efd4fd8c40b6 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Thu, 28 Dec 2023 04:41:53 +0800
Subject: [PATCH 114/162] adjust `num_warps`

---
 src/pyjuice/layer/input_layer.py    |  9 ++++++---
 tests/model/parameter_tying_test.py |  1 -
 tests/structures/pd_test.py         | 30 ++++++++++++++---------------
 3 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/src/pyjuice/layer/input_layer.py b/src/pyjuice/layer/input_layer.py
index adf97a06..63e21eaa 100644
--- a/src/pyjuice/layer/input_layer.py
+++ b/src/pyjuice/layer/input_layer.py
@@ -251,7 +251,8 @@ def forward(self, data: torch.Tensor, node_mars: torch.Tensor, params: Optional[
                 nv_block_size = triton.next_power_of_2(self.num_vars_per_node),
                 node_offset = node_offset, 
                 BLOCK_SIZE = 1024, 
-                partial_eval = 1 if fw_local_ids is not None else 0
+                partial_eval = 1 if fw_local_ids is not None else 0,
+                num_warps = 8
             )
 
             # Apply missing mask if required
@@ -271,7 +272,8 @@ def forward(self, data: torch.Tensor, node_mars: torch.Tensor, params: Optional[
                     node_offset = node_offset, 
                     BLOCK_SIZE = 1024, 
                     partial_eval = 1 if fw_local_ids is not None else 0,
-                    mask_dim = mask_dim
+                    mask_dim = mask_dim,
+                    num_warps = 8
                 )
 
         else:
@@ -329,7 +331,8 @@ def backward(self, data: torch.Tensor, node_flows: torch.Tensor,
                 nv_block_size = triton.next_power_of_2(self.num_vars_per_node),
                 node_offset = node_offset, 
                 BLOCK_SIZE = 1024, 
-                partial_eval = 1 if bk_local_ids is not None else 0
+                partial_eval = 1 if bk_local_ids is not None else 0,
+                num_warps = 8
             )
 
         else:
diff --git a/tests/model/parameter_tying_test.py b/tests/model/parameter_tying_test.py
index 6199e969..ee28d45f 100644
--- a/tests/model/parameter_tying_test.py
+++ b/tests/model/parameter_tying_test.py
@@ -612,4 +612,3 @@ def simple_structure_test_group16():
     torch.manual_seed(2390)
     simple_structure_test_group1()
     simple_structure_test_group16()
-
diff --git a/tests/structures/pd_test.py b/tests/structures/pd_test.py
index 60c56001..1ca1b9db 100644
--- a/tests/structures/pd_test.py
+++ b/tests/structures/pd_test.py
@@ -109,21 +109,21 @@ def pd_test():
     #     lls.mean().backward()
     #     break
 
-    # from torch.profiler import profile, record_function, ProfilerActivity
-    # with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], with_stack = True) as prof:
-    #     for i, batch in enumerate(train_loader):
-    #         x = batch[0].to(device)
-
-    #         lls = pc(x, record_cudagraph = False)
-    #         lls.mean().backward()
-    #         if i > 10:
-    #             break
-
-    # prof.export_chrome_trace("trace3.json")
-    # # torch.autograd.profiler.tensorboard_trace_to_flame_graph('trace.json', 'flamegraph.svg')
-    # # prof.export_stacks("trace.txt", "cpu_time_total")
-    # import pdb; pdb.set_trace()
-    # exit()
+    from torch.profiler import profile, record_function, ProfilerActivity
+    with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], with_stack = True) as prof:
+        for i, batch in enumerate(train_loader):
+            x = batch[0].to(device)
+
+            lls = pc(x, record_cudagraph = False)
+            lls.mean().backward()
+            if i > 10:
+                break
+
+    prof.export_chrome_trace("trace3.json")
+    # torch.autograd.profiler.tensorboard_trace_to_flame_graph('trace.json', 'flamegraph.svg')
+    # prof.export_stacks("trace.txt", "cpu_time_total")
+    import pdb; pdb.set_trace()
+    exit()
 
     mini_batch_em_epoch(350, pc, optimizer, None, train_loader, test_loader, device)
     full_batch_em_epoch(pc, train_loader, test_loader, device)

From 9215fa77ea0a6fc7674ef908c8c720ae73b4739d Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Thu, 28 Dec 2023 16:24:45 +0800
Subject: [PATCH 115/162] restore broken examples

---
 examples/1_pc_training/train_mnist_hclt.py    | 301 ----------
 examples/2_hybrid_models/IDF/__init__.py      |   0
 examples/2_hybrid_models/IDF/backround.py     | 148 -----
 examples/2_hybrid_models/IDF/coupling.py      | 145 -----
 examples/2_hybrid_models/IDF/distributions.py | 209 -------
 .../2_hybrid_models/IDF/generative_flows.py   | 180 ------
 examples/2_hybrid_models/IDF/idf.py           | 171 ------
 examples/2_hybrid_models/IDF/loss.py          | 149 -----
 examples/2_hybrid_models/IDF/networks.py      | 193 ------
 examples/2_hybrid_models/IDF/priors.py        | 178 ------
 examples/2_hybrid_models/IDF/utils.py         |  34 --
 examples/2_hybrid_models/VAE/model.py         | 563 ------------------
 examples/2_hybrid_models/VAE/modules.py       | 277 ---------
 examples/2_hybrid_models/VAE/rand.py          | 158 -----
 .../2_hybrid_models/train_pc_flows_hybrid.py  | 232 --------
 .../2_hybrid_models/train_pc_vae_hybrid.py    | 349 -----------
 examples/train_mnist_hclt.py                  | 125 ++++
 examples/train_mnist_pd.py                    | 110 ++++
 tests/structures/pd_test.py                   |  31 +-
 19 files changed, 251 insertions(+), 3302 deletions(-)
 delete mode 100644 examples/1_pc_training/train_mnist_hclt.py
 delete mode 100644 examples/2_hybrid_models/IDF/__init__.py
 delete mode 100644 examples/2_hybrid_models/IDF/backround.py
 delete mode 100644 examples/2_hybrid_models/IDF/coupling.py
 delete mode 100644 examples/2_hybrid_models/IDF/distributions.py
 delete mode 100644 examples/2_hybrid_models/IDF/generative_flows.py
 delete mode 100644 examples/2_hybrid_models/IDF/idf.py
 delete mode 100644 examples/2_hybrid_models/IDF/loss.py
 delete mode 100644 examples/2_hybrid_models/IDF/networks.py
 delete mode 100644 examples/2_hybrid_models/IDF/priors.py
 delete mode 100644 examples/2_hybrid_models/IDF/utils.py
 delete mode 100644 examples/2_hybrid_models/VAE/model.py
 delete mode 100644 examples/2_hybrid_models/VAE/modules.py
 delete mode 100644 examples/2_hybrid_models/VAE/rand.py
 delete mode 100644 examples/2_hybrid_models/train_pc_flows_hybrid.py
 delete mode 100644 examples/2_hybrid_models/train_pc_vae_hybrid.py
 create mode 100644 examples/train_mnist_hclt.py
 create mode 100644 examples/train_mnist_pd.py

diff --git a/examples/1_pc_training/train_mnist_hclt.py b/examples/1_pc_training/train_mnist_hclt.py
deleted file mode 100644
index 5073068a..00000000
--- a/examples/1_pc_training/train_mnist_hclt.py
+++ /dev/null
@@ -1,301 +0,0 @@
-import sys
-import pyjuice as juice
-import torch
-import torch._dynamo as dynamo
-import time
-import torchvision
-import numpy as np
-import sys
-import logging
-import warnings
-from torch.utils.data import TensorDataset, DataLoader
-import argparse
-from typing import Optional
-from PIL import Image
-from matplotlib import pyplot as plt
-
-warnings.filterwarnings("ignore")
-logging.getLogger("torch._inductor.utils").setLevel(logging.ERROR)
-logging.getLogger("torch._inductor.compile_fx").setLevel(logging.ERROR)
-logging.getLogger("torch._inductor.lowering").setLevel(logging.ERROR)
-logging.getLogger("torch._inductor.graph").setLevel(logging.ERROR)
-
-def process_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--batch_size', type=int, default=512, help='batch_size')
-    parser.add_argument('--cuda', type=int, default=0, help='cuda idx')
-    parser.add_argument('--num_latents', type=int, default=32, help='num_latents')
-    parser.add_argument("--mode", type=str, default="train", help="options: 'train', 'load'")
-    parser.add_argument("--dataset", type=str, default="mnist", help="mnist, fashion")
-    parser.add_argument("--input_circuit", type=str, default=None, help="load circuit from file instead of learning structure")
-    parser.add_argument("--output_dir", type=str, default="examples", help="output directory")
-    args = parser.parse_args()
-    return args
-
-def evaluate(pc: juice.ProbCircuit, loader: DataLoader, alphas: Optional[torch.Tensor]=None):
-    lls_total = 0.0
-    for batch in loader:
-        x = batch[0].to(pc.device)
-        lls = pc(x, alphas=alphas)
-        lls_total += lls.mean().detach().cpu().numpy().item()
-    
-    lls_total /= len(loader)
-    return lls_total
-
-def evaluate_miss(pc: juice.ProbCircuit, loader: DataLoader, alphas: Optional[torch.Tensor]=None):
-    lls_total = 0.0
-    for batch in loader:
-        x = batch[0].to(pc.device)
-        mask = batch[1].to(pc.device)
-        lls = pc(x, missing_mask=mask, alphas=alphas)
-        lls_total += lls.mean().detach().cpu().numpy().item()
-    
-    lls_total /= len(loader)
-    return lls_total
-
-
-def mini_batch_em_epoch(num_epochs, pc, optimizer, scheduler, train_loader, test_loader, device):
-    for epoch in range(num_epochs):
-        t0 = time.time()
-        train_ll = 0.0
-        for batch in train_loader:
-            x = batch[0].to(device)
-
-            optimizer.zero_grad()
-
-            lls = pc(x)
-            lls.mean().backward()
-
-            train_ll += lls.mean().detach().cpu().numpy().item()
-
-            optimizer.step()
-            scheduler.step()
-
-        train_ll /= len(train_loader)
-
-        t1 = time.time()
-        test_ll = evaluate(pc, loader=test_loader)
-        t2 = time.time()
-
-        print(f"[Epoch {epoch}/{num_epochs}][train LL: {train_ll:.2f}; test LL: {test_ll:.2f}].....[train forward+backward+step {t1-t0:.2f}; test forward {t2-t1:.2f}] ")
-
-
-def full_batch_em_epoch(pc, train_loader, test_loader, device):
-    with torch.no_grad():
-        t0 = time.time()
-        train_ll = 0.0
-        for batch in train_loader:
-            x = batch[0].to(device)
-
-            lls = pc(x)
-            pc.backward(x, flows_memory = 1.0)
-
-            train_ll += lls.mean().detach().cpu().numpy().item()
-
-        pc.mini_batch_em(step_size = 1.0, pseudocount = 0.1)
-
-        train_ll /= len(train_loader)
-
-        t1 = time.time()
-        test_ll = evaluate(pc, loader=test_loader)
-        t2 = time.time()
-        print(f"[train LL: {train_ll:.2f}; test LL: {test_ll:.2f}].....[train forward+backward+step {t1-t0:.2f}; test forward {t2-t1:.2f}] ")
-
-def load_circuit(filename, verbose=False, device=None):
-    t0 = time.time()
-    if verbose:
-        print(f"Loading circuit....{filename}.....", end="")
-    pc = juice.model.ProbCircuit.load(filename)
-    if device is not None:
-        print(f"...into device {device}...", end="")
-        pc.to(device)
-    t1 = time.time()
-    if verbose:
-        print(f"Took {t1-t0:.2f} (s)")
-        print("pc params size", pc.params.size())
-        print("pc num nodes ", pc.num_nodes)
-
-    return pc
-
-def save_circuit(pc, filename, verbose=False):
-    if verbose:
-        print(f"Saving pc into {filename}.....", end="")
-    t0_save = time.time()
-    torch.save(pc, filename)
-    t1_save = time.time()
-    if verbose:
-        print(f"took {t1_save - t0_save:.2f} (s)")
-    
-
-def main(args):
-    torch.cuda.set_device(args.cuda)
-    device = torch.device(f"cuda:{args.cuda}")
-    filename = f"{args.output_dir}/{args.dataset}_{args.num_latents}.torch"
-    
-    if args.dataset == "mnist":
-        train_dataset = torchvision.datasets.MNIST(root = "./examples/data", train = True, download = True)
-        test_dataset = torchvision.datasets.MNIST(root = "./examples/data", train = False, download = True)
-    elif args.dataset == "fashion":
-        train_dataset = torchvision.datasets.FashionMNIST(root = "./examples/data", train = True, download = True)
-        test_dataset = torchvision.datasets.FashionMNIST(root = "./examples/data", train = False, download = True)
-    else:
-        raise(f"Dataset {args.dataset} not supported.")
-     
-    train_data = train_dataset.data.reshape(60000, 28*28)
-    test_data = test_dataset.data.reshape(10000, 28*28)
-
-    num_features = train_data.size(1)
-
-    train_loader = DataLoader(
-        dataset = TensorDataset(train_data),
-        batch_size = args.batch_size,
-        shuffle = True,
-        drop_last = True
-    )
-    test_loader = DataLoader(
-        dataset = TensorDataset(test_data),
-        batch_size = args.batch_size,
-        shuffle = False,
-        drop_last = True
-    )
-
-    if args.mode == "train":
-        print("===========================Train===============================")
-        if args.input_circuit is None:
-            pc = juice.structures.HCLT(train_data.float().to(device), num_bins = 32, 
-                                                                        sigma = 0.5 / 32, 
-                                                                        num_latents = args.num_latents, 
-                                                                        chunk_size = 32)
-            pc.to(device)
-        else:
-            pc = load_circuit(args.input_circuit, verbose=True, device=device)
-        
-        optimizer = juice.optim.CircuitOptimizer(pc, lr = 0.1, pseudocount = 0.1)
-        scheduler = juice.optim.CircuitScheduler(optimizer, method = "multi_linear", 
-                                                            lrs = [0.9, 0.1, 0.05], 
-                                                            milestone_steps = [0, len(train_loader) * 100, len(train_loader) * 350])
-
-        mini_batch_em_epoch(350, pc, optimizer, scheduler, train_loader, test_loader, device)
-        full_batch_em_epoch(pc, train_loader, test_loader, device)
-        save_circuit(pc, filename, verbose=True)
-
-
-    elif args.mode == "load":
-        print("===========================LOAD===============================")
-        pc = load_circuit(filename, verbose=True, device=device)
-
-        t_compile = time.time()
-        test_ll = evaluate(pc, loader=test_loader) # force compilation
-
-        t0 = time.time()
-        train_ll = evaluate(pc, loader=train_loader)
-        t1 = time.time()
-        test_ll = evaluate(pc, loader=test_loader)
-        t2 = time.time()
-
-        train_bpd = -train_ll / (num_features * np.log(2))
-        test_bpd = -test_ll / (num_features * np.log(2))
-
-        print(f"Compilation+test took {t0-t_compile:.2f} (s); train_ll {t1-t0:.2f} (s); test_ll {t2-t1:.2f} (s)")
-        print(f"train_ll: {train_ll:.2f}, test_ll: {test_ll:.2f}")
-        print(f"train_bpd: {train_bpd:.2f}, test_bpd: {test_bpd:.2f}")
-
-    elif args.mode == "miss":
-        print("===========================MISS===============================")    
-        print(f"Loading {filename} into {device}.......")
-        pc = load_circuit(filename, verbose=True, device=device)
-    
-        # test_miss_mask = torch.zeros(test_data.size(), dtype=torch.bool)
-        # test_miss_mask[1:5000, 0:392] = 1 # for first half of images make first half missing
-        # test_miss_mask[5000:, 392:] = 1   # for second half of images make second half missing
-        test_miss_mask = torch.rand(test_data.size()) < 0.5
-
-        test_loader_miss = DataLoader(
-            dataset = TensorDataset(test_data, test_miss_mask),
-            batch_size = args.batch_size,
-            shuffle = False,
-            drop_last = True
-        )
-        t_compile = time.time()
-        test_ll = evaluate(pc, loader=test_loader) 
-        test_ll_miss = evaluate_miss(pc, loader=test_loader_miss) 
-
-        t0 = time.time()
-        train_ll = evaluate(pc, loader=train_loader)
-        t1 = time.time()
-        test_ll = evaluate(pc, loader=test_loader) 
-        t2 = time.time()
-        test_ll_miss = evaluate_miss(pc, loader=test_loader_miss) 
-        t3 = time.time()
-
-        train_bpd = -train_ll / (num_features * np.log(2))
-        test_bpd = -test_ll / (num_features * np.log(2))
-        test_miss_bpd = -test_ll_miss / (num_features * np.log(2))
-
-        print(f"train_ll: {train_ll:.2f}, train_bpd: {train_bpd:.2f}; time = {t1-t0:.2f} (s)")
-        print(f"test_ll: {test_ll:.2f}, test_bpd: {test_bpd:.2f}; time = {t2-t1:.2f} (s)")
-        print(f"test_miss_ll: {test_ll_miss:.2f}, test_miss_bpd: {test_miss_bpd:.2f}; time = {t3-t2:.2f} (s)")
-    elif args.mode == "alphas":
-        print("===========================ALPHAS===============================")
-        pc = load_circuit(filename, verbose=True, device=device)
-
-        alphas = 0.99 * torch.ones((args.batch_size, 28*28), device=device)
-        test_ll = evaluate(pc, loader=test_loader) 
-        train_ll = evaluate(pc, loader=train_loader)
-
-        t0 = time.time()
-        train_ll_alpha = evaluate(pc, loader=train_loader, alphas=alphas)
-        t1 = time.time()
-        test_ll_alpha = evaluate(pc, loader=test_loader, alphas=alphas)
-        t2 = time.time()
-        
-        print(f"train_ll: {train_ll:.2f}, test_ll: {test_ll:.2f}")
-        print(f"train_ll_alpha: {train_ll_alpha:.2f}, test_ll_alpha: {test_ll_alpha:.2f}")
-        print(f"train {t1-t0:.2f} (s); test {t2-t1:.2f} (s)")
-
-    elif args.mode == "sample":
-        print("===========================SAMPLE===============================")
-        pc = load_circuit(filename, verbose=True, device=device)
-        pc.to(device)
-
-        t0_sample = time.time()
-
-        for batch_id, batch in enumerate(train_loader):
-            x = batch[0].to(device)                                                 # (B, num_vars)
-            miss_mask = torch.zeros(x.size(), dtype=torch.bool, device=device)      # (B, num_vars)
-            
-            # Left Side of Pixels Missing
-            for row in range(28):
-                miss_mask[:, row*28:row*28+14] = 1 
-
-            # 1. Run Forward Pass
-            lls = pc(x, missing_mask=miss_mask)
-
-            # 2. Sample (for each item in batch returns a sample from p(. | x^o))
-            samples = pc.sample(x, miss_mask)                                       # (B, num_vars)
- 
-            if batch_id < 2:
-                # Plot first 8 Samples
-                plot_count = 8
-                print("Saving Samples as images to file")
-                plt.figure()
-                f, axarr = plt.subplots(3, plot_count, figsize=(28, 10))
-                plt.gray()
-                for i in range(plot_count):
-                    axarr[0][i].imshow(x[i, :].reshape(28,28).cpu().numpy().astype(np.uint8))
-                    axarr[1][i].imshow(255*miss_mask[i, :].reshape(28,28).cpu().numpy().astype(np.uint8))
-                    axarr[2][i].imshow(samples[i, :].reshape(28,28).cpu().numpy().astype(np.uint8))
-                plt.savefig(f"examples/1_pc_training/samples{batch_id}_test.png")  
-        t1_sample = time.time()   
-        print(f"Samples took {t1_sample - t0_sample:.2f} (s)")
-    
-
-    print(f"Memory allocated: {torch.cuda.memory_allocated(device) / 1024 / 1024 / 1024:.1f}GB")
-    print(f"Memory reserved: {torch.cuda.memory_reserved(device) / 1024 / 1024 / 1024:.1f}GB")
-    print(f"Max memory reserved: {torch.cuda.max_memory_reserved(device) / 1024 / 1024 / 1024:.1f}GB")
-
-
-if __name__ == "__main__":
-    args = process_args()
-    print(args)
-    main(args)
\ No newline at end of file
diff --git a/examples/2_hybrid_models/IDF/__init__.py b/examples/2_hybrid_models/IDF/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/examples/2_hybrid_models/IDF/backround.py b/examples/2_hybrid_models/IDF/backround.py
deleted file mode 100644
index 1a6a9ddc..00000000
--- a/examples/2_hybrid_models/IDF/backround.py
+++ /dev/null
@@ -1,148 +0,0 @@
-import torch
-import torch.nn.functional as F
-import numpy as np
-
-from .utils import Base
-
-
-class RoundStraightThrough(torch.autograd.Function):
-
-    def __init__(self):
-        super().__init__()
-
-    @staticmethod
-    def forward(ctx, input):
-        rounded = torch.round(input, out=None)
-        return rounded
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        grad_input = grad_output.clone()
-        return grad_input
-
-
-_round_straightthrough = RoundStraightThrough().apply
-
-
-def _stacked_sigmoid(x, temperature, n_approx=3):
-
-    x_ = x - 0.5
-    rounded = torch.round(x_)
-    x_remainder = x_ - rounded
-
-    size = x_.size()
-    x_remainder = x_remainder.view(size + (1,))
-
-    translation = torch.arange(n_approx) - n_approx // 2
-    translation = translation.to(device=x.device, dtype=x.dtype)
-    translation = translation.view([1] * len(size) + [len(translation)])
-    out = torch.sigmoid((x_remainder - translation) / temperature).sum(dim=-1)
-
-    return out + rounded - (n_approx // 2)
-
-
-class SmoothRound(Base):
-    def __init__(self):
-        self._temperature = None
-        self._n_approx = None
-        super().__init__()
-        self.hard_round = None
-
-    @property
-    def temperature(self):
-        return self._temperature
-
-    @temperature.setter
-    def temperature(self, value):
-        self._temperature = value
-
-        if self._temperature <= 0.05:
-            self._n_approx = 1
-        elif 0.05 < self._temperature < 0.13:
-            self._n_approx = 3
-        else:
-            self._n_approx = 5
-
-    def forward(self, x):
-        assert self._temperature is not None
-        assert self._n_approx is not None
-        assert self.hard_round is not None
-
-        if self.temperature <= 0.25:
-            h = _stacked_sigmoid(x, self.temperature, n_approx=self._n_approx)
-        else:
-            h = x
-
-        if self.hard_round:
-            h = _round_straightthrough(h)
-
-        return h
-
-
-class StochasticRound(Base):
-    def __init__(self):
-        super().__init__()
-        self.hard_round = None
-
-    def forward(self, x):
-        u = torch.rand_like(x)
-
-        h = x + u - 0.5
-
-        if self.hard_round:
-            h = _round_straightthrough(h)
-
-        return h
-
-
-class BackRound(Base):
-
-    def __init__(self, args, inverse_bin_width):
-        """
-        BackRound is an approximation to Round that allows for Backpropagation.
-        Approximate the round function using a sum of translated sigmoids.
-        The temperature determines how well the round function is approximated,
-        i.e., a lower temperature corresponds to a better approximation, at
-        the cost of more vanishing gradients.
-        BackRound supports the following settings:
-        * By setting hard to True and temperature > 0.25, BackRound
-          reduces to a round function with a straight through gradient
-          estimator
-        * When using 0 < temperature <= 0.25 and hard = True, the
-          output in the forward pass is equivalent to a round function, but the
-          gradient is approximated by the gradient of a sum of sigmoids.
-        * When using hard = False, the output is not constrained to integers.
-        * When temperature > 0.25 and hard = False, BackRound reduces to
-          the identity function.
-        Arguments
-        ---------
-        temperature: float
-            Temperature used for stacked sigmoid approximated. If temperature
-            is greater than 0.25, the approximation reduces to the indentiy
-            function.
-        hard: bool
-            If hard is True, a (hard) round is applied before returning. The
-            gradient for this is approximated using the straight-through
-            estimator.
-        """
-        super().__init__()
-        self.inverse_bin_width = inverse_bin_width
-        self.round_approx = args.round_approx
-
-        if args.round_approx == 'smooth':
-            self.round = SmoothRound()
-        elif args.round_approx == 'stochastic':
-            self.round = StochasticRound()
-        else:
-            raise ValueError
-
-    def forward(self, x):
-        if self.round_approx == 'smooth' or self.round_approx == 'stochastic':
-            h = x * self.inverse_bin_width
-
-            h = self.round(h)
-
-            return h / self.inverse_bin_width
-
-        else:
-            raise ValueError
\ No newline at end of file
diff --git a/examples/2_hybrid_models/IDF/coupling.py b/examples/2_hybrid_models/IDF/coupling.py
deleted file mode 100644
index fbb24d91..00000000
--- a/examples/2_hybrid_models/IDF/coupling.py
+++ /dev/null
@@ -1,145 +0,0 @@
-"""
-Collection of flow strategies
-"""
-
-from __future__ import print_function
-
-import torch
-import numpy as np
-
-from .utils import Base
-from .backround import BackRound
-from .networks import NN
-
-
-UNIT_TESTING = False
-
-
-class SplitFactorCoupling(Base):
-    def __init__(self, c_in, factor, height, width, args):
-        super().__init__()
-        self.n_channels = args.n_channels
-        self.kernel = 3
-        self.input_channel = c_in
-        self.round_approx = args.round_approx
-        
-        if args.variable_type == 'discrete':
-            self.round = BackRound(
-                args, inverse_bin_width=2**args.n_bits)
-        else:
-            self.round = None
-
-        self.split_idx = c_in - (c_in // factor)
-
-        self.nn = NN(
-            args=args,
-            c_in=self.split_idx,
-            c_out=c_in - self.split_idx,
-            height=height,
-            width=width,
-            kernel=self.kernel,
-            nn_type=args.coupling_type)
-
-    def forward(self, z, ldj, reverse=False):
-        z1 = z[:, :self.split_idx, :, :]
-        z2 = z[:, self.split_idx:, :, :]
-
-        t = self.nn(z1)
-
-        if self.round is not None:
-            # print("before rounding", t[0, :, 0, 0])
-            t = self.round(t)
-            # print("after rounding", t[0, :, 0, 0])
-
-        if not reverse:
-            z2 = z2 + t
-        else:
-            z2 = z2 - t
-
-        z = torch.cat([z1, z2], dim=1)
-
-        return z, ldj
-
-
-class Coupling(Base):
-    def __init__(self, c_in, height, width, args):
-        super().__init__()
-
-        if args.split_quarter:
-            factor = 4
-        elif args.splitfactor > 1:
-            factor = args.splitfactor
-        else:
-            factor = 2
-
-        self.coupling = SplitFactorCoupling(
-            c_in, factor, height, width, args=args)
-
-    def forward(self, z, ldj, reverse=False):
-        return self.coupling(z, ldj, reverse)
-
-
-def test_generative_flow():
-    import models.networks as networks
-    global UNIT_TESTING
-
-    networks.UNIT_TESTING = True
-    UNIT_TESTING = True
-
-    batch_size = 17
-
-    input_size = [12, 16, 16]
-
-    class Args():
-        def __init__(self):
-            self.input_size = input_size
-            self.learn_split = False
-            self.variable_type = 'continuous'
-            self.distribution_type = 'logistic'
-            self.round_approx = 'smooth'
-            self.coupling_type = 'shallow'
-            self.conv_type = 'standard'
-            self.densenet_depth = 8
-            self.bottleneck = False
-            self.n_channels = 512
-            self.network1x1 = 'standard'
-            self.auxilary_freq = -1
-            self.actnorm = False
-            self.LU = False
-            self.coupling_lifting_L = True
-            self.splitprior = True
-            self.split_quarter = True
-            self.n_levels = 2
-            self.n_flows = 2
-            self.cond_L = True
-            self.n_bits = True
-
-    args = Args()
-
-    x = (torch.randint(256, size=[batch_size] + input_size).float() - 128.) / 256.
-    ldj = torch.zeros_like(x[:, 0, 0, 0])
-
-    model = Coupling(c_in=12, height=16, width=16, args=args)
-
-    print(model)
-
-    model.set_temperature(1.)
-    model.enable_hard_round()
-
-    model.eval()
-
-    z, ldj = model(x, ldj, reverse=False)
-
-    # Check if gradient computation works
-    loss = torch.sum(z**2)
-    loss.backward()
-
-    recon, ldj = model(z, ldj, reverse=True)
-
-    sse = torch.sum(torch.pow(x - recon, 2)).item()
-    ae = torch.abs(x - recon).sum()
-    print('Error in recon: sse {} ae {}'.format(sse / np.prod(input_size), ae))
-
-
-if __name__ == '__main__':
-    test_generative_flow()
\ No newline at end of file
diff --git a/examples/2_hybrid_models/IDF/distributions.py b/examples/2_hybrid_models/IDF/distributions.py
deleted file mode 100644
index 54e864a2..00000000
--- a/examples/2_hybrid_models/IDF/distributions.py
+++ /dev/null
@@ -1,209 +0,0 @@
-from __future__ import print_function
-import torch
-import torch.utils.data
-import torch.nn.functional as F
-
-import numpy as np
-import math
-
-MIN_EPSILON = 1e-5
-MAX_EPSILON = 1.-1e-5
-
-
-PI = math.pi
-
-
-def log_min_exp(a, b, epsilon=1e-8):
-    """
-    Computes the log of exp(a) - exp(b) in a (more) numerically stable fashion.
-    Using:
-     log(exp(a) - exp(b))
-     c + log(exp(a-c) - exp(b-c))
-     a + log(1 - exp(b-a))
-    And note that we assume b < a always.
-    """
-    y = a + torch.log(1 - torch.exp(b - a) + epsilon)
-
-    return y
-
-
-def log_normal(x, mean, logvar):
-    logp = -0.5 * logvar
-    logp += -0.5 * np.log(2 * PI)
-    logp += -0.5 * (x - mean) * (x - mean) / torch.exp(logvar)
-    return logp
-
-
-def log_mixture_normal(x, mean, logvar, pi):
-    x = x.view(x.size(0), x.size(1), x.size(2), x.size(3), 1)
-
-    logp_mixtures = log_normal(x, mean, logvar)
-
-    logp = torch.log(torch.sum(pi * torch.exp(logp_mixtures), dim=-1) + 1e-8)
-
-    return logp
-
-
-def sample_normal(mean, logvar):
-    y = torch.randn_like(mean)
-
-    x = torch.exp(0.5 * logvar) * y + mean
-
-    return x
-
-
-def sample_mixture_normal(mean, logvar, pi):
-    b, c, h, w, n_mixtures = tuple(map(int, pi.size()))
-    pi = pi.view(b * c * h * w, n_mixtures)
-    sampled_pi = torch.multinomial(pi, num_samples=1).view(-1)
-
-    # Select mixture params
-    mean = mean.view(b * c * h * w, n_mixtures)
-    mean = mean[torch.arange(b*c*h*w), sampled_pi].view(b, c, h, w)
-    logvar = logvar.view(b * c * h * w, n_mixtures)
-    logvar = logvar[torch.arange(b*c*h*w), sampled_pi].view(b, c, h, w)
-
-    y = sample_normal(mean, logvar)
-
-    return y
-
-
-def log_logistic(x, mean, logscale):
-    """
-       pdf = sigma([x - mean] / scale) * [1 - sigma(...)] * 1/scale
-    """
-    scale = torch.exp(logscale)
-
-    u = (x - mean) / scale
-
-    logp = F.logsigmoid(u) + F.logsigmoid(-u) - logscale
-
-    return logp
-
-
-def sample_logistic(mean, logscale):
-    y = torch.rand_like(mean)
-
-    x = torch.exp(logscale) * torch.log(y / (1 - y)) + mean
-
-    return x
-
-
-def log_discretized_logistic(x, mean, logscale, inverse_bin_width):
-    scale = torch.exp(logscale)
-
-    logp = log_min_exp(
-        F.logsigmoid((x + 0.5 / inverse_bin_width - mean) / scale),
-        F.logsigmoid((x - 0.5 / inverse_bin_width - mean) / scale))
-
-    return logp
-
-
-def discretized_logistic_cdf(x, mean, logscale, inverse_bin_width):
-    scale = torch.exp(logscale)
-
-    cdf = torch.sigmoid((x + 0.5 / inverse_bin_width - mean) / scale)
-
-    return cdf
-
-
-def sample_discretized_logistic(mean, logscale, inverse_bin_width):
-    x = sample_logistic(mean, logscale)
-
-    x = torch.round(x * inverse_bin_width) / inverse_bin_width
-    return x
-
-
-def normal_cdf(value, loc, std):
-        return 0.5 * (1 + torch.erf((value - loc) * std.reciprocal() / math.sqrt(2)))
-
-
-def log_discretized_normal(x, mean, logvar, inverse_bin_width):
-    std = torch.exp(0.5 * logvar)
-    log_p = torch.log(normal_cdf(x + 0.5 / inverse_bin_width, mean, std) - normal_cdf(x - 0.5 / inverse_bin_width, mean, std) + 1e-7)
-
-    return log_p
-
-
-def log_mixture_discretized_normal(x, mean, logvar, pi, inverse_bin_width):
-    std = torch.exp(0.5 * logvar)
-
-    x = x.view(x.size(0), x.size(1), x.size(2), x.size(3), 1)
-
-    p = normal_cdf(x + 0.5 / inverse_bin_width, mean, std) - normal_cdf(x - 0.5 / inverse_bin_width, mean, std)
-
-    p = torch.sum(p * pi, dim=-1)
-
-    logp = torch.log(p + 1e-8)
-
-    return logp
-
-
-def sample_discretized_normal(mean, logvar, inverse_bin_width):
-    y = torch.randn_like(mean)
-
-    x = torch.exp(0.5 * logvar) * y + mean
-
-    x = torch.round(x * inverse_bin_width) / inverse_bin_width
-
-    return x
-
-
-def log_mixture_discretized_logistic(x, mean, logscale, pi, inverse_bin_width):
-    scale = torch.exp(logscale)
-
-    x = x.view(x.size(0), x.size(1), x.size(2), x.size(3), 1)
-
-    p = torch.sigmoid((x + 0.5 / inverse_bin_width - mean) / scale) \
-        - torch.sigmoid((x - 0.5 / inverse_bin_width - mean) / scale)
-
-    p = torch.sum(p * pi, dim=-1)
-
-    logp = torch.log(p + 1e-8)
-
-    return logp
-
-
-def mixture_discretized_logistic_cdf(x, mean, logscale, pi, inverse_bin_width):
-    scale = torch.exp(logscale)
-
-    x = x[..., None]
-
-    cdfs = torch.sigmoid((x + 0.5 / inverse_bin_width - mean) / scale)
-
-    cdf = torch.sum(cdfs * pi, dim=-1)
-
-    return cdf
-
-
-def sample_mixture_discretized_logistic(mean, logs, pi, inverse_bin_width):
-    # Sample mixtures
-    b, c, h, w, n_mixtures = tuple(map(int, pi.size()))
-    pi = pi.view(b * c * h * w, n_mixtures)
-    sampled_pi = torch.multinomial(pi, num_samples=1).view(-1)
-
-    # Select mixture params
-    mean = mean.view(b * c * h * w, n_mixtures)
-    mean = mean[torch.arange(b*c*h*w), sampled_pi].view(b, c, h, w)
-    logs = logs.view(b * c * h * w, n_mixtures)
-    logs = logs[torch.arange(b*c*h*w), sampled_pi].view(b, c, h, w)
-
-    y = torch.rand_like(mean)
-    x = torch.exp(logs) * torch.log(y / (1 - y)) + mean
-
-    x = torch.round(x * inverse_bin_width) / inverse_bin_width
-
-    return x
-
-
-def log_multinomial(logits, targets):
-    return -F.cross_entropy(logits, targets, reduction='none')
-
-
-def sample_multinomial(logits):
-    b, n_categories, c, h, w = logits.size()
-    logits = logits.permute(0, 2, 3, 4, 1)
-    p = F.softmax(logits, dim=-1)
-    p = p.view(b * c * h * w, n_categories)
-    x = torch.multinomial(p, num_samples=1).view(b, c, h, w)
-    return x
\ No newline at end of file
diff --git a/examples/2_hybrid_models/IDF/generative_flows.py b/examples/2_hybrid_models/IDF/generative_flows.py
deleted file mode 100644
index c5dcbd48..00000000
--- a/examples/2_hybrid_models/IDF/generative_flows.py
+++ /dev/null
@@ -1,180 +0,0 @@
-"""
-Collection of flow strategies
-"""
-
-from __future__ import print_function
-
-import torch
-import numpy as np
-
-from .utils import Base
-from .priors import SplitPrior
-from .coupling import Coupling
-
-
-UNIT_TESTING = False
-
-
-def space_to_depth(x):
-    xs = x.size()
-    # Pick off every second element
-    x = x.view(xs[0], xs[1], xs[2] // 2, 2, xs[3] // 2, 2)
-    # Transpose picked elements next to channels.
-    x = x.permute((0, 1, 3, 5, 2, 4)).contiguous()
-    # Combine with channels.
-    x = x.view(xs[0], xs[1] * 4, xs[2] // 2, xs[3] // 2)
-    return x
-
-
-def depth_to_space(x):
-    xs = x.size()
-    # Pick off elements from channels
-    x = x.view(xs[0], xs[1] // 4, 2, 2, xs[2], xs[3])
-    # Transpose picked elements next to HW dimensions.
-    x = x.permute((0, 1, 4, 2, 5, 3)).contiguous()
-    # Combine with HW dimensions.
-    x = x.view(xs[0], xs[1] // 4, xs[2] * 2, xs[3] * 2)
-    return x
-
-
-def int_shape(x):
-    return list(map(int, x.size()))
-
-
-class Flatten(Base):
-    def forward(self, x):
-        return x.view(x.size(0), -1)
-
-
-class Reshape(Base):
-    def __init__(self, shape):
-        super().__init__()
-        self.shape = shape
-
-    def forward(self, x):
-        return x.view(x.size(0), *self.shape)
-
-
-class Reverse(Base):
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, z, reverse=False):
-        flip_idx = torch.arange(z.size(1) - 1, -1, -1).long()
-        z = z[:, flip_idx, :, :]
-        return z
-
-
-class Permute(Base):
-    def __init__(self, n_channels):
-        super().__init__()
-
-        permutation = np.arange(n_channels, dtype='int')
-        np.random.shuffle(permutation)
-
-        permutation_inv = np.zeros(n_channels, dtype='int')
-        permutation_inv[permutation] = np.arange(n_channels, dtype='int')
-
-        self.permutation = torch.from_numpy(permutation)
-        self.permutation_inv = torch.from_numpy(permutation_inv)
-
-    def forward(self, z, ldj, reverse=False):
-        if not reverse:
-            z = z[:, self.permutation, :, :]
-        else:
-            z = z[:, self.permutation_inv, :, :]
-
-        return z, ldj
-
-    def InversePermute(self):
-        inv_permute = Permute(len(self.permutation))
-        inv_permute.permutation = self.permutation_inv
-        inv_permute.permutation_inv = self.permutation
-        return inv_permute
-
-
-class Squeeze(Base):
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, z, ldj, reverse=False):
-        if not reverse:
-            z = space_to_depth(z)
-        else:
-            z = depth_to_space(z)
-        return z, ldj
-
-
-class GenerativeFlow(Base):
-    def __init__(self, n_channels, height, width, args):
-        super().__init__()
-        layers = []
-        layers.append(Squeeze())
-        n_channels *= 4
-        height //= 2
-        width //= 2
-
-        for level in range(args.n_levels):
-
-            for i in range(args.n_flows):
-                perm_layer = Permute(n_channels)
-                layers.append(perm_layer)
-
-                layers.append(
-                    Coupling(n_channels, height, width, args))
-                
-                ## IDF++ ##
-                inv_perm_layer = perm_layer.InversePermute()
-                layers.append(inv_perm_layer)
-
-            if level < args.n_levels - 1:
-                if args.splitprior_type != 'none':
-                    # Standard splitprior
-                    factor_out = n_channels // 2
-                    layers.append(SplitPrior(n_channels, factor_out, height, width, args))
-                    n_channels = n_channels - factor_out
-
-                layers.append(Squeeze())
-                n_channels *= 4
-                height //= 2
-                width //= 2
-
-        self.layers = torch.nn.ModuleList(layers)
-        self.z_size = (n_channels, height, width)
-
-    def forward(self, z, ldj, pys=(), ys=(), reverse=False):
-        if not reverse:
-            for l, layer in enumerate(self.layers):
-                if isinstance(layer, (SplitPrior)):
-                    py, y, z, ldj = layer(z, ldj)
-                    pys += (py,)
-                    ys += (y,)
-
-                else:
-                    z, ldj = layer(z, ldj)
-
-        else:
-            for l, layer in reversed(list(enumerate(self.layers))):
-                if isinstance(layer, (SplitPrior)):
-                    if len(ys) > 0:
-                        z, ldj = layer.inverse(z, ldj, y=ys[-1])
-                        # Pop last element
-                        ys = ys[:-1]
-                    else:
-                        z, ldj = layer.inverse(z, ldj, y=None)
-
-                else:
-                    z, ldj = layer(z, ldj, reverse=True)
-
-        return z, ldj, pys, ys
-
-    def decode(self, z, ldj, state, decode_fn):
-
-        for l, layer in reversed(list(enumerate(self.layers))):
-            if isinstance(layer, SplitPrior):
-                z, ldj, state = layer.decode(z, ldj, state, decode_fn)
-
-            else:
-                z, ldj = layer(z, ldj, reverse=True)
-
-        return z, ldj
\ No newline at end of file
diff --git a/examples/2_hybrid_models/IDF/idf.py b/examples/2_hybrid_models/IDF/idf.py
deleted file mode 100644
index d95c780f..00000000
--- a/examples/2_hybrid_models/IDF/idf.py
+++ /dev/null
@@ -1,171 +0,0 @@
-import torch
-import numpy as np
-
-from .generative_flows import GenerativeFlow
-from .utils import Base
-from .priors import Prior
-from .loss import compute_loss_array
-
-
-class Normalize(Base):
-    def __init__(self, args):
-        super().__init__()
-        self.n_bits = args.n_bits
-        self.variable_type = args.variable_type
-        self.input_size = args.input_size
-
-    def forward(self, x, ldj, reverse=False):
-        domain = 2.**self.n_bits
-
-        if self.variable_type == 'discrete':
-            # Discrete variables will be measured on intervals sized 1/domain.
-            # Hence, there is no need to change the log Jacobian determinant.
-            dldj = 0
-        elif self.variable_type == 'continuous':
-            dldj = -np.log(domain) * np.prod(self.input_size)
-        else:
-            raise ValueError
-
-        if not reverse:
-            x = (x - domain / 2) / domain
-            ldj += dldj
-        else:
-            x = x * domain + domain / 2
-            ldj -= dldj
-
-        return x, ldj
-
-
-class Model(Base):
-    """
-    The base VAE class containing gated convolutional encoder and decoder
-    architecture. Can be used as a base class for VAE's with normalizing flows.
-    """
-
-    def __init__(self, args):
-        super().__init__()
-        self.args = args
-        self.variable_type = args.variable_type
-        self.distribution_type = args.distribution_type
-
-        n_channels, height, width = args.input_size
-
-        self.normalize = Normalize(args)
-
-        self.flow = GenerativeFlow(
-            n_channels, height, width, args)
-
-        self.n_bits = args.n_bits
-
-        self.z_size = self.flow.z_size
-
-        self.prior = Prior(self.z_size, args)
-
-    def dequantize(self, x):
-        if self.training:
-            x = x + torch.rand_like(x)
-        else:
-            # Required for stability.
-            alpha = 1e-3
-            x = x + alpha + torch.rand_like(x) * (1 - 2 * alpha)
-
-        return x
-
-    def loss(self, pz, z, pys, ys, ldj):
-        batchsize = z.size(0)
-        loss, bpd, bpd_per_prior = \
-            compute_loss_array(pz, z, pys, ys, ldj, self.args)
-
-        for module in self.modules():
-            if hasattr(module, 'auxillary_loss'):
-                loss += module.auxillary_loss() / batchsize
-
-        return loss, bpd, bpd_per_prior
-
-    def forward(self, x, debug = False, forward_only = False):
-        """
-        Evaluates the model as a whole, encodes and decodes. Note that the log
-         det jacobian is zero for a plain VAE (without flows), and z_0 = z_k.
-        """
-        if forward_only:
-            return self.forward_only(x)
-        
-        # Decode z to x.
-
-        assert x.dtype == torch.uint8
-
-        x = x.float()
-        # print(x.size())
-        if debug:
-            print("fp input x: ", x[0, :, 0, 0])
-
-        ldj = torch.zeros_like(x[:, 0, 0, 0])
-        if self.variable_type == 'continuous':
-            x = self.dequantize(x)
-        elif self.variable_type == 'discrete':
-            pass
-        else:
-            raise ValueError
-
-        x, ldj = self.normalize(x, ldj)
-        if debug:
-            print("after normalization: ", x[0, :, 0, 0])
-        # print("after normalization: ", x[0, :, 0, 0] * 256)
-
-        z, ldj, pys, ys = self.flow(x, ldj, pys=(), ys=())
-        if debug:
-            print("after flow: ", x[0, :, 0, 0])
-            print("pys", len(pys), "ys", len(ys))
-
-        pz, z, ldj = self.prior(z, ldj)
-
-        # print("000000")
-        # print(z.size(), len(ys), ys[0].size(), ys[1].size())
-        # print(z[0, 0, :, :] * 256)
-        loss, bpd, bpd_per_prior = self.loss(pz, z, pys, ys, ldj)
-
-        return loss, bpd, bpd_per_prior, pz, z, pys, ys, ldj
-    
-    def forward_only(self, x):
-        assert x.dtype == torch.uint8
-
-        x = x.float()
-        ldj = torch.zeros_like(x[:, 0, 0, 0])
-        if self.variable_type == 'continuous':
-            x = self.dequantize(x)
-        elif self.variable_type == 'discrete':
-            pass
-        else:
-            raise ValueError
-
-        x, ldj = self.normalize(x, ldj)
-        z, ldj, pys, ys = self.flow(x, ldj, pys=(), ys=())
-        pz, z, ldj = self.prior(z, ldj)
-        
-        return pz, z, pys, ys, ldj
-
-    def inverse(self, z, ys):
-        ldj = torch.zeros_like(z[:, 0, 0, 0])
-        x, ldj, pys, py = \
-            self.flow(z, ldj, pys=[], ys=ys, reverse=True)
-
-        x, ldj = self.normalize(x, ldj, reverse=True)
-
-        x_uint8 = torch.clamp(x, min=0, max=255).to(
-                torch.uint8)
-
-        return x_uint8
-
-    def sample(self, n):
-        z_sample = self.prior.sample(n)
-
-        ldj = torch.zeros_like(z_sample[:, 0, 0, 0])
-        x_sample, ldj, pys, py = \
-            self.flow(z_sample, ldj, pys=[], ys=[], reverse=True)
-
-        x_sample, ldj = self.normalize(x_sample, ldj, reverse=True)
-
-        x_sample_uint8 = torch.clamp(x_sample, min=0, max=255).to(
-                torch.uint8)
-
-        return x_sample_uint8
\ No newline at end of file
diff --git a/examples/2_hybrid_models/IDF/loss.py b/examples/2_hybrid_models/IDF/loss.py
deleted file mode 100644
index 45b2d6ab..00000000
--- a/examples/2_hybrid_models/IDF/loss.py
+++ /dev/null
@@ -1,149 +0,0 @@
-from __future__ import print_function
-
-import numpy as np
-import torch
-
-from .distributions import log_discretized_logistic, \
-    log_mixture_discretized_logistic, log_normal, log_discretized_normal, \
-    log_logistic, log_mixture_normal
-from .backround import _round_straightthrough
-
-
-def compute_log_ps(pxs, xs, args):
-    # Add likelihoods of intermediate representations.
-    inverse_bin_width = 2.**args.n_bits
-
-    log_pxs = []
-    for px, x in zip(pxs, xs):
-
-        if args.variable_type == 'discrete':
-            if args.distribution_type == 'logistic':
-                log_px = log_discretized_logistic(
-                    x, *px, inverse_bin_width=inverse_bin_width)
-            elif args.distribution_type == 'normal':
-                log_px = log_discretized_normal(
-                    x, *px, inverse_bin_width=inverse_bin_width)
-        elif args.variable_type == 'continuous':
-            if args.distribution_type == 'logistic':
-                log_px = log_logistic(x, *px)
-            elif args.distribution_type == 'normal':
-                log_px = log_normal(x, *px)
-            elif args.distribution_type == 'steplogistic':
-                x = _round_straightthrough(x * inverse_bin_width) / inverse_bin_width
-                log_px = log_discretized_logistic(
-                    x, *px, inverse_bin_width=inverse_bin_width)
-
-        log_pxs.append(
-            torch.sum(log_px, dim=[1, 2, 3]))
-
-    return log_pxs
-
-
-def compute_log_pz(pz, z, args):
-    inverse_bin_width = 2.**args.n_bits
-
-    if args.variable_type == 'discrete':
-        if args.distribution_type == 'logistic':
-            if args.n_mixtures == 1:
-                log_pz = log_discretized_logistic(
-                    z, pz[0], pz[1], inverse_bin_width=inverse_bin_width)
-            else:
-                log_pz = log_mixture_discretized_logistic(
-                    z, pz[0], pz[1], pz[2],
-                    inverse_bin_width=inverse_bin_width)
-        elif args.distribution_type == 'normal':
-            log_pz = log_discretized_normal(
-                z, *pz, inverse_bin_width=inverse_bin_width)
-
-    elif args.variable_type == 'continuous':
-        if args.distribution_type == 'logistic':
-            log_pz = log_logistic(z, *pz)
-        elif args.distribution_type == 'normal':
-            if args.n_mixtures == 1:
-                log_pz = log_normal(z, *pz)
-            else:
-                log_pz = log_mixture_normal(z, *pz)
-        elif args.distribution_type == 'steplogistic':
-            z = _round_straightthrough(z * 256.) / 256.
-            log_pz = log_discretized_logistic(z, *pz)
-
-    log_pz = torch.sum(
-        log_pz,
-        dim=[1, 2, 3])
-
-    return log_pz
-
-
-def compute_loss_function(pz, z, pys, ys, ldj, args):
-    """
-    Computes the cross entropy loss function while summing over batch dimension, not averaged!
-    :param x_logit: shape: (batch_size, num_classes * num_channels, pixel_width, pixel_height), real valued logits
-    :param x: shape (batchsize, num_channels, pixel_width, pixel_height), pixel values rescaled between [0, 1].
-    :param z_mu: mean of z_0
-    :param z_var: variance of z_0
-    :param z_0: first stochastic latent variable
-    :param z_k: last stochastic latent variable
-    :param ldj: log det jacobian
-    :param args: global parameter settings
-    :param beta: beta for kl loss
-    :return: loss, ce, kl
-    """
-    batch_size = z.size(0)
-
-    # Get array loss, sum over batch
-    loss_array, bpd_array, bpd_per_prior_array = \
-        compute_loss_array(pz, z, pys, ys, ldj, args)
-
-    loss = torch.mean(loss_array)
-    bpd = torch.mean(bpd_array).item()
-    bpd_per_prior = [torch.mean(x) for x in bpd_per_prior_array]
-
-    return loss, bpd, bpd_per_prior
-
-
-def convert_bpd(log_p, input_size):
-    return -log_p / (np.prod(input_size) * np.log(2.))
-
-
-def compute_loss_array(pz, z, pys, ys, ldj, args):
-    """
-    Computes the cross entropy loss function while summing over batch dimension, not averaged!
-    :param x_logit: shape: (batch_size, num_classes * num_channels, pixel_width, pixel_height), real valued logits
-    :param x: shape (batchsize, num_channels, pixel_width, pixel_height), pixel values rescaled between [0, 1].
-    :param z_mu: mean of z_0
-    :param z_var: variance of z_0
-    :param z_0: first stochastic latent variable
-    :param z_k: last stochastic latent variable
-    :param ldj: log det jacobian
-    :param args: global parameter settings
-    :param beta: beta for kl loss
-    :return: loss, ce, kl
-    """
-    bpd_per_prior = []
-
-    # Likelihood of final representation.
-    log_pz = compute_log_pz(pz, z, args)
-
-    bpd_per_prior.append(convert_bpd(log_pz.detach(), args.input_size))
-
-    log_p = log_pz
-
-    # Add likelihoods of intermediate representations.
-    if ys:
-        log_pys = compute_log_ps(pys, ys, args)
-
-        for log_py in log_pys:
-            log_p += log_py
-
-            bpd_per_prior.append(convert_bpd(log_py.detach(), args.input_size))
-
-    log_p += ldj
-
-    loss = -log_p
-    bpd = convert_bpd(log_p.detach(), args.input_size)
-
-    return loss, bpd, bpd_per_prior
-
-
-def calculate_loss(pz, z, pys, ys, ldj, loss_aux, args):
-    return compute_loss_function(pz, z, pys, ys, ldj, loss_aux, args)
\ No newline at end of file
diff --git a/examples/2_hybrid_models/IDF/networks.py b/examples/2_hybrid_models/IDF/networks.py
deleted file mode 100644
index 796441fe..00000000
--- a/examples/2_hybrid_models/IDF/networks.py
+++ /dev/null
@@ -1,193 +0,0 @@
-"""
-Collection of flow strategies
-"""
-
-from __future__ import print_function
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from .utils import Base
-
-
-UNIT_TESTING = False
-
-
-class Conv2dReLU(Base):
-    def __init__(
-            self, n_inputs, n_outputs, kernel_size=3, stride=1, padding=0,
-            bias=True):
-        super().__init__()
-
-        self.nn = nn.Conv2d(n_inputs, n_outputs, kernel_size, padding=padding)
-
-    def forward(self, x):
-        h = self.nn(x)
-
-        y = F.relu(h)
-
-        return y
-    
-    
-class Conv2dGnSwish(Base):
-    def __init__(self, n_inputs, n_outputs, kernel_size=3, stride=1, padding=0, bias=True):
-        super().__init__()
-        
-        self.nn = nn.Conv2d(n_inputs, n_outputs, kernel_size, padding=padding)
-        
-        if n_outputs % 3 == 0:
-            num_groups = 3
-        elif n_outputs % 2 == 0:
-            num_groups = 2
-        else:
-            num_groups = 1
-        
-        self.groupnorm = nn.GroupNorm(num_groups, n_outputs)
-        self.swish = nn.SiLU()
-        
-    def forward(self, x):
-        h = self.nn(x)
-        h = self.groupnorm(h)
-        y = self.swish(h)
-        return y
-
-
-class ResidualBlock(Base):
-    def __init__(self, n_channels, kernel, Conv2dAct):
-        super().__init__()
-
-        self.nn = torch.nn.Sequential(
-            Conv2dAct(n_channels, n_channels, kernel, padding=1),
-            torch.nn.Conv2d(n_channels, n_channels, kernel, padding=1),
-            )
-
-    def forward(self, x):
-        h = self.nn(x)
-        h = F.relu(h + x)
-        return h
-
-
-class DenseLayer(Base):
-    def __init__(self, args, n_inputs, growth, Conv2dAct):
-        super().__init__()
-
-        conv1x1 = Conv2dAct(
-                n_inputs, n_inputs, kernel_size=1, stride=1,
-                padding=0, bias=True)
-
-        self.nn = torch.nn.Sequential(
-            conv1x1,
-            Conv2dAct(
-                n_inputs, growth, kernel_size=3, stride=1,
-                padding=1, bias=True),
-            )
-
-    def forward(self, x):
-        h = self.nn(x)
-
-        h = torch.cat([x, h], dim=1)
-        return h
-
-
-class DenseBlock(Base):
-    def __init__(
-            self, args, n_inputs, n_outputs, kernel, Conv2dAct):
-        super().__init__()
-        depth = args.densenet_depth
-
-        future_growth = n_outputs - n_inputs
-
-        layers = []
-
-        for d in range(depth):
-            growth = future_growth // (depth - d)
-
-            layers.append(DenseLayer(args, n_inputs, growth, Conv2dAct))
-            n_inputs += growth
-            future_growth -= growth
-
-        self.nn = torch.nn.Sequential(*layers)
-
-    def forward(self, x):
-        y = self.nn(x)
-        return y
-
-
-class Identity(Base):
-    def __init__(self):
-        super.__init__()
-
-    def forward(self, x):
-        return x
-
-
-class NN(Base):
-    def __init__(
-            self, args, c_in, c_out, height, width, nn_type, kernel=3):
-        super().__init__()
-
-        Conv2dAct = Conv2dReLU
-        n_channels = args.n_channels
-
-        if nn_type == 'shallow':
-
-            if True or args.network1x1 == 'standard':
-                conv1x1 = Conv2dAct(
-                    n_channels, n_channels, kernel_size=1, stride=1,
-                    padding=0, bias=False)
-
-            layers = [
-                Conv2dAct(c_in, n_channels, kernel, padding=1),
-                conv1x1]
-
-            layers += [torch.nn.Conv2d(n_channels, c_out, kernel, padding=1)]
-
-        elif nn_type == 'resnet':
-            layers = [
-                Conv2dAct(c_in, n_channels, kernel, padding=1),
-                ResidualBlock(n_channels, kernel, Conv2dAct),
-                ResidualBlock(n_channels, kernel, Conv2dAct)]
-
-            layers += [
-                torch.nn.Conv2d(n_channels, c_out, kernel, padding=1)
-                ]
-
-        elif nn_type == 'densenet':
-            layers = [
-                DenseBlock(
-                    args=args,
-                    n_inputs=c_in,
-                    n_outputs=n_channels + c_in,
-                    kernel=kernel,
-                    Conv2dAct=Conv2dAct)]
-
-            layers += [
-                torch.nn.Conv2d(n_channels + c_in, c_out, kernel, padding=1)
-                ]
-        elif nn_type == 'densenet++':
-            Conv2dAct = Conv2dGnSwish
-            
-            layers = [
-                DenseBlock(
-                    args=args,
-                    n_inputs=c_in,
-                    n_outputs=n_channels + c_in,
-                    kernel=kernel,
-                    Conv2dAct=Conv2dAct)]
-
-            layers += [
-                torch.nn.Conv2d(n_channels + c_in, c_out, kernel, padding=1)
-                ]
-        else:
-            raise ValueError
-
-        self.nn = torch.nn.Sequential(*layers)
-
-        # Set parameters of last conv-layer to zero.
-        if not UNIT_TESTING:
-            self.nn[-1].weight.data.zero_()
-            self.nn[-1].bias.data.zero_()
-
-    def forward(self, x):
-        y = self.nn(x)
-        return y
\ No newline at end of file
diff --git a/examples/2_hybrid_models/IDF/priors.py b/examples/2_hybrid_models/IDF/priors.py
deleted file mode 100644
index 0e4d6b14..00000000
--- a/examples/2_hybrid_models/IDF/priors.py
+++ /dev/null
@@ -1,178 +0,0 @@
-"""
-Collection of flow strategies
-"""
-
-from __future__ import print_function
-
-import torch
-import torch.nn.functional as F
-from torch.nn import Parameter
-
-from .distributions import sample_discretized_logistic, \
-    sample_mixture_discretized_logistic, sample_normal, sample_logistic, \
-    sample_discretized_normal, sample_mixture_normal
-from .utils import Base
-from .networks import NN
-
-
-def sample_prior(px, variable_type, distribution_type, inverse_bin_width):
-    if variable_type == 'discrete':
-        if distribution_type == 'logistic':
-            if len(px) == 2:
-                return sample_discretized_logistic(
-                    *px, inverse_bin_width=inverse_bin_width)
-            elif len(px) == 3:
-                return sample_mixture_discretized_logistic(
-                    *px, inverse_bin_width=inverse_bin_width)
-
-        elif distribution_type == 'normal':
-            return sample_discretized_normal(
-                *px, inverse_bin_width=inverse_bin_width)
-
-    elif variable_type == 'continuous':
-        if distribution_type == 'logistic':
-            return sample_logistic(*px)
-        elif distribution_type == 'normal':
-            if len(px) == 2:
-                return sample_normal(*px)
-            elif len(px) == 3:
-                return sample_mixture_normal(*px)
-        elif distribution_type == 'steplogistic':
-            return sample_logistic(*px)
-
-    raise ValueError
-
-
-class Prior(Base):
-    def __init__(self, size, args):
-        super().__init__()
-        c, h, w = size
-        
-        self.inverse_bin_width = 2**args.n_bits
-        self.variable_type = args.variable_type
-        self.distribution_type = args.distribution_type
-        self.n_mixtures = args.n_mixtures
-        
-        if hasattr(args, "num_prior_leaf_nodes"):
-            self.num_prior_leaf_nodes = args.num_prior_leaf_nodes
-        else:
-            self.num_prior_leaf_nodes = 1
-
-        if self.n_mixtures == 1:
-            self.mu = Parameter(torch.Tensor(c * self.num_prior_leaf_nodes, h, w))
-            self.logs = Parameter(torch.Tensor(c * self.num_prior_leaf_nodes, h, w))
-        elif self.n_mixtures > 1:
-            self.mu = Parameter(torch.Tensor(c, h, w, self.n_mixtures))
-            self.logs = Parameter(torch.Tensor(c, h, w, self.n_mixtures))
-            self.pi_logit = Parameter(torch.Tensor(c, h, w, self.n_mixtures))
-
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        self.mu.data.zero_()
-
-        if self.n_mixtures > 1:
-            self.pi_logit.data.zero_()
-            for i in range(self.n_mixtures):
-                self.mu.data[..., i] += i - (self.n_mixtures - 1) / 2.
-
-        self.logs.data.zero_()
-
-    def get_pz(self, n):
-        if self.n_mixtures == 1:
-            mu = self.mu.repeat(n, 1, 1, 1)
-            logs = self.logs.repeat(n, 1, 1, 1)  # scaling scale
-            return mu, logs
-
-        elif self.n_mixtures > 1:
-            pi = F.softmax(self.pi_logit, dim=-1)
-            mu = self.mu.repeat(n, 1, 1, 1, 1)
-            logs = self.logs.repeat(n, 1, 1, 1, 1)
-            pi = pi.repeat(n, 1, 1, 1, 1)
-            return mu, logs, pi
-
-    def forward(self, z, ldj):
-        pz = self.get_pz(z.size(0))
-        # print("z", z.size(), z[0, ...])
-        # print("pz[0]", pz[0].size(), pz[0])
-
-        return pz, z, ldj
-
-    def sample(self, n):
-        pz = self.get_pz(n)
-
-        z_sample = sample_prior(pz, self.variable_type, self.distribution_type, self.inverse_bin_width)
-
-        return z_sample
-
-    def decode(self, states, decode_fn):
-        pz = self.get_pz(n=len(states))
-
-        states, z = decode_fn(states, pz)
-        return states, z
-
-
-class SplitPrior(Base):
-    def __init__(self, c_in, factor_out, height, width, args):
-        super().__init__()
-
-        self.split_idx = c_in - factor_out
-        self.inverse_bin_width = 2**args.n_bits
-        self.variable_type = args.variable_type
-        self.distribution_type = args.distribution_type
-        self.input_channel = c_in
-        
-        self.factor_out = factor_out
-        if hasattr(args, "num_prior_leaf_nodes"):
-            self.num_prior_leaf_nodes = args.num_prior_leaf_nodes
-        else:
-            self.num_prior_leaf_nodes = 1
-
-        self.nn = NN(
-            args=args,
-            c_in=c_in - factor_out,
-            c_out=factor_out * self.num_prior_leaf_nodes * 2,
-            height=height,
-            width=width,
-            nn_type=args.splitprior_type)
-
-    def get_py(self, z):
-        h = self.nn(z)
-        mu = h[:, ::2, :, :]
-        logs = h[:, 1::2, :, :]
-
-        py = [mu, logs]
-
-        return py
-
-    def split(self, z):
-        z1 = z[:, :self.split_idx, :, :]
-        y = z[:, self.split_idx:, :, :]
-        return z1, y
-
-    def combine(self, z, y):
-        result = torch.cat([z, y], dim=1)
-
-        return result
-
-    def forward(self, z, ldj):
-        z, y = self.split(z)
-
-        py = self.get_py(z)
-
-        return py, y, z, ldj
-
-    def inverse(self, z, ldj, y):
-        # Sample if y is not given.
-        if y is None:
-            py = self.get_py(z)
-            y = sample_prior(py, self.variable_type, self.distribution_type, self.inverse_bin_width)
-
-        z = self.combine(z, y)
-
-        return z, ldj
-
-    def decode(self, z, ldj, states, decode_fn):
-        py = self.get_py(z)
-        states, y = decode_fn(states, py)
-        return self.combine(z, y), ldj, states
\ No newline at end of file
diff --git a/examples/2_hybrid_models/IDF/utils.py b/examples/2_hybrid_models/IDF/utils.py
deleted file mode 100644
index 458dca0c..00000000
--- a/examples/2_hybrid_models/IDF/utils.py
+++ /dev/null
@@ -1,34 +0,0 @@
-import torch
-
-
-class Base(torch.nn.Module):
-    """
-    The base class for modules. That contains a disable round mode
-    """
-
-    def __init__(self):
-        super().__init__()
-
-    def _set_child_attribute(self, attr, value):
-        r"""Sets the module in rounding mode.
-        This has any effect only on certain modules if variable type is
-        discrete.
-        Returns:
-            Module: self
-        """
-        if hasattr(self, attr):
-            setattr(self, attr, value)
-
-        for module in self.modules():
-            if hasattr(module, attr):
-                setattr(module, attr, value)
-        return self
-
-    def set_temperature(self, value):
-        self._set_child_attribute("temperature", value)
-
-    def enable_hard_round(self, mode=True):
-        self._set_child_attribute("hard_round", mode)
-
-    def disable_hard_round(self, mode=True):
-        self.enable_hard_round(not mode)
\ No newline at end of file
diff --git a/examples/2_hybrid_models/VAE/model.py b/examples/2_hybrid_models/VAE/model.py
deleted file mode 100644
index 629f7b1c..00000000
--- a/examples/2_hybrid_models/VAE/model.py
+++ /dev/null
@@ -1,563 +0,0 @@
-import torch
-import torch.utils.data
-from torch import nn, optim
-from torchvision import *
-import socket
-import os
-import time
-from datetime import datetime
-import numpy as np
-import sys
-import os
-
-sys.path.append(os.path.dirname(__file__))
-
-import modules as modules
-import rand as random
-
-
-class Model(nn.Module):
-    def __init__(self, xs=(3, 32, 32), nz=1, zchannels=16, nprocessing=1, kernel_size=3, resdepth=2,
-                 reswidth=256, dropout_p=0., tag='', num_latents = 4):
-        super().__init__()
-        # default: disable compressing mode
-        # if activated, tensors will be flattened
-        self.compressing = False
-
-        # hyperparameters
-        self.xs = xs
-        self.nz = nz
-        self.zchannels = zchannels
-        self.nprocessing = nprocessing
-        # latent height/width is always 16,
-        # the number of channels depends on the dataset
-        self.zdim = (self.zchannels, 16, 16)
-        self.resdepth = resdepth
-        self.reswidth = reswidth
-        self.kernel_size = kernel_size
-
-        # apply these two factors (i.e. on the ELBO) in sequence and it results in "bits/dim"
-        # factor to convert "nats" to bits
-        self.bitsscale = np.log2(np.e)
-        # factor to divide by the data dimension
-        self.perdimsscale = 1. / np.prod(self.xs)
-
-        # calculate processing layers convolutions options
-        # kernel/filter is 5, so in order to ensure same-size outputs, we have to pad by 2
-        padding_proc = (5 - 1) / 2
-        assert padding_proc.is_integer()
-        padding_proc = int(padding_proc)
-
-        # calculate other convolutions options
-        padding = (self.kernel_size - 1) / 2
-        assert padding.is_integer()
-        padding = int(padding)
-
-        # set-up current "best elbo"
-        self.best_elbo = np.inf
-
-        # distribute ResNet blocks over latent layers
-        resdepth = [0] * (self.nz)
-        i = 0
-        for _ in range(self.resdepth):
-            i = 0 if i == (self.nz) else i
-            resdepth[i] += 1
-            i += 1
-
-        # reduce initial variance of distributions corresponding
-        # to latent layers if latent nz increases
-        scale = 1.0 / (self.nz ** 0.5)
-
-        # store activations
-        self.softplus = nn.Softplus()
-        self.sigmoid = nn.Sigmoid()
-        self.act = nn.ELU()
-        self.actresnet = nn.ELU()
-
-        # Below we build up the main model architecture of the inference- and generative-models
-        # All the architecure components are built up from different custom are existing PyTorch modules
-
-        # <===== INFERENCE MODEL =====>
-        # the bottom (zi=1) inference model
-        self.infer_in = nn.Sequential(
-            # shape: [1,32,32] -> [4,16,16]
-            modules.Squeeze2d(factor=2),
-
-            # shape: [4,16,16] -> [32,16,16]
-            modules.WnConv2d(4 * xs[0],
-                             self.reswidth,
-                             5,
-                             1,
-                             padding_proc,
-                             init_scale=1.0,
-                             loggain=True),
-            self.act
-        )
-        self.infer_res0 = nn.Sequential(
-            # shape: [32,16,16] -> [32,16,16]
-            modules.ResNetBlock(self.reswidth,
-                                self.reswidth,
-                                5,
-                                1,
-                                padding_proc,
-                                self.nprocessing,
-                                dropout_p,
-                                self.actresnet),
-            self.act
-        ) if self.nprocessing > 0 else modules.Pass()
-
-        self.infer_res1 = nn.Sequential(
-            # shape: [32,16,16] -> [32,16,16]
-            modules.ResNetBlock(self.reswidth,
-                                self.reswidth,
-                                self.kernel_size,
-                                1,
-                                padding,
-                                resdepth[0],
-                                dropout_p,
-                                self.actresnet),
-            self.act
-        ) if resdepth[0] > 0 else modules.Pass()
-
-        # shape: [32,16,16] -> [1,16,16]
-        self.infer_mu = modules.WnConv2d(self.reswidth,
-                                         self.zchannels,
-                                         self.kernel_size,
-                                         1,
-                                         padding,
-                                         init_scale=scale if self.nz > 1 else 2 ** 0.5 * scale)
-
-        # shape: [32,16,16] -> [1,16,16]
-        self.infer_std = modules.WnConv2d(self.reswidth,
-                                          self.zchannels,
-                                          self.kernel_size,
-                                          1,
-                                          padding,
-                                          init_scale=scale if self.nz > 1 else 2 ** 0.5 * scale)
-
-        # <===== DEEP INFERENCE MODEL =====>
-        # the deeper (zi > 1) inference models
-        self.deepinfer_in = nn.ModuleList([
-            # shape: [1,16,16] -> [32,16,16]
-            nn.Sequential(
-                modules.WnConv2d(self.zchannels,
-                                 self.reswidth,
-                                 self.kernel_size,
-                                 1,
-                                 padding,
-                                 init_scale=1.0,
-                                 loggain=True),
-                self.act
-            )
-            for _ in range(self.nz - 1)])
-
-        self.deepinfer_res = nn.ModuleList([
-            # shape: [32,16,16] -> [32,16,16]
-            nn.Sequential(
-                modules.ResNetBlock(self.reswidth,
-                                    self.reswidth,
-                                    self.kernel_size,
-                                    1,
-                                    padding,
-                                    resdepth[i + 1],
-                                    dropout_p,
-                                    self.actresnet),
-                self.act
-            ) if resdepth[i + 1] > 0 else modules.Pass()
-            for i in range(self.nz - 1)])
-
-        self.deepinfer_mu = nn.ModuleList([
-            # shape: [32,16,16] -> [1,16,16]
-            nn.Sequential(
-                modules.WnConv2d(self.reswidth,
-                                 self.zchannels,
-                                 self.kernel_size,
-                                 1,
-                                 padding,
-                                 init_scale=scale if i < self.nz - 2 else 2 ** 0.5 * scale)
-            )
-            for i in range(self.nz - 1)])
-
-        self.deepinfer_std = nn.ModuleList([
-            # shape: [32,16,16] -> [1,16,16]
-            nn.Sequential(
-                modules.WnConv2d(self.reswidth,
-                                 self.zchannels,
-                                 self.kernel_size,
-                                 1,
-                                 padding,
-                                 init_scale=scale if i < self.nz - 2 else 2 ** 0.5 * scale)
-            )
-            for i in range(self.nz - 1)])
-
-        # <===== DEEP GENERATIVE MODEL =====>
-        # the deeper (zi > 1) generative models
-        self.deepgen_in = nn.ModuleList([
-            # shape: [1,16,16] -> [32,16,16]
-            nn.Sequential(
-                modules.WnConv2d(self.zchannels,
-                                 self.reswidth,
-                                 self.kernel_size,
-                                 1,
-                                 padding,
-                                 init_scale=1.0,
-                                 loggain=True),
-                self.act
-            )
-            for _ in range(self.nz - 1)])
-
-        self.deepgen_res = nn.ModuleList([
-            # shape: [32,16,16] -> [32,16,16]
-            nn.Sequential(
-                modules.ResNetBlock(self.reswidth,
-                                    self.reswidth,
-                                    self.kernel_size,
-                                    1,
-                                    padding,
-                                    resdepth[i + 1],
-                                    dropout_p,
-                                    self.actresnet),
-                self.act
-            ) if resdepth[i + 1] > 0 else modules.Pass()
-            for i in range(self.nz - 1)])
-
-        self.deepgen_mu = nn.ModuleList([
-            # shape: [32,16,16] -> [1,16,16]
-            nn.Sequential(
-                modules.WnConv2d(self.reswidth,
-                                 self.zchannels,
-                                 self.kernel_size,
-                                 1,
-                                 padding,
-                                 init_scale=scale)
-            )
-            for _ in range(self.nz - 1)])
-
-        self.deepgen_std = nn.ModuleList([
-            # shape: [32,16,16] -> [1,16,16]
-            nn.Sequential(
-                modules.WnConv2d(self.reswidth,
-                                 self.zchannels,
-                                 self.kernel_size,
-                                 1,
-                                 padding, init_scale=scale)
-            )
-            for _ in range(self.nz - 1)])
-
-        # <===== GENERATIVE MODEL =====>
-        # the bottom (zi = 1) inference model
-        self.gen_in = nn.Sequential(
-            # shape: [1,16,16] -> [32,16,16]
-            modules.WnConv2d(self.zchannels,
-                             self.reswidth,
-                             self.kernel_size,
-                             1,
-                             padding,
-                             init_scale=1.0,
-                             loggain=True),
-            self.act
-        )
-
-        self.gen_res1 = nn.Sequential(
-            # shape: [32,16,16] -> [32,16,16]
-            modules.ResNetBlock(self.reswidth,
-                                self.reswidth,
-                                self.kernel_size,
-                                1,
-                                padding,
-                                resdepth[0],
-                                dropout_p,
-                                self.actresnet),
-            self.act
-        ) if resdepth[0] > 0 else modules.Pass()
-
-        self.gen_res0 = nn.Sequential(
-            # shape: [32,16,16] -> [32,16,16]
-            modules.ResNetBlock(self.reswidth,
-                                self.reswidth,
-                                5,
-                                1,
-                                padding_proc,
-                                self.nprocessing,
-                                dropout_p,
-                                self.actresnet),
-            self.act
-        ) if self.nprocessing > 0 else modules.Pass()
-
-        self.gen_mu = nn.Sequential(
-            # shape: [32,16,16] -> [4,16,16]
-            modules.WnConv2d(self.reswidth,
-                             4 * xs[0] * num_latents,
-                             self.kernel_size,
-                             1,
-                             padding,
-                             init_scale=0.1),
-            # shape: [4,16,16] -> [1,32,23]
-            modules.UnSqueeze2d(factor=2)
-        )
-
-        # the scale parameter of the bottom (zi = 1) generative model is modelled unconditional
-        self.gen_std = nn.Parameter(torch.zeros([self.xs[0] * num_latents, self.xs[1], self.xs[2]]))
-        nn.init.zeros_(self.gen_std)
-
-        self.num_latents = num_latents
-
-    # function to set the model to compression mode
-    def compress(self, compress=True):
-        self.compressing = compress
-
-    # function that only takes in the layer number and returns a distribution based on that
-    def infer(self, i):
-        # nested function that takes in the "given" value of the conditional Logistic distribution
-        # and returns the mu and scale parameters of that distribution
-        def distribution(given):
-            h = given
-
-            # if compressing, the input might not be float32, so we'll have to convert it first
-            if self.compressing:
-                type = h.type()
-                h = h.float()
-
-            # bottom latent layer
-            if i == 0:
-                # if compressing, the input is flattened, so we'll have to convert it back to a Tensor
-                if self.compressing:
-                    h = h.view((-1,) + self.xs)
-                # also, when NOT compressing, the input is not scaled from [0,255] to [-1,1]
-                else:
-                    h = (h - 127.5) / 127.5
-
-                # input convolution
-                h = self.infer_in(h)
-
-                # processing ResNet blocks
-                h = self.infer_res0(h)
-
-                # other ResNet blocks
-                h = self.infer_res1(h)
-
-                # mu parameter of the conditional Logistic distribution
-                mu = self.infer_mu(h)
-
-                # scale parameter of the conditional Logistic distribution
-                # clamp the output of the scale parameter between [0.1, 1.0] for stability
-                scale = 0.1 + 0.9 * self.sigmoid(self.infer_std(h) + 2.)
-
-            # deeper latent layers
-            else:
-                # if compressing, the input is flattened, so we'll have to convert it back to a Tensor
-                if self.compressing:
-                    h = h.view((-1,) + self.zdim)
-
-                # input convolution
-                h = self.deepinfer_in[i - 1](h)
-
-                # other ResNet blocks
-                h = self.deepinfer_res[i - 1](h)
-
-                # mu parameter of the conditional Logistic distribution
-                mu = self.deepinfer_mu[i - 1](h)
-
-                # scale parameter of the conditional Logistic distribution
-                # clamp the output of the scale parameter between [0.1, 1.0] for stability
-                scale = 0.1 + 0.9 * self.sigmoid(self.deepinfer_std[i - 1](h) + 2.)
-
-            if self.compressing:
-                # if compressing, the "batch-size" can only be 1
-                assert mu.shape[0] == 1
-
-                # flatten the Tensors back and convert back to the input datatype
-                mu = mu.view(np.prod(self.zdim)).type(type)
-                scale = scale.view(np.prod(self.zdim)).type(type)
-            return mu, scale
-
-        return distribution
-
-    # function that only takes in the layer number and returns a distribution based on that
-    def generate(self, i):
-        # nested function that takes in the "given" value of the conditional Logistic distribution
-        # and returns the mu and scale parameters of that distribution
-        def distribution(given):
-            h = given
-
-            # if compressing, the input is flattened, so we'll have to convert it back to a Tensor
-            # also, the input might not be float32, so we'll have to convert it first
-            if self.compressing:
-                type = h.type()
-                h = h.float()
-                h = h.view((-1,) + self.zdim)
-
-            # bottom latent layer
-            if i == 0:
-                # input convolution
-                h = self.gen_in(h)
-
-                # processing ResNet blocks
-                h = self.gen_res1(h)
-
-                # other ResNet blocks
-                h = self.gen_res0(h)
-
-                # mu parameter of the conditional Logistic distribution
-                mu = self.gen_mu(h)
-
-                # scale parameter of the conditional Logistic distribution
-                # set a minimal value for the scale parameter of the bottom generative model
-                scale = ((2. / 255.) / 8.) + modules.softplus(self.gen_std)
-
-            # deeper latent layers
-            else:
-                # input convolution
-                h = self.deepgen_in[i - 1](h)
-
-                # other ResNet blocks
-                h = self.deepgen_res[i - 1](h)
-
-                # mu parameter of the conditional Logistic distribution
-                mu = self.deepgen_mu[i - 1](h)
-
-                # scale parameter of the conditional Logistic distribution
-                # clamp the output of the scale parameter between [0.1, 1.0] for stability
-                scale = 0.1 + 0.9 * modules.softplus(self.deepgen_std[i - 1](h) + np.log(np.exp(1.) - 1.))
-
-
-            if self.compressing:
-                # if compressing, the "batch-size" can only be 1
-                assert mu.shape[0] == 1
-
-                # flatten the Tensors back and convert back to the input datatype
-                mu = mu.view(np.prod(self.xs if i == 0 else self.zdim)).type(type)
-                scale = scale.view(np.prod(self.xs if i == 0 else self.zdim)).type(type)
-            return mu, scale
-
-        return distribution
-
-    # function that takes as input the data and outputs all the components of the ELBO + the latent samples
-    def loss(self, x, pc = None):
-        # tensor to store inference model losses
-        logenc = torch.zeros((self.nz, x.shape[0], self.zdim[0]), device=x.device)
-
-        # tensor to store the generative model losses
-        logdec = torch.zeros((self.nz, x.shape[0], self.zdim[0]), device=x.device)
-
-        # tensor to store the latent samples
-        zsamples = torch.zeros((self.nz, x.shape[0], np.prod(self.zdim)), device=x.device)
-
-        for i in range(self.nz):
-            # inference model
-            # get the parameters of inference distribution i given x (if i == 0) or z (otherwise)
-            mu, scale = self.infer(i)(given=x if i == 0 else z)
-
-            # sample untransformed sample from Logistic distribution (mu=0, scale=1)
-            eps = random.logistic_eps(mu.shape, device=mu.device)
-            # reparameterization trick: transform using obtained parameters
-            z_next = random.transform(eps, mu, scale)
-
-            # store the inference model loss
-            zsamples[i] = z_next.flatten(1)
-            logq = torch.sum(random.logistic_logp(mu, scale, z_next), dim=2)
-            logenc[i] += logq
-
-            # generative model
-            # get the parameters of inference distribution i given z
-            mu, scale = self.generate(i)(given=z_next)
-
-            import torch.nn.functional as F
-            def _log_min_exp(a: torch.Tensor, b: torch.Tensor, epsilon = 1e-8):
-                return a + torch.log(1 - torch.exp(b - a) + epsilon)
-
-            def discrete_logistic_ll(x, mean, logscale):
-                scale = torch.exp(logscale)
-                logp = _log_min_exp(
-                    F.logsigmoid((x + 0.5 / 256.0 - mean) / scale),
-                    F.logsigmoid((x - 0.5 / 256.0 - mean) / scale))
-
-                return logp
-
-            # store the generative model loss
-            if i == 0:
-                # if bottom (zi = 1) generative model, evaluate loss using discretized Logistic distribution
-                scale = scale.unsqueeze(0).expand_as(mu)
-                input_params = {"input_0": {"mus": mu.permute(0, 2, 3, 1).reshape(-1, 32 * 32 * self.num_latents),
-                                            "log_scales": scale.log().permute(0, 2, 3, 1).reshape(-1, 32 * 32 * self.num_latents)}}
-
-                if pc is not None:
-                    logp = pc(x.reshape(-1, 32 * 32) / 255.0, input_params = input_params)
-                else:
-                    logp = torch.sum(random.discretized_logistic_logp(mu, scale, x), dim=1)
-                
-                logrecon = logp
-
-            else:
-                logp = torch.sum(random.logistic_logp(mu, scale, x if i == 0 else z), dim=2)
-                logdec[i - 1] += logp
-
-            z = z_next
-
-        # store the prior loss
-        logp = torch.sum(random.logistic_logp(torch.zeros(1, device=x.device), torch.ones(1, device=x.device), z), dim=2)
-        logdec[self.nz - 1] += logp
-
-        # convert from "nats" to bits
-        logenc = torch.mean(logenc, dim=1) * self.bitsscale
-        logdec = torch.mean(logdec, dim=1) * self.bitsscale
-        logrecon = torch.mean(logrecon) * self.bitsscale
-        return logrecon, logdec, logenc, zsamples
-
-    # function to sample from the model (using the generative model)
-    def sample(self, device, epoch, num=64):
-        # sample "num" latent variables from the prior
-        z = random.logistic_eps(((num,) + self.zdim), device=device)
-
-        # sample from the generative distribution(s)
-        for i in reversed(range(self.nz)):
-            mu, scale = self.generate(i)(given=z)
-            eps = random.logistic_eps(mu.shape, device=device)
-            z_prev = random.transform(eps, mu, scale)
-            z = z_prev
-
-        # scale up from [-1,1] to [0,255]
-        x_cont = (z * 127.5) + 127.5
-
-        # ensure that [0,255]
-        x = torch.clamp(x_cont, 0, 255)
-
-        # scale from [0,255] to [0,1] and convert to right shape
-        x_sample = x.float() / 255.
-        x_sample = x_sample.view((num,) + self.xs)
-
-        # make grid out of "num" samples
-        x_grid = utils.make_grid(x_sample)
-
-    # function to sample a reconstruction of input data
-    def reconstruct(self, x_orig, device, epoch):
-        # take only first 32 datapoints of the input
-        # otherwise the output image grid may be too big for visualization
-        x_orig = x_orig[:32, :, :, :].to(device)
-
-        # sample from the bottom (zi = 1) inference model
-        mu, scale = self.infer(0)(given=x_orig)
-        eps = random.logistic_eps(mu.shape, device=device)
-        z = random.transform(eps, mu, scale)  # sample zs
-
-        # sample from the bottom (zi = 1) generative model
-        mu, scale = self.generate(0)(given=z)
-        x_eps = random.logistic_eps(mu.shape, device=device)
-        x_cont = random.transform(x_eps, mu, scale)
-
-        # scale up from [-1.1] to [0,255]
-        x_cont = (x_cont * 127.5) + 127.5
-
-        # esnure that [0,255]
-        x_sample = torch.clamp(x_cont, 0, 255)
-
-        # scale from [0,255] to [0,1] and convert to right shape
-        x_sample = x_sample.float() / 255.
-        x_orig = x_orig.float() / 255.
-
-        # concatenate the input data and the sampled reconstructions for comparison
-        x_with_recon = torch.cat((x_orig, x_sample))
-
-        # make a grid out of the original data and the reconstruction samples
-        x_with_recon = x_with_recon.view((2 * x_orig.shape[0],) + self.xs)
-        x_grid = utils.make_grid(x_with_recon)
\ No newline at end of file
diff --git a/examples/2_hybrid_models/VAE/modules.py b/examples/2_hybrid_models/VAE/modules.py
deleted file mode 100644
index 42e2a007..00000000
--- a/examples/2_hybrid_models/VAE/modules.py
+++ /dev/null
@@ -1,277 +0,0 @@
-from contextlib import contextmanager
-
-import torch.nn.functional as F
-from torch.nn import Module, Parameter, Sequential, Dropout, ELU
-from torch.nn import init
-from PIL import Image
-import os
-import torch
-import numpy as np
-from torch.utils.data import Dataset
-
-_WN_INIT_STDV = 0.05
-_SMALL = 1e-10
-
-_INIT_ENABLED = False
-
-@contextmanager
-def init_mode():
-    global _INIT_ENABLED
-    assert not _INIT_ENABLED
-    _INIT_ENABLED = True
-    yield
-    _INIT_ENABLED = False
-
-# PyTorch module that applies Data Dependent Initialization + Weight Normalization
-class WnModule(Module):
-    """
-    Module with data-dependent initialization
-    """
-
-    def __init__(self):
-        super().__init__()
-
-    def _init(self, *args, **kwargs):
-        """
-        Data-dependent initialization. Will be called on the first forward()
-        """
-        raise NotImplementedError
-
-    def _forward(self, *args, **kwargs):
-        """
-        The standard forward pass
-        """
-        raise NotImplementedError
-
-    def forward(self, *args, **kwargs):
-        """
-        Calls _init (with no_grad) if not initialized.
-        If initialized already, calls _forward.
-        """
-        if _INIT_ENABLED:
-            with torch.no_grad():  # no gradients for the init pass
-                return self._init(*args, **kwargs)
-        return self._forward(*args, **kwargs)
-
-# Data-Dependent Initialization + Weight Normalization extension of a "Conv2D" module of PyTorch
-class WnConv2d(WnModule):
-    def __init__(self, in_dim, out_dim, kernel_size, stride, padding, init_scale=1.0, loggain=True, bias=True):
-        super().__init__()
-        self.in_dim, self.out_dim, self.kernel_size, self.stride, self.padding = in_dim, out_dim, kernel_size, stride, padding
-        self.bias = bias
-        self.init_scale = init_scale
-        self.loggain = loggain
-        self.v = Parameter(torch.Tensor(out_dim, in_dim, self.kernel_size, self.kernel_size))
-        self.gain = Parameter(torch.Tensor(out_dim))
-        self.b = Parameter(torch.Tensor(out_dim), requires_grad=True if self.bias else False)
-
-        init.normal_(self.v, 0., _WN_INIT_STDV)
-        if self.loggain:
-            init.zeros_(self.gain)
-        else:
-            init.ones_(self.gain)
-        init.zeros_(self.b)
-
-    def _init(self, x):
-        # calculate unnormalized activations
-        y_bchw = self._forward(x)
-        assert len(y_bchw.shape) == 4 and y_bchw.shape[:2] == (x.shape[0], self.out_dim)
-
-        # set g and b so that activations are normalized
-        y_c = y_bchw.transpose(0, 1).reshape(self.out_dim, -1)
-        m = y_c.mean(dim=1)
-        s = self.init_scale / (y_c.std(dim=1) + _SMALL)
-        assert m.shape == s.shape == self.gain.shape == self.b.shape
-
-        if self.loggain:
-            loggain = torch.clamp(torch.log(s), min=-10., max=None)
-            self.gain.data.copy_(loggain)
-        else:
-            self.gain.data.copy_(s)
-
-        if self.bias:
-            self.b.data.sub_(m * s)
-
-        # forward pass again, now normalized
-        return self._forward(x)
-
-    def _forward(self, x):
-        if self.loggain:
-            g = softplus(self.gain)
-        else:
-            g = self.gain
-        vnorm = self.v.view(self.out_dim, -1).norm(p=2, dim=1)
-        assert vnorm.shape == self.gain.shape == self.b.shape
-        w = self.v * (g / (vnorm + _SMALL)).view(self.out_dim, 1, 1, 1)
-        return F.conv2d(x, w, self.b, stride=self.stride, padding=self.padding)
-
-    def extra_repr(self):
-        return 'in_dim={}, out_dim={}, kernel_size={}, stride={}, padding={}, init_scale={}, loggain={}'.format(self.in_dim, self.out_dim, self.kernel_size, self.stride, self.padding, self.init_scale, self.loggain)
-
-# numerically stable version of the "softplus" function
-def softplus(x):
-    ret = -F.logsigmoid(-x)
-    return ret
-
-# class used to store two sets of parameters
-# 1. parameters that are the result of EMA (for evaluation)
-# 2. parameters not affected by EMA (for training)
-# and to apply EMA to (1.)
-class EMA(Module):
-    def __init__(self, mu):
-        super(EMA, self).__init__()
-        # decay parameter
-        self.mu = mu
-
-        # parameters affected by EMA
-        self.shadow = {}
-
-        # "default" parameters
-        self.default = {}
-
-    # set parameters affected by EMA
-    def register_ema(self, name, val):
-        self.shadow[name] = val.clone()
-
-    # set "default parameters
-    def register_default(self, name, val):
-        self.default[name] = val.clone()
-
-    # return parameters affected by EMA
-    def get_ema(self, name):
-        assert name in self.shadow
-        return self.shadow[name].clone()
-
-    # return "default" parameters
-    def get_default(self, name):
-        assert name in self.default
-        return self.default[name].clone()
-
-    # apply exponential moving average on parameters stored in self.shadow
-    def forward(self, name, x):
-        assert name in self.shadow
-        new_average = self.mu * x + (1.0 - self.mu) * self.shadow[name]
-        self.shadow[name] = new_average.clone()
-        return new_average
-
-# PyTorch module that is used to only pass through values
-class Pass(Module):
-    def __init__(self):
-        super(Pass, self).__init__()
-
-    def forward(self, x):
-        return x
-
-    def inverse(self, x):
-        return x
-
-# PyTorch module used to squeeze from [C, H, W] to [C * factor^2, H // factor, W // factor]
-class Squeeze2d(Module):
-    def __init__(self, factor=2):
-        super(Squeeze2d, self).__init__()
-        assert factor >= 2
-        self.factor = factor
-
-    def forward(self, x):
-        if self.factor == 1:
-            return x
-        shape = x.shape
-        height = int(shape[2])
-        width = int(shape[3])
-        n_channels = int(shape[1])
-        assert height % self.factor == 0 and width % self.factor == 0
-        x = x.view(-1, n_channels, height//self.factor, self.factor, width//self.factor, self.factor)
-        x = x.permute(0, 1, 3, 5, 2, 4).contiguous()
-        x = x.view(-1, n_channels*self.factor*self.factor, height//self.factor, width // self.factor)
-        return x
-
-    def extra_repr(self):
-        return 'factor={}'.format(self.factor)
-
-# PyTorch module used to squeeze from [C, H, W] to [C / factor^2, H * factor, W * factor]
-class UnSqueeze2d(Module):
-    def __init__(self, factor=2):
-        super(UnSqueeze2d, self).__init__()
-        assert factor >= 2
-        self.factor = factor
-
-    def forward(self, x):
-        if self.factor == 1:
-            return x
-        shape = x.shape
-        height = int(shape[2])
-        width = int(shape[3])
-        n_channels = int(shape[1])
-        x = x.view(-1, int(n_channels/self.factor**2), self.factor, self.factor, height, width)
-        x = x.permute(0, 1, 4, 2, 5, 3).contiguous()
-        x = x.view(-1, int(n_channels/self.factor**2), int(height*self.factor), int(width*self.factor))
-        return x
-
-    def extra_repr(self):
-        return 'factor={}'.format(self.factor)
-
-# PyTorch module used to build a ResNet layer
-class ResNetLayer(Module):
-    def __init__(self, inchannels, outchannels, kernel_size=3, stride=1, padding=1, dropout_p=0., act=ELU()):
-        super(ResNetLayer, self).__init__()
-        self.inchannels = inchannels
-        self.outchannels = outchannels
-        self.dropout_p = dropout_p
-        self.stride = stride
-        self.act = act
-
-        self.conv1 = WnConv2d(inchannels, outchannels, kernel_size=kernel_size, stride=1,
-                                      padding=padding, init_scale=1.0, loggain=True)
-        self.dropout = Dropout(dropout_p)
-        self.conv2 = WnConv2d(outchannels,  outchannels, kernel_size=kernel_size,
-                                      stride=1, padding=padding, init_scale=0., loggain=False)
-
-    def forward(self, x):
-        # first convolution preceded and followed by an activation
-        c1 = self.act(self.conv1(self.act(x)))
-
-        # dropout layer
-        if self.dropout_p > 0.:
-            c1 = self.dropout(c1)
-
-        # second convolution
-        c2 = self.conv2(c1)
-
-        # residual connection
-        return x + c2
-
-# PyTorch module used to build a sequence of ResNet layers
-class ResNetBlock(Sequential):
-    def __init__(self, inchannels, outchannels, kernel_size=3, stride=1, padding=1, nlayers=1, dropout_p=0.,
-                 act=ELU()):
-        super(ResNetBlock, self).__init__()
-        for i in range(nlayers):
-            layer = ResNetLayer(inchannels, outchannels, kernel_size, stride, padding, dropout_p, act)
-            self.add_module('res{}layer{}'.format(inchannels, i + 1), layer)
-
-# PyTorch Dataset class custom built for the ImageNet dataset (including applying data pre-processing transforms)
-class ImageNet(Dataset):
-    def __init__(self, root, file, transform=None):
-        self.transform = transform
-        self.dir = os.path.join(root, file)
-        self.dataset = np.load(self.dir)
-
-    def __getitem__(self, index):
-        img = self.dataset[index]
-
-        img = Image.fromarray(img)
-        if self.transform:
-            img = self.transform(img)
-
-        return img
-
-    def __len__(self):
-        return len(self.dataset)
-
-
-def main():
-    global _INIT_ENABLED
-    print('Outside:', _INIT_ENABLED)
-    with init_mode():
-        print('Inside:', _INIT_ENABLED)
-    print('Outside:', _INIT_ENABLED)
diff --git a/examples/2_hybrid_models/VAE/rand.py b/examples/2_hybrid_models/VAE/rand.py
deleted file mode 100644
index c8b5628c..00000000
--- a/examples/2_hybrid_models/VAE/rand.py
+++ /dev/null
@@ -1,158 +0,0 @@
-import torch
-import numpy as np
-import sys
-import os
-
-sys.path.append(os.path.dirname(__file__))
-
-import modules
-
-# function to transform "noise" using a given mean and scale
-def transform(eps, mu, scale):
-    sample = mu + scale * eps
-    return sample
-
-# function to sample from a Logistic distribution (mu=0, scale=1)
-def logistic_eps(shape, device, bound=1e-5):
-    # sample from a Gaussian
-    u = torch.rand(shape, device=device)
-
-    # clamp between two bounds to ensure numerical stability
-    u = torch.clamp(u, min=bound, max=1 - bound)
-
-    # transform to a sample from the Logistic distribution
-    eps = torch.log(u) - torch.log1p(-u)
-    return eps
-
-# function to calculate the log-probability of x under a Logistic(mu, scale) distribution
-def logistic_logp(mu, scale, x):
-    _y = -(x - mu) / scale
-    _logp = -_y - torch.log(scale) - 2 * modules.softplus(-_y)
-    logp = _logp.flatten(2)
-    return logp
-
-# function to calculate the log-probability of x under a discretized Logistic(mu, scale) distribution
-# heavily based on discretized_mix_logistic_loss() in https://github.com/openai/pixel-cnn
-def discretized_logistic_logp(mu, scale, x):
-    # [0,255] -> [-1.1] (this means bin sizes of 2./255.)
-    x_rescaled = (x - 127.5) / 127.5
-    invscale = 1. / scale
-
-    x_centered = x_rescaled - mu
-
-    plus_in = invscale * (x_centered + 1. / 255.)
-    cdf_plus = torch.sigmoid(plus_in)
-    min_in = invscale * (x_centered - 1. / 255.)
-    cdf_min = torch.sigmoid(min_in)
-
-    # log-probability for edge case of 0
-    log_cdf_plus = plus_in - modules.softplus(plus_in)
-
-    # log-probability for edge case of 255
-    log_one_minus_cdf_min = - modules.softplus(min_in)
-
-    # other cases
-    cdf_delta = cdf_plus - cdf_min
-
-    mid_in = invscale * x_centered
-
-    # log-probability in the center of the bin, to be used in extreme cases
-    log_pdf_mid = mid_in - torch.log(scale) - 2. * modules.softplus(mid_in)
-
-    # now select the right output: left edge case, right edge case, normal case, extremely low-probability case
-    cond1 = torch.where(cdf_delta > 1e-5, torch.log(torch.clamp(cdf_delta, min=1e-12, max=None)),
-                        log_pdf_mid - np.log(127.5))
-    cond2 = torch.where(x_rescaled > .999, log_one_minus_cdf_min, cond1)
-    logps = torch.where(x_rescaled < -.999, log_cdf_plus, cond2)
-
-    logp = logps.flatten(1)
-    return logp
-
-# function to calculate the CDF of the Logistic(mu, scale) distribution evaluated under x
-def logistic_cdf(x, mu, scale):
-    return torch.sigmoid((x - mu) / scale)
-
-# function to calculate the inverse CDF (quantile function) of the Logistic(mu, scale) distribution evaluated under x
-def logistic_icdf(p, mu, scale):
-    return mu + scale * torch.log(p / (1. - p))
-
-# class that is used to determine endpoints and centers of discretization bins
-# in which every bin has equal mass under some given Logistic(mu, scale) distribution.
-# note: the first (-inf) and last (inf) endpoint are not created here, but rather
-# accounted for in the compression/decompression loop
-class Bins:
-    def __init__(self, mu, scale, precision):
-        # number of bits used
-        self.precision = precision
-
-        # the resulting number of bins from the amount of bits used
-        self.nbins = 1 << precision
-
-        # parameters of the Logistic distribution
-        self.mu, self.scale = mu, scale
-
-        # datatype used
-        self.type = self.mu.dtype
-
-        # device used (GPU/CPU)
-        self.device = self.mu.device
-        self.shape = list(self.mu.shape)
-
-    def endpoints(self):
-        # first uniformly between [0,1]
-        # shape: [1 << bits]
-        endpoint_probs = torch.arange(1., self.nbins, dtype=self.type, device=self.device) / self.nbins
-
-        # reshape
-        endpoint_probs = endpoint_probs[(None,) * len(self.shape)] # shape: [1, 1, 1<<bits]
-        endpoint_probs = endpoint_probs.permute([-1] + list(range(len(self.shape)))) # shape: [1 << bits, 1, 1]
-        endpoint_probs = endpoint_probs.expand([-1] + self.shape) # shape: [1 << bits] + self.shape
-
-        # put those samples through the inverse CDF
-        endpoints = logistic_icdf(endpoint_probs, self.mu, self.scale)
-
-        # reshape
-        endpoints = endpoints.permute(list(range(1, len(self.shape) + 1)) + [0]) # self.shape + [1 << bits]
-        return endpoints
-
-    def centres(self):
-        # first uniformly between [0,1]
-        # shape: [1 << bits]
-        centre_probs = (torch.arange(end=self.nbins, dtype=self.type, device=self.device) + .5) / self.nbins
-
-        # reshape
-        centre_probs = centre_probs[(None,) * len(self.shape)] # shape: [1, 1, 1<<bits]
-        centre_probs = centre_probs.permute([-1] + list(range(len(self.shape)))) # shape: [1 << bits, 1, 1]
-        centre_probs = centre_probs.expand([-1] + self.shape) # shape: [1 << bits] + self.shape
-
-        # put those samples through the inverse CDF
-        centres = logistic_icdf(centre_probs, self.mu, self.scale)
-
-        # reshape
-        centres = centres.permute(list(range(1, len(self.shape) + 1)) + [0]) # self.shape + [1 << bits]
-        return centres
-
-# class that is used to determine the endpoints and center of bins discretized uniformly between [0,1]
-# these bins are used for the discretized Logistic distribution during compression/decompression
-# note: the first (-inf) and last (inf) endpoint are not created here, but rather
-# accounted for in the compression/decompression loop
-class ImageBins:
-    def __init__(self, type, device, shape):
-        # datatype used
-        self.type = type
-
-        # device used (CPU/GPU)
-        self.device = device
-        self.shape = [shape]
-
-    def endpoints(self):
-        endpoints = torch.arange(1, 256, dtype=self.type, device=self.device)
-        endpoints = ((endpoints - 127.5) / 127.5) - 1./255.
-        endpoints = endpoints[None,].expand(self.shape + [-1])
-        return endpoints
-
-    def centres(self):
-        centres = torch.arange(0, 256, dtype=self.type, device=self.device)
-        centres = (centres - 127.5) / 127.5
-        centres = centres[None,].expand(self.shape + [-1])
-        return centres
\ No newline at end of file
diff --git a/examples/2_hybrid_models/train_pc_flows_hybrid.py b/examples/2_hybrid_models/train_pc_flows_hybrid.py
deleted file mode 100644
index ee278845..00000000
--- a/examples/2_hybrid_models/train_pc_flows_hybrid.py
+++ /dev/null
@@ -1,232 +0,0 @@
-import sys
-import pyjuice as juice
-import torch
-import torch.nn.functional as F
-import torch._dynamo as dynamo
-import time
-import torchvision
-import numpy as np
-import sys
-import logging
-import warnings
-from torch.utils.data import TensorDataset, DataLoader
-from torch.autograd import Function
-from torchvision.transforms import Resize
-
-import argparse
-import os
-
-sys.path.append(os.path.dirname(__file__))
-
-# from vae import IWAE
-from IDF.idf import Model as IDF
-
-from pyjuice.layer import DiscreteLogisticLayer
-
-warnings.filterwarnings("ignore")
-logging.getLogger("torch._inductor").setLevel(logging.ERROR)
-logging.getLogger("torch._inductor.utils").setLevel(logging.ERROR)
-logging.getLogger("torch._inductor.compile_fx").setLevel(logging.ERROR)
-
-
-def _log_min_exp(a: torch.Tensor, b: torch.Tensor, epsilon = 1e-8):
-    return a + torch.log(1 - torch.exp(b - a) + epsilon)
-
-
-def discrete_logistic_ll(x, mean, logscale):
-
-    scale = torch.exp(logscale)
-
-    logp = _log_min_exp(
-        F.logsigmoid((x + 0.5 / 256.0 - mean) / scale),
-        F.logsigmoid((x - 0.5 / 256.0 - mean) / scale))
-
-    return logp
-
-
-def get_args():
-    parser = argparse.ArgumentParser(description='PC+IDF')
-
-    parser.add_argument('--variable_type', type=str, default='discrete',
-                        help='variable type of data distribution: discrete/continuous',
-                        choices=['discrete', 'continuous'])
-    parser.add_argument('--distribution_type', type=str, default='logistic',
-                        choices=['logistic', 'normal', 'steplogistic'],
-                        help='distribution type: logistic/normal')
-    parser.add_argument('--n_flows', type=int, default=8,
-                        help='number of flows per level')
-    parser.add_argument('--n_levels', type=int, default=3,
-                        help='number of levels')
-
-    parser.add_argument('--n_bits', type=int, default=8,
-                        help='')
-
-    # ---------------- SETTINGS CONCERNING NETWORKS -------------
-    parser.add_argument('--densenet_depth', type=int, default=2,
-                        help='Depth of densenets')
-    parser.add_argument('--n_channels', type=int, default=512,
-                        help='number of channels in coupling and splitprior')
-    # ---------------- ----------------------------- -------------
-
-
-    # ---------------- SETTINGS CONCERNING COUPLING LAYERS -------------
-    parser.add_argument('--coupling_type', type=str, default='shallow',
-                        choices=['shallow', 'resnet', 'densenet', 'densenet++'],
-                        help='Type of coupling layer')
-    parser.add_argument('--splitfactor', default=0, type=int,
-                        help='Split factor for coupling layers.')
-
-    parser.add_argument('--split_quarter', dest='split_quarter', action='store_true',
-                        help='Split coupling layer on quarter')
-    parser.add_argument('--no_split_quarter', dest='split_quarter', action='store_false')
-    parser.set_defaults(split_quarter=True)
-    # ---------------- ----------------------------------- -------------
-
-
-    # ---------------- SETTINGS CONCERNING SPLITPRIORS -------------
-    parser.add_argument('--splitprior_type', type=str, default='densenet',
-                        choices=['none', 'shallow', 'resnet', 'densenet', 'densenet++'],
-                        help='Type of splitprior. Use \'none\' for no splitprior')
-    # ---------------- ------------------------------- -------------
-
-
-    # ---------------- SETTINGS CONCERNING PRIORS -------------
-    parser.add_argument('--n_mixtures', type=int, default=4,
-                        help='number of mixtures')
-    # ---------------- ------------------------------- -------------
-
-    parser.add_argument('--hard_round', dest='hard_round', action='store_true',
-                        help='Rounding of translation in discrete models. Weird '
-                        'probabilistic implications, only for experimental phase')
-    parser.add_argument('--no_hard_round', dest='hard_round', action='store_false')
-    parser.set_defaults(hard_round=True)
-
-    parser.add_argument('--round_approx', type=str, default='smooth',
-                        choices=['smooth', 'stochastic'])
-
-    parser.add_argument('--lr_decay', default=0.999, type=float,
-                        help='Learning rate')
-
-    parser.add_argument('--temperature', default=1.0, type=float,
-                        help='Temperature used for BackRound. It is used in '
-                        'the the SmoothRound module. '
-                        '(default=1.0')
-
-    args = parser.parse_args()
-
-    return args
-
-
-def main():
-
-    args = get_args()
-
-    BATCH_SIZE = 64
-    NUM_LATENTS = 4
-
-    device = torch.device(f"cuda:0")
-
-    train_dataset = torchvision.datasets.MNIST(root = "./examples/data", train = True, download = True)
-    test_dataset = torchvision.datasets.MNIST(root = "./examples/data", train = False, download = True)
-
-    train_data = train_dataset.data.reshape(60000, 28, 28).float()
-    test_data = test_dataset.data.reshape(10000, 28, 28).float()
-
-    resize = Resize((32, 32))
-
-    train_data = (resize(train_data) * 256).floor().type(torch.uint8).unsqueeze(1)
-    test_data = (resize(test_data) * 256).floor().type(torch.uint8).unsqueeze(1)
-
-    num_features = train_data.size(1)
-
-    train_loader = DataLoader(
-        dataset = TensorDataset(train_data),
-        batch_size = BATCH_SIZE,
-        shuffle = True,
-        drop_last = True
-    )
-    test_loader = DataLoader(
-        dataset = TensorDataset(test_data),
-        batch_size = BATCH_SIZE,
-        shuffle = False,
-        drop_last = True
-    )
-
-    pc = juice.structures.HCLT(
-        train_data[:,0,::2,::2].reshape(-1, 16*16).float().to(device), 
-        num_bins = 32, 
-        sigma = 0.5 / 32, 
-        num_latents = args.n_mixtures, 
-        chunk_size = 32,
-        input_layer_type = DiscreteLogisticLayer, 
-        input_layer_params = {"input_range": [0, 1], "bin_count": 256}
-    )
-    pc.to(device)
-
-    param_specs = pc.get_param_specs()
-    print(param_specs)
-
-    args.input_size = (1, 32, 32)
-
-    idf = IDF(args).to(device)
-    idf.set_temperature(args.temperature)
-    idf.enable_hard_round(args.hard_round)
-
-    nn_optim = torch.optim.Adam(idf.parameters(), lr = 0.001)
-    optimizer = juice.optim.CircuitOptimizer(pc, base_optimizer = nn_optim, lr = 0.1, pseudocount = 0.1)
-    scheduler = juice.optim.CircuitScheduler(optimizer, method = "multi_linear", lrs = [0.1, 0.01], 
-                                             milestone_steps = [0, len(train_loader) * 200])
-
-    for epoch in range(1, 100+1):
-        train_ll = 0.0
-        for batch in train_loader:
-            x = batch[0].to(device)
-
-            optimizer.zero_grad()
-
-            pz, z, pys, ys, _ = idf.forward(x, forward_only = True)
-    
-            ## Latent codes ##
-            latent_codes = [
-                [z, pz[0], pz[1]], [ys[0], pys[0][0], pys[0][1]], [ys[1], pys[1][0], pys[1][1]]
-            ]
-            
-            ## Pre-process latent codes ##
-            for idx in range(len(latent_codes)):
-                zs = latent_codes[idx]
-                
-                n, k = zs[0].size(0), zs[0].size(1) * zs[0].size(2) * zs[0].size(3)
-                h = zs[0].view(n, -1) + 0.5
-                h_mean = zs[1].reshape(n, -1).reshape(n, -1, k).permute(0, 2, 1)
-                h_logscale = zs[2].reshape(n, -1).reshape(n, -1, k).permute(0, 2, 1)
-                
-                latent_codes[idx] = [h, h_mean, h_logscale]
-
-            input_params = {"input_0": {"mus": latent_codes[0][1].reshape(-1, 16 * 16 * args.n_mixtures),
-                                        "log_scales": latent_codes[0][2].reshape(-1, 16 * 16 * args.n_mixtures)}}
-
-            # Use a PC as the distribution of the first set of latent variables
-            lls = pc(latent_codes[0][0], input_params = input_params) + \
-                discrete_logistic_ll(latent_codes[1][0], latent_codes[1][1][:,:,0], latent_codes[1][2][:,:,0]).sum(dim = 1) + \
-                discrete_logistic_ll(latent_codes[2][0], latent_codes[2][1][:,:,0], latent_codes[2][2][:,:,0]).sum(dim = 1)
-
-            # Original LL
-            lls = discrete_logistic_ll(latent_codes[0][0], latent_codes[0][1][:,:,0], latent_codes[0][2][:,:,0]).sum(dim = 1) + \
-                discrete_logistic_ll(latent_codes[1][0], latent_codes[1][1][:,:,0], latent_codes[1][2][:,:,0]).sum(dim = 1) + \
-                discrete_logistic_ll(latent_codes[2][0], latent_codes[2][1][:,:,0], latent_codes[2][2][:,:,0]).sum(dim = 1)
-
-            loss = -lls.mean()
-            loss.backward()
-            print(lls.mean().item())
-
-            train_ll += lls.mean().item()
-
-            optimizer.step()
-            scheduler.step()
-
-        train_ll /= len(train_loader)
-        print(train_ll)
-
-
-if __name__ == "__main__":
-    main()
\ No newline at end of file
diff --git a/examples/2_hybrid_models/train_pc_vae_hybrid.py b/examples/2_hybrid_models/train_pc_vae_hybrid.py
deleted file mode 100644
index 8df4dd89..00000000
--- a/examples/2_hybrid_models/train_pc_vae_hybrid.py
+++ /dev/null
@@ -1,349 +0,0 @@
-import torch
-import torch.utils.data
-from torch import nn, optim
-from torchvision import *
-import socket
-import os
-import time
-from datetime import datetime
-import numpy as np
-import argparse
-import sys
-import pyjuice as juice
-import warnings
-import logging
-from pyjuice.layer import DiscreteLogisticLayer
-
-sys.path.append(os.path.dirname(__file__))
-
-import VAE.modules as modules
-import VAE.rand as random
-from VAE.model import Model as VAE
-
-warnings.filterwarnings("ignore")
-logging.getLogger("torch._inductor").setLevel(logging.ERROR)
-logging.getLogger("torch._inductor.utils").setLevel(logging.ERROR)
-logging.getLogger("torch._inductor.compile_fx").setLevel(logging.ERROR)
-
-
-def warmup(model, device, data_loader, warmup_batches):
-    # convert model to evaluation mode (no Dropout etc.)
-    model.eval()
-
-    # prepare initialization batch
-    for batch_idx, (image, _) in enumerate(data_loader):
-        # stack image with to current stack
-        warmup_images = torch.cat((warmup_images, image), dim=0) \
-            if batch_idx != 0 else image
-
-        # stop stacking batches if reaching limit
-        if batch_idx + 1 == warmup_batches:
-            break
-
-    # set the stack to current device
-    warmup_images = warmup_images.to(device)
-
-    # do one 'special' forward pass to initialize parameters
-    with modules.init_mode():
-        logrecon, logdec, logenc, _ = model.loss(warmup_images)
-
-    # log
-    logdec = torch.sum(logdec, dim=1)
-    logenc = torch.sum(logenc, dim=1)
-
-    elbo = -logrecon + torch.sum(-logdec + logenc)
-
-    elbo = elbo.detach().cpu().numpy() * model.perdimsscale
-    entrecon = -logrecon.detach().cpu().numpy() * model.perdimsscale
-    entdec = -logdec.detach().cpu().numpy() * model.perdimsscale
-    entenc = -logenc.detach().cpu().numpy() * model.perdimsscale
-
-    kl = entdec - entenc
-
-    print(f'====> Epoch: {0} Average loss: {elbo:.4f}')
-
-
-def train(model, pc, device, epoch, data_loader, optimizer, log_interval, schedule=True, decay=0.99995):
-    # convert model to train mode (activate Dropout etc.)
-    model.train()
-
-    # get number of batches
-    nbatches = data_loader.batch_sampler.sampler.num_samples // data_loader.batch_size
-
-    # setup training metrics
-    elbos = torch.zeros((nbatches), device=device)
-    logrecons = torch.zeros((nbatches), device=device)
-    logdecs = torch.zeros((nbatches, model.nz), device=device)
-    logencs = torch.zeros((nbatches, model.nz), device=device)
-
-    start_time = time.time()
-
-    # allocate memory for data
-    data = torch.zeros((data_loader.batch_size,) + model.xs, device=device)
-
-    # enumerate over the batches
-    for batch_idx, (batch, _) in enumerate(data_loader):
-        # keep track of the global step
-        global_step = (epoch - 1) * len(data_loader) + (batch_idx + 1)
-
-        # update the learning rate according to schedule
-        if schedule:
-            for param_group in optimizer.param_groups:
-                lr = param_group['lr']
-                lr = lr_step(global_step, lr, decay=decay)
-                param_group['lr'] = lr
-
-        # empty all the gradients stored
-        optimizer.zero_grad()
-
-        # copy the mini-batch in the pre-allocated data-variable
-        data.copy_(batch)
-
-        # evaluate the data under the model and calculate ELBO components
-        logrecon, logdec, logenc, zsamples = model.loss(data, pc)
-
-        # free bits technique, in order to prevent posterior collapse
-        bits_pc = 1.
-        kl = torch.sum(torch.max(-logdec + logenc, bits_pc * torch.ones((model.nz, model.zdim[0]), device=device)))
-
-        # compute the inference- and generative-model loss
-        logdec = torch.sum(logdec, dim=1)
-        logenc = torch.sum(logenc, dim=1)
-
-        # construct ELBO
-        elbo = -logrecon + kl
-
-        # scale by image dimensions to get "bits/dim"
-        elbo *= model.perdimsscale
-        logrecon *= model.perdimsscale
-        logdec *= model.perdimsscale
-        logenc *= model.perdimsscale
-
-        # calculate gradients
-        elbo.backward()
-
-        # take gradient step
-        total_norm = nn.utils.clip_grad_norm_(model.parameters(), 1., norm_type=2)
-        optimizer.step()
-
-        # log
-        elbos[batch_idx] += elbo
-        logrecons[batch_idx] += logrecon
-        logdecs[batch_idx] += logdec
-        logencs[batch_idx] += logenc
-
-        # log and save parameters
-        if batch_idx % log_interval == 0 and log_interval < nbatches:
-            # print metrics to console
-            print(f'Train Epoch: {epoch} [{batch_idx}/{nbatches} ({100. * batch_idx / len(data_loader):.0f}%)]\tLoss: {elbo.item():.6f}\tGnorm: {total_norm:.2f}\tSteps/sec: {(time.time() - start_time) / (batch_idx + 1):.3f}')
-
-            entrecon = -logrecon
-            entdec = -logdec
-            entenc = -logenc
-            kl = entdec - entenc
-
-    # print the average loss of the epoch to the console
-    elbo = torch.mean(elbos).detach().cpu().numpy()
-    print(f'====> Epoch: {epoch} Average loss: {elbo:.4f}')
-
-
-def test(model, pc, device, epoch, data_loader, tag):
-    # convert model to evaluation mode (no Dropout etc.)
-    model.eval()
-
-    # setup the reconstruction dataset
-    recon_dataset = None
-    nbatches = data_loader.batch_sampler.sampler.num_samples // data_loader.batch_size
-    recon_batch_idx = int(torch.Tensor(1).random_(0, nbatches - 1))
-
-    # setup testing metrics
-    logrecons = torch.zeros((nbatches), device=device)
-    logdecs = torch.zeros((nbatches, model.nz), device=device)
-    logencs = torch.zeros((nbatches, model.nz), device=device)
-
-    elbos = []
-
-    # allocate memory for the input data
-    data = torch.zeros((data_loader.batch_size,) + model.xs, device=device)
-
-    # enumerate over the batches
-    for batch_idx, (batch, _) in enumerate(data_loader):
-        # save batch for reconstruction
-        if batch_idx == recon_batch_idx:
-            recon_dataset = data
-
-        # copy the mini-batch in the pre-allocated data-variable
-        data.copy_(batch)
-
-        with torch.no_grad():
-            # evaluate the data under the model and calculate ELBO components
-            logrecon, logdec, logenc, _ = model.loss(data, pc)
-
-            # construct the ELBO
-            elbo = -logrecon + torch.sum(-logdec + logenc)
-
-            # compute the inference- and generative-model loss
-            logdec = torch.sum(logdec, dim=1)
-            logenc = torch.sum(logenc, dim=1)
-
-        # scale by image dimensions to get "bits/dim"
-        elbo *= model.perdimsscale
-        logrecon *= model.perdimsscale
-        logdec *= model.perdimsscale
-        logenc *= model.perdimsscale
-
-        elbos.append(elbo.item())
-
-        # log
-        logrecons[batch_idx] += logrecon
-        logdecs[batch_idx] += logdec
-        logencs[batch_idx] += logenc
-
-    elbo = np.mean(elbos)
-
-    entrecon = -torch.mean(logrecons).detach().cpu().numpy()
-    entdec = -torch.mean(logdecs, dim=0).detach().cpu().numpy()
-    entenc = -torch.mean(logencs, dim=0).detach().cpu().numpy()
-    kl = entdec - entenc
-
-    # print metrics to console and Tensorboard
-    print(f'\nEpoch: {epoch}\tTest loss: {elbo:.6f}')
-
-# learning rate schedule
-def lr_step(step, curr_lr, decay=0.99995, min_lr=5e-4):
-    # only decay after certain point
-    # and decay down until minimal value
-    if curr_lr > min_lr:
-        curr_lr *= decay
-        return curr_lr
-    return curr_lr
-
-
-if __name__ == '__main__':
-    # hyperparameters, input from command line
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--seed', default=99, type=int, help="seed for experiment reproducibility")
-    parser.add_argument('--nz', default=8, type=int, help="number of latent variables, greater or equal to 1")
-    parser.add_argument('--zchannels', default=1, type=int, help="number of channels for the latent variables")
-    parser.add_argument('--nprocessing', default=4, type=int, help='number of processing layers')
-    parser.add_argument('--gpu', default=0, type=int, help="number of gpu's to distribute optimization over")
-    parser.add_argument('--interval', default=100, type=int, help="interval for logging/printing of relevant values")
-    parser.add_argument('--epochs', default=10000000000, type=int, help="number of sweeps over the dataset (epochs)")
-    parser.add_argument('--blocks', default=8, type=int, help="number of ResNet blocks")
-    parser.add_argument('--width', default=64, type=int, help="number of channels in the convolutions in the ResNet blocks")
-    parser.add_argument('--dropout', default=0.2, type=float, help="dropout rate of the hidden units")
-    parser.add_argument('--kernel', default=3, type=int, help="size of the convolutional filter (kernel) in the ResNet blocks")
-    parser.add_argument('--batch', default=128, type=int, help="size of the mini-batch for gradient descent")
-    parser.add_argument('--lr', default=2e-3, type=float, help="learning rate gradient descent")
-    parser.add_argument('--schedule', default=1, type=float, help="learning rate schedule: yes (1) or no (0)")
-    parser.add_argument('--decay', default=0.9995, type=float, help="decay of the learning rate when using learning rate schedule")
-    parser.add_argument('--num-latents', default=1, type=int, help="")
-
-    args = parser.parse_args()
-    print(args) # print all the hyperparameters
-
-    # store hyperparameters in variables
-    seed = args.seed
-    epochs = args.epochs
-    batch_size = args.batch
-    nz = args.nz
-    zchannels = args.zchannels
-    nprocessing = args.nprocessing
-    gpu = args.gpu
-    blocks = args.blocks
-    width = args.width
-    log_interval = args.interval
-    dropout = args.dropout
-    kernel = args.kernel
-    lr = args.lr
-    schedule = True if args.schedule == 1 else False
-    decay = args.decay
-    assert nz > 0
-
-    # setup seeds to maintain experiment reproducibility
-    torch.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
-    np.random.seed(seed)
-    torch.backends.cudnn.deterministic = True
-
-    # set GPU/CPU options
-    use_cuda = torch.cuda.is_available()
-    cudastring = f"cuda:{gpu}"
-    device = torch.device(cudastring if use_cuda else "cpu")
-
-    # set number of workers and pin the memory if we distribute over multiple gpu's
-    # (see Dataloader docs of PyTorch)
-    kwargs = {}
-
-    # create class that scales up the data to [0,255] if called
-    class ToInt:
-        def __call__(self, pic):
-            return pic * 255
-
-    # set data pre-processing transforms
-    transform_ops = transforms.Compose([transforms.Pad(2), transforms.ToTensor(), ToInt()])
-
-    train_set = datasets.MNIST(root="examples/data", train=True, transform=transform_ops, download=True)
-    test_set = datasets.MNIST(root="examples/data", train=False, transform=transform_ops, download=True)
-
-    # setup mini-batch enumerator for both train-set and test-set
-    train_loader = torch.utils.data.DataLoader(
-        dataset = train_set,
-        batch_size = batch_size, 
-        shuffle = True, 
-        drop_last = True, 
-    )
-    test_loader = torch.utils.data.DataLoader(
-        dataset = test_set, 
-        batch_size = batch_size, 
-        shuffle = True, 
-        drop_last = True, 
-    )
-
-    # store MNIST data shape
-    xs = (1, 32, 32)
-
-    # build model from hyperparameters
-    model = VAE(xs=xs,
-                  kernel_size=kernel,
-                  nprocessing=nprocessing,
-                  nz=nz,
-                  zchannels=zchannels,
-                  resdepth=blocks,
-                  reswidth=width,
-                  dropout_p=dropout,
-                  tag="tag",
-                  num_latents=args.num_latents).to(device)
-
-    inputs = [juice.graph.InputRegionNode(
-        scope = [i], num_nodes = args.num_latents, node_type = DiscreteLogisticLayer, input_range = [0, 1], bin_count = 256
-    ) for i in range(32*32)]
-    prods = [juice.graph.PartitionNode(
-        children = [inputs[i]], num_nodes = args.num_latents, edge_ids = torch.arange(0, args.num_latents).unsqueeze(1)
-    ) for i in range(32*32)]
-    sums = [juice.graph.InnerRegionNode(
-        children = [prods[i]], num_nodes = 1, edge_ids = torch.stack((torch.zeros([args.num_latents], dtype = torch.long), torch.arange(0, args.num_latents)), dim = 0)
-    ) for i in range(32*32)]
-    rnode = juice.graph.InnerRegionNode(
-        children = [juice.graph.PartitionNode(
-            children = sums, num_nodes = 1, edge_ids = torch.zeros([32*32]).unsqueeze(0)
-        )],
-        num_nodes = 1, edge_ids = torch.zeros([2, 1], dtype = torch.long)
-    )
-    pc = juice.ProbCircuit(rnode)
-    pc.to(device)
-
-    # set up Adam optimizer
-    nn_optim = optim.Adam(model.parameters(), lr = lr)
-    optimizer = juice.optim.CircuitOptimizer(pc, base_optimizer = nn_optim, lr = 0.1, pseudocount = 0.1)
-
-    print("Data Dependent Initialization")
-    # data-dependent initialization
-    warmup(model, device, train_loader, 25)
-
-    # do the training loop and run over the test-set 1/5 epochs.
-    print("Training")
-    for epoch in range(1, epochs + 1):
-        train(model, pc, device, epoch, train_loader, optimizer, log_interval, schedule, decay)
-        if epoch % 5 == 0:
-            test(model, pc, device, epoch, test_loader, tag)
\ No newline at end of file
diff --git a/examples/train_mnist_hclt.py b/examples/train_mnist_hclt.py
new file mode 100644
index 00000000..e880864d
--- /dev/null
+++ b/examples/train_mnist_hclt.py
@@ -0,0 +1,125 @@
+import pyjuice as juice
+import torch
+import torchvision
+import time
+from torch.utils.data import TensorDataset, DataLoader
+import pyjuice.nodes.distributions as dists
+
+
+def evaluate(pc, loader):
+    lls_total = 0.0
+    for batch in loader:
+        x = batch[0].to(pc.device)
+        lls = pc(x)
+        lls_total += lls.mean().detach().cpu().numpy().item()
+    
+    lls_total /= len(loader)
+    return lls_total
+
+
+def mini_batch_em_epoch(num_epochs, pc, optimizer, scheduler, train_loader, test_loader, device):
+    for epoch in range(num_epochs):
+        t0 = time.time()
+        train_ll = 0.0
+        for batch in train_loader:
+            x = batch[0].to(device)
+
+            optimizer.zero_grad()
+
+            lls = pc(x)
+            lls.mean().backward()
+
+            train_ll += lls.mean().detach().cpu().numpy().item()
+
+            optimizer.step()
+            scheduler.step()
+
+        train_ll /= len(train_loader)
+
+        t1 = time.time()
+        test_ll = evaluate(pc, loader=test_loader)
+        t2 = time.time()
+
+        print(f"[Epoch {epoch}/{num_epochs}][train LL: {train_ll:.2f}; test LL: {test_ll:.2f}].....[train forward+backward+step {t1-t0:.2f}; test forward {t2-t1:.2f}] ")
+
+
+def full_batch_em_epoch(pc, train_loader, test_loader, device):
+    t0 = time.time()
+    train_ll = 0.0
+    for batch in train_loader:
+        x = batch[0].to(device)
+
+        lls = pc(x)
+        lls.mean().backward()
+
+        train_ll += lls.mean().detach().cpu().numpy().item()
+
+    pc.mini_batch_em(step_size = 1.0, pseudocount = 0.1)
+
+    train_ll /= len(train_loader)
+
+    t1 = time.time()
+    test_ll = evaluate(pc, loader=test_loader)
+    t2 = time.time()
+    print(f"[train LL: {train_ll:.2f}; test LL: {test_ll:.2f}].....[train forward+backward+step {t1-t0:.2f}; test forward {t2-t1:.2f}] ")
+
+
+def train_mnist_hclt(enable_cudagrph = True):
+
+    device = torch.device("cuda:0")
+
+    train_dataset = torchvision.datasets.MNIST(root = "./examples/data", train = True, download = True)
+    test_dataset = torchvision.datasets.MNIST(root = "./examples/data", train = False, download = True)
+
+    train_data = train_dataset.data.reshape(60000, 28*28)
+    test_data = test_dataset.data.reshape(10000, 28*28)
+
+    num_features = train_data.size(1)
+
+    train_loader = DataLoader(
+        dataset = TensorDataset(train_data),
+        batch_size = 512,
+        shuffle = True,
+        drop_last = True
+    )
+    test_loader = DataLoader(
+        dataset = TensorDataset(test_data),
+        batch_size = 512,
+        shuffle = False,
+        drop_last = True
+    )
+
+    ns = juice.structures.HCLT(
+        train_data.float().to(device), 
+        num_bins = 32, 
+        sigma = 0.5 / 32, 
+        num_latents = 128, 
+        chunk_size = 32
+    )
+    pc = juice.TensorCircuit(ns)
+
+    pc.to(device)
+
+    optimizer = juice.optim.CircuitOptimizer(pc, lr = 0.1, pseudocount = 0.1)
+    scheduler = juice.optim.CircuitScheduler(
+        optimizer, 
+        method = "multi_linear", 
+        lrs = [0.9, 0.1, 0.05], 
+        milestone_steps = [0, len(train_loader) * 100, len(train_loader) * 350]
+    )
+
+    if enable_cudagrph:
+        # Dry run to record CUDA graphs
+        for batch in train_loader:
+            x = batch[0].to(device)
+
+            lls = pc(x, record_cudagraph = True)
+            lls.mean().backward()
+            break
+
+    mini_batch_em_epoch(350, pc, optimizer, scheduler, train_loader, test_loader, device)
+    full_batch_em_epoch(pc, train_loader, test_loader, device)
+
+
+if __name__ == "__main__":
+    train_mnist_hclt()
diff --git a/examples/train_mnist_pd.py b/examples/train_mnist_pd.py
new file mode 100644
index 00000000..a2639887
--- /dev/null
+++ b/examples/train_mnist_pd.py
@@ -0,0 +1,110 @@
+import pyjuice as juice
+import torch
+import torchvision
+import time
+from torch.utils.data import TensorDataset, DataLoader
+
+
+def evaluate(pc, loader):
+    lls_total = 0.0
+    for batch in loader:
+        x = batch[0].to(pc.device)
+        lls = pc(x)
+        lls_total += lls.mean().detach().cpu().numpy().item()
+    
+    lls_total /= len(loader)
+    return lls_total
+
+
+def mini_batch_em_epoch(num_epochs, pc, optimizer, scheduler, train_loader, test_loader, device):
+    for epoch in range(num_epochs):
+        t0 = time.time()
+        train_ll = 0.0
+        for batch in train_loader:
+            x = batch[0].to(device)
+
+            optimizer.zero_grad()
+
+            lls = pc(x)
+            lls.mean().backward()
+
+            train_ll += lls.mean().detach().cpu().numpy().item()
+
+            optimizer.step()
+            if scheduler is not None:
+                scheduler.step()
+
+        train_ll /= len(train_loader)
+
+        t1 = time.time()
+        test_ll = evaluate(pc, loader=test_loader)
+        t2 = time.time()
+
+        print(f"[Epoch {epoch}/{num_epochs}][train LL: {train_ll:.2f}; test LL: {test_ll:.2f}].....[train forward+backward+step {t1-t0:.2f}; test forward {t2-t1:.2f}] ")
+
+
+def full_batch_em_epoch(pc, train_loader, test_loader, device):
+    with torch.no_grad():
+        t0 = time.time()
+        train_ll = 0.0
+        for batch in train_loader:
+            x = batch[0].to(device)
+
+            lls = pc(x)
+            pc.backward(x, flows_memory = 1.0)
+
+            train_ll += lls.mean().detach().cpu().numpy().item()
+
+        pc.mini_batch_em(step_size = 1.0, pseudocount = 0.1)
+
+        train_ll /= len(train_loader)
+
+        t1 = time.time()
+        test_ll = evaluate(pc, loader=test_loader)
+        t2 = time.time()
+        print(f"[train LL: {train_ll:.2f}; test LL: {test_ll:.2f}].....[train forward+backward+step {t1-t0:.2f}; test forward {t2-t1:.2f}] ")
+
+
+def train_mnist_pd():
+
+    device = torch.device("cuda:0")
+
+    train_dataset = torchvision.datasets.MNIST(root = "./examples/data", train = True, download = True)
+    test_dataset = torchvision.datasets.MNIST(root = "./examples/data", train = False, download = True)
+
+    train_data = train_dataset.data.reshape(60000, 28*28)
+    test_data = test_dataset.data.reshape(10000, 28*28)
+
+    num_features = train_data.size(1)
+
+    train_loader = DataLoader(
+        dataset = TensorDataset(train_data),
+        batch_size = 512,
+        shuffle = True,
+        drop_last = True
+    )
+    test_loader = DataLoader(
+        dataset = TensorDataset(test_data),
+        batch_size = 512,
+        shuffle = False,
+        drop_last = True
+    )
+
+    ns = juice.structures.PD(
+        data_shape = (28, 28),
+        num_latents = 128,
+        split_intervals = (4, 4),
+        structure_type = "sum_dominated"
+    )
+    pc = juice.TensorCircuit(ns)
+
+    pc.to(device)
+
+    optimizer = juice.optim.CircuitOptimizer(pc, lr = 0.1, pseudocount = 0.0001)
+
+    mini_batch_em_epoch(350, pc, optimizer, None, train_loader, test_loader, device)
+    full_batch_em_epoch(pc, train_loader, test_loader, device)
+
+
+if __name__ == "__main__":
+    train_mnist_pd()
diff --git a/tests/structures/pd_test.py b/tests/structures/pd_test.py
index 1ca1b9db..90164073 100644
--- a/tests/structures/pd_test.py
+++ b/tests/structures/pd_test.py
@@ -109,21 +109,22 @@ def pd_test():
     #     lls.mean().backward()
     #     break
 
-    from torch.profiler import profile, record_function, ProfilerActivity
-    with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], with_stack = True) as prof:
-        for i, batch in enumerate(train_loader):
-            x = batch[0].to(device)
-
-            lls = pc(x, record_cudagraph = False)
-            lls.mean().backward()
-            if i > 10:
-                break
-
-    prof.export_chrome_trace("trace3.json")
-    # torch.autograd.profiler.tensorboard_trace_to_flame_graph('trace.json', 'flamegraph.svg')
-    # prof.export_stacks("trace.txt", "cpu_time_total")
-    import pdb; pdb.set_trace()
-    exit()
+    # from torch.profiler import profile, record_function, ProfilerActivity
+    # with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], with_stack = True) as prof:
+    #     for i, batch in enumerate(train_loader):
+    #         x = batch[0].to(device)
+
+    #         lls = pc(x, record_cudagraph = False)
+    #         lls.mean().backward()
+    #         pc.mini_batch_em(step_size = 0.1, pseudocount = 0.01)
+    #         if i > 10:
+    #             break
+
+    # prof.export_chrome_trace("trace3.json")
+    # # torch.autograd.profiler.tensorboard_trace_to_flame_graph('trace.json', 'flamegraph.svg')
+    # # prof.export_stacks("trace.txt", "cpu_time_total")
+    # import pdb; pdb.set_trace()
+    # exit()
 
     mini_batch_em_epoch(350, pc, optimizer, None, train_loader, test_loader, device)
     full_batch_em_epoch(pc, train_loader, test_loader, device)

From 6d57b95e22b99dbfc87f3f9576d1ea97ed15dec0 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Thu, 28 Dec 2023 18:20:21 +0800
Subject: [PATCH 116/162] homogeneous hmm tests

---
 src/pyjuice/layer/input_layer.py    |   5 +-
 tests/model/homogeneous_hmm_test.py | 220 ++++++++++++++++++++++++++++
 2 files changed, 223 insertions(+), 2 deletions(-)
 create mode 100644 tests/model/homogeneous_hmm_test.py

diff --git a/src/pyjuice/layer/input_layer.py b/src/pyjuice/layer/input_layer.py
index 63e21eaa..d6d440a7 100644
--- a/src/pyjuice/layer/input_layer.py
+++ b/src/pyjuice/layer/input_layer.py
@@ -413,7 +413,7 @@ def mini_batch_em(self, step_size: float, pseudocount: float = 0.0):
                                 num_coalesced_groups = num_coalesced_groups,
                                 num_par_flows = num_par_flows,
                                 BLOCK_M = BLOCK_M,
-                                BLOCK_N = BLOCK_N,
+                                BLOCK_N = BLOCK_N
                             )
                         else:
                             raise NotImplementedError("Unsupported number of coalesced parameter flows.")
@@ -438,7 +438,8 @@ def mini_batch_em(self, step_size: float, pseudocount: float = 0.0):
                         source_nids_ptr = self.source_nids,
                         constexprs_ptr = constexprs,
                         layer_num_source_nodes = layer_num_source_nodes,
-                        BLOCK_SIZE = 1024
+                        BLOCK_SIZE = 1024,
+                        num_warps = 8
                     )
 
                 else:
diff --git a/tests/model/homogeneous_hmm_test.py b/tests/model/homogeneous_hmm_test.py
new file mode 100644
index 00000000..b07ffa01
--- /dev/null
+++ b/tests/model/homogeneous_hmm_test.py
@@ -0,0 +1,220 @@
+import pyjuice as juice
+import torch
+import numpy as np
+
+import pyjuice.nodes.distributions as dists
+from pyjuice.utils import BitSet
+from pyjuice.nodes import multiply, summate, inputs, set_group_size
+from pyjuice.model import TensorCircuit
+from pyjuice.model.backend import compute_cum_par_flows, em_par_update
+
+import pytest
+
+
+def homogeneous_hmm_test():
+
+    group_size = 1
+
+    with set_group_size(group_size = group_size):
+
+        ni0 = inputs(0, num_node_groups = 2, dist = dists.Categorical(num_cats = 5))
+        ni1 = ni0.duplicate(1, tie_params = True)
+        ni2 = ni0.duplicate(2, tie_params = True)
+        ni3 = ni0.duplicate(3, tie_params = True)
+
+        np01 = multiply(ni0, ni1)
+        ns01 = summate(np01, num_node_groups = 2)
+
+        np012 = multiply(ns01, ni2)
+        ns012 = ns01.duplicate(np012, tie_params = True)
+
+        np0123 = multiply(ns012, ni3)
+        ns0123 = ns012.duplicate(np0123, tie_params = True)
+
+    ns0123.init_parameters()
+
+    pc = TensorCircuit(ns0123, max_tied_ns_per_parflow_group = 2)
+
+    device = torch.device("cuda:0")
+
+    ## Compilation tests ##
+
+    assert torch.all(pc.input_layer_group[0].vids == torch.tensor([0, 0, 1, 1, 2, 2, 3, 3]).reshape(8, 1))
+    assert torch.all(pc.input_layer_group[0].s_pids == torch.tensor([0, 5, 0, 5, 0, 5, 0, 5]))
+    assert torch.all(pc.input_layer_group[0].s_pfids == torch.tensor([0, 5, 0, 5, 10, 15, 10, 15]))
+    assert torch.all(pc.input_layer_group[0].metadata == torch.tensor([5.0, 5.0, 5.0, 5.0]))
+    assert torch.all(pc.input_layer_group[0].s_mids == torch.tensor([0, 0, 1, 1, 2, 2, 3, 3]))
+    assert torch.all(pc.input_layer_group[0].source_nids == torch.tensor([0, 1]))
+    assert pc.input_layer_group[0]._output_ind_range[0] == 1
+    assert pc.input_layer_group[0]._output_ind_range[1] == 9
+
+    assert torch.all(pc.inner_layer_groups[0][0].partitioned_nids[0] == torch.tensor([1, 2]))
+    assert torch.all(pc.inner_layer_groups[0][0].partitioned_cids[0] == torch.tensor([[1, 3], [2, 4]]))
+
+    assert torch.all(pc.inner_layer_groups[0][0].partitioned_u_cids[0] == torch.tensor([1, 2, 3, 4]))
+    assert torch.all(pc.inner_layer_groups[0][0].partitioned_parids[0] == torch.tensor([[1], [2], [1], [2]]))
+
+    assert torch.all(pc.inner_layer_groups[1][0].partitioned_nids[0] == torch.tensor([9, 10]))
+    assert torch.all(pc.inner_layer_groups[1][0].partitioned_cids[0] == torch.tensor([[1, 2], [1, 2]]))
+    assert torch.all(pc.inner_layer_groups[1][0].partitioned_pids[0] == torch.tensor([[1, 2], [3, 4]]))
+    assert torch.all(pc.inner_layer_groups[1][0].partitioned_pfids[0] == torch.tensor([[0, 1], [2, 3]]))
+
+    assert torch.all(pc.inner_layer_groups[1][0].partitioned_chids[0] == torch.tensor([1, 2]))
+    assert torch.all(pc.inner_layer_groups[1][0].partitioned_parids[0] == torch.tensor([[9, 10], [9, 10]]))
+    assert torch.all(pc.inner_layer_groups[1][0].partitioned_parpids[0] == torch.tensor([[1, 3], [2, 4]]))
+
+    assert torch.all(pc.inner_layer_groups[2][0].partitioned_nids[0] == torch.tensor([1, 2]))
+    assert torch.all(pc.inner_layer_groups[2][0].partitioned_cids[0] == torch.tensor([[9, 5], [10, 6]]))
+
+    assert torch.all(pc.inner_layer_groups[2][0].partitioned_u_cids[0] == torch.tensor([5, 6, 9, 10]))
+    assert torch.all(pc.inner_layer_groups[2][0].partitioned_parids[0] == torch.tensor([[1], [2], [1], [2]]))
+
+    assert torch.all(pc.inner_layer_groups[3][0].partitioned_nids[0] == torch.tensor([11, 12]))
+    assert torch.all(pc.inner_layer_groups[3][0].partitioned_cids[0] == torch.tensor([[1, 2], [1, 2]]))
+    assert torch.all(pc.inner_layer_groups[3][0].partitioned_pids[0] == torch.tensor([[1, 2], [3, 4]]))
+    assert torch.all(pc.inner_layer_groups[3][0].partitioned_pfids[0] == torch.tensor([[0, 1], [2, 3]]))
+
+    assert torch.all(pc.inner_layer_groups[3][0].partitioned_chids[0] == torch.tensor([1, 2]))
+    assert torch.all(pc.inner_layer_groups[3][0].partitioned_parids[0] == torch.tensor([[11, 12], [11, 12]]))
+    assert torch.all(pc.inner_layer_groups[3][0].partitioned_parpids[0] == torch.tensor([[1, 3], [2, 4]]))
+
+    assert torch.all(pc.inner_layer_groups[4][0].partitioned_nids[0] == torch.tensor([1, 2]))
+    assert torch.all(pc.inner_layer_groups[4][0].partitioned_cids[0] == torch.tensor([[11, 7], [12, 8]]))
+
+    assert torch.all(pc.inner_layer_groups[4][0].partitioned_u_cids[0] == torch.tensor([7, 8, 11, 12]))
+    assert torch.all(pc.inner_layer_groups[4][0].partitioned_parids[0] == torch.tensor([[1], [2], [1], [2]]))
+
+    assert torch.all(pc.inner_layer_groups[5][0].partitioned_nids[0] == torch.tensor([13, 14]))
+    assert torch.all(pc.inner_layer_groups[5][0].partitioned_cids[0] == torch.tensor([[1, 2], [1, 2]]))
+    assert torch.all(pc.inner_layer_groups[5][0].partitioned_pids[0] == torch.tensor([[1, 2], [3, 4]]))
+    assert torch.all(pc.inner_layer_groups[5][0].partitioned_pfids[0] == torch.tensor([[4, 5], [6, 7]]))
+
+    assert torch.all(pc.inner_layer_groups[5][0].partitioned_chids[0] == torch.tensor([1, 2]))
+    assert torch.all(pc.inner_layer_groups[5][0].partitioned_parids[0] == torch.tensor([[13, 14], [13, 14]]))
+    assert torch.all(pc.inner_layer_groups[5][0].partitioned_parpids[0] == torch.tensor([[1, 3], [2, 4]]))
+
+    pc.to(device)
+
+    ## Forward tests ##
+
+    data = torch.randint(0, 5, [16, 4]).to(device)
+    data_cpu = data.cpu()
+
+    lls = pc(data)
+
+    node_mars = pc.node_mars.detach().cpu()
+    params = pc.params.detach().cpu()
+
+    params0 = ni0._params.reshape(2, 5)
+
+    assert torch.all(torch.abs(node_mars[1,:].exp() - params0[0, data_cpu[:,0]]) < 1e-4)
+    assert torch.all(torch.abs(node_mars[2,:].exp() - params0[1, data_cpu[:,0]]) < 1e-4)
+
+    assert torch.all(torch.abs(node_mars[3,:].exp() - params0[0, data_cpu[:,1]]) < 1e-4)
+    assert torch.all(torch.abs(node_mars[4,:].exp() - params0[1, data_cpu[:,1]]) < 1e-4)
+
+    assert torch.all(torch.abs(node_mars[5,:].exp() - params0[0, data_cpu[:,2]]) < 1e-4)
+    assert torch.all(torch.abs(node_mars[6,:].exp() - params0[1, data_cpu[:,2]]) < 1e-4)
+
+    assert torch.all(torch.abs(node_mars[7,:].exp() - params0[0, data_cpu[:,3]]) < 1e-4)
+    assert torch.all(torch.abs(node_mars[8,:].exp() - params0[1, data_cpu[:,3]]) < 1e-4)
+
+    params1 = ns01.get_source_ns()._params.reshape(2, 2)
+
+    np01_lls = node_mars[1:3,:] + node_mars[3:5,:]
+    ns01_lls = torch.matmul(params1, np01_lls.exp()).log()
+    assert torch.all(torch.abs(node_mars[9:11,:] - ns01_lls) < 1e-4)
+
+    np012_lls = node_mars[5:7,:] + node_mars[9:11,:]
+    ns012_lls = torch.matmul(params1, np012_lls.exp()).log()
+    assert torch.all(torch.abs(node_mars[11:13,:] - ns012_lls) < 1e-4)
+
+    np0123_lls = node_mars[7:9,:] + node_mars[11:13,:]
+    ns0123_lls = torch.matmul(params1, np0123_lls.exp()).log()
+    assert torch.all(torch.abs(node_mars[13:15,:] - ns0123_lls) < 1e-4)
+
+    ## Backward tests ##
+
+    pc.backward(data.permute(1, 0), allow_modify_flows = False)
+
+    node_flows = pc.node_flows.detach().cpu().clone()
+    param_flows = pc.param_flows.detach().cpu().clone()
+
+    assert torch.all(torch.abs(node_flows[13:15,:] - 1.0) < 1e-4)
+
+    pc.inner_layer_groups[4][0](pc.node_mars, pc.element_mars, _for_backward = True)
+    pc.inner_layer_groups[5][0].backward(
+        pc.node_flows, pc.element_flows, pc.node_mars, pc.element_mars, 
+        pc.params, pc.param_flows
+    )
+    element_flows = pc.element_flows.detach().cpu()
+
+    np0123_flows = torch.matmul(params1.permute(1, 0), 1.0 / ns0123_lls.exp()) * np0123_lls.exp()
+    assert torch.all(torch.abs(element_flows[1:3,:] - np0123_flows) < 1e-4)
+
+    param_flows1 = torch.matmul(1.0 / ns0123_lls.exp(), np0123_lls.exp().permute(1, 0)) * params1
+
+    ni3_flows = element_flows[1:3,:]
+    ns012_flows = element_flows[1:3,:]
+    assert torch.all(torch.abs(node_flows[11:13,:] - ns012_flows) < 1e-4)
+
+    pc.inner_layer_groups[2][0](pc.node_mars, pc.element_mars, _for_backward = True)
+    pc.inner_layer_groups[3][0].backward(
+        pc.node_flows, pc.element_flows, pc.node_mars, pc.element_mars, 
+        pc.params, pc.param_flows
+    )
+    element_flows = pc.element_flows.detach().cpu()
+
+    np012_flows = torch.matmul(params1.permute(1, 0), ns012_flows / ns012_lls.exp()) * np012_lls.exp()
+    assert torch.all(torch.abs(element_flows[1:3,:] - np012_flows) < 1e-4)
+
+    param_flows1 += torch.matmul(ns012_flows / ns012_lls.exp(), np012_lls.exp().permute(1, 0)) * params1
+
+    ni2_flows = element_flows[1:3,:]
+    ns01_flows = element_flows[1:3,:]
+    assert torch.all(torch.abs(node_flows[9:11,:] - ns01_flows) < 1e-4)
+
+    pc.inner_layer_groups[0][0](pc.node_mars, pc.element_mars, _for_backward = True)
+    pc.inner_layer_groups[1][0].backward(
+        pc.node_flows, pc.element_flows, pc.node_mars, pc.element_mars, 
+        pc.params, pc.param_flows
+    )
+    element_flows = pc.element_flows.detach().cpu()
+
+    np01_flows = torch.matmul(params1.permute(1, 0), ns01_flows / ns01_lls.exp()) * np01_lls.exp()
+    assert torch.all(torch.abs(element_flows[1:3,:] - np01_flows) < 1e-4)
+
+    param_flows1 += torch.matmul(ns01_flows / ns01_lls.exp(), np01_lls.exp().permute(1, 0)) * params1
+
+    ni0_flows = element_flows[1:3,:]
+    ni1_flows = element_flows[1:3,:]
+
+    assert torch.all(torch.abs(param_flows1.reshape(-1) - (param_flows[0:4] + param_flows[4:8])) < 1e-4)
+
+    ## Parameter learning & flow aggregation tests ##
+
+    temp_param_flows = param_flows.clone().to(device)
+
+    compute_cum_par_flows(temp_param_flows, pc.parflow_fusing_kwargs)
+
+    assert torch.all(torch.abs(param_flows1.reshape(-1) - temp_param_flows[0:4].cpu()) < 1e-4)
+
+    gt_param_flows0 = pc.input_layer_group[0].param_flows[0:10] + pc.input_layer_group[0].param_flows[10:20]
+    param_flows0 = torch.zeros([2, 5])
+
+    for i in range(16):
+        param_flows0[0, data_cpu[i,0]] += ni0_flows[0,i]
+        param_flows0[1, data_cpu[i,0]] += ni0_flows[1,i]
+        param_flows0[0, data_cpu[i,1]] += ni1_flows[0,i]
+        param_flows0[1, data_cpu[i,1]] += ni1_flows[1,i]
+        param_flows0[0, data_cpu[i,2]] += ni2_flows[0,i]
+        param_flows0[1, data_cpu[i,2]] += ni2_flows[1,i]
+        param_flows0[0, data_cpu[i,3]] += ni3_flows[0,i]
+        param_flows0[1, data_cpu[i,3]] += ni3_flows[1,i]
+
+    assert torch.all(torch.abs(param_flows0.reshape(-1) - gt_param_flows0.cpu()) < 1e-4)
+
+
+if __name__ == "__main__":
+    torch.manual_seed(2390)
+    homogeneous_hmm_test()

From d34db4b9936b48c0e29961ad20536e57f31a97e4 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Thu, 28 Dec 2023 18:24:11 +0800
Subject: [PATCH 117/162] version bump

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index fd45b996..c8b9cd88 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "pyjuice"
-version="0.0.1"
+version="2.0.0"
 description = "Probabilistic Circuits Library"
 dependencies = [
     "numpy",

From 6cc50977b27285346aab4e17a7e83359816aba07 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Thu, 28 Dec 2023 22:08:34 +0800
Subject: [PATCH 118/162] fix compilation bug caused by triton-nightly

---
 src/pyjuice/layer/compilation.py |  16 ++--
 src/pyjuice/layer/sum_layer.py   | 130 +++++++++++++++----------------
 2 files changed, 70 insertions(+), 76 deletions(-)

diff --git a/src/pyjuice/layer/compilation.py b/src/pyjuice/layer/compilation.py
index 5a5e528b..eb06a39e 100644
--- a/src/pyjuice/layer/compilation.py
+++ b/src/pyjuice/layer/compilation.py
@@ -186,9 +186,9 @@ def _assign_chid_kernel(chs_offsets, ns_nchs, edge_ids):
 def _assign_target_ncpids_kernel(target_nids_ptr, nids_partition_start_ptr, target_cids_ptr, pcids_partition_start_ptr,
                                  target_pids_ptr, target_pfids_ptr, edge_ids_ptr, chs_offsets_ptr, n_partition_ids_ptr, 
                                  n_id_in_partition_ptr, cs_ele_id_start_ptr, cs_node_cum_ids_ptr, fw_partition_max_chs_ptr, 
-                                 cum_n_chs_ptr, ns_param_ids_ptr, ns_param_flow_ids_ptr, constexprs_ptr, num_chs: tl.constexpr, 
-                                 num_chs_np2: tl.constexpr, add_params_flag: tl.constexpr, add_param_flows_flag: tl.constexpr, 
-                                 BLOCK_SIZE: tl.constexpr):
+                                 cum_n_chs_ptr, ns_param_ids_ptr, ns_param_flow_ids_ptr, cid_node_id_ptr, constexprs_ptr, 
+                                 num_chs: tl.constexpr, num_chs_np2: tl.constexpr, add_params_flag: tl.constexpr, 
+                                 add_param_flows_flag: tl.constexpr, BLOCK_SIZE: tl.constexpr):
 
     pid = tl.program_id(axis = 0)
     block_start = pid * BLOCK_SIZE
@@ -217,9 +217,8 @@ def _assign_target_ncpids_kernel(target_nids_ptr, nids_partition_start_ptr, targ
     cs_offsets = tl.arange(0, num_chs_np2)
     cs_node_cum_ids = tl.load(cs_node_cum_ids_ptr + cs_offsets, mask = (cs_offsets < num_chs), other = 0)
     
-    cid_node_id = tl.sum(tl.broadcast_to(cid[:,None], (BLOCK_SIZE, num_chs_np2)) >= \
-        tl.broadcast_to(cs_node_cum_ids[None,:], (BLOCK_SIZE, num_chs_np2)), axis = 1) - \
-        (1 + num_chs_np2 - num_chs)
+    # Get the `cs` indices the edges belong to
+    cid_node_id = tl.load(cid_node_id_ptr + offsets, mask = mask, other = 0)
 
     cs_cum_num = tl.load(cs_node_cum_ids_ptr + cid_node_id, mask = mask, other = 0)
     cs_ele_ind = tl.load(cs_ele_id_start_ptr + cid_node_id, mask = mask, other = 0)
@@ -438,6 +437,9 @@ def sum_layer_forward_compilation(nodes, fw_partition_max_chs, n_partition_ids,
             cum_n_chs = cum_n_chs.to(device)
             pcids_partition_start = pcids_partition_start.to(device)
 
+            # Which `cs` are the edges pointing to
+            cid_node_id = (edge_ids[1,:].unsqueeze(1) >= cs_node_cum_ids[None,:]).sum(dim = 1) - 1
+
             # We store these constants in a tensor and retrieve them in the kernel
             # This is to avoid `triton` from compiling separate kernels for every layer configuration
             # Saves 99.9% compilation time :)
@@ -452,7 +454,7 @@ def sum_layer_forward_compilation(nodes, fw_partition_max_chs, n_partition_ids,
                 target_nids, nids_partition_start, target_cids, pcids_partition_start,
                 target_pids, target_pfids, edge_ids, chs_offsets, n_partition_ids, 
                 n_id_in_partition, cs_ele_id_start, cs_node_cum_ids, fw_partition_max_chs, 
-                cum_n_chs, ns_param_ids, ns_param_flow_ids, constexprs, ns.num_chs, num_chs_np2, 
+                cum_n_chs, ns_param_ids, ns_param_flow_ids, cid_node_id, constexprs, ns.num_chs, num_chs_np2, 
                 add_params_flag, add_param_flows_flag, BLOCK_SIZE = min(2048, 2**20 // num_chs_np2)
             )
 
diff --git a/src/pyjuice/layer/sum_layer.py b/src/pyjuice/layer/sum_layer.py
index 422196ea..b60aba57 100644
--- a/src/pyjuice/layer/sum_layer.py
+++ b/src/pyjuice/layer/sum_layer.py
@@ -69,7 +69,7 @@ def __init__(self, nodes: Sequence[SumNodes], global_nid_start: int,
 
         self.num_fw_partitions = len(fw_partition_max_chs) # Number of groups
 
-        # fw_n_partition_ids:      [num_ngroups]           stores the partition id for each node node
+        # fw_n_partition_ids:      [num_ngroups]           stores the partition id for each node group
         # fw_n_id_in_partition:    [num_ngroups]           stores the index of the node groups in the partition
         # fw_num_ngs_in_partition: [num_fw_partitions]     number of node groups in each partition
         fw_n_partition_ids = torch.zeros([layer_num_ngroups], dtype = torch.long)
@@ -207,7 +207,8 @@ def num_parameters(self):
     def num_param_flows(self):
         return self._layer_pfid_range[1] - self._layer_pfid_range[0]
 
-    def forward(self, node_mars: torch.Tensor, element_mars: torch.Tensor, params: torch.Tensor) -> None:
+    def forward(self, node_mars: torch.Tensor, element_mars: torch.Tensor, params: torch.Tensor,
+                force_use_fp16: bool = False, force_use_fp32: bool = False) -> None:
         """
         Computes the forward pass of a sum layer.
 
@@ -225,7 +226,9 @@ def forward(self, node_mars: torch.Tensor, element_mars: torch.Tensor, params: t
                 pids = self.partitioned_pids[partition_id]
 
                 self._forward(
-                    node_mars, element_mars, params, nids, cids, pids, partition_id = partition_id
+                    node_mars, element_mars, params, nids, cids, pids, 
+                    partition_id = partition_id, force_use_fp16 = force_use_fp16,
+                    force_use_fp32 = force_use_fp32
                 )
 
         else:
@@ -239,7 +242,8 @@ def forward(self, node_mars: torch.Tensor, element_mars: torch.Tensor, params: t
                 self._forward(
                     node_mars, element_mars, params, 
                     nids, cids, pids, local_ids = local_ids,
-                    partition_id = partition_id
+                    partition_id = partition_id, force_use_fp16 = force_use_fp16,
+                    force_use_fp32 = force_use_fp32
                 )
 
         return None
@@ -339,7 +343,8 @@ def backward(self, node_flows: torch.Tensor, element_flows: torch.Tensor,
     def _forward(self, node_mars: torch.Tensor, element_mars: torch.Tensor,
                  params: torch.Tensor, nids: torch.Tensor, cids: torch.Tensor,
                  pids: torch.Tensor, local_ids: Optional[torch.Tensor] = None,
-                 partition_id: int = -1, mode: Optional[str] = None) -> None:
+                 partition_id: int = -1, mode: Optional[str] = None,
+                 force_use_fp16: bool = False, force_use_fp32: bool = False) -> None:
         """
         Forward pass of sum layers.
         
@@ -374,7 +379,8 @@ def _forward(self, node_mars: torch.Tensor, element_mars: torch.Tensor,
         if mode == self.BLOCK_SPARSE:
             self._forward_block_sparse(
                 node_mars, element_mars, params, nids, cids, pids, local_ids,
-                partition_id = partition_id
+                partition_id = partition_id, force_use_fp16 = force_use_fp16,
+                force_use_fp32 = force_use_fp32
             )
 
         elif mode == self.SPARSE:
@@ -521,8 +527,8 @@ def _fw_triton_block_sparse_csmm_kernel(node_mars, element_mars, params, nids, c
         # Initialize pointers to `element_mars`
         edge_start = tl.load(cids_start + offs_estart)
         emars_ptr = element_mars + \
-            edge_start[None,:] * batch_size + \
-            offs_batch[:,None] # [BLOCK_B, TILE_SIZE_K]
+            edge_start[:,None] * batch_size + \
+            offs_batch[None,:] # [TILE_SIZE_K, BLOCK_B]
 
         # Batch increment pointers
         pids_inc_ptr = pids_increment + ngroup_id * (K_NUM_TILES * TILE_SIZE_K) + offs_edge
@@ -533,23 +539,23 @@ def _fw_triton_block_sparse_csmm_kernel(node_mars, element_mars, params, nids, c
 
         for k in range(0, K_NUM_TILES):
             epars = tl.load(epars_ptr)
-            emars = tl.load(emars_ptr, mask = mask_batch[:,None])
+            emars = tl.load(emars_ptr, mask = mask_batch[None,:])
 
-            emars_max = tl.max(emars, axis = 1)
+            emars_max = tl.max(emars, axis = 0)[None,:]
             emars_sub = tl.where(emars_max != -float("inf"), tl.exp(emars - emars_max), 0.0)
 
             if use_fp16 == 1:
                 # Simulated matmul kernel + float16
                 epars = (epars * (2**12)).to(tl.float16)
                 emars = emars.to(tl.float16)
-                nmars = tl.sum(epars[:,:,None] * tl.trans(emars_sub)[None,:,:], axis = 1).to(tl.float32) / (2**12)
+                nmars = tl.sum(epars[:,:,None] * emars_sub[None,:,:], axis = 1).to(tl.float32) / (2**12)
             else:
                 # Simulated matmul kernel + float32
-                nmars = tl.sum(epars[:,:,None] * tl.trans(emars_sub)[None,:,:], axis = 1)
+                nmars = tl.sum(epars[:,:,None] * emars_sub[None,:,:], axis = 1)
 
-            acc = tl.where(emars_max[None,:] > acc,
-                tl.log(nmars + tl.exp(acc - emars_max[None,:])) + emars_max[None,:],
-                tl.log(tl.exp(emars_max[None,:] - acc) * nmars + 1.0) + acc
+            acc = tl.where(emars_max > acc,
+                tl.log(nmars + tl.exp(acc - emars_max)) + emars_max,
+                tl.log(tl.exp(emars_max - acc) * nmars + 1.0) + acc
             )
 
             # Increment `epars_ptr`
@@ -559,7 +565,7 @@ def _fw_triton_block_sparse_csmm_kernel(node_mars, element_mars, params, nids, c
 
             # Increment `emars_ptr`
             cids_inc = tl.load(cids_inc_ptr)
-            emars_ptr += cids_inc[None,:] * batch_size
+            emars_ptr += cids_inc[:,None] * batch_size
             cids_inc_ptr += TILE_SIZE_K
 
         # Write back
@@ -641,7 +647,7 @@ def _forward_block_sparse(self, node_mars: torch.Tensor, element_mars: torch.Ten
         elif force_use_fp32:
             use_fp16 = False
         else:
-            if TILE_SIZE_M >= 16 and TILE_SIZE_K >= 16 and BLOCK_B >= 16:
+            if TILE_SIZE_M >= 8 and TILE_SIZE_K >= 8 and BLOCK_B >= 8:
                 use_fp16 = True
             else:
                 use_fp16 = False
@@ -668,36 +674,6 @@ def _forward_block_sparse(self, node_mars: torch.Tensor, element_mars: torch.Ten
                 GROUP_SIZE_M = GROUP_SIZE_M,
                 use_fp16 = use_fp16
             )
-
-            # if node_mars.isnan().any():
-            #     import pdb; pdb.set_trace()
-
-            #     import numpy as np
-
-            #     np.savez("temp.npz",
-            #         node_mars = node_mars.detach().cpu().numpy(),
-            #         element_mars = element_mars.detach().cpu().numpy(),
-            #         params = params.detach().cpu().numpy(),
-            #         nids = nids.detach().cpu().numpy(),
-            #         cids = cids.detach().cpu().numpy(),
-            #         cids_start = cids_start.detach().cpu().numpy(),
-            #         cids_increment = cids_increment.detach().cpu().numpy(),
-            #         pids = pids.detach().cpu().numpy(),
-            #         pids_start = pids_start.detach().cpu().numpy(),
-            #         pids_increment = pids_increment.detach().cpu().numpy(),
-            #         batch_size = batch_size,
-            #         partial_eval = partial_eval,
-            #         BLOCK_B = BLOCK_B,
-            #         TILE_SIZE_K = TILE_SIZE_K,
-            #         K_NUM_TILES = K_NUM_TILES,
-            #         TILE_SIZE_M = TILE_SIZE_M,
-            #         GROUP_SIZE_M = GROUP_SIZE_M,
-            #         use_fp16 = use_fp16,
-            #         layer_n_nodes = layer_n_nodes
-            #     )
-
-            #     import numpy as np
-
         else:
             self._fw_triton_block_sparse_csmm_kernel[grid](
                 node_mars, 
@@ -1047,8 +1023,8 @@ def _bk_triton_block_sparse_ele_kernel(node_flows, element_flows, node_mars, ele
                                            chids, parids_start, parids_increment, parpids_start, parpids_increment, 
                                            local_ids, batch_size: tl.constexpr, partial_eval: tl.constexpr, ptr_inc_step: tl.constexpr, 
                                            allow_modify_flows: tl.constexpr, BLOCK_B: tl.constexpr, TILE_SIZE_K: tl.constexpr, 
-                                           K_NUM_TILES: tl.constexpr, TILE_SIZE_M: tl.constexpr, 
-                                           GROUP_SIZE_M: tl.constexpr, GROUP_SIZE_K: tl.constexpr):
+                                           K_NUM_TILES: tl.constexpr, TILE_SIZE_M: tl.constexpr, GROUP_SIZE_M: tl.constexpr, 
+                                           GROUP_SIZE_K: tl.constexpr, TL_DOT: tl.constexpr):
 
         pid_b = tl.program_id(0) # ID of size-`BLOCK_B` batches
         pid_m = tl.program_id(1) # ID of size-`TILE_SIZE_M` nodes
@@ -1101,25 +1077,27 @@ def _bk_triton_block_sparse_ele_kernel(node_flows, element_flows, node_mars, ele
                 nflows = tl.load(nflows_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_K, BLOCK_B]
                 log_n_fdm = tl.log(nflows) - nmars
 
-            log_n_fdm_max = tl.max(log_n_fdm, axis = 0)
-            n_fdm_sub = tl.where(log_n_fdm_max[None,:] != -float("inf"), tl.exp(log_n_fdm - log_n_fdm_max[None,:]), 0.0)
+            log_n_fdm_max = tl.max(log_n_fdm, axis = 0)[None,:]
+            n_fdm_sub = tl.where(log_n_fdm_max != -float("inf"), tl.exp(log_n_fdm - log_n_fdm_max), 0.0)
 
-            partial_flows = tl.dot(epars, n_fdm_sub)
-            # partial_flows = tl.sum(epars[:,:,None] * n_fdm_sub[None,:,:], axis = 1)
+            if TL_DOT == 1:
+                partial_flows = tl.dot(epars, n_fdm_sub)
+            else:
+                partial_flows = tl.sum(epars[:,:,None] * n_fdm_sub[None,:,:], axis = 1)
 
-            neginf_flag = (log_n_fdm_max[None,:] == -float("inf")) & (acc == -float("inf"))
-            acc = tl.where(log_n_fdm_max[None,:] > acc,
-                tl.log(partial_flows + tl.exp(acc - log_n_fdm_max[None,:])) + log_n_fdm_max[None,:],
-                tl.log(tl.exp(log_n_fdm_max[None,:] - acc) * partial_flows + 1.0) + acc
+            acc = tl.where(log_n_fdm_max == acc,
+                acc + 0.69314718056, # log(2)
+                tl.where(log_n_fdm_max > acc,
+                    tl.log(partial_flows + tl.exp(acc - log_n_fdm_max)) + log_n_fdm_max,
+                    tl.log(tl.exp(log_n_fdm_max - acc) * partial_flows + 1.0) + acc
+                )
             )
-            acc = tl.where(neginf_flag, -float("inf"), acc)
-            # acc = tl.where(log_n_fdm_max[None,:] == acc,
-            #     acc + 0.69314718056, # log(2)
-            #     tl.where(log_n_fdm_max[None,:] > acc,
-            #         tl.log(partial_flows + tl.exp(acc - log_n_fdm_max[None,:])) + log_n_fdm_max[None,:],
-            #         tl.log(tl.exp(log_n_fdm_max[None,:] - acc) * partial_flows + 1.0) + acc
-            #     )
+            # neginf_flag = (log_n_fdm_max == -float("inf")) & (acc == -float("inf"))
+            # acc = tl.where(log_n_fdm_max > acc,
+            #     tl.log(partial_flows + tl.exp(acc - log_n_fdm_max)) + log_n_fdm_max,
+            #     tl.log(tl.exp(log_n_fdm_max - acc) * partial_flows + 1.0) + acc
             # )
+            # acc = tl.where(neginf_flag, -float("inf"), acc)
 
             # Increment `epars_ptr`
             parpids_inc = tl.load(parpids_inc_ptr)
@@ -1214,6 +1192,11 @@ def _backward_block_sparse_ele_flows(self, node_flows: torch.Tensor, element_flo
         GROUP_SIZE_K = self.group_size
         allow_modify_flows = 1 if allow_modify_flows else 0
 
+        if TILE_SIZE_M >= 16 and TILE_SIZE_K >= 16 and BLOCK_B >= 16:
+            TL_DOT = 1
+        else:
+            TL_DOT = 0
+
         grid = (triton.cdiv(batch_size, BLOCK_B), triton.cdiv(layer_n_nodes, TILE_SIZE_M))
 
         self._bk_triton_block_sparse_ele_kernel[grid](
@@ -1238,8 +1221,9 @@ def _backward_block_sparse_ele_flows(self, node_flows: torch.Tensor, element_flo
             TILE_SIZE_M = TILE_SIZE_M, 
             GROUP_SIZE_M = GROUP_SIZE_M,
             GROUP_SIZE_K = GROUP_SIZE_K,
+            TL_DOT = TL_DOT,
             num_warps = 2, # TODO: test for different devices
-            num_stages = 2
+            num_stages = 1
         )
 
         return None
@@ -1250,7 +1234,7 @@ def _backward_block_sparse_ele_flows(self, node_flows: torch.Tensor, element_flo
     def _bk_triton_block_sparse_par_kernel(node_flows, node_mars, element_mars, params, param_flows, nids, cids, pids, pfids,
                                            batch_size: tl.constexpr, num_edges: tl.constexpr, allow_modify_flows: tl.constexpr, 
                                            TILE_SIZE_B: tl.constexpr, B_NUM_TILES: tl.constexpr, TILE_SIZE_K: tl.constexpr, 
-                                           TILE_SIZE_M: tl.constexpr, GROUP_SIZE_M: tl.constexpr):
+                                           TILE_SIZE_M: tl.constexpr, GROUP_SIZE_M: tl.constexpr, TL_DOT: tl.constexpr):
 
         pid_k = tl.program_id(0) # ID of size-`TILE_SIZE_K` edges
         pid_m = tl.program_id(1) # ID of size-`TILE_SIZE_M` nodes
@@ -1294,7 +1278,11 @@ def _bk_triton_block_sparse_par_kernel(node_flows, node_mars, element_mars, para
 
             scaled_emars = tl.exp(emars + log_n_fdm_max[:,None])
 
-            partial_flows = tl.dot(n_fdm_sub, scaled_emars)
+            if TL_DOT == 1:
+                partial_flows = tl.dot(n_fdm_sub, scaled_emars)
+            else:
+                partial_flows = tl.sum(n_fdm_sub[:,:,None] * scaled_emars[None,:,:], axis = 1)
+
             acc += partial_flows
 
             # Increment `emars_ptr`, `nmars_ptr`, and `nmars_ptr`
@@ -1362,6 +1350,11 @@ def _backward_block_sparse_par_flows(self, node_flows: torch.Tensor, params: tor
                                   "This is an internal error of PyJuice. Please consider checking the kernel dispatching criterions and use the " \
                                   "corresponding sparse kernel instead."
 
+        if TILE_SIZE_M >= 16 and TILE_SIZE_K >= 16 and TILE_SIZE_B >= 16:
+            TL_DOT = 1
+        else:
+            TL_DOT = 0
+
         grid = (triton.cdiv(num_edges, TILE_SIZE_K), triton.cdiv(layer_n_nodes, TILE_SIZE_M))
 
         self._bk_triton_block_sparse_par_kernel[grid](
@@ -1382,8 +1375,7 @@ def _backward_block_sparse_par_flows(self, node_flows: torch.Tensor, params: tor
             TILE_SIZE_K = TILE_SIZE_K, 
             TILE_SIZE_M = TILE_SIZE_M, 
             GROUP_SIZE_M = self.group_size,
-            num_warps = 4, # TODO: test for different devices
-            num_stages = 3
+            TL_DOT = TL_DOT,
         )
 
     def _backward_sparse(self, node_flows: torch.Tensor, element_flows: torch.Tensor, 

From 2ee7ed650fda523a9f8d8df6a04fcb904f17d133 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Fri, 29 Dec 2023 00:31:15 +0800
Subject: [PATCH 119/162] stage matmul_kernel_test

---
 tests/layer/matmul_kernel_test.py | 136 ++++++++++++++++++++++++++++++
 1 file changed, 136 insertions(+)
 create mode 100644 tests/layer/matmul_kernel_test.py

diff --git a/tests/layer/matmul_kernel_test.py b/tests/layer/matmul_kernel_test.py
new file mode 100644
index 00000000..14877878
--- /dev/null
+++ b/tests/layer/matmul_kernel_test.py
@@ -0,0 +1,136 @@
+import pyjuice as juice
+import torch
+import numpy as np
+import time
+import random
+
+import pyjuice.nodes.distributions as dists
+from pyjuice.utils import BitSet
+from pyjuice.nodes import multiply, summate, inputs
+from pyjuice.model import TensorCircuit
+
+from pyjuice.layer import InputLayer, ProdLayer, SumLayer
+
+import pytest
+
+
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def kernel1(a, b, c, M: tl.constexpr, N: tl.constexpr, K: tl.constexpr):
+    pid = tl.program_id(axis = 0)
+
+    offs_a = tl.arange(0, M)[:,None] * N + tl.arange(0, N)[None,:]
+    aa = tl.load(a + offs_a).to(tl.float16)
+
+    offs_b = tl.arange(0, N)[:,None] * K + tl.arange(0, K)[None,:]
+    bb = tl.load(b + offs_b).to(tl.float16)
+
+    cc = tl.dot(aa, bb).to(tl.float32)
+
+    offs_c = tl.arange(0, M)[:,None] * K + tl.arange(0, K)[None,:]
+    tl.store(c + offs_c, cc)
+
+
+@triton.jit
+def kernel2(a, b, c, M: tl.constexpr, N: tl.constexpr, K: tl.constexpr):
+    pid = tl.program_id(axis = 0)
+
+    offs_a = tl.arange(0, M)[:,None] * N + tl.arange(0, N)[None,:]
+    aa = tl.load(a + offs_a)
+
+    offs_b = tl.arange(0, N)[:,None] * K + tl.arange(0, K)[None,:]
+    bb = tl.load(b + offs_b)
+
+    bb_max = tl.max(bb, axis = 0)[None,:]
+    bb_sub = tl.where(bb_max != -float("inf"), tl.exp(bb - bb_max), 0.0)
+
+    cc = tl.sum(aa[:,:,None] * bb_sub[None,:,:], axis = 1)
+
+    offs_c = tl.arange(0, M)[:,None] * K + tl.arange(0, K)[None,:]
+    tl.store(c + offs_c, cc)
+
+
+@triton.jit
+def kernel2_fix(a, b, c, M: tl.constexpr, N: tl.constexpr, K: tl.constexpr):
+    pid = tl.program_id(axis = 0)
+
+    offs_a = tl.arange(0, M)[:,None] * N + tl.arange(0, N)[None,:]
+    aa = tl.load(a + offs_a)
+
+    offs_b = tl.arange(0, N)[None,:] * K + tl.arange(0, K)[:,None]
+    bb = tl.load(b + offs_b)
+
+    bb_max = tl.max(bb, axis = 1)[:,None]
+    bb_sub = tl.where(bb_max != -float("inf"), tl.exp(bb - bb_max), 0.0)
+
+    cc = tl.sum(aa[:,:,None] * tl.trans(bb_sub)[None,:,:], axis = 1)
+
+    offs_c = tl.arange(0, M)[:,None] * K + tl.arange(0, K)[None,:]
+    tl.store(c + offs_c, cc)
+
+
+@triton.jit
+def kernel3(a, b, c, M: tl.constexpr, N: tl.constexpr, K: tl.constexpr):
+    pid = tl.program_id(axis = 0)
+
+    offs_a = tl.arange(0, M)[:,None] * N + tl.arange(0, N)[None,:]
+    aa = tl.load(a + offs_a)
+
+    offs_b = tl.arange(0, N)[:,None] * K + tl.arange(0, K)[None,:]
+    bb = tl.load(b + offs_b)
+
+    aa = tl.view(tl.broadcast_to(aa[:,None,:], (M, 8 // M, N)), (8, N))
+    # cc = tl.dot(aa, bb)
+    cc = tl.sum(aa[:,:,None] * bb[None,:,:], axis = 1)
+    cc = tl.max(tl.view(cc, (M, 8 // M, K)), axis = 1)
+
+    offs_c = tl.arange(0, M)[:,None] * K + tl.arange(0, K)[None,:]
+    tl.store(c + offs_c, cc)
+
+
+if __name__ == "__main__":
+    import time
+
+    M = 8
+    N = 4
+    K = 8
+
+    a = torch.rand([M, N]).cuda()
+    b = torch.rand([N, K]).log().cuda()
+    c = torch.zeros([M, K]).cuda()
+
+    grid = (1,)
+
+    # kernel1[grid](a, b, c, M, N, K)
+
+    # torch.cuda.synchronize()
+    # t0 = time.time()
+    # for _ in range(100):
+    #     kernel1[grid](a, b, c, M, N, K)
+    #     torch.cuda.synchronize()
+    # t1 = time.time()
+
+    # print((t1 - t0) / 100 * 1000)
+
+    # kernel2[grid](a, b, c, M, N, K)
+    kernel2_fix[grid](a, b, c, M, N, K)
+
+    # torch.cuda.synchronize()
+    # t0 = time.time()
+    # for _ in range(100):
+    #     kernel2[grid](a, b, c, M, N, K)
+    #     torch.cuda.synchronize()
+    # t1 = time.time()
+
+    # print((t1 - t0) / 100 * 1000)
+
+    cc = torch.matmul(a, (b - b.max(dim = 0, keepdim = True).values).exp())
+
+    print((c - cc).abs().max())
+
+    ccc = c
+
+    import pdb; pdb.set_trace()
\ No newline at end of file

From 911cb2335b107d80d4d0903c77381718480a4356 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Fri, 29 Dec 2023 00:31:38 +0800
Subject: [PATCH 120/162] make sum layer kernels work for small group size

---
 src/pyjuice/layer/sum_layer.py | 460 ++++++++++++++++++++++++++++-----
 tests/layer/sum_layer_test.py  | 146 ++++++++++-
 tests/structures/hclt_test.py  |  40 +--
 3 files changed, 558 insertions(+), 88 deletions(-)

diff --git a/src/pyjuice/layer/sum_layer.py b/src/pyjuice/layer/sum_layer.py
index b60aba57..cf1f5c55 100644
--- a/src/pyjuice/layer/sum_layer.py
+++ b/src/pyjuice/layer/sum_layer.py
@@ -367,12 +367,12 @@ def _forward(self, node_mars: torch.Tensor, element_mars: torch.Tensor,
         elif params.dim() == 1 and self.group_size >= 16 and num_edges >= 16 and batch_size >= 16:
             # In this case, we should definitely use the block-sparse implementation
             mode = self.BLOCK_SPARSE
-        elif self.group_size == 1 and num_edges < 16384:
+        elif (self.group_size == 1 and num_edges < 16384) or num_edges < 4:
             # In this case, we should definitely use the sparse implementation
             mode = self.SPARSE
-        elif num_edges < 4:
-            # In this case, the block-sparse kernel will have compilation issues
-            mode = self.SPARSE
+        # elif self.group_size < 8:
+        #     # TODO: remove this when `triton` has fixed its bug
+        #     mode = self.SPARSE
         else:
             mode = self.BLOCK_SPARSE
 
@@ -488,7 +488,7 @@ def _fw_triton_block_sparse_tlmm_kernel(node_mars, element_mars, params, nids, c
     @staticmethod
     # @triton.jit
     @FastJITFunction
-    def _fw_triton_block_sparse_csmm_kernel(node_mars, element_mars, params, nids, cids_start, cids_increment,
+    def _fw_triton_block_sparse_csmm1_kernel(node_mars, element_mars, params, nids, cids_start, cids_increment,
                                             pids_start, pids_increment, local_ids, batch_size: tl.constexpr, partial_eval: tl.constexpr,
                                             BLOCK_B: tl.constexpr, TILE_SIZE_K: tl.constexpr, K_NUM_TILES: tl.constexpr,
                                             TILE_SIZE_M: tl.constexpr, GROUP_SIZE_M: tl.constexpr, use_fp16: tl.constexpr):
@@ -546,9 +546,9 @@ def _fw_triton_block_sparse_csmm_kernel(node_mars, element_mars, params, nids, c
 
             if use_fp16 == 1:
                 # Simulated matmul kernel + float16
-                epars = (epars * (2**12)).to(tl.float16)
-                emars = emars.to(tl.float16)
-                nmars = tl.sum(epars[:,:,None] * emars_sub[None,:,:], axis = 1).to(tl.float32) / (2**12)
+                epars = (epars * (2**4)).to(tl.float16)
+                emars_sub = emars_sub.to(tl.float16)
+                nmars = tl.sum(epars[:,:,None] * emars_sub[None,:,:], axis = 1).to(tl.float32) / (2**4)
             else:
                 # Simulated matmul kernel + float32
                 nmars = tl.sum(epars[:,:,None] * emars_sub[None,:,:], axis = 1)
@@ -573,6 +573,88 @@ def _fw_triton_block_sparse_csmm_kernel(node_mars, element_mars, params, nids, c
         offs_nmars = (off_nids + offs_node[:,None]) * batch_size + offs_batch[None,:]
         tl.store(node_mars + offs_nmars, acc, mask = mask_batch[None,:])
 
+    @staticmethod
+    # @triton.jit
+    @FastJITFunction
+    def _fw_triton_block_sparse_csmm2_kernel(node_mars, element_mars, params, nids, cids_start, cids_increment,
+                                             pids_start, pids_increment, local_ids, batch_size: tl.constexpr, partial_eval: tl.constexpr,
+                                             BLOCK_B: tl.constexpr, TILE_SIZE_K: tl.constexpr, K_NUM_TILES: tl.constexpr,
+                                             TILE_SIZE_M: tl.constexpr, GROUP_SIZE_M: tl.constexpr, use_fp16: tl.constexpr):
+
+        pid_b = tl.program_id(0) # ID of size-`BLOCK_B` batches
+        pid_m = tl.program_id(1) # ID of size-`TILE_SIZE_M` nodes
+
+        # Get inferred node group id from `pid_m`
+        ngroup_id = pid_m // (GROUP_SIZE_M // TILE_SIZE_M)
+        tile_id = pid_m % (GROUP_SIZE_M // TILE_SIZE_M)
+
+        # Get the real node group id in the case of partial evaluation
+        if partial_eval == 1:
+            ngroup_id = tl.load(local_ids + ngroup_id)
+
+        # Node offsets
+        offs_node = tl.arange(0, TILE_SIZE_M) + tile_id * TILE_SIZE_M
+        offs_node = tl.max_contiguous(offs_node, TILE_SIZE_M)
+
+        # Edge offsets
+        offs_edge = tl.arange(0, TILE_SIZE_K)
+
+        # Initialize pointers to `params`
+        offs_estart = ngroup_id * TILE_SIZE_K + offs_edge
+        offs_estart = tl.max_contiguous(offs_estart, TILE_SIZE_K)
+        par_start = tl.load(pids_start + offs_estart)
+        epars_ptr = params + \
+            offs_node[:,None] + \
+            par_start[None,:] # [TILE_SIZE_M, TILE_SIZE_K]
+
+        # Batch offsets and mask
+        offs_batch = tl.arange(0, BLOCK_B) + pid_b * BLOCK_B
+        offs_batch = tl.max_contiguous(offs_batch, BLOCK_B)
+        mask_batch = offs_batch < batch_size
+
+        # Initialize pointers to `element_mars`
+        edge_start = tl.load(cids_start + offs_estart)
+        emars_ptr = element_mars + \
+            edge_start[None,:] * batch_size + \
+            offs_batch[:,None] # [BLOCK_B, TILE_SIZE_K]
+
+        # Batch increment pointers
+        pids_inc_ptr = pids_increment + ngroup_id * (K_NUM_TILES * TILE_SIZE_K) + offs_edge
+        cids_inc_ptr = cids_increment + ngroup_id * (K_NUM_TILES * TILE_SIZE_K) + offs_edge
+
+        # Inner loop
+        acc = tl.zeros([TILE_SIZE_M, BLOCK_B], dtype = tl.float32) - float("inf")
+
+        for k in range(0, K_NUM_TILES):
+            epars = tl.load(epars_ptr)
+            emars = tl.load(emars_ptr, mask = mask_batch[:,None])
+
+            emars_max = tl.max(emars, axis = 1)
+            emars_sub = tl.where(emars_max[:,None] != -float("inf"), tl.exp(emars - emars_max[:,None]), 0.0)
+
+            # Simulated matmul kernel + float32
+            nmars = tl.sum(epars[:,:,None] * tl.trans(emars_sub)[None,:,:], axis = 1)
+
+            acc = tl.where(emars_max[None,:] > acc,
+                tl.log(nmars + tl.exp(acc - emars_max[None,:])) + emars_max[None,:],
+                tl.log(tl.exp(emars_max[None,:] - acc) * nmars + 1.0) + acc
+            )
+
+            # Increment `epars_ptr`
+            pids_inc = tl.load(pids_inc_ptr)
+            epars_ptr += pids_inc[None,:]
+            pids_inc_ptr += TILE_SIZE_K
+
+            # Increment `emars_ptr`
+            cids_inc = tl.load(cids_inc_ptr)
+            emars_ptr += cids_inc[None,:] * batch_size
+            cids_inc_ptr += TILE_SIZE_K
+
+        # Write back
+        off_nids = tl.load(nids + ngroup_id)
+        offs_nmars = (off_nids + offs_node[:,None]) * batch_size + offs_batch[None,:]
+        tl.store(node_mars + offs_nmars, acc, mask = mask_batch[None,:])
+
     def _forward_block_sparse(self, node_mars: torch.Tensor, element_mars: torch.Tensor,
                               params: torch.Tensor, nids: torch.Tensor, cids: torch.Tensor,
                               pids: torch.Tensor, local_ids: Optional[torch.Tensor] = None,
@@ -601,15 +683,12 @@ def _forward_block_sparse(self, node_mars: torch.Tensor, element_mars: torch.Ten
         # Heuristic to set `TILE_SIZE_M`, `TILE_SIZE_K`, and `BLOCK_B`
         base_size = min(self.group_size, num_edges, BATCH_SIZE_NP2, 128)
         if base_size >= 64:
-            TILE_SIZE_K = base_size
-            TILE_SIZE_M = 2048 // base_size
-            BLOCK_B = 2048 // base_size
+            TILE_SIZE_K = min(2048 // 32, num_edges)
         else:
             remainder = 2048 // (base_size ** 2)
-
             TILE_SIZE_K = min(2048 // remainder, base_size * remainder, num_edges)
-            TILE_SIZE_M = min(2048 // TILE_SIZE_K, self.group_size)
-            BLOCK_B = min(2048 // TILE_SIZE_K, BATCH_SIZE_NP2)
+        TILE_SIZE_M = min(2048 // TILE_SIZE_K, self.group_size)
+        BLOCK_B = min(2048 // TILE_SIZE_K, BATCH_SIZE_NP2)
         K_NUM_TILES = num_edges // TILE_SIZE_K
 
         assert TILE_SIZE_K >= 4, f"`TILE_SIZE_K` should be greater than 4 (but got {TILE_SIZE_K}) in order to use the block-sparse kernel. " \
@@ -674,8 +753,28 @@ def _forward_block_sparse(self, node_mars: torch.Tensor, element_mars: torch.Ten
                 GROUP_SIZE_M = GROUP_SIZE_M,
                 use_fp16 = use_fp16
             )
+        elif TILE_SIZE_M >= 8 and TILE_SIZE_K >= 8 and BLOCK_B >= 8:
+            self._fw_triton_block_sparse_csmm1_kernel[grid](
+                node_mars, 
+                element_mars, 
+                params, 
+                nids, 
+                cids_start,
+                cids_increment, 
+                pids_start,
+                pids_increment,
+                local_ids,
+                batch_size,
+                partial_eval = partial_eval,
+                BLOCK_B = BLOCK_B,
+                TILE_SIZE_K = TILE_SIZE_K,
+                K_NUM_TILES = K_NUM_TILES,
+                TILE_SIZE_M = TILE_SIZE_M,
+                GROUP_SIZE_M = GROUP_SIZE_M,
+                use_fp16 = use_fp16
+            )
         else:
-            self._fw_triton_block_sparse_csmm_kernel[grid](
+            self._fw_triton_block_sparse_csmm2_kernel[grid](
                 node_mars, 
                 element_mars, 
                 params, 
@@ -1069,12 +1168,12 @@ def _bk_triton_block_sparse_ele_kernel(node_flows, element_flows, node_mars, ele
 
         for k in range(0, K_NUM_TILES):
             epars = tl.load(epars_ptr) # [TILE_SIZE_M, TILE_SIZE_K]
-            nmars = tl.load(nmars_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_K, BLOCK_B]
 
             if allow_modify_flows == 1:
                 log_n_fdm = tl.load(nflows_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_K, BLOCK_B]
             else:
                 nflows = tl.load(nflows_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_K, BLOCK_B]
+                nmars = tl.load(nmars_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_K, BLOCK_B]
                 log_n_fdm = tl.log(nflows) - nmars
 
             log_n_fdm_max = tl.max(log_n_fdm, axis = 0)[None,:]
@@ -1106,7 +1205,8 @@ def _bk_triton_block_sparse_ele_kernel(node_flows, element_flows, node_mars, ele
 
             # Increment `nmars_ptr`
             parids_inc = tl.load(parids_inc_ptr)
-            nmars_ptr += parids_inc[:,None] * batch_size
+            if allow_modify_flows == 0:
+                nmars_ptr += parids_inc[:,None] * batch_size
             nflows_ptr += parids_inc[:,None] * batch_size
             parids_inc += ptr_inc_step
 
@@ -1121,6 +1221,109 @@ def _bk_triton_block_sparse_ele_kernel(node_flows, element_flows, node_mars, ele
         offs_elemfs = (off_eleids + offs_ele[:,None]) * batch_size + offs_batch[None,:]
         tl.store(element_flows + offs_elemfs, eflows, mask = mask_batch[None,:])
 
+    @staticmethod
+    # @triton.jit
+    @FastJITFunction
+    def _bk_triton_block_sparse_ele_csmm2_kernel(node_flows, element_flows, node_mars, element_mars, params, 
+                                                 chids, parids_start, parids_increment, parpids_start, parpids_increment, 
+                                                 local_ids, batch_size: tl.constexpr, partial_eval: tl.constexpr, ptr_inc_step: tl.constexpr, 
+                                                 allow_modify_flows: tl.constexpr, BLOCK_B: tl.constexpr, TILE_SIZE_K: tl.constexpr, 
+                                                 K_NUM_TILES: tl.constexpr, TILE_SIZE_M: tl.constexpr, GROUP_SIZE_M: tl.constexpr, 
+                                                 GROUP_SIZE_K: tl.constexpr, TL_DOT: tl.constexpr):
+
+        pid_b = tl.program_id(0) # ID of size-`BLOCK_B` batches
+        pid_m = tl.program_id(1) # ID of size-`TILE_SIZE_M` nodes
+
+        # Get inferred node group id from `pid_m`
+        elegroup_id = pid_m // (GROUP_SIZE_M // TILE_SIZE_M)
+        tile_id = pid_m % (GROUP_SIZE_M // TILE_SIZE_M)
+
+        # Get the real node group id in the case of partial evaluation
+        if partial_eval == 1:
+            elegroup_id = tl.load(local_ids + elegroup_id)
+
+        # Initialize pointers to `params`
+        offs_ele = tl.arange(0, TILE_SIZE_M) + tile_id * TILE_SIZE_M
+        offs_edge = tl.arange(0, TILE_SIZE_K)
+        offs_edge_gid = offs_edge // GROUP_SIZE_K
+        offs_edge_nid = (offs_edge % GROUP_SIZE_K)
+        par_start = tl.load(parpids_start + elegroup_id * ptr_inc_step + offs_edge_gid)
+        epars_ptr = params + \
+            offs_ele[:,None] * GROUP_SIZE_K + \
+            (par_start + offs_edge_nid)[None,:] # [TILE_SIZE_M, TILE_SIZE_K]
+
+        # Batch offsets and mask
+        offs_batch = tl.arange(0, BLOCK_B) + pid_b * BLOCK_B
+        mask_batch = offs_batch < batch_size
+
+        # Initialize pointers to `node_mars`
+        edge_start = tl.load(parids_start + elegroup_id * ptr_inc_step + offs_edge_gid)
+        nmars_ptr = node_mars + \
+            (edge_start + offs_edge_nid)[None,:] * batch_size + \
+            offs_batch[:,None] # [BLOCK_B, TILE_SIZE_K]
+        nflows_ptr = node_flows + \
+            (edge_start + offs_edge_nid)[None,:] * batch_size + \
+            offs_batch[:,None] # [BLOCK_B, TILE_SIZE_K]
+
+        # Batch increment pointers
+        parids_inc_ptr = parids_increment + elegroup_id * (K_NUM_TILES * ptr_inc_step) + offs_edge_gid
+        parpids_inc_ptr = parpids_increment + elegroup_id * (K_NUM_TILES * ptr_inc_step) + offs_edge_gid
+
+        # Inner loop
+        acc = tl.zeros([TILE_SIZE_M, BLOCK_B], dtype = tl.float32) - float("inf")
+
+        for k in range(0, K_NUM_TILES):
+            epars = tl.load(epars_ptr) # [TILE_SIZE_M, TILE_SIZE_K]
+
+            if allow_modify_flows == 1:
+                log_n_fdm = tl.load(nflows_ptr, mask = mask_batch[:,None]) # [BLOCK_B, TILE_SIZE_K]
+            else:
+                nflows = tl.load(nflows_ptr, mask = mask_batch[:,None]) # [BLOCK_B, TILE_SIZE_K]
+                nmars = tl.load(nmars_ptr, mask = mask_batch[:,None]) # [BLOCK_B, TILE_SIZE_K]
+                log_n_fdm = tl.log(nflows) - nmars
+
+            log_n_fdm_max = tl.max(log_n_fdm, axis = 1)
+            n_fdm_sub = tl.where(log_n_fdm_max[:,None] != -float("inf"), tl.exp(log_n_fdm - log_n_fdm_max[:,None]), 0.0)
+
+            partial_flows = tl.sum(epars[:,:,None] * tl.trans(n_fdm_sub)[None,:,:], axis = 1)
+
+            acc = tl.where(log_n_fdm_max[None,:] == acc,
+                acc + 0.69314718056, # log(2)
+                tl.where(log_n_fdm_max[None,:] > acc,
+                    tl.log(partial_flows + tl.exp(acc - log_n_fdm_max[None,:])) + log_n_fdm_max[None,:],
+                    tl.log(tl.exp(log_n_fdm_max[None,:] - acc) * partial_flows + 1.0) + acc
+                )
+            )
+            # neginf_flag = (log_n_fdm_max[None,:] == -float("inf")) & (acc == -float("inf"))
+            # acc = tl.where(log_n_fdm_max[None,:] > acc,
+            #     tl.log(partial_flows + tl.exp(acc - log_n_fdm_max[None,:])) + log_n_fdm_max[None,:],
+            #     tl.log(tl.exp(log_n_fdm_max[None,:] - acc) * partial_flows + 1.0) + acc
+            # )
+            # acc = tl.where(neginf_flag, -float("inf"), acc)
+
+            # Increment `epars_ptr`
+            parpids_inc = tl.load(parpids_inc_ptr)
+            epars_ptr += parpids_inc[None,:]
+            parpids_inc_ptr += ptr_inc_step
+
+            # Increment `nmars_ptr`
+            parids_inc = tl.load(parids_inc_ptr)
+            if allow_modify_flows == 0:
+                nmars_ptr += parids_inc[None,:] * batch_size
+            nflows_ptr += parids_inc[None,:] * batch_size
+            parids_inc += ptr_inc_step
+
+        # Initialize pointers to `element_mars`
+        off_eleids = tl.load(chids + elegroup_id)
+        emars_ptr = element_mars + (off_eleids + offs_ele[:,None]) * batch_size + offs_batch[None,:]
+        emars = tl.load(emars_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_K, BLOCK_B]
+
+        eflows = tl.exp(acc + emars)
+
+        # Write back
+        offs_elemfs = (off_eleids + offs_ele[:,None]) * batch_size + offs_batch[None,:]
+        tl.store(element_flows + offs_elemfs, eflows, mask = mask_batch[None,:])
+
     def _backward_block_sparse_ele_flows(self, node_flows: torch.Tensor, element_flows: torch.Tensor,
                                          params: torch.Tensor, node_mars: torch.Tensor,
                                          element_mars: torch.Tensor, chids: torch.Tensor, parids: torch.Tensor,
@@ -1199,32 +1402,60 @@ def _backward_block_sparse_ele_flows(self, node_flows: torch.Tensor, element_flo
 
         grid = (triton.cdiv(batch_size, BLOCK_B), triton.cdiv(layer_n_nodes, TILE_SIZE_M))
 
-        self._bk_triton_block_sparse_ele_kernel[grid](
-            node_flows = node_flows, 
-            element_flows = element_flows, 
-            node_mars = node_mars, 
-            element_mars = element_mars, 
-            params = params, 
-            chids = chids, 
-            parids_start = parids_start,
-            parids_increment = parids_increment,
-            parpids_start = parpids_start,
-            parpids_increment = parpids_increment, 
-            local_ids = local_ids, 
-            batch_size = batch_size, 
-            partial_eval = partial_eval,
-            ptr_inc_step = ptr_inc_step,
-            allow_modify_flows = allow_modify_flows,
-            BLOCK_B = BLOCK_B, 
-            TILE_SIZE_K = TILE_SIZE_K, 
-            K_NUM_TILES = K_NUM_TILES,
-            TILE_SIZE_M = TILE_SIZE_M, 
-            GROUP_SIZE_M = GROUP_SIZE_M,
-            GROUP_SIZE_K = GROUP_SIZE_K,
-            TL_DOT = TL_DOT,
-            num_warps = 2, # TODO: test for different devices
-            num_stages = 1
-        )
+        if TILE_SIZE_M >= 8 and TILE_SIZE_K >= 8 and BLOCK_B >= 8:
+            self._bk_triton_block_sparse_ele_kernel[grid](
+                node_flows = node_flows, 
+                element_flows = element_flows, 
+                node_mars = node_mars, 
+                element_mars = element_mars, 
+                params = params, 
+                chids = chids, 
+                parids_start = parids_start,
+                parids_increment = parids_increment,
+                parpids_start = parpids_start,
+                parpids_increment = parpids_increment, 
+                local_ids = local_ids, 
+                batch_size = batch_size, 
+                partial_eval = partial_eval,
+                ptr_inc_step = ptr_inc_step,
+                allow_modify_flows = allow_modify_flows,
+                BLOCK_B = BLOCK_B, 
+                TILE_SIZE_K = TILE_SIZE_K, 
+                K_NUM_TILES = K_NUM_TILES,
+                TILE_SIZE_M = TILE_SIZE_M, 
+                GROUP_SIZE_M = GROUP_SIZE_M,
+                GROUP_SIZE_K = GROUP_SIZE_K,
+                TL_DOT = TL_DOT,
+                num_warps = 2, # TODO: test for different devices
+                num_stages = 1
+            )
+        else:
+            self._bk_triton_block_sparse_ele_csmm2_kernel[grid](
+                node_flows = node_flows, 
+                element_flows = element_flows, 
+                node_mars = node_mars, 
+                element_mars = element_mars, 
+                params = params, 
+                chids = chids, 
+                parids_start = parids_start,
+                parids_increment = parids_increment,
+                parpids_start = parpids_start,
+                parpids_increment = parpids_increment, 
+                local_ids = local_ids, 
+                batch_size = batch_size, 
+                partial_eval = partial_eval,
+                ptr_inc_step = ptr_inc_step,
+                allow_modify_flows = allow_modify_flows,
+                BLOCK_B = BLOCK_B, 
+                TILE_SIZE_K = TILE_SIZE_K, 
+                K_NUM_TILES = K_NUM_TILES,
+                TILE_SIZE_M = TILE_SIZE_M, 
+                GROUP_SIZE_M = GROUP_SIZE_M,
+                GROUP_SIZE_K = GROUP_SIZE_K,
+                TL_DOT = TL_DOT,
+                num_warps = 2, # TODO: test for different devices
+                num_stages = 1
+            )
 
         return None
 
@@ -1265,12 +1496,12 @@ def _bk_triton_block_sparse_par_kernel(node_flows, node_mars, element_mars, para
         
         for b in range(0, B_NUM_TILES):
             emars = tl.load(emars_ptr, mask = mask_batch[:,None]) # [TILE_SIZE_B, TILE_SIZE_K]
-            nmars = tl.load(nmars_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_M, TILE_SIZE_B]
 
             if allow_modify_flows == 1:
                 log_n_fdm = tl.load(nflows_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_M, TILE_SIZE_B]
             else:
                 nflows = tl.load(nflows_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_M, TILE_SIZE_B]
+                nmars = tl.load(nmars_ptr, mask = mask_batch[None,:]) # [TILE_SIZE_M, TILE_SIZE_B]
                 log_n_fdm = tl.log(nflows) - nmars
 
             log_n_fdm_max = tl.max(log_n_fdm, axis = 0)
@@ -1287,7 +1518,84 @@ def _bk_triton_block_sparse_par_kernel(node_flows, node_mars, element_mars, para
 
             # Increment `emars_ptr`, `nmars_ptr`, and `nmars_ptr`
             emars_ptr += TILE_SIZE_B
-            nmars_ptr += TILE_SIZE_B
+            if allow_modify_flows == 0:
+                nmars_ptr += TILE_SIZE_B
+            nflows_ptr += TILE_SIZE_B
+
+            # Update batch mask
+            offs_batch += TILE_SIZE_B
+            mask_batch = offs_batch < batch_size
+
+        # Initialize `params`
+        par_start = tl.load(pids + ngroup_id * num_edges + offs_edge)
+        epars_offsets = offs_node[:,None] + par_start[None,:] # [TILE_SIZE_M, TILE_SIZE_K]
+        epars = tl.load(params + epars_offsets)
+
+        pflows = acc * epars
+
+        parflow_start = tl.load(pfids + ngroup_id * num_edges + offs_edge)
+        eparflows_offsets = offs_node[:,None] + parflow_start[None,:] # [TILE_SIZE_M, TILE_SIZE_K]
+
+        tl.atomic_add(param_flows + eparflows_offsets, pflows)
+
+    @staticmethod
+    # @triton.jit
+    @FastJITFunction
+    def _bk_triton_block_sparse_par_csmm2_kernel(node_flows, node_mars, element_mars, params, param_flows, nids, cids, pids, pfids,
+                                                 batch_size: tl.constexpr, num_edges: tl.constexpr, allow_modify_flows: tl.constexpr, 
+                                                 TILE_SIZE_B: tl.constexpr, B_NUM_TILES: tl.constexpr, TILE_SIZE_K: tl.constexpr, 
+                                                 TILE_SIZE_M: tl.constexpr, GROUP_SIZE_M: tl.constexpr, TL_DOT: tl.constexpr):
+
+        pid_k = tl.program_id(0) # ID of size-`TILE_SIZE_K` edges
+        pid_m = tl.program_id(1) # ID of size-`TILE_SIZE_M` nodes
+
+        # Get inferred node group id from `pid_m`
+        ngroup_id = pid_m // (GROUP_SIZE_M // TILE_SIZE_M)
+        tile_id = pid_m % (GROUP_SIZE_M // TILE_SIZE_M)
+
+        # Batch offsets and mask
+        offs_batch = tl.arange(0, TILE_SIZE_B)
+        mask_batch = offs_batch < batch_size
+
+        # Initialize pointers to `element_mars`
+        offs_edge = tl.arange(0, TILE_SIZE_K) + pid_k * TILE_SIZE_K
+        edge_start = tl.load(cids + ngroup_id * num_edges + offs_edge)
+        emars_ptr = element_mars + \
+            edge_start[None,:] * batch_size + \
+            offs_batch[:,None] # [TILE_SIZE_B, TILE_SIZE_K]
+
+        # Initialize pointers to `node_flows` and `node_mars`
+        offs_node = tl.arange(0, TILE_SIZE_M) + tile_id * TILE_SIZE_M
+        off_nids = tl.load(nids + ngroup_id)
+        nmars_ptr = node_mars + (off_nids + offs_node[None,:]) * batch_size + offs_batch[:,None]
+        nflows_ptr = node_flows + (off_nids + offs_node[None,:]) * batch_size + offs_batch[:,None]
+
+        # Inner loop
+        acc = tl.zeros([TILE_SIZE_M, TILE_SIZE_K], dtype = tl.float32)
+        
+        for b in range(0, B_NUM_TILES):
+            emars = tl.load(emars_ptr, mask = mask_batch[:,None]) # [TILE_SIZE_B, TILE_SIZE_K]
+
+            if allow_modify_flows == 1:
+                log_n_fdm = tl.load(nflows_ptr, mask = mask_batch[:,None]) # [TILE_SIZE_B, TILE_SIZE_M]
+            else:
+                nflows = tl.load(nflows_ptr, mask = mask_batch[:,None]) # [TILE_SIZE_B, TILE_SIZE_M]
+                nmars = tl.load(nmars_ptr, mask = mask_batch[:,None]) # [TILE_SIZE_B, TILE_SIZE_M]
+                log_n_fdm = tl.log(nflows) - nmars
+
+            log_n_fdm_max = tl.max(log_n_fdm, axis = 1)
+            n_fdm_sub = tl.where(log_n_fdm_max[:,None] != -float("inf"), tl.exp(log_n_fdm - log_n_fdm_max[:,None]), 0.0)
+
+            scaled_emars = tl.exp(emars + log_n_fdm_max[:,None])
+
+            partial_flows = tl.sum(tl.trans(n_fdm_sub)[:,:,None] * scaled_emars[None,:,:], axis = 1)
+
+            acc += partial_flows
+
+            # Increment `emars_ptr`, `nmars_ptr`, and `nmars_ptr`
+            emars_ptr += TILE_SIZE_B
+            if allow_modify_flows == 0:
+                nmars_ptr += TILE_SIZE_B
             nflows_ptr += TILE_SIZE_B
 
             # Update batch mask
@@ -1357,26 +1665,48 @@ def _backward_block_sparse_par_flows(self, node_flows: torch.Tensor, params: tor
 
         grid = (triton.cdiv(num_edges, TILE_SIZE_K), triton.cdiv(layer_n_nodes, TILE_SIZE_M))
 
-        self._bk_triton_block_sparse_par_kernel[grid](
-            node_flows = node_flows, 
-            node_mars = node_mars, 
-            element_mars = element_mars, 
-            params = params, 
-            param_flows = param_flows, 
-            nids = nids, 
-            cids = cids, 
-            pids = pids,
-            pfids = pfids,
-            batch_size = batch_size, 
-            num_edges = num_edges, 
-            allow_modify_flows = allow_modify_flows, 
-            TILE_SIZE_B = TILE_SIZE_B, 
-            B_NUM_TILES = B_NUM_TILES, 
-            TILE_SIZE_K = TILE_SIZE_K, 
-            TILE_SIZE_M = TILE_SIZE_M, 
-            GROUP_SIZE_M = self.group_size,
-            TL_DOT = TL_DOT,
-        )
+        if TILE_SIZE_M >= 8 and TILE_SIZE_K >= 8 and TILE_SIZE_B >= 8:
+            self._bk_triton_block_sparse_par_kernel[grid](
+                node_flows = node_flows, 
+                node_mars = node_mars, 
+                element_mars = element_mars, 
+                params = params, 
+                param_flows = param_flows, 
+                nids = nids, 
+                cids = cids, 
+                pids = pids,
+                pfids = pfids,
+                batch_size = batch_size, 
+                num_edges = num_edges, 
+                allow_modify_flows = allow_modify_flows, 
+                TILE_SIZE_B = TILE_SIZE_B, 
+                B_NUM_TILES = B_NUM_TILES, 
+                TILE_SIZE_K = TILE_SIZE_K, 
+                TILE_SIZE_M = TILE_SIZE_M, 
+                GROUP_SIZE_M = self.group_size,
+                TL_DOT = TL_DOT
+            )
+        else:
+            self._bk_triton_block_sparse_par_csmm2_kernel[grid](
+                node_flows = node_flows, 
+                node_mars = node_mars, 
+                element_mars = element_mars, 
+                params = params, 
+                param_flows = param_flows, 
+                nids = nids, 
+                cids = cids, 
+                pids = pids,
+                pfids = pfids,
+                batch_size = batch_size, 
+                num_edges = num_edges, 
+                allow_modify_flows = allow_modify_flows, 
+                TILE_SIZE_B = TILE_SIZE_B, 
+                B_NUM_TILES = B_NUM_TILES, 
+                TILE_SIZE_K = TILE_SIZE_K, 
+                TILE_SIZE_M = TILE_SIZE_M, 
+                GROUP_SIZE_M = self.group_size,
+                TL_DOT = TL_DOT
+            )
 
     def _backward_sparse(self, node_flows: torch.Tensor, element_flows: torch.Tensor, 
                          params: torch.Tensor, node_mars: torch.Tensor, 
diff --git a/tests/layer/sum_layer_test.py b/tests/layer/sum_layer_test.py
index 702aa838..33351a17 100644
--- a/tests/layer/sum_layer_test.py
+++ b/tests/layer/sum_layer_test.py
@@ -42,7 +42,7 @@ def sum_layer_test():
 
     layer = SumLayer([ns0, ns1, ns2], global_nid_start = group_size,
                      global_pid_start = group_size ** 2, 
-                     global_pfid_start = 0, node2tiednodes = dict(), )
+                     global_pfid_start = 0, node2tiednodes = dict())
 
     assert torch.all(layer.partitioned_nids[0] == torch.arange(group_size, 7 * group_size, group_size))
     assert torch.all(layer.partitioned_cids[0][0:2,0] == group_size)
@@ -141,6 +141,145 @@ def sum_layer_test():
     assert torch.all(torch.abs(my_pflows - param_flows) < 2e-3)
 
 
+def corner_case_test():
+
+    device = torch.device("cuda:0")
+
+    group_sizes = [2, 4, 4,  4, 8,  8, 16, 32, 32, 32]
+    batch_sizes = [4, 4, 8, 16, 8, 16,  8,  8, 16, 32]
+    
+    for group_size, batch_size in zip(group_sizes, batch_sizes):
+        for force_use_fp16, force_use_fp32 in ((False, False), (True, False), (False, True)):
+    
+            with juice.set_group_size(group_size):
+
+                ni0 = inputs(0, num_node_groups = 2, dist = dists.Categorical(num_cats = 2))
+                ni1 = inputs(1, num_node_groups = 2, dist = dists.Categorical(num_cats = 2))
+                ni2 = inputs(2, num_node_groups = 2, dist = dists.Categorical(num_cats = 2))
+                ni3 = inputs(3, num_node_groups = 2, dist = dists.Categorical(num_cats = 2))
+
+                np0 = multiply(ni0, ni1)
+                np1 = multiply(ni2, ni3)
+                np2 = multiply(ni1, ni2)
+
+                ns0 = summate(np0, num_node_groups = 2)
+                ns1 = summate(np1, num_node_groups = 2)
+                ns2 = summate(np2, num_node_groups = 2)
+
+            input_layer = InputLayer([ni0, ni1, ni2, ni3], cum_nodes = group_size)
+
+            prod_layer = ProdLayer([np0, np1, np2])
+
+            layer = SumLayer([ns0, ns1, ns2], global_nid_start = group_size,
+                            global_pid_start = group_size ** 2, 
+                            global_pfid_start = 0, node2tiednodes = dict())
+
+            ## Compilation tests ##
+
+            assert torch.all(layer.partitioned_nids[0] == torch.arange(group_size, group_size * 7, group_size))
+
+            assert torch.all(layer.partitioned_cids[0][0,:] == torch.arange(group_size, group_size * 3))
+            assert torch.all(layer.partitioned_cids[0][1,:] == torch.arange(group_size, group_size * 3))
+            assert torch.all(layer.partitioned_cids[0][2,:] == torch.arange(group_size * 3, group_size * 5))
+            assert torch.all(layer.partitioned_cids[0][3,:] == torch.arange(group_size * 3, group_size * 5))
+            assert torch.all(layer.partitioned_cids[0][4,:] == torch.arange(group_size * 5, group_size * 7))
+            assert torch.all(layer.partitioned_cids[0][5,:] == torch.arange(group_size * 5, group_size * 7))
+
+            assert torch.all(layer.partitioned_pids[0][0,:] == torch.arange(group_size**2, group_size**2 * 3, group_size))
+            assert torch.all(layer.partitioned_pids[0][1,:] == torch.arange(group_size**2 * 3, group_size**2 * 5, group_size))
+            assert torch.all(layer.partitioned_pids[0][2,:] == torch.arange(group_size**2 * 5, group_size**2 * 7, group_size))
+            assert torch.all(layer.partitioned_pids[0][3,:] == torch.arange(group_size**2 * 7, group_size**2 * 9, group_size))
+            assert torch.all(layer.partitioned_pids[0][4,:] == torch.arange(group_size**2 * 9, group_size**2 * 11, group_size))
+            assert torch.all(layer.partitioned_pids[0][5,:] == torch.arange(group_size**2 * 11, group_size**2 * 13, group_size))
+
+            assert torch.all(layer.partitioned_pfids[0][0,:] == torch.arange(0, group_size**2 * 2, group_size))
+            assert torch.all(layer.partitioned_pfids[0][1,:] == torch.arange(group_size**2 * 2, group_size**2 * 4, group_size))
+            assert torch.all(layer.partitioned_pfids[0][2,:] == torch.arange(group_size**2 * 4, group_size**2 * 6, group_size))
+            assert torch.all(layer.partitioned_pfids[0][3,:] == torch.arange(group_size**2 * 6, group_size**2 * 8, group_size))
+            assert torch.all(layer.partitioned_pfids[0][4,:] == torch.arange(group_size**2 * 8, group_size**2 * 10, group_size))
+            assert torch.all(layer.partitioned_pfids[0][5,:] == torch.arange(group_size**2 * 10, group_size**2 * 12, group_size))
+
+            assert torch.all(layer.partitioned_chids[0] == torch.arange(group_size, group_size * 7, group_size))
+
+            assert torch.all(layer.partitioned_parids[0][0,:] == torch.tensor([group_size, group_size * 2]))
+            assert torch.all(layer.partitioned_parids[0][1,:] == torch.tensor([group_size, group_size * 2]))
+            assert torch.all(layer.partitioned_parids[0][2,:] == torch.tensor([group_size * 3, group_size * 4]))
+            assert torch.all(layer.partitioned_parids[0][3,:] == torch.tensor([group_size * 3, group_size * 4]))
+            assert torch.all(layer.partitioned_parids[0][4,:] == torch.tensor([group_size * 5, group_size * 6]))
+            assert torch.all(layer.partitioned_parids[0][5,:] == torch.tensor([group_size * 5, group_size * 6]))
+
+            assert torch.all(layer.partitioned_parpids[0][0,:] == torch.tensor([group_size**2 * 1, group_size**2 * 3]))
+            assert torch.all(layer.partitioned_parpids[0][1,:] == torch.tensor([group_size**2 * 2, group_size**2 * 4]))
+            assert torch.all(layer.partitioned_parpids[0][2,:] == torch.tensor([group_size**2 * 5, group_size**2 * 7]))
+            assert torch.all(layer.partitioned_parpids[0][3,:] == torch.tensor([group_size**2 * 6, group_size**2 * 8]))
+            assert torch.all(layer.partitioned_parpids[0][4,:] == torch.tensor([group_size**2 * 9, group_size**2 * 11]))
+            assert torch.all(layer.partitioned_parpids[0][5,:] == torch.tensor([group_size**2 * 10, group_size**2 * 12]))
+
+            layer.to(device)
+
+            ## Forward tests ##
+
+            element_mars = torch.rand([group_size + 3 * 2 * 2 * group_size, batch_size]).log().to(device)
+            element_mars[:group_size,:] = -float("inf")
+            node_mars = torch.zeros([group_size + group_size * 2 * 3, batch_size]).to(device)
+
+            params = torch.rand([group_size ** 2 + 3 * 4 * group_size * group_size]).to(device)
+
+            layer(node_mars, element_mars, params, force_use_fp16 = force_use_fp16, force_use_fp32 = force_use_fp32)
+
+            for i in range(group_size):
+                for j in range(6):
+                    cmars = element_mars[layer.partitioned_cids[0][j,:]].exp()
+                    epars = params[layer.partitioned_pids[0][j,:]+i]
+                    assert torch.all(torch.abs(node_mars[(j+1)*group_size+i,:] - (epars[:,None] * cmars).sum(dim = 0).log()) < 2e-3)
+
+            ## Backward tests ##
+
+            node_flows = torch.rand([group_size + group_size * 2 * 3, batch_size]).to(device)
+            element_flows = torch.zeros([group_size + 3 * 2 * 2 * group_size, batch_size]).to(device)
+
+            param_flows = torch.zeros([group_size ** 2 + 3 * 4 * group_size * group_size]).to(device)
+
+            layer.backward(node_flows, element_flows, node_mars, element_mars, params, param_flows)
+
+            chids = layer.partitioned_chids[0]
+            parids = layer.partitioned_parids[0]
+            parpids = layer.partitioned_parpids[0]
+
+            num_ngroups = chids.size(0)
+            num_egroups = parids.size(1)
+            parids = (parids[:,:,None].repeat(1, 1, group_size) + torch.arange(0, group_size, device = parids.device)).reshape(num_ngroups, num_egroups * group_size)
+            parpids_start = (parpids[:,:,None] + torch.arange(0, group_size, device = parids.device)).reshape(
+                num_ngroups, num_egroups * group_size)
+
+            for j in range(6):
+                parpids = parpids_start.clone()
+                for i in range(group_size):
+                    nmars = node_mars[parids[j,:]].exp()
+                    nflows = node_flows[parids[j,:]]
+                    emars = element_mars[(j+1)*group_size+i,:].exp()
+                    epars = params[parpids[j,:]]
+                    eflows = (nflows * epars[:,None] * emars[None,:] / nmars).sum(dim = 0)
+
+                    assert torch.all(torch.abs(eflows - element_flows[(j+1)*group_size+i,:]) < 1e-2)
+
+                    parpids += group_size
+
+            my_pflows = torch.zeros_like(param_flows)
+
+            for i in range(group_size):
+                for j in range(6):
+                    emars = element_mars[layer.partitioned_cids[0][j,:]].exp()
+                    epars = params[layer.partitioned_pids[0][j,:]+i]
+                    nmars = node_mars[(j+1)*group_size+i,:].exp()
+                    nflows = node_flows[(j+1)*group_size+i,:]
+                    pflows = epars * (nflows[None,:] * emars / nmars[None,:]).sum(dim = 1)
+
+                    my_pflows[layer.partitioned_pfids[0][j,:]+i] = pflows
+
+            assert torch.all(torch.abs(my_pflows - param_flows) < 2e-3)
+
+
 def speed_test():
 
     device = torch.device("cuda:0")
@@ -223,5 +362,6 @@ def speed_test():
 
 if __name__ == "__main__":
     torch.manual_seed(3890)
-    sum_layer_test()
-    speed_test()
\ No newline at end of file
+    # sum_layer_test()
+    corner_case_test()
+    # speed_test()
\ No newline at end of file
diff --git a/tests/structures/hclt_test.py b/tests/structures/hclt_test.py
index e3a63881..46362dbb 100644
--- a/tests/structures/hclt_test.py
+++ b/tests/structures/hclt_test.py
@@ -108,12 +108,12 @@ def hclt_test():
         milestone_steps = [0, len(train_loader) * 100, len(train_loader) * 350]
     )
 
-    # for batch in train_loader:
-    #     x = batch[0].to(device)
+    for batch in train_loader:
+        x = batch[0].to(device)
 
-    #     lls = pc(x, record_cudagraph = True)
-    #     lls.mean().backward()
-    #     break
+        lls = pc(x, record_cudagraph = True)
+        lls.mean().backward()
+        break
 
     # for i, batch in enumerate(train_loader):
     #     x = batch[0].to(device)
@@ -123,21 +123,21 @@ def hclt_test():
     #     if i > 5:
     #         break
 
-    from torch.profiler import profile, record_function, ProfilerActivity
-    with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], with_stack = True) as prof:
-        for i, batch in enumerate(train_loader):
-            x = batch[0].to(device)
-
-            lls = pc(x, record_cudagraph = False)
-            lls.mean().backward()
-            if i > 5:
-                break
-
-    prof.export_chrome_trace("trace3.json")
-    # torch.autograd.profiler.tensorboard_trace_to_flame_graph('trace.json', 'flamegraph.svg')
-    # prof.export_stacks("trace.txt", "cpu_time_total")
-    import pdb; pdb.set_trace()
-    exit()
+    # from torch.profiler import profile, record_function, ProfilerActivity
+    # with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], with_stack = True) as prof:
+    #     for i, batch in enumerate(train_loader):
+    #         x = batch[0].to(device)
+
+    #         lls = pc(x, record_cudagraph = False)
+    #         lls.mean().backward()
+    #         if i > 5:
+    #             break
+
+    # prof.export_chrome_trace("trace3.json")
+    # # torch.autograd.profiler.tensorboard_trace_to_flame_graph('trace.json', 'flamegraph.svg')
+    # # prof.export_stacks("trace.txt", "cpu_time_total")
+    # import pdb; pdb.set_trace()
+    # exit()
 
     mini_batch_em_epoch(350, pc, optimizer, scheduler, train_loader, test_loader, device)
     full_batch_em_epoch(pc, train_loader, test_loader, device)

From 10a02f4a90fcf0a5608949bb219621ac7a685f36 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Fri, 29 Dec 2023 00:32:03 +0800
Subject: [PATCH 121/162] remove `matmul_kernel_test`

---
 tests/layer/matmul_kernel_test.py | 136 ------------------------------
 1 file changed, 136 deletions(-)
 delete mode 100644 tests/layer/matmul_kernel_test.py

diff --git a/tests/layer/matmul_kernel_test.py b/tests/layer/matmul_kernel_test.py
deleted file mode 100644
index 14877878..00000000
--- a/tests/layer/matmul_kernel_test.py
+++ /dev/null
@@ -1,136 +0,0 @@
-import pyjuice as juice
-import torch
-import numpy as np
-import time
-import random
-
-import pyjuice.nodes.distributions as dists
-from pyjuice.utils import BitSet
-from pyjuice.nodes import multiply, summate, inputs
-from pyjuice.model import TensorCircuit
-
-from pyjuice.layer import InputLayer, ProdLayer, SumLayer
-
-import pytest
-
-
-import triton
-import triton.language as tl
-
-
-@triton.jit
-def kernel1(a, b, c, M: tl.constexpr, N: tl.constexpr, K: tl.constexpr):
-    pid = tl.program_id(axis = 0)
-
-    offs_a = tl.arange(0, M)[:,None] * N + tl.arange(0, N)[None,:]
-    aa = tl.load(a + offs_a).to(tl.float16)
-
-    offs_b = tl.arange(0, N)[:,None] * K + tl.arange(0, K)[None,:]
-    bb = tl.load(b + offs_b).to(tl.float16)
-
-    cc = tl.dot(aa, bb).to(tl.float32)
-
-    offs_c = tl.arange(0, M)[:,None] * K + tl.arange(0, K)[None,:]
-    tl.store(c + offs_c, cc)
-
-
-@triton.jit
-def kernel2(a, b, c, M: tl.constexpr, N: tl.constexpr, K: tl.constexpr):
-    pid = tl.program_id(axis = 0)
-
-    offs_a = tl.arange(0, M)[:,None] * N + tl.arange(0, N)[None,:]
-    aa = tl.load(a + offs_a)
-
-    offs_b = tl.arange(0, N)[:,None] * K + tl.arange(0, K)[None,:]
-    bb = tl.load(b + offs_b)
-
-    bb_max = tl.max(bb, axis = 0)[None,:]
-    bb_sub = tl.where(bb_max != -float("inf"), tl.exp(bb - bb_max), 0.0)
-
-    cc = tl.sum(aa[:,:,None] * bb_sub[None,:,:], axis = 1)
-
-    offs_c = tl.arange(0, M)[:,None] * K + tl.arange(0, K)[None,:]
-    tl.store(c + offs_c, cc)
-
-
-@triton.jit
-def kernel2_fix(a, b, c, M: tl.constexpr, N: tl.constexpr, K: tl.constexpr):
-    pid = tl.program_id(axis = 0)
-
-    offs_a = tl.arange(0, M)[:,None] * N + tl.arange(0, N)[None,:]
-    aa = tl.load(a + offs_a)
-
-    offs_b = tl.arange(0, N)[None,:] * K + tl.arange(0, K)[:,None]
-    bb = tl.load(b + offs_b)
-
-    bb_max = tl.max(bb, axis = 1)[:,None]
-    bb_sub = tl.where(bb_max != -float("inf"), tl.exp(bb - bb_max), 0.0)
-
-    cc = tl.sum(aa[:,:,None] * tl.trans(bb_sub)[None,:,:], axis = 1)
-
-    offs_c = tl.arange(0, M)[:,None] * K + tl.arange(0, K)[None,:]
-    tl.store(c + offs_c, cc)
-
-
-@triton.jit
-def kernel3(a, b, c, M: tl.constexpr, N: tl.constexpr, K: tl.constexpr):
-    pid = tl.program_id(axis = 0)
-
-    offs_a = tl.arange(0, M)[:,None] * N + tl.arange(0, N)[None,:]
-    aa = tl.load(a + offs_a)
-
-    offs_b = tl.arange(0, N)[:,None] * K + tl.arange(0, K)[None,:]
-    bb = tl.load(b + offs_b)
-
-    aa = tl.view(tl.broadcast_to(aa[:,None,:], (M, 8 // M, N)), (8, N))
-    # cc = tl.dot(aa, bb)
-    cc = tl.sum(aa[:,:,None] * bb[None,:,:], axis = 1)
-    cc = tl.max(tl.view(cc, (M, 8 // M, K)), axis = 1)
-
-    offs_c = tl.arange(0, M)[:,None] * K + tl.arange(0, K)[None,:]
-    tl.store(c + offs_c, cc)
-
-
-if __name__ == "__main__":
-    import time
-
-    M = 8
-    N = 4
-    K = 8
-
-    a = torch.rand([M, N]).cuda()
-    b = torch.rand([N, K]).log().cuda()
-    c = torch.zeros([M, K]).cuda()
-
-    grid = (1,)
-
-    # kernel1[grid](a, b, c, M, N, K)
-
-    # torch.cuda.synchronize()
-    # t0 = time.time()
-    # for _ in range(100):
-    #     kernel1[grid](a, b, c, M, N, K)
-    #     torch.cuda.synchronize()
-    # t1 = time.time()
-
-    # print((t1 - t0) / 100 * 1000)
-
-    # kernel2[grid](a, b, c, M, N, K)
-    kernel2_fix[grid](a, b, c, M, N, K)
-
-    # torch.cuda.synchronize()
-    # t0 = time.time()
-    # for _ in range(100):
-    #     kernel2[grid](a, b, c, M, N, K)
-    #     torch.cuda.synchronize()
-    # t1 = time.time()
-
-    # print((t1 - t0) / 100 * 1000)
-
-    cc = torch.matmul(a, (b - b.max(dim = 0, keepdim = True).values).exp())
-
-    print((c - cc).abs().max())
-
-    ccc = c
-
-    import pdb; pdb.set_trace()
\ No newline at end of file

From a8e52585cb0b700005396716af9d4d5531be15a9 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Fri, 29 Dec 2023 00:34:29 +0800
Subject: [PATCH 122/162] update kernel selection heuristic

---
 src/pyjuice/layer/sum_layer.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/pyjuice/layer/sum_layer.py b/src/pyjuice/layer/sum_layer.py
index cf1f5c55..d1089caa 100644
--- a/src/pyjuice/layer/sum_layer.py
+++ b/src/pyjuice/layer/sum_layer.py
@@ -370,9 +370,9 @@ def _forward(self, node_mars: torch.Tensor, element_mars: torch.Tensor,
         elif (self.group_size == 1 and num_edges < 16384) or num_edges < 4:
             # In this case, we should definitely use the sparse implementation
             mode = self.SPARSE
-        # elif self.group_size < 8:
-        #     # TODO: remove this when `triton` has fixed its bug
-        #     mode = self.SPARSE
+        elif self.group_size * batch_size < 32:
+            # Advantage of block-sparse processing is diminishing
+            mode = self.SPARSE
         else:
             mode = self.BLOCK_SPARSE
 
@@ -979,8 +979,8 @@ def _backward(self, node_flows: torch.Tensor, element_flows: torch.Tensor,
         elif (cs_group_size == 1 or self.group_size == 1) and num_edges < 16384:
             # In this case, we should definitely use the sparse implementation
             mode = self.SPARSE
-        elif num_edges < 4 or batch_size < 4:
-            # In this case, the block-sparse kernel will have compilation issues
+        elif self.group_size * batch_size < 32:
+            # Advantage of block-sparse processing is diminishing
             mode = self.SPARSE
         else:
             mode = self.BLOCK_SPARSE

From 2e155614014350466ae47958b9289d560a4bf15a Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Fri, 29 Dec 2023 03:42:57 +0800
Subject: [PATCH 123/162] expose `force_use_fp16` and `force_use_fp32`

---
 src/pyjuice/model/tensorcircuit.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/pyjuice/model/tensorcircuit.py b/src/pyjuice/model/tensorcircuit.py
index 3a86641d..b0c07ae7 100644
--- a/src/pyjuice/model/tensorcircuit.py
+++ b/src/pyjuice/model/tensorcircuit.py
@@ -104,7 +104,7 @@ def to(self, device):
         
     def forward(self, inputs: torch.Tensor, input_layer_fn: Optional[Union[str,Callable]] = None,
                 cache: Optional[dict] = None, return_cache: bool = False, record_cudagraph: bool = False, 
-                apply_cudagraph: bool = True, **kwargs):
+                apply_cudagraph: bool = True, force_use_fp16: bool = False, force_use_fp32: bool = False, **kwargs):
         """
         Forward the circuit.
 
@@ -157,7 +157,9 @@ def _run_inner_layers():
 
                     elif layer_group.is_sum():
                         # Sum layer
-                        layer_group(self.node_mars, self.element_mars, self.params)
+                        layer_group(self.node_mars, self.element_mars, self.params, 
+                                    force_use_fp16 = force_use_fp16, 
+                                    force_use_fp32 = force_use_fp32)
 
                     else:
                         raise ValueError(f"Unknown layer type {type(layer)}.")

From 72504bb68cb32acc25a9a730f9b4a0fecda4c0e2 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Fri, 29 Dec 2023 03:43:11 +0800
Subject: [PATCH 124/162] add block-sparse tests

---
 tests/model/block_sparse_pc_test.py | 128 ++++++++++++++++++++++++++++
 1 file changed, 128 insertions(+)
 create mode 100644 tests/model/block_sparse_pc_test.py

diff --git a/tests/model/block_sparse_pc_test.py b/tests/model/block_sparse_pc_test.py
new file mode 100644
index 00000000..05969f88
--- /dev/null
+++ b/tests/model/block_sparse_pc_test.py
@@ -0,0 +1,128 @@
+import pyjuice as juice
+import torch
+import numpy as np
+import time
+import random
+
+import pyjuice.nodes.distributions as dists
+from pyjuice.utils import BitSet
+from pyjuice.nodes import multiply, summate, inputs
+from pyjuice.model import TensorCircuit
+
+from pyjuice.layer import InputLayer, ProdLayer, SumLayer
+
+import pytest
+
+
+def block_sparse_pc_test():
+    
+    device = torch.device("cuda:0")
+
+    num_node_groups = 4
+    batch_size = 512
+
+    for group_size in [16, 8, 1]:
+    
+        with juice.set_group_size(group_size):
+
+            ni00 = inputs(0, num_node_groups = num_node_groups, dist = dists.Categorical(num_cats = 4))
+            ni10 = inputs(1, num_node_groups = num_node_groups, dist = dists.Categorical(num_cats = 4))
+            np0 = multiply(ni00, ni10)
+
+            ni01 = inputs(0, num_node_groups = num_node_groups, dist = dists.Categorical(num_cats = 4))
+            ni11 = inputs(1, num_node_groups = num_node_groups, dist = dists.Categorical(num_cats = 4))
+            np1 = multiply(ni01, ni11)
+
+            ni02 = inputs(0, num_node_groups = num_node_groups, dist = dists.Categorical(num_cats = 4))
+            ni12 = inputs(1, num_node_groups = num_node_groups, dist = dists.Categorical(num_cats = 4))
+            np2 = multiply(ni02, ni12)
+
+            edge_indicators = torch.rand([num_node_groups, 3 * num_node_groups]) < 0.3
+            edge_indicators[:,0] = True
+            edge_ids = torch.nonzero(edge_indicators, as_tuple = False).permute(1, 0)
+
+            ns = summate(np0, np1, np2, edge_ids = edge_ids)
+
+        ns.init_parameters()
+
+        pc = TensorCircuit(ns, layer_sparsity_tol = 1.0)
+
+        pc.to(device)
+
+        data = torch.randint(0, 4, [batch_size, 2], device = device)
+
+        ## Forward tests ##
+
+        lls = pc(data, force_use_fp32 = True)
+
+        node_mars = pc.node_mars.cpu()
+        element_mars = pc.element_mars.cpu()
+
+        np0_vals = element_mars[group_size:group_size*(num_node_groups+1),:].exp().reshape(num_node_groups, group_size, batch_size)
+        np1_vals = element_mars[group_size*(num_node_groups+1):group_size*(num_node_groups*2+1),:].exp().reshape(num_node_groups, group_size, batch_size)
+        np2_vals = element_mars[group_size*(num_node_groups*2+1):group_size*(num_node_groups*3+1),:].exp().reshape(num_node_groups, group_size, batch_size)
+
+        params = ns._params
+
+        ns_vals = torch.zeros([num_node_groups, group_size, batch_size])
+
+        for i in range(edge_ids.size(1)):
+            ni, ci = edge_ids[0,i], edge_ids[1,i]
+            if ci < num_node_groups:
+                ns_vals[ni,:,:] += torch.matmul(params[i], np0_vals[ci])
+            elif ci < num_node_groups * 2:
+                ns_vals[ni,:,:] += torch.matmul(params[i], np1_vals[ci-num_node_groups])
+            else:
+                ns_vals[ni,:,:] += torch.matmul(params[i], np2_vals[ci-num_node_groups*2])
+
+        sid, eid = (num_node_groups * 6 + 1) * group_size, (num_node_groups * 7 + 1) * group_size
+        ref_ns_vals = node_mars[sid:eid,:].exp().reshape(num_node_groups, group_size, batch_size)
+
+        assert torch.all(torch.abs(ns_vals - ref_ns_vals) < 1e-4)
+
+        ## Backward tests ##
+
+        pc.backward(data.permute(1, 0), allow_modify_flows = False)
+
+        node_flows = pc.node_flows.cpu()
+        element_flows = pc.element_flows.cpu()
+        param_flows = pc.param_flows.cpu()
+
+        np0_flows = torch.zeros([num_node_groups, group_size, batch_size])
+        np1_flows = torch.zeros([num_node_groups, group_size, batch_size])
+        np2_flows = torch.zeros([num_node_groups, group_size, batch_size])
+
+        for i in range(edge_ids.size(1)):
+            ni, ci = edge_ids[0,i], edge_ids[1,i]
+            if ci < num_node_groups:
+                np0_flows[ci] += torch.matmul(params[i].permute(1, 0), 1.0 / ns_vals[ni]) * np0_vals[ci]
+            elif ci < num_node_groups * 2:
+                np1_flows[ci-num_node_groups] += torch.matmul(params[i].permute(1, 0), 1.0 / ns_vals[ni]) * np1_vals[ci-num_node_groups]
+            else:
+                np2_flows[ci-num_node_groups*2] += torch.matmul(params[i].permute(1, 0), 1.0 / ns_vals[ni]) * np2_vals[ci-num_node_groups*2]
+
+        ref_np0_flows = element_flows[group_size:group_size*(num_node_groups+1),:].reshape(num_node_groups, group_size, batch_size)
+        ref_np1_flows = element_flows[group_size*(num_node_groups+1):group_size*(num_node_groups*2+1),:].reshape(num_node_groups, group_size, batch_size)
+        ref_np2_flows = element_flows[group_size*(num_node_groups*2+1):group_size*(num_node_groups*3+1),:].reshape(num_node_groups, group_size, batch_size)
+
+        assert torch.all(torch.abs(np0_flows - ref_np0_flows) < 1e-3)
+        assert torch.all(torch.abs(np1_flows - ref_np1_flows) < 1e-3)
+        assert torch.all(torch.abs(np2_flows - ref_np2_flows) < 1e-3)
+
+        param_flows = param_flows.reshape(edge_ids.size(1), group_size, group_size).permute(0, 2, 1)
+
+        for i in range(edge_ids.size(1)):
+            ni, ci = edge_ids[0,i], edge_ids[1,i]
+            if ci < num_node_groups:
+                curr_par_flows = torch.matmul(1.0 / ns_vals[ni], np0_vals[ci].permute(1, 0)) * params[i]
+            elif ci < num_node_groups * 2:
+                curr_par_flows = torch.matmul(1.0 / ns_vals[ni], np1_vals[ci-num_node_groups].permute(1, 0)) * params[i]
+            else:
+                curr_par_flows = torch.matmul(1.0 / ns_vals[ni], np2_vals[ci-num_node_groups*2].permute(1, 0)) * params[i]
+
+            assert torch.all(torch.abs(param_flows[i] - curr_par_flows) < 1e-2)
+
+
+if __name__ == "__main__":
+    torch.manual_seed(3890)
+    block_sparse_pc_test()

From 24c6ad279adc604113a5a73709dcaf4e8ae0526e Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Fri, 29 Dec 2023 04:02:37 +0800
Subject: [PATCH 125/162] fix multi-head pc construction

---
 src/pyjuice/structures/compilation.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/pyjuice/structures/compilation.py b/src/pyjuice/structures/compilation.py
index 20c6f215..683d00f7 100644
--- a/src/pyjuice/structures/compilation.py
+++ b/src/pyjuice/structures/compilation.py
@@ -72,7 +72,10 @@ def children(n: int):
                 rp = multiply(*ch_regions)
 
                 if v == root:
-                    r = summate(rp, num_node_groups = num_root_ns, group_size = 1)
+                    if group_size == 1:
+                        r = summate(rp, num_node_groups = num_root_ns, group_size = 1)
+                    else:
+                        r = summate(rp, num_node_groups = num_root_ns // group_size, group_size = group_size)
                 else:
                     r = summate(rp, num_node_groups = num_node_groups)
 

From 51b56483e6e086dc52ebb522523f20b2b83a2a8b Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Fri, 29 Dec 2023 04:02:46 +0800
Subject: [PATCH 126/162] fix pdhclt

---
 src/pyjuice/structures/pd.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/pyjuice/structures/pd.py b/src/pyjuice/structures/pd.py
index b4b897d6..d87e79b4 100644
--- a/src/pyjuice/structures/pd.py
+++ b/src/pyjuice/structures/pd.py
@@ -166,7 +166,7 @@ def PDHCLT(data: torch.Tensor, data_shape: Tuple, num_latents: int,
     assert data.dim() == 2
     assert data.size(1) == reduce(lambda x, y: x * y, data_shape)
 
-    def input_layer_fn(scope, num_latents):
+    def input_layer_fn(scope, num_latents, group_size):
         vars = torch.tensor(scope.to_list()).sort().values
         ns = HCLT(
             x = data[:,vars], 
@@ -174,6 +174,7 @@ def input_layer_fn(scope, num_latents):
             input_layer_type = input_layer_type,
             input_layer_params = input_layer_params,
             num_root_ns = num_latents,
+            group_size = group_size,
             **hclt_kwargs
         )
 

From afad1c97b138a97763c5031ef058a832c76d7be9 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Fri, 29 Dec 2023 04:03:06 +0800
Subject: [PATCH 127/162] improve error message

---
 src/pyjuice/nodes/construction.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pyjuice/nodes/construction.py b/src/pyjuice/nodes/construction.py
index 5c3361a9..1a2aa28b 100644
--- a/src/pyjuice/nodes/construction.py
+++ b/src/pyjuice/nodes/construction.py
@@ -52,7 +52,7 @@ def multiply(nodes1: ProdNodesChs, *args, edge_ids: Optional[Tensor] = None, **k
     for nodes in args:
         assert isinstance(nodes, SumNodes) or isinstance(nodes, InputNodes), f"Children of product nodes must be input or sum nodes, but found input of type {type(nodes)}."
         if edge_ids is None:
-            assert nodes.num_node_groups == num_node_groups, "Input nodes should have the same `num_node_groups`."
+            assert nodes.num_node_groups == num_node_groups, f"Input nodes should have the same `num_node_groups`, but got {nodes.num_node_groups} and {num_node_groups}."
         assert nodes.group_size == group_size, "Input nodes should have the same `num_node_groups`."
         assert len(nodes.scope & scope) == 0, "Children of a `ProdNodes` should have disjoint scopes."
         chs.append(nodes)

From 9315bf6e18918a244dbf5d7ba9c631abb8ed1b30 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Fri, 29 Dec 2023 04:15:07 +0800
Subject: [PATCH 128/162] adjust runtests

---
 tests/structures/pd_hclt_test.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/tests/structures/pd_hclt_test.py b/tests/structures/pd_hclt_test.py
index ffc2fd65..5365321d 100644
--- a/tests/structures/pd_hclt_test.py
+++ b/tests/structures/pd_hclt_test.py
@@ -143,7 +143,7 @@ def pd_hclt_test():
     ns = juice.structures.PDHCLT(
         train_data.cuda(),
         data_shape = (28, 28),
-        num_latents = 32,
+        num_latents = 128,
         split_intervals = (4, 4),
         structure_type = "sum_dominated"
     )
@@ -161,6 +161,13 @@ def pd_hclt_test():
 
     pc.print_statistics()
 
+    # for batch in train_loader:
+    #     x = batch[0].to(device)
+
+    #     lls = pc(x, record_cudagraph = True)
+    #     lls.mean().backward()
+    #     break
+
     mini_batch_em_epoch(350, pc, optimizer, None, train_loader, test_loader, device)
     full_batch_em_epoch(pc, train_loader, test_loader, device)
 

From a9129aacb8955b70d98b1ddadc0af6eb37ffbdde Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Fri, 29 Dec 2023 18:36:50 +0800
Subject: [PATCH 129/162] fix compilation for multi-head PC

---
 src/pyjuice/structures/compilation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pyjuice/structures/compilation.py b/src/pyjuice/structures/compilation.py
index 683d00f7..e477f910 100644
--- a/src/pyjuice/structures/compilation.py
+++ b/src/pyjuice/structures/compilation.py
@@ -72,7 +72,7 @@ def children(n: int):
                 rp = multiply(*ch_regions)
 
                 if v == root:
-                    if group_size == 1:
+                    if num_root_ns == 1:
                         r = summate(rp, num_node_groups = num_root_ns, group_size = 1)
                     else:
                         r = summate(rp, num_node_groups = num_root_ns // group_size, group_size = group_size)

From 74beb4ad3be1c58d827807fe48188e1ece3e7185 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Fri, 29 Dec 2023 20:38:10 +0800
Subject: [PATCH 130/162] fix io

---
 src/pyjuice/io/io.py            | 2 +-
 src/pyjuice/io/serialization.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/pyjuice/io/io.py b/src/pyjuice/io/io.py
index f9387aee..edbe450e 100644
--- a/src/pyjuice/io/io.py
+++ b/src/pyjuice/io/io.py
@@ -13,7 +13,7 @@
 def save(fname: str, model: Union[CircuitNodes,TensorCircuit]):
     if isinstance(model, TensorCircuit):
         model.update_parameters()
-        root_ns = model.root_nodes
+        root_ns = model.root_ns
     else:
         root_ns = model
 
diff --git a/src/pyjuice/io/serialization.py b/src/pyjuice/io/serialization.py
index 0f3a8c71..43cdfda5 100644
--- a/src/pyjuice/io/serialization.py
+++ b/src/pyjuice/io/serialization.py
@@ -59,10 +59,10 @@ def deserialize_nodes(nodes_list: Sequence):
             scope = ns_info["scope"]
             dist = pickle.loads(ns_info["dist"])
 
-            ns = inputs(scope, num_node_groups, dist)
+            ns = inputs(scope, num_node_groups, dist, group_size = group_size)
 
             if "params" in ns_info:
-                ns._params = torch.from_numpy(ns_info["params"])
+                ns.set_params(torch.from_numpy(ns_info["params"]))
 
         elif ns_info["type"] == "Product":
             chs = [id2ns[cid] for cid in chids]

From b599a9619be6c655f368b39f00d3f481f93028b7 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Fri, 29 Dec 2023 20:38:45 +0800
Subject: [PATCH 131/162] speedup input layer kernel launches

---
 src/pyjuice/layer/input_layer.py | 33 ++++++++++++++++++++++----------
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/src/pyjuice/layer/input_layer.py b/src/pyjuice/layer/input_layer.py
index d6d440a7..7f3e06dc 100644
--- a/src/pyjuice/layer/input_layer.py
+++ b/src/pyjuice/layer/input_layer.py
@@ -17,6 +17,7 @@
 from pyjuice.utils.grad_fns import ReverseGrad
 from pyjuice.utils import BitSet
 from pyjuice.utils.source2fn import make_function_from_src
+from pyjuice.utils.kernel_launcher import FastJITFunction
 from .layer import Layer
 
 
@@ -235,7 +236,10 @@ def forward(self, data: torch.Tensor, node_mars: torch.Tensor, params: Optional[
             if not self.provided("_mars_kernel"):
                 self._mars_kernel = self._compile_triton_kernel(self._mars_kernel_template, mar_fn = self.fw_mar_fn)
 
-            grid = lambda meta: (triton.cdiv(layer_num_nodes * batch_size, meta['BLOCK_SIZE']),)
+            BLOCK_SIZE = 1024
+
+            grid = (triton.cdiv(layer_num_nodes * batch_size, BLOCK_SIZE),)
+
             self._mars_kernel[grid](
                 params_ptr = self.params, 
                 node_mars_ptr = node_mars, 
@@ -250,7 +254,7 @@ def forward(self, data: torch.Tensor, node_mars: torch.Tensor, params: Optional[
                 num_vars_per_node = self.num_vars_per_node, 
                 nv_block_size = triton.next_power_of_2(self.num_vars_per_node),
                 node_offset = node_offset, 
-                BLOCK_SIZE = 1024, 
+                BLOCK_SIZE = BLOCK_SIZE, 
                 partial_eval = 1 if fw_local_ids is not None else 0,
                 num_warps = 8
             )
@@ -261,7 +265,8 @@ def forward(self, data: torch.Tensor, node_mars: torch.Tensor, params: Optional[
 
                 mask_dim = missing_mask.dim()
 
-                grid = lambda meta: (triton.cdiv(layer_num_nodes * batch_size, meta['BLOCK_SIZE']),)
+                grid = (triton.cdiv(layer_num_nodes * batch_size, BLOCK_SIZE),)
+
                 self._fw_missing_mask_kernel[grid](
                     missing_mask_ptr = missing_mask,
                     node_mars_ptr = node_mars, 
@@ -312,7 +317,10 @@ def backward(self, data: torch.Tensor, node_flows: torch.Tensor,
             if not self.provided("_flows_kernel"):
                 self._flows_kernel = self._compile_triton_kernel(self._flows_kernel_template, flow_fn = self.bk_flow_fn)
 
-            grid = lambda meta: (triton.cdiv(layer_num_nodes * batch_size, meta['BLOCK_SIZE']),)
+            BLOCK_SIZE = 1024
+
+            grid = (triton.cdiv(layer_num_nodes * batch_size, BLOCK_SIZE),)
+
             self._flows_kernel[grid](
                 params_ptr = self.params,
                 param_flows_ptr = self.param_flows,
@@ -330,7 +338,7 @@ def backward(self, data: torch.Tensor, node_flows: torch.Tensor,
                 num_vars_per_node = self.num_vars_per_node, 
                 nv_block_size = triton.next_power_of_2(self.num_vars_per_node),
                 node_offset = node_offset, 
-                BLOCK_SIZE = 1024, 
+                BLOCK_SIZE = BLOCK_SIZE, 
                 partial_eval = 1 if bk_local_ids is not None else 0,
                 num_warps = 8
             )
@@ -367,7 +375,10 @@ def sample(self, samples: torch.Tensor, node_flows: torch.Tensor, missing_mask:
             if not self.provided("_sample_kernel"):
                 self._sample_kernel = self._compile_triton_kernel(self._sample_kernel_template, sample_fn = self.sample_fn)
 
-            grid = lambda meta: (triton.cdiv(num_activ_nodes, meta['BLOCK_SIZE']),)
+            BLOCK_SIZE = 1024
+
+            grid = (triton.cdiv(num_activ_nodes, BLOCK_SIZE),)
+
             self._sample_kernel[grid](
                 samples_ptr = samples, 
                 params_ptr = params,
@@ -381,7 +392,7 @@ def sample(self, samples: torch.Tensor, node_flows: torch.Tensor, missing_mask:
                 num_vars_per_node = self.num_vars_per_node, 
                 nv_block_size = triton.next_power_of_2(self.num_vars_per_node),
                 batch_size = batch_size, 
-                BLOCK_SIZE = 2048,
+                BLOCK_SIZE = BLOCK_SIZE,
                 seed = seed if seed is not None else random.randint(0, 1e8)
             )
 
@@ -426,7 +437,9 @@ def mini_batch_em(self, step_size: float, pseudocount: float = 0.0):
 
                     constexprs = torch.tensor([step_size, pseudocount], dtype = torch.float32, device = self.device)
 
-                    grid = lambda meta: (triton.cdiv(layer_num_source_nodes, meta['BLOCK_SIZE']),)
+                    BLOCK_SIZE = 1024
+
+                    grid = (triton.cdiv(layer_num_source_nodes, BLOCK_SIZE),)
 
                     self._em_kernel[grid](
                         params_ptr = self.params,
@@ -438,7 +451,7 @@ def mini_batch_em(self, step_size: float, pseudocount: float = 0.0):
                         source_nids_ptr = self.source_nids,
                         constexprs_ptr = constexprs,
                         layer_num_source_nodes = layer_num_source_nodes,
-                        BLOCK_SIZE = 1024,
+                        BLOCK_SIZE = BLOCK_SIZE,
                         num_warps = 8
                     )
 
@@ -863,4 +876,4 @@ def parse_source(src, get_signature = False):
         # Make a pseudo-function from the source code
         new_fn = make_function_from_src(new_src)
 
-        return JITFunction(new_fn)
\ No newline at end of file
+        return FastJITFunction(new_fn)
\ No newline at end of file

From c9ef27faf5fa12bd3a945c0c97fc55b23182e5eb Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Fri, 29 Dec 2023 20:39:21 +0800
Subject: [PATCH 132/162] add normalization option in `set_params` of input
 nodes

---
 src/pyjuice/nodes/distributions/categorical.py   | 6 ++++++
 src/pyjuice/nodes/distributions/distributions.py | 3 +++
 src/pyjuice/nodes/input_nodes.py                 | 7 ++++++-
 3 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/src/pyjuice/nodes/distributions/categorical.py b/src/pyjuice/nodes/distributions/categorical.py
index a10342ac..ec5abda5 100644
--- a/src/pyjuice/nodes/distributions/categorical.py
+++ b/src/pyjuice/nodes/distributions/categorical.py
@@ -19,6 +19,12 @@ def get_signature(self):
     def get_metadata(self):
         return [self.num_cats]
 
+    def normalize_parameters(self, params: torch.Tensor):
+        params = params.reshape(-1, self.num_cats)
+        params /= params.sum(dim = 1, keepdim = True)
+
+        return params.reshape(-1)
+
     def num_parameters(self):
         return self.num_cats
 
diff --git a/src/pyjuice/nodes/distributions/distributions.py b/src/pyjuice/nodes/distributions/distributions.py
index c574b659..16c57075 100644
--- a/src/pyjuice/nodes/distributions/distributions.py
+++ b/src/pyjuice/nodes/distributions/distributions.py
@@ -13,6 +13,9 @@ def get_signature(self):
     def get_metadata(self):
         return [] # no metadata
 
+    def normalize_params(self, params: torch.Tensor):
+        return params
+
     def num_parameters(self):
         """
         The number of parameters per node.
diff --git a/src/pyjuice/nodes/input_nodes.py b/src/pyjuice/nodes/input_nodes.py
index 6ec0eb12..3bc85fee 100644
--- a/src/pyjuice/nodes/input_nodes.py
+++ b/src/pyjuice/nodes/input_nodes.py
@@ -58,7 +58,12 @@ def get_params(self):
 
     def set_params(self, params: torch.Tensor, normalize: bool = True):
         assert params.numel() == self.num_nodes * self.dist.num_parameters()
-        self._params = params.reshape(-1)
+
+        params = params.reshape(-1)
+        if normalize:
+            params = self.dist.normalize_params(params)
+
+        self._params = params
 
     def init_parameters(self, perturbation: float = 2.0, recursive: bool = True, 
                         is_root: bool = True, ret_params: bool = False, **kwargs):

From e4ba4189b3879990e2d91663adba1b1e62ffd6dd Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Fri, 29 Dec 2023 20:39:38 +0800
Subject: [PATCH 133/162] add io parameter tests

---
 tests/io/io_test.py | 35 ++++++++++++++++++++++++++++++++++-
 1 file changed, 34 insertions(+), 1 deletion(-)

diff --git a/tests/io/io_test.py b/tests/io/io_test.py
index 0cf013a0..a06e0a88 100644
--- a/tests/io/io_test.py
+++ b/tests/io/io_test.py
@@ -46,5 +46,38 @@ def io_test():
     assert n0.chs[1].chs[1].dist.num_cats == n0_dup.chs[1].chs[1].dist.num_cats
 
 
+def io_param_test():
+    num_node_groups = 2
+    group_size = 4
+
+    with juice.set_group_size(group_size):
+        i00 = inputs(0, num_node_groups, dists.Categorical(num_cats = 5))
+        i01 = inputs(0, num_node_groups, dists.Categorical(num_cats = 5))
+        i10 = inputs(1, num_node_groups, dists.Categorical(num_cats = 5))
+        i11 = inputs(1, num_node_groups, dists.Categorical(num_cats = 5))
+        
+        m00 = multiply(i00, i10)
+        m01 = multiply(i01, i11)
+
+        n0 = summate(m00, m01, num_node_groups = num_node_groups)
+
+    n0.init_parameters()
+
+    pc = juice.TensorCircuit(n0)
+
+    temp_file = tempfile.NamedTemporaryFile(suffix='.jpc')
+    temp_file_name = temp_file.name
+    save(temp_file_name, pc)
+
+    n0_dup = load(temp_file_name)
+
+    assert torch.all(torch.abs(n0._params - n0_dup._params) < 1e-4)
+    assert torch.all(torch.abs(n0.chs[0].chs[0]._params - n0_dup.chs[0].chs[0]._params) < 1e-4)
+    assert torch.all(torch.abs(n0.chs[0].chs[1]._params - n0_dup.chs[0].chs[1]._params) < 1e-4)
+    assert torch.all(torch.abs(n0.chs[1].chs[0]._params - n0_dup.chs[1].chs[0]._params) < 1e-4)
+    assert torch.all(torch.abs(n0.chs[1].chs[1]._params - n0_dup.chs[1].chs[1]._params) < 1e-4)
+
+
 if __name__ == "__main__":
-    io_test()
\ No newline at end of file
+    io_test()
+    io_param_test()

From f13bdfe4f10791cb3f45447a1d7b9a0ecd93e30e Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Fri, 29 Dec 2023 20:44:36 +0800
Subject: [PATCH 134/162] export `load` and `save` in root

---
 src/pyjuice/__init__.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/pyjuice/__init__.py b/src/pyjuice/__init__.py
index fd280dc8..2513f456 100644
--- a/src/pyjuice/__init__.py
+++ b/src/pyjuice/__init__.py
@@ -18,4 +18,7 @@
 from pyjuice.nodes.methods.lvd import LVDistiller
 
 # Commonly-used transformations
-from .transformations import merge
+from pyjuice.transformations import merge
+
+# IO
+from pyjuice.io import load, save

From 663ce794796fabadd827ed77372da2b18c31782b Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Fri, 29 Dec 2023 21:15:28 +0800
Subject: [PATCH 135/162] speedup par_update fn compilation

---
 src/pyjuice/model/backend/par_update.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/pyjuice/model/backend/par_update.py b/src/pyjuice/model/backend/par_update.py
index d21c1b4d..e3c9b3c1 100644
--- a/src/pyjuice/model/backend/par_update.py
+++ b/src/pyjuice/model/backend/par_update.py
@@ -58,7 +58,7 @@ def compile_par_update_fn(root_ns: CircuitNodes, BLOCK_SIZE: int = 32, buffer_in
     pid = 0
 
     global_nid = 0
-    for ns in root_ns:
+    for i, ns in enumerate(root_ns):
         if not ns.is_sum() or ns.is_tied():
             continue
 
@@ -98,8 +98,9 @@ def compile_par_update_fn(root_ns: CircuitNodes, BLOCK_SIZE: int = 32, buffer_in
             nchs_new[:curr_size] = nchs[:curr_size]
             nchs = nchs_new
 
-        if use_numba:
+            buffer_inc_interval *= 2
 
+        if use_numba:
             ns_num_node_groups = ns.num_node_groups
             ns_group_size = ns.group_size
             cs_group_size = ns.ch_group_size

From 7216c1fe73b6577af30ec91027fc5054c6245705 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Fri, 29 Dec 2023 22:29:39 +0800
Subject: [PATCH 136/162] fix backward sum kernel increment bug

---
 src/pyjuice/layer/sum_layer.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/pyjuice/layer/sum_layer.py b/src/pyjuice/layer/sum_layer.py
index d1089caa..b6e1243c 100644
--- a/src/pyjuice/layer/sum_layer.py
+++ b/src/pyjuice/layer/sum_layer.py
@@ -1208,7 +1208,7 @@ def _bk_triton_block_sparse_ele_kernel(node_flows, element_flows, node_mars, ele
             if allow_modify_flows == 0:
                 nmars_ptr += parids_inc[:,None] * batch_size
             nflows_ptr += parids_inc[:,None] * batch_size
-            parids_inc += ptr_inc_step
+            parids_inc_ptr += ptr_inc_step
 
         # Initialize pointers to `element_mars`
         off_eleids = tl.load(chids + elegroup_id)
@@ -1311,7 +1311,7 @@ def _bk_triton_block_sparse_ele_csmm2_kernel(node_flows, element_flows, node_mar
             if allow_modify_flows == 0:
                 nmars_ptr += parids_inc[None,:] * batch_size
             nflows_ptr += parids_inc[None,:] * batch_size
-            parids_inc += ptr_inc_step
+            parids_inc_ptr += ptr_inc_step
 
         # Initialize pointers to `element_mars`
         off_eleids = tl.load(chids + elegroup_id)
@@ -1382,7 +1382,7 @@ def _backward_block_sparse_ele_flows(self, node_flows: torch.Tensor, element_flo
 
             parpids_start = parpids[:,0,:].contiguous()
             parpids_increment = torch.cat(
-                (parpids[:,1:,:] - parpids[:,:-1], parpids[:,0:1,:] * 0),
+                (parpids[:,1:,:] - parpids[:,:-1,:], parpids[:,0:1,:] * 0),
                 dim = 1
             ).contiguous()
 

From f0c629edcbc837d5d1a16d3b3327fc385f763651 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Sat, 30 Dec 2023 03:05:57 +0800
Subject: [PATCH 137/162] add option to maintain zero parameters

---
 src/pyjuice/model/backend/par_update.py | 13 ++++++++++---
 src/pyjuice/model/tensorcircuit.py      |  6 ++++--
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/src/pyjuice/model/backend/par_update.py b/src/pyjuice/model/backend/par_update.py
index e3c9b3c1..e0809e95 100644
--- a/src/pyjuice/model/backend/par_update.py
+++ b/src/pyjuice/model/backend/par_update.py
@@ -195,7 +195,8 @@ def cum_pflow_kernel(cum_pflows, param_flows, pflow_start_ids, blk_sizes, blk_in
 
 @triton.jit
 def par_update_kernel(params, param_flows, cum_pflows, nchs, par_start_ids, pflow_start_ids, blk_sizes, blk_intervals,
-                      global_nids, constexprs, num_blocks, BLOCK_ID: tl.constexpr, BLOCK_SIZE: tl.constexpr):
+                      global_nids, constexprs, num_blocks, keep_zero_params: tl.constexpr, BLOCK_ID: tl.constexpr, 
+                      BLOCK_SIZE: tl.constexpr):
 
     pid = tl.program_id(axis = 0)
 
@@ -227,11 +228,15 @@ def par_update_kernel(params, param_flows, cum_pflows, nchs, par_start_ids, pflo
     old_param = tl.load(params + offs_par, mask = mask_pflow, other = 0)
 
     updated_param = (1.0 - step_size) * old_param + step_size * new_param
+
+    if keep_zero_params == 1:
+        updated_params = tl.where(old_param < 1e-12, 0.0, updated_params)
+
     tl.store(params + offs_par, updated_param, mask = mask_pflow)
 
 
 def em_par_update(params: torch.Tensor, param_flows: torch.Tensor, par_update_kwargs: Sequence, 
-                  step_size: float, pseudocount: float = 0.0):
+                  step_size: float, pseudocount: float = 0.0, keep_zero_params: bool = True):
 
     par_start_ids, pflow_start_ids, blk_sizes, blk_intervals, global_nids, nchs, cum_pflows, metadata = par_update_kwargs
 
@@ -255,7 +260,9 @@ def em_par_update(params: torch.Tensor, param_flows: torch.Tensor, par_update_kw
 
     constexprs = torch.tensor([step_size, pseudocount]).to(params.device)
 
+    keep_zero_params = 1 if keep_zero_params else 0
+
     par_update_kernel[grid](
         params, param_flows, cum_pflows, nchs, par_start_ids, pflow_start_ids, blk_sizes, blk_intervals,
-        global_nids, constexprs, num_blocks, BLOCK_ID, BLOCK_SIZE
+        global_nids, constexprs, num_blocks, keep_zero_params, BLOCK_ID, BLOCK_SIZE
     )
diff --git a/src/pyjuice/model/tensorcircuit.py b/src/pyjuice/model/tensorcircuit.py
index b0c07ae7..c8ec15a0 100644
--- a/src/pyjuice/model/tensorcircuit.py
+++ b/src/pyjuice/model/tensorcircuit.py
@@ -359,7 +359,7 @@ def _run_inner_layers():
         else:
             return None
 
-    def mini_batch_em(self, step_size: float, pseudocount: float = 0.0):
+    def mini_batch_em(self, step_size: float, pseudocount: float = 0.0, keep_zero_params: bool = False):
         # Update input layers
         for layer in self.input_layer_group:
             layer.mini_batch_em(step_size = step_size, pseudocount = pseudocount)
@@ -368,7 +368,9 @@ def mini_batch_em(self, step_size: float, pseudocount: float = 0.0):
         compute_cum_par_flows(self.param_flows, self.parflow_fusing_kwargs)
 
         # Normalize and update parameters
-        em_par_update(self.params, self.param_flows, self.par_update_kwargs, step_size = step_size, pseudocount = pseudocount)
+        em_par_update(self.params, self.param_flows, self.par_update_kwargs, 
+                      step_size = step_size, pseudocount = pseudocount,
+                      keep_zero_params = keep_zero_params)
 
     def cumulate_flows(self, inputs: torch.Tensor, params: Optional[torch.Tensor] = None):
         with torch.no_grad():

From cd25b7d4232ac957fe90d983feaaed80e633122b Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Sat, 30 Dec 2023 03:06:30 +0800
Subject: [PATCH 138/162] fix triton error on sparse backward kernel for large
 models

---
 src/pyjuice/layer/sum_layer.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/pyjuice/layer/sum_layer.py b/src/pyjuice/layer/sum_layer.py
index b6e1243c..0e8269a6 100644
--- a/src/pyjuice/layer/sum_layer.py
+++ b/src/pyjuice/layer/sum_layer.py
@@ -1897,8 +1897,9 @@ def _bk_triton_sparse_par_kernel(node_flows, node_mars, element_mars, params, pa
         acc = tl.zeros([BLOCK_K], dtype = tl.float32)
 
         for b in range(0, B_NUM_BLOCKS):
-            # Update batch mask
-            mask_batch = (offs_batch < batch_size)
+            # Batch offsets and mask
+            offs_batch = tl.arange(0, BLOCK_B) + pid_b * TILE_SIZE_B + b * BLOCK_B
+            mask_batch = offs_batch < batch_size
 
             emars = tl.load(emars_ptr, mask = mask_batch[None,:]) # [BLOCK_K, BLOCK_B]
 
@@ -1917,9 +1918,6 @@ def _bk_triton_sparse_par_kernel(node_flows, node_mars, element_mars, params, pa
             nmars_ptr += BLOCK_B
             nflows_ptr += BLOCK_B
 
-            # Update batch offsets
-            offs_batch += BLOCK_B
-
         par_start = tl.load(pids + ngroup_id * num_edges + offs_edge)
         epars_ptr = params + par_start + tile_id
         epars = tl.load(epars_ptr) # [BLOCK_K]
@@ -2005,6 +2003,8 @@ def _backward_sparse_par_flows(self, node_flows: torch.Tensor, params: torch.Ten
             B_NUM_BLOCKS = B_NUM_BLOCKS
         )
 
+        return None
+
     def _backward_pytorch(self, node_flows, element_flows, params, node_mars, 
                           element_mars, param_flows, nids, cids, pids, pfids, 
                           chids, parids, parpids, cs_group_size):

From 9ca14c6a0b80d9220b4ac01f6653175830aad824 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Sat, 30 Dec 2023 04:06:37 +0800
Subject: [PATCH 139/162] a seemingly stable version

---
 tests/structures/hclt_test.py | 2 +-
 tests/structures/pd_test.py   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/structures/hclt_test.py b/tests/structures/hclt_test.py
index 46362dbb..c3033833 100644
--- a/tests/structures/hclt_test.py
+++ b/tests/structures/hclt_test.py
@@ -93,7 +93,7 @@ def hclt_test():
         train_data.float().to(device), 
         num_bins = 32, 
         sigma = 0.5 / 32, 
-        num_latents = 128, 
+        num_latents = 512, 
         chunk_size = 32
     )
     pc = juice.TensorCircuit(ns)
diff --git a/tests/structures/pd_test.py b/tests/structures/pd_test.py
index 90164073..f6921b2d 100644
--- a/tests/structures/pd_test.py
+++ b/tests/structures/pd_test.py
@@ -92,7 +92,7 @@ def pd_test():
 
     ns = juice.structures.PD(
         data_shape = (28, 28),
-        num_latents = 128,
+        num_latents = 256,
         split_intervals = (4, 4),
         structure_type = "sum_dominated"
     )

From 7cc924237cd8093f57e895130ef8dc3c73caaffc Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Sat, 30 Dec 2023 04:06:54 +0800
Subject: [PATCH 140/162] add grouping function

---
 src/pyjuice/transformations/group.py | 108 +++++++++++++++++++++++++++
 1 file changed, 108 insertions(+)
 create mode 100644 src/pyjuice/transformations/group.py

diff --git a/src/pyjuice/transformations/group.py b/src/pyjuice/transformations/group.py
new file mode 100644
index 00000000..165574e9
--- /dev/null
+++ b/src/pyjuice/transformations/group.py
@@ -0,0 +1,108 @@
+from __future__ import annotations
+
+from typing import Optional, Dict, Sequence
+
+from pyjuice.nodes import CircuitNodes, InputNodes, ProdNodes, SumNodes
+from pyjuice.utils import BitSet
+from pyjuice.utils.util import max_cdf_power_of_2
+
+
+def group(root_ns: CircuitNodes, sparsity_tolerance: float = 0.25, max_target_group_size: int = 32):
+
+    ## Do an initial pass to compute the maximum group size of every `ns` ##
+
+    ns2group_size = dict()
+    for ns in root_ns:
+        if ns.is_input():
+            ns2group_size[ns] = min(max_cdf_power_of_2(ns.num_nodes), max_target_group_size)
+
+        elif ns.is_prod():
+            ns2group_size[ns] = min(max_cdf_power_of_2(ns.num_nodes), max_target_group_size)
+
+        else:
+            assert ns.is_sum()
+
+            old_group_size = ns.group_size
+            old_cs_group_size = ns.cs_group_size
+            edge_ids = ns.edge_ids
+
+            old_ns_num_ngroups = ns.num_node_groups
+            old_cs_num_ngroups = sum([cs.num_node_groups for cs in ns.chs])
+
+            flag = False
+            plausible_combinations = list()
+
+            group_size = min(max_cdf_power_of_2(ns.num_nodes), max_target_group_size)
+            while group_size > old_group_size:
+                group_mul_size = group_size // old_group_size
+
+                ns_num_ngroups = old_ns_num_ngroups // group_mul_size
+                
+                cs_group_size = ns2group_size[ns.chs[0]]
+                while cs_group_size > old_cs_group_size:
+                    cs_group_mul_size = cs_group_size // old_cs_group_size
+
+                    cs_num_ngroups = old_cs_num_ngroups // group_mul_size
+
+                    n_edge_ids = edge_ids[0,:] // group_mul_size
+                    c_edge_ids = edge_ids[1,:] // cs_group_mul_size
+                    _, counts = torch.unique(n_edge_ids * cs_num_ngroups + c_edge_ids, return_counts = True)
+
+                    if torch.all(counts >= (1.0 - sparsity_tolerance) * group_mul_size * cs_group_mul_size):
+                        plausible_combinations.append((group_size, cs_group_size))
+
+                    cs_group_size = cs_group_size // 2
+
+                group_size = group_size // 2
+
+            # Find the best group size combination
+            best_group_size = 0
+            best_cs_group_size = 0
+            for group_size, cs_group_size in plausible_combinations:
+                if group_size >= 16 and cs_group_size >= 16:
+                    best_group_size = group_size
+                    best_cs_group_size = cs_group_size
+                    break
+
+            if best_group_size == 0:
+                best_val = 0
+                best_frac = 0
+                for group_size, cs_group_size in plausible_combinations:
+                    cond1 = group_size * cs_group_size > best_val
+                    cond2 = (group_size * cs_group_size > best_val) and \
+                        (max(group_size, cs_group_size) // min(group_size, cs_group_size) < best_frac)
+                    if cond1 or cond2:
+                        best_group_size = group_size
+                        best_cs_group_size = cs_group_size
+                        best_val = group_size * cs_group_size
+                        best_frac = max(group_size, cs_group_size) // min(group_size, cs_group_size)
+
+            ns2group_size[ns] = best_group_size
+            for cs in ns.chs:
+                ns2group_size[cs] = best_cs_group_size
+
+    ## Do a second pass to finalize the group sizes ##
+
+    for ns in root_ns:
+        if ns.is_prod():
+            group_size = ns2group_size[ns]
+            for cs in ns.chs:
+                group_size = min(group_size, ns2group_size[cs])
+
+            ns2group_size[ns] = group_size
+            for cs in ns.chs:
+                ns2group_size[cs] = group_size
+
+    ## Apply the new group sizes ##
+
+    def update_ns(ns: CircuitNodes, ns_chs: Sequence[CircuitNodes]):
+        if ns.isinput():
+            pass
+
+        elif ns.isprod():
+            pass
+
+        else:
+            assert ns.issum()
+
+    return foldup_aggregate(update_ns, root_ns)

From 38ce2bc7bd1d4479b1427937ab732fd49f71567b Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Sat, 30 Dec 2023 17:04:07 +0800
Subject: [PATCH 141/162] add "sparse edge" mode for product nodes

---
 src/pyjuice/nodes/prod_nodes.py | 68 ++++++++++++++++++++++++---------
 1 file changed, 49 insertions(+), 19 deletions(-)

diff --git a/src/pyjuice/nodes/prod_nodes.py b/src/pyjuice/nodes/prod_nodes.py
index 88707ec3..b35e0b84 100644
--- a/src/pyjuice/nodes/prod_nodes.py
+++ b/src/pyjuice/nodes/prod_nodes.py
@@ -13,6 +13,10 @@
 
 
 class ProdNodes(CircuitNodes):
+
+    SPARSE = 0
+    BLOCK_SPARSE = 1
+
     def __init__(self, num_node_groups: int, chs: Sequence[CircuitNodes], edge_ids: Optional[Tensor] = None, group_size: int = 0, **kwargs) -> None:
 
         rg_node = PartitionNode([ch.region_node for ch in chs])
@@ -27,28 +31,26 @@ def __init__(self, num_node_groups: int, chs: Sequence[CircuitNodes], edge_ids:
         # Callbacks
         self._run_init_callbacks(**kwargs)
 
-    def _construct_edges(self, edge_ids: Optional[Tensor]):
-        if edge_ids is None:
-            for c in self.chs:
-                assert self.num_node_groups == c.num_node_groups and self.group_size == c.group_size, \
-                    "Cannot create edges implicitly since # nodes do not match."
-
-            edge_ids = torch.arange(self.num_node_groups).unsqueeze(1).repeat(1, self.num_chs)
-
-        if isinstance(edge_ids, np.ndarray):
-            edge_ids = torch.from_numpy(edge_ids)
+    @property
+    def num_edges(self):
+        return self.num_nodes * self.num_chs
 
-        # Sanity checks
-        assert edge_ids.size(0) == self.num_node_groups and edge_ids.size(1) == self.num_chs, f"Expect edge_ids.size() == ({self.num_node_groups}, {self.num_chs})."
-        for cid in range(self.num_chs):
-            assert torch.all(edge_ids[:,cid] >= 0), "Edge index underflow."
-            assert torch.all(edge_ids[:,cid] < self.chs[cid].num_node_groups), "Edge index overflow."
+    @property
+    def edge_type(self):
+        if self.edge_ids.size(0) == self.num_node_groups:
+            return self.BLOCK_SPARSE
+        elif self.edge_ids.size(0) == self.num_nodes:
+            return self.SPARSE
+        else:
+            raise RuntimeError(f"Unexpected shape of `edge_ids`: ({self.edge_ids.size(0)}, {self.edge_ids.size(1)})")
 
-        self.edge_ids = edge_ids
+    @property
+    def is_block_sparse(self):
+        return self.edge_type == self.BLOCK_SPARSE
 
     @property
-    def num_edges(self):
-        return self.edge_ids.size(0) * self.edge_ids.size(1) * self.group_size
+    def is_sparse(self):
+        return self.edge_type == self.SPARSE
 
     def duplicate(self, *args, tie_params: bool = False):
         chs = []
@@ -78,4 +80,32 @@ def init_parameters(self, perturbation: float = 2.0, recursive: bool = True, is_
         )
 
     def __repr__(self):
-        return f"ProdNodes(num_node_groups={self.num_node_groups}, group_size={self.group_size}, num_chs={self.num_chs})"
+        edge_type = "sparse" if self.edge_type == self.SPARSE else "block_sparse"
+        return f"ProdNodes(num_node_groups={self.num_node_groups}, group_size={self.group_size}, num_chs={self.num_chs}, edge_type='{edge_type}')"
+
+    def _construct_edges(self, edge_ids: Optional[Tensor]):
+        if edge_ids is None:
+            for c in self.chs:
+                assert self.num_node_groups == c.num_node_groups and self.group_size == c.group_size, \
+                    "Cannot create edges implicitly since # nodes do not match."
+
+            edge_ids = torch.arange(self.num_node_groups).unsqueeze(1).repeat(1, self.num_chs)
+
+        if isinstance(edge_ids, np.ndarray):
+            edge_ids = torch.from_numpy(edge_ids)
+
+        # Sanity checks
+        if edge_ids.size(0) == self.num_node_groups:
+            assert edge_ids.size(0) == self.num_node_groups and edge_ids.size(1) == self.num_chs, f"Expect edge_ids.size() == ({self.num_node_groups}, {self.num_chs})."
+            for cid in range(self.num_chs):
+                assert torch.all(edge_ids[:,cid] >= 0), "Edge index underflow."
+                assert torch.all(edge_ids[:,cid] < self.chs[cid].num_node_groups), "Edge index overflow."
+        elif edge_ids.size(0) == self.num_nodes:
+            assert edge_ids.size(0) == self.num_nodes and edge_ids.size(1) == self.num_chs, f"Expect edge_ids.size() == ({self.num_nodes}, {self.num_chs})."
+            for cid in range(self.num_chs):
+                assert torch.all(edge_ids[:,cid] >= 0), "Edge index underflow."
+                assert torch.all(edge_ids[:,cid] < self.chs[cid].num_nodes), "Edge index overflow."
+        else:
+            raise RuntimeError(f"Unexpected shape of `edge_ids`: ({self.edge_ids.size(0)}, {self.edge_ids.size(1)})")
+
+        self.edge_ids = edge_ids

From 59c9e818b92e5ee82dc389d239416ca5d3550448 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Sat, 30 Dec 2023 17:15:19 +0800
Subject: [PATCH 142/162] `merge` supports sparse prod nodes

---
 src/pyjuice/transformations/merge.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/pyjuice/transformations/merge.py b/src/pyjuice/transformations/merge.py
index 5802f27c..3fdc17f9 100644
--- a/src/pyjuice/transformations/merge.py
+++ b/src/pyjuice/transformations/merge.py
@@ -104,8 +104,11 @@ def merge_prod_nodes(ns1: ProdNodes, ns2: ProdNodes, *args) -> ProdNodes:
             new_sum_chs.append(merge_sum_nodes(*sum_ns))
 
     prod_edge_ids = []
+    use_sparse_mode = any([ns.is_sparse for ns in all_ns])
     for ns in all_ns:
         edge_ids = ns.edge_ids.clone()
+        if use_sparse_mode and ns.is_block_sparse:
+            edge_ids = (edge_ids[:,None,:].repeat(1, ns.group_size, 1) + torch.arange(0, ns.group_size)).flatten(0, 1)
         for scope_id in range(num_scopes):
             cs = ns.chs[scope_id]
             edge_ids[:,scope_id] += cs2start_id[cs]

From cf3fe2098d57497d26c22aaec17ab3ecaaa30487 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Sat, 30 Dec 2023 18:01:03 +0800
Subject: [PATCH 143/162] sparse product layer compilation

---
 src/pyjuice/layer/compilation.py     | 79 ++++++++++++++++++++--------
 src/pyjuice/layer/prod_layer.py      | 16 ++++--
 src/pyjuice/transformations/merge.py |  2 +-
 3 files changed, 70 insertions(+), 27 deletions(-)

diff --git a/src/pyjuice/layer/compilation.py b/src/pyjuice/layer/compilation.py
index eb06a39e..985fbbfd 100644
--- a/src/pyjuice/layer/compilation.py
+++ b/src/pyjuice/layer/compilation.py
@@ -846,35 +846,55 @@ def sum_layer_backward_compilation(nodes, cs2parns, n_partition_ids, n_id_in_par
 
 ## Compilation for ProdLayer ##
 
+def get_prod_layer_stats(nodes: Sequence[SumNodes], group_size: int, global_nid_start: int, use_block_sparse_edges: bool):
+    if use_block_sparse_edges:
+        layer_num_ngroup = sum(map(lambda ns: ns.num_node_groups, nodes))
+        layer_num_edges = 0
 
-def get_prod_layer_stats(nodes: Sequence[SumNodes], group_size: int, global_nid_start: int):
-    layer_num_ngroup = sum(map(lambda ns: ns.num_node_groups, nodes))
-    layer_num_edges = 0
+        ng_sid = 0
+        n_chgs = torch.zeros([layer_num_ngroup], dtype = torch.long)
+        for ns_idx, ns in enumerate(nodes):
+            ng_eid = ng_sid + ns.num_node_groups
 
-    ng_sid = 0
-    n_chgs = torch.zeros([layer_num_ngroup], dtype = torch.long)
-    for ns_idx, ns in enumerate(nodes):
-        ng_eid = ng_sid + ns.num_node_groups
+            n_chgs[ng_sid:ng_eid] = ns.num_chs
 
-        n_chgs[ng_sid:ng_eid] = ns.num_chs
+            layer_num_edges += ns.num_nodes * ns.num_chs
 
-        layer_num_edges += ns.num_nodes * ns.num_chs
+            ns._output_ind_range = (global_nid_start, global_nid_start + ns.num_nodes)
+            global_nid_start += ns.num_nodes
 
-        ns._output_ind_range = (global_nid_start, global_nid_start + ns.num_nodes)
-        global_nid_start += ns.num_nodes
+            ng_sid = ng_eid
+    else:
+        layer_num_ngroup = sum(map(lambda ns: ns.num_nodes, nodes))
+        layer_num_edges = 0
+
+        ng_sid = 0
+        n_chgs = torch.zeros([layer_num_ngroup], dtype = torch.long)
+        for ns_idx, ns in enumerate(nodes):
+            ng_eid = ng_sid + ns.num_nodes
+
+            n_chgs[ng_sid:ng_eid] = ns.num_chs
 
-        ng_sid = ng_eid
+            layer_num_edges += ns.num_nodes * ns.num_chs
+
+            ns._output_ind_range = (global_nid_start, global_nid_start + ns.num_nodes)
+            global_nid_start += ns.num_nodes
+
+            ng_sid = ng_eid
 
     return layer_num_ngroup, layer_num_edges, n_chgs
 
 
 @torch.no_grad()
 def prod_layer_forward_compilation(nodes, fw_partition_max_chs, n_partition_ids, n_id_in_partition, num_ngs_in_partition, 
-                                   group_size, use_cuda: bool = False):
+                                   group_size, use_block_sparse_edges: bool, use_cuda: bool = False):
     
     if use_cuda and not torch.cuda.is_available():
         use_cuda = False
 
+    if not use_block_sparse_edges:
+        assert group_size == 1
+
     if use_cuda:
         device = torch.device("cuda:0")
     else:
@@ -890,13 +910,29 @@ def prod_layer_forward_compilation(nodes, fw_partition_max_chs, n_partition_ids,
         # `partition_nchs`: maximum number of child nodes in the current partition
         partition_id = n_partition_ids[ns_id]
         local_sid = n_id_in_partition[ns_id]
-        local_eid = local_sid + ns.num_node_groups
+        if use_block_sparse_edges:
+            local_eid = local_sid + ns.num_node_groups
+        else:
+            local_eid = local_sid + ns.num_nodes
         partition_nchs = fw_partition_max_chs[partition_id]
 
-        n_sid = ns._output_ind_range[0]
-        nids[partition_id][local_sid:local_eid] = torch.arange(0, ns.num_nodes, group_size, device = device) + n_sid
-        for cs_id, cs in enumerate(ns.chs):
-            cids[partition_id][local_sid:local_eid,cs_id] = ns.edge_ids[:,cs_id].to(device) * group_size + cs._output_ind_range[0]
+        if use_block_sparse_edges:
+            n_sid = ns._output_ind_range[0]
+            nids[partition_id][local_sid:local_eid] = torch.arange(0, ns.num_nodes, group_size, device = device) + n_sid
+            for cs_id, cs in enumerate(ns.chs):
+                cids[partition_id][local_sid:local_eid,cs_id] = ns.edge_ids[:,cs_id].to(device) * group_size + cs._output_ind_range[0]
+        else:
+            n_sid = ns._output_ind_range[0]
+            nids[partition_id][local_sid:local_eid] = torch.arange(0, ns.num_nodes, device = device) + n_sid
+            if ns.is_sparse:
+                for cs_id, cs in enumerate(ns.chs):
+                    cids[partition_id][local_sid:local_eid,cs_id] = ns.edge_ids[:,cs_id].to(device) + cs._output_ind_range[0]
+            else:
+                assert ns.is_block_sparse
+                edge_ids = ns.edge_ids.clone()
+                edge_ids = (edge_ids[:,None,:].repeat(1, ns.group_size, 1) + torch.arange(0, ns.group_size)[None,:,None]).flatten(0, 1)
+                for cs_id, cs in enumerate(ns.chs):
+                    cids[partition_id][local_sid:local_eid,cs_id] = edge_ids[:,cs_id].to(device) + cs._output_ind_range[0]
 
     if use_cuda:
         nids = [tensor.cpu() for tensor in nids]
@@ -906,7 +942,7 @@ def prod_layer_forward_compilation(nodes, fw_partition_max_chs, n_partition_ids,
 
 
 @torch.no_grad()
-def flatten_c_ids(nids: torch.Tensor, cids: torch.Tensor):
+def flatten_c_ids(nids: Sequence[torch.Tensor], cids: Sequence[torch.Tensor]):
 
     num_cid_slots = sum(map(lambda x: x.size(0) * x.size(1), cids))
     flat_cids = torch.zeros([num_cid_slots], dtype = torch.long)
@@ -1026,9 +1062,8 @@ def _assign_prod_target_parids_kernel(target_parids_ptr, flat_cid2nid_ptr, flat_
 
 
 @torch.no_grad()
-def prod_layer_backward_compilation(flat_u_cids, flat_cids, flat_cid2nid, 
-                                    bk_partition_max_pars, n_partition_ids, n_id_in_partition, num_ns_in_partition, 
-                                    use_cuda: bool = False):
+def prod_layer_backward_compilation(flat_u_cids, flat_cids, flat_cid2nid, bk_partition_max_pars, n_partition_ids, 
+                                    n_id_in_partition, num_ns_in_partition, use_cuda: bool = False):
     
     if use_cuda and not torch.cuda.is_available():
         use_cuda = False
diff --git a/src/pyjuice/layer/prod_layer.py b/src/pyjuice/layer/prod_layer.py
index 78d1dde2..88db8b11 100644
--- a/src/pyjuice/layer/prod_layer.py
+++ b/src/pyjuice/layer/prod_layer.py
@@ -28,18 +28,25 @@ def __init__(self, nodes: Sequence[ProdNodes], global_nid_start: Optional[int] =
 
         assert len(nodes) > 0, "No input node."
 
+        use_block_sparse_edges = True
         for nid in range(1, len(nodes)):
-            assert nodes[0].group_size == nodes[nid].group_size, f"`group_size` within a `ProdLayer` should be the same, but found {nodes[0].group_size} and {nodes[nid].group_size}."
+            if nodes[0].group_size != nodes[nid].group_size or nodes[nid].is_sparse:
+                use_block_sparse_edges = False
+                break
+        self.use_block_sparse_edges = use_block_sparse_edges
 
         self.nodes = nodes
-        self.group_size = nodes[0].group_size
+        self.group_size = nodes[0].group_size if self.use_block_sparse_edges else 1
 
         if global_nid_start is None:
             global_nid_start = self.group_size
 
         ## Get layer statistics & prepare for compilation ##
 
-        layer_num_ngroups, layer_num_edges, n_chgs = get_prod_layer_stats(self.nodes, self.group_size, global_nid_start = global_nid_start)
+        layer_num_ngroups, layer_num_edges, n_chgs = get_prod_layer_stats(
+            self.nodes, self.group_size, global_nid_start = global_nid_start, 
+            use_block_sparse_edges = self.use_block_sparse_edges
+        )
 
         self.num_nodes = layer_num_ngroups * self.group_size
         self.num_edges = layer_num_edges
@@ -76,7 +83,8 @@ def __init__(self, nodes: Sequence[ProdNodes], global_nid_start: Optional[int] =
         # nids:      List[[partition_size]]                      stores node ids
         # cids:      List[[partition_size, partition_max_n_chs]] stores indices of child nodes
         nids, cids = prod_layer_forward_compilation(
-            self.nodes, fw_partition_max_chs, fw_n_partition_ids, fw_n_id_in_partition, fw_num_ngs_in_partition, self.group_size
+            self.nodes, fw_partition_max_chs, fw_n_partition_ids, fw_n_id_in_partition, fw_num_ngs_in_partition, 
+            self.group_size, self.use_block_sparse_edges
         )
 
         # Store buffers for the forward pass
diff --git a/src/pyjuice/transformations/merge.py b/src/pyjuice/transformations/merge.py
index 3fdc17f9..1677a2f7 100644
--- a/src/pyjuice/transformations/merge.py
+++ b/src/pyjuice/transformations/merge.py
@@ -108,7 +108,7 @@ def merge_prod_nodes(ns1: ProdNodes, ns2: ProdNodes, *args) -> ProdNodes:
     for ns in all_ns:
         edge_ids = ns.edge_ids.clone()
         if use_sparse_mode and ns.is_block_sparse:
-            edge_ids = (edge_ids[:,None,:].repeat(1, ns.group_size, 1) + torch.arange(0, ns.group_size)).flatten(0, 1)
+            edge_ids = (edge_ids[:,None,:].repeat(1, ns.group_size, 1) + torch.arange(0, ns.group_size)[None,:,None]).flatten(0, 1)
         for scope_id in range(num_scopes):
             cs = ns.chs[scope_id]
             edge_ids[:,scope_id] += cs2start_id[cs]

From dca4ebd27fc15097e38d4ad38a8ef40ee786cf4e Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Sat, 30 Dec 2023 18:19:15 +0800
Subject: [PATCH 144/162] restore commented sum layer tests

---
 tests/layer/sum_layer_test.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/layer/sum_layer_test.py b/tests/layer/sum_layer_test.py
index 33351a17..f19515be 100644
--- a/tests/layer/sum_layer_test.py
+++ b/tests/layer/sum_layer_test.py
@@ -362,6 +362,6 @@ def speed_test():
 
 if __name__ == "__main__":
     torch.manual_seed(3890)
-    # sum_layer_test()
+    sum_layer_test()
     corner_case_test()
-    # speed_test()
\ No newline at end of file
+    speed_test()
\ No newline at end of file

From 1aa439ad60f763d2f86f515a37b00ea1241f4a55 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Sat, 30 Dec 2023 19:53:40 +0800
Subject: [PATCH 145/162] runtests for sparse product layers

---
 src/pyjuice/layer/compilation.py      |   2 +-
 src/pyjuice/layer/prod_layer.py       |   5 +-
 src/pyjuice/nodes/construction.py     |   4 +-
 src/pyjuice/transformations/merge.py  |   2 +-
 tests/layer/sparse_prod_layer_test.py | 107 ++++++++++++++++++++++++++
 5 files changed, 115 insertions(+), 5 deletions(-)
 create mode 100644 tests/layer/sparse_prod_layer_test.py

diff --git a/src/pyjuice/layer/compilation.py b/src/pyjuice/layer/compilation.py
index 985fbbfd..afb187fd 100644
--- a/src/pyjuice/layer/compilation.py
+++ b/src/pyjuice/layer/compilation.py
@@ -930,7 +930,7 @@ def prod_layer_forward_compilation(nodes, fw_partition_max_chs, n_partition_ids,
             else:
                 assert ns.is_block_sparse
                 edge_ids = ns.edge_ids.clone()
-                edge_ids = (edge_ids[:,None,:].repeat(1, ns.group_size, 1) + torch.arange(0, ns.group_size)[None,:,None]).flatten(0, 1)
+                edge_ids = (edge_ids[:,None,:].repeat(1, ns.group_size, 1) * ns.group_size + torch.arange(0, ns.group_size)[None,:,None]).flatten(0, 1)
                 for cs_id, cs in enumerate(ns.chs):
                     cids[partition_id][local_sid:local_eid,cs_id] = edge_ids[:,cs_id].to(device) + cs._output_ind_range[0]
 
diff --git a/src/pyjuice/layer/prod_layer.py b/src/pyjuice/layer/prod_layer.py
index 88db8b11..072390b2 100644
--- a/src/pyjuice/layer/prod_layer.py
+++ b/src/pyjuice/layer/prod_layer.py
@@ -76,7 +76,10 @@ def __init__(self, nodes: Sequence[ProdNodes], global_nid_start: Optional[int] =
 
             fw_n_partition_ids[ns_id] = partition_id
             fw_n_id_in_partition[ns_id] = fw_num_ngs_in_partition[partition_id]
-            fw_num_ngs_in_partition[partition_id] += ns.num_node_groups
+            if self.use_block_sparse_edges:
+                fw_num_ngs_in_partition[partition_id] += ns.num_node_groups
+            else:
+                fw_num_ngs_in_partition[partition_id] += ns.num_nodes
 
         ## Initialize forward pass ##
 
diff --git a/src/pyjuice/nodes/construction.py b/src/pyjuice/nodes/construction.py
index 1a2aa28b..fe1558e1 100644
--- a/src/pyjuice/nodes/construction.py
+++ b/src/pyjuice/nodes/construction.py
@@ -50,7 +50,7 @@ def multiply(nodes1: ProdNodesChs, *args, edge_ids: Optional[Tensor] = None, **k
     scope = deepcopy(nodes1.scope)
 
     for nodes in args:
-        assert isinstance(nodes, SumNodes) or isinstance(nodes, InputNodes), f"Children of product nodes must be input or sum nodes, but found input of type {type(nodes)}."
+        assert nodes.is_input() or nodes.is_sum(), f"Children of product nodes must be input or sum nodes, but found input of type {type(nodes)}."
         if edge_ids is None:
             assert nodes.num_node_groups == num_node_groups, f"Input nodes should have the same `num_node_groups`, but got {nodes.num_node_groups} and {num_node_groups}."
         assert nodes.group_size == group_size, "Input nodes should have the same `num_node_groups`."
@@ -59,7 +59,7 @@ def multiply(nodes1: ProdNodesChs, *args, edge_ids: Optional[Tensor] = None, **k
         scope |= nodes.scope
 
     if edge_ids is not None:
-        num_node_groups = edge_ids.shape[0]
+        assert edge_ids.shape[0] == num_node_groups or edge_ids.shape[0] == num_node_groups * group_size
 
     return ProdNodes(num_node_groups, chs, edge_ids, group_size = group_size, **kwargs)
 
diff --git a/src/pyjuice/transformations/merge.py b/src/pyjuice/transformations/merge.py
index 1677a2f7..551db4ab 100644
--- a/src/pyjuice/transformations/merge.py
+++ b/src/pyjuice/transformations/merge.py
@@ -108,7 +108,7 @@ def merge_prod_nodes(ns1: ProdNodes, ns2: ProdNodes, *args) -> ProdNodes:
     for ns in all_ns:
         edge_ids = ns.edge_ids.clone()
         if use_sparse_mode and ns.is_block_sparse:
-            edge_ids = (edge_ids[:,None,:].repeat(1, ns.group_size, 1) + torch.arange(0, ns.group_size)[None,:,None]).flatten(0, 1)
+            edge_ids = (edge_ids[:,None,:].repeat(1, ns.group_size, 1) * ns.group_size + torch.arange(0, ns.group_size)[None,:,None]).flatten(0, 1)
         for scope_id in range(num_scopes):
             cs = ns.chs[scope_id]
             edge_ids[:,scope_id] += cs2start_id[cs]
diff --git a/tests/layer/sparse_prod_layer_test.py b/tests/layer/sparse_prod_layer_test.py
new file mode 100644
index 00000000..dcee47af
--- /dev/null
+++ b/tests/layer/sparse_prod_layer_test.py
@@ -0,0 +1,107 @@
+import pyjuice as juice
+import torch
+import numpy as np
+import time
+import random
+
+import pyjuice.nodes.distributions as dists
+from pyjuice.utils import BitSet
+from pyjuice.nodes import multiply, summate, inputs
+from pyjuice.model import TensorCircuit
+
+from pyjuice.layer import InputLayer, ProdLayer
+
+import pytest
+
+
+def sparse_prod_layer_test():
+
+    device = torch.device("cuda:0")
+
+    group_size = 16
+    batch_size = 16
+
+    with juice.set_group_size(group_size):
+
+        ni0 = inputs(0, num_node_groups = 2, dist = dists.Categorical(num_cats = 2))
+        ni1 = inputs(1, num_node_groups = 2, dist = dists.Categorical(num_cats = 2))
+        ni2 = inputs(2, num_node_groups = 2, dist = dists.Categorical(num_cats = 2))
+        ni3 = inputs(3, num_node_groups = 2, dist = dists.Categorical(num_cats = 2))
+
+        np0 = multiply(ni0, ni1)
+        np1 = multiply(ni2, ni3, edge_ids = torch.arange(0, group_size * 2)[:,None].repeat(1, 2))
+        np2 = multiply(ni1, ni2)
+
+    input_layer = InputLayer([ni0, ni1, ni2, ni3], cum_nodes = group_size)
+
+    layer = ProdLayer([np0, np1, np2])
+
+    assert torch.all(layer.partitioned_nids[0] == torch.arange(1, 1+6*group_size))
+    assert torch.all(layer.partitioned_cids[0][0,:] == torch.tensor([1 * group_size, 3 * group_size]))
+    assert torch.all(layer.partitioned_cids[0][1:16,:] - layer.partitioned_cids[0][0:15,:] == 1)
+    assert torch.all(layer.partitioned_cids[0][16,:] == torch.tensor([2 * group_size, 4 * group_size]))
+    assert torch.all(layer.partitioned_cids[0][17:32,:] - layer.partitioned_cids[0][16:31,:] == 1)
+    assert torch.all(layer.partitioned_cids[0][32,:] == torch.tensor([5 * group_size, 7 * group_size]))
+    assert torch.all(layer.partitioned_cids[0][33:48,:] - layer.partitioned_cids[0][32:47,:] == 1)
+    assert torch.all(layer.partitioned_cids[0][48,:] == torch.tensor([6 * group_size, 8 * group_size]))
+    assert torch.all(layer.partitioned_cids[0][49:64,:] - layer.partitioned_cids[0][48:63,:] == 1)
+    assert torch.all(layer.partitioned_cids[0][64,:] == torch.tensor([3 * group_size, 5 * group_size]))
+    assert torch.all(layer.partitioned_cids[0][65:80,:] - layer.partitioned_cids[0][64:79,:] == 1)
+    assert torch.all(layer.partitioned_cids[0][80,:] == torch.tensor([4 * group_size, 6 * group_size]))
+    assert torch.all(layer.partitioned_cids[0][81:96,:] - layer.partitioned_cids[0][80:95,:] == 1)
+
+    assert torch.all(layer.partitioned_u_cids[0] == torch.arange(16, 144))
+    assert torch.all(layer.partitioned_parids[0][0:32,0] == torch.arange(1, 32+1))
+    assert torch.all(layer.partitioned_parids[0][0:32,1] == 0)
+    assert torch.all(layer.partitioned_parids[0][32:64,0] == torch.arange(1, 32+1))
+    assert torch.all(layer.partitioned_parids[0][32:64,1] == torch.arange(64+1, 96+1))
+    assert torch.all(layer.partitioned_parids[0][64:96,0] == torch.arange(32+1, 64+1))
+    assert torch.all(layer.partitioned_parids[0][64:96,1] == torch.arange(64+1, 96+1))
+    assert torch.all(layer.partitioned_parids[0][96:128,0] == torch.arange(32+1, 64+1))
+    assert torch.all(layer.partitioned_parids[0][96:128,1] == 0)
+
+    layer.to(device)
+
+    node_mars = torch.rand([group_size + group_size * 2 * 4, batch_size]).log().to(device)
+    element_mars = torch.zeros([1 + 3 * 2 * 2 * group_size, batch_size]).to(device)
+
+    ## Forward tests ##
+
+    layer(node_mars, element_mars)
+
+    for i in range(group_size):
+        assert torch.all(torch.abs(element_mars[1+i,:] - (node_mars[group_size+i,:] + node_mars[3*group_size+i,:])) < 1e-4)
+        assert torch.all(torch.abs(element_mars[1+1*group_size+i,:] - (node_mars[2*group_size+i,:] + node_mars[4*group_size+i,:])) < 1e-4)
+
+        assert torch.all(torch.abs(element_mars[1+2*group_size+i,:] - (node_mars[5*group_size+i,:] + node_mars[7*group_size+i,:])) < 1e-4)
+        assert torch.all(torch.abs(element_mars[1+3*group_size+i,:] - (node_mars[6*group_size+i,:] + node_mars[8*group_size+i,:])) < 1e-4)
+
+        assert torch.all(torch.abs(element_mars[1+4*group_size+i,:] - (node_mars[3*group_size+i,:] + node_mars[5*group_size+i,:])) < 1e-4)
+        assert torch.all(torch.abs(element_mars[1+5*group_size+i,:] - (node_mars[4*group_size+i,:] + node_mars[6*group_size+i,:])) < 1e-4)
+
+    ## Backward tests ##
+
+    element_flows = torch.rand([1 + 3 * 2 * 2 * group_size, batch_size]).to(device)
+    element_flows[0,:] = 0.0
+    node_flows = torch.zeros([group_size + group_size * 2 * 4, batch_size]).to(device)
+
+    layer(node_mars, element_mars)
+    layer.backward(node_flows, element_flows)
+
+    for i in range(group_size):
+        assert torch.all(torch.abs(node_flows[group_size+i,:] - element_flows[1+i,:]) < 1e-4)
+        assert torch.all(torch.abs(node_flows[2*group_size+i,:] - element_flows[1+1*group_size+i,:]) < 1e-4)
+
+        assert torch.all(torch.abs(node_flows[3*group_size+i,:] - (element_flows[1+i,:] + element_flows[1+4*group_size+i,:])) < 1e-4)
+        assert torch.all(torch.abs(node_flows[4*group_size+i,:] - (element_flows[1+1*group_size+i,:] + element_flows[1+5*group_size+i,:])) < 1e-4)
+
+        assert torch.all(torch.abs(node_flows[5*group_size+i,:] - (element_flows[1+2*group_size+i,:] + element_flows[1+4*group_size+i,:])) < 1e-4)
+        assert torch.all(torch.abs(node_flows[6*group_size+i,:] - (element_flows[1+3*group_size+i,:] + element_flows[1+5*group_size+i,:])) < 1e-4)
+
+        assert torch.all(torch.abs(node_flows[7*group_size+i,:] - element_flows[1+2*group_size+i,:]) < 1e-4)
+        assert torch.all(torch.abs(node_flows[8*group_size+i,:] - element_flows[1+3*group_size+i,:]) < 1e-4)
+
+
+if __name__ == "__main__":
+    torch.manual_seed(2390)
+    sparse_prod_layer_test()

From e35e4bc0b1821205acfaa86085723e21b9d4d331 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Sat, 30 Dec 2023 20:52:01 +0800
Subject: [PATCH 146/162] add `zero_param_mask` in `SumNodes`

---
 src/pyjuice/nodes/sum_nodes.py | 30 +++++++++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

diff --git a/src/pyjuice/nodes/sum_nodes.py b/src/pyjuice/nodes/sum_nodes.py
index efcdabda..af08535b 100644
--- a/src/pyjuice/nodes/sum_nodes.py
+++ b/src/pyjuice/nodes/sum_nodes.py
@@ -16,7 +16,7 @@
 
 class SumNodes(CircuitNodes):
     def __init__(self, num_node_groups: int, chs: Sequence[CircuitNodes], edge_ids: Optional[Union[Tensor,Sequence[Tensor]]] = None, 
-                 params: Optional[Tensor] = None, group_size: int = 0, **kwargs) -> None:
+                 params: Optional[Tensor] = None, zero_param_mask: Optional[Tensor] = None, group_size: int = 0, **kwargs) -> None:
 
         assert len(chs) > 0, "`SumNodes` must have at least one child."
         for i in range(1, len(chs)):
@@ -37,6 +37,10 @@ def __init__(self, num_node_groups: int, chs: Sequence[CircuitNodes], edge_ids:
         # Construct sum edges
         self._construct_edges(edge_ids)
 
+        # Set zero parameter mask
+        if zero_param_mask is not None:
+            self.set_zero_param_mask(zero_param_mask)
+
         # Set parameters
         if params is not None:
             self.set_params(params, pseudocount = 0.0)
@@ -105,10 +109,31 @@ def set_params(self, params: torch.Tensor, normalize: bool = True, pseudocount:
         else:
             raise ValueError("Unsupported parameter input.")
 
+        if self.zero_param_mask is not None:
+            self._params[self.zero_param_mask] = 0.0
+
         if normalize:
             normalize_ns_parameters(self._params, self.edge_ids[0,:], group_size = self.group_size, 
                                     ch_group_size = self.ch_group_size, pseudocount = pseudocount)
 
+    def set_zero_param_mask(self, zero_param_mask: Optional[Tensor] = None):
+        if zero_param_mask is None:
+            return None
+
+        if self._source_node is not None:
+            ns_source = self._source_node
+            ns_source.set_zero_param_mask(zero_param_mask)
+
+            return None
+
+        assert zero_param_mask.dim() == 3
+        assert zero_param_mask.size(0) == self.edge_ids.size(1)
+        assert zero_param_mask.size(1) == self.group_size
+        assert zero_param_mask.size(2) == self.ch_group_size
+        assert zero_param_mask.dtype == torch.bool
+
+        self.zero_param_mask = zero_param_mask
+
     def set_edges(self, edge_ids: Union[Tensor,Sequence[Tensor]]):
         self._construct_edges(edge_ids)
 
@@ -118,6 +143,9 @@ def init_parameters(self, perturbation: float = 2.0, recursive: bool = True, is_
         if self._source_node is None:
             self._params = torch.exp(torch.rand([self.edge_ids.size(1), self.group_size, self.ch_group_size]) * -perturbation)
 
+            if self.zero_param_mask is not None:
+                self._params[self.zero_param_mask] = 0.0
+
             normalize_ns_parameters(self._params, self.edge_ids[0,:], group_size = self.group_size, 
                                     ch_group_size = self.ch_group_size, pseudocount = 0.0)
 

From 545c8f3d0c734a3c7381951a8f28b15ed48532a1 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Sat, 30 Dec 2023 21:38:25 +0800
Subject: [PATCH 147/162] always init parameters from `CircuitNodes`

---
 src/pyjuice/model/tensorcircuit.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/pyjuice/model/tensorcircuit.py b/src/pyjuice/model/tensorcircuit.py
index c8ec15a0..e887a1ae 100644
--- a/src/pyjuice/model/tensorcircuit.py
+++ b/src/pyjuice/model/tensorcircuit.py
@@ -689,6 +689,10 @@ def _init_layers(self, layer_sparsity_tol: Optional[float] = None, max_num_parti
         self._init_parameters()
 
     def _init_parameters(self, perturbation: float = 4.0, pseudocount: float = 0.0):
+        for ns in self.root_ns:
+            if not ns.is_tied() and (ns.is_sum() or ns.is_input()) and not ns.has_params():
+                ns.init_parameters(perturbation = perturbation, recursive = False)
+
         params = torch.exp(torch.rand([self.num_sum_params]) * -perturbation)
         params[:self.num_dummy_params] = 0.0
 

From 809080de47e7f4592eecc3410a1bba8796004317 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Sat, 30 Dec 2023 21:38:50 +0800
Subject: [PATCH 148/162] finish `group` function

---
 src/pyjuice/transformations/group.py | 107 ++++++++++++++++++++++++++-
 1 file changed, 104 insertions(+), 3 deletions(-)

diff --git a/src/pyjuice/transformations/group.py b/src/pyjuice/transformations/group.py
index 165574e9..41c08dc5 100644
--- a/src/pyjuice/transformations/group.py
+++ b/src/pyjuice/transformations/group.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+from copy import deepcopy as pydeepcopy
 from typing import Optional, Dict, Sequence
 
 from pyjuice.nodes import CircuitNodes, InputNodes, ProdNodes, SumNodes
@@ -96,13 +97,113 @@ def group(root_ns: CircuitNodes, sparsity_tolerance: float = 0.25, max_target_gr
     ## Apply the new group sizes ##
 
     def update_ns(ns: CircuitNodes, ns_chs: Sequence[CircuitNodes]):
+        new_group_size = ns2group_size[ns]
+        group_mul_size = new_group_size // ns.group_size
+
+        new_num_ngroups = ns.num_node_groups // group_mul_size
+
         if ns.isinput():
-            pass
+            new_ns = InputNodes(
+                num_node_groups = new_num_ngroups,
+                scope = pydeepcopy(scope),
+                dist = pydeepcopy(ns.dist),
+                group_size = new_group_size
+            )
+
+            if not ns.is_tied():
+                params = ns.get_params()
+                if params is not None:
+                    new_ns.set_params(params.clone(), normalize = False)
 
         elif ns.isprod():
-            pass
+            edge_ids = ns.edge_ids.clone()
+            edge_ids = edge_ids.reshape(new_num_ngroups, group_mul_size, ns.num_chs)
+            if torch.all(edge_ids[:,1:,:] - edge_ids[:,:-1,:]) == 1:
+                # Block-sparse mode
+                edge_ids = edge_ids[:,0,:].contiguous()
+                mode = "block_sparse"
+            else:
+                # Sparse mode
+                edge_ids = (edge_ids.reshape(ns.group_size, ns.num_chs)[:,None,:] * ns.group_size + \
+                    torch.arange(0, ns.group_size)[None,:,None]).flatten(0, 1)
+                mode = "sparse"
+            
+            new_ns = ProdNodes(
+                num_node_groups = new_num_ngroups,
+                chs = ns_chs,
+                edge_ids = edge_ids,
+                group_size = ns.group_size
+            )
+
+            if mode == "block_sparse":
+                assert new_ns.is_block_sparse
+            elif mode == "sparse":
+                assert new_ns.is_sparse
 
         else:
             assert ns.issum()
 
-    return foldup_aggregate(update_ns, root_ns)
+            old_num_ngroups = ns.num_node_groups
+            old_num_cgroups = sum([cs.num_node_groups for cs in ns.chs])
+
+            new_cs_group_size = ns2group_size[ns.chs[0]]
+            cs_group_mul_size = new_cs_group_size // ns.chs[0].group_size
+
+            new_num_cgroups = old_num_cgroups // cs_group_mul_size
+
+            edge_ids = ns.edge_ids.clone()
+            grid_edge_ids = torch.zeros([old_num_ngroups, old_num_cgroups], dtype = torch.bool)
+            grid_edge_ids[edge_ids[0,:],edge_ids[1,:]] = True
+
+            grid_edge_ids = grid_edge_ids.reshape(new_num_ngroups, group_mul_size, new_num_cgroups, cs_group_mul_size)
+            new_edge_ids = torch.nonzero(grid_edge_ids.any(dim = 3).any(dim = 1), as_tuple = False).permute(1, 0)
+
+            new_ns = SumNodes(
+                num_node_groups = new_num_ngroups,
+                chs = new_chs,
+                edge_ids = new_edge_ids,
+                group_size = ns.group_size
+            )
+            
+            if not ns.is_tied():
+                # Collect selected blocks
+                grid_edge_ids = grid_edge_ids.permute(0, 2, 1, 3).flatten(0, 1)
+                block_ids = new_edge_ids[0,:] * new_num_cgroups + new_edge_ids[1,:]
+                param_indicator = grid_edge_ids[block_ids,:,:]
+                param_indicator = param_indicator[:,:,None,:,None].repeat(1, 1, ns.group_size, 1, ns.chs[0].group_size)
+                param_indicator = param_indicator.flatten(3, 4).flatten(1, 2)
+                zero_param_mask = ~param_indicator
+
+                new_ns.set_zero_param_mask(zero_param_mask)
+
+                params = ns.get_params()
+                if params is not None:
+                    # TODO: add a GPU implementation
+                    new_params = torch.zeros([new_edge_ids.size(1), new_group_size, new_cs_group_size])
+                    for par_group_id in range(new_edge_ids.size(1)):
+                        nsid = new_edge_ids[0,i] * group_mul_size
+                        neid = nsid + group_mul_size
+                        csid = new_edge_ids[1,i] * cs_group_mul_size
+                        ceid = csid + cs_group_mul_size
+
+                        blk_ids = torch.where((nsid <= edge_ids[0,:] < neid) & (csid <= edge_ids[1,:] < ceid))
+                        for blk_id in blk_ids:
+                            nid0, nid1 = (edge_ids[0,:] - nsid) * ns.group_size, (edge_ids[0,:] - nsid + 1) * ns.group_size
+                            cid0, cid1 = (edge_ids[1,:] - csid) * ns.chs[0].group_size, (edge_ids[1,:] - csid + 1) * ns.chs[0].group_size
+                            new_params[par_group_id,nid0:nid1,cid0:cid1] = params[blk_id,:,:]
+
+                    new_ns.set_params(params.clone(), normalize = False)
+
+        return new_ns
+
+    old2new = dict()
+    new_root_ns = foldup_aggregate(update_ns, root_ns, cache = old2new)
+
+    # Re-link tied nodes to their source
+    for ns in root_ns:
+        if ns.is_tied():
+            new_source_ns = old2new[ns.get_source_ns()]
+            new_ns = old2new[ns]
+            new_ns.set_source_ns(new_source_ns)
+
+    return new_root_ns

From 237ed883fe44e8559e34a21f33bae24d793f8939 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Sat, 30 Dec 2023 22:08:02 +0800
Subject: [PATCH 149/162] fix `par_update` logic when `keep_zero_params` is
 turned on

---
 src/pyjuice/model/backend/par_update.py | 38 ++++++++++++++++++-------
 1 file changed, 28 insertions(+), 10 deletions(-)

diff --git a/src/pyjuice/model/backend/par_update.py b/src/pyjuice/model/backend/par_update.py
index e0809e95..68388bdc 100644
--- a/src/pyjuice/model/backend/par_update.py
+++ b/src/pyjuice/model/backend/par_update.py
@@ -170,11 +170,15 @@ def par_update_to_device(par_update_kwargs, device):
 
 
 @triton.jit
-def cum_pflow_kernel(cum_pflows, param_flows, pflow_start_ids, blk_sizes, blk_intervals, 
-                     global_nids, num_blocks, BLOCK_ID: tl.constexpr, BLOCK_SIZE: tl.constexpr):
+def cum_pflow_kernel(cum_pflows, params, param_flows, nchs, par_start_ids, pflow_start_ids, blk_sizes, blk_intervals, 
+                     global_nids, constexprs, num_blocks, keep_zero_params: tl.constexpr, BLOCK_ID: tl.constexpr, 
+                     BLOCK_SIZE: tl.constexpr):
 
     pid = tl.program_id(axis = 0)
 
+    # Retrieve the constants
+    pseudocount = tl.load(constexprs + 1)
+
     offs_m = pid * BLOCK_ID + tl.arange(0, BLOCK_ID)
     mask_m = offs_m < num_blocks
 
@@ -188,7 +192,18 @@ def cum_pflow_kernel(cum_pflows, param_flows, pflow_start_ids, blk_sizes, blk_in
     offs_pflow = pflow_start[:,None] + offs_blk[None,:] * blk_interval[:,None]
     mask_pflow = mask_m[:,None] & (offs_blk[None,:] < blk_size[:,None])
     pflows = tl.load(param_flows + offs_pflow, mask = mask_pflow, other = 0)
-    nflows = tl.sum(pflows, axis = 1)
+
+    if keep_zero_params == 1:
+        par_start = tl.load(par_start_ids + offs_m, mask = mask_m, other = 0)
+        offs_par = par_start[:,None] + offs_blk[None,:] * blk_interval[:,None]
+        old_params = tl.load(params + offs_par, mask = mask_pflow, other = 0)
+
+        nch = tl.load(nchs + global_nid, mask = mask_m, other = 1)
+        pflows += (pseudocount / nch[:,None])
+
+        nflows = tl.sum(tl.where(old_params < 1e-12, 0.0, pflows), axis = 1)
+    else:
+        nflows = tl.sum(pflows, axis = 1)
 
     tl.atomic_add(cum_pflows + global_nid, nflows, mask = mask_m)
 
@@ -222,7 +237,10 @@ def par_update_kernel(params, param_flows, cum_pflows, nchs, par_start_ids, pflo
     nflows = tl.load(cum_pflows + global_nid, mask = mask_m, other = 1)
     nch = tl.load(nchs + global_nid, mask = mask_m, other = 1)
 
-    new_param = (pflows + pseudocount / nch[:,None]) / (nflows[:,None] + pseudocount)
+    if keep_zero_params == 1:
+        new_param = (pflows + pseudocount / nch[:,None]) / nflows[:,None]
+    else:
+        new_param = (pflows + pseudocount / nch[:,None]) / (nflows[:,None] + pseudocount)
 
     offs_par = par_start[:,None] + offs_blk[None,:] * blk_interval[:,None]
     old_param = tl.load(params + offs_par, mask = mask_pflow, other = 0)
@@ -230,7 +248,7 @@ def par_update_kernel(params, param_flows, cum_pflows, nchs, par_start_ids, pflo
     updated_param = (1.0 - step_size) * old_param + step_size * new_param
 
     if keep_zero_params == 1:
-        updated_params = tl.where(old_param < 1e-12, 0.0, updated_params)
+        updated_params = tl.where(old_param < 1e-12, 0.0, updated_param)
 
     tl.store(params + offs_par, updated_param, mask = mask_pflow)
 
@@ -253,15 +271,15 @@ def em_par_update(params: torch.Tensor, param_flows: torch.Tensor, par_update_kw
 
     grid = (triton.cdiv(num_blocks, BLOCK_ID),)
 
-    cum_pflow_kernel[grid](
-        cum_pflows, param_flows, pflow_start_ids, blk_sizes, blk_intervals, 
-        global_nids, num_blocks, BLOCK_ID, BLOCK_SIZE
-    )
-
     constexprs = torch.tensor([step_size, pseudocount]).to(params.device)
 
     keep_zero_params = 1 if keep_zero_params else 0
 
+    cum_pflow_kernel[grid](
+        cum_pflows, params, param_flows, nchs, par_start_ids, pflow_start_ids, blk_sizes, blk_intervals, 
+        global_nids, constexprs, num_blocks, keep_zero_params, BLOCK_ID, BLOCK_SIZE
+    )
+
     par_update_kernel[grid](
         params, param_flows, cum_pflows, nchs, par_start_ids, pflow_start_ids, blk_sizes, blk_intervals,
         global_nids, constexprs, num_blocks, keep_zero_params, BLOCK_ID, BLOCK_SIZE

From 4dc566bca283943e880e1d2211eafd00cf8c27db Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Sat, 30 Dec 2023 22:14:58 +0800
Subject: [PATCH 150/162] add a flag for `multiply` to encode sparse prod nodes

---
 src/pyjuice/nodes/construction.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/pyjuice/nodes/construction.py b/src/pyjuice/nodes/construction.py
index fe1558e1..af66b9b4 100644
--- a/src/pyjuice/nodes/construction.py
+++ b/src/pyjuice/nodes/construction.py
@@ -40,7 +40,7 @@ def inputs(var: Union[int,Sequence[int]], num_node_groups: int = 0, dist: Distri
     )
 
 
-def multiply(nodes1: ProdNodesChs, *args, edge_ids: Optional[Tensor] = None, **kwargs):
+def multiply(nodes1: ProdNodesChs, *args, edge_ids: Optional[Tensor] = None, sparse_edges: bool = False, **kwargs):
 
     assert isinstance(nodes1, SumNodes) or isinstance(nodes1, InputNodes), "Children of product nodes must be input or sum nodes." 
 
@@ -59,6 +59,11 @@ def multiply(nodes1: ProdNodesChs, *args, edge_ids: Optional[Tensor] = None, **k
         scope |= nodes.scope
 
     if edge_ids is not None:
+        if sparse_edges:
+            assert edge_ids.shape[0] % group_size == 0
+            num_node_groups = edge_ids.shape[0] // group_size
+        else:
+            num_node_groups = edge_ids.shape[0]
         assert edge_ids.shape[0] == num_node_groups or edge_ids.shape[0] == num_node_groups * group_size
 
     return ProdNodes(num_node_groups, chs, edge_ids, group_size = group_size, **kwargs)

From bb0a47f4fa13a140b6ba7c63f9b6aa2bdc361664 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Sat, 30 Dec 2023 22:15:24 +0800
Subject: [PATCH 151/162] fix `zero_param_mask` not defined error

---
 src/pyjuice/nodes/sum_nodes.py        | 4 ++--
 tests/layer/sparse_prod_layer_test.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/pyjuice/nodes/sum_nodes.py b/src/pyjuice/nodes/sum_nodes.py
index af08535b..03610f83 100644
--- a/src/pyjuice/nodes/sum_nodes.py
+++ b/src/pyjuice/nodes/sum_nodes.py
@@ -109,7 +109,7 @@ def set_params(self, params: torch.Tensor, normalize: bool = True, pseudocount:
         else:
             raise ValueError("Unsupported parameter input.")
 
-        if self.zero_param_mask is not None:
+        if self.provided("zero_param_mask"):
             self._params[self.zero_param_mask] = 0.0
 
         if normalize:
@@ -143,7 +143,7 @@ def init_parameters(self, perturbation: float = 2.0, recursive: bool = True, is_
         if self._source_node is None:
             self._params = torch.exp(torch.rand([self.edge_ids.size(1), self.group_size, self.ch_group_size]) * -perturbation)
 
-            if self.zero_param_mask is not None:
+            if self.provided("zero_param_mask"):
                 self._params[self.zero_param_mask] = 0.0
 
             normalize_ns_parameters(self._params, self.edge_ids[0,:], group_size = self.group_size, 
diff --git a/tests/layer/sparse_prod_layer_test.py b/tests/layer/sparse_prod_layer_test.py
index dcee47af..84a4a739 100644
--- a/tests/layer/sparse_prod_layer_test.py
+++ b/tests/layer/sparse_prod_layer_test.py
@@ -29,7 +29,7 @@ def sparse_prod_layer_test():
         ni3 = inputs(3, num_node_groups = 2, dist = dists.Categorical(num_cats = 2))
 
         np0 = multiply(ni0, ni1)
-        np1 = multiply(ni2, ni3, edge_ids = torch.arange(0, group_size * 2)[:,None].repeat(1, 2))
+        np1 = multiply(ni2, ni3, edge_ids = torch.arange(0, group_size * 2)[:,None].repeat(1, 2), sparse_edges = True)
         np2 = multiply(ni1, ni2)
 
     input_layer = InputLayer([ni0, ni1, ni2, ni3], cum_nodes = group_size)

From a6169fcbaa1e5df20be9c014a32496cff69b6893 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Sat, 30 Dec 2023 22:22:21 +0800
Subject: [PATCH 152/162] fix pruning tests

---
 src/pyjuice/transformations/prune.py  | 2 ++
 tests/transformations/pruning_test.py | 7 ++++---
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/pyjuice/transformations/prune.py b/src/pyjuice/transformations/prune.py
index ee5cb0d9..91414fb3 100644
--- a/src/pyjuice/transformations/prune.py
+++ b/src/pyjuice/transformations/prune.py
@@ -51,6 +51,8 @@ def _get_scores(ns: CircuitNodes):
 
     # Indices to keep
     flat_scores = torch.cat(flat_scores, dim = 0)
+    if flat_scores.dim() == 3:
+        flat_scores = flat_scores.sum(dim = 2).sum(dim = 1)
     if keep_frac is not None:
         assert score_threshold is None, "Only one of `keep_frac` and `score_threshold` should be set."
         score_threshold = torch.quantile(flat_scores, 1.0 - keep_frac, dim = 0)
diff --git a/tests/transformations/pruning_test.py b/tests/transformations/pruning_test.py
index efedef4b..532eb3e0 100644
--- a/tests/transformations/pruning_test.py
+++ b/tests/transformations/pruning_test.py
@@ -121,11 +121,12 @@ def pruning_by_flow_test():
     # If there are more samples, just do this iteratively for 
     # all batches. The flows will be accumulated automatically.
     lls = pc(data)
-    pc.backward(data)
+    pc.backward(data.permute(1, 0))
 
-    pc.update_parameters(update_flows = True) # Map the flows back to their corresponding nodes
+    pc.update_parameters() # Map the flows back to their corresponding nodes
+    pc.update_param_flows()
 
-    new_n = prune_by_score(n, key = "_flows", score_threshold = 0.5) # Use `n._flows` for pruning
+    new_n = prune_by_score(n, key = "_param_flows", score_threshold = 0.5) # Use `n._flows` for pruning
 
 
 if __name__ == "__main__":

From d506ae072957d1a214628b167473c922f810ceca Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Sat, 30 Dec 2023 22:26:30 +0800
Subject: [PATCH 153/162] export `juice.group`

---
 src/pyjuice/__init__.py                 |  2 +-
 src/pyjuice/transformations/__init__.py |  3 ++-
 tests/transformations/group_test.py     | 18 ++++++++++++++++++
 3 files changed, 21 insertions(+), 2 deletions(-)
 create mode 100644 tests/transformations/group_test.py

diff --git a/src/pyjuice/__init__.py b/src/pyjuice/__init__.py
index 2513f456..947211ef 100644
--- a/src/pyjuice/__init__.py
+++ b/src/pyjuice/__init__.py
@@ -18,7 +18,7 @@
 from pyjuice.nodes.methods.lvd import LVDistiller
 
 # Commonly-used transformations
-from pyjuice.transformations import merge
+from pyjuice.transformations import merge, group
 
 # IO
 from pyjuice.io import load, save
diff --git a/src/pyjuice/transformations/__init__.py b/src/pyjuice/transformations/__init__.py
index 829e6fb5..4d7a7629 100644
--- a/src/pyjuice/transformations/__init__.py
+++ b/src/pyjuice/transformations/__init__.py
@@ -1,3 +1,4 @@
 from .prune import prune_by_score
 from .merge import merge
-from .copy import deepcopy
\ No newline at end of file
+from .copy import deepcopy
+from .group import group
\ No newline at end of file
diff --git a/tests/transformations/group_test.py b/tests/transformations/group_test.py
new file mode 100644
index 00000000..3656210c
--- /dev/null
+++ b/tests/transformations/group_test.py
@@ -0,0 +1,18 @@
+import pyjuice as juice
+import torch
+import numpy as np
+
+import pyjuice.nodes.distributions as dists
+from pyjuice.utils import BitSet
+from pyjuice.nodes import multiply, summate, inputs
+from pyjuice.transformations import deepcopy
+
+import pytest
+
+
+def group_test():
+    pass
+
+
+if __name__ == "__main__":
+    group_test()

From 5b71b9a59594228dfb6b706fa7a00d553427f346 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Sun, 31 Dec 2023 04:09:30 +0800
Subject: [PATCH 154/162] add tests for `group` and fix the fn

---
 src/pyjuice/layer/compilation.py     |   4 +-
 src/pyjuice/layer/prod_layer.py      |   2 +-
 src/pyjuice/nodes/prod_nodes.py      |   6 +-
 src/pyjuice/transformations/group.py | 100 +++++++++---------
 src/pyjuice/transformations/merge.py |   4 +-
 tests/transformations/group_test.py  | 147 ++++++++++++++++++++++++++-
 6 files changed, 204 insertions(+), 59 deletions(-)

diff --git a/src/pyjuice/layer/compilation.py b/src/pyjuice/layer/compilation.py
index afb187fd..9dd48164 100644
--- a/src/pyjuice/layer/compilation.py
+++ b/src/pyjuice/layer/compilation.py
@@ -924,11 +924,11 @@ def prod_layer_forward_compilation(nodes, fw_partition_max_chs, n_partition_ids,
         else:
             n_sid = ns._output_ind_range[0]
             nids[partition_id][local_sid:local_eid] = torch.arange(0, ns.num_nodes, device = device) + n_sid
-            if ns.is_sparse:
+            if ns.is_sparse():
                 for cs_id, cs in enumerate(ns.chs):
                     cids[partition_id][local_sid:local_eid,cs_id] = ns.edge_ids[:,cs_id].to(device) + cs._output_ind_range[0]
             else:
-                assert ns.is_block_sparse
+                assert ns.is_block_sparse()
                 edge_ids = ns.edge_ids.clone()
                 edge_ids = (edge_ids[:,None,:].repeat(1, ns.group_size, 1) * ns.group_size + torch.arange(0, ns.group_size)[None,:,None]).flatten(0, 1)
                 for cs_id, cs in enumerate(ns.chs):
diff --git a/src/pyjuice/layer/prod_layer.py b/src/pyjuice/layer/prod_layer.py
index 072390b2..711588df 100644
--- a/src/pyjuice/layer/prod_layer.py
+++ b/src/pyjuice/layer/prod_layer.py
@@ -30,7 +30,7 @@ def __init__(self, nodes: Sequence[ProdNodes], global_nid_start: Optional[int] =
 
         use_block_sparse_edges = True
         for nid in range(1, len(nodes)):
-            if nodes[0].group_size != nodes[nid].group_size or nodes[nid].is_sparse:
+            if nodes[0].group_size != nodes[nid].group_size or nodes[nid].is_sparse():
                 use_block_sparse_edges = False
                 break
         self.use_block_sparse_edges = use_block_sparse_edges
diff --git a/src/pyjuice/nodes/prod_nodes.py b/src/pyjuice/nodes/prod_nodes.py
index b35e0b84..7c4dace1 100644
--- a/src/pyjuice/nodes/prod_nodes.py
+++ b/src/pyjuice/nodes/prod_nodes.py
@@ -25,7 +25,7 @@ def __init__(self, num_node_groups: int, chs: Sequence[CircuitNodes], edge_ids:
         # Child layers
         self.chs = chs
 
-        # Construct sum edges
+        # Construct product edges
         self._construct_edges(edge_ids)
 
         # Callbacks
@@ -44,11 +44,9 @@ def edge_type(self):
         else:
             raise RuntimeError(f"Unexpected shape of `edge_ids`: ({self.edge_ids.size(0)}, {self.edge_ids.size(1)})")
 
-    @property
     def is_block_sparse(self):
         return self.edge_type == self.BLOCK_SPARSE
 
-    @property
     def is_sparse(self):
         return self.edge_type == self.SPARSE
 
@@ -106,6 +104,6 @@ def _construct_edges(self, edge_ids: Optional[Tensor]):
                 assert torch.all(edge_ids[:,cid] >= 0), "Edge index underflow."
                 assert torch.all(edge_ids[:,cid] < self.chs[cid].num_nodes), "Edge index overflow."
         else:
-            raise RuntimeError(f"Unexpected shape of `edge_ids`: ({self.edge_ids.size(0)}, {self.edge_ids.size(1)})")
+            raise RuntimeError(f"Unexpected shape of `edge_ids`: ({edge_ids.size(0)}, {edge_ids.size(1)})")
 
         self.edge_ids = edge_ids
diff --git a/src/pyjuice/transformations/group.py b/src/pyjuice/transformations/group.py
index 41c08dc5..9d2a9640 100644
--- a/src/pyjuice/transformations/group.py
+++ b/src/pyjuice/transformations/group.py
@@ -1,9 +1,10 @@
 from __future__ import annotations
 
+import torch
 from copy import deepcopy as pydeepcopy
 from typing import Optional, Dict, Sequence
 
-from pyjuice.nodes import CircuitNodes, InputNodes, ProdNodes, SumNodes
+from pyjuice.nodes import CircuitNodes, InputNodes, ProdNodes, SumNodes, foldup_aggregate
 from pyjuice.utils import BitSet
 from pyjuice.utils.util import max_cdf_power_of_2
 
@@ -24,7 +25,7 @@ def group(root_ns: CircuitNodes, sparsity_tolerance: float = 0.25, max_target_gr
             assert ns.is_sum()
 
             old_group_size = ns.group_size
-            old_cs_group_size = ns.cs_group_size
+            old_ch_group_size = ns.ch_group_size
             edge_ids = ns.edge_ids
 
             old_ns_num_ngroups = ns.num_node_groups
@@ -34,53 +35,53 @@ def group(root_ns: CircuitNodes, sparsity_tolerance: float = 0.25, max_target_gr
             plausible_combinations = list()
 
             group_size = min(max_cdf_power_of_2(ns.num_nodes), max_target_group_size)
-            while group_size > old_group_size:
+            while group_size >= old_group_size:
                 group_mul_size = group_size // old_group_size
 
                 ns_num_ngroups = old_ns_num_ngroups // group_mul_size
                 
-                cs_group_size = ns2group_size[ns.chs[0]]
-                while cs_group_size > old_cs_group_size:
-                    cs_group_mul_size = cs_group_size // old_cs_group_size
+                ch_group_size = ns2group_size[ns.chs[0]]
+                while ch_group_size >= old_ch_group_size:
+                    ch_group_mul_size = ch_group_size // old_ch_group_size
 
                     cs_num_ngroups = old_cs_num_ngroups // group_mul_size
 
                     n_edge_ids = edge_ids[0,:] // group_mul_size
-                    c_edge_ids = edge_ids[1,:] // cs_group_mul_size
+                    c_edge_ids = edge_ids[1,:] // ch_group_mul_size
                     _, counts = torch.unique(n_edge_ids * cs_num_ngroups + c_edge_ids, return_counts = True)
 
-                    if torch.all(counts >= (1.0 - sparsity_tolerance) * group_mul_size * cs_group_mul_size):
-                        plausible_combinations.append((group_size, cs_group_size))
+                    if counts.float().mean() >= (1.0 - sparsity_tolerance) * group_mul_size * ch_group_mul_size:
+                        plausible_combinations.append((group_size, ch_group_size))
 
-                    cs_group_size = cs_group_size // 2
+                    ch_group_size = ch_group_size // 2
 
                 group_size = group_size // 2
 
             # Find the best group size combination
             best_group_size = 0
-            best_cs_group_size = 0
-            for group_size, cs_group_size in plausible_combinations:
-                if group_size >= 16 and cs_group_size >= 16:
+            best_ch_group_size = 0
+            for group_size, ch_group_size in plausible_combinations:
+                if group_size >= 16 and ch_group_size >= 16:
                     best_group_size = group_size
-                    best_cs_group_size = cs_group_size
+                    best_ch_group_size = ch_group_size
                     break
 
             if best_group_size == 0:
                 best_val = 0
                 best_frac = 0
-                for group_size, cs_group_size in plausible_combinations:
-                    cond1 = group_size * cs_group_size > best_val
-                    cond2 = (group_size * cs_group_size > best_val) and \
-                        (max(group_size, cs_group_size) // min(group_size, cs_group_size) < best_frac)
+                for group_size, ch_group_size in plausible_combinations:
+                    cond1 = group_size * ch_group_size > best_val
+                    cond2 = (group_size * ch_group_size > best_val) and \
+                        (max(group_size, ch_group_size) // min(group_size, ch_group_size) < best_frac)
                     if cond1 or cond2:
                         best_group_size = group_size
-                        best_cs_group_size = cs_group_size
-                        best_val = group_size * cs_group_size
-                        best_frac = max(group_size, cs_group_size) // min(group_size, cs_group_size)
+                        best_ch_group_size = ch_group_size
+                        best_val = group_size * ch_group_size
+                        best_frac = max(group_size, ch_group_size) // min(group_size, ch_group_size)
 
             ns2group_size[ns] = best_group_size
             for cs in ns.chs:
-                ns2group_size[cs] = best_cs_group_size
+                ns2group_size[cs] = best_ch_group_size
 
     ## Do a second pass to finalize the group sizes ##
 
@@ -102,10 +103,12 @@ def update_ns(ns: CircuitNodes, ns_chs: Sequence[CircuitNodes]):
 
         new_num_ngroups = ns.num_node_groups // group_mul_size
 
-        if ns.isinput():
+        assert new_num_ngroups * new_group_size == ns.num_node_groups * ns.group_size
+
+        if ns.is_input():
             new_ns = InputNodes(
                 num_node_groups = new_num_ngroups,
-                scope = pydeepcopy(scope),
+                scope = pydeepcopy(ns.scope),
                 dist = pydeepcopy(ns.dist),
                 group_size = new_group_size
             )
@@ -115,7 +118,7 @@ def update_ns(ns: CircuitNodes, ns_chs: Sequence[CircuitNodes]):
                 if params is not None:
                     new_ns.set_params(params.clone(), normalize = False)
 
-        elif ns.isprod():
+        elif ns.is_prod():
             edge_ids = ns.edge_ids.clone()
             edge_ids = edge_ids.reshape(new_num_ngroups, group_mul_size, ns.num_chs)
             if torch.all(edge_ids[:,1:,:] - edge_ids[:,:-1,:]) == 1:
@@ -124,7 +127,7 @@ def update_ns(ns: CircuitNodes, ns_chs: Sequence[CircuitNodes]):
                 mode = "block_sparse"
             else:
                 # Sparse mode
-                edge_ids = (edge_ids.reshape(ns.group_size, ns.num_chs)[:,None,:] * ns.group_size + \
+                edge_ids = (edge_ids.reshape(ns.num_node_groups, ns.num_chs)[:,None,:] * ns.group_size + \
                     torch.arange(0, ns.group_size)[None,:,None]).flatten(0, 1)
                 mode = "sparse"
             
@@ -132,37 +135,37 @@ def update_ns(ns: CircuitNodes, ns_chs: Sequence[CircuitNodes]):
                 num_node_groups = new_num_ngroups,
                 chs = ns_chs,
                 edge_ids = edge_ids,
-                group_size = ns.group_size
+                group_size = new_group_size
             )
 
             if mode == "block_sparse":
-                assert new_ns.is_block_sparse
+                assert new_ns.is_block_sparse()
             elif mode == "sparse":
-                assert new_ns.is_sparse
+                assert new_ns.is_sparse()
 
         else:
-            assert ns.issum()
+            assert ns.is_sum()
 
             old_num_ngroups = ns.num_node_groups
             old_num_cgroups = sum([cs.num_node_groups for cs in ns.chs])
 
-            new_cs_group_size = ns2group_size[ns.chs[0]]
-            cs_group_mul_size = new_cs_group_size // ns.chs[0].group_size
+            new_ch_group_size = ns2group_size[ns.chs[0]]
+            ch_group_mul_size = new_ch_group_size // ns.chs[0].group_size
 
-            new_num_cgroups = old_num_cgroups // cs_group_mul_size
+            new_num_cgroups = old_num_cgroups // ch_group_mul_size
 
             edge_ids = ns.edge_ids.clone()
             grid_edge_ids = torch.zeros([old_num_ngroups, old_num_cgroups], dtype = torch.bool)
             grid_edge_ids[edge_ids[0,:],edge_ids[1,:]] = True
 
-            grid_edge_ids = grid_edge_ids.reshape(new_num_ngroups, group_mul_size, new_num_cgroups, cs_group_mul_size)
+            grid_edge_ids = grid_edge_ids.reshape(new_num_ngroups, group_mul_size, new_num_cgroups, ch_group_mul_size)
             new_edge_ids = torch.nonzero(grid_edge_ids.any(dim = 3).any(dim = 1), as_tuple = False).permute(1, 0)
 
             new_ns = SumNodes(
                 num_node_groups = new_num_ngroups,
-                chs = new_chs,
+                chs = ns_chs,
                 edge_ids = new_edge_ids,
-                group_size = ns.group_size
+                group_size = new_group_size
             )
             
             if not ns.is_tied():
@@ -170,29 +173,30 @@ def update_ns(ns: CircuitNodes, ns_chs: Sequence[CircuitNodes]):
                 grid_edge_ids = grid_edge_ids.permute(0, 2, 1, 3).flatten(0, 1)
                 block_ids = new_edge_ids[0,:] * new_num_cgroups + new_edge_ids[1,:]
                 param_indicator = grid_edge_ids[block_ids,:,:]
-                param_indicator = param_indicator[:,:,None,:,None].repeat(1, 1, ns.group_size, 1, ns.chs[0].group_size)
-                param_indicator = param_indicator.flatten(3, 4).flatten(1, 2)
-                zero_param_mask = ~param_indicator
+                if not torch.all(param_indicator):
+                    param_indicator = param_indicator[:,:,None,:,None].repeat(1, 1, ns.group_size, 1, ns.chs[0].group_size)
+                    param_indicator = param_indicator.flatten(3, 4).flatten(1, 2)
+                    zero_param_mask = ~param_indicator
 
-                new_ns.set_zero_param_mask(zero_param_mask)
+                    new_ns.set_zero_param_mask(zero_param_mask)
 
                 params = ns.get_params()
                 if params is not None:
                     # TODO: add a GPU implementation
-                    new_params = torch.zeros([new_edge_ids.size(1), new_group_size, new_cs_group_size])
+                    new_params = torch.zeros([new_edge_ids.size(1), new_group_size, new_ch_group_size])
                     for par_group_id in range(new_edge_ids.size(1)):
-                        nsid = new_edge_ids[0,i] * group_mul_size
+                        nsid = new_edge_ids[0,par_group_id] * group_mul_size
                         neid = nsid + group_mul_size
-                        csid = new_edge_ids[1,i] * cs_group_mul_size
-                        ceid = csid + cs_group_mul_size
+                        csid = new_edge_ids[1,par_group_id] * ch_group_mul_size
+                        ceid = csid + ch_group_mul_size
 
-                        blk_ids = torch.where((nsid <= edge_ids[0,:] < neid) & (csid <= edge_ids[1,:] < ceid))
+                        blk_ids = torch.where((edge_ids[0,:] >= nsid) & (edge_ids[0,:] < neid) & (edge_ids[1,:] >= csid) & (edge_ids[1,:] < ceid))[0]
                         for blk_id in blk_ids:
-                            nid0, nid1 = (edge_ids[0,:] - nsid) * ns.group_size, (edge_ids[0,:] - nsid + 1) * ns.group_size
-                            cid0, cid1 = (edge_ids[1,:] - csid) * ns.chs[0].group_size, (edge_ids[1,:] - csid + 1) * ns.chs[0].group_size
+                            nid0, nid1 = (edge_ids[0,blk_id] - nsid) * ns.group_size, (edge_ids[0,blk_id] - nsid + 1) * ns.group_size
+                            cid0, cid1 = (edge_ids[1,blk_id] - csid) * ns.chs[0].group_size, (edge_ids[1,blk_id] - csid + 1) * ns.chs[0].group_size
                             new_params[par_group_id,nid0:nid1,cid0:cid1] = params[blk_id,:,:]
 
-                    new_ns.set_params(params.clone(), normalize = False)
+                    new_ns.set_params(new_params, normalize = False)
 
         return new_ns
 
diff --git a/src/pyjuice/transformations/merge.py b/src/pyjuice/transformations/merge.py
index 551db4ab..665bae30 100644
--- a/src/pyjuice/transformations/merge.py
+++ b/src/pyjuice/transformations/merge.py
@@ -104,10 +104,10 @@ def merge_prod_nodes(ns1: ProdNodes, ns2: ProdNodes, *args) -> ProdNodes:
             new_sum_chs.append(merge_sum_nodes(*sum_ns))
 
     prod_edge_ids = []
-    use_sparse_mode = any([ns.is_sparse for ns in all_ns])
+    use_sparse_mode = any([ns.is_sparse() for ns in all_ns])
     for ns in all_ns:
         edge_ids = ns.edge_ids.clone()
-        if use_sparse_mode and ns.is_block_sparse:
+        if use_sparse_mode and ns.is_block_sparse():
             edge_ids = (edge_ids[:,None,:].repeat(1, ns.group_size, 1) * ns.group_size + torch.arange(0, ns.group_size)[None,:,None]).flatten(0, 1)
         for scope_id in range(num_scopes):
             cs = ns.chs[scope_id]
diff --git a/tests/transformations/group_test.py b/tests/transformations/group_test.py
index 3656210c..848dc14c 100644
--- a/tests/transformations/group_test.py
+++ b/tests/transformations/group_test.py
@@ -2,17 +2,160 @@
 import torch
 import numpy as np
 
+import pyjuice as juice
 import pyjuice.nodes.distributions as dists
 from pyjuice.utils import BitSet
-from pyjuice.nodes import multiply, summate, inputs
+from pyjuice.nodes import multiply, summate, inputs, set_group_size
 from pyjuice.transformations import deepcopy
 
 import pytest
 
 
 def group_test():
-    pass
+    
+    with set_group_size(group_size = 2):
+
+        ni0 = inputs(0, num_node_groups = 2, dist = dists.Categorical(num_cats = 2))
+        ni1 = inputs(1, num_node_groups = 2, dist = dists.Categorical(num_cats = 2))
+        ni2 = inputs(2, num_node_groups = 2, dist = dists.Categorical(num_cats = 2))
+        ni3 = inputs(3, num_node_groups = 2, dist = dists.Categorical(num_cats = 2))
+
+        m1 = multiply(ni0, ni1, edge_ids = torch.tensor([[0, 0], [0, 1], [1, 0], [1, 1]], dtype = torch.long))
+        n1 = summate(m1, edge_ids = torch.tensor([[0, 0, 0, 0, 1, 1, 1, 1], [0, 1, 2, 3, 0, 1, 2, 3]], dtype = torch.long))
+
+        m2 = multiply(ni2, ni3, edge_ids = torch.tensor([[0, 0], [1, 1]], dtype = torch.long))
+        n2 = summate(m2, edge_ids = torch.tensor([[0, 0, 1, 1], [0, 1, 0, 1]], dtype = torch.long))
+
+        m = multiply(n1, n2, edge_ids = torch.tensor([[0, 0], [1, 1]], dtype = torch.long))
+        n = summate(m, edge_ids = torch.tensor([[0, 0], [0, 1]], dtype = torch.long), group_size = 1)
+
+    n.init_parameters()
+
+    new_n = juice.group(n)
+    new_m = new_n.chs[0]
+
+    new_n1 = new_m.chs[0]
+    new_n2 = new_m.chs[1]
+
+    new_m1 = new_n1.chs[0]
+    new_m2 = new_n2.chs[0]
+
+    new_ni0 = new_m1.chs[0]
+    new_ni1 = new_m1.chs[1]
+    new_ni2 = new_m2.chs[0]
+    new_ni3 = new_m2.chs[1]
+
+    assert new_ni0.group_size == 4 and new_ni0.num_node_groups == 1
+    assert new_ni1.group_size == 4 and new_ni1.num_node_groups == 1
+    assert new_ni2.group_size == 4 and new_ni2.num_node_groups == 1
+    assert new_ni3.group_size == 4 and new_ni3.num_node_groups == 1
+
+    assert new_m1.num_node_groups == 2 and new_m1.group_size == 4
+    assert new_m1.is_sparse()
+    assert torch.all(new_m1.edge_ids == torch.tensor([[0, 0], [1, 1], [0, 2], [1, 3], [2, 0], [3, 1], [2, 2], [3, 3]]))
+
+    assert new_m2.num_node_groups == 1 and new_m2.group_size == 4
+    assert new_m2.is_block_sparse()
+    assert torch.all(new_m2.edge_ids == torch.tensor([[0, 0]]))
+
+    assert new_n1.num_node_groups == 1 and new_n1.group_size == 4
+    assert torch.all(new_n1.edge_ids == torch.tensor([[0, 0], [0, 1]]))
+    assert torch.all(new_n1._params[0][0:2,0:2] == n1._params[0])
+    assert torch.all(new_n1._params[0][0:2,2:4] == n1._params[1])
+    assert torch.all(new_n1._params[0][2:4,0:2] == n1._params[4])
+    assert torch.all(new_n1._params[0][2:4,2:4] == n1._params[5])
+    assert torch.all(new_n1._params[1][0:2,0:2] == n1._params[2])
+    assert torch.all(new_n1._params[1][0:2,2:4] == n1._params[3])
+    assert torch.all(new_n1._params[1][2:4,0:2] == n1._params[6])
+    assert torch.all(new_n1._params[1][2:4,2:4] == n1._params[7])
+
+    assert new_n2.num_node_groups == 1 and new_n2.group_size == 4
+    assert torch.all(new_n2.edge_ids == torch.tensor([[0], [0]]))
+    assert torch.all(new_n2._params[0][0:2,0:2] == n2._params[0])
+    assert torch.all(new_n2._params[0][0:2,2:4] == n2._params[1])
+    assert torch.all(new_n2._params[0][2:4,0:2] == n2._params[2])
+    assert torch.all(new_n2._params[0][2:4,2:4] == n2._params[3])
+
+
+def block_sparse_group_test():
+
+    with set_group_size(group_size = 4):
+
+        ni0 = inputs(0, num_node_groups = 4, dist = dists.Categorical(num_cats = 2))
+        ni1 = inputs(1, num_node_groups = 4, dist = dists.Categorical(num_cats = 2))
+        ni2 = inputs(0, num_node_groups = 4, dist = dists.Categorical(num_cats = 2))
+        ni3 = inputs(1, num_node_groups = 4, dist = dists.Categorical(num_cats = 2))
+
+        np0 = multiply(ni0, ni1)
+        np1 = multiply(ni2, ni3)
+
+        edge_ids = torch.tensor([
+            [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3], 
+            [0, 1, 3, 4, 5, 6, 7, 1, 2, 3, 4, 6, 7, 4, 5, 6, 7, 5, 6, 7]
+        ], dtype = torch.long)
+        ns = summate(np0, np1, edge_ids = edge_ids)
+
+    ns.init_parameters()
+
+    new_ns = juice.group(ns)
+
+    new_np0 = new_ns.chs[0]
+    new_np1 = new_ns.chs[1]
+
+    new_ni0 = new_np0.chs[0]
+    new_ni1 = new_np0.chs[1]
+    new_ni2 = new_np1.chs[0]
+    new_ni3 = new_np1.chs[1]
+
+    assert new_ni0.group_size == 16 and new_ni0.num_node_groups == 1
+    assert new_ni1.group_size == 16 and new_ni1.num_node_groups == 1
+    assert new_ni2.group_size == 16 and new_ni2.num_node_groups == 1
+    assert new_ni3.group_size == 16 and new_ni3.num_node_groups == 1
+
+    assert new_np0.num_node_groups == 1 and new_np0.group_size == 16
+    assert new_np0.is_block_sparse()
+    assert torch.all(new_np0.edge_ids == torch.tensor([[0, 0]]))
+
+    assert new_np1.num_node_groups == 1 and new_np1.group_size == 16
+    assert new_np1.is_block_sparse()
+    assert torch.all(new_np1.edge_ids == torch.tensor([[0, 0]]))
+
+    assert new_ns.num_node_groups == 2 and new_ns.group_size == 8
+    assert new_ns.ch_group_size == 16
+    assert torch.all(new_ns.edge_ids == torch.tensor([[0, 0, 1], [0, 1, 1]]))
+    assert torch.all(new_ns._params[0][0:4,0:4] == ns._params[0])
+    assert torch.all(new_ns._params[0][0:4,4:8] == ns._params[1])
+    assert torch.all(new_ns._params[0][0:4,8:12] == 0.0)
+    assert torch.all(new_ns._params[0][0:4,12:16] == ns._params[2])
+    assert torch.all(new_ns._params[1][0:4,0:4] == ns._params[3])
+    assert torch.all(new_ns._params[1][0:4,4:8] == ns._params[4])
+    assert torch.all(new_ns._params[1][0:4,8:12] == ns._params[5])
+    assert torch.all(new_ns._params[1][0:4,12:16] == ns._params[6])
+    assert torch.all(new_ns._params[0][4:8,0:4] == 0.0)
+    assert torch.all(new_ns._params[0][4:8,4:8] == ns._params[7])
+    assert torch.all(new_ns._params[0][4:8,8:12] == ns._params[8])
+    assert torch.all(new_ns._params[0][4:8,12:16] == ns._params[9])
+    assert torch.all(new_ns._params[1][4:8,0:4] == ns._params[10])
+    assert torch.all(new_ns._params[1][4:8,4:8] == 0.0)
+    assert torch.all(new_ns._params[1][4:8,8:12] == ns._params[11])
+    assert torch.all(new_ns._params[1][4:8,12:16] == ns._params[12])
+    assert torch.all(new_ns._params[2][0:4,0:4] == ns._params[13])
+    assert torch.all(new_ns._params[2][0:4,4:8] == ns._params[14])
+    assert torch.all(new_ns._params[2][0:4,8:12] == ns._params[15])
+    assert torch.all(new_ns._params[2][0:4,12:16] == ns._params[16])
+    assert torch.all(new_ns._params[2][4:8,0:4] == 0.0)
+    assert torch.all(new_ns._params[2][4:8,4:8] == ns._params[17])
+    assert torch.all(new_ns._params[2][4:8,8:12] == ns._params[18])
+    assert torch.all(new_ns._params[2][4:8,12:16] == ns._params[19])
+
+    assert torch.all(new_ns.zero_param_mask[0][0:4,8:12])
+    assert torch.all(new_ns.zero_param_mask[0][4:8,0:4])
+    assert torch.all(new_ns.zero_param_mask[1][4:8,4:8])
+    assert torch.all(new_ns.zero_param_mask[2][4:8,0:4])
+
+    assert new_ns.zero_param_mask.long().sum() == 4 * 4 * 4
 
 
 if __name__ == "__main__":
     group_test()
+    block_sparse_group_test()

From b8c404ca9da547e8da0d03c94609b68f831a04ed Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Sun, 31 Dec 2023 04:49:01 +0800
Subject: [PATCH 155/162] add GPU mode in `group` to improve the speed of
 parameter copy

---
 src/pyjuice/transformations/group.py | 129 +++++++++++++++++++++++----
 1 file changed, 113 insertions(+), 16 deletions(-)

diff --git a/src/pyjuice/transformations/group.py b/src/pyjuice/transformations/group.py
index 9d2a9640..45b32e65 100644
--- a/src/pyjuice/transformations/group.py
+++ b/src/pyjuice/transformations/group.py
@@ -1,7 +1,11 @@
 from __future__ import annotations
 
+import numpy as np
 import torch
+import triton
+import triton.language as tl
 from copy import deepcopy as pydeepcopy
+from numba import njit
 from typing import Optional, Dict, Sequence
 
 from pyjuice.nodes import CircuitNodes, InputNodes, ProdNodes, SumNodes, foldup_aggregate
@@ -9,7 +13,63 @@
 from pyjuice.utils.util import max_cdf_power_of_2
 
 
-def group(root_ns: CircuitNodes, sparsity_tolerance: float = 0.25, max_target_group_size: int = 32):
+@njit
+def _compute_param_target_ids_kernel(target_id0, target_id1, target_id2, edge_ids, new_edge_ids, 
+                                     group_mul_size, ch_group_mul_size, group_size, ch_group_size):
+    for i in range(edge_ids.shape[1]):
+        old_ngid = edge_ids[0,i]
+        old_cgid = edge_ids[1,i]
+
+        for j in range(new_edge_ids.shape[1]):
+            new_ngid = new_edge_ids[0,j]
+            new_cgid = new_edge_ids[1,j]
+            
+            ng_sid = new_ngid * group_mul_size
+            ng_eid = ng_sid + group_mul_size
+            cg_sid = new_cgid * ch_group_mul_size
+            cg_eid = cg_sid + ch_group_mul_size
+
+            if (old_ngid >= ng_sid) and (old_ngid < ng_eid) and (old_cgid >= cg_sid) and (old_cgid < cg_eid):
+                target_id0[i] = j
+
+                target_id1[0,i] = (old_ngid - ng_sid) * group_size
+                target_id1[1,i] = (old_ngid - ng_sid + 1) * group_size
+
+                target_id2[0,i] = (old_cgid - cg_sid) * ch_group_size
+                target_id2[1,i] = (old_cgid - cg_sid + 1) * ch_group_size
+                break
+
+
+@triton.jit
+def _copy_params_kernel(new_params, params, target_id0, target_id1, target_id2, 
+                        old_group_size: tl.constexpr, old_ch_group_size: tl.constexpr, 
+                        new_group_size: tl.constexpr, new_ch_group_size: tl.constexpr, 
+                        BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr):
+
+    pid_n = tl.program_id(0)
+    pid_m = tl.program_id(1)
+    pid_b = tl.program_id(2)
+
+    offs_m = tl.arange(0, BLOCK_M) + pid_m * BLOCK_M
+    offs_n = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
+
+    offs_pars = pid_b * (old_group_size * old_ch_group_size) + offs_m[:,None] * old_ch_group_size + offs_n[None,:]
+    pars = tl.load(params + offs_pars)
+
+    id0 = tl.load(target_id0 + pid_b)
+    id1 = tl.load(target_id1 + pid_b)
+    id2 = tl.load(target_id2 + pid_b)
+
+    offs_npars = id0 * (new_group_size * new_ch_group_size) + (id1 + offs_m)[:,None] * new_ch_group_size + (id2 + offs_n)[None,:]
+    tl.store(new_params + offs_npars, pars)
+
+
+def group(root_ns: CircuitNodes, sparsity_tolerance: float = 0.25, max_target_group_size: int = 32, use_cuda: bool = True):
+
+    if use_cuda:
+        device = torch.device("cuda:0")
+    else:
+        device = torch.device("cpu")
 
     ## Do an initial pass to compute the maximum group size of every `ns` ##
 
@@ -182,21 +242,58 @@ def update_ns(ns: CircuitNodes, ns_chs: Sequence[CircuitNodes]):
 
                 params = ns.get_params()
                 if params is not None:
-                    # TODO: add a GPU implementation
-                    new_params = torch.zeros([new_edge_ids.size(1), new_group_size, new_ch_group_size])
-                    for par_group_id in range(new_edge_ids.size(1)):
-                        nsid = new_edge_ids[0,par_group_id] * group_mul_size
-                        neid = nsid + group_mul_size
-                        csid = new_edge_ids[1,par_group_id] * ch_group_mul_size
-                        ceid = csid + ch_group_mul_size
-
-                        blk_ids = torch.where((edge_ids[0,:] >= nsid) & (edge_ids[0,:] < neid) & (edge_ids[1,:] >= csid) & (edge_ids[1,:] < ceid))[0]
-                        for blk_id in blk_ids:
-                            nid0, nid1 = (edge_ids[0,blk_id] - nsid) * ns.group_size, (edge_ids[0,blk_id] - nsid + 1) * ns.group_size
-                            cid0, cid1 = (edge_ids[1,blk_id] - csid) * ns.chs[0].group_size, (edge_ids[1,blk_id] - csid + 1) * ns.chs[0].group_size
-                            new_params[par_group_id,nid0:nid1,cid0:cid1] = params[blk_id,:,:]
-
-                    new_ns.set_params(new_params, normalize = False)
+                    new_params = torch.zeros([new_edge_ids.size(1), new_group_size, new_ch_group_size], device = device)
+                    if use_cuda:
+                        edge_ids_np = edge_ids.numpy()
+                        new_edge_ids_np = new_edge_ids.numpy()
+
+                        old_group_size = ns.group_size
+                        old_ch_group_size = ns.chs[0].group_size
+
+                        target_id0 = np.zeros([edge_ids.size(1)], dtype = np.int64) - 1
+                        target_id1 = np.zeros([2, edge_ids.size(1)], dtype = np.int64) - 1
+                        target_id2 = np.zeros([2, edge_ids.size(1)], dtype = np.int64) - 1
+                        
+                        _compute_param_target_ids_kernel(
+                            target_id0, target_id1, target_id2, edge_ids_np, new_edge_ids_np, 
+                            group_mul_size, ch_group_mul_size, old_group_size, old_ch_group_size
+                        )
+
+                        target_id0 = torch.from_numpy(target_id0).to(device)
+                        target_id1 = torch.from_numpy(target_id1).to(device)
+                        target_id2 = torch.from_numpy(target_id2).to(device)
+
+                        params = params.to(device)
+
+                        BLOCK_M = min(32, old_group_size)
+                        BLOCK_N = min(32, old_ch_group_size)
+
+                        grid = (old_ch_group_size // BLOCK_N, old_group_size // BLOCK_M, edge_ids.size(1))
+
+                        _copy_params_kernel[grid](
+                            new_params, params, target_id0, target_id1, target_id2, 
+                            old_group_size = old_group_size, 
+                            old_ch_group_size = old_ch_group_size, 
+                            new_group_size = new_group_size, 
+                            new_ch_group_size = new_ch_group_size, 
+                            BLOCK_M = BLOCK_M, 
+                            BLOCK_N = BLOCK_N
+                        )
+
+                    else:
+                        for par_group_id in range(new_edge_ids.size(1)):
+                            nsid = new_edge_ids[0,par_group_id] * group_mul_size
+                            neid = nsid + group_mul_size
+                            csid = new_edge_ids[1,par_group_id] * ch_group_mul_size
+                            ceid = csid + ch_group_mul_size
+
+                            blk_ids = torch.where((edge_ids[0,:] >= nsid) & (edge_ids[0,:] < neid) & (edge_ids[1,:] >= csid) & (edge_ids[1,:] < ceid))[0]
+                            for blk_id in blk_ids:
+                                nid0, nid1 = (edge_ids[0,blk_id] - nsid) * ns.group_size, (edge_ids[0,blk_id] - nsid + 1) * ns.group_size
+                                cid0, cid1 = (edge_ids[1,blk_id] - csid) * ns.chs[0].group_size, (edge_ids[1,blk_id] - csid + 1) * ns.chs[0].group_size
+                                new_params[par_group_id,nid0:nid1,cid0:cid1] = params[blk_id,:,:]
+
+                    new_ns.set_params(new_params.cpu(), normalize = False)
 
         return new_ns
 

From db25b261f45aadbc5af175a344f9458b242acfbe Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Sun, 31 Dec 2023 16:42:05 +0800
Subject: [PATCH 156/162] reorder edges

---
 src/pyjuice/nodes/sum_nodes.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/pyjuice/nodes/sum_nodes.py b/src/pyjuice/nodes/sum_nodes.py
index 03610f83..aa9e5eb4 100644
--- a/src/pyjuice/nodes/sum_nodes.py
+++ b/src/pyjuice/nodes/sum_nodes.py
@@ -223,7 +223,7 @@ def _standardize_chs(self, chs):
 
         return new_chs
 
-    def _construct_edges(self, edge_ids: Optional[Union[Tensor,Sequence[Tensor]]]):
+    def _construct_edges(self, edge_ids: Optional[Union[Tensor,Sequence[Tensor]]], reorder: bool = True):
         if edge_ids is None:
             edge_ids = torch.cat(
                 (torch.arange(self.num_node_groups).unsqueeze(1).repeat(1, self.num_ch_node_groups).reshape(1, -1),
@@ -245,6 +245,9 @@ def _construct_edges(self, edge_ids: Optional[Union[Tensor,Sequence[Tensor]]]):
 
             edge_ids = torch.cat(edge_ids, dim = 1)
 
+        if reorder:
+            edge_ids = self._reorder_edges(edge_ids)
+
         if isinstance(edge_ids, np.ndarray):
             edge_ids = torch.from_numpy(edge_ids)
 
@@ -262,5 +265,9 @@ def _construct_edges(self, edge_ids: Optional[Union[Tensor,Sequence[Tensor]]]):
 
         self.edge_ids = edge_ids
 
+    def _reorder_edges(self, edge_ids: Tensor):
+        ids = torch.argsort(edge_ids[0,:] * self.num_ch_node_groups + edge_ids[1,:])
+        return edge_ids[:,ids].contiguous()
+
     def __repr__(self):
         return f"SumNodes(num_node_groups={self.num_node_groups}, group_size={self.group_size}, num_chs={self.num_chs}, num_edges={self.num_edges})"

From 5a0d7b7b42219f43349fa6d1d091514997aca476 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Tue, 2 Jan 2024 20:09:12 +0800
Subject: [PATCH 157/162] test both CPU and GPU compilation in group_test

---
 tests/transformations/group_test.py | 204 ++++++++++++++--------------
 1 file changed, 103 insertions(+), 101 deletions(-)

diff --git a/tests/transformations/group_test.py b/tests/transformations/group_test.py
index 848dc14c..d520ecce 100644
--- a/tests/transformations/group_test.py
+++ b/tests/transformations/group_test.py
@@ -31,50 +31,51 @@ def group_test():
 
     n.init_parameters()
 
-    new_n = juice.group(n)
-    new_m = new_n.chs[0]
-
-    new_n1 = new_m.chs[0]
-    new_n2 = new_m.chs[1]
-
-    new_m1 = new_n1.chs[0]
-    new_m2 = new_n2.chs[0]
-
-    new_ni0 = new_m1.chs[0]
-    new_ni1 = new_m1.chs[1]
-    new_ni2 = new_m2.chs[0]
-    new_ni3 = new_m2.chs[1]
-
-    assert new_ni0.group_size == 4 and new_ni0.num_node_groups == 1
-    assert new_ni1.group_size == 4 and new_ni1.num_node_groups == 1
-    assert new_ni2.group_size == 4 and new_ni2.num_node_groups == 1
-    assert new_ni3.group_size == 4 and new_ni3.num_node_groups == 1
-
-    assert new_m1.num_node_groups == 2 and new_m1.group_size == 4
-    assert new_m1.is_sparse()
-    assert torch.all(new_m1.edge_ids == torch.tensor([[0, 0], [1, 1], [0, 2], [1, 3], [2, 0], [3, 1], [2, 2], [3, 3]]))
-
-    assert new_m2.num_node_groups == 1 and new_m2.group_size == 4
-    assert new_m2.is_block_sparse()
-    assert torch.all(new_m2.edge_ids == torch.tensor([[0, 0]]))
-
-    assert new_n1.num_node_groups == 1 and new_n1.group_size == 4
-    assert torch.all(new_n1.edge_ids == torch.tensor([[0, 0], [0, 1]]))
-    assert torch.all(new_n1._params[0][0:2,0:2] == n1._params[0])
-    assert torch.all(new_n1._params[0][0:2,2:4] == n1._params[1])
-    assert torch.all(new_n1._params[0][2:4,0:2] == n1._params[4])
-    assert torch.all(new_n1._params[0][2:4,2:4] == n1._params[5])
-    assert torch.all(new_n1._params[1][0:2,0:2] == n1._params[2])
-    assert torch.all(new_n1._params[1][0:2,2:4] == n1._params[3])
-    assert torch.all(new_n1._params[1][2:4,0:2] == n1._params[6])
-    assert torch.all(new_n1._params[1][2:4,2:4] == n1._params[7])
-
-    assert new_n2.num_node_groups == 1 and new_n2.group_size == 4
-    assert torch.all(new_n2.edge_ids == torch.tensor([[0], [0]]))
-    assert torch.all(new_n2._params[0][0:2,0:2] == n2._params[0])
-    assert torch.all(new_n2._params[0][0:2,2:4] == n2._params[1])
-    assert torch.all(new_n2._params[0][2:4,0:2] == n2._params[2])
-    assert torch.all(new_n2._params[0][2:4,2:4] == n2._params[3])
+    for use_cuda in [True, False]:
+        new_n = juice.group(n, use_cuda = use_cuda)
+        new_m = new_n.chs[0]
+
+        new_n1 = new_m.chs[0]
+        new_n2 = new_m.chs[1]
+
+        new_m1 = new_n1.chs[0]
+        new_m2 = new_n2.chs[0]
+
+        new_ni0 = new_m1.chs[0]
+        new_ni1 = new_m1.chs[1]
+        new_ni2 = new_m2.chs[0]
+        new_ni3 = new_m2.chs[1]
+
+        assert new_ni0.group_size == 4 and new_ni0.num_node_groups == 1
+        assert new_ni1.group_size == 4 and new_ni1.num_node_groups == 1
+        assert new_ni2.group_size == 4 and new_ni2.num_node_groups == 1
+        assert new_ni3.group_size == 4 and new_ni3.num_node_groups == 1
+
+        assert new_m1.num_node_groups == 2 and new_m1.group_size == 4
+        assert new_m1.is_sparse()
+        assert torch.all(new_m1.edge_ids == torch.tensor([[0, 0], [1, 1], [0, 2], [1, 3], [2, 0], [3, 1], [2, 2], [3, 3]]))
+
+        assert new_m2.num_node_groups == 1 and new_m2.group_size == 4
+        assert new_m2.is_block_sparse()
+        assert torch.all(new_m2.edge_ids == torch.tensor([[0, 0]]))
+
+        assert new_n1.num_node_groups == 1 and new_n1.group_size == 4
+        assert torch.all(new_n1.edge_ids == torch.tensor([[0, 0], [0, 1]]))
+        assert torch.all(new_n1._params[0][0:2,0:2] == n1._params[0])
+        assert torch.all(new_n1._params[0][0:2,2:4] == n1._params[1])
+        assert torch.all(new_n1._params[0][2:4,0:2] == n1._params[4])
+        assert torch.all(new_n1._params[0][2:4,2:4] == n1._params[5])
+        assert torch.all(new_n1._params[1][0:2,0:2] == n1._params[2])
+        assert torch.all(new_n1._params[1][0:2,2:4] == n1._params[3])
+        assert torch.all(new_n1._params[1][2:4,0:2] == n1._params[6])
+        assert torch.all(new_n1._params[1][2:4,2:4] == n1._params[7])
+
+        assert new_n2.num_node_groups == 1 and new_n2.group_size == 4
+        assert torch.all(new_n2.edge_ids == torch.tensor([[0], [0]]))
+        assert torch.all(new_n2._params[0][0:2,0:2] == n2._params[0])
+        assert torch.all(new_n2._params[0][0:2,2:4] == n2._params[1])
+        assert torch.all(new_n2._params[0][2:4,0:2] == n2._params[2])
+        assert torch.all(new_n2._params[0][2:4,2:4] == n2._params[3])
 
 
 def block_sparse_group_test():
@@ -97,63 +98,64 @@ def block_sparse_group_test():
 
     ns.init_parameters()
 
-    new_ns = juice.group(ns)
-
-    new_np0 = new_ns.chs[0]
-    new_np1 = new_ns.chs[1]
-
-    new_ni0 = new_np0.chs[0]
-    new_ni1 = new_np0.chs[1]
-    new_ni2 = new_np1.chs[0]
-    new_ni3 = new_np1.chs[1]
-
-    assert new_ni0.group_size == 16 and new_ni0.num_node_groups == 1
-    assert new_ni1.group_size == 16 and new_ni1.num_node_groups == 1
-    assert new_ni2.group_size == 16 and new_ni2.num_node_groups == 1
-    assert new_ni3.group_size == 16 and new_ni3.num_node_groups == 1
-
-    assert new_np0.num_node_groups == 1 and new_np0.group_size == 16
-    assert new_np0.is_block_sparse()
-    assert torch.all(new_np0.edge_ids == torch.tensor([[0, 0]]))
-
-    assert new_np1.num_node_groups == 1 and new_np1.group_size == 16
-    assert new_np1.is_block_sparse()
-    assert torch.all(new_np1.edge_ids == torch.tensor([[0, 0]]))
-
-    assert new_ns.num_node_groups == 2 and new_ns.group_size == 8
-    assert new_ns.ch_group_size == 16
-    assert torch.all(new_ns.edge_ids == torch.tensor([[0, 0, 1], [0, 1, 1]]))
-    assert torch.all(new_ns._params[0][0:4,0:4] == ns._params[0])
-    assert torch.all(new_ns._params[0][0:4,4:8] == ns._params[1])
-    assert torch.all(new_ns._params[0][0:4,8:12] == 0.0)
-    assert torch.all(new_ns._params[0][0:4,12:16] == ns._params[2])
-    assert torch.all(new_ns._params[1][0:4,0:4] == ns._params[3])
-    assert torch.all(new_ns._params[1][0:4,4:8] == ns._params[4])
-    assert torch.all(new_ns._params[1][0:4,8:12] == ns._params[5])
-    assert torch.all(new_ns._params[1][0:4,12:16] == ns._params[6])
-    assert torch.all(new_ns._params[0][4:8,0:4] == 0.0)
-    assert torch.all(new_ns._params[0][4:8,4:8] == ns._params[7])
-    assert torch.all(new_ns._params[0][4:8,8:12] == ns._params[8])
-    assert torch.all(new_ns._params[0][4:8,12:16] == ns._params[9])
-    assert torch.all(new_ns._params[1][4:8,0:4] == ns._params[10])
-    assert torch.all(new_ns._params[1][4:8,4:8] == 0.0)
-    assert torch.all(new_ns._params[1][4:8,8:12] == ns._params[11])
-    assert torch.all(new_ns._params[1][4:8,12:16] == ns._params[12])
-    assert torch.all(new_ns._params[2][0:4,0:4] == ns._params[13])
-    assert torch.all(new_ns._params[2][0:4,4:8] == ns._params[14])
-    assert torch.all(new_ns._params[2][0:4,8:12] == ns._params[15])
-    assert torch.all(new_ns._params[2][0:4,12:16] == ns._params[16])
-    assert torch.all(new_ns._params[2][4:8,0:4] == 0.0)
-    assert torch.all(new_ns._params[2][4:8,4:8] == ns._params[17])
-    assert torch.all(new_ns._params[2][4:8,8:12] == ns._params[18])
-    assert torch.all(new_ns._params[2][4:8,12:16] == ns._params[19])
-
-    assert torch.all(new_ns.zero_param_mask[0][0:4,8:12])
-    assert torch.all(new_ns.zero_param_mask[0][4:8,0:4])
-    assert torch.all(new_ns.zero_param_mask[1][4:8,4:8])
-    assert torch.all(new_ns.zero_param_mask[2][4:8,0:4])
-
-    assert new_ns.zero_param_mask.long().sum() == 4 * 4 * 4
+    for use_cuda in [True, False]:
+        new_ns = juice.group(ns, use_cuda = use_cuda)
+
+        new_np0 = new_ns.chs[0]
+        new_np1 = new_ns.chs[1]
+
+        new_ni0 = new_np0.chs[0]
+        new_ni1 = new_np0.chs[1]
+        new_ni2 = new_np1.chs[0]
+        new_ni3 = new_np1.chs[1]
+
+        assert new_ni0.group_size == 16 and new_ni0.num_node_groups == 1
+        assert new_ni1.group_size == 16 and new_ni1.num_node_groups == 1
+        assert new_ni2.group_size == 16 and new_ni2.num_node_groups == 1
+        assert new_ni3.group_size == 16 and new_ni3.num_node_groups == 1
+
+        assert new_np0.num_node_groups == 1 and new_np0.group_size == 16
+        assert new_np0.is_block_sparse()
+        assert torch.all(new_np0.edge_ids == torch.tensor([[0, 0]]))
+
+        assert new_np1.num_node_groups == 1 and new_np1.group_size == 16
+        assert new_np1.is_block_sparse()
+        assert torch.all(new_np1.edge_ids == torch.tensor([[0, 0]]))
+
+        assert new_ns.num_node_groups == 2 and new_ns.group_size == 8
+        assert new_ns.ch_group_size == 16
+        assert torch.all(new_ns.edge_ids == torch.tensor([[0, 0, 1], [0, 1, 1]]))
+        assert torch.all(new_ns._params[0][0:4,0:4] == ns._params[0])
+        assert torch.all(new_ns._params[0][0:4,4:8] == ns._params[1])
+        assert torch.all(new_ns._params[0][0:4,8:12] == 0.0)
+        assert torch.all(new_ns._params[0][0:4,12:16] == ns._params[2])
+        assert torch.all(new_ns._params[1][0:4,0:4] == ns._params[3])
+        assert torch.all(new_ns._params[1][0:4,4:8] == ns._params[4])
+        assert torch.all(new_ns._params[1][0:4,8:12] == ns._params[5])
+        assert torch.all(new_ns._params[1][0:4,12:16] == ns._params[6])
+        assert torch.all(new_ns._params[0][4:8,0:4] == 0.0)
+        assert torch.all(new_ns._params[0][4:8,4:8] == ns._params[7])
+        assert torch.all(new_ns._params[0][4:8,8:12] == ns._params[8])
+        assert torch.all(new_ns._params[0][4:8,12:16] == ns._params[9])
+        assert torch.all(new_ns._params[1][4:8,0:4] == ns._params[10])
+        assert torch.all(new_ns._params[1][4:8,4:8] == 0.0)
+        assert torch.all(new_ns._params[1][4:8,8:12] == ns._params[11])
+        assert torch.all(new_ns._params[1][4:8,12:16] == ns._params[12])
+        assert torch.all(new_ns._params[2][0:4,0:4] == ns._params[13])
+        assert torch.all(new_ns._params[2][0:4,4:8] == ns._params[14])
+        assert torch.all(new_ns._params[2][0:4,8:12] == ns._params[15])
+        assert torch.all(new_ns._params[2][0:4,12:16] == ns._params[16])
+        assert torch.all(new_ns._params[2][4:8,0:4] == 0.0)
+        assert torch.all(new_ns._params[2][4:8,4:8] == ns._params[17])
+        assert torch.all(new_ns._params[2][4:8,8:12] == ns._params[18])
+        assert torch.all(new_ns._params[2][4:8,12:16] == ns._params[19])
+
+        assert torch.all(new_ns.zero_param_mask[0][0:4,8:12])
+        assert torch.all(new_ns.zero_param_mask[0][4:8,0:4])
+        assert torch.all(new_ns.zero_param_mask[1][4:8,4:8])
+        assert torch.all(new_ns.zero_param_mask[2][4:8,0:4])
+
+        assert new_ns.zero_param_mask.long().sum() == 4 * 4 * 4
 
 
 if __name__ == "__main__":

From 7db65efdb30d0e76f000478be7cf3f3ceb25b2e1 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Tue, 2 Jan 2024 20:10:39 +0800
Subject: [PATCH 158/162] change control flow for parameter initialization of
 input nodes

---
 .../nodes/distributions/distributions.py      | 20 ++++-
 .../nodes/distributions/masked_categorical.py | 82 +++++++++++++------
 src/pyjuice/nodes/input_nodes.py              | 27 ++++--
 src/pyjuice/nodes/nodes.py                    |  3 +
 src/pyjuice/nodes/sum_nodes.py                |  2 +-
 5 files changed, 98 insertions(+), 36 deletions(-)

diff --git a/src/pyjuice/nodes/distributions/distributions.py b/src/pyjuice/nodes/distributions/distributions.py
index 8c9ff929..5967dc66 100644
--- a/src/pyjuice/nodes/distributions/distributions.py
+++ b/src/pyjuice/nodes/distributions/distributions.py
@@ -15,9 +15,16 @@ def get_signature(self):
     def get_metadata(self):
         return [] # no metadata
 
-    def normalize_params(self, params: torch.Tensor):
+    def normalize_parameters(self, params: torch.Tensor, **kwargs):
         return params
 
+    def set_meta_parameters(self, **kwargs):
+        """
+        Assign meta-parameters to `self._params`.
+        Note: the actual parameters are not initialized after this function call.
+        """
+        raise NotImplementedError()
+
     def num_parameters(self):
         """
         The number of parameters per node.
@@ -37,10 +44,17 @@ def init_parameters(self, num_nodes: int, perturbation: float = 2.0, params: Opt
         """
         raise NotImplementedError()
 
+    def init_meta_parameters(self, num_nodes: int, params: Any, **kwargs):
+        """
+        Initialize meta-parameters for `num_nodes` nodes.
+        Return shape should be the same with `init_parameters`.
+        """
+        raise NotImplementedError()
+
     @property
-    def need_external_params(self):
+    def need_meta_parameters(self):
         """
-        A flag indicating whether users need to pass in `params` to the 
+        A flag indicating whether users need to pass in meta-parameters to the 
         constructor of InputNodes.
         """
         return False
diff --git a/src/pyjuice/nodes/distributions/masked_categorical.py b/src/pyjuice/nodes/distributions/masked_categorical.py
index 5c12fa8c..dfdff461 100644
--- a/src/pyjuice/nodes/distributions/masked_categorical.py
+++ b/src/pyjuice/nodes/distributions/masked_categorical.py
@@ -57,20 +57,53 @@ def num_parameters(self):
     def num_param_flows(self):
         return self.num_cats
 
-    def init_parameters(self, num_nodes: int, perturbation: float = 2.0, params: Optional[Any] = None, **kwargs):
+    def init_parameters(self, num_nodes: int, perturbation: float = 2.0, params: Optional[torch.Tensor] = None, **kwargs):
         """
         Initialize parameters for `num_nodes` nodes.
         Returned parameters should be flattened into a vector.
         """
-        assert params is not None, "Musk info should be provided."
+        assert params is not None, "Parameters should be provided to get meta-parameters."
+        params = params.reshape(-1, self.num_parameters())
+        assert params.size(0) == num_nodes
 
-        if isinstance(params, dict):
-            mask_tensor = params["masks"].float()
-        elif isinstance(params, torch.Tensor):
-            mask_tensor = params.float()
-        else:
-            raise ValueError()
+        num_nodes = params.size(0)
+
+        if self.mask_mode == "range":
+            mask_tensor = params[:,self.num_cats:self.num_cats+2]
+        elif self.mask_mode == "full_mask":
+            mask_tensor = params[:,self.num_cats:self.num_cats*2]
+        elif self.mask_mode == "rev_range":
+            mask_tensor = params[:,self.num_cats:self.num_cats+2]
+
+        cat_params = torch.exp(torch.rand([num_nodes, self.num_cats]) * -perturbation)
+        
+        # Apply mask
+        self._apply_mask(cat_params, num_nodes, mask_tensor)
+        
+        cat_params /= cat_params.sum(dim = 1, keepdim = True)
+
+        params = params.clone()
+        params[:,:self.num_cats] = cat_params
 
+        return params.reshape(-1)
+
+    def normalize_parameters(self, params: torch.Tensor):
+        params = params.reshape(-1, self.num_parameters())
+        num_nodes = params.size(0)
+
+        cat_params = params[:,:self.num_cats]
+
+        # Apply mask
+        self._apply_mask(cat_params, num_nodes, mask_tensor)
+
+        cat_params /= cat_params.sum(dim = 1, keepdim = True)
+        params[:,:self.num_cats] = cat_params
+
+        return params.reshape(-1)
+
+    def set_meta_parameters(self, num_nodes: int, **kwargs):
+        assert "mask" in kwargs, "`MaskedCategorical` requires an input argument `mask`."
+        mask_tensor = kwargs["mask"]
         assert mask_tensor.size(0) == num_nodes
 
         if self.mask_mode == "range":
@@ -83,19 +116,7 @@ def init_parameters(self, num_nodes: int, perturbation: float = 2.0, params: Opt
             assert mask_tensor.size(1) == 2
             num_free_cats = self.num_cats - (mask_tensor[:,1:2] - mask_tensor[:,0:1])
 
-        cat_params = torch.exp(torch.rand([num_nodes, self.num_cats]) * -perturbation)
-        
-        # Apply mask
-        if self.mask_mode == "range":
-            mask = torch.arange(self.num_cats).unsqueeze(0).expand(num_nodes, -1)
-            cat_params[(mask < mask_tensor[:,:1]) | (mask >= mask_tensor[:,1:])] = 0.0
-        elif self.mask_mode == "full_mask":
-            cat_params[(mask_tensor < 0.5)] = 0.0
-        elif self.mask_mode == "rev_range":
-            mask = torch.arange(self.num_cats).unsqueeze(0).expand(num_nodes, -1)
-            cat_params[(mask >= mask_tensor[:,:1]) & (mask < mask_tensor[:,1:])] = 0.0
-        
-        cat_params /= cat_params.sum(dim = 1, keepdim = True)
+        cat_params = torch.zeros([num_nodes, self.num_cats])
 
         params = torch.cat(
             (cat_params, mask_tensor, num_free_cats), 
@@ -105,13 +126,26 @@ def init_parameters(self, num_nodes: int, perturbation: float = 2.0, params: Opt
         return params.reshape(-1)
 
     @property
-    def need_external_params(self):
+    def need_meta_parameters(self):
         """
-        A flag indicating whether users need to pass in `params` to the 
-        constructor of InputNodes.
+        A flag indicating whether users need to pass in meta-parameters to the 
+        constructor of InputNodes. In this case, we need to provide information
+        regarding the categorical mask.
         """
         return True
 
+    def _apply_mask(self, cat_params: torch.Tensor, num_nodes: int, mask_tensor: torch.Tensor):
+        if self.mask_mode == "range":
+            mask = torch.arange(self.num_cats).unsqueeze(0).expand(num_nodes, -1)
+            cat_params[(mask < mask_tensor[:,:1]) | (mask >= mask_tensor[:,1:])] = 0.0
+        elif self.mask_mode == "full_mask":
+            cat_params[(mask_tensor < 0.5)] = 0.0
+        elif self.mask_mode == "rev_range":
+            mask = torch.arange(self.num_cats).unsqueeze(0).expand(num_nodes, -1)
+            cat_params[(mask >= mask_tensor[:,:1]) & (mask < mask_tensor[:,1:])] = 0.0
+        else:
+            raise ValueError(f"Unknown mask mode {self.mask_mode}.")
+
     @staticmethod
     def fw_mar_fn_range(local_offsets, data, params_ptr, s_pids, metadata_ptr, s_mids_ptr, mask, num_vars_per_node, BLOCK_SIZE):
         # Get `num_cats` from `metadata`
diff --git a/src/pyjuice/nodes/input_nodes.py b/src/pyjuice/nodes/input_nodes.py
index 69f952a5..876d6d1f 100644
--- a/src/pyjuice/nodes/input_nodes.py
+++ b/src/pyjuice/nodes/input_nodes.py
@@ -2,7 +2,7 @@
 
 import numpy as np
 import torch
-from typing import Sequence, Union, Type, Optional
+from typing import Sequence, Union, Type, Optional, Dict
 from copy import deepcopy
 
 from pyjuice.graph import InputRegionNode
@@ -21,15 +21,18 @@ def __init__(self, num_node_groups: int, scope: Union[Sequence,BitSet], dist: Di
 
         self.dist = dist
 
-        # Init parameters
-        if self.dist.need_external_params and params is None:
-            raise RuntimeError(f"Distribution `{self.dist}` requires `params` to be set.")
+        # Init parameters and meta-parameters
+        if self.dist.need_meta_parameters:
+            self.set_meta_params(**kwargs)
         if params is not None:
             self.set_params(params)
 
         # Callbacks
         self._run_init_callbacks(**kwargs)
 
+        # Parameter initialization flag
+        self._param_initialized = False
+
     @property
     def num_edges(self):
         return 0
@@ -53,26 +56,34 @@ def duplicate(self, scope: Optional[Union[int,Sequence,BitSet]] = None, tie_para
         return ns
 
     def get_params(self):
-        if self._params is None:
+        if not self.provided("_params"):
             return None
         else:
             return self._params
 
-    def set_params(self, params: torch.Tensor, normalize: bool = True):
+    def set_params(self, params: Union[torch.Tensor,Dict], normalize: bool = True):
         assert params.numel() == self.num_nodes * self.dist.num_parameters()
 
         params = params.reshape(-1)
         if normalize:
-            params = self.dist.normalize_params(params)
+            params = self.dist.normalize_parameters(params)
+
+        self._param_initialized = True
+        self._params = params
+
+    def set_meta_params(self, **kwargs):
+        params = self.dist.set_meta_parameters(self.num_nodes, **kwargs)
 
+        self._param_initialized = False
         self._params = params
 
     def init_parameters(self, perturbation: float = 2.0, recursive: bool = True, 
                         is_root: bool = True, ret_params: bool = False, **kwargs):
-        if not self.is_tied() and (not hasattr(self, "_params") or self._params is None):
+        if not self.is_tied() and not self.has_params():
             self._params = self.dist.init_parameters(
                 num_nodes = self.num_nodes,
                 perturbation = perturbation,
+                params = self.get_params(),
                 **kwargs
             )
 
diff --git a/src/pyjuice/nodes/nodes.py b/src/pyjuice/nodes/nodes.py
index 760520c9..2f8f734f 100644
--- a/src/pyjuice/nodes/nodes.py
+++ b/src/pyjuice/nodes/nodes.py
@@ -176,6 +176,9 @@ def set_source_ns(self, source_ns: CircuitNodes):
         self._source_node = source_ns
 
     def has_params(self):
+        if self.is_input():
+            return self._param_initialized
+
         if not self.is_tied():
             return hasattr(self, "_params") and self._params is not None
         else:
diff --git a/src/pyjuice/nodes/sum_nodes.py b/src/pyjuice/nodes/sum_nodes.py
index aa9e5eb4..c3662e4b 100644
--- a/src/pyjuice/nodes/sum_nodes.py
+++ b/src/pyjuice/nodes/sum_nodes.py
@@ -223,7 +223,7 @@ def _standardize_chs(self, chs):
 
         return new_chs
 
-    def _construct_edges(self, edge_ids: Optional[Union[Tensor,Sequence[Tensor]]], reorder: bool = True):
+    def _construct_edges(self, edge_ids: Optional[Union[Tensor,Sequence[Tensor]]], reorder: bool = False):
         if edge_ids is None:
             edge_ids = torch.cat(
                 (torch.arange(self.num_node_groups).unsqueeze(1).repeat(1, self.num_ch_node_groups).reshape(1, -1),

From fc59047cccc5945ca8256955a9abb03df7ceb61f Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Tue, 2 Jan 2024 20:10:51 +0800
Subject: [PATCH 159/162] update input distribution runtests

---
 tests/nodes/input_dists_test.py | 102 ++++++++++++++++----------------
 1 file changed, 51 insertions(+), 51 deletions(-)

diff --git a/tests/nodes/input_dists_test.py b/tests/nodes/input_dists_test.py
index 0eb96f1c..a3040d60 100644
--- a/tests/nodes/input_dists_test.py
+++ b/tests/nodes/input_dists_test.py
@@ -375,10 +375,10 @@ def discrete_logistic_nodes_behavior_test():
 
 def masked_categorical_nodes_range_test():
 
-    ni0 = inputs(0, num_nodes = 2, dist = dists.MaskedCategorical(num_cats = 5, mask_mode = "range"), params = {"masks": torch.tensor([[2, 4], [3, 5]])})
-    ni1 = inputs(1, num_nodes = 2, dist = dists.MaskedCategorical(num_cats = 5, mask_mode = "range"), params = {"masks": torch.tensor([[2, 4], [3, 5]])})
-    ni2 = inputs(2, num_nodes = 2, dist = dists.MaskedCategorical(num_cats = 5, mask_mode = "range"), params = {"masks": torch.tensor([[0, 3], [1, 4]])})
-    ni3 = inputs(3, num_nodes = 2, dist = dists.MaskedCategorical(num_cats = 5, mask_mode = "range"), params = {"masks": torch.tensor([[0, 5], [2, 5]])})
+    ni0 = inputs(0, num_nodes = 2, dist = dists.MaskedCategorical(num_cats = 5, mask_mode = "range"), mask = torch.tensor([[2, 4], [3, 5]]))
+    ni1 = inputs(1, num_nodes = 2, dist = dists.MaskedCategorical(num_cats = 5, mask_mode = "range"), mask = torch.tensor([[2, 4], [3, 5]]))
+    ni2 = inputs(2, num_nodes = 2, dist = dists.MaskedCategorical(num_cats = 5, mask_mode = "range"), mask = torch.tensor([[0, 3], [1, 4]]))
+    ni3 = inputs(3, num_nodes = 2, dist = dists.MaskedCategorical(num_cats = 5, mask_mode = "range"), mask = torch.tensor([[0, 5], [2, 5]]))
 
     m1 = multiply(ni0, ni1, edge_ids = torch.tensor([[0, 0], [0, 1], [1, 0], [1, 1]], dtype = torch.long))
     n1 = summate(m1, edge_ids = torch.tensor([[0, 0, 0, 0, 1, 1, 1, 1], [0, 1, 2, 3, 0, 1, 2, 3]], dtype = torch.long))
@@ -405,41 +405,41 @@ def masked_categorical_nodes_range_test():
 
     lls = pc(data)
 
-    pc.backward(data)
+    pc.backward(data.permute(1, 0))
 
     ## Input node forward tests ##
 
     for i in range(16):
         if 2 <= data[i,0] < 4:
-            assert torch.abs(pc.node_mars[1,i] - torch.log(pc.input_layers[0].params[data[i,0]])) < 1e-4
+            assert torch.abs(pc.node_mars[1,i] - torch.log(pc.input_layer_group[0].params[data[i,0]])) < 1e-4
         else:
             assert torch.abs(pc.node_mars[1,i] - math.log(1e-10)) < 1e-4
         if 3 <= data[i,0] < 5:
-            assert torch.abs(pc.node_mars[2,i] - torch.log(pc.input_layers[0].params[8+data[i,0]])) < 1e-4
+            assert torch.abs(pc.node_mars[2,i] - torch.log(pc.input_layer_group[0].params[8+data[i,0]])) < 1e-4
         else:
             assert torch.abs(pc.node_mars[2,i] - math.log(1e-10)) < 1e-4
         if 2 <= data[i,1] < 4:
-            assert torch.abs(pc.node_mars[3,i] - torch.log(pc.input_layers[0].params[16+data[i,1]])) < 1e-4
+            assert torch.abs(pc.node_mars[3,i] - torch.log(pc.input_layer_group[0].params[16+data[i,1]])) < 1e-4
         else:
             assert torch.abs(pc.node_mars[3,i] - math.log(1e-10)) < 1e-4
         if 3 <= data[i,1] < 5:
-            assert torch.abs(pc.node_mars[4,i] - torch.log(pc.input_layers[0].params[24+data[i,1]])) < 1e-4
+            assert torch.abs(pc.node_mars[4,i] - torch.log(pc.input_layer_group[0].params[24+data[i,1]])) < 1e-4
         else:
             assert torch.abs(pc.node_mars[4,i] - math.log(1e-10)) < 1e-4
         if 0 <= data[i,2] < 3:
-            assert torch.abs(pc.node_mars[5,i] - torch.log(pc.input_layers[0].params[32+data[i,2]])) < 1e-4
+            assert torch.abs(pc.node_mars[5,i] - torch.log(pc.input_layer_group[0].params[32+data[i,2]])) < 1e-4
         else:
             assert torch.abs(pc.node_mars[5,i] - math.log(1e-10)) < 1e-4
         if 1 <= data[i,2] < 4:
-            assert torch.abs(pc.node_mars[6,i] - torch.log(pc.input_layers[0].params[40+data[i,2]])) < 1e-4
+            assert torch.abs(pc.node_mars[6,i] - torch.log(pc.input_layer_group[0].params[40+data[i,2]])) < 1e-4
         else:
             assert torch.abs(pc.node_mars[6,i] - math.log(1e-10)) < 1e-4
         if 0 <= data[i,3] < 5:
-            assert torch.abs(pc.node_mars[7,i] - torch.log(pc.input_layers[0].params[48+data[i,3]])) < 1e-4
+            assert torch.abs(pc.node_mars[7,i] - torch.log(pc.input_layer_group[0].params[48+data[i,3]])) < 1e-4
         else:
             assert torch.abs(pc.node_mars[7,i] - math.log(1e-10)) < 1e-4
         if 2 <= data[i,3] < 5:
-            assert torch.abs(pc.node_mars[8,i] - torch.log(pc.input_layers[0].params[56+data[i,3]])) < 1e-4
+            assert torch.abs(pc.node_mars[8,i] - torch.log(pc.input_layer_group[0].params[56+data[i,3]])) < 1e-4
         else:
             assert torch.abs(pc.node_mars[8,i] - math.log(1e-10)) < 1e-4
 
@@ -475,16 +475,16 @@ def masked_categorical_nodes_range_test():
     gt_param_flows[7,:2] = 0.0
     gt_param_flows[7,5:] = 0.0
 
-    assert torch.all(torch.abs(gt_param_flows - pc.input_layers[0].param_flows.reshape(-1, 5)) < 1e-4)
+    assert torch.all(torch.abs(gt_param_flows - pc.input_layer_group[0].param_flows.reshape(-1, 5)) < 1e-4)
 
     ## EM tests ##
 
-    original_params = pc.input_layers[0].params.clone().reshape(8, 8)
+    original_params = pc.input_layer_group[0].params.clone().reshape(8, 8)
 
     step_size = 0.3
     pseudocount = 0.1
 
-    par_flows = pc.input_layers[0].param_flows.clone().reshape(8, 5)
+    par_flows = pc.input_layer_group[0].param_flows.clone().reshape(8, 5)
     pseudocounts = pseudocount / torch.tensor([2, 2, 2, 2, 3, 3, 5, 3]).unsqueeze(1).to(device)
     new_params = (1.0 - step_size) * original_params[:,:5] + step_size * ((par_flows + pseudocounts) / (par_flows.sum(dim = 1, keepdim = True) + pseudocount))
     updated_params = original_params
@@ -492,15 +492,15 @@ def masked_categorical_nodes_range_test():
 
     pc.mini_batch_em(step_size = step_size, pseudocount = pseudocount)
 
-    assert torch.all(torch.abs(updated_params - pc.input_layers[0].params.reshape(8, 8)) < 1e-4)
+    assert torch.all(torch.abs(updated_params - pc.input_layer_group[0].params.reshape(8, 8)) < 1e-4)
 
 
 def masked_categorical_nodes_full_mask_test():
 
-    ni0 = inputs(0, num_nodes = 2, dist = dists.MaskedCategorical(num_cats = 5, mask_mode = "full_mask"), params = {"masks": torch.tensor([[0, 0, 1, 1, 0], [0, 0, 0, 1, 1]])})
-    ni1 = inputs(1, num_nodes = 2, dist = dists.MaskedCategorical(num_cats = 5, mask_mode = "full_mask"), params = {"masks": torch.tensor([[0, 0, 1, 1, 0], [0, 0, 0, 1, 1]])})
-    ni2 = inputs(2, num_nodes = 2, dist = dists.MaskedCategorical(num_cats = 5, mask_mode = "full_mask"), params = {"masks": torch.tensor([[1, 1, 1, 0, 0], [0, 1, 1, 1, 0]])})
-    ni3 = inputs(3, num_nodes = 2, dist = dists.MaskedCategorical(num_cats = 5, mask_mode = "full_mask"), params = {"masks": torch.tensor([[1, 1, 1, 1, 1], [0, 0, 1, 1, 1]])})
+    ni0 = inputs(0, num_nodes = 2, dist = dists.MaskedCategorical(num_cats = 5, mask_mode = "full_mask"), mask = torch.tensor([[0, 0, 1, 1, 0], [0, 0, 0, 1, 1]]))
+    ni1 = inputs(1, num_nodes = 2, dist = dists.MaskedCategorical(num_cats = 5, mask_mode = "full_mask"), mask = torch.tensor([[0, 0, 1, 1, 0], [0, 0, 0, 1, 1]]))
+    ni2 = inputs(2, num_nodes = 2, dist = dists.MaskedCategorical(num_cats = 5, mask_mode = "full_mask"), mask = torch.tensor([[1, 1, 1, 0, 0], [0, 1, 1, 1, 0]]))
+    ni3 = inputs(3, num_nodes = 2, dist = dists.MaskedCategorical(num_cats = 5, mask_mode = "full_mask"), mask = torch.tensor([[1, 1, 1, 1, 1], [0, 0, 1, 1, 1]]))
 
     m1 = multiply(ni0, ni1, edge_ids = torch.tensor([[0, 0], [0, 1], [1, 0], [1, 1]], dtype = torch.long))
     n1 = summate(m1, edge_ids = torch.tensor([[0, 0, 0, 0, 1, 1, 1, 1], [0, 1, 2, 3, 0, 1, 2, 3]], dtype = torch.long))
@@ -527,41 +527,41 @@ def masked_categorical_nodes_full_mask_test():
 
     lls = pc(data)
 
-    pc.backward(data)
+    pc.backward(data.permute(1, 0))
 
     ## Input node forward tests ##
 
     for i in range(16):
         if 2 <= data[i,0] < 4:
-            assert torch.abs(pc.node_mars[1,i] - torch.log(pc.input_layers[0].params[data[i,0]])) < 1e-4
+            assert torch.abs(pc.node_mars[1,i] - torch.log(pc.input_layer_group[0].params[data[i,0]])) < 1e-4
         else:
             assert torch.abs(pc.node_mars[1,i] - math.log(1e-10)) < 1e-4
         if 3 <= data[i,0] < 5:
-            assert torch.abs(pc.node_mars[2,i] - torch.log(pc.input_layers[0].params[11+data[i,0]])) < 1e-4
+            assert torch.abs(pc.node_mars[2,i] - torch.log(pc.input_layer_group[0].params[11+data[i,0]])) < 1e-4
         else:
             assert torch.abs(pc.node_mars[2,i] - math.log(1e-10)) < 1e-4
         if 2 <= data[i,1] < 4:
-            assert torch.abs(pc.node_mars[3,i] - torch.log(pc.input_layers[0].params[22+data[i,1]])) < 1e-4
+            assert torch.abs(pc.node_mars[3,i] - torch.log(pc.input_layer_group[0].params[22+data[i,1]])) < 1e-4
         else:
             assert torch.abs(pc.node_mars[3,i] - math.log(1e-10)) < 1e-4
         if 3 <= data[i,1] < 5:
-            assert torch.abs(pc.node_mars[4,i] - torch.log(pc.input_layers[0].params[33+data[i,1]])) < 1e-4
+            assert torch.abs(pc.node_mars[4,i] - torch.log(pc.input_layer_group[0].params[33+data[i,1]])) < 1e-4
         else:
             assert torch.abs(pc.node_mars[4,i] - math.log(1e-10)) < 1e-4
         if 0 <= data[i,2] < 3:
-            assert torch.abs(pc.node_mars[5,i] - torch.log(pc.input_layers[0].params[44+data[i,2]])) < 1e-4
+            assert torch.abs(pc.node_mars[5,i] - torch.log(pc.input_layer_group[0].params[44+data[i,2]])) < 1e-4
         else:
             assert torch.abs(pc.node_mars[5,i] - math.log(1e-10)) < 1e-4
         if 1 <= data[i,2] < 4:
-            assert torch.abs(pc.node_mars[6,i] - torch.log(pc.input_layers[0].params[55+data[i,2]])) < 1e-4
+            assert torch.abs(pc.node_mars[6,i] - torch.log(pc.input_layer_group[0].params[55+data[i,2]])) < 1e-4
         else:
             assert torch.abs(pc.node_mars[6,i] - math.log(1e-10)) < 1e-4
         if 0 <= data[i,3] < 5:
-            assert torch.abs(pc.node_mars[7,i] - torch.log(pc.input_layers[0].params[66+data[i,3]])) < 1e-4
+            assert torch.abs(pc.node_mars[7,i] - torch.log(pc.input_layer_group[0].params[66+data[i,3]])) < 1e-4
         else:
             assert torch.abs(pc.node_mars[7,i] - math.log(1e-10)) < 1e-4
         if 2 <= data[i,3] < 5:
-            assert torch.abs(pc.node_mars[8,i] - torch.log(pc.input_layers[0].params[77+data[i,3]])) < 1e-4
+            assert torch.abs(pc.node_mars[8,i] - torch.log(pc.input_layer_group[0].params[77+data[i,3]])) < 1e-4
         else:
             assert torch.abs(pc.node_mars[8,i] - math.log(1e-10)) < 1e-4
 
@@ -597,16 +597,16 @@ def masked_categorical_nodes_full_mask_test():
     gt_param_flows[7,:2] = 0.0
     gt_param_flows[7,5:] = 0.0
 
-    assert torch.all(torch.abs(gt_param_flows - pc.input_layers[0].param_flows.reshape(-1, 5)) < 1e-4)
+    assert torch.all(torch.abs(gt_param_flows - pc.input_layer_group[0].param_flows.reshape(-1, 5)) < 1e-4)
 
     ## EM tests ##
 
-    original_params = pc.input_layers[0].params.clone().reshape(8, 11)
+    original_params = pc.input_layer_group[0].params.clone().reshape(8, 11)
 
     step_size = 0.3
     pseudocount = 0.1
 
-    par_flows = pc.input_layers[0].param_flows.clone().reshape(8, 5)
+    par_flows = pc.input_layer_group[0].param_flows.clone().reshape(8, 5)
     pseudocounts = pseudocount / torch.tensor([2, 2, 2, 2, 3, 3, 5, 3]).unsqueeze(1).to(device)
     new_params = (1.0 - step_size) * original_params[:,:5] + step_size * ((par_flows + pseudocounts) / (par_flows.sum(dim = 1, keepdim = True) + pseudocount))
     updated_params = original_params
@@ -614,15 +614,15 @@ def masked_categorical_nodes_full_mask_test():
 
     pc.mini_batch_em(step_size = step_size, pseudocount = pseudocount)
 
-    assert torch.all(torch.abs(updated_params - pc.input_layers[0].params.reshape(8, 11)) < 1e-4)
+    assert torch.all(torch.abs(updated_params - pc.input_layer_group[0].params.reshape(8, 11)) < 1e-4)
 
 
 def masked_categorical_nodes_rev_range_test():
 
-    ni0 = inputs(0, num_nodes = 2, dist = dists.MaskedCategorical(num_cats = 5, mask_mode = "rev_range"), params = {"masks": torch.tensor([[2, 4], [3, 5]])})
-    ni1 = inputs(1, num_nodes = 2, dist = dists.MaskedCategorical(num_cats = 5, mask_mode = "rev_range"), params = {"masks": torch.tensor([[2, 4], [3, 5]])})
-    ni2 = inputs(2, num_nodes = 2, dist = dists.MaskedCategorical(num_cats = 5, mask_mode = "rev_range"), params = {"masks": torch.tensor([[0, 3], [1, 4]])})
-    ni3 = inputs(3, num_nodes = 2, dist = dists.MaskedCategorical(num_cats = 5, mask_mode = "rev_range"), params = {"masks": torch.tensor([[4, 5], [2, 5]])})
+    ni0 = inputs(0, num_nodes = 2, dist = dists.MaskedCategorical(num_cats = 5, mask_mode = "rev_range"), mask = torch.tensor([[2, 4], [3, 5]]))
+    ni1 = inputs(1, num_nodes = 2, dist = dists.MaskedCategorical(num_cats = 5, mask_mode = "rev_range"), mask = torch.tensor([[2, 4], [3, 5]]))
+    ni2 = inputs(2, num_nodes = 2, dist = dists.MaskedCategorical(num_cats = 5, mask_mode = "rev_range"), mask = torch.tensor([[0, 3], [1, 4]]))
+    ni3 = inputs(3, num_nodes = 2, dist = dists.MaskedCategorical(num_cats = 5, mask_mode = "rev_range"), mask = torch.tensor([[4, 5], [2, 5]]))
 
     m1 = multiply(ni0, ni1, edge_ids = torch.tensor([[0, 0], [0, 1], [1, 0], [1, 1]], dtype = torch.long))
     n1 = summate(m1, edge_ids = torch.tensor([[0, 0, 0, 0, 1, 1, 1, 1], [0, 1, 2, 3, 0, 1, 2, 3]], dtype = torch.long))
@@ -649,41 +649,41 @@ def masked_categorical_nodes_rev_range_test():
 
     lls = pc(data)
 
-    pc.backward(data)
+    pc.backward(data.permute(1, 0))
 
     ## Input node forward tests ##
 
     for i in range(16):
         if not 2 <= data[i,0] < 4:
-            assert torch.abs(pc.node_mars[1,i] - torch.log(pc.input_layers[0].params[data[i,0]])) < 1e-4
+            assert torch.abs(pc.node_mars[1,i] - torch.log(pc.input_layer_group[0].params[data[i,0]])) < 1e-4
         else:
             assert torch.abs(pc.node_mars[1,i] - math.log(1e-10)) < 1e-4
         if not 3 <= data[i,0] < 5:
-            assert torch.abs(pc.node_mars[2,i] - torch.log(pc.input_layers[0].params[8+data[i,0]])) < 1e-4
+            assert torch.abs(pc.node_mars[2,i] - torch.log(pc.input_layer_group[0].params[8+data[i,0]])) < 1e-4
         else:
             assert torch.abs(pc.node_mars[2,i] - math.log(1e-10)) < 1e-4
         if not 2 <= data[i,1] < 4:
-            assert torch.abs(pc.node_mars[3,i] - torch.log(pc.input_layers[0].params[16+data[i,1]])) < 1e-4
+            assert torch.abs(pc.node_mars[3,i] - torch.log(pc.input_layer_group[0].params[16+data[i,1]])) < 1e-4
         else:
             assert torch.abs(pc.node_mars[3,i] - math.log(1e-10)) < 1e-4
         if not 3 <= data[i,1] < 5:
-            assert torch.abs(pc.node_mars[4,i] - torch.log(pc.input_layers[0].params[24+data[i,1]])) < 1e-4
+            assert torch.abs(pc.node_mars[4,i] - torch.log(pc.input_layer_group[0].params[24+data[i,1]])) < 1e-4
         else:
             assert torch.abs(pc.node_mars[4,i] - math.log(1e-10)) < 1e-4
         if not 0 <= data[i,2] < 3:
-            assert torch.abs(pc.node_mars[5,i] - torch.log(pc.input_layers[0].params[32+data[i,2]])) < 1e-4
+            assert torch.abs(pc.node_mars[5,i] - torch.log(pc.input_layer_group[0].params[32+data[i,2]])) < 1e-4
         else:
             assert torch.abs(pc.node_mars[5,i] - math.log(1e-10)) < 1e-4
         if not 1 <= data[i,2] < 4:
-            assert torch.abs(pc.node_mars[6,i] - torch.log(pc.input_layers[0].params[40+data[i,2]])) < 1e-4
+            assert torch.abs(pc.node_mars[6,i] - torch.log(pc.input_layer_group[0].params[40+data[i,2]])) < 1e-4
         else:
             assert torch.abs(pc.node_mars[6,i] - math.log(1e-10)) < 1e-4
         if not 4 <= data[i,3] < 5:
-            assert torch.abs(pc.node_mars[7,i] - torch.log(pc.input_layers[0].params[48+data[i,3]])) < 1e-4
+            assert torch.abs(pc.node_mars[7,i] - torch.log(pc.input_layer_group[0].params[48+data[i,3]])) < 1e-4
         else:
             assert torch.abs(pc.node_mars[7,i] - math.log(1e-10)) < 1e-4
         if not 2 <= data[i,3] < 5:
-            assert torch.abs(pc.node_mars[8,i] - torch.log(pc.input_layers[0].params[56+data[i,3]])) < 1e-4
+            assert torch.abs(pc.node_mars[8,i] - torch.log(pc.input_layer_group[0].params[56+data[i,3]])) < 1e-4
         else:
             assert torch.abs(pc.node_mars[8,i] - math.log(1e-10)) < 1e-4
 
@@ -711,16 +711,16 @@ def masked_categorical_nodes_rev_range_test():
     gt_param_flows[6,4:5] = 0.0
     gt_param_flows[7,2:5] = 0.0
 
-    assert torch.all(torch.abs(gt_param_flows - pc.input_layers[0].param_flows.reshape(-1, 5)) < 1e-4)
+    assert torch.all(torch.abs(gt_param_flows - pc.input_layer_group[0].param_flows.reshape(-1, 5)) < 1e-4)
 
     ## EM tests ##
 
-    original_params = pc.input_layers[0].params.clone().reshape(8, 8)
+    original_params = pc.input_layer_group[0].params.clone().reshape(8, 8)
 
     step_size = 0.3
     pseudocount = 0.1
 
-    par_flows = pc.input_layers[0].param_flows.clone().reshape(8, 5)
+    par_flows = pc.input_layer_group[0].param_flows.clone().reshape(8, 5)
     pseudocounts = pseudocount / torch.tensor([3, 3, 3, 3, 2, 2, 4, 2]).unsqueeze(1).to(device)
     new_params = (1.0 - step_size) * original_params[:,:5] + step_size * ((par_flows + pseudocounts) / (par_flows.sum(dim = 1, keepdim = True) + pseudocount))
     updated_params = original_params
@@ -728,7 +728,7 @@ def masked_categorical_nodes_rev_range_test():
 
     pc.mini_batch_em(step_size = step_size, pseudocount = pseudocount)
 
-    assert torch.all(torch.abs(updated_params - pc.input_layers[0].params.reshape(8, 8)) < 1e-4)
+    assert torch.all(torch.abs(updated_params - pc.input_layer_group[0].params.reshape(8, 8)) < 1e-4)
 
 
 if __name__ == "__main__":

From 3e02e8a3219bb11dc20e4668afe029456331591a Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Tue, 2 Jan 2024 21:38:31 +0800
Subject: [PATCH 160/162] speedup runtests for structures

---
 examples/train_mnist_pdhclt.py   | 120 +++++++++++++++++++++++++++++++
 tests/structures/hclt_test.py    |  18 +++--
 tests/structures/pd_hclt_test.py |  16 +++--
 tests/structures/pd_test.py      |   7 +-
 4 files changed, 148 insertions(+), 13 deletions(-)
 create mode 100644 examples/train_mnist_pdhclt.py

diff --git a/examples/train_mnist_pdhclt.py b/examples/train_mnist_pdhclt.py
new file mode 100644
index 00000000..e47806cb
--- /dev/null
+++ b/examples/train_mnist_pdhclt.py
@@ -0,0 +1,120 @@
+import pyjuice as juice
+import torch
+import torchvision
+import time
+from torch.utils.data import TensorDataset, DataLoader
+
+
+def evaluate(pc, loader):
+    lls_total = 0.0
+    for batch in loader:
+        x = batch[0].to(pc.device)
+        lls = pc(x)
+        lls_total += lls.mean().detach().cpu().numpy().item()
+    
+    lls_total /= len(loader)
+    return lls_total
+
+
+def mini_batch_em_epoch(num_epochs, pc, optimizer, scheduler, train_loader, test_loader, device):
+    for epoch in range(num_epochs):
+        t0 = time.time()
+        train_ll = 0.0
+        for batch in train_loader:
+            x = batch[0].to(device)
+
+            optimizer.zero_grad()
+
+            lls = pc(x)
+            lls.mean().backward()
+
+            train_ll += lls.mean().detach().cpu().numpy().item()
+
+            optimizer.step()
+            if scheduler is not None:
+                scheduler.step()
+
+        train_ll /= len(train_loader)
+
+        t1 = time.time()
+        test_ll = evaluate(pc, loader=test_loader)
+        t2 = time.time()
+
+        print(f"[Epoch {epoch}/{num_epochs}][train LL: {train_ll:.2f}; test LL: {test_ll:.2f}].....[train forward+backward+step {t1-t0:.2f}; test forward {t2-t1:.2f}] ")
+
+
+def full_batch_em_epoch(pc, train_loader, test_loader, device):
+    with torch.no_grad():
+        t0 = time.time()
+        train_ll = 0.0
+        for batch in train_loader:
+            x = batch[0].to(device)
+
+            lls = pc(x)
+            pc.backward(x, flows_memory = 1.0)
+
+            train_ll += lls.mean().detach().cpu().numpy().item()
+
+        pc.mini_batch_em(step_size = 1.0, pseudocount = 0.1)
+
+        train_ll /= len(train_loader)
+
+        t1 = time.time()
+        test_ll = evaluate(pc, loader=test_loader)
+        t2 = time.time()
+        print(f"[train LL: {train_ll:.2f}; test LL: {test_ll:.2f}].....[train forward+backward+step {t1-t0:.2f}; test forward {t2-t1:.2f}] ")
+
+
+def pd_hclt_test():
+
+    device = torch.device("cuda:0")
+
+    train_dataset = torchvision.datasets.MNIST(root = "./examples/data", train = True, download = True)
+    test_dataset = torchvision.datasets.MNIST(root = "./examples/data", train = False, download = True)
+
+    train_data = train_dataset.data.reshape(60000, 28*28)
+    test_data = test_dataset.data.reshape(10000, 28*28)
+
+    num_features = train_data.size(1)
+
+    train_loader = DataLoader(
+        dataset = TensorDataset(train_data),
+        batch_size = 512,
+        shuffle = True,
+        drop_last = True
+    )
+    test_loader = DataLoader(
+        dataset = TensorDataset(test_data),
+        batch_size = 512,
+        shuffle = False,
+        drop_last = True
+    )
+
+    ns = juice.structures.PDHCLT(
+        train_data.cuda(),
+        data_shape = (28, 28),
+        num_latents = 128,
+        split_intervals = (4, 4),
+        structure_type = "sum_dominated"
+    )
+    pc = juice.TensorCircuit(ns)
+
+    pc.to(device)
+
+    optimizer = juice.optim.CircuitOptimizer(pc, lr = 0.1, pseudocount = 0.1)
+    scheduler = juice.optim.CircuitScheduler(
+        optimizer, 
+        method = "multi_linear", 
+        lrs = [0.9, 0.1, 0.05], 
+        milestone_steps = [0, len(train_loader) * 100, len(train_loader) * 350]
+    )
+
+    pc.print_statistics()
+
+    mini_batch_em_epoch(350, pc, optimizer, None, train_loader, test_loader, device)
+    full_batch_em_epoch(pc, train_loader, test_loader, device)
+
+
+if __name__ == "__main__":
+    torch.manual_seed(2391)
+    pd_hclt_test()
diff --git a/tests/structures/hclt_test.py b/tests/structures/hclt_test.py
index c3033833..85bffe34 100644
--- a/tests/structures/hclt_test.py
+++ b/tests/structures/hclt_test.py
@@ -93,7 +93,7 @@ def hclt_test():
         train_data.float().to(device), 
         num_bins = 32, 
         sigma = 0.5 / 32, 
-        num_latents = 512, 
+        num_latents = 256, 
         chunk_size = 32
     )
     pc = juice.TensorCircuit(ns)
@@ -139,8 +139,11 @@ def hclt_test():
     # import pdb; pdb.set_trace()
     # exit()
 
-    mini_batch_em_epoch(350, pc, optimizer, scheduler, train_loader, test_loader, device)
-    full_batch_em_epoch(pc, train_loader, test_loader, device)
+    mini_batch_em_epoch(5, pc, optimizer, scheduler, train_loader, test_loader, device)
+
+    test_ll = evaluate(pc, test_loader)
+
+    assert test_ll > -770
 
 
 def hclt_logistic_test():
@@ -191,10 +194,13 @@ def hclt_logistic_test():
         milestone_steps = [0, len(train_loader) * 100, len(train_loader) * 350]
     )
 
-    mini_batch_em_epoch(350, pc, optimizer, scheduler, train_loader, test_loader, device)
-    full_batch_em_epoch(pc, train_loader, test_loader, device)
+    mini_batch_em_epoch(5, pc, optimizer, scheduler, train_loader, test_loader, device)
+
+    test_ll = evaluate(pc, test_loader)
+
+    assert test_ll > -990.0
 
 
 if __name__ == "__main__":
     hclt_test()
-    # hclt_logistic_test()
+    hclt_logistic_test()
diff --git a/tests/structures/pd_hclt_test.py b/tests/structures/pd_hclt_test.py
index 5365321d..e9fcc11a 100644
--- a/tests/structures/pd_hclt_test.py
+++ b/tests/structures/pd_hclt_test.py
@@ -111,8 +111,11 @@ def pd_hclt_degenerative_case_test():
 
     pc.print_statistics()
 
-    mini_batch_em_epoch(350, pc, optimizer, None, train_loader, test_loader, device)
-    full_batch_em_epoch(pc, train_loader, test_loader, device)
+    mini_batch_em_epoch(5, pc, optimizer, None, train_loader, test_loader, device)
+    
+    test_ll = evaluate(pc, test_loader)
+
+    assert test_ll > -690.0
 
 
 def pd_hclt_test():
@@ -168,11 +171,14 @@ def pd_hclt_test():
     #     lls.mean().backward()
     #     break
 
-    mini_batch_em_epoch(350, pc, optimizer, None, train_loader, test_loader, device)
-    full_batch_em_epoch(pc, train_loader, test_loader, device)
+    mini_batch_em_epoch(5, pc, optimizer, None, train_loader, test_loader, device)
+
+    test_ll = evaluate(pc, test_loader)
+
+    assert test_ll > -680.0
 
 
 if __name__ == "__main__":
     torch.manual_seed(2391)
-    # pd_hclt_degenerative_case_test()
+    pd_hclt_degenerative_case_test()
     pd_hclt_test()
diff --git a/tests/structures/pd_test.py b/tests/structures/pd_test.py
index f6921b2d..001046ac 100644
--- a/tests/structures/pd_test.py
+++ b/tests/structures/pd_test.py
@@ -126,8 +126,11 @@ def pd_test():
     # import pdb; pdb.set_trace()
     # exit()
 
-    mini_batch_em_epoch(350, pc, optimizer, None, train_loader, test_loader, device)
-    full_batch_em_epoch(pc, train_loader, test_loader, device)
+    mini_batch_em_epoch(5, pc, optimizer, None, train_loader, test_loader, device)
+    
+    test_ll = evaluate(pc, test_loader)
+
+    assert test_ll > -755.0
 
 
 if __name__ == "__main__":

From 26cd5f39c324491fb4ad976449d795999ac94f2b Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Tue, 2 Jan 2024 22:40:38 +0800
Subject: [PATCH 161/162] shorten long lines

---
 src/pyjuice/nodes/construction.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/pyjuice/nodes/construction.py b/src/pyjuice/nodes/construction.py
index af66b9b4..12eb8850 100644
--- a/src/pyjuice/nodes/construction.py
+++ b/src/pyjuice/nodes/construction.py
@@ -69,7 +69,8 @@ def multiply(nodes1: ProdNodesChs, *args, edge_ids: Optional[Tensor] = None, spa
     return ProdNodes(num_node_groups, chs, edge_ids, group_size = group_size, **kwargs)
 
 
-def summate(nodes1: SumNodesChs, *args, num_nodes: int = 0, num_node_groups: int = 0, edge_ids: Optional[Tensor] = None, group_size: int = 0, **kwargs):
+def summate(nodes1: SumNodesChs, *args, num_nodes: int = 0, num_node_groups: int = 0, 
+            edge_ids: Optional[Tensor] = None, group_size: int = 0, **kwargs):
 
     if num_nodes > 0:
         assert num_node_groups == 0, "Only one of `num_nodes` and `num_node_groups` can be set at the same time."

From b7b13f93532dc3a18da5488410a35897d4a79241 Mon Sep 17 00:00:00 2001
From: Anji Liu <anjiliu219@gmail.com>
Date: Tue, 2 Jan 2024 22:45:06 +0800
Subject: [PATCH 162/162] add group size assertions

---
 src/pyjuice/layer/prod_layer.py         | 2 +-
 src/pyjuice/model/backend/par_update.py | 2 +-
 src/pyjuice/nodes/construction.py       | 6 ++++++
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/pyjuice/layer/prod_layer.py b/src/pyjuice/layer/prod_layer.py
index 711588df..37a1827d 100644
--- a/src/pyjuice/layer/prod_layer.py
+++ b/src/pyjuice/layer/prod_layer.py
@@ -412,7 +412,7 @@ def _forward_backward(self, node_vals: torch.Tensor, element_vals: torch.Tensor,
         accum = 1 if accum else 0
         partial_eval = 1 if local_ids is not None else 0
 
-        assert num_edges & (num_edges - 1) == 0, "`num_edges` must be power of 2."
+        assert num_edges & (num_edges - 1) == 0, "`num_edges` must be a power of 2."
 
         # Fall back to the `torch.compile` kernel in the case where we cannot store child edges within a single block
         if num_edges > 1024:
diff --git a/src/pyjuice/model/backend/par_update.py b/src/pyjuice/model/backend/par_update.py
index 68388bdc..fa867172 100644
--- a/src/pyjuice/model/backend/par_update.py
+++ b/src/pyjuice/model/backend/par_update.py
@@ -47,7 +47,7 @@ def _record_par_blks(par_start_ids, pflow_start_ids, blk_sizes, blk_intervals, g
 @torch.no_grad()
 def compile_par_update_fn(root_ns: CircuitNodes, BLOCK_SIZE: int = 32, buffer_inc_interval: int = 10000, use_numba: bool = True):
 
-    assert BLOCK_SIZE & (BLOCK_SIZE - 1) == 0, "`BLOCK_SIZE` must be power of 2."
+    assert BLOCK_SIZE & (BLOCK_SIZE - 1) == 0, "`BLOCK_SIZE` must be a power of 2."
 
     par_start_ids = np.zeros([buffer_inc_interval], dtype = np.int64)
     pflow_start_ids = np.zeros([buffer_inc_interval], dtype = np.int64)
diff --git a/src/pyjuice/nodes/construction.py b/src/pyjuice/nodes/construction.py
index 12eb8850..f690173a 100644
--- a/src/pyjuice/nodes/construction.py
+++ b/src/pyjuice/nodes/construction.py
@@ -21,6 +21,8 @@
 def inputs(var: Union[int,Sequence[int]], num_node_groups: int = 0, dist: Distribution = Distribution(), 
            params: Optional[Tensor] = None, num_nodes: int = 0, group_size: int = 0, **kwargs):
 
+    assert group_size == 0 or group_size & (group_size - 1) == 0, "`group_size` must be a power of 2."
+
     if num_nodes > 0:
         assert num_node_groups == 0, "Only one of `num_nodes` and `num_node_groups` can be set at the same time."
         if group_size == 0:
@@ -72,6 +74,8 @@ def multiply(nodes1: ProdNodesChs, *args, edge_ids: Optional[Tensor] = None, spa
 def summate(nodes1: SumNodesChs, *args, num_nodes: int = 0, num_node_groups: int = 0, 
             edge_ids: Optional[Tensor] = None, group_size: int = 0, **kwargs):
 
+    assert group_size == 0 or group_size & (group_size - 1) == 0, "`group_size` must be a power of 2."
+
     if num_nodes > 0:
         assert num_node_groups == 0, "Only one of `num_nodes` and `num_node_groups` can be set at the same time."
         if group_size == 0:
@@ -101,6 +105,8 @@ def summate(nodes1: SumNodesChs, *args, num_nodes: int = 0, num_node_groups: int
 class set_group_size(_DecoratorContextManager):
     def __init__(self, group_size: int = 1):
 
+        assert group_size & (group_size - 1) == 0, "`group_size` must be a power of 2."
+
         self.group_size = group_size
         
         self.original_group_size = None