Add natten with static tiling & Morton Curve (#87)

joydddd · web-flow · commit b32a6b5c46dd · 2024-12-12T11:51:43.000-08:00
* Add natten with static tiling

* move natten correctness test to test folder
diff --git a/attn_gym/masks/__init__.py b/attn_gym/masks/__init__.py
@@ -3,3 +3,4 @@
 from attn_gym.masks.prefix_lm import generate_prefix_lm_mask
 from attn_gym.masks.document_mask import generate_doc_mask_mod
 from attn_gym.masks.dilated_sliding_window import generate_dilated_sliding_window
+from attn_gym.masks.natten import generate_natten, generate_tiled_natten, generate_morton_natten
diff --git a/attn_gym/masks/natten.py b/attn_gym/masks/natten.py
@@ -42,6 +42,133 @@ def natten_mask_mod(
     natten_mask_mod.__name__ = f"natten_c{canvas_w}x{canvas_h}_k{kernel_w}x{kernel_h}"
     return natten_mask_mod
 
+def generate_tiled_natten(
+    W: int,
+    H: int,
+    K_W: int,
+    K_H: int,
+    T_W: int,
+    T_H: int,
+) -> _mask_mod_signature:
+    """Generates a NATTEN attention mask with a given kernel size and static tiling.
+    Args:
+        W: The width of the canvas.
+        H: The height of the canvas.
+        K_W: The width of the kernel.
+        K_H: The height of the kernel.
+        T_W: The width of the tile.
+        T_H: The height of the tile.
+    """
+
+    def get_x_y_tiled(idx: IntTensor) -> Tuple[IntTensor, IntTensor]:
+        """
+        Map 1-D index to 2-D coordinates for static tiles of T_H x T_W.
+        """
+        t_id = idx // (T_H * T_W)
+        t_x, t_y = t_id // (W // T_W), t_id % (W // T_W)
+        t_offset = idx % (T_H * T_W)
+        i_x, i_y = t_offset // T_W, t_offset % T_W
+        return t_x*T_W + i_x, t_y*T_H + i_y
+
+    def tiled_natten_mask(
+        b: IntTensor,
+        h: IntTensor,
+        q_idx: IntTensor,
+        kv_idx: IntTensor,
+    ) -> BoolTensor:
+        q_x, q_y = get_x_y_tiled(q_idx)
+        kv_x, kv_y = get_x_y_tiled(kv_idx)
+        kernel_x = q_x.clamp(K_W // 2, (W - 1) - K_W // 2)
+        kernel_y = q_y.clamp(K_H // 2, (H - 1) - K_H // 2)
+        hori_mask = (kernel_x - kv_x).abs() <= K_W // 2
+        vert_mask = (kernel_y - kv_y).abs() <= K_H // 2
+        return hori_mask & vert_mask
+
+    tiled_natten_mask.__name__ = f"tiled_natten_c{W}x{H}_k{K_W}x{K_H}_t{T_W}x{T_H}"
+    return tiled_natten_mask
+
+def interleave_bits_32(x):
+    """
+    Interleave the bits of a 16-bit integer x, producing a 32-bit integer
+    where the bits of x are interleaved with zeros.
+    """
+    x = x & 0x0000FFFF  # Ensure x is 16 bits
+    x = (x | (x << 8)) & 0x00FF00FF
+    x = (x | (x << 4)) & 0x0F0F0F0F
+    x = (x | (x << 2)) & 0x33333333
+    x = (x | (x << 1)) & 0x55555555
+    return x
+
+def morton_encode(x, y):
+    """
+    Encode 2D coordinates (x, y) into a Morton code (Z-order curve index).
+
+    Parameters:
+    x (int): The x-coordinate.
+    y (int): The y-coordinate.
+
+    Returns:
+    int: The Morton code resulting from interleaving the bits of x and y.
+    """
+    return (interleave_bits_32(y) << 1) | interleave_bits_32(x)
+
+def deinterleave_bits_32(code):
+    """
+    Deinterleave bits to retrieve the original 16-bit integer.
+    """
+    code = code & 0x55555555
+    code = (code | (code >> 1)) & 0x33333333
+    code = (code | (code >> 2)) & 0x0F0F0F0F
+    code = (code | (code >> 4)) & 0x00FF00FF
+    code = (code | (code >> 8)) & 0x0000FFFF
+    return code
+
+def morton_decode(code):
+    """
+    Decode a Morton code to retrieve the original 2D coordinates (x, y).
+
+    Parameters:
+    code (int): The Morton code.
+
+    Returns:
+    tuple: A tuple (x, y) representing the original coordinates.
+    """
+    x = deinterleave_bits_32(code)
+    y = deinterleave_bits_32(code >> 1)
+    return x, y
+
+
+def generate_morton_natten(
+    canvas_w: int,
+    canvas_h: int,
+    kernel_w: int,
+    kernel_h: int,
+) -> _mask_mod_signature:
+    """Generates a NATTEN attention mask with a given kernel size under morton curve layout. 
+    Args:
+        canvas_w: The width of the canvas.
+        canvas_h: The height of the canvas.
+        kernel_w: The width of the kernel.
+        kernel_h: The height of the kernel.
+    """
+    def natten_mask_mod(
+        b: IntTensor,
+        h: IntTensor,
+        q_idx: IntTensor,
+        kv_idx: IntTensor,
+    ) -> BoolTensor:
+        q_x, q_y = morton_decode(q_idx)
+        kv_x, kv_y = morton_decode(kv_idx)
+        # kernel nominally attempts to center itself on the query, but kernel center
+        # is clamped to a fixed distance (kernel half-length) from the canvas edge
+        kernel_center_x = q_x.clamp(kernel_w // 2, (canvas_w - 1) - kernel_w // 2)
+        kernel_center_y = q_y.clamp(kernel_h // 2, (canvas_h - 1) - kernel_h // 2)
+        hori_mask = (kernel_center_x - kv_x).abs() <= kernel_w // 2
+        vert_mask = (kernel_center_y - kv_y).abs() <= kernel_h // 2
+        return hori_mask & vert_mask
+
+    natten_mask_mod.__name__ = f"morton_natten_c{canvas_w}x{canvas_h}_k{kernel_w}x{kernel_h}"
+    return natten_mask_mod
 
 def main(device: str = "cpu"):
     """Visualize the attention scores of NATTEN mask mod.
@@ -77,6 +204,40 @@ def make_tensor():
         device=device,
         name=natten_mask.__name__,
     )
+    
+    
+    tiled_natten_mask = generate_tiled_natten(
+        W=CANVAS_WIDTH,
+        H=CANVAS_HEIGHT,
+        K_W=kernel_size,
+        K_H=kernel_size,
+        T_W=2,
+        T_H=2,
+    )
+    visualize_attention_scores(
+        # TODO: update visualize_attention_scores to support 2D sequences
+        query.flatten(start_dim=2, end_dim=3),
+        key.flatten(start_dim=2, end_dim=3),
+        mask_mod=tiled_natten_mask,
+        device=device,
+        name=tiled_natten_mask.__name__,
+    )
+    
+    
+    morton_natten_mask = generate_morton_natten(
+        canvas_w=CANVAS_WIDTH,
+        canvas_h=CANVAS_HEIGHT,
+        kernel_w=kernel_size,
+        kernel_h=kernel_size,
+    )
+    visualize_attention_scores(
+        # TODO: update visualize_attention_scores to support 2D sequences
+        query.flatten(start_dim=2, end_dim=3),
+        key.flatten(start_dim=2, end_dim=3),
+        mask_mod=morton_natten_mask,
+        device=device,
+        name=morton_natten_mask.__name__,
+    )
 
 
 if __name__ == "__main__":
diff --git a/examples/flex_attn.ipynb b/examples/flex_attn.ipynb
@@ -632,8 +632,8 @@
    "source": [
     "H = 128\n",
     "W = 128\n",
-    "K_H = 7\n",
-    "K_W = 7\n",
+    "K_H = 13\n",
+    "K_W = 13\n",
     "\n",
     "\n",
     "def get_x_y(idx):\n",
@@ -657,7 +657,175 @@
     "    return hori_mask & vert_mask\n",
     "\n",
     "\n",
-    "test_mask(mask_mod=natten_mask)"
+    "test_mask(mask_mod=natten_mask, S=H * W)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Tiled NATTEN layout\n",
+    "The solution above unrolls 2-D Q and KV into 1-D attention problem in a naive column major way. This breaks the locality of the very sparse Q K V layout: While the density of the MATTEN mask is `(13 * 13) / (128 * 128) = 1.0%`, the density of our block mask becomes 10.16% with 128x128 blocks. Q K V layouts with that retains their 2-D spatial locality could improve the block sparsity and make flexattention implementation more efficient. \n",
+    "\n",
+    "Static tiling as proposed in the [faster NATTEN](https://arxiv.org/abs/2403.04690) maps static tiles of $ T_h \\times T_w $ in the 2-D space in contiguous region in 1-D Q K V. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "H = 128\n",
+    "W = 128\n",
+    "K_H = 13\n",
+    "K_W = 13\n",
+    "T_H, T_W = 8, 8\n",
+    "\n",
+    "def gen_tiled_natten(W, H, K_W, K_H, T_W, T_H):\n",
+    "    def get_idx_tiled(x, y):\n",
+    "        \"\"\"\n",
+    "        Map 2-D coordinates to 1-D index for static tiles of T_H x T_W.\n",
+    "        \"\"\"\n",
+    "        t_x, t_y = x // T_W, y // T_H\n",
+    "        t_id = t_x * (W // T_W) + t_y\n",
+    "        i_x, i_y = x % T_W, y % T_H\n",
+    "        t_offset = i_x * T_W + i_y\n",
+    "        return t_id * (T_H * T_W) + t_offset\n",
+    "\n",
+    "    def get_x_y_tiled(idx):\n",
+    "        \"\"\"\n",
+    "        Map 1-D index to 2-D coordinates for static tiles of T_H x T_W.\n",
+    "        \"\"\"\n",
+    "        t_id = idx // (T_H * T_W)\n",
+    "        t_x, t_y = t_id // (W // T_W), t_id % (W // T_W)\n",
+    "        t_offset = idx % (T_H * T_W)\n",
+    "        i_x, i_y = t_offset // T_W, t_offset % T_W\n",
+    "        return t_x*T_W + i_x, t_y*T_H + i_y\n",
+    "\n",
+    "    def tiled_natten_mask(b, h, q, kv):\n",
+    "        q_x, q_y = get_x_y_tiled(q)\n",
+    "        kv_x, kv_y = get_x_y_tiled(kv)\n",
+    "        kernel_x = q_x.clamp(K_W // 2, (W - 1) - K_W // 2)\n",
+    "        kernel_y = q_y.clamp(K_H // 2, (H - 1) - K_H // 2)\n",
+    "        hori_mask = (kernel_x - kv_x).abs() <= K_W // 2\n",
+    "        vert_mask = (kernel_y - kv_y).abs() <= K_H // 2\n",
+    "        return hori_mask & vert_mask\n",
+    "    return tiled_natten_mask\n",
+    "\n",
+    "# tiled_natten_mask = gen_tiled_natten(W, H, K_W, K_H, T_W, T_H)\n",
+    "from attn_gym.masks.natten import generate_tiled_natten\n",
+    "tiled_natten_mask_mod = generate_tiled_natten(W, H, K_W, K_H, T_W, T_H)\n",
+    "\n",
+    "test_mask(mask_mod=tiled_natten_mask_mod, S=H * W)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Verify that Naive NATTEN Mask and tiled NATTEN generate the same output"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def run_natten(\n",
+    "    mask = None,\n",
+    "    encoder = None, \n",
+    "    decoder = None,\n",
+    "    query = None, \n",
+    "    key = None,\n",
+    "    value = None, \n",
+    "    gradOut = None,\n",
+    "    B=16,\n",
+    "    H=16,\n",
+    "    W=128,\n",
+    "    D=64,\n",
+    "    print_mask=True,\n",
+    "):\n",
+    "    if decoder:\n",
+    "        permuter_x, permuter_y = decoder(torch.arange(W*W))\n",
+    "        permuter_index = permuter_x * W + permuter_y\n",
+    "        q = query[:, :, permuter_x, permuter_y, :].clone().detach().requires_grad_(query.requires_grad)\n",
+    "        k = key[:, :, permuter_x, permuter_y, :].clone().detach().requires_grad_(key.requires_grad)\n",
+    "        v = value[:, :, permuter_x, permuter_y, :].clone().detach().requires_grad_(value.requires_grad)\n",
+    "        dO = gradOut[:, :, permuter_x, permuter_y, :]\n",
+    "    else: \n",
+    "        q = query.flatten(2, 3).clone().detach().requires_grad_(query.requires_grad)\n",
+    "        k = key.flatten(2, 3).clone().detach().requires_grad_(key.requires_grad)\n",
+    "        v = value.flatten(2, 3).clone().detach().requires_grad_(value.requires_grad)\n",
+    "        dO = gradOut.flatten(2, 3)\n",
+    "    block_mask = create_block_mask_cached(mask, 1, 1, W*W, W*W, device=query.device)\n",
+    "    if print_mask:\n",
+    "        print(f\"\\nBlock Mask:\\n{block_mask}\")\n",
+    "    \n",
+    "    out = flex_attention(q, k, v, block_mask=block_mask)\n",
+    "    \n",
+    "    out.backward(dO)\n",
+    "    \n",
+    "    if encoder: \n",
+    "        i_x = torch.arange(W)[:, None].broadcast_to(W, W).flatten() \n",
+    "        i_y = torch.arange(W)[None, :].broadcast_to(W, W).flatten() \n",
+    "        depermuter = encoder(i_x, i_y)\n",
+    "        out = out[:, :, depermuter, :].reshape(B, H, W, W, D)\n",
+    "        q_grad = q.grad[:, :, depermuter, :].reshape(B, H, W, W, D)\n",
+    "        k_grad = k.grad[:, :, depermuter, :].reshape(B, H, W, W, D)\n",
+    "        v_grad = v.grad[:, :, depermuter, :].reshape(B, H, W, W, D)\n",
+    "        results = [out, q_grad, k_grad, v_grad]\n",
+    "    else:\n",
+    "        out= out.reshape(B, H, W, W, D)\n",
+    "        q_grad = q.grad.reshape(B, H, W, W, D)\n",
+    "        k_grad = k.grad.reshape(B, H, W, W, D)\n",
+    "        v_grad = v.grad.reshape(B, H, W, W, D)\n",
+    "        results = [out, q_grad, k_grad, v_grad]\n",
+    "        \n",
+    "    del q, k, v, dO\n",
+    "    \n",
+    "    return results\n",
+    "\n",
+    "\n",
+    "def test_natten_masks(\n",
+    "    naive,\n",
+    "    tiled,\n",
+    "    B=16,\n",
+    "    H=16,\n",
+    "    W=128,\n",
+    "    D=64,\n",
+    "    skip_correctness=False,\n",
+    "    print_mask=True,\n",
+    "): \n",
+    "    query = torch.randn(\n",
+    "        B, H, W, W, D, device=\"cuda\", dtype=torch.float16, requires_grad=True\n",
+    "    )\n",
+    "    key = torch.randn(\n",
+    "        B, H, W, W, D, device=\"cuda\", dtype=torch.float16, requires_grad=True\n",
+    "    )\n",
+    "    value = torch.randn(\n",
+    "        B, H, W, W, D, device=\"cuda\", dtype=torch.float16, requires_grad=True\n",
+    "    )\n",
+    "    gradOut = torch.randn(B, H, W, W, D, device=\"cuda\", dtype=torch.float16)\n",
+    "    \n",
+    "    naive_results = run_natten(mask=naive[0], encoder=naive[1], decoder=naive[2], query=query, key=key, value=value, gradOut=gradOut, print_mask=print_mask)\n",
+    "    tiled_results = run_natten(mask=tiled[0], encoder=tiled[1], decoder=tiled[2], query=query, key=key, value=value, gradOut=gradOut, print_mask=print_mask)\n",
+    "    \n",
+    "    if not skip_correctness:\n",
+    "        for naive, tiled in zip(naive_results, tiled_results):\n",
+    "            torch.testing.assert_close(naive, tiled, atol=1e-1, rtol=1e-2)\n",
+    "\n",
+    "        print(\"Correctness check passed ✅\")\n",
+    "\n",
+    "    # Clean up to save memory\n",
+    "    del query, key, value, gradOut, naive_results, tiled_results\n",
+    "    torch.cuda.empty_cache()\n",
+    "\n",
+    "test_natten_masks(\n",
+    "    naive=[natten_mask, None, None],\n",
+    "    tiled=[tiled_natten_mask, get_idx_tiled, get_x_y_tiled],\n",
+    ")"
    ]
   },
   {
diff --git a/test/test_natten.py b/test/test_natten.py