From 17f1f8b816efb1085f4a64e5a4820a6e13a8f88a Mon Sep 17 00:00:00 2001
From: Jakub Kuderski <jakub@nod-labs.com>
Date: Wed, 2 Oct 2024 17:19:10 -0400
Subject: [PATCH] Drop mlir files (#6)

These are generated by the benchmark frameworks, so no need to keep them
stored in git.
---
 .gitignore                                    |   6 +-
 .../attention_128x1024x128x128x1024xf16.mlir  |  26 ----
 ...tion_128x1024x128x128x1024xf8E4M3FNUZ.mlir |  25 ---
 .../attention_128x1024x64x64x1024xf16.mlir    |  26 ----
 ...ention_128x1024x64x64x1024xf8E4M3FNUZ.mlir |  25 ---
 ...attention_128x16384x128x128x16384xf16.mlir |  26 ----
 ...on_128x16384x128x128x16384xf8E4M3FNUZ.mlir |  25 ---
 .../attention_128x16384x64x64x16384xf16.mlir  |  26 ----
 ...tion_128x16384x64x64x16384xf8E4M3FNUZ.mlir |  25 ---
 .../attention_128x2048x128x128x2048xf16.mlir  |  26 ----
 ...tion_128x2048x128x128x2048xf8E4M3FNUZ.mlir |  25 ---
 .../attention_128x2048x64x64x2048xf16.mlir    |  26 ----
 ...ention_128x2048x64x64x2048xf8E4M3FNUZ.mlir |  25 ---
 .../attention_128x4096x128x128x4096xf16.mlir  |  26 ----
 ...tion_128x4096x128x128x4096xf8E4M3FNUZ.mlir |  25 ---
 .../attention_128x4096x64x64x4096xf16.mlir    |  26 ----
 ...ention_128x4096x64x64x4096xf8E4M3FNUZ.mlir |  25 ---
 .../attention_128x8192x128x128x8192xf16.mlir  |  26 ----
 ...tion_128x8192x128x128x8192xf8E4M3FNUZ.mlir |  25 ---
 .../attention_128x8192x64x64x8192xf16.mlir    |  26 ----
 ...ention_128x8192x64x64x8192xf8E4M3FNUZ.mlir |  25 ---
 .../mlir/attention_12x384x64x64x384xf16.mlir  |  26 ----
 ...attention_12x384x64x64x384xf8E4M3FNUZ.mlir |  25 ---
 .../attention_16x1024x128x128x1024xf16.mlir   |  26 ----
 ...ntion_16x1024x128x128x1024xf8E4M3FNUZ.mlir |  25 ---
 .../attention_16x1024x64x64x1024xf16.mlir     |  26 ----
 ...tention_16x1024x64x64x1024xf8E4M3FNUZ.mlir |  25 ---
 .../attention_16x16384x128x128x16384xf16.mlir |  26 ----
 ...ion_16x16384x128x128x16384xf8E4M3FNUZ.mlir |  25 ---
 .../attention_16x16384x64x64x16384xf16.mlir   |  26 ----
 ...ntion_16x16384x64x64x16384xf8E4M3FNUZ.mlir |  25 ---
 .../attention_16x2048x128x128x2048xf16.mlir   |  26 ----
 ...ntion_16x2048x128x128x2048xf8E4M3FNUZ.mlir |  25 ---
 .../attention_16x2048x64x64x2048xf16.mlir     |  26 ----
 ...tention_16x2048x64x64x2048xf8E4M3FNUZ.mlir |  25 ---
 .../attention_16x4096x128x128x4096xf16.mlir   |  26 ----
 ...ntion_16x4096x128x128x4096xf8E4M3FNUZ.mlir |  25 ---
 .../attention_16x4096x64x64x4096xf16.mlir     |  26 ----
 ...tention_16x4096x64x64x4096xf8E4M3FNUZ.mlir |  25 ---
 .../attention_16x8192x128x128x8192xf16.mlir   |  26 ----
 ...ntion_16x8192x128x128x8192xf8E4M3FNUZ.mlir |  25 ---
 .../attention_16x8192x64x64x8192xf16.mlir     |  26 ----
 ...tention_16x8192x64x64x8192xf8E4M3FNUZ.mlir |  25 ---
 .../attention_192x1024x128x128x1024xf16.mlir  |  26 ----
 ...tion_192x1024x128x128x1024xf8E4M3FNUZ.mlir |  25 ---
 .../attention_192x1024x64x64x1024xf16.mlir    |  26 ----
 ...ention_192x1024x64x64x1024xf8E4M3FNUZ.mlir |  25 ---
 ...attention_192x16384x128x128x16384xf16.mlir |  26 ----
 ...on_192x16384x128x128x16384xf8E4M3FNUZ.mlir |  25 ---
 .../attention_192x16384x64x64x16384xf16.mlir  |  26 ----
 ...tion_192x16384x64x64x16384xf8E4M3FNUZ.mlir |  25 ---
 .../attention_192x2048x128x128x2048xf16.mlir  |  26 ----
 ...tion_192x2048x128x128x2048xf8E4M3FNUZ.mlir |  25 ---
 .../attention_192x2048x64x64x2048xf16.mlir    |  26 ----
 ...ention_192x2048x64x64x2048xf8E4M3FNUZ.mlir |  25 ---
 .../attention_192x4096x128x128x4096xf16.mlir  |  26 ----
 ...tion_192x4096x128x128x4096xf8E4M3FNUZ.mlir |  25 ---
 .../attention_192x4096x64x64x4096xf16.mlir    |  26 ----
 ...ention_192x4096x64x64x4096xf8E4M3FNUZ.mlir |  25 ---
 .../attention_192x8192x128x128x8192xf16.mlir  |  26 ----
 ...tion_192x8192x128x128x8192xf8E4M3FNUZ.mlir |  25 ---
 .../attention_192x8192x64x64x8192xf16.mlir    |  26 ----
 ...ention_192x8192x64x64x8192xf8E4M3FNUZ.mlir |  25 ---
 .../attention_1x1024x128x128x1024xf16.mlir    |  26 ----
 ...ention_1x1024x128x128x1024xf8E4M3FNUZ.mlir |  25 ---
 .../mlir/attention_1x1024x64x64x1024xf16.mlir |  26 ----
 ...ttention_1x1024x64x64x1024xf8E4M3FNUZ.mlir |  25 ---
 .../attention_1x16384x128x128x16384xf16.mlir  |  26 ----
 ...tion_1x16384x128x128x16384xf8E4M3FNUZ.mlir |  25 ---
 .../attention_1x16384x64x64x16384xf16.mlir    |  26 ----
 ...ention_1x16384x64x64x16384xf8E4M3FNUZ.mlir |  25 ---
 .../attention_1x2048x128x128x2048xf16.mlir    |  26 ----
 ...ention_1x2048x128x128x2048xf8E4M3FNUZ.mlir |  25 ---
 .../mlir/attention_1x2048x64x64x2048xf16.mlir |  26 ----
 ...ttention_1x2048x64x64x2048xf8E4M3FNUZ.mlir |  25 ---
 .../attention_1x4096x128x128x4096xf16.mlir    |  26 ----
 ...ention_1x4096x128x128x4096xf8E4M3FNUZ.mlir |  25 ---
 .../mlir/attention_1x4096x64x64x4096xf16.mlir |  26 ----
 ...ttention_1x4096x64x64x4096xf8E4M3FNUZ.mlir |  25 ---
 .../mlir/attention_1x4096x64x64x64xf16.mlir   |  26 ----
 .../attention_1x4096x64x64x64xf8E4M3FNUZ.mlir |  25 ---
 .../attention_1x8192x128x128x8192xf16.mlir    |  26 ----
 ...ention_1x8192x128x128x8192xf8E4M3FNUZ.mlir |  25 ---
 .../mlir/attention_1x8192x64x64x8192xf16.mlir |  26 ----
 ...ttention_1x8192x64x64x8192xf8E4M3FNUZ.mlir |  25 ---
 .../attention_20x4096x64x64x4096xf16.mlir     |  26 ----
 ...tention_20x4096x64x64x4096xf8E4M3FNUZ.mlir |  25 ---
 .../mlir/attention_20x4096x64x64x64xf16.mlir  |  26 ----
 ...attention_20x4096x64x64x64xf8E4M3FNUZ.mlir |  25 ---
 .../attention_2x1024x128x128x1024xf16.mlir    |  26 ----
 ...ention_2x1024x128x128x1024xf8E4M3FNUZ.mlir |  25 ---
 .../mlir/attention_2x1024x64x64x1024xf16.mlir |  26 ----
 ...ttention_2x1024x64x64x1024xf8E4M3FNUZ.mlir |  25 ---
 .../mlir/attention_2x1024x64x64x64xf16.mlir   |  26 ----
 .../attention_2x1024x64x64x64xf8E4M3FNUZ.mlir |  25 ---
 .../attention_2x16384x128x128x16384xf16.mlir  |  26 ----
 ...tion_2x16384x128x128x16384xf8E4M3FNUZ.mlir |  25 ---
 .../attention_2x16384x64x64x16384xf16.mlir    |  26 ----
 ...ention_2x16384x64x64x16384xf8E4M3FNUZ.mlir |  25 ---
 .../attention_2x2048x128x128x2048xf16.mlir    |  26 ----
 ...ention_2x2048x128x128x2048xf8E4M3FNUZ.mlir |  25 ---
 .../mlir/attention_2x2048x64x64x2048xf16.mlir |  26 ----
 ...ttention_2x2048x64x64x2048xf8E4M3FNUZ.mlir |  25 ---
 .../attention_2x4096x128x128x4096xf16.mlir    |  26 ----
 ...ention_2x4096x128x128x4096xf8E4M3FNUZ.mlir |  25 ---
 .../mlir/attention_2x4096x64x64x4096xf16.mlir |  26 ----
 ...ttention_2x4096x64x64x4096xf8E4M3FNUZ.mlir |  25 ---
 .../attention_2x8192x128x128x8192xf16.mlir    |  26 ----
 ...ention_2x8192x128x128x8192xf8E4M3FNUZ.mlir |  25 ---
 .../mlir/attention_2x8192x64x64x8192xf16.mlir |  26 ----
 ...ttention_2x8192x64x64x8192xf8E4M3FNUZ.mlir |  25 ---
 .../attention_32x1024x128x128x1024xf16.mlir   |  26 ----
 ...ntion_32x1024x128x128x1024xf8E4M3FNUZ.mlir |  25 ---
 .../attention_32x1024x64x64x1024xf16.mlir     |  26 ----
 ...tention_32x1024x64x64x1024xf8E4M3FNUZ.mlir |  25 ---
 .../attention_32x16384x128x128x16384xf16.mlir |  26 ----
 ...ion_32x16384x128x128x16384xf8E4M3FNUZ.mlir |  25 ---
 .../attention_32x16384x64x64x16384xf16.mlir   |  26 ----
 ...ntion_32x16384x64x64x16384xf8E4M3FNUZ.mlir |  25 ---
 .../attention_32x2048x128x128x2048xf16.mlir   |  26 ----
 ...ntion_32x2048x128x128x2048xf8E4M3FNUZ.mlir |  25 ---
 .../attention_32x2048x64x64x2048xf16.mlir     |  26 ----
 ...tention_32x2048x64x64x2048xf8E4M3FNUZ.mlir |  25 ---
 .../attention_32x4096x128x128x4096xf16.mlir   |  26 ----
 ...ntion_32x4096x128x128x4096xf8E4M3FNUZ.mlir |  25 ---
 .../attention_32x4096x64x64x4096xf16.mlir     |  26 ----
 ...tention_32x4096x64x64x4096xf8E4M3FNUZ.mlir |  25 ---
 .../attention_32x8192x128x128x8192xf16.mlir   |  26 ----
 ...ntion_32x8192x128x128x8192xf8E4M3FNUZ.mlir |  25 ---
 .../attention_32x8192x64x64x8192xf16.mlir     |  26 ----
 ...tention_32x8192x64x64x8192xf8E4M3FNUZ.mlir |  25 ---
 .../attention_40x1024x64x64x1024xf16.mlir     |  26 ----
 ...tention_40x1024x64x64x1024xf8E4M3FNUZ.mlir |  25 ---
 .../mlir/attention_40x1024x64x64x64xf16.mlir  |  26 ----
 ...attention_40x1024x64x64x64xf8E4M3FNUZ.mlir |  25 ---
 .../attention_48x1024x128x128x1024xf16.mlir   |  26 ----
 ...ntion_48x1024x128x128x1024xf8E4M3FNUZ.mlir |  25 ---
 .../attention_48x1024x64x64x1024xf16.mlir     |  26 ----
 ...tention_48x1024x64x64x1024xf8E4M3FNUZ.mlir |  25 ---
 .../attention_48x16384x128x128x16384xf16.mlir |  26 ----
 ...ion_48x16384x128x128x16384xf8E4M3FNUZ.mlir |  25 ---
 .../attention_48x16384x64x64x16384xf16.mlir   |  26 ----
 ...ntion_48x16384x64x64x16384xf8E4M3FNUZ.mlir |  25 ---
 .../attention_48x2048x128x128x2048xf16.mlir   |  26 ----
 ...ntion_48x2048x128x128x2048xf8E4M3FNUZ.mlir |  25 ---
 .../attention_48x2048x64x64x2048xf16.mlir     |  26 ----
 ...tention_48x2048x64x64x2048xf8E4M3FNUZ.mlir |  25 ---
 .../attention_48x4096x128x128x4096xf16.mlir   |  26 ----
 ...ntion_48x4096x128x128x4096xf8E4M3FNUZ.mlir |  25 ---
 .../attention_48x4096x64x64x4096xf16.mlir     |  26 ----
 ...tention_48x4096x64x64x4096xf8E4M3FNUZ.mlir |  25 ---
 .../attention_48x8192x128x128x8192xf16.mlir   |  26 ----
 ...ntion_48x8192x128x128x8192xf8E4M3FNUZ.mlir |  25 ---
 .../attention_48x8192x64x64x8192xf16.mlir     |  26 ----
 ...tention_48x8192x64x64x8192xf8E4M3FNUZ.mlir |  25 ---
 .../attention_4x1024x128x128x1024xf16.mlir    |  26 ----
 ...ention_4x1024x128x128x1024xf8E4M3FNUZ.mlir |  25 ---
 .../mlir/attention_4x1024x64x64x1024xf16.mlir |  26 ----
 ...ttention_4x1024x64x64x1024xf8E4M3FNUZ.mlir |  25 ---
 .../attention_4x16384x128x128x16384xf16.mlir  |  26 ----
 ...tion_4x16384x128x128x16384xf8E4M3FNUZ.mlir |  25 ---
 .../attention_4x16384x64x64x16384xf16.mlir    |  26 ----
 ...ention_4x16384x64x64x16384xf8E4M3FNUZ.mlir |  25 ---
 .../attention_4x2048x128x128x2048xf16.mlir    |  26 ----
 ...ention_4x2048x128x128x2048xf8E4M3FNUZ.mlir |  25 ---
 .../mlir/attention_4x2048x64x64x2048xf16.mlir |  26 ----
 ...ttention_4x2048x64x64x2048xf8E4M3FNUZ.mlir |  25 ---
 .../attention_4x4096x128x128x4096xf16.mlir    |  26 ----
 ...ention_4x4096x128x128x4096xf8E4M3FNUZ.mlir |  25 ---
 .../mlir/attention_4x4096x64x64x4096xf16.mlir |  26 ----
 ...ttention_4x4096x64x64x4096xf8E4M3FNUZ.mlir |  25 ---
 .../mlir/attention_4x4096x64x64x64xf16.mlir   |  26 ----
 .../attention_4x4096x64x64x64xf8E4M3FNUZ.mlir |  25 ---
 .../attention_4x8192x128x128x8192xf16.mlir    |  26 ----
 ...ention_4x8192x128x128x8192xf8E4M3FNUZ.mlir |  25 ---
 .../mlir/attention_4x8192x64x64x8192xf16.mlir |  26 ----
 ...ttention_4x8192x64x64x8192xf8E4M3FNUZ.mlir |  25 ---
 .../attention_64x1024x128x128x1024xf16.mlir   |  26 ----
 ...ntion_64x1024x128x128x1024xf8E4M3FNUZ.mlir |  25 ---
 .../attention_64x1024x64x64x1024xf16.mlir     |  26 ----
 ...tention_64x1024x64x64x1024xf8E4M3FNUZ.mlir |  25 ---
 .../attention_64x16384x128x128x16384xf16.mlir |  26 ----
 ...ion_64x16384x128x128x16384xf8E4M3FNUZ.mlir |  25 ---
 .../attention_64x16384x64x64x16384xf16.mlir   |  26 ----
 ...ntion_64x16384x64x64x16384xf8E4M3FNUZ.mlir |  25 ---
 .../attention_64x2048x128x128x2048xf16.mlir   |  26 ----
 ...ntion_64x2048x128x128x2048xf8E4M3FNUZ.mlir |  25 ---
 .../attention_64x2048x64x64x2048xf16.mlir     |  26 ----
 ...tention_64x2048x64x64x2048xf8E4M3FNUZ.mlir |  25 ---
 .../attention_64x4096x128x128x4096xf16.mlir   |  26 ----
 ...ntion_64x4096x128x128x4096xf8E4M3FNUZ.mlir |  25 ---
 .../attention_64x4096x64x64x4096xf16.mlir     |  26 ----
 ...tention_64x4096x64x64x4096xf8E4M3FNUZ.mlir |  25 ---
 .../attention_64x8192x128x128x8192xf16.mlir   |  26 ----
 ...ntion_64x8192x128x128x8192xf8E4M3FNUZ.mlir |  25 ---
 .../attention_64x8192x64x64x8192xf16.mlir     |  26 ----
 ...tention_64x8192x64x64x8192xf8E4M3FNUZ.mlir |  25 ---
 .../mlir/attention_768x4096x64x64x64xf16.mlir |  26 ----
 ...ttention_768x4096x64x64x64xf8E4M3FNUZ.mlir |  25 ---
 .../attention_8x1024x128x128x1024xf16.mlir    |  26 ----
 ...ention_8x1024x128x128x1024xf8E4M3FNUZ.mlir |  25 ---
 .../mlir/attention_8x1024x64x64x1024xf16.mlir |  26 ----
 ...ttention_8x1024x64x64x1024xf8E4M3FNUZ.mlir |  25 ---
 .../mlir/attention_8x1024x64x64x64xf16.mlir   |  26 ----
 .../attention_8x1024x64x64x64xf8E4M3FNUZ.mlir |  25 ---
 .../attention_8x16384x128x128x16384xf16.mlir  |  26 ----
 ...tion_8x16384x128x128x16384xf8E4M3FNUZ.mlir |  25 ---
 .../attention_8x16384x64x64x16384xf16.mlir    |  26 ----
 ...ention_8x16384x64x64x16384xf8E4M3FNUZ.mlir |  25 ---
 .../attention_8x2048x128x128x2048xf16.mlir    |  26 ----
 ...ention_8x2048x128x128x2048xf8E4M3FNUZ.mlir |  25 ---
 .../mlir/attention_8x2048x64x64x2048xf16.mlir |  26 ----
 ...ttention_8x2048x64x64x2048xf8E4M3FNUZ.mlir |  25 ---
 .../attention_8x4096x128x128x4096xf16.mlir    |  26 ----
 ...ention_8x4096x128x128x4096xf8E4M3FNUZ.mlir |  25 ---
 .../mlir/attention_8x4096x64x64x4096xf16.mlir |  26 ----
 ...ttention_8x4096x64x64x4096xf8E4M3FNUZ.mlir |  25 ---
 .../attention_8x8192x128x128x8192xf16.mlir    |  26 ----
 ...ention_8x8192x128x128x8192xf8E4M3FNUZ.mlir |  25 ---
 .../mlir/attention_8x8192x64x64x8192xf16.mlir |  26 ----
 ...ttention_8x8192x64x64x8192xf8E4M3FNUZ.mlir |  25 ---
 .../attention_96x1024x128x128x1024xf16.mlir   |  26 ----
 ...ntion_96x1024x128x128x1024xf8E4M3FNUZ.mlir |  25 ---
 .../attention_96x1024x64x64x1024xf16.mlir     |  26 ----
 ...tention_96x1024x64x64x1024xf8E4M3FNUZ.mlir |  25 ---
 .../attention_96x16384x128x128x16384xf16.mlir |  26 ----
 ...ion_96x16384x128x128x16384xf8E4M3FNUZ.mlir |  25 ---
 .../attention_96x16384x64x64x16384xf16.mlir   |  26 ----
 ...ntion_96x16384x64x64x16384xf8E4M3FNUZ.mlir |  25 ---
 .../attention_96x2048x128x128x2048xf16.mlir   |  26 ----
 ...ntion_96x2048x128x128x2048xf8E4M3FNUZ.mlir |  25 ---
 .../attention_96x2048x64x64x2048xf16.mlir     |  26 ----
 ...tention_96x2048x64x64x2048xf8E4M3FNUZ.mlir |  25 ---
 .../attention_96x4096x128x128x4096xf16.mlir   |  26 ----
 ...ntion_96x4096x128x128x4096xf8E4M3FNUZ.mlir |  25 ---
 .../attention_96x4096x64x64x4096xf16.mlir     |  26 ----
 ...tention_96x4096x64x64x4096xf8E4M3FNUZ.mlir |  25 ---
 .../attention_96x8192x128x128x8192xf16.mlir   |  26 ----
 ...ntion_96x8192x128x128x8192xf8E4M3FNUZ.mlir |  25 ---
 .../attention_96x8192x64x64x8192xf16.mlir     |  26 ----
 ...tention_96x8192x64x64x8192xf8E4M3FNUZ.mlir |  25 ---
 ...x112x112x64x7x7x3_f32xf32xf32_stride2.mlir |   7 -
 ...4x14x1024x1x1x512_f32xf32xf32_stride2.mlir |   7 -
 ...14x14x256x3x3x256_f32xf32xf32_stride1.mlir |   7 -
 ...14x14x256x3x3x256_f32xf32xf32_stride2.mlir |   7 -
 ...28x28x128x3x3x128_f32xf32xf32_stride1.mlir |   7 -
 ...28x28x128x3x3x128_f32xf32xf32_stride2.mlir |   7 -
 ...28x28x512x1x1x256_f32xf32xf32_stride2.mlir |   7 -
 ...6x56x56x64x3x3x64_f32xf32xf32_stride1.mlir |   7 -
 ...7x7x2048x1x1x1024_f32xf32xf32_stride2.mlir |   7 -
 ...6x7x7x512x3x3x512_f32xf32xf32_stride1.mlir |   7 -
 ...6x7x7x512x3x3x512_f32xf32xf32_stride2.mlir |   7 -
 ...x112x112x64x7x7x3_f32xf32xf32_stride2.mlir |   7 -
 ...4x14x1024x1x1x512_f32xf32xf32_stride2.mlir |   7 -
 ...14x14x256x3x3x256_f32xf32xf32_stride1.mlir |   7 -
 ...14x14x256x3x3x256_f32xf32xf32_stride2.mlir |   7 -
 ...28x28x128x3x3x128_f32xf32xf32_stride1.mlir |   7 -
 ...28x28x128x3x3x128_f32xf32xf32_stride2.mlir |   7 -
 ...28x28x512x1x1x256_f32xf32xf32_stride2.mlir |   7 -
 ...1x56x56x64x3x3x64_f32xf32xf32_stride1.mlir |   7 -
 ..._1x56x56x64x7x7x3_f32xf32xf32_stride1.mlir |   8 -
 ...7x7x2048x1x1x1024_f32xf32xf32_stride2.mlir |   7 -
 ...1x7x7x512x3x3x512_f32xf32xf32_stride1.mlir |   7 -
 ...1x7x7x512x3x3x512_f32xf32xf32_stride2.mlir |   7 -
 ...x112x112x64x7x7x3_f32xf32xf32_stride2.mlir |   7 -
 ...4x14x1024x1x1x512_f32xf32xf32_stride2.mlir |   7 -
 ...14x14x256x3x3x256_f32xf32xf32_stride1.mlir |   7 -
 ...14x14x256x3x3x256_f32xf32xf32_stride2.mlir |   7 -
 ...28x28x128x3x3x128_f32xf32xf32_stride1.mlir |   7 -
 ...28x28x128x3x3x128_f32xf32xf32_stride2.mlir |   7 -
 ...28x28x512x1x1x256_f32xf32xf32_stride2.mlir |   7 -
 ...2x56x56x64x3x3x64_f32xf32xf32_stride1.mlir |   7 -
 ...7x7x2048x1x1x1024_f32xf32xf32_stride2.mlir |   7 -
 ...2x7x7x512x3x3x512_f32xf32xf32_stride1.mlir |   7 -
 ...2x7x7x512x3x3x512_f32xf32xf32_stride2.mlir |   7 -
 ...x112x112x64x7x7x3_f32xf32xf32_stride2.mlir |   7 -
 ...4x14x1024x1x1x512_f32xf32xf32_stride2.mlir |   7 -
 ...14x14x256x3x3x256_f32xf32xf32_stride1.mlir |   7 -
 ...14x14x256x3x3x256_f32xf32xf32_stride2.mlir |   7 -
 ...28x28x128x3x3x128_f32xf32xf32_stride1.mlir |   7 -
 ...28x28x128x3x3x128_f32xf32xf32_stride2.mlir |   7 -
 ...28x28x512x1x1x256_f32xf32xf32_stride2.mlir |   7 -
 ...2x56x56x64x3x3x64_f32xf32xf32_stride1.mlir |   7 -
 ...7x7x2048x1x1x1024_f32xf32xf32_stride2.mlir |   7 -
 ...2x7x7x512x3x3x512_f32xf32xf32_stride1.mlir |   7 -
 ...2x7x7x512x3x3x512_f32xf32xf32_stride2.mlir |   7 -
 ...x112x112x64x7x7x3_f32xf32xf32_stride2.mlir |   7 -
 ...4x14x1024x1x1x512_f32xf32xf32_stride2.mlir |   7 -
 ...14x14x256x3x3x256_f32xf32xf32_stride1.mlir |   7 -
 ...14x14x256x3x3x256_f32xf32xf32_stride2.mlir |   7 -
 ...28x28x128x3x3x128_f32xf32xf32_stride1.mlir |   7 -
 ...28x28x128x3x3x128_f32xf32xf32_stride2.mlir |   7 -
 ...28x28x512x1x1x256_f32xf32xf32_stride2.mlir |   7 -
 ...8x56x56x64x3x3x64_f32xf32xf32_stride1.mlir |   7 -
 ...7x7x2048x1x1x1024_f32xf32xf32_stride2.mlir |   7 -
 ...8x7x7x512x3x3x512_f32xf32xf32_stride1.mlir |   7 -
 ...8x7x7x512x3x3x512_f32xf32xf32_stride2.mlir |   7 -
 ...x112x112x64x7x7x3_f32xf32xf32_stride2.mlir |   7 -
 ...4x14x1024x1x1x512_f32xf32xf32_stride2.mlir |   7 -
 ...14x14x256x3x3x256_f32xf32xf32_stride1.mlir |   7 -
 ...14x14x256x3x3x256_f32xf32xf32_stride2.mlir |   7 -
 ...28x28x128x3x3x128_f32xf32xf32_stride1.mlir |   7 -
 ...28x28x128x3x3x128_f32xf32xf32_stride2.mlir |   7 -
 ...28x28x512x1x1x256_f32xf32xf32_stride2.mlir |   7 -
 ...4x56x56x64x3x3x64_f32xf32xf32_stride1.mlir |   7 -
 ...7x7x2048x1x1x1024_f32xf32xf32_stride2.mlir |   7 -
 ...4x7x7x512x3x3x512_f32xf32xf32_stride1.mlir |   7 -
 ...4x7x7x512x3x3x512_f32xf32xf32_stride2.mlir |   7 -
 ...x112x112x64x7x7x3_f32xf32xf32_stride2.mlir |   7 -
 ...4x14x1024x1x1x512_f32xf32xf32_stride2.mlir |   7 -
 ...14x14x256x3x3x256_f32xf32xf32_stride1.mlir |   7 -
 ...14x14x256x3x3x256_f32xf32xf32_stride2.mlir |   7 -
 ...28x28x128x3x3x128_f32xf32xf32_stride1.mlir |   7 -
 ...28x28x128x3x3x128_f32xf32xf32_stride2.mlir |   7 -
 ...28x28x512x1x1x256_f32xf32xf32_stride2.mlir |   7 -
 ...8x56x56x64x3x3x64_f32xf32xf32_stride1.mlir |   7 -
 ...7x7x2048x1x1x1024_f32xf32xf32_stride2.mlir |   7 -
 ...8x7x7x512x3x3x512_f32xf32xf32_stride1.mlir |   7 -
 ...8x7x7x512x3x3x512_f32xf32xf32_stride2.mlir |   7 -
 ...16x112x112x64x7x7x3_i8xi8xi32_stride2.mlir |   8 -
 ...x14x14x1024x1x1x512_i8xi8xi32_stride2.mlir |   8 -
 ...6x14x14x256x3x3x256_i8xi8xi32_stride1.mlir |   8 -
 ...6x14x14x256x3x3x256_i8xi8xi32_stride2.mlir |   8 -
 ...6x28x28x128x3x3x128_i8xi8xi32_stride1.mlir |   8 -
 ...6x28x28x128x3x3x128_i8xi8xi32_stride2.mlir |   8 -
 ...6x28x28x512x1x1x256_i8xi8xi32_stride2.mlir |   8 -
 ..._16x56x56x64x3x3x64_i8xi8xi32_stride1.mlir |   8 -
 ...6x7x7x2048x1x1x1024_i8xi8xi32_stride2.mlir |   8 -
 ..._16x7x7x512x3x3x512_i8xi8xi32_stride1.mlir |   8 -
 ..._16x7x7x512x3x3x512_i8xi8xi32_stride2.mlir |   8 -
 ..._1x112x112x64x7x7x3_i8xi8xi32_stride2.mlir |   8 -
 ...x14x14x1024x1x1x512_i8xi8xi32_stride2.mlir |   8 -
 ...1x14x14x256x3x3x256_i8xi8xi32_stride1.mlir |   8 -
 ...1x14x14x256x3x3x256_i8xi8xi32_stride2.mlir |   8 -
 ...1x28x28x128x3x3x128_i8xi8xi32_stride1.mlir |   8 -
 ...1x28x28x128x3x3x128_i8xi8xi32_stride2.mlir |   8 -
 ...1x28x28x512x1x1x256_i8xi8xi32_stride2.mlir |   8 -
 ...q_1x56x56x64x3x3x64_i8xi8xi32_stride1.mlir |   8 -
 ...1x7x7x2048x1x1x1024_i8xi8xi32_stride2.mlir |   8 -
 ...q_1x7x7x512x3x3x512_i8xi8xi32_stride1.mlir |   8 -
 ...q_1x7x7x512x3x3x512_i8xi8xi32_stride2.mlir |   8 -
 ..._2x112x112x64x7x7x3_i8xi8xi32_stride2.mlir |   8 -
 ...x14x14x1024x1x1x512_i8xi8xi32_stride2.mlir |   8 -
 ...2x14x14x256x3x3x256_i8xi8xi32_stride1.mlir |   8 -
 ...2x14x14x256x3x3x256_i8xi8xi32_stride2.mlir |   8 -
 ...2x28x28x128x3x3x128_i8xi8xi32_stride1.mlir |   8 -
 ...2x28x28x128x3x3x128_i8xi8xi32_stride2.mlir |   8 -
 ...2x28x28x512x1x1x256_i8xi8xi32_stride2.mlir |   8 -
 ...q_2x56x56x64x3x3x64_i8xi8xi32_stride1.mlir |   8 -
 ...2x7x7x2048x1x1x1024_i8xi8xi32_stride2.mlir |   8 -
 ...q_2x7x7x512x3x3x512_i8xi8xi32_stride1.mlir |   8 -
 ...q_2x7x7x512x3x3x512_i8xi8xi32_stride2.mlir |   8 -
 ...32x112x112x64x7x7x3_i8xi8xi32_stride2.mlir |   8 -
 ...x14x14x1024x1x1x512_i8xi8xi32_stride2.mlir |   8 -
 ...2x14x14x256x3x3x256_i8xi8xi32_stride1.mlir |   8 -
 ...2x14x14x256x3x3x256_i8xi8xi32_stride2.mlir |   8 -
 ...2x28x28x128x3x3x128_i8xi8xi32_stride1.mlir |   8 -
 ...2x28x28x128x3x3x128_i8xi8xi32_stride2.mlir |   8 -
 ...2x28x28x512x1x1x256_i8xi8xi32_stride2.mlir |   8 -
 ..._32x56x56x64x3x3x64_i8xi8xi32_stride1.mlir |   8 -
 ...2x7x7x2048x1x1x1024_i8xi8xi32_stride2.mlir |   8 -
 ..._32x7x7x512x3x3x512_i8xi8xi32_stride1.mlir |   8 -
 ..._32x7x7x512x3x3x512_i8xi8xi32_stride2.mlir |   8 -
 ...48x112x112x64x7x7x3_i8xi8xi32_stride2.mlir |   8 -
 ...x14x14x1024x1x1x512_i8xi8xi32_stride2.mlir |   8 -
 ...8x14x14x256x3x3x256_i8xi8xi32_stride1.mlir |   8 -
 ...8x14x14x256x3x3x256_i8xi8xi32_stride2.mlir |   8 -
 ...8x28x28x128x3x3x128_i8xi8xi32_stride1.mlir |   8 -
 ...8x28x28x128x3x3x128_i8xi8xi32_stride2.mlir |   8 -
 ...8x28x28x512x1x1x256_i8xi8xi32_stride2.mlir |   8 -
 ..._48x56x56x64x3x3x64_i8xi8xi32_stride1.mlir |   8 -
 ...8x7x7x2048x1x1x1024_i8xi8xi32_stride2.mlir |   8 -
 ..._48x7x7x512x3x3x512_i8xi8xi32_stride1.mlir |   8 -
 ..._48x7x7x512x3x3x512_i8xi8xi32_stride2.mlir |   8 -
 ..._4x112x112x64x7x7x3_i8xi8xi32_stride2.mlir |   8 -
 ...x14x14x1024x1x1x512_i8xi8xi32_stride2.mlir |   8 -
 ...4x14x14x256x3x3x256_i8xi8xi32_stride1.mlir |   8 -
 ...4x14x14x256x3x3x256_i8xi8xi32_stride2.mlir |   8 -
 ...4x28x28x128x3x3x128_i8xi8xi32_stride1.mlir |   8 -
 ...4x28x28x128x3x3x128_i8xi8xi32_stride2.mlir |   8 -
 ...4x28x28x512x1x1x256_i8xi8xi32_stride2.mlir |   8 -
 ...q_4x56x56x64x3x3x64_i8xi8xi32_stride1.mlir |   8 -
 ...4x7x7x2048x1x1x1024_i8xi8xi32_stride2.mlir |   8 -
 ...q_4x7x7x512x3x3x512_i8xi8xi32_stride1.mlir |   8 -
 ...q_4x7x7x512x3x3x512_i8xi8xi32_stride2.mlir |   8 -
 ..._8x112x112x64x7x7x3_i8xi8xi32_stride2.mlir |   8 -
 ...x14x14x1024x1x1x512_i8xi8xi32_stride2.mlir |   8 -
 ...8x14x14x256x3x3x256_i8xi8xi32_stride1.mlir |   8 -
 ...8x14x14x256x3x3x256_i8xi8xi32_stride2.mlir |   8 -
 ...8x28x28x128x3x3x128_i8xi8xi32_stride1.mlir |   8 -
 ...8x28x28x128x3x3x128_i8xi8xi32_stride2.mlir |   8 -
 ...8x28x28x512x1x1x256_i8xi8xi32_stride2.mlir |   8 -
 ...q_8x56x56x64x3x3x64_i8xi8xi32_stride1.mlir |   8 -
 ...8x7x7x2048x1x1x1024_i8xi8xi32_stride2.mlir |   8 -
 ...q_8x7x7x512x3x3x512_i8xi8xi32_stride1.mlir |   8 -
 ...q_8x7x7x512x3x3x512_i8xi8xi32_stride2.mlir |   8 -
 gemm/mlir/gemm_10240_16_8192_bf16_tA.mlir     |  10 --
 gemm/mlir/gemm_10240_16_8192_f16_tA.mlir      |  10 --
 gemm/mlir/gemm_10240_1_8192_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_10240_1_8192_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_10240_2_8192_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_10240_2_8192_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_10240_32_8192_bf16_tA.mlir     |  10 --
 gemm/mlir/gemm_10240_32_8192_f16_tA.mlir      |  10 --
 gemm/mlir/gemm_10240_4_8192_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_10240_4_8192_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_10240_8_8192_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_10240_8_8192_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_1024_5120_640_f16_tB.mlir      | 145 ------------------
 gemm/mlir/gemm_1280_16_8192_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_1280_16_8192_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_1280_1_8192_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_1280_1_8192_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_1280_2_8192_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_1280_2_8192_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_1280_32_8192_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_1280_32_8192_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_1280_4_8192_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_1280_4_8192_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_1280_8_8192_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_1280_8_8192_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_128_1280_2048_bf16.mlir        |   9 --
 gemm/mlir/gemm_128_1280_2048_bf16_tA.mlir     |  10 --
 gemm/mlir/gemm_128_1280_2048_bf16_tB.mlir     |  10 --
 gemm/mlir/gemm_128_1280_2048_f16.mlir         |   9 --
 gemm/mlir/gemm_128_1280_2048_f16_tA.mlir      |  10 --
 gemm/mlir/gemm_128_1280_2048_f16_tB.mlir      | 144 -----------------
 gemm/mlir/gemm_13824_16_5120_bf16_tA.mlir     |  10 --
 gemm/mlir/gemm_13824_16_5120_f16_tA.mlir      |  10 --
 gemm/mlir/gemm_13824_1_5120_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_13824_1_5120_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_13824_2_5120_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_13824_2_5120_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_13824_32_5120_bf16_tA.mlir     |  10 --
 gemm/mlir/gemm_13824_32_5120_f16_tA.mlir      |  10 --
 gemm/mlir/gemm_13824_4_5120_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_13824_4_5120_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_13824_8_5120_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_13824_8_5120_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_14336_16_8192_bf16_tA.mlir     |  10 --
 gemm/mlir/gemm_14336_16_8192_f16_tA.mlir      |  10 --
 gemm/mlir/gemm_14336_1_8192_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_14336_1_8192_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_14336_2_8192_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_14336_2_8192_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_14336_32_8192_bf16_tA.mlir     |  10 --
 gemm/mlir/gemm_14336_32_8192_f16_tA.mlir      |  10 --
 gemm/mlir/gemm_14336_4_8192_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_14336_4_8192_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_14336_8_8192_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_14336_8_8192_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_15360_16_5120_bf16_tA.mlir     |  10 --
 gemm/mlir/gemm_15360_16_5120_f16_tA.mlir      |  10 --
 gemm/mlir/gemm_15360_1_5120_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_15360_1_5120_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_15360_2_5120_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_15360_2_5120_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_15360_32_5120_bf16_tA.mlir     |  10 --
 gemm/mlir/gemm_15360_32_5120_f16_tA.mlir      |  10 --
 gemm/mlir/gemm_15360_4_5120_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_15360_4_5120_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_15360_8_5120_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_15360_8_5120_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_16000_16_5120_bf16_tA.mlir     |  10 --
 gemm/mlir/gemm_16000_16_5120_f16_tA.mlir      |  10 --
 gemm/mlir/gemm_16000_16_8192_bf16_tA.mlir     |  10 --
 gemm/mlir/gemm_16000_16_8192_f16_tA.mlir      |  10 --
 gemm/mlir/gemm_16000_1_5120_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_16000_1_5120_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_16000_1_8192_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_16000_1_8192_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_16000_2_5120_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_16000_2_5120_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_16000_2_8192_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_16000_2_8192_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_16000_32_5120_bf16_tA.mlir     |  10 --
 gemm/mlir/gemm_16000_32_5120_f16_tA.mlir      |  10 --
 gemm/mlir/gemm_16000_32_8192_bf16_tA.mlir     |  10 --
 gemm/mlir/gemm_16000_32_8192_f16_tA.mlir      |  10 --
 gemm/mlir/gemm_16000_4_5120_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_16000_4_5120_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_16000_4_8192_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_16000_4_8192_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_16000_8_5120_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_16000_8_5120_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_16000_8_8192_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_16000_8_8192_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_1920_16_5120_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_1920_16_5120_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_1920_1_5120_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_1920_1_5120_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_1920_2_5120_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_1920_2_5120_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_1920_32_5120_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_1920_32_5120_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_1920_4_5120_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_1920_4_5120_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_1920_8_5120_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_1920_8_5120_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_2048_10240_1280_bf16.mlir      |   9 --
 gemm/mlir/gemm_2048_10240_1280_bf16_tA.mlir   |  10 --
 gemm/mlir/gemm_2048_10240_1280_bf16_tB.mlir   |  10 --
 gemm/mlir/gemm_2048_10240_1280_f16.mlir       |   9 --
 gemm/mlir/gemm_2048_10240_1280_f16_tA.mlir    |  10 --
 gemm/mlir/gemm_2048_10240_1280_f16_tB.mlir    | 145 ------------------
 gemm/mlir/gemm_2048_1280_1280_bf16.mlir       |   9 --
 gemm/mlir/gemm_2048_1280_1280_bf16_tA.mlir    |  10 --
 gemm/mlir/gemm_2048_1280_1280_bf16_tB.mlir    |  10 --
 gemm/mlir/gemm_2048_1280_1280_f16.mlir        |   9 --
 gemm/mlir/gemm_2048_1280_1280_f16_tA.mlir     |  10 --
 gemm/mlir/gemm_2048_1280_1280_f16_tB.mlir     | 145 ------------------
 gemm/mlir/gemm_2048_1280_5120_bf16.mlir       |   9 --
 gemm/mlir/gemm_2048_1280_5120_bf16_tA.mlir    |  10 --
 gemm/mlir/gemm_2048_1280_5120_bf16_tB.mlir    |  10 --
 gemm/mlir/gemm_2048_1280_5120_f16.mlir        |   9 --
 gemm/mlir/gemm_2048_1280_5120_f16_tA.mlir     |  10 --
 gemm/mlir/gemm_2048_1280_5120_f16_tB.mlir     | 145 ------------------
 gemm/mlir/gemm_2048_2048_1024_f16.mlir        |   9 --
 gemm/mlir/gemm_2048_2048_65536_f16.mlir       |   9 --
 gemm/mlir/gemm_2048_2048_8192_f16.mlir        |   9 --
 gemm/mlir/gemm_2048_8192_1024_f16.mlir        |   9 --
 gemm/mlir/gemm_2048_8192_65536_f16.mlir       |   9 --
 gemm/mlir/gemm_2048_8192_8192_f16.mlir        |   9 --
 gemm/mlir/gemm_2560_16_8192_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_2560_16_8192_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_2560_1_8192_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_2560_1_8192_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_2560_2_8192_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_2560_2_8192_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_2560_32_8192_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_2560_32_8192_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_2560_4_8192_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_2560_4_8192_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_2560_8_8192_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_2560_8_8192_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_27648_16_5120_bf16_tA.mlir     |  10 --
 gemm/mlir/gemm_27648_16_5120_f16_tA.mlir      |  10 --
 gemm/mlir/gemm_27648_1_5120_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_27648_1_5120_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_27648_2_5120_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_27648_2_5120_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_27648_32_5120_bf16_tA.mlir     |  10 --
 gemm/mlir/gemm_27648_32_5120_f16_tA.mlir      |  10 --
 gemm/mlir/gemm_27648_4_5120_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_27648_4_5120_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_27648_8_5120_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_27648_8_5120_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_28672_16_8192_bf16_tA.mlir     |  10 --
 gemm/mlir/gemm_28672_16_8192_f16_tA.mlir      |  10 --
 gemm/mlir/gemm_28672_1_8192_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_28672_1_8192_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_28672_2_8192_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_28672_2_8192_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_28672_32_8192_bf16_tA.mlir     |  10 --
 gemm/mlir/gemm_28672_32_8192_f16_tA.mlir      |  10 --
 gemm/mlir/gemm_28672_4_8192_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_28672_4_8192_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_28672_8_8192_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_28672_8_8192_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_2_1280_8192_bf16_tB.mlir       |  10 --
 gemm/mlir/gemm_2_3584_8192_bf16_tB.mlir       |  10 --
 gemm/mlir/gemm_2_7168_8192_bf16_tB.mlir       |  10 --
 gemm/mlir/gemm_32000_16_5120_bf16_tA.mlir     |  10 --
 gemm/mlir/gemm_32000_16_5120_f16_tA.mlir      |  10 --
 gemm/mlir/gemm_32000_16_8192_bf16_tA.mlir     |  10 --
 gemm/mlir/gemm_32000_16_8192_f16_tA.mlir      |  10 --
 gemm/mlir/gemm_32000_1_5120_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_32000_1_5120_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_32000_1_8192_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_32000_1_8192_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_32000_2_5120_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_32000_2_5120_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_32000_2_8192_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_32000_2_8192_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_32000_32_5120_bf16_tA.mlir     |  10 --
 gemm/mlir/gemm_32000_32_5120_f16_tA.mlir      |  10 --
 gemm/mlir/gemm_32000_32_8192_bf16_tA.mlir     |  10 --
 gemm/mlir/gemm_32000_32_8192_f16_tA.mlir      |  10 --
 gemm/mlir/gemm_32000_4_5120_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_32000_4_5120_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_32000_4_8192_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_32000_4_8192_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_32000_8_5120_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_32000_8_5120_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_32000_8_8192_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_32000_8_8192_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_3456_16_5120_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_3456_16_5120_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_3456_1_5120_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_3456_1_5120_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_3456_2_5120_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_3456_2_5120_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_3456_32_5120_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_3456_32_5120_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_3456_4_5120_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_3456_4_5120_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_3456_8_5120_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_3456_8_5120_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_3840_16_5120_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_3840_16_5120_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_3840_1_5120_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_3840_1_5120_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_3840_2_5120_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_3840_2_5120_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_3840_32_5120_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_3840_32_5120_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_3840_4_5120_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_3840_4_5120_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_3840_8_5120_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_3840_8_5120_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_4000_16_5120_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_4000_16_5120_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_4000_16_8192_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_4000_16_8192_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_4000_1_5120_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_4000_1_5120_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_4000_1_8192_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_4000_1_8192_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_4000_2_5120_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_4000_2_5120_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_4000_2_8192_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_4000_2_8192_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_4000_32_5120_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_4000_32_5120_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_4000_32_8192_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_4000_32_8192_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_4000_4_5120_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_4000_4_5120_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_4000_4_8192_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_4000_4_8192_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_4000_8_5120_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_4000_8_5120_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_4000_8_8192_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_4000_8_8192_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_4096_20480_2560_f16_tB.mlir    | 145 ------------------
 gemm/mlir/gemm_4096_4096_8192_bf16.mlir       |   9 --
 gemm/mlir/gemm_4096_4096_8192_bf16_tA.mlir    |  10 --
 gemm/mlir/gemm_4096_4096_8192_bf16_tB.mlir    |  10 --
 gemm/mlir/gemm_4096_4096_8192_f16.mlir        |   9 --
 gemm/mlir/gemm_4096_4096_8192_f16_tA.mlir     |  10 --
 gemm/mlir/gemm_4096_4096_8192_f16_tB.mlir     |  10 --
 gemm/mlir/gemm_5120_16_1280_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_5120_16_1280_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_5120_16_13824_bf16_tA.mlir     |  10 --
 gemm/mlir/gemm_5120_16_13824_f16_tA.mlir      |  10 --
 gemm/mlir/gemm_5120_16_1728_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_5120_16_1728_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_5120_16_2560_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_5120_16_2560_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_5120_16_3456_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_5120_16_3456_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_5120_16_5120_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_5120_16_5120_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_5120_16_640_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_5120_16_640_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_5120_16_6912_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_5120_16_6912_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_5120_16_8192_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_5120_16_8192_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_5120_1_1280_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_5120_1_1280_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_5120_1_13824_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_5120_1_13824_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_5120_1_1728_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_5120_1_1728_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_5120_1_2560_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_5120_1_2560_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_5120_1_3456_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_5120_1_3456_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_5120_1_5120_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_5120_1_5120_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_5120_1_640_bf16_tA.mlir        |  10 --
 gemm/mlir/gemm_5120_1_640_f16_tA.mlir         |  10 --
 gemm/mlir/gemm_5120_1_6912_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_5120_1_6912_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_5120_1_8192_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_5120_1_8192_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_5120_2_1280_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_5120_2_1280_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_5120_2_13824_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_5120_2_13824_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_5120_2_1728_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_5120_2_1728_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_5120_2_2560_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_5120_2_2560_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_5120_2_3456_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_5120_2_3456_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_5120_2_5120_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_5120_2_5120_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_5120_2_640_bf16_tA.mlir        |  10 --
 gemm/mlir/gemm_5120_2_640_f16_tA.mlir         |  10 --
 gemm/mlir/gemm_5120_2_6912_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_5120_2_6912_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_5120_2_8192_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_5120_2_8192_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_5120_32_1280_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_5120_32_1280_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_5120_32_13824_bf16_tA.mlir     |  10 --
 gemm/mlir/gemm_5120_32_13824_f16_tA.mlir      |  10 --
 gemm/mlir/gemm_5120_32_1728_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_5120_32_1728_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_5120_32_2560_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_5120_32_2560_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_5120_32_3456_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_5120_32_3456_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_5120_32_5120_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_5120_32_5120_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_5120_32_640_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_5120_32_640_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_5120_32_6912_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_5120_32_6912_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_5120_32_8192_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_5120_32_8192_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_5120_4_1280_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_5120_4_1280_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_5120_4_13824_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_5120_4_13824_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_5120_4_1728_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_5120_4_1728_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_5120_4_2560_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_5120_4_2560_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_5120_4_3456_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_5120_4_3456_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_5120_4_5120_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_5120_4_5120_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_5120_4_640_bf16_tA.mlir        |  10 --
 gemm/mlir/gemm_5120_4_640_f16_tA.mlir         |  10 --
 gemm/mlir/gemm_5120_4_6912_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_5120_4_6912_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_5120_4_8192_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_5120_4_8192_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_5120_8_1280_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_5120_8_1280_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_5120_8_13824_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_5120_8_13824_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_5120_8_1728_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_5120_8_1728_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_5120_8_2560_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_5120_8_2560_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_5120_8_3456_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_5120_8_3456_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_5120_8_5120_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_5120_8_5120_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_5120_8_640_bf16_tA.mlir        |  10 --
 gemm/mlir/gemm_5120_8_640_f16_tA.mlir         |  10 --
 gemm/mlir/gemm_5120_8_6912_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_5120_8_6912_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_5120_8_8192_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_5120_8_8192_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_57344_16_8192_bf16_tA.mlir     |  10 --
 gemm/mlir/gemm_57344_16_8192_f16_tA.mlir      |  10 --
 gemm/mlir/gemm_57344_1_8192_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_57344_1_8192_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_57344_2_8192_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_57344_2_8192_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_57344_32_8192_bf16_tA.mlir     |  10 --
 gemm/mlir/gemm_57344_32_8192_f16_tA.mlir      |  10 --
 gemm/mlir/gemm_57344_4_8192_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_57344_4_8192_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_57344_8_8192_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_57344_8_8192_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_6912_16_5120_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_6912_16_5120_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_6912_1_5120_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_6912_1_5120_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_6912_2_5120_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_6912_2_5120_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_6912_32_5120_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_6912_32_5120_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_6912_4_5120_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_6912_4_5120_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_6912_8_5120_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_6912_8_5120_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_7168_16_8192_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_7168_16_8192_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_7168_1_8192_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_7168_1_8192_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_7168_2_8192_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_7168_2_8192_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_7168_32_8192_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_7168_32_8192_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_7168_4_8192_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_7168_4_8192_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_7168_8_8192_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_7168_8_8192_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_7680_16_5120_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_7680_16_5120_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_7680_1_5120_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_7680_1_5120_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_7680_2_5120_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_7680_2_5120_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_7680_32_5120_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_7680_32_5120_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_7680_4_5120_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_7680_4_5120_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_7680_8_5120_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_7680_8_5120_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_8000_16_5120_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_8000_16_5120_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_8000_16_8192_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_8000_16_8192_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_8000_1_5120_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_8000_1_5120_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_8000_1_8192_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_8000_1_8192_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_8000_2_5120_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_8000_2_5120_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_8000_2_8192_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_8000_2_8192_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_8000_32_5120_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_8000_32_5120_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_8000_32_8192_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_8000_32_8192_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_8000_4_5120_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_8000_4_5120_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_8000_4_8192_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_8000_4_8192_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_8000_8_5120_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_8000_8_5120_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_8000_8_8192_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_8000_8_8192_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_8192_16_1024_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_8192_16_1024_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_8192_16_14336_bf16_tA.mlir     |  10 --
 gemm/mlir/gemm_8192_16_14336_f16_tA.mlir      |  10 --
 gemm/mlir/gemm_8192_16_2048_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_8192_16_2048_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_8192_16_28672_bf16_tA.mlir     |  10 --
 gemm/mlir/gemm_8192_16_28672_f16_tA.mlir      |  10 --
 gemm/mlir/gemm_8192_16_3584_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_8192_16_3584_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_8192_16_4096_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_8192_16_4096_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_8192_16_7168_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_8192_16_7168_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_8192_16_8192_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_8192_16_8192_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_8192_1_1024_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_8192_1_1024_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_8192_1_14336_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_8192_1_14336_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_8192_1_2048_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_8192_1_2048_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_8192_1_28672_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_8192_1_28672_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_8192_1_3584_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_8192_1_3584_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_8192_1_4096_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_8192_1_4096_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_8192_1_7168_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_8192_1_7168_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_8192_1_8192_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_8192_1_8192_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_8192_2048_1024_f16.mlir        |   9 --
 gemm/mlir/gemm_8192_2048_65536_f16.mlir       |   9 --
 gemm/mlir/gemm_8192_2048_8192_f16.mlir        |   9 --
 gemm/mlir/gemm_8192_2_1024_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_8192_2_1024_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_8192_2_14336_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_8192_2_14336_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_8192_2_2048_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_8192_2_2048_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_8192_2_28672_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_8192_2_28672_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_8192_2_3584_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_8192_2_3584_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_8192_2_4096_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_8192_2_4096_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_8192_2_7168_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_8192_2_7168_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_8192_2_8192_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_8192_2_8192_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_8192_32_1024_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_8192_32_1024_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_8192_32_14336_bf16_tA.mlir     |  10 --
 gemm/mlir/gemm_8192_32_14336_f16_tA.mlir      |  10 --
 gemm/mlir/gemm_8192_32_2048_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_8192_32_2048_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_8192_32_28672_bf16_tA.mlir     |  10 --
 gemm/mlir/gemm_8192_32_28672_f16_tA.mlir      |  10 --
 gemm/mlir/gemm_8192_32_3584_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_8192_32_3584_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_8192_32_4096_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_8192_32_4096_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_8192_32_7168_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_8192_32_7168_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_8192_32_8192_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_8192_32_8192_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_8192_4_1024_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_8192_4_1024_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_8192_4_14336_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_8192_4_14336_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_8192_4_2048_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_8192_4_2048_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_8192_4_28672_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_8192_4_28672_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_8192_4_3584_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_8192_4_3584_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_8192_4_4096_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_8192_4_4096_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_8192_4_7168_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_8192_4_7168_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_8192_4_8192_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_8192_4_8192_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_8192_5120_640_bf16.mlir        |   9 --
 gemm/mlir/gemm_8192_5120_640_bf16_tA.mlir     |  10 --
 gemm/mlir/gemm_8192_5120_640_bf16_tB.mlir     |  10 --
 gemm/mlir/gemm_8192_5120_640_f16.mlir         |   9 --
 gemm/mlir/gemm_8192_5120_640_f16_tA.mlir      |  10 --
 gemm/mlir/gemm_8192_5120_640_f16_tB.mlir      | 145 ------------------
 gemm/mlir/gemm_8192_8192_1024_f16.mlir        |   9 --
 gemm/mlir/gemm_8192_8192_65536_f16.mlir       |   9 --
 gemm/mlir/gemm_8192_8192_8192_f16.mlir        |   9 --
 gemm/mlir/gemm_8192_8_1024_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_8192_8_1024_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_8192_8_14336_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_8192_8_14336_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_8192_8_2048_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_8192_8_2048_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_8192_8_28672_bf16_tA.mlir      |  10 --
 gemm/mlir/gemm_8192_8_28672_f16_tA.mlir       |  10 --
 gemm/mlir/gemm_8192_8_3584_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_8192_8_3584_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_8192_8_4096_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_8192_8_4096_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_8192_8_7168_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_8192_8_7168_f16_tA.mlir        |  10 --
 gemm/mlir/gemm_8192_8_8192_bf16_tA.mlir       |  10 --
 gemm/mlir/gemm_8192_8_8192_f16_tA.mlir        |  10 --
 929 files changed, 3 insertions(+), 13536 deletions(-)
 delete mode 100644 attention/mlir/attention_128x1024x128x128x1024xf16.mlir
 delete mode 100644 attention/mlir/attention_128x1024x128x128x1024xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_128x1024x64x64x1024xf16.mlir
 delete mode 100644 attention/mlir/attention_128x1024x64x64x1024xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_128x16384x128x128x16384xf16.mlir
 delete mode 100644 attention/mlir/attention_128x16384x128x128x16384xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_128x16384x64x64x16384xf16.mlir
 delete mode 100644 attention/mlir/attention_128x16384x64x64x16384xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_128x2048x128x128x2048xf16.mlir
 delete mode 100644 attention/mlir/attention_128x2048x128x128x2048xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_128x2048x64x64x2048xf16.mlir
 delete mode 100644 attention/mlir/attention_128x2048x64x64x2048xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_128x4096x128x128x4096xf16.mlir
 delete mode 100644 attention/mlir/attention_128x4096x128x128x4096xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_128x4096x64x64x4096xf16.mlir
 delete mode 100644 attention/mlir/attention_128x4096x64x64x4096xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_128x8192x128x128x8192xf16.mlir
 delete mode 100644 attention/mlir/attention_128x8192x128x128x8192xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_128x8192x64x64x8192xf16.mlir
 delete mode 100644 attention/mlir/attention_128x8192x64x64x8192xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_12x384x64x64x384xf16.mlir
 delete mode 100644 attention/mlir/attention_12x384x64x64x384xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_16x1024x128x128x1024xf16.mlir
 delete mode 100644 attention/mlir/attention_16x1024x128x128x1024xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_16x1024x64x64x1024xf16.mlir
 delete mode 100644 attention/mlir/attention_16x1024x64x64x1024xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_16x16384x128x128x16384xf16.mlir
 delete mode 100644 attention/mlir/attention_16x16384x128x128x16384xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_16x16384x64x64x16384xf16.mlir
 delete mode 100644 attention/mlir/attention_16x16384x64x64x16384xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_16x2048x128x128x2048xf16.mlir
 delete mode 100644 attention/mlir/attention_16x2048x128x128x2048xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_16x2048x64x64x2048xf16.mlir
 delete mode 100644 attention/mlir/attention_16x2048x64x64x2048xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_16x4096x128x128x4096xf16.mlir
 delete mode 100644 attention/mlir/attention_16x4096x128x128x4096xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_16x4096x64x64x4096xf16.mlir
 delete mode 100644 attention/mlir/attention_16x4096x64x64x4096xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_16x8192x128x128x8192xf16.mlir
 delete mode 100644 attention/mlir/attention_16x8192x128x128x8192xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_16x8192x64x64x8192xf16.mlir
 delete mode 100644 attention/mlir/attention_16x8192x64x64x8192xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_192x1024x128x128x1024xf16.mlir
 delete mode 100644 attention/mlir/attention_192x1024x128x128x1024xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_192x1024x64x64x1024xf16.mlir
 delete mode 100644 attention/mlir/attention_192x1024x64x64x1024xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_192x16384x128x128x16384xf16.mlir
 delete mode 100644 attention/mlir/attention_192x16384x128x128x16384xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_192x16384x64x64x16384xf16.mlir
 delete mode 100644 attention/mlir/attention_192x16384x64x64x16384xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_192x2048x128x128x2048xf16.mlir
 delete mode 100644 attention/mlir/attention_192x2048x128x128x2048xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_192x2048x64x64x2048xf16.mlir
 delete mode 100644 attention/mlir/attention_192x2048x64x64x2048xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_192x4096x128x128x4096xf16.mlir
 delete mode 100644 attention/mlir/attention_192x4096x128x128x4096xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_192x4096x64x64x4096xf16.mlir
 delete mode 100644 attention/mlir/attention_192x4096x64x64x4096xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_192x8192x128x128x8192xf16.mlir
 delete mode 100644 attention/mlir/attention_192x8192x128x128x8192xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_192x8192x64x64x8192xf16.mlir
 delete mode 100644 attention/mlir/attention_192x8192x64x64x8192xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_1x1024x128x128x1024xf16.mlir
 delete mode 100644 attention/mlir/attention_1x1024x128x128x1024xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_1x1024x64x64x1024xf16.mlir
 delete mode 100644 attention/mlir/attention_1x1024x64x64x1024xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_1x16384x128x128x16384xf16.mlir
 delete mode 100644 attention/mlir/attention_1x16384x128x128x16384xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_1x16384x64x64x16384xf16.mlir
 delete mode 100644 attention/mlir/attention_1x16384x64x64x16384xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_1x2048x128x128x2048xf16.mlir
 delete mode 100644 attention/mlir/attention_1x2048x128x128x2048xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_1x2048x64x64x2048xf16.mlir
 delete mode 100644 attention/mlir/attention_1x2048x64x64x2048xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_1x4096x128x128x4096xf16.mlir
 delete mode 100644 attention/mlir/attention_1x4096x128x128x4096xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_1x4096x64x64x4096xf16.mlir
 delete mode 100644 attention/mlir/attention_1x4096x64x64x4096xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_1x4096x64x64x64xf16.mlir
 delete mode 100644 attention/mlir/attention_1x4096x64x64x64xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_1x8192x128x128x8192xf16.mlir
 delete mode 100644 attention/mlir/attention_1x8192x128x128x8192xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_1x8192x64x64x8192xf16.mlir
 delete mode 100644 attention/mlir/attention_1x8192x64x64x8192xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_20x4096x64x64x4096xf16.mlir
 delete mode 100644 attention/mlir/attention_20x4096x64x64x4096xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_20x4096x64x64x64xf16.mlir
 delete mode 100644 attention/mlir/attention_20x4096x64x64x64xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_2x1024x128x128x1024xf16.mlir
 delete mode 100644 attention/mlir/attention_2x1024x128x128x1024xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_2x1024x64x64x1024xf16.mlir
 delete mode 100644 attention/mlir/attention_2x1024x64x64x1024xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_2x1024x64x64x64xf16.mlir
 delete mode 100644 attention/mlir/attention_2x1024x64x64x64xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_2x16384x128x128x16384xf16.mlir
 delete mode 100644 attention/mlir/attention_2x16384x128x128x16384xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_2x16384x64x64x16384xf16.mlir
 delete mode 100644 attention/mlir/attention_2x16384x64x64x16384xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_2x2048x128x128x2048xf16.mlir
 delete mode 100644 attention/mlir/attention_2x2048x128x128x2048xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_2x2048x64x64x2048xf16.mlir
 delete mode 100644 attention/mlir/attention_2x2048x64x64x2048xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_2x4096x128x128x4096xf16.mlir
 delete mode 100644 attention/mlir/attention_2x4096x128x128x4096xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_2x4096x64x64x4096xf16.mlir
 delete mode 100644 attention/mlir/attention_2x4096x64x64x4096xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_2x8192x128x128x8192xf16.mlir
 delete mode 100644 attention/mlir/attention_2x8192x128x128x8192xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_2x8192x64x64x8192xf16.mlir
 delete mode 100644 attention/mlir/attention_2x8192x64x64x8192xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_32x1024x128x128x1024xf16.mlir
 delete mode 100644 attention/mlir/attention_32x1024x128x128x1024xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_32x1024x64x64x1024xf16.mlir
 delete mode 100644 attention/mlir/attention_32x1024x64x64x1024xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_32x16384x128x128x16384xf16.mlir
 delete mode 100644 attention/mlir/attention_32x16384x128x128x16384xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_32x16384x64x64x16384xf16.mlir
 delete mode 100644 attention/mlir/attention_32x16384x64x64x16384xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_32x2048x128x128x2048xf16.mlir
 delete mode 100644 attention/mlir/attention_32x2048x128x128x2048xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_32x2048x64x64x2048xf16.mlir
 delete mode 100644 attention/mlir/attention_32x2048x64x64x2048xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_32x4096x128x128x4096xf16.mlir
 delete mode 100644 attention/mlir/attention_32x4096x128x128x4096xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_32x4096x64x64x4096xf16.mlir
 delete mode 100644 attention/mlir/attention_32x4096x64x64x4096xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_32x8192x128x128x8192xf16.mlir
 delete mode 100644 attention/mlir/attention_32x8192x128x128x8192xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_32x8192x64x64x8192xf16.mlir
 delete mode 100644 attention/mlir/attention_32x8192x64x64x8192xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_40x1024x64x64x1024xf16.mlir
 delete mode 100644 attention/mlir/attention_40x1024x64x64x1024xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_40x1024x64x64x64xf16.mlir
 delete mode 100644 attention/mlir/attention_40x1024x64x64x64xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_48x1024x128x128x1024xf16.mlir
 delete mode 100644 attention/mlir/attention_48x1024x128x128x1024xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_48x1024x64x64x1024xf16.mlir
 delete mode 100644 attention/mlir/attention_48x1024x64x64x1024xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_48x16384x128x128x16384xf16.mlir
 delete mode 100644 attention/mlir/attention_48x16384x128x128x16384xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_48x16384x64x64x16384xf16.mlir
 delete mode 100644 attention/mlir/attention_48x16384x64x64x16384xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_48x2048x128x128x2048xf16.mlir
 delete mode 100644 attention/mlir/attention_48x2048x128x128x2048xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_48x2048x64x64x2048xf16.mlir
 delete mode 100644 attention/mlir/attention_48x2048x64x64x2048xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_48x4096x128x128x4096xf16.mlir
 delete mode 100644 attention/mlir/attention_48x4096x128x128x4096xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_48x4096x64x64x4096xf16.mlir
 delete mode 100644 attention/mlir/attention_48x4096x64x64x4096xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_48x8192x128x128x8192xf16.mlir
 delete mode 100644 attention/mlir/attention_48x8192x128x128x8192xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_48x8192x64x64x8192xf16.mlir
 delete mode 100644 attention/mlir/attention_48x8192x64x64x8192xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_4x1024x128x128x1024xf16.mlir
 delete mode 100644 attention/mlir/attention_4x1024x128x128x1024xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_4x1024x64x64x1024xf16.mlir
 delete mode 100644 attention/mlir/attention_4x1024x64x64x1024xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_4x16384x128x128x16384xf16.mlir
 delete mode 100644 attention/mlir/attention_4x16384x128x128x16384xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_4x16384x64x64x16384xf16.mlir
 delete mode 100644 attention/mlir/attention_4x16384x64x64x16384xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_4x2048x128x128x2048xf16.mlir
 delete mode 100644 attention/mlir/attention_4x2048x128x128x2048xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_4x2048x64x64x2048xf16.mlir
 delete mode 100644 attention/mlir/attention_4x2048x64x64x2048xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_4x4096x128x128x4096xf16.mlir
 delete mode 100644 attention/mlir/attention_4x4096x128x128x4096xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_4x4096x64x64x4096xf16.mlir
 delete mode 100644 attention/mlir/attention_4x4096x64x64x4096xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_4x4096x64x64x64xf16.mlir
 delete mode 100644 attention/mlir/attention_4x4096x64x64x64xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_4x8192x128x128x8192xf16.mlir
 delete mode 100644 attention/mlir/attention_4x8192x128x128x8192xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_4x8192x64x64x8192xf16.mlir
 delete mode 100644 attention/mlir/attention_4x8192x64x64x8192xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_64x1024x128x128x1024xf16.mlir
 delete mode 100644 attention/mlir/attention_64x1024x128x128x1024xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_64x1024x64x64x1024xf16.mlir
 delete mode 100644 attention/mlir/attention_64x1024x64x64x1024xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_64x16384x128x128x16384xf16.mlir
 delete mode 100644 attention/mlir/attention_64x16384x128x128x16384xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_64x16384x64x64x16384xf16.mlir
 delete mode 100644 attention/mlir/attention_64x16384x64x64x16384xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_64x2048x128x128x2048xf16.mlir
 delete mode 100644 attention/mlir/attention_64x2048x128x128x2048xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_64x2048x64x64x2048xf16.mlir
 delete mode 100644 attention/mlir/attention_64x2048x64x64x2048xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_64x4096x128x128x4096xf16.mlir
 delete mode 100644 attention/mlir/attention_64x4096x128x128x4096xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_64x4096x64x64x4096xf16.mlir
 delete mode 100644 attention/mlir/attention_64x4096x64x64x4096xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_64x8192x128x128x8192xf16.mlir
 delete mode 100644 attention/mlir/attention_64x8192x128x128x8192xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_64x8192x64x64x8192xf16.mlir
 delete mode 100644 attention/mlir/attention_64x8192x64x64x8192xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_768x4096x64x64x64xf16.mlir
 delete mode 100644 attention/mlir/attention_768x4096x64x64x64xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_8x1024x128x128x1024xf16.mlir
 delete mode 100644 attention/mlir/attention_8x1024x128x128x1024xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_8x1024x64x64x1024xf16.mlir
 delete mode 100644 attention/mlir/attention_8x1024x64x64x1024xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_8x1024x64x64x64xf16.mlir
 delete mode 100644 attention/mlir/attention_8x1024x64x64x64xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_8x16384x128x128x16384xf16.mlir
 delete mode 100644 attention/mlir/attention_8x16384x128x128x16384xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_8x16384x64x64x16384xf16.mlir
 delete mode 100644 attention/mlir/attention_8x16384x64x64x16384xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_8x2048x128x128x2048xf16.mlir
 delete mode 100644 attention/mlir/attention_8x2048x128x128x2048xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_8x2048x64x64x2048xf16.mlir
 delete mode 100644 attention/mlir/attention_8x2048x64x64x2048xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_8x4096x128x128x4096xf16.mlir
 delete mode 100644 attention/mlir/attention_8x4096x128x128x4096xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_8x4096x64x64x4096xf16.mlir
 delete mode 100644 attention/mlir/attention_8x4096x64x64x4096xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_8x8192x128x128x8192xf16.mlir
 delete mode 100644 attention/mlir/attention_8x8192x128x128x8192xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_8x8192x64x64x8192xf16.mlir
 delete mode 100644 attention/mlir/attention_8x8192x64x64x8192xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_96x1024x128x128x1024xf16.mlir
 delete mode 100644 attention/mlir/attention_96x1024x128x128x1024xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_96x1024x64x64x1024xf16.mlir
 delete mode 100644 attention/mlir/attention_96x1024x64x64x1024xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_96x16384x128x128x16384xf16.mlir
 delete mode 100644 attention/mlir/attention_96x16384x128x128x16384xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_96x16384x64x64x16384xf16.mlir
 delete mode 100644 attention/mlir/attention_96x16384x64x64x16384xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_96x2048x128x128x2048xf16.mlir
 delete mode 100644 attention/mlir/attention_96x2048x128x128x2048xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_96x2048x64x64x2048xf16.mlir
 delete mode 100644 attention/mlir/attention_96x2048x64x64x2048xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_96x4096x128x128x4096xf16.mlir
 delete mode 100644 attention/mlir/attention_96x4096x128x128x4096xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_96x4096x64x64x4096xf16.mlir
 delete mode 100644 attention/mlir/attention_96x4096x64x64x4096xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_96x8192x128x128x8192xf16.mlir
 delete mode 100644 attention/mlir/attention_96x8192x128x128x8192xf8E4M3FNUZ.mlir
 delete mode 100644 attention/mlir/attention_96x8192x64x64x8192xf16.mlir
 delete mode 100644 attention/mlir/attention_96x8192x64x64x8192xf8E4M3FNUZ.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_16x112x112x64x7x7x3_f32xf32xf32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_16x14x14x1024x1x1x512_f32xf32xf32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_16x14x14x256x3x3x256_f32xf32xf32_stride1.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_16x14x14x256x3x3x256_f32xf32xf32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_16x28x28x128x3x3x128_f32xf32xf32_stride1.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_16x28x28x128x3x3x128_f32xf32xf32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_16x28x28x512x1x1x256_f32xf32xf32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_16x56x56x64x3x3x64_f32xf32xf32_stride1.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_16x7x7x2048x1x1x1024_f32xf32xf32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_16x7x7x512x3x3x512_f32xf32xf32_stride1.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_16x7x7x512x3x3x512_f32xf32xf32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_1x112x112x64x7x7x3_f32xf32xf32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_1x14x14x1024x1x1x512_f32xf32xf32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_1x14x14x256x3x3x256_f32xf32xf32_stride1.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_1x14x14x256x3x3x256_f32xf32xf32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_1x28x28x128x3x3x128_f32xf32xf32_stride1.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_1x28x28x128x3x3x128_f32xf32xf32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_1x28x28x512x1x1x256_f32xf32xf32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_1x56x56x64x3x3x64_f32xf32xf32_stride1.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_1x56x56x64x7x7x3_f32xf32xf32_stride1.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_1x7x7x2048x1x1x1024_f32xf32xf32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_1x7x7x512x3x3x512_f32xf32xf32_stride1.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_1x7x7x512x3x3x512_f32xf32xf32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_2x112x112x64x7x7x3_f32xf32xf32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_2x14x14x1024x1x1x512_f32xf32xf32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_2x14x14x256x3x3x256_f32xf32xf32_stride1.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_2x14x14x256x3x3x256_f32xf32xf32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_2x28x28x128x3x3x128_f32xf32xf32_stride1.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_2x28x28x128x3x3x128_f32xf32xf32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_2x28x28x512x1x1x256_f32xf32xf32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_2x56x56x64x3x3x64_f32xf32xf32_stride1.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_2x7x7x2048x1x1x1024_f32xf32xf32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_2x7x7x512x3x3x512_f32xf32xf32_stride1.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_2x7x7x512x3x3x512_f32xf32xf32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_32x112x112x64x7x7x3_f32xf32xf32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_32x14x14x1024x1x1x512_f32xf32xf32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_32x14x14x256x3x3x256_f32xf32xf32_stride1.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_32x14x14x256x3x3x256_f32xf32xf32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_32x28x28x128x3x3x128_f32xf32xf32_stride1.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_32x28x28x128x3x3x128_f32xf32xf32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_32x28x28x512x1x1x256_f32xf32xf32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_32x56x56x64x3x3x64_f32xf32xf32_stride1.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_32x7x7x2048x1x1x1024_f32xf32xf32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_32x7x7x512x3x3x512_f32xf32xf32_stride1.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_32x7x7x512x3x3x512_f32xf32xf32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_48x112x112x64x7x7x3_f32xf32xf32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_48x14x14x1024x1x1x512_f32xf32xf32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_48x14x14x256x3x3x256_f32xf32xf32_stride1.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_48x14x14x256x3x3x256_f32xf32xf32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_48x28x28x128x3x3x128_f32xf32xf32_stride1.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_48x28x28x128x3x3x128_f32xf32xf32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_48x28x28x512x1x1x256_f32xf32xf32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_48x56x56x64x3x3x64_f32xf32xf32_stride1.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_48x7x7x2048x1x1x1024_f32xf32xf32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_48x7x7x512x3x3x512_f32xf32xf32_stride1.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_48x7x7x512x3x3x512_f32xf32xf32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_4x112x112x64x7x7x3_f32xf32xf32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_4x14x14x1024x1x1x512_f32xf32xf32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_4x14x14x256x3x3x256_f32xf32xf32_stride1.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_4x14x14x256x3x3x256_f32xf32xf32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_4x28x28x128x3x3x128_f32xf32xf32_stride1.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_4x28x28x128x3x3x128_f32xf32xf32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_4x28x28x512x1x1x256_f32xf32xf32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_4x56x56x64x3x3x64_f32xf32xf32_stride1.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_4x7x7x2048x1x1x1024_f32xf32xf32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_4x7x7x512x3x3x512_f32xf32xf32_stride1.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_4x7x7x512x3x3x512_f32xf32xf32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_8x112x112x64x7x7x3_f32xf32xf32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_8x14x14x1024x1x1x512_f32xf32xf32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_8x14x14x256x3x3x256_f32xf32xf32_stride1.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_8x14x14x256x3x3x256_f32xf32xf32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_8x28x28x128x3x3x128_f32xf32xf32_stride1.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_8x28x28x128x3x3x128_f32xf32xf32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_8x28x28x512x1x1x256_f32xf32xf32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_8x56x56x64x3x3x64_f32xf32xf32_stride1.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_8x7x7x2048x1x1x1024_f32xf32xf32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_8x7x7x512x3x3x512_f32xf32xf32_stride1.mlir
 delete mode 100644 conv/mlir/conv_2d_nchw_fchw_8x7x7x512x3x3x512_f32xf32xf32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_16x112x112x64x7x7x3_i8xi8xi32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_16x14x14x1024x1x1x512_i8xi8xi32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_16x14x14x256x3x3x256_i8xi8xi32_stride1.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_16x14x14x256x3x3x256_i8xi8xi32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_16x28x28x128x3x3x128_i8xi8xi32_stride1.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_16x28x28x128x3x3x128_i8xi8xi32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_16x28x28x512x1x1x256_i8xi8xi32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_16x56x56x64x3x3x64_i8xi8xi32_stride1.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_16x7x7x2048x1x1x1024_i8xi8xi32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_16x7x7x512x3x3x512_i8xi8xi32_stride1.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_16x7x7x512x3x3x512_i8xi8xi32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_1x112x112x64x7x7x3_i8xi8xi32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_1x14x14x1024x1x1x512_i8xi8xi32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_1x14x14x256x3x3x256_i8xi8xi32_stride1.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_1x14x14x256x3x3x256_i8xi8xi32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_1x28x28x128x3x3x128_i8xi8xi32_stride1.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_1x28x28x128x3x3x128_i8xi8xi32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_1x28x28x512x1x1x256_i8xi8xi32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_1x56x56x64x3x3x64_i8xi8xi32_stride1.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_1x7x7x2048x1x1x1024_i8xi8xi32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_1x7x7x512x3x3x512_i8xi8xi32_stride1.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_1x7x7x512x3x3x512_i8xi8xi32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_2x112x112x64x7x7x3_i8xi8xi32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_2x14x14x1024x1x1x512_i8xi8xi32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_2x14x14x256x3x3x256_i8xi8xi32_stride1.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_2x14x14x256x3x3x256_i8xi8xi32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_2x28x28x128x3x3x128_i8xi8xi32_stride1.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_2x28x28x128x3x3x128_i8xi8xi32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_2x28x28x512x1x1x256_i8xi8xi32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_2x56x56x64x3x3x64_i8xi8xi32_stride1.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_2x7x7x2048x1x1x1024_i8xi8xi32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_2x7x7x512x3x3x512_i8xi8xi32_stride1.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_2x7x7x512x3x3x512_i8xi8xi32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_32x112x112x64x7x7x3_i8xi8xi32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_32x14x14x1024x1x1x512_i8xi8xi32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_32x14x14x256x3x3x256_i8xi8xi32_stride1.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_32x14x14x256x3x3x256_i8xi8xi32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_32x28x28x128x3x3x128_i8xi8xi32_stride1.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_32x28x28x128x3x3x128_i8xi8xi32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_32x28x28x512x1x1x256_i8xi8xi32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_32x56x56x64x3x3x64_i8xi8xi32_stride1.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_32x7x7x2048x1x1x1024_i8xi8xi32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_32x7x7x512x3x3x512_i8xi8xi32_stride1.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_32x7x7x512x3x3x512_i8xi8xi32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_48x112x112x64x7x7x3_i8xi8xi32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_48x14x14x1024x1x1x512_i8xi8xi32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_48x14x14x256x3x3x256_i8xi8xi32_stride1.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_48x14x14x256x3x3x256_i8xi8xi32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_48x28x28x128x3x3x128_i8xi8xi32_stride1.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_48x28x28x128x3x3x128_i8xi8xi32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_48x28x28x512x1x1x256_i8xi8xi32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_48x56x56x64x3x3x64_i8xi8xi32_stride1.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_48x7x7x2048x1x1x1024_i8xi8xi32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_48x7x7x512x3x3x512_i8xi8xi32_stride1.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_48x7x7x512x3x3x512_i8xi8xi32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_4x112x112x64x7x7x3_i8xi8xi32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_4x14x14x1024x1x1x512_i8xi8xi32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_4x14x14x256x3x3x256_i8xi8xi32_stride1.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_4x14x14x256x3x3x256_i8xi8xi32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_4x28x28x128x3x3x128_i8xi8xi32_stride1.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_4x28x28x128x3x3x128_i8xi8xi32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_4x28x28x512x1x1x256_i8xi8xi32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_4x56x56x64x3x3x64_i8xi8xi32_stride1.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_4x7x7x2048x1x1x1024_i8xi8xi32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_4x7x7x512x3x3x512_i8xi8xi32_stride1.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_4x7x7x512x3x3x512_i8xi8xi32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_8x112x112x64x7x7x3_i8xi8xi32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_8x14x14x1024x1x1x512_i8xi8xi32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_8x14x14x256x3x3x256_i8xi8xi32_stride1.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_8x14x14x256x3x3x256_i8xi8xi32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_8x28x28x128x3x3x128_i8xi8xi32_stride1.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_8x28x28x128x3x3x128_i8xi8xi32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_8x28x28x512x1x1x256_i8xi8xi32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_8x56x56x64x3x3x64_i8xi8xi32_stride1.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_8x7x7x2048x1x1x1024_i8xi8xi32_stride2.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_8x7x7x512x3x3x512_i8xi8xi32_stride1.mlir
 delete mode 100644 conv/mlir/conv_2d_nhwc_hwcf_q_8x7x7x512x3x3x512_i8xi8xi32_stride2.mlir
 delete mode 100644 gemm/mlir/gemm_10240_16_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_10240_16_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_10240_1_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_10240_1_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_10240_2_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_10240_2_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_10240_32_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_10240_32_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_10240_4_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_10240_4_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_10240_8_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_10240_8_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_1024_5120_640_f16_tB.mlir
 delete mode 100644 gemm/mlir/gemm_1280_16_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_1280_16_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_1280_1_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_1280_1_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_1280_2_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_1280_2_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_1280_32_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_1280_32_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_1280_4_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_1280_4_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_1280_8_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_1280_8_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_128_1280_2048_bf16.mlir
 delete mode 100644 gemm/mlir/gemm_128_1280_2048_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_128_1280_2048_bf16_tB.mlir
 delete mode 100644 gemm/mlir/gemm_128_1280_2048_f16.mlir
 delete mode 100644 gemm/mlir/gemm_128_1280_2048_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_128_1280_2048_f16_tB.mlir
 delete mode 100644 gemm/mlir/gemm_13824_16_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_13824_16_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_13824_1_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_13824_1_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_13824_2_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_13824_2_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_13824_32_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_13824_32_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_13824_4_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_13824_4_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_13824_8_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_13824_8_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_14336_16_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_14336_16_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_14336_1_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_14336_1_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_14336_2_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_14336_2_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_14336_32_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_14336_32_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_14336_4_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_14336_4_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_14336_8_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_14336_8_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_15360_16_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_15360_16_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_15360_1_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_15360_1_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_15360_2_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_15360_2_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_15360_32_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_15360_32_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_15360_4_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_15360_4_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_15360_8_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_15360_8_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_16000_16_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_16000_16_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_16000_16_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_16000_16_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_16000_1_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_16000_1_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_16000_1_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_16000_1_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_16000_2_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_16000_2_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_16000_2_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_16000_2_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_16000_32_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_16000_32_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_16000_32_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_16000_32_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_16000_4_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_16000_4_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_16000_4_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_16000_4_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_16000_8_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_16000_8_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_16000_8_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_16000_8_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_1920_16_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_1920_16_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_1920_1_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_1920_1_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_1920_2_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_1920_2_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_1920_32_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_1920_32_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_1920_4_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_1920_4_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_1920_8_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_1920_8_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_2048_10240_1280_bf16.mlir
 delete mode 100644 gemm/mlir/gemm_2048_10240_1280_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_2048_10240_1280_bf16_tB.mlir
 delete mode 100644 gemm/mlir/gemm_2048_10240_1280_f16.mlir
 delete mode 100644 gemm/mlir/gemm_2048_10240_1280_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_2048_10240_1280_f16_tB.mlir
 delete mode 100644 gemm/mlir/gemm_2048_1280_1280_bf16.mlir
 delete mode 100644 gemm/mlir/gemm_2048_1280_1280_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_2048_1280_1280_bf16_tB.mlir
 delete mode 100644 gemm/mlir/gemm_2048_1280_1280_f16.mlir
 delete mode 100644 gemm/mlir/gemm_2048_1280_1280_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_2048_1280_1280_f16_tB.mlir
 delete mode 100644 gemm/mlir/gemm_2048_1280_5120_bf16.mlir
 delete mode 100644 gemm/mlir/gemm_2048_1280_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_2048_1280_5120_bf16_tB.mlir
 delete mode 100644 gemm/mlir/gemm_2048_1280_5120_f16.mlir
 delete mode 100644 gemm/mlir/gemm_2048_1280_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_2048_1280_5120_f16_tB.mlir
 delete mode 100644 gemm/mlir/gemm_2048_2048_1024_f16.mlir
 delete mode 100644 gemm/mlir/gemm_2048_2048_65536_f16.mlir
 delete mode 100644 gemm/mlir/gemm_2048_2048_8192_f16.mlir
 delete mode 100644 gemm/mlir/gemm_2048_8192_1024_f16.mlir
 delete mode 100644 gemm/mlir/gemm_2048_8192_65536_f16.mlir
 delete mode 100644 gemm/mlir/gemm_2048_8192_8192_f16.mlir
 delete mode 100644 gemm/mlir/gemm_2560_16_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_2560_16_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_2560_1_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_2560_1_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_2560_2_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_2560_2_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_2560_32_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_2560_32_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_2560_4_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_2560_4_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_2560_8_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_2560_8_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_27648_16_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_27648_16_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_27648_1_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_27648_1_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_27648_2_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_27648_2_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_27648_32_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_27648_32_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_27648_4_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_27648_4_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_27648_8_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_27648_8_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_28672_16_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_28672_16_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_28672_1_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_28672_1_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_28672_2_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_28672_2_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_28672_32_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_28672_32_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_28672_4_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_28672_4_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_28672_8_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_28672_8_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_2_1280_8192_bf16_tB.mlir
 delete mode 100644 gemm/mlir/gemm_2_3584_8192_bf16_tB.mlir
 delete mode 100644 gemm/mlir/gemm_2_7168_8192_bf16_tB.mlir
 delete mode 100644 gemm/mlir/gemm_32000_16_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_32000_16_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_32000_16_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_32000_16_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_32000_1_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_32000_1_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_32000_1_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_32000_1_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_32000_2_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_32000_2_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_32000_2_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_32000_2_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_32000_32_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_32000_32_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_32000_32_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_32000_32_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_32000_4_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_32000_4_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_32000_4_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_32000_4_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_32000_8_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_32000_8_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_32000_8_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_32000_8_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_3456_16_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_3456_16_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_3456_1_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_3456_1_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_3456_2_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_3456_2_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_3456_32_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_3456_32_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_3456_4_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_3456_4_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_3456_8_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_3456_8_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_3840_16_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_3840_16_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_3840_1_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_3840_1_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_3840_2_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_3840_2_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_3840_32_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_3840_32_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_3840_4_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_3840_4_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_3840_8_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_3840_8_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_4000_16_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_4000_16_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_4000_16_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_4000_16_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_4000_1_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_4000_1_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_4000_1_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_4000_1_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_4000_2_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_4000_2_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_4000_2_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_4000_2_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_4000_32_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_4000_32_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_4000_32_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_4000_32_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_4000_4_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_4000_4_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_4000_4_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_4000_4_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_4000_8_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_4000_8_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_4000_8_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_4000_8_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_4096_20480_2560_f16_tB.mlir
 delete mode 100644 gemm/mlir/gemm_4096_4096_8192_bf16.mlir
 delete mode 100644 gemm/mlir/gemm_4096_4096_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_4096_4096_8192_bf16_tB.mlir
 delete mode 100644 gemm/mlir/gemm_4096_4096_8192_f16.mlir
 delete mode 100644 gemm/mlir/gemm_4096_4096_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_4096_4096_8192_f16_tB.mlir
 delete mode 100644 gemm/mlir/gemm_5120_16_1280_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_16_1280_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_16_13824_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_16_13824_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_16_1728_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_16_1728_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_16_2560_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_16_2560_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_16_3456_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_16_3456_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_16_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_16_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_16_640_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_16_640_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_16_6912_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_16_6912_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_16_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_16_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_1_1280_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_1_1280_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_1_13824_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_1_13824_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_1_1728_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_1_1728_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_1_2560_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_1_2560_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_1_3456_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_1_3456_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_1_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_1_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_1_640_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_1_640_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_1_6912_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_1_6912_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_1_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_1_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_2_1280_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_2_1280_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_2_13824_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_2_13824_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_2_1728_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_2_1728_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_2_2560_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_2_2560_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_2_3456_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_2_3456_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_2_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_2_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_2_640_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_2_640_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_2_6912_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_2_6912_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_2_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_2_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_32_1280_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_32_1280_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_32_13824_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_32_13824_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_32_1728_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_32_1728_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_32_2560_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_32_2560_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_32_3456_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_32_3456_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_32_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_32_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_32_640_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_32_640_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_32_6912_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_32_6912_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_32_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_32_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_4_1280_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_4_1280_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_4_13824_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_4_13824_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_4_1728_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_4_1728_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_4_2560_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_4_2560_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_4_3456_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_4_3456_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_4_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_4_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_4_640_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_4_640_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_4_6912_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_4_6912_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_4_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_4_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_8_1280_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_8_1280_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_8_13824_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_8_13824_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_8_1728_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_8_1728_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_8_2560_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_8_2560_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_8_3456_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_8_3456_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_8_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_8_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_8_640_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_8_640_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_8_6912_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_8_6912_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_8_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_5120_8_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_57344_16_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_57344_16_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_57344_1_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_57344_1_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_57344_2_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_57344_2_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_57344_32_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_57344_32_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_57344_4_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_57344_4_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_57344_8_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_57344_8_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_6912_16_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_6912_16_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_6912_1_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_6912_1_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_6912_2_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_6912_2_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_6912_32_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_6912_32_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_6912_4_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_6912_4_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_6912_8_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_6912_8_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_7168_16_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_7168_16_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_7168_1_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_7168_1_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_7168_2_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_7168_2_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_7168_32_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_7168_32_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_7168_4_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_7168_4_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_7168_8_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_7168_8_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_7680_16_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_7680_16_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_7680_1_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_7680_1_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_7680_2_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_7680_2_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_7680_32_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_7680_32_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_7680_4_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_7680_4_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_7680_8_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_7680_8_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8000_16_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8000_16_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8000_16_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8000_16_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8000_1_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8000_1_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8000_1_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8000_1_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8000_2_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8000_2_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8000_2_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8000_2_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8000_32_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8000_32_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8000_32_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8000_32_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8000_4_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8000_4_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8000_4_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8000_4_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8000_8_5120_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8000_8_5120_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8000_8_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8000_8_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_16_1024_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_16_1024_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_16_14336_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_16_14336_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_16_2048_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_16_2048_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_16_28672_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_16_28672_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_16_3584_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_16_3584_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_16_4096_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_16_4096_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_16_7168_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_16_7168_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_16_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_16_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_1_1024_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_1_1024_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_1_14336_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_1_14336_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_1_2048_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_1_2048_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_1_28672_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_1_28672_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_1_3584_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_1_3584_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_1_4096_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_1_4096_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_1_7168_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_1_7168_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_1_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_1_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_2048_1024_f16.mlir
 delete mode 100644 gemm/mlir/gemm_8192_2048_65536_f16.mlir
 delete mode 100644 gemm/mlir/gemm_8192_2048_8192_f16.mlir
 delete mode 100644 gemm/mlir/gemm_8192_2_1024_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_2_1024_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_2_14336_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_2_14336_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_2_2048_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_2_2048_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_2_28672_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_2_28672_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_2_3584_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_2_3584_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_2_4096_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_2_4096_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_2_7168_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_2_7168_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_2_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_2_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_32_1024_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_32_1024_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_32_14336_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_32_14336_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_32_2048_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_32_2048_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_32_28672_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_32_28672_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_32_3584_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_32_3584_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_32_4096_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_32_4096_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_32_7168_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_32_7168_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_32_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_32_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_4_1024_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_4_1024_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_4_14336_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_4_14336_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_4_2048_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_4_2048_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_4_28672_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_4_28672_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_4_3584_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_4_3584_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_4_4096_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_4_4096_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_4_7168_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_4_7168_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_4_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_4_8192_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_5120_640_bf16.mlir
 delete mode 100644 gemm/mlir/gemm_8192_5120_640_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_5120_640_bf16_tB.mlir
 delete mode 100644 gemm/mlir/gemm_8192_5120_640_f16.mlir
 delete mode 100644 gemm/mlir/gemm_8192_5120_640_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_5120_640_f16_tB.mlir
 delete mode 100644 gemm/mlir/gemm_8192_8192_1024_f16.mlir
 delete mode 100644 gemm/mlir/gemm_8192_8192_65536_f16.mlir
 delete mode 100644 gemm/mlir/gemm_8192_8192_8192_f16.mlir
 delete mode 100644 gemm/mlir/gemm_8192_8_1024_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_8_1024_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_8_14336_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_8_14336_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_8_2048_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_8_2048_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_8_28672_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_8_28672_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_8_3584_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_8_3584_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_8_4096_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_8_4096_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_8_7168_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_8_7168_f16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_8_8192_bf16_tA.mlir
 delete mode 100644 gemm/mlir/gemm_8192_8_8192_f16_tA.mlir

diff --git a/.gitignore b/.gitignore
index b42c298..5c0612d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -25,7 +25,7 @@ wheelhouse
 bench_venv/
 
 # Bench Artifacts
-gemm/vmfb/
-attention/vmfb/
-conv/vmfb/
+attention/
+conv/
+gemm/
 results/
diff --git a/attention/mlir/attention_128x1024x128x128x1024xf16.mlir b/attention/mlir/attention_128x1024x128x128x1024xf16.mlir
deleted file mode 100644
index f4cf768..0000000
--- a/attention/mlir/attention_128x1024x128x128x1024xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<128x1024x128xf16>
-!K     = tensor<128x1024x128xf16>
-!V     = tensor<128x1024x128xf16>
-!O     = tensor<128x1024x128xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_128x1024x128x128x1024xf8E4M3FNUZ.mlir b/attention/mlir/attention_128x1024x128x128x1024xf8E4M3FNUZ.mlir
deleted file mode 100644
index 1e07e22..0000000
--- a/attention/mlir/attention_128x1024x128x128x1024xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<128x1024x128xf8E4M3FNUZ>
-!K     = tensor<128x1024x128xf8E4M3FNUZ>
-!V     = tensor<128x1024x128xf8E4M3FNUZ>
-!O     = tensor<128x1024x128xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_128x1024x64x64x1024xf16.mlir b/attention/mlir/attention_128x1024x64x64x1024xf16.mlir
deleted file mode 100644
index 7b8f32e..0000000
--- a/attention/mlir/attention_128x1024x64x64x1024xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<128x1024x64xf16>
-!K     = tensor<128x1024x64xf16>
-!V     = tensor<128x1024x64xf16>
-!O     = tensor<128x1024x64xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_128x1024x64x64x1024xf8E4M3FNUZ.mlir b/attention/mlir/attention_128x1024x64x64x1024xf8E4M3FNUZ.mlir
deleted file mode 100644
index 743e8bc..0000000
--- a/attention/mlir/attention_128x1024x64x64x1024xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<128x1024x64xf8E4M3FNUZ>
-!K     = tensor<128x1024x64xf8E4M3FNUZ>
-!V     = tensor<128x1024x64xf8E4M3FNUZ>
-!O     = tensor<128x1024x64xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_128x16384x128x128x16384xf16.mlir b/attention/mlir/attention_128x16384x128x128x16384xf16.mlir
deleted file mode 100644
index 9b26573..0000000
--- a/attention/mlir/attention_128x16384x128x128x16384xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<128x16384x128xf16>
-!K     = tensor<128x16384x128xf16>
-!V     = tensor<128x16384x128xf16>
-!O     = tensor<128x16384x128xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_128x16384x128x128x16384xf8E4M3FNUZ.mlir b/attention/mlir/attention_128x16384x128x128x16384xf8E4M3FNUZ.mlir
deleted file mode 100644
index cf96125..0000000
--- a/attention/mlir/attention_128x16384x128x128x16384xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<128x16384x128xf8E4M3FNUZ>
-!K     = tensor<128x16384x128xf8E4M3FNUZ>
-!V     = tensor<128x16384x128xf8E4M3FNUZ>
-!O     = tensor<128x16384x128xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_128x16384x64x64x16384xf16.mlir b/attention/mlir/attention_128x16384x64x64x16384xf16.mlir
deleted file mode 100644
index 1961900..0000000
--- a/attention/mlir/attention_128x16384x64x64x16384xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<128x16384x64xf16>
-!K     = tensor<128x16384x64xf16>
-!V     = tensor<128x16384x64xf16>
-!O     = tensor<128x16384x64xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_128x16384x64x64x16384xf8E4M3FNUZ.mlir b/attention/mlir/attention_128x16384x64x64x16384xf8E4M3FNUZ.mlir
deleted file mode 100644
index be49c2e..0000000
--- a/attention/mlir/attention_128x16384x64x64x16384xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<128x16384x64xf8E4M3FNUZ>
-!K     = tensor<128x16384x64xf8E4M3FNUZ>
-!V     = tensor<128x16384x64xf8E4M3FNUZ>
-!O     = tensor<128x16384x64xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_128x2048x128x128x2048xf16.mlir b/attention/mlir/attention_128x2048x128x128x2048xf16.mlir
deleted file mode 100644
index 0d60d01..0000000
--- a/attention/mlir/attention_128x2048x128x128x2048xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<128x2048x128xf16>
-!K     = tensor<128x2048x128xf16>
-!V     = tensor<128x2048x128xf16>
-!O     = tensor<128x2048x128xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_128x2048x128x128x2048xf8E4M3FNUZ.mlir b/attention/mlir/attention_128x2048x128x128x2048xf8E4M3FNUZ.mlir
deleted file mode 100644
index 68d7e5f..0000000
--- a/attention/mlir/attention_128x2048x128x128x2048xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<128x2048x128xf8E4M3FNUZ>
-!K     = tensor<128x2048x128xf8E4M3FNUZ>
-!V     = tensor<128x2048x128xf8E4M3FNUZ>
-!O     = tensor<128x2048x128xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_128x2048x64x64x2048xf16.mlir b/attention/mlir/attention_128x2048x64x64x2048xf16.mlir
deleted file mode 100644
index e4ac23f..0000000
--- a/attention/mlir/attention_128x2048x64x64x2048xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<128x2048x64xf16>
-!K     = tensor<128x2048x64xf16>
-!V     = tensor<128x2048x64xf16>
-!O     = tensor<128x2048x64xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_128x2048x64x64x2048xf8E4M3FNUZ.mlir b/attention/mlir/attention_128x2048x64x64x2048xf8E4M3FNUZ.mlir
deleted file mode 100644
index 57c6543..0000000
--- a/attention/mlir/attention_128x2048x64x64x2048xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<128x2048x64xf8E4M3FNUZ>
-!K     = tensor<128x2048x64xf8E4M3FNUZ>
-!V     = tensor<128x2048x64xf8E4M3FNUZ>
-!O     = tensor<128x2048x64xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_128x4096x128x128x4096xf16.mlir b/attention/mlir/attention_128x4096x128x128x4096xf16.mlir
deleted file mode 100644
index a3af00a..0000000
--- a/attention/mlir/attention_128x4096x128x128x4096xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<128x4096x128xf16>
-!K     = tensor<128x4096x128xf16>
-!V     = tensor<128x4096x128xf16>
-!O     = tensor<128x4096x128xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_128x4096x128x128x4096xf8E4M3FNUZ.mlir b/attention/mlir/attention_128x4096x128x128x4096xf8E4M3FNUZ.mlir
deleted file mode 100644
index c4a680d..0000000
--- a/attention/mlir/attention_128x4096x128x128x4096xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<128x4096x128xf8E4M3FNUZ>
-!K     = tensor<128x4096x128xf8E4M3FNUZ>
-!V     = tensor<128x4096x128xf8E4M3FNUZ>
-!O     = tensor<128x4096x128xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_128x4096x64x64x4096xf16.mlir b/attention/mlir/attention_128x4096x64x64x4096xf16.mlir
deleted file mode 100644
index 4c7aa7b..0000000
--- a/attention/mlir/attention_128x4096x64x64x4096xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<128x4096x64xf16>
-!K     = tensor<128x4096x64xf16>
-!V     = tensor<128x4096x64xf16>
-!O     = tensor<128x4096x64xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_128x4096x64x64x4096xf8E4M3FNUZ.mlir b/attention/mlir/attention_128x4096x64x64x4096xf8E4M3FNUZ.mlir
deleted file mode 100644
index 2e612dc..0000000
--- a/attention/mlir/attention_128x4096x64x64x4096xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<128x4096x64xf8E4M3FNUZ>
-!K     = tensor<128x4096x64xf8E4M3FNUZ>
-!V     = tensor<128x4096x64xf8E4M3FNUZ>
-!O     = tensor<128x4096x64xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_128x8192x128x128x8192xf16.mlir b/attention/mlir/attention_128x8192x128x128x8192xf16.mlir
deleted file mode 100644
index 1dda46d..0000000
--- a/attention/mlir/attention_128x8192x128x128x8192xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<128x8192x128xf16>
-!K     = tensor<128x8192x128xf16>
-!V     = tensor<128x8192x128xf16>
-!O     = tensor<128x8192x128xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_128x8192x128x128x8192xf8E4M3FNUZ.mlir b/attention/mlir/attention_128x8192x128x128x8192xf8E4M3FNUZ.mlir
deleted file mode 100644
index fa1b3e0..0000000
--- a/attention/mlir/attention_128x8192x128x128x8192xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<128x8192x128xf8E4M3FNUZ>
-!K     = tensor<128x8192x128xf8E4M3FNUZ>
-!V     = tensor<128x8192x128xf8E4M3FNUZ>
-!O     = tensor<128x8192x128xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_128x8192x64x64x8192xf16.mlir b/attention/mlir/attention_128x8192x64x64x8192xf16.mlir
deleted file mode 100644
index 2158d02..0000000
--- a/attention/mlir/attention_128x8192x64x64x8192xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<128x8192x64xf16>
-!K     = tensor<128x8192x64xf16>
-!V     = tensor<128x8192x64xf16>
-!O     = tensor<128x8192x64xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_128x8192x64x64x8192xf8E4M3FNUZ.mlir b/attention/mlir/attention_128x8192x64x64x8192xf8E4M3FNUZ.mlir
deleted file mode 100644
index 54339bd..0000000
--- a/attention/mlir/attention_128x8192x64x64x8192xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<128x8192x64xf8E4M3FNUZ>
-!K     = tensor<128x8192x64xf8E4M3FNUZ>
-!V     = tensor<128x8192x64xf8E4M3FNUZ>
-!O     = tensor<128x8192x64xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_12x384x64x64x384xf16.mlir b/attention/mlir/attention_12x384x64x64x384xf16.mlir
deleted file mode 100644
index b005a07..0000000
--- a/attention/mlir/attention_12x384x64x64x384xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<12x384x64xf16>
-!K     = tensor<12x384x64xf16>
-!V     = tensor<12x384x64xf16>
-!O     = tensor<12x384x64xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_12x384x64x64x384xf8E4M3FNUZ.mlir b/attention/mlir/attention_12x384x64x64x384xf8E4M3FNUZ.mlir
deleted file mode 100644
index 377da3e..0000000
--- a/attention/mlir/attention_12x384x64x64x384xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<12x384x64xf8E4M3FNUZ>
-!K     = tensor<12x384x64xf8E4M3FNUZ>
-!V     = tensor<12x384x64xf8E4M3FNUZ>
-!O     = tensor<12x384x64xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_16x1024x128x128x1024xf16.mlir b/attention/mlir/attention_16x1024x128x128x1024xf16.mlir
deleted file mode 100644
index 4623470..0000000
--- a/attention/mlir/attention_16x1024x128x128x1024xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<16x1024x128xf16>
-!K     = tensor<16x1024x128xf16>
-!V     = tensor<16x1024x128xf16>
-!O     = tensor<16x1024x128xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_16x1024x128x128x1024xf8E4M3FNUZ.mlir b/attention/mlir/attention_16x1024x128x128x1024xf8E4M3FNUZ.mlir
deleted file mode 100644
index 0b6bded..0000000
--- a/attention/mlir/attention_16x1024x128x128x1024xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<16x1024x128xf8E4M3FNUZ>
-!K     = tensor<16x1024x128xf8E4M3FNUZ>
-!V     = tensor<16x1024x128xf8E4M3FNUZ>
-!O     = tensor<16x1024x128xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_16x1024x64x64x1024xf16.mlir b/attention/mlir/attention_16x1024x64x64x1024xf16.mlir
deleted file mode 100644
index 50c9a78..0000000
--- a/attention/mlir/attention_16x1024x64x64x1024xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<16x1024x64xf16>
-!K     = tensor<16x1024x64xf16>
-!V     = tensor<16x1024x64xf16>
-!O     = tensor<16x1024x64xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_16x1024x64x64x1024xf8E4M3FNUZ.mlir b/attention/mlir/attention_16x1024x64x64x1024xf8E4M3FNUZ.mlir
deleted file mode 100644
index 86ef7f7..0000000
--- a/attention/mlir/attention_16x1024x64x64x1024xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<16x1024x64xf8E4M3FNUZ>
-!K     = tensor<16x1024x64xf8E4M3FNUZ>
-!V     = tensor<16x1024x64xf8E4M3FNUZ>
-!O     = tensor<16x1024x64xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_16x16384x128x128x16384xf16.mlir b/attention/mlir/attention_16x16384x128x128x16384xf16.mlir
deleted file mode 100644
index f8ade10..0000000
--- a/attention/mlir/attention_16x16384x128x128x16384xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<16x16384x128xf16>
-!K     = tensor<16x16384x128xf16>
-!V     = tensor<16x16384x128xf16>
-!O     = tensor<16x16384x128xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_16x16384x128x128x16384xf8E4M3FNUZ.mlir b/attention/mlir/attention_16x16384x128x128x16384xf8E4M3FNUZ.mlir
deleted file mode 100644
index 9e46f03..0000000
--- a/attention/mlir/attention_16x16384x128x128x16384xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<16x16384x128xf8E4M3FNUZ>
-!K     = tensor<16x16384x128xf8E4M3FNUZ>
-!V     = tensor<16x16384x128xf8E4M3FNUZ>
-!O     = tensor<16x16384x128xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_16x16384x64x64x16384xf16.mlir b/attention/mlir/attention_16x16384x64x64x16384xf16.mlir
deleted file mode 100644
index 047fb68..0000000
--- a/attention/mlir/attention_16x16384x64x64x16384xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<16x16384x64xf16>
-!K     = tensor<16x16384x64xf16>
-!V     = tensor<16x16384x64xf16>
-!O     = tensor<16x16384x64xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_16x16384x64x64x16384xf8E4M3FNUZ.mlir b/attention/mlir/attention_16x16384x64x64x16384xf8E4M3FNUZ.mlir
deleted file mode 100644
index 6c876c0..0000000
--- a/attention/mlir/attention_16x16384x64x64x16384xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<16x16384x64xf8E4M3FNUZ>
-!K     = tensor<16x16384x64xf8E4M3FNUZ>
-!V     = tensor<16x16384x64xf8E4M3FNUZ>
-!O     = tensor<16x16384x64xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_16x2048x128x128x2048xf16.mlir b/attention/mlir/attention_16x2048x128x128x2048xf16.mlir
deleted file mode 100644
index ea249c7..0000000
--- a/attention/mlir/attention_16x2048x128x128x2048xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<16x2048x128xf16>
-!K     = tensor<16x2048x128xf16>
-!V     = tensor<16x2048x128xf16>
-!O     = tensor<16x2048x128xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_16x2048x128x128x2048xf8E4M3FNUZ.mlir b/attention/mlir/attention_16x2048x128x128x2048xf8E4M3FNUZ.mlir
deleted file mode 100644
index 12d772f..0000000
--- a/attention/mlir/attention_16x2048x128x128x2048xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<16x2048x128xf8E4M3FNUZ>
-!K     = tensor<16x2048x128xf8E4M3FNUZ>
-!V     = tensor<16x2048x128xf8E4M3FNUZ>
-!O     = tensor<16x2048x128xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_16x2048x64x64x2048xf16.mlir b/attention/mlir/attention_16x2048x64x64x2048xf16.mlir
deleted file mode 100644
index b73d4a8..0000000
--- a/attention/mlir/attention_16x2048x64x64x2048xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<16x2048x64xf16>
-!K     = tensor<16x2048x64xf16>
-!V     = tensor<16x2048x64xf16>
-!O     = tensor<16x2048x64xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_16x2048x64x64x2048xf8E4M3FNUZ.mlir b/attention/mlir/attention_16x2048x64x64x2048xf8E4M3FNUZ.mlir
deleted file mode 100644
index a965fae..0000000
--- a/attention/mlir/attention_16x2048x64x64x2048xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<16x2048x64xf8E4M3FNUZ>
-!K     = tensor<16x2048x64xf8E4M3FNUZ>
-!V     = tensor<16x2048x64xf8E4M3FNUZ>
-!O     = tensor<16x2048x64xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_16x4096x128x128x4096xf16.mlir b/attention/mlir/attention_16x4096x128x128x4096xf16.mlir
deleted file mode 100644
index 754a68d..0000000
--- a/attention/mlir/attention_16x4096x128x128x4096xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<16x4096x128xf16>
-!K     = tensor<16x4096x128xf16>
-!V     = tensor<16x4096x128xf16>
-!O     = tensor<16x4096x128xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_16x4096x128x128x4096xf8E4M3FNUZ.mlir b/attention/mlir/attention_16x4096x128x128x4096xf8E4M3FNUZ.mlir
deleted file mode 100644
index 2fc1642..0000000
--- a/attention/mlir/attention_16x4096x128x128x4096xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<16x4096x128xf8E4M3FNUZ>
-!K     = tensor<16x4096x128xf8E4M3FNUZ>
-!V     = tensor<16x4096x128xf8E4M3FNUZ>
-!O     = tensor<16x4096x128xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_16x4096x64x64x4096xf16.mlir b/attention/mlir/attention_16x4096x64x64x4096xf16.mlir
deleted file mode 100644
index c673c2c..0000000
--- a/attention/mlir/attention_16x4096x64x64x4096xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<16x4096x64xf16>
-!K     = tensor<16x4096x64xf16>
-!V     = tensor<16x4096x64xf16>
-!O     = tensor<16x4096x64xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_16x4096x64x64x4096xf8E4M3FNUZ.mlir b/attention/mlir/attention_16x4096x64x64x4096xf8E4M3FNUZ.mlir
deleted file mode 100644
index 7a71898..0000000
--- a/attention/mlir/attention_16x4096x64x64x4096xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<16x4096x64xf8E4M3FNUZ>
-!K     = tensor<16x4096x64xf8E4M3FNUZ>
-!V     = tensor<16x4096x64xf8E4M3FNUZ>
-!O     = tensor<16x4096x64xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_16x8192x128x128x8192xf16.mlir b/attention/mlir/attention_16x8192x128x128x8192xf16.mlir
deleted file mode 100644
index e9642ad..0000000
--- a/attention/mlir/attention_16x8192x128x128x8192xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<16x8192x128xf16>
-!K     = tensor<16x8192x128xf16>
-!V     = tensor<16x8192x128xf16>
-!O     = tensor<16x8192x128xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_16x8192x128x128x8192xf8E4M3FNUZ.mlir b/attention/mlir/attention_16x8192x128x128x8192xf8E4M3FNUZ.mlir
deleted file mode 100644
index 773130f..0000000
--- a/attention/mlir/attention_16x8192x128x128x8192xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<16x8192x128xf8E4M3FNUZ>
-!K     = tensor<16x8192x128xf8E4M3FNUZ>
-!V     = tensor<16x8192x128xf8E4M3FNUZ>
-!O     = tensor<16x8192x128xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_16x8192x64x64x8192xf16.mlir b/attention/mlir/attention_16x8192x64x64x8192xf16.mlir
deleted file mode 100644
index b7134c4..0000000
--- a/attention/mlir/attention_16x8192x64x64x8192xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<16x8192x64xf16>
-!K     = tensor<16x8192x64xf16>
-!V     = tensor<16x8192x64xf16>
-!O     = tensor<16x8192x64xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_16x8192x64x64x8192xf8E4M3FNUZ.mlir b/attention/mlir/attention_16x8192x64x64x8192xf8E4M3FNUZ.mlir
deleted file mode 100644
index 385f513..0000000
--- a/attention/mlir/attention_16x8192x64x64x8192xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<16x8192x64xf8E4M3FNUZ>
-!K     = tensor<16x8192x64xf8E4M3FNUZ>
-!V     = tensor<16x8192x64xf8E4M3FNUZ>
-!O     = tensor<16x8192x64xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_192x1024x128x128x1024xf16.mlir b/attention/mlir/attention_192x1024x128x128x1024xf16.mlir
deleted file mode 100644
index d568d56..0000000
--- a/attention/mlir/attention_192x1024x128x128x1024xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<192x1024x128xf16>
-!K     = tensor<192x1024x128xf16>
-!V     = tensor<192x1024x128xf16>
-!O     = tensor<192x1024x128xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_192x1024x128x128x1024xf8E4M3FNUZ.mlir b/attention/mlir/attention_192x1024x128x128x1024xf8E4M3FNUZ.mlir
deleted file mode 100644
index 198e2a9..0000000
--- a/attention/mlir/attention_192x1024x128x128x1024xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<192x1024x128xf8E4M3FNUZ>
-!K     = tensor<192x1024x128xf8E4M3FNUZ>
-!V     = tensor<192x1024x128xf8E4M3FNUZ>
-!O     = tensor<192x1024x128xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_192x1024x64x64x1024xf16.mlir b/attention/mlir/attention_192x1024x64x64x1024xf16.mlir
deleted file mode 100644
index 0ae8348..0000000
--- a/attention/mlir/attention_192x1024x64x64x1024xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<192x1024x64xf16>
-!K     = tensor<192x1024x64xf16>
-!V     = tensor<192x1024x64xf16>
-!O     = tensor<192x1024x64xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_192x1024x64x64x1024xf8E4M3FNUZ.mlir b/attention/mlir/attention_192x1024x64x64x1024xf8E4M3FNUZ.mlir
deleted file mode 100644
index 5167cf9..0000000
--- a/attention/mlir/attention_192x1024x64x64x1024xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<192x1024x64xf8E4M3FNUZ>
-!K     = tensor<192x1024x64xf8E4M3FNUZ>
-!V     = tensor<192x1024x64xf8E4M3FNUZ>
-!O     = tensor<192x1024x64xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_192x16384x128x128x16384xf16.mlir b/attention/mlir/attention_192x16384x128x128x16384xf16.mlir
deleted file mode 100644
index 5a0016f..0000000
--- a/attention/mlir/attention_192x16384x128x128x16384xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<192x16384x128xf16>
-!K     = tensor<192x16384x128xf16>
-!V     = tensor<192x16384x128xf16>
-!O     = tensor<192x16384x128xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_192x16384x128x128x16384xf8E4M3FNUZ.mlir b/attention/mlir/attention_192x16384x128x128x16384xf8E4M3FNUZ.mlir
deleted file mode 100644
index 4932489..0000000
--- a/attention/mlir/attention_192x16384x128x128x16384xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<192x16384x128xf8E4M3FNUZ>
-!K     = tensor<192x16384x128xf8E4M3FNUZ>
-!V     = tensor<192x16384x128xf8E4M3FNUZ>
-!O     = tensor<192x16384x128xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_192x16384x64x64x16384xf16.mlir b/attention/mlir/attention_192x16384x64x64x16384xf16.mlir
deleted file mode 100644
index e8d0bed..0000000
--- a/attention/mlir/attention_192x16384x64x64x16384xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<192x16384x64xf16>
-!K     = tensor<192x16384x64xf16>
-!V     = tensor<192x16384x64xf16>
-!O     = tensor<192x16384x64xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_192x16384x64x64x16384xf8E4M3FNUZ.mlir b/attention/mlir/attention_192x16384x64x64x16384xf8E4M3FNUZ.mlir
deleted file mode 100644
index aa089b4..0000000
--- a/attention/mlir/attention_192x16384x64x64x16384xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<192x16384x64xf8E4M3FNUZ>
-!K     = tensor<192x16384x64xf8E4M3FNUZ>
-!V     = tensor<192x16384x64xf8E4M3FNUZ>
-!O     = tensor<192x16384x64xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_192x2048x128x128x2048xf16.mlir b/attention/mlir/attention_192x2048x128x128x2048xf16.mlir
deleted file mode 100644
index 3d248d7..0000000
--- a/attention/mlir/attention_192x2048x128x128x2048xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<192x2048x128xf16>
-!K     = tensor<192x2048x128xf16>
-!V     = tensor<192x2048x128xf16>
-!O     = tensor<192x2048x128xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_192x2048x128x128x2048xf8E4M3FNUZ.mlir b/attention/mlir/attention_192x2048x128x128x2048xf8E4M3FNUZ.mlir
deleted file mode 100644
index ced5bb1..0000000
--- a/attention/mlir/attention_192x2048x128x128x2048xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<192x2048x128xf8E4M3FNUZ>
-!K     = tensor<192x2048x128xf8E4M3FNUZ>
-!V     = tensor<192x2048x128xf8E4M3FNUZ>
-!O     = tensor<192x2048x128xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_192x2048x64x64x2048xf16.mlir b/attention/mlir/attention_192x2048x64x64x2048xf16.mlir
deleted file mode 100644
index 95391b8..0000000
--- a/attention/mlir/attention_192x2048x64x64x2048xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<192x2048x64xf16>
-!K     = tensor<192x2048x64xf16>
-!V     = tensor<192x2048x64xf16>
-!O     = tensor<192x2048x64xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_192x2048x64x64x2048xf8E4M3FNUZ.mlir b/attention/mlir/attention_192x2048x64x64x2048xf8E4M3FNUZ.mlir
deleted file mode 100644
index cd6006d..0000000
--- a/attention/mlir/attention_192x2048x64x64x2048xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<192x2048x64xf8E4M3FNUZ>
-!K     = tensor<192x2048x64xf8E4M3FNUZ>
-!V     = tensor<192x2048x64xf8E4M3FNUZ>
-!O     = tensor<192x2048x64xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_192x4096x128x128x4096xf16.mlir b/attention/mlir/attention_192x4096x128x128x4096xf16.mlir
deleted file mode 100644
index b33089f..0000000
--- a/attention/mlir/attention_192x4096x128x128x4096xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<192x4096x128xf16>
-!K     = tensor<192x4096x128xf16>
-!V     = tensor<192x4096x128xf16>
-!O     = tensor<192x4096x128xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_192x4096x128x128x4096xf8E4M3FNUZ.mlir b/attention/mlir/attention_192x4096x128x128x4096xf8E4M3FNUZ.mlir
deleted file mode 100644
index c42edaa..0000000
--- a/attention/mlir/attention_192x4096x128x128x4096xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<192x4096x128xf8E4M3FNUZ>
-!K     = tensor<192x4096x128xf8E4M3FNUZ>
-!V     = tensor<192x4096x128xf8E4M3FNUZ>
-!O     = tensor<192x4096x128xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_192x4096x64x64x4096xf16.mlir b/attention/mlir/attention_192x4096x64x64x4096xf16.mlir
deleted file mode 100644
index 0a4df89..0000000
--- a/attention/mlir/attention_192x4096x64x64x4096xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<192x4096x64xf16>
-!K     = tensor<192x4096x64xf16>
-!V     = tensor<192x4096x64xf16>
-!O     = tensor<192x4096x64xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_192x4096x64x64x4096xf8E4M3FNUZ.mlir b/attention/mlir/attention_192x4096x64x64x4096xf8E4M3FNUZ.mlir
deleted file mode 100644
index 7ba5e9a..0000000
--- a/attention/mlir/attention_192x4096x64x64x4096xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<192x4096x64xf8E4M3FNUZ>
-!K     = tensor<192x4096x64xf8E4M3FNUZ>
-!V     = tensor<192x4096x64xf8E4M3FNUZ>
-!O     = tensor<192x4096x64xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_192x8192x128x128x8192xf16.mlir b/attention/mlir/attention_192x8192x128x128x8192xf16.mlir
deleted file mode 100644
index 9f5cacd..0000000
--- a/attention/mlir/attention_192x8192x128x128x8192xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<192x8192x128xf16>
-!K     = tensor<192x8192x128xf16>
-!V     = tensor<192x8192x128xf16>
-!O     = tensor<192x8192x128xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_192x8192x128x128x8192xf8E4M3FNUZ.mlir b/attention/mlir/attention_192x8192x128x128x8192xf8E4M3FNUZ.mlir
deleted file mode 100644
index f65c32f..0000000
--- a/attention/mlir/attention_192x8192x128x128x8192xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<192x8192x128xf8E4M3FNUZ>
-!K     = tensor<192x8192x128xf8E4M3FNUZ>
-!V     = tensor<192x8192x128xf8E4M3FNUZ>
-!O     = tensor<192x8192x128xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_192x8192x64x64x8192xf16.mlir b/attention/mlir/attention_192x8192x64x64x8192xf16.mlir
deleted file mode 100644
index dd75314..0000000
--- a/attention/mlir/attention_192x8192x64x64x8192xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<192x8192x64xf16>
-!K     = tensor<192x8192x64xf16>
-!V     = tensor<192x8192x64xf16>
-!O     = tensor<192x8192x64xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_192x8192x64x64x8192xf8E4M3FNUZ.mlir b/attention/mlir/attention_192x8192x64x64x8192xf8E4M3FNUZ.mlir
deleted file mode 100644
index 6e02f7f..0000000
--- a/attention/mlir/attention_192x8192x64x64x8192xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<192x8192x64xf8E4M3FNUZ>
-!K     = tensor<192x8192x64xf8E4M3FNUZ>
-!V     = tensor<192x8192x64xf8E4M3FNUZ>
-!O     = tensor<192x8192x64xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_1x1024x128x128x1024xf16.mlir b/attention/mlir/attention_1x1024x128x128x1024xf16.mlir
deleted file mode 100644
index d0033c3..0000000
--- a/attention/mlir/attention_1x1024x128x128x1024xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<1x1024x128xf16>
-!K     = tensor<1x1024x128xf16>
-!V     = tensor<1x1024x128xf16>
-!O     = tensor<1x1024x128xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_1x1024x128x128x1024xf8E4M3FNUZ.mlir b/attention/mlir/attention_1x1024x128x128x1024xf8E4M3FNUZ.mlir
deleted file mode 100644
index 6f4262a..0000000
--- a/attention/mlir/attention_1x1024x128x128x1024xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<1x1024x128xf8E4M3FNUZ>
-!K     = tensor<1x1024x128xf8E4M3FNUZ>
-!V     = tensor<1x1024x128xf8E4M3FNUZ>
-!O     = tensor<1x1024x128xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_1x1024x64x64x1024xf16.mlir b/attention/mlir/attention_1x1024x64x64x1024xf16.mlir
deleted file mode 100644
index 801f7a1..0000000
--- a/attention/mlir/attention_1x1024x64x64x1024xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<1x1024x64xf16>
-!K     = tensor<1x1024x64xf16>
-!V     = tensor<1x1024x64xf16>
-!O     = tensor<1x1024x64xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_1x1024x64x64x1024xf8E4M3FNUZ.mlir b/attention/mlir/attention_1x1024x64x64x1024xf8E4M3FNUZ.mlir
deleted file mode 100644
index ff0265b..0000000
--- a/attention/mlir/attention_1x1024x64x64x1024xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<1x1024x64xf8E4M3FNUZ>
-!K     = tensor<1x1024x64xf8E4M3FNUZ>
-!V     = tensor<1x1024x64xf8E4M3FNUZ>
-!O     = tensor<1x1024x64xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_1x16384x128x128x16384xf16.mlir b/attention/mlir/attention_1x16384x128x128x16384xf16.mlir
deleted file mode 100644
index 9de41f4..0000000
--- a/attention/mlir/attention_1x16384x128x128x16384xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<1x16384x128xf16>
-!K     = tensor<1x16384x128xf16>
-!V     = tensor<1x16384x128xf16>
-!O     = tensor<1x16384x128xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_1x16384x128x128x16384xf8E4M3FNUZ.mlir b/attention/mlir/attention_1x16384x128x128x16384xf8E4M3FNUZ.mlir
deleted file mode 100644
index e2a3332..0000000
--- a/attention/mlir/attention_1x16384x128x128x16384xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<1x16384x128xf8E4M3FNUZ>
-!K     = tensor<1x16384x128xf8E4M3FNUZ>
-!V     = tensor<1x16384x128xf8E4M3FNUZ>
-!O     = tensor<1x16384x128xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_1x16384x64x64x16384xf16.mlir b/attention/mlir/attention_1x16384x64x64x16384xf16.mlir
deleted file mode 100644
index c6c641c..0000000
--- a/attention/mlir/attention_1x16384x64x64x16384xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<1x16384x64xf16>
-!K     = tensor<1x16384x64xf16>
-!V     = tensor<1x16384x64xf16>
-!O     = tensor<1x16384x64xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_1x16384x64x64x16384xf8E4M3FNUZ.mlir b/attention/mlir/attention_1x16384x64x64x16384xf8E4M3FNUZ.mlir
deleted file mode 100644
index b1243c4..0000000
--- a/attention/mlir/attention_1x16384x64x64x16384xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<1x16384x64xf8E4M3FNUZ>
-!K     = tensor<1x16384x64xf8E4M3FNUZ>
-!V     = tensor<1x16384x64xf8E4M3FNUZ>
-!O     = tensor<1x16384x64xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_1x2048x128x128x2048xf16.mlir b/attention/mlir/attention_1x2048x128x128x2048xf16.mlir
deleted file mode 100644
index 33118e6..0000000
--- a/attention/mlir/attention_1x2048x128x128x2048xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<1x2048x128xf16>
-!K     = tensor<1x2048x128xf16>
-!V     = tensor<1x2048x128xf16>
-!O     = tensor<1x2048x128xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_1x2048x128x128x2048xf8E4M3FNUZ.mlir b/attention/mlir/attention_1x2048x128x128x2048xf8E4M3FNUZ.mlir
deleted file mode 100644
index edc2893..0000000
--- a/attention/mlir/attention_1x2048x128x128x2048xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<1x2048x128xf8E4M3FNUZ>
-!K     = tensor<1x2048x128xf8E4M3FNUZ>
-!V     = tensor<1x2048x128xf8E4M3FNUZ>
-!O     = tensor<1x2048x128xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_1x2048x64x64x2048xf16.mlir b/attention/mlir/attention_1x2048x64x64x2048xf16.mlir
deleted file mode 100644
index af1fe5e..0000000
--- a/attention/mlir/attention_1x2048x64x64x2048xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<1x2048x64xf16>
-!K     = tensor<1x2048x64xf16>
-!V     = tensor<1x2048x64xf16>
-!O     = tensor<1x2048x64xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_1x2048x64x64x2048xf8E4M3FNUZ.mlir b/attention/mlir/attention_1x2048x64x64x2048xf8E4M3FNUZ.mlir
deleted file mode 100644
index fc712b1..0000000
--- a/attention/mlir/attention_1x2048x64x64x2048xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<1x2048x64xf8E4M3FNUZ>
-!K     = tensor<1x2048x64xf8E4M3FNUZ>
-!V     = tensor<1x2048x64xf8E4M3FNUZ>
-!O     = tensor<1x2048x64xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_1x4096x128x128x4096xf16.mlir b/attention/mlir/attention_1x4096x128x128x4096xf16.mlir
deleted file mode 100644
index c65d072..0000000
--- a/attention/mlir/attention_1x4096x128x128x4096xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<1x4096x128xf16>
-!K     = tensor<1x4096x128xf16>
-!V     = tensor<1x4096x128xf16>
-!O     = tensor<1x4096x128xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_1x4096x128x128x4096xf8E4M3FNUZ.mlir b/attention/mlir/attention_1x4096x128x128x4096xf8E4M3FNUZ.mlir
deleted file mode 100644
index 11503c2..0000000
--- a/attention/mlir/attention_1x4096x128x128x4096xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<1x4096x128xf8E4M3FNUZ>
-!K     = tensor<1x4096x128xf8E4M3FNUZ>
-!V     = tensor<1x4096x128xf8E4M3FNUZ>
-!O     = tensor<1x4096x128xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_1x4096x64x64x4096xf16.mlir b/attention/mlir/attention_1x4096x64x64x4096xf16.mlir
deleted file mode 100644
index ffb81b9..0000000
--- a/attention/mlir/attention_1x4096x64x64x4096xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<1x4096x64xf16>
-!K     = tensor<1x4096x64xf16>
-!V     = tensor<1x4096x64xf16>
-!O     = tensor<1x4096x64xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_1x4096x64x64x4096xf8E4M3FNUZ.mlir b/attention/mlir/attention_1x4096x64x64x4096xf8E4M3FNUZ.mlir
deleted file mode 100644
index c06f231..0000000
--- a/attention/mlir/attention_1x4096x64x64x4096xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<1x4096x64xf8E4M3FNUZ>
-!K     = tensor<1x4096x64xf8E4M3FNUZ>
-!V     = tensor<1x4096x64xf8E4M3FNUZ>
-!O     = tensor<1x4096x64xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_1x4096x64x64x64xf16.mlir b/attention/mlir/attention_1x4096x64x64x64xf16.mlir
deleted file mode 100644
index f514c5f..0000000
--- a/attention/mlir/attention_1x4096x64x64x64xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<1x4096x64xf16>
-!K     = tensor<1x64x64xf16>
-!V     = tensor<1x64x64xf16>
-!O     = tensor<1x4096x64xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_1x4096x64x64x64xf8E4M3FNUZ.mlir b/attention/mlir/attention_1x4096x64x64x64xf8E4M3FNUZ.mlir
deleted file mode 100644
index 277080c..0000000
--- a/attention/mlir/attention_1x4096x64x64x64xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<1x4096x64xf8E4M3FNUZ>
-!K     = tensor<1x64x64xf8E4M3FNUZ>
-!V     = tensor<1x64x64xf8E4M3FNUZ>
-!O     = tensor<1x4096x64xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_1x8192x128x128x8192xf16.mlir b/attention/mlir/attention_1x8192x128x128x8192xf16.mlir
deleted file mode 100644
index 8f95a6e..0000000
--- a/attention/mlir/attention_1x8192x128x128x8192xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<1x8192x128xf16>
-!K     = tensor<1x8192x128xf16>
-!V     = tensor<1x8192x128xf16>
-!O     = tensor<1x8192x128xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_1x8192x128x128x8192xf8E4M3FNUZ.mlir b/attention/mlir/attention_1x8192x128x128x8192xf8E4M3FNUZ.mlir
deleted file mode 100644
index 296ea87..0000000
--- a/attention/mlir/attention_1x8192x128x128x8192xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<1x8192x128xf8E4M3FNUZ>
-!K     = tensor<1x8192x128xf8E4M3FNUZ>
-!V     = tensor<1x8192x128xf8E4M3FNUZ>
-!O     = tensor<1x8192x128xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_1x8192x64x64x8192xf16.mlir b/attention/mlir/attention_1x8192x64x64x8192xf16.mlir
deleted file mode 100644
index 8164497..0000000
--- a/attention/mlir/attention_1x8192x64x64x8192xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<1x8192x64xf16>
-!K     = tensor<1x8192x64xf16>
-!V     = tensor<1x8192x64xf16>
-!O     = tensor<1x8192x64xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_1x8192x64x64x8192xf8E4M3FNUZ.mlir b/attention/mlir/attention_1x8192x64x64x8192xf8E4M3FNUZ.mlir
deleted file mode 100644
index ca010f7..0000000
--- a/attention/mlir/attention_1x8192x64x64x8192xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<1x8192x64xf8E4M3FNUZ>
-!K     = tensor<1x8192x64xf8E4M3FNUZ>
-!V     = tensor<1x8192x64xf8E4M3FNUZ>
-!O     = tensor<1x8192x64xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_20x4096x64x64x4096xf16.mlir b/attention/mlir/attention_20x4096x64x64x4096xf16.mlir
deleted file mode 100644
index 5902946..0000000
--- a/attention/mlir/attention_20x4096x64x64x4096xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<20x4096x64xf16>
-!K     = tensor<20x4096x64xf16>
-!V     = tensor<20x4096x64xf16>
-!O     = tensor<20x4096x64xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_20x4096x64x64x4096xf8E4M3FNUZ.mlir b/attention/mlir/attention_20x4096x64x64x4096xf8E4M3FNUZ.mlir
deleted file mode 100644
index b1049ac..0000000
--- a/attention/mlir/attention_20x4096x64x64x4096xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<20x4096x64xf8E4M3FNUZ>
-!K     = tensor<20x4096x64xf8E4M3FNUZ>
-!V     = tensor<20x4096x64xf8E4M3FNUZ>
-!O     = tensor<20x4096x64xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_20x4096x64x64x64xf16.mlir b/attention/mlir/attention_20x4096x64x64x64xf16.mlir
deleted file mode 100644
index 125de49..0000000
--- a/attention/mlir/attention_20x4096x64x64x64xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<20x4096x64xf16>
-!K     = tensor<20x64x64xf16>
-!V     = tensor<20x64x64xf16>
-!O     = tensor<20x4096x64xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_20x4096x64x64x64xf8E4M3FNUZ.mlir b/attention/mlir/attention_20x4096x64x64x64xf8E4M3FNUZ.mlir
deleted file mode 100644
index 9a419fa..0000000
--- a/attention/mlir/attention_20x4096x64x64x64xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<20x4096x64xf8E4M3FNUZ>
-!K     = tensor<20x64x64xf8E4M3FNUZ>
-!V     = tensor<20x64x64xf8E4M3FNUZ>
-!O     = tensor<20x4096x64xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_2x1024x128x128x1024xf16.mlir b/attention/mlir/attention_2x1024x128x128x1024xf16.mlir
deleted file mode 100644
index 74b26d4..0000000
--- a/attention/mlir/attention_2x1024x128x128x1024xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<2x1024x128xf16>
-!K     = tensor<2x1024x128xf16>
-!V     = tensor<2x1024x128xf16>
-!O     = tensor<2x1024x128xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_2x1024x128x128x1024xf8E4M3FNUZ.mlir b/attention/mlir/attention_2x1024x128x128x1024xf8E4M3FNUZ.mlir
deleted file mode 100644
index af9abef..0000000
--- a/attention/mlir/attention_2x1024x128x128x1024xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<2x1024x128xf8E4M3FNUZ>
-!K     = tensor<2x1024x128xf8E4M3FNUZ>
-!V     = tensor<2x1024x128xf8E4M3FNUZ>
-!O     = tensor<2x1024x128xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_2x1024x64x64x1024xf16.mlir b/attention/mlir/attention_2x1024x64x64x1024xf16.mlir
deleted file mode 100644
index 88d5950..0000000
--- a/attention/mlir/attention_2x1024x64x64x1024xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<2x1024x64xf16>
-!K     = tensor<2x1024x64xf16>
-!V     = tensor<2x1024x64xf16>
-!O     = tensor<2x1024x64xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_2x1024x64x64x1024xf8E4M3FNUZ.mlir b/attention/mlir/attention_2x1024x64x64x1024xf8E4M3FNUZ.mlir
deleted file mode 100644
index e5c0053..0000000
--- a/attention/mlir/attention_2x1024x64x64x1024xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<2x1024x64xf8E4M3FNUZ>
-!K     = tensor<2x1024x64xf8E4M3FNUZ>
-!V     = tensor<2x1024x64xf8E4M3FNUZ>
-!O     = tensor<2x1024x64xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_2x1024x64x64x64xf16.mlir b/attention/mlir/attention_2x1024x64x64x64xf16.mlir
deleted file mode 100644
index 2bfed9b..0000000
--- a/attention/mlir/attention_2x1024x64x64x64xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<2x1024x64xf16>
-!K     = tensor<2x64x64xf16>
-!V     = tensor<2x64x64xf16>
-!O     = tensor<2x1024x64xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_2x1024x64x64x64xf8E4M3FNUZ.mlir b/attention/mlir/attention_2x1024x64x64x64xf8E4M3FNUZ.mlir
deleted file mode 100644
index 33facf8..0000000
--- a/attention/mlir/attention_2x1024x64x64x64xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<2x1024x64xf8E4M3FNUZ>
-!K     = tensor<2x64x64xf8E4M3FNUZ>
-!V     = tensor<2x64x64xf8E4M3FNUZ>
-!O     = tensor<2x1024x64xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_2x16384x128x128x16384xf16.mlir b/attention/mlir/attention_2x16384x128x128x16384xf16.mlir
deleted file mode 100644
index 19d2a0b..0000000
--- a/attention/mlir/attention_2x16384x128x128x16384xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<2x16384x128xf16>
-!K     = tensor<2x16384x128xf16>
-!V     = tensor<2x16384x128xf16>
-!O     = tensor<2x16384x128xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_2x16384x128x128x16384xf8E4M3FNUZ.mlir b/attention/mlir/attention_2x16384x128x128x16384xf8E4M3FNUZ.mlir
deleted file mode 100644
index 7183642..0000000
--- a/attention/mlir/attention_2x16384x128x128x16384xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<2x16384x128xf8E4M3FNUZ>
-!K     = tensor<2x16384x128xf8E4M3FNUZ>
-!V     = tensor<2x16384x128xf8E4M3FNUZ>
-!O     = tensor<2x16384x128xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_2x16384x64x64x16384xf16.mlir b/attention/mlir/attention_2x16384x64x64x16384xf16.mlir
deleted file mode 100644
index ce2a318..0000000
--- a/attention/mlir/attention_2x16384x64x64x16384xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<2x16384x64xf16>
-!K     = tensor<2x16384x64xf16>
-!V     = tensor<2x16384x64xf16>
-!O     = tensor<2x16384x64xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_2x16384x64x64x16384xf8E4M3FNUZ.mlir b/attention/mlir/attention_2x16384x64x64x16384xf8E4M3FNUZ.mlir
deleted file mode 100644
index 4205d23..0000000
--- a/attention/mlir/attention_2x16384x64x64x16384xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<2x16384x64xf8E4M3FNUZ>
-!K     = tensor<2x16384x64xf8E4M3FNUZ>
-!V     = tensor<2x16384x64xf8E4M3FNUZ>
-!O     = tensor<2x16384x64xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_2x2048x128x128x2048xf16.mlir b/attention/mlir/attention_2x2048x128x128x2048xf16.mlir
deleted file mode 100644
index a9ee68d..0000000
--- a/attention/mlir/attention_2x2048x128x128x2048xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<2x2048x128xf16>
-!K     = tensor<2x2048x128xf16>
-!V     = tensor<2x2048x128xf16>
-!O     = tensor<2x2048x128xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_2x2048x128x128x2048xf8E4M3FNUZ.mlir b/attention/mlir/attention_2x2048x128x128x2048xf8E4M3FNUZ.mlir
deleted file mode 100644
index b39d931..0000000
--- a/attention/mlir/attention_2x2048x128x128x2048xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<2x2048x128xf8E4M3FNUZ>
-!K     = tensor<2x2048x128xf8E4M3FNUZ>
-!V     = tensor<2x2048x128xf8E4M3FNUZ>
-!O     = tensor<2x2048x128xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_2x2048x64x64x2048xf16.mlir b/attention/mlir/attention_2x2048x64x64x2048xf16.mlir
deleted file mode 100644
index d5d1fff..0000000
--- a/attention/mlir/attention_2x2048x64x64x2048xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<2x2048x64xf16>
-!K     = tensor<2x2048x64xf16>
-!V     = tensor<2x2048x64xf16>
-!O     = tensor<2x2048x64xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_2x2048x64x64x2048xf8E4M3FNUZ.mlir b/attention/mlir/attention_2x2048x64x64x2048xf8E4M3FNUZ.mlir
deleted file mode 100644
index 3b8d113..0000000
--- a/attention/mlir/attention_2x2048x64x64x2048xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<2x2048x64xf8E4M3FNUZ>
-!K     = tensor<2x2048x64xf8E4M3FNUZ>
-!V     = tensor<2x2048x64xf8E4M3FNUZ>
-!O     = tensor<2x2048x64xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_2x4096x128x128x4096xf16.mlir b/attention/mlir/attention_2x4096x128x128x4096xf16.mlir
deleted file mode 100644
index d150478..0000000
--- a/attention/mlir/attention_2x4096x128x128x4096xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<2x4096x128xf16>
-!K     = tensor<2x4096x128xf16>
-!V     = tensor<2x4096x128xf16>
-!O     = tensor<2x4096x128xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_2x4096x128x128x4096xf8E4M3FNUZ.mlir b/attention/mlir/attention_2x4096x128x128x4096xf8E4M3FNUZ.mlir
deleted file mode 100644
index 838f730..0000000
--- a/attention/mlir/attention_2x4096x128x128x4096xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<2x4096x128xf8E4M3FNUZ>
-!K     = tensor<2x4096x128xf8E4M3FNUZ>
-!V     = tensor<2x4096x128xf8E4M3FNUZ>
-!O     = tensor<2x4096x128xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_2x4096x64x64x4096xf16.mlir b/attention/mlir/attention_2x4096x64x64x4096xf16.mlir
deleted file mode 100644
index f36c84d..0000000
--- a/attention/mlir/attention_2x4096x64x64x4096xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<2x4096x64xf16>
-!K     = tensor<2x4096x64xf16>
-!V     = tensor<2x4096x64xf16>
-!O     = tensor<2x4096x64xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_2x4096x64x64x4096xf8E4M3FNUZ.mlir b/attention/mlir/attention_2x4096x64x64x4096xf8E4M3FNUZ.mlir
deleted file mode 100644
index 6b96d01..0000000
--- a/attention/mlir/attention_2x4096x64x64x4096xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<2x4096x64xf8E4M3FNUZ>
-!K     = tensor<2x4096x64xf8E4M3FNUZ>
-!V     = tensor<2x4096x64xf8E4M3FNUZ>
-!O     = tensor<2x4096x64xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_2x8192x128x128x8192xf16.mlir b/attention/mlir/attention_2x8192x128x128x8192xf16.mlir
deleted file mode 100644
index 91c7a35..0000000
--- a/attention/mlir/attention_2x8192x128x128x8192xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<2x8192x128xf16>
-!K     = tensor<2x8192x128xf16>
-!V     = tensor<2x8192x128xf16>
-!O     = tensor<2x8192x128xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_2x8192x128x128x8192xf8E4M3FNUZ.mlir b/attention/mlir/attention_2x8192x128x128x8192xf8E4M3FNUZ.mlir
deleted file mode 100644
index 49022f9..0000000
--- a/attention/mlir/attention_2x8192x128x128x8192xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<2x8192x128xf8E4M3FNUZ>
-!K     = tensor<2x8192x128xf8E4M3FNUZ>
-!V     = tensor<2x8192x128xf8E4M3FNUZ>
-!O     = tensor<2x8192x128xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_2x8192x64x64x8192xf16.mlir b/attention/mlir/attention_2x8192x64x64x8192xf16.mlir
deleted file mode 100644
index 4a6854b..0000000
--- a/attention/mlir/attention_2x8192x64x64x8192xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<2x8192x64xf16>
-!K     = tensor<2x8192x64xf16>
-!V     = tensor<2x8192x64xf16>
-!O     = tensor<2x8192x64xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_2x8192x64x64x8192xf8E4M3FNUZ.mlir b/attention/mlir/attention_2x8192x64x64x8192xf8E4M3FNUZ.mlir
deleted file mode 100644
index 6c1be4a..0000000
--- a/attention/mlir/attention_2x8192x64x64x8192xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<2x8192x64xf8E4M3FNUZ>
-!K     = tensor<2x8192x64xf8E4M3FNUZ>
-!V     = tensor<2x8192x64xf8E4M3FNUZ>
-!O     = tensor<2x8192x64xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_32x1024x128x128x1024xf16.mlir b/attention/mlir/attention_32x1024x128x128x1024xf16.mlir
deleted file mode 100644
index b49b18d..0000000
--- a/attention/mlir/attention_32x1024x128x128x1024xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<32x1024x128xf16>
-!K     = tensor<32x1024x128xf16>
-!V     = tensor<32x1024x128xf16>
-!O     = tensor<32x1024x128xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_32x1024x128x128x1024xf8E4M3FNUZ.mlir b/attention/mlir/attention_32x1024x128x128x1024xf8E4M3FNUZ.mlir
deleted file mode 100644
index 42307f6..0000000
--- a/attention/mlir/attention_32x1024x128x128x1024xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<32x1024x128xf8E4M3FNUZ>
-!K     = tensor<32x1024x128xf8E4M3FNUZ>
-!V     = tensor<32x1024x128xf8E4M3FNUZ>
-!O     = tensor<32x1024x128xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_32x1024x64x64x1024xf16.mlir b/attention/mlir/attention_32x1024x64x64x1024xf16.mlir
deleted file mode 100644
index 368bdb0..0000000
--- a/attention/mlir/attention_32x1024x64x64x1024xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<32x1024x64xf16>
-!K     = tensor<32x1024x64xf16>
-!V     = tensor<32x1024x64xf16>
-!O     = tensor<32x1024x64xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_32x1024x64x64x1024xf8E4M3FNUZ.mlir b/attention/mlir/attention_32x1024x64x64x1024xf8E4M3FNUZ.mlir
deleted file mode 100644
index 2749162..0000000
--- a/attention/mlir/attention_32x1024x64x64x1024xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<32x1024x64xf8E4M3FNUZ>
-!K     = tensor<32x1024x64xf8E4M3FNUZ>
-!V     = tensor<32x1024x64xf8E4M3FNUZ>
-!O     = tensor<32x1024x64xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_32x16384x128x128x16384xf16.mlir b/attention/mlir/attention_32x16384x128x128x16384xf16.mlir
deleted file mode 100644
index e8e0305..0000000
--- a/attention/mlir/attention_32x16384x128x128x16384xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<32x16384x128xf16>
-!K     = tensor<32x16384x128xf16>
-!V     = tensor<32x16384x128xf16>
-!O     = tensor<32x16384x128xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_32x16384x128x128x16384xf8E4M3FNUZ.mlir b/attention/mlir/attention_32x16384x128x128x16384xf8E4M3FNUZ.mlir
deleted file mode 100644
index 678a5f4..0000000
--- a/attention/mlir/attention_32x16384x128x128x16384xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<32x16384x128xf8E4M3FNUZ>
-!K     = tensor<32x16384x128xf8E4M3FNUZ>
-!V     = tensor<32x16384x128xf8E4M3FNUZ>
-!O     = tensor<32x16384x128xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_32x16384x64x64x16384xf16.mlir b/attention/mlir/attention_32x16384x64x64x16384xf16.mlir
deleted file mode 100644
index f9a8903..0000000
--- a/attention/mlir/attention_32x16384x64x64x16384xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<32x16384x64xf16>
-!K     = tensor<32x16384x64xf16>
-!V     = tensor<32x16384x64xf16>
-!O     = tensor<32x16384x64xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_32x16384x64x64x16384xf8E4M3FNUZ.mlir b/attention/mlir/attention_32x16384x64x64x16384xf8E4M3FNUZ.mlir
deleted file mode 100644
index be3a447..0000000
--- a/attention/mlir/attention_32x16384x64x64x16384xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<32x16384x64xf8E4M3FNUZ>
-!K     = tensor<32x16384x64xf8E4M3FNUZ>
-!V     = tensor<32x16384x64xf8E4M3FNUZ>
-!O     = tensor<32x16384x64xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_32x2048x128x128x2048xf16.mlir b/attention/mlir/attention_32x2048x128x128x2048xf16.mlir
deleted file mode 100644
index da02867..0000000
--- a/attention/mlir/attention_32x2048x128x128x2048xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<32x2048x128xf16>
-!K     = tensor<32x2048x128xf16>
-!V     = tensor<32x2048x128xf16>
-!O     = tensor<32x2048x128xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_32x2048x128x128x2048xf8E4M3FNUZ.mlir b/attention/mlir/attention_32x2048x128x128x2048xf8E4M3FNUZ.mlir
deleted file mode 100644
index 3ab545a..0000000
--- a/attention/mlir/attention_32x2048x128x128x2048xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<32x2048x128xf8E4M3FNUZ>
-!K     = tensor<32x2048x128xf8E4M3FNUZ>
-!V     = tensor<32x2048x128xf8E4M3FNUZ>
-!O     = tensor<32x2048x128xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_32x2048x64x64x2048xf16.mlir b/attention/mlir/attention_32x2048x64x64x2048xf16.mlir
deleted file mode 100644
index 8b4ec4f..0000000
--- a/attention/mlir/attention_32x2048x64x64x2048xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<32x2048x64xf16>
-!K     = tensor<32x2048x64xf16>
-!V     = tensor<32x2048x64xf16>
-!O     = tensor<32x2048x64xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_32x2048x64x64x2048xf8E4M3FNUZ.mlir b/attention/mlir/attention_32x2048x64x64x2048xf8E4M3FNUZ.mlir
deleted file mode 100644
index d256708..0000000
--- a/attention/mlir/attention_32x2048x64x64x2048xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<32x2048x64xf8E4M3FNUZ>
-!K     = tensor<32x2048x64xf8E4M3FNUZ>
-!V     = tensor<32x2048x64xf8E4M3FNUZ>
-!O     = tensor<32x2048x64xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_32x4096x128x128x4096xf16.mlir b/attention/mlir/attention_32x4096x128x128x4096xf16.mlir
deleted file mode 100644
index b12bd54..0000000
--- a/attention/mlir/attention_32x4096x128x128x4096xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<32x4096x128xf16>
-!K     = tensor<32x4096x128xf16>
-!V     = tensor<32x4096x128xf16>
-!O     = tensor<32x4096x128xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_32x4096x128x128x4096xf8E4M3FNUZ.mlir b/attention/mlir/attention_32x4096x128x128x4096xf8E4M3FNUZ.mlir
deleted file mode 100644
index b9f6c07..0000000
--- a/attention/mlir/attention_32x4096x128x128x4096xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<32x4096x128xf8E4M3FNUZ>
-!K     = tensor<32x4096x128xf8E4M3FNUZ>
-!V     = tensor<32x4096x128xf8E4M3FNUZ>
-!O     = tensor<32x4096x128xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_32x4096x64x64x4096xf16.mlir b/attention/mlir/attention_32x4096x64x64x4096xf16.mlir
deleted file mode 100644
index e7e4b2f..0000000
--- a/attention/mlir/attention_32x4096x64x64x4096xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<32x4096x64xf16>
-!K     = tensor<32x4096x64xf16>
-!V     = tensor<32x4096x64xf16>
-!O     = tensor<32x4096x64xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_32x4096x64x64x4096xf8E4M3FNUZ.mlir b/attention/mlir/attention_32x4096x64x64x4096xf8E4M3FNUZ.mlir
deleted file mode 100644
index df3cf40..0000000
--- a/attention/mlir/attention_32x4096x64x64x4096xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<32x4096x64xf8E4M3FNUZ>
-!K     = tensor<32x4096x64xf8E4M3FNUZ>
-!V     = tensor<32x4096x64xf8E4M3FNUZ>
-!O     = tensor<32x4096x64xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_32x8192x128x128x8192xf16.mlir b/attention/mlir/attention_32x8192x128x128x8192xf16.mlir
deleted file mode 100644
index 1198ace..0000000
--- a/attention/mlir/attention_32x8192x128x128x8192xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<32x8192x128xf16>
-!K     = tensor<32x8192x128xf16>
-!V     = tensor<32x8192x128xf16>
-!O     = tensor<32x8192x128xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_32x8192x128x128x8192xf8E4M3FNUZ.mlir b/attention/mlir/attention_32x8192x128x128x8192xf8E4M3FNUZ.mlir
deleted file mode 100644
index 2477ea2..0000000
--- a/attention/mlir/attention_32x8192x128x128x8192xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<32x8192x128xf8E4M3FNUZ>
-!K     = tensor<32x8192x128xf8E4M3FNUZ>
-!V     = tensor<32x8192x128xf8E4M3FNUZ>
-!O     = tensor<32x8192x128xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_32x8192x64x64x8192xf16.mlir b/attention/mlir/attention_32x8192x64x64x8192xf16.mlir
deleted file mode 100644
index 631bded..0000000
--- a/attention/mlir/attention_32x8192x64x64x8192xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<32x8192x64xf16>
-!K     = tensor<32x8192x64xf16>
-!V     = tensor<32x8192x64xf16>
-!O     = tensor<32x8192x64xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_32x8192x64x64x8192xf8E4M3FNUZ.mlir b/attention/mlir/attention_32x8192x64x64x8192xf8E4M3FNUZ.mlir
deleted file mode 100644
index 44d9a16..0000000
--- a/attention/mlir/attention_32x8192x64x64x8192xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<32x8192x64xf8E4M3FNUZ>
-!K     = tensor<32x8192x64xf8E4M3FNUZ>
-!V     = tensor<32x8192x64xf8E4M3FNUZ>
-!O     = tensor<32x8192x64xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_40x1024x64x64x1024xf16.mlir b/attention/mlir/attention_40x1024x64x64x1024xf16.mlir
deleted file mode 100644
index 30b8be9..0000000
--- a/attention/mlir/attention_40x1024x64x64x1024xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<40x1024x64xf16>
-!K     = tensor<40x1024x64xf16>
-!V     = tensor<40x1024x64xf16>
-!O     = tensor<40x1024x64xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_40x1024x64x64x1024xf8E4M3FNUZ.mlir b/attention/mlir/attention_40x1024x64x64x1024xf8E4M3FNUZ.mlir
deleted file mode 100644
index bbb52d5..0000000
--- a/attention/mlir/attention_40x1024x64x64x1024xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<40x1024x64xf8E4M3FNUZ>
-!K     = tensor<40x1024x64xf8E4M3FNUZ>
-!V     = tensor<40x1024x64xf8E4M3FNUZ>
-!O     = tensor<40x1024x64xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_40x1024x64x64x64xf16.mlir b/attention/mlir/attention_40x1024x64x64x64xf16.mlir
deleted file mode 100644
index 2b1ecab..0000000
--- a/attention/mlir/attention_40x1024x64x64x64xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<40x1024x64xf16>
-!K     = tensor<40x64x64xf16>
-!V     = tensor<40x64x64xf16>
-!O     = tensor<40x1024x64xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_40x1024x64x64x64xf8E4M3FNUZ.mlir b/attention/mlir/attention_40x1024x64x64x64xf8E4M3FNUZ.mlir
deleted file mode 100644
index cc73d9a..0000000
--- a/attention/mlir/attention_40x1024x64x64x64xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<40x1024x64xf8E4M3FNUZ>
-!K     = tensor<40x64x64xf8E4M3FNUZ>
-!V     = tensor<40x64x64xf8E4M3FNUZ>
-!O     = tensor<40x1024x64xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_48x1024x128x128x1024xf16.mlir b/attention/mlir/attention_48x1024x128x128x1024xf16.mlir
deleted file mode 100644
index b145d0e..0000000
--- a/attention/mlir/attention_48x1024x128x128x1024xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<48x1024x128xf16>
-!K     = tensor<48x1024x128xf16>
-!V     = tensor<48x1024x128xf16>
-!O     = tensor<48x1024x128xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_48x1024x128x128x1024xf8E4M3FNUZ.mlir b/attention/mlir/attention_48x1024x128x128x1024xf8E4M3FNUZ.mlir
deleted file mode 100644
index 066c173..0000000
--- a/attention/mlir/attention_48x1024x128x128x1024xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<48x1024x128xf8E4M3FNUZ>
-!K     = tensor<48x1024x128xf8E4M3FNUZ>
-!V     = tensor<48x1024x128xf8E4M3FNUZ>
-!O     = tensor<48x1024x128xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_48x1024x64x64x1024xf16.mlir b/attention/mlir/attention_48x1024x64x64x1024xf16.mlir
deleted file mode 100644
index 53ea959..0000000
--- a/attention/mlir/attention_48x1024x64x64x1024xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<48x1024x64xf16>
-!K     = tensor<48x1024x64xf16>
-!V     = tensor<48x1024x64xf16>
-!O     = tensor<48x1024x64xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_48x1024x64x64x1024xf8E4M3FNUZ.mlir b/attention/mlir/attention_48x1024x64x64x1024xf8E4M3FNUZ.mlir
deleted file mode 100644
index 2f0c7f0..0000000
--- a/attention/mlir/attention_48x1024x64x64x1024xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<48x1024x64xf8E4M3FNUZ>
-!K     = tensor<48x1024x64xf8E4M3FNUZ>
-!V     = tensor<48x1024x64xf8E4M3FNUZ>
-!O     = tensor<48x1024x64xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_48x16384x128x128x16384xf16.mlir b/attention/mlir/attention_48x16384x128x128x16384xf16.mlir
deleted file mode 100644
index 3e6f137..0000000
--- a/attention/mlir/attention_48x16384x128x128x16384xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<48x16384x128xf16>
-!K     = tensor<48x16384x128xf16>
-!V     = tensor<48x16384x128xf16>
-!O     = tensor<48x16384x128xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_48x16384x128x128x16384xf8E4M3FNUZ.mlir b/attention/mlir/attention_48x16384x128x128x16384xf8E4M3FNUZ.mlir
deleted file mode 100644
index 956eb56..0000000
--- a/attention/mlir/attention_48x16384x128x128x16384xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<48x16384x128xf8E4M3FNUZ>
-!K     = tensor<48x16384x128xf8E4M3FNUZ>
-!V     = tensor<48x16384x128xf8E4M3FNUZ>
-!O     = tensor<48x16384x128xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_48x16384x64x64x16384xf16.mlir b/attention/mlir/attention_48x16384x64x64x16384xf16.mlir
deleted file mode 100644
index 09f27d3..0000000
--- a/attention/mlir/attention_48x16384x64x64x16384xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<48x16384x64xf16>
-!K     = tensor<48x16384x64xf16>
-!V     = tensor<48x16384x64xf16>
-!O     = tensor<48x16384x64xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_48x16384x64x64x16384xf8E4M3FNUZ.mlir b/attention/mlir/attention_48x16384x64x64x16384xf8E4M3FNUZ.mlir
deleted file mode 100644
index 64550e6..0000000
--- a/attention/mlir/attention_48x16384x64x64x16384xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<48x16384x64xf8E4M3FNUZ>
-!K     = tensor<48x16384x64xf8E4M3FNUZ>
-!V     = tensor<48x16384x64xf8E4M3FNUZ>
-!O     = tensor<48x16384x64xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_48x2048x128x128x2048xf16.mlir b/attention/mlir/attention_48x2048x128x128x2048xf16.mlir
deleted file mode 100644
index d509a38..0000000
--- a/attention/mlir/attention_48x2048x128x128x2048xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<48x2048x128xf16>
-!K     = tensor<48x2048x128xf16>
-!V     = tensor<48x2048x128xf16>
-!O     = tensor<48x2048x128xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_48x2048x128x128x2048xf8E4M3FNUZ.mlir b/attention/mlir/attention_48x2048x128x128x2048xf8E4M3FNUZ.mlir
deleted file mode 100644
index 92dd2d1..0000000
--- a/attention/mlir/attention_48x2048x128x128x2048xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<48x2048x128xf8E4M3FNUZ>
-!K     = tensor<48x2048x128xf8E4M3FNUZ>
-!V     = tensor<48x2048x128xf8E4M3FNUZ>
-!O     = tensor<48x2048x128xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_48x2048x64x64x2048xf16.mlir b/attention/mlir/attention_48x2048x64x64x2048xf16.mlir
deleted file mode 100644
index 9076e81..0000000
--- a/attention/mlir/attention_48x2048x64x64x2048xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<48x2048x64xf16>
-!K     = tensor<48x2048x64xf16>
-!V     = tensor<48x2048x64xf16>
-!O     = tensor<48x2048x64xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_48x2048x64x64x2048xf8E4M3FNUZ.mlir b/attention/mlir/attention_48x2048x64x64x2048xf8E4M3FNUZ.mlir
deleted file mode 100644
index 7890143..0000000
--- a/attention/mlir/attention_48x2048x64x64x2048xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<48x2048x64xf8E4M3FNUZ>
-!K     = tensor<48x2048x64xf8E4M3FNUZ>
-!V     = tensor<48x2048x64xf8E4M3FNUZ>
-!O     = tensor<48x2048x64xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_48x4096x128x128x4096xf16.mlir b/attention/mlir/attention_48x4096x128x128x4096xf16.mlir
deleted file mode 100644
index cefa7f5..0000000
--- a/attention/mlir/attention_48x4096x128x128x4096xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<48x4096x128xf16>
-!K     = tensor<48x4096x128xf16>
-!V     = tensor<48x4096x128xf16>
-!O     = tensor<48x4096x128xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_48x4096x128x128x4096xf8E4M3FNUZ.mlir b/attention/mlir/attention_48x4096x128x128x4096xf8E4M3FNUZ.mlir
deleted file mode 100644
index 05d6106..0000000
--- a/attention/mlir/attention_48x4096x128x128x4096xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<48x4096x128xf8E4M3FNUZ>
-!K     = tensor<48x4096x128xf8E4M3FNUZ>
-!V     = tensor<48x4096x128xf8E4M3FNUZ>
-!O     = tensor<48x4096x128xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_48x4096x64x64x4096xf16.mlir b/attention/mlir/attention_48x4096x64x64x4096xf16.mlir
deleted file mode 100644
index 28a5ac9..0000000
--- a/attention/mlir/attention_48x4096x64x64x4096xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<48x4096x64xf16>
-!K     = tensor<48x4096x64xf16>
-!V     = tensor<48x4096x64xf16>
-!O     = tensor<48x4096x64xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_48x4096x64x64x4096xf8E4M3FNUZ.mlir b/attention/mlir/attention_48x4096x64x64x4096xf8E4M3FNUZ.mlir
deleted file mode 100644
index e28ab3c..0000000
--- a/attention/mlir/attention_48x4096x64x64x4096xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<48x4096x64xf8E4M3FNUZ>
-!K     = tensor<48x4096x64xf8E4M3FNUZ>
-!V     = tensor<48x4096x64xf8E4M3FNUZ>
-!O     = tensor<48x4096x64xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_48x8192x128x128x8192xf16.mlir b/attention/mlir/attention_48x8192x128x128x8192xf16.mlir
deleted file mode 100644
index 00f17b2..0000000
--- a/attention/mlir/attention_48x8192x128x128x8192xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<48x8192x128xf16>
-!K     = tensor<48x8192x128xf16>
-!V     = tensor<48x8192x128xf16>
-!O     = tensor<48x8192x128xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_48x8192x128x128x8192xf8E4M3FNUZ.mlir b/attention/mlir/attention_48x8192x128x128x8192xf8E4M3FNUZ.mlir
deleted file mode 100644
index eecccd9..0000000
--- a/attention/mlir/attention_48x8192x128x128x8192xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<48x8192x128xf8E4M3FNUZ>
-!K     = tensor<48x8192x128xf8E4M3FNUZ>
-!V     = tensor<48x8192x128xf8E4M3FNUZ>
-!O     = tensor<48x8192x128xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_48x8192x64x64x8192xf16.mlir b/attention/mlir/attention_48x8192x64x64x8192xf16.mlir
deleted file mode 100644
index 38f7a02..0000000
--- a/attention/mlir/attention_48x8192x64x64x8192xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<48x8192x64xf16>
-!K     = tensor<48x8192x64xf16>
-!V     = tensor<48x8192x64xf16>
-!O     = tensor<48x8192x64xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_48x8192x64x64x8192xf8E4M3FNUZ.mlir b/attention/mlir/attention_48x8192x64x64x8192xf8E4M3FNUZ.mlir
deleted file mode 100644
index 109d155..0000000
--- a/attention/mlir/attention_48x8192x64x64x8192xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<48x8192x64xf8E4M3FNUZ>
-!K     = tensor<48x8192x64xf8E4M3FNUZ>
-!V     = tensor<48x8192x64xf8E4M3FNUZ>
-!O     = tensor<48x8192x64xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_4x1024x128x128x1024xf16.mlir b/attention/mlir/attention_4x1024x128x128x1024xf16.mlir
deleted file mode 100644
index 6e34384..0000000
--- a/attention/mlir/attention_4x1024x128x128x1024xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<4x1024x128xf16>
-!K     = tensor<4x1024x128xf16>
-!V     = tensor<4x1024x128xf16>
-!O     = tensor<4x1024x128xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_4x1024x128x128x1024xf8E4M3FNUZ.mlir b/attention/mlir/attention_4x1024x128x128x1024xf8E4M3FNUZ.mlir
deleted file mode 100644
index 90d48c2..0000000
--- a/attention/mlir/attention_4x1024x128x128x1024xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<4x1024x128xf8E4M3FNUZ>
-!K     = tensor<4x1024x128xf8E4M3FNUZ>
-!V     = tensor<4x1024x128xf8E4M3FNUZ>
-!O     = tensor<4x1024x128xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_4x1024x64x64x1024xf16.mlir b/attention/mlir/attention_4x1024x64x64x1024xf16.mlir
deleted file mode 100644
index 381ad7d..0000000
--- a/attention/mlir/attention_4x1024x64x64x1024xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<4x1024x64xf16>
-!K     = tensor<4x1024x64xf16>
-!V     = tensor<4x1024x64xf16>
-!O     = tensor<4x1024x64xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_4x1024x64x64x1024xf8E4M3FNUZ.mlir b/attention/mlir/attention_4x1024x64x64x1024xf8E4M3FNUZ.mlir
deleted file mode 100644
index 7f920cc..0000000
--- a/attention/mlir/attention_4x1024x64x64x1024xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<4x1024x64xf8E4M3FNUZ>
-!K     = tensor<4x1024x64xf8E4M3FNUZ>
-!V     = tensor<4x1024x64xf8E4M3FNUZ>
-!O     = tensor<4x1024x64xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_4x16384x128x128x16384xf16.mlir b/attention/mlir/attention_4x16384x128x128x16384xf16.mlir
deleted file mode 100644
index 9571b4f..0000000
--- a/attention/mlir/attention_4x16384x128x128x16384xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<4x16384x128xf16>
-!K     = tensor<4x16384x128xf16>
-!V     = tensor<4x16384x128xf16>
-!O     = tensor<4x16384x128xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_4x16384x128x128x16384xf8E4M3FNUZ.mlir b/attention/mlir/attention_4x16384x128x128x16384xf8E4M3FNUZ.mlir
deleted file mode 100644
index 3c76d96..0000000
--- a/attention/mlir/attention_4x16384x128x128x16384xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<4x16384x128xf8E4M3FNUZ>
-!K     = tensor<4x16384x128xf8E4M3FNUZ>
-!V     = tensor<4x16384x128xf8E4M3FNUZ>
-!O     = tensor<4x16384x128xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_4x16384x64x64x16384xf16.mlir b/attention/mlir/attention_4x16384x64x64x16384xf16.mlir
deleted file mode 100644
index b97efb6..0000000
--- a/attention/mlir/attention_4x16384x64x64x16384xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<4x16384x64xf16>
-!K     = tensor<4x16384x64xf16>
-!V     = tensor<4x16384x64xf16>
-!O     = tensor<4x16384x64xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_4x16384x64x64x16384xf8E4M3FNUZ.mlir b/attention/mlir/attention_4x16384x64x64x16384xf8E4M3FNUZ.mlir
deleted file mode 100644
index dea5ee9..0000000
--- a/attention/mlir/attention_4x16384x64x64x16384xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<4x16384x64xf8E4M3FNUZ>
-!K     = tensor<4x16384x64xf8E4M3FNUZ>
-!V     = tensor<4x16384x64xf8E4M3FNUZ>
-!O     = tensor<4x16384x64xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_4x2048x128x128x2048xf16.mlir b/attention/mlir/attention_4x2048x128x128x2048xf16.mlir
deleted file mode 100644
index b7f74a9..0000000
--- a/attention/mlir/attention_4x2048x128x128x2048xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<4x2048x128xf16>
-!K     = tensor<4x2048x128xf16>
-!V     = tensor<4x2048x128xf16>
-!O     = tensor<4x2048x128xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_4x2048x128x128x2048xf8E4M3FNUZ.mlir b/attention/mlir/attention_4x2048x128x128x2048xf8E4M3FNUZ.mlir
deleted file mode 100644
index 4f228c2..0000000
--- a/attention/mlir/attention_4x2048x128x128x2048xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<4x2048x128xf8E4M3FNUZ>
-!K     = tensor<4x2048x128xf8E4M3FNUZ>
-!V     = tensor<4x2048x128xf8E4M3FNUZ>
-!O     = tensor<4x2048x128xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_4x2048x64x64x2048xf16.mlir b/attention/mlir/attention_4x2048x64x64x2048xf16.mlir
deleted file mode 100644
index 42df208..0000000
--- a/attention/mlir/attention_4x2048x64x64x2048xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<4x2048x64xf16>
-!K     = tensor<4x2048x64xf16>
-!V     = tensor<4x2048x64xf16>
-!O     = tensor<4x2048x64xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_4x2048x64x64x2048xf8E4M3FNUZ.mlir b/attention/mlir/attention_4x2048x64x64x2048xf8E4M3FNUZ.mlir
deleted file mode 100644
index 5848865..0000000
--- a/attention/mlir/attention_4x2048x64x64x2048xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<4x2048x64xf8E4M3FNUZ>
-!K     = tensor<4x2048x64xf8E4M3FNUZ>
-!V     = tensor<4x2048x64xf8E4M3FNUZ>
-!O     = tensor<4x2048x64xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_4x4096x128x128x4096xf16.mlir b/attention/mlir/attention_4x4096x128x128x4096xf16.mlir
deleted file mode 100644
index d66c261..0000000
--- a/attention/mlir/attention_4x4096x128x128x4096xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<4x4096x128xf16>
-!K     = tensor<4x4096x128xf16>
-!V     = tensor<4x4096x128xf16>
-!O     = tensor<4x4096x128xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_4x4096x128x128x4096xf8E4M3FNUZ.mlir b/attention/mlir/attention_4x4096x128x128x4096xf8E4M3FNUZ.mlir
deleted file mode 100644
index f818c13..0000000
--- a/attention/mlir/attention_4x4096x128x128x4096xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<4x4096x128xf8E4M3FNUZ>
-!K     = tensor<4x4096x128xf8E4M3FNUZ>
-!V     = tensor<4x4096x128xf8E4M3FNUZ>
-!O     = tensor<4x4096x128xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_4x4096x64x64x4096xf16.mlir b/attention/mlir/attention_4x4096x64x64x4096xf16.mlir
deleted file mode 100644
index 7d19ec1..0000000
--- a/attention/mlir/attention_4x4096x64x64x4096xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<4x4096x64xf16>
-!K     = tensor<4x4096x64xf16>
-!V     = tensor<4x4096x64xf16>
-!O     = tensor<4x4096x64xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_4x4096x64x64x4096xf8E4M3FNUZ.mlir b/attention/mlir/attention_4x4096x64x64x4096xf8E4M3FNUZ.mlir
deleted file mode 100644
index 494f39f..0000000
--- a/attention/mlir/attention_4x4096x64x64x4096xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<4x4096x64xf8E4M3FNUZ>
-!K     = tensor<4x4096x64xf8E4M3FNUZ>
-!V     = tensor<4x4096x64xf8E4M3FNUZ>
-!O     = tensor<4x4096x64xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_4x4096x64x64x64xf16.mlir b/attention/mlir/attention_4x4096x64x64x64xf16.mlir
deleted file mode 100644
index 6dc2d25..0000000
--- a/attention/mlir/attention_4x4096x64x64x64xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<4x4096x64xf16>
-!K     = tensor<4x64x64xf16>
-!V     = tensor<4x64x64xf16>
-!O     = tensor<4x4096x64xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_4x4096x64x64x64xf8E4M3FNUZ.mlir b/attention/mlir/attention_4x4096x64x64x64xf8E4M3FNUZ.mlir
deleted file mode 100644
index c10cff1..0000000
--- a/attention/mlir/attention_4x4096x64x64x64xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<4x4096x64xf8E4M3FNUZ>
-!K     = tensor<4x64x64xf8E4M3FNUZ>
-!V     = tensor<4x64x64xf8E4M3FNUZ>
-!O     = tensor<4x4096x64xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_4x8192x128x128x8192xf16.mlir b/attention/mlir/attention_4x8192x128x128x8192xf16.mlir
deleted file mode 100644
index 463a0fb..0000000
--- a/attention/mlir/attention_4x8192x128x128x8192xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<4x8192x128xf16>
-!K     = tensor<4x8192x128xf16>
-!V     = tensor<4x8192x128xf16>
-!O     = tensor<4x8192x128xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_4x8192x128x128x8192xf8E4M3FNUZ.mlir b/attention/mlir/attention_4x8192x128x128x8192xf8E4M3FNUZ.mlir
deleted file mode 100644
index ee97189..0000000
--- a/attention/mlir/attention_4x8192x128x128x8192xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<4x8192x128xf8E4M3FNUZ>
-!K     = tensor<4x8192x128xf8E4M3FNUZ>
-!V     = tensor<4x8192x128xf8E4M3FNUZ>
-!O     = tensor<4x8192x128xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_4x8192x64x64x8192xf16.mlir b/attention/mlir/attention_4x8192x64x64x8192xf16.mlir
deleted file mode 100644
index 17cf3fb..0000000
--- a/attention/mlir/attention_4x8192x64x64x8192xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<4x8192x64xf16>
-!K     = tensor<4x8192x64xf16>
-!V     = tensor<4x8192x64xf16>
-!O     = tensor<4x8192x64xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_4x8192x64x64x8192xf8E4M3FNUZ.mlir b/attention/mlir/attention_4x8192x64x64x8192xf8E4M3FNUZ.mlir
deleted file mode 100644
index e2b6765..0000000
--- a/attention/mlir/attention_4x8192x64x64x8192xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<4x8192x64xf8E4M3FNUZ>
-!K     = tensor<4x8192x64xf8E4M3FNUZ>
-!V     = tensor<4x8192x64xf8E4M3FNUZ>
-!O     = tensor<4x8192x64xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_64x1024x128x128x1024xf16.mlir b/attention/mlir/attention_64x1024x128x128x1024xf16.mlir
deleted file mode 100644
index 43c97e6..0000000
--- a/attention/mlir/attention_64x1024x128x128x1024xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<64x1024x128xf16>
-!K     = tensor<64x1024x128xf16>
-!V     = tensor<64x1024x128xf16>
-!O     = tensor<64x1024x128xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_64x1024x128x128x1024xf8E4M3FNUZ.mlir b/attention/mlir/attention_64x1024x128x128x1024xf8E4M3FNUZ.mlir
deleted file mode 100644
index b15dcd7..0000000
--- a/attention/mlir/attention_64x1024x128x128x1024xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<64x1024x128xf8E4M3FNUZ>
-!K     = tensor<64x1024x128xf8E4M3FNUZ>
-!V     = tensor<64x1024x128xf8E4M3FNUZ>
-!O     = tensor<64x1024x128xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_64x1024x64x64x1024xf16.mlir b/attention/mlir/attention_64x1024x64x64x1024xf16.mlir
deleted file mode 100644
index ca3f166..0000000
--- a/attention/mlir/attention_64x1024x64x64x1024xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<64x1024x64xf16>
-!K     = tensor<64x1024x64xf16>
-!V     = tensor<64x1024x64xf16>
-!O     = tensor<64x1024x64xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_64x1024x64x64x1024xf8E4M3FNUZ.mlir b/attention/mlir/attention_64x1024x64x64x1024xf8E4M3FNUZ.mlir
deleted file mode 100644
index 6cf5181..0000000
--- a/attention/mlir/attention_64x1024x64x64x1024xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<64x1024x64xf8E4M3FNUZ>
-!K     = tensor<64x1024x64xf8E4M3FNUZ>
-!V     = tensor<64x1024x64xf8E4M3FNUZ>
-!O     = tensor<64x1024x64xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_64x16384x128x128x16384xf16.mlir b/attention/mlir/attention_64x16384x128x128x16384xf16.mlir
deleted file mode 100644
index 82eb394..0000000
--- a/attention/mlir/attention_64x16384x128x128x16384xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<64x16384x128xf16>
-!K     = tensor<64x16384x128xf16>
-!V     = tensor<64x16384x128xf16>
-!O     = tensor<64x16384x128xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_64x16384x128x128x16384xf8E4M3FNUZ.mlir b/attention/mlir/attention_64x16384x128x128x16384xf8E4M3FNUZ.mlir
deleted file mode 100644
index f76335c..0000000
--- a/attention/mlir/attention_64x16384x128x128x16384xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<64x16384x128xf8E4M3FNUZ>
-!K     = tensor<64x16384x128xf8E4M3FNUZ>
-!V     = tensor<64x16384x128xf8E4M3FNUZ>
-!O     = tensor<64x16384x128xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_64x16384x64x64x16384xf16.mlir b/attention/mlir/attention_64x16384x64x64x16384xf16.mlir
deleted file mode 100644
index 36d1ff8..0000000
--- a/attention/mlir/attention_64x16384x64x64x16384xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<64x16384x64xf16>
-!K     = tensor<64x16384x64xf16>
-!V     = tensor<64x16384x64xf16>
-!O     = tensor<64x16384x64xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_64x16384x64x64x16384xf8E4M3FNUZ.mlir b/attention/mlir/attention_64x16384x64x64x16384xf8E4M3FNUZ.mlir
deleted file mode 100644
index ae5c81b..0000000
--- a/attention/mlir/attention_64x16384x64x64x16384xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<64x16384x64xf8E4M3FNUZ>
-!K     = tensor<64x16384x64xf8E4M3FNUZ>
-!V     = tensor<64x16384x64xf8E4M3FNUZ>
-!O     = tensor<64x16384x64xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_64x2048x128x128x2048xf16.mlir b/attention/mlir/attention_64x2048x128x128x2048xf16.mlir
deleted file mode 100644
index 1ec1b4f..0000000
--- a/attention/mlir/attention_64x2048x128x128x2048xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<64x2048x128xf16>
-!K     = tensor<64x2048x128xf16>
-!V     = tensor<64x2048x128xf16>
-!O     = tensor<64x2048x128xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_64x2048x128x128x2048xf8E4M3FNUZ.mlir b/attention/mlir/attention_64x2048x128x128x2048xf8E4M3FNUZ.mlir
deleted file mode 100644
index 5eaff5f..0000000
--- a/attention/mlir/attention_64x2048x128x128x2048xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<64x2048x128xf8E4M3FNUZ>
-!K     = tensor<64x2048x128xf8E4M3FNUZ>
-!V     = tensor<64x2048x128xf8E4M3FNUZ>
-!O     = tensor<64x2048x128xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_64x2048x64x64x2048xf16.mlir b/attention/mlir/attention_64x2048x64x64x2048xf16.mlir
deleted file mode 100644
index bd32514..0000000
--- a/attention/mlir/attention_64x2048x64x64x2048xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<64x2048x64xf16>
-!K     = tensor<64x2048x64xf16>
-!V     = tensor<64x2048x64xf16>
-!O     = tensor<64x2048x64xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_64x2048x64x64x2048xf8E4M3FNUZ.mlir b/attention/mlir/attention_64x2048x64x64x2048xf8E4M3FNUZ.mlir
deleted file mode 100644
index b43c953..0000000
--- a/attention/mlir/attention_64x2048x64x64x2048xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<64x2048x64xf8E4M3FNUZ>
-!K     = tensor<64x2048x64xf8E4M3FNUZ>
-!V     = tensor<64x2048x64xf8E4M3FNUZ>
-!O     = tensor<64x2048x64xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_64x4096x128x128x4096xf16.mlir b/attention/mlir/attention_64x4096x128x128x4096xf16.mlir
deleted file mode 100644
index c88339f..0000000
--- a/attention/mlir/attention_64x4096x128x128x4096xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<64x4096x128xf16>
-!K     = tensor<64x4096x128xf16>
-!V     = tensor<64x4096x128xf16>
-!O     = tensor<64x4096x128xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_64x4096x128x128x4096xf8E4M3FNUZ.mlir b/attention/mlir/attention_64x4096x128x128x4096xf8E4M3FNUZ.mlir
deleted file mode 100644
index 97beedf..0000000
--- a/attention/mlir/attention_64x4096x128x128x4096xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<64x4096x128xf8E4M3FNUZ>
-!K     = tensor<64x4096x128xf8E4M3FNUZ>
-!V     = tensor<64x4096x128xf8E4M3FNUZ>
-!O     = tensor<64x4096x128xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_64x4096x64x64x4096xf16.mlir b/attention/mlir/attention_64x4096x64x64x4096xf16.mlir
deleted file mode 100644
index c62c248..0000000
--- a/attention/mlir/attention_64x4096x64x64x4096xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<64x4096x64xf16>
-!K     = tensor<64x4096x64xf16>
-!V     = tensor<64x4096x64xf16>
-!O     = tensor<64x4096x64xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_64x4096x64x64x4096xf8E4M3FNUZ.mlir b/attention/mlir/attention_64x4096x64x64x4096xf8E4M3FNUZ.mlir
deleted file mode 100644
index 2d21334..0000000
--- a/attention/mlir/attention_64x4096x64x64x4096xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<64x4096x64xf8E4M3FNUZ>
-!K     = tensor<64x4096x64xf8E4M3FNUZ>
-!V     = tensor<64x4096x64xf8E4M3FNUZ>
-!O     = tensor<64x4096x64xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_64x8192x128x128x8192xf16.mlir b/attention/mlir/attention_64x8192x128x128x8192xf16.mlir
deleted file mode 100644
index d856d95..0000000
--- a/attention/mlir/attention_64x8192x128x128x8192xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<64x8192x128xf16>
-!K     = tensor<64x8192x128xf16>
-!V     = tensor<64x8192x128xf16>
-!O     = tensor<64x8192x128xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_64x8192x128x128x8192xf8E4M3FNUZ.mlir b/attention/mlir/attention_64x8192x128x128x8192xf8E4M3FNUZ.mlir
deleted file mode 100644
index eff571f..0000000
--- a/attention/mlir/attention_64x8192x128x128x8192xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<64x8192x128xf8E4M3FNUZ>
-!K     = tensor<64x8192x128xf8E4M3FNUZ>
-!V     = tensor<64x8192x128xf8E4M3FNUZ>
-!O     = tensor<64x8192x128xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_64x8192x64x64x8192xf16.mlir b/attention/mlir/attention_64x8192x64x64x8192xf16.mlir
deleted file mode 100644
index 26e6408..0000000
--- a/attention/mlir/attention_64x8192x64x64x8192xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<64x8192x64xf16>
-!K     = tensor<64x8192x64xf16>
-!V     = tensor<64x8192x64xf16>
-!O     = tensor<64x8192x64xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_64x8192x64x64x8192xf8E4M3FNUZ.mlir b/attention/mlir/attention_64x8192x64x64x8192xf8E4M3FNUZ.mlir
deleted file mode 100644
index 60bfd03..0000000
--- a/attention/mlir/attention_64x8192x64x64x8192xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<64x8192x64xf8E4M3FNUZ>
-!K     = tensor<64x8192x64xf8E4M3FNUZ>
-!V     = tensor<64x8192x64xf8E4M3FNUZ>
-!O     = tensor<64x8192x64xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_768x4096x64x64x64xf16.mlir b/attention/mlir/attention_768x4096x64x64x64xf16.mlir
deleted file mode 100644
index a21153c..0000000
--- a/attention/mlir/attention_768x4096x64x64x64xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<768x4096x64xf16>
-!K     = tensor<768x64x64xf16>
-!V     = tensor<768x64x64xf16>
-!O     = tensor<768x4096x64xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_768x4096x64x64x64xf8E4M3FNUZ.mlir b/attention/mlir/attention_768x4096x64x64x64xf8E4M3FNUZ.mlir
deleted file mode 100644
index 26fa16f..0000000
--- a/attention/mlir/attention_768x4096x64x64x64xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<768x4096x64xf8E4M3FNUZ>
-!K     = tensor<768x64x64xf8E4M3FNUZ>
-!V     = tensor<768x64x64xf8E4M3FNUZ>
-!O     = tensor<768x4096x64xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_8x1024x128x128x1024xf16.mlir b/attention/mlir/attention_8x1024x128x128x1024xf16.mlir
deleted file mode 100644
index c94c507..0000000
--- a/attention/mlir/attention_8x1024x128x128x1024xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<8x1024x128xf16>
-!K     = tensor<8x1024x128xf16>
-!V     = tensor<8x1024x128xf16>
-!O     = tensor<8x1024x128xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_8x1024x128x128x1024xf8E4M3FNUZ.mlir b/attention/mlir/attention_8x1024x128x128x1024xf8E4M3FNUZ.mlir
deleted file mode 100644
index f01cbda..0000000
--- a/attention/mlir/attention_8x1024x128x128x1024xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<8x1024x128xf8E4M3FNUZ>
-!K     = tensor<8x1024x128xf8E4M3FNUZ>
-!V     = tensor<8x1024x128xf8E4M3FNUZ>
-!O     = tensor<8x1024x128xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_8x1024x64x64x1024xf16.mlir b/attention/mlir/attention_8x1024x64x64x1024xf16.mlir
deleted file mode 100644
index 7390cc2..0000000
--- a/attention/mlir/attention_8x1024x64x64x1024xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<8x1024x64xf16>
-!K     = tensor<8x1024x64xf16>
-!V     = tensor<8x1024x64xf16>
-!O     = tensor<8x1024x64xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_8x1024x64x64x1024xf8E4M3FNUZ.mlir b/attention/mlir/attention_8x1024x64x64x1024xf8E4M3FNUZ.mlir
deleted file mode 100644
index 4ddbddd..0000000
--- a/attention/mlir/attention_8x1024x64x64x1024xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<8x1024x64xf8E4M3FNUZ>
-!K     = tensor<8x1024x64xf8E4M3FNUZ>
-!V     = tensor<8x1024x64xf8E4M3FNUZ>
-!O     = tensor<8x1024x64xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_8x1024x64x64x64xf16.mlir b/attention/mlir/attention_8x1024x64x64x64xf16.mlir
deleted file mode 100644
index 6e491d5..0000000
--- a/attention/mlir/attention_8x1024x64x64x64xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<8x1024x64xf16>
-!K     = tensor<8x64x64xf16>
-!V     = tensor<8x64x64xf16>
-!O     = tensor<8x1024x64xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_8x1024x64x64x64xf8E4M3FNUZ.mlir b/attention/mlir/attention_8x1024x64x64x64xf8E4M3FNUZ.mlir
deleted file mode 100644
index 87c5acd..0000000
--- a/attention/mlir/attention_8x1024x64x64x64xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<8x1024x64xf8E4M3FNUZ>
-!K     = tensor<8x64x64xf8E4M3FNUZ>
-!V     = tensor<8x64x64xf8E4M3FNUZ>
-!O     = tensor<8x1024x64xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_8x16384x128x128x16384xf16.mlir b/attention/mlir/attention_8x16384x128x128x16384xf16.mlir
deleted file mode 100644
index 9ef86da..0000000
--- a/attention/mlir/attention_8x16384x128x128x16384xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<8x16384x128xf16>
-!K     = tensor<8x16384x128xf16>
-!V     = tensor<8x16384x128xf16>
-!O     = tensor<8x16384x128xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_8x16384x128x128x16384xf8E4M3FNUZ.mlir b/attention/mlir/attention_8x16384x128x128x16384xf8E4M3FNUZ.mlir
deleted file mode 100644
index e5f0fd2..0000000
--- a/attention/mlir/attention_8x16384x128x128x16384xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<8x16384x128xf8E4M3FNUZ>
-!K     = tensor<8x16384x128xf8E4M3FNUZ>
-!V     = tensor<8x16384x128xf8E4M3FNUZ>
-!O     = tensor<8x16384x128xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_8x16384x64x64x16384xf16.mlir b/attention/mlir/attention_8x16384x64x64x16384xf16.mlir
deleted file mode 100644
index f82cbcc..0000000
--- a/attention/mlir/attention_8x16384x64x64x16384xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<8x16384x64xf16>
-!K     = tensor<8x16384x64xf16>
-!V     = tensor<8x16384x64xf16>
-!O     = tensor<8x16384x64xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_8x16384x64x64x16384xf8E4M3FNUZ.mlir b/attention/mlir/attention_8x16384x64x64x16384xf8E4M3FNUZ.mlir
deleted file mode 100644
index 07f3508..0000000
--- a/attention/mlir/attention_8x16384x64x64x16384xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<8x16384x64xf8E4M3FNUZ>
-!K     = tensor<8x16384x64xf8E4M3FNUZ>
-!V     = tensor<8x16384x64xf8E4M3FNUZ>
-!O     = tensor<8x16384x64xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_8x2048x128x128x2048xf16.mlir b/attention/mlir/attention_8x2048x128x128x2048xf16.mlir
deleted file mode 100644
index ae32ff6..0000000
--- a/attention/mlir/attention_8x2048x128x128x2048xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<8x2048x128xf16>
-!K     = tensor<8x2048x128xf16>
-!V     = tensor<8x2048x128xf16>
-!O     = tensor<8x2048x128xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_8x2048x128x128x2048xf8E4M3FNUZ.mlir b/attention/mlir/attention_8x2048x128x128x2048xf8E4M3FNUZ.mlir
deleted file mode 100644
index 6577fb4..0000000
--- a/attention/mlir/attention_8x2048x128x128x2048xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<8x2048x128xf8E4M3FNUZ>
-!K     = tensor<8x2048x128xf8E4M3FNUZ>
-!V     = tensor<8x2048x128xf8E4M3FNUZ>
-!O     = tensor<8x2048x128xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_8x2048x64x64x2048xf16.mlir b/attention/mlir/attention_8x2048x64x64x2048xf16.mlir
deleted file mode 100644
index 992cd17..0000000
--- a/attention/mlir/attention_8x2048x64x64x2048xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<8x2048x64xf16>
-!K     = tensor<8x2048x64xf16>
-!V     = tensor<8x2048x64xf16>
-!O     = tensor<8x2048x64xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_8x2048x64x64x2048xf8E4M3FNUZ.mlir b/attention/mlir/attention_8x2048x64x64x2048xf8E4M3FNUZ.mlir
deleted file mode 100644
index dcbb8e2..0000000
--- a/attention/mlir/attention_8x2048x64x64x2048xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<8x2048x64xf8E4M3FNUZ>
-!K     = tensor<8x2048x64xf8E4M3FNUZ>
-!V     = tensor<8x2048x64xf8E4M3FNUZ>
-!O     = tensor<8x2048x64xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_8x4096x128x128x4096xf16.mlir b/attention/mlir/attention_8x4096x128x128x4096xf16.mlir
deleted file mode 100644
index ae7e70b..0000000
--- a/attention/mlir/attention_8x4096x128x128x4096xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<8x4096x128xf16>
-!K     = tensor<8x4096x128xf16>
-!V     = tensor<8x4096x128xf16>
-!O     = tensor<8x4096x128xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_8x4096x128x128x4096xf8E4M3FNUZ.mlir b/attention/mlir/attention_8x4096x128x128x4096xf8E4M3FNUZ.mlir
deleted file mode 100644
index 6507701..0000000
--- a/attention/mlir/attention_8x4096x128x128x4096xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<8x4096x128xf8E4M3FNUZ>
-!K     = tensor<8x4096x128xf8E4M3FNUZ>
-!V     = tensor<8x4096x128xf8E4M3FNUZ>
-!O     = tensor<8x4096x128xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_8x4096x64x64x4096xf16.mlir b/attention/mlir/attention_8x4096x64x64x4096xf16.mlir
deleted file mode 100644
index 0d0ed94..0000000
--- a/attention/mlir/attention_8x4096x64x64x4096xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<8x4096x64xf16>
-!K     = tensor<8x4096x64xf16>
-!V     = tensor<8x4096x64xf16>
-!O     = tensor<8x4096x64xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_8x4096x64x64x4096xf8E4M3FNUZ.mlir b/attention/mlir/attention_8x4096x64x64x4096xf8E4M3FNUZ.mlir
deleted file mode 100644
index dc8ae0c..0000000
--- a/attention/mlir/attention_8x4096x64x64x4096xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<8x4096x64xf8E4M3FNUZ>
-!K     = tensor<8x4096x64xf8E4M3FNUZ>
-!V     = tensor<8x4096x64xf8E4M3FNUZ>
-!O     = tensor<8x4096x64xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_8x8192x128x128x8192xf16.mlir b/attention/mlir/attention_8x8192x128x128x8192xf16.mlir
deleted file mode 100644
index 458b367..0000000
--- a/attention/mlir/attention_8x8192x128x128x8192xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<8x8192x128xf16>
-!K     = tensor<8x8192x128xf16>
-!V     = tensor<8x8192x128xf16>
-!O     = tensor<8x8192x128xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_8x8192x128x128x8192xf8E4M3FNUZ.mlir b/attention/mlir/attention_8x8192x128x128x8192xf8E4M3FNUZ.mlir
deleted file mode 100644
index 6910322..0000000
--- a/attention/mlir/attention_8x8192x128x128x8192xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<8x8192x128xf8E4M3FNUZ>
-!K     = tensor<8x8192x128xf8E4M3FNUZ>
-!V     = tensor<8x8192x128xf8E4M3FNUZ>
-!O     = tensor<8x8192x128xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_8x8192x64x64x8192xf16.mlir b/attention/mlir/attention_8x8192x64x64x8192xf16.mlir
deleted file mode 100644
index 7e03007..0000000
--- a/attention/mlir/attention_8x8192x64x64x8192xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<8x8192x64xf16>
-!K     = tensor<8x8192x64xf16>
-!V     = tensor<8x8192x64xf16>
-!O     = tensor<8x8192x64xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_8x8192x64x64x8192xf8E4M3FNUZ.mlir b/attention/mlir/attention_8x8192x64x64x8192xf8E4M3FNUZ.mlir
deleted file mode 100644
index e8ce4e8..0000000
--- a/attention/mlir/attention_8x8192x64x64x8192xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<8x8192x64xf8E4M3FNUZ>
-!K     = tensor<8x8192x64xf8E4M3FNUZ>
-!V     = tensor<8x8192x64xf8E4M3FNUZ>
-!O     = tensor<8x8192x64xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_96x1024x128x128x1024xf16.mlir b/attention/mlir/attention_96x1024x128x128x1024xf16.mlir
deleted file mode 100644
index 7728731..0000000
--- a/attention/mlir/attention_96x1024x128x128x1024xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<96x1024x128xf16>
-!K     = tensor<96x1024x128xf16>
-!V     = tensor<96x1024x128xf16>
-!O     = tensor<96x1024x128xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_96x1024x128x128x1024xf8E4M3FNUZ.mlir b/attention/mlir/attention_96x1024x128x128x1024xf8E4M3FNUZ.mlir
deleted file mode 100644
index 0f440bf..0000000
--- a/attention/mlir/attention_96x1024x128x128x1024xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<96x1024x128xf8E4M3FNUZ>
-!K     = tensor<96x1024x128xf8E4M3FNUZ>
-!V     = tensor<96x1024x128xf8E4M3FNUZ>
-!O     = tensor<96x1024x128xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_96x1024x64x64x1024xf16.mlir b/attention/mlir/attention_96x1024x64x64x1024xf16.mlir
deleted file mode 100644
index 8d682cd..0000000
--- a/attention/mlir/attention_96x1024x64x64x1024xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<96x1024x64xf16>
-!K     = tensor<96x1024x64xf16>
-!V     = tensor<96x1024x64xf16>
-!O     = tensor<96x1024x64xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_96x1024x64x64x1024xf8E4M3FNUZ.mlir b/attention/mlir/attention_96x1024x64x64x1024xf8E4M3FNUZ.mlir
deleted file mode 100644
index 9a71011..0000000
--- a/attention/mlir/attention_96x1024x64x64x1024xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<96x1024x64xf8E4M3FNUZ>
-!K     = tensor<96x1024x64xf8E4M3FNUZ>
-!V     = tensor<96x1024x64xf8E4M3FNUZ>
-!O     = tensor<96x1024x64xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_96x16384x128x128x16384xf16.mlir b/attention/mlir/attention_96x16384x128x128x16384xf16.mlir
deleted file mode 100644
index eaca82a..0000000
--- a/attention/mlir/attention_96x16384x128x128x16384xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<96x16384x128xf16>
-!K     = tensor<96x16384x128xf16>
-!V     = tensor<96x16384x128xf16>
-!O     = tensor<96x16384x128xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_96x16384x128x128x16384xf8E4M3FNUZ.mlir b/attention/mlir/attention_96x16384x128x128x16384xf8E4M3FNUZ.mlir
deleted file mode 100644
index ca80b55..0000000
--- a/attention/mlir/attention_96x16384x128x128x16384xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<96x16384x128xf8E4M3FNUZ>
-!K     = tensor<96x16384x128xf8E4M3FNUZ>
-!V     = tensor<96x16384x128xf8E4M3FNUZ>
-!O     = tensor<96x16384x128xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_96x16384x64x64x16384xf16.mlir b/attention/mlir/attention_96x16384x64x64x16384xf16.mlir
deleted file mode 100644
index 67ec70b..0000000
--- a/attention/mlir/attention_96x16384x64x64x16384xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<96x16384x64xf16>
-!K     = tensor<96x16384x64xf16>
-!V     = tensor<96x16384x64xf16>
-!O     = tensor<96x16384x64xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_96x16384x64x64x16384xf8E4M3FNUZ.mlir b/attention/mlir/attention_96x16384x64x64x16384xf8E4M3FNUZ.mlir
deleted file mode 100644
index e611023..0000000
--- a/attention/mlir/attention_96x16384x64x64x16384xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<96x16384x64xf8E4M3FNUZ>
-!K     = tensor<96x16384x64xf8E4M3FNUZ>
-!V     = tensor<96x16384x64xf8E4M3FNUZ>
-!O     = tensor<96x16384x64xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_96x2048x128x128x2048xf16.mlir b/attention/mlir/attention_96x2048x128x128x2048xf16.mlir
deleted file mode 100644
index 371b275..0000000
--- a/attention/mlir/attention_96x2048x128x128x2048xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<96x2048x128xf16>
-!K     = tensor<96x2048x128xf16>
-!V     = tensor<96x2048x128xf16>
-!O     = tensor<96x2048x128xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_96x2048x128x128x2048xf8E4M3FNUZ.mlir b/attention/mlir/attention_96x2048x128x128x2048xf8E4M3FNUZ.mlir
deleted file mode 100644
index 7ae181d..0000000
--- a/attention/mlir/attention_96x2048x128x128x2048xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<96x2048x128xf8E4M3FNUZ>
-!K     = tensor<96x2048x128xf8E4M3FNUZ>
-!V     = tensor<96x2048x128xf8E4M3FNUZ>
-!O     = tensor<96x2048x128xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_96x2048x64x64x2048xf16.mlir b/attention/mlir/attention_96x2048x64x64x2048xf16.mlir
deleted file mode 100644
index d0484ab..0000000
--- a/attention/mlir/attention_96x2048x64x64x2048xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<96x2048x64xf16>
-!K     = tensor<96x2048x64xf16>
-!V     = tensor<96x2048x64xf16>
-!O     = tensor<96x2048x64xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_96x2048x64x64x2048xf8E4M3FNUZ.mlir b/attention/mlir/attention_96x2048x64x64x2048xf8E4M3FNUZ.mlir
deleted file mode 100644
index ec17a0a..0000000
--- a/attention/mlir/attention_96x2048x64x64x2048xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<96x2048x64xf8E4M3FNUZ>
-!K     = tensor<96x2048x64xf8E4M3FNUZ>
-!V     = tensor<96x2048x64xf8E4M3FNUZ>
-!O     = tensor<96x2048x64xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_96x4096x128x128x4096xf16.mlir b/attention/mlir/attention_96x4096x128x128x4096xf16.mlir
deleted file mode 100644
index 5d72378..0000000
--- a/attention/mlir/attention_96x4096x128x128x4096xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<96x4096x128xf16>
-!K     = tensor<96x4096x128xf16>
-!V     = tensor<96x4096x128xf16>
-!O     = tensor<96x4096x128xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_96x4096x128x128x4096xf8E4M3FNUZ.mlir b/attention/mlir/attention_96x4096x128x128x4096xf8E4M3FNUZ.mlir
deleted file mode 100644
index 1c90153..0000000
--- a/attention/mlir/attention_96x4096x128x128x4096xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<96x4096x128xf8E4M3FNUZ>
-!K     = tensor<96x4096x128xf8E4M3FNUZ>
-!V     = tensor<96x4096x128xf8E4M3FNUZ>
-!O     = tensor<96x4096x128xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_96x4096x64x64x4096xf16.mlir b/attention/mlir/attention_96x4096x64x64x4096xf16.mlir
deleted file mode 100644
index d5b335f..0000000
--- a/attention/mlir/attention_96x4096x64x64x4096xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<96x4096x64xf16>
-!K     = tensor<96x4096x64xf16>
-!V     = tensor<96x4096x64xf16>
-!O     = tensor<96x4096x64xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_96x4096x64x64x4096xf8E4M3FNUZ.mlir b/attention/mlir/attention_96x4096x64x64x4096xf8E4M3FNUZ.mlir
deleted file mode 100644
index 9ce9417..0000000
--- a/attention/mlir/attention_96x4096x64x64x4096xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<96x4096x64xf8E4M3FNUZ>
-!K     = tensor<96x4096x64xf8E4M3FNUZ>
-!V     = tensor<96x4096x64xf8E4M3FNUZ>
-!O     = tensor<96x4096x64xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_96x8192x128x128x8192xf16.mlir b/attention/mlir/attention_96x8192x128x128x8192xf16.mlir
deleted file mode 100644
index 105ebc1..0000000
--- a/attention/mlir/attention_96x8192x128x128x8192xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<96x8192x128xf16>
-!K     = tensor<96x8192x128xf16>
-!V     = tensor<96x8192x128xf16>
-!O     = tensor<96x8192x128xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_96x8192x128x128x8192xf8E4M3FNUZ.mlir b/attention/mlir/attention_96x8192x128x128x8192xf8E4M3FNUZ.mlir
deleted file mode 100644
index 9edf095..0000000
--- a/attention/mlir/attention_96x8192x128x128x8192xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<96x8192x128xf8E4M3FNUZ>
-!K     = tensor<96x8192x128xf8E4M3FNUZ>
-!V     = tensor<96x8192x128xf8E4M3FNUZ>
-!O     = tensor<96x8192x128xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_96x8192x64x64x8192xf16.mlir b/attention/mlir/attention_96x8192x64x64x8192xf16.mlir
deleted file mode 100644
index 436ab86..0000000
--- a/attention/mlir/attention_96x8192x64x64x8192xf16.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-!dtype = f16
-!Q     = tensor<96x8192x64xf16>
-!K     = tensor<96x8192x64xf16>
-!V     = tensor<96x8192x64xf16>
-!O     = tensor<96x8192x64xf16>
-
-#tuning = #iree_codegen.compilation_info<lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1,128,0,0,32]]>, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 4] subgroup_size = 64 ,{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1> , llvm_func_attrs = { "amdgpu-waves-per-eu" = "2","denormal-fp-math-f32" = "preserve-sign" }}>>
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         ,compilation_info = #tuning
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/attention/mlir/attention_96x8192x64x64x8192xf8E4M3FNUZ.mlir b/attention/mlir/attention_96x8192x64x64x8192xf8E4M3FNUZ.mlir
deleted file mode 100644
index 9301a91..0000000
--- a/attention/mlir/attention_96x8192x64x64x8192xf8E4M3FNUZ.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-!dtype = f8E4M3FNUZ
-!Q     = tensor<96x8192x64xf8E4M3FNUZ>
-!K     = tensor<96x8192x64xf8E4M3FNUZ>
-!V     = tensor<96x8192x64xf8E4M3FNUZ>
-!O     = tensor<96x8192x64xf8E4M3FNUZ>
-
-
-
-#Q = affine_map<(b, m, n, k1, k2) -> (b, m, k1)>
-#K = affine_map<(b, m, n, k1, k2) -> (b, k2, k1)>
-#V = affine_map<(b, m, n, k1, k2) -> (b, k2, n)>
-#S = affine_map<(b, m, n, k1, k2) -> ()>
-#O = affine_map<(b, m, n, k1, k2) -> (b, m, n)>
-
-func.func @main(%Q : !Q, %K : !K, %V : !V) -> !O {
-  %scale = arith.constant 1.0 : !dtype
-  %empty = tensor.empty() : !O
-  %O = iree_linalg_ext.attention 
-       { indexing_maps = [#Q, #K, #V, #S, #O]
-         
-       }
-       ins(%Q, %K, %V, %scale : !Q, !K, !V, !dtype)
-       outs(%empty : !O) -> !O
-  return %O : !O
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_16x112x112x64x7x7x3_f32xf32xf32_stride2.mlir b/conv/mlir/conv_2d_nchw_fchw_16x112x112x64x7x7x3_f32xf32xf32_stride2.mlir
deleted file mode 100644
index 1f136f0..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_16x112x112x64x7x7x3_f32xf32xf32_stride2.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<16x64x230x230xf32>, %arg1: tensor<3x64x7x7xf32>) -> tensor<16x3x112x112xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<16x3x112x112xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<16x3x112x112xf32>) -> tensor<16x3x112x112xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1 : tensor<16x64x230x230xf32>, tensor<3x64x7x7xf32>) outs(%10 : tensor<16x3x112x112xf32>) -> tensor<16x3x112x112xf32>
-    util.return %11 : tensor<16x3x112x112xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_16x14x14x1024x1x1x512_f32xf32xf32_stride2.mlir b/conv/mlir/conv_2d_nchw_fchw_16x14x14x1024x1x1x512_f32xf32xf32_stride2.mlir
deleted file mode 100644
index d919e2b..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_16x14x14x1024x1x1x512_f32xf32xf32_stride2.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<16x1024x28x28xf32>, %arg1: tensor<512x1024x1x1xf32>) -> tensor<16x512x14x14xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<16x512x14x14xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<16x512x14x14xf32>) -> tensor<16x512x14x14xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1 : tensor<16x1024x28x28xf32>, tensor<512x1024x1x1xf32>) outs(%10 : tensor<16x512x14x14xf32>) -> tensor<16x512x14x14xf32>
-    util.return %11 : tensor<16x512x14x14xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_16x14x14x256x3x3x256_f32xf32xf32_stride1.mlir b/conv/mlir/conv_2d_nchw_fchw_16x14x14x256x3x3x256_f32xf32xf32_stride1.mlir
deleted file mode 100644
index 679e9c1..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_16x14x14x256x3x3x256_f32xf32xf32_stride1.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<16x256x16x16xf32>, %arg1: tensor<256x256x3x3xf32>) -> tensor<16x256x14x14xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<16x256x14x14xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<16x256x14x14xf32>) -> tensor<16x256x14x14xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1 : tensor<16x256x16x16xf32>, tensor<256x256x3x3xf32>) outs(%10 : tensor<16x256x14x14xf32>) -> tensor<16x256x14x14xf32>
-    util.return %11 : tensor<16x256x14x14xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_16x14x14x256x3x3x256_f32xf32xf32_stride2.mlir b/conv/mlir/conv_2d_nchw_fchw_16x14x14x256x3x3x256_f32xf32xf32_stride2.mlir
deleted file mode 100644
index 7f3b898..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_16x14x14x256x3x3x256_f32xf32xf32_stride2.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<16x256x30x30xf32>, %arg1: tensor<256x256x3x3xf32>) -> tensor<16x256x14x14xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<16x256x14x14xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<16x256x14x14xf32>) -> tensor<16x256x14x14xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1 : tensor<16x256x30x30xf32>, tensor<256x256x3x3xf32>) outs(%10 : tensor<16x256x14x14xf32>) -> tensor<16x256x14x14xf32>
-    util.return %11 : tensor<16x256x14x14xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_16x28x28x128x3x3x128_f32xf32xf32_stride1.mlir b/conv/mlir/conv_2d_nchw_fchw_16x28x28x128x3x3x128_f32xf32xf32_stride1.mlir
deleted file mode 100644
index ad8f50b..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_16x28x28x128x3x3x128_f32xf32xf32_stride1.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<16x128x30x30xf32>, %arg1: tensor<128x128x3x3xf32>) -> tensor<16x128x28x28xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<16x128x28x28xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<16x128x28x28xf32>) -> tensor<16x128x28x28xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1 : tensor<16x128x30x30xf32>, tensor<128x128x3x3xf32>) outs(%10 : tensor<16x128x28x28xf32>) -> tensor<16x128x28x28xf32>
-    util.return %11 : tensor<16x128x28x28xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_16x28x28x128x3x3x128_f32xf32xf32_stride2.mlir b/conv/mlir/conv_2d_nchw_fchw_16x28x28x128x3x3x128_f32xf32xf32_stride2.mlir
deleted file mode 100644
index dfa1206..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_16x28x28x128x3x3x128_f32xf32xf32_stride2.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<16x128x58x58xf32>, %arg1: tensor<128x128x3x3xf32>) -> tensor<16x128x28x28xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<16x128x28x28xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<16x128x28x28xf32>) -> tensor<16x128x28x28xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1 : tensor<16x128x58x58xf32>, tensor<128x128x3x3xf32>) outs(%10 : tensor<16x128x28x28xf32>) -> tensor<16x128x28x28xf32>
-    util.return %11 : tensor<16x128x28x28xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_16x28x28x512x1x1x256_f32xf32xf32_stride2.mlir b/conv/mlir/conv_2d_nchw_fchw_16x28x28x512x1x1x256_f32xf32xf32_stride2.mlir
deleted file mode 100644
index 1a54757..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_16x28x28x512x1x1x256_f32xf32xf32_stride2.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<16x512x56x56xf32>, %arg1: tensor<256x512x1x1xf32>) -> tensor<16x256x28x28xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<16x256x28x28xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<16x256x28x28xf32>) -> tensor<16x256x28x28xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1 : tensor<16x512x56x56xf32>, tensor<256x512x1x1xf32>) outs(%10 : tensor<16x256x28x28xf32>) -> tensor<16x256x28x28xf32>
-    util.return %11 : tensor<16x256x28x28xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_16x56x56x64x3x3x64_f32xf32xf32_stride1.mlir b/conv/mlir/conv_2d_nchw_fchw_16x56x56x64x3x3x64_f32xf32xf32_stride1.mlir
deleted file mode 100644
index cd5bb55..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_16x56x56x64x3x3x64_f32xf32xf32_stride1.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<16x64x58x58xf32>, %arg1: tensor<64x64x3x3xf32>) -> tensor<16x64x56x56xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<16x64x56x56xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<16x64x56x56xf32>) -> tensor<16x64x56x56xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1 : tensor<16x64x58x58xf32>, tensor<64x64x3x3xf32>) outs(%10 : tensor<16x64x56x56xf32>) -> tensor<16x64x56x56xf32>
-    util.return %11 : tensor<16x64x56x56xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_16x7x7x2048x1x1x1024_f32xf32xf32_stride2.mlir b/conv/mlir/conv_2d_nchw_fchw_16x7x7x2048x1x1x1024_f32xf32xf32_stride2.mlir
deleted file mode 100644
index 65bb530..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_16x7x7x2048x1x1x1024_f32xf32xf32_stride2.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<16x2048x14x14xf32>, %arg1: tensor<1024x2048x1x1xf32>) -> tensor<16x1024x7x7xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<16x1024x7x7xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<16x1024x7x7xf32>) -> tensor<16x1024x7x7xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1 : tensor<16x2048x14x14xf32>, tensor<1024x2048x1x1xf32>) outs(%10 : tensor<16x1024x7x7xf32>) -> tensor<16x1024x7x7xf32>
-    util.return %11 : tensor<16x1024x7x7xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_16x7x7x512x3x3x512_f32xf32xf32_stride1.mlir b/conv/mlir/conv_2d_nchw_fchw_16x7x7x512x3x3x512_f32xf32xf32_stride1.mlir
deleted file mode 100644
index 8be6811..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_16x7x7x512x3x3x512_f32xf32xf32_stride1.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<16x512x9x9xf32>, %arg1: tensor<512x512x3x3xf32>) -> tensor<16x512x7x7xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<16x512x7x7xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<16x512x7x7xf32>) -> tensor<16x512x7x7xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1 : tensor<16x512x9x9xf32>, tensor<512x512x3x3xf32>) outs(%10 : tensor<16x512x7x7xf32>) -> tensor<16x512x7x7xf32>
-    util.return %11 : tensor<16x512x7x7xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_16x7x7x512x3x3x512_f32xf32xf32_stride2.mlir b/conv/mlir/conv_2d_nchw_fchw_16x7x7x512x3x3x512_f32xf32xf32_stride2.mlir
deleted file mode 100644
index c29dbad..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_16x7x7x512x3x3x512_f32xf32xf32_stride2.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<16x512x16x16xf32>, %arg1: tensor<512x512x3x3xf32>) -> tensor<16x512x7x7xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<16x512x7x7xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<16x512x7x7xf32>) -> tensor<16x512x7x7xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1 : tensor<16x512x16x16xf32>, tensor<512x512x3x3xf32>) outs(%10 : tensor<16x512x7x7xf32>) -> tensor<16x512x7x7xf32>
-    util.return %11 : tensor<16x512x7x7xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_1x112x112x64x7x7x3_f32xf32xf32_stride2.mlir b/conv/mlir/conv_2d_nchw_fchw_1x112x112x64x7x7x3_f32xf32xf32_stride2.mlir
deleted file mode 100644
index 6b8f39c..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_1x112x112x64x7x7x3_f32xf32xf32_stride2.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<1x64x230x230xf32>, %arg1: tensor<3x64x7x7xf32>) -> tensor<1x3x112x112xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<1x3x112x112xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<1x3x112x112xf32>) -> tensor<1x3x112x112xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1 : tensor<1x64x230x230xf32>, tensor<3x64x7x7xf32>) outs(%10 : tensor<1x3x112x112xf32>) -> tensor<1x3x112x112xf32>
-    util.return %11 : tensor<1x3x112x112xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_1x14x14x1024x1x1x512_f32xf32xf32_stride2.mlir b/conv/mlir/conv_2d_nchw_fchw_1x14x14x1024x1x1x512_f32xf32xf32_stride2.mlir
deleted file mode 100644
index 979447f..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_1x14x14x1024x1x1x512_f32xf32xf32_stride2.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<1x1024x28x28xf32>, %arg1: tensor<512x1024x1x1xf32>) -> tensor<1x512x14x14xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<1x512x14x14xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<1x512x14x14xf32>) -> tensor<1x512x14x14xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1 : tensor<1x1024x28x28xf32>, tensor<512x1024x1x1xf32>) outs(%10 : tensor<1x512x14x14xf32>) -> tensor<1x512x14x14xf32>
-    util.return %11 : tensor<1x512x14x14xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_1x14x14x256x3x3x256_f32xf32xf32_stride1.mlir b/conv/mlir/conv_2d_nchw_fchw_1x14x14x256x3x3x256_f32xf32xf32_stride1.mlir
deleted file mode 100644
index 5333cb1..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_1x14x14x256x3x3x256_f32xf32xf32_stride1.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<1x256x16x16xf32>, %arg1: tensor<256x256x3x3xf32>) -> tensor<1x256x14x14xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<1x256x14x14xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1 : tensor<1x256x16x16xf32>, tensor<256x256x3x3xf32>) outs(%10 : tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    util.return %11 : tensor<1x256x14x14xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_1x14x14x256x3x3x256_f32xf32xf32_stride2.mlir b/conv/mlir/conv_2d_nchw_fchw_1x14x14x256x3x3x256_f32xf32xf32_stride2.mlir
deleted file mode 100644
index 9bde2cb..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_1x14x14x256x3x3x256_f32xf32xf32_stride2.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<1x256x30x30xf32>, %arg1: tensor<256x256x3x3xf32>) -> tensor<1x256x14x14xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<1x256x14x14xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1 : tensor<1x256x30x30xf32>, tensor<256x256x3x3xf32>) outs(%10 : tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    util.return %11 : tensor<1x256x14x14xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_1x28x28x128x3x3x128_f32xf32xf32_stride1.mlir b/conv/mlir/conv_2d_nchw_fchw_1x28x28x128x3x3x128_f32xf32xf32_stride1.mlir
deleted file mode 100644
index 3fade95..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_1x28x28x128x3x3x128_f32xf32xf32_stride1.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<1x128x30x30xf32>, %arg1: tensor<128x128x3x3xf32>) -> tensor<1x128x28x28xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<1x128x28x28xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1 : tensor<1x128x30x30xf32>, tensor<128x128x3x3xf32>) outs(%10 : tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    util.return %11 : tensor<1x128x28x28xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_1x28x28x128x3x3x128_f32xf32xf32_stride2.mlir b/conv/mlir/conv_2d_nchw_fchw_1x28x28x128x3x3x128_f32xf32xf32_stride2.mlir
deleted file mode 100644
index 1d69b02..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_1x28x28x128x3x3x128_f32xf32xf32_stride2.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<1x128x58x58xf32>, %arg1: tensor<128x128x3x3xf32>) -> tensor<1x128x28x28xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<1x128x28x28xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1 : tensor<1x128x58x58xf32>, tensor<128x128x3x3xf32>) outs(%10 : tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    util.return %11 : tensor<1x128x28x28xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_1x28x28x512x1x1x256_f32xf32xf32_stride2.mlir b/conv/mlir/conv_2d_nchw_fchw_1x28x28x512x1x1x256_f32xf32xf32_stride2.mlir
deleted file mode 100644
index f64ac9d..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_1x28x28x512x1x1x256_f32xf32xf32_stride2.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<1x512x56x56xf32>, %arg1: tensor<256x512x1x1xf32>) -> tensor<1x256x28x28xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<1x256x28x28xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<1x256x28x28xf32>) -> tensor<1x256x28x28xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1 : tensor<1x512x56x56xf32>, tensor<256x512x1x1xf32>) outs(%10 : tensor<1x256x28x28xf32>) -> tensor<1x256x28x28xf32>
-    util.return %11 : tensor<1x256x28x28xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_1x56x56x64x3x3x64_f32xf32xf32_stride1.mlir b/conv/mlir/conv_2d_nchw_fchw_1x56x56x64x3x3x64_f32xf32xf32_stride1.mlir
deleted file mode 100644
index a519332..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_1x56x56x64x3x3x64_f32xf32xf32_stride1.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<1x64x58x58xf32>, %arg1: tensor<64x64x3x3xf32>) -> tensor<1x64x56x56xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<1x64x56x56xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1 : tensor<1x64x58x58xf32>, tensor<64x64x3x3xf32>) outs(%10 : tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32>
-    util.return %11 : tensor<1x64x56x56xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_1x56x56x64x7x7x3_f32xf32xf32_stride1.mlir b/conv/mlir/conv_2d_nchw_fchw_1x56x56x64x7x7x3_f32xf32xf32_stride1.mlir
deleted file mode 100644
index e64129d..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_1x56x56x64x7x7x3_f32xf32xf32_stride1.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-
-util.func public @main(%arg0: tensor<1x64x58x58xf32>, %arg1: tensor<3x64x7x7xf32>) -> tensor<1x64x56x56xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<1x64x56x56xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1 : tensor<1x64x58x58xf32>, tensor<3x64x7x7xf32>) outs(%10 : tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32>
-    util.return %11 : tensor<1x64x56x56xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_1x7x7x2048x1x1x1024_f32xf32xf32_stride2.mlir b/conv/mlir/conv_2d_nchw_fchw_1x7x7x2048x1x1x1024_f32xf32xf32_stride2.mlir
deleted file mode 100644
index 3e7dc3a..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_1x7x7x2048x1x1x1024_f32xf32xf32_stride2.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<1x2048x14x14xf32>, %arg1: tensor<1024x2048x1x1xf32>) -> tensor<1x1024x7x7xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<1x1024x7x7xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<1x1024x7x7xf32>) -> tensor<1x1024x7x7xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1 : tensor<1x2048x14x14xf32>, tensor<1024x2048x1x1xf32>) outs(%10 : tensor<1x1024x7x7xf32>) -> tensor<1x1024x7x7xf32>
-    util.return %11 : tensor<1x1024x7x7xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_1x7x7x512x3x3x512_f32xf32xf32_stride1.mlir b/conv/mlir/conv_2d_nchw_fchw_1x7x7x512x3x3x512_f32xf32xf32_stride1.mlir
deleted file mode 100644
index 437a522..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_1x7x7x512x3x3x512_f32xf32xf32_stride1.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<1x512x9x9xf32>, %arg1: tensor<512x512x3x3xf32>) -> tensor<1x512x7x7xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<1x512x7x7xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1 : tensor<1x512x9x9xf32>, tensor<512x512x3x3xf32>) outs(%10 : tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf32>
-    util.return %11 : tensor<1x512x7x7xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_1x7x7x512x3x3x512_f32xf32xf32_stride2.mlir b/conv/mlir/conv_2d_nchw_fchw_1x7x7x512x3x3x512_f32xf32xf32_stride2.mlir
deleted file mode 100644
index 745dff7..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_1x7x7x512x3x3x512_f32xf32xf32_stride2.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<1x512x16x16xf32>, %arg1: tensor<512x512x3x3xf32>) -> tensor<1x512x7x7xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<1x512x7x7xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1 : tensor<1x512x16x16xf32>, tensor<512x512x3x3xf32>) outs(%10 : tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf32>
-    util.return %11 : tensor<1x512x7x7xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_2x112x112x64x7x7x3_f32xf32xf32_stride2.mlir b/conv/mlir/conv_2d_nchw_fchw_2x112x112x64x7x7x3_f32xf32xf32_stride2.mlir
deleted file mode 100644
index be31d37..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_2x112x112x64x7x7x3_f32xf32xf32_stride2.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<2x64x230x230xf32>, %arg1: tensor<3x64x7x7xf32>) -> tensor<2x3x112x112xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<2x3x112x112xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x3x112x112xf32>) -> tensor<2x3x112x112xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1 : tensor<2x64x230x230xf32>, tensor<3x64x7x7xf32>) outs(%10 : tensor<2x3x112x112xf32>) -> tensor<2x3x112x112xf32>
-    util.return %11 : tensor<2x3x112x112xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_2x14x14x1024x1x1x512_f32xf32xf32_stride2.mlir b/conv/mlir/conv_2d_nchw_fchw_2x14x14x1024x1x1x512_f32xf32xf32_stride2.mlir
deleted file mode 100644
index 75c53d7..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_2x14x14x1024x1x1x512_f32xf32xf32_stride2.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<2x1024x28x28xf32>, %arg1: tensor<512x1024x1x1xf32>) -> tensor<2x512x14x14xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<2x512x14x14xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x512x14x14xf32>) -> tensor<2x512x14x14xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1 : tensor<2x1024x28x28xf32>, tensor<512x1024x1x1xf32>) outs(%10 : tensor<2x512x14x14xf32>) -> tensor<2x512x14x14xf32>
-    util.return %11 : tensor<2x512x14x14xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_2x14x14x256x3x3x256_f32xf32xf32_stride1.mlir b/conv/mlir/conv_2d_nchw_fchw_2x14x14x256x3x3x256_f32xf32xf32_stride1.mlir
deleted file mode 100644
index 0086840..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_2x14x14x256x3x3x256_f32xf32xf32_stride1.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<2x256x16x16xf32>, %arg1: tensor<256x256x3x3xf32>) -> tensor<2x256x14x14xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<2x256x14x14xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x256x14x14xf32>) -> tensor<2x256x14x14xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1 : tensor<2x256x16x16xf32>, tensor<256x256x3x3xf32>) outs(%10 : tensor<2x256x14x14xf32>) -> tensor<2x256x14x14xf32>
-    util.return %11 : tensor<2x256x14x14xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_2x14x14x256x3x3x256_f32xf32xf32_stride2.mlir b/conv/mlir/conv_2d_nchw_fchw_2x14x14x256x3x3x256_f32xf32xf32_stride2.mlir
deleted file mode 100644
index 799d59b..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_2x14x14x256x3x3x256_f32xf32xf32_stride2.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<2x256x30x30xf32>, %arg1: tensor<256x256x3x3xf32>) -> tensor<2x256x14x14xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<2x256x14x14xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x256x14x14xf32>) -> tensor<2x256x14x14xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1 : tensor<2x256x30x30xf32>, tensor<256x256x3x3xf32>) outs(%10 : tensor<2x256x14x14xf32>) -> tensor<2x256x14x14xf32>
-    util.return %11 : tensor<2x256x14x14xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_2x28x28x128x3x3x128_f32xf32xf32_stride1.mlir b/conv/mlir/conv_2d_nchw_fchw_2x28x28x128x3x3x128_f32xf32xf32_stride1.mlir
deleted file mode 100644
index 4e3ac52..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_2x28x28x128x3x3x128_f32xf32xf32_stride1.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<2x128x30x30xf32>, %arg1: tensor<128x128x3x3xf32>) -> tensor<2x128x28x28xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<2x128x28x28xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x128x28x28xf32>) -> tensor<2x128x28x28xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1 : tensor<2x128x30x30xf32>, tensor<128x128x3x3xf32>) outs(%10 : tensor<2x128x28x28xf32>) -> tensor<2x128x28x28xf32>
-    util.return %11 : tensor<2x128x28x28xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_2x28x28x128x3x3x128_f32xf32xf32_stride2.mlir b/conv/mlir/conv_2d_nchw_fchw_2x28x28x128x3x3x128_f32xf32xf32_stride2.mlir
deleted file mode 100644
index 03f9ca2..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_2x28x28x128x3x3x128_f32xf32xf32_stride2.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<2x128x58x58xf32>, %arg1: tensor<128x128x3x3xf32>) -> tensor<2x128x28x28xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<2x128x28x28xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x128x28x28xf32>) -> tensor<2x128x28x28xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1 : tensor<2x128x58x58xf32>, tensor<128x128x3x3xf32>) outs(%10 : tensor<2x128x28x28xf32>) -> tensor<2x128x28x28xf32>
-    util.return %11 : tensor<2x128x28x28xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_2x28x28x512x1x1x256_f32xf32xf32_stride2.mlir b/conv/mlir/conv_2d_nchw_fchw_2x28x28x512x1x1x256_f32xf32xf32_stride2.mlir
deleted file mode 100644
index 837a4fe..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_2x28x28x512x1x1x256_f32xf32xf32_stride2.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<2x512x56x56xf32>, %arg1: tensor<256x512x1x1xf32>) -> tensor<2x256x28x28xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<2x256x28x28xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x256x28x28xf32>) -> tensor<2x256x28x28xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1 : tensor<2x512x56x56xf32>, tensor<256x512x1x1xf32>) outs(%10 : tensor<2x256x28x28xf32>) -> tensor<2x256x28x28xf32>
-    util.return %11 : tensor<2x256x28x28xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_2x56x56x64x3x3x64_f32xf32xf32_stride1.mlir b/conv/mlir/conv_2d_nchw_fchw_2x56x56x64x3x3x64_f32xf32xf32_stride1.mlir
deleted file mode 100644
index 5a86b3d..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_2x56x56x64x3x3x64_f32xf32xf32_stride1.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<2x64x58x58xf32>, %arg1: tensor<64x64x3x3xf32>) -> tensor<2x64x56x56xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<2x64x56x56xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x64x56x56xf32>) -> tensor<2x64x56x56xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1 : tensor<2x64x58x58xf32>, tensor<64x64x3x3xf32>) outs(%10 : tensor<2x64x56x56xf32>) -> tensor<2x64x56x56xf32>
-    util.return %11 : tensor<2x64x56x56xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_2x7x7x2048x1x1x1024_f32xf32xf32_stride2.mlir b/conv/mlir/conv_2d_nchw_fchw_2x7x7x2048x1x1x1024_f32xf32xf32_stride2.mlir
deleted file mode 100644
index 0febd5a..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_2x7x7x2048x1x1x1024_f32xf32xf32_stride2.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<2x2048x14x14xf32>, %arg1: tensor<1024x2048x1x1xf32>) -> tensor<2x1024x7x7xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<2x1024x7x7xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x1024x7x7xf32>) -> tensor<2x1024x7x7xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1 : tensor<2x2048x14x14xf32>, tensor<1024x2048x1x1xf32>) outs(%10 : tensor<2x1024x7x7xf32>) -> tensor<2x1024x7x7xf32>
-    util.return %11 : tensor<2x1024x7x7xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_2x7x7x512x3x3x512_f32xf32xf32_stride1.mlir b/conv/mlir/conv_2d_nchw_fchw_2x7x7x512x3x3x512_f32xf32xf32_stride1.mlir
deleted file mode 100644
index 80c002a..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_2x7x7x512x3x3x512_f32xf32xf32_stride1.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<2x512x9x9xf32>, %arg1: tensor<512x512x3x3xf32>) -> tensor<2x512x7x7xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<2x512x7x7xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x512x7x7xf32>) -> tensor<2x512x7x7xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1 : tensor<2x512x9x9xf32>, tensor<512x512x3x3xf32>) outs(%10 : tensor<2x512x7x7xf32>) -> tensor<2x512x7x7xf32>
-    util.return %11 : tensor<2x512x7x7xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_2x7x7x512x3x3x512_f32xf32xf32_stride2.mlir b/conv/mlir/conv_2d_nchw_fchw_2x7x7x512x3x3x512_f32xf32xf32_stride2.mlir
deleted file mode 100644
index 9d3daa1..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_2x7x7x512x3x3x512_f32xf32xf32_stride2.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<2x512x16x16xf32>, %arg1: tensor<512x512x3x3xf32>) -> tensor<2x512x7x7xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<2x512x7x7xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x512x7x7xf32>) -> tensor<2x512x7x7xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1 : tensor<2x512x16x16xf32>, tensor<512x512x3x3xf32>) outs(%10 : tensor<2x512x7x7xf32>) -> tensor<2x512x7x7xf32>
-    util.return %11 : tensor<2x512x7x7xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_32x112x112x64x7x7x3_f32xf32xf32_stride2.mlir b/conv/mlir/conv_2d_nchw_fchw_32x112x112x64x7x7x3_f32xf32xf32_stride2.mlir
deleted file mode 100644
index 78b135b..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_32x112x112x64x7x7x3_f32xf32xf32_stride2.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<32x64x230x230xf32>, %arg1: tensor<3x64x7x7xf32>) -> tensor<32x3x112x112xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<32x3x112x112xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<32x3x112x112xf32>) -> tensor<32x3x112x112xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1 : tensor<32x64x230x230xf32>, tensor<3x64x7x7xf32>) outs(%10 : tensor<32x3x112x112xf32>) -> tensor<32x3x112x112xf32>
-    util.return %11 : tensor<32x3x112x112xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_32x14x14x1024x1x1x512_f32xf32xf32_stride2.mlir b/conv/mlir/conv_2d_nchw_fchw_32x14x14x1024x1x1x512_f32xf32xf32_stride2.mlir
deleted file mode 100644
index 7856bdf..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_32x14x14x1024x1x1x512_f32xf32xf32_stride2.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<32x1024x28x28xf32>, %arg1: tensor<512x1024x1x1xf32>) -> tensor<32x512x14x14xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<32x512x14x14xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<32x512x14x14xf32>) -> tensor<32x512x14x14xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1 : tensor<32x1024x28x28xf32>, tensor<512x1024x1x1xf32>) outs(%10 : tensor<32x512x14x14xf32>) -> tensor<32x512x14x14xf32>
-    util.return %11 : tensor<32x512x14x14xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_32x14x14x256x3x3x256_f32xf32xf32_stride1.mlir b/conv/mlir/conv_2d_nchw_fchw_32x14x14x256x3x3x256_f32xf32xf32_stride1.mlir
deleted file mode 100644
index 1e0e58c..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_32x14x14x256x3x3x256_f32xf32xf32_stride1.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<32x256x16x16xf32>, %arg1: tensor<256x256x3x3xf32>) -> tensor<32x256x14x14xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<32x256x14x14xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<32x256x14x14xf32>) -> tensor<32x256x14x14xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1 : tensor<32x256x16x16xf32>, tensor<256x256x3x3xf32>) outs(%10 : tensor<32x256x14x14xf32>) -> tensor<32x256x14x14xf32>
-    util.return %11 : tensor<32x256x14x14xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_32x14x14x256x3x3x256_f32xf32xf32_stride2.mlir b/conv/mlir/conv_2d_nchw_fchw_32x14x14x256x3x3x256_f32xf32xf32_stride2.mlir
deleted file mode 100644
index 56cf448..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_32x14x14x256x3x3x256_f32xf32xf32_stride2.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<32x256x30x30xf32>, %arg1: tensor<256x256x3x3xf32>) -> tensor<32x256x14x14xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<32x256x14x14xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<32x256x14x14xf32>) -> tensor<32x256x14x14xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1 : tensor<32x256x30x30xf32>, tensor<256x256x3x3xf32>) outs(%10 : tensor<32x256x14x14xf32>) -> tensor<32x256x14x14xf32>
-    util.return %11 : tensor<32x256x14x14xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_32x28x28x128x3x3x128_f32xf32xf32_stride1.mlir b/conv/mlir/conv_2d_nchw_fchw_32x28x28x128x3x3x128_f32xf32xf32_stride1.mlir
deleted file mode 100644
index a720340..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_32x28x28x128x3x3x128_f32xf32xf32_stride1.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<32x128x30x30xf32>, %arg1: tensor<128x128x3x3xf32>) -> tensor<32x128x28x28xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<32x128x28x28xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<32x128x28x28xf32>) -> tensor<32x128x28x28xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1 : tensor<32x128x30x30xf32>, tensor<128x128x3x3xf32>) outs(%10 : tensor<32x128x28x28xf32>) -> tensor<32x128x28x28xf32>
-    util.return %11 : tensor<32x128x28x28xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_32x28x28x128x3x3x128_f32xf32xf32_stride2.mlir b/conv/mlir/conv_2d_nchw_fchw_32x28x28x128x3x3x128_f32xf32xf32_stride2.mlir
deleted file mode 100644
index c206973..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_32x28x28x128x3x3x128_f32xf32xf32_stride2.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<32x128x58x58xf32>, %arg1: tensor<128x128x3x3xf32>) -> tensor<32x128x28x28xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<32x128x28x28xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<32x128x28x28xf32>) -> tensor<32x128x28x28xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1 : tensor<32x128x58x58xf32>, tensor<128x128x3x3xf32>) outs(%10 : tensor<32x128x28x28xf32>) -> tensor<32x128x28x28xf32>
-    util.return %11 : tensor<32x128x28x28xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_32x28x28x512x1x1x256_f32xf32xf32_stride2.mlir b/conv/mlir/conv_2d_nchw_fchw_32x28x28x512x1x1x256_f32xf32xf32_stride2.mlir
deleted file mode 100644
index 492581c..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_32x28x28x512x1x1x256_f32xf32xf32_stride2.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<32x512x56x56xf32>, %arg1: tensor<256x512x1x1xf32>) -> tensor<32x256x28x28xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<32x256x28x28xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<32x256x28x28xf32>) -> tensor<32x256x28x28xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1 : tensor<32x512x56x56xf32>, tensor<256x512x1x1xf32>) outs(%10 : tensor<32x256x28x28xf32>) -> tensor<32x256x28x28xf32>
-    util.return %11 : tensor<32x256x28x28xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_32x56x56x64x3x3x64_f32xf32xf32_stride1.mlir b/conv/mlir/conv_2d_nchw_fchw_32x56x56x64x3x3x64_f32xf32xf32_stride1.mlir
deleted file mode 100644
index e3ffe53..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_32x56x56x64x3x3x64_f32xf32xf32_stride1.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<32x64x58x58xf32>, %arg1: tensor<64x64x3x3xf32>) -> tensor<32x64x56x56xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<32x64x56x56xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<32x64x56x56xf32>) -> tensor<32x64x56x56xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1 : tensor<32x64x58x58xf32>, tensor<64x64x3x3xf32>) outs(%10 : tensor<32x64x56x56xf32>) -> tensor<32x64x56x56xf32>
-    util.return %11 : tensor<32x64x56x56xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_32x7x7x2048x1x1x1024_f32xf32xf32_stride2.mlir b/conv/mlir/conv_2d_nchw_fchw_32x7x7x2048x1x1x1024_f32xf32xf32_stride2.mlir
deleted file mode 100644
index 131cbd7..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_32x7x7x2048x1x1x1024_f32xf32xf32_stride2.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<32x2048x14x14xf32>, %arg1: tensor<1024x2048x1x1xf32>) -> tensor<32x1024x7x7xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<32x1024x7x7xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<32x1024x7x7xf32>) -> tensor<32x1024x7x7xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1 : tensor<32x2048x14x14xf32>, tensor<1024x2048x1x1xf32>) outs(%10 : tensor<32x1024x7x7xf32>) -> tensor<32x1024x7x7xf32>
-    util.return %11 : tensor<32x1024x7x7xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_32x7x7x512x3x3x512_f32xf32xf32_stride1.mlir b/conv/mlir/conv_2d_nchw_fchw_32x7x7x512x3x3x512_f32xf32xf32_stride1.mlir
deleted file mode 100644
index 254d882..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_32x7x7x512x3x3x512_f32xf32xf32_stride1.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<32x512x9x9xf32>, %arg1: tensor<512x512x3x3xf32>) -> tensor<32x512x7x7xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<32x512x7x7xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<32x512x7x7xf32>) -> tensor<32x512x7x7xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1 : tensor<32x512x9x9xf32>, tensor<512x512x3x3xf32>) outs(%10 : tensor<32x512x7x7xf32>) -> tensor<32x512x7x7xf32>
-    util.return %11 : tensor<32x512x7x7xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_32x7x7x512x3x3x512_f32xf32xf32_stride2.mlir b/conv/mlir/conv_2d_nchw_fchw_32x7x7x512x3x3x512_f32xf32xf32_stride2.mlir
deleted file mode 100644
index 2227e1b..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_32x7x7x512x3x3x512_f32xf32xf32_stride2.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<32x512x16x16xf32>, %arg1: tensor<512x512x3x3xf32>) -> tensor<32x512x7x7xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<32x512x7x7xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<32x512x7x7xf32>) -> tensor<32x512x7x7xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1 : tensor<32x512x16x16xf32>, tensor<512x512x3x3xf32>) outs(%10 : tensor<32x512x7x7xf32>) -> tensor<32x512x7x7xf32>
-    util.return %11 : tensor<32x512x7x7xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_48x112x112x64x7x7x3_f32xf32xf32_stride2.mlir b/conv/mlir/conv_2d_nchw_fchw_48x112x112x64x7x7x3_f32xf32xf32_stride2.mlir
deleted file mode 100644
index c40a1d1..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_48x112x112x64x7x7x3_f32xf32xf32_stride2.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<48x64x230x230xf32>, %arg1: tensor<3x64x7x7xf32>) -> tensor<48x3x112x112xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<48x3x112x112xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<48x3x112x112xf32>) -> tensor<48x3x112x112xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1 : tensor<48x64x230x230xf32>, tensor<3x64x7x7xf32>) outs(%10 : tensor<48x3x112x112xf32>) -> tensor<48x3x112x112xf32>
-    util.return %11 : tensor<48x3x112x112xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_48x14x14x1024x1x1x512_f32xf32xf32_stride2.mlir b/conv/mlir/conv_2d_nchw_fchw_48x14x14x1024x1x1x512_f32xf32xf32_stride2.mlir
deleted file mode 100644
index 1e19c6b..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_48x14x14x1024x1x1x512_f32xf32xf32_stride2.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<48x1024x28x28xf32>, %arg1: tensor<512x1024x1x1xf32>) -> tensor<48x512x14x14xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<48x512x14x14xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<48x512x14x14xf32>) -> tensor<48x512x14x14xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1 : tensor<48x1024x28x28xf32>, tensor<512x1024x1x1xf32>) outs(%10 : tensor<48x512x14x14xf32>) -> tensor<48x512x14x14xf32>
-    util.return %11 : tensor<48x512x14x14xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_48x14x14x256x3x3x256_f32xf32xf32_stride1.mlir b/conv/mlir/conv_2d_nchw_fchw_48x14x14x256x3x3x256_f32xf32xf32_stride1.mlir
deleted file mode 100644
index e7eb010..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_48x14x14x256x3x3x256_f32xf32xf32_stride1.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<48x256x16x16xf32>, %arg1: tensor<256x256x3x3xf32>) -> tensor<48x256x14x14xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<48x256x14x14xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<48x256x14x14xf32>) -> tensor<48x256x14x14xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1 : tensor<48x256x16x16xf32>, tensor<256x256x3x3xf32>) outs(%10 : tensor<48x256x14x14xf32>) -> tensor<48x256x14x14xf32>
-    util.return %11 : tensor<48x256x14x14xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_48x14x14x256x3x3x256_f32xf32xf32_stride2.mlir b/conv/mlir/conv_2d_nchw_fchw_48x14x14x256x3x3x256_f32xf32xf32_stride2.mlir
deleted file mode 100644
index 60a80b3..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_48x14x14x256x3x3x256_f32xf32xf32_stride2.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<48x256x30x30xf32>, %arg1: tensor<256x256x3x3xf32>) -> tensor<48x256x14x14xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<48x256x14x14xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<48x256x14x14xf32>) -> tensor<48x256x14x14xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1 : tensor<48x256x30x30xf32>, tensor<256x256x3x3xf32>) outs(%10 : tensor<48x256x14x14xf32>) -> tensor<48x256x14x14xf32>
-    util.return %11 : tensor<48x256x14x14xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_48x28x28x128x3x3x128_f32xf32xf32_stride1.mlir b/conv/mlir/conv_2d_nchw_fchw_48x28x28x128x3x3x128_f32xf32xf32_stride1.mlir
deleted file mode 100644
index 0c8451f..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_48x28x28x128x3x3x128_f32xf32xf32_stride1.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<48x128x30x30xf32>, %arg1: tensor<128x128x3x3xf32>) -> tensor<48x128x28x28xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<48x128x28x28xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<48x128x28x28xf32>) -> tensor<48x128x28x28xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1 : tensor<48x128x30x30xf32>, tensor<128x128x3x3xf32>) outs(%10 : tensor<48x128x28x28xf32>) -> tensor<48x128x28x28xf32>
-    util.return %11 : tensor<48x128x28x28xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_48x28x28x128x3x3x128_f32xf32xf32_stride2.mlir b/conv/mlir/conv_2d_nchw_fchw_48x28x28x128x3x3x128_f32xf32xf32_stride2.mlir
deleted file mode 100644
index 5da75c9..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_48x28x28x128x3x3x128_f32xf32xf32_stride2.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<48x128x58x58xf32>, %arg1: tensor<128x128x3x3xf32>) -> tensor<48x128x28x28xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<48x128x28x28xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<48x128x28x28xf32>) -> tensor<48x128x28x28xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1 : tensor<48x128x58x58xf32>, tensor<128x128x3x3xf32>) outs(%10 : tensor<48x128x28x28xf32>) -> tensor<48x128x28x28xf32>
-    util.return %11 : tensor<48x128x28x28xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_48x28x28x512x1x1x256_f32xf32xf32_stride2.mlir b/conv/mlir/conv_2d_nchw_fchw_48x28x28x512x1x1x256_f32xf32xf32_stride2.mlir
deleted file mode 100644
index 111edbc..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_48x28x28x512x1x1x256_f32xf32xf32_stride2.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<48x512x56x56xf32>, %arg1: tensor<256x512x1x1xf32>) -> tensor<48x256x28x28xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<48x256x28x28xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<48x256x28x28xf32>) -> tensor<48x256x28x28xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1 : tensor<48x512x56x56xf32>, tensor<256x512x1x1xf32>) outs(%10 : tensor<48x256x28x28xf32>) -> tensor<48x256x28x28xf32>
-    util.return %11 : tensor<48x256x28x28xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_48x56x56x64x3x3x64_f32xf32xf32_stride1.mlir b/conv/mlir/conv_2d_nchw_fchw_48x56x56x64x3x3x64_f32xf32xf32_stride1.mlir
deleted file mode 100644
index 976b273..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_48x56x56x64x3x3x64_f32xf32xf32_stride1.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<48x64x58x58xf32>, %arg1: tensor<64x64x3x3xf32>) -> tensor<48x64x56x56xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<48x64x56x56xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<48x64x56x56xf32>) -> tensor<48x64x56x56xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1 : tensor<48x64x58x58xf32>, tensor<64x64x3x3xf32>) outs(%10 : tensor<48x64x56x56xf32>) -> tensor<48x64x56x56xf32>
-    util.return %11 : tensor<48x64x56x56xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_48x7x7x2048x1x1x1024_f32xf32xf32_stride2.mlir b/conv/mlir/conv_2d_nchw_fchw_48x7x7x2048x1x1x1024_f32xf32xf32_stride2.mlir
deleted file mode 100644
index 91ef8a1..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_48x7x7x2048x1x1x1024_f32xf32xf32_stride2.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<48x2048x14x14xf32>, %arg1: tensor<1024x2048x1x1xf32>) -> tensor<48x1024x7x7xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<48x1024x7x7xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<48x1024x7x7xf32>) -> tensor<48x1024x7x7xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1 : tensor<48x2048x14x14xf32>, tensor<1024x2048x1x1xf32>) outs(%10 : tensor<48x1024x7x7xf32>) -> tensor<48x1024x7x7xf32>
-    util.return %11 : tensor<48x1024x7x7xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_48x7x7x512x3x3x512_f32xf32xf32_stride1.mlir b/conv/mlir/conv_2d_nchw_fchw_48x7x7x512x3x3x512_f32xf32xf32_stride1.mlir
deleted file mode 100644
index b4b108b..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_48x7x7x512x3x3x512_f32xf32xf32_stride1.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<48x512x9x9xf32>, %arg1: tensor<512x512x3x3xf32>) -> tensor<48x512x7x7xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<48x512x7x7xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<48x512x7x7xf32>) -> tensor<48x512x7x7xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1 : tensor<48x512x9x9xf32>, tensor<512x512x3x3xf32>) outs(%10 : tensor<48x512x7x7xf32>) -> tensor<48x512x7x7xf32>
-    util.return %11 : tensor<48x512x7x7xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_48x7x7x512x3x3x512_f32xf32xf32_stride2.mlir b/conv/mlir/conv_2d_nchw_fchw_48x7x7x512x3x3x512_f32xf32xf32_stride2.mlir
deleted file mode 100644
index 4e64124..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_48x7x7x512x3x3x512_f32xf32xf32_stride2.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<48x512x16x16xf32>, %arg1: tensor<512x512x3x3xf32>) -> tensor<48x512x7x7xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<48x512x7x7xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<48x512x7x7xf32>) -> tensor<48x512x7x7xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1 : tensor<48x512x16x16xf32>, tensor<512x512x3x3xf32>) outs(%10 : tensor<48x512x7x7xf32>) -> tensor<48x512x7x7xf32>
-    util.return %11 : tensor<48x512x7x7xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_4x112x112x64x7x7x3_f32xf32xf32_stride2.mlir b/conv/mlir/conv_2d_nchw_fchw_4x112x112x64x7x7x3_f32xf32xf32_stride2.mlir
deleted file mode 100644
index dfec8cd..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_4x112x112x64x7x7x3_f32xf32xf32_stride2.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<4x64x230x230xf32>, %arg1: tensor<3x64x7x7xf32>) -> tensor<4x3x112x112xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<4x3x112x112xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<4x3x112x112xf32>) -> tensor<4x3x112x112xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1 : tensor<4x64x230x230xf32>, tensor<3x64x7x7xf32>) outs(%10 : tensor<4x3x112x112xf32>) -> tensor<4x3x112x112xf32>
-    util.return %11 : tensor<4x3x112x112xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_4x14x14x1024x1x1x512_f32xf32xf32_stride2.mlir b/conv/mlir/conv_2d_nchw_fchw_4x14x14x1024x1x1x512_f32xf32xf32_stride2.mlir
deleted file mode 100644
index a4be022..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_4x14x14x1024x1x1x512_f32xf32xf32_stride2.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<4x1024x28x28xf32>, %arg1: tensor<512x1024x1x1xf32>) -> tensor<4x512x14x14xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<4x512x14x14xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<4x512x14x14xf32>) -> tensor<4x512x14x14xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1 : tensor<4x1024x28x28xf32>, tensor<512x1024x1x1xf32>) outs(%10 : tensor<4x512x14x14xf32>) -> tensor<4x512x14x14xf32>
-    util.return %11 : tensor<4x512x14x14xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_4x14x14x256x3x3x256_f32xf32xf32_stride1.mlir b/conv/mlir/conv_2d_nchw_fchw_4x14x14x256x3x3x256_f32xf32xf32_stride1.mlir
deleted file mode 100644
index 84cb673..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_4x14x14x256x3x3x256_f32xf32xf32_stride1.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<4x256x16x16xf32>, %arg1: tensor<256x256x3x3xf32>) -> tensor<4x256x14x14xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<4x256x14x14xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<4x256x14x14xf32>) -> tensor<4x256x14x14xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1 : tensor<4x256x16x16xf32>, tensor<256x256x3x3xf32>) outs(%10 : tensor<4x256x14x14xf32>) -> tensor<4x256x14x14xf32>
-    util.return %11 : tensor<4x256x14x14xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_4x14x14x256x3x3x256_f32xf32xf32_stride2.mlir b/conv/mlir/conv_2d_nchw_fchw_4x14x14x256x3x3x256_f32xf32xf32_stride2.mlir
deleted file mode 100644
index 0e72024..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_4x14x14x256x3x3x256_f32xf32xf32_stride2.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<4x256x30x30xf32>, %arg1: tensor<256x256x3x3xf32>) -> tensor<4x256x14x14xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<4x256x14x14xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<4x256x14x14xf32>) -> tensor<4x256x14x14xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1 : tensor<4x256x30x30xf32>, tensor<256x256x3x3xf32>) outs(%10 : tensor<4x256x14x14xf32>) -> tensor<4x256x14x14xf32>
-    util.return %11 : tensor<4x256x14x14xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_4x28x28x128x3x3x128_f32xf32xf32_stride1.mlir b/conv/mlir/conv_2d_nchw_fchw_4x28x28x128x3x3x128_f32xf32xf32_stride1.mlir
deleted file mode 100644
index f4b6e62..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_4x28x28x128x3x3x128_f32xf32xf32_stride1.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<4x128x30x30xf32>, %arg1: tensor<128x128x3x3xf32>) -> tensor<4x128x28x28xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<4x128x28x28xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<4x128x28x28xf32>) -> tensor<4x128x28x28xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1 : tensor<4x128x30x30xf32>, tensor<128x128x3x3xf32>) outs(%10 : tensor<4x128x28x28xf32>) -> tensor<4x128x28x28xf32>
-    util.return %11 : tensor<4x128x28x28xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_4x28x28x128x3x3x128_f32xf32xf32_stride2.mlir b/conv/mlir/conv_2d_nchw_fchw_4x28x28x128x3x3x128_f32xf32xf32_stride2.mlir
deleted file mode 100644
index b8b87bb..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_4x28x28x128x3x3x128_f32xf32xf32_stride2.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<4x128x58x58xf32>, %arg1: tensor<128x128x3x3xf32>) -> tensor<4x128x28x28xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<4x128x28x28xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<4x128x28x28xf32>) -> tensor<4x128x28x28xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1 : tensor<4x128x58x58xf32>, tensor<128x128x3x3xf32>) outs(%10 : tensor<4x128x28x28xf32>) -> tensor<4x128x28x28xf32>
-    util.return %11 : tensor<4x128x28x28xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_4x28x28x512x1x1x256_f32xf32xf32_stride2.mlir b/conv/mlir/conv_2d_nchw_fchw_4x28x28x512x1x1x256_f32xf32xf32_stride2.mlir
deleted file mode 100644
index 29e7dc8..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_4x28x28x512x1x1x256_f32xf32xf32_stride2.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<4x512x56x56xf32>, %arg1: tensor<256x512x1x1xf32>) -> tensor<4x256x28x28xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<4x256x28x28xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<4x256x28x28xf32>) -> tensor<4x256x28x28xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1 : tensor<4x512x56x56xf32>, tensor<256x512x1x1xf32>) outs(%10 : tensor<4x256x28x28xf32>) -> tensor<4x256x28x28xf32>
-    util.return %11 : tensor<4x256x28x28xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_4x56x56x64x3x3x64_f32xf32xf32_stride1.mlir b/conv/mlir/conv_2d_nchw_fchw_4x56x56x64x3x3x64_f32xf32xf32_stride1.mlir
deleted file mode 100644
index 3e1b498..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_4x56x56x64x3x3x64_f32xf32xf32_stride1.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<4x64x58x58xf32>, %arg1: tensor<64x64x3x3xf32>) -> tensor<4x64x56x56xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<4x64x56x56xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<4x64x56x56xf32>) -> tensor<4x64x56x56xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1 : tensor<4x64x58x58xf32>, tensor<64x64x3x3xf32>) outs(%10 : tensor<4x64x56x56xf32>) -> tensor<4x64x56x56xf32>
-    util.return %11 : tensor<4x64x56x56xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_4x7x7x2048x1x1x1024_f32xf32xf32_stride2.mlir b/conv/mlir/conv_2d_nchw_fchw_4x7x7x2048x1x1x1024_f32xf32xf32_stride2.mlir
deleted file mode 100644
index 351a3f1..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_4x7x7x2048x1x1x1024_f32xf32xf32_stride2.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<4x2048x14x14xf32>, %arg1: tensor<1024x2048x1x1xf32>) -> tensor<4x1024x7x7xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<4x1024x7x7xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<4x1024x7x7xf32>) -> tensor<4x1024x7x7xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1 : tensor<4x2048x14x14xf32>, tensor<1024x2048x1x1xf32>) outs(%10 : tensor<4x1024x7x7xf32>) -> tensor<4x1024x7x7xf32>
-    util.return %11 : tensor<4x1024x7x7xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_4x7x7x512x3x3x512_f32xf32xf32_stride1.mlir b/conv/mlir/conv_2d_nchw_fchw_4x7x7x512x3x3x512_f32xf32xf32_stride1.mlir
deleted file mode 100644
index 106e477..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_4x7x7x512x3x3x512_f32xf32xf32_stride1.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<4x512x9x9xf32>, %arg1: tensor<512x512x3x3xf32>) -> tensor<4x512x7x7xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<4x512x7x7xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<4x512x7x7xf32>) -> tensor<4x512x7x7xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1 : tensor<4x512x9x9xf32>, tensor<512x512x3x3xf32>) outs(%10 : tensor<4x512x7x7xf32>) -> tensor<4x512x7x7xf32>
-    util.return %11 : tensor<4x512x7x7xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_4x7x7x512x3x3x512_f32xf32xf32_stride2.mlir b/conv/mlir/conv_2d_nchw_fchw_4x7x7x512x3x3x512_f32xf32xf32_stride2.mlir
deleted file mode 100644
index d3535b6..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_4x7x7x512x3x3x512_f32xf32xf32_stride2.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<4x512x16x16xf32>, %arg1: tensor<512x512x3x3xf32>) -> tensor<4x512x7x7xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<4x512x7x7xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<4x512x7x7xf32>) -> tensor<4x512x7x7xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1 : tensor<4x512x16x16xf32>, tensor<512x512x3x3xf32>) outs(%10 : tensor<4x512x7x7xf32>) -> tensor<4x512x7x7xf32>
-    util.return %11 : tensor<4x512x7x7xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_8x112x112x64x7x7x3_f32xf32xf32_stride2.mlir b/conv/mlir/conv_2d_nchw_fchw_8x112x112x64x7x7x3_f32xf32xf32_stride2.mlir
deleted file mode 100644
index c280b6c..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_8x112x112x64x7x7x3_f32xf32xf32_stride2.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<8x64x230x230xf32>, %arg1: tensor<3x64x7x7xf32>) -> tensor<8x3x112x112xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<8x3x112x112xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<8x3x112x112xf32>) -> tensor<8x3x112x112xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1 : tensor<8x64x230x230xf32>, tensor<3x64x7x7xf32>) outs(%10 : tensor<8x3x112x112xf32>) -> tensor<8x3x112x112xf32>
-    util.return %11 : tensor<8x3x112x112xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_8x14x14x1024x1x1x512_f32xf32xf32_stride2.mlir b/conv/mlir/conv_2d_nchw_fchw_8x14x14x1024x1x1x512_f32xf32xf32_stride2.mlir
deleted file mode 100644
index 3a1b502..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_8x14x14x1024x1x1x512_f32xf32xf32_stride2.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<8x1024x28x28xf32>, %arg1: tensor<512x1024x1x1xf32>) -> tensor<8x512x14x14xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<8x512x14x14xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<8x512x14x14xf32>) -> tensor<8x512x14x14xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1 : tensor<8x1024x28x28xf32>, tensor<512x1024x1x1xf32>) outs(%10 : tensor<8x512x14x14xf32>) -> tensor<8x512x14x14xf32>
-    util.return %11 : tensor<8x512x14x14xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_8x14x14x256x3x3x256_f32xf32xf32_stride1.mlir b/conv/mlir/conv_2d_nchw_fchw_8x14x14x256x3x3x256_f32xf32xf32_stride1.mlir
deleted file mode 100644
index dbdd40c..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_8x14x14x256x3x3x256_f32xf32xf32_stride1.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<8x256x16x16xf32>, %arg1: tensor<256x256x3x3xf32>) -> tensor<8x256x14x14xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<8x256x14x14xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<8x256x14x14xf32>) -> tensor<8x256x14x14xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1 : tensor<8x256x16x16xf32>, tensor<256x256x3x3xf32>) outs(%10 : tensor<8x256x14x14xf32>) -> tensor<8x256x14x14xf32>
-    util.return %11 : tensor<8x256x14x14xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_8x14x14x256x3x3x256_f32xf32xf32_stride2.mlir b/conv/mlir/conv_2d_nchw_fchw_8x14x14x256x3x3x256_f32xf32xf32_stride2.mlir
deleted file mode 100644
index d8c468b..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_8x14x14x256x3x3x256_f32xf32xf32_stride2.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<8x256x30x30xf32>, %arg1: tensor<256x256x3x3xf32>) -> tensor<8x256x14x14xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<8x256x14x14xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<8x256x14x14xf32>) -> tensor<8x256x14x14xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1 : tensor<8x256x30x30xf32>, tensor<256x256x3x3xf32>) outs(%10 : tensor<8x256x14x14xf32>) -> tensor<8x256x14x14xf32>
-    util.return %11 : tensor<8x256x14x14xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_8x28x28x128x3x3x128_f32xf32xf32_stride1.mlir b/conv/mlir/conv_2d_nchw_fchw_8x28x28x128x3x3x128_f32xf32xf32_stride1.mlir
deleted file mode 100644
index 4ff5f80..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_8x28x28x128x3x3x128_f32xf32xf32_stride1.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<8x128x30x30xf32>, %arg1: tensor<128x128x3x3xf32>) -> tensor<8x128x28x28xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<8x128x28x28xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<8x128x28x28xf32>) -> tensor<8x128x28x28xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1 : tensor<8x128x30x30xf32>, tensor<128x128x3x3xf32>) outs(%10 : tensor<8x128x28x28xf32>) -> tensor<8x128x28x28xf32>
-    util.return %11 : tensor<8x128x28x28xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_8x28x28x128x3x3x128_f32xf32xf32_stride2.mlir b/conv/mlir/conv_2d_nchw_fchw_8x28x28x128x3x3x128_f32xf32xf32_stride2.mlir
deleted file mode 100644
index 511eb75..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_8x28x28x128x3x3x128_f32xf32xf32_stride2.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<8x128x58x58xf32>, %arg1: tensor<128x128x3x3xf32>) -> tensor<8x128x28x28xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<8x128x28x28xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<8x128x28x28xf32>) -> tensor<8x128x28x28xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1 : tensor<8x128x58x58xf32>, tensor<128x128x3x3xf32>) outs(%10 : tensor<8x128x28x28xf32>) -> tensor<8x128x28x28xf32>
-    util.return %11 : tensor<8x128x28x28xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_8x28x28x512x1x1x256_f32xf32xf32_stride2.mlir b/conv/mlir/conv_2d_nchw_fchw_8x28x28x512x1x1x256_f32xf32xf32_stride2.mlir
deleted file mode 100644
index c318b1f..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_8x28x28x512x1x1x256_f32xf32xf32_stride2.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<8x512x56x56xf32>, %arg1: tensor<256x512x1x1xf32>) -> tensor<8x256x28x28xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<8x256x28x28xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<8x256x28x28xf32>) -> tensor<8x256x28x28xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1 : tensor<8x512x56x56xf32>, tensor<256x512x1x1xf32>) outs(%10 : tensor<8x256x28x28xf32>) -> tensor<8x256x28x28xf32>
-    util.return %11 : tensor<8x256x28x28xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_8x56x56x64x3x3x64_f32xf32xf32_stride1.mlir b/conv/mlir/conv_2d_nchw_fchw_8x56x56x64x3x3x64_f32xf32xf32_stride1.mlir
deleted file mode 100644
index ce3336c..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_8x56x56x64x3x3x64_f32xf32xf32_stride1.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<8x64x58x58xf32>, %arg1: tensor<64x64x3x3xf32>) -> tensor<8x64x56x56xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<8x64x56x56xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<8x64x56x56xf32>) -> tensor<8x64x56x56xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1 : tensor<8x64x58x58xf32>, tensor<64x64x3x3xf32>) outs(%10 : tensor<8x64x56x56xf32>) -> tensor<8x64x56x56xf32>
-    util.return %11 : tensor<8x64x56x56xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_8x7x7x2048x1x1x1024_f32xf32xf32_stride2.mlir b/conv/mlir/conv_2d_nchw_fchw_8x7x7x2048x1x1x1024_f32xf32xf32_stride2.mlir
deleted file mode 100644
index e910d9d..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_8x7x7x2048x1x1x1024_f32xf32xf32_stride2.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<8x2048x14x14xf32>, %arg1: tensor<1024x2048x1x1xf32>) -> tensor<8x1024x7x7xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<8x1024x7x7xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<8x1024x7x7xf32>) -> tensor<8x1024x7x7xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1 : tensor<8x2048x14x14xf32>, tensor<1024x2048x1x1xf32>) outs(%10 : tensor<8x1024x7x7xf32>) -> tensor<8x1024x7x7xf32>
-    util.return %11 : tensor<8x1024x7x7xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_8x7x7x512x3x3x512_f32xf32xf32_stride1.mlir b/conv/mlir/conv_2d_nchw_fchw_8x7x7x512x3x3x512_f32xf32xf32_stride1.mlir
deleted file mode 100644
index e2700fc..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_8x7x7x512x3x3x512_f32xf32xf32_stride1.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<8x512x9x9xf32>, %arg1: tensor<512x512x3x3xf32>) -> tensor<8x512x7x7xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<8x512x7x7xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<8x512x7x7xf32>) -> tensor<8x512x7x7xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1 : tensor<8x512x9x9xf32>, tensor<512x512x3x3xf32>) outs(%10 : tensor<8x512x7x7xf32>) -> tensor<8x512x7x7xf32>
-    util.return %11 : tensor<8x512x7x7xf32>
-}
diff --git a/conv/mlir/conv_2d_nchw_fchw_8x7x7x512x3x3x512_f32xf32xf32_stride2.mlir b/conv/mlir/conv_2d_nchw_fchw_8x7x7x512x3x3x512_f32xf32xf32_stride2.mlir
deleted file mode 100644
index 4b6bdae..0000000
--- a/conv/mlir/conv_2d_nchw_fchw_8x7x7x512x3x3x512_f32xf32xf32_stride2.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-util.func public @main(%arg0: tensor<8x512x16x16xf32>, %arg1: tensor<512x512x3x3xf32>) -> tensor<8x512x7x7xf32> {
-    %cst = arith.constant 0.0 : f32
-    %9 = tensor.empty() : tensor<8x512x7x7xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<8x512x7x7xf32>) -> tensor<8x512x7x7xf32>
-    %11 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1 : tensor<8x512x16x16xf32>, tensor<512x512x3x3xf32>) outs(%10 : tensor<8x512x7x7xf32>) -> tensor<8x512x7x7xf32>
-    util.return %11 : tensor<8x512x7x7xf32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_16x112x112x64x7x7x3_i8xi8xi32_stride2.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_16x112x112x64x7x7x3_i8xi8xi32_stride2.mlir
deleted file mode 100644
index 51aa523..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_16x112x112x64x7x7x3_i8xi8xi32_stride2.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<16x230x230x64xi8>, %arg1: tensor<7x7x64x3xi8>) -> tensor<16x112x112x3xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<16x112x112x3xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<16x112x112x3xi32>) -> tensor<16x112x112x3xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<16x230x230x64xi8>, tensor<7x7x64x3xi8>, i32, i32) outs(%10 : tensor<16x112x112x3xi32>) -> tensor<16x112x112x3xi32>
-    util.return %11 : tensor<16x112x112x3xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_16x14x14x1024x1x1x512_i8xi8xi32_stride2.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_16x14x14x1024x1x1x512_i8xi8xi32_stride2.mlir
deleted file mode 100644
index 901f76d..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_16x14x14x1024x1x1x512_i8xi8xi32_stride2.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<16x28x28x1024xi8>, %arg1: tensor<1x1x1024x512xi8>) -> tensor<16x14x14x512xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<16x14x14x512xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<16x14x14x512xi32>) -> tensor<16x14x14x512xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<16x28x28x1024xi8>, tensor<1x1x1024x512xi8>, i32, i32) outs(%10 : tensor<16x14x14x512xi32>) -> tensor<16x14x14x512xi32>
-    util.return %11 : tensor<16x14x14x512xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_16x14x14x256x3x3x256_i8xi8xi32_stride1.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_16x14x14x256x3x3x256_i8xi8xi32_stride1.mlir
deleted file mode 100644
index 9cccdc5..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_16x14x14x256x3x3x256_i8xi8xi32_stride1.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<16x16x16x256xi8>, %arg1: tensor<3x3x256x256xi8>) -> tensor<16x14x14x256xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<16x14x14x256xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<16x14x14x256xi32>) -> tensor<16x14x14x256xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<16x16x16x256xi8>, tensor<3x3x256x256xi8>, i32, i32) outs(%10 : tensor<16x14x14x256xi32>) -> tensor<16x14x14x256xi32>
-    util.return %11 : tensor<16x14x14x256xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_16x14x14x256x3x3x256_i8xi8xi32_stride2.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_16x14x14x256x3x3x256_i8xi8xi32_stride2.mlir
deleted file mode 100644
index 5ef0460..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_16x14x14x256x3x3x256_i8xi8xi32_stride2.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<16x30x30x256xi8>, %arg1: tensor<3x3x256x256xi8>) -> tensor<16x14x14x256xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<16x14x14x256xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<16x14x14x256xi32>) -> tensor<16x14x14x256xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<16x30x30x256xi8>, tensor<3x3x256x256xi8>, i32, i32) outs(%10 : tensor<16x14x14x256xi32>) -> tensor<16x14x14x256xi32>
-    util.return %11 : tensor<16x14x14x256xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_16x28x28x128x3x3x128_i8xi8xi32_stride1.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_16x28x28x128x3x3x128_i8xi8xi32_stride1.mlir
deleted file mode 100644
index 39443af..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_16x28x28x128x3x3x128_i8xi8xi32_stride1.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<16x30x30x128xi8>, %arg1: tensor<3x3x128x128xi8>) -> tensor<16x28x28x128xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<16x28x28x128xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<16x28x28x128xi32>) -> tensor<16x28x28x128xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<16x30x30x128xi8>, tensor<3x3x128x128xi8>, i32, i32) outs(%10 : tensor<16x28x28x128xi32>) -> tensor<16x28x28x128xi32>
-    util.return %11 : tensor<16x28x28x128xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_16x28x28x128x3x3x128_i8xi8xi32_stride2.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_16x28x28x128x3x3x128_i8xi8xi32_stride2.mlir
deleted file mode 100644
index 1b2c1e8..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_16x28x28x128x3x3x128_i8xi8xi32_stride2.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<16x58x58x128xi8>, %arg1: tensor<3x3x128x128xi8>) -> tensor<16x28x28x128xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<16x28x28x128xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<16x28x28x128xi32>) -> tensor<16x28x28x128xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<16x58x58x128xi8>, tensor<3x3x128x128xi8>, i32, i32) outs(%10 : tensor<16x28x28x128xi32>) -> tensor<16x28x28x128xi32>
-    util.return %11 : tensor<16x28x28x128xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_16x28x28x512x1x1x256_i8xi8xi32_stride2.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_16x28x28x512x1x1x256_i8xi8xi32_stride2.mlir
deleted file mode 100644
index dc625e6..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_16x28x28x512x1x1x256_i8xi8xi32_stride2.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<16x56x56x512xi8>, %arg1: tensor<1x1x512x256xi8>) -> tensor<16x28x28x256xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<16x28x28x256xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<16x28x28x256xi32>) -> tensor<16x28x28x256xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<16x56x56x512xi8>, tensor<1x1x512x256xi8>, i32, i32) outs(%10 : tensor<16x28x28x256xi32>) -> tensor<16x28x28x256xi32>
-    util.return %11 : tensor<16x28x28x256xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_16x56x56x64x3x3x64_i8xi8xi32_stride1.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_16x56x56x64x3x3x64_i8xi8xi32_stride1.mlir
deleted file mode 100644
index ada033c..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_16x56x56x64x3x3x64_i8xi8xi32_stride1.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<16x58x58x64xi8>, %arg1: tensor<3x3x64x64xi8>) -> tensor<16x56x56x64xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<16x56x56x64xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<16x56x56x64xi32>) -> tensor<16x56x56x64xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<16x58x58x64xi8>, tensor<3x3x64x64xi8>, i32, i32) outs(%10 : tensor<16x56x56x64xi32>) -> tensor<16x56x56x64xi32>
-    util.return %11 : tensor<16x56x56x64xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_16x7x7x2048x1x1x1024_i8xi8xi32_stride2.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_16x7x7x2048x1x1x1024_i8xi8xi32_stride2.mlir
deleted file mode 100644
index d0273cb..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_16x7x7x2048x1x1x1024_i8xi8xi32_stride2.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<16x14x14x2048xi8>, %arg1: tensor<1x1x2048x1024xi8>) -> tensor<16x7x7x1024xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<16x7x7x1024xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<16x7x7x1024xi32>) -> tensor<16x7x7x1024xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<16x14x14x2048xi8>, tensor<1x1x2048x1024xi8>, i32, i32) outs(%10 : tensor<16x7x7x1024xi32>) -> tensor<16x7x7x1024xi32>
-    util.return %11 : tensor<16x7x7x1024xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_16x7x7x512x3x3x512_i8xi8xi32_stride1.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_16x7x7x512x3x3x512_i8xi8xi32_stride1.mlir
deleted file mode 100644
index 79d6bd8..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_16x7x7x512x3x3x512_i8xi8xi32_stride1.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<16x9x9x512xi8>, %arg1: tensor<3x3x512x512xi8>) -> tensor<16x7x7x512xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<16x7x7x512xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<16x7x7x512xi32>) -> tensor<16x7x7x512xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<16x9x9x512xi8>, tensor<3x3x512x512xi8>, i32, i32) outs(%10 : tensor<16x7x7x512xi32>) -> tensor<16x7x7x512xi32>
-    util.return %11 : tensor<16x7x7x512xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_16x7x7x512x3x3x512_i8xi8xi32_stride2.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_16x7x7x512x3x3x512_i8xi8xi32_stride2.mlir
deleted file mode 100644
index d815eba..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_16x7x7x512x3x3x512_i8xi8xi32_stride2.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<16x16x16x512xi8>, %arg1: tensor<3x3x512x512xi8>) -> tensor<16x7x7x512xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<16x7x7x512xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<16x7x7x512xi32>) -> tensor<16x7x7x512xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<16x16x16x512xi8>, tensor<3x3x512x512xi8>, i32, i32) outs(%10 : tensor<16x7x7x512xi32>) -> tensor<16x7x7x512xi32>
-    util.return %11 : tensor<16x7x7x512xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_1x112x112x64x7x7x3_i8xi8xi32_stride2.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_1x112x112x64x7x7x3_i8xi8xi32_stride2.mlir
deleted file mode 100644
index 8f8aef5..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_1x112x112x64x7x7x3_i8xi8xi32_stride2.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<1x230x230x64xi8>, %arg1: tensor<7x7x64x3xi8>) -> tensor<1x112x112x3xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<1x112x112x3xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<1x112x112x3xi32>) -> tensor<1x112x112x3xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<1x230x230x64xi8>, tensor<7x7x64x3xi8>, i32, i32) outs(%10 : tensor<1x112x112x3xi32>) -> tensor<1x112x112x3xi32>
-    util.return %11 : tensor<1x112x112x3xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_1x14x14x1024x1x1x512_i8xi8xi32_stride2.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_1x14x14x1024x1x1x512_i8xi8xi32_stride2.mlir
deleted file mode 100644
index 22483c7..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_1x14x14x1024x1x1x512_i8xi8xi32_stride2.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<1x28x28x1024xi8>, %arg1: tensor<1x1x1024x512xi8>) -> tensor<1x14x14x512xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<1x14x14x512xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<1x14x14x512xi32>) -> tensor<1x14x14x512xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<1x28x28x1024xi8>, tensor<1x1x1024x512xi8>, i32, i32) outs(%10 : tensor<1x14x14x512xi32>) -> tensor<1x14x14x512xi32>
-    util.return %11 : tensor<1x14x14x512xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_1x14x14x256x3x3x256_i8xi8xi32_stride1.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_1x14x14x256x3x3x256_i8xi8xi32_stride1.mlir
deleted file mode 100644
index 0f680d6..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_1x14x14x256x3x3x256_i8xi8xi32_stride1.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<1x16x16x256xi8>, %arg1: tensor<3x3x256x256xi8>) -> tensor<1x14x14x256xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<1x14x14x256xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<1x14x14x256xi32>) -> tensor<1x14x14x256xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<1x16x16x256xi8>, tensor<3x3x256x256xi8>, i32, i32) outs(%10 : tensor<1x14x14x256xi32>) -> tensor<1x14x14x256xi32>
-    util.return %11 : tensor<1x14x14x256xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_1x14x14x256x3x3x256_i8xi8xi32_stride2.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_1x14x14x256x3x3x256_i8xi8xi32_stride2.mlir
deleted file mode 100644
index ac0f447..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_1x14x14x256x3x3x256_i8xi8xi32_stride2.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<1x30x30x256xi8>, %arg1: tensor<3x3x256x256xi8>) -> tensor<1x14x14x256xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<1x14x14x256xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<1x14x14x256xi32>) -> tensor<1x14x14x256xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<1x30x30x256xi8>, tensor<3x3x256x256xi8>, i32, i32) outs(%10 : tensor<1x14x14x256xi32>) -> tensor<1x14x14x256xi32>
-    util.return %11 : tensor<1x14x14x256xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_1x28x28x128x3x3x128_i8xi8xi32_stride1.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_1x28x28x128x3x3x128_i8xi8xi32_stride1.mlir
deleted file mode 100644
index d26a99b..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_1x28x28x128x3x3x128_i8xi8xi32_stride1.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<1x30x30x128xi8>, %arg1: tensor<3x3x128x128xi8>) -> tensor<1x28x28x128xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<1x28x28x128xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<1x28x28x128xi32>) -> tensor<1x28x28x128xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<1x30x30x128xi8>, tensor<3x3x128x128xi8>, i32, i32) outs(%10 : tensor<1x28x28x128xi32>) -> tensor<1x28x28x128xi32>
-    util.return %11 : tensor<1x28x28x128xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_1x28x28x128x3x3x128_i8xi8xi32_stride2.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_1x28x28x128x3x3x128_i8xi8xi32_stride2.mlir
deleted file mode 100644
index 5b5d2e7..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_1x28x28x128x3x3x128_i8xi8xi32_stride2.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<1x58x58x128xi8>, %arg1: tensor<3x3x128x128xi8>) -> tensor<1x28x28x128xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<1x28x28x128xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<1x28x28x128xi32>) -> tensor<1x28x28x128xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<1x58x58x128xi8>, tensor<3x3x128x128xi8>, i32, i32) outs(%10 : tensor<1x28x28x128xi32>) -> tensor<1x28x28x128xi32>
-    util.return %11 : tensor<1x28x28x128xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_1x28x28x512x1x1x256_i8xi8xi32_stride2.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_1x28x28x512x1x1x256_i8xi8xi32_stride2.mlir
deleted file mode 100644
index e8d1574..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_1x28x28x512x1x1x256_i8xi8xi32_stride2.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<1x56x56x512xi8>, %arg1: tensor<1x1x512x256xi8>) -> tensor<1x28x28x256xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<1x28x28x256xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<1x28x28x256xi32>) -> tensor<1x28x28x256xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<1x56x56x512xi8>, tensor<1x1x512x256xi8>, i32, i32) outs(%10 : tensor<1x28x28x256xi32>) -> tensor<1x28x28x256xi32>
-    util.return %11 : tensor<1x28x28x256xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_1x56x56x64x3x3x64_i8xi8xi32_stride1.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_1x56x56x64x3x3x64_i8xi8xi32_stride1.mlir
deleted file mode 100644
index f33d50a..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_1x56x56x64x3x3x64_i8xi8xi32_stride1.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<1x58x58x64xi8>, %arg1: tensor<3x3x64x64xi8>) -> tensor<1x56x56x64xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<1x56x56x64xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<1x56x56x64xi32>) -> tensor<1x56x56x64xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<1x58x58x64xi8>, tensor<3x3x64x64xi8>, i32, i32) outs(%10 : tensor<1x56x56x64xi32>) -> tensor<1x56x56x64xi32>
-    util.return %11 : tensor<1x56x56x64xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_1x7x7x2048x1x1x1024_i8xi8xi32_stride2.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_1x7x7x2048x1x1x1024_i8xi8xi32_stride2.mlir
deleted file mode 100644
index ba91ced..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_1x7x7x2048x1x1x1024_i8xi8xi32_stride2.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<1x14x14x2048xi8>, %arg1: tensor<1x1x2048x1024xi8>) -> tensor<1x7x7x1024xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<1x7x7x1024xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<1x7x7x1024xi32>) -> tensor<1x7x7x1024xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<1x14x14x2048xi8>, tensor<1x1x2048x1024xi8>, i32, i32) outs(%10 : tensor<1x7x7x1024xi32>) -> tensor<1x7x7x1024xi32>
-    util.return %11 : tensor<1x7x7x1024xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_1x7x7x512x3x3x512_i8xi8xi32_stride1.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_1x7x7x512x3x3x512_i8xi8xi32_stride1.mlir
deleted file mode 100644
index 9b3d433..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_1x7x7x512x3x3x512_i8xi8xi32_stride1.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<1x9x9x512xi8>, %arg1: tensor<3x3x512x512xi8>) -> tensor<1x7x7x512xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<1x7x7x512xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<1x7x7x512xi32>) -> tensor<1x7x7x512xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<1x9x9x512xi8>, tensor<3x3x512x512xi8>, i32, i32) outs(%10 : tensor<1x7x7x512xi32>) -> tensor<1x7x7x512xi32>
-    util.return %11 : tensor<1x7x7x512xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_1x7x7x512x3x3x512_i8xi8xi32_stride2.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_1x7x7x512x3x3x512_i8xi8xi32_stride2.mlir
deleted file mode 100644
index 57902ba..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_1x7x7x512x3x3x512_i8xi8xi32_stride2.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<1x16x16x512xi8>, %arg1: tensor<3x3x512x512xi8>) -> tensor<1x7x7x512xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<1x7x7x512xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<1x7x7x512xi32>) -> tensor<1x7x7x512xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<1x16x16x512xi8>, tensor<3x3x512x512xi8>, i32, i32) outs(%10 : tensor<1x7x7x512xi32>) -> tensor<1x7x7x512xi32>
-    util.return %11 : tensor<1x7x7x512xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_2x112x112x64x7x7x3_i8xi8xi32_stride2.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_2x112x112x64x7x7x3_i8xi8xi32_stride2.mlir
deleted file mode 100644
index 980db88..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_2x112x112x64x7x7x3_i8xi8xi32_stride2.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<2x230x230x64xi8>, %arg1: tensor<7x7x64x3xi8>) -> tensor<2x112x112x3xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<2x112x112x3xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<2x112x112x3xi32>) -> tensor<2x112x112x3xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<2x230x230x64xi8>, tensor<7x7x64x3xi8>, i32, i32) outs(%10 : tensor<2x112x112x3xi32>) -> tensor<2x112x112x3xi32>
-    util.return %11 : tensor<2x112x112x3xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_2x14x14x1024x1x1x512_i8xi8xi32_stride2.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_2x14x14x1024x1x1x512_i8xi8xi32_stride2.mlir
deleted file mode 100644
index 17c720e..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_2x14x14x1024x1x1x512_i8xi8xi32_stride2.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<2x28x28x1024xi8>, %arg1: tensor<1x1x1024x512xi8>) -> tensor<2x14x14x512xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<2x14x14x512xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<2x14x14x512xi32>) -> tensor<2x14x14x512xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<2x28x28x1024xi8>, tensor<1x1x1024x512xi8>, i32, i32) outs(%10 : tensor<2x14x14x512xi32>) -> tensor<2x14x14x512xi32>
-    util.return %11 : tensor<2x14x14x512xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_2x14x14x256x3x3x256_i8xi8xi32_stride1.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_2x14x14x256x3x3x256_i8xi8xi32_stride1.mlir
deleted file mode 100644
index 07690ae..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_2x14x14x256x3x3x256_i8xi8xi32_stride1.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<2x16x16x256xi8>, %arg1: tensor<3x3x256x256xi8>) -> tensor<2x14x14x256xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<2x14x14x256xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<2x14x14x256xi32>) -> tensor<2x14x14x256xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<2x16x16x256xi8>, tensor<3x3x256x256xi8>, i32, i32) outs(%10 : tensor<2x14x14x256xi32>) -> tensor<2x14x14x256xi32>
-    util.return %11 : tensor<2x14x14x256xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_2x14x14x256x3x3x256_i8xi8xi32_stride2.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_2x14x14x256x3x3x256_i8xi8xi32_stride2.mlir
deleted file mode 100644
index 0a72f46..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_2x14x14x256x3x3x256_i8xi8xi32_stride2.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<2x30x30x256xi8>, %arg1: tensor<3x3x256x256xi8>) -> tensor<2x14x14x256xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<2x14x14x256xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<2x14x14x256xi32>) -> tensor<2x14x14x256xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<2x30x30x256xi8>, tensor<3x3x256x256xi8>, i32, i32) outs(%10 : tensor<2x14x14x256xi32>) -> tensor<2x14x14x256xi32>
-    util.return %11 : tensor<2x14x14x256xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_2x28x28x128x3x3x128_i8xi8xi32_stride1.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_2x28x28x128x3x3x128_i8xi8xi32_stride1.mlir
deleted file mode 100644
index 701dfa9..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_2x28x28x128x3x3x128_i8xi8xi32_stride1.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<2x30x30x128xi8>, %arg1: tensor<3x3x128x128xi8>) -> tensor<2x28x28x128xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<2x28x28x128xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<2x28x28x128xi32>) -> tensor<2x28x28x128xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<2x30x30x128xi8>, tensor<3x3x128x128xi8>, i32, i32) outs(%10 : tensor<2x28x28x128xi32>) -> tensor<2x28x28x128xi32>
-    util.return %11 : tensor<2x28x28x128xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_2x28x28x128x3x3x128_i8xi8xi32_stride2.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_2x28x28x128x3x3x128_i8xi8xi32_stride2.mlir
deleted file mode 100644
index fb40589..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_2x28x28x128x3x3x128_i8xi8xi32_stride2.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<2x58x58x128xi8>, %arg1: tensor<3x3x128x128xi8>) -> tensor<2x28x28x128xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<2x28x28x128xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<2x28x28x128xi32>) -> tensor<2x28x28x128xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<2x58x58x128xi8>, tensor<3x3x128x128xi8>, i32, i32) outs(%10 : tensor<2x28x28x128xi32>) -> tensor<2x28x28x128xi32>
-    util.return %11 : tensor<2x28x28x128xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_2x28x28x512x1x1x256_i8xi8xi32_stride2.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_2x28x28x512x1x1x256_i8xi8xi32_stride2.mlir
deleted file mode 100644
index 96af59a..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_2x28x28x512x1x1x256_i8xi8xi32_stride2.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<2x56x56x512xi8>, %arg1: tensor<1x1x512x256xi8>) -> tensor<2x28x28x256xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<2x28x28x256xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<2x28x28x256xi32>) -> tensor<2x28x28x256xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<2x56x56x512xi8>, tensor<1x1x512x256xi8>, i32, i32) outs(%10 : tensor<2x28x28x256xi32>) -> tensor<2x28x28x256xi32>
-    util.return %11 : tensor<2x28x28x256xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_2x56x56x64x3x3x64_i8xi8xi32_stride1.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_2x56x56x64x3x3x64_i8xi8xi32_stride1.mlir
deleted file mode 100644
index be5037c..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_2x56x56x64x3x3x64_i8xi8xi32_stride1.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<2x58x58x64xi8>, %arg1: tensor<3x3x64x64xi8>) -> tensor<2x56x56x64xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<2x56x56x64xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<2x56x56x64xi32>) -> tensor<2x56x56x64xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<2x58x58x64xi8>, tensor<3x3x64x64xi8>, i32, i32) outs(%10 : tensor<2x56x56x64xi32>) -> tensor<2x56x56x64xi32>
-    util.return %11 : tensor<2x56x56x64xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_2x7x7x2048x1x1x1024_i8xi8xi32_stride2.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_2x7x7x2048x1x1x1024_i8xi8xi32_stride2.mlir
deleted file mode 100644
index 630dbb4..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_2x7x7x2048x1x1x1024_i8xi8xi32_stride2.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<2x14x14x2048xi8>, %arg1: tensor<1x1x2048x1024xi8>) -> tensor<2x7x7x1024xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<2x7x7x1024xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<2x7x7x1024xi32>) -> tensor<2x7x7x1024xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<2x14x14x2048xi8>, tensor<1x1x2048x1024xi8>, i32, i32) outs(%10 : tensor<2x7x7x1024xi32>) -> tensor<2x7x7x1024xi32>
-    util.return %11 : tensor<2x7x7x1024xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_2x7x7x512x3x3x512_i8xi8xi32_stride1.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_2x7x7x512x3x3x512_i8xi8xi32_stride1.mlir
deleted file mode 100644
index 63fddc7..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_2x7x7x512x3x3x512_i8xi8xi32_stride1.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<2x9x9x512xi8>, %arg1: tensor<3x3x512x512xi8>) -> tensor<2x7x7x512xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<2x7x7x512xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<2x7x7x512xi32>) -> tensor<2x7x7x512xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<2x9x9x512xi8>, tensor<3x3x512x512xi8>, i32, i32) outs(%10 : tensor<2x7x7x512xi32>) -> tensor<2x7x7x512xi32>
-    util.return %11 : tensor<2x7x7x512xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_2x7x7x512x3x3x512_i8xi8xi32_stride2.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_2x7x7x512x3x3x512_i8xi8xi32_stride2.mlir
deleted file mode 100644
index 3a7790a..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_2x7x7x512x3x3x512_i8xi8xi32_stride2.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<2x16x16x512xi8>, %arg1: tensor<3x3x512x512xi8>) -> tensor<2x7x7x512xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<2x7x7x512xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<2x7x7x512xi32>) -> tensor<2x7x7x512xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<2x16x16x512xi8>, tensor<3x3x512x512xi8>, i32, i32) outs(%10 : tensor<2x7x7x512xi32>) -> tensor<2x7x7x512xi32>
-    util.return %11 : tensor<2x7x7x512xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_32x112x112x64x7x7x3_i8xi8xi32_stride2.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_32x112x112x64x7x7x3_i8xi8xi32_stride2.mlir
deleted file mode 100644
index b402202..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_32x112x112x64x7x7x3_i8xi8xi32_stride2.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<32x230x230x64xi8>, %arg1: tensor<7x7x64x3xi8>) -> tensor<32x112x112x3xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<32x112x112x3xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<32x112x112x3xi32>) -> tensor<32x112x112x3xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<32x230x230x64xi8>, tensor<7x7x64x3xi8>, i32, i32) outs(%10 : tensor<32x112x112x3xi32>) -> tensor<32x112x112x3xi32>
-    util.return %11 : tensor<32x112x112x3xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_32x14x14x1024x1x1x512_i8xi8xi32_stride2.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_32x14x14x1024x1x1x512_i8xi8xi32_stride2.mlir
deleted file mode 100644
index c74cd1c..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_32x14x14x1024x1x1x512_i8xi8xi32_stride2.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<32x28x28x1024xi8>, %arg1: tensor<1x1x1024x512xi8>) -> tensor<32x14x14x512xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<32x14x14x512xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<32x14x14x512xi32>) -> tensor<32x14x14x512xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<32x28x28x1024xi8>, tensor<1x1x1024x512xi8>, i32, i32) outs(%10 : tensor<32x14x14x512xi32>) -> tensor<32x14x14x512xi32>
-    util.return %11 : tensor<32x14x14x512xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_32x14x14x256x3x3x256_i8xi8xi32_stride1.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_32x14x14x256x3x3x256_i8xi8xi32_stride1.mlir
deleted file mode 100644
index 22ae40a..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_32x14x14x256x3x3x256_i8xi8xi32_stride1.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<32x16x16x256xi8>, %arg1: tensor<3x3x256x256xi8>) -> tensor<32x14x14x256xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<32x14x14x256xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<32x14x14x256xi32>) -> tensor<32x14x14x256xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<32x16x16x256xi8>, tensor<3x3x256x256xi8>, i32, i32) outs(%10 : tensor<32x14x14x256xi32>) -> tensor<32x14x14x256xi32>
-    util.return %11 : tensor<32x14x14x256xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_32x14x14x256x3x3x256_i8xi8xi32_stride2.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_32x14x14x256x3x3x256_i8xi8xi32_stride2.mlir
deleted file mode 100644
index 4526e5e..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_32x14x14x256x3x3x256_i8xi8xi32_stride2.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<32x30x30x256xi8>, %arg1: tensor<3x3x256x256xi8>) -> tensor<32x14x14x256xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<32x14x14x256xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<32x14x14x256xi32>) -> tensor<32x14x14x256xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<32x30x30x256xi8>, tensor<3x3x256x256xi8>, i32, i32) outs(%10 : tensor<32x14x14x256xi32>) -> tensor<32x14x14x256xi32>
-    util.return %11 : tensor<32x14x14x256xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_32x28x28x128x3x3x128_i8xi8xi32_stride1.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_32x28x28x128x3x3x128_i8xi8xi32_stride1.mlir
deleted file mode 100644
index 9e6be94..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_32x28x28x128x3x3x128_i8xi8xi32_stride1.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<32x30x30x128xi8>, %arg1: tensor<3x3x128x128xi8>) -> tensor<32x28x28x128xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<32x28x28x128xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<32x28x28x128xi32>) -> tensor<32x28x28x128xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<32x30x30x128xi8>, tensor<3x3x128x128xi8>, i32, i32) outs(%10 : tensor<32x28x28x128xi32>) -> tensor<32x28x28x128xi32>
-    util.return %11 : tensor<32x28x28x128xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_32x28x28x128x3x3x128_i8xi8xi32_stride2.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_32x28x28x128x3x3x128_i8xi8xi32_stride2.mlir
deleted file mode 100644
index bc1357a..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_32x28x28x128x3x3x128_i8xi8xi32_stride2.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<32x58x58x128xi8>, %arg1: tensor<3x3x128x128xi8>) -> tensor<32x28x28x128xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<32x28x28x128xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<32x28x28x128xi32>) -> tensor<32x28x28x128xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<32x58x58x128xi8>, tensor<3x3x128x128xi8>, i32, i32) outs(%10 : tensor<32x28x28x128xi32>) -> tensor<32x28x28x128xi32>
-    util.return %11 : tensor<32x28x28x128xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_32x28x28x512x1x1x256_i8xi8xi32_stride2.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_32x28x28x512x1x1x256_i8xi8xi32_stride2.mlir
deleted file mode 100644
index 769904d..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_32x28x28x512x1x1x256_i8xi8xi32_stride2.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<32x56x56x512xi8>, %arg1: tensor<1x1x512x256xi8>) -> tensor<32x28x28x256xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<32x28x28x256xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<32x28x28x256xi32>) -> tensor<32x28x28x256xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<32x56x56x512xi8>, tensor<1x1x512x256xi8>, i32, i32) outs(%10 : tensor<32x28x28x256xi32>) -> tensor<32x28x28x256xi32>
-    util.return %11 : tensor<32x28x28x256xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_32x56x56x64x3x3x64_i8xi8xi32_stride1.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_32x56x56x64x3x3x64_i8xi8xi32_stride1.mlir
deleted file mode 100644
index 91cfcb4..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_32x56x56x64x3x3x64_i8xi8xi32_stride1.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<32x58x58x64xi8>, %arg1: tensor<3x3x64x64xi8>) -> tensor<32x56x56x64xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<32x56x56x64xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<32x56x56x64xi32>) -> tensor<32x56x56x64xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<32x58x58x64xi8>, tensor<3x3x64x64xi8>, i32, i32) outs(%10 : tensor<32x56x56x64xi32>) -> tensor<32x56x56x64xi32>
-    util.return %11 : tensor<32x56x56x64xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_32x7x7x2048x1x1x1024_i8xi8xi32_stride2.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_32x7x7x2048x1x1x1024_i8xi8xi32_stride2.mlir
deleted file mode 100644
index 8cd1114..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_32x7x7x2048x1x1x1024_i8xi8xi32_stride2.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<32x14x14x2048xi8>, %arg1: tensor<1x1x2048x1024xi8>) -> tensor<32x7x7x1024xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<32x7x7x1024xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<32x7x7x1024xi32>) -> tensor<32x7x7x1024xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<32x14x14x2048xi8>, tensor<1x1x2048x1024xi8>, i32, i32) outs(%10 : tensor<32x7x7x1024xi32>) -> tensor<32x7x7x1024xi32>
-    util.return %11 : tensor<32x7x7x1024xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_32x7x7x512x3x3x512_i8xi8xi32_stride1.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_32x7x7x512x3x3x512_i8xi8xi32_stride1.mlir
deleted file mode 100644
index 2ae453e..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_32x7x7x512x3x3x512_i8xi8xi32_stride1.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<32x9x9x512xi8>, %arg1: tensor<3x3x512x512xi8>) -> tensor<32x7x7x512xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<32x7x7x512xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<32x7x7x512xi32>) -> tensor<32x7x7x512xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<32x9x9x512xi8>, tensor<3x3x512x512xi8>, i32, i32) outs(%10 : tensor<32x7x7x512xi32>) -> tensor<32x7x7x512xi32>
-    util.return %11 : tensor<32x7x7x512xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_32x7x7x512x3x3x512_i8xi8xi32_stride2.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_32x7x7x512x3x3x512_i8xi8xi32_stride2.mlir
deleted file mode 100644
index 58c5b1e..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_32x7x7x512x3x3x512_i8xi8xi32_stride2.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<32x16x16x512xi8>, %arg1: tensor<3x3x512x512xi8>) -> tensor<32x7x7x512xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<32x7x7x512xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<32x7x7x512xi32>) -> tensor<32x7x7x512xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<32x16x16x512xi8>, tensor<3x3x512x512xi8>, i32, i32) outs(%10 : tensor<32x7x7x512xi32>) -> tensor<32x7x7x512xi32>
-    util.return %11 : tensor<32x7x7x512xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_48x112x112x64x7x7x3_i8xi8xi32_stride2.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_48x112x112x64x7x7x3_i8xi8xi32_stride2.mlir
deleted file mode 100644
index 2c8253e..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_48x112x112x64x7x7x3_i8xi8xi32_stride2.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<48x230x230x64xi8>, %arg1: tensor<7x7x64x3xi8>) -> tensor<48x112x112x3xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<48x112x112x3xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<48x112x112x3xi32>) -> tensor<48x112x112x3xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<48x230x230x64xi8>, tensor<7x7x64x3xi8>, i32, i32) outs(%10 : tensor<48x112x112x3xi32>) -> tensor<48x112x112x3xi32>
-    util.return %11 : tensor<48x112x112x3xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_48x14x14x1024x1x1x512_i8xi8xi32_stride2.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_48x14x14x1024x1x1x512_i8xi8xi32_stride2.mlir
deleted file mode 100644
index 5289794..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_48x14x14x1024x1x1x512_i8xi8xi32_stride2.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<48x28x28x1024xi8>, %arg1: tensor<1x1x1024x512xi8>) -> tensor<48x14x14x512xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<48x14x14x512xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<48x14x14x512xi32>) -> tensor<48x14x14x512xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<48x28x28x1024xi8>, tensor<1x1x1024x512xi8>, i32, i32) outs(%10 : tensor<48x14x14x512xi32>) -> tensor<48x14x14x512xi32>
-    util.return %11 : tensor<48x14x14x512xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_48x14x14x256x3x3x256_i8xi8xi32_stride1.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_48x14x14x256x3x3x256_i8xi8xi32_stride1.mlir
deleted file mode 100644
index 3f1f5dd..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_48x14x14x256x3x3x256_i8xi8xi32_stride1.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<48x16x16x256xi8>, %arg1: tensor<3x3x256x256xi8>) -> tensor<48x14x14x256xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<48x14x14x256xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<48x14x14x256xi32>) -> tensor<48x14x14x256xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<48x16x16x256xi8>, tensor<3x3x256x256xi8>, i32, i32) outs(%10 : tensor<48x14x14x256xi32>) -> tensor<48x14x14x256xi32>
-    util.return %11 : tensor<48x14x14x256xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_48x14x14x256x3x3x256_i8xi8xi32_stride2.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_48x14x14x256x3x3x256_i8xi8xi32_stride2.mlir
deleted file mode 100644
index 8a9306c..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_48x14x14x256x3x3x256_i8xi8xi32_stride2.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<48x30x30x256xi8>, %arg1: tensor<3x3x256x256xi8>) -> tensor<48x14x14x256xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<48x14x14x256xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<48x14x14x256xi32>) -> tensor<48x14x14x256xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<48x30x30x256xi8>, tensor<3x3x256x256xi8>, i32, i32) outs(%10 : tensor<48x14x14x256xi32>) -> tensor<48x14x14x256xi32>
-    util.return %11 : tensor<48x14x14x256xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_48x28x28x128x3x3x128_i8xi8xi32_stride1.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_48x28x28x128x3x3x128_i8xi8xi32_stride1.mlir
deleted file mode 100644
index 30cdd37..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_48x28x28x128x3x3x128_i8xi8xi32_stride1.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<48x30x30x128xi8>, %arg1: tensor<3x3x128x128xi8>) -> tensor<48x28x28x128xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<48x28x28x128xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<48x28x28x128xi32>) -> tensor<48x28x28x128xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<48x30x30x128xi8>, tensor<3x3x128x128xi8>, i32, i32) outs(%10 : tensor<48x28x28x128xi32>) -> tensor<48x28x28x128xi32>
-    util.return %11 : tensor<48x28x28x128xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_48x28x28x128x3x3x128_i8xi8xi32_stride2.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_48x28x28x128x3x3x128_i8xi8xi32_stride2.mlir
deleted file mode 100644
index c044d56..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_48x28x28x128x3x3x128_i8xi8xi32_stride2.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<48x58x58x128xi8>, %arg1: tensor<3x3x128x128xi8>) -> tensor<48x28x28x128xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<48x28x28x128xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<48x28x28x128xi32>) -> tensor<48x28x28x128xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<48x58x58x128xi8>, tensor<3x3x128x128xi8>, i32, i32) outs(%10 : tensor<48x28x28x128xi32>) -> tensor<48x28x28x128xi32>
-    util.return %11 : tensor<48x28x28x128xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_48x28x28x512x1x1x256_i8xi8xi32_stride2.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_48x28x28x512x1x1x256_i8xi8xi32_stride2.mlir
deleted file mode 100644
index 78cc002..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_48x28x28x512x1x1x256_i8xi8xi32_stride2.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<48x56x56x512xi8>, %arg1: tensor<1x1x512x256xi8>) -> tensor<48x28x28x256xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<48x28x28x256xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<48x28x28x256xi32>) -> tensor<48x28x28x256xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<48x56x56x512xi8>, tensor<1x1x512x256xi8>, i32, i32) outs(%10 : tensor<48x28x28x256xi32>) -> tensor<48x28x28x256xi32>
-    util.return %11 : tensor<48x28x28x256xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_48x56x56x64x3x3x64_i8xi8xi32_stride1.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_48x56x56x64x3x3x64_i8xi8xi32_stride1.mlir
deleted file mode 100644
index 5c4839e..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_48x56x56x64x3x3x64_i8xi8xi32_stride1.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<48x58x58x64xi8>, %arg1: tensor<3x3x64x64xi8>) -> tensor<48x56x56x64xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<48x56x56x64xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<48x56x56x64xi32>) -> tensor<48x56x56x64xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<48x58x58x64xi8>, tensor<3x3x64x64xi8>, i32, i32) outs(%10 : tensor<48x56x56x64xi32>) -> tensor<48x56x56x64xi32>
-    util.return %11 : tensor<48x56x56x64xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_48x7x7x2048x1x1x1024_i8xi8xi32_stride2.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_48x7x7x2048x1x1x1024_i8xi8xi32_stride2.mlir
deleted file mode 100644
index 7a087b0..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_48x7x7x2048x1x1x1024_i8xi8xi32_stride2.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<48x14x14x2048xi8>, %arg1: tensor<1x1x2048x1024xi8>) -> tensor<48x7x7x1024xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<48x7x7x1024xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<48x7x7x1024xi32>) -> tensor<48x7x7x1024xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<48x14x14x2048xi8>, tensor<1x1x2048x1024xi8>, i32, i32) outs(%10 : tensor<48x7x7x1024xi32>) -> tensor<48x7x7x1024xi32>
-    util.return %11 : tensor<48x7x7x1024xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_48x7x7x512x3x3x512_i8xi8xi32_stride1.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_48x7x7x512x3x3x512_i8xi8xi32_stride1.mlir
deleted file mode 100644
index 8873a93..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_48x7x7x512x3x3x512_i8xi8xi32_stride1.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<48x9x9x512xi8>, %arg1: tensor<3x3x512x512xi8>) -> tensor<48x7x7x512xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<48x7x7x512xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<48x7x7x512xi32>) -> tensor<48x7x7x512xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<48x9x9x512xi8>, tensor<3x3x512x512xi8>, i32, i32) outs(%10 : tensor<48x7x7x512xi32>) -> tensor<48x7x7x512xi32>
-    util.return %11 : tensor<48x7x7x512xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_48x7x7x512x3x3x512_i8xi8xi32_stride2.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_48x7x7x512x3x3x512_i8xi8xi32_stride2.mlir
deleted file mode 100644
index b4fa224..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_48x7x7x512x3x3x512_i8xi8xi32_stride2.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<48x16x16x512xi8>, %arg1: tensor<3x3x512x512xi8>) -> tensor<48x7x7x512xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<48x7x7x512xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<48x7x7x512xi32>) -> tensor<48x7x7x512xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<48x16x16x512xi8>, tensor<3x3x512x512xi8>, i32, i32) outs(%10 : tensor<48x7x7x512xi32>) -> tensor<48x7x7x512xi32>
-    util.return %11 : tensor<48x7x7x512xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_4x112x112x64x7x7x3_i8xi8xi32_stride2.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_4x112x112x64x7x7x3_i8xi8xi32_stride2.mlir
deleted file mode 100644
index 19cfe1e..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_4x112x112x64x7x7x3_i8xi8xi32_stride2.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<4x230x230x64xi8>, %arg1: tensor<7x7x64x3xi8>) -> tensor<4x112x112x3xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<4x112x112x3xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<4x112x112x3xi32>) -> tensor<4x112x112x3xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<4x230x230x64xi8>, tensor<7x7x64x3xi8>, i32, i32) outs(%10 : tensor<4x112x112x3xi32>) -> tensor<4x112x112x3xi32>
-    util.return %11 : tensor<4x112x112x3xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_4x14x14x1024x1x1x512_i8xi8xi32_stride2.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_4x14x14x1024x1x1x512_i8xi8xi32_stride2.mlir
deleted file mode 100644
index d05f703..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_4x14x14x1024x1x1x512_i8xi8xi32_stride2.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<4x28x28x1024xi8>, %arg1: tensor<1x1x1024x512xi8>) -> tensor<4x14x14x512xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<4x14x14x512xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<4x14x14x512xi32>) -> tensor<4x14x14x512xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<4x28x28x1024xi8>, tensor<1x1x1024x512xi8>, i32, i32) outs(%10 : tensor<4x14x14x512xi32>) -> tensor<4x14x14x512xi32>
-    util.return %11 : tensor<4x14x14x512xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_4x14x14x256x3x3x256_i8xi8xi32_stride1.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_4x14x14x256x3x3x256_i8xi8xi32_stride1.mlir
deleted file mode 100644
index 8087212..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_4x14x14x256x3x3x256_i8xi8xi32_stride1.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<4x16x16x256xi8>, %arg1: tensor<3x3x256x256xi8>) -> tensor<4x14x14x256xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<4x14x14x256xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<4x14x14x256xi32>) -> tensor<4x14x14x256xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<4x16x16x256xi8>, tensor<3x3x256x256xi8>, i32, i32) outs(%10 : tensor<4x14x14x256xi32>) -> tensor<4x14x14x256xi32>
-    util.return %11 : tensor<4x14x14x256xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_4x14x14x256x3x3x256_i8xi8xi32_stride2.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_4x14x14x256x3x3x256_i8xi8xi32_stride2.mlir
deleted file mode 100644
index 0757c3d..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_4x14x14x256x3x3x256_i8xi8xi32_stride2.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<4x30x30x256xi8>, %arg1: tensor<3x3x256x256xi8>) -> tensor<4x14x14x256xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<4x14x14x256xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<4x14x14x256xi32>) -> tensor<4x14x14x256xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<4x30x30x256xi8>, tensor<3x3x256x256xi8>, i32, i32) outs(%10 : tensor<4x14x14x256xi32>) -> tensor<4x14x14x256xi32>
-    util.return %11 : tensor<4x14x14x256xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_4x28x28x128x3x3x128_i8xi8xi32_stride1.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_4x28x28x128x3x3x128_i8xi8xi32_stride1.mlir
deleted file mode 100644
index 65c9515..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_4x28x28x128x3x3x128_i8xi8xi32_stride1.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<4x30x30x128xi8>, %arg1: tensor<3x3x128x128xi8>) -> tensor<4x28x28x128xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<4x28x28x128xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<4x28x28x128xi32>) -> tensor<4x28x28x128xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<4x30x30x128xi8>, tensor<3x3x128x128xi8>, i32, i32) outs(%10 : tensor<4x28x28x128xi32>) -> tensor<4x28x28x128xi32>
-    util.return %11 : tensor<4x28x28x128xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_4x28x28x128x3x3x128_i8xi8xi32_stride2.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_4x28x28x128x3x3x128_i8xi8xi32_stride2.mlir
deleted file mode 100644
index b1977ac..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_4x28x28x128x3x3x128_i8xi8xi32_stride2.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<4x58x58x128xi8>, %arg1: tensor<3x3x128x128xi8>) -> tensor<4x28x28x128xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<4x28x28x128xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<4x28x28x128xi32>) -> tensor<4x28x28x128xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<4x58x58x128xi8>, tensor<3x3x128x128xi8>, i32, i32) outs(%10 : tensor<4x28x28x128xi32>) -> tensor<4x28x28x128xi32>
-    util.return %11 : tensor<4x28x28x128xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_4x28x28x512x1x1x256_i8xi8xi32_stride2.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_4x28x28x512x1x1x256_i8xi8xi32_stride2.mlir
deleted file mode 100644
index 954f14c..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_4x28x28x512x1x1x256_i8xi8xi32_stride2.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<4x56x56x512xi8>, %arg1: tensor<1x1x512x256xi8>) -> tensor<4x28x28x256xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<4x28x28x256xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<4x28x28x256xi32>) -> tensor<4x28x28x256xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<4x56x56x512xi8>, tensor<1x1x512x256xi8>, i32, i32) outs(%10 : tensor<4x28x28x256xi32>) -> tensor<4x28x28x256xi32>
-    util.return %11 : tensor<4x28x28x256xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_4x56x56x64x3x3x64_i8xi8xi32_stride1.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_4x56x56x64x3x3x64_i8xi8xi32_stride1.mlir
deleted file mode 100644
index 13553f3..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_4x56x56x64x3x3x64_i8xi8xi32_stride1.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<4x58x58x64xi8>, %arg1: tensor<3x3x64x64xi8>) -> tensor<4x56x56x64xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<4x56x56x64xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<4x56x56x64xi32>) -> tensor<4x56x56x64xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<4x58x58x64xi8>, tensor<3x3x64x64xi8>, i32, i32) outs(%10 : tensor<4x56x56x64xi32>) -> tensor<4x56x56x64xi32>
-    util.return %11 : tensor<4x56x56x64xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_4x7x7x2048x1x1x1024_i8xi8xi32_stride2.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_4x7x7x2048x1x1x1024_i8xi8xi32_stride2.mlir
deleted file mode 100644
index e4c857e..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_4x7x7x2048x1x1x1024_i8xi8xi32_stride2.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<4x14x14x2048xi8>, %arg1: tensor<1x1x2048x1024xi8>) -> tensor<4x7x7x1024xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<4x7x7x1024xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<4x7x7x1024xi32>) -> tensor<4x7x7x1024xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<4x14x14x2048xi8>, tensor<1x1x2048x1024xi8>, i32, i32) outs(%10 : tensor<4x7x7x1024xi32>) -> tensor<4x7x7x1024xi32>
-    util.return %11 : tensor<4x7x7x1024xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_4x7x7x512x3x3x512_i8xi8xi32_stride1.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_4x7x7x512x3x3x512_i8xi8xi32_stride1.mlir
deleted file mode 100644
index 37b65ed..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_4x7x7x512x3x3x512_i8xi8xi32_stride1.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<4x9x9x512xi8>, %arg1: tensor<3x3x512x512xi8>) -> tensor<4x7x7x512xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<4x7x7x512xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<4x7x7x512xi32>) -> tensor<4x7x7x512xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<4x9x9x512xi8>, tensor<3x3x512x512xi8>, i32, i32) outs(%10 : tensor<4x7x7x512xi32>) -> tensor<4x7x7x512xi32>
-    util.return %11 : tensor<4x7x7x512xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_4x7x7x512x3x3x512_i8xi8xi32_stride2.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_4x7x7x512x3x3x512_i8xi8xi32_stride2.mlir
deleted file mode 100644
index 0d84e71..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_4x7x7x512x3x3x512_i8xi8xi32_stride2.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<4x16x16x512xi8>, %arg1: tensor<3x3x512x512xi8>) -> tensor<4x7x7x512xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<4x7x7x512xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<4x7x7x512xi32>) -> tensor<4x7x7x512xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<4x16x16x512xi8>, tensor<3x3x512x512xi8>, i32, i32) outs(%10 : tensor<4x7x7x512xi32>) -> tensor<4x7x7x512xi32>
-    util.return %11 : tensor<4x7x7x512xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_8x112x112x64x7x7x3_i8xi8xi32_stride2.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_8x112x112x64x7x7x3_i8xi8xi32_stride2.mlir
deleted file mode 100644
index 40d6ef8..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_8x112x112x64x7x7x3_i8xi8xi32_stride2.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<8x230x230x64xi8>, %arg1: tensor<7x7x64x3xi8>) -> tensor<8x112x112x3xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<8x112x112x3xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<8x112x112x3xi32>) -> tensor<8x112x112x3xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<8x230x230x64xi8>, tensor<7x7x64x3xi8>, i32, i32) outs(%10 : tensor<8x112x112x3xi32>) -> tensor<8x112x112x3xi32>
-    util.return %11 : tensor<8x112x112x3xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_8x14x14x1024x1x1x512_i8xi8xi32_stride2.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_8x14x14x1024x1x1x512_i8xi8xi32_stride2.mlir
deleted file mode 100644
index c3e3ba6..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_8x14x14x1024x1x1x512_i8xi8xi32_stride2.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<8x28x28x1024xi8>, %arg1: tensor<1x1x1024x512xi8>) -> tensor<8x14x14x512xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<8x14x14x512xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<8x14x14x512xi32>) -> tensor<8x14x14x512xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<8x28x28x1024xi8>, tensor<1x1x1024x512xi8>, i32, i32) outs(%10 : tensor<8x14x14x512xi32>) -> tensor<8x14x14x512xi32>
-    util.return %11 : tensor<8x14x14x512xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_8x14x14x256x3x3x256_i8xi8xi32_stride1.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_8x14x14x256x3x3x256_i8xi8xi32_stride1.mlir
deleted file mode 100644
index d641165..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_8x14x14x256x3x3x256_i8xi8xi32_stride1.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<8x16x16x256xi8>, %arg1: tensor<3x3x256x256xi8>) -> tensor<8x14x14x256xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<8x14x14x256xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<8x14x14x256xi32>) -> tensor<8x14x14x256xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<8x16x16x256xi8>, tensor<3x3x256x256xi8>, i32, i32) outs(%10 : tensor<8x14x14x256xi32>) -> tensor<8x14x14x256xi32>
-    util.return %11 : tensor<8x14x14x256xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_8x14x14x256x3x3x256_i8xi8xi32_stride2.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_8x14x14x256x3x3x256_i8xi8xi32_stride2.mlir
deleted file mode 100644
index 9d5b6a7..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_8x14x14x256x3x3x256_i8xi8xi32_stride2.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<8x30x30x256xi8>, %arg1: tensor<3x3x256x256xi8>) -> tensor<8x14x14x256xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<8x14x14x256xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<8x14x14x256xi32>) -> tensor<8x14x14x256xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<8x30x30x256xi8>, tensor<3x3x256x256xi8>, i32, i32) outs(%10 : tensor<8x14x14x256xi32>) -> tensor<8x14x14x256xi32>
-    util.return %11 : tensor<8x14x14x256xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_8x28x28x128x3x3x128_i8xi8xi32_stride1.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_8x28x28x128x3x3x128_i8xi8xi32_stride1.mlir
deleted file mode 100644
index c65bba4..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_8x28x28x128x3x3x128_i8xi8xi32_stride1.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<8x30x30x128xi8>, %arg1: tensor<3x3x128x128xi8>) -> tensor<8x28x28x128xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<8x28x28x128xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<8x28x28x128xi32>) -> tensor<8x28x28x128xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<8x30x30x128xi8>, tensor<3x3x128x128xi8>, i32, i32) outs(%10 : tensor<8x28x28x128xi32>) -> tensor<8x28x28x128xi32>
-    util.return %11 : tensor<8x28x28x128xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_8x28x28x128x3x3x128_i8xi8xi32_stride2.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_8x28x28x128x3x3x128_i8xi8xi32_stride2.mlir
deleted file mode 100644
index 94378ef..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_8x28x28x128x3x3x128_i8xi8xi32_stride2.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<8x58x58x128xi8>, %arg1: tensor<3x3x128x128xi8>) -> tensor<8x28x28x128xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<8x28x28x128xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<8x28x28x128xi32>) -> tensor<8x28x28x128xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<8x58x58x128xi8>, tensor<3x3x128x128xi8>, i32, i32) outs(%10 : tensor<8x28x28x128xi32>) -> tensor<8x28x28x128xi32>
-    util.return %11 : tensor<8x28x28x128xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_8x28x28x512x1x1x256_i8xi8xi32_stride2.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_8x28x28x512x1x1x256_i8xi8xi32_stride2.mlir
deleted file mode 100644
index cfed53f..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_8x28x28x512x1x1x256_i8xi8xi32_stride2.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<8x56x56x512xi8>, %arg1: tensor<1x1x512x256xi8>) -> tensor<8x28x28x256xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<8x28x28x256xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<8x28x28x256xi32>) -> tensor<8x28x28x256xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<8x56x56x512xi8>, tensor<1x1x512x256xi8>, i32, i32) outs(%10 : tensor<8x28x28x256xi32>) -> tensor<8x28x28x256xi32>
-    util.return %11 : tensor<8x28x28x256xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_8x56x56x64x3x3x64_i8xi8xi32_stride1.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_8x56x56x64x3x3x64_i8xi8xi32_stride1.mlir
deleted file mode 100644
index 5bca844..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_8x56x56x64x3x3x64_i8xi8xi32_stride1.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<8x58x58x64xi8>, %arg1: tensor<3x3x64x64xi8>) -> tensor<8x56x56x64xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<8x56x56x64xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<8x56x56x64xi32>) -> tensor<8x56x56x64xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<8x58x58x64xi8>, tensor<3x3x64x64xi8>, i32, i32) outs(%10 : tensor<8x56x56x64xi32>) -> tensor<8x56x56x64xi32>
-    util.return %11 : tensor<8x56x56x64xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_8x7x7x2048x1x1x1024_i8xi8xi32_stride2.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_8x7x7x2048x1x1x1024_i8xi8xi32_stride2.mlir
deleted file mode 100644
index 6dba97c..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_8x7x7x2048x1x1x1024_i8xi8xi32_stride2.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<8x14x14x2048xi8>, %arg1: tensor<1x1x2048x1024xi8>) -> tensor<8x7x7x1024xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<8x7x7x1024xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<8x7x7x1024xi32>) -> tensor<8x7x7x1024xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<8x14x14x2048xi8>, tensor<1x1x2048x1024xi8>, i32, i32) outs(%10 : tensor<8x7x7x1024xi32>) -> tensor<8x7x7x1024xi32>
-    util.return %11 : tensor<8x7x7x1024xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_8x7x7x512x3x3x512_i8xi8xi32_stride1.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_8x7x7x512x3x3x512_i8xi8xi32_stride1.mlir
deleted file mode 100644
index 1c218cf..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_8x7x7x512x3x3x512_i8xi8xi32_stride1.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<8x9x9x512xi8>, %arg1: tensor<3x3x512x512xi8>) -> tensor<8x7x7x512xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<8x7x7x512xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<8x7x7x512xi32>) -> tensor<8x7x7x512xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<8x9x9x512xi8>, tensor<3x3x512x512xi8>, i32, i32) outs(%10 : tensor<8x7x7x512xi32>) -> tensor<8x7x7x512xi32>
-    util.return %11 : tensor<8x7x7x512xi32>
-}
diff --git a/conv/mlir/conv_2d_nhwc_hwcf_q_8x7x7x512x3x3x512_i8xi8xi32_stride2.mlir b/conv/mlir/conv_2d_nhwc_hwcf_q_8x7x7x512x3x3x512_i8xi8xi32_stride2.mlir
deleted file mode 100644
index 25d0638..0000000
--- a/conv/mlir/conv_2d_nhwc_hwcf_q_8x7x7x512x3x3x512_i8xi8xi32_stride2.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-util.func public @main(%arg0: tensor<8x16x16x512xi8>, %arg1: tensor<3x3x512x512xi8>) -> tensor<8x7x7x512xi32> {
-    %cst = arith.constant 0 : i32
-    %9 = tensor.empty() : tensor<8x7x7x512xi32>
-    %10 = linalg.fill ins(%cst : i32) outs(%9 : tensor<8x7x7x512xi32>) -> tensor<8x7x7x512xi32>
-    %c0_i32 = arith.constant 0 : i32
-    %11 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32 : tensor<8x16x16x512xi8>, tensor<3x3x512x512xi8>, i32, i32) outs(%10 : tensor<8x7x7x512xi32>) -> tensor<8x7x7x512xi32>
-    util.return %11 : tensor<8x7x7x512xi32>
-}
diff --git a/gemm/mlir/gemm_10240_16_8192_bf16_tA.mlir b/gemm/mlir/gemm_10240_16_8192_bf16_tA.mlir
deleted file mode 100644
index 2f56e73..0000000
--- a/gemm/mlir/gemm_10240_16_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x10240xbf16>, %arg1: tensor<8192x16xbf16>) -> tensor<10240x16xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<10240x16xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<10240x16xbf16>) -> tensor<10240x16xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x10240xbf16>, tensor<8192x16xbf16>) outs(%1 : tensor<10240x16xbf16>) -> tensor<10240x16xbf16>
-        return %2 : tensor<10240x16xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_10240_16_8192_f16_tA.mlir b/gemm/mlir/gemm_10240_16_8192_f16_tA.mlir
deleted file mode 100644
index 78c8d49..0000000
--- a/gemm/mlir/gemm_10240_16_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x10240xf16>, %arg1: tensor<8192x16xf16>) -> tensor<10240x16xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<10240x16xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<10240x16xf16>) -> tensor<10240x16xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x10240xf16>, tensor<8192x16xf16>) outs(%1 : tensor<10240x16xf16>) -> tensor<10240x16xf16>
-        return %2 : tensor<10240x16xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_10240_1_8192_bf16_tA.mlir b/gemm/mlir/gemm_10240_1_8192_bf16_tA.mlir
deleted file mode 100644
index ad452e2..0000000
--- a/gemm/mlir/gemm_10240_1_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x10240xbf16>, %arg1: tensor<8192x1xbf16>) -> tensor<10240x1xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<10240x1xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<10240x1xbf16>) -> tensor<10240x1xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x10240xbf16>, tensor<8192x1xbf16>) outs(%1 : tensor<10240x1xbf16>) -> tensor<10240x1xbf16>
-        return %2 : tensor<10240x1xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_10240_1_8192_f16_tA.mlir b/gemm/mlir/gemm_10240_1_8192_f16_tA.mlir
deleted file mode 100644
index 71b8145..0000000
--- a/gemm/mlir/gemm_10240_1_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x10240xf16>, %arg1: tensor<8192x1xf16>) -> tensor<10240x1xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<10240x1xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<10240x1xf16>) -> tensor<10240x1xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x10240xf16>, tensor<8192x1xf16>) outs(%1 : tensor<10240x1xf16>) -> tensor<10240x1xf16>
-        return %2 : tensor<10240x1xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_10240_2_8192_bf16_tA.mlir b/gemm/mlir/gemm_10240_2_8192_bf16_tA.mlir
deleted file mode 100644
index b0f1298..0000000
--- a/gemm/mlir/gemm_10240_2_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x10240xbf16>, %arg1: tensor<8192x2xbf16>) -> tensor<10240x2xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<10240x2xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<10240x2xbf16>) -> tensor<10240x2xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x10240xbf16>, tensor<8192x2xbf16>) outs(%1 : tensor<10240x2xbf16>) -> tensor<10240x2xbf16>
-        return %2 : tensor<10240x2xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_10240_2_8192_f16_tA.mlir b/gemm/mlir/gemm_10240_2_8192_f16_tA.mlir
deleted file mode 100644
index 273354c..0000000
--- a/gemm/mlir/gemm_10240_2_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x10240xf16>, %arg1: tensor<8192x2xf16>) -> tensor<10240x2xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<10240x2xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<10240x2xf16>) -> tensor<10240x2xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x10240xf16>, tensor<8192x2xf16>) outs(%1 : tensor<10240x2xf16>) -> tensor<10240x2xf16>
-        return %2 : tensor<10240x2xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_10240_32_8192_bf16_tA.mlir b/gemm/mlir/gemm_10240_32_8192_bf16_tA.mlir
deleted file mode 100644
index f1ec0ed..0000000
--- a/gemm/mlir/gemm_10240_32_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x10240xbf16>, %arg1: tensor<8192x32xbf16>) -> tensor<10240x32xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<10240x32xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<10240x32xbf16>) -> tensor<10240x32xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x10240xbf16>, tensor<8192x32xbf16>) outs(%1 : tensor<10240x32xbf16>) -> tensor<10240x32xbf16>
-        return %2 : tensor<10240x32xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_10240_32_8192_f16_tA.mlir b/gemm/mlir/gemm_10240_32_8192_f16_tA.mlir
deleted file mode 100644
index 3a3e10a..0000000
--- a/gemm/mlir/gemm_10240_32_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x10240xf16>, %arg1: tensor<8192x32xf16>) -> tensor<10240x32xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<10240x32xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<10240x32xf16>) -> tensor<10240x32xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x10240xf16>, tensor<8192x32xf16>) outs(%1 : tensor<10240x32xf16>) -> tensor<10240x32xf16>
-        return %2 : tensor<10240x32xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_10240_4_8192_bf16_tA.mlir b/gemm/mlir/gemm_10240_4_8192_bf16_tA.mlir
deleted file mode 100644
index 2b73883..0000000
--- a/gemm/mlir/gemm_10240_4_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x10240xbf16>, %arg1: tensor<8192x4xbf16>) -> tensor<10240x4xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<10240x4xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<10240x4xbf16>) -> tensor<10240x4xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x10240xbf16>, tensor<8192x4xbf16>) outs(%1 : tensor<10240x4xbf16>) -> tensor<10240x4xbf16>
-        return %2 : tensor<10240x4xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_10240_4_8192_f16_tA.mlir b/gemm/mlir/gemm_10240_4_8192_f16_tA.mlir
deleted file mode 100644
index 2a97ec8..0000000
--- a/gemm/mlir/gemm_10240_4_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x10240xf16>, %arg1: tensor<8192x4xf16>) -> tensor<10240x4xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<10240x4xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<10240x4xf16>) -> tensor<10240x4xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x10240xf16>, tensor<8192x4xf16>) outs(%1 : tensor<10240x4xf16>) -> tensor<10240x4xf16>
-        return %2 : tensor<10240x4xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_10240_8_8192_bf16_tA.mlir b/gemm/mlir/gemm_10240_8_8192_bf16_tA.mlir
deleted file mode 100644
index a5c4f70..0000000
--- a/gemm/mlir/gemm_10240_8_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x10240xbf16>, %arg1: tensor<8192x8xbf16>) -> tensor<10240x8xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<10240x8xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<10240x8xbf16>) -> tensor<10240x8xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x10240xbf16>, tensor<8192x8xbf16>) outs(%1 : tensor<10240x8xbf16>) -> tensor<10240x8xbf16>
-        return %2 : tensor<10240x8xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_10240_8_8192_f16_tA.mlir b/gemm/mlir/gemm_10240_8_8192_f16_tA.mlir
deleted file mode 100644
index 96ca8f3..0000000
--- a/gemm/mlir/gemm_10240_8_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x10240xf16>, %arg1: tensor<8192x8xf16>) -> tensor<10240x8xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<10240x8xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<10240x8xf16>) -> tensor<10240x8xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x10240xf16>, tensor<8192x8xf16>) outs(%1 : tensor<10240x8xf16>) -> tensor<10240x8xf16>
-        return %2 : tensor<10240x8xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_1024_5120_640_f16_tB.mlir b/gemm/mlir/gemm_1024_5120_640_f16_tB.mlir
deleted file mode 100644
index 7e31313..0000000
--- a/gemm/mlir/gemm_1024_5120_640_f16_tB.mlir
+++ /dev/null
@@ -1,145 +0,0 @@
-#translation = #iree_codegen.translation_info<None workgroup_size = [128, 2, 1] subgroup_size = 64>
-module attributes {transform.with_named_sequence} {
-  stream.executable private @gemm {
-    stream.executable.export public @gemm workgroups() -> (index, index, index) {
-      %c16 = arith.constant 16 : index
-      %c80 = arith.constant 80 : index
-      %c1 = arith.constant 1 : index
-      stream.return %c16, %c80, %c1 : index, index, index
-    }
-    builtin.module {
-      func.func @gemm(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) attributes {translation_info = #translation} {
-        %c19 = arith.constant 19 : index
-        %c18 = arith.constant 18 : index
-        %c17 = arith.constant 17 : index
-        %c3 = arith.constant 3 : index
-        %c2 = arith.constant 2 : index
-        %c16 = arith.constant 16 : index
-        %c8 = arith.constant 8 : index
-        %c4 = arith.constant 4 : index
-        %c32 = arith.constant 32 : index
-        %c64 = arith.constant 64 : index
-        %c1 = arith.constant 1 : index
-        %c20 = arith.constant 20 : index
-        %c0 = arith.constant 0 : index
-        %cst = arith.constant dense<0.000000e+00> : vector<4xf32>
-        %workgroup_id_0 = stream.dispatch.workgroup.id[0] : index
-        %workgroup_id_1 = stream.dispatch.workgroup.id[1] : index
-        %thread_id_x = gpu.thread_id  x
-        %thread_id_y = gpu.thread_id  y
-        %alloc = memref.alloc() : memref<64x32xf16, #gpu.address_space<workgroup>>
-        %alloc_0 = memref.alloc() : memref<64x32xf16, #gpu.address_space<workgroup>>
-        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> memref<1024x640xf16, strided<[640, 1], offset: ?>>
-        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> memref<5120x640xf16, strided<[640, 1], offset: ?>>
-        %2 = arith.muli %workgroup_id_0, %c64 : index
-        %3 = arith.muli %thread_id_y, %c32 : index
-        %4 = arith.divsi %thread_id_x, %c4 : index
-        %5 = arith.addi %4, %3 : index
-        %6 = arith.remsi %5, %c64 : index
-        %7 = arith.addi %6, %2 : index
-        %8 = arith.remsi %thread_id_x, %c4 : index
-        %9 = arith.muli %8, %c8 : index
-        %10 = arith.divsi %thread_id_x, %c64 : index
-        %11 = arith.muli %10, %c32 : index
-        %12 = arith.remsi %thread_id_x, %c16 : index
-        %13 = arith.addi %12, %11 : index
-        %14 = arith.remsi %thread_id_x, %c64 : index
-        %15 = arith.divsi %14, %c16 : index
-        %16 = arith.muli %15, %c4 : index
-        %17 = arith.addi %16, %c16 : index
-        %18 = arith.addi %13, %c16 : index
-        %19 = arith.muli %workgroup_id_1, %c64 : index
-        %20 = arith.addi %6, %19 : index
-        %21 = arith.addi %12, %3 : index
-        %22 = arith.addi %21, %c16 : index
-        %23:4 = scf.for %arg3 = %c0 to %c20 step %c1 iter_args(%arg4 = %cst, %arg5 = %cst, %arg6 = %cst, %arg7 = %cst) -> (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) {
-          %62 = arith.muli %arg3, %c32 : index
-          %63 = arith.addi %62, %9 : index
-          %64 = vector.load %0[%7, %63] : memref<1024x640xf16, strided<[640, 1], offset: ?>>, vector<8xf16>
-          vector.store %64, %alloc[%6, %9] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<8xf16>
-          amdgpu.lds_barrier
-          %65 = vector.load %alloc[%13, %16] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<4xf16>
-          %66 = vector.load %alloc[%13, %17] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<4xf16>
-          %67 = vector.load %alloc[%18, %16] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<4xf16>
-          %68 = vector.load %alloc[%18, %17] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<4xf16>
-          %69 = vector.load %1[%20, %63] : memref<5120x640xf16, strided<[640, 1], offset: ?>>, vector<8xf16>
-          amdgpu.lds_barrier
-          vector.store %69, %alloc_0[%6, %9] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<8xf16>
-          amdgpu.lds_barrier
-          %70 = vector.load %alloc_0[%21, %16] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<4xf16>
-          %71 = vector.load %alloc_0[%21, %17] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<4xf16>
-          %72 = vector.load %alloc_0[%22, %16] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<4xf16>
-          %73 = vector.load %alloc_0[%22, %17] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<4xf16>
-          %74 = amdgpu.mfma %65 * %70 + %arg4 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
-          %75 = amdgpu.mfma %66 * %71 + %74 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
-          %76 = amdgpu.mfma %67 * %72 + %arg7 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
-          %77 = amdgpu.mfma %68 * %73 + %76 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
-          %78 = amdgpu.mfma %67 * %70 + %arg6 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
-          %79 = amdgpu.mfma %68 * %71 + %78 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
-          %80 = amdgpu.mfma %65 * %72 + %arg5 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
-          %81 = amdgpu.mfma %66 * %73 + %80 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
-          scf.yield %75, %81, %79, %77 : vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>
-        }
-        %24 = vector.extract_strided_slice %23#0 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        %25 = stream.binding.subspan %arg2[%c0] : !stream.binding -> memref<1024x5120xf32, strided<[5120, 1], offset: ?>>
-        %26 = arith.remsi %thread_id_x, %c64 : index
-        %27 = arith.divsi %26, %c16 : index
-        %28 = arith.muli %27, %c4 : index
-        %29 = arith.divsi %thread_id_x, %c64 : index
-        %30 = arith.muli %29, %c32 : index
-        %31 = arith.muli %workgroup_id_0, %c64 : index
-        %32 = arith.addi %31, %30 : index
-        %33 = arith.addi %32, %28 : index
-        %34 = arith.muli %thread_id_y, %c32 : index
-        %35 = arith.muli %workgroup_id_1, %c64 : index
-        %36 = arith.remsi %thread_id_x, %c16 : index
-        %37 = arith.addi %36, %35 : index
-        %38 = arith.addi %37, %34 : index
-        vector.store %24, %25[%33, %38] : memref<1024x5120xf32, strided<[5120, 1], offset: ?>>, vector<1xf32>
-        %39 = vector.extract_strided_slice %23#0 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        %40 = arith.addi %33, %c1 : index
-        vector.store %39, %25[%40, %38] : memref<1024x5120xf32, strided<[5120, 1], offset: ?>>, vector<1xf32>
-        %41 = vector.extract_strided_slice %23#0 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        %42 = arith.addi %33, %c2 : index
-        vector.store %41, %25[%42, %38] : memref<1024x5120xf32, strided<[5120, 1], offset: ?>>, vector<1xf32>
-        %43 = vector.extract_strided_slice %23#0 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        %44 = arith.addi %33, %c3 : index
-        vector.store %43, %25[%44, %38] : memref<1024x5120xf32, strided<[5120, 1], offset: ?>>, vector<1xf32>
-        %45 = vector.extract_strided_slice %23#3 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        %46 = arith.addi %33, %c16 : index
-        %47 = arith.addi %38, %c16 : index
-        vector.store %45, %25[%46, %47] : memref<1024x5120xf32, strided<[5120, 1], offset: ?>>, vector<1xf32>
-        %48 = vector.extract_strided_slice %23#3 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        %49 = arith.addi %33, %c17 : index
-        vector.store %48, %25[%49, %47] : memref<1024x5120xf32, strided<[5120, 1], offset: ?>>, vector<1xf32>
-        %50 = vector.extract_strided_slice %23#3 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        %51 = arith.addi %33, %c18 : index
-        vector.store %50, %25[%51, %47] : memref<1024x5120xf32, strided<[5120, 1], offset: ?>>, vector<1xf32>
-        %52 = vector.extract_strided_slice %23#3 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        %53 = arith.addi %33, %c19 : index
-        vector.store %52, %25[%53, %47] : memref<1024x5120xf32, strided<[5120, 1], offset: ?>>, vector<1xf32>
-        %54 = vector.extract_strided_slice %23#2 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        vector.store %54, %25[%46, %38] : memref<1024x5120xf32, strided<[5120, 1], offset: ?>>, vector<1xf32>
-        %55 = vector.extract_strided_slice %23#2 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        vector.store %55, %25[%49, %38] : memref<1024x5120xf32, strided<[5120, 1], offset: ?>>, vector<1xf32>
-        %56 = vector.extract_strided_slice %23#2 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        vector.store %56, %25[%51, %38] : memref<1024x5120xf32, strided<[5120, 1], offset: ?>>, vector<1xf32>
-        %57 = vector.extract_strided_slice %23#2 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        vector.store %57, %25[%53, %38] : memref<1024x5120xf32, strided<[5120, 1], offset: ?>>, vector<1xf32>
-        %58 = vector.extract_strided_slice %23#1 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        vector.store %58, %25[%33, %47] : memref<1024x5120xf32, strided<[5120, 1], offset: ?>>, vector<1xf32>
-        %59 = vector.extract_strided_slice %23#1 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        vector.store %59, %25[%40, %47] : memref<1024x5120xf32, strided<[5120, 1], offset: ?>>, vector<1xf32>
-        %60 = vector.extract_strided_slice %23#1 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        vector.store %60, %25[%42, %47] : memref<1024x5120xf32, strided<[5120, 1], offset: ?>>, vector<1xf32>
-        %61 = vector.extract_strided_slice %23#1 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        vector.store %61, %25[%44, %47] : memref<1024x5120xf32, strided<[5120, 1], offset: ?>>, vector<1xf32>
-        return
-      }
-    }
-  }
-  func.func @isolated_benchmark(%arg0: tensor<1024x640xf16>, %arg1: tensor<5120x640xf16>) -> tensor<1024x5120xf32> {
-    %0 = flow.dispatch @gemm::@gemm(%arg0, %arg1) : (tensor<1024x640xf16>, tensor<5120x640xf16>) -> tensor<1024x5120xf32>
-    return %0 : tensor<1024x5120xf32>
-  }
-}
diff --git a/gemm/mlir/gemm_1280_16_8192_bf16_tA.mlir b/gemm/mlir/gemm_1280_16_8192_bf16_tA.mlir
deleted file mode 100644
index 3baa555..0000000
--- a/gemm/mlir/gemm_1280_16_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x1280xbf16>, %arg1: tensor<8192x16xbf16>) -> tensor<1280x16xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<1280x16xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<1280x16xbf16>) -> tensor<1280x16xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x1280xbf16>, tensor<8192x16xbf16>) outs(%1 : tensor<1280x16xbf16>) -> tensor<1280x16xbf16>
-        return %2 : tensor<1280x16xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_1280_16_8192_f16_tA.mlir b/gemm/mlir/gemm_1280_16_8192_f16_tA.mlir
deleted file mode 100644
index 3fe4759..0000000
--- a/gemm/mlir/gemm_1280_16_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x1280xf16>, %arg1: tensor<8192x16xf16>) -> tensor<1280x16xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<1280x16xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<1280x16xf16>) -> tensor<1280x16xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x1280xf16>, tensor<8192x16xf16>) outs(%1 : tensor<1280x16xf16>) -> tensor<1280x16xf16>
-        return %2 : tensor<1280x16xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_1280_1_8192_bf16_tA.mlir b/gemm/mlir/gemm_1280_1_8192_bf16_tA.mlir
deleted file mode 100644
index 3d2ccc5..0000000
--- a/gemm/mlir/gemm_1280_1_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x1280xbf16>, %arg1: tensor<8192x1xbf16>) -> tensor<1280x1xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<1280x1xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<1280x1xbf16>) -> tensor<1280x1xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x1280xbf16>, tensor<8192x1xbf16>) outs(%1 : tensor<1280x1xbf16>) -> tensor<1280x1xbf16>
-        return %2 : tensor<1280x1xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_1280_1_8192_f16_tA.mlir b/gemm/mlir/gemm_1280_1_8192_f16_tA.mlir
deleted file mode 100644
index b723290..0000000
--- a/gemm/mlir/gemm_1280_1_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x1280xf16>, %arg1: tensor<8192x1xf16>) -> tensor<1280x1xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<1280x1xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<1280x1xf16>) -> tensor<1280x1xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x1280xf16>, tensor<8192x1xf16>) outs(%1 : tensor<1280x1xf16>) -> tensor<1280x1xf16>
-        return %2 : tensor<1280x1xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_1280_2_8192_bf16_tA.mlir b/gemm/mlir/gemm_1280_2_8192_bf16_tA.mlir
deleted file mode 100644
index 3f23515..0000000
--- a/gemm/mlir/gemm_1280_2_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x1280xbf16>, %arg1: tensor<8192x2xbf16>) -> tensor<1280x2xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<1280x2xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<1280x2xbf16>) -> tensor<1280x2xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x1280xbf16>, tensor<8192x2xbf16>) outs(%1 : tensor<1280x2xbf16>) -> tensor<1280x2xbf16>
-        return %2 : tensor<1280x2xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_1280_2_8192_f16_tA.mlir b/gemm/mlir/gemm_1280_2_8192_f16_tA.mlir
deleted file mode 100644
index 32fdd34..0000000
--- a/gemm/mlir/gemm_1280_2_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x1280xf16>, %arg1: tensor<8192x2xf16>) -> tensor<1280x2xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<1280x2xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<1280x2xf16>) -> tensor<1280x2xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x1280xf16>, tensor<8192x2xf16>) outs(%1 : tensor<1280x2xf16>) -> tensor<1280x2xf16>
-        return %2 : tensor<1280x2xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_1280_32_8192_bf16_tA.mlir b/gemm/mlir/gemm_1280_32_8192_bf16_tA.mlir
deleted file mode 100644
index e9bf063..0000000
--- a/gemm/mlir/gemm_1280_32_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x1280xbf16>, %arg1: tensor<8192x32xbf16>) -> tensor<1280x32xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<1280x32xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<1280x32xbf16>) -> tensor<1280x32xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x1280xbf16>, tensor<8192x32xbf16>) outs(%1 : tensor<1280x32xbf16>) -> tensor<1280x32xbf16>
-        return %2 : tensor<1280x32xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_1280_32_8192_f16_tA.mlir b/gemm/mlir/gemm_1280_32_8192_f16_tA.mlir
deleted file mode 100644
index faf8f1a..0000000
--- a/gemm/mlir/gemm_1280_32_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x1280xf16>, %arg1: tensor<8192x32xf16>) -> tensor<1280x32xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<1280x32xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<1280x32xf16>) -> tensor<1280x32xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x1280xf16>, tensor<8192x32xf16>) outs(%1 : tensor<1280x32xf16>) -> tensor<1280x32xf16>
-        return %2 : tensor<1280x32xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_1280_4_8192_bf16_tA.mlir b/gemm/mlir/gemm_1280_4_8192_bf16_tA.mlir
deleted file mode 100644
index d844019..0000000
--- a/gemm/mlir/gemm_1280_4_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x1280xbf16>, %arg1: tensor<8192x4xbf16>) -> tensor<1280x4xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<1280x4xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<1280x4xbf16>) -> tensor<1280x4xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x1280xbf16>, tensor<8192x4xbf16>) outs(%1 : tensor<1280x4xbf16>) -> tensor<1280x4xbf16>
-        return %2 : tensor<1280x4xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_1280_4_8192_f16_tA.mlir b/gemm/mlir/gemm_1280_4_8192_f16_tA.mlir
deleted file mode 100644
index f7ead50..0000000
--- a/gemm/mlir/gemm_1280_4_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x1280xf16>, %arg1: tensor<8192x4xf16>) -> tensor<1280x4xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<1280x4xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<1280x4xf16>) -> tensor<1280x4xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x1280xf16>, tensor<8192x4xf16>) outs(%1 : tensor<1280x4xf16>) -> tensor<1280x4xf16>
-        return %2 : tensor<1280x4xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_1280_8_8192_bf16_tA.mlir b/gemm/mlir/gemm_1280_8_8192_bf16_tA.mlir
deleted file mode 100644
index 8f2da95..0000000
--- a/gemm/mlir/gemm_1280_8_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x1280xbf16>, %arg1: tensor<8192x8xbf16>) -> tensor<1280x8xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<1280x8xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<1280x8xbf16>) -> tensor<1280x8xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x1280xbf16>, tensor<8192x8xbf16>) outs(%1 : tensor<1280x8xbf16>) -> tensor<1280x8xbf16>
-        return %2 : tensor<1280x8xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_1280_8_8192_f16_tA.mlir b/gemm/mlir/gemm_1280_8_8192_f16_tA.mlir
deleted file mode 100644
index 4c96f74..0000000
--- a/gemm/mlir/gemm_1280_8_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x1280xf16>, %arg1: tensor<8192x8xf16>) -> tensor<1280x8xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<1280x8xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<1280x8xf16>) -> tensor<1280x8xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x1280xf16>, tensor<8192x8xf16>) outs(%1 : tensor<1280x8xf16>) -> tensor<1280x8xf16>
-        return %2 : tensor<1280x8xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_128_1280_2048_bf16.mlir b/gemm/mlir/gemm_128_1280_2048_bf16.mlir
deleted file mode 100644
index c758c9d..0000000
--- a/gemm/mlir/gemm_128_1280_2048_bf16.mlir
+++ /dev/null
@@ -1,9 +0,0 @@
-module {
-    func.func @main(%arg0: tensor<128x2048xbf16>, %arg1: tensor<2048x1280xbf16>) -> tensor<128x1280xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<128x1280xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<128x1280xbf16>) -> tensor<128x1280xbf16>
-        %2 = linalg.matmul ins(%arg0, %arg1 : tensor<128x2048xbf16>, tensor<2048x1280xbf16>) outs(%1 : tensor<128x1280xbf16>) -> tensor<128x1280xbf16>
-        return %2 : tensor<128x1280xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_128_1280_2048_bf16_tA.mlir b/gemm/mlir/gemm_128_1280_2048_bf16_tA.mlir
deleted file mode 100644
index 0cb012c..0000000
--- a/gemm/mlir/gemm_128_1280_2048_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<2048x128xbf16>, %arg1: tensor<2048x1280xbf16>) -> tensor<128x1280xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<128x1280xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<128x1280xbf16>) -> tensor<128x1280xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<2048x128xbf16>, tensor<2048x1280xbf16>) outs(%1 : tensor<128x1280xbf16>) -> tensor<128x1280xbf16>
-        return %2 : tensor<128x1280xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_128_1280_2048_bf16_tB.mlir b/gemm/mlir/gemm_128_1280_2048_bf16_tB.mlir
deleted file mode 100644
index 32f5e6f..0000000
--- a/gemm/mlir/gemm_128_1280_2048_bf16_tB.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<128x2048xbf16>, %arg1: tensor<1280x2048xbf16>) -> tensor<128x1280xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<128x1280xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<128x1280xbf16>) -> tensor<128x1280xbf16>
-        %2 = linalg.matmul_transpose_b ins(%arg0, %arg1 : tensor<128x2048xbf16>, tensor<1280x2048xbf16>) outs(%1 : tensor<128x1280xbf16>) -> tensor<128x1280xbf16>
-        return %2 : tensor<128x1280xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_128_1280_2048_f16.mlir b/gemm/mlir/gemm_128_1280_2048_f16.mlir
deleted file mode 100644
index 84ea04a..0000000
--- a/gemm/mlir/gemm_128_1280_2048_f16.mlir
+++ /dev/null
@@ -1,9 +0,0 @@
-module {
-    func.func @main(%arg0: tensor<128x2048xf16>, %arg1: tensor<2048x1280xf16>) -> tensor<128x1280xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<128x1280xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<128x1280xf16>) -> tensor<128x1280xf16>
-        %2 = linalg.matmul ins(%arg0, %arg1 : tensor<128x2048xf16>, tensor<2048x1280xf16>) outs(%1 : tensor<128x1280xf16>) -> tensor<128x1280xf16>
-        return %2 : tensor<128x1280xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_128_1280_2048_f16_tA.mlir b/gemm/mlir/gemm_128_1280_2048_f16_tA.mlir
deleted file mode 100644
index 45cda80..0000000
--- a/gemm/mlir/gemm_128_1280_2048_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<2048x128xf16>, %arg1: tensor<2048x1280xf16>) -> tensor<128x1280xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<128x1280xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<128x1280xf16>) -> tensor<128x1280xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<2048x128xf16>, tensor<2048x1280xf16>) outs(%1 : tensor<128x1280xf16>) -> tensor<128x1280xf16>
-        return %2 : tensor<128x1280xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_128_1280_2048_f16_tB.mlir b/gemm/mlir/gemm_128_1280_2048_f16_tB.mlir
deleted file mode 100644
index 785a854..0000000
--- a/gemm/mlir/gemm_128_1280_2048_f16_tB.mlir
+++ /dev/null
@@ -1,144 +0,0 @@
-#translation = #iree_codegen.translation_info<None workgroup_size = [128, 2, 1] subgroup_size = 64>
-module attributes {transform.with_named_sequence} {
-  stream.executable private @gemm {
-    stream.executable.export public @gemm workgroups() -> (index, index, index) {
-      %c2 = arith.constant 2 : index
-      %c20 = arith.constant 20 : index
-      %c1 = arith.constant 1 : index
-      stream.return %c2, %c20, %c1 : index, index, index
-    }
-    builtin.module {
-      func.func @gemm(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) attributes {translation_info = #translation} {
-        %c19 = arith.constant 19 : index
-        %c18 = arith.constant 18 : index
-        %c17 = arith.constant 17 : index
-        %c3 = arith.constant 3 : index
-        %c2 = arith.constant 2 : index
-        %c16 = arith.constant 16 : index
-        %c8 = arith.constant 8 : index
-        %c4 = arith.constant 4 : index
-        %c32 = arith.constant 32 : index
-        %c1 = arith.constant 1 : index
-        %c64 = arith.constant 64 : index
-        %c0 = arith.constant 0 : index
-        %cst = arith.constant dense<0.000000e+00> : vector<4xf32>
-        %workgroup_id_0 = stream.dispatch.workgroup.id[0] : index
-        %workgroup_id_1 = stream.dispatch.workgroup.id[1] : index
-        %thread_id_x = gpu.thread_id  x
-        %thread_id_y = gpu.thread_id  y
-        %alloc = memref.alloc() : memref<64x32xf16, #gpu.address_space<workgroup>>
-        %alloc_0 = memref.alloc() : memref<64x32xf16, #gpu.address_space<workgroup>>
-        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> memref<128x2048xf16, strided<[2048, 1], offset: ?>>
-        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> memref<1280x2048xf16, strided<[2048, 1], offset: ?>>
-        %2 = arith.muli %workgroup_id_0, %c64 : index
-        %3 = arith.muli %thread_id_y, %c32 : index
-        %4 = arith.divsi %thread_id_x, %c4 : index
-        %5 = arith.addi %4, %3 : index
-        %6 = arith.remsi %5, %c64 : index
-        %7 = arith.addi %6, %2 : index
-        %8 = arith.remsi %thread_id_x, %c4 : index
-        %9 = arith.muli %8, %c8 : index
-        %10 = arith.divsi %thread_id_x, %c64 : index
-        %11 = arith.muli %10, %c32 : index
-        %12 = arith.remsi %thread_id_x, %c16 : index
-        %13 = arith.addi %12, %11 : index
-        %14 = arith.remsi %thread_id_x, %c64 : index
-        %15 = arith.divsi %14, %c16 : index
-        %16 = arith.muli %15, %c4 : index
-        %17 = arith.addi %16, %c16 : index
-        %18 = arith.addi %13, %c16 : index
-        %19 = arith.muli %workgroup_id_1, %c64 : index
-        %20 = arith.addi %6, %19 : index
-        %21 = arith.addi %12, %3 : index
-        %22 = arith.addi %21, %c16 : index
-        %23:4 = scf.for %arg3 = %c0 to %c64 step %c1 iter_args(%arg4 = %cst, %arg5 = %cst, %arg6 = %cst, %arg7 = %cst) -> (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) {
-          %62 = arith.muli %arg3, %c32 : index
-          %63 = arith.addi %62, %9 : index
-          %64 = vector.load %0[%7, %63] : memref<128x2048xf16, strided<[2048, 1], offset: ?>>, vector<8xf16>
-          vector.store %64, %alloc[%6, %9] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<8xf16>
-          amdgpu.lds_barrier
-          %65 = vector.load %alloc[%13, %16] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<4xf16>
-          %66 = vector.load %alloc[%13, %17] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<4xf16>
-          %67 = vector.load %alloc[%18, %16] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<4xf16>
-          %68 = vector.load %alloc[%18, %17] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<4xf16>
-          %69 = vector.load %1[%20, %63] : memref<1280x2048xf16, strided<[2048, 1], offset: ?>>, vector<8xf16>
-          amdgpu.lds_barrier
-          vector.store %69, %alloc_0[%6, %9] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<8xf16>
-          amdgpu.lds_barrier
-          %70 = vector.load %alloc_0[%21, %16] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<4xf16>
-          %71 = vector.load %alloc_0[%21, %17] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<4xf16>
-          %72 = vector.load %alloc_0[%22, %16] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<4xf16>
-          %73 = vector.load %alloc_0[%22, %17] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<4xf16>
-          %74 = amdgpu.mfma %65 * %70 + %arg4 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
-          %75 = amdgpu.mfma %66 * %71 + %74 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
-          %76 = amdgpu.mfma %67 * %72 + %arg7 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
-          %77 = amdgpu.mfma %68 * %73 + %76 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
-          %78 = amdgpu.mfma %67 * %70 + %arg6 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
-          %79 = amdgpu.mfma %68 * %71 + %78 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
-          %80 = amdgpu.mfma %65 * %72 + %arg5 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
-          %81 = amdgpu.mfma %66 * %73 + %80 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
-          scf.yield %75, %81, %79, %77 : vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>
-        }
-        %24 = vector.extract_strided_slice %23#0 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        %25 = stream.binding.subspan %arg2[%c0] : !stream.binding -> memref<128x1280xf32, strided<[1280, 1], offset: ?>>
-        %26 = arith.remsi %thread_id_x, %c64 : index
-        %27 = arith.divsi %26, %c16 : index
-        %28 = arith.muli %27, %c4 : index
-        %29 = arith.divsi %thread_id_x, %c64 : index
-        %30 = arith.muli %29, %c32 : index
-        %31 = arith.muli %workgroup_id_0, %c64 : index
-        %32 = arith.addi %31, %30 : index
-        %33 = arith.addi %32, %28 : index
-        %34 = arith.muli %thread_id_y, %c32 : index
-        %35 = arith.muli %workgroup_id_1, %c64 : index
-        %36 = arith.remsi %thread_id_x, %c16 : index
-        %37 = arith.addi %36, %35 : index
-        %38 = arith.addi %37, %34 : index
-        vector.store %24, %25[%33, %38] : memref<128x1280xf32, strided<[1280, 1], offset: ?>>, vector<1xf32>
-        %39 = vector.extract_strided_slice %23#0 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        %40 = arith.addi %33, %c1 : index
-        vector.store %39, %25[%40, %38] : memref<128x1280xf32, strided<[1280, 1], offset: ?>>, vector<1xf32>
-        %41 = vector.extract_strided_slice %23#0 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        %42 = arith.addi %33, %c2 : index
-        vector.store %41, %25[%42, %38] : memref<128x1280xf32, strided<[1280, 1], offset: ?>>, vector<1xf32>
-        %43 = vector.extract_strided_slice %23#0 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        %44 = arith.addi %33, %c3 : index
-        vector.store %43, %25[%44, %38] : memref<128x1280xf32, strided<[1280, 1], offset: ?>>, vector<1xf32>
-        %45 = vector.extract_strided_slice %23#3 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        %46 = arith.addi %33, %c16 : index
-        %47 = arith.addi %38, %c16 : index
-        vector.store %45, %25[%46, %47] : memref<128x1280xf32, strided<[1280, 1], offset: ?>>, vector<1xf32>
-        %48 = vector.extract_strided_slice %23#3 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        %49 = arith.addi %33, %c17 : index
-        vector.store %48, %25[%49, %47] : memref<128x1280xf32, strided<[1280, 1], offset: ?>>, vector<1xf32>
-        %50 = vector.extract_strided_slice %23#3 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        %51 = arith.addi %33, %c18 : index
-        vector.store %50, %25[%51, %47] : memref<128x1280xf32, strided<[1280, 1], offset: ?>>, vector<1xf32>
-        %52 = vector.extract_strided_slice %23#3 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        %53 = arith.addi %33, %c19 : index
-        vector.store %52, %25[%53, %47] : memref<128x1280xf32, strided<[1280, 1], offset: ?>>, vector<1xf32>
-        %54 = vector.extract_strided_slice %23#2 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        vector.store %54, %25[%46, %38] : memref<128x1280xf32, strided<[1280, 1], offset: ?>>, vector<1xf32>
-        %55 = vector.extract_strided_slice %23#2 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        vector.store %55, %25[%49, %38] : memref<128x1280xf32, strided<[1280, 1], offset: ?>>, vector<1xf32>
-        %56 = vector.extract_strided_slice %23#2 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        vector.store %56, %25[%51, %38] : memref<128x1280xf32, strided<[1280, 1], offset: ?>>, vector<1xf32>
-        %57 = vector.extract_strided_slice %23#2 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        vector.store %57, %25[%53, %38] : memref<128x1280xf32, strided<[1280, 1], offset: ?>>, vector<1xf32>
-        %58 = vector.extract_strided_slice %23#1 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        vector.store %58, %25[%33, %47] : memref<128x1280xf32, strided<[1280, 1], offset: ?>>, vector<1xf32>
-        %59 = vector.extract_strided_slice %23#1 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        vector.store %59, %25[%40, %47] : memref<128x1280xf32, strided<[1280, 1], offset: ?>>, vector<1xf32>
-        %60 = vector.extract_strided_slice %23#1 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        vector.store %60, %25[%42, %47] : memref<128x1280xf32, strided<[1280, 1], offset: ?>>, vector<1xf32>
-        %61 = vector.extract_strided_slice %23#1 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        vector.store %61, %25[%44, %47] : memref<128x1280xf32, strided<[1280, 1], offset: ?>>, vector<1xf32>
-        return
-      }
-    }
-  }
-  func.func @isolated_benchmark(%arg0: tensor<128x2048xf16>, %arg1: tensor<1280x2048xf16>) -> tensor<128x1280xf32> {
-    %0 = flow.dispatch @gemm::@gemm(%arg0, %arg1) : (tensor<128x2048xf16>, tensor<1280x2048xf16>) -> tensor<128x1280xf32>
-    return %0 : tensor<128x1280xf32>
-  }
-}
diff --git a/gemm/mlir/gemm_13824_16_5120_bf16_tA.mlir b/gemm/mlir/gemm_13824_16_5120_bf16_tA.mlir
deleted file mode 100644
index bf06e53..0000000
--- a/gemm/mlir/gemm_13824_16_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x13824xbf16>, %arg1: tensor<5120x16xbf16>) -> tensor<13824x16xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<13824x16xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<13824x16xbf16>) -> tensor<13824x16xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x13824xbf16>, tensor<5120x16xbf16>) outs(%1 : tensor<13824x16xbf16>) -> tensor<13824x16xbf16>
-        return %2 : tensor<13824x16xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_13824_16_5120_f16_tA.mlir b/gemm/mlir/gemm_13824_16_5120_f16_tA.mlir
deleted file mode 100644
index 6820445..0000000
--- a/gemm/mlir/gemm_13824_16_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x13824xf16>, %arg1: tensor<5120x16xf16>) -> tensor<13824x16xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<13824x16xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<13824x16xf16>) -> tensor<13824x16xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x13824xf16>, tensor<5120x16xf16>) outs(%1 : tensor<13824x16xf16>) -> tensor<13824x16xf16>
-        return %2 : tensor<13824x16xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_13824_1_5120_bf16_tA.mlir b/gemm/mlir/gemm_13824_1_5120_bf16_tA.mlir
deleted file mode 100644
index bddc513..0000000
--- a/gemm/mlir/gemm_13824_1_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x13824xbf16>, %arg1: tensor<5120x1xbf16>) -> tensor<13824x1xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<13824x1xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<13824x1xbf16>) -> tensor<13824x1xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x13824xbf16>, tensor<5120x1xbf16>) outs(%1 : tensor<13824x1xbf16>) -> tensor<13824x1xbf16>
-        return %2 : tensor<13824x1xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_13824_1_5120_f16_tA.mlir b/gemm/mlir/gemm_13824_1_5120_f16_tA.mlir
deleted file mode 100644
index de51690..0000000
--- a/gemm/mlir/gemm_13824_1_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x13824xf16>, %arg1: tensor<5120x1xf16>) -> tensor<13824x1xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<13824x1xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<13824x1xf16>) -> tensor<13824x1xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x13824xf16>, tensor<5120x1xf16>) outs(%1 : tensor<13824x1xf16>) -> tensor<13824x1xf16>
-        return %2 : tensor<13824x1xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_13824_2_5120_bf16_tA.mlir b/gemm/mlir/gemm_13824_2_5120_bf16_tA.mlir
deleted file mode 100644
index b73977b..0000000
--- a/gemm/mlir/gemm_13824_2_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x13824xbf16>, %arg1: tensor<5120x2xbf16>) -> tensor<13824x2xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<13824x2xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<13824x2xbf16>) -> tensor<13824x2xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x13824xbf16>, tensor<5120x2xbf16>) outs(%1 : tensor<13824x2xbf16>) -> tensor<13824x2xbf16>
-        return %2 : tensor<13824x2xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_13824_2_5120_f16_tA.mlir b/gemm/mlir/gemm_13824_2_5120_f16_tA.mlir
deleted file mode 100644
index b763847..0000000
--- a/gemm/mlir/gemm_13824_2_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x13824xf16>, %arg1: tensor<5120x2xf16>) -> tensor<13824x2xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<13824x2xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<13824x2xf16>) -> tensor<13824x2xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x13824xf16>, tensor<5120x2xf16>) outs(%1 : tensor<13824x2xf16>) -> tensor<13824x2xf16>
-        return %2 : tensor<13824x2xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_13824_32_5120_bf16_tA.mlir b/gemm/mlir/gemm_13824_32_5120_bf16_tA.mlir
deleted file mode 100644
index 3be8ecf..0000000
--- a/gemm/mlir/gemm_13824_32_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x13824xbf16>, %arg1: tensor<5120x32xbf16>) -> tensor<13824x32xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<13824x32xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<13824x32xbf16>) -> tensor<13824x32xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x13824xbf16>, tensor<5120x32xbf16>) outs(%1 : tensor<13824x32xbf16>) -> tensor<13824x32xbf16>
-        return %2 : tensor<13824x32xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_13824_32_5120_f16_tA.mlir b/gemm/mlir/gemm_13824_32_5120_f16_tA.mlir
deleted file mode 100644
index 2069eef..0000000
--- a/gemm/mlir/gemm_13824_32_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x13824xf16>, %arg1: tensor<5120x32xf16>) -> tensor<13824x32xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<13824x32xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<13824x32xf16>) -> tensor<13824x32xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x13824xf16>, tensor<5120x32xf16>) outs(%1 : tensor<13824x32xf16>) -> tensor<13824x32xf16>
-        return %2 : tensor<13824x32xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_13824_4_5120_bf16_tA.mlir b/gemm/mlir/gemm_13824_4_5120_bf16_tA.mlir
deleted file mode 100644
index 3ac974f..0000000
--- a/gemm/mlir/gemm_13824_4_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x13824xbf16>, %arg1: tensor<5120x4xbf16>) -> tensor<13824x4xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<13824x4xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<13824x4xbf16>) -> tensor<13824x4xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x13824xbf16>, tensor<5120x4xbf16>) outs(%1 : tensor<13824x4xbf16>) -> tensor<13824x4xbf16>
-        return %2 : tensor<13824x4xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_13824_4_5120_f16_tA.mlir b/gemm/mlir/gemm_13824_4_5120_f16_tA.mlir
deleted file mode 100644
index 2d2dbaf..0000000
--- a/gemm/mlir/gemm_13824_4_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x13824xf16>, %arg1: tensor<5120x4xf16>) -> tensor<13824x4xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<13824x4xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<13824x4xf16>) -> tensor<13824x4xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x13824xf16>, tensor<5120x4xf16>) outs(%1 : tensor<13824x4xf16>) -> tensor<13824x4xf16>
-        return %2 : tensor<13824x4xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_13824_8_5120_bf16_tA.mlir b/gemm/mlir/gemm_13824_8_5120_bf16_tA.mlir
deleted file mode 100644
index 30c7d55..0000000
--- a/gemm/mlir/gemm_13824_8_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x13824xbf16>, %arg1: tensor<5120x8xbf16>) -> tensor<13824x8xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<13824x8xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<13824x8xbf16>) -> tensor<13824x8xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x13824xbf16>, tensor<5120x8xbf16>) outs(%1 : tensor<13824x8xbf16>) -> tensor<13824x8xbf16>
-        return %2 : tensor<13824x8xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_13824_8_5120_f16_tA.mlir b/gemm/mlir/gemm_13824_8_5120_f16_tA.mlir
deleted file mode 100644
index 96d5e3c..0000000
--- a/gemm/mlir/gemm_13824_8_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x13824xf16>, %arg1: tensor<5120x8xf16>) -> tensor<13824x8xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<13824x8xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<13824x8xf16>) -> tensor<13824x8xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x13824xf16>, tensor<5120x8xf16>) outs(%1 : tensor<13824x8xf16>) -> tensor<13824x8xf16>
-        return %2 : tensor<13824x8xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_14336_16_8192_bf16_tA.mlir b/gemm/mlir/gemm_14336_16_8192_bf16_tA.mlir
deleted file mode 100644
index ebb53bd..0000000
--- a/gemm/mlir/gemm_14336_16_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x14336xbf16>, %arg1: tensor<8192x16xbf16>) -> tensor<14336x16xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<14336x16xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<14336x16xbf16>) -> tensor<14336x16xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x14336xbf16>, tensor<8192x16xbf16>) outs(%1 : tensor<14336x16xbf16>) -> tensor<14336x16xbf16>
-        return %2 : tensor<14336x16xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_14336_16_8192_f16_tA.mlir b/gemm/mlir/gemm_14336_16_8192_f16_tA.mlir
deleted file mode 100644
index 1c62bae..0000000
--- a/gemm/mlir/gemm_14336_16_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x14336xf16>, %arg1: tensor<8192x16xf16>) -> tensor<14336x16xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<14336x16xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<14336x16xf16>) -> tensor<14336x16xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x14336xf16>, tensor<8192x16xf16>) outs(%1 : tensor<14336x16xf16>) -> tensor<14336x16xf16>
-        return %2 : tensor<14336x16xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_14336_1_8192_bf16_tA.mlir b/gemm/mlir/gemm_14336_1_8192_bf16_tA.mlir
deleted file mode 100644
index 12e1750..0000000
--- a/gemm/mlir/gemm_14336_1_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x14336xbf16>, %arg1: tensor<8192x1xbf16>) -> tensor<14336x1xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<14336x1xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<14336x1xbf16>) -> tensor<14336x1xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x14336xbf16>, tensor<8192x1xbf16>) outs(%1 : tensor<14336x1xbf16>) -> tensor<14336x1xbf16>
-        return %2 : tensor<14336x1xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_14336_1_8192_f16_tA.mlir b/gemm/mlir/gemm_14336_1_8192_f16_tA.mlir
deleted file mode 100644
index b3cee07..0000000
--- a/gemm/mlir/gemm_14336_1_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x14336xf16>, %arg1: tensor<8192x1xf16>) -> tensor<14336x1xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<14336x1xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<14336x1xf16>) -> tensor<14336x1xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x14336xf16>, tensor<8192x1xf16>) outs(%1 : tensor<14336x1xf16>) -> tensor<14336x1xf16>
-        return %2 : tensor<14336x1xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_14336_2_8192_bf16_tA.mlir b/gemm/mlir/gemm_14336_2_8192_bf16_tA.mlir
deleted file mode 100644
index ce3f701..0000000
--- a/gemm/mlir/gemm_14336_2_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x14336xbf16>, %arg1: tensor<8192x2xbf16>) -> tensor<14336x2xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<14336x2xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<14336x2xbf16>) -> tensor<14336x2xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x14336xbf16>, tensor<8192x2xbf16>) outs(%1 : tensor<14336x2xbf16>) -> tensor<14336x2xbf16>
-        return %2 : tensor<14336x2xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_14336_2_8192_f16_tA.mlir b/gemm/mlir/gemm_14336_2_8192_f16_tA.mlir
deleted file mode 100644
index 100d62f..0000000
--- a/gemm/mlir/gemm_14336_2_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x14336xf16>, %arg1: tensor<8192x2xf16>) -> tensor<14336x2xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<14336x2xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<14336x2xf16>) -> tensor<14336x2xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x14336xf16>, tensor<8192x2xf16>) outs(%1 : tensor<14336x2xf16>) -> tensor<14336x2xf16>
-        return %2 : tensor<14336x2xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_14336_32_8192_bf16_tA.mlir b/gemm/mlir/gemm_14336_32_8192_bf16_tA.mlir
deleted file mode 100644
index 39a012e..0000000
--- a/gemm/mlir/gemm_14336_32_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x14336xbf16>, %arg1: tensor<8192x32xbf16>) -> tensor<14336x32xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<14336x32xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<14336x32xbf16>) -> tensor<14336x32xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x14336xbf16>, tensor<8192x32xbf16>) outs(%1 : tensor<14336x32xbf16>) -> tensor<14336x32xbf16>
-        return %2 : tensor<14336x32xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_14336_32_8192_f16_tA.mlir b/gemm/mlir/gemm_14336_32_8192_f16_tA.mlir
deleted file mode 100644
index 6457a07..0000000
--- a/gemm/mlir/gemm_14336_32_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x14336xf16>, %arg1: tensor<8192x32xf16>) -> tensor<14336x32xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<14336x32xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<14336x32xf16>) -> tensor<14336x32xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x14336xf16>, tensor<8192x32xf16>) outs(%1 : tensor<14336x32xf16>) -> tensor<14336x32xf16>
-        return %2 : tensor<14336x32xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_14336_4_8192_bf16_tA.mlir b/gemm/mlir/gemm_14336_4_8192_bf16_tA.mlir
deleted file mode 100644
index 99bcffb..0000000
--- a/gemm/mlir/gemm_14336_4_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x14336xbf16>, %arg1: tensor<8192x4xbf16>) -> tensor<14336x4xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<14336x4xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<14336x4xbf16>) -> tensor<14336x4xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x14336xbf16>, tensor<8192x4xbf16>) outs(%1 : tensor<14336x4xbf16>) -> tensor<14336x4xbf16>
-        return %2 : tensor<14336x4xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_14336_4_8192_f16_tA.mlir b/gemm/mlir/gemm_14336_4_8192_f16_tA.mlir
deleted file mode 100644
index 6c93d68..0000000
--- a/gemm/mlir/gemm_14336_4_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x14336xf16>, %arg1: tensor<8192x4xf16>) -> tensor<14336x4xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<14336x4xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<14336x4xf16>) -> tensor<14336x4xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x14336xf16>, tensor<8192x4xf16>) outs(%1 : tensor<14336x4xf16>) -> tensor<14336x4xf16>
-        return %2 : tensor<14336x4xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_14336_8_8192_bf16_tA.mlir b/gemm/mlir/gemm_14336_8_8192_bf16_tA.mlir
deleted file mode 100644
index 22146cb..0000000
--- a/gemm/mlir/gemm_14336_8_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x14336xbf16>, %arg1: tensor<8192x8xbf16>) -> tensor<14336x8xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<14336x8xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<14336x8xbf16>) -> tensor<14336x8xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x14336xbf16>, tensor<8192x8xbf16>) outs(%1 : tensor<14336x8xbf16>) -> tensor<14336x8xbf16>
-        return %2 : tensor<14336x8xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_14336_8_8192_f16_tA.mlir b/gemm/mlir/gemm_14336_8_8192_f16_tA.mlir
deleted file mode 100644
index 452edf9..0000000
--- a/gemm/mlir/gemm_14336_8_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x14336xf16>, %arg1: tensor<8192x8xf16>) -> tensor<14336x8xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<14336x8xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<14336x8xf16>) -> tensor<14336x8xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x14336xf16>, tensor<8192x8xf16>) outs(%1 : tensor<14336x8xf16>) -> tensor<14336x8xf16>
-        return %2 : tensor<14336x8xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_15360_16_5120_bf16_tA.mlir b/gemm/mlir/gemm_15360_16_5120_bf16_tA.mlir
deleted file mode 100644
index da57d0c..0000000
--- a/gemm/mlir/gemm_15360_16_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x15360xbf16>, %arg1: tensor<5120x16xbf16>) -> tensor<15360x16xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<15360x16xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<15360x16xbf16>) -> tensor<15360x16xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x15360xbf16>, tensor<5120x16xbf16>) outs(%1 : tensor<15360x16xbf16>) -> tensor<15360x16xbf16>
-        return %2 : tensor<15360x16xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_15360_16_5120_f16_tA.mlir b/gemm/mlir/gemm_15360_16_5120_f16_tA.mlir
deleted file mode 100644
index b15d265..0000000
--- a/gemm/mlir/gemm_15360_16_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x15360xf16>, %arg1: tensor<5120x16xf16>) -> tensor<15360x16xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<15360x16xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<15360x16xf16>) -> tensor<15360x16xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x15360xf16>, tensor<5120x16xf16>) outs(%1 : tensor<15360x16xf16>) -> tensor<15360x16xf16>
-        return %2 : tensor<15360x16xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_15360_1_5120_bf16_tA.mlir b/gemm/mlir/gemm_15360_1_5120_bf16_tA.mlir
deleted file mode 100644
index b0d9c92..0000000
--- a/gemm/mlir/gemm_15360_1_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x15360xbf16>, %arg1: tensor<5120x1xbf16>) -> tensor<15360x1xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<15360x1xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<15360x1xbf16>) -> tensor<15360x1xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x15360xbf16>, tensor<5120x1xbf16>) outs(%1 : tensor<15360x1xbf16>) -> tensor<15360x1xbf16>
-        return %2 : tensor<15360x1xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_15360_1_5120_f16_tA.mlir b/gemm/mlir/gemm_15360_1_5120_f16_tA.mlir
deleted file mode 100644
index d458ee9..0000000
--- a/gemm/mlir/gemm_15360_1_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x15360xf16>, %arg1: tensor<5120x1xf16>) -> tensor<15360x1xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<15360x1xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<15360x1xf16>) -> tensor<15360x1xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x15360xf16>, tensor<5120x1xf16>) outs(%1 : tensor<15360x1xf16>) -> tensor<15360x1xf16>
-        return %2 : tensor<15360x1xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_15360_2_5120_bf16_tA.mlir b/gemm/mlir/gemm_15360_2_5120_bf16_tA.mlir
deleted file mode 100644
index 032eae5..0000000
--- a/gemm/mlir/gemm_15360_2_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x15360xbf16>, %arg1: tensor<5120x2xbf16>) -> tensor<15360x2xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<15360x2xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<15360x2xbf16>) -> tensor<15360x2xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x15360xbf16>, tensor<5120x2xbf16>) outs(%1 : tensor<15360x2xbf16>) -> tensor<15360x2xbf16>
-        return %2 : tensor<15360x2xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_15360_2_5120_f16_tA.mlir b/gemm/mlir/gemm_15360_2_5120_f16_tA.mlir
deleted file mode 100644
index 18a0d50..0000000
--- a/gemm/mlir/gemm_15360_2_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x15360xf16>, %arg1: tensor<5120x2xf16>) -> tensor<15360x2xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<15360x2xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<15360x2xf16>) -> tensor<15360x2xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x15360xf16>, tensor<5120x2xf16>) outs(%1 : tensor<15360x2xf16>) -> tensor<15360x2xf16>
-        return %2 : tensor<15360x2xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_15360_32_5120_bf16_tA.mlir b/gemm/mlir/gemm_15360_32_5120_bf16_tA.mlir
deleted file mode 100644
index 8f7fa25..0000000
--- a/gemm/mlir/gemm_15360_32_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x15360xbf16>, %arg1: tensor<5120x32xbf16>) -> tensor<15360x32xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<15360x32xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<15360x32xbf16>) -> tensor<15360x32xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x15360xbf16>, tensor<5120x32xbf16>) outs(%1 : tensor<15360x32xbf16>) -> tensor<15360x32xbf16>
-        return %2 : tensor<15360x32xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_15360_32_5120_f16_tA.mlir b/gemm/mlir/gemm_15360_32_5120_f16_tA.mlir
deleted file mode 100644
index fc86593..0000000
--- a/gemm/mlir/gemm_15360_32_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x15360xf16>, %arg1: tensor<5120x32xf16>) -> tensor<15360x32xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<15360x32xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<15360x32xf16>) -> tensor<15360x32xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x15360xf16>, tensor<5120x32xf16>) outs(%1 : tensor<15360x32xf16>) -> tensor<15360x32xf16>
-        return %2 : tensor<15360x32xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_15360_4_5120_bf16_tA.mlir b/gemm/mlir/gemm_15360_4_5120_bf16_tA.mlir
deleted file mode 100644
index f388bfc..0000000
--- a/gemm/mlir/gemm_15360_4_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x15360xbf16>, %arg1: tensor<5120x4xbf16>) -> tensor<15360x4xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<15360x4xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<15360x4xbf16>) -> tensor<15360x4xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x15360xbf16>, tensor<5120x4xbf16>) outs(%1 : tensor<15360x4xbf16>) -> tensor<15360x4xbf16>
-        return %2 : tensor<15360x4xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_15360_4_5120_f16_tA.mlir b/gemm/mlir/gemm_15360_4_5120_f16_tA.mlir
deleted file mode 100644
index c8666aa..0000000
--- a/gemm/mlir/gemm_15360_4_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x15360xf16>, %arg1: tensor<5120x4xf16>) -> tensor<15360x4xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<15360x4xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<15360x4xf16>) -> tensor<15360x4xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x15360xf16>, tensor<5120x4xf16>) outs(%1 : tensor<15360x4xf16>) -> tensor<15360x4xf16>
-        return %2 : tensor<15360x4xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_15360_8_5120_bf16_tA.mlir b/gemm/mlir/gemm_15360_8_5120_bf16_tA.mlir
deleted file mode 100644
index 813f5a1..0000000
--- a/gemm/mlir/gemm_15360_8_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x15360xbf16>, %arg1: tensor<5120x8xbf16>) -> tensor<15360x8xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<15360x8xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<15360x8xbf16>) -> tensor<15360x8xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x15360xbf16>, tensor<5120x8xbf16>) outs(%1 : tensor<15360x8xbf16>) -> tensor<15360x8xbf16>
-        return %2 : tensor<15360x8xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_15360_8_5120_f16_tA.mlir b/gemm/mlir/gemm_15360_8_5120_f16_tA.mlir
deleted file mode 100644
index 5df7526..0000000
--- a/gemm/mlir/gemm_15360_8_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x15360xf16>, %arg1: tensor<5120x8xf16>) -> tensor<15360x8xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<15360x8xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<15360x8xf16>) -> tensor<15360x8xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x15360xf16>, tensor<5120x8xf16>) outs(%1 : tensor<15360x8xf16>) -> tensor<15360x8xf16>
-        return %2 : tensor<15360x8xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_16000_16_5120_bf16_tA.mlir b/gemm/mlir/gemm_16000_16_5120_bf16_tA.mlir
deleted file mode 100644
index 50136f8..0000000
--- a/gemm/mlir/gemm_16000_16_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x16000xbf16>, %arg1: tensor<5120x16xbf16>) -> tensor<16000x16xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<16000x16xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<16000x16xbf16>) -> tensor<16000x16xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x16000xbf16>, tensor<5120x16xbf16>) outs(%1 : tensor<16000x16xbf16>) -> tensor<16000x16xbf16>
-        return %2 : tensor<16000x16xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_16000_16_5120_f16_tA.mlir b/gemm/mlir/gemm_16000_16_5120_f16_tA.mlir
deleted file mode 100644
index e0ebb71..0000000
--- a/gemm/mlir/gemm_16000_16_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x16000xf16>, %arg1: tensor<5120x16xf16>) -> tensor<16000x16xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<16000x16xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<16000x16xf16>) -> tensor<16000x16xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x16000xf16>, tensor<5120x16xf16>) outs(%1 : tensor<16000x16xf16>) -> tensor<16000x16xf16>
-        return %2 : tensor<16000x16xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_16000_16_8192_bf16_tA.mlir b/gemm/mlir/gemm_16000_16_8192_bf16_tA.mlir
deleted file mode 100644
index 95ae5e6..0000000
--- a/gemm/mlir/gemm_16000_16_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x16000xbf16>, %arg1: tensor<8192x16xbf16>) -> tensor<16000x16xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<16000x16xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<16000x16xbf16>) -> tensor<16000x16xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x16000xbf16>, tensor<8192x16xbf16>) outs(%1 : tensor<16000x16xbf16>) -> tensor<16000x16xbf16>
-        return %2 : tensor<16000x16xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_16000_16_8192_f16_tA.mlir b/gemm/mlir/gemm_16000_16_8192_f16_tA.mlir
deleted file mode 100644
index c1107cc..0000000
--- a/gemm/mlir/gemm_16000_16_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x16000xf16>, %arg1: tensor<8192x16xf16>) -> tensor<16000x16xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<16000x16xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<16000x16xf16>) -> tensor<16000x16xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x16000xf16>, tensor<8192x16xf16>) outs(%1 : tensor<16000x16xf16>) -> tensor<16000x16xf16>
-        return %2 : tensor<16000x16xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_16000_1_5120_bf16_tA.mlir b/gemm/mlir/gemm_16000_1_5120_bf16_tA.mlir
deleted file mode 100644
index d0fc2f2..0000000
--- a/gemm/mlir/gemm_16000_1_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x16000xbf16>, %arg1: tensor<5120x1xbf16>) -> tensor<16000x1xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<16000x1xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<16000x1xbf16>) -> tensor<16000x1xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x16000xbf16>, tensor<5120x1xbf16>) outs(%1 : tensor<16000x1xbf16>) -> tensor<16000x1xbf16>
-        return %2 : tensor<16000x1xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_16000_1_5120_f16_tA.mlir b/gemm/mlir/gemm_16000_1_5120_f16_tA.mlir
deleted file mode 100644
index 7182791..0000000
--- a/gemm/mlir/gemm_16000_1_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x16000xf16>, %arg1: tensor<5120x1xf16>) -> tensor<16000x1xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<16000x1xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<16000x1xf16>) -> tensor<16000x1xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x16000xf16>, tensor<5120x1xf16>) outs(%1 : tensor<16000x1xf16>) -> tensor<16000x1xf16>
-        return %2 : tensor<16000x1xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_16000_1_8192_bf16_tA.mlir b/gemm/mlir/gemm_16000_1_8192_bf16_tA.mlir
deleted file mode 100644
index 8258663..0000000
--- a/gemm/mlir/gemm_16000_1_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x16000xbf16>, %arg1: tensor<8192x1xbf16>) -> tensor<16000x1xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<16000x1xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<16000x1xbf16>) -> tensor<16000x1xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x16000xbf16>, tensor<8192x1xbf16>) outs(%1 : tensor<16000x1xbf16>) -> tensor<16000x1xbf16>
-        return %2 : tensor<16000x1xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_16000_1_8192_f16_tA.mlir b/gemm/mlir/gemm_16000_1_8192_f16_tA.mlir
deleted file mode 100644
index 8186ad5..0000000
--- a/gemm/mlir/gemm_16000_1_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x16000xf16>, %arg1: tensor<8192x1xf16>) -> tensor<16000x1xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<16000x1xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<16000x1xf16>) -> tensor<16000x1xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x16000xf16>, tensor<8192x1xf16>) outs(%1 : tensor<16000x1xf16>) -> tensor<16000x1xf16>
-        return %2 : tensor<16000x1xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_16000_2_5120_bf16_tA.mlir b/gemm/mlir/gemm_16000_2_5120_bf16_tA.mlir
deleted file mode 100644
index 11c07f2..0000000
--- a/gemm/mlir/gemm_16000_2_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x16000xbf16>, %arg1: tensor<5120x2xbf16>) -> tensor<16000x2xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<16000x2xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<16000x2xbf16>) -> tensor<16000x2xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x16000xbf16>, tensor<5120x2xbf16>) outs(%1 : tensor<16000x2xbf16>) -> tensor<16000x2xbf16>
-        return %2 : tensor<16000x2xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_16000_2_5120_f16_tA.mlir b/gemm/mlir/gemm_16000_2_5120_f16_tA.mlir
deleted file mode 100644
index 3efeb6a..0000000
--- a/gemm/mlir/gemm_16000_2_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x16000xf16>, %arg1: tensor<5120x2xf16>) -> tensor<16000x2xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<16000x2xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<16000x2xf16>) -> tensor<16000x2xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x16000xf16>, tensor<5120x2xf16>) outs(%1 : tensor<16000x2xf16>) -> tensor<16000x2xf16>
-        return %2 : tensor<16000x2xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_16000_2_8192_bf16_tA.mlir b/gemm/mlir/gemm_16000_2_8192_bf16_tA.mlir
deleted file mode 100644
index 28e4d63..0000000
--- a/gemm/mlir/gemm_16000_2_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x16000xbf16>, %arg1: tensor<8192x2xbf16>) -> tensor<16000x2xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<16000x2xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<16000x2xbf16>) -> tensor<16000x2xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x16000xbf16>, tensor<8192x2xbf16>) outs(%1 : tensor<16000x2xbf16>) -> tensor<16000x2xbf16>
-        return %2 : tensor<16000x2xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_16000_2_8192_f16_tA.mlir b/gemm/mlir/gemm_16000_2_8192_f16_tA.mlir
deleted file mode 100644
index 8c125de..0000000
--- a/gemm/mlir/gemm_16000_2_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x16000xf16>, %arg1: tensor<8192x2xf16>) -> tensor<16000x2xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<16000x2xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<16000x2xf16>) -> tensor<16000x2xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x16000xf16>, tensor<8192x2xf16>) outs(%1 : tensor<16000x2xf16>) -> tensor<16000x2xf16>
-        return %2 : tensor<16000x2xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_16000_32_5120_bf16_tA.mlir b/gemm/mlir/gemm_16000_32_5120_bf16_tA.mlir
deleted file mode 100644
index a47ce25..0000000
--- a/gemm/mlir/gemm_16000_32_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x16000xbf16>, %arg1: tensor<5120x32xbf16>) -> tensor<16000x32xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<16000x32xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<16000x32xbf16>) -> tensor<16000x32xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x16000xbf16>, tensor<5120x32xbf16>) outs(%1 : tensor<16000x32xbf16>) -> tensor<16000x32xbf16>
-        return %2 : tensor<16000x32xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_16000_32_5120_f16_tA.mlir b/gemm/mlir/gemm_16000_32_5120_f16_tA.mlir
deleted file mode 100644
index 5ea27d7..0000000
--- a/gemm/mlir/gemm_16000_32_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x16000xf16>, %arg1: tensor<5120x32xf16>) -> tensor<16000x32xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<16000x32xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<16000x32xf16>) -> tensor<16000x32xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x16000xf16>, tensor<5120x32xf16>) outs(%1 : tensor<16000x32xf16>) -> tensor<16000x32xf16>
-        return %2 : tensor<16000x32xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_16000_32_8192_bf16_tA.mlir b/gemm/mlir/gemm_16000_32_8192_bf16_tA.mlir
deleted file mode 100644
index 72308e0..0000000
--- a/gemm/mlir/gemm_16000_32_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x16000xbf16>, %arg1: tensor<8192x32xbf16>) -> tensor<16000x32xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<16000x32xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<16000x32xbf16>) -> tensor<16000x32xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x16000xbf16>, tensor<8192x32xbf16>) outs(%1 : tensor<16000x32xbf16>) -> tensor<16000x32xbf16>
-        return %2 : tensor<16000x32xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_16000_32_8192_f16_tA.mlir b/gemm/mlir/gemm_16000_32_8192_f16_tA.mlir
deleted file mode 100644
index e5f6d3b..0000000
--- a/gemm/mlir/gemm_16000_32_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x16000xf16>, %arg1: tensor<8192x32xf16>) -> tensor<16000x32xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<16000x32xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<16000x32xf16>) -> tensor<16000x32xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x16000xf16>, tensor<8192x32xf16>) outs(%1 : tensor<16000x32xf16>) -> tensor<16000x32xf16>
-        return %2 : tensor<16000x32xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_16000_4_5120_bf16_tA.mlir b/gemm/mlir/gemm_16000_4_5120_bf16_tA.mlir
deleted file mode 100644
index a514a47..0000000
--- a/gemm/mlir/gemm_16000_4_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x16000xbf16>, %arg1: tensor<5120x4xbf16>) -> tensor<16000x4xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<16000x4xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<16000x4xbf16>) -> tensor<16000x4xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x16000xbf16>, tensor<5120x4xbf16>) outs(%1 : tensor<16000x4xbf16>) -> tensor<16000x4xbf16>
-        return %2 : tensor<16000x4xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_16000_4_5120_f16_tA.mlir b/gemm/mlir/gemm_16000_4_5120_f16_tA.mlir
deleted file mode 100644
index 1b73c07..0000000
--- a/gemm/mlir/gemm_16000_4_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x16000xf16>, %arg1: tensor<5120x4xf16>) -> tensor<16000x4xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<16000x4xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<16000x4xf16>) -> tensor<16000x4xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x16000xf16>, tensor<5120x4xf16>) outs(%1 : tensor<16000x4xf16>) -> tensor<16000x4xf16>
-        return %2 : tensor<16000x4xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_16000_4_8192_bf16_tA.mlir b/gemm/mlir/gemm_16000_4_8192_bf16_tA.mlir
deleted file mode 100644
index 1de70e2..0000000
--- a/gemm/mlir/gemm_16000_4_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x16000xbf16>, %arg1: tensor<8192x4xbf16>) -> tensor<16000x4xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<16000x4xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<16000x4xbf16>) -> tensor<16000x4xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x16000xbf16>, tensor<8192x4xbf16>) outs(%1 : tensor<16000x4xbf16>) -> tensor<16000x4xbf16>
-        return %2 : tensor<16000x4xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_16000_4_8192_f16_tA.mlir b/gemm/mlir/gemm_16000_4_8192_f16_tA.mlir
deleted file mode 100644
index a035de1..0000000
--- a/gemm/mlir/gemm_16000_4_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x16000xf16>, %arg1: tensor<8192x4xf16>) -> tensor<16000x4xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<16000x4xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<16000x4xf16>) -> tensor<16000x4xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x16000xf16>, tensor<8192x4xf16>) outs(%1 : tensor<16000x4xf16>) -> tensor<16000x4xf16>
-        return %2 : tensor<16000x4xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_16000_8_5120_bf16_tA.mlir b/gemm/mlir/gemm_16000_8_5120_bf16_tA.mlir
deleted file mode 100644
index 23c98e5..0000000
--- a/gemm/mlir/gemm_16000_8_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x16000xbf16>, %arg1: tensor<5120x8xbf16>) -> tensor<16000x8xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<16000x8xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<16000x8xbf16>) -> tensor<16000x8xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x16000xbf16>, tensor<5120x8xbf16>) outs(%1 : tensor<16000x8xbf16>) -> tensor<16000x8xbf16>
-        return %2 : tensor<16000x8xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_16000_8_5120_f16_tA.mlir b/gemm/mlir/gemm_16000_8_5120_f16_tA.mlir
deleted file mode 100644
index 25ea2f2..0000000
--- a/gemm/mlir/gemm_16000_8_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x16000xf16>, %arg1: tensor<5120x8xf16>) -> tensor<16000x8xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<16000x8xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<16000x8xf16>) -> tensor<16000x8xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x16000xf16>, tensor<5120x8xf16>) outs(%1 : tensor<16000x8xf16>) -> tensor<16000x8xf16>
-        return %2 : tensor<16000x8xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_16000_8_8192_bf16_tA.mlir b/gemm/mlir/gemm_16000_8_8192_bf16_tA.mlir
deleted file mode 100644
index 8b5ce5a..0000000
--- a/gemm/mlir/gemm_16000_8_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x16000xbf16>, %arg1: tensor<8192x8xbf16>) -> tensor<16000x8xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<16000x8xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<16000x8xbf16>) -> tensor<16000x8xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x16000xbf16>, tensor<8192x8xbf16>) outs(%1 : tensor<16000x8xbf16>) -> tensor<16000x8xbf16>
-        return %2 : tensor<16000x8xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_16000_8_8192_f16_tA.mlir b/gemm/mlir/gemm_16000_8_8192_f16_tA.mlir
deleted file mode 100644
index b53f1c0..0000000
--- a/gemm/mlir/gemm_16000_8_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x16000xf16>, %arg1: tensor<8192x8xf16>) -> tensor<16000x8xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<16000x8xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<16000x8xf16>) -> tensor<16000x8xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x16000xf16>, tensor<8192x8xf16>) outs(%1 : tensor<16000x8xf16>) -> tensor<16000x8xf16>
-        return %2 : tensor<16000x8xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_1920_16_5120_bf16_tA.mlir b/gemm/mlir/gemm_1920_16_5120_bf16_tA.mlir
deleted file mode 100644
index 0498cb4..0000000
--- a/gemm/mlir/gemm_1920_16_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x1920xbf16>, %arg1: tensor<5120x16xbf16>) -> tensor<1920x16xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<1920x16xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<1920x16xbf16>) -> tensor<1920x16xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x1920xbf16>, tensor<5120x16xbf16>) outs(%1 : tensor<1920x16xbf16>) -> tensor<1920x16xbf16>
-        return %2 : tensor<1920x16xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_1920_16_5120_f16_tA.mlir b/gemm/mlir/gemm_1920_16_5120_f16_tA.mlir
deleted file mode 100644
index 7a26a60..0000000
--- a/gemm/mlir/gemm_1920_16_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x1920xf16>, %arg1: tensor<5120x16xf16>) -> tensor<1920x16xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<1920x16xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<1920x16xf16>) -> tensor<1920x16xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x1920xf16>, tensor<5120x16xf16>) outs(%1 : tensor<1920x16xf16>) -> tensor<1920x16xf16>
-        return %2 : tensor<1920x16xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_1920_1_5120_bf16_tA.mlir b/gemm/mlir/gemm_1920_1_5120_bf16_tA.mlir
deleted file mode 100644
index 69a8142..0000000
--- a/gemm/mlir/gemm_1920_1_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x1920xbf16>, %arg1: tensor<5120x1xbf16>) -> tensor<1920x1xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<1920x1xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<1920x1xbf16>) -> tensor<1920x1xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x1920xbf16>, tensor<5120x1xbf16>) outs(%1 : tensor<1920x1xbf16>) -> tensor<1920x1xbf16>
-        return %2 : tensor<1920x1xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_1920_1_5120_f16_tA.mlir b/gemm/mlir/gemm_1920_1_5120_f16_tA.mlir
deleted file mode 100644
index 7f56072..0000000
--- a/gemm/mlir/gemm_1920_1_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x1920xf16>, %arg1: tensor<5120x1xf16>) -> tensor<1920x1xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<1920x1xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<1920x1xf16>) -> tensor<1920x1xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x1920xf16>, tensor<5120x1xf16>) outs(%1 : tensor<1920x1xf16>) -> tensor<1920x1xf16>
-        return %2 : tensor<1920x1xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_1920_2_5120_bf16_tA.mlir b/gemm/mlir/gemm_1920_2_5120_bf16_tA.mlir
deleted file mode 100644
index 8241b87..0000000
--- a/gemm/mlir/gemm_1920_2_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x1920xbf16>, %arg1: tensor<5120x2xbf16>) -> tensor<1920x2xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<1920x2xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<1920x2xbf16>) -> tensor<1920x2xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x1920xbf16>, tensor<5120x2xbf16>) outs(%1 : tensor<1920x2xbf16>) -> tensor<1920x2xbf16>
-        return %2 : tensor<1920x2xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_1920_2_5120_f16_tA.mlir b/gemm/mlir/gemm_1920_2_5120_f16_tA.mlir
deleted file mode 100644
index 8410b70..0000000
--- a/gemm/mlir/gemm_1920_2_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x1920xf16>, %arg1: tensor<5120x2xf16>) -> tensor<1920x2xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<1920x2xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<1920x2xf16>) -> tensor<1920x2xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x1920xf16>, tensor<5120x2xf16>) outs(%1 : tensor<1920x2xf16>) -> tensor<1920x2xf16>
-        return %2 : tensor<1920x2xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_1920_32_5120_bf16_tA.mlir b/gemm/mlir/gemm_1920_32_5120_bf16_tA.mlir
deleted file mode 100644
index fb33ba0..0000000
--- a/gemm/mlir/gemm_1920_32_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x1920xbf16>, %arg1: tensor<5120x32xbf16>) -> tensor<1920x32xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<1920x32xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<1920x32xbf16>) -> tensor<1920x32xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x1920xbf16>, tensor<5120x32xbf16>) outs(%1 : tensor<1920x32xbf16>) -> tensor<1920x32xbf16>
-        return %2 : tensor<1920x32xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_1920_32_5120_f16_tA.mlir b/gemm/mlir/gemm_1920_32_5120_f16_tA.mlir
deleted file mode 100644
index 17e9ebc..0000000
--- a/gemm/mlir/gemm_1920_32_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x1920xf16>, %arg1: tensor<5120x32xf16>) -> tensor<1920x32xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<1920x32xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<1920x32xf16>) -> tensor<1920x32xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x1920xf16>, tensor<5120x32xf16>) outs(%1 : tensor<1920x32xf16>) -> tensor<1920x32xf16>
-        return %2 : tensor<1920x32xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_1920_4_5120_bf16_tA.mlir b/gemm/mlir/gemm_1920_4_5120_bf16_tA.mlir
deleted file mode 100644
index 5f1c806..0000000
--- a/gemm/mlir/gemm_1920_4_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x1920xbf16>, %arg1: tensor<5120x4xbf16>) -> tensor<1920x4xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<1920x4xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<1920x4xbf16>) -> tensor<1920x4xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x1920xbf16>, tensor<5120x4xbf16>) outs(%1 : tensor<1920x4xbf16>) -> tensor<1920x4xbf16>
-        return %2 : tensor<1920x4xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_1920_4_5120_f16_tA.mlir b/gemm/mlir/gemm_1920_4_5120_f16_tA.mlir
deleted file mode 100644
index cd45416..0000000
--- a/gemm/mlir/gemm_1920_4_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x1920xf16>, %arg1: tensor<5120x4xf16>) -> tensor<1920x4xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<1920x4xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<1920x4xf16>) -> tensor<1920x4xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x1920xf16>, tensor<5120x4xf16>) outs(%1 : tensor<1920x4xf16>) -> tensor<1920x4xf16>
-        return %2 : tensor<1920x4xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_1920_8_5120_bf16_tA.mlir b/gemm/mlir/gemm_1920_8_5120_bf16_tA.mlir
deleted file mode 100644
index bb5ee3c..0000000
--- a/gemm/mlir/gemm_1920_8_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x1920xbf16>, %arg1: tensor<5120x8xbf16>) -> tensor<1920x8xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<1920x8xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<1920x8xbf16>) -> tensor<1920x8xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x1920xbf16>, tensor<5120x8xbf16>) outs(%1 : tensor<1920x8xbf16>) -> tensor<1920x8xbf16>
-        return %2 : tensor<1920x8xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_1920_8_5120_f16_tA.mlir b/gemm/mlir/gemm_1920_8_5120_f16_tA.mlir
deleted file mode 100644
index 7f94a48..0000000
--- a/gemm/mlir/gemm_1920_8_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x1920xf16>, %arg1: tensor<5120x8xf16>) -> tensor<1920x8xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<1920x8xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<1920x8xf16>) -> tensor<1920x8xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x1920xf16>, tensor<5120x8xf16>) outs(%1 : tensor<1920x8xf16>) -> tensor<1920x8xf16>
-        return %2 : tensor<1920x8xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_2048_10240_1280_bf16.mlir b/gemm/mlir/gemm_2048_10240_1280_bf16.mlir
deleted file mode 100644
index d65d3a7..0000000
--- a/gemm/mlir/gemm_2048_10240_1280_bf16.mlir
+++ /dev/null
@@ -1,9 +0,0 @@
-module {
-    func.func @main(%arg0: tensor<2048x1280xbf16>, %arg1: tensor<1280x10240xbf16>) -> tensor<2048x10240xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<2048x10240xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<2048x10240xbf16>) -> tensor<2048x10240xbf16>
-        %2 = linalg.matmul ins(%arg0, %arg1 : tensor<2048x1280xbf16>, tensor<1280x10240xbf16>) outs(%1 : tensor<2048x10240xbf16>) -> tensor<2048x10240xbf16>
-        return %2 : tensor<2048x10240xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_2048_10240_1280_bf16_tA.mlir b/gemm/mlir/gemm_2048_10240_1280_bf16_tA.mlir
deleted file mode 100644
index 84241c7..0000000
--- a/gemm/mlir/gemm_2048_10240_1280_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<1280x2048xbf16>, %arg1: tensor<1280x10240xbf16>) -> tensor<2048x10240xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<2048x10240xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<2048x10240xbf16>) -> tensor<2048x10240xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1280x2048xbf16>, tensor<1280x10240xbf16>) outs(%1 : tensor<2048x10240xbf16>) -> tensor<2048x10240xbf16>
-        return %2 : tensor<2048x10240xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_2048_10240_1280_bf16_tB.mlir b/gemm/mlir/gemm_2048_10240_1280_bf16_tB.mlir
deleted file mode 100644
index 28e61ff..0000000
--- a/gemm/mlir/gemm_2048_10240_1280_bf16_tB.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<2048x1280xbf16>, %arg1: tensor<10240x1280xbf16>) -> tensor<2048x10240xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<2048x10240xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<2048x10240xbf16>) -> tensor<2048x10240xbf16>
-        %2 = linalg.matmul_transpose_b ins(%arg0, %arg1 : tensor<2048x1280xbf16>, tensor<10240x1280xbf16>) outs(%1 : tensor<2048x10240xbf16>) -> tensor<2048x10240xbf16>
-        return %2 : tensor<2048x10240xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_2048_10240_1280_f16.mlir b/gemm/mlir/gemm_2048_10240_1280_f16.mlir
deleted file mode 100644
index e3bbec5..0000000
--- a/gemm/mlir/gemm_2048_10240_1280_f16.mlir
+++ /dev/null
@@ -1,9 +0,0 @@
-module {
-    func.func @main(%arg0: tensor<2048x1280xf16>, %arg1: tensor<1280x10240xf16>) -> tensor<2048x10240xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<2048x10240xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<2048x10240xf16>) -> tensor<2048x10240xf16>
-        %2 = linalg.matmul ins(%arg0, %arg1 : tensor<2048x1280xf16>, tensor<1280x10240xf16>) outs(%1 : tensor<2048x10240xf16>) -> tensor<2048x10240xf16>
-        return %2 : tensor<2048x10240xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_2048_10240_1280_f16_tA.mlir b/gemm/mlir/gemm_2048_10240_1280_f16_tA.mlir
deleted file mode 100644
index 6a0033c..0000000
--- a/gemm/mlir/gemm_2048_10240_1280_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<1280x2048xf16>, %arg1: tensor<1280x10240xf16>) -> tensor<2048x10240xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<2048x10240xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<2048x10240xf16>) -> tensor<2048x10240xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1280x2048xf16>, tensor<1280x10240xf16>) outs(%1 : tensor<2048x10240xf16>) -> tensor<2048x10240xf16>
-        return %2 : tensor<2048x10240xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_2048_10240_1280_f16_tB.mlir b/gemm/mlir/gemm_2048_10240_1280_f16_tB.mlir
deleted file mode 100644
index 5513769..0000000
--- a/gemm/mlir/gemm_2048_10240_1280_f16_tB.mlir
+++ /dev/null
@@ -1,145 +0,0 @@
-#translation = #iree_codegen.translation_info<None workgroup_size = [128, 2, 1] subgroup_size = 64>
-module attributes {transform.with_named_sequence} {
-  stream.executable private @gemm {
-    stream.executable.export public @gemm workgroups() -> (index, index, index) {
-      %c32 = arith.constant 32 : index
-      %c160 = arith.constant 160 : index
-      %c1 = arith.constant 1 : index
-      stream.return %c32, %c160, %c1 : index, index, index
-    }
-    builtin.module {
-      func.func @gemm(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) attributes {translation_info = #translation} {
-        %c19 = arith.constant 19 : index
-        %c18 = arith.constant 18 : index
-        %c17 = arith.constant 17 : index
-        %c3 = arith.constant 3 : index
-        %c2 = arith.constant 2 : index
-        %c16 = arith.constant 16 : index
-        %c8 = arith.constant 8 : index
-        %c4 = arith.constant 4 : index
-        %c32 = arith.constant 32 : index
-        %c64 = arith.constant 64 : index
-        %c1 = arith.constant 1 : index
-        %c40 = arith.constant 40 : index
-        %c0 = arith.constant 0 : index
-        %cst = arith.constant dense<0.000000e+00> : vector<4xf32>
-        %workgroup_id_0 = stream.dispatch.workgroup.id[0] : index
-        %workgroup_id_1 = stream.dispatch.workgroup.id[1] : index
-        %thread_id_x = gpu.thread_id  x
-        %thread_id_y = gpu.thread_id  y
-        %alloc = memref.alloc() : memref<64x32xf16, #gpu.address_space<workgroup>>
-        %alloc_0 = memref.alloc() : memref<64x32xf16, #gpu.address_space<workgroup>>
-        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> memref<2048x1280xf16, strided<[1280, 1], offset: ?>>
-        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> memref<10240x1280xf16, strided<[1280, 1], offset: ?>>
-        %2 = arith.muli %workgroup_id_0, %c64 : index
-        %3 = arith.muli %thread_id_y, %c32 : index
-        %4 = arith.divsi %thread_id_x, %c4 : index
-        %5 = arith.addi %4, %3 : index
-        %6 = arith.remsi %5, %c64 : index
-        %7 = arith.addi %6, %2 : index
-        %8 = arith.remsi %thread_id_x, %c4 : index
-        %9 = arith.muli %8, %c8 : index
-        %10 = arith.divsi %thread_id_x, %c64 : index
-        %11 = arith.muli %10, %c32 : index
-        %12 = arith.remsi %thread_id_x, %c16 : index
-        %13 = arith.addi %12, %11 : index
-        %14 = arith.remsi %thread_id_x, %c64 : index
-        %15 = arith.divsi %14, %c16 : index
-        %16 = arith.muli %15, %c4 : index
-        %17 = arith.addi %16, %c16 : index
-        %18 = arith.addi %13, %c16 : index
-        %19 = arith.muli %workgroup_id_1, %c64 : index
-        %20 = arith.addi %6, %19 : index
-        %21 = arith.addi %12, %3 : index
-        %22 = arith.addi %21, %c16 : index
-        %23:4 = scf.for %arg3 = %c0 to %c40 step %c1 iter_args(%arg4 = %cst, %arg5 = %cst, %arg6 = %cst, %arg7 = %cst) -> (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) {
-          %62 = arith.muli %arg3, %c32 : index
-          %63 = arith.addi %62, %9 : index
-          %64 = vector.load %0[%7, %63] : memref<2048x1280xf16, strided<[1280, 1], offset: ?>>, vector<8xf16>
-          vector.store %64, %alloc[%6, %9] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<8xf16>
-          amdgpu.lds_barrier
-          %65 = vector.load %alloc[%13, %16] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<4xf16>
-          %66 = vector.load %alloc[%13, %17] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<4xf16>
-          %67 = vector.load %alloc[%18, %16] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<4xf16>
-          %68 = vector.load %alloc[%18, %17] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<4xf16>
-          %69 = vector.load %1[%20, %63] : memref<10240x1280xf16, strided<[1280, 1], offset: ?>>, vector<8xf16>
-          amdgpu.lds_barrier
-          vector.store %69, %alloc_0[%6, %9] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<8xf16>
-          amdgpu.lds_barrier
-          %70 = vector.load %alloc_0[%21, %16] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<4xf16>
-          %71 = vector.load %alloc_0[%21, %17] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<4xf16>
-          %72 = vector.load %alloc_0[%22, %16] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<4xf16>
-          %73 = vector.load %alloc_0[%22, %17] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<4xf16>
-          %74 = amdgpu.mfma %65 * %70 + %arg4 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
-          %75 = amdgpu.mfma %66 * %71 + %74 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
-          %76 = amdgpu.mfma %67 * %72 + %arg7 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
-          %77 = amdgpu.mfma %68 * %73 + %76 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
-          %78 = amdgpu.mfma %67 * %70 + %arg6 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
-          %79 = amdgpu.mfma %68 * %71 + %78 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
-          %80 = amdgpu.mfma %65 * %72 + %arg5 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
-          %81 = amdgpu.mfma %66 * %73 + %80 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
-          scf.yield %75, %81, %79, %77 : vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>
-        }
-        %24 = vector.extract_strided_slice %23#0 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        %25 = stream.binding.subspan %arg2[%c0] : !stream.binding -> memref<2048x10240xf32, strided<[10240, 1], offset: ?>>
-        %26 = arith.remsi %thread_id_x, %c64 : index
-        %27 = arith.divsi %26, %c16 : index
-        %28 = arith.muli %27, %c4 : index
-        %29 = arith.divsi %thread_id_x, %c64 : index
-        %30 = arith.muli %29, %c32 : index
-        %31 = arith.muli %workgroup_id_0, %c64 : index
-        %32 = arith.addi %31, %30 : index
-        %33 = arith.addi %32, %28 : index
-        %34 = arith.muli %thread_id_y, %c32 : index
-        %35 = arith.muli %workgroup_id_1, %c64 : index
-        %36 = arith.remsi %thread_id_x, %c16 : index
-        %37 = arith.addi %36, %35 : index
-        %38 = arith.addi %37, %34 : index
-        vector.store %24, %25[%33, %38] : memref<2048x10240xf32, strided<[10240, 1], offset: ?>>, vector<1xf32>
-        %39 = vector.extract_strided_slice %23#0 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        %40 = arith.addi %33, %c1 : index
-        vector.store %39, %25[%40, %38] : memref<2048x10240xf32, strided<[10240, 1], offset: ?>>, vector<1xf32>
-        %41 = vector.extract_strided_slice %23#0 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        %42 = arith.addi %33, %c2 : index
-        vector.store %41, %25[%42, %38] : memref<2048x10240xf32, strided<[10240, 1], offset: ?>>, vector<1xf32>
-        %43 = vector.extract_strided_slice %23#0 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        %44 = arith.addi %33, %c3 : index
-        vector.store %43, %25[%44, %38] : memref<2048x10240xf32, strided<[10240, 1], offset: ?>>, vector<1xf32>
-        %45 = vector.extract_strided_slice %23#3 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        %46 = arith.addi %33, %c16 : index
-        %47 = arith.addi %38, %c16 : index
-        vector.store %45, %25[%46, %47] : memref<2048x10240xf32, strided<[10240, 1], offset: ?>>, vector<1xf32>
-        %48 = vector.extract_strided_slice %23#3 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        %49 = arith.addi %33, %c17 : index
-        vector.store %48, %25[%49, %47] : memref<2048x10240xf32, strided<[10240, 1], offset: ?>>, vector<1xf32>
-        %50 = vector.extract_strided_slice %23#3 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        %51 = arith.addi %33, %c18 : index
-        vector.store %50, %25[%51, %47] : memref<2048x10240xf32, strided<[10240, 1], offset: ?>>, vector<1xf32>
-        %52 = vector.extract_strided_slice %23#3 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        %53 = arith.addi %33, %c19 : index
-        vector.store %52, %25[%53, %47] : memref<2048x10240xf32, strided<[10240, 1], offset: ?>>, vector<1xf32>
-        %54 = vector.extract_strided_slice %23#2 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        vector.store %54, %25[%46, %38] : memref<2048x10240xf32, strided<[10240, 1], offset: ?>>, vector<1xf32>
-        %55 = vector.extract_strided_slice %23#2 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        vector.store %55, %25[%49, %38] : memref<2048x10240xf32, strided<[10240, 1], offset: ?>>, vector<1xf32>
-        %56 = vector.extract_strided_slice %23#2 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        vector.store %56, %25[%51, %38] : memref<2048x10240xf32, strided<[10240, 1], offset: ?>>, vector<1xf32>
-        %57 = vector.extract_strided_slice %23#2 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        vector.store %57, %25[%53, %38] : memref<2048x10240xf32, strided<[10240, 1], offset: ?>>, vector<1xf32>
-        %58 = vector.extract_strided_slice %23#1 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        vector.store %58, %25[%33, %47] : memref<2048x10240xf32, strided<[10240, 1], offset: ?>>, vector<1xf32>
-        %59 = vector.extract_strided_slice %23#1 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        vector.store %59, %25[%40, %47] : memref<2048x10240xf32, strided<[10240, 1], offset: ?>>, vector<1xf32>
-        %60 = vector.extract_strided_slice %23#1 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        vector.store %60, %25[%42, %47] : memref<2048x10240xf32, strided<[10240, 1], offset: ?>>, vector<1xf32>
-        %61 = vector.extract_strided_slice %23#1 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        vector.store %61, %25[%44, %47] : memref<2048x10240xf32, strided<[10240, 1], offset: ?>>, vector<1xf32>
-        return
-      }
-    }
-  }
-  func.func @isolated_benchmark(%arg0: tensor<2048x1280xf16>, %arg1: tensor<10240x1280xf16>) -> tensor<2048x10240xf32> {
-    %0 = flow.dispatch @gemm::@gemm(%arg0, %arg1) : (tensor<2048x1280xf16>, tensor<10240x1280xf16>) -> tensor<2048x10240xf32>
-    return %0 : tensor<2048x10240xf32>
-  }
-}
diff --git a/gemm/mlir/gemm_2048_1280_1280_bf16.mlir b/gemm/mlir/gemm_2048_1280_1280_bf16.mlir
deleted file mode 100644
index 91c8ae2..0000000
--- a/gemm/mlir/gemm_2048_1280_1280_bf16.mlir
+++ /dev/null
@@ -1,9 +0,0 @@
-module {
-    func.func @main(%arg0: tensor<2048x1280xbf16>, %arg1: tensor<1280x1280xbf16>) -> tensor<2048x1280xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<2048x1280xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<2048x1280xbf16>) -> tensor<2048x1280xbf16>
-        %2 = linalg.matmul ins(%arg0, %arg1 : tensor<2048x1280xbf16>, tensor<1280x1280xbf16>) outs(%1 : tensor<2048x1280xbf16>) -> tensor<2048x1280xbf16>
-        return %2 : tensor<2048x1280xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_2048_1280_1280_bf16_tA.mlir b/gemm/mlir/gemm_2048_1280_1280_bf16_tA.mlir
deleted file mode 100644
index a155776..0000000
--- a/gemm/mlir/gemm_2048_1280_1280_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<1280x2048xbf16>, %arg1: tensor<1280x1280xbf16>) -> tensor<2048x1280xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<2048x1280xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<2048x1280xbf16>) -> tensor<2048x1280xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1280x2048xbf16>, tensor<1280x1280xbf16>) outs(%1 : tensor<2048x1280xbf16>) -> tensor<2048x1280xbf16>
-        return %2 : tensor<2048x1280xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_2048_1280_1280_bf16_tB.mlir b/gemm/mlir/gemm_2048_1280_1280_bf16_tB.mlir
deleted file mode 100644
index 2087786..0000000
--- a/gemm/mlir/gemm_2048_1280_1280_bf16_tB.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<2048x1280xbf16>, %arg1: tensor<1280x1280xbf16>) -> tensor<2048x1280xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<2048x1280xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<2048x1280xbf16>) -> tensor<2048x1280xbf16>
-        %2 = linalg.matmul_transpose_b ins(%arg0, %arg1 : tensor<2048x1280xbf16>, tensor<1280x1280xbf16>) outs(%1 : tensor<2048x1280xbf16>) -> tensor<2048x1280xbf16>
-        return %2 : tensor<2048x1280xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_2048_1280_1280_f16.mlir b/gemm/mlir/gemm_2048_1280_1280_f16.mlir
deleted file mode 100644
index ceb58a2..0000000
--- a/gemm/mlir/gemm_2048_1280_1280_f16.mlir
+++ /dev/null
@@ -1,9 +0,0 @@
-module {
-    func.func @main(%arg0: tensor<2048x1280xf16>, %arg1: tensor<1280x1280xf16>) -> tensor<2048x1280xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<2048x1280xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<2048x1280xf16>) -> tensor<2048x1280xf16>
-        %2 = linalg.matmul ins(%arg0, %arg1 : tensor<2048x1280xf16>, tensor<1280x1280xf16>) outs(%1 : tensor<2048x1280xf16>) -> tensor<2048x1280xf16>
-        return %2 : tensor<2048x1280xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_2048_1280_1280_f16_tA.mlir b/gemm/mlir/gemm_2048_1280_1280_f16_tA.mlir
deleted file mode 100644
index 13b6466..0000000
--- a/gemm/mlir/gemm_2048_1280_1280_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<1280x2048xf16>, %arg1: tensor<1280x1280xf16>) -> tensor<2048x1280xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<2048x1280xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<2048x1280xf16>) -> tensor<2048x1280xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1280x2048xf16>, tensor<1280x1280xf16>) outs(%1 : tensor<2048x1280xf16>) -> tensor<2048x1280xf16>
-        return %2 : tensor<2048x1280xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_2048_1280_1280_f16_tB.mlir b/gemm/mlir/gemm_2048_1280_1280_f16_tB.mlir
deleted file mode 100644
index 54d210b..0000000
--- a/gemm/mlir/gemm_2048_1280_1280_f16_tB.mlir
+++ /dev/null
@@ -1,145 +0,0 @@
-#translation = #iree_codegen.translation_info<None workgroup_size = [128, 2, 1] subgroup_size = 64>
-module attributes {transform.with_named_sequence} {
-  stream.executable private @gemm {
-    stream.executable.export public @gemm workgroups() -> (index, index, index) {
-      %c32 = arith.constant 32 : index
-      %c20 = arith.constant 20 : index
-      %c1 = arith.constant 1 : index
-      stream.return %c32, %c20, %c1 : index, index, index
-    }
-    builtin.module {
-      func.func @gemm(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) attributes {translation_info = #translation} {
-        %c19 = arith.constant 19 : index
-        %c18 = arith.constant 18 : index
-        %c17 = arith.constant 17 : index
-        %c3 = arith.constant 3 : index
-        %c2 = arith.constant 2 : index
-        %c16 = arith.constant 16 : index
-        %c8 = arith.constant 8 : index
-        %c4 = arith.constant 4 : index
-        %c32 = arith.constant 32 : index
-        %c64 = arith.constant 64 : index
-        %c1 = arith.constant 1 : index
-        %c40 = arith.constant 40 : index
-        %c0 = arith.constant 0 : index
-        %cst = arith.constant dense<0.000000e+00> : vector<4xf32>
-        %workgroup_id_0 = stream.dispatch.workgroup.id[0] : index
-        %workgroup_id_1 = stream.dispatch.workgroup.id[1] : index
-        %thread_id_x = gpu.thread_id  x
-        %thread_id_y = gpu.thread_id  y
-        %alloc = memref.alloc() : memref<64x32xf16, #gpu.address_space<workgroup>>
-        %alloc_0 = memref.alloc() : memref<64x32xf16, #gpu.address_space<workgroup>>
-        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> memref<2048x1280xf16, strided<[1280, 1], offset: ?>>
-        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> memref<1280x1280xf16, strided<[1280, 1], offset: ?>>
-        %2 = arith.muli %workgroup_id_0, %c64 : index
-        %3 = arith.muli %thread_id_y, %c32 : index
-        %4 = arith.divsi %thread_id_x, %c4 : index
-        %5 = arith.addi %4, %3 : index
-        %6 = arith.remsi %5, %c64 : index
-        %7 = arith.addi %6, %2 : index
-        %8 = arith.remsi %thread_id_x, %c4 : index
-        %9 = arith.muli %8, %c8 : index
-        %10 = arith.divsi %thread_id_x, %c64 : index
-        %11 = arith.muli %10, %c32 : index
-        %12 = arith.remsi %thread_id_x, %c16 : index
-        %13 = arith.addi %12, %11 : index
-        %14 = arith.remsi %thread_id_x, %c64 : index
-        %15 = arith.divsi %14, %c16 : index
-        %16 = arith.muli %15, %c4 : index
-        %17 = arith.addi %16, %c16 : index
-        %18 = arith.addi %13, %c16 : index
-        %19 = arith.muli %workgroup_id_1, %c64 : index
-        %20 = arith.addi %6, %19 : index
-        %21 = arith.addi %12, %3 : index
-        %22 = arith.addi %21, %c16 : index
-        %23:4 = scf.for %arg3 = %c0 to %c40 step %c1 iter_args(%arg4 = %cst, %arg5 = %cst, %arg6 = %cst, %arg7 = %cst) -> (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) {
-          %62 = arith.muli %arg3, %c32 : index
-          %63 = arith.addi %62, %9 : index
-          %64 = vector.load %0[%7, %63] : memref<2048x1280xf16, strided<[1280, 1], offset: ?>>, vector<8xf16>
-          vector.store %64, %alloc[%6, %9] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<8xf16>
-          amdgpu.lds_barrier
-          %65 = vector.load %alloc[%13, %16] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<4xf16>
-          %66 = vector.load %alloc[%13, %17] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<4xf16>
-          %67 = vector.load %alloc[%18, %16] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<4xf16>
-          %68 = vector.load %alloc[%18, %17] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<4xf16>
-          %69 = vector.load %1[%20, %63] : memref<1280x1280xf16, strided<[1280, 1], offset: ?>>, vector<8xf16>
-          amdgpu.lds_barrier
-          vector.store %69, %alloc_0[%6, %9] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<8xf16>
-          amdgpu.lds_barrier
-          %70 = vector.load %alloc_0[%21, %16] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<4xf16>
-          %71 = vector.load %alloc_0[%21, %17] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<4xf16>
-          %72 = vector.load %alloc_0[%22, %16] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<4xf16>
-          %73 = vector.load %alloc_0[%22, %17] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<4xf16>
-          %74 = amdgpu.mfma %65 * %70 + %arg4 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
-          %75 = amdgpu.mfma %66 * %71 + %74 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
-          %76 = amdgpu.mfma %67 * %72 + %arg7 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
-          %77 = amdgpu.mfma %68 * %73 + %76 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
-          %78 = amdgpu.mfma %67 * %70 + %arg6 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
-          %79 = amdgpu.mfma %68 * %71 + %78 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
-          %80 = amdgpu.mfma %65 * %72 + %arg5 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
-          %81 = amdgpu.mfma %66 * %73 + %80 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
-          scf.yield %75, %81, %79, %77 : vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>
-        }
-        %24 = vector.extract_strided_slice %23#0 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        %25 = stream.binding.subspan %arg2[%c0] : !stream.binding -> memref<2048x1280xf32, strided<[1280, 1], offset: ?>>
-        %26 = arith.remsi %thread_id_x, %c64 : index
-        %27 = arith.divsi %26, %c16 : index
-        %28 = arith.muli %27, %c4 : index
-        %29 = arith.divsi %thread_id_x, %c64 : index
-        %30 = arith.muli %29, %c32 : index
-        %31 = arith.muli %workgroup_id_0, %c64 : index
-        %32 = arith.addi %31, %30 : index
-        %33 = arith.addi %32, %28 : index
-        %34 = arith.muli %thread_id_y, %c32 : index
-        %35 = arith.muli %workgroup_id_1, %c64 : index
-        %36 = arith.remsi %thread_id_x, %c16 : index
-        %37 = arith.addi %36, %35 : index
-        %38 = arith.addi %37, %34 : index
-        vector.store %24, %25[%33, %38] : memref<2048x1280xf32, strided<[1280, 1], offset: ?>>, vector<1xf32>
-        %39 = vector.extract_strided_slice %23#0 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        %40 = arith.addi %33, %c1 : index
-        vector.store %39, %25[%40, %38] : memref<2048x1280xf32, strided<[1280, 1], offset: ?>>, vector<1xf32>
-        %41 = vector.extract_strided_slice %23#0 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        %42 = arith.addi %33, %c2 : index
-        vector.store %41, %25[%42, %38] : memref<2048x1280xf32, strided<[1280, 1], offset: ?>>, vector<1xf32>
-        %43 = vector.extract_strided_slice %23#0 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        %44 = arith.addi %33, %c3 : index
-        vector.store %43, %25[%44, %38] : memref<2048x1280xf32, strided<[1280, 1], offset: ?>>, vector<1xf32>
-        %45 = vector.extract_strided_slice %23#3 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        %46 = arith.addi %33, %c16 : index
-        %47 = arith.addi %38, %c16 : index
-        vector.store %45, %25[%46, %47] : memref<2048x1280xf32, strided<[1280, 1], offset: ?>>, vector<1xf32>
-        %48 = vector.extract_strided_slice %23#3 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        %49 = arith.addi %33, %c17 : index
-        vector.store %48, %25[%49, %47] : memref<2048x1280xf32, strided<[1280, 1], offset: ?>>, vector<1xf32>
-        %50 = vector.extract_strided_slice %23#3 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        %51 = arith.addi %33, %c18 : index
-        vector.store %50, %25[%51, %47] : memref<2048x1280xf32, strided<[1280, 1], offset: ?>>, vector<1xf32>
-        %52 = vector.extract_strided_slice %23#3 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        %53 = arith.addi %33, %c19 : index
-        vector.store %52, %25[%53, %47] : memref<2048x1280xf32, strided<[1280, 1], offset: ?>>, vector<1xf32>
-        %54 = vector.extract_strided_slice %23#2 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        vector.store %54, %25[%46, %38] : memref<2048x1280xf32, strided<[1280, 1], offset: ?>>, vector<1xf32>
-        %55 = vector.extract_strided_slice %23#2 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        vector.store %55, %25[%49, %38] : memref<2048x1280xf32, strided<[1280, 1], offset: ?>>, vector<1xf32>
-        %56 = vector.extract_strided_slice %23#2 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        vector.store %56, %25[%51, %38] : memref<2048x1280xf32, strided<[1280, 1], offset: ?>>, vector<1xf32>
-        %57 = vector.extract_strided_slice %23#2 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        vector.store %57, %25[%53, %38] : memref<2048x1280xf32, strided<[1280, 1], offset: ?>>, vector<1xf32>
-        %58 = vector.extract_strided_slice %23#1 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        vector.store %58, %25[%33, %47] : memref<2048x1280xf32, strided<[1280, 1], offset: ?>>, vector<1xf32>
-        %59 = vector.extract_strided_slice %23#1 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        vector.store %59, %25[%40, %47] : memref<2048x1280xf32, strided<[1280, 1], offset: ?>>, vector<1xf32>
-        %60 = vector.extract_strided_slice %23#1 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        vector.store %60, %25[%42, %47] : memref<2048x1280xf32, strided<[1280, 1], offset: ?>>, vector<1xf32>
-        %61 = vector.extract_strided_slice %23#1 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        vector.store %61, %25[%44, %47] : memref<2048x1280xf32, strided<[1280, 1], offset: ?>>, vector<1xf32>
-        return
-      }
-    }
-  }
-  func.func @isolated_benchmark(%arg0: tensor<2048x1280xf16>, %arg1: tensor<1280x1280xf16>) -> tensor<2048x1280xf32> {
-    %0 = flow.dispatch @gemm::@gemm(%arg0, %arg1) : (tensor<2048x1280xf16>, tensor<1280x1280xf16>) -> tensor<2048x1280xf32>
-    return %0 : tensor<2048x1280xf32>
-  }
-}
diff --git a/gemm/mlir/gemm_2048_1280_5120_bf16.mlir b/gemm/mlir/gemm_2048_1280_5120_bf16.mlir
deleted file mode 100644
index 6739dcc..0000000
--- a/gemm/mlir/gemm_2048_1280_5120_bf16.mlir
+++ /dev/null
@@ -1,9 +0,0 @@
-module {
-    func.func @main(%arg0: tensor<2048x5120xbf16>, %arg1: tensor<5120x1280xbf16>) -> tensor<2048x1280xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<2048x1280xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<2048x1280xbf16>) -> tensor<2048x1280xbf16>
-        %2 = linalg.matmul ins(%arg0, %arg1 : tensor<2048x5120xbf16>, tensor<5120x1280xbf16>) outs(%1 : tensor<2048x1280xbf16>) -> tensor<2048x1280xbf16>
-        return %2 : tensor<2048x1280xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_2048_1280_5120_bf16_tA.mlir b/gemm/mlir/gemm_2048_1280_5120_bf16_tA.mlir
deleted file mode 100644
index d99f327..0000000
--- a/gemm/mlir/gemm_2048_1280_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x2048xbf16>, %arg1: tensor<5120x1280xbf16>) -> tensor<2048x1280xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<2048x1280xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<2048x1280xbf16>) -> tensor<2048x1280xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x2048xbf16>, tensor<5120x1280xbf16>) outs(%1 : tensor<2048x1280xbf16>) -> tensor<2048x1280xbf16>
-        return %2 : tensor<2048x1280xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_2048_1280_5120_bf16_tB.mlir b/gemm/mlir/gemm_2048_1280_5120_bf16_tB.mlir
deleted file mode 100644
index ef0bd8e..0000000
--- a/gemm/mlir/gemm_2048_1280_5120_bf16_tB.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<2048x5120xbf16>, %arg1: tensor<1280x5120xbf16>) -> tensor<2048x1280xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<2048x1280xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<2048x1280xbf16>) -> tensor<2048x1280xbf16>
-        %2 = linalg.matmul_transpose_b ins(%arg0, %arg1 : tensor<2048x5120xbf16>, tensor<1280x5120xbf16>) outs(%1 : tensor<2048x1280xbf16>) -> tensor<2048x1280xbf16>
-        return %2 : tensor<2048x1280xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_2048_1280_5120_f16.mlir b/gemm/mlir/gemm_2048_1280_5120_f16.mlir
deleted file mode 100644
index c75885c..0000000
--- a/gemm/mlir/gemm_2048_1280_5120_f16.mlir
+++ /dev/null
@@ -1,9 +0,0 @@
-module {
-    func.func @main(%arg0: tensor<2048x5120xf16>, %arg1: tensor<5120x1280xf16>) -> tensor<2048x1280xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<2048x1280xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<2048x1280xf16>) -> tensor<2048x1280xf16>
-        %2 = linalg.matmul ins(%arg0, %arg1 : tensor<2048x5120xf16>, tensor<5120x1280xf16>) outs(%1 : tensor<2048x1280xf16>) -> tensor<2048x1280xf16>
-        return %2 : tensor<2048x1280xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_2048_1280_5120_f16_tA.mlir b/gemm/mlir/gemm_2048_1280_5120_f16_tA.mlir
deleted file mode 100644
index 3d6fa99..0000000
--- a/gemm/mlir/gemm_2048_1280_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x2048xf16>, %arg1: tensor<5120x1280xf16>) -> tensor<2048x1280xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<2048x1280xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<2048x1280xf16>) -> tensor<2048x1280xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x2048xf16>, tensor<5120x1280xf16>) outs(%1 : tensor<2048x1280xf16>) -> tensor<2048x1280xf16>
-        return %2 : tensor<2048x1280xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_2048_1280_5120_f16_tB.mlir b/gemm/mlir/gemm_2048_1280_5120_f16_tB.mlir
deleted file mode 100644
index 9a35119..0000000
--- a/gemm/mlir/gemm_2048_1280_5120_f16_tB.mlir
+++ /dev/null
@@ -1,145 +0,0 @@
-#translation = #iree_codegen.translation_info<None workgroup_size = [128, 2, 1] subgroup_size = 64>
-module attributes {transform.with_named_sequence} {
-  stream.executable private @gemm {
-    stream.executable.export public @gemm workgroups() -> (index, index, index) {
-      %c32 = arith.constant 32 : index
-      %c20 = arith.constant 20 : index
-      %c1 = arith.constant 1 : index
-      stream.return %c32, %c20, %c1 : index, index, index
-    }
-    builtin.module {
-      func.func @gemm(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) attributes {translation_info = #translation} {
-        %c19 = arith.constant 19 : index
-        %c18 = arith.constant 18 : index
-        %c17 = arith.constant 17 : index
-        %c3 = arith.constant 3 : index
-        %c2 = arith.constant 2 : index
-        %c16 = arith.constant 16 : index
-        %c8 = arith.constant 8 : index
-        %c4 = arith.constant 4 : index
-        %c32 = arith.constant 32 : index
-        %c64 = arith.constant 64 : index
-        %c1 = arith.constant 1 : index
-        %c160 = arith.constant 160 : index
-        %c0 = arith.constant 0 : index
-        %cst = arith.constant dense<0.000000e+00> : vector<4xf32>
-        %workgroup_id_0 = stream.dispatch.workgroup.id[0] : index
-        %workgroup_id_1 = stream.dispatch.workgroup.id[1] : index
-        %thread_id_x = gpu.thread_id  x
-        %thread_id_y = gpu.thread_id  y
-        %alloc = memref.alloc() : memref<64x32xf16, #gpu.address_space<workgroup>>
-        %alloc_0 = memref.alloc() : memref<64x32xf16, #gpu.address_space<workgroup>>
-        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> memref<2048x5120xf16, strided<[5120, 1], offset: ?>>
-        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> memref<1280x5120xf16, strided<[5120, 1], offset: ?>>
-        %2 = arith.muli %workgroup_id_0, %c64 : index
-        %3 = arith.muli %thread_id_y, %c32 : index
-        %4 = arith.divsi %thread_id_x, %c4 : index
-        %5 = arith.addi %4, %3 : index
-        %6 = arith.remsi %5, %c64 : index
-        %7 = arith.addi %6, %2 : index
-        %8 = arith.remsi %thread_id_x, %c4 : index
-        %9 = arith.muli %8, %c8 : index
-        %10 = arith.divsi %thread_id_x, %c64 : index
-        %11 = arith.muli %10, %c32 : index
-        %12 = arith.remsi %thread_id_x, %c16 : index
-        %13 = arith.addi %12, %11 : index
-        %14 = arith.remsi %thread_id_x, %c64 : index
-        %15 = arith.divsi %14, %c16 : index
-        %16 = arith.muli %15, %c4 : index
-        %17 = arith.addi %16, %c16 : index
-        %18 = arith.addi %13, %c16 : index
-        %19 = arith.muli %workgroup_id_1, %c64 : index
-        %20 = arith.addi %6, %19 : index
-        %21 = arith.addi %12, %3 : index
-        %22 = arith.addi %21, %c16 : index
-        %23:4 = scf.for %arg3 = %c0 to %c160 step %c1 iter_args(%arg4 = %cst, %arg5 = %cst, %arg6 = %cst, %arg7 = %cst) -> (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) {
-          %62 = arith.muli %arg3, %c32 : index
-          %63 = arith.addi %62, %9 : index
-          %64 = vector.load %0[%7, %63] : memref<2048x5120xf16, strided<[5120, 1], offset: ?>>, vector<8xf16>
-          vector.store %64, %alloc[%6, %9] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<8xf16>
-          amdgpu.lds_barrier
-          %65 = vector.load %alloc[%13, %16] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<4xf16>
-          %66 = vector.load %alloc[%13, %17] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<4xf16>
-          %67 = vector.load %alloc[%18, %16] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<4xf16>
-          %68 = vector.load %alloc[%18, %17] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<4xf16>
-          %69 = vector.load %1[%20, %63] : memref<1280x5120xf16, strided<[5120, 1], offset: ?>>, vector<8xf16>
-          amdgpu.lds_barrier
-          vector.store %69, %alloc_0[%6, %9] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<8xf16>
-          amdgpu.lds_barrier
-          %70 = vector.load %alloc_0[%21, %16] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<4xf16>
-          %71 = vector.load %alloc_0[%21, %17] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<4xf16>
-          %72 = vector.load %alloc_0[%22, %16] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<4xf16>
-          %73 = vector.load %alloc_0[%22, %17] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<4xf16>
-          %74 = amdgpu.mfma %65 * %70 + %arg4 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
-          %75 = amdgpu.mfma %66 * %71 + %74 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
-          %76 = amdgpu.mfma %67 * %72 + %arg7 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
-          %77 = amdgpu.mfma %68 * %73 + %76 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
-          %78 = amdgpu.mfma %67 * %70 + %arg6 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
-          %79 = amdgpu.mfma %68 * %71 + %78 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
-          %80 = amdgpu.mfma %65 * %72 + %arg5 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
-          %81 = amdgpu.mfma %66 * %73 + %80 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
-          scf.yield %75, %81, %79, %77 : vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>
-        }
-        %24 = vector.extract_strided_slice %23#0 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        %25 = stream.binding.subspan %arg2[%c0] : !stream.binding -> memref<2048x1280xf32, strided<[1280, 1], offset: ?>>
-        %26 = arith.remsi %thread_id_x, %c64 : index
-        %27 = arith.divsi %26, %c16 : index
-        %28 = arith.muli %27, %c4 : index
-        %29 = arith.divsi %thread_id_x, %c64 : index
-        %30 = arith.muli %29, %c32 : index
-        %31 = arith.muli %workgroup_id_0, %c64 : index
-        %32 = arith.addi %31, %30 : index
-        %33 = arith.addi %32, %28 : index
-        %34 = arith.muli %thread_id_y, %c32 : index
-        %35 = arith.muli %workgroup_id_1, %c64 : index
-        %36 = arith.remsi %thread_id_x, %c16 : index
-        %37 = arith.addi %36, %35 : index
-        %38 = arith.addi %37, %34 : index
-        vector.store %24, %25[%33, %38] : memref<2048x1280xf32, strided<[1280, 1], offset: ?>>, vector<1xf32>
-        %39 = vector.extract_strided_slice %23#0 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        %40 = arith.addi %33, %c1 : index
-        vector.store %39, %25[%40, %38] : memref<2048x1280xf32, strided<[1280, 1], offset: ?>>, vector<1xf32>
-        %41 = vector.extract_strided_slice %23#0 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        %42 = arith.addi %33, %c2 : index
-        vector.store %41, %25[%42, %38] : memref<2048x1280xf32, strided<[1280, 1], offset: ?>>, vector<1xf32>
-        %43 = vector.extract_strided_slice %23#0 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        %44 = arith.addi %33, %c3 : index
-        vector.store %43, %25[%44, %38] : memref<2048x1280xf32, strided<[1280, 1], offset: ?>>, vector<1xf32>
-        %45 = vector.extract_strided_slice %23#3 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        %46 = arith.addi %33, %c16 : index
-        %47 = arith.addi %38, %c16 : index
-        vector.store %45, %25[%46, %47] : memref<2048x1280xf32, strided<[1280, 1], offset: ?>>, vector<1xf32>
-        %48 = vector.extract_strided_slice %23#3 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        %49 = arith.addi %33, %c17 : index
-        vector.store %48, %25[%49, %47] : memref<2048x1280xf32, strided<[1280, 1], offset: ?>>, vector<1xf32>
-        %50 = vector.extract_strided_slice %23#3 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        %51 = arith.addi %33, %c18 : index
-        vector.store %50, %25[%51, %47] : memref<2048x1280xf32, strided<[1280, 1], offset: ?>>, vector<1xf32>
-        %52 = vector.extract_strided_slice %23#3 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        %53 = arith.addi %33, %c19 : index
-        vector.store %52, %25[%53, %47] : memref<2048x1280xf32, strided<[1280, 1], offset: ?>>, vector<1xf32>
-        %54 = vector.extract_strided_slice %23#2 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        vector.store %54, %25[%46, %38] : memref<2048x1280xf32, strided<[1280, 1], offset: ?>>, vector<1xf32>
-        %55 = vector.extract_strided_slice %23#2 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        vector.store %55, %25[%49, %38] : memref<2048x1280xf32, strided<[1280, 1], offset: ?>>, vector<1xf32>
-        %56 = vector.extract_strided_slice %23#2 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        vector.store %56, %25[%51, %38] : memref<2048x1280xf32, strided<[1280, 1], offset: ?>>, vector<1xf32>
-        %57 = vector.extract_strided_slice %23#2 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        vector.store %57, %25[%53, %38] : memref<2048x1280xf32, strided<[1280, 1], offset: ?>>, vector<1xf32>
-        %58 = vector.extract_strided_slice %23#1 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        vector.store %58, %25[%33, %47] : memref<2048x1280xf32, strided<[1280, 1], offset: ?>>, vector<1xf32>
-        %59 = vector.extract_strided_slice %23#1 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        vector.store %59, %25[%40, %47] : memref<2048x1280xf32, strided<[1280, 1], offset: ?>>, vector<1xf32>
-        %60 = vector.extract_strided_slice %23#1 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        vector.store %60, %25[%42, %47] : memref<2048x1280xf32, strided<[1280, 1], offset: ?>>, vector<1xf32>
-        %61 = vector.extract_strided_slice %23#1 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        vector.store %61, %25[%44, %47] : memref<2048x1280xf32, strided<[1280, 1], offset: ?>>, vector<1xf32>
-        return
-      }
-    }
-  }
-  func.func @isolated_benchmark(%arg0: tensor<2048x5120xf16>, %arg1: tensor<1280x5120xf16>) -> tensor<2048x1280xf32> {
-    %0 = flow.dispatch @gemm::@gemm(%arg0, %arg1) : (tensor<2048x5120xf16>, tensor<1280x5120xf16>) -> tensor<2048x1280xf32>
-    return %0 : tensor<2048x1280xf32>
-  }
-}
diff --git a/gemm/mlir/gemm_2048_2048_1024_f16.mlir b/gemm/mlir/gemm_2048_2048_1024_f16.mlir
deleted file mode 100644
index cc77455..0000000
--- a/gemm/mlir/gemm_2048_2048_1024_f16.mlir
+++ /dev/null
@@ -1,9 +0,0 @@
-module {
-    func.func @main(%arg0: tensor<2048x1024xf16>, %arg1: tensor<1024x2048xf16>) -> tensor<2048x2048xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<2048x2048xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<2048x2048xf16>) -> tensor<2048x2048xf16>
-        %2 = linalg.matmul ins(%arg0, %arg1 : tensor<2048x1024xf16>, tensor<1024x2048xf16>) outs(%1 : tensor<2048x2048xf16>) -> tensor<2048x2048xf16>
-        return %2 : tensor<2048x2048xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_2048_2048_65536_f16.mlir b/gemm/mlir/gemm_2048_2048_65536_f16.mlir
deleted file mode 100644
index 34b9849..0000000
--- a/gemm/mlir/gemm_2048_2048_65536_f16.mlir
+++ /dev/null
@@ -1,9 +0,0 @@
-module {
-    func.func @main(%arg0: tensor<2048x65536xf16>, %arg1: tensor<65536x2048xf16>) -> tensor<2048x2048xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<2048x2048xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<2048x2048xf16>) -> tensor<2048x2048xf16>
-        %2 = linalg.matmul ins(%arg0, %arg1 : tensor<2048x65536xf16>, tensor<65536x2048xf16>) outs(%1 : tensor<2048x2048xf16>) -> tensor<2048x2048xf16>
-        return %2 : tensor<2048x2048xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_2048_2048_8192_f16.mlir b/gemm/mlir/gemm_2048_2048_8192_f16.mlir
deleted file mode 100644
index e9f3dd8..0000000
--- a/gemm/mlir/gemm_2048_2048_8192_f16.mlir
+++ /dev/null
@@ -1,9 +0,0 @@
-module {
-    func.func @main(%arg0: tensor<2048x8192xf16>, %arg1: tensor<8192x2048xf16>) -> tensor<2048x2048xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<2048x2048xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<2048x2048xf16>) -> tensor<2048x2048xf16>
-        %2 = linalg.matmul ins(%arg0, %arg1 : tensor<2048x8192xf16>, tensor<8192x2048xf16>) outs(%1 : tensor<2048x2048xf16>) -> tensor<2048x2048xf16>
-        return %2 : tensor<2048x2048xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_2048_8192_1024_f16.mlir b/gemm/mlir/gemm_2048_8192_1024_f16.mlir
deleted file mode 100644
index edfa213..0000000
--- a/gemm/mlir/gemm_2048_8192_1024_f16.mlir
+++ /dev/null
@@ -1,9 +0,0 @@
-module {
-    func.func @main(%arg0: tensor<2048x1024xf16>, %arg1: tensor<1024x8192xf16>) -> tensor<2048x8192xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<2048x8192xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<2048x8192xf16>) -> tensor<2048x8192xf16>
-        %2 = linalg.matmul ins(%arg0, %arg1 : tensor<2048x1024xf16>, tensor<1024x8192xf16>) outs(%1 : tensor<2048x8192xf16>) -> tensor<2048x8192xf16>
-        return %2 : tensor<2048x8192xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_2048_8192_65536_f16.mlir b/gemm/mlir/gemm_2048_8192_65536_f16.mlir
deleted file mode 100644
index e419b78..0000000
--- a/gemm/mlir/gemm_2048_8192_65536_f16.mlir
+++ /dev/null
@@ -1,9 +0,0 @@
-module {
-    func.func @main(%arg0: tensor<2048x65536xf16>, %arg1: tensor<65536x8192xf16>) -> tensor<2048x8192xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<2048x8192xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<2048x8192xf16>) -> tensor<2048x8192xf16>
-        %2 = linalg.matmul ins(%arg0, %arg1 : tensor<2048x65536xf16>, tensor<65536x8192xf16>) outs(%1 : tensor<2048x8192xf16>) -> tensor<2048x8192xf16>
-        return %2 : tensor<2048x8192xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_2048_8192_8192_f16.mlir b/gemm/mlir/gemm_2048_8192_8192_f16.mlir
deleted file mode 100644
index cc93de1..0000000
--- a/gemm/mlir/gemm_2048_8192_8192_f16.mlir
+++ /dev/null
@@ -1,9 +0,0 @@
-module {
-    func.func @main(%arg0: tensor<2048x8192xf16>, %arg1: tensor<8192x8192xf16>) -> tensor<2048x8192xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<2048x8192xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<2048x8192xf16>) -> tensor<2048x8192xf16>
-        %2 = linalg.matmul ins(%arg0, %arg1 : tensor<2048x8192xf16>, tensor<8192x8192xf16>) outs(%1 : tensor<2048x8192xf16>) -> tensor<2048x8192xf16>
-        return %2 : tensor<2048x8192xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_2560_16_8192_bf16_tA.mlir b/gemm/mlir/gemm_2560_16_8192_bf16_tA.mlir
deleted file mode 100644
index 1ab9cc0..0000000
--- a/gemm/mlir/gemm_2560_16_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x2560xbf16>, %arg1: tensor<8192x16xbf16>) -> tensor<2560x16xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<2560x16xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<2560x16xbf16>) -> tensor<2560x16xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x2560xbf16>, tensor<8192x16xbf16>) outs(%1 : tensor<2560x16xbf16>) -> tensor<2560x16xbf16>
-        return %2 : tensor<2560x16xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_2560_16_8192_f16_tA.mlir b/gemm/mlir/gemm_2560_16_8192_f16_tA.mlir
deleted file mode 100644
index fd4d377..0000000
--- a/gemm/mlir/gemm_2560_16_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x2560xf16>, %arg1: tensor<8192x16xf16>) -> tensor<2560x16xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<2560x16xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<2560x16xf16>) -> tensor<2560x16xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x2560xf16>, tensor<8192x16xf16>) outs(%1 : tensor<2560x16xf16>) -> tensor<2560x16xf16>
-        return %2 : tensor<2560x16xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_2560_1_8192_bf16_tA.mlir b/gemm/mlir/gemm_2560_1_8192_bf16_tA.mlir
deleted file mode 100644
index bf23aca..0000000
--- a/gemm/mlir/gemm_2560_1_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x2560xbf16>, %arg1: tensor<8192x1xbf16>) -> tensor<2560x1xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<2560x1xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<2560x1xbf16>) -> tensor<2560x1xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x2560xbf16>, tensor<8192x1xbf16>) outs(%1 : tensor<2560x1xbf16>) -> tensor<2560x1xbf16>
-        return %2 : tensor<2560x1xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_2560_1_8192_f16_tA.mlir b/gemm/mlir/gemm_2560_1_8192_f16_tA.mlir
deleted file mode 100644
index e6b86b4..0000000
--- a/gemm/mlir/gemm_2560_1_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x2560xf16>, %arg1: tensor<8192x1xf16>) -> tensor<2560x1xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<2560x1xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<2560x1xf16>) -> tensor<2560x1xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x2560xf16>, tensor<8192x1xf16>) outs(%1 : tensor<2560x1xf16>) -> tensor<2560x1xf16>
-        return %2 : tensor<2560x1xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_2560_2_8192_bf16_tA.mlir b/gemm/mlir/gemm_2560_2_8192_bf16_tA.mlir
deleted file mode 100644
index de185be..0000000
--- a/gemm/mlir/gemm_2560_2_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x2560xbf16>, %arg1: tensor<8192x2xbf16>) -> tensor<2560x2xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<2560x2xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<2560x2xbf16>) -> tensor<2560x2xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x2560xbf16>, tensor<8192x2xbf16>) outs(%1 : tensor<2560x2xbf16>) -> tensor<2560x2xbf16>
-        return %2 : tensor<2560x2xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_2560_2_8192_f16_tA.mlir b/gemm/mlir/gemm_2560_2_8192_f16_tA.mlir
deleted file mode 100644
index 3e6664e..0000000
--- a/gemm/mlir/gemm_2560_2_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x2560xf16>, %arg1: tensor<8192x2xf16>) -> tensor<2560x2xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<2560x2xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<2560x2xf16>) -> tensor<2560x2xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x2560xf16>, tensor<8192x2xf16>) outs(%1 : tensor<2560x2xf16>) -> tensor<2560x2xf16>
-        return %2 : tensor<2560x2xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_2560_32_8192_bf16_tA.mlir b/gemm/mlir/gemm_2560_32_8192_bf16_tA.mlir
deleted file mode 100644
index 45d0840..0000000
--- a/gemm/mlir/gemm_2560_32_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x2560xbf16>, %arg1: tensor<8192x32xbf16>) -> tensor<2560x32xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<2560x32xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<2560x32xbf16>) -> tensor<2560x32xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x2560xbf16>, tensor<8192x32xbf16>) outs(%1 : tensor<2560x32xbf16>) -> tensor<2560x32xbf16>
-        return %2 : tensor<2560x32xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_2560_32_8192_f16_tA.mlir b/gemm/mlir/gemm_2560_32_8192_f16_tA.mlir
deleted file mode 100644
index 456b6c6..0000000
--- a/gemm/mlir/gemm_2560_32_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x2560xf16>, %arg1: tensor<8192x32xf16>) -> tensor<2560x32xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<2560x32xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<2560x32xf16>) -> tensor<2560x32xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x2560xf16>, tensor<8192x32xf16>) outs(%1 : tensor<2560x32xf16>) -> tensor<2560x32xf16>
-        return %2 : tensor<2560x32xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_2560_4_8192_bf16_tA.mlir b/gemm/mlir/gemm_2560_4_8192_bf16_tA.mlir
deleted file mode 100644
index d377ec1..0000000
--- a/gemm/mlir/gemm_2560_4_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x2560xbf16>, %arg1: tensor<8192x4xbf16>) -> tensor<2560x4xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<2560x4xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<2560x4xbf16>) -> tensor<2560x4xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x2560xbf16>, tensor<8192x4xbf16>) outs(%1 : tensor<2560x4xbf16>) -> tensor<2560x4xbf16>
-        return %2 : tensor<2560x4xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_2560_4_8192_f16_tA.mlir b/gemm/mlir/gemm_2560_4_8192_f16_tA.mlir
deleted file mode 100644
index a152ec3..0000000
--- a/gemm/mlir/gemm_2560_4_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x2560xf16>, %arg1: tensor<8192x4xf16>) -> tensor<2560x4xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<2560x4xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<2560x4xf16>) -> tensor<2560x4xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x2560xf16>, tensor<8192x4xf16>) outs(%1 : tensor<2560x4xf16>) -> tensor<2560x4xf16>
-        return %2 : tensor<2560x4xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_2560_8_8192_bf16_tA.mlir b/gemm/mlir/gemm_2560_8_8192_bf16_tA.mlir
deleted file mode 100644
index 76c1250..0000000
--- a/gemm/mlir/gemm_2560_8_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x2560xbf16>, %arg1: tensor<8192x8xbf16>) -> tensor<2560x8xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<2560x8xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<2560x8xbf16>) -> tensor<2560x8xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x2560xbf16>, tensor<8192x8xbf16>) outs(%1 : tensor<2560x8xbf16>) -> tensor<2560x8xbf16>
-        return %2 : tensor<2560x8xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_2560_8_8192_f16_tA.mlir b/gemm/mlir/gemm_2560_8_8192_f16_tA.mlir
deleted file mode 100644
index fff4a68..0000000
--- a/gemm/mlir/gemm_2560_8_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x2560xf16>, %arg1: tensor<8192x8xf16>) -> tensor<2560x8xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<2560x8xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<2560x8xf16>) -> tensor<2560x8xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x2560xf16>, tensor<8192x8xf16>) outs(%1 : tensor<2560x8xf16>) -> tensor<2560x8xf16>
-        return %2 : tensor<2560x8xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_27648_16_5120_bf16_tA.mlir b/gemm/mlir/gemm_27648_16_5120_bf16_tA.mlir
deleted file mode 100644
index e06171a..0000000
--- a/gemm/mlir/gemm_27648_16_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x27648xbf16>, %arg1: tensor<5120x16xbf16>) -> tensor<27648x16xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<27648x16xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<27648x16xbf16>) -> tensor<27648x16xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x27648xbf16>, tensor<5120x16xbf16>) outs(%1 : tensor<27648x16xbf16>) -> tensor<27648x16xbf16>
-        return %2 : tensor<27648x16xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_27648_16_5120_f16_tA.mlir b/gemm/mlir/gemm_27648_16_5120_f16_tA.mlir
deleted file mode 100644
index 9af970d..0000000
--- a/gemm/mlir/gemm_27648_16_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x27648xf16>, %arg1: tensor<5120x16xf16>) -> tensor<27648x16xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<27648x16xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<27648x16xf16>) -> tensor<27648x16xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x27648xf16>, tensor<5120x16xf16>) outs(%1 : tensor<27648x16xf16>) -> tensor<27648x16xf16>
-        return %2 : tensor<27648x16xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_27648_1_5120_bf16_tA.mlir b/gemm/mlir/gemm_27648_1_5120_bf16_tA.mlir
deleted file mode 100644
index dda9b15..0000000
--- a/gemm/mlir/gemm_27648_1_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x27648xbf16>, %arg1: tensor<5120x1xbf16>) -> tensor<27648x1xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<27648x1xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<27648x1xbf16>) -> tensor<27648x1xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x27648xbf16>, tensor<5120x1xbf16>) outs(%1 : tensor<27648x1xbf16>) -> tensor<27648x1xbf16>
-        return %2 : tensor<27648x1xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_27648_1_5120_f16_tA.mlir b/gemm/mlir/gemm_27648_1_5120_f16_tA.mlir
deleted file mode 100644
index f2d5c42..0000000
--- a/gemm/mlir/gemm_27648_1_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x27648xf16>, %arg1: tensor<5120x1xf16>) -> tensor<27648x1xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<27648x1xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<27648x1xf16>) -> tensor<27648x1xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x27648xf16>, tensor<5120x1xf16>) outs(%1 : tensor<27648x1xf16>) -> tensor<27648x1xf16>
-        return %2 : tensor<27648x1xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_27648_2_5120_bf16_tA.mlir b/gemm/mlir/gemm_27648_2_5120_bf16_tA.mlir
deleted file mode 100644
index e16cd24..0000000
--- a/gemm/mlir/gemm_27648_2_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x27648xbf16>, %arg1: tensor<5120x2xbf16>) -> tensor<27648x2xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<27648x2xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<27648x2xbf16>) -> tensor<27648x2xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x27648xbf16>, tensor<5120x2xbf16>) outs(%1 : tensor<27648x2xbf16>) -> tensor<27648x2xbf16>
-        return %2 : tensor<27648x2xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_27648_2_5120_f16_tA.mlir b/gemm/mlir/gemm_27648_2_5120_f16_tA.mlir
deleted file mode 100644
index dcf4508..0000000
--- a/gemm/mlir/gemm_27648_2_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x27648xf16>, %arg1: tensor<5120x2xf16>) -> tensor<27648x2xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<27648x2xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<27648x2xf16>) -> tensor<27648x2xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x27648xf16>, tensor<5120x2xf16>) outs(%1 : tensor<27648x2xf16>) -> tensor<27648x2xf16>
-        return %2 : tensor<27648x2xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_27648_32_5120_bf16_tA.mlir b/gemm/mlir/gemm_27648_32_5120_bf16_tA.mlir
deleted file mode 100644
index 0a408fd..0000000
--- a/gemm/mlir/gemm_27648_32_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x27648xbf16>, %arg1: tensor<5120x32xbf16>) -> tensor<27648x32xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<27648x32xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<27648x32xbf16>) -> tensor<27648x32xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x27648xbf16>, tensor<5120x32xbf16>) outs(%1 : tensor<27648x32xbf16>) -> tensor<27648x32xbf16>
-        return %2 : tensor<27648x32xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_27648_32_5120_f16_tA.mlir b/gemm/mlir/gemm_27648_32_5120_f16_tA.mlir
deleted file mode 100644
index 90927a3..0000000
--- a/gemm/mlir/gemm_27648_32_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x27648xf16>, %arg1: tensor<5120x32xf16>) -> tensor<27648x32xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<27648x32xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<27648x32xf16>) -> tensor<27648x32xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x27648xf16>, tensor<5120x32xf16>) outs(%1 : tensor<27648x32xf16>) -> tensor<27648x32xf16>
-        return %2 : tensor<27648x32xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_27648_4_5120_bf16_tA.mlir b/gemm/mlir/gemm_27648_4_5120_bf16_tA.mlir
deleted file mode 100644
index 20f2150..0000000
--- a/gemm/mlir/gemm_27648_4_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x27648xbf16>, %arg1: tensor<5120x4xbf16>) -> tensor<27648x4xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<27648x4xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<27648x4xbf16>) -> tensor<27648x4xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x27648xbf16>, tensor<5120x4xbf16>) outs(%1 : tensor<27648x4xbf16>) -> tensor<27648x4xbf16>
-        return %2 : tensor<27648x4xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_27648_4_5120_f16_tA.mlir b/gemm/mlir/gemm_27648_4_5120_f16_tA.mlir
deleted file mode 100644
index af948b5..0000000
--- a/gemm/mlir/gemm_27648_4_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x27648xf16>, %arg1: tensor<5120x4xf16>) -> tensor<27648x4xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<27648x4xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<27648x4xf16>) -> tensor<27648x4xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x27648xf16>, tensor<5120x4xf16>) outs(%1 : tensor<27648x4xf16>) -> tensor<27648x4xf16>
-        return %2 : tensor<27648x4xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_27648_8_5120_bf16_tA.mlir b/gemm/mlir/gemm_27648_8_5120_bf16_tA.mlir
deleted file mode 100644
index fd43a3e..0000000
--- a/gemm/mlir/gemm_27648_8_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x27648xbf16>, %arg1: tensor<5120x8xbf16>) -> tensor<27648x8xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<27648x8xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<27648x8xbf16>) -> tensor<27648x8xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x27648xbf16>, tensor<5120x8xbf16>) outs(%1 : tensor<27648x8xbf16>) -> tensor<27648x8xbf16>
-        return %2 : tensor<27648x8xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_27648_8_5120_f16_tA.mlir b/gemm/mlir/gemm_27648_8_5120_f16_tA.mlir
deleted file mode 100644
index 6d0ec2e..0000000
--- a/gemm/mlir/gemm_27648_8_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x27648xf16>, %arg1: tensor<5120x8xf16>) -> tensor<27648x8xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<27648x8xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<27648x8xf16>) -> tensor<27648x8xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x27648xf16>, tensor<5120x8xf16>) outs(%1 : tensor<27648x8xf16>) -> tensor<27648x8xf16>
-        return %2 : tensor<27648x8xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_28672_16_8192_bf16_tA.mlir b/gemm/mlir/gemm_28672_16_8192_bf16_tA.mlir
deleted file mode 100644
index 10c20ee..0000000
--- a/gemm/mlir/gemm_28672_16_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x28672xbf16>, %arg1: tensor<8192x16xbf16>) -> tensor<28672x16xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<28672x16xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<28672x16xbf16>) -> tensor<28672x16xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x28672xbf16>, tensor<8192x16xbf16>) outs(%1 : tensor<28672x16xbf16>) -> tensor<28672x16xbf16>
-        return %2 : tensor<28672x16xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_28672_16_8192_f16_tA.mlir b/gemm/mlir/gemm_28672_16_8192_f16_tA.mlir
deleted file mode 100644
index f923157..0000000
--- a/gemm/mlir/gemm_28672_16_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x28672xf16>, %arg1: tensor<8192x16xf16>) -> tensor<28672x16xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<28672x16xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<28672x16xf16>) -> tensor<28672x16xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x28672xf16>, tensor<8192x16xf16>) outs(%1 : tensor<28672x16xf16>) -> tensor<28672x16xf16>
-        return %2 : tensor<28672x16xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_28672_1_8192_bf16_tA.mlir b/gemm/mlir/gemm_28672_1_8192_bf16_tA.mlir
deleted file mode 100644
index 6a24568..0000000
--- a/gemm/mlir/gemm_28672_1_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x28672xbf16>, %arg1: tensor<8192x1xbf16>) -> tensor<28672x1xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<28672x1xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<28672x1xbf16>) -> tensor<28672x1xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x28672xbf16>, tensor<8192x1xbf16>) outs(%1 : tensor<28672x1xbf16>) -> tensor<28672x1xbf16>
-        return %2 : tensor<28672x1xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_28672_1_8192_f16_tA.mlir b/gemm/mlir/gemm_28672_1_8192_f16_tA.mlir
deleted file mode 100644
index a4bb37c..0000000
--- a/gemm/mlir/gemm_28672_1_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x28672xf16>, %arg1: tensor<8192x1xf16>) -> tensor<28672x1xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<28672x1xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<28672x1xf16>) -> tensor<28672x1xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x28672xf16>, tensor<8192x1xf16>) outs(%1 : tensor<28672x1xf16>) -> tensor<28672x1xf16>
-        return %2 : tensor<28672x1xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_28672_2_8192_bf16_tA.mlir b/gemm/mlir/gemm_28672_2_8192_bf16_tA.mlir
deleted file mode 100644
index 24fd156..0000000
--- a/gemm/mlir/gemm_28672_2_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x28672xbf16>, %arg1: tensor<8192x2xbf16>) -> tensor<28672x2xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<28672x2xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<28672x2xbf16>) -> tensor<28672x2xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x28672xbf16>, tensor<8192x2xbf16>) outs(%1 : tensor<28672x2xbf16>) -> tensor<28672x2xbf16>
-        return %2 : tensor<28672x2xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_28672_2_8192_f16_tA.mlir b/gemm/mlir/gemm_28672_2_8192_f16_tA.mlir
deleted file mode 100644
index 85df0ac..0000000
--- a/gemm/mlir/gemm_28672_2_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x28672xf16>, %arg1: tensor<8192x2xf16>) -> tensor<28672x2xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<28672x2xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<28672x2xf16>) -> tensor<28672x2xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x28672xf16>, tensor<8192x2xf16>) outs(%1 : tensor<28672x2xf16>) -> tensor<28672x2xf16>
-        return %2 : tensor<28672x2xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_28672_32_8192_bf16_tA.mlir b/gemm/mlir/gemm_28672_32_8192_bf16_tA.mlir
deleted file mode 100644
index e920955..0000000
--- a/gemm/mlir/gemm_28672_32_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x28672xbf16>, %arg1: tensor<8192x32xbf16>) -> tensor<28672x32xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<28672x32xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<28672x32xbf16>) -> tensor<28672x32xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x28672xbf16>, tensor<8192x32xbf16>) outs(%1 : tensor<28672x32xbf16>) -> tensor<28672x32xbf16>
-        return %2 : tensor<28672x32xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_28672_32_8192_f16_tA.mlir b/gemm/mlir/gemm_28672_32_8192_f16_tA.mlir
deleted file mode 100644
index 44a1361..0000000
--- a/gemm/mlir/gemm_28672_32_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x28672xf16>, %arg1: tensor<8192x32xf16>) -> tensor<28672x32xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<28672x32xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<28672x32xf16>) -> tensor<28672x32xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x28672xf16>, tensor<8192x32xf16>) outs(%1 : tensor<28672x32xf16>) -> tensor<28672x32xf16>
-        return %2 : tensor<28672x32xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_28672_4_8192_bf16_tA.mlir b/gemm/mlir/gemm_28672_4_8192_bf16_tA.mlir
deleted file mode 100644
index 7ce0353..0000000
--- a/gemm/mlir/gemm_28672_4_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x28672xbf16>, %arg1: tensor<8192x4xbf16>) -> tensor<28672x4xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<28672x4xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<28672x4xbf16>) -> tensor<28672x4xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x28672xbf16>, tensor<8192x4xbf16>) outs(%1 : tensor<28672x4xbf16>) -> tensor<28672x4xbf16>
-        return %2 : tensor<28672x4xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_28672_4_8192_f16_tA.mlir b/gemm/mlir/gemm_28672_4_8192_f16_tA.mlir
deleted file mode 100644
index a773111..0000000
--- a/gemm/mlir/gemm_28672_4_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x28672xf16>, %arg1: tensor<8192x4xf16>) -> tensor<28672x4xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<28672x4xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<28672x4xf16>) -> tensor<28672x4xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x28672xf16>, tensor<8192x4xf16>) outs(%1 : tensor<28672x4xf16>) -> tensor<28672x4xf16>
-        return %2 : tensor<28672x4xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_28672_8_8192_bf16_tA.mlir b/gemm/mlir/gemm_28672_8_8192_bf16_tA.mlir
deleted file mode 100644
index 5a2541f..0000000
--- a/gemm/mlir/gemm_28672_8_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x28672xbf16>, %arg1: tensor<8192x8xbf16>) -> tensor<28672x8xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<28672x8xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<28672x8xbf16>) -> tensor<28672x8xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x28672xbf16>, tensor<8192x8xbf16>) outs(%1 : tensor<28672x8xbf16>) -> tensor<28672x8xbf16>
-        return %2 : tensor<28672x8xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_28672_8_8192_f16_tA.mlir b/gemm/mlir/gemm_28672_8_8192_f16_tA.mlir
deleted file mode 100644
index 9226cfe..0000000
--- a/gemm/mlir/gemm_28672_8_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x28672xf16>, %arg1: tensor<8192x8xf16>) -> tensor<28672x8xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<28672x8xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<28672x8xf16>) -> tensor<28672x8xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x28672xf16>, tensor<8192x8xf16>) outs(%1 : tensor<28672x8xf16>) -> tensor<28672x8xf16>
-        return %2 : tensor<28672x8xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_2_1280_8192_bf16_tB.mlir b/gemm/mlir/gemm_2_1280_8192_bf16_tB.mlir
deleted file mode 100644
index 1040350..0000000
--- a/gemm/mlir/gemm_2_1280_8192_bf16_tB.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<2x8192xbf16>, %arg1: tensor<1280x8192xbf16>) -> tensor<2x1280xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<2x1280xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<2x1280xbf16>) -> tensor<2x1280xbf16>
-        %2 = linalg.matmul_transpose_b ins(%arg0, %arg1 : tensor<2x8192xbf16>, tensor<1280x8192xbf16>) outs(%1 : tensor<2x1280xbf16>) -> tensor<2x1280xbf16>
-        return %2 : tensor<2x1280xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_2_3584_8192_bf16_tB.mlir b/gemm/mlir/gemm_2_3584_8192_bf16_tB.mlir
deleted file mode 100644
index 7f6b6ea..0000000
--- a/gemm/mlir/gemm_2_3584_8192_bf16_tB.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<2x8192xbf16>, %arg1: tensor<3584x8192xbf16>) -> tensor<2x3584xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<2x3584xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<2x3584xbf16>) -> tensor<2x3584xbf16>
-        %2 = linalg.matmul_transpose_b ins(%arg0, %arg1 : tensor<2x8192xbf16>, tensor<3584x8192xbf16>) outs(%1 : tensor<2x3584xbf16>) -> tensor<2x3584xbf16>
-        return %2 : tensor<2x3584xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_2_7168_8192_bf16_tB.mlir b/gemm/mlir/gemm_2_7168_8192_bf16_tB.mlir
deleted file mode 100644
index 6ac8002..0000000
--- a/gemm/mlir/gemm_2_7168_8192_bf16_tB.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<2x8192xbf16>, %arg1: tensor<7168x8192xbf16>) -> tensor<2x7168xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<2x7168xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<2x7168xbf16>) -> tensor<2x7168xbf16>
-        %2 = linalg.matmul_transpose_b ins(%arg0, %arg1 : tensor<2x8192xbf16>, tensor<7168x8192xbf16>) outs(%1 : tensor<2x7168xbf16>) -> tensor<2x7168xbf16>
-        return %2 : tensor<2x7168xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_32000_16_5120_bf16_tA.mlir b/gemm/mlir/gemm_32000_16_5120_bf16_tA.mlir
deleted file mode 100644
index 986fbe3..0000000
--- a/gemm/mlir/gemm_32000_16_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x32000xbf16>, %arg1: tensor<5120x16xbf16>) -> tensor<32000x16xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<32000x16xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<32000x16xbf16>) -> tensor<32000x16xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x32000xbf16>, tensor<5120x16xbf16>) outs(%1 : tensor<32000x16xbf16>) -> tensor<32000x16xbf16>
-        return %2 : tensor<32000x16xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_32000_16_5120_f16_tA.mlir b/gemm/mlir/gemm_32000_16_5120_f16_tA.mlir
deleted file mode 100644
index bb83872..0000000
--- a/gemm/mlir/gemm_32000_16_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x32000xf16>, %arg1: tensor<5120x16xf16>) -> tensor<32000x16xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<32000x16xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<32000x16xf16>) -> tensor<32000x16xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x32000xf16>, tensor<5120x16xf16>) outs(%1 : tensor<32000x16xf16>) -> tensor<32000x16xf16>
-        return %2 : tensor<32000x16xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_32000_16_8192_bf16_tA.mlir b/gemm/mlir/gemm_32000_16_8192_bf16_tA.mlir
deleted file mode 100644
index af63a99..0000000
--- a/gemm/mlir/gemm_32000_16_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x32000xbf16>, %arg1: tensor<8192x16xbf16>) -> tensor<32000x16xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<32000x16xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<32000x16xbf16>) -> tensor<32000x16xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x32000xbf16>, tensor<8192x16xbf16>) outs(%1 : tensor<32000x16xbf16>) -> tensor<32000x16xbf16>
-        return %2 : tensor<32000x16xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_32000_16_8192_f16_tA.mlir b/gemm/mlir/gemm_32000_16_8192_f16_tA.mlir
deleted file mode 100644
index 9881c6e..0000000
--- a/gemm/mlir/gemm_32000_16_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x32000xf16>, %arg1: tensor<8192x16xf16>) -> tensor<32000x16xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<32000x16xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<32000x16xf16>) -> tensor<32000x16xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x32000xf16>, tensor<8192x16xf16>) outs(%1 : tensor<32000x16xf16>) -> tensor<32000x16xf16>
-        return %2 : tensor<32000x16xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_32000_1_5120_bf16_tA.mlir b/gemm/mlir/gemm_32000_1_5120_bf16_tA.mlir
deleted file mode 100644
index 4d33257..0000000
--- a/gemm/mlir/gemm_32000_1_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x32000xbf16>, %arg1: tensor<5120x1xbf16>) -> tensor<32000x1xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<32000x1xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<32000x1xbf16>) -> tensor<32000x1xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x32000xbf16>, tensor<5120x1xbf16>) outs(%1 : tensor<32000x1xbf16>) -> tensor<32000x1xbf16>
-        return %2 : tensor<32000x1xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_32000_1_5120_f16_tA.mlir b/gemm/mlir/gemm_32000_1_5120_f16_tA.mlir
deleted file mode 100644
index 9849f9c..0000000
--- a/gemm/mlir/gemm_32000_1_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x32000xf16>, %arg1: tensor<5120x1xf16>) -> tensor<32000x1xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<32000x1xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<32000x1xf16>) -> tensor<32000x1xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x32000xf16>, tensor<5120x1xf16>) outs(%1 : tensor<32000x1xf16>) -> tensor<32000x1xf16>
-        return %2 : tensor<32000x1xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_32000_1_8192_bf16_tA.mlir b/gemm/mlir/gemm_32000_1_8192_bf16_tA.mlir
deleted file mode 100644
index cdf30e8..0000000
--- a/gemm/mlir/gemm_32000_1_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x32000xbf16>, %arg1: tensor<8192x1xbf16>) -> tensor<32000x1xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<32000x1xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<32000x1xbf16>) -> tensor<32000x1xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x32000xbf16>, tensor<8192x1xbf16>) outs(%1 : tensor<32000x1xbf16>) -> tensor<32000x1xbf16>
-        return %2 : tensor<32000x1xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_32000_1_8192_f16_tA.mlir b/gemm/mlir/gemm_32000_1_8192_f16_tA.mlir
deleted file mode 100644
index fb063c9..0000000
--- a/gemm/mlir/gemm_32000_1_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x32000xf16>, %arg1: tensor<8192x1xf16>) -> tensor<32000x1xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<32000x1xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<32000x1xf16>) -> tensor<32000x1xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x32000xf16>, tensor<8192x1xf16>) outs(%1 : tensor<32000x1xf16>) -> tensor<32000x1xf16>
-        return %2 : tensor<32000x1xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_32000_2_5120_bf16_tA.mlir b/gemm/mlir/gemm_32000_2_5120_bf16_tA.mlir
deleted file mode 100644
index ffcff1f..0000000
--- a/gemm/mlir/gemm_32000_2_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x32000xbf16>, %arg1: tensor<5120x2xbf16>) -> tensor<32000x2xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<32000x2xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<32000x2xbf16>) -> tensor<32000x2xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x32000xbf16>, tensor<5120x2xbf16>) outs(%1 : tensor<32000x2xbf16>) -> tensor<32000x2xbf16>
-        return %2 : tensor<32000x2xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_32000_2_5120_f16_tA.mlir b/gemm/mlir/gemm_32000_2_5120_f16_tA.mlir
deleted file mode 100644
index 74b1e6a..0000000
--- a/gemm/mlir/gemm_32000_2_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x32000xf16>, %arg1: tensor<5120x2xf16>) -> tensor<32000x2xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<32000x2xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<32000x2xf16>) -> tensor<32000x2xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x32000xf16>, tensor<5120x2xf16>) outs(%1 : tensor<32000x2xf16>) -> tensor<32000x2xf16>
-        return %2 : tensor<32000x2xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_32000_2_8192_bf16_tA.mlir b/gemm/mlir/gemm_32000_2_8192_bf16_tA.mlir
deleted file mode 100644
index 5c6b46d..0000000
--- a/gemm/mlir/gemm_32000_2_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x32000xbf16>, %arg1: tensor<8192x2xbf16>) -> tensor<32000x2xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<32000x2xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<32000x2xbf16>) -> tensor<32000x2xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x32000xbf16>, tensor<8192x2xbf16>) outs(%1 : tensor<32000x2xbf16>) -> tensor<32000x2xbf16>
-        return %2 : tensor<32000x2xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_32000_2_8192_f16_tA.mlir b/gemm/mlir/gemm_32000_2_8192_f16_tA.mlir
deleted file mode 100644
index 5623d69..0000000
--- a/gemm/mlir/gemm_32000_2_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x32000xf16>, %arg1: tensor<8192x2xf16>) -> tensor<32000x2xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<32000x2xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<32000x2xf16>) -> tensor<32000x2xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x32000xf16>, tensor<8192x2xf16>) outs(%1 : tensor<32000x2xf16>) -> tensor<32000x2xf16>
-        return %2 : tensor<32000x2xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_32000_32_5120_bf16_tA.mlir b/gemm/mlir/gemm_32000_32_5120_bf16_tA.mlir
deleted file mode 100644
index 6585842..0000000
--- a/gemm/mlir/gemm_32000_32_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x32000xbf16>, %arg1: tensor<5120x32xbf16>) -> tensor<32000x32xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<32000x32xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<32000x32xbf16>) -> tensor<32000x32xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x32000xbf16>, tensor<5120x32xbf16>) outs(%1 : tensor<32000x32xbf16>) -> tensor<32000x32xbf16>
-        return %2 : tensor<32000x32xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_32000_32_5120_f16_tA.mlir b/gemm/mlir/gemm_32000_32_5120_f16_tA.mlir
deleted file mode 100644
index dfc38c7..0000000
--- a/gemm/mlir/gemm_32000_32_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x32000xf16>, %arg1: tensor<5120x32xf16>) -> tensor<32000x32xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<32000x32xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<32000x32xf16>) -> tensor<32000x32xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x32000xf16>, tensor<5120x32xf16>) outs(%1 : tensor<32000x32xf16>) -> tensor<32000x32xf16>
-        return %2 : tensor<32000x32xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_32000_32_8192_bf16_tA.mlir b/gemm/mlir/gemm_32000_32_8192_bf16_tA.mlir
deleted file mode 100644
index efaefd2..0000000
--- a/gemm/mlir/gemm_32000_32_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x32000xbf16>, %arg1: tensor<8192x32xbf16>) -> tensor<32000x32xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<32000x32xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<32000x32xbf16>) -> tensor<32000x32xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x32000xbf16>, tensor<8192x32xbf16>) outs(%1 : tensor<32000x32xbf16>) -> tensor<32000x32xbf16>
-        return %2 : tensor<32000x32xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_32000_32_8192_f16_tA.mlir b/gemm/mlir/gemm_32000_32_8192_f16_tA.mlir
deleted file mode 100644
index d82b086..0000000
--- a/gemm/mlir/gemm_32000_32_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x32000xf16>, %arg1: tensor<8192x32xf16>) -> tensor<32000x32xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<32000x32xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<32000x32xf16>) -> tensor<32000x32xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x32000xf16>, tensor<8192x32xf16>) outs(%1 : tensor<32000x32xf16>) -> tensor<32000x32xf16>
-        return %2 : tensor<32000x32xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_32000_4_5120_bf16_tA.mlir b/gemm/mlir/gemm_32000_4_5120_bf16_tA.mlir
deleted file mode 100644
index f52612c..0000000
--- a/gemm/mlir/gemm_32000_4_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x32000xbf16>, %arg1: tensor<5120x4xbf16>) -> tensor<32000x4xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<32000x4xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<32000x4xbf16>) -> tensor<32000x4xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x32000xbf16>, tensor<5120x4xbf16>) outs(%1 : tensor<32000x4xbf16>) -> tensor<32000x4xbf16>
-        return %2 : tensor<32000x4xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_32000_4_5120_f16_tA.mlir b/gemm/mlir/gemm_32000_4_5120_f16_tA.mlir
deleted file mode 100644
index 43e179b..0000000
--- a/gemm/mlir/gemm_32000_4_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x32000xf16>, %arg1: tensor<5120x4xf16>) -> tensor<32000x4xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<32000x4xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<32000x4xf16>) -> tensor<32000x4xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x32000xf16>, tensor<5120x4xf16>) outs(%1 : tensor<32000x4xf16>) -> tensor<32000x4xf16>
-        return %2 : tensor<32000x4xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_32000_4_8192_bf16_tA.mlir b/gemm/mlir/gemm_32000_4_8192_bf16_tA.mlir
deleted file mode 100644
index e3a7fcc..0000000
--- a/gemm/mlir/gemm_32000_4_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x32000xbf16>, %arg1: tensor<8192x4xbf16>) -> tensor<32000x4xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<32000x4xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<32000x4xbf16>) -> tensor<32000x4xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x32000xbf16>, tensor<8192x4xbf16>) outs(%1 : tensor<32000x4xbf16>) -> tensor<32000x4xbf16>
-        return %2 : tensor<32000x4xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_32000_4_8192_f16_tA.mlir b/gemm/mlir/gemm_32000_4_8192_f16_tA.mlir
deleted file mode 100644
index c430b43..0000000
--- a/gemm/mlir/gemm_32000_4_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x32000xf16>, %arg1: tensor<8192x4xf16>) -> tensor<32000x4xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<32000x4xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<32000x4xf16>) -> tensor<32000x4xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x32000xf16>, tensor<8192x4xf16>) outs(%1 : tensor<32000x4xf16>) -> tensor<32000x4xf16>
-        return %2 : tensor<32000x4xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_32000_8_5120_bf16_tA.mlir b/gemm/mlir/gemm_32000_8_5120_bf16_tA.mlir
deleted file mode 100644
index c3082b6..0000000
--- a/gemm/mlir/gemm_32000_8_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x32000xbf16>, %arg1: tensor<5120x8xbf16>) -> tensor<32000x8xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<32000x8xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<32000x8xbf16>) -> tensor<32000x8xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x32000xbf16>, tensor<5120x8xbf16>) outs(%1 : tensor<32000x8xbf16>) -> tensor<32000x8xbf16>
-        return %2 : tensor<32000x8xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_32000_8_5120_f16_tA.mlir b/gemm/mlir/gemm_32000_8_5120_f16_tA.mlir
deleted file mode 100644
index 84959d3..0000000
--- a/gemm/mlir/gemm_32000_8_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x32000xf16>, %arg1: tensor<5120x8xf16>) -> tensor<32000x8xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<32000x8xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<32000x8xf16>) -> tensor<32000x8xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x32000xf16>, tensor<5120x8xf16>) outs(%1 : tensor<32000x8xf16>) -> tensor<32000x8xf16>
-        return %2 : tensor<32000x8xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_32000_8_8192_bf16_tA.mlir b/gemm/mlir/gemm_32000_8_8192_bf16_tA.mlir
deleted file mode 100644
index 7cbee49..0000000
--- a/gemm/mlir/gemm_32000_8_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x32000xbf16>, %arg1: tensor<8192x8xbf16>) -> tensor<32000x8xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<32000x8xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<32000x8xbf16>) -> tensor<32000x8xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x32000xbf16>, tensor<8192x8xbf16>) outs(%1 : tensor<32000x8xbf16>) -> tensor<32000x8xbf16>
-        return %2 : tensor<32000x8xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_32000_8_8192_f16_tA.mlir b/gemm/mlir/gemm_32000_8_8192_f16_tA.mlir
deleted file mode 100644
index 67d245e..0000000
--- a/gemm/mlir/gemm_32000_8_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x32000xf16>, %arg1: tensor<8192x8xf16>) -> tensor<32000x8xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<32000x8xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<32000x8xf16>) -> tensor<32000x8xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x32000xf16>, tensor<8192x8xf16>) outs(%1 : tensor<32000x8xf16>) -> tensor<32000x8xf16>
-        return %2 : tensor<32000x8xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_3456_16_5120_bf16_tA.mlir b/gemm/mlir/gemm_3456_16_5120_bf16_tA.mlir
deleted file mode 100644
index ab4fa46..0000000
--- a/gemm/mlir/gemm_3456_16_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x3456xbf16>, %arg1: tensor<5120x16xbf16>) -> tensor<3456x16xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<3456x16xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<3456x16xbf16>) -> tensor<3456x16xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x3456xbf16>, tensor<5120x16xbf16>) outs(%1 : tensor<3456x16xbf16>) -> tensor<3456x16xbf16>
-        return %2 : tensor<3456x16xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_3456_16_5120_f16_tA.mlir b/gemm/mlir/gemm_3456_16_5120_f16_tA.mlir
deleted file mode 100644
index 0c15001..0000000
--- a/gemm/mlir/gemm_3456_16_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x3456xf16>, %arg1: tensor<5120x16xf16>) -> tensor<3456x16xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<3456x16xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<3456x16xf16>) -> tensor<3456x16xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x3456xf16>, tensor<5120x16xf16>) outs(%1 : tensor<3456x16xf16>) -> tensor<3456x16xf16>
-        return %2 : tensor<3456x16xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_3456_1_5120_bf16_tA.mlir b/gemm/mlir/gemm_3456_1_5120_bf16_tA.mlir
deleted file mode 100644
index 754923e..0000000
--- a/gemm/mlir/gemm_3456_1_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x3456xbf16>, %arg1: tensor<5120x1xbf16>) -> tensor<3456x1xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<3456x1xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<3456x1xbf16>) -> tensor<3456x1xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x3456xbf16>, tensor<5120x1xbf16>) outs(%1 : tensor<3456x1xbf16>) -> tensor<3456x1xbf16>
-        return %2 : tensor<3456x1xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_3456_1_5120_f16_tA.mlir b/gemm/mlir/gemm_3456_1_5120_f16_tA.mlir
deleted file mode 100644
index a179e69..0000000
--- a/gemm/mlir/gemm_3456_1_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x3456xf16>, %arg1: tensor<5120x1xf16>) -> tensor<3456x1xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<3456x1xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<3456x1xf16>) -> tensor<3456x1xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x3456xf16>, tensor<5120x1xf16>) outs(%1 : tensor<3456x1xf16>) -> tensor<3456x1xf16>
-        return %2 : tensor<3456x1xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_3456_2_5120_bf16_tA.mlir b/gemm/mlir/gemm_3456_2_5120_bf16_tA.mlir
deleted file mode 100644
index 68afe12..0000000
--- a/gemm/mlir/gemm_3456_2_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x3456xbf16>, %arg1: tensor<5120x2xbf16>) -> tensor<3456x2xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<3456x2xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<3456x2xbf16>) -> tensor<3456x2xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x3456xbf16>, tensor<5120x2xbf16>) outs(%1 : tensor<3456x2xbf16>) -> tensor<3456x2xbf16>
-        return %2 : tensor<3456x2xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_3456_2_5120_f16_tA.mlir b/gemm/mlir/gemm_3456_2_5120_f16_tA.mlir
deleted file mode 100644
index c0fe5f9..0000000
--- a/gemm/mlir/gemm_3456_2_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x3456xf16>, %arg1: tensor<5120x2xf16>) -> tensor<3456x2xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<3456x2xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<3456x2xf16>) -> tensor<3456x2xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x3456xf16>, tensor<5120x2xf16>) outs(%1 : tensor<3456x2xf16>) -> tensor<3456x2xf16>
-        return %2 : tensor<3456x2xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_3456_32_5120_bf16_tA.mlir b/gemm/mlir/gemm_3456_32_5120_bf16_tA.mlir
deleted file mode 100644
index 9b8159a..0000000
--- a/gemm/mlir/gemm_3456_32_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x3456xbf16>, %arg1: tensor<5120x32xbf16>) -> tensor<3456x32xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<3456x32xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<3456x32xbf16>) -> tensor<3456x32xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x3456xbf16>, tensor<5120x32xbf16>) outs(%1 : tensor<3456x32xbf16>) -> tensor<3456x32xbf16>
-        return %2 : tensor<3456x32xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_3456_32_5120_f16_tA.mlir b/gemm/mlir/gemm_3456_32_5120_f16_tA.mlir
deleted file mode 100644
index fe43487..0000000
--- a/gemm/mlir/gemm_3456_32_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x3456xf16>, %arg1: tensor<5120x32xf16>) -> tensor<3456x32xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<3456x32xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<3456x32xf16>) -> tensor<3456x32xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x3456xf16>, tensor<5120x32xf16>) outs(%1 : tensor<3456x32xf16>) -> tensor<3456x32xf16>
-        return %2 : tensor<3456x32xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_3456_4_5120_bf16_tA.mlir b/gemm/mlir/gemm_3456_4_5120_bf16_tA.mlir
deleted file mode 100644
index d6bbdaa..0000000
--- a/gemm/mlir/gemm_3456_4_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x3456xbf16>, %arg1: tensor<5120x4xbf16>) -> tensor<3456x4xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<3456x4xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<3456x4xbf16>) -> tensor<3456x4xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x3456xbf16>, tensor<5120x4xbf16>) outs(%1 : tensor<3456x4xbf16>) -> tensor<3456x4xbf16>
-        return %2 : tensor<3456x4xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_3456_4_5120_f16_tA.mlir b/gemm/mlir/gemm_3456_4_5120_f16_tA.mlir
deleted file mode 100644
index d1ba93e..0000000
--- a/gemm/mlir/gemm_3456_4_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x3456xf16>, %arg1: tensor<5120x4xf16>) -> tensor<3456x4xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<3456x4xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<3456x4xf16>) -> tensor<3456x4xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x3456xf16>, tensor<5120x4xf16>) outs(%1 : tensor<3456x4xf16>) -> tensor<3456x4xf16>
-        return %2 : tensor<3456x4xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_3456_8_5120_bf16_tA.mlir b/gemm/mlir/gemm_3456_8_5120_bf16_tA.mlir
deleted file mode 100644
index b7b3a1e..0000000
--- a/gemm/mlir/gemm_3456_8_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x3456xbf16>, %arg1: tensor<5120x8xbf16>) -> tensor<3456x8xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<3456x8xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<3456x8xbf16>) -> tensor<3456x8xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x3456xbf16>, tensor<5120x8xbf16>) outs(%1 : tensor<3456x8xbf16>) -> tensor<3456x8xbf16>
-        return %2 : tensor<3456x8xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_3456_8_5120_f16_tA.mlir b/gemm/mlir/gemm_3456_8_5120_f16_tA.mlir
deleted file mode 100644
index 60f9e0c..0000000
--- a/gemm/mlir/gemm_3456_8_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x3456xf16>, %arg1: tensor<5120x8xf16>) -> tensor<3456x8xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<3456x8xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<3456x8xf16>) -> tensor<3456x8xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x3456xf16>, tensor<5120x8xf16>) outs(%1 : tensor<3456x8xf16>) -> tensor<3456x8xf16>
-        return %2 : tensor<3456x8xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_3840_16_5120_bf16_tA.mlir b/gemm/mlir/gemm_3840_16_5120_bf16_tA.mlir
deleted file mode 100644
index 63c122d..0000000
--- a/gemm/mlir/gemm_3840_16_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x3840xbf16>, %arg1: tensor<5120x16xbf16>) -> tensor<3840x16xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<3840x16xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<3840x16xbf16>) -> tensor<3840x16xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x3840xbf16>, tensor<5120x16xbf16>) outs(%1 : tensor<3840x16xbf16>) -> tensor<3840x16xbf16>
-        return %2 : tensor<3840x16xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_3840_16_5120_f16_tA.mlir b/gemm/mlir/gemm_3840_16_5120_f16_tA.mlir
deleted file mode 100644
index 5ed7814..0000000
--- a/gemm/mlir/gemm_3840_16_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x3840xf16>, %arg1: tensor<5120x16xf16>) -> tensor<3840x16xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<3840x16xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<3840x16xf16>) -> tensor<3840x16xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x3840xf16>, tensor<5120x16xf16>) outs(%1 : tensor<3840x16xf16>) -> tensor<3840x16xf16>
-        return %2 : tensor<3840x16xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_3840_1_5120_bf16_tA.mlir b/gemm/mlir/gemm_3840_1_5120_bf16_tA.mlir
deleted file mode 100644
index 30fce43..0000000
--- a/gemm/mlir/gemm_3840_1_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x3840xbf16>, %arg1: tensor<5120x1xbf16>) -> tensor<3840x1xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<3840x1xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<3840x1xbf16>) -> tensor<3840x1xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x3840xbf16>, tensor<5120x1xbf16>) outs(%1 : tensor<3840x1xbf16>) -> tensor<3840x1xbf16>
-        return %2 : tensor<3840x1xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_3840_1_5120_f16_tA.mlir b/gemm/mlir/gemm_3840_1_5120_f16_tA.mlir
deleted file mode 100644
index c83b20c..0000000
--- a/gemm/mlir/gemm_3840_1_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x3840xf16>, %arg1: tensor<5120x1xf16>) -> tensor<3840x1xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<3840x1xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<3840x1xf16>) -> tensor<3840x1xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x3840xf16>, tensor<5120x1xf16>) outs(%1 : tensor<3840x1xf16>) -> tensor<3840x1xf16>
-        return %2 : tensor<3840x1xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_3840_2_5120_bf16_tA.mlir b/gemm/mlir/gemm_3840_2_5120_bf16_tA.mlir
deleted file mode 100644
index fde61e4..0000000
--- a/gemm/mlir/gemm_3840_2_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x3840xbf16>, %arg1: tensor<5120x2xbf16>) -> tensor<3840x2xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<3840x2xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<3840x2xbf16>) -> tensor<3840x2xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x3840xbf16>, tensor<5120x2xbf16>) outs(%1 : tensor<3840x2xbf16>) -> tensor<3840x2xbf16>
-        return %2 : tensor<3840x2xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_3840_2_5120_f16_tA.mlir b/gemm/mlir/gemm_3840_2_5120_f16_tA.mlir
deleted file mode 100644
index 3526c21..0000000
--- a/gemm/mlir/gemm_3840_2_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x3840xf16>, %arg1: tensor<5120x2xf16>) -> tensor<3840x2xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<3840x2xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<3840x2xf16>) -> tensor<3840x2xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x3840xf16>, tensor<5120x2xf16>) outs(%1 : tensor<3840x2xf16>) -> tensor<3840x2xf16>
-        return %2 : tensor<3840x2xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_3840_32_5120_bf16_tA.mlir b/gemm/mlir/gemm_3840_32_5120_bf16_tA.mlir
deleted file mode 100644
index aae821a..0000000
--- a/gemm/mlir/gemm_3840_32_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x3840xbf16>, %arg1: tensor<5120x32xbf16>) -> tensor<3840x32xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<3840x32xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<3840x32xbf16>) -> tensor<3840x32xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x3840xbf16>, tensor<5120x32xbf16>) outs(%1 : tensor<3840x32xbf16>) -> tensor<3840x32xbf16>
-        return %2 : tensor<3840x32xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_3840_32_5120_f16_tA.mlir b/gemm/mlir/gemm_3840_32_5120_f16_tA.mlir
deleted file mode 100644
index 1491630..0000000
--- a/gemm/mlir/gemm_3840_32_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x3840xf16>, %arg1: tensor<5120x32xf16>) -> tensor<3840x32xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<3840x32xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<3840x32xf16>) -> tensor<3840x32xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x3840xf16>, tensor<5120x32xf16>) outs(%1 : tensor<3840x32xf16>) -> tensor<3840x32xf16>
-        return %2 : tensor<3840x32xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_3840_4_5120_bf16_tA.mlir b/gemm/mlir/gemm_3840_4_5120_bf16_tA.mlir
deleted file mode 100644
index fe34d3f..0000000
--- a/gemm/mlir/gemm_3840_4_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x3840xbf16>, %arg1: tensor<5120x4xbf16>) -> tensor<3840x4xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<3840x4xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<3840x4xbf16>) -> tensor<3840x4xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x3840xbf16>, tensor<5120x4xbf16>) outs(%1 : tensor<3840x4xbf16>) -> tensor<3840x4xbf16>
-        return %2 : tensor<3840x4xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_3840_4_5120_f16_tA.mlir b/gemm/mlir/gemm_3840_4_5120_f16_tA.mlir
deleted file mode 100644
index eab6a7c..0000000
--- a/gemm/mlir/gemm_3840_4_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x3840xf16>, %arg1: tensor<5120x4xf16>) -> tensor<3840x4xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<3840x4xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<3840x4xf16>) -> tensor<3840x4xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x3840xf16>, tensor<5120x4xf16>) outs(%1 : tensor<3840x4xf16>) -> tensor<3840x4xf16>
-        return %2 : tensor<3840x4xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_3840_8_5120_bf16_tA.mlir b/gemm/mlir/gemm_3840_8_5120_bf16_tA.mlir
deleted file mode 100644
index 84bb52a..0000000
--- a/gemm/mlir/gemm_3840_8_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x3840xbf16>, %arg1: tensor<5120x8xbf16>) -> tensor<3840x8xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<3840x8xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<3840x8xbf16>) -> tensor<3840x8xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x3840xbf16>, tensor<5120x8xbf16>) outs(%1 : tensor<3840x8xbf16>) -> tensor<3840x8xbf16>
-        return %2 : tensor<3840x8xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_3840_8_5120_f16_tA.mlir b/gemm/mlir/gemm_3840_8_5120_f16_tA.mlir
deleted file mode 100644
index 8c91198..0000000
--- a/gemm/mlir/gemm_3840_8_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x3840xf16>, %arg1: tensor<5120x8xf16>) -> tensor<3840x8xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<3840x8xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<3840x8xf16>) -> tensor<3840x8xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x3840xf16>, tensor<5120x8xf16>) outs(%1 : tensor<3840x8xf16>) -> tensor<3840x8xf16>
-        return %2 : tensor<3840x8xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_4000_16_5120_bf16_tA.mlir b/gemm/mlir/gemm_4000_16_5120_bf16_tA.mlir
deleted file mode 100644
index 01c0a78..0000000
--- a/gemm/mlir/gemm_4000_16_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x4000xbf16>, %arg1: tensor<5120x16xbf16>) -> tensor<4000x16xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<4000x16xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<4000x16xbf16>) -> tensor<4000x16xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x4000xbf16>, tensor<5120x16xbf16>) outs(%1 : tensor<4000x16xbf16>) -> tensor<4000x16xbf16>
-        return %2 : tensor<4000x16xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_4000_16_5120_f16_tA.mlir b/gemm/mlir/gemm_4000_16_5120_f16_tA.mlir
deleted file mode 100644
index 3eb9fe7..0000000
--- a/gemm/mlir/gemm_4000_16_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x4000xf16>, %arg1: tensor<5120x16xf16>) -> tensor<4000x16xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<4000x16xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<4000x16xf16>) -> tensor<4000x16xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x4000xf16>, tensor<5120x16xf16>) outs(%1 : tensor<4000x16xf16>) -> tensor<4000x16xf16>
-        return %2 : tensor<4000x16xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_4000_16_8192_bf16_tA.mlir b/gemm/mlir/gemm_4000_16_8192_bf16_tA.mlir
deleted file mode 100644
index a64464a..0000000
--- a/gemm/mlir/gemm_4000_16_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x4000xbf16>, %arg1: tensor<8192x16xbf16>) -> tensor<4000x16xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<4000x16xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<4000x16xbf16>) -> tensor<4000x16xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x4000xbf16>, tensor<8192x16xbf16>) outs(%1 : tensor<4000x16xbf16>) -> tensor<4000x16xbf16>
-        return %2 : tensor<4000x16xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_4000_16_8192_f16_tA.mlir b/gemm/mlir/gemm_4000_16_8192_f16_tA.mlir
deleted file mode 100644
index 68f9cda..0000000
--- a/gemm/mlir/gemm_4000_16_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x4000xf16>, %arg1: tensor<8192x16xf16>) -> tensor<4000x16xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<4000x16xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<4000x16xf16>) -> tensor<4000x16xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x4000xf16>, tensor<8192x16xf16>) outs(%1 : tensor<4000x16xf16>) -> tensor<4000x16xf16>
-        return %2 : tensor<4000x16xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_4000_1_5120_bf16_tA.mlir b/gemm/mlir/gemm_4000_1_5120_bf16_tA.mlir
deleted file mode 100644
index 857de41..0000000
--- a/gemm/mlir/gemm_4000_1_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x4000xbf16>, %arg1: tensor<5120x1xbf16>) -> tensor<4000x1xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<4000x1xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<4000x1xbf16>) -> tensor<4000x1xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x4000xbf16>, tensor<5120x1xbf16>) outs(%1 : tensor<4000x1xbf16>) -> tensor<4000x1xbf16>
-        return %2 : tensor<4000x1xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_4000_1_5120_f16_tA.mlir b/gemm/mlir/gemm_4000_1_5120_f16_tA.mlir
deleted file mode 100644
index f64c226..0000000
--- a/gemm/mlir/gemm_4000_1_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x4000xf16>, %arg1: tensor<5120x1xf16>) -> tensor<4000x1xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<4000x1xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<4000x1xf16>) -> tensor<4000x1xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x4000xf16>, tensor<5120x1xf16>) outs(%1 : tensor<4000x1xf16>) -> tensor<4000x1xf16>
-        return %2 : tensor<4000x1xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_4000_1_8192_bf16_tA.mlir b/gemm/mlir/gemm_4000_1_8192_bf16_tA.mlir
deleted file mode 100644
index c98f58c..0000000
--- a/gemm/mlir/gemm_4000_1_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x4000xbf16>, %arg1: tensor<8192x1xbf16>) -> tensor<4000x1xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<4000x1xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<4000x1xbf16>) -> tensor<4000x1xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x4000xbf16>, tensor<8192x1xbf16>) outs(%1 : tensor<4000x1xbf16>) -> tensor<4000x1xbf16>
-        return %2 : tensor<4000x1xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_4000_1_8192_f16_tA.mlir b/gemm/mlir/gemm_4000_1_8192_f16_tA.mlir
deleted file mode 100644
index 5aaef53..0000000
--- a/gemm/mlir/gemm_4000_1_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x4000xf16>, %arg1: tensor<8192x1xf16>) -> tensor<4000x1xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<4000x1xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<4000x1xf16>) -> tensor<4000x1xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x4000xf16>, tensor<8192x1xf16>) outs(%1 : tensor<4000x1xf16>) -> tensor<4000x1xf16>
-        return %2 : tensor<4000x1xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_4000_2_5120_bf16_tA.mlir b/gemm/mlir/gemm_4000_2_5120_bf16_tA.mlir
deleted file mode 100644
index cf6d890..0000000
--- a/gemm/mlir/gemm_4000_2_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x4000xbf16>, %arg1: tensor<5120x2xbf16>) -> tensor<4000x2xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<4000x2xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<4000x2xbf16>) -> tensor<4000x2xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x4000xbf16>, tensor<5120x2xbf16>) outs(%1 : tensor<4000x2xbf16>) -> tensor<4000x2xbf16>
-        return %2 : tensor<4000x2xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_4000_2_5120_f16_tA.mlir b/gemm/mlir/gemm_4000_2_5120_f16_tA.mlir
deleted file mode 100644
index 1d7ef35..0000000
--- a/gemm/mlir/gemm_4000_2_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x4000xf16>, %arg1: tensor<5120x2xf16>) -> tensor<4000x2xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<4000x2xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<4000x2xf16>) -> tensor<4000x2xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x4000xf16>, tensor<5120x2xf16>) outs(%1 : tensor<4000x2xf16>) -> tensor<4000x2xf16>
-        return %2 : tensor<4000x2xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_4000_2_8192_bf16_tA.mlir b/gemm/mlir/gemm_4000_2_8192_bf16_tA.mlir
deleted file mode 100644
index 1081115..0000000
--- a/gemm/mlir/gemm_4000_2_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x4000xbf16>, %arg1: tensor<8192x2xbf16>) -> tensor<4000x2xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<4000x2xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<4000x2xbf16>) -> tensor<4000x2xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x4000xbf16>, tensor<8192x2xbf16>) outs(%1 : tensor<4000x2xbf16>) -> tensor<4000x2xbf16>
-        return %2 : tensor<4000x2xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_4000_2_8192_f16_tA.mlir b/gemm/mlir/gemm_4000_2_8192_f16_tA.mlir
deleted file mode 100644
index 5d645df..0000000
--- a/gemm/mlir/gemm_4000_2_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x4000xf16>, %arg1: tensor<8192x2xf16>) -> tensor<4000x2xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<4000x2xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<4000x2xf16>) -> tensor<4000x2xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x4000xf16>, tensor<8192x2xf16>) outs(%1 : tensor<4000x2xf16>) -> tensor<4000x2xf16>
-        return %2 : tensor<4000x2xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_4000_32_5120_bf16_tA.mlir b/gemm/mlir/gemm_4000_32_5120_bf16_tA.mlir
deleted file mode 100644
index faa22ff..0000000
--- a/gemm/mlir/gemm_4000_32_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x4000xbf16>, %arg1: tensor<5120x32xbf16>) -> tensor<4000x32xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<4000x32xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<4000x32xbf16>) -> tensor<4000x32xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x4000xbf16>, tensor<5120x32xbf16>) outs(%1 : tensor<4000x32xbf16>) -> tensor<4000x32xbf16>
-        return %2 : tensor<4000x32xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_4000_32_5120_f16_tA.mlir b/gemm/mlir/gemm_4000_32_5120_f16_tA.mlir
deleted file mode 100644
index eb8e87e..0000000
--- a/gemm/mlir/gemm_4000_32_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x4000xf16>, %arg1: tensor<5120x32xf16>) -> tensor<4000x32xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<4000x32xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<4000x32xf16>) -> tensor<4000x32xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x4000xf16>, tensor<5120x32xf16>) outs(%1 : tensor<4000x32xf16>) -> tensor<4000x32xf16>
-        return %2 : tensor<4000x32xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_4000_32_8192_bf16_tA.mlir b/gemm/mlir/gemm_4000_32_8192_bf16_tA.mlir
deleted file mode 100644
index 0688fe2..0000000
--- a/gemm/mlir/gemm_4000_32_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x4000xbf16>, %arg1: tensor<8192x32xbf16>) -> tensor<4000x32xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<4000x32xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<4000x32xbf16>) -> tensor<4000x32xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x4000xbf16>, tensor<8192x32xbf16>) outs(%1 : tensor<4000x32xbf16>) -> tensor<4000x32xbf16>
-        return %2 : tensor<4000x32xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_4000_32_8192_f16_tA.mlir b/gemm/mlir/gemm_4000_32_8192_f16_tA.mlir
deleted file mode 100644
index d261394..0000000
--- a/gemm/mlir/gemm_4000_32_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x4000xf16>, %arg1: tensor<8192x32xf16>) -> tensor<4000x32xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<4000x32xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<4000x32xf16>) -> tensor<4000x32xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x4000xf16>, tensor<8192x32xf16>) outs(%1 : tensor<4000x32xf16>) -> tensor<4000x32xf16>
-        return %2 : tensor<4000x32xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_4000_4_5120_bf16_tA.mlir b/gemm/mlir/gemm_4000_4_5120_bf16_tA.mlir
deleted file mode 100644
index ee32dc1..0000000
--- a/gemm/mlir/gemm_4000_4_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x4000xbf16>, %arg1: tensor<5120x4xbf16>) -> tensor<4000x4xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<4000x4xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<4000x4xbf16>) -> tensor<4000x4xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x4000xbf16>, tensor<5120x4xbf16>) outs(%1 : tensor<4000x4xbf16>) -> tensor<4000x4xbf16>
-        return %2 : tensor<4000x4xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_4000_4_5120_f16_tA.mlir b/gemm/mlir/gemm_4000_4_5120_f16_tA.mlir
deleted file mode 100644
index 61b5e3d..0000000
--- a/gemm/mlir/gemm_4000_4_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x4000xf16>, %arg1: tensor<5120x4xf16>) -> tensor<4000x4xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<4000x4xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<4000x4xf16>) -> tensor<4000x4xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x4000xf16>, tensor<5120x4xf16>) outs(%1 : tensor<4000x4xf16>) -> tensor<4000x4xf16>
-        return %2 : tensor<4000x4xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_4000_4_8192_bf16_tA.mlir b/gemm/mlir/gemm_4000_4_8192_bf16_tA.mlir
deleted file mode 100644
index 1f73b7e..0000000
--- a/gemm/mlir/gemm_4000_4_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x4000xbf16>, %arg1: tensor<8192x4xbf16>) -> tensor<4000x4xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<4000x4xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<4000x4xbf16>) -> tensor<4000x4xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x4000xbf16>, tensor<8192x4xbf16>) outs(%1 : tensor<4000x4xbf16>) -> tensor<4000x4xbf16>
-        return %2 : tensor<4000x4xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_4000_4_8192_f16_tA.mlir b/gemm/mlir/gemm_4000_4_8192_f16_tA.mlir
deleted file mode 100644
index f85ff47..0000000
--- a/gemm/mlir/gemm_4000_4_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x4000xf16>, %arg1: tensor<8192x4xf16>) -> tensor<4000x4xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<4000x4xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<4000x4xf16>) -> tensor<4000x4xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x4000xf16>, tensor<8192x4xf16>) outs(%1 : tensor<4000x4xf16>) -> tensor<4000x4xf16>
-        return %2 : tensor<4000x4xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_4000_8_5120_bf16_tA.mlir b/gemm/mlir/gemm_4000_8_5120_bf16_tA.mlir
deleted file mode 100644
index a59e9b6..0000000
--- a/gemm/mlir/gemm_4000_8_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x4000xbf16>, %arg1: tensor<5120x8xbf16>) -> tensor<4000x8xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<4000x8xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<4000x8xbf16>) -> tensor<4000x8xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x4000xbf16>, tensor<5120x8xbf16>) outs(%1 : tensor<4000x8xbf16>) -> tensor<4000x8xbf16>
-        return %2 : tensor<4000x8xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_4000_8_5120_f16_tA.mlir b/gemm/mlir/gemm_4000_8_5120_f16_tA.mlir
deleted file mode 100644
index 2821933..0000000
--- a/gemm/mlir/gemm_4000_8_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x4000xf16>, %arg1: tensor<5120x8xf16>) -> tensor<4000x8xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<4000x8xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<4000x8xf16>) -> tensor<4000x8xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x4000xf16>, tensor<5120x8xf16>) outs(%1 : tensor<4000x8xf16>) -> tensor<4000x8xf16>
-        return %2 : tensor<4000x8xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_4000_8_8192_bf16_tA.mlir b/gemm/mlir/gemm_4000_8_8192_bf16_tA.mlir
deleted file mode 100644
index bbaeb69..0000000
--- a/gemm/mlir/gemm_4000_8_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x4000xbf16>, %arg1: tensor<8192x8xbf16>) -> tensor<4000x8xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<4000x8xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<4000x8xbf16>) -> tensor<4000x8xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x4000xbf16>, tensor<8192x8xbf16>) outs(%1 : tensor<4000x8xbf16>) -> tensor<4000x8xbf16>
-        return %2 : tensor<4000x8xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_4000_8_8192_f16_tA.mlir b/gemm/mlir/gemm_4000_8_8192_f16_tA.mlir
deleted file mode 100644
index 3bd900f..0000000
--- a/gemm/mlir/gemm_4000_8_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x4000xf16>, %arg1: tensor<8192x8xf16>) -> tensor<4000x8xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<4000x8xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<4000x8xf16>) -> tensor<4000x8xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x4000xf16>, tensor<8192x8xf16>) outs(%1 : tensor<4000x8xf16>) -> tensor<4000x8xf16>
-        return %2 : tensor<4000x8xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_4096_20480_2560_f16_tB.mlir b/gemm/mlir/gemm_4096_20480_2560_f16_tB.mlir
deleted file mode 100644
index 461726c..0000000
--- a/gemm/mlir/gemm_4096_20480_2560_f16_tB.mlir
+++ /dev/null
@@ -1,145 +0,0 @@
-#translation = #iree_codegen.translation_info<None workgroup_size = [128, 2, 1] subgroup_size = 64>
-module attributes {transform.with_named_sequence} {
-  stream.executable private @gemm {
-    stream.executable.export public @gemm workgroups() -> (index, index, index) {
-      %c64 = arith.constant 64 : index
-      %c320 = arith.constant 320 : index
-      %c1 = arith.constant 1 : index
-      stream.return %c64, %c320, %c1 : index, index, index
-    }
-    builtin.module {
-      func.func @gemm(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) attributes {translation_info = #translation} {
-        %c19 = arith.constant 19 : index
-        %c18 = arith.constant 18 : index
-        %c17 = arith.constant 17 : index
-        %c3 = arith.constant 3 : index
-        %c2 = arith.constant 2 : index
-        %c16 = arith.constant 16 : index
-        %c8 = arith.constant 8 : index
-        %c4 = arith.constant 4 : index
-        %c32 = arith.constant 32 : index
-        %c64 = arith.constant 64 : index
-        %c1 = arith.constant 1 : index
-        %c80 = arith.constant 80 : index
-        %c0 = arith.constant 0 : index
-        %cst = arith.constant dense<0.000000e+00> : vector<4xf32>
-        %workgroup_id_0 = stream.dispatch.workgroup.id[0] : index
-        %workgroup_id_1 = stream.dispatch.workgroup.id[1] : index
-        %thread_id_x = gpu.thread_id  x
-        %thread_id_y = gpu.thread_id  y
-        %alloc = memref.alloc() : memref<64x32xf16, #gpu.address_space<workgroup>>
-        %alloc_0 = memref.alloc() : memref<64x32xf16, #gpu.address_space<workgroup>>
-        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> memref<4096x2560xf16, strided<[2560, 1], offset: ?>>
-        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> memref<20480x2560xf16, strided<[2560, 1], offset: ?>>
-        %2 = arith.muli %workgroup_id_0, %c64 : index
-        %3 = arith.muli %thread_id_y, %c32 : index
-        %4 = arith.divsi %thread_id_x, %c4 : index
-        %5 = arith.addi %4, %3 : index
-        %6 = arith.remsi %5, %c64 : index
-        %7 = arith.addi %6, %2 : index
-        %8 = arith.remsi %thread_id_x, %c4 : index
-        %9 = arith.muli %8, %c8 : index
-        %10 = arith.divsi %thread_id_x, %c64 : index
-        %11 = arith.muli %10, %c32 : index
-        %12 = arith.remsi %thread_id_x, %c16 : index
-        %13 = arith.addi %12, %11 : index
-        %14 = arith.remsi %thread_id_x, %c64 : index
-        %15 = arith.divsi %14, %c16 : index
-        %16 = arith.muli %15, %c4 : index
-        %17 = arith.addi %16, %c16 : index
-        %18 = arith.addi %13, %c16 : index
-        %19 = arith.muli %workgroup_id_1, %c64 : index
-        %20 = arith.addi %6, %19 : index
-        %21 = arith.addi %12, %3 : index
-        %22 = arith.addi %21, %c16 : index
-        %23:4 = scf.for %arg3 = %c0 to %c80 step %c1 iter_args(%arg4 = %cst, %arg5 = %cst, %arg6 = %cst, %arg7 = %cst) -> (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) {
-          %62 = arith.muli %arg3, %c32 : index
-          %63 = arith.addi %62, %9 : index
-          %64 = vector.load %0[%7, %63] : memref<4096x2560xf16, strided<[2560, 1], offset: ?>>, vector<8xf16>
-          vector.store %64, %alloc[%6, %9] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<8xf16>
-          amdgpu.lds_barrier
-          %65 = vector.load %alloc[%13, %16] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<4xf16>
-          %66 = vector.load %alloc[%13, %17] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<4xf16>
-          %67 = vector.load %alloc[%18, %16] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<4xf16>
-          %68 = vector.load %alloc[%18, %17] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<4xf16>
-          %69 = vector.load %1[%20, %63] : memref<20480x2560xf16, strided<[2560, 1], offset: ?>>, vector<8xf16>
-          amdgpu.lds_barrier
-          vector.store %69, %alloc_0[%6, %9] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<8xf16>
-          amdgpu.lds_barrier
-          %70 = vector.load %alloc_0[%21, %16] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<4xf16>
-          %71 = vector.load %alloc_0[%21, %17] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<4xf16>
-          %72 = vector.load %alloc_0[%22, %16] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<4xf16>
-          %73 = vector.load %alloc_0[%22, %17] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<4xf16>
-          %74 = amdgpu.mfma %65 * %70 + %arg4 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
-          %75 = amdgpu.mfma %66 * %71 + %74 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
-          %76 = amdgpu.mfma %67 * %72 + %arg7 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
-          %77 = amdgpu.mfma %68 * %73 + %76 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
-          %78 = amdgpu.mfma %67 * %70 + %arg6 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
-          %79 = amdgpu.mfma %68 * %71 + %78 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
-          %80 = amdgpu.mfma %65 * %72 + %arg5 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
-          %81 = amdgpu.mfma %66 * %73 + %80 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
-          scf.yield %75, %81, %79, %77 : vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>
-        }
-        %24 = vector.extract_strided_slice %23#0 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        %25 = stream.binding.subspan %arg2[%c0] : !stream.binding -> memref<4096x20480xf32, strided<[20480, 1], offset: ?>>
-        %26 = arith.remsi %thread_id_x, %c64 : index
-        %27 = arith.divsi %26, %c16 : index
-        %28 = arith.muli %27, %c4 : index
-        %29 = arith.divsi %thread_id_x, %c64 : index
-        %30 = arith.muli %29, %c32 : index
-        %31 = arith.muli %workgroup_id_0, %c64 : index
-        %32 = arith.addi %31, %30 : index
-        %33 = arith.addi %32, %28 : index
-        %34 = arith.muli %thread_id_y, %c32 : index
-        %35 = arith.muli %workgroup_id_1, %c64 : index
-        %36 = arith.remsi %thread_id_x, %c16 : index
-        %37 = arith.addi %36, %35 : index
-        %38 = arith.addi %37, %34 : index
-        vector.store %24, %25[%33, %38] : memref<4096x20480xf32, strided<[20480, 1], offset: ?>>, vector<1xf32>
-        %39 = vector.extract_strided_slice %23#0 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        %40 = arith.addi %33, %c1 : index
-        vector.store %39, %25[%40, %38] : memref<4096x20480xf32, strided<[20480, 1], offset: ?>>, vector<1xf32>
-        %41 = vector.extract_strided_slice %23#0 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        %42 = arith.addi %33, %c2 : index
-        vector.store %41, %25[%42, %38] : memref<4096x20480xf32, strided<[20480, 1], offset: ?>>, vector<1xf32>
-        %43 = vector.extract_strided_slice %23#0 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        %44 = arith.addi %33, %c3 : index
-        vector.store %43, %25[%44, %38] : memref<4096x20480xf32, strided<[20480, 1], offset: ?>>, vector<1xf32>
-        %45 = vector.extract_strided_slice %23#3 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        %46 = arith.addi %33, %c16 : index
-        %47 = arith.addi %38, %c16 : index
-        vector.store %45, %25[%46, %47] : memref<4096x20480xf32, strided<[20480, 1], offset: ?>>, vector<1xf32>
-        %48 = vector.extract_strided_slice %23#3 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        %49 = arith.addi %33, %c17 : index
-        vector.store %48, %25[%49, %47] : memref<4096x20480xf32, strided<[20480, 1], offset: ?>>, vector<1xf32>
-        %50 = vector.extract_strided_slice %23#3 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        %51 = arith.addi %33, %c18 : index
-        vector.store %50, %25[%51, %47] : memref<4096x20480xf32, strided<[20480, 1], offset: ?>>, vector<1xf32>
-        %52 = vector.extract_strided_slice %23#3 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        %53 = arith.addi %33, %c19 : index
-        vector.store %52, %25[%53, %47] : memref<4096x20480xf32, strided<[20480, 1], offset: ?>>, vector<1xf32>
-        %54 = vector.extract_strided_slice %23#2 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        vector.store %54, %25[%46, %38] : memref<4096x20480xf32, strided<[20480, 1], offset: ?>>, vector<1xf32>
-        %55 = vector.extract_strided_slice %23#2 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        vector.store %55, %25[%49, %38] : memref<4096x20480xf32, strided<[20480, 1], offset: ?>>, vector<1xf32>
-        %56 = vector.extract_strided_slice %23#2 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        vector.store %56, %25[%51, %38] : memref<4096x20480xf32, strided<[20480, 1], offset: ?>>, vector<1xf32>
-        %57 = vector.extract_strided_slice %23#2 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        vector.store %57, %25[%53, %38] : memref<4096x20480xf32, strided<[20480, 1], offset: ?>>, vector<1xf32>
-        %58 = vector.extract_strided_slice %23#1 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        vector.store %58, %25[%33, %47] : memref<4096x20480xf32, strided<[20480, 1], offset: ?>>, vector<1xf32>
-        %59 = vector.extract_strided_slice %23#1 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        vector.store %59, %25[%40, %47] : memref<4096x20480xf32, strided<[20480, 1], offset: ?>>, vector<1xf32>
-        %60 = vector.extract_strided_slice %23#1 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        vector.store %60, %25[%42, %47] : memref<4096x20480xf32, strided<[20480, 1], offset: ?>>, vector<1xf32>
-        %61 = vector.extract_strided_slice %23#1 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        vector.store %61, %25[%44, %47] : memref<4096x20480xf32, strided<[20480, 1], offset: ?>>, vector<1xf32>
-        return
-      }
-    }
-  }
-  func.func @isolated_benchmark(%arg0: tensor<4096x2560xf16>, %arg1: tensor<20480x2560xf16>) -> tensor<4096x20480xf32> {
-    %0 = flow.dispatch @gemm::@gemm(%arg0, %arg1) : (tensor<4096x2560xf16>, tensor<20480x2560xf16>) -> tensor<4096x20480xf32>
-    return %0 : tensor<4096x20480xf32>
-  }
-}
diff --git a/gemm/mlir/gemm_4096_4096_8192_bf16.mlir b/gemm/mlir/gemm_4096_4096_8192_bf16.mlir
deleted file mode 100644
index da783d2..0000000
--- a/gemm/mlir/gemm_4096_4096_8192_bf16.mlir
+++ /dev/null
@@ -1,9 +0,0 @@
-module {
-    func.func @main(%arg0: tensor<4096x8192xbf16>, %arg1: tensor<8192x4096xbf16>) -> tensor<4096x4096xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<4096x4096xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<4096x4096xbf16>) -> tensor<4096x4096xbf16>
-        %2 = linalg.matmul ins(%arg0, %arg1 : tensor<4096x8192xbf16>, tensor<8192x4096xbf16>) outs(%1 : tensor<4096x4096xbf16>) -> tensor<4096x4096xbf16>
-        return %2 : tensor<4096x4096xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_4096_4096_8192_bf16_tA.mlir b/gemm/mlir/gemm_4096_4096_8192_bf16_tA.mlir
deleted file mode 100644
index f9c0df8..0000000
--- a/gemm/mlir/gemm_4096_4096_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x4096xbf16>, %arg1: tensor<8192x4096xbf16>) -> tensor<4096x4096xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<4096x4096xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<4096x4096xbf16>) -> tensor<4096x4096xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x4096xbf16>, tensor<8192x4096xbf16>) outs(%1 : tensor<4096x4096xbf16>) -> tensor<4096x4096xbf16>
-        return %2 : tensor<4096x4096xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_4096_4096_8192_bf16_tB.mlir b/gemm/mlir/gemm_4096_4096_8192_bf16_tB.mlir
deleted file mode 100644
index ff2a1ac..0000000
--- a/gemm/mlir/gemm_4096_4096_8192_bf16_tB.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<4096x8192xbf16>, %arg1: tensor<4096x8192xbf16>) -> tensor<4096x4096xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<4096x4096xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<4096x4096xbf16>) -> tensor<4096x4096xbf16>
-        %2 = linalg.matmul_transpose_b ins(%arg0, %arg1 : tensor<4096x8192xbf16>, tensor<4096x8192xbf16>) outs(%1 : tensor<4096x4096xbf16>) -> tensor<4096x4096xbf16>
-        return %2 : tensor<4096x4096xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_4096_4096_8192_f16.mlir b/gemm/mlir/gemm_4096_4096_8192_f16.mlir
deleted file mode 100644
index d21690a..0000000
--- a/gemm/mlir/gemm_4096_4096_8192_f16.mlir
+++ /dev/null
@@ -1,9 +0,0 @@
-module {
-    func.func @main(%arg0: tensor<4096x8192xf16>, %arg1: tensor<8192x4096xf16>) -> tensor<4096x4096xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<4096x4096xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<4096x4096xf16>) -> tensor<4096x4096xf16>
-        %2 = linalg.matmul ins(%arg0, %arg1 : tensor<4096x8192xf16>, tensor<8192x4096xf16>) outs(%1 : tensor<4096x4096xf16>) -> tensor<4096x4096xf16>
-        return %2 : tensor<4096x4096xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_4096_4096_8192_f16_tA.mlir b/gemm/mlir/gemm_4096_4096_8192_f16_tA.mlir
deleted file mode 100644
index f4ba892..0000000
--- a/gemm/mlir/gemm_4096_4096_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x4096xf16>, %arg1: tensor<8192x4096xf16>) -> tensor<4096x4096xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<4096x4096xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<4096x4096xf16>) -> tensor<4096x4096xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x4096xf16>, tensor<8192x4096xf16>) outs(%1 : tensor<4096x4096xf16>) -> tensor<4096x4096xf16>
-        return %2 : tensor<4096x4096xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_4096_4096_8192_f16_tB.mlir b/gemm/mlir/gemm_4096_4096_8192_f16_tB.mlir
deleted file mode 100644
index d96e00f..0000000
--- a/gemm/mlir/gemm_4096_4096_8192_f16_tB.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<4096x8192xf16>, %arg1: tensor<4096x8192xf16>) -> tensor<4096x4096xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<4096x4096xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<4096x4096xf16>) -> tensor<4096x4096xf16>
-        %2 = linalg.matmul_transpose_b ins(%arg0, %arg1 : tensor<4096x8192xf16>, tensor<4096x8192xf16>) outs(%1 : tensor<4096x4096xf16>) -> tensor<4096x4096xf16>
-        return %2 : tensor<4096x4096xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_16_1280_bf16_tA.mlir b/gemm/mlir/gemm_5120_16_1280_bf16_tA.mlir
deleted file mode 100644
index 7e21b10..0000000
--- a/gemm/mlir/gemm_5120_16_1280_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<1280x5120xbf16>, %arg1: tensor<1280x16xbf16>) -> tensor<5120x16xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<5120x16xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x16xbf16>) -> tensor<5120x16xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1280x5120xbf16>, tensor<1280x16xbf16>) outs(%1 : tensor<5120x16xbf16>) -> tensor<5120x16xbf16>
-        return %2 : tensor<5120x16xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_16_1280_f16_tA.mlir b/gemm/mlir/gemm_5120_16_1280_f16_tA.mlir
deleted file mode 100644
index e777fe8..0000000
--- a/gemm/mlir/gemm_5120_16_1280_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<1280x5120xf16>, %arg1: tensor<1280x16xf16>) -> tensor<5120x16xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<5120x16xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x16xf16>) -> tensor<5120x16xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1280x5120xf16>, tensor<1280x16xf16>) outs(%1 : tensor<5120x16xf16>) -> tensor<5120x16xf16>
-        return %2 : tensor<5120x16xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_16_13824_bf16_tA.mlir b/gemm/mlir/gemm_5120_16_13824_bf16_tA.mlir
deleted file mode 100644
index 712a5a3..0000000
--- a/gemm/mlir/gemm_5120_16_13824_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<13824x5120xbf16>, %arg1: tensor<13824x16xbf16>) -> tensor<5120x16xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<5120x16xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x16xbf16>) -> tensor<5120x16xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<13824x5120xbf16>, tensor<13824x16xbf16>) outs(%1 : tensor<5120x16xbf16>) -> tensor<5120x16xbf16>
-        return %2 : tensor<5120x16xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_16_13824_f16_tA.mlir b/gemm/mlir/gemm_5120_16_13824_f16_tA.mlir
deleted file mode 100644
index e95a174..0000000
--- a/gemm/mlir/gemm_5120_16_13824_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<13824x5120xf16>, %arg1: tensor<13824x16xf16>) -> tensor<5120x16xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<5120x16xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x16xf16>) -> tensor<5120x16xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<13824x5120xf16>, tensor<13824x16xf16>) outs(%1 : tensor<5120x16xf16>) -> tensor<5120x16xf16>
-        return %2 : tensor<5120x16xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_16_1728_bf16_tA.mlir b/gemm/mlir/gemm_5120_16_1728_bf16_tA.mlir
deleted file mode 100644
index 1f0b6cf..0000000
--- a/gemm/mlir/gemm_5120_16_1728_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<1728x5120xbf16>, %arg1: tensor<1728x16xbf16>) -> tensor<5120x16xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<5120x16xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x16xbf16>) -> tensor<5120x16xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1728x5120xbf16>, tensor<1728x16xbf16>) outs(%1 : tensor<5120x16xbf16>) -> tensor<5120x16xbf16>
-        return %2 : tensor<5120x16xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_16_1728_f16_tA.mlir b/gemm/mlir/gemm_5120_16_1728_f16_tA.mlir
deleted file mode 100644
index c0efaf2..0000000
--- a/gemm/mlir/gemm_5120_16_1728_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<1728x5120xf16>, %arg1: tensor<1728x16xf16>) -> tensor<5120x16xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<5120x16xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x16xf16>) -> tensor<5120x16xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1728x5120xf16>, tensor<1728x16xf16>) outs(%1 : tensor<5120x16xf16>) -> tensor<5120x16xf16>
-        return %2 : tensor<5120x16xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_16_2560_bf16_tA.mlir b/gemm/mlir/gemm_5120_16_2560_bf16_tA.mlir
deleted file mode 100644
index d850d73..0000000
--- a/gemm/mlir/gemm_5120_16_2560_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<2560x5120xbf16>, %arg1: tensor<2560x16xbf16>) -> tensor<5120x16xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<5120x16xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x16xbf16>) -> tensor<5120x16xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<2560x5120xbf16>, tensor<2560x16xbf16>) outs(%1 : tensor<5120x16xbf16>) -> tensor<5120x16xbf16>
-        return %2 : tensor<5120x16xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_16_2560_f16_tA.mlir b/gemm/mlir/gemm_5120_16_2560_f16_tA.mlir
deleted file mode 100644
index e4183f4..0000000
--- a/gemm/mlir/gemm_5120_16_2560_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<2560x5120xf16>, %arg1: tensor<2560x16xf16>) -> tensor<5120x16xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<5120x16xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x16xf16>) -> tensor<5120x16xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<2560x5120xf16>, tensor<2560x16xf16>) outs(%1 : tensor<5120x16xf16>) -> tensor<5120x16xf16>
-        return %2 : tensor<5120x16xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_16_3456_bf16_tA.mlir b/gemm/mlir/gemm_5120_16_3456_bf16_tA.mlir
deleted file mode 100644
index dab5177..0000000
--- a/gemm/mlir/gemm_5120_16_3456_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<3456x5120xbf16>, %arg1: tensor<3456x16xbf16>) -> tensor<5120x16xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<5120x16xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x16xbf16>) -> tensor<5120x16xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<3456x5120xbf16>, tensor<3456x16xbf16>) outs(%1 : tensor<5120x16xbf16>) -> tensor<5120x16xbf16>
-        return %2 : tensor<5120x16xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_16_3456_f16_tA.mlir b/gemm/mlir/gemm_5120_16_3456_f16_tA.mlir
deleted file mode 100644
index e4d9277..0000000
--- a/gemm/mlir/gemm_5120_16_3456_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<3456x5120xf16>, %arg1: tensor<3456x16xf16>) -> tensor<5120x16xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<5120x16xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x16xf16>) -> tensor<5120x16xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<3456x5120xf16>, tensor<3456x16xf16>) outs(%1 : tensor<5120x16xf16>) -> tensor<5120x16xf16>
-        return %2 : tensor<5120x16xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_16_5120_bf16_tA.mlir b/gemm/mlir/gemm_5120_16_5120_bf16_tA.mlir
deleted file mode 100644
index f5dfe26..0000000
--- a/gemm/mlir/gemm_5120_16_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x5120xbf16>, %arg1: tensor<5120x16xbf16>) -> tensor<5120x16xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<5120x16xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x16xbf16>) -> tensor<5120x16xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x5120xbf16>, tensor<5120x16xbf16>) outs(%1 : tensor<5120x16xbf16>) -> tensor<5120x16xbf16>
-        return %2 : tensor<5120x16xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_16_5120_f16_tA.mlir b/gemm/mlir/gemm_5120_16_5120_f16_tA.mlir
deleted file mode 100644
index 71c7f1f..0000000
--- a/gemm/mlir/gemm_5120_16_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x5120xf16>, %arg1: tensor<5120x16xf16>) -> tensor<5120x16xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<5120x16xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x16xf16>) -> tensor<5120x16xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x5120xf16>, tensor<5120x16xf16>) outs(%1 : tensor<5120x16xf16>) -> tensor<5120x16xf16>
-        return %2 : tensor<5120x16xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_16_640_bf16_tA.mlir b/gemm/mlir/gemm_5120_16_640_bf16_tA.mlir
deleted file mode 100644
index 20d9a68..0000000
--- a/gemm/mlir/gemm_5120_16_640_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<640x5120xbf16>, %arg1: tensor<640x16xbf16>) -> tensor<5120x16xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<5120x16xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x16xbf16>) -> tensor<5120x16xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<640x5120xbf16>, tensor<640x16xbf16>) outs(%1 : tensor<5120x16xbf16>) -> tensor<5120x16xbf16>
-        return %2 : tensor<5120x16xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_16_640_f16_tA.mlir b/gemm/mlir/gemm_5120_16_640_f16_tA.mlir
deleted file mode 100644
index bf06141..0000000
--- a/gemm/mlir/gemm_5120_16_640_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<640x5120xf16>, %arg1: tensor<640x16xf16>) -> tensor<5120x16xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<5120x16xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x16xf16>) -> tensor<5120x16xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<640x5120xf16>, tensor<640x16xf16>) outs(%1 : tensor<5120x16xf16>) -> tensor<5120x16xf16>
-        return %2 : tensor<5120x16xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_16_6912_bf16_tA.mlir b/gemm/mlir/gemm_5120_16_6912_bf16_tA.mlir
deleted file mode 100644
index 4ab4378..0000000
--- a/gemm/mlir/gemm_5120_16_6912_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<6912x5120xbf16>, %arg1: tensor<6912x16xbf16>) -> tensor<5120x16xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<5120x16xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x16xbf16>) -> tensor<5120x16xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<6912x5120xbf16>, tensor<6912x16xbf16>) outs(%1 : tensor<5120x16xbf16>) -> tensor<5120x16xbf16>
-        return %2 : tensor<5120x16xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_16_6912_f16_tA.mlir b/gemm/mlir/gemm_5120_16_6912_f16_tA.mlir
deleted file mode 100644
index 476253e..0000000
--- a/gemm/mlir/gemm_5120_16_6912_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<6912x5120xf16>, %arg1: tensor<6912x16xf16>) -> tensor<5120x16xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<5120x16xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x16xf16>) -> tensor<5120x16xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<6912x5120xf16>, tensor<6912x16xf16>) outs(%1 : tensor<5120x16xf16>) -> tensor<5120x16xf16>
-        return %2 : tensor<5120x16xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_16_8192_bf16_tA.mlir b/gemm/mlir/gemm_5120_16_8192_bf16_tA.mlir
deleted file mode 100644
index af65c87..0000000
--- a/gemm/mlir/gemm_5120_16_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x5120xbf16>, %arg1: tensor<8192x16xbf16>) -> tensor<5120x16xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<5120x16xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x16xbf16>) -> tensor<5120x16xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x5120xbf16>, tensor<8192x16xbf16>) outs(%1 : tensor<5120x16xbf16>) -> tensor<5120x16xbf16>
-        return %2 : tensor<5120x16xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_16_8192_f16_tA.mlir b/gemm/mlir/gemm_5120_16_8192_f16_tA.mlir
deleted file mode 100644
index 9acb611..0000000
--- a/gemm/mlir/gemm_5120_16_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x5120xf16>, %arg1: tensor<8192x16xf16>) -> tensor<5120x16xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<5120x16xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x16xf16>) -> tensor<5120x16xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x5120xf16>, tensor<8192x16xf16>) outs(%1 : tensor<5120x16xf16>) -> tensor<5120x16xf16>
-        return %2 : tensor<5120x16xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_1_1280_bf16_tA.mlir b/gemm/mlir/gemm_5120_1_1280_bf16_tA.mlir
deleted file mode 100644
index fbad7cb..0000000
--- a/gemm/mlir/gemm_5120_1_1280_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<1280x5120xbf16>, %arg1: tensor<1280x1xbf16>) -> tensor<5120x1xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<5120x1xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x1xbf16>) -> tensor<5120x1xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1280x5120xbf16>, tensor<1280x1xbf16>) outs(%1 : tensor<5120x1xbf16>) -> tensor<5120x1xbf16>
-        return %2 : tensor<5120x1xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_1_1280_f16_tA.mlir b/gemm/mlir/gemm_5120_1_1280_f16_tA.mlir
deleted file mode 100644
index a7e29cd..0000000
--- a/gemm/mlir/gemm_5120_1_1280_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<1280x5120xf16>, %arg1: tensor<1280x1xf16>) -> tensor<5120x1xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<5120x1xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x1xf16>) -> tensor<5120x1xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1280x5120xf16>, tensor<1280x1xf16>) outs(%1 : tensor<5120x1xf16>) -> tensor<5120x1xf16>
-        return %2 : tensor<5120x1xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_1_13824_bf16_tA.mlir b/gemm/mlir/gemm_5120_1_13824_bf16_tA.mlir
deleted file mode 100644
index d006ff7..0000000
--- a/gemm/mlir/gemm_5120_1_13824_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<13824x5120xbf16>, %arg1: tensor<13824x1xbf16>) -> tensor<5120x1xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<5120x1xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x1xbf16>) -> tensor<5120x1xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<13824x5120xbf16>, tensor<13824x1xbf16>) outs(%1 : tensor<5120x1xbf16>) -> tensor<5120x1xbf16>
-        return %2 : tensor<5120x1xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_1_13824_f16_tA.mlir b/gemm/mlir/gemm_5120_1_13824_f16_tA.mlir
deleted file mode 100644
index a9fcf15..0000000
--- a/gemm/mlir/gemm_5120_1_13824_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<13824x5120xf16>, %arg1: tensor<13824x1xf16>) -> tensor<5120x1xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<5120x1xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x1xf16>) -> tensor<5120x1xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<13824x5120xf16>, tensor<13824x1xf16>) outs(%1 : tensor<5120x1xf16>) -> tensor<5120x1xf16>
-        return %2 : tensor<5120x1xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_1_1728_bf16_tA.mlir b/gemm/mlir/gemm_5120_1_1728_bf16_tA.mlir
deleted file mode 100644
index 9417831..0000000
--- a/gemm/mlir/gemm_5120_1_1728_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<1728x5120xbf16>, %arg1: tensor<1728x1xbf16>) -> tensor<5120x1xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<5120x1xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x1xbf16>) -> tensor<5120x1xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1728x5120xbf16>, tensor<1728x1xbf16>) outs(%1 : tensor<5120x1xbf16>) -> tensor<5120x1xbf16>
-        return %2 : tensor<5120x1xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_1_1728_f16_tA.mlir b/gemm/mlir/gemm_5120_1_1728_f16_tA.mlir
deleted file mode 100644
index 124f5a6..0000000
--- a/gemm/mlir/gemm_5120_1_1728_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<1728x5120xf16>, %arg1: tensor<1728x1xf16>) -> tensor<5120x1xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<5120x1xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x1xf16>) -> tensor<5120x1xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1728x5120xf16>, tensor<1728x1xf16>) outs(%1 : tensor<5120x1xf16>) -> tensor<5120x1xf16>
-        return %2 : tensor<5120x1xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_1_2560_bf16_tA.mlir b/gemm/mlir/gemm_5120_1_2560_bf16_tA.mlir
deleted file mode 100644
index 3779817..0000000
--- a/gemm/mlir/gemm_5120_1_2560_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<2560x5120xbf16>, %arg1: tensor<2560x1xbf16>) -> tensor<5120x1xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<5120x1xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x1xbf16>) -> tensor<5120x1xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<2560x5120xbf16>, tensor<2560x1xbf16>) outs(%1 : tensor<5120x1xbf16>) -> tensor<5120x1xbf16>
-        return %2 : tensor<5120x1xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_1_2560_f16_tA.mlir b/gemm/mlir/gemm_5120_1_2560_f16_tA.mlir
deleted file mode 100644
index 6258f4f..0000000
--- a/gemm/mlir/gemm_5120_1_2560_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<2560x5120xf16>, %arg1: tensor<2560x1xf16>) -> tensor<5120x1xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<5120x1xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x1xf16>) -> tensor<5120x1xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<2560x5120xf16>, tensor<2560x1xf16>) outs(%1 : tensor<5120x1xf16>) -> tensor<5120x1xf16>
-        return %2 : tensor<5120x1xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_1_3456_bf16_tA.mlir b/gemm/mlir/gemm_5120_1_3456_bf16_tA.mlir
deleted file mode 100644
index c2c0363..0000000
--- a/gemm/mlir/gemm_5120_1_3456_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<3456x5120xbf16>, %arg1: tensor<3456x1xbf16>) -> tensor<5120x1xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<5120x1xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x1xbf16>) -> tensor<5120x1xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<3456x5120xbf16>, tensor<3456x1xbf16>) outs(%1 : tensor<5120x1xbf16>) -> tensor<5120x1xbf16>
-        return %2 : tensor<5120x1xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_1_3456_f16_tA.mlir b/gemm/mlir/gemm_5120_1_3456_f16_tA.mlir
deleted file mode 100644
index 27728e7..0000000
--- a/gemm/mlir/gemm_5120_1_3456_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<3456x5120xf16>, %arg1: tensor<3456x1xf16>) -> tensor<5120x1xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<5120x1xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x1xf16>) -> tensor<5120x1xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<3456x5120xf16>, tensor<3456x1xf16>) outs(%1 : tensor<5120x1xf16>) -> tensor<5120x1xf16>
-        return %2 : tensor<5120x1xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_1_5120_bf16_tA.mlir b/gemm/mlir/gemm_5120_1_5120_bf16_tA.mlir
deleted file mode 100644
index e8652a1..0000000
--- a/gemm/mlir/gemm_5120_1_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x5120xbf16>, %arg1: tensor<5120x1xbf16>) -> tensor<5120x1xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<5120x1xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x1xbf16>) -> tensor<5120x1xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x5120xbf16>, tensor<5120x1xbf16>) outs(%1 : tensor<5120x1xbf16>) -> tensor<5120x1xbf16>
-        return %2 : tensor<5120x1xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_1_5120_f16_tA.mlir b/gemm/mlir/gemm_5120_1_5120_f16_tA.mlir
deleted file mode 100644
index d36e54c..0000000
--- a/gemm/mlir/gemm_5120_1_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x5120xf16>, %arg1: tensor<5120x1xf16>) -> tensor<5120x1xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<5120x1xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x1xf16>) -> tensor<5120x1xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x5120xf16>, tensor<5120x1xf16>) outs(%1 : tensor<5120x1xf16>) -> tensor<5120x1xf16>
-        return %2 : tensor<5120x1xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_1_640_bf16_tA.mlir b/gemm/mlir/gemm_5120_1_640_bf16_tA.mlir
deleted file mode 100644
index 3b414a8..0000000
--- a/gemm/mlir/gemm_5120_1_640_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<640x5120xbf16>, %arg1: tensor<640x1xbf16>) -> tensor<5120x1xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<5120x1xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x1xbf16>) -> tensor<5120x1xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<640x5120xbf16>, tensor<640x1xbf16>) outs(%1 : tensor<5120x1xbf16>) -> tensor<5120x1xbf16>
-        return %2 : tensor<5120x1xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_1_640_f16_tA.mlir b/gemm/mlir/gemm_5120_1_640_f16_tA.mlir
deleted file mode 100644
index f8bbbe2..0000000
--- a/gemm/mlir/gemm_5120_1_640_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<640x5120xf16>, %arg1: tensor<640x1xf16>) -> tensor<5120x1xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<5120x1xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x1xf16>) -> tensor<5120x1xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<640x5120xf16>, tensor<640x1xf16>) outs(%1 : tensor<5120x1xf16>) -> tensor<5120x1xf16>
-        return %2 : tensor<5120x1xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_1_6912_bf16_tA.mlir b/gemm/mlir/gemm_5120_1_6912_bf16_tA.mlir
deleted file mode 100644
index fdc2298..0000000
--- a/gemm/mlir/gemm_5120_1_6912_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<6912x5120xbf16>, %arg1: tensor<6912x1xbf16>) -> tensor<5120x1xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<5120x1xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x1xbf16>) -> tensor<5120x1xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<6912x5120xbf16>, tensor<6912x1xbf16>) outs(%1 : tensor<5120x1xbf16>) -> tensor<5120x1xbf16>
-        return %2 : tensor<5120x1xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_1_6912_f16_tA.mlir b/gemm/mlir/gemm_5120_1_6912_f16_tA.mlir
deleted file mode 100644
index be5c109..0000000
--- a/gemm/mlir/gemm_5120_1_6912_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<6912x5120xf16>, %arg1: tensor<6912x1xf16>) -> tensor<5120x1xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<5120x1xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x1xf16>) -> tensor<5120x1xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<6912x5120xf16>, tensor<6912x1xf16>) outs(%1 : tensor<5120x1xf16>) -> tensor<5120x1xf16>
-        return %2 : tensor<5120x1xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_1_8192_bf16_tA.mlir b/gemm/mlir/gemm_5120_1_8192_bf16_tA.mlir
deleted file mode 100644
index 13e6f69..0000000
--- a/gemm/mlir/gemm_5120_1_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x5120xbf16>, %arg1: tensor<8192x1xbf16>) -> tensor<5120x1xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<5120x1xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x1xbf16>) -> tensor<5120x1xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x5120xbf16>, tensor<8192x1xbf16>) outs(%1 : tensor<5120x1xbf16>) -> tensor<5120x1xbf16>
-        return %2 : tensor<5120x1xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_1_8192_f16_tA.mlir b/gemm/mlir/gemm_5120_1_8192_f16_tA.mlir
deleted file mode 100644
index 572ff85..0000000
--- a/gemm/mlir/gemm_5120_1_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x5120xf16>, %arg1: tensor<8192x1xf16>) -> tensor<5120x1xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<5120x1xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x1xf16>) -> tensor<5120x1xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x5120xf16>, tensor<8192x1xf16>) outs(%1 : tensor<5120x1xf16>) -> tensor<5120x1xf16>
-        return %2 : tensor<5120x1xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_2_1280_bf16_tA.mlir b/gemm/mlir/gemm_5120_2_1280_bf16_tA.mlir
deleted file mode 100644
index 07b6e62..0000000
--- a/gemm/mlir/gemm_5120_2_1280_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<1280x5120xbf16>, %arg1: tensor<1280x2xbf16>) -> tensor<5120x2xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<5120x2xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x2xbf16>) -> tensor<5120x2xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1280x5120xbf16>, tensor<1280x2xbf16>) outs(%1 : tensor<5120x2xbf16>) -> tensor<5120x2xbf16>
-        return %2 : tensor<5120x2xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_2_1280_f16_tA.mlir b/gemm/mlir/gemm_5120_2_1280_f16_tA.mlir
deleted file mode 100644
index 70ad768..0000000
--- a/gemm/mlir/gemm_5120_2_1280_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<1280x5120xf16>, %arg1: tensor<1280x2xf16>) -> tensor<5120x2xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<5120x2xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x2xf16>) -> tensor<5120x2xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1280x5120xf16>, tensor<1280x2xf16>) outs(%1 : tensor<5120x2xf16>) -> tensor<5120x2xf16>
-        return %2 : tensor<5120x2xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_2_13824_bf16_tA.mlir b/gemm/mlir/gemm_5120_2_13824_bf16_tA.mlir
deleted file mode 100644
index e83f65d..0000000
--- a/gemm/mlir/gemm_5120_2_13824_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<13824x5120xbf16>, %arg1: tensor<13824x2xbf16>) -> tensor<5120x2xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<5120x2xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x2xbf16>) -> tensor<5120x2xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<13824x5120xbf16>, tensor<13824x2xbf16>) outs(%1 : tensor<5120x2xbf16>) -> tensor<5120x2xbf16>
-        return %2 : tensor<5120x2xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_2_13824_f16_tA.mlir b/gemm/mlir/gemm_5120_2_13824_f16_tA.mlir
deleted file mode 100644
index e30738c..0000000
--- a/gemm/mlir/gemm_5120_2_13824_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<13824x5120xf16>, %arg1: tensor<13824x2xf16>) -> tensor<5120x2xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<5120x2xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x2xf16>) -> tensor<5120x2xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<13824x5120xf16>, tensor<13824x2xf16>) outs(%1 : tensor<5120x2xf16>) -> tensor<5120x2xf16>
-        return %2 : tensor<5120x2xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_2_1728_bf16_tA.mlir b/gemm/mlir/gemm_5120_2_1728_bf16_tA.mlir
deleted file mode 100644
index 8a04fb2..0000000
--- a/gemm/mlir/gemm_5120_2_1728_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<1728x5120xbf16>, %arg1: tensor<1728x2xbf16>) -> tensor<5120x2xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<5120x2xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x2xbf16>) -> tensor<5120x2xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1728x5120xbf16>, tensor<1728x2xbf16>) outs(%1 : tensor<5120x2xbf16>) -> tensor<5120x2xbf16>
-        return %2 : tensor<5120x2xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_2_1728_f16_tA.mlir b/gemm/mlir/gemm_5120_2_1728_f16_tA.mlir
deleted file mode 100644
index 2c77846..0000000
--- a/gemm/mlir/gemm_5120_2_1728_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<1728x5120xf16>, %arg1: tensor<1728x2xf16>) -> tensor<5120x2xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<5120x2xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x2xf16>) -> tensor<5120x2xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1728x5120xf16>, tensor<1728x2xf16>) outs(%1 : tensor<5120x2xf16>) -> tensor<5120x2xf16>
-        return %2 : tensor<5120x2xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_2_2560_bf16_tA.mlir b/gemm/mlir/gemm_5120_2_2560_bf16_tA.mlir
deleted file mode 100644
index 25d142a..0000000
--- a/gemm/mlir/gemm_5120_2_2560_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<2560x5120xbf16>, %arg1: tensor<2560x2xbf16>) -> tensor<5120x2xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<5120x2xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x2xbf16>) -> tensor<5120x2xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<2560x5120xbf16>, tensor<2560x2xbf16>) outs(%1 : tensor<5120x2xbf16>) -> tensor<5120x2xbf16>
-        return %2 : tensor<5120x2xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_2_2560_f16_tA.mlir b/gemm/mlir/gemm_5120_2_2560_f16_tA.mlir
deleted file mode 100644
index 414bd86..0000000
--- a/gemm/mlir/gemm_5120_2_2560_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<2560x5120xf16>, %arg1: tensor<2560x2xf16>) -> tensor<5120x2xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<5120x2xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x2xf16>) -> tensor<5120x2xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<2560x5120xf16>, tensor<2560x2xf16>) outs(%1 : tensor<5120x2xf16>) -> tensor<5120x2xf16>
-        return %2 : tensor<5120x2xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_2_3456_bf16_tA.mlir b/gemm/mlir/gemm_5120_2_3456_bf16_tA.mlir
deleted file mode 100644
index 3b81d86..0000000
--- a/gemm/mlir/gemm_5120_2_3456_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<3456x5120xbf16>, %arg1: tensor<3456x2xbf16>) -> tensor<5120x2xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<5120x2xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x2xbf16>) -> tensor<5120x2xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<3456x5120xbf16>, tensor<3456x2xbf16>) outs(%1 : tensor<5120x2xbf16>) -> tensor<5120x2xbf16>
-        return %2 : tensor<5120x2xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_2_3456_f16_tA.mlir b/gemm/mlir/gemm_5120_2_3456_f16_tA.mlir
deleted file mode 100644
index fe954d2..0000000
--- a/gemm/mlir/gemm_5120_2_3456_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<3456x5120xf16>, %arg1: tensor<3456x2xf16>) -> tensor<5120x2xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<5120x2xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x2xf16>) -> tensor<5120x2xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<3456x5120xf16>, tensor<3456x2xf16>) outs(%1 : tensor<5120x2xf16>) -> tensor<5120x2xf16>
-        return %2 : tensor<5120x2xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_2_5120_bf16_tA.mlir b/gemm/mlir/gemm_5120_2_5120_bf16_tA.mlir
deleted file mode 100644
index 6599984..0000000
--- a/gemm/mlir/gemm_5120_2_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x5120xbf16>, %arg1: tensor<5120x2xbf16>) -> tensor<5120x2xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<5120x2xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x2xbf16>) -> tensor<5120x2xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x5120xbf16>, tensor<5120x2xbf16>) outs(%1 : tensor<5120x2xbf16>) -> tensor<5120x2xbf16>
-        return %2 : tensor<5120x2xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_2_5120_f16_tA.mlir b/gemm/mlir/gemm_5120_2_5120_f16_tA.mlir
deleted file mode 100644
index f88163e..0000000
--- a/gemm/mlir/gemm_5120_2_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x5120xf16>, %arg1: tensor<5120x2xf16>) -> tensor<5120x2xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<5120x2xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x2xf16>) -> tensor<5120x2xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x5120xf16>, tensor<5120x2xf16>) outs(%1 : tensor<5120x2xf16>) -> tensor<5120x2xf16>
-        return %2 : tensor<5120x2xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_2_640_bf16_tA.mlir b/gemm/mlir/gemm_5120_2_640_bf16_tA.mlir
deleted file mode 100644
index 8ade0ca..0000000
--- a/gemm/mlir/gemm_5120_2_640_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<640x5120xbf16>, %arg1: tensor<640x2xbf16>) -> tensor<5120x2xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<5120x2xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x2xbf16>) -> tensor<5120x2xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<640x5120xbf16>, tensor<640x2xbf16>) outs(%1 : tensor<5120x2xbf16>) -> tensor<5120x2xbf16>
-        return %2 : tensor<5120x2xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_2_640_f16_tA.mlir b/gemm/mlir/gemm_5120_2_640_f16_tA.mlir
deleted file mode 100644
index 3c50f2f..0000000
--- a/gemm/mlir/gemm_5120_2_640_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<640x5120xf16>, %arg1: tensor<640x2xf16>) -> tensor<5120x2xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<5120x2xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x2xf16>) -> tensor<5120x2xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<640x5120xf16>, tensor<640x2xf16>) outs(%1 : tensor<5120x2xf16>) -> tensor<5120x2xf16>
-        return %2 : tensor<5120x2xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_2_6912_bf16_tA.mlir b/gemm/mlir/gemm_5120_2_6912_bf16_tA.mlir
deleted file mode 100644
index 5f8b20a..0000000
--- a/gemm/mlir/gemm_5120_2_6912_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<6912x5120xbf16>, %arg1: tensor<6912x2xbf16>) -> tensor<5120x2xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<5120x2xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x2xbf16>) -> tensor<5120x2xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<6912x5120xbf16>, tensor<6912x2xbf16>) outs(%1 : tensor<5120x2xbf16>) -> tensor<5120x2xbf16>
-        return %2 : tensor<5120x2xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_2_6912_f16_tA.mlir b/gemm/mlir/gemm_5120_2_6912_f16_tA.mlir
deleted file mode 100644
index 7fe73cd..0000000
--- a/gemm/mlir/gemm_5120_2_6912_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<6912x5120xf16>, %arg1: tensor<6912x2xf16>) -> tensor<5120x2xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<5120x2xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x2xf16>) -> tensor<5120x2xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<6912x5120xf16>, tensor<6912x2xf16>) outs(%1 : tensor<5120x2xf16>) -> tensor<5120x2xf16>
-        return %2 : tensor<5120x2xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_2_8192_bf16_tA.mlir b/gemm/mlir/gemm_5120_2_8192_bf16_tA.mlir
deleted file mode 100644
index 4460592..0000000
--- a/gemm/mlir/gemm_5120_2_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x5120xbf16>, %arg1: tensor<8192x2xbf16>) -> tensor<5120x2xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<5120x2xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x2xbf16>) -> tensor<5120x2xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x5120xbf16>, tensor<8192x2xbf16>) outs(%1 : tensor<5120x2xbf16>) -> tensor<5120x2xbf16>
-        return %2 : tensor<5120x2xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_2_8192_f16_tA.mlir b/gemm/mlir/gemm_5120_2_8192_f16_tA.mlir
deleted file mode 100644
index 6e9ac82..0000000
--- a/gemm/mlir/gemm_5120_2_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x5120xf16>, %arg1: tensor<8192x2xf16>) -> tensor<5120x2xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<5120x2xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x2xf16>) -> tensor<5120x2xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x5120xf16>, tensor<8192x2xf16>) outs(%1 : tensor<5120x2xf16>) -> tensor<5120x2xf16>
-        return %2 : tensor<5120x2xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_32_1280_bf16_tA.mlir b/gemm/mlir/gemm_5120_32_1280_bf16_tA.mlir
deleted file mode 100644
index 256678e..0000000
--- a/gemm/mlir/gemm_5120_32_1280_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<1280x5120xbf16>, %arg1: tensor<1280x32xbf16>) -> tensor<5120x32xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<5120x32xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x32xbf16>) -> tensor<5120x32xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1280x5120xbf16>, tensor<1280x32xbf16>) outs(%1 : tensor<5120x32xbf16>) -> tensor<5120x32xbf16>
-        return %2 : tensor<5120x32xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_32_1280_f16_tA.mlir b/gemm/mlir/gemm_5120_32_1280_f16_tA.mlir
deleted file mode 100644
index e7f5580..0000000
--- a/gemm/mlir/gemm_5120_32_1280_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<1280x5120xf16>, %arg1: tensor<1280x32xf16>) -> tensor<5120x32xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<5120x32xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x32xf16>) -> tensor<5120x32xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1280x5120xf16>, tensor<1280x32xf16>) outs(%1 : tensor<5120x32xf16>) -> tensor<5120x32xf16>
-        return %2 : tensor<5120x32xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_32_13824_bf16_tA.mlir b/gemm/mlir/gemm_5120_32_13824_bf16_tA.mlir
deleted file mode 100644
index d84ed24..0000000
--- a/gemm/mlir/gemm_5120_32_13824_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<13824x5120xbf16>, %arg1: tensor<13824x32xbf16>) -> tensor<5120x32xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<5120x32xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x32xbf16>) -> tensor<5120x32xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<13824x5120xbf16>, tensor<13824x32xbf16>) outs(%1 : tensor<5120x32xbf16>) -> tensor<5120x32xbf16>
-        return %2 : tensor<5120x32xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_32_13824_f16_tA.mlir b/gemm/mlir/gemm_5120_32_13824_f16_tA.mlir
deleted file mode 100644
index f50d0d0..0000000
--- a/gemm/mlir/gemm_5120_32_13824_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<13824x5120xf16>, %arg1: tensor<13824x32xf16>) -> tensor<5120x32xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<5120x32xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x32xf16>) -> tensor<5120x32xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<13824x5120xf16>, tensor<13824x32xf16>) outs(%1 : tensor<5120x32xf16>) -> tensor<5120x32xf16>
-        return %2 : tensor<5120x32xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_32_1728_bf16_tA.mlir b/gemm/mlir/gemm_5120_32_1728_bf16_tA.mlir
deleted file mode 100644
index a4af4b4..0000000
--- a/gemm/mlir/gemm_5120_32_1728_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<1728x5120xbf16>, %arg1: tensor<1728x32xbf16>) -> tensor<5120x32xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<5120x32xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x32xbf16>) -> tensor<5120x32xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1728x5120xbf16>, tensor<1728x32xbf16>) outs(%1 : tensor<5120x32xbf16>) -> tensor<5120x32xbf16>
-        return %2 : tensor<5120x32xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_32_1728_f16_tA.mlir b/gemm/mlir/gemm_5120_32_1728_f16_tA.mlir
deleted file mode 100644
index 16e7179..0000000
--- a/gemm/mlir/gemm_5120_32_1728_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<1728x5120xf16>, %arg1: tensor<1728x32xf16>) -> tensor<5120x32xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<5120x32xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x32xf16>) -> tensor<5120x32xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1728x5120xf16>, tensor<1728x32xf16>) outs(%1 : tensor<5120x32xf16>) -> tensor<5120x32xf16>
-        return %2 : tensor<5120x32xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_32_2560_bf16_tA.mlir b/gemm/mlir/gemm_5120_32_2560_bf16_tA.mlir
deleted file mode 100644
index bea8cb5..0000000
--- a/gemm/mlir/gemm_5120_32_2560_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<2560x5120xbf16>, %arg1: tensor<2560x32xbf16>) -> tensor<5120x32xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<5120x32xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x32xbf16>) -> tensor<5120x32xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<2560x5120xbf16>, tensor<2560x32xbf16>) outs(%1 : tensor<5120x32xbf16>) -> tensor<5120x32xbf16>
-        return %2 : tensor<5120x32xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_32_2560_f16_tA.mlir b/gemm/mlir/gemm_5120_32_2560_f16_tA.mlir
deleted file mode 100644
index d4d7491..0000000
--- a/gemm/mlir/gemm_5120_32_2560_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<2560x5120xf16>, %arg1: tensor<2560x32xf16>) -> tensor<5120x32xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<5120x32xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x32xf16>) -> tensor<5120x32xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<2560x5120xf16>, tensor<2560x32xf16>) outs(%1 : tensor<5120x32xf16>) -> tensor<5120x32xf16>
-        return %2 : tensor<5120x32xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_32_3456_bf16_tA.mlir b/gemm/mlir/gemm_5120_32_3456_bf16_tA.mlir
deleted file mode 100644
index a1ec40e..0000000
--- a/gemm/mlir/gemm_5120_32_3456_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<3456x5120xbf16>, %arg1: tensor<3456x32xbf16>) -> tensor<5120x32xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<5120x32xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x32xbf16>) -> tensor<5120x32xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<3456x5120xbf16>, tensor<3456x32xbf16>) outs(%1 : tensor<5120x32xbf16>) -> tensor<5120x32xbf16>
-        return %2 : tensor<5120x32xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_32_3456_f16_tA.mlir b/gemm/mlir/gemm_5120_32_3456_f16_tA.mlir
deleted file mode 100644
index 8f6301c..0000000
--- a/gemm/mlir/gemm_5120_32_3456_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<3456x5120xf16>, %arg1: tensor<3456x32xf16>) -> tensor<5120x32xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<5120x32xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x32xf16>) -> tensor<5120x32xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<3456x5120xf16>, tensor<3456x32xf16>) outs(%1 : tensor<5120x32xf16>) -> tensor<5120x32xf16>
-        return %2 : tensor<5120x32xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_32_5120_bf16_tA.mlir b/gemm/mlir/gemm_5120_32_5120_bf16_tA.mlir
deleted file mode 100644
index 4c72158..0000000
--- a/gemm/mlir/gemm_5120_32_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x5120xbf16>, %arg1: tensor<5120x32xbf16>) -> tensor<5120x32xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<5120x32xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x32xbf16>) -> tensor<5120x32xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x5120xbf16>, tensor<5120x32xbf16>) outs(%1 : tensor<5120x32xbf16>) -> tensor<5120x32xbf16>
-        return %2 : tensor<5120x32xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_32_5120_f16_tA.mlir b/gemm/mlir/gemm_5120_32_5120_f16_tA.mlir
deleted file mode 100644
index 027a09f..0000000
--- a/gemm/mlir/gemm_5120_32_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x5120xf16>, %arg1: tensor<5120x32xf16>) -> tensor<5120x32xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<5120x32xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x32xf16>) -> tensor<5120x32xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x5120xf16>, tensor<5120x32xf16>) outs(%1 : tensor<5120x32xf16>) -> tensor<5120x32xf16>
-        return %2 : tensor<5120x32xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_32_640_bf16_tA.mlir b/gemm/mlir/gemm_5120_32_640_bf16_tA.mlir
deleted file mode 100644
index fec70cb..0000000
--- a/gemm/mlir/gemm_5120_32_640_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<640x5120xbf16>, %arg1: tensor<640x32xbf16>) -> tensor<5120x32xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<5120x32xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x32xbf16>) -> tensor<5120x32xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<640x5120xbf16>, tensor<640x32xbf16>) outs(%1 : tensor<5120x32xbf16>) -> tensor<5120x32xbf16>
-        return %2 : tensor<5120x32xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_32_640_f16_tA.mlir b/gemm/mlir/gemm_5120_32_640_f16_tA.mlir
deleted file mode 100644
index d2e3949..0000000
--- a/gemm/mlir/gemm_5120_32_640_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<640x5120xf16>, %arg1: tensor<640x32xf16>) -> tensor<5120x32xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<5120x32xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x32xf16>) -> tensor<5120x32xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<640x5120xf16>, tensor<640x32xf16>) outs(%1 : tensor<5120x32xf16>) -> tensor<5120x32xf16>
-        return %2 : tensor<5120x32xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_32_6912_bf16_tA.mlir b/gemm/mlir/gemm_5120_32_6912_bf16_tA.mlir
deleted file mode 100644
index 7e22180..0000000
--- a/gemm/mlir/gemm_5120_32_6912_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<6912x5120xbf16>, %arg1: tensor<6912x32xbf16>) -> tensor<5120x32xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<5120x32xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x32xbf16>) -> tensor<5120x32xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<6912x5120xbf16>, tensor<6912x32xbf16>) outs(%1 : tensor<5120x32xbf16>) -> tensor<5120x32xbf16>
-        return %2 : tensor<5120x32xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_32_6912_f16_tA.mlir b/gemm/mlir/gemm_5120_32_6912_f16_tA.mlir
deleted file mode 100644
index 1d9947a..0000000
--- a/gemm/mlir/gemm_5120_32_6912_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<6912x5120xf16>, %arg1: tensor<6912x32xf16>) -> tensor<5120x32xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<5120x32xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x32xf16>) -> tensor<5120x32xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<6912x5120xf16>, tensor<6912x32xf16>) outs(%1 : tensor<5120x32xf16>) -> tensor<5120x32xf16>
-        return %2 : tensor<5120x32xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_32_8192_bf16_tA.mlir b/gemm/mlir/gemm_5120_32_8192_bf16_tA.mlir
deleted file mode 100644
index 323437a..0000000
--- a/gemm/mlir/gemm_5120_32_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x5120xbf16>, %arg1: tensor<8192x32xbf16>) -> tensor<5120x32xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<5120x32xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x32xbf16>) -> tensor<5120x32xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x5120xbf16>, tensor<8192x32xbf16>) outs(%1 : tensor<5120x32xbf16>) -> tensor<5120x32xbf16>
-        return %2 : tensor<5120x32xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_32_8192_f16_tA.mlir b/gemm/mlir/gemm_5120_32_8192_f16_tA.mlir
deleted file mode 100644
index 91e0026..0000000
--- a/gemm/mlir/gemm_5120_32_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x5120xf16>, %arg1: tensor<8192x32xf16>) -> tensor<5120x32xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<5120x32xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x32xf16>) -> tensor<5120x32xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x5120xf16>, tensor<8192x32xf16>) outs(%1 : tensor<5120x32xf16>) -> tensor<5120x32xf16>
-        return %2 : tensor<5120x32xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_4_1280_bf16_tA.mlir b/gemm/mlir/gemm_5120_4_1280_bf16_tA.mlir
deleted file mode 100644
index b02b975..0000000
--- a/gemm/mlir/gemm_5120_4_1280_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<1280x5120xbf16>, %arg1: tensor<1280x4xbf16>) -> tensor<5120x4xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<5120x4xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x4xbf16>) -> tensor<5120x4xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1280x5120xbf16>, tensor<1280x4xbf16>) outs(%1 : tensor<5120x4xbf16>) -> tensor<5120x4xbf16>
-        return %2 : tensor<5120x4xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_4_1280_f16_tA.mlir b/gemm/mlir/gemm_5120_4_1280_f16_tA.mlir
deleted file mode 100644
index cdbe240..0000000
--- a/gemm/mlir/gemm_5120_4_1280_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<1280x5120xf16>, %arg1: tensor<1280x4xf16>) -> tensor<5120x4xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<5120x4xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x4xf16>) -> tensor<5120x4xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1280x5120xf16>, tensor<1280x4xf16>) outs(%1 : tensor<5120x4xf16>) -> tensor<5120x4xf16>
-        return %2 : tensor<5120x4xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_4_13824_bf16_tA.mlir b/gemm/mlir/gemm_5120_4_13824_bf16_tA.mlir
deleted file mode 100644
index c024c59..0000000
--- a/gemm/mlir/gemm_5120_4_13824_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<13824x5120xbf16>, %arg1: tensor<13824x4xbf16>) -> tensor<5120x4xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<5120x4xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x4xbf16>) -> tensor<5120x4xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<13824x5120xbf16>, tensor<13824x4xbf16>) outs(%1 : tensor<5120x4xbf16>) -> tensor<5120x4xbf16>
-        return %2 : tensor<5120x4xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_4_13824_f16_tA.mlir b/gemm/mlir/gemm_5120_4_13824_f16_tA.mlir
deleted file mode 100644
index 1b355e9..0000000
--- a/gemm/mlir/gemm_5120_4_13824_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<13824x5120xf16>, %arg1: tensor<13824x4xf16>) -> tensor<5120x4xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<5120x4xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x4xf16>) -> tensor<5120x4xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<13824x5120xf16>, tensor<13824x4xf16>) outs(%1 : tensor<5120x4xf16>) -> tensor<5120x4xf16>
-        return %2 : tensor<5120x4xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_4_1728_bf16_tA.mlir b/gemm/mlir/gemm_5120_4_1728_bf16_tA.mlir
deleted file mode 100644
index 77d316d..0000000
--- a/gemm/mlir/gemm_5120_4_1728_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<1728x5120xbf16>, %arg1: tensor<1728x4xbf16>) -> tensor<5120x4xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<5120x4xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x4xbf16>) -> tensor<5120x4xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1728x5120xbf16>, tensor<1728x4xbf16>) outs(%1 : tensor<5120x4xbf16>) -> tensor<5120x4xbf16>
-        return %2 : tensor<5120x4xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_4_1728_f16_tA.mlir b/gemm/mlir/gemm_5120_4_1728_f16_tA.mlir
deleted file mode 100644
index b77fd46..0000000
--- a/gemm/mlir/gemm_5120_4_1728_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<1728x5120xf16>, %arg1: tensor<1728x4xf16>) -> tensor<5120x4xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<5120x4xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x4xf16>) -> tensor<5120x4xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1728x5120xf16>, tensor<1728x4xf16>) outs(%1 : tensor<5120x4xf16>) -> tensor<5120x4xf16>
-        return %2 : tensor<5120x4xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_4_2560_bf16_tA.mlir b/gemm/mlir/gemm_5120_4_2560_bf16_tA.mlir
deleted file mode 100644
index b441065..0000000
--- a/gemm/mlir/gemm_5120_4_2560_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<2560x5120xbf16>, %arg1: tensor<2560x4xbf16>) -> tensor<5120x4xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<5120x4xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x4xbf16>) -> tensor<5120x4xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<2560x5120xbf16>, tensor<2560x4xbf16>) outs(%1 : tensor<5120x4xbf16>) -> tensor<5120x4xbf16>
-        return %2 : tensor<5120x4xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_4_2560_f16_tA.mlir b/gemm/mlir/gemm_5120_4_2560_f16_tA.mlir
deleted file mode 100644
index 78af1ae..0000000
--- a/gemm/mlir/gemm_5120_4_2560_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<2560x5120xf16>, %arg1: tensor<2560x4xf16>) -> tensor<5120x4xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<5120x4xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x4xf16>) -> tensor<5120x4xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<2560x5120xf16>, tensor<2560x4xf16>) outs(%1 : tensor<5120x4xf16>) -> tensor<5120x4xf16>
-        return %2 : tensor<5120x4xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_4_3456_bf16_tA.mlir b/gemm/mlir/gemm_5120_4_3456_bf16_tA.mlir
deleted file mode 100644
index 65e3813..0000000
--- a/gemm/mlir/gemm_5120_4_3456_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<3456x5120xbf16>, %arg1: tensor<3456x4xbf16>) -> tensor<5120x4xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<5120x4xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x4xbf16>) -> tensor<5120x4xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<3456x5120xbf16>, tensor<3456x4xbf16>) outs(%1 : tensor<5120x4xbf16>) -> tensor<5120x4xbf16>
-        return %2 : tensor<5120x4xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_4_3456_f16_tA.mlir b/gemm/mlir/gemm_5120_4_3456_f16_tA.mlir
deleted file mode 100644
index 055a56a..0000000
--- a/gemm/mlir/gemm_5120_4_3456_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<3456x5120xf16>, %arg1: tensor<3456x4xf16>) -> tensor<5120x4xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<5120x4xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x4xf16>) -> tensor<5120x4xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<3456x5120xf16>, tensor<3456x4xf16>) outs(%1 : tensor<5120x4xf16>) -> tensor<5120x4xf16>
-        return %2 : tensor<5120x4xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_4_5120_bf16_tA.mlir b/gemm/mlir/gemm_5120_4_5120_bf16_tA.mlir
deleted file mode 100644
index 133c6e2..0000000
--- a/gemm/mlir/gemm_5120_4_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x5120xbf16>, %arg1: tensor<5120x4xbf16>) -> tensor<5120x4xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<5120x4xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x4xbf16>) -> tensor<5120x4xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x5120xbf16>, tensor<5120x4xbf16>) outs(%1 : tensor<5120x4xbf16>) -> tensor<5120x4xbf16>
-        return %2 : tensor<5120x4xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_4_5120_f16_tA.mlir b/gemm/mlir/gemm_5120_4_5120_f16_tA.mlir
deleted file mode 100644
index 3b6cabf..0000000
--- a/gemm/mlir/gemm_5120_4_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x5120xf16>, %arg1: tensor<5120x4xf16>) -> tensor<5120x4xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<5120x4xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x4xf16>) -> tensor<5120x4xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x5120xf16>, tensor<5120x4xf16>) outs(%1 : tensor<5120x4xf16>) -> tensor<5120x4xf16>
-        return %2 : tensor<5120x4xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_4_640_bf16_tA.mlir b/gemm/mlir/gemm_5120_4_640_bf16_tA.mlir
deleted file mode 100644
index 1e22dd9..0000000
--- a/gemm/mlir/gemm_5120_4_640_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<640x5120xbf16>, %arg1: tensor<640x4xbf16>) -> tensor<5120x4xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<5120x4xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x4xbf16>) -> tensor<5120x4xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<640x5120xbf16>, tensor<640x4xbf16>) outs(%1 : tensor<5120x4xbf16>) -> tensor<5120x4xbf16>
-        return %2 : tensor<5120x4xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_4_640_f16_tA.mlir b/gemm/mlir/gemm_5120_4_640_f16_tA.mlir
deleted file mode 100644
index f7459f4..0000000
--- a/gemm/mlir/gemm_5120_4_640_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<640x5120xf16>, %arg1: tensor<640x4xf16>) -> tensor<5120x4xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<5120x4xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x4xf16>) -> tensor<5120x4xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<640x5120xf16>, tensor<640x4xf16>) outs(%1 : tensor<5120x4xf16>) -> tensor<5120x4xf16>
-        return %2 : tensor<5120x4xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_4_6912_bf16_tA.mlir b/gemm/mlir/gemm_5120_4_6912_bf16_tA.mlir
deleted file mode 100644
index 9244683..0000000
--- a/gemm/mlir/gemm_5120_4_6912_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<6912x5120xbf16>, %arg1: tensor<6912x4xbf16>) -> tensor<5120x4xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<5120x4xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x4xbf16>) -> tensor<5120x4xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<6912x5120xbf16>, tensor<6912x4xbf16>) outs(%1 : tensor<5120x4xbf16>) -> tensor<5120x4xbf16>
-        return %2 : tensor<5120x4xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_4_6912_f16_tA.mlir b/gemm/mlir/gemm_5120_4_6912_f16_tA.mlir
deleted file mode 100644
index f3c0b6a..0000000
--- a/gemm/mlir/gemm_5120_4_6912_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<6912x5120xf16>, %arg1: tensor<6912x4xf16>) -> tensor<5120x4xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<5120x4xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x4xf16>) -> tensor<5120x4xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<6912x5120xf16>, tensor<6912x4xf16>) outs(%1 : tensor<5120x4xf16>) -> tensor<5120x4xf16>
-        return %2 : tensor<5120x4xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_4_8192_bf16_tA.mlir b/gemm/mlir/gemm_5120_4_8192_bf16_tA.mlir
deleted file mode 100644
index 1e39bcc..0000000
--- a/gemm/mlir/gemm_5120_4_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x5120xbf16>, %arg1: tensor<8192x4xbf16>) -> tensor<5120x4xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<5120x4xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x4xbf16>) -> tensor<5120x4xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x5120xbf16>, tensor<8192x4xbf16>) outs(%1 : tensor<5120x4xbf16>) -> tensor<5120x4xbf16>
-        return %2 : tensor<5120x4xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_4_8192_f16_tA.mlir b/gemm/mlir/gemm_5120_4_8192_f16_tA.mlir
deleted file mode 100644
index 59ff5c1..0000000
--- a/gemm/mlir/gemm_5120_4_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x5120xf16>, %arg1: tensor<8192x4xf16>) -> tensor<5120x4xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<5120x4xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x4xf16>) -> tensor<5120x4xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x5120xf16>, tensor<8192x4xf16>) outs(%1 : tensor<5120x4xf16>) -> tensor<5120x4xf16>
-        return %2 : tensor<5120x4xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_8_1280_bf16_tA.mlir b/gemm/mlir/gemm_5120_8_1280_bf16_tA.mlir
deleted file mode 100644
index 090d0a3..0000000
--- a/gemm/mlir/gemm_5120_8_1280_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<1280x5120xbf16>, %arg1: tensor<1280x8xbf16>) -> tensor<5120x8xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<5120x8xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x8xbf16>) -> tensor<5120x8xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1280x5120xbf16>, tensor<1280x8xbf16>) outs(%1 : tensor<5120x8xbf16>) -> tensor<5120x8xbf16>
-        return %2 : tensor<5120x8xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_8_1280_f16_tA.mlir b/gemm/mlir/gemm_5120_8_1280_f16_tA.mlir
deleted file mode 100644
index 68c2973..0000000
--- a/gemm/mlir/gemm_5120_8_1280_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<1280x5120xf16>, %arg1: tensor<1280x8xf16>) -> tensor<5120x8xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<5120x8xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x8xf16>) -> tensor<5120x8xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1280x5120xf16>, tensor<1280x8xf16>) outs(%1 : tensor<5120x8xf16>) -> tensor<5120x8xf16>
-        return %2 : tensor<5120x8xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_8_13824_bf16_tA.mlir b/gemm/mlir/gemm_5120_8_13824_bf16_tA.mlir
deleted file mode 100644
index b80c0d8..0000000
--- a/gemm/mlir/gemm_5120_8_13824_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<13824x5120xbf16>, %arg1: tensor<13824x8xbf16>) -> tensor<5120x8xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<5120x8xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x8xbf16>) -> tensor<5120x8xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<13824x5120xbf16>, tensor<13824x8xbf16>) outs(%1 : tensor<5120x8xbf16>) -> tensor<5120x8xbf16>
-        return %2 : tensor<5120x8xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_8_13824_f16_tA.mlir b/gemm/mlir/gemm_5120_8_13824_f16_tA.mlir
deleted file mode 100644
index 77658a9..0000000
--- a/gemm/mlir/gemm_5120_8_13824_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<13824x5120xf16>, %arg1: tensor<13824x8xf16>) -> tensor<5120x8xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<5120x8xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x8xf16>) -> tensor<5120x8xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<13824x5120xf16>, tensor<13824x8xf16>) outs(%1 : tensor<5120x8xf16>) -> tensor<5120x8xf16>
-        return %2 : tensor<5120x8xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_8_1728_bf16_tA.mlir b/gemm/mlir/gemm_5120_8_1728_bf16_tA.mlir
deleted file mode 100644
index 3d405b3..0000000
--- a/gemm/mlir/gemm_5120_8_1728_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<1728x5120xbf16>, %arg1: tensor<1728x8xbf16>) -> tensor<5120x8xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<5120x8xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x8xbf16>) -> tensor<5120x8xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1728x5120xbf16>, tensor<1728x8xbf16>) outs(%1 : tensor<5120x8xbf16>) -> tensor<5120x8xbf16>
-        return %2 : tensor<5120x8xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_8_1728_f16_tA.mlir b/gemm/mlir/gemm_5120_8_1728_f16_tA.mlir
deleted file mode 100644
index 9717a1c..0000000
--- a/gemm/mlir/gemm_5120_8_1728_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<1728x5120xf16>, %arg1: tensor<1728x8xf16>) -> tensor<5120x8xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<5120x8xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x8xf16>) -> tensor<5120x8xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1728x5120xf16>, tensor<1728x8xf16>) outs(%1 : tensor<5120x8xf16>) -> tensor<5120x8xf16>
-        return %2 : tensor<5120x8xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_8_2560_bf16_tA.mlir b/gemm/mlir/gemm_5120_8_2560_bf16_tA.mlir
deleted file mode 100644
index e20b534..0000000
--- a/gemm/mlir/gemm_5120_8_2560_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<2560x5120xbf16>, %arg1: tensor<2560x8xbf16>) -> tensor<5120x8xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<5120x8xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x8xbf16>) -> tensor<5120x8xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<2560x5120xbf16>, tensor<2560x8xbf16>) outs(%1 : tensor<5120x8xbf16>) -> tensor<5120x8xbf16>
-        return %2 : tensor<5120x8xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_8_2560_f16_tA.mlir b/gemm/mlir/gemm_5120_8_2560_f16_tA.mlir
deleted file mode 100644
index fcb3692..0000000
--- a/gemm/mlir/gemm_5120_8_2560_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<2560x5120xf16>, %arg1: tensor<2560x8xf16>) -> tensor<5120x8xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<5120x8xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x8xf16>) -> tensor<5120x8xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<2560x5120xf16>, tensor<2560x8xf16>) outs(%1 : tensor<5120x8xf16>) -> tensor<5120x8xf16>
-        return %2 : tensor<5120x8xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_8_3456_bf16_tA.mlir b/gemm/mlir/gemm_5120_8_3456_bf16_tA.mlir
deleted file mode 100644
index e86a941..0000000
--- a/gemm/mlir/gemm_5120_8_3456_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<3456x5120xbf16>, %arg1: tensor<3456x8xbf16>) -> tensor<5120x8xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<5120x8xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x8xbf16>) -> tensor<5120x8xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<3456x5120xbf16>, tensor<3456x8xbf16>) outs(%1 : tensor<5120x8xbf16>) -> tensor<5120x8xbf16>
-        return %2 : tensor<5120x8xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_8_3456_f16_tA.mlir b/gemm/mlir/gemm_5120_8_3456_f16_tA.mlir
deleted file mode 100644
index b81b946..0000000
--- a/gemm/mlir/gemm_5120_8_3456_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<3456x5120xf16>, %arg1: tensor<3456x8xf16>) -> tensor<5120x8xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<5120x8xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x8xf16>) -> tensor<5120x8xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<3456x5120xf16>, tensor<3456x8xf16>) outs(%1 : tensor<5120x8xf16>) -> tensor<5120x8xf16>
-        return %2 : tensor<5120x8xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_8_5120_bf16_tA.mlir b/gemm/mlir/gemm_5120_8_5120_bf16_tA.mlir
deleted file mode 100644
index b66fabd..0000000
--- a/gemm/mlir/gemm_5120_8_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x5120xbf16>, %arg1: tensor<5120x8xbf16>) -> tensor<5120x8xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<5120x8xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x8xbf16>) -> tensor<5120x8xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x5120xbf16>, tensor<5120x8xbf16>) outs(%1 : tensor<5120x8xbf16>) -> tensor<5120x8xbf16>
-        return %2 : tensor<5120x8xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_8_5120_f16_tA.mlir b/gemm/mlir/gemm_5120_8_5120_f16_tA.mlir
deleted file mode 100644
index b42ef4d..0000000
--- a/gemm/mlir/gemm_5120_8_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x5120xf16>, %arg1: tensor<5120x8xf16>) -> tensor<5120x8xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<5120x8xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x8xf16>) -> tensor<5120x8xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x5120xf16>, tensor<5120x8xf16>) outs(%1 : tensor<5120x8xf16>) -> tensor<5120x8xf16>
-        return %2 : tensor<5120x8xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_8_640_bf16_tA.mlir b/gemm/mlir/gemm_5120_8_640_bf16_tA.mlir
deleted file mode 100644
index 919f4aa..0000000
--- a/gemm/mlir/gemm_5120_8_640_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<640x5120xbf16>, %arg1: tensor<640x8xbf16>) -> tensor<5120x8xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<5120x8xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x8xbf16>) -> tensor<5120x8xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<640x5120xbf16>, tensor<640x8xbf16>) outs(%1 : tensor<5120x8xbf16>) -> tensor<5120x8xbf16>
-        return %2 : tensor<5120x8xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_8_640_f16_tA.mlir b/gemm/mlir/gemm_5120_8_640_f16_tA.mlir
deleted file mode 100644
index 2667615..0000000
--- a/gemm/mlir/gemm_5120_8_640_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<640x5120xf16>, %arg1: tensor<640x8xf16>) -> tensor<5120x8xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<5120x8xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x8xf16>) -> tensor<5120x8xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<640x5120xf16>, tensor<640x8xf16>) outs(%1 : tensor<5120x8xf16>) -> tensor<5120x8xf16>
-        return %2 : tensor<5120x8xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_8_6912_bf16_tA.mlir b/gemm/mlir/gemm_5120_8_6912_bf16_tA.mlir
deleted file mode 100644
index 68be7d6..0000000
--- a/gemm/mlir/gemm_5120_8_6912_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<6912x5120xbf16>, %arg1: tensor<6912x8xbf16>) -> tensor<5120x8xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<5120x8xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x8xbf16>) -> tensor<5120x8xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<6912x5120xbf16>, tensor<6912x8xbf16>) outs(%1 : tensor<5120x8xbf16>) -> tensor<5120x8xbf16>
-        return %2 : tensor<5120x8xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_8_6912_f16_tA.mlir b/gemm/mlir/gemm_5120_8_6912_f16_tA.mlir
deleted file mode 100644
index b423ad4..0000000
--- a/gemm/mlir/gemm_5120_8_6912_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<6912x5120xf16>, %arg1: tensor<6912x8xf16>) -> tensor<5120x8xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<5120x8xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x8xf16>) -> tensor<5120x8xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<6912x5120xf16>, tensor<6912x8xf16>) outs(%1 : tensor<5120x8xf16>) -> tensor<5120x8xf16>
-        return %2 : tensor<5120x8xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_8_8192_bf16_tA.mlir b/gemm/mlir/gemm_5120_8_8192_bf16_tA.mlir
deleted file mode 100644
index 70c44a9..0000000
--- a/gemm/mlir/gemm_5120_8_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x5120xbf16>, %arg1: tensor<8192x8xbf16>) -> tensor<5120x8xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<5120x8xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<5120x8xbf16>) -> tensor<5120x8xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x5120xbf16>, tensor<8192x8xbf16>) outs(%1 : tensor<5120x8xbf16>) -> tensor<5120x8xbf16>
-        return %2 : tensor<5120x8xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_5120_8_8192_f16_tA.mlir b/gemm/mlir/gemm_5120_8_8192_f16_tA.mlir
deleted file mode 100644
index 79a3420..0000000
--- a/gemm/mlir/gemm_5120_8_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x5120xf16>, %arg1: tensor<8192x8xf16>) -> tensor<5120x8xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<5120x8xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<5120x8xf16>) -> tensor<5120x8xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x5120xf16>, tensor<8192x8xf16>) outs(%1 : tensor<5120x8xf16>) -> tensor<5120x8xf16>
-        return %2 : tensor<5120x8xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_57344_16_8192_bf16_tA.mlir b/gemm/mlir/gemm_57344_16_8192_bf16_tA.mlir
deleted file mode 100644
index c05fd42..0000000
--- a/gemm/mlir/gemm_57344_16_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x57344xbf16>, %arg1: tensor<8192x16xbf16>) -> tensor<57344x16xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<57344x16xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<57344x16xbf16>) -> tensor<57344x16xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x57344xbf16>, tensor<8192x16xbf16>) outs(%1 : tensor<57344x16xbf16>) -> tensor<57344x16xbf16>
-        return %2 : tensor<57344x16xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_57344_16_8192_f16_tA.mlir b/gemm/mlir/gemm_57344_16_8192_f16_tA.mlir
deleted file mode 100644
index a27c3ca..0000000
--- a/gemm/mlir/gemm_57344_16_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x57344xf16>, %arg1: tensor<8192x16xf16>) -> tensor<57344x16xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<57344x16xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<57344x16xf16>) -> tensor<57344x16xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x57344xf16>, tensor<8192x16xf16>) outs(%1 : tensor<57344x16xf16>) -> tensor<57344x16xf16>
-        return %2 : tensor<57344x16xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_57344_1_8192_bf16_tA.mlir b/gemm/mlir/gemm_57344_1_8192_bf16_tA.mlir
deleted file mode 100644
index 6b50b47..0000000
--- a/gemm/mlir/gemm_57344_1_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x57344xbf16>, %arg1: tensor<8192x1xbf16>) -> tensor<57344x1xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<57344x1xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<57344x1xbf16>) -> tensor<57344x1xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x57344xbf16>, tensor<8192x1xbf16>) outs(%1 : tensor<57344x1xbf16>) -> tensor<57344x1xbf16>
-        return %2 : tensor<57344x1xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_57344_1_8192_f16_tA.mlir b/gemm/mlir/gemm_57344_1_8192_f16_tA.mlir
deleted file mode 100644
index a391e24..0000000
--- a/gemm/mlir/gemm_57344_1_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x57344xf16>, %arg1: tensor<8192x1xf16>) -> tensor<57344x1xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<57344x1xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<57344x1xf16>) -> tensor<57344x1xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x57344xf16>, tensor<8192x1xf16>) outs(%1 : tensor<57344x1xf16>) -> tensor<57344x1xf16>
-        return %2 : tensor<57344x1xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_57344_2_8192_bf16_tA.mlir b/gemm/mlir/gemm_57344_2_8192_bf16_tA.mlir
deleted file mode 100644
index b176f2c..0000000
--- a/gemm/mlir/gemm_57344_2_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x57344xbf16>, %arg1: tensor<8192x2xbf16>) -> tensor<57344x2xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<57344x2xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<57344x2xbf16>) -> tensor<57344x2xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x57344xbf16>, tensor<8192x2xbf16>) outs(%1 : tensor<57344x2xbf16>) -> tensor<57344x2xbf16>
-        return %2 : tensor<57344x2xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_57344_2_8192_f16_tA.mlir b/gemm/mlir/gemm_57344_2_8192_f16_tA.mlir
deleted file mode 100644
index ffac68f..0000000
--- a/gemm/mlir/gemm_57344_2_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x57344xf16>, %arg1: tensor<8192x2xf16>) -> tensor<57344x2xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<57344x2xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<57344x2xf16>) -> tensor<57344x2xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x57344xf16>, tensor<8192x2xf16>) outs(%1 : tensor<57344x2xf16>) -> tensor<57344x2xf16>
-        return %2 : tensor<57344x2xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_57344_32_8192_bf16_tA.mlir b/gemm/mlir/gemm_57344_32_8192_bf16_tA.mlir
deleted file mode 100644
index bbe0c75..0000000
--- a/gemm/mlir/gemm_57344_32_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x57344xbf16>, %arg1: tensor<8192x32xbf16>) -> tensor<57344x32xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<57344x32xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<57344x32xbf16>) -> tensor<57344x32xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x57344xbf16>, tensor<8192x32xbf16>) outs(%1 : tensor<57344x32xbf16>) -> tensor<57344x32xbf16>
-        return %2 : tensor<57344x32xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_57344_32_8192_f16_tA.mlir b/gemm/mlir/gemm_57344_32_8192_f16_tA.mlir
deleted file mode 100644
index 34675d0..0000000
--- a/gemm/mlir/gemm_57344_32_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x57344xf16>, %arg1: tensor<8192x32xf16>) -> tensor<57344x32xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<57344x32xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<57344x32xf16>) -> tensor<57344x32xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x57344xf16>, tensor<8192x32xf16>) outs(%1 : tensor<57344x32xf16>) -> tensor<57344x32xf16>
-        return %2 : tensor<57344x32xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_57344_4_8192_bf16_tA.mlir b/gemm/mlir/gemm_57344_4_8192_bf16_tA.mlir
deleted file mode 100644
index 2189c7e..0000000
--- a/gemm/mlir/gemm_57344_4_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x57344xbf16>, %arg1: tensor<8192x4xbf16>) -> tensor<57344x4xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<57344x4xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<57344x4xbf16>) -> tensor<57344x4xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x57344xbf16>, tensor<8192x4xbf16>) outs(%1 : tensor<57344x4xbf16>) -> tensor<57344x4xbf16>
-        return %2 : tensor<57344x4xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_57344_4_8192_f16_tA.mlir b/gemm/mlir/gemm_57344_4_8192_f16_tA.mlir
deleted file mode 100644
index 5419137..0000000
--- a/gemm/mlir/gemm_57344_4_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x57344xf16>, %arg1: tensor<8192x4xf16>) -> tensor<57344x4xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<57344x4xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<57344x4xf16>) -> tensor<57344x4xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x57344xf16>, tensor<8192x4xf16>) outs(%1 : tensor<57344x4xf16>) -> tensor<57344x4xf16>
-        return %2 : tensor<57344x4xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_57344_8_8192_bf16_tA.mlir b/gemm/mlir/gemm_57344_8_8192_bf16_tA.mlir
deleted file mode 100644
index 84785f7..0000000
--- a/gemm/mlir/gemm_57344_8_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x57344xbf16>, %arg1: tensor<8192x8xbf16>) -> tensor<57344x8xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<57344x8xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<57344x8xbf16>) -> tensor<57344x8xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x57344xbf16>, tensor<8192x8xbf16>) outs(%1 : tensor<57344x8xbf16>) -> tensor<57344x8xbf16>
-        return %2 : tensor<57344x8xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_57344_8_8192_f16_tA.mlir b/gemm/mlir/gemm_57344_8_8192_f16_tA.mlir
deleted file mode 100644
index 58d7ded..0000000
--- a/gemm/mlir/gemm_57344_8_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x57344xf16>, %arg1: tensor<8192x8xf16>) -> tensor<57344x8xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<57344x8xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<57344x8xf16>) -> tensor<57344x8xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x57344xf16>, tensor<8192x8xf16>) outs(%1 : tensor<57344x8xf16>) -> tensor<57344x8xf16>
-        return %2 : tensor<57344x8xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_6912_16_5120_bf16_tA.mlir b/gemm/mlir/gemm_6912_16_5120_bf16_tA.mlir
deleted file mode 100644
index a9180ad..0000000
--- a/gemm/mlir/gemm_6912_16_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x6912xbf16>, %arg1: tensor<5120x16xbf16>) -> tensor<6912x16xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<6912x16xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<6912x16xbf16>) -> tensor<6912x16xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x6912xbf16>, tensor<5120x16xbf16>) outs(%1 : tensor<6912x16xbf16>) -> tensor<6912x16xbf16>
-        return %2 : tensor<6912x16xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_6912_16_5120_f16_tA.mlir b/gemm/mlir/gemm_6912_16_5120_f16_tA.mlir
deleted file mode 100644
index 47aaf92..0000000
--- a/gemm/mlir/gemm_6912_16_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x6912xf16>, %arg1: tensor<5120x16xf16>) -> tensor<6912x16xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<6912x16xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<6912x16xf16>) -> tensor<6912x16xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x6912xf16>, tensor<5120x16xf16>) outs(%1 : tensor<6912x16xf16>) -> tensor<6912x16xf16>
-        return %2 : tensor<6912x16xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_6912_1_5120_bf16_tA.mlir b/gemm/mlir/gemm_6912_1_5120_bf16_tA.mlir
deleted file mode 100644
index f087893..0000000
--- a/gemm/mlir/gemm_6912_1_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x6912xbf16>, %arg1: tensor<5120x1xbf16>) -> tensor<6912x1xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<6912x1xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<6912x1xbf16>) -> tensor<6912x1xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x6912xbf16>, tensor<5120x1xbf16>) outs(%1 : tensor<6912x1xbf16>) -> tensor<6912x1xbf16>
-        return %2 : tensor<6912x1xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_6912_1_5120_f16_tA.mlir b/gemm/mlir/gemm_6912_1_5120_f16_tA.mlir
deleted file mode 100644
index beee00d..0000000
--- a/gemm/mlir/gemm_6912_1_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x6912xf16>, %arg1: tensor<5120x1xf16>) -> tensor<6912x1xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<6912x1xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<6912x1xf16>) -> tensor<6912x1xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x6912xf16>, tensor<5120x1xf16>) outs(%1 : tensor<6912x1xf16>) -> tensor<6912x1xf16>
-        return %2 : tensor<6912x1xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_6912_2_5120_bf16_tA.mlir b/gemm/mlir/gemm_6912_2_5120_bf16_tA.mlir
deleted file mode 100644
index 441ec83..0000000
--- a/gemm/mlir/gemm_6912_2_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x6912xbf16>, %arg1: tensor<5120x2xbf16>) -> tensor<6912x2xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<6912x2xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<6912x2xbf16>) -> tensor<6912x2xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x6912xbf16>, tensor<5120x2xbf16>) outs(%1 : tensor<6912x2xbf16>) -> tensor<6912x2xbf16>
-        return %2 : tensor<6912x2xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_6912_2_5120_f16_tA.mlir b/gemm/mlir/gemm_6912_2_5120_f16_tA.mlir
deleted file mode 100644
index 397c7b2..0000000
--- a/gemm/mlir/gemm_6912_2_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x6912xf16>, %arg1: tensor<5120x2xf16>) -> tensor<6912x2xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<6912x2xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<6912x2xf16>) -> tensor<6912x2xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x6912xf16>, tensor<5120x2xf16>) outs(%1 : tensor<6912x2xf16>) -> tensor<6912x2xf16>
-        return %2 : tensor<6912x2xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_6912_32_5120_bf16_tA.mlir b/gemm/mlir/gemm_6912_32_5120_bf16_tA.mlir
deleted file mode 100644
index 926a24a..0000000
--- a/gemm/mlir/gemm_6912_32_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x6912xbf16>, %arg1: tensor<5120x32xbf16>) -> tensor<6912x32xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<6912x32xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<6912x32xbf16>) -> tensor<6912x32xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x6912xbf16>, tensor<5120x32xbf16>) outs(%1 : tensor<6912x32xbf16>) -> tensor<6912x32xbf16>
-        return %2 : tensor<6912x32xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_6912_32_5120_f16_tA.mlir b/gemm/mlir/gemm_6912_32_5120_f16_tA.mlir
deleted file mode 100644
index 75888ec..0000000
--- a/gemm/mlir/gemm_6912_32_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x6912xf16>, %arg1: tensor<5120x32xf16>) -> tensor<6912x32xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<6912x32xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<6912x32xf16>) -> tensor<6912x32xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x6912xf16>, tensor<5120x32xf16>) outs(%1 : tensor<6912x32xf16>) -> tensor<6912x32xf16>
-        return %2 : tensor<6912x32xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_6912_4_5120_bf16_tA.mlir b/gemm/mlir/gemm_6912_4_5120_bf16_tA.mlir
deleted file mode 100644
index 105402a..0000000
--- a/gemm/mlir/gemm_6912_4_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x6912xbf16>, %arg1: tensor<5120x4xbf16>) -> tensor<6912x4xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<6912x4xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<6912x4xbf16>) -> tensor<6912x4xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x6912xbf16>, tensor<5120x4xbf16>) outs(%1 : tensor<6912x4xbf16>) -> tensor<6912x4xbf16>
-        return %2 : tensor<6912x4xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_6912_4_5120_f16_tA.mlir b/gemm/mlir/gemm_6912_4_5120_f16_tA.mlir
deleted file mode 100644
index 2938490..0000000
--- a/gemm/mlir/gemm_6912_4_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x6912xf16>, %arg1: tensor<5120x4xf16>) -> tensor<6912x4xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<6912x4xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<6912x4xf16>) -> tensor<6912x4xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x6912xf16>, tensor<5120x4xf16>) outs(%1 : tensor<6912x4xf16>) -> tensor<6912x4xf16>
-        return %2 : tensor<6912x4xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_6912_8_5120_bf16_tA.mlir b/gemm/mlir/gemm_6912_8_5120_bf16_tA.mlir
deleted file mode 100644
index c62dc28..0000000
--- a/gemm/mlir/gemm_6912_8_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x6912xbf16>, %arg1: tensor<5120x8xbf16>) -> tensor<6912x8xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<6912x8xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<6912x8xbf16>) -> tensor<6912x8xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x6912xbf16>, tensor<5120x8xbf16>) outs(%1 : tensor<6912x8xbf16>) -> tensor<6912x8xbf16>
-        return %2 : tensor<6912x8xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_6912_8_5120_f16_tA.mlir b/gemm/mlir/gemm_6912_8_5120_f16_tA.mlir
deleted file mode 100644
index 0fc7b88..0000000
--- a/gemm/mlir/gemm_6912_8_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x6912xf16>, %arg1: tensor<5120x8xf16>) -> tensor<6912x8xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<6912x8xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<6912x8xf16>) -> tensor<6912x8xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x6912xf16>, tensor<5120x8xf16>) outs(%1 : tensor<6912x8xf16>) -> tensor<6912x8xf16>
-        return %2 : tensor<6912x8xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_7168_16_8192_bf16_tA.mlir b/gemm/mlir/gemm_7168_16_8192_bf16_tA.mlir
deleted file mode 100644
index c7660f1..0000000
--- a/gemm/mlir/gemm_7168_16_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x7168xbf16>, %arg1: tensor<8192x16xbf16>) -> tensor<7168x16xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<7168x16xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<7168x16xbf16>) -> tensor<7168x16xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x7168xbf16>, tensor<8192x16xbf16>) outs(%1 : tensor<7168x16xbf16>) -> tensor<7168x16xbf16>
-        return %2 : tensor<7168x16xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_7168_16_8192_f16_tA.mlir b/gemm/mlir/gemm_7168_16_8192_f16_tA.mlir
deleted file mode 100644
index 3b4e48c..0000000
--- a/gemm/mlir/gemm_7168_16_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x7168xf16>, %arg1: tensor<8192x16xf16>) -> tensor<7168x16xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<7168x16xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<7168x16xf16>) -> tensor<7168x16xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x7168xf16>, tensor<8192x16xf16>) outs(%1 : tensor<7168x16xf16>) -> tensor<7168x16xf16>
-        return %2 : tensor<7168x16xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_7168_1_8192_bf16_tA.mlir b/gemm/mlir/gemm_7168_1_8192_bf16_tA.mlir
deleted file mode 100644
index 41d8ee8..0000000
--- a/gemm/mlir/gemm_7168_1_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x7168xbf16>, %arg1: tensor<8192x1xbf16>) -> tensor<7168x1xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<7168x1xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<7168x1xbf16>) -> tensor<7168x1xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x7168xbf16>, tensor<8192x1xbf16>) outs(%1 : tensor<7168x1xbf16>) -> tensor<7168x1xbf16>
-        return %2 : tensor<7168x1xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_7168_1_8192_f16_tA.mlir b/gemm/mlir/gemm_7168_1_8192_f16_tA.mlir
deleted file mode 100644
index 93b1d5e..0000000
--- a/gemm/mlir/gemm_7168_1_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x7168xf16>, %arg1: tensor<8192x1xf16>) -> tensor<7168x1xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<7168x1xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<7168x1xf16>) -> tensor<7168x1xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x7168xf16>, tensor<8192x1xf16>) outs(%1 : tensor<7168x1xf16>) -> tensor<7168x1xf16>
-        return %2 : tensor<7168x1xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_7168_2_8192_bf16_tA.mlir b/gemm/mlir/gemm_7168_2_8192_bf16_tA.mlir
deleted file mode 100644
index 555cca9..0000000
--- a/gemm/mlir/gemm_7168_2_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x7168xbf16>, %arg1: tensor<8192x2xbf16>) -> tensor<7168x2xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<7168x2xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<7168x2xbf16>) -> tensor<7168x2xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x7168xbf16>, tensor<8192x2xbf16>) outs(%1 : tensor<7168x2xbf16>) -> tensor<7168x2xbf16>
-        return %2 : tensor<7168x2xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_7168_2_8192_f16_tA.mlir b/gemm/mlir/gemm_7168_2_8192_f16_tA.mlir
deleted file mode 100644
index 4ab13c2..0000000
--- a/gemm/mlir/gemm_7168_2_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x7168xf16>, %arg1: tensor<8192x2xf16>) -> tensor<7168x2xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<7168x2xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<7168x2xf16>) -> tensor<7168x2xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x7168xf16>, tensor<8192x2xf16>) outs(%1 : tensor<7168x2xf16>) -> tensor<7168x2xf16>
-        return %2 : tensor<7168x2xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_7168_32_8192_bf16_tA.mlir b/gemm/mlir/gemm_7168_32_8192_bf16_tA.mlir
deleted file mode 100644
index e6b536c..0000000
--- a/gemm/mlir/gemm_7168_32_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x7168xbf16>, %arg1: tensor<8192x32xbf16>) -> tensor<7168x32xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<7168x32xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<7168x32xbf16>) -> tensor<7168x32xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x7168xbf16>, tensor<8192x32xbf16>) outs(%1 : tensor<7168x32xbf16>) -> tensor<7168x32xbf16>
-        return %2 : tensor<7168x32xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_7168_32_8192_f16_tA.mlir b/gemm/mlir/gemm_7168_32_8192_f16_tA.mlir
deleted file mode 100644
index 2d2744f..0000000
--- a/gemm/mlir/gemm_7168_32_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x7168xf16>, %arg1: tensor<8192x32xf16>) -> tensor<7168x32xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<7168x32xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<7168x32xf16>) -> tensor<7168x32xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x7168xf16>, tensor<8192x32xf16>) outs(%1 : tensor<7168x32xf16>) -> tensor<7168x32xf16>
-        return %2 : tensor<7168x32xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_7168_4_8192_bf16_tA.mlir b/gemm/mlir/gemm_7168_4_8192_bf16_tA.mlir
deleted file mode 100644
index 98c5839..0000000
--- a/gemm/mlir/gemm_7168_4_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x7168xbf16>, %arg1: tensor<8192x4xbf16>) -> tensor<7168x4xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<7168x4xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<7168x4xbf16>) -> tensor<7168x4xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x7168xbf16>, tensor<8192x4xbf16>) outs(%1 : tensor<7168x4xbf16>) -> tensor<7168x4xbf16>
-        return %2 : tensor<7168x4xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_7168_4_8192_f16_tA.mlir b/gemm/mlir/gemm_7168_4_8192_f16_tA.mlir
deleted file mode 100644
index 1bf5e1c..0000000
--- a/gemm/mlir/gemm_7168_4_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x7168xf16>, %arg1: tensor<8192x4xf16>) -> tensor<7168x4xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<7168x4xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<7168x4xf16>) -> tensor<7168x4xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x7168xf16>, tensor<8192x4xf16>) outs(%1 : tensor<7168x4xf16>) -> tensor<7168x4xf16>
-        return %2 : tensor<7168x4xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_7168_8_8192_bf16_tA.mlir b/gemm/mlir/gemm_7168_8_8192_bf16_tA.mlir
deleted file mode 100644
index c7dbcb9..0000000
--- a/gemm/mlir/gemm_7168_8_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x7168xbf16>, %arg1: tensor<8192x8xbf16>) -> tensor<7168x8xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<7168x8xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<7168x8xbf16>) -> tensor<7168x8xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x7168xbf16>, tensor<8192x8xbf16>) outs(%1 : tensor<7168x8xbf16>) -> tensor<7168x8xbf16>
-        return %2 : tensor<7168x8xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_7168_8_8192_f16_tA.mlir b/gemm/mlir/gemm_7168_8_8192_f16_tA.mlir
deleted file mode 100644
index f36208c..0000000
--- a/gemm/mlir/gemm_7168_8_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x7168xf16>, %arg1: tensor<8192x8xf16>) -> tensor<7168x8xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<7168x8xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<7168x8xf16>) -> tensor<7168x8xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x7168xf16>, tensor<8192x8xf16>) outs(%1 : tensor<7168x8xf16>) -> tensor<7168x8xf16>
-        return %2 : tensor<7168x8xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_7680_16_5120_bf16_tA.mlir b/gemm/mlir/gemm_7680_16_5120_bf16_tA.mlir
deleted file mode 100644
index db4ed5e..0000000
--- a/gemm/mlir/gemm_7680_16_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x7680xbf16>, %arg1: tensor<5120x16xbf16>) -> tensor<7680x16xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<7680x16xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<7680x16xbf16>) -> tensor<7680x16xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x7680xbf16>, tensor<5120x16xbf16>) outs(%1 : tensor<7680x16xbf16>) -> tensor<7680x16xbf16>
-        return %2 : tensor<7680x16xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_7680_16_5120_f16_tA.mlir b/gemm/mlir/gemm_7680_16_5120_f16_tA.mlir
deleted file mode 100644
index 884fae5..0000000
--- a/gemm/mlir/gemm_7680_16_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x7680xf16>, %arg1: tensor<5120x16xf16>) -> tensor<7680x16xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<7680x16xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<7680x16xf16>) -> tensor<7680x16xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x7680xf16>, tensor<5120x16xf16>) outs(%1 : tensor<7680x16xf16>) -> tensor<7680x16xf16>
-        return %2 : tensor<7680x16xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_7680_1_5120_bf16_tA.mlir b/gemm/mlir/gemm_7680_1_5120_bf16_tA.mlir
deleted file mode 100644
index 3e9229a..0000000
--- a/gemm/mlir/gemm_7680_1_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x7680xbf16>, %arg1: tensor<5120x1xbf16>) -> tensor<7680x1xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<7680x1xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<7680x1xbf16>) -> tensor<7680x1xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x7680xbf16>, tensor<5120x1xbf16>) outs(%1 : tensor<7680x1xbf16>) -> tensor<7680x1xbf16>
-        return %2 : tensor<7680x1xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_7680_1_5120_f16_tA.mlir b/gemm/mlir/gemm_7680_1_5120_f16_tA.mlir
deleted file mode 100644
index 8852272..0000000
--- a/gemm/mlir/gemm_7680_1_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x7680xf16>, %arg1: tensor<5120x1xf16>) -> tensor<7680x1xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<7680x1xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<7680x1xf16>) -> tensor<7680x1xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x7680xf16>, tensor<5120x1xf16>) outs(%1 : tensor<7680x1xf16>) -> tensor<7680x1xf16>
-        return %2 : tensor<7680x1xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_7680_2_5120_bf16_tA.mlir b/gemm/mlir/gemm_7680_2_5120_bf16_tA.mlir
deleted file mode 100644
index 91b162d..0000000
--- a/gemm/mlir/gemm_7680_2_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x7680xbf16>, %arg1: tensor<5120x2xbf16>) -> tensor<7680x2xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<7680x2xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<7680x2xbf16>) -> tensor<7680x2xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x7680xbf16>, tensor<5120x2xbf16>) outs(%1 : tensor<7680x2xbf16>) -> tensor<7680x2xbf16>
-        return %2 : tensor<7680x2xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_7680_2_5120_f16_tA.mlir b/gemm/mlir/gemm_7680_2_5120_f16_tA.mlir
deleted file mode 100644
index 0b11af3..0000000
--- a/gemm/mlir/gemm_7680_2_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x7680xf16>, %arg1: tensor<5120x2xf16>) -> tensor<7680x2xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<7680x2xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<7680x2xf16>) -> tensor<7680x2xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x7680xf16>, tensor<5120x2xf16>) outs(%1 : tensor<7680x2xf16>) -> tensor<7680x2xf16>
-        return %2 : tensor<7680x2xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_7680_32_5120_bf16_tA.mlir b/gemm/mlir/gemm_7680_32_5120_bf16_tA.mlir
deleted file mode 100644
index a89c462..0000000
--- a/gemm/mlir/gemm_7680_32_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x7680xbf16>, %arg1: tensor<5120x32xbf16>) -> tensor<7680x32xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<7680x32xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<7680x32xbf16>) -> tensor<7680x32xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x7680xbf16>, tensor<5120x32xbf16>) outs(%1 : tensor<7680x32xbf16>) -> tensor<7680x32xbf16>
-        return %2 : tensor<7680x32xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_7680_32_5120_f16_tA.mlir b/gemm/mlir/gemm_7680_32_5120_f16_tA.mlir
deleted file mode 100644
index 6dd24ce..0000000
--- a/gemm/mlir/gemm_7680_32_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x7680xf16>, %arg1: tensor<5120x32xf16>) -> tensor<7680x32xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<7680x32xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<7680x32xf16>) -> tensor<7680x32xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x7680xf16>, tensor<5120x32xf16>) outs(%1 : tensor<7680x32xf16>) -> tensor<7680x32xf16>
-        return %2 : tensor<7680x32xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_7680_4_5120_bf16_tA.mlir b/gemm/mlir/gemm_7680_4_5120_bf16_tA.mlir
deleted file mode 100644
index b0334e4..0000000
--- a/gemm/mlir/gemm_7680_4_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x7680xbf16>, %arg1: tensor<5120x4xbf16>) -> tensor<7680x4xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<7680x4xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<7680x4xbf16>) -> tensor<7680x4xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x7680xbf16>, tensor<5120x4xbf16>) outs(%1 : tensor<7680x4xbf16>) -> tensor<7680x4xbf16>
-        return %2 : tensor<7680x4xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_7680_4_5120_f16_tA.mlir b/gemm/mlir/gemm_7680_4_5120_f16_tA.mlir
deleted file mode 100644
index c927588..0000000
--- a/gemm/mlir/gemm_7680_4_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x7680xf16>, %arg1: tensor<5120x4xf16>) -> tensor<7680x4xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<7680x4xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<7680x4xf16>) -> tensor<7680x4xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x7680xf16>, tensor<5120x4xf16>) outs(%1 : tensor<7680x4xf16>) -> tensor<7680x4xf16>
-        return %2 : tensor<7680x4xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_7680_8_5120_bf16_tA.mlir b/gemm/mlir/gemm_7680_8_5120_bf16_tA.mlir
deleted file mode 100644
index 4d799fa..0000000
--- a/gemm/mlir/gemm_7680_8_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x7680xbf16>, %arg1: tensor<5120x8xbf16>) -> tensor<7680x8xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<7680x8xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<7680x8xbf16>) -> tensor<7680x8xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x7680xbf16>, tensor<5120x8xbf16>) outs(%1 : tensor<7680x8xbf16>) -> tensor<7680x8xbf16>
-        return %2 : tensor<7680x8xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_7680_8_5120_f16_tA.mlir b/gemm/mlir/gemm_7680_8_5120_f16_tA.mlir
deleted file mode 100644
index f817f9d..0000000
--- a/gemm/mlir/gemm_7680_8_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x7680xf16>, %arg1: tensor<5120x8xf16>) -> tensor<7680x8xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<7680x8xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<7680x8xf16>) -> tensor<7680x8xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x7680xf16>, tensor<5120x8xf16>) outs(%1 : tensor<7680x8xf16>) -> tensor<7680x8xf16>
-        return %2 : tensor<7680x8xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8000_16_5120_bf16_tA.mlir b/gemm/mlir/gemm_8000_16_5120_bf16_tA.mlir
deleted file mode 100644
index 50cb640..0000000
--- a/gemm/mlir/gemm_8000_16_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x8000xbf16>, %arg1: tensor<5120x16xbf16>) -> tensor<8000x16xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<8000x16xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8000x16xbf16>) -> tensor<8000x16xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x8000xbf16>, tensor<5120x16xbf16>) outs(%1 : tensor<8000x16xbf16>) -> tensor<8000x16xbf16>
-        return %2 : tensor<8000x16xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8000_16_5120_f16_tA.mlir b/gemm/mlir/gemm_8000_16_5120_f16_tA.mlir
deleted file mode 100644
index 53b5315..0000000
--- a/gemm/mlir/gemm_8000_16_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x8000xf16>, %arg1: tensor<5120x16xf16>) -> tensor<8000x16xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8000x16xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8000x16xf16>) -> tensor<8000x16xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x8000xf16>, tensor<5120x16xf16>) outs(%1 : tensor<8000x16xf16>) -> tensor<8000x16xf16>
-        return %2 : tensor<8000x16xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8000_16_8192_bf16_tA.mlir b/gemm/mlir/gemm_8000_16_8192_bf16_tA.mlir
deleted file mode 100644
index d61ae44..0000000
--- a/gemm/mlir/gemm_8000_16_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x8000xbf16>, %arg1: tensor<8192x16xbf16>) -> tensor<8000x16xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<8000x16xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8000x16xbf16>) -> tensor<8000x16xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x8000xbf16>, tensor<8192x16xbf16>) outs(%1 : tensor<8000x16xbf16>) -> tensor<8000x16xbf16>
-        return %2 : tensor<8000x16xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8000_16_8192_f16_tA.mlir b/gemm/mlir/gemm_8000_16_8192_f16_tA.mlir
deleted file mode 100644
index 2fdaae0..0000000
--- a/gemm/mlir/gemm_8000_16_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x8000xf16>, %arg1: tensor<8192x16xf16>) -> tensor<8000x16xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8000x16xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8000x16xf16>) -> tensor<8000x16xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x8000xf16>, tensor<8192x16xf16>) outs(%1 : tensor<8000x16xf16>) -> tensor<8000x16xf16>
-        return %2 : tensor<8000x16xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8000_1_5120_bf16_tA.mlir b/gemm/mlir/gemm_8000_1_5120_bf16_tA.mlir
deleted file mode 100644
index 2103508..0000000
--- a/gemm/mlir/gemm_8000_1_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x8000xbf16>, %arg1: tensor<5120x1xbf16>) -> tensor<8000x1xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<8000x1xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8000x1xbf16>) -> tensor<8000x1xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x8000xbf16>, tensor<5120x1xbf16>) outs(%1 : tensor<8000x1xbf16>) -> tensor<8000x1xbf16>
-        return %2 : tensor<8000x1xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8000_1_5120_f16_tA.mlir b/gemm/mlir/gemm_8000_1_5120_f16_tA.mlir
deleted file mode 100644
index d168465..0000000
--- a/gemm/mlir/gemm_8000_1_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x8000xf16>, %arg1: tensor<5120x1xf16>) -> tensor<8000x1xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8000x1xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8000x1xf16>) -> tensor<8000x1xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x8000xf16>, tensor<5120x1xf16>) outs(%1 : tensor<8000x1xf16>) -> tensor<8000x1xf16>
-        return %2 : tensor<8000x1xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8000_1_8192_bf16_tA.mlir b/gemm/mlir/gemm_8000_1_8192_bf16_tA.mlir
deleted file mode 100644
index 0f58095..0000000
--- a/gemm/mlir/gemm_8000_1_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x8000xbf16>, %arg1: tensor<8192x1xbf16>) -> tensor<8000x1xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<8000x1xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8000x1xbf16>) -> tensor<8000x1xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x8000xbf16>, tensor<8192x1xbf16>) outs(%1 : tensor<8000x1xbf16>) -> tensor<8000x1xbf16>
-        return %2 : tensor<8000x1xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8000_1_8192_f16_tA.mlir b/gemm/mlir/gemm_8000_1_8192_f16_tA.mlir
deleted file mode 100644
index 52e5c03..0000000
--- a/gemm/mlir/gemm_8000_1_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x8000xf16>, %arg1: tensor<8192x1xf16>) -> tensor<8000x1xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8000x1xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8000x1xf16>) -> tensor<8000x1xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x8000xf16>, tensor<8192x1xf16>) outs(%1 : tensor<8000x1xf16>) -> tensor<8000x1xf16>
-        return %2 : tensor<8000x1xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8000_2_5120_bf16_tA.mlir b/gemm/mlir/gemm_8000_2_5120_bf16_tA.mlir
deleted file mode 100644
index 668917b..0000000
--- a/gemm/mlir/gemm_8000_2_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x8000xbf16>, %arg1: tensor<5120x2xbf16>) -> tensor<8000x2xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<8000x2xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8000x2xbf16>) -> tensor<8000x2xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x8000xbf16>, tensor<5120x2xbf16>) outs(%1 : tensor<8000x2xbf16>) -> tensor<8000x2xbf16>
-        return %2 : tensor<8000x2xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8000_2_5120_f16_tA.mlir b/gemm/mlir/gemm_8000_2_5120_f16_tA.mlir
deleted file mode 100644
index d85abf9..0000000
--- a/gemm/mlir/gemm_8000_2_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x8000xf16>, %arg1: tensor<5120x2xf16>) -> tensor<8000x2xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8000x2xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8000x2xf16>) -> tensor<8000x2xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x8000xf16>, tensor<5120x2xf16>) outs(%1 : tensor<8000x2xf16>) -> tensor<8000x2xf16>
-        return %2 : tensor<8000x2xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8000_2_8192_bf16_tA.mlir b/gemm/mlir/gemm_8000_2_8192_bf16_tA.mlir
deleted file mode 100644
index fa3aeca..0000000
--- a/gemm/mlir/gemm_8000_2_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x8000xbf16>, %arg1: tensor<8192x2xbf16>) -> tensor<8000x2xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<8000x2xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8000x2xbf16>) -> tensor<8000x2xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x8000xbf16>, tensor<8192x2xbf16>) outs(%1 : tensor<8000x2xbf16>) -> tensor<8000x2xbf16>
-        return %2 : tensor<8000x2xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8000_2_8192_f16_tA.mlir b/gemm/mlir/gemm_8000_2_8192_f16_tA.mlir
deleted file mode 100644
index 4d8cb01..0000000
--- a/gemm/mlir/gemm_8000_2_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x8000xf16>, %arg1: tensor<8192x2xf16>) -> tensor<8000x2xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8000x2xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8000x2xf16>) -> tensor<8000x2xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x8000xf16>, tensor<8192x2xf16>) outs(%1 : tensor<8000x2xf16>) -> tensor<8000x2xf16>
-        return %2 : tensor<8000x2xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8000_32_5120_bf16_tA.mlir b/gemm/mlir/gemm_8000_32_5120_bf16_tA.mlir
deleted file mode 100644
index e76d224..0000000
--- a/gemm/mlir/gemm_8000_32_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x8000xbf16>, %arg1: tensor<5120x32xbf16>) -> tensor<8000x32xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<8000x32xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8000x32xbf16>) -> tensor<8000x32xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x8000xbf16>, tensor<5120x32xbf16>) outs(%1 : tensor<8000x32xbf16>) -> tensor<8000x32xbf16>
-        return %2 : tensor<8000x32xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8000_32_5120_f16_tA.mlir b/gemm/mlir/gemm_8000_32_5120_f16_tA.mlir
deleted file mode 100644
index 5c226af..0000000
--- a/gemm/mlir/gemm_8000_32_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x8000xf16>, %arg1: tensor<5120x32xf16>) -> tensor<8000x32xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8000x32xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8000x32xf16>) -> tensor<8000x32xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x8000xf16>, tensor<5120x32xf16>) outs(%1 : tensor<8000x32xf16>) -> tensor<8000x32xf16>
-        return %2 : tensor<8000x32xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8000_32_8192_bf16_tA.mlir b/gemm/mlir/gemm_8000_32_8192_bf16_tA.mlir
deleted file mode 100644
index 4df2655..0000000
--- a/gemm/mlir/gemm_8000_32_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x8000xbf16>, %arg1: tensor<8192x32xbf16>) -> tensor<8000x32xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<8000x32xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8000x32xbf16>) -> tensor<8000x32xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x8000xbf16>, tensor<8192x32xbf16>) outs(%1 : tensor<8000x32xbf16>) -> tensor<8000x32xbf16>
-        return %2 : tensor<8000x32xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8000_32_8192_f16_tA.mlir b/gemm/mlir/gemm_8000_32_8192_f16_tA.mlir
deleted file mode 100644
index 656010a..0000000
--- a/gemm/mlir/gemm_8000_32_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x8000xf16>, %arg1: tensor<8192x32xf16>) -> tensor<8000x32xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8000x32xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8000x32xf16>) -> tensor<8000x32xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x8000xf16>, tensor<8192x32xf16>) outs(%1 : tensor<8000x32xf16>) -> tensor<8000x32xf16>
-        return %2 : tensor<8000x32xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8000_4_5120_bf16_tA.mlir b/gemm/mlir/gemm_8000_4_5120_bf16_tA.mlir
deleted file mode 100644
index f45eab2..0000000
--- a/gemm/mlir/gemm_8000_4_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x8000xbf16>, %arg1: tensor<5120x4xbf16>) -> tensor<8000x4xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<8000x4xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8000x4xbf16>) -> tensor<8000x4xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x8000xbf16>, tensor<5120x4xbf16>) outs(%1 : tensor<8000x4xbf16>) -> tensor<8000x4xbf16>
-        return %2 : tensor<8000x4xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8000_4_5120_f16_tA.mlir b/gemm/mlir/gemm_8000_4_5120_f16_tA.mlir
deleted file mode 100644
index a715200..0000000
--- a/gemm/mlir/gemm_8000_4_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x8000xf16>, %arg1: tensor<5120x4xf16>) -> tensor<8000x4xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8000x4xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8000x4xf16>) -> tensor<8000x4xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x8000xf16>, tensor<5120x4xf16>) outs(%1 : tensor<8000x4xf16>) -> tensor<8000x4xf16>
-        return %2 : tensor<8000x4xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8000_4_8192_bf16_tA.mlir b/gemm/mlir/gemm_8000_4_8192_bf16_tA.mlir
deleted file mode 100644
index a0bf7e7..0000000
--- a/gemm/mlir/gemm_8000_4_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x8000xbf16>, %arg1: tensor<8192x4xbf16>) -> tensor<8000x4xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<8000x4xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8000x4xbf16>) -> tensor<8000x4xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x8000xbf16>, tensor<8192x4xbf16>) outs(%1 : tensor<8000x4xbf16>) -> tensor<8000x4xbf16>
-        return %2 : tensor<8000x4xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8000_4_8192_f16_tA.mlir b/gemm/mlir/gemm_8000_4_8192_f16_tA.mlir
deleted file mode 100644
index 4d5e8c5..0000000
--- a/gemm/mlir/gemm_8000_4_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x8000xf16>, %arg1: tensor<8192x4xf16>) -> tensor<8000x4xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8000x4xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8000x4xf16>) -> tensor<8000x4xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x8000xf16>, tensor<8192x4xf16>) outs(%1 : tensor<8000x4xf16>) -> tensor<8000x4xf16>
-        return %2 : tensor<8000x4xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8000_8_5120_bf16_tA.mlir b/gemm/mlir/gemm_8000_8_5120_bf16_tA.mlir
deleted file mode 100644
index 5a7f7e8..0000000
--- a/gemm/mlir/gemm_8000_8_5120_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x8000xbf16>, %arg1: tensor<5120x8xbf16>) -> tensor<8000x8xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<8000x8xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8000x8xbf16>) -> tensor<8000x8xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x8000xbf16>, tensor<5120x8xbf16>) outs(%1 : tensor<8000x8xbf16>) -> tensor<8000x8xbf16>
-        return %2 : tensor<8000x8xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8000_8_5120_f16_tA.mlir b/gemm/mlir/gemm_8000_8_5120_f16_tA.mlir
deleted file mode 100644
index 5552aa7..0000000
--- a/gemm/mlir/gemm_8000_8_5120_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<5120x8000xf16>, %arg1: tensor<5120x8xf16>) -> tensor<8000x8xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8000x8xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8000x8xf16>) -> tensor<8000x8xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<5120x8000xf16>, tensor<5120x8xf16>) outs(%1 : tensor<8000x8xf16>) -> tensor<8000x8xf16>
-        return %2 : tensor<8000x8xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8000_8_8192_bf16_tA.mlir b/gemm/mlir/gemm_8000_8_8192_bf16_tA.mlir
deleted file mode 100644
index a79aac9..0000000
--- a/gemm/mlir/gemm_8000_8_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x8000xbf16>, %arg1: tensor<8192x8xbf16>) -> tensor<8000x8xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<8000x8xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8000x8xbf16>) -> tensor<8000x8xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x8000xbf16>, tensor<8192x8xbf16>) outs(%1 : tensor<8000x8xbf16>) -> tensor<8000x8xbf16>
-        return %2 : tensor<8000x8xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8000_8_8192_f16_tA.mlir b/gemm/mlir/gemm_8000_8_8192_f16_tA.mlir
deleted file mode 100644
index 4f1ed4c..0000000
--- a/gemm/mlir/gemm_8000_8_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x8000xf16>, %arg1: tensor<8192x8xf16>) -> tensor<8000x8xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8000x8xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8000x8xf16>) -> tensor<8000x8xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x8000xf16>, tensor<8192x8xf16>) outs(%1 : tensor<8000x8xf16>) -> tensor<8000x8xf16>
-        return %2 : tensor<8000x8xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_16_1024_bf16_tA.mlir b/gemm/mlir/gemm_8192_16_1024_bf16_tA.mlir
deleted file mode 100644
index 665dc34..0000000
--- a/gemm/mlir/gemm_8192_16_1024_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<1024x8192xbf16>, %arg1: tensor<1024x16xbf16>) -> tensor<8192x16xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<8192x16xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x16xbf16>) -> tensor<8192x16xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1024x8192xbf16>, tensor<1024x16xbf16>) outs(%1 : tensor<8192x16xbf16>) -> tensor<8192x16xbf16>
-        return %2 : tensor<8192x16xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_16_1024_f16_tA.mlir b/gemm/mlir/gemm_8192_16_1024_f16_tA.mlir
deleted file mode 100644
index b37806e..0000000
--- a/gemm/mlir/gemm_8192_16_1024_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<1024x8192xf16>, %arg1: tensor<1024x16xf16>) -> tensor<8192x16xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8192x16xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x16xf16>) -> tensor<8192x16xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1024x8192xf16>, tensor<1024x16xf16>) outs(%1 : tensor<8192x16xf16>) -> tensor<8192x16xf16>
-        return %2 : tensor<8192x16xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_16_14336_bf16_tA.mlir b/gemm/mlir/gemm_8192_16_14336_bf16_tA.mlir
deleted file mode 100644
index 5a46495..0000000
--- a/gemm/mlir/gemm_8192_16_14336_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<14336x8192xbf16>, %arg1: tensor<14336x16xbf16>) -> tensor<8192x16xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<8192x16xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x16xbf16>) -> tensor<8192x16xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<14336x8192xbf16>, tensor<14336x16xbf16>) outs(%1 : tensor<8192x16xbf16>) -> tensor<8192x16xbf16>
-        return %2 : tensor<8192x16xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_16_14336_f16_tA.mlir b/gemm/mlir/gemm_8192_16_14336_f16_tA.mlir
deleted file mode 100644
index e9fab90..0000000
--- a/gemm/mlir/gemm_8192_16_14336_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<14336x8192xf16>, %arg1: tensor<14336x16xf16>) -> tensor<8192x16xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8192x16xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x16xf16>) -> tensor<8192x16xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<14336x8192xf16>, tensor<14336x16xf16>) outs(%1 : tensor<8192x16xf16>) -> tensor<8192x16xf16>
-        return %2 : tensor<8192x16xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_16_2048_bf16_tA.mlir b/gemm/mlir/gemm_8192_16_2048_bf16_tA.mlir
deleted file mode 100644
index d5390e1..0000000
--- a/gemm/mlir/gemm_8192_16_2048_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<2048x8192xbf16>, %arg1: tensor<2048x16xbf16>) -> tensor<8192x16xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<8192x16xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x16xbf16>) -> tensor<8192x16xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<2048x8192xbf16>, tensor<2048x16xbf16>) outs(%1 : tensor<8192x16xbf16>) -> tensor<8192x16xbf16>
-        return %2 : tensor<8192x16xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_16_2048_f16_tA.mlir b/gemm/mlir/gemm_8192_16_2048_f16_tA.mlir
deleted file mode 100644
index 899b396..0000000
--- a/gemm/mlir/gemm_8192_16_2048_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<2048x8192xf16>, %arg1: tensor<2048x16xf16>) -> tensor<8192x16xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8192x16xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x16xf16>) -> tensor<8192x16xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<2048x8192xf16>, tensor<2048x16xf16>) outs(%1 : tensor<8192x16xf16>) -> tensor<8192x16xf16>
-        return %2 : tensor<8192x16xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_16_28672_bf16_tA.mlir b/gemm/mlir/gemm_8192_16_28672_bf16_tA.mlir
deleted file mode 100644
index a052c9d..0000000
--- a/gemm/mlir/gemm_8192_16_28672_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<28672x8192xbf16>, %arg1: tensor<28672x16xbf16>) -> tensor<8192x16xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<8192x16xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x16xbf16>) -> tensor<8192x16xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<28672x8192xbf16>, tensor<28672x16xbf16>) outs(%1 : tensor<8192x16xbf16>) -> tensor<8192x16xbf16>
-        return %2 : tensor<8192x16xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_16_28672_f16_tA.mlir b/gemm/mlir/gemm_8192_16_28672_f16_tA.mlir
deleted file mode 100644
index 937f6ca..0000000
--- a/gemm/mlir/gemm_8192_16_28672_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<28672x8192xf16>, %arg1: tensor<28672x16xf16>) -> tensor<8192x16xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8192x16xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x16xf16>) -> tensor<8192x16xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<28672x8192xf16>, tensor<28672x16xf16>) outs(%1 : tensor<8192x16xf16>) -> tensor<8192x16xf16>
-        return %2 : tensor<8192x16xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_16_3584_bf16_tA.mlir b/gemm/mlir/gemm_8192_16_3584_bf16_tA.mlir
deleted file mode 100644
index 956f501..0000000
--- a/gemm/mlir/gemm_8192_16_3584_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<3584x8192xbf16>, %arg1: tensor<3584x16xbf16>) -> tensor<8192x16xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<8192x16xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x16xbf16>) -> tensor<8192x16xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<3584x8192xbf16>, tensor<3584x16xbf16>) outs(%1 : tensor<8192x16xbf16>) -> tensor<8192x16xbf16>
-        return %2 : tensor<8192x16xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_16_3584_f16_tA.mlir b/gemm/mlir/gemm_8192_16_3584_f16_tA.mlir
deleted file mode 100644
index 5182fe9..0000000
--- a/gemm/mlir/gemm_8192_16_3584_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<3584x8192xf16>, %arg1: tensor<3584x16xf16>) -> tensor<8192x16xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8192x16xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x16xf16>) -> tensor<8192x16xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<3584x8192xf16>, tensor<3584x16xf16>) outs(%1 : tensor<8192x16xf16>) -> tensor<8192x16xf16>
-        return %2 : tensor<8192x16xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_16_4096_bf16_tA.mlir b/gemm/mlir/gemm_8192_16_4096_bf16_tA.mlir
deleted file mode 100644
index ae4ae02..0000000
--- a/gemm/mlir/gemm_8192_16_4096_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<4096x8192xbf16>, %arg1: tensor<4096x16xbf16>) -> tensor<8192x16xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<8192x16xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x16xbf16>) -> tensor<8192x16xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<4096x8192xbf16>, tensor<4096x16xbf16>) outs(%1 : tensor<8192x16xbf16>) -> tensor<8192x16xbf16>
-        return %2 : tensor<8192x16xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_16_4096_f16_tA.mlir b/gemm/mlir/gemm_8192_16_4096_f16_tA.mlir
deleted file mode 100644
index 1510fbb..0000000
--- a/gemm/mlir/gemm_8192_16_4096_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<4096x8192xf16>, %arg1: tensor<4096x16xf16>) -> tensor<8192x16xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8192x16xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x16xf16>) -> tensor<8192x16xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<4096x8192xf16>, tensor<4096x16xf16>) outs(%1 : tensor<8192x16xf16>) -> tensor<8192x16xf16>
-        return %2 : tensor<8192x16xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_16_7168_bf16_tA.mlir b/gemm/mlir/gemm_8192_16_7168_bf16_tA.mlir
deleted file mode 100644
index ce83d31..0000000
--- a/gemm/mlir/gemm_8192_16_7168_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<7168x8192xbf16>, %arg1: tensor<7168x16xbf16>) -> tensor<8192x16xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<8192x16xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x16xbf16>) -> tensor<8192x16xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<7168x8192xbf16>, tensor<7168x16xbf16>) outs(%1 : tensor<8192x16xbf16>) -> tensor<8192x16xbf16>
-        return %2 : tensor<8192x16xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_16_7168_f16_tA.mlir b/gemm/mlir/gemm_8192_16_7168_f16_tA.mlir
deleted file mode 100644
index 63da0a2..0000000
--- a/gemm/mlir/gemm_8192_16_7168_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<7168x8192xf16>, %arg1: tensor<7168x16xf16>) -> tensor<8192x16xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8192x16xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x16xf16>) -> tensor<8192x16xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<7168x8192xf16>, tensor<7168x16xf16>) outs(%1 : tensor<8192x16xf16>) -> tensor<8192x16xf16>
-        return %2 : tensor<8192x16xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_16_8192_bf16_tA.mlir b/gemm/mlir/gemm_8192_16_8192_bf16_tA.mlir
deleted file mode 100644
index b269adb..0000000
--- a/gemm/mlir/gemm_8192_16_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x8192xbf16>, %arg1: tensor<8192x16xbf16>) -> tensor<8192x16xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<8192x16xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x16xbf16>) -> tensor<8192x16xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x8192xbf16>, tensor<8192x16xbf16>) outs(%1 : tensor<8192x16xbf16>) -> tensor<8192x16xbf16>
-        return %2 : tensor<8192x16xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_16_8192_f16_tA.mlir b/gemm/mlir/gemm_8192_16_8192_f16_tA.mlir
deleted file mode 100644
index a00d5ed..0000000
--- a/gemm/mlir/gemm_8192_16_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x8192xf16>, %arg1: tensor<8192x16xf16>) -> tensor<8192x16xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8192x16xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x16xf16>) -> tensor<8192x16xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x8192xf16>, tensor<8192x16xf16>) outs(%1 : tensor<8192x16xf16>) -> tensor<8192x16xf16>
-        return %2 : tensor<8192x16xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_1_1024_bf16_tA.mlir b/gemm/mlir/gemm_8192_1_1024_bf16_tA.mlir
deleted file mode 100644
index 4dbf154..0000000
--- a/gemm/mlir/gemm_8192_1_1024_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<1024x8192xbf16>, %arg1: tensor<1024x1xbf16>) -> tensor<8192x1xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<8192x1xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x1xbf16>) -> tensor<8192x1xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1024x8192xbf16>, tensor<1024x1xbf16>) outs(%1 : tensor<8192x1xbf16>) -> tensor<8192x1xbf16>
-        return %2 : tensor<8192x1xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_1_1024_f16_tA.mlir b/gemm/mlir/gemm_8192_1_1024_f16_tA.mlir
deleted file mode 100644
index cfe91b7..0000000
--- a/gemm/mlir/gemm_8192_1_1024_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<1024x8192xf16>, %arg1: tensor<1024x1xf16>) -> tensor<8192x1xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8192x1xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x1xf16>) -> tensor<8192x1xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1024x8192xf16>, tensor<1024x1xf16>) outs(%1 : tensor<8192x1xf16>) -> tensor<8192x1xf16>
-        return %2 : tensor<8192x1xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_1_14336_bf16_tA.mlir b/gemm/mlir/gemm_8192_1_14336_bf16_tA.mlir
deleted file mode 100644
index bb3ee2c..0000000
--- a/gemm/mlir/gemm_8192_1_14336_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<14336x8192xbf16>, %arg1: tensor<14336x1xbf16>) -> tensor<8192x1xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<8192x1xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x1xbf16>) -> tensor<8192x1xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<14336x8192xbf16>, tensor<14336x1xbf16>) outs(%1 : tensor<8192x1xbf16>) -> tensor<8192x1xbf16>
-        return %2 : tensor<8192x1xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_1_14336_f16_tA.mlir b/gemm/mlir/gemm_8192_1_14336_f16_tA.mlir
deleted file mode 100644
index d1bf657..0000000
--- a/gemm/mlir/gemm_8192_1_14336_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<14336x8192xf16>, %arg1: tensor<14336x1xf16>) -> tensor<8192x1xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8192x1xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x1xf16>) -> tensor<8192x1xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<14336x8192xf16>, tensor<14336x1xf16>) outs(%1 : tensor<8192x1xf16>) -> tensor<8192x1xf16>
-        return %2 : tensor<8192x1xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_1_2048_bf16_tA.mlir b/gemm/mlir/gemm_8192_1_2048_bf16_tA.mlir
deleted file mode 100644
index dcb9440..0000000
--- a/gemm/mlir/gemm_8192_1_2048_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<2048x8192xbf16>, %arg1: tensor<2048x1xbf16>) -> tensor<8192x1xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<8192x1xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x1xbf16>) -> tensor<8192x1xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<2048x8192xbf16>, tensor<2048x1xbf16>) outs(%1 : tensor<8192x1xbf16>) -> tensor<8192x1xbf16>
-        return %2 : tensor<8192x1xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_1_2048_f16_tA.mlir b/gemm/mlir/gemm_8192_1_2048_f16_tA.mlir
deleted file mode 100644
index bb68ab7..0000000
--- a/gemm/mlir/gemm_8192_1_2048_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<2048x8192xf16>, %arg1: tensor<2048x1xf16>) -> tensor<8192x1xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8192x1xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x1xf16>) -> tensor<8192x1xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<2048x8192xf16>, tensor<2048x1xf16>) outs(%1 : tensor<8192x1xf16>) -> tensor<8192x1xf16>
-        return %2 : tensor<8192x1xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_1_28672_bf16_tA.mlir b/gemm/mlir/gemm_8192_1_28672_bf16_tA.mlir
deleted file mode 100644
index 9d8ba4c..0000000
--- a/gemm/mlir/gemm_8192_1_28672_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<28672x8192xbf16>, %arg1: tensor<28672x1xbf16>) -> tensor<8192x1xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<8192x1xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x1xbf16>) -> tensor<8192x1xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<28672x8192xbf16>, tensor<28672x1xbf16>) outs(%1 : tensor<8192x1xbf16>) -> tensor<8192x1xbf16>
-        return %2 : tensor<8192x1xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_1_28672_f16_tA.mlir b/gemm/mlir/gemm_8192_1_28672_f16_tA.mlir
deleted file mode 100644
index 7f24658..0000000
--- a/gemm/mlir/gemm_8192_1_28672_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<28672x8192xf16>, %arg1: tensor<28672x1xf16>) -> tensor<8192x1xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8192x1xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x1xf16>) -> tensor<8192x1xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<28672x8192xf16>, tensor<28672x1xf16>) outs(%1 : tensor<8192x1xf16>) -> tensor<8192x1xf16>
-        return %2 : tensor<8192x1xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_1_3584_bf16_tA.mlir b/gemm/mlir/gemm_8192_1_3584_bf16_tA.mlir
deleted file mode 100644
index db96ed7..0000000
--- a/gemm/mlir/gemm_8192_1_3584_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<3584x8192xbf16>, %arg1: tensor<3584x1xbf16>) -> tensor<8192x1xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<8192x1xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x1xbf16>) -> tensor<8192x1xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<3584x8192xbf16>, tensor<3584x1xbf16>) outs(%1 : tensor<8192x1xbf16>) -> tensor<8192x1xbf16>
-        return %2 : tensor<8192x1xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_1_3584_f16_tA.mlir b/gemm/mlir/gemm_8192_1_3584_f16_tA.mlir
deleted file mode 100644
index 0c6617c..0000000
--- a/gemm/mlir/gemm_8192_1_3584_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<3584x8192xf16>, %arg1: tensor<3584x1xf16>) -> tensor<8192x1xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8192x1xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x1xf16>) -> tensor<8192x1xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<3584x8192xf16>, tensor<3584x1xf16>) outs(%1 : tensor<8192x1xf16>) -> tensor<8192x1xf16>
-        return %2 : tensor<8192x1xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_1_4096_bf16_tA.mlir b/gemm/mlir/gemm_8192_1_4096_bf16_tA.mlir
deleted file mode 100644
index fef2bfe..0000000
--- a/gemm/mlir/gemm_8192_1_4096_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<4096x8192xbf16>, %arg1: tensor<4096x1xbf16>) -> tensor<8192x1xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<8192x1xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x1xbf16>) -> tensor<8192x1xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<4096x8192xbf16>, tensor<4096x1xbf16>) outs(%1 : tensor<8192x1xbf16>) -> tensor<8192x1xbf16>
-        return %2 : tensor<8192x1xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_1_4096_f16_tA.mlir b/gemm/mlir/gemm_8192_1_4096_f16_tA.mlir
deleted file mode 100644
index 76cdec0..0000000
--- a/gemm/mlir/gemm_8192_1_4096_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<4096x8192xf16>, %arg1: tensor<4096x1xf16>) -> tensor<8192x1xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8192x1xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x1xf16>) -> tensor<8192x1xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<4096x8192xf16>, tensor<4096x1xf16>) outs(%1 : tensor<8192x1xf16>) -> tensor<8192x1xf16>
-        return %2 : tensor<8192x1xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_1_7168_bf16_tA.mlir b/gemm/mlir/gemm_8192_1_7168_bf16_tA.mlir
deleted file mode 100644
index 83f7005..0000000
--- a/gemm/mlir/gemm_8192_1_7168_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<7168x8192xbf16>, %arg1: tensor<7168x1xbf16>) -> tensor<8192x1xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<8192x1xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x1xbf16>) -> tensor<8192x1xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<7168x8192xbf16>, tensor<7168x1xbf16>) outs(%1 : tensor<8192x1xbf16>) -> tensor<8192x1xbf16>
-        return %2 : tensor<8192x1xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_1_7168_f16_tA.mlir b/gemm/mlir/gemm_8192_1_7168_f16_tA.mlir
deleted file mode 100644
index 1eedb82..0000000
--- a/gemm/mlir/gemm_8192_1_7168_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<7168x8192xf16>, %arg1: tensor<7168x1xf16>) -> tensor<8192x1xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8192x1xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x1xf16>) -> tensor<8192x1xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<7168x8192xf16>, tensor<7168x1xf16>) outs(%1 : tensor<8192x1xf16>) -> tensor<8192x1xf16>
-        return %2 : tensor<8192x1xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_1_8192_bf16_tA.mlir b/gemm/mlir/gemm_8192_1_8192_bf16_tA.mlir
deleted file mode 100644
index 2ea5414..0000000
--- a/gemm/mlir/gemm_8192_1_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x8192xbf16>, %arg1: tensor<8192x1xbf16>) -> tensor<8192x1xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<8192x1xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x1xbf16>) -> tensor<8192x1xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x8192xbf16>, tensor<8192x1xbf16>) outs(%1 : tensor<8192x1xbf16>) -> tensor<8192x1xbf16>
-        return %2 : tensor<8192x1xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_1_8192_f16_tA.mlir b/gemm/mlir/gemm_8192_1_8192_f16_tA.mlir
deleted file mode 100644
index ea88565..0000000
--- a/gemm/mlir/gemm_8192_1_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x8192xf16>, %arg1: tensor<8192x1xf16>) -> tensor<8192x1xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8192x1xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x1xf16>) -> tensor<8192x1xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x8192xf16>, tensor<8192x1xf16>) outs(%1 : tensor<8192x1xf16>) -> tensor<8192x1xf16>
-        return %2 : tensor<8192x1xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_2048_1024_f16.mlir b/gemm/mlir/gemm_8192_2048_1024_f16.mlir
deleted file mode 100644
index f42dfba..0000000
--- a/gemm/mlir/gemm_8192_2048_1024_f16.mlir
+++ /dev/null
@@ -1,9 +0,0 @@
-module {
-    func.func @main(%arg0: tensor<8192x1024xf16>, %arg1: tensor<1024x2048xf16>) -> tensor<8192x2048xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8192x2048xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x2048xf16>) -> tensor<8192x2048xf16>
-        %2 = linalg.matmul ins(%arg0, %arg1 : tensor<8192x1024xf16>, tensor<1024x2048xf16>) outs(%1 : tensor<8192x2048xf16>) -> tensor<8192x2048xf16>
-        return %2 : tensor<8192x2048xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_2048_65536_f16.mlir b/gemm/mlir/gemm_8192_2048_65536_f16.mlir
deleted file mode 100644
index 59ae1bc..0000000
--- a/gemm/mlir/gemm_8192_2048_65536_f16.mlir
+++ /dev/null
@@ -1,9 +0,0 @@
-module {
-    func.func @main(%arg0: tensor<8192x65536xf16>, %arg1: tensor<65536x2048xf16>) -> tensor<8192x2048xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8192x2048xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x2048xf16>) -> tensor<8192x2048xf16>
-        %2 = linalg.matmul ins(%arg0, %arg1 : tensor<8192x65536xf16>, tensor<65536x2048xf16>) outs(%1 : tensor<8192x2048xf16>) -> tensor<8192x2048xf16>
-        return %2 : tensor<8192x2048xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_2048_8192_f16.mlir b/gemm/mlir/gemm_8192_2048_8192_f16.mlir
deleted file mode 100644
index 8e0a0a6..0000000
--- a/gemm/mlir/gemm_8192_2048_8192_f16.mlir
+++ /dev/null
@@ -1,9 +0,0 @@
-module {
-    func.func @main(%arg0: tensor<8192x8192xf16>, %arg1: tensor<8192x2048xf16>) -> tensor<8192x2048xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8192x2048xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x2048xf16>) -> tensor<8192x2048xf16>
-        %2 = linalg.matmul ins(%arg0, %arg1 : tensor<8192x8192xf16>, tensor<8192x2048xf16>) outs(%1 : tensor<8192x2048xf16>) -> tensor<8192x2048xf16>
-        return %2 : tensor<8192x2048xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_2_1024_bf16_tA.mlir b/gemm/mlir/gemm_8192_2_1024_bf16_tA.mlir
deleted file mode 100644
index 7b07b58..0000000
--- a/gemm/mlir/gemm_8192_2_1024_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<1024x8192xbf16>, %arg1: tensor<1024x2xbf16>) -> tensor<8192x2xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<8192x2xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x2xbf16>) -> tensor<8192x2xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1024x8192xbf16>, tensor<1024x2xbf16>) outs(%1 : tensor<8192x2xbf16>) -> tensor<8192x2xbf16>
-        return %2 : tensor<8192x2xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_2_1024_f16_tA.mlir b/gemm/mlir/gemm_8192_2_1024_f16_tA.mlir
deleted file mode 100644
index fad9863..0000000
--- a/gemm/mlir/gemm_8192_2_1024_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<1024x8192xf16>, %arg1: tensor<1024x2xf16>) -> tensor<8192x2xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8192x2xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x2xf16>) -> tensor<8192x2xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1024x8192xf16>, tensor<1024x2xf16>) outs(%1 : tensor<8192x2xf16>) -> tensor<8192x2xf16>
-        return %2 : tensor<8192x2xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_2_14336_bf16_tA.mlir b/gemm/mlir/gemm_8192_2_14336_bf16_tA.mlir
deleted file mode 100644
index c3ba4e7..0000000
--- a/gemm/mlir/gemm_8192_2_14336_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<14336x8192xbf16>, %arg1: tensor<14336x2xbf16>) -> tensor<8192x2xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<8192x2xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x2xbf16>) -> tensor<8192x2xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<14336x8192xbf16>, tensor<14336x2xbf16>) outs(%1 : tensor<8192x2xbf16>) -> tensor<8192x2xbf16>
-        return %2 : tensor<8192x2xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_2_14336_f16_tA.mlir b/gemm/mlir/gemm_8192_2_14336_f16_tA.mlir
deleted file mode 100644
index ac6a2f1..0000000
--- a/gemm/mlir/gemm_8192_2_14336_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<14336x8192xf16>, %arg1: tensor<14336x2xf16>) -> tensor<8192x2xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8192x2xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x2xf16>) -> tensor<8192x2xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<14336x8192xf16>, tensor<14336x2xf16>) outs(%1 : tensor<8192x2xf16>) -> tensor<8192x2xf16>
-        return %2 : tensor<8192x2xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_2_2048_bf16_tA.mlir b/gemm/mlir/gemm_8192_2_2048_bf16_tA.mlir
deleted file mode 100644
index 154421f..0000000
--- a/gemm/mlir/gemm_8192_2_2048_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<2048x8192xbf16>, %arg1: tensor<2048x2xbf16>) -> tensor<8192x2xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<8192x2xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x2xbf16>) -> tensor<8192x2xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<2048x8192xbf16>, tensor<2048x2xbf16>) outs(%1 : tensor<8192x2xbf16>) -> tensor<8192x2xbf16>
-        return %2 : tensor<8192x2xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_2_2048_f16_tA.mlir b/gemm/mlir/gemm_8192_2_2048_f16_tA.mlir
deleted file mode 100644
index 531fb51..0000000
--- a/gemm/mlir/gemm_8192_2_2048_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<2048x8192xf16>, %arg1: tensor<2048x2xf16>) -> tensor<8192x2xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8192x2xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x2xf16>) -> tensor<8192x2xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<2048x8192xf16>, tensor<2048x2xf16>) outs(%1 : tensor<8192x2xf16>) -> tensor<8192x2xf16>
-        return %2 : tensor<8192x2xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_2_28672_bf16_tA.mlir b/gemm/mlir/gemm_8192_2_28672_bf16_tA.mlir
deleted file mode 100644
index a26e286..0000000
--- a/gemm/mlir/gemm_8192_2_28672_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<28672x8192xbf16>, %arg1: tensor<28672x2xbf16>) -> tensor<8192x2xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<8192x2xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x2xbf16>) -> tensor<8192x2xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<28672x8192xbf16>, tensor<28672x2xbf16>) outs(%1 : tensor<8192x2xbf16>) -> tensor<8192x2xbf16>
-        return %2 : tensor<8192x2xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_2_28672_f16_tA.mlir b/gemm/mlir/gemm_8192_2_28672_f16_tA.mlir
deleted file mode 100644
index fa64b0d..0000000
--- a/gemm/mlir/gemm_8192_2_28672_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<28672x8192xf16>, %arg1: tensor<28672x2xf16>) -> tensor<8192x2xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8192x2xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x2xf16>) -> tensor<8192x2xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<28672x8192xf16>, tensor<28672x2xf16>) outs(%1 : tensor<8192x2xf16>) -> tensor<8192x2xf16>
-        return %2 : tensor<8192x2xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_2_3584_bf16_tA.mlir b/gemm/mlir/gemm_8192_2_3584_bf16_tA.mlir
deleted file mode 100644
index 6bfc9dc..0000000
--- a/gemm/mlir/gemm_8192_2_3584_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<3584x8192xbf16>, %arg1: tensor<3584x2xbf16>) -> tensor<8192x2xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<8192x2xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x2xbf16>) -> tensor<8192x2xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<3584x8192xbf16>, tensor<3584x2xbf16>) outs(%1 : tensor<8192x2xbf16>) -> tensor<8192x2xbf16>
-        return %2 : tensor<8192x2xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_2_3584_f16_tA.mlir b/gemm/mlir/gemm_8192_2_3584_f16_tA.mlir
deleted file mode 100644
index 5891198..0000000
--- a/gemm/mlir/gemm_8192_2_3584_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<3584x8192xf16>, %arg1: tensor<3584x2xf16>) -> tensor<8192x2xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8192x2xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x2xf16>) -> tensor<8192x2xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<3584x8192xf16>, tensor<3584x2xf16>) outs(%1 : tensor<8192x2xf16>) -> tensor<8192x2xf16>
-        return %2 : tensor<8192x2xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_2_4096_bf16_tA.mlir b/gemm/mlir/gemm_8192_2_4096_bf16_tA.mlir
deleted file mode 100644
index 5bad65e..0000000
--- a/gemm/mlir/gemm_8192_2_4096_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<4096x8192xbf16>, %arg1: tensor<4096x2xbf16>) -> tensor<8192x2xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<8192x2xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x2xbf16>) -> tensor<8192x2xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<4096x8192xbf16>, tensor<4096x2xbf16>) outs(%1 : tensor<8192x2xbf16>) -> tensor<8192x2xbf16>
-        return %2 : tensor<8192x2xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_2_4096_f16_tA.mlir b/gemm/mlir/gemm_8192_2_4096_f16_tA.mlir
deleted file mode 100644
index 2ff588d..0000000
--- a/gemm/mlir/gemm_8192_2_4096_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<4096x8192xf16>, %arg1: tensor<4096x2xf16>) -> tensor<8192x2xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8192x2xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x2xf16>) -> tensor<8192x2xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<4096x8192xf16>, tensor<4096x2xf16>) outs(%1 : tensor<8192x2xf16>) -> tensor<8192x2xf16>
-        return %2 : tensor<8192x2xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_2_7168_bf16_tA.mlir b/gemm/mlir/gemm_8192_2_7168_bf16_tA.mlir
deleted file mode 100644
index 6017644..0000000
--- a/gemm/mlir/gemm_8192_2_7168_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<7168x8192xbf16>, %arg1: tensor<7168x2xbf16>) -> tensor<8192x2xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<8192x2xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x2xbf16>) -> tensor<8192x2xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<7168x8192xbf16>, tensor<7168x2xbf16>) outs(%1 : tensor<8192x2xbf16>) -> tensor<8192x2xbf16>
-        return %2 : tensor<8192x2xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_2_7168_f16_tA.mlir b/gemm/mlir/gemm_8192_2_7168_f16_tA.mlir
deleted file mode 100644
index ada61d1..0000000
--- a/gemm/mlir/gemm_8192_2_7168_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<7168x8192xf16>, %arg1: tensor<7168x2xf16>) -> tensor<8192x2xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8192x2xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x2xf16>) -> tensor<8192x2xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<7168x8192xf16>, tensor<7168x2xf16>) outs(%1 : tensor<8192x2xf16>) -> tensor<8192x2xf16>
-        return %2 : tensor<8192x2xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_2_8192_bf16_tA.mlir b/gemm/mlir/gemm_8192_2_8192_bf16_tA.mlir
deleted file mode 100644
index 4a5c210..0000000
--- a/gemm/mlir/gemm_8192_2_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x8192xbf16>, %arg1: tensor<8192x2xbf16>) -> tensor<8192x2xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<8192x2xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x2xbf16>) -> tensor<8192x2xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x8192xbf16>, tensor<8192x2xbf16>) outs(%1 : tensor<8192x2xbf16>) -> tensor<8192x2xbf16>
-        return %2 : tensor<8192x2xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_2_8192_f16_tA.mlir b/gemm/mlir/gemm_8192_2_8192_f16_tA.mlir
deleted file mode 100644
index 070a5ba..0000000
--- a/gemm/mlir/gemm_8192_2_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x8192xf16>, %arg1: tensor<8192x2xf16>) -> tensor<8192x2xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8192x2xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x2xf16>) -> tensor<8192x2xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x8192xf16>, tensor<8192x2xf16>) outs(%1 : tensor<8192x2xf16>) -> tensor<8192x2xf16>
-        return %2 : tensor<8192x2xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_32_1024_bf16_tA.mlir b/gemm/mlir/gemm_8192_32_1024_bf16_tA.mlir
deleted file mode 100644
index 852e767..0000000
--- a/gemm/mlir/gemm_8192_32_1024_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<1024x8192xbf16>, %arg1: tensor<1024x32xbf16>) -> tensor<8192x32xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<8192x32xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x32xbf16>) -> tensor<8192x32xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1024x8192xbf16>, tensor<1024x32xbf16>) outs(%1 : tensor<8192x32xbf16>) -> tensor<8192x32xbf16>
-        return %2 : tensor<8192x32xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_32_1024_f16_tA.mlir b/gemm/mlir/gemm_8192_32_1024_f16_tA.mlir
deleted file mode 100644
index c5f5846..0000000
--- a/gemm/mlir/gemm_8192_32_1024_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<1024x8192xf16>, %arg1: tensor<1024x32xf16>) -> tensor<8192x32xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8192x32xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x32xf16>) -> tensor<8192x32xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1024x8192xf16>, tensor<1024x32xf16>) outs(%1 : tensor<8192x32xf16>) -> tensor<8192x32xf16>
-        return %2 : tensor<8192x32xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_32_14336_bf16_tA.mlir b/gemm/mlir/gemm_8192_32_14336_bf16_tA.mlir
deleted file mode 100644
index 7f3f684..0000000
--- a/gemm/mlir/gemm_8192_32_14336_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<14336x8192xbf16>, %arg1: tensor<14336x32xbf16>) -> tensor<8192x32xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<8192x32xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x32xbf16>) -> tensor<8192x32xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<14336x8192xbf16>, tensor<14336x32xbf16>) outs(%1 : tensor<8192x32xbf16>) -> tensor<8192x32xbf16>
-        return %2 : tensor<8192x32xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_32_14336_f16_tA.mlir b/gemm/mlir/gemm_8192_32_14336_f16_tA.mlir
deleted file mode 100644
index 4670ddd..0000000
--- a/gemm/mlir/gemm_8192_32_14336_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<14336x8192xf16>, %arg1: tensor<14336x32xf16>) -> tensor<8192x32xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8192x32xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x32xf16>) -> tensor<8192x32xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<14336x8192xf16>, tensor<14336x32xf16>) outs(%1 : tensor<8192x32xf16>) -> tensor<8192x32xf16>
-        return %2 : tensor<8192x32xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_32_2048_bf16_tA.mlir b/gemm/mlir/gemm_8192_32_2048_bf16_tA.mlir
deleted file mode 100644
index 9b7cb18..0000000
--- a/gemm/mlir/gemm_8192_32_2048_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<2048x8192xbf16>, %arg1: tensor<2048x32xbf16>) -> tensor<8192x32xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<8192x32xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x32xbf16>) -> tensor<8192x32xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<2048x8192xbf16>, tensor<2048x32xbf16>) outs(%1 : tensor<8192x32xbf16>) -> tensor<8192x32xbf16>
-        return %2 : tensor<8192x32xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_32_2048_f16_tA.mlir b/gemm/mlir/gemm_8192_32_2048_f16_tA.mlir
deleted file mode 100644
index ad5ff98..0000000
--- a/gemm/mlir/gemm_8192_32_2048_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<2048x8192xf16>, %arg1: tensor<2048x32xf16>) -> tensor<8192x32xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8192x32xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x32xf16>) -> tensor<8192x32xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<2048x8192xf16>, tensor<2048x32xf16>) outs(%1 : tensor<8192x32xf16>) -> tensor<8192x32xf16>
-        return %2 : tensor<8192x32xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_32_28672_bf16_tA.mlir b/gemm/mlir/gemm_8192_32_28672_bf16_tA.mlir
deleted file mode 100644
index 0e75daf..0000000
--- a/gemm/mlir/gemm_8192_32_28672_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<28672x8192xbf16>, %arg1: tensor<28672x32xbf16>) -> tensor<8192x32xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<8192x32xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x32xbf16>) -> tensor<8192x32xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<28672x8192xbf16>, tensor<28672x32xbf16>) outs(%1 : tensor<8192x32xbf16>) -> tensor<8192x32xbf16>
-        return %2 : tensor<8192x32xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_32_28672_f16_tA.mlir b/gemm/mlir/gemm_8192_32_28672_f16_tA.mlir
deleted file mode 100644
index 1809761..0000000
--- a/gemm/mlir/gemm_8192_32_28672_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<28672x8192xf16>, %arg1: tensor<28672x32xf16>) -> tensor<8192x32xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8192x32xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x32xf16>) -> tensor<8192x32xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<28672x8192xf16>, tensor<28672x32xf16>) outs(%1 : tensor<8192x32xf16>) -> tensor<8192x32xf16>
-        return %2 : tensor<8192x32xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_32_3584_bf16_tA.mlir b/gemm/mlir/gemm_8192_32_3584_bf16_tA.mlir
deleted file mode 100644
index ddbba11..0000000
--- a/gemm/mlir/gemm_8192_32_3584_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<3584x8192xbf16>, %arg1: tensor<3584x32xbf16>) -> tensor<8192x32xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<8192x32xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x32xbf16>) -> tensor<8192x32xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<3584x8192xbf16>, tensor<3584x32xbf16>) outs(%1 : tensor<8192x32xbf16>) -> tensor<8192x32xbf16>
-        return %2 : tensor<8192x32xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_32_3584_f16_tA.mlir b/gemm/mlir/gemm_8192_32_3584_f16_tA.mlir
deleted file mode 100644
index 45b7ca4..0000000
--- a/gemm/mlir/gemm_8192_32_3584_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<3584x8192xf16>, %arg1: tensor<3584x32xf16>) -> tensor<8192x32xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8192x32xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x32xf16>) -> tensor<8192x32xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<3584x8192xf16>, tensor<3584x32xf16>) outs(%1 : tensor<8192x32xf16>) -> tensor<8192x32xf16>
-        return %2 : tensor<8192x32xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_32_4096_bf16_tA.mlir b/gemm/mlir/gemm_8192_32_4096_bf16_tA.mlir
deleted file mode 100644
index 7134984..0000000
--- a/gemm/mlir/gemm_8192_32_4096_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<4096x8192xbf16>, %arg1: tensor<4096x32xbf16>) -> tensor<8192x32xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<8192x32xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x32xbf16>) -> tensor<8192x32xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<4096x8192xbf16>, tensor<4096x32xbf16>) outs(%1 : tensor<8192x32xbf16>) -> tensor<8192x32xbf16>
-        return %2 : tensor<8192x32xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_32_4096_f16_tA.mlir b/gemm/mlir/gemm_8192_32_4096_f16_tA.mlir
deleted file mode 100644
index 7df2c92..0000000
--- a/gemm/mlir/gemm_8192_32_4096_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<4096x8192xf16>, %arg1: tensor<4096x32xf16>) -> tensor<8192x32xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8192x32xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x32xf16>) -> tensor<8192x32xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<4096x8192xf16>, tensor<4096x32xf16>) outs(%1 : tensor<8192x32xf16>) -> tensor<8192x32xf16>
-        return %2 : tensor<8192x32xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_32_7168_bf16_tA.mlir b/gemm/mlir/gemm_8192_32_7168_bf16_tA.mlir
deleted file mode 100644
index 672f613..0000000
--- a/gemm/mlir/gemm_8192_32_7168_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<7168x8192xbf16>, %arg1: tensor<7168x32xbf16>) -> tensor<8192x32xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<8192x32xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x32xbf16>) -> tensor<8192x32xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<7168x8192xbf16>, tensor<7168x32xbf16>) outs(%1 : tensor<8192x32xbf16>) -> tensor<8192x32xbf16>
-        return %2 : tensor<8192x32xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_32_7168_f16_tA.mlir b/gemm/mlir/gemm_8192_32_7168_f16_tA.mlir
deleted file mode 100644
index aa39da1..0000000
--- a/gemm/mlir/gemm_8192_32_7168_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<7168x8192xf16>, %arg1: tensor<7168x32xf16>) -> tensor<8192x32xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8192x32xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x32xf16>) -> tensor<8192x32xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<7168x8192xf16>, tensor<7168x32xf16>) outs(%1 : tensor<8192x32xf16>) -> tensor<8192x32xf16>
-        return %2 : tensor<8192x32xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_32_8192_bf16_tA.mlir b/gemm/mlir/gemm_8192_32_8192_bf16_tA.mlir
deleted file mode 100644
index beeb9f6..0000000
--- a/gemm/mlir/gemm_8192_32_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x8192xbf16>, %arg1: tensor<8192x32xbf16>) -> tensor<8192x32xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<8192x32xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x32xbf16>) -> tensor<8192x32xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x8192xbf16>, tensor<8192x32xbf16>) outs(%1 : tensor<8192x32xbf16>) -> tensor<8192x32xbf16>
-        return %2 : tensor<8192x32xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_32_8192_f16_tA.mlir b/gemm/mlir/gemm_8192_32_8192_f16_tA.mlir
deleted file mode 100644
index 538b2a5..0000000
--- a/gemm/mlir/gemm_8192_32_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x8192xf16>, %arg1: tensor<8192x32xf16>) -> tensor<8192x32xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8192x32xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x32xf16>) -> tensor<8192x32xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x8192xf16>, tensor<8192x32xf16>) outs(%1 : tensor<8192x32xf16>) -> tensor<8192x32xf16>
-        return %2 : tensor<8192x32xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_4_1024_bf16_tA.mlir b/gemm/mlir/gemm_8192_4_1024_bf16_tA.mlir
deleted file mode 100644
index dadcc8c..0000000
--- a/gemm/mlir/gemm_8192_4_1024_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<1024x8192xbf16>, %arg1: tensor<1024x4xbf16>) -> tensor<8192x4xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<8192x4xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x4xbf16>) -> tensor<8192x4xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1024x8192xbf16>, tensor<1024x4xbf16>) outs(%1 : tensor<8192x4xbf16>) -> tensor<8192x4xbf16>
-        return %2 : tensor<8192x4xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_4_1024_f16_tA.mlir b/gemm/mlir/gemm_8192_4_1024_f16_tA.mlir
deleted file mode 100644
index ae01271..0000000
--- a/gemm/mlir/gemm_8192_4_1024_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<1024x8192xf16>, %arg1: tensor<1024x4xf16>) -> tensor<8192x4xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8192x4xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x4xf16>) -> tensor<8192x4xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1024x8192xf16>, tensor<1024x4xf16>) outs(%1 : tensor<8192x4xf16>) -> tensor<8192x4xf16>
-        return %2 : tensor<8192x4xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_4_14336_bf16_tA.mlir b/gemm/mlir/gemm_8192_4_14336_bf16_tA.mlir
deleted file mode 100644
index a91f9bf..0000000
--- a/gemm/mlir/gemm_8192_4_14336_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<14336x8192xbf16>, %arg1: tensor<14336x4xbf16>) -> tensor<8192x4xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<8192x4xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x4xbf16>) -> tensor<8192x4xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<14336x8192xbf16>, tensor<14336x4xbf16>) outs(%1 : tensor<8192x4xbf16>) -> tensor<8192x4xbf16>
-        return %2 : tensor<8192x4xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_4_14336_f16_tA.mlir b/gemm/mlir/gemm_8192_4_14336_f16_tA.mlir
deleted file mode 100644
index 925676f..0000000
--- a/gemm/mlir/gemm_8192_4_14336_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<14336x8192xf16>, %arg1: tensor<14336x4xf16>) -> tensor<8192x4xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8192x4xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x4xf16>) -> tensor<8192x4xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<14336x8192xf16>, tensor<14336x4xf16>) outs(%1 : tensor<8192x4xf16>) -> tensor<8192x4xf16>
-        return %2 : tensor<8192x4xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_4_2048_bf16_tA.mlir b/gemm/mlir/gemm_8192_4_2048_bf16_tA.mlir
deleted file mode 100644
index 63f589c..0000000
--- a/gemm/mlir/gemm_8192_4_2048_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<2048x8192xbf16>, %arg1: tensor<2048x4xbf16>) -> tensor<8192x4xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<8192x4xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x4xbf16>) -> tensor<8192x4xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<2048x8192xbf16>, tensor<2048x4xbf16>) outs(%1 : tensor<8192x4xbf16>) -> tensor<8192x4xbf16>
-        return %2 : tensor<8192x4xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_4_2048_f16_tA.mlir b/gemm/mlir/gemm_8192_4_2048_f16_tA.mlir
deleted file mode 100644
index 043dba2..0000000
--- a/gemm/mlir/gemm_8192_4_2048_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<2048x8192xf16>, %arg1: tensor<2048x4xf16>) -> tensor<8192x4xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8192x4xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x4xf16>) -> tensor<8192x4xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<2048x8192xf16>, tensor<2048x4xf16>) outs(%1 : tensor<8192x4xf16>) -> tensor<8192x4xf16>
-        return %2 : tensor<8192x4xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_4_28672_bf16_tA.mlir b/gemm/mlir/gemm_8192_4_28672_bf16_tA.mlir
deleted file mode 100644
index ffcc49d..0000000
--- a/gemm/mlir/gemm_8192_4_28672_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<28672x8192xbf16>, %arg1: tensor<28672x4xbf16>) -> tensor<8192x4xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<8192x4xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x4xbf16>) -> tensor<8192x4xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<28672x8192xbf16>, tensor<28672x4xbf16>) outs(%1 : tensor<8192x4xbf16>) -> tensor<8192x4xbf16>
-        return %2 : tensor<8192x4xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_4_28672_f16_tA.mlir b/gemm/mlir/gemm_8192_4_28672_f16_tA.mlir
deleted file mode 100644
index 3e66079..0000000
--- a/gemm/mlir/gemm_8192_4_28672_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<28672x8192xf16>, %arg1: tensor<28672x4xf16>) -> tensor<8192x4xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8192x4xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x4xf16>) -> tensor<8192x4xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<28672x8192xf16>, tensor<28672x4xf16>) outs(%1 : tensor<8192x4xf16>) -> tensor<8192x4xf16>
-        return %2 : tensor<8192x4xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_4_3584_bf16_tA.mlir b/gemm/mlir/gemm_8192_4_3584_bf16_tA.mlir
deleted file mode 100644
index 13ea765..0000000
--- a/gemm/mlir/gemm_8192_4_3584_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<3584x8192xbf16>, %arg1: tensor<3584x4xbf16>) -> tensor<8192x4xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<8192x4xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x4xbf16>) -> tensor<8192x4xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<3584x8192xbf16>, tensor<3584x4xbf16>) outs(%1 : tensor<8192x4xbf16>) -> tensor<8192x4xbf16>
-        return %2 : tensor<8192x4xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_4_3584_f16_tA.mlir b/gemm/mlir/gemm_8192_4_3584_f16_tA.mlir
deleted file mode 100644
index b3a4aca..0000000
--- a/gemm/mlir/gemm_8192_4_3584_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<3584x8192xf16>, %arg1: tensor<3584x4xf16>) -> tensor<8192x4xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8192x4xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x4xf16>) -> tensor<8192x4xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<3584x8192xf16>, tensor<3584x4xf16>) outs(%1 : tensor<8192x4xf16>) -> tensor<8192x4xf16>
-        return %2 : tensor<8192x4xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_4_4096_bf16_tA.mlir b/gemm/mlir/gemm_8192_4_4096_bf16_tA.mlir
deleted file mode 100644
index 111e1b9..0000000
--- a/gemm/mlir/gemm_8192_4_4096_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<4096x8192xbf16>, %arg1: tensor<4096x4xbf16>) -> tensor<8192x4xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<8192x4xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x4xbf16>) -> tensor<8192x4xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<4096x8192xbf16>, tensor<4096x4xbf16>) outs(%1 : tensor<8192x4xbf16>) -> tensor<8192x4xbf16>
-        return %2 : tensor<8192x4xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_4_4096_f16_tA.mlir b/gemm/mlir/gemm_8192_4_4096_f16_tA.mlir
deleted file mode 100644
index e9059bf..0000000
--- a/gemm/mlir/gemm_8192_4_4096_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<4096x8192xf16>, %arg1: tensor<4096x4xf16>) -> tensor<8192x4xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8192x4xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x4xf16>) -> tensor<8192x4xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<4096x8192xf16>, tensor<4096x4xf16>) outs(%1 : tensor<8192x4xf16>) -> tensor<8192x4xf16>
-        return %2 : tensor<8192x4xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_4_7168_bf16_tA.mlir b/gemm/mlir/gemm_8192_4_7168_bf16_tA.mlir
deleted file mode 100644
index 7c140f3..0000000
--- a/gemm/mlir/gemm_8192_4_7168_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<7168x8192xbf16>, %arg1: tensor<7168x4xbf16>) -> tensor<8192x4xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<8192x4xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x4xbf16>) -> tensor<8192x4xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<7168x8192xbf16>, tensor<7168x4xbf16>) outs(%1 : tensor<8192x4xbf16>) -> tensor<8192x4xbf16>
-        return %2 : tensor<8192x4xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_4_7168_f16_tA.mlir b/gemm/mlir/gemm_8192_4_7168_f16_tA.mlir
deleted file mode 100644
index 81e98c6..0000000
--- a/gemm/mlir/gemm_8192_4_7168_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<7168x8192xf16>, %arg1: tensor<7168x4xf16>) -> tensor<8192x4xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8192x4xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x4xf16>) -> tensor<8192x4xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<7168x8192xf16>, tensor<7168x4xf16>) outs(%1 : tensor<8192x4xf16>) -> tensor<8192x4xf16>
-        return %2 : tensor<8192x4xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_4_8192_bf16_tA.mlir b/gemm/mlir/gemm_8192_4_8192_bf16_tA.mlir
deleted file mode 100644
index 7ae31d7..0000000
--- a/gemm/mlir/gemm_8192_4_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x8192xbf16>, %arg1: tensor<8192x4xbf16>) -> tensor<8192x4xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<8192x4xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x4xbf16>) -> tensor<8192x4xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x8192xbf16>, tensor<8192x4xbf16>) outs(%1 : tensor<8192x4xbf16>) -> tensor<8192x4xbf16>
-        return %2 : tensor<8192x4xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_4_8192_f16_tA.mlir b/gemm/mlir/gemm_8192_4_8192_f16_tA.mlir
deleted file mode 100644
index 2378c0f..0000000
--- a/gemm/mlir/gemm_8192_4_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x8192xf16>, %arg1: tensor<8192x4xf16>) -> tensor<8192x4xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8192x4xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x4xf16>) -> tensor<8192x4xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x8192xf16>, tensor<8192x4xf16>) outs(%1 : tensor<8192x4xf16>) -> tensor<8192x4xf16>
-        return %2 : tensor<8192x4xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_5120_640_bf16.mlir b/gemm/mlir/gemm_8192_5120_640_bf16.mlir
deleted file mode 100644
index 5f59098..0000000
--- a/gemm/mlir/gemm_8192_5120_640_bf16.mlir
+++ /dev/null
@@ -1,9 +0,0 @@
-module {
-    func.func @main(%arg0: tensor<8192x640xbf16>, %arg1: tensor<640x5120xbf16>) -> tensor<8192x5120xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<8192x5120xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x5120xbf16>) -> tensor<8192x5120xbf16>
-        %2 = linalg.matmul ins(%arg0, %arg1 : tensor<8192x640xbf16>, tensor<640x5120xbf16>) outs(%1 : tensor<8192x5120xbf16>) -> tensor<8192x5120xbf16>
-        return %2 : tensor<8192x5120xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_5120_640_bf16_tA.mlir b/gemm/mlir/gemm_8192_5120_640_bf16_tA.mlir
deleted file mode 100644
index 177684d..0000000
--- a/gemm/mlir/gemm_8192_5120_640_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<640x8192xbf16>, %arg1: tensor<640x5120xbf16>) -> tensor<8192x5120xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<8192x5120xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x5120xbf16>) -> tensor<8192x5120xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<640x8192xbf16>, tensor<640x5120xbf16>) outs(%1 : tensor<8192x5120xbf16>) -> tensor<8192x5120xbf16>
-        return %2 : tensor<8192x5120xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_5120_640_bf16_tB.mlir b/gemm/mlir/gemm_8192_5120_640_bf16_tB.mlir
deleted file mode 100644
index 629f56f..0000000
--- a/gemm/mlir/gemm_8192_5120_640_bf16_tB.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x640xbf16>, %arg1: tensor<5120x640xbf16>) -> tensor<8192x5120xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<8192x5120xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x5120xbf16>) -> tensor<8192x5120xbf16>
-        %2 = linalg.matmul_transpose_b ins(%arg0, %arg1 : tensor<8192x640xbf16>, tensor<5120x640xbf16>) outs(%1 : tensor<8192x5120xbf16>) -> tensor<8192x5120xbf16>
-        return %2 : tensor<8192x5120xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_5120_640_f16.mlir b/gemm/mlir/gemm_8192_5120_640_f16.mlir
deleted file mode 100644
index 52be98c..0000000
--- a/gemm/mlir/gemm_8192_5120_640_f16.mlir
+++ /dev/null
@@ -1,9 +0,0 @@
-module {
-    func.func @main(%arg0: tensor<8192x640xf16>, %arg1: tensor<640x5120xf16>) -> tensor<8192x5120xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8192x5120xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x5120xf16>) -> tensor<8192x5120xf16>
-        %2 = linalg.matmul ins(%arg0, %arg1 : tensor<8192x640xf16>, tensor<640x5120xf16>) outs(%1 : tensor<8192x5120xf16>) -> tensor<8192x5120xf16>
-        return %2 : tensor<8192x5120xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_5120_640_f16_tA.mlir b/gemm/mlir/gemm_8192_5120_640_f16_tA.mlir
deleted file mode 100644
index 97875f6..0000000
--- a/gemm/mlir/gemm_8192_5120_640_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<640x8192xf16>, %arg1: tensor<640x5120xf16>) -> tensor<8192x5120xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8192x5120xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x5120xf16>) -> tensor<8192x5120xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<640x8192xf16>, tensor<640x5120xf16>) outs(%1 : tensor<8192x5120xf16>) -> tensor<8192x5120xf16>
-        return %2 : tensor<8192x5120xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_5120_640_f16_tB.mlir b/gemm/mlir/gemm_8192_5120_640_f16_tB.mlir
deleted file mode 100644
index c512215..0000000
--- a/gemm/mlir/gemm_8192_5120_640_f16_tB.mlir
+++ /dev/null
@@ -1,145 +0,0 @@
-#translation = #iree_codegen.translation_info<None workgroup_size = [128, 2, 1] subgroup_size = 64>
-module attributes {transform.with_named_sequence} {
-  stream.executable private @gemm {
-    stream.executable.export public @gemm workgroups() -> (index, index, index) {
-      %c128 = arith.constant 128 : index
-      %c80 = arith.constant 80 : index
-      %c1 = arith.constant 1 : index
-      stream.return %c128, %c80, %c1 : index, index, index
-    }
-    builtin.module {
-      func.func @gemm(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) attributes {translation_info = #translation} {
-        %c19 = arith.constant 19 : index
-        %c18 = arith.constant 18 : index
-        %c17 = arith.constant 17 : index
-        %c3 = arith.constant 3 : index
-        %c2 = arith.constant 2 : index
-        %c16 = arith.constant 16 : index
-        %c8 = arith.constant 8 : index
-        %c4 = arith.constant 4 : index
-        %c32 = arith.constant 32 : index
-        %c64 = arith.constant 64 : index
-        %c1 = arith.constant 1 : index
-        %c20 = arith.constant 20 : index
-        %c0 = arith.constant 0 : index
-        %cst = arith.constant dense<0.000000e+00> : vector<4xf32>
-        %workgroup_id_0 = stream.dispatch.workgroup.id[0] : index
-        %workgroup_id_1 = stream.dispatch.workgroup.id[1] : index
-        %thread_id_x = gpu.thread_id  x
-        %thread_id_y = gpu.thread_id  y
-        %alloc = memref.alloc() : memref<64x32xf16, #gpu.address_space<workgroup>>
-        %alloc_0 = memref.alloc() : memref<64x32xf16, #gpu.address_space<workgroup>>
-        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> memref<8192x640xf16, strided<[640, 1], offset: ?>>
-        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> memref<5120x640xf16, strided<[640, 1], offset: ?>>
-        %2 = arith.muli %workgroup_id_0, %c64 : index
-        %3 = arith.muli %thread_id_y, %c32 : index
-        %4 = arith.divsi %thread_id_x, %c4 : index
-        %5 = arith.addi %4, %3 : index
-        %6 = arith.remsi %5, %c64 : index
-        %7 = arith.addi %6, %2 : index
-        %8 = arith.remsi %thread_id_x, %c4 : index
-        %9 = arith.muli %8, %c8 : index
-        %10 = arith.divsi %thread_id_x, %c64 : index
-        %11 = arith.muli %10, %c32 : index
-        %12 = arith.remsi %thread_id_x, %c16 : index
-        %13 = arith.addi %12, %11 : index
-        %14 = arith.remsi %thread_id_x, %c64 : index
-        %15 = arith.divsi %14, %c16 : index
-        %16 = arith.muli %15, %c4 : index
-        %17 = arith.addi %16, %c16 : index
-        %18 = arith.addi %13, %c16 : index
-        %19 = arith.muli %workgroup_id_1, %c64 : index
-        %20 = arith.addi %6, %19 : index
-        %21 = arith.addi %12, %3 : index
-        %22 = arith.addi %21, %c16 : index
-        %23:4 = scf.for %arg3 = %c0 to %c20 step %c1 iter_args(%arg4 = %cst, %arg5 = %cst, %arg6 = %cst, %arg7 = %cst) -> (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) {
-          %62 = arith.muli %arg3, %c32 : index
-          %63 = arith.addi %62, %9 : index
-          %64 = vector.load %0[%7, %63] : memref<8192x640xf16, strided<[640, 1], offset: ?>>, vector<8xf16>
-          vector.store %64, %alloc[%6, %9] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<8xf16>
-          amdgpu.lds_barrier
-          %65 = vector.load %alloc[%13, %16] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<4xf16>
-          %66 = vector.load %alloc[%13, %17] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<4xf16>
-          %67 = vector.load %alloc[%18, %16] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<4xf16>
-          %68 = vector.load %alloc[%18, %17] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<4xf16>
-          %69 = vector.load %1[%20, %63] : memref<5120x640xf16, strided<[640, 1], offset: ?>>, vector<8xf16>
-          amdgpu.lds_barrier
-          vector.store %69, %alloc_0[%6, %9] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<8xf16>
-          amdgpu.lds_barrier
-          %70 = vector.load %alloc_0[%21, %16] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<4xf16>
-          %71 = vector.load %alloc_0[%21, %17] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<4xf16>
-          %72 = vector.load %alloc_0[%22, %16] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<4xf16>
-          %73 = vector.load %alloc_0[%22, %17] : memref<64x32xf16, #gpu.address_space<workgroup>>, vector<4xf16>
-          %74 = amdgpu.mfma %65 * %70 + %arg4 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
-          %75 = amdgpu.mfma %66 * %71 + %74 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
-          %76 = amdgpu.mfma %67 * %72 + %arg7 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
-          %77 = amdgpu.mfma %68 * %73 + %76 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
-          %78 = amdgpu.mfma %67 * %70 + %arg6 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
-          %79 = amdgpu.mfma %68 * %71 + %78 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
-          %80 = amdgpu.mfma %65 * %72 + %arg5 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
-          %81 = amdgpu.mfma %66 * %73 + %80 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
-          scf.yield %75, %81, %79, %77 : vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>
-        }
-        %24 = vector.extract_strided_slice %23#0 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        %25 = stream.binding.subspan %arg2[%c0] : !stream.binding -> memref<8192x5120xf32, strided<[5120, 1], offset: ?>>
-        %26 = arith.remsi %thread_id_x, %c64 : index
-        %27 = arith.divsi %26, %c16 : index
-        %28 = arith.muli %27, %c4 : index
-        %29 = arith.divsi %thread_id_x, %c64 : index
-        %30 = arith.muli %29, %c32 : index
-        %31 = arith.muli %workgroup_id_0, %c64 : index
-        %32 = arith.addi %31, %30 : index
-        %33 = arith.addi %32, %28 : index
-        %34 = arith.muli %thread_id_y, %c32 : index
-        %35 = arith.muli %workgroup_id_1, %c64 : index
-        %36 = arith.remsi %thread_id_x, %c16 : index
-        %37 = arith.addi %36, %35 : index
-        %38 = arith.addi %37, %34 : index
-        vector.store %24, %25[%33, %38] : memref<8192x5120xf32, strided<[5120, 1], offset: ?>>, vector<1xf32>
-        %39 = vector.extract_strided_slice %23#0 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        %40 = arith.addi %33, %c1 : index
-        vector.store %39, %25[%40, %38] : memref<8192x5120xf32, strided<[5120, 1], offset: ?>>, vector<1xf32>
-        %41 = vector.extract_strided_slice %23#0 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        %42 = arith.addi %33, %c2 : index
-        vector.store %41, %25[%42, %38] : memref<8192x5120xf32, strided<[5120, 1], offset: ?>>, vector<1xf32>
-        %43 = vector.extract_strided_slice %23#0 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        %44 = arith.addi %33, %c3 : index
-        vector.store %43, %25[%44, %38] : memref<8192x5120xf32, strided<[5120, 1], offset: ?>>, vector<1xf32>
-        %45 = vector.extract_strided_slice %23#3 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        %46 = arith.addi %33, %c16 : index
-        %47 = arith.addi %38, %c16 : index
-        vector.store %45, %25[%46, %47] : memref<8192x5120xf32, strided<[5120, 1], offset: ?>>, vector<1xf32>
-        %48 = vector.extract_strided_slice %23#3 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        %49 = arith.addi %33, %c17 : index
-        vector.store %48, %25[%49, %47] : memref<8192x5120xf32, strided<[5120, 1], offset: ?>>, vector<1xf32>
-        %50 = vector.extract_strided_slice %23#3 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        %51 = arith.addi %33, %c18 : index
-        vector.store %50, %25[%51, %47] : memref<8192x5120xf32, strided<[5120, 1], offset: ?>>, vector<1xf32>
-        %52 = vector.extract_strided_slice %23#3 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        %53 = arith.addi %33, %c19 : index
-        vector.store %52, %25[%53, %47] : memref<8192x5120xf32, strided<[5120, 1], offset: ?>>, vector<1xf32>
-        %54 = vector.extract_strided_slice %23#2 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        vector.store %54, %25[%46, %38] : memref<8192x5120xf32, strided<[5120, 1], offset: ?>>, vector<1xf32>
-        %55 = vector.extract_strided_slice %23#2 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        vector.store %55, %25[%49, %38] : memref<8192x5120xf32, strided<[5120, 1], offset: ?>>, vector<1xf32>
-        %56 = vector.extract_strided_slice %23#2 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        vector.store %56, %25[%51, %38] : memref<8192x5120xf32, strided<[5120, 1], offset: ?>>, vector<1xf32>
-        %57 = vector.extract_strided_slice %23#2 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        vector.store %57, %25[%53, %38] : memref<8192x5120xf32, strided<[5120, 1], offset: ?>>, vector<1xf32>
-        %58 = vector.extract_strided_slice %23#1 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        vector.store %58, %25[%33, %47] : memref<8192x5120xf32, strided<[5120, 1], offset: ?>>, vector<1xf32>
-        %59 = vector.extract_strided_slice %23#1 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        vector.store %59, %25[%40, %47] : memref<8192x5120xf32, strided<[5120, 1], offset: ?>>, vector<1xf32>
-        %60 = vector.extract_strided_slice %23#1 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        vector.store %60, %25[%42, %47] : memref<8192x5120xf32, strided<[5120, 1], offset: ?>>, vector<1xf32>
-        %61 = vector.extract_strided_slice %23#1 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
-        vector.store %61, %25[%44, %47] : memref<8192x5120xf32, strided<[5120, 1], offset: ?>>, vector<1xf32>
-        return
-      }
-    }
-  }
-  func.func @isolated_benchmark(%arg0: tensor<8192x640xf16>, %arg1: tensor<5120x640xf16>) -> tensor<8192x5120xf32> {
-    %0 = flow.dispatch @gemm::@gemm(%arg0, %arg1) : (tensor<8192x640xf16>, tensor<5120x640xf16>) -> tensor<8192x5120xf32>
-    return %0 : tensor<8192x5120xf32>
-  }
-}
diff --git a/gemm/mlir/gemm_8192_8192_1024_f16.mlir b/gemm/mlir/gemm_8192_8192_1024_f16.mlir
deleted file mode 100644
index 8c5ec54..0000000
--- a/gemm/mlir/gemm_8192_8192_1024_f16.mlir
+++ /dev/null
@@ -1,9 +0,0 @@
-module {
-    func.func @main(%arg0: tensor<8192x1024xf16>, %arg1: tensor<1024x8192xf16>) -> tensor<8192x8192xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8192x8192xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x8192xf16>) -> tensor<8192x8192xf16>
-        %2 = linalg.matmul ins(%arg0, %arg1 : tensor<8192x1024xf16>, tensor<1024x8192xf16>) outs(%1 : tensor<8192x8192xf16>) -> tensor<8192x8192xf16>
-        return %2 : tensor<8192x8192xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_8192_65536_f16.mlir b/gemm/mlir/gemm_8192_8192_65536_f16.mlir
deleted file mode 100644
index 04bdc92..0000000
--- a/gemm/mlir/gemm_8192_8192_65536_f16.mlir
+++ /dev/null
@@ -1,9 +0,0 @@
-module {
-    func.func @main(%arg0: tensor<8192x65536xf16>, %arg1: tensor<65536x8192xf16>) -> tensor<8192x8192xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8192x8192xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x8192xf16>) -> tensor<8192x8192xf16>
-        %2 = linalg.matmul ins(%arg0, %arg1 : tensor<8192x65536xf16>, tensor<65536x8192xf16>) outs(%1 : tensor<8192x8192xf16>) -> tensor<8192x8192xf16>
-        return %2 : tensor<8192x8192xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_8192_8192_f16.mlir b/gemm/mlir/gemm_8192_8192_8192_f16.mlir
deleted file mode 100644
index 232fdb7..0000000
--- a/gemm/mlir/gemm_8192_8192_8192_f16.mlir
+++ /dev/null
@@ -1,9 +0,0 @@
-module {
-    func.func @main(%arg0: tensor<8192x8192xf16>, %arg1: tensor<8192x8192xf16>) -> tensor<8192x8192xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8192x8192xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x8192xf16>) -> tensor<8192x8192xf16>
-        %2 = linalg.matmul ins(%arg0, %arg1 : tensor<8192x8192xf16>, tensor<8192x8192xf16>) outs(%1 : tensor<8192x8192xf16>) -> tensor<8192x8192xf16>
-        return %2 : tensor<8192x8192xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_8_1024_bf16_tA.mlir b/gemm/mlir/gemm_8192_8_1024_bf16_tA.mlir
deleted file mode 100644
index 90fbed3..0000000
--- a/gemm/mlir/gemm_8192_8_1024_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<1024x8192xbf16>, %arg1: tensor<1024x8xbf16>) -> tensor<8192x8xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<8192x8xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x8xbf16>) -> tensor<8192x8xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1024x8192xbf16>, tensor<1024x8xbf16>) outs(%1 : tensor<8192x8xbf16>) -> tensor<8192x8xbf16>
-        return %2 : tensor<8192x8xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_8_1024_f16_tA.mlir b/gemm/mlir/gemm_8192_8_1024_f16_tA.mlir
deleted file mode 100644
index b3d0f26..0000000
--- a/gemm/mlir/gemm_8192_8_1024_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<1024x8192xf16>, %arg1: tensor<1024x8xf16>) -> tensor<8192x8xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8192x8xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x8xf16>) -> tensor<8192x8xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<1024x8192xf16>, tensor<1024x8xf16>) outs(%1 : tensor<8192x8xf16>) -> tensor<8192x8xf16>
-        return %2 : tensor<8192x8xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_8_14336_bf16_tA.mlir b/gemm/mlir/gemm_8192_8_14336_bf16_tA.mlir
deleted file mode 100644
index 17fe727..0000000
--- a/gemm/mlir/gemm_8192_8_14336_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<14336x8192xbf16>, %arg1: tensor<14336x8xbf16>) -> tensor<8192x8xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<8192x8xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x8xbf16>) -> tensor<8192x8xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<14336x8192xbf16>, tensor<14336x8xbf16>) outs(%1 : tensor<8192x8xbf16>) -> tensor<8192x8xbf16>
-        return %2 : tensor<8192x8xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_8_14336_f16_tA.mlir b/gemm/mlir/gemm_8192_8_14336_f16_tA.mlir
deleted file mode 100644
index bbf21b1..0000000
--- a/gemm/mlir/gemm_8192_8_14336_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<14336x8192xf16>, %arg1: tensor<14336x8xf16>) -> tensor<8192x8xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8192x8xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x8xf16>) -> tensor<8192x8xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<14336x8192xf16>, tensor<14336x8xf16>) outs(%1 : tensor<8192x8xf16>) -> tensor<8192x8xf16>
-        return %2 : tensor<8192x8xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_8_2048_bf16_tA.mlir b/gemm/mlir/gemm_8192_8_2048_bf16_tA.mlir
deleted file mode 100644
index d46ec59..0000000
--- a/gemm/mlir/gemm_8192_8_2048_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<2048x8192xbf16>, %arg1: tensor<2048x8xbf16>) -> tensor<8192x8xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<8192x8xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x8xbf16>) -> tensor<8192x8xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<2048x8192xbf16>, tensor<2048x8xbf16>) outs(%1 : tensor<8192x8xbf16>) -> tensor<8192x8xbf16>
-        return %2 : tensor<8192x8xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_8_2048_f16_tA.mlir b/gemm/mlir/gemm_8192_8_2048_f16_tA.mlir
deleted file mode 100644
index 30f757f..0000000
--- a/gemm/mlir/gemm_8192_8_2048_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<2048x8192xf16>, %arg1: tensor<2048x8xf16>) -> tensor<8192x8xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8192x8xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x8xf16>) -> tensor<8192x8xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<2048x8192xf16>, tensor<2048x8xf16>) outs(%1 : tensor<8192x8xf16>) -> tensor<8192x8xf16>
-        return %2 : tensor<8192x8xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_8_28672_bf16_tA.mlir b/gemm/mlir/gemm_8192_8_28672_bf16_tA.mlir
deleted file mode 100644
index 7fdd508..0000000
--- a/gemm/mlir/gemm_8192_8_28672_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<28672x8192xbf16>, %arg1: tensor<28672x8xbf16>) -> tensor<8192x8xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<8192x8xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x8xbf16>) -> tensor<8192x8xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<28672x8192xbf16>, tensor<28672x8xbf16>) outs(%1 : tensor<8192x8xbf16>) -> tensor<8192x8xbf16>
-        return %2 : tensor<8192x8xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_8_28672_f16_tA.mlir b/gemm/mlir/gemm_8192_8_28672_f16_tA.mlir
deleted file mode 100644
index aafb576..0000000
--- a/gemm/mlir/gemm_8192_8_28672_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<28672x8192xf16>, %arg1: tensor<28672x8xf16>) -> tensor<8192x8xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8192x8xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x8xf16>) -> tensor<8192x8xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<28672x8192xf16>, tensor<28672x8xf16>) outs(%1 : tensor<8192x8xf16>) -> tensor<8192x8xf16>
-        return %2 : tensor<8192x8xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_8_3584_bf16_tA.mlir b/gemm/mlir/gemm_8192_8_3584_bf16_tA.mlir
deleted file mode 100644
index caa3522..0000000
--- a/gemm/mlir/gemm_8192_8_3584_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<3584x8192xbf16>, %arg1: tensor<3584x8xbf16>) -> tensor<8192x8xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<8192x8xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x8xbf16>) -> tensor<8192x8xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<3584x8192xbf16>, tensor<3584x8xbf16>) outs(%1 : tensor<8192x8xbf16>) -> tensor<8192x8xbf16>
-        return %2 : tensor<8192x8xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_8_3584_f16_tA.mlir b/gemm/mlir/gemm_8192_8_3584_f16_tA.mlir
deleted file mode 100644
index 9964378..0000000
--- a/gemm/mlir/gemm_8192_8_3584_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<3584x8192xf16>, %arg1: tensor<3584x8xf16>) -> tensor<8192x8xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8192x8xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x8xf16>) -> tensor<8192x8xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<3584x8192xf16>, tensor<3584x8xf16>) outs(%1 : tensor<8192x8xf16>) -> tensor<8192x8xf16>
-        return %2 : tensor<8192x8xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_8_4096_bf16_tA.mlir b/gemm/mlir/gemm_8192_8_4096_bf16_tA.mlir
deleted file mode 100644
index ed9262d..0000000
--- a/gemm/mlir/gemm_8192_8_4096_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<4096x8192xbf16>, %arg1: tensor<4096x8xbf16>) -> tensor<8192x8xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<8192x8xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x8xbf16>) -> tensor<8192x8xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<4096x8192xbf16>, tensor<4096x8xbf16>) outs(%1 : tensor<8192x8xbf16>) -> tensor<8192x8xbf16>
-        return %2 : tensor<8192x8xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_8_4096_f16_tA.mlir b/gemm/mlir/gemm_8192_8_4096_f16_tA.mlir
deleted file mode 100644
index 0fca3dc..0000000
--- a/gemm/mlir/gemm_8192_8_4096_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<4096x8192xf16>, %arg1: tensor<4096x8xf16>) -> tensor<8192x8xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8192x8xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x8xf16>) -> tensor<8192x8xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<4096x8192xf16>, tensor<4096x8xf16>) outs(%1 : tensor<8192x8xf16>) -> tensor<8192x8xf16>
-        return %2 : tensor<8192x8xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_8_7168_bf16_tA.mlir b/gemm/mlir/gemm_8192_8_7168_bf16_tA.mlir
deleted file mode 100644
index b7f68ff..0000000
--- a/gemm/mlir/gemm_8192_8_7168_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<7168x8192xbf16>, %arg1: tensor<7168x8xbf16>) -> tensor<8192x8xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<8192x8xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x8xbf16>) -> tensor<8192x8xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<7168x8192xbf16>, tensor<7168x8xbf16>) outs(%1 : tensor<8192x8xbf16>) -> tensor<8192x8xbf16>
-        return %2 : tensor<8192x8xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_8_7168_f16_tA.mlir b/gemm/mlir/gemm_8192_8_7168_f16_tA.mlir
deleted file mode 100644
index c143d7f..0000000
--- a/gemm/mlir/gemm_8192_8_7168_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<7168x8192xf16>, %arg1: tensor<7168x8xf16>) -> tensor<8192x8xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8192x8xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x8xf16>) -> tensor<8192x8xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<7168x8192xf16>, tensor<7168x8xf16>) outs(%1 : tensor<8192x8xf16>) -> tensor<8192x8xf16>
-        return %2 : tensor<8192x8xf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_8_8192_bf16_tA.mlir b/gemm/mlir/gemm_8192_8_8192_bf16_tA.mlir
deleted file mode 100644
index be2e86d..0000000
--- a/gemm/mlir/gemm_8192_8_8192_bf16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x8192xbf16>, %arg1: tensor<8192x8xbf16>) -> tensor<8192x8xbf16> {
-        %cst = arith.constant 0.000000e+00 : bf16
-        %0 = tensor.empty() : tensor<8192x8xbf16>
-        %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<8192x8xbf16>) -> tensor<8192x8xbf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x8192xbf16>, tensor<8192x8xbf16>) outs(%1 : tensor<8192x8xbf16>) -> tensor<8192x8xbf16>
-        return %2 : tensor<8192x8xbf16>
-    }
-}
diff --git a/gemm/mlir/gemm_8192_8_8192_f16_tA.mlir b/gemm/mlir/gemm_8192_8_8192_f16_tA.mlir
deleted file mode 100644
index 62431ce..0000000
--- a/gemm/mlir/gemm_8192_8_8192_f16_tA.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-
-module {
-    func.func @main(%arg0: tensor<8192x8192xf16>, %arg1: tensor<8192x8xf16>) -> tensor<8192x8xf16> {
-        %cst = arith.constant 0.000000e+00 : f16
-        %0 = tensor.empty() : tensor<8192x8xf16>
-        %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<8192x8xf16>) -> tensor<8192x8xf16>
-        %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<8192x8192xf16>, tensor<8192x8xf16>) outs(%1 : tensor<8192x8xf16>) -> tensor<8192x8xf16>
-        return %2 : tensor<8192x8xf16>
-    }
-}