pytorch · kirklandsign · Apr 10, 2025 · Apr 10, 2025 · Apr 10, 2025
@@ -108,7 +108,7 @@ runtime.python_library(
         "source_transformation/pre_quantization.py",
         "source_transformation/prune_vocab.py",
         "source_transformation/quantize.py",
-        "source_transformation/quantized_kv_cache.py",
+        "source_transformation/custom_kv_cache.py",
         "source_transformation/rms_norm.py",
         "source_transformation/rope.py",
         "source_transformation/sdpa.py",
@@ -208,9 +208,9 @@ runtime.python_library(
 )
 
 runtime.python_library(
-    name = "quantized_kv_cache",
+    name = "custom_kv_cache",
     srcs = [
-        "source_transformation/quantized_kv_cache.py",
+        "source_transformation/custom_kv_cache.py",
     ],
     _is_external_target = True,
     visibility = ["//executorch/..."],
@@ -240,7 +240,7 @@ runtime.python_test(
         "//executorch/extension/llm/custom_ops:custom_ops_aot_lib",
     ],
     deps = [
-        ":quantized_kv_cache",
+        ":custom_kv_cache",
         "//caffe2:torch",
         "//executorch/examples/models/llama:llama_transformer",
     ],
@@ -255,7 +255,7 @@ runtime.python_test(
         "//executorch/extension/llm/custom_ops:custom_ops_aot_lib",
     ],
     deps = [
-        ":quantized_kv_cache",
+        ":custom_kv_cache",
         ":sdpa",
         "//caffe2:torch",
         "//executorch/examples/models/llama:llama_transformer",

@@ -59,14 +59,14 @@
 )
 
 from .source_transformation.attention import replace_attention_to_attention_sha
+from .source_transformation.custom_kv_cache import (
+    replace_kv_cache_with_custom_kv_cache,
+    replace_kv_cache_with_quantized_kv_cache,
+)
 from .source_transformation.quantize import (
     get_quant_embedding_transform,
     get_quant_weight_transform,
 )
-from .source_transformation.quantized_kv_cache import (
-    replace_kv_cache_with_custom_kv_cache,
-    replace_kv_cache_with_quantized_kv_cache,
-)
 from .source_transformation.rms_norm import replace_rms_norm_with_native_rms_norm
 
 from .source_transformation.rope import materialze_broadcast_of_rope_freq_cis

@@ -10,7 +10,7 @@
 
 from executorch.examples.models.llama.attention import KVCache
 
-from executorch.examples.models.llama.source_transformation.quantized_kv_cache import (
+from executorch.examples.models.llama.source_transformation.custom_kv_cache import (
     QuantizedCacheType,
     QuantizedKVCache,
 )

@@ -10,7 +10,7 @@
 
 from executorch.examples.models.llama.attention import KVCache
 
-from executorch.examples.models.llama.source_transformation.quantized_kv_cache import (
+from executorch.examples.models.llama.source_transformation.custom_kv_cache import (
     CustomKVCache,
     QuantizedCacheType,
     QuantizedKVCache,

@@ -20,13 +20,13 @@
     build_args_parser,
     get_quantizer_and_quant_params,
 )
+from executorch.examples.models.llama.source_transformation.custom_kv_cache import (
+    replace_kv_cache_with_custom_kv_cache,
+)
 from executorch.examples.models.llama.source_transformation.quantize import (
     EmbeddingQuantHandler,
     get_quant_weight_transform,
 )
-from executorch.examples.models.llama.source_transformation.quantized_kv_cache import (
-    replace_kv_cache_with_custom_kv_cache,
-)
 from executorch.examples.models.llama.source_transformation.sdpa import (
     replace_sdpa_with_custom_op,
 )

@@ -15,7 +15,7 @@
 from executorch.examples.models.llama.llama_transformer import Transformer
 from executorch.examples.models.llama.model_args import ModelArgs
 
-from executorch.examples.models.llama.source_transformation.quantized_kv_cache import (
+from executorch.examples.models.llama.source_transformation.custom_kv_cache import (
     replace_kv_cache_with_custom_kv_cache,
 )
 from executorch.examples.models.llama.source_transformation.sdpa import (

@@ -264,14 +264,14 @@ Tensor& flash_attention_kernel_out(
       InvalidArgument,
       output);
 
-  auto q_seq_len = query.size(2);
+  auto seq_len = query.size(2);
 
   ET_SWITCH_FLOAT_TYPES(
       query.scalar_type(), ctx, "flash_attention", CTYPE, [&] {
         // TODO we need to re-evaluate this for ARM CPUs
         // And there can be many so instead of templatizing
         // we might consider another appraoch
-        if (q_seq_len >= 768) {
+        if (seq_len >= 768) {
           sdpa::impl::cpu_flash_attention<CTYPE, 256, 512>(
               output,
               query,
@@ -287,7 +287,7 @@ Tensor& flash_attention_kernel_out(
               nullopt,
               nullopt,
               nullopt);
-        } else if (q_seq_len >= 192) {
+        } else if (seq_len >= 192) {
           sdpa::impl::cpu_flash_attention<CTYPE, 64, 512>(
               output,
               query,
@@ -341,7 +341,8 @@ Tensor& custom_sdpa_out_impl(
     const optional<Tensor>& k_zero_points = nullopt,
     const optional<Tensor>& k_scales = nullopt,
     const optional<Tensor>& v_zero_points = nullopt,
-    const optional<Tensor>& v_scales = nullopt) {
+    const optional<Tensor>& v_scales = nullopt,
+    bool is_seq_at_dim_2 = false) {
   ET_KERNEL_CHECK_MSG(
       ctx,
       !attn_mask.has_value() || !is_causal,
@@ -357,13 +358,15 @@ Tensor& custom_sdpa_out_impl(
       "Invalid arguments");
 
   int64_t seq_len = q.size(1);
-  auto q_seq_len = q.size(1);
+  SeqDim seq_dim{SeqDim::TWO};
+  if (!is_seq_at_dim_2) {
+    seq_dim = SeqDim::ONE;
+  }
 
-  bool is_seq_at_dim_1{true};
   if (q.scalar_type() == ScalarType::Char) {
-    is_seq_at_dim_1 = false;
-    seq_len = q.size(2);
-    q_seq_len = q.size(2);
+    if (seq_dim == SeqDim::TWO) {
+      seq_len = q.size(2);
+    }
     ET_KERNEL_CHECK_MSG(
         ctx,
         q_scales.has_value() && q_zero_points.has_value() &&
@@ -412,7 +415,7 @@ Tensor& custom_sdpa_out_impl(
         // TODO we need to re-evaluate this for ARM CPUs
         // And there can be many so instead of templatizing
         // we might consider another appraoch
-        if (q_seq_len >= 768) {
+        if (seq_len >= 768) {
           sdpa::impl::cpu_flash_attention<CTYPE, 256, 512>(
               output,
               q,
@@ -428,10 +431,10 @@ Tensor& custom_sdpa_out_impl(
               k_scales, // k_scales
               v_zero_points, // v_zero_points
               v_scales, // v_scales
-              is_seq_at_dim_1, /* is_seq_at_dim_1 */
+              seq_dim, /* seq_dim */
               start_pos,
               num_keys_for_causal_attention);
-        } else if (q_seq_len >= 192) {
+        } else if (seq_len >= 192) {
           sdpa::impl::cpu_flash_attention<CTYPE, 64, 512>(
               output,
               q,
@@ -447,7 +450,7 @@ Tensor& custom_sdpa_out_impl(
               k_scales, // k_scales
               v_zero_points, // v_zero_points
               v_scales, // v_scales
-              is_seq_at_dim_1, /* is_seq_at_dim_1 */
+              seq_dim, /* seq_dim */
               start_pos,
               num_keys_for_causal_attention);
         } else {
@@ -466,7 +469,7 @@ Tensor& custom_sdpa_out_impl(
               k_scales, // k_scales
               v_zero_points, // v_zero_points
               v_scales, // v_scales
-              is_seq_at_dim_1, /* is_seq_at_dim_1 */
+              seq_dim, /* seq_dim */
               start_pos,
               num_keys_for_causal_attention);
         }
@@ -492,6 +495,7 @@ Tensor& custom_quantized_sdpa_out(
     const optional<Tensor>& k_scales,
     const optional<Tensor>& v_zero_points,
     const optional<Tensor>& v_scales,
+    const bool is_seq_at_dim_2,
     Tensor& output) {
   return custom_sdpa_out_impl(
       ctx,
@@ -509,7 +513,8 @@ Tensor& custom_quantized_sdpa_out(
       k_zero_points,
       k_scales,
       v_zero_points,
-      v_scales);
+      v_scales,
+      is_seq_at_dim_2);
 }
 #endif // ENABLE_CUSTOM_QUANTIZED_SDPA
 

@@ -74,6 +74,7 @@ Tensor& custom_quantized_sdpa_out(
     const optional<Tensor>& k_scales,
     const optional<Tensor>& v_zero_points,
     const optional<Tensor>& v_scales,
+    const bool is_seq_at_dim_1,
     Tensor& output);
 #endif // ENABLE_CUSTOM_QUANTIZED_SDPA
 } // namespace native

@@ -96,6 +96,7 @@ Tensor& custom_quantized_sdpa_out_no_context(
     const optional<Tensor> k_scales,
     const optional<Tensor> v_zero_points,
     const optional<Tensor> v_scales,
+    const bool is_seq_at_dim_2,
     Tensor& output);
 
 at::Tensor custom_quantized_sdpa_aten(
@@ -115,7 +116,8 @@ at::Tensor custom_quantized_sdpa_aten(
     const std::optional<at::Tensor>& k_zero_points,
     const std::optional<at::Tensor>& k_scales,
     const std::optional<at::Tensor>& v_zero_points,
-    const std::optional<at::Tensor>& v_scales);
+    const std::optional<at::Tensor>& v_scales,
+    const bool is_seq_at_dim_2);
 #endif // ENABLE_CUSTOM_QUANTIZED_SDPA
 
 Tensor& update_cache_out_no_context(
@@ -258,6 +260,7 @@ Tensor& custom_quantized_sdpa_out_no_context(
     const optional<Tensor> k_scales,
     const optional<Tensor> v_zero_points,
     const optional<Tensor> v_scales,
+    const bool is_seq_at_dim_2,
     Tensor& output) {
   executorch::aten::RuntimeContext context{};
   return torch::executor::native::custom_quantized_sdpa_out(
@@ -276,6 +279,7 @@ Tensor& custom_quantized_sdpa_out_no_context(
       k_scales,
       v_zero_points,
       v_scales,
+      is_seq_at_dim_2,
       output);
 }
 
@@ -296,9 +300,10 @@ at::Tensor custom_quantized_sdpa_aten(
     const std::optional<at::Tensor>& k_zero_points,
     const std::optional<at::Tensor>& k_scales,
     const std::optional<at::Tensor>& v_zero_points,
-    const std::optional<at::Tensor>& v_scales) {
+    const std::optional<at::Tensor>& v_scales,
+    const bool is_seq_at_dim_2) {
   auto output = at::empty(q.sizes());
-  WRAP_TO_ATEN(custom_quantized_sdpa_out_no_context, 14)
+  WRAP_TO_ATEN(custom_quantized_sdpa_out_no_context, 15)
   (q,
    k,
    v,
@@ -313,6 +318,7 @@ at::Tensor custom_quantized_sdpa_aten(
    k_scales,
    v_zero_points,
    v_scales,
+   is_seq_at_dim_2,
    output);
   return output;
 }
@@ -371,13 +377,13 @@ TORCH_LIBRARY_FRAGMENT(llama, m) {
       "Tensor? attn_mask=None, float drpout_p=0.0, bool is_causal=False, "
       "float? scale=None, Tensor? q_zero_points=None, Tensor? q_scales=None, "
       "Tensor? k_zero_points=None, Tensor? k_scales=None, Tensor? v_zero_points=None, "
-      "Tensor? v_scales=None) -> Tensor");
+      "Tensor? v_scales=None, bool is_seq_at_dim_2=False) -> Tensor");
   m.def(
       "custom_quantized_sdpa.out(Tensor query, Tensor key, Tensor value, SymInt start_pos, "
       "Tensor? attn_mask=None, float drpout_p=0.0, bool is_causal=False, "
       "float? scale=None, Tensor? q_zero_points=None, Tensor? q_scales=None, "
       "Tensor? k_zero_points=None, Tensor? k_scales=None, Tensor? v_zero_points=None, "
-      "Tensor? v_scales=None, *, Tensor(a!) out) -> Tensor(a!)");
+      "Tensor? v_scales=None, bool is_seq_at_dim_2=False, *, Tensor(a!) out) -> Tensor(a!)");
 #endif // ENABLE_CUSTOM_QUANTIZED_SDPA
 }
 
@@ -404,6 +410,6 @@ TORCH_LIBRARY_IMPL(llama, CompositeExplicitAutograd, m) {
   m.impl(
       "custom_quantized_sdpa.out",
       WRAP_TO_ATEN(
-          torch::executor::native::custom_quantized_sdpa_out_no_context, 14));
+          torch::executor::native::custom_quantized_sdpa_out_no_context, 15));
 #endif // ENABLE_CUSTOM_QUANTIZED_SDPA
 }