pytorch · facebook-github-bot · Apr 10, 2025 · Apr 7, 2025 · Apr 8, 2025 · Apr 9, 2025
@@ -108,7 +108,7 @@ runtime.python_library(
         "source_transformation/pre_quantization.py",
         "source_transformation/prune_vocab.py",
         "source_transformation/quantize.py",
-        "source_transformation/quantized_kv_cache.py",
+        "source_transformation/custom_kv_cache.py",
         "source_transformation/rms_norm.py",
         "source_transformation/rope.py",
         "source_transformation/sdpa.py",
@@ -208,9 +208,9 @@ runtime.python_library(
 )
 
 runtime.python_library(
-    name = "quantized_kv_cache",
+    name = "custom_kv_cache",
     srcs = [
-        "source_transformation/quantized_kv_cache.py",
+        "source_transformation/custom_kv_cache.py",
     ],
     _is_external_target = True,
     visibility = ["//executorch/..."],
@@ -240,7 +240,7 @@ runtime.python_test(
         "//executorch/extension/llm/custom_ops:custom_ops_aot_lib",
     ],
     deps = [
-        ":quantized_kv_cache",
+        ":custom_kv_cache",
         "//caffe2:torch",
         "//executorch/examples/models/llama:llama_transformer",
     ],
@@ -255,7 +255,7 @@ runtime.python_test(
         "//executorch/extension/llm/custom_ops:custom_ops_aot_lib",
     ],
     deps = [
-        ":quantized_kv_cache",
+        ":custom_kv_cache",
         ":sdpa",
         "//caffe2:torch",
         "//executorch/examples/models/llama:llama_transformer",

@@ -59,14 +59,14 @@
 )
 
 from .source_transformation.attention import replace_attention_to_attention_sha
+from .source_transformation.custom_kv_cache import (
+    replace_kv_cache_with_custom_kv_cache,
+    replace_kv_cache_with_quantized_kv_cache,
+)
 from .source_transformation.quantize import (
     get_quant_embedding_transform,
     get_quant_weight_transform,
 )
-from .source_transformation.quantized_kv_cache import (
-    replace_kv_cache_with_custom_kv_cache,
-    replace_kv_cache_with_quantized_kv_cache,
-)
 from .source_transformation.rms_norm import replace_rms_norm_with_native_rms_norm
 
 from .source_transformation.rope import materialze_broadcast_of_rope_freq_cis

@@ -10,7 +10,7 @@
 
 from executorch.examples.models.llama.attention import KVCache
 
-from executorch.examples.models.llama.source_transformation.quantized_kv_cache import (
+from executorch.examples.models.llama.source_transformation.custom_kv_cache import (
     QuantizedCacheType,
     QuantizedKVCache,
 )

@@ -10,7 +10,7 @@
 
 from executorch.examples.models.llama.attention import KVCache
 
-from executorch.examples.models.llama.source_transformation.quantized_kv_cache import (
+from executorch.examples.models.llama.source_transformation.custom_kv_cache import (
     CustomKVCache,
     QuantizedCacheType,
     QuantizedKVCache,

@@ -20,13 +20,13 @@
     build_args_parser,
     get_quantizer_and_quant_params,
 )
+from executorch.examples.models.llama.source_transformation.custom_kv_cache import (
+    replace_kv_cache_with_custom_kv_cache,
+)
 from executorch.examples.models.llama.source_transformation.quantize import (
     EmbeddingQuantHandler,
     get_quant_weight_transform,
 )
-from executorch.examples.models.llama.source_transformation.quantized_kv_cache import (
-    replace_kv_cache_with_custom_kv_cache,
-)
 from executorch.examples.models.llama.source_transformation.sdpa import (
     replace_sdpa_with_custom_op,
 )

@@ -15,7 +15,7 @@
 from executorch.examples.models.llama.llama_transformer import Transformer
 from executorch.examples.models.llama.model_args import ModelArgs
 
-from executorch.examples.models.llama.source_transformation.quantized_kv_cache import (
+from executorch.examples.models.llama.source_transformation.custom_kv_cache import (
     replace_kv_cache_with_custom_kv_cache,
 )
 from executorch.examples.models.llama.source_transformation.sdpa import (