Skip to content

[Executorch][llama] Renamed quantized_kv_cache to custom_kv_cache #10061

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Apr 10, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions examples/models/llama/TARGETS
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ runtime.python_library(
"source_transformation/pre_quantization.py",
"source_transformation/prune_vocab.py",
"source_transformation/quantize.py",
"source_transformation/quantized_kv_cache.py",
"source_transformation/custom_kv_cache.py",
"source_transformation/rms_norm.py",
"source_transformation/rope.py",
"source_transformation/sdpa.py",
Expand Down Expand Up @@ -208,9 +208,9 @@ runtime.python_library(
)

runtime.python_library(
name = "quantized_kv_cache",
name = "custom_kv_cache",
srcs = [
"source_transformation/quantized_kv_cache.py",
"source_transformation/custom_kv_cache.py",
],
_is_external_target = True,
visibility = ["//executorch/..."],
Expand Down Expand Up @@ -240,7 +240,7 @@ runtime.python_test(
"//executorch/extension/llm/custom_ops:custom_ops_aot_lib",
],
deps = [
":quantized_kv_cache",
":custom_kv_cache",
"//caffe2:torch",
"//executorch/examples/models/llama:llama_transformer",
],
Expand All @@ -255,7 +255,7 @@ runtime.python_test(
"//executorch/extension/llm/custom_ops:custom_ops_aot_lib",
],
deps = [
":quantized_kv_cache",
":custom_kv_cache",
":sdpa",
"//caffe2:torch",
"//executorch/examples/models/llama:llama_transformer",
Expand Down
8 changes: 4 additions & 4 deletions examples/models/llama/export_llama_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,14 +59,14 @@
)

from .source_transformation.attention import replace_attention_to_attention_sha
from .source_transformation.custom_kv_cache import (
replace_kv_cache_with_custom_kv_cache,
replace_kv_cache_with_quantized_kv_cache,
)
from .source_transformation.quantize import (
get_quant_embedding_transform,
get_quant_weight_transform,
)
from .source_transformation.quantized_kv_cache import (
replace_kv_cache_with_custom_kv_cache,
replace_kv_cache_with_quantized_kv_cache,
)
from .source_transformation.rms_norm import replace_rms_norm_with_native_rms_norm

from .source_transformation.rope import materialze_broadcast_of_rope_freq_cis
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

from executorch.examples.models.llama.attention import KVCache

from executorch.examples.models.llama.source_transformation.quantized_kv_cache import (
from executorch.examples.models.llama.source_transformation.custom_kv_cache import (
QuantizedCacheType,
QuantizedKVCache,
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

from executorch.examples.models.llama.attention import KVCache

from executorch.examples.models.llama.source_transformation.quantized_kv_cache import (
from executorch.examples.models.llama.source_transformation.custom_kv_cache import (
CustomKVCache,
QuantizedCacheType,
QuantizedKVCache,
Expand Down
6 changes: 3 additions & 3 deletions examples/models/llava/export_llava.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,13 @@
build_args_parser,
get_quantizer_and_quant_params,
)
from executorch.examples.models.llama.source_transformation.custom_kv_cache import (
replace_kv_cache_with_custom_kv_cache,
)
from executorch.examples.models.llama.source_transformation.quantize import (
EmbeddingQuantHandler,
get_quant_weight_transform,
)
from executorch.examples.models.llama.source_transformation.quantized_kv_cache import (
replace_kv_cache_with_custom_kv_cache,
)
from executorch.examples.models.llama.source_transformation.sdpa import (
replace_sdpa_with_custom_op,
)
Expand Down
2 changes: 1 addition & 1 deletion examples/models/llava/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from executorch.examples.models.llama.llama_transformer import Transformer
from executorch.examples.models.llama.model_args import ModelArgs

from executorch.examples.models.llama.source_transformation.quantized_kv_cache import (
from executorch.examples.models.llama.source_transformation.custom_kv_cache import (
replace_kv_cache_with_custom_kv_cache,
)
from executorch.examples.models.llama.source_transformation.sdpa import (
Expand Down
35 changes: 20 additions & 15 deletions extension/llm/custom_ops/op_sdpa.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -264,14 +264,14 @@ Tensor& flash_attention_kernel_out(
InvalidArgument,
output);

auto q_seq_len = query.size(2);
auto seq_len = query.size(2);

ET_SWITCH_FLOAT_TYPES(
query.scalar_type(), ctx, "flash_attention", CTYPE, [&] {
// TODO we need to re-evaluate this for ARM CPUs
// And there can be many so instead of templatizing
// we might consider another appraoch
if (q_seq_len >= 768) {
if (seq_len >= 768) {
sdpa::impl::cpu_flash_attention<CTYPE, 256, 512>(
output,
query,
Expand All @@ -287,7 +287,7 @@ Tensor& flash_attention_kernel_out(
nullopt,
nullopt,
nullopt);
} else if (q_seq_len >= 192) {
} else if (seq_len >= 192) {
sdpa::impl::cpu_flash_attention<CTYPE, 64, 512>(
output,
query,
Expand Down Expand Up @@ -341,7 +341,8 @@ Tensor& custom_sdpa_out_impl(
const optional<Tensor>& k_zero_points = nullopt,
const optional<Tensor>& k_scales = nullopt,
const optional<Tensor>& v_zero_points = nullopt,
const optional<Tensor>& v_scales = nullopt) {
const optional<Tensor>& v_scales = nullopt,
bool is_seq_at_dim_2 = false) {
ET_KERNEL_CHECK_MSG(
ctx,
!attn_mask.has_value() || !is_causal,
Expand All @@ -357,13 +358,15 @@ Tensor& custom_sdpa_out_impl(
"Invalid arguments");

int64_t seq_len = q.size(1);
auto q_seq_len = q.size(1);
SeqDim seq_dim{SeqDim::TWO};
if (!is_seq_at_dim_2) {
seq_dim = SeqDim::ONE;
}

bool is_seq_at_dim_1{true};
if (q.scalar_type() == ScalarType::Char) {
is_seq_at_dim_1 = false;
seq_len = q.size(2);
q_seq_len = q.size(2);
if (seq_dim == SeqDim::TWO) {
seq_len = q.size(2);
}
ET_KERNEL_CHECK_MSG(
ctx,
q_scales.has_value() && q_zero_points.has_value() &&
Expand Down Expand Up @@ -412,7 +415,7 @@ Tensor& custom_sdpa_out_impl(
// TODO we need to re-evaluate this for ARM CPUs
// And there can be many so instead of templatizing
// we might consider another appraoch
if (q_seq_len >= 768) {
if (seq_len >= 768) {
sdpa::impl::cpu_flash_attention<CTYPE, 256, 512>(
output,
q,
Expand All @@ -428,10 +431,10 @@ Tensor& custom_sdpa_out_impl(
k_scales, // k_scales
v_zero_points, // v_zero_points
v_scales, // v_scales
is_seq_at_dim_1, /* is_seq_at_dim_1 */
seq_dim, /* seq_dim */
start_pos,
num_keys_for_causal_attention);
} else if (q_seq_len >= 192) {
} else if (seq_len >= 192) {
sdpa::impl::cpu_flash_attention<CTYPE, 64, 512>(
output,
q,
Expand All @@ -447,7 +450,7 @@ Tensor& custom_sdpa_out_impl(
k_scales, // k_scales
v_zero_points, // v_zero_points
v_scales, // v_scales
is_seq_at_dim_1, /* is_seq_at_dim_1 */
seq_dim, /* seq_dim */
start_pos,
num_keys_for_causal_attention);
} else {
Expand All @@ -466,7 +469,7 @@ Tensor& custom_sdpa_out_impl(
k_scales, // k_scales
v_zero_points, // v_zero_points
v_scales, // v_scales
is_seq_at_dim_1, /* is_seq_at_dim_1 */
seq_dim, /* seq_dim */
start_pos,
num_keys_for_causal_attention);
}
Expand All @@ -492,6 +495,7 @@ Tensor& custom_quantized_sdpa_out(
const optional<Tensor>& k_scales,
const optional<Tensor>& v_zero_points,
const optional<Tensor>& v_scales,
const bool is_seq_at_dim_2,
Tensor& output) {
return custom_sdpa_out_impl(
ctx,
Expand All @@ -509,7 +513,8 @@ Tensor& custom_quantized_sdpa_out(
k_zero_points,
k_scales,
v_zero_points,
v_scales);
v_scales,
is_seq_at_dim_2);
}
#endif // ENABLE_CUSTOM_QUANTIZED_SDPA

Expand Down
1 change: 1 addition & 0 deletions extension/llm/custom_ops/op_sdpa.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ Tensor& custom_quantized_sdpa_out(
const optional<Tensor>& k_scales,
const optional<Tensor>& v_zero_points,
const optional<Tensor>& v_scales,
const bool is_seq_at_dim_1,
Tensor& output);
#endif // ENABLE_CUSTOM_QUANTIZED_SDPA
} // namespace native
Expand Down
18 changes: 12 additions & 6 deletions extension/llm/custom_ops/op_sdpa_aot.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ Tensor& custom_quantized_sdpa_out_no_context(
const optional<Tensor> k_scales,
const optional<Tensor> v_zero_points,
const optional<Tensor> v_scales,
const bool is_seq_at_dim_2,
Tensor& output);

at::Tensor custom_quantized_sdpa_aten(
Expand All @@ -115,7 +116,8 @@ at::Tensor custom_quantized_sdpa_aten(
const std::optional<at::Tensor>& k_zero_points,
const std::optional<at::Tensor>& k_scales,
const std::optional<at::Tensor>& v_zero_points,
const std::optional<at::Tensor>& v_scales);
const std::optional<at::Tensor>& v_scales,
const bool is_seq_at_dim_2);
#endif // ENABLE_CUSTOM_QUANTIZED_SDPA

Tensor& update_cache_out_no_context(
Expand Down Expand Up @@ -258,6 +260,7 @@ Tensor& custom_quantized_sdpa_out_no_context(
const optional<Tensor> k_scales,
const optional<Tensor> v_zero_points,
const optional<Tensor> v_scales,
const bool is_seq_at_dim_2,
Tensor& output) {
executorch::aten::RuntimeContext context{};
return torch::executor::native::custom_quantized_sdpa_out(
Expand All @@ -276,6 +279,7 @@ Tensor& custom_quantized_sdpa_out_no_context(
k_scales,
v_zero_points,
v_scales,
is_seq_at_dim_2,
output);
}

Expand All @@ -296,9 +300,10 @@ at::Tensor custom_quantized_sdpa_aten(
const std::optional<at::Tensor>& k_zero_points,
const std::optional<at::Tensor>& k_scales,
const std::optional<at::Tensor>& v_zero_points,
const std::optional<at::Tensor>& v_scales) {
const std::optional<at::Tensor>& v_scales,
const bool is_seq_at_dim_2) {
auto output = at::empty(q.sizes());
WRAP_TO_ATEN(custom_quantized_sdpa_out_no_context, 14)
WRAP_TO_ATEN(custom_quantized_sdpa_out_no_context, 15)
(q,
k,
v,
Expand All @@ -313,6 +318,7 @@ at::Tensor custom_quantized_sdpa_aten(
k_scales,
v_zero_points,
v_scales,
is_seq_at_dim_2,
output);
return output;
}
Expand Down Expand Up @@ -371,13 +377,13 @@ TORCH_LIBRARY_FRAGMENT(llama, m) {
"Tensor? attn_mask=None, float drpout_p=0.0, bool is_causal=False, "
"float? scale=None, Tensor? q_zero_points=None, Tensor? q_scales=None, "
"Tensor? k_zero_points=None, Tensor? k_scales=None, Tensor? v_zero_points=None, "
"Tensor? v_scales=None) -> Tensor");
"Tensor? v_scales=None, bool is_seq_at_dim_2=False) -> Tensor");
m.def(
"custom_quantized_sdpa.out(Tensor query, Tensor key, Tensor value, SymInt start_pos, "
"Tensor? attn_mask=None, float drpout_p=0.0, bool is_causal=False, "
"float? scale=None, Tensor? q_zero_points=None, Tensor? q_scales=None, "
"Tensor? k_zero_points=None, Tensor? k_scales=None, Tensor? v_zero_points=None, "
"Tensor? v_scales=None, *, Tensor(a!) out) -> Tensor(a!)");
"Tensor? v_scales=None, bool is_seq_at_dim_2=False, *, Tensor(a!) out) -> Tensor(a!)");
#endif // ENABLE_CUSTOM_QUANTIZED_SDPA
}

Expand All @@ -404,6 +410,6 @@ TORCH_LIBRARY_IMPL(llama, CompositeExplicitAutograd, m) {
m.impl(
"custom_quantized_sdpa.out",
WRAP_TO_ATEN(
torch::executor::native::custom_quantized_sdpa_out_no_context, 14));
torch::executor::native::custom_quantized_sdpa_out_no_context, 15));
#endif // ENABLE_CUSTOM_QUANTIZED_SDPA
}
Loading
Loading