import neuron_attention earlier (#38)

patrick-toulme · Dec 19, 2024 · f4a68f9 · f4a68f9
1 parent 68c6ee9
commit f4a68f9
Showing 1 changed file with 1 addition and 4 deletions.
diff --git a/axlearn/common/flash_attention/utils.py b/axlearn/common/flash_attention/utils.py
@@ -11,6 +11,7 @@
 from axlearn.common.attention import NEG_INF, MaskFn, causal_mask, softmax_with_biases
 from axlearn.common.flash_attention.gpu_attention import cudnn_dot_product_attention
 from axlearn.common.flash_attention.gpu_attention import flash_attention as gpu_flash_attention
+from axlearn.common.flash_attention.neuron_attention import flash_attention as neuron_flash_attention
 from axlearn.common.flash_attention.tpu_attention import tpu_flash_attention
 from axlearn.common.utils import Tensor
 
@@ -160,10 +161,6 @@ def jit_attn(query, key, value, bias, segment_ids):
         return jit_attn
 
     elif backend == "neuron":
-        from axlearn.common.flash_attention.neuron_attention import (
-            flash_attention as neuron_flash_attention,
-        )
-
         # shard_map-decorated function needs to be jitted.
         @jax.jit
         def jit_attn(query, key, value, bias, segment_ids):