Revert pr 235 as flash attention is not really enabled for gemma (#239)

huggingface · Oct 23, 2024 · b126bf4 · b126bf4
1 parent c5e3881
commit b126bf4
Showing 1 changed file with 2 additions and 3 deletions.
diff --git a/server/text_generation_server/models/causal_lm.py b/server/text_generation_server/models/causal_lm.py
@@ -694,13 +694,12 @@ def __init__(
             "return_dict": True,
         }
 
-        if model.config.model_type in ["llama", "mistral", "starcoder2", "qwen2", "falcon", "gemma"]:
+        if model.config.model_type in ["llama", "mistral", "starcoder2", "qwen2", "falcon"]:
 
             if model.config.model_type not in ["falcon"]:
                 kwargs["attn_softmax_bf16"] = True
 
-            if model.config.model_type not in ["gemma"]:
-                kwargs["trim_logits"] = True
+            kwargs["trim_logits"] = True
 
             if os.getenv("USE_FLASH_ATTENTION", "false").lower() == "true":
                 kwargs["use_flash_attention"] = True