fix generate cache of glm models.

mikecovlee · Jul 22, 2024 · 0e64974 · 0e64974
1 parent 23f112f
commit 0e64974
Show file tree

Hide file tree

Showing 3 changed files with 3 additions and 10 deletions.
diff --git a/README.md b/README.md
@@ -110,7 +110,6 @@ For users with NVIDIA Ampere or newer GPU architectures, the `--tf32` option can
  + Quantization with Qwen2 have no effect (same with transformers).
  + Applying quantization with DoRA will result in higher memory and computation cost (same with PEFT).
  + Sliding window attention with generate cache may product abnormal output.
- + ChatGLM models with generate cache may product abnormal output.
 
 ## Installation
 

diff --git a/mlora/generator.py b/mlora/generator.py
@@ -425,17 +425,14 @@ def generate(
                 )
             )
 
-    if cache_implementation is None:
+    if use_cache and cache_implementation is None:
         cache_implementation = model.model_.cache_implementation()
-        if use_cache and cache_implementation is None:
+        if cache_implementation is None:
             logging.warn(
                 "Cache disabled by model, use cache_implementation to force enable."
             )
             use_cache = False
 
-    if use_cache is None and cache_implementation is not None:
-        use_cache = True
-
     packed_outputs: Dict[str, List] = {}
 
     while True:

diff --git a/mlora/models/modeling_chatglm.py b/mlora/models/modeling_chatglm.py
@@ -407,7 +407,7 @@ def forward(
 
         # apply relative positional encoding (rotary embedding)
         if self.rotary_pos_emb is not None:
-            rotary_pos_emb = self.rotary_pos_emb[None, : hidden_states.shape[1]]
+            rotary_pos_emb = self.rotary_pos_emb[None, cache_position]
             query_layer = apply_rotary_pos_emb(query_layer, rotary_pos_emb)
             key_layer = apply_rotary_pos_emb(key_layer, rotary_pos_emb)
 
@@ -748,9 +748,6 @@ def causal_mask(
     ) -> torch.Tensor:
         return self.get_masks(input_tensor, past_key_values, attention_mask)
 
-    def cache_implementation(self) -> str:
-        return None
-
     def model_config(self) -> GLMConfig:
         return self.config_