allenai · epwalsh · Mar 19, 2024 · Jan 8, 2024 · Jan 8, 2024 · Jan 10, 2024
diff --git a/olmo/config.py b/olmo/config.py
@@ -243,6 +243,14 @@ class ModelConfig(BaseConfig):
     The number of self-attention heads.
     """
 
+    n_kv_heads: Optional[int] = None
+    """
+    The number of heads to use for keys and values.
-    The number of heads to use for keys and values.
+    The number of heads to use for keys and values. Defaults to `n_heads`.
-    The number of heads to use for keys and values.
+    The number of heads to use for keys and values. Defaults to `n_heads`.
+    Set this to ``None`` or ``n_heads`` for normal multi-head attention.
+    Set this to 1 for multi-query attention.
+    Set it to some in-between value for Llama2-style grouped query attention.
+    """
+
     n_layers: int = 12
     """
     The number of layers/blocks.
@@ -309,8 +317,7 @@ class ModelConfig(BaseConfig):
 
     multi_query_attention: bool = False
     """
-    Use the Multi-Query formulation of attention used in PaLM. This reduces the number of parameters
-    and is more efficient during inference.
+    Deprecated. Use n_kv_heads instead.
     """
 
     attention_layer_norm: bool = False
@@ -428,6 +435,29 @@ class ModelConfig(BaseConfig):
     See :data:`TrainConfig.precision` instead.
     """
 
+    def __post_init__(self):
+        if self.n_kv_heads is None:
+            self.n_kv_heads = self.n_heads
+
+    @classmethod
+    def update_legacy_settings(cls, config: D) -> D:
+        new_config = config.copy()
+        if om.is_dict(new_config):
+            assert isinstance(new_config, DictConfig)
+
+            if hasattr(new_config, "multi_query_attention"):
+                if hasattr(new_config, "n_kv_heads") and new_config.n_kv_heads is not None:
+                    raise OlmoConfigurationError("You can't specify both `multi_query_attention` and `n_kv_heads`. Specify only `n_kv_heads`.")
+                if new_config.multi_query_attention:
+                    new_config.n_kv_heads = 1
+                else:
+                    new_config.n_kv_heads = new_config.n_heads
+
+            if hasattr(new_config, "optimizer"):
+                new_config.optimizer = OptimizerConfig.update_legacy_settings(new_config.optimizer)
+
+        return new_config
+
 
 class OptimizerType(StrEnum):
     lionw = "lionw"
@@ -1036,4 +1066,7 @@ def update_legacy_settings(cls, config: D) -> D:
             if hasattr(new_config, "optimizer"):
                 new_config.optimizer = OptimizerConfig.update_legacy_settings(new_config.optimizer)
 
+            if hasattr(new_config, "model"):
+                new_config.model = ModelConfig.update_legacy_settings(new_config.model)
+
         return new_config
diff --git a/olmo/model.py b/olmo/model.py
@@ -440,7 +440,7 @@ def __init__(self, layer_id: int, config: ModelConfig, cache: BufferCache):
         if config.attention_layer_norm:
             self.k_norm = LayerNormBase.build(
                 config,
-                size=config.d_model // config.n_heads if config.multi_query_attention else None,
+                size=config.d_model // config.n_kv_heads,
                 elementwise_affine=config.attention_layer_norm_with_affine,
             )
             self.q_norm = LayerNormBase.build(config, elementwise_affine=config.attention_layer_norm_with_affine)
@@ -552,16 +552,10 @@ def attention(
         # Move head forward to be next to the batch dim.
         # shape: (B, nh, T, hs)
         q = q.view(B, T, self.config.n_heads, C // self.config.n_heads).transpose(1, 2)
-        if self.config.multi_query_attention:
-            # shape: (B, 1, T, hs)
-            k = k.view(B, T, 1, C // self.config.n_heads).transpose(1, 2)
-            # shape: (B, 1, T, hs)
-            v = v.view(B, T, 1, C // self.config.n_heads).transpose(1, 2)
-        else:
-            # shape: (B, nh, T, hs)
-            k = k.view(B, T, self.config.n_heads, C // self.config.n_heads).transpose(1, 2)
-            # shape: (B, nh, T, hs)
-            v = v.view(B, T, self.config.n_heads, C // self.config.n_heads).transpose(1, 2)
+        # shape: (B, n_kv_h, T, hs)
+        k = k.view(B, T, self.config.n_kv_heads, C // self.config.n_heads).transpose(1, 2)
+        # shape: (B, n_kv_h, T, hs)
+        v = v.view(B, T, self.config.n_kv_heads, C // self.config.n_heads).transpose(1, 2)
 
         if layer_past is not None:
             past_key, past_value = layer_past
@@ -636,10 +630,13 @@ def __init__(self, layer_id: int, config: ModelConfig, cache: BufferCache):
         self.attn_norm = LayerNorm.build(config)
         self.ff_norm = LayerNorm.build(config)
         # Attention input projection. Projects x -> (q, k, v)
-        if config.multi_query_attention:
-            self.fused_dims = (config.d_model, config.d_model // config.n_heads, config.d_model // config.n_heads)
-        else:
-            self.fused_dims = (config.d_model, config.d_model, config.d_model)
+
+        head_dim = config.d_model // config.n_heads
+        self.fused_dims = (
+            config.d_model,
+            config.n_kv_heads * head_dim,
+            config.n_kv_heads * head_dim
+        )
         self.att_proj = nn.Linear(
             config.d_model, sum(self.fused_dims), bias=config.include_bias, device=config.init_device
         )
@@ -672,6 +669,8 @@ def forward(
         #  - for regular attn q, k, v: (batch_size, seq_len, d_model)
         #  - for multi-query attn q: (batch_size, seq_len, d_model)
         #                      k, v: (batch_size, seq_len, d_model // n_heads)
+        #  - for group query attn q: (batch_size, seq_len, d_model)
+        #                      k, v: (batch_size, seq_len, d_model // n_kv_heads)
         if self._activation_checkpoint_fn is not None:
             q, k, v = self.att_proj(self._activation_checkpoint_fn(self.attn_norm, x)).split(
                 self.fused_dims, dim=-1