fix merge conflicts

allenai · Jan 21, 2025 · fde5f68 · fde5f68
2 parents b1ad693 + 7633461
commit fde5f68
Show file tree

Hide file tree

Showing 18 changed files with 249 additions and 120 deletions.
diff --git a/.github/RELEASE_PROCESS.md b/.github/RELEASE_PROCESS.md
@@ -7,18 +7,12 @@
 3. Run the release script:
 
     ```bash
-    ./src/scripts/release.sh
+    ./src/scripts/release/release.sh
     ```
 
     This will commit the changes to the CHANGELOG and `version.py` files and then create a new tag in git
     which will trigger a workflow on GitHub Actions that handles the rest.
 
 ## Fixing a failed release
 
-If for some reason the GitHub Actions release workflow failed with an error that needs to be fixed, you'll have to delete both the tag and corresponding release from GitHub. After you've pushed a fix, delete the tag from your local clone with
-
-```bash
-git tag -l | xargs git tag -d && git fetch -t
-```
-
-Then repeat the steps above.
+If for some reason the GitHub Actions release workflow failed with an error that needs to be fixed, you'll have to delete the tag on GitHub. Once you've pushed a fix you can simply repeat the steps above.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -184,11 +184,12 @@ jobs:
                   cluster:
                     # H100 clusters
                     - ai2/jupiter-cirrascale-2
-                    - ai2/pluto-cirrascale
                     - ai2/augusta-google-1
+                    - ai2/allennlp-elara-cirrascale
+                    - ai2/ganymede-cirrascale
+                    - ai2/ceres-cirrascale
                     # A100 clusters
                     - ai2/saturn-cirrascale
-                    - ai2/allennlp-cirrascale
                     # - ai2/allennlp-elanding-a100-40g
                 envVars:
                   - name: CUBLAS_WORKSPACE_CONFIG
@@ -238,7 +239,7 @@ jobs:
       - name: Generate release notes
         run: |
           . .venv/bin/activate
-          python src/scripts/release_notes.py > ${{ github.workspace }}-RELEASE_NOTES.md
+          python src/scripts/release/release_notes.py > ${{ github.workspace }}-RELEASE_NOTES.md
 
       - name: Publish package to PyPI
         run: |
@@ -254,3 +255,9 @@ jobs:
           prerelease: ${{ contains(env.TAG, 'rc') }}
           files: |
             dist/*
+
+      - name: Add PR comments on release
+        env:
+          GH_TOKEN: ${{ github.token }}
+        run: |
+          ./src/scripts/release/add_pr_comments_on_release.sh
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -16,16 +16,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Added option to pre-download checkpoint files from remote storage before trying to load a checkpoint.
 - Added a callback for sending Slack notifications.
 - Added `SkipStepAdamW` optimizer.
+- The trainer can load model-only checkpoints now.
 
 ### Changed
 
 - Changed storage of shared shard state in sharded checkpoints from smallest shard to lowest rank (normally 0).
-- Changed underlying AdamW implementation.
+- Changed how the trainer handles loading a checkpoint when `load_path` is provided. Now `load_path` is only used if no checkpoint is found in the `save_folder`.
 
 ### Fixed
 
 - Added missing `weights_only=False` argument to fix loading train checkpoints with newer versions of PyTorch.
-- Fixed bug where GCS upload does not retry on transient failures. 
+- Fixed bug where GCS upload does not retry on transient failures.
 
 ## [v1.7.0](https://github.com/allenai/OLMo-core/releases/tag/v1.7.0) - 2024-11-27
 

diff --git a/README.md b/README.md
@@ -78,11 +78,11 @@ Code checks:
 ## Citing
 
 ```bibtex
-@article{OLMo,
-  title={OLMo: Accelerating the Science of Language Models},
-  author={Dirk Groeneveld and Iz Beltagy and Pete Walsh and Akshita Bhagia and Rodney Kinney and Oyvind Tafjord and A. Jha and Hamish Ivison and Ian Magnusson and Yizhong Wang and Shane Arora and David Atkinson and Russell Authur and Khyathi Raghavi Chandu and Arman Cohan and Jennifer Dumas and Yanai Elazar and Yuling Gu and Jack Hessel and Tushar Khot and William Merrill and Jacob Daniel Morrison and Niklas Muennighoff and Aakanksha Naik and Crystal Nam and Matthew E. Peters and Valentina Pyatkin and Abhilasha Ravichander and Dustin Schwenk and Saurabh Shah and Will Smith and Emma Strubell and Nishant Subramani and Mitchell Wortsman and Pradeep Dasigi and Nathan Lambert and Kyle Richardson and Luke Zettlemoyer and Jesse Dodge and Kyle Lo and Luca Soldaini and Noah A. Smith and Hanna Hajishirzi},
+@article{OLMo2,
+  title={2 OLMo 2 Furious},
+  author={Team OLMo and Pete Walsh and Luca Soldaini and Dirk Groeneveld and Kyle Lo and Shane Arora and Akshita Bhagia and Yuling Gu and Shengyi Huang and Matt Jordan and Nathan Lambert and Dustin Schwenk and Oyvind Tafjord and Taira Anderson and David Atkinson and Faeze Brahman and Christopher Clark and Pradeep Dasigi and Nouha Dziri and Michal Guerquin and Hamish Ivison and Pang Wei Koh and Jiacheng Liu and Saumya Malik and William Merrill and Lester James Validad Miranda and Jacob Daniel Morrison and Tyler C. Murray and Crystal Nam and Valentina Pyatkin and Aman Rangapur and Michael Schmitz and Sam Skjonsberg and David Wadden and Chris Wilhelm and Michael Wilson and Luke S. Zettlemoyer and Ali Farhadi and Noah A. Smith and Hanna Hajishirzi},
   year={2024},
-  url={https://api.semanticscholar.org/CorpusID:267365485},
+  url={https://api.semanticscholar.org/CorpusID:275213098},
   journal={arXiv preprint},
 }
 ```
diff --git a/src/olmo_core/io.py b/src/olmo_core/io.py
@@ -590,7 +590,10 @@ def _gcs_get_bytes_range(bucket_name: str, key: str, bytes_start: int, num_bytes
     except NotFound:
         raise FileNotFoundError(f"gs://{bucket_name}/{key}")
     return blob.download_as_bytes(
-        start=bytes_start, end=bytes_start + num_bytes - 1, retry=_get_gcs_retry(), checksum=None, # type: ignore
+        start=bytes_start,
+        end=bytes_start + num_bytes - 1,
+        retry=_get_gcs_retry(),
+        checksum=None,  # type: ignore
     )
 
 
@@ -612,7 +615,10 @@ def _gcs_upload(source: Path, bucket_name: str, key: str, save_overwrite: bool =
         generation = blob.generation
 
     blob.upload_from_filename(
-        source, if_generation_match=generation, retry=_get_gcs_conditional_retry(), checksum=None,
+        source,
+        if_generation_match=generation,
+        retry=_get_gcs_conditional_retry(),
+        checksum=None,
     )
 
 

diff --git a/src/olmo_core/launch/reorder_ranks_in_gcp.py b/src/olmo_core/launch/reorder_ranks_in_gcp.py
@@ -0,0 +1,70 @@
+import argparse
+import sys
+
+import requests
+import torch.distributed as dist
+from urllib3.exceptions import MaxRetryError, NameResolutionError
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("rank", type=int, help="Worker number")
+    parser.add_argument("world_size", type=int, help="Total number of workers")
+    parser.add_argument("master_addr", help="Hostname of worker 0")
+    parser.add_argument("--master_port", type=int, default=29501, help="Port for TCPStore")
+    parser.add_argument("--debug", action="store_true", help="Enable debug mode (outside of GCP)")
+    args = parser.parse_args()
+
+    # Create or connect to the store
+    store = dist.TCPStore(
+        host_name=args.master_addr,
+        port=args.master_port,
+        world_size=args.world_size,
+        is_master=(args.rank == 0),
+    )
+
+    # Get our own host id
+    if args.debug:
+        import socket
+
+        host_id = f"{socket.gethostname()}_{args.rank}"
+    else:
+        try:
+            response = requests.get(
+                "http://metadata.google.internal/computeMetadata/v1/instance/attributes/physical_host",
+                headers={"Metadata-Flavor": "Google"},
+            )
+            assert response.status_code == 200
+            host_id = response.text.strip()
+        except requests.exceptions.ConnectionError as e:
+            # Unwrap the exception
+            e = e.args[0]
+            if not isinstance(e, MaxRetryError):
+                raise
+            e = e.reason
+            if not isinstance(e, NameResolutionError):
+                raise
+            # Seems we called this outside of GCP, so we do nothing and just print our original rank.
+            print(args.rank)
+            sys.exit(0)
+
+    # Find the index of our host id
+    store.set(f"node_{args.rank}_hostid", host_id)
+    store.wait([f"node_{i}_hostid" for i in range(args.world_size)])
+    all_host_ids = [store.get(f"node_{i}_hostid").decode("UTF-8") for i in range(args.world_size)]
+    assert len(set(all_host_ids)) == len(all_host_ids)
+    assert host_id in all_host_ids
+    rank0_host_id = all_host_ids[0]
+    all_host_ids.sort()
+    # Rank 0 needs to remain rank 0, so we reshuffle around it
+    rank0_index = all_host_ids.index(rank0_host_id)
+    all_host_ids = all_host_ids[rank0_index:] + all_host_ids[:rank0_index]
+    print(all_host_ids.index(host_id))
+
+    # Make sure we're all done before exiting
+    store.set(f"node_{args.rank}_done", host_id)
+    store.wait([f"node_{i}_done" for i in range(args.world_size)])
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/olmo_core/optim/__init__.py b/src/olmo_core/optim/__init__.py
@@ -1,5 +1,5 @@
 from .adam import AdamConfig
-from .adamw import AdamW, AdamWConfig, SkipStepAdamW, SkipStepAdamWConfig
+from .adamw import AdamWConfig, SkipStepAdamW, SkipStepAdamWConfig
 from .config import OptimConfig, OptimGroupOverride
 from .lion import Lion, LionConfig, SkipStepLion, SkipStepLionConfig
 from .scheduler import (
@@ -18,8 +18,9 @@
     "OptimGroupOverride",
     "SkipStepOptimizer",
     "AdamWConfig",
+    "SkipStepAdamWConfig",
+    "SkipStepAdamW",
     "AdamConfig",
-    "AdamW",
     "SkipStepAdamWConfig",
     "SkipStepAdamW",
     "LionConfig",

diff --git a/src/olmo_core/optim/adamw.py b/src/olmo_core/optim/adamw.py
@@ -3,7 +3,6 @@
 
 import torch
 import torch.nn as nn
-from torch.optim.optimizer import Optimizer
 
 from .config import OptimConfig
 from .skip_step_optimizer import SkipStepOptimizer
@@ -34,8 +33,8 @@ def adamw_step(
     exp_avg_sq.mul_(1 - step_factor * (1 - beta2))
     exp_avg_sq.add_(step_factor * p.grad * p.grad, alpha=1 - beta2)
 
-    bias_correction1 = 1 - beta1**(step + 1)
-    bias_correction2 = 1 - beta2**(step + 1)
+    bias_correction1 = 1 - beta1 ** (step + 1)
+    bias_correction2 = 1 - beta2 ** (step + 1)
 
     step_size = lr / bias_correction1
 
@@ -46,58 +45,6 @@ def adamw_step(
     p.add_(update)
 
 
-class AdamW(Optimizer):
-    """
-    An implementation of the AdamW optimizer.
-    """
-
-    def __init__(
-        self,
-        params,
-        lr: float = 1e-3,
-        betas: Tuple[float, float] = (0.9, 0.999),
-        eps: float = 1e-8,
-        weight_decay: float = 1e-2,
-        foreach: Optional[bool] = None,
-        fused: Optional[bool] = None,
-    ):
-        assert lr > 0.0
-        assert all([0.0 <= beta <= 1.0 for beta in betas])
-        defaults = dict(
-            lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, foreach=foreach, fused=fused
-        )
-        super().__init__(params, defaults)
-
-    @torch.no_grad()
-    def step(self, closure=None) -> None:
-        if closure is not None:
-            with torch.enable_grad():
-                closure()
-
-        for group in self.param_groups:
-            for p in group["params"]:
-                if p.grad is None:
-                    continue
-
-                state = self.state[p]
-                if len(state) == 0:
-                    state["step"] = torch.tensor(0.0, dtype=torch.float32, device=p.device)
-                    state["exp_avg"] = torch.zeros_like(p)
-                    state["exp_avg_sq"] = torch.zeros_like(p)
-
-                adamw_step(
-                    p,
-                    lr=group["lr"],
-                    betas=group["betas"],
-                    eps=group["eps"],
-                    weight_decay=group["weight_decay"],
-                    exp_avg=state["exp_avg"],
-                    exp_avg_sq=state["exp_avg_sq"],
-                    step=state["step"],
-                    step_factor=torch.tensor(1.0, device=p.device),
-                )
-
-
 class SkipStepAdamW(SkipStepOptimizer):
     """
     A "skip step" version of :class:`AdamW`.
@@ -181,8 +128,8 @@ class AdamWConfig(OptimConfig):  # NOTE: omagaconf doesn't like "OptimConfig[tor
     fused: Optional[bool] = None
 
     @classmethod
-    def optimizer(cls) -> Type[AdamW]:
-        return AdamW
+    def optimizer(cls) -> Type[torch.optim.AdamW]:
+        return torch.optim.AdamW
 
 
 @dataclass
@@ -195,8 +142,6 @@ class SkipStepAdamWConfig(OptimConfig):
     betas: Tuple[float, float] = (0.9, 0.999)
     eps: float = 1e-8
     weight_decay: float = 1e-2
-    foreach: Optional[bool] = None
-    fused: Optional[bool] = None
     rolling_interval_length: int = 128
     sigma_factor: int = 6
 

diff --git a/src/olmo_core/train/callbacks/checkpointer.py b/src/olmo_core/train/callbacks/checkpointer.py
@@ -216,7 +216,10 @@ def pre_train(self):
                 path for _, path in sorted(ephemeral_checkpoints, key=lambda x: x[0])
             ]
             for path in self._ephemeral_checkpoints:
-                log.info(f"Collected existing ephemeral checkpoint at '{path}'")
+                log.info(
+                    f"Found existing ephemeral checkpoint at '{path}' which will "
+                    "be removed when the next checkpoint is saved"
+                )
 
     def post_train_batch(self):
         self._await_last_checkpoint(blocking=False)

diff --git a/src/olmo_core/train/callbacks/slack_notifier.py b/src/olmo_core/train/callbacks/slack_notifier.py
@@ -12,6 +12,7 @@
 
 SLACK_WEBHOOK_URL_ENV_VAR = "SLACK_WEBHOOK_URL"
 BEAKER_JOB_ID_ENV_VAR = "BEAKER_JOB_ID"
+EXC_LINE_LIMIT = 30
 
 
 class SlackNotificationSetting(StrEnum):
@@ -98,7 +99,12 @@ def on_error(self, exc: BaseException):
             SlackNotificationSetting.end_only,
             SlackNotificationSetting.failure_only,
         ):
-            self._post_message(f"failed with error:\n{exc}")
+            exc_lines = str(exc).rstrip("\n").split("\n")
+            if len(exc_lines) > EXC_LINE_LIMIT:
+                exc_lines = exc_lines[:EXC_LINE_LIMIT]
+                exc_lines.append("...")
+            exc_str = "\n".join(exc_lines)
+            self._post_message(f"failed with error:\n```\n{exc_str}\n```")
 
     def _post_message(self, msg: str):
         webhook_url = self.webhook_url or os.environ.get(SLACK_WEBHOOK_URL_ENV_VAR)