allenai · epwalsh · Dec 4, 2024 · Dec 4, 2024 · Dec 4, 2024 · Dec 4, 2024
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -8,9 +8,11 @@ on:
   pull_request:
     branches:
       - main
+      - v2
   push:
     branches:
       - main
+      - v2
     tags:
       - 'v*.*.*'
 
@@ -185,9 +187,11 @@ jobs:
                     # H100 clusters
                     - ai2/jupiter-cirrascale-2
                     - ai2/augusta-google-1
+                    - ai2/allennlp-elara-cirrascale
+                    - ai2/ganymede-cirrascale
+                    - ai2/ceres-cirrascale
                     # A100 clusters
                     - ai2/saturn-cirrascale
-                    - ai2/allennlp-cirrascale
                     # - ai2/allennlp-elanding-a100-40g
                 envVars:
                   - name: CUBLAS_WORKSPACE_CONFIG

diff --git a/.github/workflows/pr_checks.yml b/.github/workflows/pr_checks.yml
@@ -8,6 +8,7 @@ on:
   pull_request:
     branches:
       - main
+      - v2
     paths:
       - 'src/**'
 

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,26 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## v2
+
+This major release introduces a few breaking changes. As such, we've provided an upgrade guide here: [OLMo-core upgrade guide](https://docs.google.com/document/d/1LvANhNzA-MdtiD2pLniLTqB9wxSSuqY435WuJIADeFM/edit?usp=sharing).
+
+### Added
+
+- Added `TrainModule` abstraction with `TransformerTrainModule` implementation, which encapsulates both a model and optimizer.
+- Added `namespace` argument to `Trainer.record_metric()`.
+
+### Changed
+
+- The `Trainer` now takes a `TrainModule` instead of a model and optimizer, and several configuration options have been moved to `TransformerTrainModule`, including `rank_microbatch_size`, `fused_loss`, `compile_loss`, `z_loss_multiplier`, and `autocast_precision`.
+- Several `TransformerModelConfig` options have been to `TransformerTrainModule` / `TransformerTrainModuleConfig`, including `dp_config`, `tp_config`, `float8_config`, and `compile`.
+
+### Removed
+
+- Removed the following callbacks: `MoEHandlerCallback`, `SchedulerCallback`, `MatrixNormalizerCallback`, `GradClipperCallback`, and `Float8HandlerCallback`.
+  The functionality from all of those callbacks has been moved to the `TransformerTrainModule` class.
+- Removed the callback methods `.pre_eval_batch()` and `.post_eval_batch()`.
+
 ## Unreleased
 
 ### Added

diff --git a/README.md b/README.md
@@ -45,16 +45,16 @@ To see the exact usage for each script, run the script without any arguments.
 
 Throughput numbers from these scripts with various different configuration settings are reported below, measured on a cluster with NVIDIA H100 GPUs.
 
-| Model&nbsp;size | Model&nbsp;arch.&nbsp;&nbsp; | Context&nbsp;length | Precision | Throughput[^1] | Training&nbsp;&nbsp;&nbsp;script | Commandline&nbsp;overrides&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; |
+| Model&nbsp;size | Model&nbsp;arch.&nbsp;&nbsp; | Context&nbsp;length | Precision | Throughput[^1] | Training&nbsp;&nbsp;&nbsp;script | Commandline&nbsp;overrides&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; |
 | :--------: | :--------: | :------------: | :-------: | -----------: | :----------- | :-------- |
 | **1B**  | OLMo-1124 | 4096 | BF16 | 55,000 TPS | `OLMo2-1B.py` | |
-| | | 4096 | BF16/FP8[^2] | 65,000 TPS | `OLMo2-1B.py` | `--model.float8_config.enabled=true` |
+| | | 4096 | BF16/FP8[^2] | 65,000 TPS | `OLMo2-1B.py` | `--train_module.float8_config.enabled=true` |
 | **7B**  | OLMo-1124 | 4096 | BF16 | 10,000 TPS | `OLMo2-7B.py` | |
-| | | 4096 | BF16/FP8 | 13,000 TPS | `OLMo2-7B.py` | `--model.float8_config.enabled=true` |
+| | | 4096 | BF16/FP8 | 13,000 TPS | `OLMo2-7B.py` | `--train_module.float8_config.enabled=true` |
 | **8B**  | Llama | 4096 | BF16 | 9,500 TPS | `Llama3-8B.py` | |
-| | | 4096 | BF16/FP8 | 12,500 TPS | `Llama3-8B.py` | `--model.float8_config.enabled=true` |
+| | | 4096 | BF16/FP8 | 12,500 TPS | `Llama3-8B.py` | `--train_module.float8_config.enabled=true` |
 | **13B** | OLMo-1124 | 4096 | BF16 | 4,600 TPS | `OLMo2-13B.py` | |
-| | | 4096 | BF16/FP8 | 5,500 TPS | `OLMo2-13B.py` | `--model.float8_config.enabled=true` |
+| | | 4096 | BF16/FP8 | 5,500 TPS | `OLMo2-13B.py` | `--train_module.float8_config.enabled=true` |
 
 [^1]: Throughput reported in tokens per second per device.
 [^2]: In this setup most matrix multiplications are computed in `float8`, everything else is in `bfloat16`.

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -135,13 +135,22 @@ def autodoc_skip_member(app, what, name, obj, skip, options):
 
     module = inspect.getmodule(obj)
     module_name = None if module is None else module.__name__
-    if (
-        what == "class"
-        and module_name is not None
-        and module_name.startswith("olmo_core.train.callbacks")
-        and module_name != "olmo_core.train.callbacks.callback"
-    ):
-        if inspect.isfunction(obj) or inspect.ismethod(obj):
+
+    if what == "class" and module_name is not None:
+        # Skip documenting callback subclass methods.
+        if (
+            module_name.startswith("olmo_core.train.callbacks.")
+            and module_name != "olmo_core.train.callbacks.callback"
+            and (inspect.isfunction(obj) or inspect.ismethod(obj))
+        ):
+            return True
+
+        # Skip documenting train module subclass methods.
+        if (
+            module_name.startswith("olmo_core.train.train_module.")
+            and module_name != "olmo_core.train.train_module.train_module"
+            and (inspect.isfunction(obj) or inspect.ismethod(obj) or isinstance(obj, property))
+        ):
             return True
 
     return skip

diff --git a/docs/source/nn/transformer.rst b/docs/source/nn/transformer.rst
@@ -3,3 +3,4 @@
 
 .. automodule:: olmo_core.nn.transformer
    :members:
+   :exclude-members: TransformerDataParallelWrappingStrategy,TransformerActivationCheckpointingMode
diff --git a/docs/source/overview/introduction.rst b/docs/source/overview/introduction.rst
@@ -17,18 +17,19 @@ Most users will likely follow a workflow that looks like this:
    For example::
 
      model_config = TransformerConfig.llama2_7B(...)
-     optim_config = AdamWConfig(lr=1e-3, ...)
+     train_module_config = TransformerTrainModuleConfig(...)
      data_config = NumpyDatasetConfig(...)
+     data_loader_config = NumpyDataLoaderConfig(...)
      trainer_config = TrainerConfig(...)
 
 2. Build the corresponding components within a ``main()`` function at runtime and then call :meth:`Trainer.fit() <olmo_core.train.Trainer.fit>`.
    For example::
 
-     def main(model_config, optim_config, data_config, trainer_config):
+     def main():
          model = model_config.build()
-         optim = optim_config.build()
-         dataset = data_config.build()
-         trainer = trainer_config.build(model, optim, dataset)
+         train_module = train_module_config.build(model)
+         data_loader = data_loader_config.build(data_config.build(), dp_process_group=train_module.dp_process_groupo)
+         trainer = trainer_config.build(train_module, data_loader)
 
          trainer.fit()
 

diff --git a/docs/source/train/index.rst b/docs/source/train/index.rst
@@ -9,3 +9,4 @@
    :caption: Submodules
 
    callbacks
+   train_module
diff --git a/docs/source/train/train_module.rst b/docs/source/train/train_module.rst
@@ -0,0 +1,5 @@
+``train.train_module``
+======================
+
+.. automodule:: olmo_core.train.train_module
+   :members:
diff --git a/src/examples/huggingface/convert_checkpoint.py b/src/examples/huggingface/convert_checkpoint.py
@@ -144,7 +144,9 @@ def validate_conversion(hf_model):
 
     del hf_model
 
-    model = MODEL_CONFIG.build(device=device, max_seq_len=131072).eval()
+    model = MODEL_CONFIG.build()
+    model.init_weights(device=device, max_seq_len=131072)
+    model.eval()
 
     log.info("Loading converted checkpoint for validation...")
     load_model_and_optim_state(SAVE_PATH, model)

diff --git a/src/examples/llama/train.py b/src/examples/llama/train.py
@@ -18,7 +18,7 @@
     TokenizerConfig,
 )
 from olmo_core.distributed.parallel import DataParallelType
-from olmo_core.nn.transformer import TransformerConfig, TransformerDataParallelConfig
+from olmo_core.nn.transformer import TransformerConfig
 from olmo_core.optim import AdamWConfig, CosWithWarmup, OptimGroupOverride
 from olmo_core.train import (
     Duration,
@@ -32,22 +32,23 @@
     ConfigSaverCallback,
     DownstreamEvaluatorCallbackConfig,
     GPUMemoryMonitorCallback,
-    GradClipperCallback,
     LMEvaluatorCallbackConfig,
     ProfilerCallback,
-    SchedulerCallback,
-    SequenceLengthSchedulerCallback,
     WandBCallback,
 )
-from olmo_core.utils import get_default_device, seed_all
+from olmo_core.train.train_module import (
+    TransformerDataParallelConfig,
+    TransformerTrainModuleConfig,
+)
+from olmo_core.utils import seed_all
 
 
 @dataclass
 class ExperimentConfig(Config):
     model: TransformerConfig
-    optim: AdamWConfig
     dataset: NumpyDatasetConfig
     data_loader: NumpyDataLoaderConfig
+    train_module: TransformerTrainModuleConfig
     trainer: TrainerConfig
     init_seed: int = 12536
 
@@ -57,30 +58,13 @@ def build_config(run_name: str, overrides: List[str]) -> ExperimentConfig:
 
     model_config = TransformerConfig.llama2_271M(
         vocab_size=tokenizer_config.padded_vocab_size(),  # a little bigger than actual vocab size to make it a multiple of 128
-        compile=True,
-        fused_ops=False,
-        use_flash=False,
-        dp_config=TransformerDataParallelConfig(
-            name=DataParallelType.fsdp, param_dtype=DType.bfloat16, reduce_dtype=DType.float32
-        ),
-    )
-
-    optim_config = AdamWConfig(
-        lr=1e-3,
-        group_overrides=[
-            OptimGroupOverride(params=["embeddings.weight"], opts=dict(weight_decay=0.0))
-        ],
     )
 
     dataset_config = NumpyDatasetConfig.glob(
         "/net/nfs/allennlp/llm-data/c4/en/c4-train.*.npy",  # can be globs
         name=NumpyDatasetType.fsl,
         sequence_length=1024,
         max_target_sequence_length=8192,
-        #  name=NumpyDatasetType.vsl,
-        #  max_sequence_length=2048,
-        #  min_sequence_length=256,
-        #  vsl_curriculum=VSLCurriculumConfig(name=VSLCurriculumType.grow_p2, num_cycles=4),
         tokenizer=tokenizer_config,
         work_dir="/tmp/dataset-cache",
     )
@@ -91,28 +75,32 @@ def build_config(run_name: str, overrides: List[str]) -> ExperimentConfig:
         num_workers=4,
     )
 
+    train_module_config = TransformerTrainModuleConfig(
+        rank_microbatch_size=16 * 1024,
+        max_sequence_length=dataset_config.effective_sequence_length,
+        optim=AdamWConfig(
+            lr=1e-3,
+            group_overrides=[
+                OptimGroupOverride(params=["embeddings.weight"], opts=dict(weight_decay=0.0))
+            ],
+        ),
+        compile_model=True,
+        dp_config=TransformerDataParallelConfig(
+            name=DataParallelType.fsdp, param_dtype=DType.bfloat16, reduce_dtype=DType.float32
+        ),
+        compile_loss=True,
+        max_grad_norm=1.0,
+        scheduler=CosWithWarmup(warmup_steps=100),
+    )
+
     trainer_config = (
         TrainerConfig(
             save_folder=f"/tmp/{run_name}",
-            rank_microbatch_size=16 * 1024,
             save_overwrite=True,
             metrics_collect_interval=5,
             cancel_check_interval=5,
-            load_key_mapping={
-                # For backwards compatibility when loading older checkpoints.
-                "lm_head.w_out.weight": "w_out.weight",
-                "lm_head.norm.weight": "norm.weight",
-            },
-        )
-        .with_callback("lr_scheduler", SchedulerCallback(scheduler=CosWithWarmup(warmup_steps=100)))
-        .with_callback(
-            "seq_len_scheduler",
-            SequenceLengthSchedulerCallback(
-                min_sequence_length=128, warmup_steps=100, enabled=False
-            ),
         )
         .with_callback("gpu_monitor", GPUMemoryMonitorCallback())
-        .with_callback("grad_clipper", GradClipperCallback(max_grad_norm=1.0))
         .with_callback(
             "checkpointer",
             CheckpointerCallback(
@@ -166,9 +154,9 @@ def build_config(run_name: str, overrides: List[str]) -> ExperimentConfig:
 
     return ExperimentConfig(
         model=model_config,
-        optim=optim_config,
         dataset=dataset_config,
         data_loader=data_loader_config,
+        train_module=train_module_config,
         trainer=trainer_config,
     ).merge(overrides)
 
@@ -179,22 +167,12 @@ def main(run_name: str, overrides: List[str]):
     # Set RNG states on all devices.
     seed_all(config.init_seed)
 
-    device = get_default_device()
-
-    # Build the world mesh, if needed.
-    world_mesh = config.model.build_mesh(device=device)
-
     # Build components.
-    model = config.model.build(
-        init_device="meta",
-        device=device,
-        max_seq_len=config.dataset.sequence_length,
-        mesh=world_mesh,
-    )
-    optim = config.optim.build(model)
+    model = config.model.build(init_device="meta")
+    train_module = config.train_module.build(model)
     dataset = config.dataset.build()
-    data_loader = config.data_loader.build(dataset, mesh=world_mesh)
-    trainer = config.trainer.build(model, optim, data_loader, mesh=world_mesh)
+    data_loader = config.data_loader.build(dataset, dp_process_group=train_module.dp_process_group)
+    trainer = config.trainer.build(train_module, data_loader)
 
     # Save config to W&B and each checkpoint dir.
     config_dict = config.as_config_dict()
-Original file line number
+Diff line change
@@ Expand Up / @@ -8,6 +8,7 @@ on: @@
       pull_request:
         branches:
           - main
+          - v2
         paths:
           - 'src/**'
@@ Expand Down @@
Original file line number	Diff line number	Diff line change
Expand Up		@@ -3,3 +3,4 @@

		.. automodule:: olmo_core.nn.transformer
		:members:
		:exclude-members: TransformerDataParallelWrappingStrategy,TransformerActivationCheckpointingMode
Original file line number	Diff line number	Diff line change
Expand Up		@@ -9,3 +9,4 @@
		:caption: Submodules

		callbacks
		train_module