diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml index 05d1a4b9c..77ad24c4d 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yml +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -17,7 +17,7 @@ body: ```python # All necessary imports at the beginning - import dolma + import olmo # A succinct reproducing example trimmed down to the essential parts: assert False is True, "Oh no!" diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index ab3089822..346098cdf 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -101,7 +101,7 @@ jobs: if: always() run: | . .venv/bin/activate - pip uninstall -y dolma + pip uninstall -y olmo gpu_tests: name: GPU Tests @@ -109,7 +109,7 @@ jobs: timeout-minutes: 15 env: BEAKER_TOKEN: ${{ secrets.BEAKER_TOKEN }} - BEAKER_IMAGE: dolma-torch2-test + BEAKER_IMAGE: olmo-torch2-test BEAKER_WORKSPACE: ai2/llm-testing steps: - name: Determine current commit SHA (pull request) diff --git a/.github/workflows/pr_checks.yml b/.github/workflows/pr_checks.yml index f853fb446..12f918aaf 100644 --- a/.github/workflows/pr_checks.yml +++ b/.github/workflows/pr_checks.yml @@ -9,7 +9,7 @@ on: branches: - main paths: - - 'dolma/**' + - 'olmo/**' jobs: changelog: diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 5f8294129..f790ef6ac 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -77,8 +77,8 @@ When you're ready to contribute code to address an open issue, please follow the Then you can create and activate a new Python environment by running: - conda create -n dolma python=3.9 - conda activate dolma + conda create -n olmo python=3.9 + conda activate olmo Once your virtual environment is activated, you can install your local clone in "editable mode" with @@ -139,13 +139,13 @@ When you're ready to contribute code to address an open issue, please follow the We also strive to maintain high test coverage, so most contributions should include additions to [the unit tests](https://github.com/allenai/LLM/tree/main/tests). These tests are run with [`pytest`](https://docs.pytest.org/en/latest/), which you can use to locally run any test modules that you've added or changed. - For example, if you've fixed a bug in `dolma/a/b.py`, you can run the tests specific to that module with + For example, if you've fixed a bug in `olmo/a/b.py`, you can run the tests specific to that module with pytest -v tests/a/b_test.py To check the code coverage locally in this example, you could run - pytest -v --cov dolma.a.b tests/a/b_test.py + pytest -v --cov olmo.a.b tests/a/b_test.py If your contribution involves additions to any public part of the API, we require that you write docstrings for each function, method, class, or module that you add. diff --git a/Makefile b/Makefile index 3ac3be0f5..d6e37708f 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ # If you update this, also update BEAKER_IMAGE in .github/workflows/main.yml -IMAGE_NAME_BASE = dolma-torch2 +IMAGE_NAME_BASE = olmo-torch2 # If you update this, also update BEAKER_WORKSPACE in .github/workflows/main.yml BEAKER_WORKSPACE = ai2/llm-testing @@ -24,7 +24,7 @@ beaker-info : .PHONY : images images : gantry-image test-image -PHONY : base-image +.PHONY : base-image base-image : docker build -f docker/Dockerfile.base -t $(IMAGE_NAME_BASE)-base . @@ -91,7 +91,7 @@ gantry-run-ib : --env NCCL_DEBUG=INFO \ --env SCRATCH_DIR=/tmp/scratch \ --env FLASH_DIR=/tmp/flash \ - --env WANDB_PROJECT=dolma-beaker-ib \ + --env WANDB_PROJECT=olmo-beaker-ib \ --env-secret WANDB_API_KEY=WANDB_API_KEY \ --replicas 4 \ --leader-selection \ @@ -103,8 +103,8 @@ gantry-run-ib : .PHONY : check-cpu-install check-cpu-install : - @python -c 'from dolma import check_install; check_install(cuda=False)' + @python -c 'from olmo import check_install; check_install(cuda=False)' .PHONY : check-cuda-install check-cuda-install : - @python -c 'from dolma import check_install; check_install(cuda=True)' + @python -c 'from olmo import check_install; check_install(cuda=True)' diff --git a/README.md b/README.md index 17d9c557a..64bd8a22d 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# DOLMA: Delightful Open Language Model from AI2 +# OLMo: Delightful Open Language Model from AI2 ## Setup @@ -24,7 +24,7 @@ gantry run \ --nfs \ --priority preemptible \ --gpus 8 \ - --beaker-image dolma-torch2-gantry \ + --beaker-image olmo-torch2-gantry \ --cluster 'ai2/*-cirrascale' \ --allow-dirty \ -- composer scripts/train.py configs/1.2b-c4.yaml @@ -36,7 +36,7 @@ Train the 70B model on c4 with gantry across multiple nodes: gantry run \ --workspace ai2/llm-testing \ --priority "high" \ - --beaker-image dolma-torch2-gantry \ + --beaker-image olmo-torch2-gantry \ --cluster ai2/general-cirrascale-a100-80g-ib \ --gpus 8 \ --nfs \ @@ -45,7 +45,7 @@ gantry run \ --env NCCL_DEBUG=INFO \ --env SCRATCH_DIR=/tmp/scratch \ --env FLASH_DIR=/tmp/flash \ - --env WANDB_PROJECT=dolma-beaker-ib \ + --env WANDB_PROJECT=olmo-beaker-ib \ --env-secret WANDB_API_KEY=WANDB_API_KEY \ --replicas 4 \ --leader-selection \ diff --git a/RELEASE_PROCESS.md b/RELEASE_PROCESS.md index 03bb4ba80..dc1eb9c94 100644 --- a/RELEASE_PROCESS.md +++ b/RELEASE_PROCESS.md @@ -2,7 +2,7 @@ ## Steps -1. Update the version in `dolma/version.py`. +1. Update the version in `olmo/version.py`. 3. Run the release script: diff --git a/conftest.py b/conftest.py index ea8c522bb..3aa87ca93 100644 --- a/conftest.py +++ b/conftest.py @@ -2,7 +2,7 @@ import pytest -from dolma.config import ( +from olmo.config import ( DataConfig, ModelConfig, OptimizerConfig, @@ -11,7 +11,7 @@ TokenizerConfig, TrainConfig, ) -from dolma.tokenizer import Tokenizer +from olmo.tokenizer import Tokenizer TEST_MODEL = "gpt2" diff --git a/docker/Dockerfile.gantry b/docker/Dockerfile.gantry index 14a902870..18fd894b0 100644 --- a/docker/Dockerfile.gantry +++ b/docker/Dockerfile.gantry @@ -4,11 +4,11 @@ # To build and push the image to Beaker, run 'make gantry-image'. # To test the image after pushing to Beaker, run 'make gantry-test'. -FROM dolma-torch2-base +FROM olmo-torch2-base WORKDIR /stage COPY requirements.txt . RUN pip install --no-cache-dir -r requirements.txt -WORKDIR /app/dolma +WORKDIR /app/olmo diff --git a/docker/Dockerfile.test b/docker/Dockerfile.test index 35614df8b..e4589f964 100644 --- a/docker/Dockerfile.test +++ b/docker/Dockerfile.test @@ -4,7 +4,7 @@ # # To build and push the image to Beaker, run 'make test-image'. -FROM dolma-torch2-base +FROM olmo-torch2-base COPY scripts/test_entrypoint.sh /entrypoint.sh RUN chmod +x /entrypoint.sh diff --git a/dolma/exceptions.py b/dolma/exceptions.py deleted file mode 100644 index 6f3383d7d..000000000 --- a/dolma/exceptions.py +++ /dev/null @@ -1,19 +0,0 @@ -__all__ = ["DolmaError", "DolmaConfigurationError", "DolmaCliError"] - - -class DolmaError(Exception): - """ - Base class for all custom DOLMA exceptions. - """ - - -class DolmaConfigurationError(DolmaError): - """ - An error with a configuration file. - """ - - -class DolmaCliError(DolmaError): - """ - An error from incorrect CLI usage. - """ diff --git a/dolma/__init__.py b/olmo/__init__.py similarity index 86% rename from dolma/__init__.py rename to olmo/__init__.py index 3d9043e83..e14934765 100644 --- a/dolma/__init__.py +++ b/olmo/__init__.py @@ -12,4 +12,4 @@ def check_install(cuda: bool = False): assert torch.cuda.is_available(), "CUDA is not available!" print("CUDA available") - print(f"DOLMA v{VERSION} installed") + print(f"OLMo v{VERSION} installed") diff --git a/dolma/aliases.py b/olmo/aliases.py similarity index 100% rename from dolma/aliases.py rename to olmo/aliases.py diff --git a/dolma/beam_search.py b/olmo/beam_search.py similarity index 100% rename from dolma/beam_search.py rename to olmo/beam_search.py diff --git a/dolma/composer.py b/olmo/composer.py similarity index 95% rename from dolma/composer.py rename to olmo/composer.py index 9f737551e..0952871c5 100644 --- a/dolma/composer.py +++ b/olmo/composer.py @@ -23,16 +23,16 @@ TrainConfig, ) from .data import DataCollator, MemMapDataset -from .exceptions import DolmaConfigurationError -from .model import Dolma, LayerNormBase +from .exceptions import OlmoConfigurationError +from .model import LayerNormBase, Olmo from .optim import DecoupledLionW log = logging.getLogger(__name__) __all__ = [ "TrainBatchPerplexity", - "ComposerDolmaLM", - "DolmaConsoleLogger", + "ComposerOlmoLM", + "OlmoConsoleLogger", "build_dataloader", "build_optimizer", "build_scheduler", @@ -76,10 +76,10 @@ def compute(self) -> torch.Tensor: return torch.exp(self.loss) -class ComposerDolmaLM(ComposerModel): - def __init__(self, model_or_config: Union[Dolma, ModelConfig]): +class ComposerOlmoLM(ComposerModel): + def __init__(self, model_or_config: Union[Olmo, ModelConfig]): super().__init__() - self.model = Dolma(model_or_config) if isinstance(model_or_config, ModelConfig) else model_or_config + self.model = Olmo(model_or_config) if isinstance(model_or_config, ModelConfig) else model_or_config self.config = self.model.config self.num_fwd_flops = self.model.num_fwd_flops @@ -131,7 +131,7 @@ def flops_per_batch(self, batch: BatchDict): return self.num_fwd_flops * 3 * batch["input_ids"].shape[0] -class DolmaConsoleLogger(ConsoleLogger): +class OlmoConsoleLogger(ConsoleLogger): metrics_to_log: Set[str] = {"loss/train/total", "trainer/global_step", "metrics/*"} def log_metrics(self, metrics: dict[str, float], step: Optional[int] = None) -> None: @@ -285,7 +285,7 @@ def calculate_batch_size_info( global_batch_size: int, device_microbatch_size: Union[int, str] ) -> Tuple[int, Union[str, int], Union[str, int]]: if global_batch_size % dist.get_world_size() != 0: - raise DolmaConfigurationError( + raise OlmoConfigurationError( f"Global batch size {global_batch_size} is not divisible by {dist.get_world_size()} " "as a result, the batch size would be truncated, please adjust `global_batch_size` " f"to be divisible by world size, {dist.get_world_size()}." @@ -303,7 +303,7 @@ def calculate_batch_size_info( device_microbatch_size = device_batch_size device_grad_accum = math.ceil(device_batch_size / device_microbatch_size) else: - raise DolmaConfigurationError(f"Not sure how to parse {device_microbatch_size=}") + raise OlmoConfigurationError(f"Not sure how to parse {device_microbatch_size=}") return device_batch_size, device_microbatch_size, device_grad_accum @@ -324,7 +324,7 @@ def update_batch_size_info(cfg: TrainConfig): elif isinstance(cfg.device_train_microbatch_size, int): cfg.device_eval_batch_size = cfg.device_train_microbatch_size else: - raise DolmaConfigurationError( + raise OlmoConfigurationError( f"Not sure how to parse device_train_microbatch_size={cfg.device_train_microbatch_size}" ) return cfg diff --git a/dolma/config.py b/olmo/config.py similarity index 98% rename from dolma/config.py rename to olmo/config.py index 1aabacc23..079da5486 100644 --- a/dolma/config.py +++ b/olmo/config.py @@ -22,7 +22,7 @@ from omegaconf.errors import OmegaConfBaseException from .aliases import PathOrStr -from .exceptions import DolmaConfigurationError +from .exceptions import OlmoConfigurationError __all__ = [ "ActivationType", @@ -90,7 +90,7 @@ def new(cls: Type[C], **kwargs) -> C: conf = om.merge(conf, kwargs) return cast(C, om.to_object(conf)) except OmegaConfBaseException as e: - raise DolmaConfigurationError(str(e)) + raise OlmoConfigurationError(str(e)) @classmethod def load(cls: Type[C], path: PathOrStr, overrides: Optional[List[str]] = None) -> C: @@ -103,7 +103,7 @@ def load(cls: Type[C], path: PathOrStr, overrides: Optional[List[str]] = None) - conf = om.merge(conf, om.from_dotlist(overrides)) return cast(C, om.to_object(conf)) except OmegaConfBaseException as e: - raise DolmaConfigurationError(str(e)) + raise OlmoConfigurationError(str(e)) def save(self, path: PathOrStr) -> None: """Save to a YAML file.""" @@ -155,7 +155,7 @@ class BlockType(StrEnum): @dataclass class ModelConfig(BaseConfig): """ - DOLMA (model) configuration. + OLMo (model) configuration. """ # Note that the defaults for these attributes are equivalent to the base GPT2 model. @@ -401,7 +401,7 @@ class CompilerConfig(BaseConfig): @dataclass class TrainConfig(BaseConfig): """ - DOLMA training configuration. + OLMo training configuration. """ run_name: Optional[str] = None diff --git a/dolma/data/__init__.py b/olmo/data/__init__.py similarity index 100% rename from dolma/data/__init__.py rename to olmo/data/__init__.py diff --git a/dolma/data/collator.py b/olmo/data/collator.py similarity index 100% rename from dolma/data/collator.py rename to olmo/data/collator.py diff --git a/dolma/data/memmap_dataset.py b/olmo/data/memmap_dataset.py similarity index 100% rename from dolma/data/memmap_dataset.py rename to olmo/data/memmap_dataset.py diff --git a/olmo/exceptions.py b/olmo/exceptions.py new file mode 100644 index 000000000..9b46995a5 --- /dev/null +++ b/olmo/exceptions.py @@ -0,0 +1,19 @@ +__all__ = ["OlmoError", "OlmoConfigurationError", "OlmoCliError"] + + +class OlmoError(Exception): + """ + Base class for all custom OLMo exceptions. + """ + + +class OlmoConfigurationError(OlmoError): + """ + An error with a configuration file. + """ + + +class OlmoCliError(OlmoError): + """ + An error from incorrect CLI usage. + """ diff --git a/dolma/model.py b/olmo/model.py similarity index 96% rename from dolma/model.py rename to olmo/model.py index 98c69f841..577d72928 100644 --- a/dolma/model.py +++ b/olmo/model.py @@ -18,7 +18,7 @@ from .beam_search import BeamSearch, Constraint, FinalSequenceScorer, Sampler from .config import ActivationType, BlockType, LayerNormType, ModelConfig -from .exceptions import DolmaConfigurationError +from .exceptions import OlmoConfigurationError __all__ = [ "LayerNormBase", @@ -29,12 +29,12 @@ "GELU", "ReLU", "SwiGLU", - "DolmaBlock", - "DolmaSequentialBlock", - "DolmaParallelBlock", - "Dolma", - "DolmaOutput", - "DolmaGenerateOutput", + "OlmoBlock", + "OlmoSequentialBlock", + "OlmoParallelBlock", + "Olmo", + "OlmoOutput", + "OlmoGenerateOutput", ] @@ -215,7 +215,7 @@ def output_multiplier(self) -> float: return 0.5 -class DolmaBlock(nn.Module): +class OlmoBlock(nn.Module): """ A base class for transformer block implementations. """ @@ -317,16 +317,16 @@ def forward( raise NotImplementedError @classmethod - def build(cls, config: ModelConfig) -> DolmaBlock: + def build(cls, config: ModelConfig) -> OlmoBlock: if config.block_type == BlockType.sequential: - return DolmaSequentialBlock(config) + return OlmoSequentialBlock(config) elif config.block_type == BlockType.parallel: - return DolmaParallelBlock(config) + return OlmoParallelBlock(config) else: raise NotImplementedError(f"not sure how to handle block type '{config.block_type}'") -class DolmaSequentialBlock(DolmaBlock): +class OlmoSequentialBlock(OlmoBlock): """ This is a typical transformer block where the output is computed as ``MLP(LN(x + Attention(LN(x))))`` (plus another skip connection). @@ -364,11 +364,11 @@ def forward( return x -class DolmaParallelBlock(DolmaBlock): +class OlmoParallelBlock(OlmoBlock): """ This is a transformer block where the output is computed as ``MLP(LN(x)) + Attention(LN(x))`` as in the PaLM architecture, as opposed to the typical ``MLP(LN(x + Attention(LN(x))))`` - as in :class:`DolmaSequentialBlock` (ignoring some skip connections). + as in :class:`OlmoSequentialBlock` (ignoring some skip connections). The decoupling of the MLP and Attention functions allow us to fuse the separate input projections into a single linear layer to increase throughput. In this configuration it's also straight-forward @@ -408,7 +408,7 @@ def forward( return x + self.dropout(self.ff_out(self.act(ff))) + self.dropout(att) -class DolmaOutput(NamedTuple): +class OlmoOutput(NamedTuple): logits: torch.FloatTensor """ A tensor of shape `(batch_size, seq_len, vocab_size)` representing the log probabilities @@ -416,7 +416,7 @@ class DolmaOutput(NamedTuple): """ -class DolmaGenerateOutput(NamedTuple): +class OlmoGenerateOutput(NamedTuple): token_ids: torch.LongTensor """ The generated token IDs, a tensor of shape `(batch_size, beam_size, max_steps)`. @@ -429,21 +429,21 @@ class DolmaGenerateOutput(NamedTuple): """ -class Dolma(nn.Module): +class Olmo(nn.Module): def __init__(self, config: ModelConfig, init_params: bool = True): super().__init__() self.config = config # Validate config. if self.config.alibi and self.config.flash_attention: - raise DolmaConfigurationError("ALiBi is currently not supported with FlashAttention") + raise OlmoConfigurationError("ALiBi is currently not supported with FlashAttention") if self.config.alibi and self.config.rope: - raise DolmaConfigurationError("ALiBi and RoPE are mutually exclusive") + raise OlmoConfigurationError("ALiBi and RoPE are mutually exclusive") if self.config.embedding_size is not None and self.config.embedding_size != self.config.vocab_size: if self.config.embedding_size < self.config.vocab_size: - raise DolmaConfigurationError("embedding size should be at least as big as vocab size") + raise OlmoConfigurationError("embedding size should be at least as big as vocab size") elif self.config.embedding_size % 128 != 0: import warnings @@ -460,7 +460,7 @@ def __init__(self, config: ModelConfig, init_params: bool = True): config.embedding_size or config.vocab_size, config.d_model, device=config.init_device ), emb_drop=nn.Dropout(config.embedding_dropout), - blocks=nn.ModuleList([DolmaBlock.build(config) for _ in range(config.n_layers)]), + blocks=nn.ModuleList([OlmoBlock.build(config) for _ in range(config.n_layers)]), ln_f=LayerNorm.build(config), ) ) @@ -541,7 +541,7 @@ def forward( input_ids: torch.LongTensor, attention_mask: Optional[torch.Tensor] = None, attention_bias: Optional[torch.Tensor] = None, - ) -> DolmaOutput: + ) -> OlmoOutput: """ :param input_ids: A tensor of shape `(batch_size, seq_len)`. :param attention_mask: A tensor of shape `(batch_size, seq_len)` that indicates @@ -625,13 +625,13 @@ def forward( # shape: (batch_size, seq_len, vocab_size) logits = F.linear(x, self.transformer.wte.weight, None) # type: ignore - return DolmaOutput(logits=logits) # type: ignore[arg-type] + return OlmoOutput(logits=logits) # type: ignore[arg-type] def fsdp_wrap_fn(self, module): - return isinstance(module, DolmaBlock) + return isinstance(module, OlmoBlock) def activation_checkpointing_fn(self, module): - return isinstance(module, DolmaBlock) + return isinstance(module, OlmoBlock) def param_init_fn(self, module): from functools import partial @@ -723,7 +723,7 @@ def generate( min_steps: Optional[int] = None, final_sequence_scorer: Optional[FinalSequenceScorer] = None, constraints: Optional[List[Constraint]] = None, - ) -> DolmaGenerateOutput: + ) -> OlmoGenerateOutput: """ Generate token IDs using beam search. @@ -803,7 +803,7 @@ def step( state["attention_bias"] = attention_bias token_ids, scores = beam_search.search(initial_preds, state, step) - return DolmaGenerateOutput( + return OlmoGenerateOutput( token_ids=token_ids, # type: ignore[arg-type] scores=scores, # type: ignore[arg-type] ) diff --git a/dolma/optim.py b/olmo/optim.py similarity index 100% rename from dolma/optim.py rename to olmo/optim.py diff --git a/dolma/py.typed b/olmo/py.typed similarity index 100% rename from dolma/py.typed rename to olmo/py.typed diff --git a/dolma/tokenizer.py b/olmo/tokenizer.py similarity index 97% rename from dolma/tokenizer.py rename to olmo/tokenizer.py index 41b896151..f22243ab6 100644 --- a/dolma/tokenizer.py +++ b/olmo/tokenizer.py @@ -6,7 +6,7 @@ from tokenizers import Tokenizer as BaseTokenizer from .config import TrainConfig, TruncationDirection -from .exceptions import DolmaConfigurationError +from .exceptions import OlmoConfigurationError __all__ = ["Tokenizer"] @@ -43,7 +43,7 @@ def vocab_size(self) -> int: def from_train_config(cls, config: TrainConfig) -> Tokenizer: tokenizer = cls.from_pretrained(config.tokenizer.identifier, eos_token_id=config.model.eos_token_id) if config.model.vocab_size != tokenizer.vocab_size: - raise DolmaConfigurationError("vocab size mismatch between config and tokenizer") + raise OlmoConfigurationError("vocab size mismatch between config and tokenizer") return tokenizer @classmethod diff --git a/dolma/util.py b/olmo/util.py similarity index 97% rename from dolma/util.py rename to olmo/util.py index 68977d1fc..5979de2fb 100644 --- a/dolma/util.py +++ b/olmo/util.py @@ -13,7 +13,7 @@ from rich.text import Text from rich.traceback import Traceback -from .exceptions import DolmaCliError, DolmaError +from .exceptions import OlmoCliError, OlmoError _log_extra_fields: Dict[str, Any] = {} @@ -43,7 +43,7 @@ def log_record_factory(*args, **kwargs) -> logging.LogRecord: handler: logging.Handler if ( - os.environ.get("DOLMA_NONINTERACTIVE", False) + os.environ.get("OLMo_NONINTERACTIVE", False) or os.environ.get("DEBIAN_FRONTEND", None) == "noninteractive" or not sys.stdout.isatty() ): @@ -74,9 +74,9 @@ def excepthook(exctype, value, traceback): """ if issubclass(exctype, KeyboardInterrupt): sys.__excepthook__(exctype, value, traceback) - elif issubclass(exctype, DolmaCliError): + elif issubclass(exctype, OlmoCliError): rich.get_console().print(f"[yellow]{value}[/]", highlight=False) - elif issubclass(exctype, DolmaError): + elif issubclass(exctype, OlmoError): rich.get_console().print(Text(f"{exctype.__name__}:", style="red"), value, highlight=False) else: logging.getLogger().critical( diff --git a/dolma/version.py b/olmo/version.py similarity index 100% rename from dolma/version.py rename to olmo/version.py diff --git a/scripts/init_config.py b/scripts/init_config.py index 05740b7da..22143d401 100644 --- a/scripts/init_config.py +++ b/scripts/init_config.py @@ -6,9 +6,9 @@ from pathlib import Path from typing import List -from dolma import TrainConfig -from dolma.exceptions import DolmaCliError -from dolma.util import clean_opt, prepare_cli_environment +from olmo import TrainConfig +from olmo.exceptions import OlmoCliError +from olmo.util import clean_opt, prepare_cli_environment log = logging.getLogger(__name__) @@ -27,6 +27,6 @@ def main(save_path: Path, args_list: List[str]) -> None: try: save_path, args_list = sys.argv[1], sys.argv[2:] except IndexError: - raise DolmaCliError(f"Usage: {sys.argv[0]} [SAVE_PATH] [OPTIONS]") + raise OlmoCliError(f"Usage: {sys.argv[0]} [SAVE_PATH] [OPTIONS]") main(Path(save_path), [clean_opt(s) for s in args_list]) diff --git a/scripts/prepare_changelog.py b/scripts/prepare_changelog.py index 768fb5caf..6a286df82 100644 --- a/scripts/prepare_changelog.py +++ b/scripts/prepare_changelog.py @@ -1,7 +1,7 @@ from datetime import datetime from pathlib import Path -from dolma.version import VERSION +from olmo.version import VERSION def main() -> None: diff --git a/scripts/prepare_memmap_dataset.py b/scripts/prepare_memmap_dataset.py index 04dfb40f9..6d38f20d2 100644 --- a/scripts/prepare_memmap_dataset.py +++ b/scripts/prepare_memmap_dataset.py @@ -29,8 +29,8 @@ TimeElapsedColumn, ) -from dolma import Tokenizer -from dolma.util import prepare_cli_environment +from olmo import Tokenizer +from olmo.util import prepare_cli_environment log = logging.getLogger(__name__) diff --git a/scripts/release.sh b/scripts/release.sh index 6577df6ff..7b61bfe26 100755 --- a/scripts/release.sh +++ b/scripts/release.sh @@ -2,7 +2,7 @@ set -e -TAG=$(python -c 'from dolma.version import VERSION; print("v" + VERSION)') +TAG=$(python -c 'from olmo.version import VERSION; print("v" + VERSION)') read -p "Creating new release for $TAG. Do you want to continue? [Y/n] " prompt diff --git a/scripts/train.py b/scripts/train.py index 3b276db4b..df1b9ffa3 100644 --- a/scripts/train.py +++ b/scripts/train.py @@ -1,5 +1,5 @@ """ -This is the script used to train DOLMA. +This is the script used to train OLMo. There is one required positional argument, the path to a YAML :class:`TrainConfig`. Following the YAML path, you could pass any number of options to override @@ -25,9 +25,9 @@ import torch -from dolma import Dolma, TrainConfig -from dolma.exceptions import DolmaCliError -from dolma.util import clean_opt, log_extra_field, prepare_cli_environment +from olmo import Olmo, TrainConfig +from olmo.exceptions import OlmoCliError +from olmo.util import clean_opt, log_extra_field, prepare_cli_environment log = logging.getLogger(__name__) @@ -41,9 +41,9 @@ def main(cfg: TrainConfig) -> None: from composer.utils import dist, get_device, reproducibility from composer.utils.dist import get_node_rank - from dolma.composer import ( - ComposerDolmaLM, - DolmaConsoleLogger, + from olmo.composer import ( + ComposerOlmoLM, + OlmoConsoleLogger, build_algorithm, build_dataloader, build_optimizer, @@ -78,11 +78,11 @@ def main(cfg: TrainConfig) -> None: ) # Initialize the model. - dolma_model = Dolma(cfg.model) + olmo_model = Olmo(cfg.model) if get_node_rank() == 0: - log.info(f"Total number of parameters: {dolma_model.num_params():,d}") + log.info(f"Total number of parameters: {olmo_model.num_params():,d}") log.info( - f"Number of non-embedding parameters: {dolma_model.num_params(include_embedding=False):,d}", + f"Number of non-embedding parameters: {olmo_model.num_params(include_embedding=False):,d}", ) # Compile it if necessary. @@ -90,11 +90,11 @@ def main(cfg: TrainConfig) -> None: compile_kwargs = cfg.compile.asdict() if compile_kwargs.get("fullgraph") is None: compile_kwargs["fullgraph"] = cfg.fsdp_config is None - # As far as duck typing is concerned, this is still a Dolma object. - dolma_model = cast(Dolma, torch.compile(dolma_model, **compile_kwargs)) + # As far as duck typing is concerned, this is still a Olmo object. + olmo_model = cast(Olmo, torch.compile(olmo_model, **compile_kwargs)) # Optimizer. - optimizer = build_optimizer(dolma_model, **cfg.optimizer.asdict()) + optimizer = build_optimizer(olmo_model, **cfg.optimizer.asdict()) # Scheduler. scheduler = build_scheduler(cfg.scheduler) @@ -117,13 +117,13 @@ def main(cfg: TrainConfig) -> None: ] # Loggers. - loggers: List[LoggerDestination] = [DolmaConsoleLogger(log_interval=cfg.console_log_interval)] + loggers: List[LoggerDestination] = [OlmoConsoleLogger(log_interval=cfg.console_log_interval)] if cfg.wandb is not None: loggers.append(WandBLogger(init_kwargs={"config": cfg.asdict(exclude=["wandb"])}, **cfg.wandb.asdict())) # Wrap model into composer model. - composer_model = ComposerDolmaLM(dolma_model) - del dolma_model + composer_model = ComposerOlmoLM(olmo_model) + del olmo_model # Trainer. trainer = Trainer( @@ -181,7 +181,7 @@ def main(cfg: TrainConfig) -> None: try: yaml_path, args_list = sys.argv[1], sys.argv[2:] except IndexError: - raise DolmaCliError(f"Usage: {sys.argv[0]} [CONFIG_PATH] [OPTIONS]") + raise OlmoCliError(f"Usage: {sys.argv[0]} [CONFIG_PATH] [OPTIONS]") cfg = TrainConfig.load(yaml_path, [clean_opt(s) for s in args_list]) main(cfg) diff --git a/scripts/upload_artifact.py b/scripts/upload_artifact.py index ee41f5a6b..e53dbcfd3 100644 --- a/scripts/upload_artifact.py +++ b/scripts/upload_artifact.py @@ -6,7 +6,7 @@ from google.cloud import storage from tqdm import tqdm -from dolma.util import prepare_cli_environment +from olmo.util import prepare_cli_environment log = logging.getLogger(__name__) diff --git a/setup.py b/setup.py index 9ca12cb45..7951268c2 100644 --- a/setup.py +++ b/setup.py @@ -15,11 +15,11 @@ def read_requirements(filename: str): # version.py defines the VERSION and VERSION_SHORT variables. # We use exec here so we don't import cached_path whilst setting up. VERSION = {} # type: ignore -with open("dolma/version.py", "r") as version_file: +with open("olmo/version.py", "r") as version_file: exec(version_file.read(), VERSION) setup( - name="dolma", + name="olmo", version=VERSION["VERSION"], description="", long_description=open("README.md").read(), @@ -39,7 +39,7 @@ def read_requirements(filename: str): packages=find_packages( exclude=["*.tests", "*.tests.*", "tests.*", "tests"], ), - package_data={"dolma": ["py.typed"]}, + package_data={"olmo": ["py.typed"]}, install_requires=read_requirements("requirements.txt"), extras_require={"dev": read_requirements("dev-requirements.txt")}, python_requires=">=3.8", diff --git a/tests/beam_search_test.py b/tests/beam_search_test.py index 872517c8a..1e6d4156b 100644 --- a/tests/beam_search_test.py +++ b/tests/beam_search_test.py @@ -4,7 +4,7 @@ import pytest import torch -from dolma.beam_search import ( +from olmo.beam_search import ( BeamSearch, GumbelSampler, LengthNormalizedSequenceLogProbabilityScorer, diff --git a/tests/config_test.py b/tests/config_test.py index 3a5607a1e..a0aedc5a3 100644 --- a/tests/config_test.py +++ b/tests/config_test.py @@ -1,6 +1,6 @@ from pathlib import Path -from dolma.config import StrEnum, TrainConfig +from olmo.config import StrEnum, TrainConfig def test_str_enum(): diff --git a/tests/data/collator_test.py b/tests/data/collator_test.py index 0e906caeb..2279570d5 100644 --- a/tests/data/collator_test.py +++ b/tests/data/collator_test.py @@ -1,7 +1,7 @@ import pytest import torch -from dolma.data.collator import DataCollator, PaddingDirection +from olmo.data.collator import DataCollator, PaddingDirection @pytest.mark.parametrize( diff --git a/tests/data/memmap_dataset_test.py b/tests/data/memmap_dataset_test.py index e3c97c862..bb1b1e3d4 100644 --- a/tests/data/memmap_dataset_test.py +++ b/tests/data/memmap_dataset_test.py @@ -3,8 +3,8 @@ import numpy as np -from dolma.data.memmap_dataset import MemMapDataset -from dolma.tokenizer import Tokenizer +from olmo.data.memmap_dataset import MemMapDataset +from olmo.tokenizer import Tokenizer def test_mmap_dataset(tokenizer: Tokenizer, tmp_path: Path, lorem_ipsum_docs: List[str]): diff --git a/tests/model_test.py b/tests/model_test.py index a952adb01..a84d4bb2c 100644 --- a/tests/model_test.py +++ b/tests/model_test.py @@ -2,10 +2,10 @@ import torch from torch.nn import CrossEntropyLoss -from dolma import BlockType, Dolma, ModelConfig, Tokenizer, TrainConfig -from dolma.composer import build_optimizer -from dolma.config import PaddingDirection -from dolma.data import DataCollator +from olmo import BlockType, ModelConfig, Olmo, Tokenizer, TrainConfig +from olmo.composer import build_optimizer +from olmo.config import PaddingDirection +from olmo.data import DataCollator @pytest.mark.parametrize( @@ -126,9 +126,9 @@ def test_forward( use_amp = dtype in {torch.float16, torch.bfloat16} - model = Dolma(train_config.model).eval() + model = Olmo(train_config.model).eval() - input1 = tokenizer.encode("My name is DOLMA!") + input1 = tokenizer.encode("My name is OLMo!") input2 = tokenizer.encode("I'm a delightful large open language model :)") batch_inputs = DataCollator.from_train_config(train_config)( [ # type: ignore @@ -223,13 +223,13 @@ def test_backward( else: train_config.model.init_device = "cpu" - model = Dolma(train_config.model).train() + model = Olmo(train_config.model).train() with torch.autocast( device_type="cuda" if cuda else "cpu", enabled=use_amp, dtype=None if not use_amp else dtype ): # Forward pass to get logits. - input_ids = torch.tensor(tokenizer.encode("My name is DOLMA!"), device=train_config.device).unsqueeze(0) + input_ids = torch.tensor(tokenizer.encode("My name is OLMo!"), device=train_config.device).unsqueeze(0) logits = model(input_ids).logits # Compute loss. @@ -255,7 +255,7 @@ def test_backward( def test_build_optimizer(model_config: ModelConfig): - build_optimizer(Dolma(model_config)) + build_optimizer(Olmo(model_config)) @pytest.mark.parametrize( @@ -297,9 +297,9 @@ def test_generate( train_config.model.init_device = "cpu" use_amp = dtype in {torch.float16, torch.bfloat16} - model = Dolma(train_config.model).eval() + model = Olmo(train_config.model).eval() - input1 = tokenizer.encode("My name is DOLMA! ", add_special_tokens=False) + input1 = tokenizer.encode("My name is OLMo! ", add_special_tokens=False) input2 = tokenizer.encode("I'm a delightful large open language model :) ", add_special_tokens=False) batch_inputs = DataCollator.from_train_config(train_config)( [ # type: ignore diff --git a/tests/tokenizer_test.py b/tests/tokenizer_test.py index 1e1110c07..a3c761413 100644 --- a/tests/tokenizer_test.py +++ b/tests/tokenizer_test.py @@ -2,7 +2,7 @@ import pytest -from dolma.tokenizer import Tokenizer +from olmo.tokenizer import Tokenizer @pytest.mark.parametrize("add_special_tokens", [pytest.param(x, id=f"specials={x}") for x in (True, False)])