From f8791665cda4e4e54f9c4221efc95fcf8c44839a Mon Sep 17 00:00:00 2001 From: Jiacheng Liu Date: Tue, 15 Oct 2024 21:46:59 +0000 Subject: [PATCH 01/58] Eval-only script --- configs/peteish7-weka.yaml | 284 +++++++++++++++-- olmo/config.py | 1 + scripts/beaker/peteish/peteish7-launch.sh | 12 +- scripts/beaker/peteish/peteish7.sh | 6 +- scripts/eval.py | 366 ++++++++++++++++++++++ 5 files changed, 626 insertions(+), 43 deletions(-) create mode 100644 scripts/eval.py diff --git a/configs/peteish7-weka.yaml b/configs/peteish7-weka.yaml index a7dc9d66c..9bd92029e 100644 --- a/configs/peteish7-weka.yaml +++ b/configs/peteish7-weka.yaml @@ -1,4 +1,4 @@ -run_name: peteish7-run001 +run_name: peteish7-backfill seed: 6198 dry_run: false @@ -107,35 +107,35 @@ eval_interval: 1000 eval_subset_num_batches: -1 device_eval_batch_size: ${device_train_microbatch_size} evaluators: - # - label: all-small-ppl-validation - # data: - # num_workers: 0 - # drop_last: true - # # generate_doc_lengths: true - # memmap_dtype: uint32 - # datasets: - # c4_en-validation: - # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy - # dolma_books-validation: - # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy - # dolma_common-crawl-validation: - # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy - # dolma_pes2o-validation: - # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy - # dolma_reddit-validation: - # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy - # dolma_stack-validation: - # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy - # dolma_wiki-validation: - # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy - # ice-validation: - # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy - # m2d2_s2orc-validation: - # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy - # pile-validation: - # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy - # wikitext_103-validation: - # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + - label: all-small-ppl-validation + data: + num_workers: 0 + drop_last: true + # generate_doc_lengths: true + memmap_dtype: uint32 + datasets: + c4_en-validation: + - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + dolma_books-validation: + - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + dolma_common-crawl-validation: + - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + dolma_pes2o-validation: + - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + dolma_reddit-validation: + - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + dolma_stack-validation: + - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + dolma_wiki-validation: + - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + ice-validation: + - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + m2d2_s2orc-validation: + - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + pile-validation: + - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + wikitext_103-validation: + - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy ########################## # Downstream evaluations # @@ -154,7 +154,7 @@ evaluators: - label: boolq type: downstream - + - label: sciq type: downstream @@ -230,6 +230,228 @@ evaluators: - label: arc_easy_ppl type: downstream + - label: piqa_rc_0shot + type: downstream + + - label: piqa_rc_0shot_bpb + type: downstream + + - label: piqa_rc_5shot + type: downstream + + - label: piqa_rc_5shot_bpb + type: downstream + + - label: piqa_mc_5shot + type: downstream + + - label: piqa_mc_5shot_bpb + type: downstream + + - label: hellaswag_rc_0shot + type: downstream + + - label: hellaswag_rc_0shot_bpb + type: downstream + + - label: hellaswag_rc_5shot + type: downstream + + - label: hellaswag_rc_5shot_bpb + type: downstream + + - label: hellaswag_mc_5shot + type: downstream + + - label: hellaswag_mc_5shot_bpb + type: downstream + + - label: winogrande_rc_0shot + type: downstream + + - label: winogrande_rc_0shot_bpb + type: downstream + + - label: winogrande_rc_5shot + type: downstream + + - label: winogrande_rc_5shot_bpb + type: downstream + + - label: winogrande_mc_5shot + type: downstream + + - label: winogrande_mc_5shot_bpb + type: downstream + + - label: openbookqa_rc_0shot + type: downstream + + - label: openbookqa_rc_0shot_bpb + type: downstream + + - label: openbookqa_rc_5shot + type: downstream + + - label: openbookqa_rc_5shot_bpb + type: downstream + + - label: openbookqa_mc_5shot + type: downstream + + - label: openbookqa_mc_5shot_bpb + type: downstream + + - label: boolq_rc_0shot + type: downstream + + - label: boolq_rc_0shot_bpb + type: downstream + + - label: boolq_rc_5shot + type: downstream + + - label: boolq_rc_5shot_bpb + type: downstream + + - label: boolq_mc_5shot + type: downstream + + - label: boolq_mc_5shot_bpb + type: downstream + + - label: sciq_rc_0shot + type: downstream + + - label: sciq_rc_0shot_bpb + type: downstream + + # - label: sciq_rc_5shot + # type: downstream + + # - label: sciq_rc_5shot_bpb + # type: downstream + + # - label: sciq_mc_5shot + # type: downstream + + # - label: sciq_mc_5shot_bpb + # type: downstream + + - label: arc_easy_rc_0shot + type: downstream + + - label: arc_easy_rc_0shot_bpb + type: downstream + + - label: arc_easy_rc_5shot + type: downstream + + - label: arc_easy_rc_5shot_bpb + type: downstream + + - label: arc_easy_mc_5shot + type: downstream + + - label: arc_easy_mc_5shot_bpb + type: downstream + + - label: arc_challenge_rc_0shot + type: downstream + + - label: arc_challenge_rc_0shot_bpb + type: downstream + + - label: arc_challenge_rc_5shot + type: downstream + + - label: arc_challenge_rc_5shot_bpb + type: downstream + + - label: arc_challenge_mc_5shot + type: downstream + + - label: arc_challenge_mc_5shot_bpb + type: downstream + + - label: copa_rc_0shot + type: downstream + + - label: copa_rc_0shot_bpb + type: downstream + + # - label: copa_rc_5shot + # type: downstream + + # - label: copa_rc_5shot_bpb + # type: downstream + + # - label: copa_mc_5shot + # type: downstream + + # - label: copa_mc_5shot_bpb + # type: downstream + + - label: csqa_rc_0shot + type: downstream + + - label: csqa_rc_0shot_bpb + type: downstream + + - label: csqa_rc_5shot + type: downstream + + - label: csqa_rc_5shot_bpb + type: downstream + + - label: csqa_mc_5shot + type: downstream + + - label: csqa_mc_5shot_bpb + type: downstream + + - label: socialiqa_rc_0shot + type: downstream + + - label: socialiqa_rc_0shot_bpb + type: downstream + + - label: socialiqa_rc_5shot + type: downstream + + - label: socialiqa_rc_5shot_bpb + type: downstream + + - label: socialiqa_mc_5shot + type: downstream + + - label: socialiqa_mc_5shot_bpb + type: downstream + + - label: mmlu_stem_var_bpb + type: downstream + + - label: mmlu_humanities_var_bpb + type: downstream + + - label: mmlu_social_sciences_var_bpb + type: downstream + + - label: mmlu_other_var_bpb + type: downstream + + - label: mmlu_stem_bpb + type: downstream + + - label: mmlu_humanities_bpb + type: downstream + + - label: mmlu_social_sciences_bpb + type: downstream + + - label: mmlu_other_bpb + type: downstream + data: pad_direction: right # generate_doc_lengths: true diff --git a/olmo/config.py b/olmo/config.py index 94e5103d2..a370937ee 100644 --- a/olmo/config.py +++ b/olmo/config.py @@ -667,6 +667,7 @@ class WandbConfig(BaseConfig): log_artifacts: bool = False rank_zero_only: bool = True log_interval: int = 1 + id: Optional[str] = None @dataclass diff --git a/scripts/beaker/peteish/peteish7-launch.sh b/scripts/beaker/peteish/peteish7-launch.sh index 4180dee67..92d316866 100755 --- a/scripts/beaker/peteish/peteish7-launch.sh +++ b/scripts/beaker/peteish/peteish7-launch.sh @@ -2,13 +2,13 @@ set -ex -NUM_NODES=16 +NUM_NODES=8 gantry run \ - --workspace ai2/OLMo-pretraining-stability \ + --workspace ai2/hb-wolf-olmo \ --task-name peteish7 \ --description "Pete-ish 7B" \ - --priority urgent \ + --priority high \ --preemptible \ --beaker-image petew/olmo-torch23-gantry \ --cluster ai2/jupiter-cirrascale-2 \ @@ -29,11 +29,7 @@ gantry run \ --env R2_PROFILE=R2 \ --env S3_PROFILE=S3 \ --env WEKA_PROFILE=WEKA \ - --env-secret AWS_CONFIG=PETEW_AWS_CONFIG \ - --env-secret AWS_CREDENTIALS=PETEW_AWS_CREDENTIALS \ - --env-secret R2_ENDPOINT_URL=R2_ENDPOINT_URL \ - --env-secret WEKA_ENDPOINT_URL=WEKA_ENDPOINT_URL \ - --env-secret WANDB_API_KEY=PETEW_WANDB_API_KEY \ + --env-secret WANDB_API_KEY=WANDB_API_KEY \ --shared-memory 10GiB \ --yes \ --timeout=-1 \ diff --git a/scripts/beaker/peteish/peteish7.sh b/scripts/beaker/peteish/peteish7.sh index 11166f700..fe6622c55 100755 --- a/scripts/beaker/peteish/peteish7.sh +++ b/scripts/beaker/peteish/peteish7.sh @@ -48,10 +48,8 @@ torchrun \ --rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \ --node_rank "${BEAKER_REPLICA_RANK}" \ --rdzv_conf 'read_timeout=420' \ - scripts/train.py \ + scripts/eval.py \ configs/peteish7-weka.yaml \ --run_name="${GANTRY_TASK_NAME}" \ --save_interval_ephemeral=500 \ - --save_overwrite - - # '--load_path=${path.last_checkpoint:${save_folder}}' \ + '--load_path=${path.last_checkpoint:${save_folder}}' \ diff --git a/scripts/eval.py b/scripts/eval.py new file mode 100644 index 000000000..b68175213 --- /dev/null +++ b/scripts/eval.py @@ -0,0 +1,366 @@ +"""Run this script with 'torchrun'.""" + +import gzip +import logging +import sys +from datetime import timedelta +from pathlib import Path +from typing import Optional, TextIO + +import torch +import torch.distributed as dist +import torch.multiprocessing as mp +import wandb +from packaging import version +from torch.distributed.fsdp import FullyShardedDataParallel as FSDP +from torch.distributed.fsdp import ShardingStrategy +from torch.nn.parallel import DistributedDataParallel as DDP + +from olmo.config import ( + CheckpointType, + DDPGradSyncMode, + DistributedStrategy, + TrainConfig, +) +from olmo.data import build_train_dataloader +from olmo.eval import build_evaluators +from olmo.exceptions import OLMoCliError, OLMoConfigurationError +from olmo.model import OLMo +from olmo.optim import BoltOnWarmupScheduler, build_optimizer, build_scheduler +from olmo.torch_util import ( + barrier, + get_default_device, + get_global_rank, + get_local_rank, + get_local_world_size, + get_world_size, + peak_gpu_memory, + seed_all, +) +from olmo.train import Trainer +from olmo.util import ( + add_cached_path_clients, + clean_opt, + find_latest_checkpoint, + log_extra_field, + prepare_cli_environment, +) + +log = logging.getLogger("train") + + +def main(cfg: TrainConfig) -> None: + # Ensure run name set. + if cfg.run_name is None: + raise OLMoConfigurationError("--run_name is required") + log_extra_field("run_name", cfg.run_name) + + # Sanity check + if (cfg.reset_optimizer_state or cfg.reset_trainer_state) and cfg.load_path is None: + log.warning( + "You want to reset the optimizer or trainer state, but we're not loading from the checkpoint. The" + "setting has no effect." + ) + + barrier() + + device = torch.device("cuda") + + # Fill some configuration options. + cfg.model.precision = cfg.precision + cfg.device_train_batch_size = cfg.global_train_batch_size // get_world_size() + assert cfg.device_train_batch_size is not None # for mypy + cfg.device_train_grad_accum = cfg.device_train_batch_size // cfg.device_train_microbatch_size + if cfg.optimizer.no_decay_norm_and_bias is not None: + log.warning( + "You set the deprecated config option `no_decay_norm_and_bias`. For compatibility, this" + "setting will take precedence over all other weight decay configurations. Please change" + "your config to use `decay_norm_and_bias` and `decay_embeddings` instead." + ) + cfg.optimizer.decay_norm_and_bias = not cfg.optimizer.no_decay_norm_and_bias + cfg.optimizer.decay_embeddings = not cfg.optimizer.no_decay_norm_and_bias + cfg.optimizer.no_decay_norm_and_bias = None # So nobody uses this by accident. + + # # Display and save configuration. + # if get_global_rank() == 0: + # if cfg.data.paths is not None and len(cfg.data.paths) < 50: + # log.info("Configuration:") + # log.info(cfg) + # if not cfg.dry_run and (cfg.load_path is None or Path(cfg.load_path).parent != Path(cfg.save_folder)): + # # Save config. + # save_path = Path(cfg.save_folder) / "config.yaml" + # if save_path.is_file() and not cfg.save_overwrite: + # raise OLMoConfigurationError(f"{save_path} already exists, use --save_overwrite to overwrite") + # else: + # log.info(f"Saving config to {save_path}") + # save_path.parent.mkdir(exist_ok=True, parents=True) + # cfg.save(save_path) + # del save_path + + barrier() + + # Maybe start W&B run. + if cfg.wandb is not None and (get_global_rank() == 0 or not cfg.wandb.rank_zero_only): + wandb_dir = Path(cfg.save_folder) / "wandb" + wandb_dir.mkdir(parents=True, exist_ok=True) + wandb.init( + dir=wandb_dir, + project=cfg.wandb.project, + entity=cfg.wandb.entity, + group=cfg.wandb.group, + name=cfg.wandb.name, + tags=cfg.wandb.tags, + config=cfg.asdict(exclude=["wandb"]), + id=cfg.wandb.id, + resume="allow", + ) + + barrier() + + # Set seed. + seed_all(cfg.seed) + + # # Construct data loader. + # train_loader = build_train_dataloader(cfg) + train_loader = None + + # Construct evaluators. + evaluators = build_evaluators(cfg, device) + barrier() + + # Initialize the model. + log.info("Building model...") + olmo_model = OLMo(cfg.model) + log.info(f"Total number of parameters: {olmo_model.num_params():,d}") + log.info(f"Number of non-embedding parameters: {olmo_model.num_params(include_embedding=False):,d}") + log.info(f"Peak GPU Memory (MB) before {cfg.distributed_strategy}: {int(peak_gpu_memory() or 0)}") + + olmo_model.set_activation_checkpointing(cfg.activation_checkpointing) + + if cfg.distributed_strategy == DistributedStrategy.ddp: + log.info("Wrapping model with DDP...") + assert cfg.ddp is not None, "DistributedStrategy ddp needs cfg.ddp to be set!" + + if cfg.model.init_device != "cuda": + raise OLMoConfigurationError("DDP does not work with init_device set to anything other than `cuda`.") + + if cfg.ddp.find_unused_params is True and cfg.ddp.grad_sync_mode != DDPGradSyncMode.micro_batch: + raise OLMoConfigurationError( + "`find_unused_params` is set to True. DDP needs to synchronize gradients for every micro-batch to avoid errors. Set `grad_sync_mode` to `micro_batch`." + ) + + param_init_fn = None + + # move to cuda before calling ddp + dist_model = DDP(olmo_model.to(device), find_unused_parameters=cfg.ddp.find_unused_params) + elif cfg.distributed_strategy == DistributedStrategy.fsdp: + # Wrap the model in FSDP. + log.info("Wrapping model with FSDP...") + assert cfg.fsdp is not None, "DistributedStrategy fsdp needs cfg.fsdp to be set!" + wrap_policy = olmo_model.get_fsdp_wrap_policy(cfg.fsdp.wrapping_strategy) + + if version.parse(torch.__version__) >= version.parse("2.1.0"): + # This prevents any parameters from being initialized twice + def dummy_init_fn(module: torch.nn.Module) -> None: + module.to_empty(device=get_default_device()) + + param_init_fn = dummy_init_fn + else: + param_init_fn = None + + # Set up device mesh for hybrid sharding in order to specify which nodes are assoicated to a given model replica + device_mesh = None + hybrid_sharding_fsdp_kwargs = {} + if cfg.fsdp.sharding_strategy in (ShardingStrategy.HYBRID_SHARD, ShardingStrategy._HYBRID_SHARD_ZERO2): + if version.parse(torch.__version__) < version.parse("2.2.0"): + # Device mesh was not added to PyTorch until v2.2.0 + raise OLMoConfigurationError( + "OLMo training does not correctly support hybrid sharding before torch 2.2.0" + ) + + from torch.distributed.device_mesh import init_device_mesh + + num_model_replicas = cfg.fsdp.hybrid_sharding_num_model_replicas or ( + get_world_size() // get_local_world_size() + ) + + if num_model_replicas <= 0: + raise OLMoConfigurationError("fsdp.hybrid_sharding_num_model_replicas must be a positive integer") + + if get_world_size() % num_model_replicas != 0: + raise OLMoConfigurationError("fsdp.hybrid_sharding_num_model_replicas must divide world size") + + device_mesh = init_device_mesh("cuda", (num_model_replicas, get_world_size() // num_model_replicas)) + hybrid_sharding_fsdp_kwargs["device_mesh"] = device_mesh + + dist_model = FSDP( + olmo_model, + sharding_strategy=cfg.fsdp.sharding_strategy, + mixed_precision=cfg.fsdp_precision, + auto_wrap_policy=wrap_policy, + use_orig_params=cfg.fsdp.use_orig_params, # needed for compile and some of our optimizer/parameter metrics + limit_all_gathers=True, + device_id=get_local_rank(), + param_init_fn=param_init_fn, + **hybrid_sharding_fsdp_kwargs, + ) + elif cfg.distributed_strategy is None: + raise NotImplementedError("Single accelerator training not implemented yet!") + + # when param_init_fn is None, FSDP will call reset_parameters() automatically + if param_init_fn is not None or cfg.distributed_strategy == DistributedStrategy.ddp: + olmo_model.reset_parameters() + + log.info(f"Peak GPU Memory (MB) after {cfg.distributed_strategy}: {int(peak_gpu_memory() or 0)}") + log.info("Model:") + log.info(dist_model) + + # Construct optimizer and learning rate scheduler. + optim = build_optimizer(cfg, dist_model) + scheduler = build_scheduler(cfg) + + # Data indices file. + indices_file: Optional[TextIO] = None + if cfg.save_data_indices: + indices_file_path = Path(cfg.save_folder) / f"data-indices/rank{get_global_rank()}.tsv.gz" + if indices_file_path.exists() and not cfg.save_overwrite: + raise OLMoConfigurationError(f"{indices_file_path} already exists, use --save_overwrite to overwrite") + indices_file_path.parent.mkdir(exist_ok=True, parents=True) + indices_file = gzip.open(indices_file_path, "wt") + + # Consolidate components into `Trainer` object. + with Trainer( + cfg=cfg, + epoch=cfg.epoch, + model=olmo_model, + dist_model=dist_model, + optim=optim, + scheduler=scheduler, + train_loader=train_loader, + device=device, + evaluators=evaluators, + indices_file=indices_file, + ) as trainer: + # if cfg.try_load_latest_save: + # if ( + # cfg.save_folder is not None + # and (checkpoint_dir := find_latest_checkpoint(cfg.save_folder)) is not None + # ): + # log.info("Setting load path to local checkpoint %s", checkpoint_dir) + # cfg.load_path = str(checkpoint_dir) + # elif ( + # cfg.remote_save_folder is not None + # and (checkpoint_dir := find_latest_checkpoint(cfg.remote_save_folder)) is not None + # ): + # log.info("Setting load path to remote checkpoint %s", checkpoint_dir) + # cfg.load_path = str(checkpoint_dir) + + # if not cfg.dry_run and not cfg.no_pre_train_checkpoint and cfg.load_path is None: + # if cfg.distributed_strategy == DistributedStrategy.ddp: + # checkpoint_type = CheckpointType.unsharded + + # if cfg.save_interval_unsharded is None: + # log.warning( + # "DDP requires setting `save_interval_unsharded`. Using the value set for `save_interval`." + # ) + # cfg.save_interval_unsharded = cfg.save_interval + + # if cfg.save_num_unsharded_checkpoints_to_keep == 0: + # log.warning( + # "DDP requires setting `save_num_unsharded_checkpoints_to_keep`. Using the value set for `save_num_checkpoints_to_keep`." + # ) + # cfg.save_num_unsharded_checkpoints_to_keep = cfg.save_num_checkpoints_to_keep + # elif cfg.distributed_strategy == DistributedStrategy.fsdp: + # checkpoint_type = ( + # CheckpointType.sharded if cfg.save_num_checkpoints_to_keep != 0 else CheckpointType.unsharded + # ) + # else: + # raise NotImplementedError(f"Distributed strategy {cfg.distributed_strategy} not supported yet!") + + # # We save a checkpoint up-front to make sure this won't fail (due to disk space or whatever). + # log.info("Saving pre-train checkpoint...") + # checkpoint_path, local_checkpoint_cache = trainer.save_checkpoint(checkpoint_type=checkpoint_type) + # log.info(f"Checkpoint saved to {checkpoint_path}") + + # # And they we verify that we can load it. + # log.info("Attempting to load pre-train checkpoint...") + # trainer.restore_checkpoint( + # checkpoint_path, checkpoint_type=checkpoint_type, local_cache=local_checkpoint_cache + # ) + # log.info("Checkpoint successfully loaded") + + # # NOTE: https://github.com/allenai/LLM/issues/233 + # # log.info("Removing pre-train checkpoint...") + # # trainer.remove_checkpoint(checkpoint_type=checkpoint_type) + # # log.info("Successfully removed checkpoint") + + if cfg.load_path is not None: + log.info(f"Loading checkpoint from {cfg.load_path}...") + trainer.restore_checkpoint( + cfg.load_path, + load_optimizer_state=not cfg.reset_optimizer_state, + load_trainer_state=not cfg.reset_trainer_state, + sharded_checkpointer=cfg.load_path_sharded_checkpointer, + ) + log.info("Checkpoint successfully loaded") + + # If we have to, set a new scheduler: + if cfg.reset_optimizer_state and not cfg.reset_trainer_state: + trainer.scheduler = BoltOnWarmupScheduler.wrap( + trainer.scheduler, + trainer.global_step, + int(trainer.global_step + cfg.scheduler.t_warmup), + ) + + # if cfg.force_save_unsharded and cfg.distributed_strategy != DistributedStrategy.ddp: + # log.info("Saving unsharded checkpoint...") + # checkpoint_path, _ = trainer.save_checkpoint(checkpoint_type=CheckpointType.unsharded) + # log.info(f"Unsharded checkpoint saved to {checkpoint_path}") + + if cfg.compile is not None: + # TODO (epwalsh): trying to compile the whole train step results in a compile-time error from within + # the optimizer. We should investigate this further at some point. + # trainer.train_step = torch.compile(trainer.train_step, **cfg.compile.asdict()) + trainer.train_batch = torch.compile(trainer.train_batch, **cfg.compile.asdict()) # type: ignore + # TODO (epwalsh): compiling the `eval_batch()` method is a little sketchy since the inputs will look + # different for different eval tasks. That might be okay, but it might not be. + # trainer.eval_batch = torch.compile(trainer.eval_batch, **cfg.compile.asdict()) # type: ignore + # Alternatively, could just do this: + # trainer.fsdp_model = torch.compile(trainer.fsdp_model, **cfg.compile.asdict()) + + if not cfg.dry_run: + log.info("Starting evaluating...") + eval_metrics = trainer.eval() + if wandb.run is not None: + wandb.log(eval_metrics, step=trainer.global_step) + log.info("Evaluating complete") + else: + log.info("Dry run complete") + + +if __name__ == "__main__": + try: + mp.set_start_method("spawn", force=True) + except RuntimeError as e: + print(f"failed to set multiprocessing start method: {e}") + log.info(f"Multiprocessing start method set to '{mp.get_start_method()}'") + + # Set CUDA device. + torch.cuda.set_device(f"cuda:{get_local_rank()}") + + # Initialize process group. + dist.init_process_group(backend="nccl", timeout=timedelta(minutes=30)) + log.info("Process group initialized") + + prepare_cli_environment() + log.info("CLI environment prepared") + + add_cached_path_clients() + + try: + yaml_path, args_list = sys.argv[1], sys.argv[2:] + except IndexError: + raise OLMoCliError(f"Usage: {sys.argv[0]} [CONFIG_PATH] [OPTIONS]") + + cfg = TrainConfig.load(yaml_path, [clean_opt(s) for s in args_list]) + main(cfg) From 7619ad764c04bcc4277740174505c045debc0ead Mon Sep 17 00:00:00 2001 From: Jiacheng Liu Date: Tue, 15 Oct 2024 21:51:40 +0000 Subject: [PATCH 02/58] Fix env --- scripts/beaker/peteish/peteish7.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/beaker/peteish/peteish7.sh b/scripts/beaker/peteish/peteish7.sh index fe6622c55..480e37876 100755 --- a/scripts/beaker/peteish/peteish7.sh +++ b/scripts/beaker/peteish/peteish7.sh @@ -26,8 +26,8 @@ pip freeze # Move AWS credentials from env to relevant files mkdir -p ~/.aws -printenv AWS_CONFIG > ~/.aws/config -printenv AWS_CREDENTIALS > ~/.aws/credentials +# printenv AWS_CONFIG > ~/.aws/config +# printenv AWS_CREDENTIALS > ~/.aws/credentials # Force processes to synchronize at init_process_group export TORCH_DIST_INIT_BARRIER=1 From 2b877579970099c2b342d06ff971994a3c90a73e Mon Sep 17 00:00:00 2001 From: Jiacheng Liu Date: Tue, 15 Oct 2024 22:04:49 +0000 Subject: [PATCH 03/58] Disable saving data indices --- scripts/beaker/peteish/peteish7-launch.sh | 6 +++--- scripts/beaker/peteish/peteish7.sh | 2 +- scripts/eval.py | 12 ++++++------ 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/scripts/beaker/peteish/peteish7-launch.sh b/scripts/beaker/peteish/peteish7-launch.sh index 92d316866..c8db28f03 100755 --- a/scripts/beaker/peteish/peteish7-launch.sh +++ b/scripts/beaker/peteish/peteish7-launch.sh @@ -5,6 +5,7 @@ set -ex NUM_NODES=8 gantry run \ + --allow-dirty \ --workspace ai2/hb-wolf-olmo \ --task-name peteish7 \ --description "Pete-ish 7B" \ @@ -26,10 +27,9 @@ gantry run \ --env LOG_FILTER_TYPE=local_rank0_only \ --env OMP_NUM_THREADS=8 \ --env OLMO_TASK=model \ - --env R2_PROFILE=R2 \ - --env S3_PROFILE=S3 \ - --env WEKA_PROFILE=WEKA \ --env-secret WANDB_API_KEY=WANDB_API_KEY \ + --env-secret AWS_ACCESS_KEY_ID=AWS_ACCESS_KEY_ID \ + --env-secret AWS_SECRET_ACCESS_KEY=AWS_SECRET_ACCESS_KEY \ --shared-memory 10GiB \ --yes \ --timeout=-1 \ diff --git a/scripts/beaker/peteish/peteish7.sh b/scripts/beaker/peteish/peteish7.sh index 480e37876..98d419338 100755 --- a/scripts/beaker/peteish/peteish7.sh +++ b/scripts/beaker/peteish/peteish7.sh @@ -25,7 +25,7 @@ pip install '.[train]' pip freeze # Move AWS credentials from env to relevant files -mkdir -p ~/.aws +# mkdir -p ~/.aws # printenv AWS_CONFIG > ~/.aws/config # printenv AWS_CREDENTIALS > ~/.aws/credentials diff --git a/scripts/eval.py b/scripts/eval.py index b68175213..f54b3cafc 100644 --- a/scripts/eval.py +++ b/scripts/eval.py @@ -221,12 +221,12 @@ def dummy_init_fn(module: torch.nn.Module) -> None: # Data indices file. indices_file: Optional[TextIO] = None - if cfg.save_data_indices: - indices_file_path = Path(cfg.save_folder) / f"data-indices/rank{get_global_rank()}.tsv.gz" - if indices_file_path.exists() and not cfg.save_overwrite: - raise OLMoConfigurationError(f"{indices_file_path} already exists, use --save_overwrite to overwrite") - indices_file_path.parent.mkdir(exist_ok=True, parents=True) - indices_file = gzip.open(indices_file_path, "wt") + # if cfg.save_data_indices: + # indices_file_path = Path(cfg.save_folder) / f"data-indices/rank{get_global_rank()}.tsv.gz" + # if indices_file_path.exists() and not cfg.save_overwrite: + # raise OLMoConfigurationError(f"{indices_file_path} already exists, use --save_overwrite to overwrite") + # indices_file_path.parent.mkdir(exist_ok=True, parents=True) + # indices_file = gzip.open(indices_file_path, "wt") # Consolidate components into `Trainer` object. with Trainer( From aeabd027d77d226c867b9a00a3e9c61147f53bd7 Mon Sep 17 00:00:00 2001 From: Jiacheng Liu Date: Tue, 15 Oct 2024 22:10:52 +0000 Subject: [PATCH 04/58] Restore train dataloader --- scripts/eval.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scripts/eval.py b/scripts/eval.py index f54b3cafc..508e01ede 100644 --- a/scripts/eval.py +++ b/scripts/eval.py @@ -121,8 +121,7 @@ def main(cfg: TrainConfig) -> None: seed_all(cfg.seed) # # Construct data loader. - # train_loader = build_train_dataloader(cfg) - train_loader = None + train_loader = build_train_dataloader(cfg) # Construct evaluators. evaluators = build_evaluators(cfg, device) From 414277be1a1c5749019c14a1d918206534b014eb Mon Sep 17 00:00:00 2001 From: Jiacheng Liu Date: Tue, 15 Oct 2024 22:16:54 +0000 Subject: [PATCH 05/58] Do not load train state --- scripts/eval.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/scripts/eval.py b/scripts/eval.py index 508e01ede..e62d048e6 100644 --- a/scripts/eval.py +++ b/scripts/eval.py @@ -121,7 +121,8 @@ def main(cfg: TrainConfig) -> None: seed_all(cfg.seed) # # Construct data loader. - train_loader = build_train_dataloader(cfg) + # train_loader = build_train_dataloader(cfg) + train_loader = None # Construct evaluators. evaluators = build_evaluators(cfg, device) @@ -295,11 +296,16 @@ def dummy_init_fn(module: torch.nn.Module) -> None: if cfg.load_path is not None: log.info(f"Loading checkpoint from {cfg.load_path}...") - trainer.restore_checkpoint( + # trainer.restore_checkpoint( + # cfg.load_path, + # load_optimizer_state=not cfg.reset_optimizer_state, + # load_trainer_state=not cfg.reset_trainer_state, + # sharded_checkpointer=cfg.load_path_sharded_checkpointer, + # ) + trainer.restore_unsharded_checkpoint( cfg.load_path, - load_optimizer_state=not cfg.reset_optimizer_state, - load_trainer_state=not cfg.reset_trainer_state, - sharded_checkpointer=cfg.load_path_sharded_checkpointer, + load_optimizer_state=False, + load_trainer_state=False, ) log.info("Checkpoint successfully loaded") From b27a822338eec233cc715a72fe73076462e98c25 Mon Sep 17 00:00:00 2001 From: Jiacheng Liu Date: Tue, 15 Oct 2024 23:19:29 +0000 Subject: [PATCH 06/58] Bypass trainer state --- olmo/checkpoint.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/olmo/checkpoint.py b/olmo/checkpoint.py index 544441450..c30a3ad2f 100644 --- a/olmo/checkpoint.py +++ b/olmo/checkpoint.py @@ -781,11 +781,12 @@ def restore_checkpoint( ) # Load other state. - try: - trainer_state = load_state_dict(load_path, "train.pt", local_cache=local_cache) - except FileNotFoundError: - # for backwards compatibility - trainer_state = load_state_dict(load_path, "other.pt", local_cache=local_cache) + trainer_state = None + # try: + # trainer_state = load_state_dict(load_path, "train.pt", local_cache=local_cache) + # except FileNotFoundError: + # # for backwards compatibility + # trainer_state = load_state_dict(load_path, "other.pt", local_cache=local_cache) barrier() return trainer_state From ea0cf0746dec3874c4ebbd6c1a66269999afafef Mon Sep 17 00:00:00 2001 From: Jiacheng Liu Date: Wed, 16 Oct 2024 16:42:59 +0000 Subject: [PATCH 07/58] Fix save folder --- scripts/beaker/peteish/peteish7-launch.sh | 2 +- scripts/beaker/peteish/peteish7.sh | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/beaker/peteish/peteish7-launch.sh b/scripts/beaker/peteish/peteish7-launch.sh index c8db28f03..ec2eb5515 100755 --- a/scripts/beaker/peteish/peteish7-launch.sh +++ b/scripts/beaker/peteish/peteish7-launch.sh @@ -7,7 +7,7 @@ NUM_NODES=8 gantry run \ --allow-dirty \ --workspace ai2/hb-wolf-olmo \ - --task-name peteish7 \ + --task-name peteish7-eval \ --description "Pete-ish 7B" \ --priority high \ --preemptible \ diff --git a/scripts/beaker/peteish/peteish7.sh b/scripts/beaker/peteish/peteish7.sh index 98d419338..4cc5d5640 100755 --- a/scripts/beaker/peteish/peteish7.sh +++ b/scripts/beaker/peteish/peteish7.sh @@ -52,4 +52,5 @@ torchrun \ configs/peteish7-weka.yaml \ --run_name="${GANTRY_TASK_NAME}" \ --save_interval_ephemeral=500 \ + --save_folder="/weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7" \ '--load_path=${path.last_checkpoint:${save_folder}}' \ From 746c6748d2ea29653e106b286520ef8f125a51ca Mon Sep 17 00:00:00 2001 From: Jiacheng Liu Date: Thu, 17 Oct 2024 03:59:07 +0000 Subject: [PATCH 08/58] Switch to loading sharded ckpt --- configs/peteish1-weka.yaml | 282 +++++++++++++++++++++++++++++++++---- configs/peteish7-weka.yaml | 2 +- olmo/checkpoint.py | 11 +- scripts/eval.py | 18 +-- 4 files changed, 267 insertions(+), 46 deletions(-) diff --git a/configs/peteish1-weka.yaml b/configs/peteish1-weka.yaml index 071c5399b..896500244 100644 --- a/configs/peteish1-weka.yaml +++ b/configs/peteish1-weka.yaml @@ -108,35 +108,35 @@ eval_interval: 1000 eval_subset_num_batches: -1 device_eval_batch_size: ${device_train_microbatch_size} evaluators: - # - label: all-small-ppl-validation - # data: - # num_workers: 0 - # drop_last: true - # # generate_doc_lengths: true - # memmap_dtype: uint32 - # datasets: - # c4_en-validation: - # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy - # dolma_books-validation: - # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy - # dolma_common-crawl-validation: - # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy - # dolma_pes2o-validation: - # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy - # dolma_reddit-validation: - # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy - # dolma_stack-validation: - # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy - # dolma_wiki-validation: - # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy - # ice-validation: - # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy - # m2d2_s2orc-validation: - # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy - # pile-validation: - # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy - # wikitext_103-validation: - # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + - label: all-small-ppl-validation + data: + num_workers: 0 + drop_last: true + # generate_doc_lengths: true + memmap_dtype: uint32 + datasets: + c4_en-validation: + - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + dolma_books-validation: + - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + dolma_common-crawl-validation: + - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + dolma_pes2o-validation: + - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + dolma_reddit-validation: + - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + dolma_stack-validation: + - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + dolma_wiki-validation: + - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + ice-validation: + - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + m2d2_s2orc-validation: + - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + pile-validation: + - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + wikitext_103-validation: + - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy ########################## # Downstream evaluations # @@ -155,7 +155,7 @@ evaluators: - label: boolq type: downstream - + - label: sciq type: downstream @@ -231,6 +231,228 @@ evaluators: - label: arc_easy_ppl type: downstream + - label: piqa_rc_0shot + type: downstream + + - label: piqa_rc_0shot_bpb + type: downstream + + - label: piqa_rc_5shot + type: downstream + + - label: piqa_rc_5shot_bpb + type: downstream + + - label: piqa_mc_5shot + type: downstream + + - label: piqa_mc_5shot_bpb + type: downstream + + - label: hellaswag_rc_0shot + type: downstream + + - label: hellaswag_rc_0shot_bpb + type: downstream + + - label: hellaswag_rc_5shot + type: downstream + + - label: hellaswag_rc_5shot_bpb + type: downstream + + - label: hellaswag_mc_5shot + type: downstream + + - label: hellaswag_mc_5shot_bpb + type: downstream + + - label: winogrande_rc_0shot + type: downstream + + - label: winogrande_rc_0shot_bpb + type: downstream + + - label: winogrande_rc_5shot + type: downstream + + - label: winogrande_rc_5shot_bpb + type: downstream + + - label: winogrande_mc_5shot + type: downstream + + - label: winogrande_mc_5shot_bpb + type: downstream + + - label: openbookqa_rc_0shot + type: downstream + + - label: openbookqa_rc_0shot_bpb + type: downstream + + - label: openbookqa_rc_5shot + type: downstream + + - label: openbookqa_rc_5shot_bpb + type: downstream + + - label: openbookqa_mc_5shot + type: downstream + + - label: openbookqa_mc_5shot_bpb + type: downstream + + - label: boolq_rc_0shot + type: downstream + + - label: boolq_rc_0shot_bpb + type: downstream + + - label: boolq_rc_5shot + type: downstream + + - label: boolq_rc_5shot_bpb + type: downstream + + - label: boolq_mc_5shot + type: downstream + + - label: boolq_mc_5shot_bpb + type: downstream + + - label: sciq_rc_0shot + type: downstream + + - label: sciq_rc_0shot_bpb + type: downstream + + # - label: sciq_rc_5shot + # type: downstream + + # - label: sciq_rc_5shot_bpb + # type: downstream + + # - label: sciq_mc_5shot + # type: downstream + + # - label: sciq_mc_5shot_bpb + # type: downstream + + - label: arc_easy_rc_0shot + type: downstream + + - label: arc_easy_rc_0shot_bpb + type: downstream + + - label: arc_easy_rc_5shot + type: downstream + + - label: arc_easy_rc_5shot_bpb + type: downstream + + - label: arc_easy_mc_5shot + type: downstream + + - label: arc_easy_mc_5shot_bpb + type: downstream + + - label: arc_challenge_rc_0shot + type: downstream + + - label: arc_challenge_rc_0shot_bpb + type: downstream + + - label: arc_challenge_rc_5shot + type: downstream + + - label: arc_challenge_rc_5shot_bpb + type: downstream + + - label: arc_challenge_mc_5shot + type: downstream + + - label: arc_challenge_mc_5shot_bpb + type: downstream + + - label: copa_rc_0shot + type: downstream + + - label: copa_rc_0shot_bpb + type: downstream + + # - label: copa_rc_5shot + # type: downstream + + # - label: copa_rc_5shot_bpb + # type: downstream + + # - label: copa_mc_5shot + # type: downstream + + # - label: copa_mc_5shot_bpb + # type: downstream + + - label: csqa_rc_0shot + type: downstream + + - label: csqa_rc_0shot_bpb + type: downstream + + - label: csqa_rc_5shot + type: downstream + + - label: csqa_rc_5shot_bpb + type: downstream + + - label: csqa_mc_5shot + type: downstream + + - label: csqa_mc_5shot_bpb + type: downstream + + - label: socialiqa_rc_0shot + type: downstream + + - label: socialiqa_rc_0shot_bpb + type: downstream + + - label: socialiqa_rc_5shot + type: downstream + + - label: socialiqa_rc_5shot_bpb + type: downstream + + - label: socialiqa_mc_5shot + type: downstream + + - label: socialiqa_mc_5shot_bpb + type: downstream + + - label: mmlu_stem_var_bpb + type: downstream + + - label: mmlu_humanities_var_bpb + type: downstream + + - label: mmlu_social_sciences_var_bpb + type: downstream + + - label: mmlu_other_var_bpb + type: downstream + + - label: mmlu_stem_bpb + type: downstream + + - label: mmlu_humanities_bpb + type: downstream + + - label: mmlu_social_sciences_bpb + type: downstream + + - label: mmlu_other_bpb + type: downstream + data: pad_direction: right # generate_doc_lengths: true diff --git a/configs/peteish7-weka.yaml b/configs/peteish7-weka.yaml index 9bd92029e..1e96ffbe2 100644 --- a/configs/peteish7-weka.yaml +++ b/configs/peteish7-weka.yaml @@ -1,4 +1,4 @@ -run_name: peteish7-backfill +run_name: peteish7-run001 seed: 6198 dry_run: false diff --git a/olmo/checkpoint.py b/olmo/checkpoint.py index c30a3ad2f..544441450 100644 --- a/olmo/checkpoint.py +++ b/olmo/checkpoint.py @@ -781,12 +781,11 @@ def restore_checkpoint( ) # Load other state. - trainer_state = None - # try: - # trainer_state = load_state_dict(load_path, "train.pt", local_cache=local_cache) - # except FileNotFoundError: - # # for backwards compatibility - # trainer_state = load_state_dict(load_path, "other.pt", local_cache=local_cache) + try: + trainer_state = load_state_dict(load_path, "train.pt", local_cache=local_cache) + except FileNotFoundError: + # for backwards compatibility + trainer_state = load_state_dict(load_path, "other.pt", local_cache=local_cache) barrier() return trainer_state diff --git a/scripts/eval.py b/scripts/eval.py index e62d048e6..2033febc7 100644 --- a/scripts/eval.py +++ b/scripts/eval.py @@ -296,17 +296,17 @@ def dummy_init_fn(module: torch.nn.Module) -> None: if cfg.load_path is not None: log.info(f"Loading checkpoint from {cfg.load_path}...") - # trainer.restore_checkpoint( - # cfg.load_path, - # load_optimizer_state=not cfg.reset_optimizer_state, - # load_trainer_state=not cfg.reset_trainer_state, - # sharded_checkpointer=cfg.load_path_sharded_checkpointer, - # ) - trainer.restore_unsharded_checkpoint( + trainer.restore_checkpoint( cfg.load_path, - load_optimizer_state=False, - load_trainer_state=False, + load_optimizer_state=not cfg.reset_optimizer_state, + load_trainer_state=not cfg.reset_trainer_state, + sharded_checkpointer=cfg.load_path_sharded_checkpointer, ) + # trainer.restore_unsharded_checkpoint( + # cfg.load_path, + # load_optimizer_state=False, + # load_trainer_state=False, + # ) log.info("Checkpoint successfully loaded") # If we have to, set a new scheduler: From cb54f80cfb19631136d9d4e55517019e64557789 Mon Sep 17 00:00:00 2001 From: Jiacheng Liu Date: Thu, 17 Oct 2024 04:02:55 +0000 Subject: [PATCH 09/58] Eval peteish1 --- scripts/beaker/peteish/peteish1-launch.sh | 18 +++++++----------- scripts/beaker/peteish/peteish1.sh | 13 ++++++------- 2 files changed, 13 insertions(+), 18 deletions(-) diff --git a/scripts/beaker/peteish/peteish1-launch.sh b/scripts/beaker/peteish/peteish1-launch.sh index ffab0df65..9540b9005 100755 --- a/scripts/beaker/peteish/peteish1-launch.sh +++ b/scripts/beaker/peteish/peteish1-launch.sh @@ -5,10 +5,11 @@ set -ex NUM_NODES=16 gantry run \ - --workspace ai2/OLMo-pretraining-stability \ - --task-name peteish1 \ + --allow-dirty \ + --workspace ai2/hb-wolf-olmo \ + --task-name peteish1-eval \ --description "Pete-ish 1B" \ - --priority urgent \ + --priority high \ --preemptible \ --beaker-image petew/olmo-torch23-gantry \ --cluster ai2/jupiter-cirrascale-2 \ @@ -26,14 +27,9 @@ gantry run \ --env LOG_FILTER_TYPE=local_rank0_only \ --env OMP_NUM_THREADS=8 \ --env OLMO_TASK=model \ - --env R2_PROFILE=R2 \ - --env S3_PROFILE=S3 \ - --env WEKA_PROFILE=WEKA \ - --env-secret AWS_CONFIG=PETEW_AWS_CONFIG \ - --env-secret AWS_CREDENTIALS=PETEW_AWS_CREDENTIALS \ - --env-secret R2_ENDPOINT_URL=R2_ENDPOINT_URL \ - --env-secret WEKA_ENDPOINT_URL=WEKA_ENDPOINT_URL \ - --env-secret WANDB_API_KEY=PETEW_WANDB_API_KEY \ + --env-secret WANDB_API_KEY=WANDB_API_KEY \ + --env-secret AWS_ACCESS_KEY_ID=AWS_ACCESS_KEY_ID \ + --env-secret AWS_SECRET_ACCESS_KEY=AWS_SECRET_ACCESS_KEY \ --shared-memory 10GiB \ --yes \ --timeout=-1 \ diff --git a/scripts/beaker/peteish/peteish1.sh b/scripts/beaker/peteish/peteish1.sh index 270f3e4b5..289d33b15 100755 --- a/scripts/beaker/peteish/peteish1.sh +++ b/scripts/beaker/peteish/peteish1.sh @@ -25,9 +25,9 @@ pip install '.[train]' pip freeze # Move AWS credentials from env to relevant files -mkdir -p ~/.aws -printenv AWS_CONFIG > ~/.aws/config -printenv AWS_CREDENTIALS > ~/.aws/credentials +# mkdir -p ~/.aws +# printenv AWS_CONFIG > ~/.aws/config +# printenv AWS_CREDENTIALS > ~/.aws/credentials # Force processes to synchronize at init_process_group export TORCH_DIST_INIT_BARRIER=1 @@ -48,10 +48,9 @@ torchrun \ --rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \ --node_rank "${BEAKER_REPLICA_RANK}" \ --rdzv_conf 'read_timeout=420' \ - scripts/train.py \ + scripts/eval.py \ configs/peteish1-weka.yaml \ --run_name="${GANTRY_TASK_NAME}" \ --save_interval_ephemeral=null \ - --save_overwrite - - # '--load_path=${path.last_checkpoint:${save_folder}}' \ + --save_folder="/weka/oe-training-default/ai2-llm/checkpoints/OLMo-small/peteish1" \ + '--load_path=${path.last_checkpoint:${save_folder}}' \ From 7b403102d82a582c5d93d1defc7470047766b684 Mon Sep 17 00:00:00 2001 From: Jiacheng Liu Date: Thu, 17 Oct 2024 04:11:49 +0000 Subject: [PATCH 10/58] Switch to 1 node --- scripts/beaker/peteish/peteish1-launch.sh | 13 +++++++------ scripts/beaker/peteish/peteish1.sh | 6 ++++-- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/scripts/beaker/peteish/peteish1-launch.sh b/scripts/beaker/peteish/peteish1-launch.sh index 9540b9005..bfbbe8d72 100755 --- a/scripts/beaker/peteish/peteish1-launch.sh +++ b/scripts/beaker/peteish/peteish1-launch.sh @@ -2,7 +2,7 @@ set -ex -NUM_NODES=16 +NUM_NODES=1 gantry run \ --allow-dirty \ @@ -15,14 +15,9 @@ gantry run \ --cluster ai2/jupiter-cirrascale-2 \ --gpus 8 \ --replicas "${NUM_NODES}" \ - --leader-selection \ - --host-networking \ --budget ai2/oe-training \ --no-nfs \ --weka oe-training-default:/weka/oe-training-default \ - --propagate-failure \ - --propagate-preemption \ - --synchronized-start-timeout 90m \ --no-python \ --env LOG_FILTER_TYPE=local_rank0_only \ --env OMP_NUM_THREADS=8 \ @@ -34,3 +29,9 @@ gantry run \ --yes \ --timeout=-1 \ -- /bin/bash -c "scripts/beaker/peteish/peteish1.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" + + # --leader-selection \ + # --host-networking \ + # --propagate-failure \ + # --propagate-preemption \ + # --synchronized-start-timeout 90m \ diff --git a/scripts/beaker/peteish/peteish1.sh b/scripts/beaker/peteish/peteish1.sh index 289d33b15..44f3ba198 100755 --- a/scripts/beaker/peteish/peteish1.sh +++ b/scripts/beaker/peteish/peteish1.sh @@ -52,5 +52,7 @@ torchrun \ configs/peteish1-weka.yaml \ --run_name="${GANTRY_TASK_NAME}" \ --save_interval_ephemeral=null \ - --save_folder="/weka/oe-training-default/ai2-llm/checkpoints/OLMo-small/peteish1" \ - '--load_path=${path.last_checkpoint:${save_folder}}' \ + --save_folder="/weka/oe-training-default/ai2-llm/checkpoints/OLMo-small/peteish1-eval" \ + --load_path="/weka/oe-training-default/wolf/v3.0_v2.7_peteish/step12212" + + # --save_folder="/weka/oe-training-default/ai2-llm/checkpoints/OLMo-small/peteish1" \ \ No newline at end of file From 7f994fe20754d915ca85aa93d851be6f18d3f5d7 Mon Sep 17 00:00:00 2001 From: Jiacheng Liu Date: Thu, 17 Oct 2024 04:24:00 +0000 Subject: [PATCH 11/58] Make things work for single node --- scripts/beaker/peteish/peteish1-launch.sh | 3 ++- scripts/beaker/peteish/peteish1.sh | 8 ++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/scripts/beaker/peteish/peteish1-launch.sh b/scripts/beaker/peteish/peteish1-launch.sh index bfbbe8d72..a33f36bd8 100755 --- a/scripts/beaker/peteish/peteish1-launch.sh +++ b/scripts/beaker/peteish/peteish1-launch.sh @@ -28,7 +28,8 @@ gantry run \ --shared-memory 10GiB \ --yes \ --timeout=-1 \ - -- /bin/bash -c "scripts/beaker/peteish/peteish1.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" + -- /bin/bash -c "scripts/beaker/peteish/peteish1.sh ${NUM_NODES}" + # -- /bin/bash -c "scripts/beaker/peteish/peteish1.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" # --leader-selection \ # --host-networking \ diff --git a/scripts/beaker/peteish/peteish1.sh b/scripts/beaker/peteish/peteish1.sh index 44f3ba198..064b12c0f 100755 --- a/scripts/beaker/peteish/peteish1.sh +++ b/scripts/beaker/peteish/peteish1.sh @@ -3,14 +3,14 @@ set -exuo pipefail IFS=$'\n\t' -BEAKER_LEADER_REPLICA_HOSTNAME=$1 -shift +# BEAKER_LEADER_REPLICA_HOSTNAME=$1 +# shift NUM_NODES=$1 shift -BEAKER_REPLICA_RANK=$1 -shift +# BEAKER_REPLICA_RANK=$1 +# shift # Setup Python environment. conda shell.bash activate base From 9a5f0766bbbca17626d621e8643f9f9788a76973 Mon Sep 17 00:00:00 2001 From: Jiacheng Liu Date: Thu, 17 Oct 2024 04:26:56 +0000 Subject: [PATCH 12/58] Make things work for single node --- scripts/beaker/peteish/peteish1.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/beaker/peteish/peteish1.sh b/scripts/beaker/peteish/peteish1.sh index 064b12c0f..1635a4af4 100755 --- a/scripts/beaker/peteish/peteish1.sh +++ b/scripts/beaker/peteish/peteish1.sh @@ -43,11 +43,6 @@ export NCCL_SOCKET_IFNAME=ib torchrun \ --nnodes "${NUM_NODES}:${NUM_NODES}" \ --nproc-per-node 8 \ - --rdzv_id 12347 \ - --rdzv_backend static \ - --rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \ - --node_rank "${BEAKER_REPLICA_RANK}" \ - --rdzv_conf 'read_timeout=420' \ scripts/eval.py \ configs/peteish1-weka.yaml \ --run_name="${GANTRY_TASK_NAME}" \ @@ -55,4 +50,9 @@ torchrun \ --save_folder="/weka/oe-training-default/ai2-llm/checkpoints/OLMo-small/peteish1-eval" \ --load_path="/weka/oe-training-default/wolf/v3.0_v2.7_peteish/step12212" + # --rdzv_id 12347 \ + # --rdzv_backend static \ + # --rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \ + # --node_rank "${BEAKER_REPLICA_RANK}" \ + # --rdzv_conf 'read_timeout=420' \ # --save_folder="/weka/oe-training-default/ai2-llm/checkpoints/OLMo-small/peteish1" \ \ No newline at end of file From d1e05fdc06d699a0ca0df276e619d197c48d43b7 Mon Sep 17 00:00:00 2001 From: Jiacheng Liu Date: Thu, 17 Oct 2024 05:01:05 +0000 Subject: [PATCH 13/58] Make things work for single node --- scripts/beaker/peteish/peteish1.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/beaker/peteish/peteish1.sh b/scripts/beaker/peteish/peteish1.sh index 1635a4af4..291c74278 100755 --- a/scripts/beaker/peteish/peteish1.sh +++ b/scripts/beaker/peteish/peteish1.sh @@ -37,7 +37,7 @@ export OLMO_SHARED_FS=1 export NCCL_DEBUG=INFO export NCCL_IB_HCA="^=mlx5_bond_0" -export NCCL_SOCKET_IFNAME=ib +# export NCCL_SOCKET_IFNAME=ib # export NCCL_IB_GID_INDEX=0 torchrun \ From b455b996ba2c676fc3b6c74746ab18fc5bfd1be2 Mon Sep 17 00:00:00 2001 From: Jiacheng Liu Date: Thu, 17 Oct 2024 05:23:26 +0000 Subject: [PATCH 14/58] Make things work for single node --- scripts/beaker/peteish/peteish1.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/beaker/peteish/peteish1.sh b/scripts/beaker/peteish/peteish1.sh index 291c74278..a63741f2a 100755 --- a/scripts/beaker/peteish/peteish1.sh +++ b/scripts/beaker/peteish/peteish1.sh @@ -48,7 +48,7 @@ torchrun \ --run_name="${GANTRY_TASK_NAME}" \ --save_interval_ephemeral=null \ --save_folder="/weka/oe-training-default/ai2-llm/checkpoints/OLMo-small/peteish1-eval" \ - --load_path="/weka/oe-training-default/wolf/v3.0_v2.7_peteish/step12212" + --load_path="/weka/oe-training-default/wolf/ckpt/v3.0_v2.7_peteish/step12212" # --rdzv_id 12347 \ # --rdzv_backend static \ From 2f4d252ce5a76f23f22c6b8e5e905d2aa6653117 Mon Sep 17 00:00:00 2001 From: Jiacheng Liu Date: Thu, 17 Oct 2024 05:30:22 +0000 Subject: [PATCH 15/58] Load train_dataloader --- scripts/eval.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/eval.py b/scripts/eval.py index 2033febc7..1baf23239 100644 --- a/scripts/eval.py +++ b/scripts/eval.py @@ -121,8 +121,8 @@ def main(cfg: TrainConfig) -> None: seed_all(cfg.seed) # # Construct data loader. - # train_loader = build_train_dataloader(cfg) - train_loader = None + train_loader = build_train_dataloader(cfg) + # train_loader = None # Construct evaluators. evaluators = build_evaluators(cfg, device) From 331b0add77fca52b5fca37bbdbedf55e04c0521e Mon Sep 17 00:00:00 2001 From: Jiacheng Liu Date: Thu, 17 Oct 2024 05:52:00 +0000 Subject: [PATCH 16/58] Change to another ckpt --- scripts/beaker/peteish/peteish1.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/beaker/peteish/peteish1.sh b/scripts/beaker/peteish/peteish1.sh index a63741f2a..05bd2443d 100755 --- a/scripts/beaker/peteish/peteish1.sh +++ b/scripts/beaker/peteish/peteish1.sh @@ -47,8 +47,9 @@ torchrun \ configs/peteish1-weka.yaml \ --run_name="${GANTRY_TASK_NAME}" \ --save_interval_ephemeral=null \ + --wandb.id="2vsp82zs" \ --save_folder="/weka/oe-training-default/ai2-llm/checkpoints/OLMo-small/peteish1-eval" \ - --load_path="/weka/oe-training-default/wolf/ckpt/v3.0_v2.7_peteish/step12212" + --load_path="/weka/oe-training-default/wolf/ckpt/v3.0_v2.7_peteish/step10000" # --rdzv_id 12347 \ # --rdzv_backend static \ From 6acabf3ee8f39c3c29fa27d70c0b7a3541875879 Mon Sep 17 00:00:00 2001 From: Jiacheng Liu Date: Thu, 17 Oct 2024 06:41:42 +0000 Subject: [PATCH 17/58] Do not load train_dataloader and trainer_state --- scripts/eval.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/eval.py b/scripts/eval.py index 1baf23239..0e3688b5d 100644 --- a/scripts/eval.py +++ b/scripts/eval.py @@ -121,8 +121,8 @@ def main(cfg: TrainConfig) -> None: seed_all(cfg.seed) # # Construct data loader. - train_loader = build_train_dataloader(cfg) - # train_loader = None + # train_loader = build_train_dataloader(cfg) + train_loader = None # Construct evaluators. evaluators = build_evaluators(cfg, device) @@ -298,8 +298,8 @@ def dummy_init_fn(module: torch.nn.Module) -> None: log.info(f"Loading checkpoint from {cfg.load_path}...") trainer.restore_checkpoint( cfg.load_path, - load_optimizer_state=not cfg.reset_optimizer_state, - load_trainer_state=not cfg.reset_trainer_state, + load_optimizer_state=False, + load_trainer_state=False, sharded_checkpointer=cfg.load_path_sharded_checkpointer, ) # trainer.restore_unsharded_checkpoint( From 24157196a313f8d3aa3e0c14ad4010df0e0f011a Mon Sep 17 00:00:00 2001 From: Akshita Bhagia Date: Thu, 17 Oct 2024 22:57:23 -0600 Subject: [PATCH 18/58] run for annealed model --- scripts/beaker/peteish/peteish7-launch.sh | 10 +++++----- scripts/beaker/peteish/peteish7.sh | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/scripts/beaker/peteish/peteish7-launch.sh b/scripts/beaker/peteish/peteish7-launch.sh index ec2eb5515..ef74bce67 100755 --- a/scripts/beaker/peteish/peteish7-launch.sh +++ b/scripts/beaker/peteish/peteish7-launch.sh @@ -6,8 +6,8 @@ NUM_NODES=8 gantry run \ --allow-dirty \ - --workspace ai2/hb-wolf-olmo \ - --task-name peteish7-eval \ + --workspace ai2/OLMo-tiny \ + --task-name peteish7-anneal-eval \ --description "Pete-ish 7B" \ --priority high \ --preemptible \ @@ -27,9 +27,9 @@ gantry run \ --env LOG_FILTER_TYPE=local_rank0_only \ --env OMP_NUM_THREADS=8 \ --env OLMO_TASK=model \ - --env-secret WANDB_API_KEY=WANDB_API_KEY \ - --env-secret AWS_ACCESS_KEY_ID=AWS_ACCESS_KEY_ID \ - --env-secret AWS_SECRET_ACCESS_KEY=AWS_SECRET_ACCESS_KEY \ + --env-secret WANDB_API_KEY=AKSHITAB_WANDB_API_KEY \ + --env-secret AWS_ACCESS_KEY_ID=AKSHITAB_AWS_ACCESS_KEY_ID \ + --env-secret AWS_SECRET_ACCESS_KEY=AKSHITAB_AWS_SECRET_ACCESS_KEY \ --shared-memory 10GiB \ --yes \ --timeout=-1 \ diff --git a/scripts/beaker/peteish/peteish7.sh b/scripts/beaker/peteish/peteish7.sh index 4cc5d5640..179690f91 100755 --- a/scripts/beaker/peteish/peteish7.sh +++ b/scripts/beaker/peteish/peteish7.sh @@ -52,5 +52,5 @@ torchrun \ configs/peteish7-weka.yaml \ --run_name="${GANTRY_TASK_NAME}" \ --save_interval_ephemeral=500 \ - --save_folder="/weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7" \ + --save_folder="/weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7-anneal-from-928646-50B-no-warmup" \ '--load_path=${path.last_checkpoint:${save_folder}}' \ From 788b3972dab4436c34a7c25c221bbfe720e6e81a Mon Sep 17 00:00:00 2001 From: Jiacheng Liu Date: Sun, 20 Oct 2024 22:15:23 +0000 Subject: [PATCH 19/58] Backfill does not seem possible; Evaluating multiple ckpts --- olmo/config.py | 1 - scripts/beaker/peteish/peteish1.sh | 3 +- scripts/eval.py | 102 ++++------------------------- 3 files changed, 14 insertions(+), 92 deletions(-) diff --git a/olmo/config.py b/olmo/config.py index a370937ee..94e5103d2 100644 --- a/olmo/config.py +++ b/olmo/config.py @@ -667,7 +667,6 @@ class WandbConfig(BaseConfig): log_artifacts: bool = False rank_zero_only: bool = True log_interval: int = 1 - id: Optional[str] = None @dataclass diff --git a/scripts/beaker/peteish/peteish1.sh b/scripts/beaker/peteish/peteish1.sh index 05bd2443d..0a0ee1562 100755 --- a/scripts/beaker/peteish/peteish1.sh +++ b/scripts/beaker/peteish/peteish1.sh @@ -47,9 +47,8 @@ torchrun \ configs/peteish1-weka.yaml \ --run_name="${GANTRY_TASK_NAME}" \ --save_interval_ephemeral=null \ - --wandb.id="2vsp82zs" \ --save_folder="/weka/oe-training-default/ai2-llm/checkpoints/OLMo-small/peteish1-eval" \ - --load_path="/weka/oe-training-default/wolf/ckpt/v3.0_v2.7_peteish/step10000" + --load_path="/weka/oe-training-default/wolf/ckpt/v3.0_v2.7_peteish" # --rdzv_id 12347 \ # --rdzv_backend static \ diff --git a/scripts/eval.py b/scripts/eval.py index 0e3688b5d..72d315f2d 100644 --- a/scripts/eval.py +++ b/scripts/eval.py @@ -111,8 +111,6 @@ def main(cfg: TrainConfig) -> None: name=cfg.wandb.name, tags=cfg.wandb.tags, config=cfg.asdict(exclude=["wandb"]), - id=cfg.wandb.id, - resume="allow", ) barrier() @@ -241,60 +239,18 @@ def dummy_init_fn(module: torch.nn.Module) -> None: evaluators=evaluators, indices_file=indices_file, ) as trainer: - # if cfg.try_load_latest_save: - # if ( - # cfg.save_folder is not None - # and (checkpoint_dir := find_latest_checkpoint(cfg.save_folder)) is not None - # ): - # log.info("Setting load path to local checkpoint %s", checkpoint_dir) - # cfg.load_path = str(checkpoint_dir) - # elif ( - # cfg.remote_save_folder is not None - # and (checkpoint_dir := find_latest_checkpoint(cfg.remote_save_folder)) is not None - # ): - # log.info("Setting load path to remote checkpoint %s", checkpoint_dir) - # cfg.load_path = str(checkpoint_dir) - - # if not cfg.dry_run and not cfg.no_pre_train_checkpoint and cfg.load_path is None: - # if cfg.distributed_strategy == DistributedStrategy.ddp: - # checkpoint_type = CheckpointType.unsharded - - # if cfg.save_interval_unsharded is None: - # log.warning( - # "DDP requires setting `save_interval_unsharded`. Using the value set for `save_interval`." - # ) - # cfg.save_interval_unsharded = cfg.save_interval - - # if cfg.save_num_unsharded_checkpoints_to_keep == 0: - # log.warning( - # "DDP requires setting `save_num_unsharded_checkpoints_to_keep`. Using the value set for `save_num_checkpoints_to_keep`." - # ) - # cfg.save_num_unsharded_checkpoints_to_keep = cfg.save_num_checkpoints_to_keep - # elif cfg.distributed_strategy == DistributedStrategy.fsdp: - # checkpoint_type = ( - # CheckpointType.sharded if cfg.save_num_checkpoints_to_keep != 0 else CheckpointType.unsharded - # ) - # else: - # raise NotImplementedError(f"Distributed strategy {cfg.distributed_strategy} not supported yet!") - - # # We save a checkpoint up-front to make sure this won't fail (due to disk space or whatever). - # log.info("Saving pre-train checkpoint...") - # checkpoint_path, local_checkpoint_cache = trainer.save_checkpoint(checkpoint_type=checkpoint_type) - # log.info(f"Checkpoint saved to {checkpoint_path}") - - # # And they we verify that we can load it. - # log.info("Attempting to load pre-train checkpoint...") - # trainer.restore_checkpoint( - # checkpoint_path, checkpoint_type=checkpoint_type, local_cache=local_checkpoint_cache - # ) - # log.info("Checkpoint successfully loaded") - - # # NOTE: https://github.com/allenai/LLM/issues/233 - # # log.info("Removing pre-train checkpoint...") - # # trainer.remove_checkpoint(checkpoint_type=checkpoint_type) - # # log.info("Successfully removed checkpoint") - - if cfg.load_path is not None: + + if cfg.load_path is None: + raise OLMoConfigurationError("To run eval you must provide a load_path") + if 'step' in cfg.load_path.split('/')[-1]: + load_paths = [cfg.load_path] + else: + # This globbing does not work with remote paths. + load_paths = list(sorted(glob.glob(Path(cfg.save_folder) / f"step*"), key=lambda x: int(x.split('/')[-1].split('step')[-1]))) + + for load_path in load_paths: + step = int(load_path.split('/')[-1].split('step')[-1]) + log.info(f"Loading checkpoint from {cfg.load_path}...") trainer.restore_checkpoint( cfg.load_path, @@ -302,45 +258,13 @@ def dummy_init_fn(module: torch.nn.Module) -> None: load_trainer_state=False, sharded_checkpointer=cfg.load_path_sharded_checkpointer, ) - # trainer.restore_unsharded_checkpoint( - # cfg.load_path, - # load_optimizer_state=False, - # load_trainer_state=False, - # ) log.info("Checkpoint successfully loaded") - # If we have to, set a new scheduler: - if cfg.reset_optimizer_state and not cfg.reset_trainer_state: - trainer.scheduler = BoltOnWarmupScheduler.wrap( - trainer.scheduler, - trainer.global_step, - int(trainer.global_step + cfg.scheduler.t_warmup), - ) - - # if cfg.force_save_unsharded and cfg.distributed_strategy != DistributedStrategy.ddp: - # log.info("Saving unsharded checkpoint...") - # checkpoint_path, _ = trainer.save_checkpoint(checkpoint_type=CheckpointType.unsharded) - # log.info(f"Unsharded checkpoint saved to {checkpoint_path}") - - if cfg.compile is not None: - # TODO (epwalsh): trying to compile the whole train step results in a compile-time error from within - # the optimizer. We should investigate this further at some point. - # trainer.train_step = torch.compile(trainer.train_step, **cfg.compile.asdict()) - trainer.train_batch = torch.compile(trainer.train_batch, **cfg.compile.asdict()) # type: ignore - # TODO (epwalsh): compiling the `eval_batch()` method is a little sketchy since the inputs will look - # different for different eval tasks. That might be okay, but it might not be. - # trainer.eval_batch = torch.compile(trainer.eval_batch, **cfg.compile.asdict()) # type: ignore - # Alternatively, could just do this: - # trainer.fsdp_model = torch.compile(trainer.fsdp_model, **cfg.compile.asdict()) - - if not cfg.dry_run: log.info("Starting evaluating...") eval_metrics = trainer.eval() if wandb.run is not None: - wandb.log(eval_metrics, step=trainer.global_step) + wandb.log(eval_metrics, step=step) log.info("Evaluating complete") - else: - log.info("Dry run complete") if __name__ == "__main__": From ed074da74f9b9dea530823e2210d0640eb9d3267 Mon Sep 17 00:00:00 2001 From: Jiacheng Liu Date: Sun, 20 Oct 2024 22:20:12 +0000 Subject: [PATCH 20/58] Fix import --- scripts/eval.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/eval.py b/scripts/eval.py index 72d315f2d..148927fed 100644 --- a/scripts/eval.py +++ b/scripts/eval.py @@ -6,6 +6,7 @@ from datetime import timedelta from pathlib import Path from typing import Optional, TextIO +import glob import torch import torch.distributed as dist From eb628a83794974bb729fbfc3abec81e025bf599c Mon Sep 17 00:00:00 2001 From: Jiacheng Liu Date: Sun, 20 Oct 2024 22:25:33 +0000 Subject: [PATCH 21/58] Fix glob --- scripts/eval.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/eval.py b/scripts/eval.py index 148927fed..71383eed4 100644 --- a/scripts/eval.py +++ b/scripts/eval.py @@ -247,7 +247,7 @@ def dummy_init_fn(module: torch.nn.Module) -> None: load_paths = [cfg.load_path] else: # This globbing does not work with remote paths. - load_paths = list(sorted(glob.glob(Path(cfg.save_folder) / f"step*"), key=lambda x: int(x.split('/')[-1].split('step')[-1]))) + load_paths = list(sorted(glob.glob(f"{cfg.save_folder}/step*"), key=lambda x: int(x.split('/')[-1].split('step')[-1]))) for load_path in load_paths: step = int(load_path.split('/')[-1].split('step')[-1]) From ee6d55f0e54006fedc50d0fab29a457b12531e30 Mon Sep 17 00:00:00 2001 From: Jiacheng Liu Date: Sun, 20 Oct 2024 22:33:26 +0000 Subject: [PATCH 22/58] Fix glob --- scripts/eval.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/eval.py b/scripts/eval.py index 71383eed4..2a812f01c 100644 --- a/scripts/eval.py +++ b/scripts/eval.py @@ -247,7 +247,7 @@ def dummy_init_fn(module: torch.nn.Module) -> None: load_paths = [cfg.load_path] else: # This globbing does not work with remote paths. - load_paths = list(sorted(glob.glob(f"{cfg.save_folder}/step*"), key=lambda x: int(x.split('/')[-1].split('step')[-1]))) + load_paths = list(sorted(glob.glob(f"{cfg.load_path}/step*"), key=lambda x: int(x.split('/')[-1].split('step')[-1]))) for load_path in load_paths: step = int(load_path.split('/')[-1].split('step')[-1]) From 901ed16d2c46df8537df4154493ed527e01acb05 Mon Sep 17 00:00:00 2001 From: Jiacheng Liu Date: Sun, 20 Oct 2024 22:38:49 +0000 Subject: [PATCH 23/58] Fix load --- scripts/eval.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/eval.py b/scripts/eval.py index 2a812f01c..5bdb68088 100644 --- a/scripts/eval.py +++ b/scripts/eval.py @@ -252,9 +252,9 @@ def dummy_init_fn(module: torch.nn.Module) -> None: for load_path in load_paths: step = int(load_path.split('/')[-1].split('step')[-1]) - log.info(f"Loading checkpoint from {cfg.load_path}...") + log.info(f"Loading checkpoint from {load_path}...") trainer.restore_checkpoint( - cfg.load_path, + load_path, load_optimizer_state=False, load_trainer_state=False, sharded_checkpointer=cfg.load_path_sharded_checkpointer, From 90e1f93246f242f7c677a6661fd9e1a5847796e2 Mon Sep 17 00:00:00 2001 From: Jiacheng Liu Date: Sun, 20 Oct 2024 22:52:01 +0000 Subject: [PATCH 24/58] Switch back to the real peteish1 --- scripts/beaker/peteish/peteish1-launch.sh | 34 +++++++++++----------- scripts/beaker/peteish/peteish1.sh | 35 +++++++++++------------ 2 files changed, 35 insertions(+), 34 deletions(-) diff --git a/scripts/beaker/peteish/peteish1-launch.sh b/scripts/beaker/peteish/peteish1-launch.sh index a33f36bd8..d034ce3a0 100755 --- a/scripts/beaker/peteish/peteish1-launch.sh +++ b/scripts/beaker/peteish/peteish1-launch.sh @@ -2,37 +2,39 @@ set -ex -NUM_NODES=1 +NUM_NODES=16 gantry run \ - --allow-dirty \ - --workspace ai2/hb-wolf-olmo \ - --task-name peteish1-eval \ + --workspace ai2/OLMo-pretraining-stability \ + --task-name peteish1 \ --description "Pete-ish 1B" \ - --priority high \ + --priority urgent \ --preemptible \ --beaker-image petew/olmo-torch23-gantry \ --cluster ai2/jupiter-cirrascale-2 \ --gpus 8 \ --replicas "${NUM_NODES}" \ + --leader-selection \ + --host-networking \ --budget ai2/oe-training \ --no-nfs \ --weka oe-training-default:/weka/oe-training-default \ + --propagate-failure \ + --propagate-preemption \ + --synchronized-start-timeout 90m \ --no-python \ --env LOG_FILTER_TYPE=local_rank0_only \ --env OMP_NUM_THREADS=8 \ --env OLMO_TASK=model \ - --env-secret WANDB_API_KEY=WANDB_API_KEY \ - --env-secret AWS_ACCESS_KEY_ID=AWS_ACCESS_KEY_ID \ - --env-secret AWS_SECRET_ACCESS_KEY=AWS_SECRET_ACCESS_KEY \ + --env R2_PROFILE=R2 \ + --env S3_PROFILE=S3 \ + --env WEKA_PROFILE=WEKA \ + --env-secret AWS_CONFIG=PETEW_AWS_CONFIG \ + --env-secret AWS_CREDENTIALS=PETEW_AWS_CREDENTIALS \ + --env-secret R2_ENDPOINT_URL=R2_ENDPOINT_URL \ + --env-secret WEKA_ENDPOINT_URL=WEKA_ENDPOINT_URL \ + --env-secret WANDB_API_KEY=PETEW_WANDB_API_KEY \ --shared-memory 10GiB \ --yes \ --timeout=-1 \ - -- /bin/bash -c "scripts/beaker/peteish/peteish1.sh ${NUM_NODES}" - # -- /bin/bash -c "scripts/beaker/peteish/peteish1.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" - - # --leader-selection \ - # --host-networking \ - # --propagate-failure \ - # --propagate-preemption \ - # --synchronized-start-timeout 90m \ + -- /bin/bash -c "scripts/beaker/peteish/peteish1.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" \ No newline at end of file diff --git a/scripts/beaker/peteish/peteish1.sh b/scripts/beaker/peteish/peteish1.sh index 0a0ee1562..6c55bafc2 100755 --- a/scripts/beaker/peteish/peteish1.sh +++ b/scripts/beaker/peteish/peteish1.sh @@ -3,14 +3,14 @@ set -exuo pipefail IFS=$'\n\t' -# BEAKER_LEADER_REPLICA_HOSTNAME=$1 -# shift +BEAKER_LEADER_REPLICA_HOSTNAME=$1 +shift NUM_NODES=$1 shift -# BEAKER_REPLICA_RANK=$1 -# shift +BEAKER_REPLICA_RANK=$1 +shift # Setup Python environment. conda shell.bash activate base @@ -25,9 +25,9 @@ pip install '.[train]' pip freeze # Move AWS credentials from env to relevant files -# mkdir -p ~/.aws -# printenv AWS_CONFIG > ~/.aws/config -# printenv AWS_CREDENTIALS > ~/.aws/credentials +mkdir -p ~/.aws +printenv AWS_CONFIG > ~/.aws/config +printenv AWS_CREDENTIALS > ~/.aws/credentials # Force processes to synchronize at init_process_group export TORCH_DIST_INIT_BARRIER=1 @@ -37,22 +37,21 @@ export OLMO_SHARED_FS=1 export NCCL_DEBUG=INFO export NCCL_IB_HCA="^=mlx5_bond_0" -# export NCCL_SOCKET_IFNAME=ib +export NCCL_SOCKET_IFNAME=ib # export NCCL_IB_GID_INDEX=0 torchrun \ --nnodes "${NUM_NODES}:${NUM_NODES}" \ --nproc-per-node 8 \ - scripts/eval.py \ + --rdzv_id 12347 \ + --rdzv_backend static \ + --rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \ + --node_rank "${BEAKER_REPLICA_RANK}" \ + --rdzv_conf 'read_timeout=420' \ + scripts/train.py \ configs/peteish1-weka.yaml \ --run_name="${GANTRY_TASK_NAME}" \ --save_interval_ephemeral=null \ - --save_folder="/weka/oe-training-default/ai2-llm/checkpoints/OLMo-small/peteish1-eval" \ - --load_path="/weka/oe-training-default/wolf/ckpt/v3.0_v2.7_peteish" - - # --rdzv_id 12347 \ - # --rdzv_backend static \ - # --rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \ - # --node_rank "${BEAKER_REPLICA_RANK}" \ - # --rdzv_conf 'read_timeout=420' \ - # --save_folder="/weka/oe-training-default/ai2-llm/checkpoints/OLMo-small/peteish1" \ \ No newline at end of file + --save_overwrite + + # '--load_path=${path.last_checkpoint:${save_folder}}' \ \ No newline at end of file From 003cd29bbac4c6c051a800585da54a7be6cbe306 Mon Sep 17 00:00:00 2001 From: Jiacheng Liu Date: Sun, 20 Oct 2024 23:12:26 +0000 Subject: [PATCH 25/58] Fix ckpt loading --- .../beaker/peteish/peteish1-eval-launch.sh | 38 ++++++++++++ scripts/beaker/peteish/peteish1-eval.sh | 59 +++++++++++++++++++ scripts/eval.py | 6 +- 3 files changed, 100 insertions(+), 3 deletions(-) create mode 100644 scripts/beaker/peteish/peteish1-eval-launch.sh create mode 100755 scripts/beaker/peteish/peteish1-eval.sh diff --git a/scripts/beaker/peteish/peteish1-eval-launch.sh b/scripts/beaker/peteish/peteish1-eval-launch.sh new file mode 100644 index 000000000..a33f36bd8 --- /dev/null +++ b/scripts/beaker/peteish/peteish1-eval-launch.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash + +set -ex + +NUM_NODES=1 + +gantry run \ + --allow-dirty \ + --workspace ai2/hb-wolf-olmo \ + --task-name peteish1-eval \ + --description "Pete-ish 1B" \ + --priority high \ + --preemptible \ + --beaker-image petew/olmo-torch23-gantry \ + --cluster ai2/jupiter-cirrascale-2 \ + --gpus 8 \ + --replicas "${NUM_NODES}" \ + --budget ai2/oe-training \ + --no-nfs \ + --weka oe-training-default:/weka/oe-training-default \ + --no-python \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env OLMO_TASK=model \ + --env-secret WANDB_API_KEY=WANDB_API_KEY \ + --env-secret AWS_ACCESS_KEY_ID=AWS_ACCESS_KEY_ID \ + --env-secret AWS_SECRET_ACCESS_KEY=AWS_SECRET_ACCESS_KEY \ + --shared-memory 10GiB \ + --yes \ + --timeout=-1 \ + -- /bin/bash -c "scripts/beaker/peteish/peteish1.sh ${NUM_NODES}" + # -- /bin/bash -c "scripts/beaker/peteish/peteish1.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" + + # --leader-selection \ + # --host-networking \ + # --propagate-failure \ + # --propagate-preemption \ + # --synchronized-start-timeout 90m \ diff --git a/scripts/beaker/peteish/peteish1-eval.sh b/scripts/beaker/peteish/peteish1-eval.sh new file mode 100755 index 000000000..f6a44b27a --- /dev/null +++ b/scripts/beaker/peteish/peteish1-eval.sh @@ -0,0 +1,59 @@ +#!/usr/bin/env bash + +set -exuo pipefail +IFS=$'\n\t' + +# BEAKER_LEADER_REPLICA_HOSTNAME=$1 +# shift + +NUM_NODES=$1 +shift + +# BEAKER_REPLICA_RANK=$1 +# shift + +# Setup Python environment. +conda shell.bash activate base + +# Install flash-attn +#conda install -y -c nvidia cuda-python +pip install packaging ninja +export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE +pip install flash-attn==2.5.9.post1 --no-build-isolation +# pip install awscli +pip install '.[train]' +pip freeze + +# Move AWS credentials from env to relevant files +# mkdir -p ~/.aws +# printenv AWS_CONFIG > ~/.aws/config +# printenv AWS_CREDENTIALS > ~/.aws/credentials + +# Force processes to synchronize at init_process_group +export TORCH_DIST_INIT_BARRIER=1 + +# Tell OLMo all ranks share the same filesystem for checkpoints. +export OLMO_SHARED_FS=1 + +export NCCL_DEBUG=INFO +export NCCL_IB_HCA="^=mlx5_bond_0" +# export NCCL_SOCKET_IFNAME=ib +# export NCCL_IB_GID_INDEX=0 + +torchrun \ + --nnodes "${NUM_NODES}:${NUM_NODES}" \ + --nproc-per-node 8 \ + scripts/eval.py \ + configs/peteish1-weka.yaml \ + --run_name="${GANTRY_TASK_NAME}" \ + --save_interval_ephemeral=null \ + --save_overwrite \ + --save_folder="/weka/oe-training-default/ai2-llm/checkpoints/OLMo-small/peteish1-eval" \ + --load_path="/weka/oe-training-default/wolf/ckpt/v3.0_v2.7_peteish" + + # --rdzv_id 12347 \ + # --rdzv_backend static \ + # --rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \ + # --node_rank "${BEAKER_REPLICA_RANK}" \ + # --rdzv_conf 'read_timeout=420' \ + # --save_folder="/weka/oe-training-default/ai2-llm/checkpoints/OLMo-small/peteish1" \ \ No newline at end of file diff --git a/scripts/eval.py b/scripts/eval.py index 5bdb68088..e6069d708 100644 --- a/scripts/eval.py +++ b/scripts/eval.py @@ -120,8 +120,8 @@ def main(cfg: TrainConfig) -> None: seed_all(cfg.seed) # # Construct data loader. - # train_loader = build_train_dataloader(cfg) - train_loader = None + train_loader = build_train_dataloader(cfg) + # train_loader = None # Construct evaluators. evaluators = build_evaluators(cfg, device) @@ -256,7 +256,7 @@ def dummy_init_fn(module: torch.nn.Module) -> None: trainer.restore_checkpoint( load_path, load_optimizer_state=False, - load_trainer_state=False, + load_trainer_state=True, sharded_checkpointer=cfg.load_path_sharded_checkpointer, ) log.info("Checkpoint successfully loaded") From e67812cfb765b15bd1ef9b6e93112c3ae9e9ecd5 Mon Sep 17 00:00:00 2001 From: Jiacheng Liu Date: Mon, 21 Oct 2024 22:54:10 +0000 Subject: [PATCH 26/58] Print sum of params --- scripts/beaker/peteish/peteish1-eval-launch.sh | 2 +- scripts/eval.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/beaker/peteish/peteish1-eval-launch.sh b/scripts/beaker/peteish/peteish1-eval-launch.sh index a33f36bd8..ac0488ead 100644 --- a/scripts/beaker/peteish/peteish1-eval-launch.sh +++ b/scripts/beaker/peteish/peteish1-eval-launch.sh @@ -28,7 +28,7 @@ gantry run \ --shared-memory 10GiB \ --yes \ --timeout=-1 \ - -- /bin/bash -c "scripts/beaker/peteish/peteish1.sh ${NUM_NODES}" + -- /bin/bash -c "scripts/beaker/peteish/peteish1-eval.sh ${NUM_NODES}" # -- /bin/bash -c "scripts/beaker/peteish/peteish1.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" # --leader-selection \ diff --git a/scripts/eval.py b/scripts/eval.py index e6069d708..b24071b60 100644 --- a/scripts/eval.py +++ b/scripts/eval.py @@ -260,6 +260,9 @@ def dummy_init_fn(module: torch.nn.Module) -> None: sharded_checkpointer=cfg.load_path_sharded_checkpointer, ) log.info("Checkpoint successfully loaded") + # compute and print the sum of the value of all parameters in the model + log.info(f"Sum of all parameters: {sum(p.sum().item() for p in olmo_model.parameters())}") + continue log.info("Starting evaluating...") eval_metrics = trainer.eval() From c27037be8602b934609452308f3267519b1aa5f7 Mon Sep 17 00:00:00 2001 From: Jiacheng Liu Date: Mon, 21 Oct 2024 23:14:40 +0000 Subject: [PATCH 27/58] Skip step0 --- scripts/eval.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/eval.py b/scripts/eval.py index b24071b60..e928299e7 100644 --- a/scripts/eval.py +++ b/scripts/eval.py @@ -248,6 +248,7 @@ def dummy_init_fn(module: torch.nn.Module) -> None: else: # This globbing does not work with remote paths. load_paths = list(sorted(glob.glob(f"{cfg.load_path}/step*"), key=lambda x: int(x.split('/')[-1].split('step')[-1]))) + load_paths = load_paths[1:] # Skip step0 for load_path in load_paths: step = int(load_path.split('/')[-1].split('step')[-1]) @@ -261,8 +262,7 @@ def dummy_init_fn(module: torch.nn.Module) -> None: ) log.info("Checkpoint successfully loaded") # compute and print the sum of the value of all parameters in the model - log.info(f"Sum of all parameters: {sum(p.sum().item() for p in olmo_model.parameters())}") - continue + log.info(f"Sum of all parameters: {sum(p.sum().item() for p in dist_model.parameters())}") log.info("Starting evaluating...") eval_metrics = trainer.eval() From 1996d04071d6c6c056bfd058d2786c4c1f6bf138 Mon Sep 17 00:00:00 2001 From: Jiacheng Liu Date: Mon, 21 Oct 2024 23:28:07 +0000 Subject: [PATCH 28/58] Print param sum of dist_model --- scripts/eval.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/eval.py b/scripts/eval.py index e928299e7..79282708e 100644 --- a/scripts/eval.py +++ b/scripts/eval.py @@ -263,6 +263,7 @@ def dummy_init_fn(module: torch.nn.Module) -> None: log.info("Checkpoint successfully loaded") # compute and print the sum of the value of all parameters in the model log.info(f"Sum of all parameters: {sum(p.sum().item() for p in dist_model.parameters())}") + continue log.info("Starting evaluating...") eval_metrics = trainer.eval() From d1c528de15ba01718ae37363ef6518fd3449d2c3 Mon Sep 17 00:00:00 2001 From: Jiacheng Liu Date: Mon, 21 Oct 2024 23:43:46 +0000 Subject: [PATCH 29/58] Print per-batch ce loss --- olmo/train.py | 1 + 1 file changed, 1 insertion(+) diff --git a/olmo/train.py b/olmo/train.py index 341055003..e563d4d39 100644 --- a/olmo/train.py +++ b/olmo/train.py @@ -910,6 +910,7 @@ def eval_step(self, batch: Dict[str, Any], evaluator: Evaluator) -> None: # Run forward pass. with torch.no_grad(): # NOTE: 'torch.inference_mode()' doesn't work with 'torch.compile()'. ce_loss, logits = self.eval_batch(batch) + log.info(f"ce_loss={ce_loss.mean().item()}") # Update metrics. evaluator.update_metrics( From 01b1dc4f0ec89c2c08ed3eab6ca75a00fcb44034 Mon Sep 17 00:00:00 2001 From: Jiacheng Liu Date: Tue, 22 Oct 2024 00:24:09 +0000 Subject: [PATCH 30/58] Update --- scripts/eval.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/eval.py b/scripts/eval.py index 79282708e..e928299e7 100644 --- a/scripts/eval.py +++ b/scripts/eval.py @@ -263,7 +263,6 @@ def dummy_init_fn(module: torch.nn.Module) -> None: log.info("Checkpoint successfully loaded") # compute and print the sum of the value of all parameters in the model log.info(f"Sum of all parameters: {sum(p.sum().item() for p in dist_model.parameters())}") - continue log.info("Starting evaluating...") eval_metrics = trainer.eval() From 35f2186d3a3b7c979ce3a1630943db96b8374559 Mon Sep 17 00:00:00 2001 From: Jiacheng Liu Date: Wed, 23 Oct 2024 19:44:27 +0000 Subject: [PATCH 31/58] Reconstruct models when iterating ckpts --- olmo/train.py | 1 - scripts/eval.py | 271 ++++++++++++++++++++++++------------------------ 2 files changed, 135 insertions(+), 137 deletions(-) diff --git a/olmo/train.py b/olmo/train.py index e563d4d39..341055003 100644 --- a/olmo/train.py +++ b/olmo/train.py @@ -910,7 +910,6 @@ def eval_step(self, batch: Dict[str, Any], evaluator: Evaluator) -> None: # Run forward pass. with torch.no_grad(): # NOTE: 'torch.inference_mode()' doesn't work with 'torch.compile()'. ce_loss, logits = self.eval_batch(batch) - log.info(f"ce_loss={ce_loss.mean().item()}") # Update metrics. evaluator.update_metrics( diff --git a/scripts/eval.py b/scripts/eval.py index e928299e7..eebbc7f35 100644 --- a/scripts/eval.py +++ b/scripts/eval.py @@ -127,148 +127,147 @@ def main(cfg: TrainConfig) -> None: evaluators = build_evaluators(cfg, device) barrier() - # Initialize the model. - log.info("Building model...") - olmo_model = OLMo(cfg.model) - log.info(f"Total number of parameters: {olmo_model.num_params():,d}") - log.info(f"Number of non-embedding parameters: {olmo_model.num_params(include_embedding=False):,d}") - log.info(f"Peak GPU Memory (MB) before {cfg.distributed_strategy}: {int(peak_gpu_memory() or 0)}") - - olmo_model.set_activation_checkpointing(cfg.activation_checkpointing) - - if cfg.distributed_strategy == DistributedStrategy.ddp: - log.info("Wrapping model with DDP...") - assert cfg.ddp is not None, "DistributedStrategy ddp needs cfg.ddp to be set!" - - if cfg.model.init_device != "cuda": - raise OLMoConfigurationError("DDP does not work with init_device set to anything other than `cuda`.") - - if cfg.ddp.find_unused_params is True and cfg.ddp.grad_sync_mode != DDPGradSyncMode.micro_batch: - raise OLMoConfigurationError( - "`find_unused_params` is set to True. DDP needs to synchronize gradients for every micro-batch to avoid errors. Set `grad_sync_mode` to `micro_batch`." - ) - - param_init_fn = None - - # move to cuda before calling ddp - dist_model = DDP(olmo_model.to(device), find_unused_parameters=cfg.ddp.find_unused_params) - elif cfg.distributed_strategy == DistributedStrategy.fsdp: - # Wrap the model in FSDP. - log.info("Wrapping model with FSDP...") - assert cfg.fsdp is not None, "DistributedStrategy fsdp needs cfg.fsdp to be set!" - wrap_policy = olmo_model.get_fsdp_wrap_policy(cfg.fsdp.wrapping_strategy) - - if version.parse(torch.__version__) >= version.parse("2.1.0"): - # This prevents any parameters from being initialized twice - def dummy_init_fn(module: torch.nn.Module) -> None: - module.to_empty(device=get_default_device()) - - param_init_fn = dummy_init_fn - else: - param_init_fn = None - - # Set up device mesh for hybrid sharding in order to specify which nodes are assoicated to a given model replica - device_mesh = None - hybrid_sharding_fsdp_kwargs = {} - if cfg.fsdp.sharding_strategy in (ShardingStrategy.HYBRID_SHARD, ShardingStrategy._HYBRID_SHARD_ZERO2): - if version.parse(torch.__version__) < version.parse("2.2.0"): - # Device mesh was not added to PyTorch until v2.2.0 + if cfg.load_path is None: + raise OLMoConfigurationError("To run eval you must provide a load_path") + if 'step' in cfg.load_path.split('/')[-1]: + load_paths = [cfg.load_path] + else: + # This globbing does not work with remote paths. + load_paths = list(sorted(glob.glob(f"{cfg.load_path}/step*"), key=lambda x: int(x.split('/')[-1].split('step')[-1]))) + + for load_path in load_paths: + step = int(load_path.split('/')[-1].split('step')[-1]) + + # Initialize the model. + log.info("Building model...") + olmo_model = OLMo(cfg.model) + log.info(f"Total number of parameters: {olmo_model.num_params():,d}") + log.info(f"Number of non-embedding parameters: {olmo_model.num_params(include_embedding=False):,d}") + log.info(f"Peak GPU Memory (MB) before {cfg.distributed_strategy}: {int(peak_gpu_memory() or 0)}") + + olmo_model.set_activation_checkpointing(cfg.activation_checkpointing) + + if cfg.distributed_strategy == DistributedStrategy.ddp: + log.info("Wrapping model with DDP...") + assert cfg.ddp is not None, "DistributedStrategy ddp needs cfg.ddp to be set!" + + if cfg.model.init_device != "cuda": + raise OLMoConfigurationError("DDP does not work with init_device set to anything other than `cuda`.") + + if cfg.ddp.find_unused_params is True and cfg.ddp.grad_sync_mode != DDPGradSyncMode.micro_batch: raise OLMoConfigurationError( - "OLMo training does not correctly support hybrid sharding before torch 2.2.0" + "`find_unused_params` is set to True. DDP needs to synchronize gradients for every micro-batch to avoid errors. Set `grad_sync_mode` to `micro_batch`." ) - from torch.distributed.device_mesh import init_device_mesh + param_init_fn = None - num_model_replicas = cfg.fsdp.hybrid_sharding_num_model_replicas or ( - get_world_size() // get_local_world_size() - ) + # move to cuda before calling ddp + dist_model = DDP(olmo_model.to(device), find_unused_parameters=cfg.ddp.find_unused_params) + elif cfg.distributed_strategy == DistributedStrategy.fsdp: + # Wrap the model in FSDP. + log.info("Wrapping model with FSDP...") + assert cfg.fsdp is not None, "DistributedStrategy fsdp needs cfg.fsdp to be set!" + wrap_policy = olmo_model.get_fsdp_wrap_policy(cfg.fsdp.wrapping_strategy) + + if version.parse(torch.__version__) >= version.parse("2.1.0"): + # This prevents any parameters from being initialized twice + def dummy_init_fn(module: torch.nn.Module) -> None: + module.to_empty(device=get_default_device()) + + param_init_fn = dummy_init_fn + else: + param_init_fn = None + + # Set up device mesh for hybrid sharding in order to specify which nodes are assoicated to a given model replica + device_mesh = None + hybrid_sharding_fsdp_kwargs = {} + if cfg.fsdp.sharding_strategy in (ShardingStrategy.HYBRID_SHARD, ShardingStrategy._HYBRID_SHARD_ZERO2): + if version.parse(torch.__version__) < version.parse("2.2.0"): + # Device mesh was not added to PyTorch until v2.2.0 + raise OLMoConfigurationError( + "OLMo training does not correctly support hybrid sharding before torch 2.2.0" + ) + + from torch.distributed.device_mesh import init_device_mesh + + num_model_replicas = cfg.fsdp.hybrid_sharding_num_model_replicas or ( + get_world_size() // get_local_world_size() + ) - if num_model_replicas <= 0: - raise OLMoConfigurationError("fsdp.hybrid_sharding_num_model_replicas must be a positive integer") - - if get_world_size() % num_model_replicas != 0: - raise OLMoConfigurationError("fsdp.hybrid_sharding_num_model_replicas must divide world size") - - device_mesh = init_device_mesh("cuda", (num_model_replicas, get_world_size() // num_model_replicas)) - hybrid_sharding_fsdp_kwargs["device_mesh"] = device_mesh - - dist_model = FSDP( - olmo_model, - sharding_strategy=cfg.fsdp.sharding_strategy, - mixed_precision=cfg.fsdp_precision, - auto_wrap_policy=wrap_policy, - use_orig_params=cfg.fsdp.use_orig_params, # needed for compile and some of our optimizer/parameter metrics - limit_all_gathers=True, - device_id=get_local_rank(), - param_init_fn=param_init_fn, - **hybrid_sharding_fsdp_kwargs, - ) - elif cfg.distributed_strategy is None: - raise NotImplementedError("Single accelerator training not implemented yet!") - - # when param_init_fn is None, FSDP will call reset_parameters() automatically - if param_init_fn is not None or cfg.distributed_strategy == DistributedStrategy.ddp: - olmo_model.reset_parameters() - - log.info(f"Peak GPU Memory (MB) after {cfg.distributed_strategy}: {int(peak_gpu_memory() or 0)}") - log.info("Model:") - log.info(dist_model) - - # Construct optimizer and learning rate scheduler. - optim = build_optimizer(cfg, dist_model) - scheduler = build_scheduler(cfg) - - # Data indices file. - indices_file: Optional[TextIO] = None - # if cfg.save_data_indices: - # indices_file_path = Path(cfg.save_folder) / f"data-indices/rank{get_global_rank()}.tsv.gz" - # if indices_file_path.exists() and not cfg.save_overwrite: - # raise OLMoConfigurationError(f"{indices_file_path} already exists, use --save_overwrite to overwrite") - # indices_file_path.parent.mkdir(exist_ok=True, parents=True) - # indices_file = gzip.open(indices_file_path, "wt") - - # Consolidate components into `Trainer` object. - with Trainer( - cfg=cfg, - epoch=cfg.epoch, - model=olmo_model, - dist_model=dist_model, - optim=optim, - scheduler=scheduler, - train_loader=train_loader, - device=device, - evaluators=evaluators, - indices_file=indices_file, - ) as trainer: - - if cfg.load_path is None: - raise OLMoConfigurationError("To run eval you must provide a load_path") - if 'step' in cfg.load_path.split('/')[-1]: - load_paths = [cfg.load_path] - else: - # This globbing does not work with remote paths. - load_paths = list(sorted(glob.glob(f"{cfg.load_path}/step*"), key=lambda x: int(x.split('/')[-1].split('step')[-1]))) - load_paths = load_paths[1:] # Skip step0 - - for load_path in load_paths: - step = int(load_path.split('/')[-1].split('step')[-1]) - - log.info(f"Loading checkpoint from {load_path}...") - trainer.restore_checkpoint( - load_path, - load_optimizer_state=False, - load_trainer_state=True, - sharded_checkpointer=cfg.load_path_sharded_checkpointer, + if num_model_replicas <= 0: + raise OLMoConfigurationError("fsdp.hybrid_sharding_num_model_replicas must be a positive integer") + + if get_world_size() % num_model_replicas != 0: + raise OLMoConfigurationError("fsdp.hybrid_sharding_num_model_replicas must divide world size") + + device_mesh = init_device_mesh("cuda", (num_model_replicas, get_world_size() // num_model_replicas)) + hybrid_sharding_fsdp_kwargs["device_mesh"] = device_mesh + + dist_model = FSDP( + olmo_model, + sharding_strategy=cfg.fsdp.sharding_strategy, + mixed_precision=cfg.fsdp_precision, + auto_wrap_policy=wrap_policy, + use_orig_params=cfg.fsdp.use_orig_params, # needed for compile and some of our optimizer/parameter metrics + limit_all_gathers=True, + device_id=get_local_rank(), + param_init_fn=param_init_fn, + **hybrid_sharding_fsdp_kwargs, ) - log.info("Checkpoint successfully loaded") - # compute and print the sum of the value of all parameters in the model - log.info(f"Sum of all parameters: {sum(p.sum().item() for p in dist_model.parameters())}") - - log.info("Starting evaluating...") - eval_metrics = trainer.eval() - if wandb.run is not None: - wandb.log(eval_metrics, step=step) - log.info("Evaluating complete") + elif cfg.distributed_strategy is None: + raise NotImplementedError("Single accelerator training not implemented yet!") + + # when param_init_fn is None, FSDP will call reset_parameters() automatically + if param_init_fn is not None or cfg.distributed_strategy == DistributedStrategy.ddp: + olmo_model.reset_parameters() + + log.info(f"Peak GPU Memory (MB) after {cfg.distributed_strategy}: {int(peak_gpu_memory() or 0)}") + log.info("Model:") + log.info(dist_model) + + # Construct optimizer and learning rate scheduler. + optim = build_optimizer(cfg, dist_model) + scheduler = build_scheduler(cfg) + + # Data indices file. + indices_file: Optional[TextIO] = None + # if cfg.save_data_indices: + # indices_file_path = Path(cfg.save_folder) / f"data-indices/rank{get_global_rank()}.tsv.gz" + # if indices_file_path.exists() and not cfg.save_overwrite: + # raise OLMoConfigurationError(f"{indices_file_path} already exists, use --save_overwrite to overwrite") + # indices_file_path.parent.mkdir(exist_ok=True, parents=True) + # indices_file = gzip.open(indices_file_path, "wt") + + # Consolidate components into `Trainer` object. + with Trainer( + cfg=cfg, + epoch=cfg.epoch, + model=olmo_model, + dist_model=dist_model, + optim=optim, + scheduler=scheduler, + train_loader=train_loader, + device=device, + evaluators=evaluators, + indices_file=indices_file, + ) as trainer: + + log.info(f"Loading checkpoint from {load_path}...") + trainer.restore_checkpoint( + load_path, + load_optimizer_state=False, + load_trainer_state=True, + sharded_checkpointer=cfg.load_path_sharded_checkpointer, + ) + log.info("Checkpoint successfully loaded") + # compute and print the sum of the value of all parameters in the model + log.info(f"Sum of all parameters: {sum(p.sum().item() for p in dist_model.parameters())}") + + log.info("Starting evaluating...") + eval_metrics = trainer.eval() + if wandb.run is not None: + wandb.log(eval_metrics, step=step) + log.info("Evaluating complete") if __name__ == "__main__": From 504fb2a0e2a30444ebe916c698d68f1f6f918d24 Mon Sep 17 00:00:00 2001 From: Jiacheng Liu Date: Wed, 23 Oct 2024 20:55:25 +0000 Subject: [PATCH 32/58] Do not quit wandb; Do not create train_loader --- olmo/train.py | 4 ++-- scripts/beaker/peteish/peteish1-eval-launch.sh | 10 +++++----- scripts/eval.py | 6 +++--- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/olmo/train.py b/olmo/train.py index 341055003..b7f778bfb 100644 --- a/olmo/train.py +++ b/olmo/train.py @@ -1368,8 +1368,8 @@ def close(self, exit_code: int = 0) -> None: gc.enable() else: gc.disable() - if wandb.run is not None: - wandb.finish(exit_code=exit_code, quiet=True) + # if wandb.run is not None: + # wandb.finish(exit_code=exit_code, quiet=True) def __enter__(self) -> Trainer: return self diff --git a/scripts/beaker/peteish/peteish1-eval-launch.sh b/scripts/beaker/peteish/peteish1-eval-launch.sh index ac0488ead..452528c45 100644 --- a/scripts/beaker/peteish/peteish1-eval-launch.sh +++ b/scripts/beaker/peteish/peteish1-eval-launch.sh @@ -6,9 +6,9 @@ NUM_NODES=1 gantry run \ --allow-dirty \ - --workspace ai2/hb-wolf-olmo \ + --workspace ai2/OLMo-tiny \ --task-name peteish1-eval \ - --description "Pete-ish 1B" \ + --description "Pete-ish 1B eval" \ --priority high \ --preemptible \ --beaker-image petew/olmo-torch23-gantry \ @@ -22,9 +22,9 @@ gantry run \ --env LOG_FILTER_TYPE=local_rank0_only \ --env OMP_NUM_THREADS=8 \ --env OLMO_TASK=model \ - --env-secret WANDB_API_KEY=WANDB_API_KEY \ - --env-secret AWS_ACCESS_KEY_ID=AWS_ACCESS_KEY_ID \ - --env-secret AWS_SECRET_ACCESS_KEY=AWS_SECRET_ACCESS_KEY \ + --env-secret WANDB_API_KEY=JIACHENGL_WANDB_API_KEY \ + --env-secret AWS_ACCESS_KEY_ID=AKSHITAB_AWS_ACCESS_KEY_ID \ + --env-secret AWS_SECRET_ACCESS_KEY=AKSHITAB_AWS_SECRET_ACCESS_KEY \ --shared-memory 10GiB \ --yes \ --timeout=-1 \ diff --git a/scripts/eval.py b/scripts/eval.py index eebbc7f35..03eff960c 100644 --- a/scripts/eval.py +++ b/scripts/eval.py @@ -120,8 +120,8 @@ def main(cfg: TrainConfig) -> None: seed_all(cfg.seed) # # Construct data loader. - train_loader = build_train_dataloader(cfg) - # train_loader = None + # train_loader = build_train_dataloader(cfg) + train_loader = None # Construct evaluators. evaluators = build_evaluators(cfg, device) @@ -256,7 +256,7 @@ def dummy_init_fn(module: torch.nn.Module) -> None: trainer.restore_checkpoint( load_path, load_optimizer_state=False, - load_trainer_state=True, + load_trainer_state=False, sharded_checkpointer=cfg.load_path_sharded_checkpointer, ) log.info("Checkpoint successfully loaded") From a51a63b1136a9cb38c77dd632e74520c88d782f4 Mon Sep 17 00:00:00 2001 From: Jiacheng Liu Date: Thu, 24 Oct 2024 17:56:28 +0000 Subject: [PATCH 33/58] Switch to the real peteish1 --- .../beaker/peteish/peteish1-eval-launch.sh | 31 ++++++++------- scripts/beaker/peteish/peteish1-eval.sh | 31 ++++++++------- scripts/eval.py | 38 ++++++++----------- 3 files changed, 47 insertions(+), 53 deletions(-) diff --git a/scripts/beaker/peteish/peteish1-eval-launch.sh b/scripts/beaker/peteish/peteish1-eval-launch.sh index 452528c45..0baf16e92 100644 --- a/scripts/beaker/peteish/peteish1-eval-launch.sh +++ b/scripts/beaker/peteish/peteish1-eval-launch.sh @@ -2,12 +2,12 @@ set -ex -NUM_NODES=1 +NUM_NODES=16 gantry run \ --allow-dirty \ - --workspace ai2/OLMo-tiny \ - --task-name peteish1-eval \ + --workspace ai2/OLMo-pretraining-stability \ + --task-name peteish1 \ --description "Pete-ish 1B eval" \ --priority high \ --preemptible \ @@ -15,6 +15,11 @@ gantry run \ --cluster ai2/jupiter-cirrascale-2 \ --gpus 8 \ --replicas "${NUM_NODES}" \ + --leader-selection \ + --host-networking \ + --propagate-failure \ + --propagate-preemption \ + --synchronized-start-timeout 90m \ --budget ai2/oe-training \ --no-nfs \ --weka oe-training-default:/weka/oe-training-default \ @@ -22,17 +27,15 @@ gantry run \ --env LOG_FILTER_TYPE=local_rank0_only \ --env OMP_NUM_THREADS=8 \ --env OLMO_TASK=model \ - --env-secret WANDB_API_KEY=JIACHENGL_WANDB_API_KEY \ - --env-secret AWS_ACCESS_KEY_ID=AKSHITAB_AWS_ACCESS_KEY_ID \ - --env-secret AWS_SECRET_ACCESS_KEY=AKSHITAB_AWS_SECRET_ACCESS_KEY \ + --env R2_PROFILE=R2 \ + --env S3_PROFILE=S3 \ + --env WEKA_PROFILE=WEKA \ + --env-secret AWS_CONFIG=PETEW_AWS_CONFIG \ + --env-secret AWS_CREDENTIALS=PETEW_AWS_CREDENTIALS \ + --env-secret R2_ENDPOINT_URL=R2_ENDPOINT_URL \ + --env-secret WEKA_ENDPOINT_URL=WEKA_ENDPOINT_URL \ + --env-secret WANDB_API_KEY=PETEW_WANDB_API_KEY \ --shared-memory 10GiB \ --yes \ --timeout=-1 \ - -- /bin/bash -c "scripts/beaker/peteish/peteish1-eval.sh ${NUM_NODES}" - # -- /bin/bash -c "scripts/beaker/peteish/peteish1.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" - - # --leader-selection \ - # --host-networking \ - # --propagate-failure \ - # --propagate-preemption \ - # --synchronized-start-timeout 90m \ + -- /bin/bash -c "scripts/beaker/peteish/peteish1.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" \ No newline at end of file diff --git a/scripts/beaker/peteish/peteish1-eval.sh b/scripts/beaker/peteish/peteish1-eval.sh index f6a44b27a..74cc0520d 100755 --- a/scripts/beaker/peteish/peteish1-eval.sh +++ b/scripts/beaker/peteish/peteish1-eval.sh @@ -3,14 +3,14 @@ set -exuo pipefail IFS=$'\n\t' -# BEAKER_LEADER_REPLICA_HOSTNAME=$1 -# shift +BEAKER_LEADER_REPLICA_HOSTNAME=$1 +shift NUM_NODES=$1 shift -# BEAKER_REPLICA_RANK=$1 -# shift +BEAKER_REPLICA_RANK=$1 +shift # Setup Python environment. conda shell.bash activate base @@ -25,9 +25,9 @@ pip install '.[train]' pip freeze # Move AWS credentials from env to relevant files -# mkdir -p ~/.aws -# printenv AWS_CONFIG > ~/.aws/config -# printenv AWS_CREDENTIALS > ~/.aws/credentials +mkdir -p ~/.aws +printenv AWS_CONFIG > ~/.aws/config +printenv AWS_CREDENTIALS > ~/.aws/credentials # Force processes to synchronize at init_process_group export TORCH_DIST_INIT_BARRIER=1 @@ -37,23 +37,22 @@ export OLMO_SHARED_FS=1 export NCCL_DEBUG=INFO export NCCL_IB_HCA="^=mlx5_bond_0" -# export NCCL_SOCKET_IFNAME=ib +export NCCL_SOCKET_IFNAME=ib # export NCCL_IB_GID_INDEX=0 torchrun \ --nnodes "${NUM_NODES}:${NUM_NODES}" \ --nproc-per-node 8 \ + --rdzv_id 12347 \ + --rdzv_backend static \ + --rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \ + --node_rank "${BEAKER_REPLICA_RANK}" \ + --rdzv_conf 'read_timeout=420' \ scripts/eval.py \ configs/peteish1-weka.yaml \ --run_name="${GANTRY_TASK_NAME}" \ --save_interval_ephemeral=null \ --save_overwrite \ + --wandb.name="${GANTRY_TASK_NAME}-eval" \ --save_folder="/weka/oe-training-default/ai2-llm/checkpoints/OLMo-small/peteish1-eval" \ - --load_path="/weka/oe-training-default/wolf/ckpt/v3.0_v2.7_peteish" - - # --rdzv_id 12347 \ - # --rdzv_backend static \ - # --rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \ - # --node_rank "${BEAKER_REPLICA_RANK}" \ - # --rdzv_conf 'read_timeout=420' \ - # --save_folder="/weka/oe-training-default/ai2-llm/checkpoints/OLMo-small/peteish1" \ \ No newline at end of file + --load_path="/weka/oe-training-default/ai2-llm/checkpoints/OLMo-small/peteish1" diff --git a/scripts/eval.py b/scripts/eval.py index 03eff960c..061555dae 100644 --- a/scripts/eval.py +++ b/scripts/eval.py @@ -82,21 +82,21 @@ def main(cfg: TrainConfig) -> None: cfg.optimizer.decay_embeddings = not cfg.optimizer.no_decay_norm_and_bias cfg.optimizer.no_decay_norm_and_bias = None # So nobody uses this by accident. - # # Display and save configuration. - # if get_global_rank() == 0: - # if cfg.data.paths is not None and len(cfg.data.paths) < 50: - # log.info("Configuration:") - # log.info(cfg) - # if not cfg.dry_run and (cfg.load_path is None or Path(cfg.load_path).parent != Path(cfg.save_folder)): - # # Save config. - # save_path = Path(cfg.save_folder) / "config.yaml" - # if save_path.is_file() and not cfg.save_overwrite: - # raise OLMoConfigurationError(f"{save_path} already exists, use --save_overwrite to overwrite") - # else: - # log.info(f"Saving config to {save_path}") - # save_path.parent.mkdir(exist_ok=True, parents=True) - # cfg.save(save_path) - # del save_path + # Display and save configuration. + if get_global_rank() == 0: + if cfg.data.paths is not None and len(cfg.data.paths) < 50: + log.info("Configuration:") + log.info(cfg) + if not cfg.dry_run and (cfg.load_path is None or Path(cfg.load_path).parent != Path(cfg.save_folder)): + # Save config. + save_path = Path(cfg.save_folder) / "config.yaml" + if save_path.is_file() and not cfg.save_overwrite: + raise OLMoConfigurationError(f"{save_path} already exists, use --save_overwrite to overwrite") + else: + log.info(f"Saving config to {save_path}") + save_path.parent.mkdir(exist_ok=True, parents=True) + cfg.save(save_path) + del save_path barrier() @@ -231,12 +231,6 @@ def dummy_init_fn(module: torch.nn.Module) -> None: # Data indices file. indices_file: Optional[TextIO] = None - # if cfg.save_data_indices: - # indices_file_path = Path(cfg.save_folder) / f"data-indices/rank{get_global_rank()}.tsv.gz" - # if indices_file_path.exists() and not cfg.save_overwrite: - # raise OLMoConfigurationError(f"{indices_file_path} already exists, use --save_overwrite to overwrite") - # indices_file_path.parent.mkdir(exist_ok=True, parents=True) - # indices_file = gzip.open(indices_file_path, "wt") # Consolidate components into `Trainer` object. with Trainer( @@ -260,8 +254,6 @@ def dummy_init_fn(module: torch.nn.Module) -> None: sharded_checkpointer=cfg.load_path_sharded_checkpointer, ) log.info("Checkpoint successfully loaded") - # compute and print the sum of the value of all parameters in the model - log.info(f"Sum of all parameters: {sum(p.sum().item() for p in dist_model.parameters())}") log.info("Starting evaluating...") eval_metrics = trainer.eval() From c362030246b6782166b1c2e343c267de235e7842 Mon Sep 17 00:00:00 2001 From: Jiacheng Liu Date: Thu, 24 Oct 2024 18:08:08 +0000 Subject: [PATCH 34/58] Massage the group --- scripts/beaker/peteish/peteish1-eval-launch.sh | 4 ++-- scripts/beaker/peteish/peteish1-eval.sh | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/scripts/beaker/peteish/peteish1-eval-launch.sh b/scripts/beaker/peteish/peteish1-eval-launch.sh index 0baf16e92..317dd190b 100644 --- a/scripts/beaker/peteish/peteish1-eval-launch.sh +++ b/scripts/beaker/peteish/peteish1-eval-launch.sh @@ -7,7 +7,7 @@ NUM_NODES=16 gantry run \ --allow-dirty \ --workspace ai2/OLMo-pretraining-stability \ - --task-name peteish1 \ + --task-name peteish1-eval \ --description "Pete-ish 1B eval" \ --priority high \ --preemptible \ @@ -34,7 +34,7 @@ gantry run \ --env-secret AWS_CREDENTIALS=PETEW_AWS_CREDENTIALS \ --env-secret R2_ENDPOINT_URL=R2_ENDPOINT_URL \ --env-secret WEKA_ENDPOINT_URL=WEKA_ENDPOINT_URL \ - --env-secret WANDB_API_KEY=PETEW_WANDB_API_KEY \ + --env-secret WANDB_API_KEY=JIACHENGL_WANDB_API_KEY \ --shared-memory 10GiB \ --yes \ --timeout=-1 \ diff --git a/scripts/beaker/peteish/peteish1-eval.sh b/scripts/beaker/peteish/peteish1-eval.sh index 74cc0520d..954043b0d 100755 --- a/scripts/beaker/peteish/peteish1-eval.sh +++ b/scripts/beaker/peteish/peteish1-eval.sh @@ -53,6 +53,5 @@ torchrun \ --run_name="${GANTRY_TASK_NAME}" \ --save_interval_ephemeral=null \ --save_overwrite \ - --wandb.name="${GANTRY_TASK_NAME}-eval" \ - --save_folder="/weka/oe-training-default/ai2-llm/checkpoints/OLMo-small/peteish1-eval" \ + --wandb.group="peteish1" \ --load_path="/weka/oe-training-default/ai2-llm/checkpoints/OLMo-small/peteish1" From a36b9e0a637aa0e4ebc6fcccf83875caa4138660 Mon Sep 17 00:00:00 2001 From: Jiacheng Liu Date: Thu, 24 Oct 2024 18:11:10 +0000 Subject: [PATCH 35/58] Fix bug --- scripts/beaker/peteish/peteish1-eval-launch.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/beaker/peteish/peteish1-eval-launch.sh b/scripts/beaker/peteish/peteish1-eval-launch.sh index 317dd190b..09ee396cc 100644 --- a/scripts/beaker/peteish/peteish1-eval-launch.sh +++ b/scripts/beaker/peteish/peteish1-eval-launch.sh @@ -38,4 +38,4 @@ gantry run \ --shared-memory 10GiB \ --yes \ --timeout=-1 \ - -- /bin/bash -c "scripts/beaker/peteish/peteish1.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" \ No newline at end of file + -- /bin/bash -c "scripts/beaker/peteish/peteish1-eval.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" \ No newline at end of file From 97d78ede5d712c9491e0c8d0ae3398ed3e2626af Mon Sep 17 00:00:00 2001 From: Jiacheng Liu Date: Thu, 24 Oct 2024 18:25:26 +0000 Subject: [PATCH 36/58] Revert Peteish7 changes --- configs/peteish7-weka.yaml | 280 +++------------------- scripts/beaker/peteish/peteish1-launch.sh | 2 +- scripts/beaker/peteish/peteish1.sh | 2 +- scripts/beaker/peteish/peteish7-launch.sh | 20 +- scripts/beaker/peteish/peteish7.sh | 13 +- 5 files changed, 50 insertions(+), 267 deletions(-) diff --git a/configs/peteish7-weka.yaml b/configs/peteish7-weka.yaml index 1e96ffbe2..5980de319 100644 --- a/configs/peteish7-weka.yaml +++ b/configs/peteish7-weka.yaml @@ -107,35 +107,35 @@ eval_interval: 1000 eval_subset_num_batches: -1 device_eval_batch_size: ${device_train_microbatch_size} evaluators: - - label: all-small-ppl-validation - data: - num_workers: 0 - drop_last: true - # generate_doc_lengths: true - memmap_dtype: uint32 - datasets: - c4_en-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy - dolma_books-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy - dolma_common-crawl-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy - dolma_pes2o-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy - dolma_reddit-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy - dolma_stack-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy - dolma_wiki-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy - ice-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy - m2d2_s2orc-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy - pile-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy - wikitext_103-validation: - - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + # - label: all-small-ppl-validation + # data: + # num_workers: 0 + # drop_last: true + # # generate_doc_lengths: true + # memmap_dtype: uint32 + # datasets: + # c4_en-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + # dolma_books-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + # dolma_common-crawl-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + # dolma_pes2o-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + # dolma_reddit-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + # dolma_stack-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + # dolma_wiki-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + # ice-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + # m2d2_s2orc-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + # pile-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + # wikitext_103-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy ########################## # Downstream evaluations # @@ -230,228 +230,6 @@ evaluators: - label: arc_easy_ppl type: downstream - - label: piqa_rc_0shot - type: downstream - - - label: piqa_rc_0shot_bpb - type: downstream - - - label: piqa_rc_5shot - type: downstream - - - label: piqa_rc_5shot_bpb - type: downstream - - - label: piqa_mc_5shot - type: downstream - - - label: piqa_mc_5shot_bpb - type: downstream - - - label: hellaswag_rc_0shot - type: downstream - - - label: hellaswag_rc_0shot_bpb - type: downstream - - - label: hellaswag_rc_5shot - type: downstream - - - label: hellaswag_rc_5shot_bpb - type: downstream - - - label: hellaswag_mc_5shot - type: downstream - - - label: hellaswag_mc_5shot_bpb - type: downstream - - - label: winogrande_rc_0shot - type: downstream - - - label: winogrande_rc_0shot_bpb - type: downstream - - - label: winogrande_rc_5shot - type: downstream - - - label: winogrande_rc_5shot_bpb - type: downstream - - - label: winogrande_mc_5shot - type: downstream - - - label: winogrande_mc_5shot_bpb - type: downstream - - - label: openbookqa_rc_0shot - type: downstream - - - label: openbookqa_rc_0shot_bpb - type: downstream - - - label: openbookqa_rc_5shot - type: downstream - - - label: openbookqa_rc_5shot_bpb - type: downstream - - - label: openbookqa_mc_5shot - type: downstream - - - label: openbookqa_mc_5shot_bpb - type: downstream - - - label: boolq_rc_0shot - type: downstream - - - label: boolq_rc_0shot_bpb - type: downstream - - - label: boolq_rc_5shot - type: downstream - - - label: boolq_rc_5shot_bpb - type: downstream - - - label: boolq_mc_5shot - type: downstream - - - label: boolq_mc_5shot_bpb - type: downstream - - - label: sciq_rc_0shot - type: downstream - - - label: sciq_rc_0shot_bpb - type: downstream - - # - label: sciq_rc_5shot - # type: downstream - - # - label: sciq_rc_5shot_bpb - # type: downstream - - # - label: sciq_mc_5shot - # type: downstream - - # - label: sciq_mc_5shot_bpb - # type: downstream - - - label: arc_easy_rc_0shot - type: downstream - - - label: arc_easy_rc_0shot_bpb - type: downstream - - - label: arc_easy_rc_5shot - type: downstream - - - label: arc_easy_rc_5shot_bpb - type: downstream - - - label: arc_easy_mc_5shot - type: downstream - - - label: arc_easy_mc_5shot_bpb - type: downstream - - - label: arc_challenge_rc_0shot - type: downstream - - - label: arc_challenge_rc_0shot_bpb - type: downstream - - - label: arc_challenge_rc_5shot - type: downstream - - - label: arc_challenge_rc_5shot_bpb - type: downstream - - - label: arc_challenge_mc_5shot - type: downstream - - - label: arc_challenge_mc_5shot_bpb - type: downstream - - - label: copa_rc_0shot - type: downstream - - - label: copa_rc_0shot_bpb - type: downstream - - # - label: copa_rc_5shot - # type: downstream - - # - label: copa_rc_5shot_bpb - # type: downstream - - # - label: copa_mc_5shot - # type: downstream - - # - label: copa_mc_5shot_bpb - # type: downstream - - - label: csqa_rc_0shot - type: downstream - - - label: csqa_rc_0shot_bpb - type: downstream - - - label: csqa_rc_5shot - type: downstream - - - label: csqa_rc_5shot_bpb - type: downstream - - - label: csqa_mc_5shot - type: downstream - - - label: csqa_mc_5shot_bpb - type: downstream - - - label: socialiqa_rc_0shot - type: downstream - - - label: socialiqa_rc_0shot_bpb - type: downstream - - - label: socialiqa_rc_5shot - type: downstream - - - label: socialiqa_rc_5shot_bpb - type: downstream - - - label: socialiqa_mc_5shot - type: downstream - - - label: socialiqa_mc_5shot_bpb - type: downstream - - - label: mmlu_stem_var_bpb - type: downstream - - - label: mmlu_humanities_var_bpb - type: downstream - - - label: mmlu_social_sciences_var_bpb - type: downstream - - - label: mmlu_other_var_bpb - type: downstream - - - label: mmlu_stem_bpb - type: downstream - - - label: mmlu_humanities_bpb - type: downstream - - - label: mmlu_social_sciences_bpb - type: downstream - - - label: mmlu_other_bpb - type: downstream - data: pad_direction: right # generate_doc_lengths: true diff --git a/scripts/beaker/peteish/peteish1-launch.sh b/scripts/beaker/peteish/peteish1-launch.sh index d034ce3a0..ffab0df65 100755 --- a/scripts/beaker/peteish/peteish1-launch.sh +++ b/scripts/beaker/peteish/peteish1-launch.sh @@ -37,4 +37,4 @@ gantry run \ --shared-memory 10GiB \ --yes \ --timeout=-1 \ - -- /bin/bash -c "scripts/beaker/peteish/peteish1.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" \ No newline at end of file + -- /bin/bash -c "scripts/beaker/peteish/peteish1.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/peteish/peteish1.sh b/scripts/beaker/peteish/peteish1.sh index 6c55bafc2..270f3e4b5 100755 --- a/scripts/beaker/peteish/peteish1.sh +++ b/scripts/beaker/peteish/peteish1.sh @@ -54,4 +54,4 @@ torchrun \ --save_interval_ephemeral=null \ --save_overwrite - # '--load_path=${path.last_checkpoint:${save_folder}}' \ \ No newline at end of file + # '--load_path=${path.last_checkpoint:${save_folder}}' \ diff --git a/scripts/beaker/peteish/peteish7-launch.sh b/scripts/beaker/peteish/peteish7-launch.sh index ef74bce67..4180dee67 100755 --- a/scripts/beaker/peteish/peteish7-launch.sh +++ b/scripts/beaker/peteish/peteish7-launch.sh @@ -2,14 +2,13 @@ set -ex -NUM_NODES=8 +NUM_NODES=16 gantry run \ - --allow-dirty \ - --workspace ai2/OLMo-tiny \ - --task-name peteish7-anneal-eval \ + --workspace ai2/OLMo-pretraining-stability \ + --task-name peteish7 \ --description "Pete-ish 7B" \ - --priority high \ + --priority urgent \ --preemptible \ --beaker-image petew/olmo-torch23-gantry \ --cluster ai2/jupiter-cirrascale-2 \ @@ -27,9 +26,14 @@ gantry run \ --env LOG_FILTER_TYPE=local_rank0_only \ --env OMP_NUM_THREADS=8 \ --env OLMO_TASK=model \ - --env-secret WANDB_API_KEY=AKSHITAB_WANDB_API_KEY \ - --env-secret AWS_ACCESS_KEY_ID=AKSHITAB_AWS_ACCESS_KEY_ID \ - --env-secret AWS_SECRET_ACCESS_KEY=AKSHITAB_AWS_SECRET_ACCESS_KEY \ + --env R2_PROFILE=R2 \ + --env S3_PROFILE=S3 \ + --env WEKA_PROFILE=WEKA \ + --env-secret AWS_CONFIG=PETEW_AWS_CONFIG \ + --env-secret AWS_CREDENTIALS=PETEW_AWS_CREDENTIALS \ + --env-secret R2_ENDPOINT_URL=R2_ENDPOINT_URL \ + --env-secret WEKA_ENDPOINT_URL=WEKA_ENDPOINT_URL \ + --env-secret WANDB_API_KEY=PETEW_WANDB_API_KEY \ --shared-memory 10GiB \ --yes \ --timeout=-1 \ diff --git a/scripts/beaker/peteish/peteish7.sh b/scripts/beaker/peteish/peteish7.sh index 179690f91..11166f700 100755 --- a/scripts/beaker/peteish/peteish7.sh +++ b/scripts/beaker/peteish/peteish7.sh @@ -25,9 +25,9 @@ pip install '.[train]' pip freeze # Move AWS credentials from env to relevant files -# mkdir -p ~/.aws -# printenv AWS_CONFIG > ~/.aws/config -# printenv AWS_CREDENTIALS > ~/.aws/credentials +mkdir -p ~/.aws +printenv AWS_CONFIG > ~/.aws/config +printenv AWS_CREDENTIALS > ~/.aws/credentials # Force processes to synchronize at init_process_group export TORCH_DIST_INIT_BARRIER=1 @@ -48,9 +48,10 @@ torchrun \ --rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \ --node_rank "${BEAKER_REPLICA_RANK}" \ --rdzv_conf 'read_timeout=420' \ - scripts/eval.py \ + scripts/train.py \ configs/peteish7-weka.yaml \ --run_name="${GANTRY_TASK_NAME}" \ --save_interval_ephemeral=500 \ - --save_folder="/weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7-anneal-from-928646-50B-no-warmup" \ - '--load_path=${path.last_checkpoint:${save_folder}}' \ + --save_overwrite + + # '--load_path=${path.last_checkpoint:${save_folder}}' \ From 909762495f8900147c79e0a25aecade7ffdb60e0 Mon Sep 17 00:00:00 2001 From: Jiacheng Liu Date: Thu, 24 Oct 2024 22:44:46 +0000 Subject: [PATCH 37/58] Fix lint --- scripts/eval.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/scripts/eval.py b/scripts/eval.py index 061555dae..39990ed67 100644 --- a/scripts/eval.py +++ b/scripts/eval.py @@ -1,6 +1,5 @@ """Run this script with 'torchrun'.""" -import gzip import logging import sys from datetime import timedelta @@ -18,16 +17,14 @@ from torch.nn.parallel import DistributedDataParallel as DDP from olmo.config import ( - CheckpointType, DDPGradSyncMode, DistributedStrategy, TrainConfig, ) -from olmo.data import build_train_dataloader from olmo.eval import build_evaluators from olmo.exceptions import OLMoCliError, OLMoConfigurationError from olmo.model import OLMo -from olmo.optim import BoltOnWarmupScheduler, build_optimizer, build_scheduler +from olmo.optim import build_optimizer, build_scheduler from olmo.torch_util import ( barrier, get_default_device, @@ -42,7 +39,6 @@ from olmo.util import ( add_cached_path_clients, clean_opt, - find_latest_checkpoint, log_extra_field, prepare_cli_environment, ) From 752b9cc973dab9ffcd01d24ecd0b5b87eededdcf Mon Sep 17 00:00:00 2001 From: Jiacheng Liu Date: Thu, 24 Oct 2024 22:47:50 +0000 Subject: [PATCH 38/58] Update CHANGELOG --- CHANGELOG.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e9752a733..29968953d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added ability to try loading latest checkpoint from save folder using `--try_load_latest_save`. - Added support for flash attention and gradient checkpointing to `hf_olmo`. +- Added an eval-only script that evaluates existing checkpoints on specified tasks. ## [v0.5.0](https://github.com/allenai/OLMo/releases/tag/v0.5.0) - 2024-08-26 @@ -42,7 +43,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Swapped in correct flan data mix. - Fix bug where the attention norm, when applied before the attention block, was modifying the residual stream. - Fixed `OLMo.from_checkpoint()` so that it correctly loads `olmo_core` and `torch_new` style checkpoints. -- Fixed `preserve_rng_state` being incorrectly set to False when doing gradient checkpointing with dropout +- Fixed `preserve_rng_state` being incorrectly set to False when doing gradient checkpointing with dropout ## [v0.4.0](https://github.com/allenai/OLMo/releases/tag/v0.4.0) - 2024-07-11 From 6f60eddc31a65f1205a50f8bbbeb842e66dc509b Mon Sep 17 00:00:00 2001 From: Jiacheng Liu Date: Mon, 28 Oct 2024 19:56:29 +0000 Subject: [PATCH 39/58] Separate into TrainerForEval --- olmo/train.py | 25 ++++++++++++++++++++----- scripts/eval.py | 47 +++++++++++++++-------------------------------- 2 files changed, 35 insertions(+), 37 deletions(-) diff --git a/olmo/train.py b/olmo/train.py index b7f778bfb..89ee7fafe 100644 --- a/olmo/train.py +++ b/olmo/train.py @@ -209,11 +209,11 @@ class Trainer: cfg: TrainConfig model: OLMo dist_model: Union[DDP, FSDP] - optim: Optimizer - scheduler: Scheduler - train_loader: DataLoader device: torch.device evaluators: List[Evaluator] + optim: Optional[Optimizer] = None + scheduler: Optional[Scheduler] = None + train_loader: Optional[DataLoader] = None epoch: Optional[int] = None global_step: int = 0 global_train_examples_seen_this_epoch: int = 0 @@ -1368,8 +1368,8 @@ def close(self, exit_code: int = 0) -> None: gc.enable() else: gc.disable() - # if wandb.run is not None: - # wandb.finish(exit_code=exit_code, quiet=True) + if wandb.run is not None: + wandb.finish(exit_code=exit_code, quiet=True) def __enter__(self) -> Trainer: return self @@ -1377,3 +1377,18 @@ def __enter__(self) -> Trainer: def __exit__(self, exc_type, exc_val, exc_tb) -> None: del exc_val, exc_tb self.close(0 if exc_type is None else 1) + + +@dataclass +class TrainerForEval(Trainer): + + def close(self, exit_code: int = 0) -> None: + gc_cuda() + + if self.indices_file is not None: + self.indices_file.flush() + self.indices_file.close() + if self._gc_init_state: + gc.enable() + else: + gc.disable() diff --git a/scripts/eval.py b/scripts/eval.py index 39990ed67..a3774911a 100644 --- a/scripts/eval.py +++ b/scripts/eval.py @@ -35,7 +35,7 @@ peak_gpu_memory, seed_all, ) -from olmo.train import Trainer +from olmo.train import TrainerForEval from olmo.util import ( add_cached_path_clients, clean_opt, @@ -115,10 +115,6 @@ def main(cfg: TrainConfig) -> None: # Set seed. seed_all(cfg.seed) - # # Construct data loader. - # train_loader = build_train_dataloader(cfg) - train_loader = None - # Construct evaluators. evaluators = build_evaluators(cfg, device) barrier() @@ -141,8 +137,6 @@ def main(cfg: TrainConfig) -> None: log.info(f"Number of non-embedding parameters: {olmo_model.num_params(include_embedding=False):,d}") log.info(f"Peak GPU Memory (MB) before {cfg.distributed_strategy}: {int(peak_gpu_memory() or 0)}") - olmo_model.set_activation_checkpointing(cfg.activation_checkpointing) - if cfg.distributed_strategy == DistributedStrategy.ddp: log.info("Wrapping model with DDP...") assert cfg.ddp is not None, "DistributedStrategy ddp needs cfg.ddp to be set!" @@ -221,41 +215,30 @@ def dummy_init_fn(module: torch.nn.Module) -> None: log.info("Model:") log.info(dist_model) - # Construct optimizer and learning rate scheduler. - optim = build_optimizer(cfg, dist_model) - scheduler = build_scheduler(cfg) - - # Data indices file. - indices_file: Optional[TextIO] = None - # Consolidate components into `Trainer` object. - with Trainer( + with TrainerForEval( cfg=cfg, epoch=cfg.epoch, model=olmo_model, dist_model=dist_model, - optim=optim, - scheduler=scheduler, - train_loader=train_loader, device=device, evaluators=evaluators, - indices_file=indices_file, ) as trainer: - log.info(f"Loading checkpoint from {load_path}...") - trainer.restore_checkpoint( - load_path, - load_optimizer_state=False, - load_trainer_state=False, - sharded_checkpointer=cfg.load_path_sharded_checkpointer, - ) - log.info("Checkpoint successfully loaded") + log.info(f"Loading checkpoint from {load_path}...") + trainer.restore_checkpoint( + load_path, + load_optimizer_state=False, + load_trainer_state=False, + sharded_checkpointer=cfg.load_path_sharded_checkpointer, + ) + log.info("Checkpoint successfully loaded") - log.info("Starting evaluating...") - eval_metrics = trainer.eval() - if wandb.run is not None: - wandb.log(eval_metrics, step=step) - log.info("Evaluating complete") + log.info("Starting evaluating...") + eval_metrics = trainer.eval() + if wandb.run is not None: + wandb.log(eval_metrics, step=step) + log.info("Evaluating complete") if __name__ == "__main__": From 99c0d80ee5124564016bc1c8782582f0444b8685 Mon Sep 17 00:00:00 2001 From: CodeCreator Date: Fri, 1 Nov 2024 11:24:27 -0700 Subject: [PATCH 40/58] Implement soft versions of accuracies --- olmo/eval/downstream.py | 26 ++++++++++++++++++++------ olmo/eval/evaluator.py | 15 +++++++++------ 2 files changed, 29 insertions(+), 12 deletions(-) diff --git a/olmo/eval/downstream.py b/olmo/eval/downstream.py index 2a9d1d365..fb06ca90f 100644 --- a/olmo/eval/downstream.py +++ b/olmo/eval/downstream.py @@ -95,7 +95,9 @@ def update(self, batch: Dict[str, Any], lm_logits: torch.Tensor, dc_lm_logits=No torch.LongTensor((doc_id, cont_id, batch["label_id"][idx])).to(batch["label_id"][idx].device) ) - def compute(self) -> torch.Tensor: + def compute(self) -> Dict[str, torch.Tensor]: + # Task "suffix" -> tensor + # states should have been synced from all accelerators at this point # account for duplicates here because of DistributedSampler compensating for drop_last=False loglikelihood_dict: Dict[int, Dict[int, float]] = {} @@ -116,6 +118,9 @@ def compute(self) -> torch.Tensor: # compute acc correct = [] + soft_scores = [] + soft_log_scores = [] + preds: Optional[List[float]] = None labels: Optional[List[int]] = None if self.metric_type == "f1": @@ -140,14 +145,15 @@ def compute(self) -> torch.Tensor: continue if self.metric_type in ["ce_loss", "bpb"]: correct.append(loglikelihoods[0]) # Only one answer is scored - else: - correct.append(1.0 if torch.argmax(loglikelihoods).item() == label_dict[doc_id] else 0.0) - - if self.metric_type == "f1": + elif self.metric_type == "f1": assert preds is not None assert labels is not None preds.append(torch.argmax(loglikelihoods).item()) labels.append(label_dict[doc_id]) + else: + correct.append(1.0 if torch.argmax(loglikelihoods).item() == label_dict[doc_id] else 0.0) + soft_scores.append(torch.softmax(loglikelihoods, dim=0)[label_dict[doc_id]].item()) + soft_log_scores.append(torch.log_softmax(loglikelihoods, dim=0)[label_dict[doc_id]].item()) if self.metric_type == "f1": assert preds is not None @@ -157,7 +163,15 @@ def compute(self) -> torch.Tensor: else: score = sum(correct) / len(correct) - return torch.tensor(score) + outputs = { + "": torch.tensor(score), + } + + if soft_scores: + outputs["_soft"] = torch.tensor(sum(soft_scores) / len(soft_scores)) + outputs["_soft_log"] = torch.tensor(sum(soft_log_scores) / len(soft_log_scores)) + + return outputs class ICLMultiChoiceTaskDataset(metaclass=abc.ABCMeta): diff --git a/olmo/eval/evaluator.py b/olmo/eval/evaluator.py index 29ef049d7..e71a7d859 100644 --- a/olmo/eval/evaluator.py +++ b/olmo/eval/evaluator.py @@ -29,11 +29,14 @@ def reset_metrics(self) -> None: def compute_metrics(self) -> Dict[str, float]: if self.type == EvaluatorType.downstream: assert isinstance(self.eval_metric, ICLMetric) - value = self.eval_metric.compute().item() - key = f"eval/downstream/{self.label}_{self.eval_metric.metric_type}" - if self.eval_metric.metric_type in ["ce_loss", "bpb"]: - key = key.replace("/downstream/", f"/downstream_{self.eval_metric.metric_type}/") - return {key: value} + suffix_to_value = self.eval_metric.compute() + outputs = {} + for suffix, value in suffix_to_value.items(): + key = f"eval/downstream/{self.label}_{self.eval_metric.metric_type}{suffix}" + if self.eval_metric.metric_type in ["ce_loss", "bpb"]: + key = key.replace("/downstream/", f"/downstream_{self.eval_metric.metric_type}/") + outputs[key] = value.item() + return outputs elif self.type == EvaluatorType.lm: # Metric(s) = cross entropy loss metrics: Dict[str, Metric] @@ -52,7 +55,7 @@ def compute_metrics(self) -> Dict[str, float]: # This can happen when the evaluator contains multiple tasks/datasets and we didn't # get to this one within the current evaluation loop. metric.update(0.0, 0.0) - loss = metric.compute() + loss = metric.compute()[""] # always no suffix if loss.isnan().item(): # This can happen when the evaluator contains multiple tasks/datasets and we didn't # get to this one within the current evaluation loop. From 9ae7fccf767ba4cf17b817ef5f1e843c64db974a Mon Sep 17 00:00:00 2001 From: CodeCreator Date: Fri, 1 Nov 2024 11:33:56 -0700 Subject: [PATCH 41/58] Revert "Implement soft versions of accuracies" This reverts commit 99c0d80ee5124564016bc1c8782582f0444b8685. --- olmo/eval/downstream.py | 26 ++++++-------------------- olmo/eval/evaluator.py | 15 ++++++--------- 2 files changed, 12 insertions(+), 29 deletions(-) diff --git a/olmo/eval/downstream.py b/olmo/eval/downstream.py index fb06ca90f..2a9d1d365 100644 --- a/olmo/eval/downstream.py +++ b/olmo/eval/downstream.py @@ -95,9 +95,7 @@ def update(self, batch: Dict[str, Any], lm_logits: torch.Tensor, dc_lm_logits=No torch.LongTensor((doc_id, cont_id, batch["label_id"][idx])).to(batch["label_id"][idx].device) ) - def compute(self) -> Dict[str, torch.Tensor]: - # Task "suffix" -> tensor - + def compute(self) -> torch.Tensor: # states should have been synced from all accelerators at this point # account for duplicates here because of DistributedSampler compensating for drop_last=False loglikelihood_dict: Dict[int, Dict[int, float]] = {} @@ -118,9 +116,6 @@ def compute(self) -> Dict[str, torch.Tensor]: # compute acc correct = [] - soft_scores = [] - soft_log_scores = [] - preds: Optional[List[float]] = None labels: Optional[List[int]] = None if self.metric_type == "f1": @@ -145,15 +140,14 @@ def compute(self) -> Dict[str, torch.Tensor]: continue if self.metric_type in ["ce_loss", "bpb"]: correct.append(loglikelihoods[0]) # Only one answer is scored - elif self.metric_type == "f1": + else: + correct.append(1.0 if torch.argmax(loglikelihoods).item() == label_dict[doc_id] else 0.0) + + if self.metric_type == "f1": assert preds is not None assert labels is not None preds.append(torch.argmax(loglikelihoods).item()) labels.append(label_dict[doc_id]) - else: - correct.append(1.0 if torch.argmax(loglikelihoods).item() == label_dict[doc_id] else 0.0) - soft_scores.append(torch.softmax(loglikelihoods, dim=0)[label_dict[doc_id]].item()) - soft_log_scores.append(torch.log_softmax(loglikelihoods, dim=0)[label_dict[doc_id]].item()) if self.metric_type == "f1": assert preds is not None @@ -163,15 +157,7 @@ def compute(self) -> Dict[str, torch.Tensor]: else: score = sum(correct) / len(correct) - outputs = { - "": torch.tensor(score), - } - - if soft_scores: - outputs["_soft"] = torch.tensor(sum(soft_scores) / len(soft_scores)) - outputs["_soft_log"] = torch.tensor(sum(soft_log_scores) / len(soft_log_scores)) - - return outputs + return torch.tensor(score) class ICLMultiChoiceTaskDataset(metaclass=abc.ABCMeta): diff --git a/olmo/eval/evaluator.py b/olmo/eval/evaluator.py index e71a7d859..29ef049d7 100644 --- a/olmo/eval/evaluator.py +++ b/olmo/eval/evaluator.py @@ -29,14 +29,11 @@ def reset_metrics(self) -> None: def compute_metrics(self) -> Dict[str, float]: if self.type == EvaluatorType.downstream: assert isinstance(self.eval_metric, ICLMetric) - suffix_to_value = self.eval_metric.compute() - outputs = {} - for suffix, value in suffix_to_value.items(): - key = f"eval/downstream/{self.label}_{self.eval_metric.metric_type}{suffix}" - if self.eval_metric.metric_type in ["ce_loss", "bpb"]: - key = key.replace("/downstream/", f"/downstream_{self.eval_metric.metric_type}/") - outputs[key] = value.item() - return outputs + value = self.eval_metric.compute().item() + key = f"eval/downstream/{self.label}_{self.eval_metric.metric_type}" + if self.eval_metric.metric_type in ["ce_loss", "bpb"]: + key = key.replace("/downstream/", f"/downstream_{self.eval_metric.metric_type}/") + return {key: value} elif self.type == EvaluatorType.lm: # Metric(s) = cross entropy loss metrics: Dict[str, Metric] @@ -55,7 +52,7 @@ def compute_metrics(self) -> Dict[str, float]: # This can happen when the evaluator contains multiple tasks/datasets and we didn't # get to this one within the current evaluation loop. metric.update(0.0, 0.0) - loss = metric.compute()[""] # always no suffix + loss = metric.compute() if loss.isnan().item(): # This can happen when the evaluator contains multiple tasks/datasets and we didn't # get to this one within the current evaluation loop. From 54248d18bb746daa2ff035588eb8e11cabce3030 Mon Sep 17 00:00:00 2001 From: Jiacheng Liu Date: Fri, 1 Nov 2024 22:18:47 +0000 Subject: [PATCH 42/58] Eval peteish7 --- configs/peteish7-weka.yaml | 280 ++++++++++++++++-- .../beaker/peteish/peteish7-eval-launch.sh | 40 +++ scripts/beaker/peteish/peteish7-eval.sh | 59 ++++ scripts/eval.py | 4 +- 4 files changed, 353 insertions(+), 30 deletions(-) create mode 100755 scripts/beaker/peteish/peteish7-eval-launch.sh create mode 100755 scripts/beaker/peteish/peteish7-eval.sh diff --git a/configs/peteish7-weka.yaml b/configs/peteish7-weka.yaml index 5980de319..1e96ffbe2 100644 --- a/configs/peteish7-weka.yaml +++ b/configs/peteish7-weka.yaml @@ -107,35 +107,35 @@ eval_interval: 1000 eval_subset_num_batches: -1 device_eval_batch_size: ${device_train_microbatch_size} evaluators: - # - label: all-small-ppl-validation - # data: - # num_workers: 0 - # drop_last: true - # # generate_doc_lengths: true - # memmap_dtype: uint32 - # datasets: - # c4_en-validation: - # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy - # dolma_books-validation: - # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy - # dolma_common-crawl-validation: - # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy - # dolma_pes2o-validation: - # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy - # dolma_reddit-validation: - # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy - # dolma_stack-validation: - # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy - # dolma_wiki-validation: - # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy - # ice-validation: - # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy - # m2d2_s2orc-validation: - # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy - # pile-validation: - # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy - # wikitext_103-validation: - # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + - label: all-small-ppl-validation + data: + num_workers: 0 + drop_last: true + # generate_doc_lengths: true + memmap_dtype: uint32 + datasets: + c4_en-validation: + - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + dolma_books-validation: + - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + dolma_common-crawl-validation: + - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + dolma_pes2o-validation: + - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + dolma_reddit-validation: + - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + dolma_stack-validation: + - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + dolma_wiki-validation: + - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + ice-validation: + - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + m2d2_s2orc-validation: + - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + pile-validation: + - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + wikitext_103-validation: + - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy ########################## # Downstream evaluations # @@ -230,6 +230,228 @@ evaluators: - label: arc_easy_ppl type: downstream + - label: piqa_rc_0shot + type: downstream + + - label: piqa_rc_0shot_bpb + type: downstream + + - label: piqa_rc_5shot + type: downstream + + - label: piqa_rc_5shot_bpb + type: downstream + + - label: piqa_mc_5shot + type: downstream + + - label: piqa_mc_5shot_bpb + type: downstream + + - label: hellaswag_rc_0shot + type: downstream + + - label: hellaswag_rc_0shot_bpb + type: downstream + + - label: hellaswag_rc_5shot + type: downstream + + - label: hellaswag_rc_5shot_bpb + type: downstream + + - label: hellaswag_mc_5shot + type: downstream + + - label: hellaswag_mc_5shot_bpb + type: downstream + + - label: winogrande_rc_0shot + type: downstream + + - label: winogrande_rc_0shot_bpb + type: downstream + + - label: winogrande_rc_5shot + type: downstream + + - label: winogrande_rc_5shot_bpb + type: downstream + + - label: winogrande_mc_5shot + type: downstream + + - label: winogrande_mc_5shot_bpb + type: downstream + + - label: openbookqa_rc_0shot + type: downstream + + - label: openbookqa_rc_0shot_bpb + type: downstream + + - label: openbookqa_rc_5shot + type: downstream + + - label: openbookqa_rc_5shot_bpb + type: downstream + + - label: openbookqa_mc_5shot + type: downstream + + - label: openbookqa_mc_5shot_bpb + type: downstream + + - label: boolq_rc_0shot + type: downstream + + - label: boolq_rc_0shot_bpb + type: downstream + + - label: boolq_rc_5shot + type: downstream + + - label: boolq_rc_5shot_bpb + type: downstream + + - label: boolq_mc_5shot + type: downstream + + - label: boolq_mc_5shot_bpb + type: downstream + + - label: sciq_rc_0shot + type: downstream + + - label: sciq_rc_0shot_bpb + type: downstream + + # - label: sciq_rc_5shot + # type: downstream + + # - label: sciq_rc_5shot_bpb + # type: downstream + + # - label: sciq_mc_5shot + # type: downstream + + # - label: sciq_mc_5shot_bpb + # type: downstream + + - label: arc_easy_rc_0shot + type: downstream + + - label: arc_easy_rc_0shot_bpb + type: downstream + + - label: arc_easy_rc_5shot + type: downstream + + - label: arc_easy_rc_5shot_bpb + type: downstream + + - label: arc_easy_mc_5shot + type: downstream + + - label: arc_easy_mc_5shot_bpb + type: downstream + + - label: arc_challenge_rc_0shot + type: downstream + + - label: arc_challenge_rc_0shot_bpb + type: downstream + + - label: arc_challenge_rc_5shot + type: downstream + + - label: arc_challenge_rc_5shot_bpb + type: downstream + + - label: arc_challenge_mc_5shot + type: downstream + + - label: arc_challenge_mc_5shot_bpb + type: downstream + + - label: copa_rc_0shot + type: downstream + + - label: copa_rc_0shot_bpb + type: downstream + + # - label: copa_rc_5shot + # type: downstream + + # - label: copa_rc_5shot_bpb + # type: downstream + + # - label: copa_mc_5shot + # type: downstream + + # - label: copa_mc_5shot_bpb + # type: downstream + + - label: csqa_rc_0shot + type: downstream + + - label: csqa_rc_0shot_bpb + type: downstream + + - label: csqa_rc_5shot + type: downstream + + - label: csqa_rc_5shot_bpb + type: downstream + + - label: csqa_mc_5shot + type: downstream + + - label: csqa_mc_5shot_bpb + type: downstream + + - label: socialiqa_rc_0shot + type: downstream + + - label: socialiqa_rc_0shot_bpb + type: downstream + + - label: socialiqa_rc_5shot + type: downstream + + - label: socialiqa_rc_5shot_bpb + type: downstream + + - label: socialiqa_mc_5shot + type: downstream + + - label: socialiqa_mc_5shot_bpb + type: downstream + + - label: mmlu_stem_var_bpb + type: downstream + + - label: mmlu_humanities_var_bpb + type: downstream + + - label: mmlu_social_sciences_var_bpb + type: downstream + + - label: mmlu_other_var_bpb + type: downstream + + - label: mmlu_stem_bpb + type: downstream + + - label: mmlu_humanities_bpb + type: downstream + + - label: mmlu_social_sciences_bpb + type: downstream + + - label: mmlu_other_bpb + type: downstream + data: pad_direction: right # generate_doc_lengths: true diff --git a/scripts/beaker/peteish/peteish7-eval-launch.sh b/scripts/beaker/peteish/peteish7-eval-launch.sh new file mode 100755 index 000000000..bb318651e --- /dev/null +++ b/scripts/beaker/peteish/peteish7-eval-launch.sh @@ -0,0 +1,40 @@ +#!/usr/bin/env bash + +set -ex + +NUM_NODES=1 + +gantry run \ + --workspace ai2/OLMo-tiny \ + --task-name peteish7-eval \ + --description "Pete-ish 7B eval" \ + --priority high \ + --preemptible \ + --beaker-image petew/olmo-torch23-gantry \ + --cluster ai2/jupiter-cirrascale-2 \ + --gpus 8 \ + --replicas "${NUM_NODES}" \ + --leader-selection \ + --host-networking \ + --propagate-failure \ + --propagate-preemption \ + --synchronized-start-timeout 90m \ + --budget ai2/oe-training \ + --no-nfs \ + --weka oe-training-default:/weka/oe-training-default \ + --no-python \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env OLMO_TASK=model \ + --env R2_PROFILE=R2 \ + --env S3_PROFILE=S3 \ + --env WEKA_PROFILE=WEKA \ + --env-secret AWS_CONFIG=PETEW_AWS_CONFIG \ + --env-secret AWS_CREDENTIALS=PETEW_AWS_CREDENTIALS \ + --env-secret R2_ENDPOINT_URL=R2_ENDPOINT_URL \ + --env-secret WEKA_ENDPOINT_URL=WEKA_ENDPOINT_URL \ + --env-secret WANDB_API_KEY=JIACHENGL_WANDB_API_KEY \ + --shared-memory 10GiB \ + --yes \ + --timeout=-1 \ + -- /bin/bash -c "scripts/beaker/peteish/peteish7-eval.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/peteish/peteish7-eval.sh b/scripts/beaker/peteish/peteish7-eval.sh new file mode 100755 index 000000000..ca623d8aa --- /dev/null +++ b/scripts/beaker/peteish/peteish7-eval.sh @@ -0,0 +1,59 @@ +#!/usr/bin/env bash + +set -exuo pipefail +IFS=$'\n\t' + +BEAKER_LEADER_REPLICA_HOSTNAME=$1 +shift + +NUM_NODES=$1 +shift + +BEAKER_REPLICA_RANK=$1 +shift + +# Setup Python environment. +conda shell.bash activate base + +# Install flash-attn +#conda install -y -c nvidia cuda-python +pip install packaging ninja +export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE +pip install flash-attn==2.5.9.post1 --no-build-isolation +# pip install awscli +pip install '.[train]' +pip freeze + +# Move AWS credentials from env to relevant files +mkdir -p ~/.aws +printenv AWS_CONFIG > ~/.aws/config +printenv AWS_CREDENTIALS > ~/.aws/credentials + +# Force processes to synchronize at init_process_group +export TORCH_DIST_INIT_BARRIER=1 + +# Tell OLMo all ranks share the same filesystem for checkpoints. +export OLMO_SHARED_FS=1 + +export NCCL_DEBUG=INFO +export NCCL_IB_HCA="^=mlx5_bond_0" +export NCCL_SOCKET_IFNAME=ib +# export NCCL_IB_GID_INDEX=0 + +torchrun \ + --nnodes "${NUM_NODES}:${NUM_NODES}" \ + --nproc-per-node 8 \ + --rdzv_id 12347 \ + --rdzv_backend static \ + --rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \ + --node_rank "${BEAKER_REPLICA_RANK}" \ + --rdzv_conf 'read_timeout=420' \ + scripts/eval.py \ + configs/peteish7-weka.yaml \ + --run_name="${GANTRY_TASK_NAME}" \ + --save_interval_ephemeral=500 \ + --save_overwrite \ + --wandb.group="peteish7" \ + --load_path="/weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7" + + # '--load_path=${path.last_checkpoint:${save_folder}}' \ diff --git a/scripts/eval.py b/scripts/eval.py index a3774911a..9c55bff0f 100644 --- a/scripts/eval.py +++ b/scripts/eval.py @@ -125,7 +125,9 @@ def main(cfg: TrainConfig) -> None: load_paths = [cfg.load_path] else: # This globbing does not work with remote paths. - load_paths = list(sorted(glob.glob(f"{cfg.load_path}/step*"), key=lambda x: int(x.split('/')[-1].split('step')[-1]))) + load_paths = list(glob.glob(f"{cfg.load_path}/step*")) + load_paths = [x for x in load_paths if x[-1].isdigit()] + load_paths = list(sorted(load_paths, key=lambda x: int(x.split('/')[-1].split('step')[-1]))) for load_path in load_paths: step = int(load_path.split('/')[-1].split('step')[-1]) From dfdd5dce672e092fd9a5d177bf9c73c50fdbbff3 Mon Sep 17 00:00:00 2001 From: Jiacheng Liu Date: Fri, 1 Nov 2024 22:26:34 +0000 Subject: [PATCH 43/58] Add back optim and scheduler --- scripts/eval.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/scripts/eval.py b/scripts/eval.py index 9c55bff0f..af0a39981 100644 --- a/scripts/eval.py +++ b/scripts/eval.py @@ -217,6 +217,10 @@ def dummy_init_fn(module: torch.nn.Module) -> None: log.info("Model:") log.info(dist_model) + # Construct optimizer and learning rate scheduler. + optim = build_optimizer(cfg, dist_model) + scheduler = build_scheduler(cfg) + # Consolidate components into `Trainer` object. with TrainerForEval( cfg=cfg, @@ -225,6 +229,8 @@ def dummy_init_fn(module: torch.nn.Module) -> None: dist_model=dist_model, device=device, evaluators=evaluators, + optim=optim, + scheduler=scheduler, ) as trainer: log.info(f"Loading checkpoint from {load_path}...") From 52347a0af175a02d4424ced155e9f067bc054a04 Mon Sep 17 00:00:00 2001 From: Jiacheng Liu Date: Fri, 1 Nov 2024 22:49:08 +0000 Subject: [PATCH 44/58] Set eval_batch_size=16 --- scripts/beaker/peteish/peteish7-eval-launch.sh | 3 ++- scripts/beaker/peteish/peteish7-eval.sh | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/beaker/peteish/peteish7-eval-launch.sh b/scripts/beaker/peteish/peteish7-eval-launch.sh index bb318651e..0c56d96c3 100755 --- a/scripts/beaker/peteish/peteish7-eval-launch.sh +++ b/scripts/beaker/peteish/peteish7-eval-launch.sh @@ -2,9 +2,10 @@ set -ex -NUM_NODES=1 +NUM_NODES=2 gantry run \ + --allow-dirty \ --workspace ai2/OLMo-tiny \ --task-name peteish7-eval \ --description "Pete-ish 7B eval" \ diff --git a/scripts/beaker/peteish/peteish7-eval.sh b/scripts/beaker/peteish/peteish7-eval.sh index ca623d8aa..7a79ae9c0 100755 --- a/scripts/beaker/peteish/peteish7-eval.sh +++ b/scripts/beaker/peteish/peteish7-eval.sh @@ -53,6 +53,7 @@ torchrun \ --run_name="${GANTRY_TASK_NAME}" \ --save_interval_ephemeral=500 \ --save_overwrite \ + --device_eval_batch_size=16 \ --wandb.group="peteish7" \ --load_path="/weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7" From 7826a92191e9b446490f560595d745c7d1e81e7c Mon Sep 17 00:00:00 2001 From: Jiacheng Liu Date: Mon, 4 Nov 2024 04:39:59 +0000 Subject: [PATCH 45/58] Temporary: resume from step712000 --- scripts/beaker/peteish/peteish7-eval-launch.sh | 2 +- scripts/eval.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/beaker/peteish/peteish7-eval-launch.sh b/scripts/beaker/peteish/peteish7-eval-launch.sh index 0c56d96c3..7cd60ba69 100755 --- a/scripts/beaker/peteish/peteish7-eval-launch.sh +++ b/scripts/beaker/peteish/peteish7-eval-launch.sh @@ -2,7 +2,7 @@ set -ex -NUM_NODES=2 +NUM_NODES=8 gantry run \ --allow-dirty \ diff --git a/scripts/eval.py b/scripts/eval.py index af0a39981..da0b97fd2 100644 --- a/scripts/eval.py +++ b/scripts/eval.py @@ -127,6 +127,7 @@ def main(cfg: TrainConfig) -> None: # This globbing does not work with remote paths. load_paths = list(glob.glob(f"{cfg.load_path}/step*")) load_paths = [x for x in load_paths if x[-1].isdigit()] + load_paths = [x for x in load_paths if int(x.split('/')[-1].split('step')[-1]) >= 712000] # TODO: delete this load_paths = list(sorted(load_paths, key=lambda x: int(x.split('/')[-1].split('step')[-1]))) for load_path in load_paths: From a4fd271e7efe2c8b4188beeb9b4fae5336439433 Mon Sep 17 00:00:00 2001 From: Jiacheng Liu Date: Tue, 5 Nov 2024 21:22:23 +0000 Subject: [PATCH 46/58] Revert 712000 --- scripts/eval.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/eval.py b/scripts/eval.py index da0b97fd2..613eff935 100644 --- a/scripts/eval.py +++ b/scripts/eval.py @@ -127,7 +127,7 @@ def main(cfg: TrainConfig) -> None: # This globbing does not work with remote paths. load_paths = list(glob.glob(f"{cfg.load_path}/step*")) load_paths = [x for x in load_paths if x[-1].isdigit()] - load_paths = [x for x in load_paths if int(x.split('/')[-1].split('step')[-1]) >= 712000] # TODO: delete this + # load_paths = [x for x in load_paths if int(x.split('/')[-1].split('step')[-1]) >= 712000] # TODO: delete this load_paths = list(sorted(load_paths, key=lambda x: int(x.split('/')[-1].split('step')[-1]))) for load_path in load_paths: From 90523d0c9fad7e3320e04bbdb72ff93ea349dffe Mon Sep 17 00:00:00 2001 From: Jiacheng Liu Date: Fri, 8 Nov 2024 21:25:02 +0000 Subject: [PATCH 47/58] Support unsharded ckpt; Throw exception for remote load_path --- scripts/eval.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/scripts/eval.py b/scripts/eval.py index 613eff935..08a60067a 100644 --- a/scripts/eval.py +++ b/scripts/eval.py @@ -121,17 +121,18 @@ def main(cfg: TrainConfig) -> None: if cfg.load_path is None: raise OLMoConfigurationError("To run eval you must provide a load_path") + elif "://" in cfg.load_path: + raise OLMoConfigurationError("Eval does not support remote paths. Please specify a local path or WEKA mounted path.") if 'step' in cfg.load_path.split('/')[-1]: load_paths = [cfg.load_path] else: - # This globbing does not work with remote paths. + # This globbing only works with local paths load_paths = list(glob.glob(f"{cfg.load_path}/step*")) load_paths = [x for x in load_paths if x[-1].isdigit()] - # load_paths = [x for x in load_paths if int(x.split('/')[-1].split('step')[-1]) >= 712000] # TODO: delete this - load_paths = list(sorted(load_paths, key=lambda x: int(x.split('/')[-1].split('step')[-1]))) + load_paths = list(sorted(load_paths, key=lambda x: int(x.split('/')[-1].replace('-unsharded', '').split('step')[-1]))) for load_path in load_paths: - step = int(load_path.split('/')[-1].split('step')[-1]) + step = int(load_path.split('/')[-1].replace('-unsharded', '').split('step')[-1]) # Initialize the model. log.info("Building model...") From 6610f1bf4d0f65516a95a8a7856db1c54d9c1c1b Mon Sep 17 00:00:00 2001 From: Jiacheng Liu Date: Mon, 11 Nov 2024 21:56:26 +0000 Subject: [PATCH 48/58] Evaluate HF models --- scripts/eval_hf.py | 192 +++++++++++++++++++++++++++++++++++++++++++++ scripts/eval_hf.sh | 30 +++++++ 2 files changed, 222 insertions(+) create mode 100644 scripts/eval_hf.py create mode 100644 scripts/eval_hf.sh diff --git a/scripts/eval_hf.py b/scripts/eval_hf.py new file mode 100644 index 000000000..9974eafad --- /dev/null +++ b/scripts/eval_hf.py @@ -0,0 +1,192 @@ +from itertools import islice +import json +import os +import sys +from tqdm import tqdm +from typing import Any, Dict +import torch +import torch.nn.functional as F +import transformers +from olmo.config import TrainConfig, EvaluatorConfig, EvaluatorType +from olmo.eval import build_evaluator +from olmo.torch_util import move_to_device +from olmo.exceptions import OLMoCliError + + +def get_labels(batch: Dict[str, Any]) -> torch.Tensor: + # Labels are just input IDs shifted to the left (first item is ignored). + labels, label_mask, attention_mask, instance_mask = ( + batch["input_ids"].clone(), + batch.get("label_mask"), + batch.get("attention_mask"), + batch.get("instance_mask"), + ) + if label_mask is not None: + labels.masked_fill_(~label_mask, -100) + if attention_mask is not None: + labels.masked_fill_(attention_mask == 0.0, -100) + if instance_mask is not None: + labels.masked_fill_(~instance_mask.unsqueeze(-1), value=-100) + return labels[..., 1:].contiguous() + +def main(cfg: TrainConfig, model_name: str): + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + tokenizer = transformers.AutoTokenizer.from_pretrained(model_name) + model = transformers.AutoModelForCausalLM.from_pretrained(model_name) + model.to(device) + model.eval() + + cfg.evaluators = [ + EvaluatorConfig(label="piqa_rc_0shot", type=EvaluatorType.downstream), + EvaluatorConfig(label="piqa_rc_0shot_bpb", type=EvaluatorType.downstream), + EvaluatorConfig(label="piqa_rc_5shot", type=EvaluatorType.downstream), + EvaluatorConfig(label="piqa_rc_5shot_bpb", type=EvaluatorType.downstream), + EvaluatorConfig(label="piqa_mc_5shot", type=EvaluatorType.downstream), + EvaluatorConfig(label="piqa_mc_5shot_bpb", type=EvaluatorType.downstream), + EvaluatorConfig(label="hellaswag_rc_0shot", type=EvaluatorType.downstream), + EvaluatorConfig(label="hellaswag_rc_0shot_bpb", type=EvaluatorType.downstream), + EvaluatorConfig(label="hellaswag_rc_5shot", type=EvaluatorType.downstream), + EvaluatorConfig(label="hellaswag_rc_5shot_bpb", type=EvaluatorType.downstream), + EvaluatorConfig(label="hellaswag_mc_5shot", type=EvaluatorType.downstream), + EvaluatorConfig(label="hellaswag_mc_5shot_bpb", type=EvaluatorType.downstream), + EvaluatorConfig(label="winogrande_rc_0shot", type=EvaluatorType.downstream), + EvaluatorConfig(label="winogrande_rc_0shot_bpb", type=EvaluatorType.downstream), + EvaluatorConfig(label="winogrande_rc_5shot", type=EvaluatorType.downstream), + EvaluatorConfig(label="winogrande_rc_5shot_bpb", type=EvaluatorType.downstream), + EvaluatorConfig(label="winogrande_mc_5shot", type=EvaluatorType.downstream), + EvaluatorConfig(label="winogrande_mc_5shot_bpb", type=EvaluatorType.downstream), + EvaluatorConfig(label="openbookqa_rc_0shot", type=EvaluatorType.downstream), + EvaluatorConfig(label="openbookqa_rc_0shot_bpb", type=EvaluatorType.downstream), + EvaluatorConfig(label="openbookqa_rc_5shot", type=EvaluatorType.downstream), + EvaluatorConfig(label="openbookqa_rc_5shot_bpb", type=EvaluatorType.downstream), + EvaluatorConfig(label="openbookqa_mc_5shot", type=EvaluatorType.downstream), + EvaluatorConfig(label="openbookqa_mc_5shot_bpb", type=EvaluatorType.downstream), + EvaluatorConfig(label="boolq_rc_0shot", type=EvaluatorType.downstream), + EvaluatorConfig(label="boolq_rc_0shot_bpb", type=EvaluatorType.downstream), + EvaluatorConfig(label="boolq_rc_5shot", type=EvaluatorType.downstream), + EvaluatorConfig(label="boolq_rc_5shot_bpb", type=EvaluatorType.downstream), + EvaluatorConfig(label="boolq_mc_5shot", type=EvaluatorType.downstream), + EvaluatorConfig(label="boolq_mc_5shot_bpb", type=EvaluatorType.downstream), + EvaluatorConfig(label="sciq_rc_0shot", type=EvaluatorType.downstream), + EvaluatorConfig(label="sciq_rc_0shot_bpb", type=EvaluatorType.downstream), + # EvaluatorConfig(label="sciq_rc_5shot", type=EvaluatorType.downstream), + # EvaluatorConfig(label="sciq_rc_5shot_bpb", type=EvaluatorType.downstream), + # EvaluatorConfig(label="sciq_mc_5shot", type=EvaluatorType.downstream), + # EvaluatorConfig(label="sciq_mc_5shot_bpb", type=EvaluatorType.downstream), + EvaluatorConfig(label="arc_easy_rc_0shot", type=EvaluatorType.downstream), + EvaluatorConfig(label="arc_easy_rc_0shot_bpb", type=EvaluatorType.downstream), + EvaluatorConfig(label="arc_easy_rc_5shot", type=EvaluatorType.downstream), + EvaluatorConfig(label="arc_easy_rc_5shot_bpb", type=EvaluatorType.downstream), + EvaluatorConfig(label="arc_easy_mc_5shot", type=EvaluatorType.downstream), + EvaluatorConfig(label="arc_easy_mc_5shot_bpb", type=EvaluatorType.downstream), + EvaluatorConfig(label="arc_challenge_rc_0shot", type=EvaluatorType.downstream), + EvaluatorConfig(label="arc_challenge_rc_0shot_bpb", type=EvaluatorType.downstream), + EvaluatorConfig(label="arc_challenge_rc_5shot", type=EvaluatorType.downstream), + EvaluatorConfig(label="arc_challenge_rc_5shot_bpb", type=EvaluatorType.downstream), + EvaluatorConfig(label="arc_challenge_mc_5shot", type=EvaluatorType.downstream), + EvaluatorConfig(label="arc_challenge_mc_5shot_bpb", type=EvaluatorType.downstream), + EvaluatorConfig(label="copa_rc_0shot", type=EvaluatorType.downstream), + EvaluatorConfig(label="copa_rc_0shot_bpb", type=EvaluatorType.downstream), + # EvaluatorConfig(label="copa_rc_5shot", type=EvaluatorType.downstream), + # EvaluatorConfig(label="copa_rc_5shot_bpb", type=EvaluatorType.downstream), + # EvaluatorConfig(label="copa_mc_5shot", type=EvaluatorType.downstream), + # EvaluatorConfig(label="copa_mc_5shot_bpb", type=EvaluatorType.downstream), + EvaluatorConfig(label="csqa_rc_0shot", type=EvaluatorType.downstream), + EvaluatorConfig(label="csqa_rc_0shot_bpb", type=EvaluatorType.downstream), + EvaluatorConfig(label="csqa_rc_5shot", type=EvaluatorType.downstream), + EvaluatorConfig(label="csqa_rc_5shot_bpb", type=EvaluatorType.downstream), + EvaluatorConfig(label="csqa_mc_5shot", type=EvaluatorType.downstream), + EvaluatorConfig(label="csqa_mc_5shot_bpb", type=EvaluatorType.downstream), + EvaluatorConfig(label="socialiqa_rc_0shot", type=EvaluatorType.downstream), + EvaluatorConfig(label="socialiqa_rc_0shot_bpb", type=EvaluatorType.downstream), + EvaluatorConfig(label="socialiqa_rc_5shot", type=EvaluatorType.downstream), + EvaluatorConfig(label="socialiqa_rc_5shot_bpb", type=EvaluatorType.downstream), + EvaluatorConfig(label="socialiqa_mc_5shot", type=EvaluatorType.downstream), + EvaluatorConfig(label="socialiqa_mc_5shot_bpb", type=EvaluatorType.downstream), + EvaluatorConfig(label="mmlu_stem_var", type=EvaluatorType.downstream), + EvaluatorConfig(label="mmlu_humanities_var", type=EvaluatorType.downstream), + EvaluatorConfig(label="mmlu_social_sciences_var", type=EvaluatorType.downstream), + EvaluatorConfig(label="mmlu_other_var", type=EvaluatorType.downstream), + EvaluatorConfig(label="mmlu_stem_mc_5shot", type=EvaluatorType.downstream), + EvaluatorConfig(label="mmlu_humanities_mc_5shot", type=EvaluatorType.downstream), + EvaluatorConfig(label="mmlu_social_sciences_mc_5shot", type=EvaluatorType.downstream), + EvaluatorConfig(label="mmlu_other_mc_5shot", type=EvaluatorType.downstream), + EvaluatorConfig(label="mmlu_stem_var_bpb", type=EvaluatorType.downstream), + EvaluatorConfig(label="mmlu_humanities_var_bpb", type=EvaluatorType.downstream), + EvaluatorConfig(label="mmlu_social_sciences_var_bpb", type=EvaluatorType.downstream), + EvaluatorConfig(label="mmlu_other_var_bpb", type=EvaluatorType.downstream), + EvaluatorConfig(label="mmlu_stem_bpb", type=EvaluatorType.downstream), + EvaluatorConfig(label="mmlu_humanities_bpb", type=EvaluatorType.downstream), + EvaluatorConfig(label="mmlu_social_sciences_bpb", type=EvaluatorType.downstream), + EvaluatorConfig(label="mmlu_other_bpb", type=EvaluatorType.downstream), + ] + + evaluators = [] + for eval_cfg in cfg.evaluators: + evaluators.append(build_evaluator(cfg, eval_cfg, tokenizer, device)) + + eval_metrics = {} + for evaluator in tqdm(evaluators): + # Reset metrics. + evaluator.reset_metrics() + + # Initialize data loader iterator. + eval_batches = iter(evaluator.eval_loader) + + # Adjust how many batches to evaluate on. + num_eval_batches = ( + evaluator.subset_num_batches + if evaluator.subset_num_batches is not None + else cfg.eval_subset_num_batches + ) + if num_eval_batches > 0: + num_eval_batches = min(num_eval_batches, len(evaluator.eval_loader)) + eval_batches = islice(eval_batches, num_eval_batches) + + # Run model over batches. + for eval_step, eval_batch in enumerate(eval_batches): + batch = move_to_device(eval_batch, device) + with torch.no_grad(): + with torch.autocast("cuda", enabled=True, dtype=cfg.autocast_precision): + logits = model( + input_ids=batch["input_ids"], + attention_mask=batch.get("attention_mask"), + ).logits + logits_for_loss = logits[..., :-1, :].contiguous() + # shape: (batch_size * seq_len, vocab_size) + logits_for_loss = logits_for_loss.view(-1, logits_for_loss.size(-1)) + # shape: (batch_size, seq_len) + labels = get_labels(batch) + # shape: (batch_size * seq_len,) + labels = labels.view(-1) + ce_loss = F.cross_entropy(logits_for_loss, labels, ignore_index=-100, reduction="none") + # Reshape (batch_size * seq_len,) -> (batch_size, seq_len) + ce_loss = ce_loss.view(batch["input_ids"].shape[0], -1) + ce_loss = ce_loss.mean(dim=-1) + evaluator.update_metrics(batch, ce_loss, logits) + + # Get final metrics. + metrics = evaluator.compute_metrics() + eval_metrics.update(metrics) + print(eval_metrics) + + save_folder = f'/weka/oe-training-default/jiachengl/hc-law/eval_bpb_mc' + if not os.path.exists(save_folder): + os.makedirs(save_folder) + with open(f'{save_folder}/{model_name.replace("/", "_")}.json', 'w') as f: + json.dump(eval_metrics, f) + + del eval_batches + + +if __name__ == "__main__": + + try: + yaml_path, model_name = sys.argv[1], sys.argv[2] + except IndexError: + raise OLMoCliError(f"Usage: {sys.argv[0]} [CONFIG_PATH] [MODEL_NAME]") + + cfg = TrainConfig.load(yaml_path) + main(cfg, model_name) diff --git a/scripts/eval_hf.sh b/scripts/eval_hf.sh new file mode 100644 index 000000000..c8f55f2d5 --- /dev/null +++ b/scripts/eval_hf.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash + +set -ex + +gantry run \ + --allow-dirty \ + --workspace ai2/OLMo-tiny \ + --task-name eval-bpb-mc \ + --description "Evaluate open-weight models" \ + --priority high \ + --preemptible \ + --beaker-image petew/olmo-torch23-gantry \ + --cluster ai2/jupiter-cirrascale-2 \ + --gpus 1 \ + --budget ai2/oe-training \ + --no-nfs \ + --weka oe-training-default:/weka/oe-training-default \ + --no-python \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env OLMO_TASK=model \ + --shared-memory 10GiB \ + --yes \ + --timeout=-1 \ + -- /bin/bash -c "\ + set -exuo pipefail; \ + IFS=$'\n\t'; \ + conda shell.bash activate base; \ + torchrun --nproc-per-node 1 scripts/eval_hf.py configs/peteish1-weka.yaml allenai/OLMo-1B-0724-hf; \ + " From 1fe0a3618354ab95d2d27b4b699fae4456e6080e Mon Sep 17 00:00:00 2001 From: Jiacheng Liu Date: Mon, 11 Nov 2024 22:26:52 +0000 Subject: [PATCH 49/58] Update --- scripts/eval_hf.py | 17 ++++++++++------- scripts/eval_hf.sh | 7 +++++-- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/scripts/eval_hf.py b/scripts/eval_hf.py index 9974eafad..5435e7a34 100644 --- a/scripts/eval_hf.py +++ b/scripts/eval_hf.py @@ -38,6 +38,7 @@ def main(cfg: TrainConfig, model_name: str): model.to(device) model.eval() + cfg.device_eval_batch_size = 64 cfg.evaluators = [ EvaluatorConfig(label="piqa_rc_0shot", type=EvaluatorType.downstream), EvaluatorConfig(label="piqa_rc_0shot_bpb", type=EvaluatorType.downstream), @@ -170,16 +171,18 @@ def main(cfg: TrainConfig, model_name: str): # Get final metrics. metrics = evaluator.compute_metrics() eval_metrics.update(metrics) - print(eval_metrics) - - save_folder = f'/weka/oe-training-default/jiachengl/hc-law/eval_bpb_mc' - if not os.path.exists(save_folder): - os.makedirs(save_folder) - with open(f'{save_folder}/{model_name.replace("/", "_")}.json', 'w') as f: - json.dump(eval_metrics, f) + print(metrics) del eval_batches + print(eval_metrics) + + save_folder = f'/weka/oe-training-default/jiachengl/hc-law/eval_bpb_mc' + if not os.path.exists(save_folder): + os.makedirs(save_folder) + with open(f'{save_folder}/{model_name.replace("/", "_")}.json', 'w') as f: + json.dump(eval_metrics, f) + if __name__ == "__main__": diff --git a/scripts/eval_hf.sh b/scripts/eval_hf.sh index c8f55f2d5..45426284f 100644 --- a/scripts/eval_hf.sh +++ b/scripts/eval_hf.sh @@ -2,11 +2,13 @@ set -ex +MODEL_NAME=$1 + gantry run \ --allow-dirty \ --workspace ai2/OLMo-tiny \ --task-name eval-bpb-mc \ - --description "Evaluate open-weight models" \ + --description "Evaluate bpb and mc for ${MODEL_NAME}" \ --priority high \ --preemptible \ --beaker-image petew/olmo-torch23-gantry \ @@ -26,5 +28,6 @@ gantry run \ set -exuo pipefail; \ IFS=$'\n\t'; \ conda shell.bash activate base; \ - torchrun --nproc-per-node 1 scripts/eval_hf.py configs/peteish1-weka.yaml allenai/OLMo-1B-0724-hf; \ + pip install '.[train]'; \ + torchrun --nproc-per-node 1 scripts/eval_hf.py configs/peteish1-weka.yaml ${MODEL_NAME}; \ " From ae054cf4fbd3f0627a39a0ade70704e35abbe63e Mon Sep 17 00:00:00 2001 From: Jiacheng Liu Date: Mon, 11 Nov 2024 22:38:20 +0000 Subject: [PATCH 50/58] Update --- scripts/eval_hf.py | 2 +- scripts/eval_hf.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/eval_hf.py b/scripts/eval_hf.py index 5435e7a34..ed5f898fc 100644 --- a/scripts/eval_hf.py +++ b/scripts/eval_hf.py @@ -38,7 +38,7 @@ def main(cfg: TrainConfig, model_name: str): model.to(device) model.eval() - cfg.device_eval_batch_size = 64 + cfg.device_eval_batch_size = 4 cfg.evaluators = [ EvaluatorConfig(label="piqa_rc_0shot", type=EvaluatorType.downstream), EvaluatorConfig(label="piqa_rc_0shot_bpb", type=EvaluatorType.downstream), diff --git a/scripts/eval_hf.sh b/scripts/eval_hf.sh index 45426284f..cedf43937 100644 --- a/scripts/eval_hf.sh +++ b/scripts/eval_hf.sh @@ -23,7 +23,7 @@ gantry run \ --env OLMO_TASK=model \ --shared-memory 10GiB \ --yes \ - --timeout=-1 \ + --timeout=0 \ -- /bin/bash -c "\ set -exuo pipefail; \ IFS=$'\n\t'; \ From 4a4a5b1fe0d54592c972832e301bb6d44f74d5d4 Mon Sep 17 00:00:00 2001 From: Jiacheng Liu Date: Mon, 11 Nov 2024 22:42:26 +0000 Subject: [PATCH 51/58] Update --- scripts/eval_hf.py | 4 ++-- scripts/eval_hf.sh | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/eval_hf.py b/scripts/eval_hf.py index ed5f898fc..a31ec94d2 100644 --- a/scripts/eval_hf.py +++ b/scripts/eval_hf.py @@ -33,8 +33,8 @@ def main(cfg: TrainConfig, model_name: str): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - tokenizer = transformers.AutoTokenizer.from_pretrained(model_name) - model = transformers.AutoModelForCausalLM.from_pretrained(model_name) + tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, token=os.environ.get("HF_TOKEN", None)) + model = transformers.AutoModelForCausalLM.from_pretrained(model_name, token=os.environ.get("HF_TOKEN", None)) model.to(device) model.eval() diff --git a/scripts/eval_hf.sh b/scripts/eval_hf.sh index cedf43937..65837f97a 100644 --- a/scripts/eval_hf.sh +++ b/scripts/eval_hf.sh @@ -21,6 +21,7 @@ gantry run \ --env LOG_FILTER_TYPE=local_rank0_only \ --env OMP_NUM_THREADS=8 \ --env OLMO_TASK=model \ + --env HF_TOKEN=JIACHENGL_HF_TOKEN \ --shared-memory 10GiB \ --yes \ --timeout=0 \ From 47a290733a24690ba08a07378d1615854058c80a Mon Sep 17 00:00:00 2001 From: Jiacheng Liu Date: Mon, 11 Nov 2024 22:47:47 +0000 Subject: [PATCH 52/58] Update --- scripts/eval_hf.py | 4 ++-- scripts/eval_hf.sh | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/eval_hf.py b/scripts/eval_hf.py index a31ec94d2..5e3478f30 100644 --- a/scripts/eval_hf.py +++ b/scripts/eval_hf.py @@ -33,8 +33,8 @@ def main(cfg: TrainConfig, model_name: str): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, token=os.environ.get("HF_TOKEN", None)) - model = transformers.AutoModelForCausalLM.from_pretrained(model_name, token=os.environ.get("HF_TOKEN", None)) + tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, token=os.environ.get("HF_TOKEN_DOWNLOAD", None)) + model = transformers.AutoModelForCausalLM.from_pretrained(model_name, token=os.environ.get("HF_TOKEN_DOWNLOAD", None)) model.to(device) model.eval() diff --git a/scripts/eval_hf.sh b/scripts/eval_hf.sh index 65837f97a..14020917c 100644 --- a/scripts/eval_hf.sh +++ b/scripts/eval_hf.sh @@ -21,7 +21,7 @@ gantry run \ --env LOG_FILTER_TYPE=local_rank0_only \ --env OMP_NUM_THREADS=8 \ --env OLMO_TASK=model \ - --env HF_TOKEN=JIACHENGL_HF_TOKEN \ + --env HF_TOKEN_DOWNLOAD=JIACHENGL_HF_TOKEN \ --shared-memory 10GiB \ --yes \ --timeout=0 \ From a44cd797d3a6b852583f9a0b032b24e3e1e233bd Mon Sep 17 00:00:00 2001 From: Jiacheng Liu Date: Mon, 11 Nov 2024 23:47:12 +0000 Subject: [PATCH 53/58] Debug None in ctx --- olmo/eval/downstream.py | 5 +++++ scripts/eval_hf.py | 29 ++++++++++++++--------------- scripts/eval_hf.sh | 4 +++- 3 files changed, 22 insertions(+), 16 deletions(-) diff --git a/olmo/eval/downstream.py b/olmo/eval/downstream.py index 2a9d1d365..cbd33749f 100644 --- a/olmo/eval/downstream.py +++ b/olmo/eval/downstream.py @@ -331,6 +331,11 @@ def collate_fn(self, data): doc_ids.append(sample["doc_id"]) cont_ids.append(sample["cont_id"]) + if None in sample["ctx"]: + log.info(f'None in sample["ctx"]: {sample}') + if None in sample["continuation"]: + log.info(f'None in sample["continuation"]: {sample}') + ctxs.append(torch.LongTensor(self.pad_tokens_until_max(sample["ctx"], max_len=max_ctx_len))) continuations.append( torch.LongTensor(self.pad_tokens_until_max(sample["continuation"], max_len=max_cont_len)) diff --git a/scripts/eval_hf.py b/scripts/eval_hf.py index 5e3478f30..1c521389c 100644 --- a/scripts/eval_hf.py +++ b/scripts/eval_hf.py @@ -150,21 +150,20 @@ def main(cfg: TrainConfig, model_name: str): for eval_step, eval_batch in enumerate(eval_batches): batch = move_to_device(eval_batch, device) with torch.no_grad(): - with torch.autocast("cuda", enabled=True, dtype=cfg.autocast_precision): - logits = model( - input_ids=batch["input_ids"], - attention_mask=batch.get("attention_mask"), - ).logits - logits_for_loss = logits[..., :-1, :].contiguous() - # shape: (batch_size * seq_len, vocab_size) - logits_for_loss = logits_for_loss.view(-1, logits_for_loss.size(-1)) - # shape: (batch_size, seq_len) - labels = get_labels(batch) - # shape: (batch_size * seq_len,) - labels = labels.view(-1) - ce_loss = F.cross_entropy(logits_for_loss, labels, ignore_index=-100, reduction="none") - # Reshape (batch_size * seq_len,) -> (batch_size, seq_len) - ce_loss = ce_loss.view(batch["input_ids"].shape[0], -1) + logits = model( + input_ids=batch["input_ids"], + attention_mask=batch.get("attention_mask"), + ).logits + logits_for_loss = logits[..., :-1, :].contiguous() + # shape: (batch_size * seq_len, vocab_size) + logits_for_loss = logits_for_loss.view(-1, logits_for_loss.size(-1)) + # shape: (batch_size, seq_len) + labels = get_labels(batch) + # shape: (batch_size * seq_len,) + labels = labels.view(-1) + ce_loss = F.cross_entropy(logits_for_loss, labels, ignore_index=-100, reduction="none") + # Reshape (batch_size * seq_len,) -> (batch_size, seq_len) + ce_loss = ce_loss.view(batch["input_ids"].shape[0], -1) ce_loss = ce_loss.mean(dim=-1) evaluator.update_metrics(batch, ce_loss, logits) diff --git a/scripts/eval_hf.sh b/scripts/eval_hf.sh index 14020917c..dcc38f3d0 100644 --- a/scripts/eval_hf.sh +++ b/scripts/eval_hf.sh @@ -21,7 +21,7 @@ gantry run \ --env LOG_FILTER_TYPE=local_rank0_only \ --env OMP_NUM_THREADS=8 \ --env OLMO_TASK=model \ - --env HF_TOKEN_DOWNLOAD=JIACHENGL_HF_TOKEN \ + --env-secret HF_TOKEN_DOWNLOAD=JIACHENGL_HF_TOKEN \ --shared-memory 10GiB \ --yes \ --timeout=0 \ @@ -30,5 +30,7 @@ gantry run \ IFS=$'\n\t'; \ conda shell.bash activate base; \ pip install '.[train]'; \ + pip install -U transformers==4.46.2; \ + pip install -U sentencepiece; \ torchrun --nproc-per-node 1 scripts/eval_hf.py configs/peteish1-weka.yaml ${MODEL_NAME}; \ " From 2a6bed8cd7bc1daeaef48be4ed098baaaf64f673 Mon Sep 17 00:00:00 2001 From: Jiacheng Liu Date: Mon, 11 Nov 2024 23:58:22 +0000 Subject: [PATCH 54/58] Debug None in ctx --- olmo/eval/downstream.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/olmo/eval/downstream.py b/olmo/eval/downstream.py index cbd33749f..6c033c999 100644 --- a/olmo/eval/downstream.py +++ b/olmo/eval/downstream.py @@ -331,6 +331,8 @@ def collate_fn(self, data): doc_ids.append(sample["doc_id"]) cont_ids.append(sample["cont_id"]) + log.info(f'max_ctx_len = {max_ctx_len}') + log.info(sample) if None in sample["ctx"]: log.info(f'None in sample["ctx"]: {sample}') if None in sample["continuation"]: From a868d876d9eeeae9188a497bcaae691eba964377 Mon Sep 17 00:00:00 2001 From: Jiacheng Liu Date: Tue, 12 Nov 2024 00:03:20 +0000 Subject: [PATCH 55/58] Debug None in ctx --- olmo/eval/downstream.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/olmo/eval/downstream.py b/olmo/eval/downstream.py index 6c033c999..ed9e3c20d 100644 --- a/olmo/eval/downstream.py +++ b/olmo/eval/downstream.py @@ -331,12 +331,12 @@ def collate_fn(self, data): doc_ids.append(sample["doc_id"]) cont_ids.append(sample["cont_id"]) - log.info(f'max_ctx_len = {max_ctx_len}') - log.info(sample) + print(f'max_ctx_len = {max_ctx_len}') + print(sample) if None in sample["ctx"]: - log.info(f'None in sample["ctx"]: {sample}') + print(f'None in sample["ctx"]: {sample}') if None in sample["continuation"]: - log.info(f'None in sample["continuation"]: {sample}') + print(f'None in sample["continuation"]: {sample}') ctxs.append(torch.LongTensor(self.pad_tokens_until_max(sample["ctx"], max_len=max_ctx_len))) continuations.append( From 85088d6c0c6e0f320fba57cd7d535664a1a8173e Mon Sep 17 00:00:00 2001 From: Jiacheng Liu Date: Tue, 12 Nov 2024 00:20:42 +0000 Subject: [PATCH 56/58] Debug None in ctx --- olmo/eval/downstream.py | 7 ------- scripts/eval_hf.py | 2 ++ 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/olmo/eval/downstream.py b/olmo/eval/downstream.py index ed9e3c20d..2a9d1d365 100644 --- a/olmo/eval/downstream.py +++ b/olmo/eval/downstream.py @@ -331,13 +331,6 @@ def collate_fn(self, data): doc_ids.append(sample["doc_id"]) cont_ids.append(sample["cont_id"]) - print(f'max_ctx_len = {max_ctx_len}') - print(sample) - if None in sample["ctx"]: - print(f'None in sample["ctx"]: {sample}') - if None in sample["continuation"]: - print(f'None in sample["continuation"]: {sample}') - ctxs.append(torch.LongTensor(self.pad_tokens_until_max(sample["ctx"], max_len=max_ctx_len))) continuations.append( torch.LongTensor(self.pad_tokens_until_max(sample["continuation"], max_len=max_cont_len)) diff --git a/scripts/eval_hf.py b/scripts/eval_hf.py index 1c521389c..9aca63f2a 100644 --- a/scripts/eval_hf.py +++ b/scripts/eval_hf.py @@ -34,6 +34,8 @@ def main(cfg: TrainConfig, model_name: str): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, token=os.environ.get("HF_TOKEN_DOWNLOAD", None)) + if tokenizer.pad_token_id is None: # This is to prevent the NoneType error in collate_fn() + tokenizer.pad_token = 0 model = transformers.AutoModelForCausalLM.from_pretrained(model_name, token=os.environ.get("HF_TOKEN_DOWNLOAD", None)) model.to(device) model.eval() From 780d97ebb6a84f7f51fda485b2e9b519aeb82fa6 Mon Sep 17 00:00:00 2001 From: Jiacheng Liu Date: Tue, 12 Nov 2024 00:32:17 +0000 Subject: [PATCH 57/58] Debug None in ctx --- scripts/eval_hf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/eval_hf.py b/scripts/eval_hf.py index 9aca63f2a..29dd3f091 100644 --- a/scripts/eval_hf.py +++ b/scripts/eval_hf.py @@ -35,7 +35,7 @@ def main(cfg: TrainConfig, model_name: str): tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, token=os.environ.get("HF_TOKEN_DOWNLOAD", None)) if tokenizer.pad_token_id is None: # This is to prevent the NoneType error in collate_fn() - tokenizer.pad_token = 0 + tokenizer.pad_token_id = 0 model = transformers.AutoModelForCausalLM.from_pretrained(model_name, token=os.environ.get("HF_TOKEN_DOWNLOAD", None)) model.to(device) model.eval() From 32d36aa7a44bf78e6af70802cd6543567731ae64 Mon Sep 17 00:00:00 2001 From: Jiacheng Liu Date: Sat, 16 Nov 2024 22:50:28 +0000 Subject: [PATCH 58/58] Eval peteish13 --- configs/peteish13-weka.yaml | 1602 +++++++++++++++++ .../beaker/peteish/peteish13-eval-launch.sh | 41 + scripts/beaker/peteish/peteish13-eval.sh | 60 + 3 files changed, 1703 insertions(+) create mode 100644 configs/peteish13-weka.yaml create mode 100755 scripts/beaker/peteish/peteish13-eval-launch.sh create mode 100755 scripts/beaker/peteish/peteish13-eval.sh diff --git a/configs/peteish13-weka.yaml b/configs/peteish13-weka.yaml new file mode 100644 index 000000000..5937d1c12 --- /dev/null +++ b/configs/peteish13-weka.yaml @@ -0,0 +1,1602 @@ +run_name: peteish13-s3-run001 +seed: 6198 +dry_run: false + +wandb: + name: ${run_name} + project: olmo-medium + group: ${run_name} + +model: + d_model: 5120 + n_heads: 40 + n_layers: 40 + mlp_hidden_size: 27648 + weight_tying: false + alibi: false + rope: true + rope_theta: 500000 + flash_attention: true + attention_dropout: 0.0 + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: true + layer_norm_eps: 1e-6 + bias_for_layer_norm: false + attention_layer_norm: true + attention_layer_norm_with_affine: true + norm_after: true + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 4096 + vocab_size: 100278 + embedding_size: 100352 + eos_token_id: 100257 + pad_token_id: 100277 + init_device: meta + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +softmax_auxiliary_loss: true +auxiliary_loss_multiplier: 1e-5 +fused_loss: true + +compile: null + +optimizer: + name: adamw + learning_rate: 3.0e-4 + weight_decay: 0.1 + eps: 1e-8 + decay_norm_and_bias: true + decay_embeddings: false + betas: + - 0.9 + - 0.95 + metrics_log_interval: 1 + +scheduler: + name: cosine_with_warmup + units: tokens + t_warmup: 8388608000 + t_max: 5e12 + alpha_f: 0.1 + warmup_min_lr: 0.0 + +tokenizer: + identifier: tokenizers/allenai_dolma2.json + truncate_direction: right + +save_folder: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/${run_name} +save_overwrite: false + +save_interval: 1000 +save_interval_ephemeral: 250 +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: null + +max_duration: 1ep +global_train_batch_size: 2048 +device_train_microbatch_size: 2 + +precision: amp_bf16 + +fsdp: + wrapping_strategy: by_block_and_size + precision: mixed + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +gen1_gc_interval: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + - label: all-small-ppl-validation + data: + num_workers: 0 + drop_last: true + # generate_doc_lengths: true + memmap_dtype: uint32 + datasets: + c4_en-validation: + - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy + dolma_books-validation: + - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy + dolma_common-crawl-validation: + - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy + dolma_pes2o-validation: + - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy + dolma_reddit-validation: + - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy + dolma_stack-validation: + - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy + dolma_wiki-validation: + - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy + ice-validation: + - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy + m2d2_s2orc-validation: + - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy + pile-validation: + - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy + wikitext_103-validation: + - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + #- label: rte + # type: downstream + + #- label: commitment_bank + # type: downstream + + #- label: sst2 + # type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + + - label: basic_arithmetic + type: downstream + + - label: trivia_qa_wiki_ppl + type: downstream + + - label: natural_qs_open_ppl + type: downstream + + - label: arc_easy_ppl + type: downstream + + - label: piqa_rc_0shot + type: downstream + + - label: piqa_rc_0shot_bpb + type: downstream + + - label: piqa_rc_5shot + type: downstream + + - label: piqa_rc_5shot_bpb + type: downstream + + - label: piqa_mc_5shot + type: downstream + + - label: piqa_mc_5shot_bpb + type: downstream + + - label: hellaswag_rc_0shot + type: downstream + + - label: hellaswag_rc_0shot_bpb + type: downstream + + - label: hellaswag_rc_5shot + type: downstream + + - label: hellaswag_rc_5shot_bpb + type: downstream + + - label: hellaswag_mc_5shot + type: downstream + + - label: hellaswag_mc_5shot_bpb + type: downstream + + - label: winogrande_rc_0shot + type: downstream + + - label: winogrande_rc_0shot_bpb + type: downstream + + - label: winogrande_rc_5shot + type: downstream + + - label: winogrande_rc_5shot_bpb + type: downstream + + - label: winogrande_mc_5shot + type: downstream + + - label: winogrande_mc_5shot_bpb + type: downstream + + - label: openbookqa_rc_0shot + type: downstream + + - label: openbookqa_rc_0shot_bpb + type: downstream + + - label: openbookqa_rc_5shot + type: downstream + + - label: openbookqa_rc_5shot_bpb + type: downstream + + - label: openbookqa_mc_5shot + type: downstream + + - label: openbookqa_mc_5shot_bpb + type: downstream + + - label: boolq_rc_0shot + type: downstream + + - label: boolq_rc_0shot_bpb + type: downstream + + - label: boolq_rc_5shot + type: downstream + + - label: boolq_rc_5shot_bpb + type: downstream + + - label: boolq_mc_5shot + type: downstream + + - label: boolq_mc_5shot_bpb + type: downstream + + - label: sciq_rc_0shot + type: downstream + + - label: sciq_rc_0shot_bpb + type: downstream + + # - label: sciq_rc_5shot + # type: downstream + + # - label: sciq_rc_5shot_bpb + # type: downstream + + # - label: sciq_mc_5shot + # type: downstream + + # - label: sciq_mc_5shot_bpb + # type: downstream + + - label: arc_easy_rc_0shot + type: downstream + + - label: arc_easy_rc_0shot_bpb + type: downstream + + - label: arc_easy_rc_5shot + type: downstream + + - label: arc_easy_rc_5shot_bpb + type: downstream + + - label: arc_easy_mc_5shot + type: downstream + + - label: arc_easy_mc_5shot_bpb + type: downstream + + - label: arc_challenge_rc_0shot + type: downstream + + - label: arc_challenge_rc_0shot_bpb + type: downstream + + - label: arc_challenge_rc_5shot + type: downstream + + - label: arc_challenge_rc_5shot_bpb + type: downstream + + - label: arc_challenge_mc_5shot + type: downstream + + - label: arc_challenge_mc_5shot_bpb + type: downstream + + - label: copa_rc_0shot + type: downstream + + - label: copa_rc_0shot_bpb + type: downstream + + # - label: copa_rc_5shot + # type: downstream + + # - label: copa_rc_5shot_bpb + # type: downstream + + # - label: copa_mc_5shot + # type: downstream + + # - label: copa_mc_5shot_bpb + # type: downstream + + - label: csqa_rc_0shot + type: downstream + + - label: csqa_rc_0shot_bpb + type: downstream + + - label: csqa_rc_5shot + type: downstream + + - label: csqa_rc_5shot_bpb + type: downstream + + - label: csqa_mc_5shot + type: downstream + + - label: csqa_mc_5shot_bpb + type: downstream + + - label: socialiqa_rc_0shot + type: downstream + + - label: socialiqa_rc_0shot_bpb + type: downstream + + - label: socialiqa_rc_5shot + type: downstream + + - label: socialiqa_rc_5shot_bpb + type: downstream + + - label: socialiqa_mc_5shot + type: downstream + + - label: socialiqa_mc_5shot_bpb + type: downstream + + - label: mmlu_stem_var_bpb + type: downstream + + - label: mmlu_humanities_var_bpb + type: downstream + + - label: mmlu_social_sciences_var_bpb + type: downstream + + - label: mmlu_other_var_bpb + type: downstream + + - label: mmlu_stem_bpb + type: downstream + + - label: mmlu_humanities_bpb + type: downstream + + - label: mmlu_social_sciences_bpb + type: downstream + + - label: mmlu_other_bpb + type: downstream + +data: + pad_direction: right + # generate_doc_lengths: true + num_workers: 32 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + memmap_dtype: uint32 + timeout: 0 + instance_filter: + repetition_max_period: 13 + repetition_min_period: 1 + repetition_max_count: 32 + paths: + # ProofPile 2: Algebraic Stack Data + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-00-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-01-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-02-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-03-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-04-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-05-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-06-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-07-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-08-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-09-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-10-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-11-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-12-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-13-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-14-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-15-00000.npy + + # ProofPile 2: Arxiv Data + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-00-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-01-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-02-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-03-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-04-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-05-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-06-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-07-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-08-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-09-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-10-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-11-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-12-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-13-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-14-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-15-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-16-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-17-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-18-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-19-00000.npy + + # ProofPile 2: Open Web Math Data + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-00-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-01-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-02-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-03-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-04-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-05-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-06-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-07-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-08-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-09-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-10-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-11-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-12-00000.npy + + # Pes2o Data + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-00-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-01-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-02-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-03-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-04-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-05-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-06-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-07-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-08-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-09-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-10-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-11-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-12-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-13-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-14-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-15-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-16-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-17-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-18-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-19-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-20-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-21-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-22-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-23-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-24-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-25-00000.npy + + # Starcoder Data (fixed!) + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-000-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-001-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-002-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-003-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-004-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-005-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-006-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-007-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-008-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-009-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-010-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-011-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-012-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-013-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-014-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-015-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-016-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-017-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-018-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-019-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-020-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-021-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-022-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-023-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-024-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-025-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-026-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-027-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-028-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-029-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-030-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-031-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-032-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-033-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-034-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-035-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-036-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-037-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-038-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-039-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-040-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-041-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-042-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-043-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-044-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-045-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-046-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-047-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-048-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-049-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-050-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-051-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-052-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-053-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-054-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-055-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-056-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-057-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-058-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-059-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-060-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-061-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-062-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-063-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-064-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-065-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-066-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-067-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-068-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-069-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-070-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-071-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-072-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-073-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-074-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-075-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-076-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-077-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-078-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-079-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-080-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-081-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-082-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-083-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-084-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-085-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-086-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-087-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-088-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-089-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-090-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-091-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-092-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-093-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-094-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-095-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-096-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-097-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-098-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-099-00000.npy + + # DCLM Data + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-000-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-000-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-000-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-000-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-000-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-001-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-001-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-001-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-001-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-001-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-002-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-002-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-002-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-002-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-002-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-003-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-003-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-003-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-003-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-003-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-004-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-004-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-004-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-004-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-004-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-005-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-005-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-005-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-005-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-005-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-006-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-006-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-006-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-006-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-006-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-007-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-007-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-007-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-007-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-007-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-008-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-008-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-008-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-008-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-008-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-009-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-009-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-009-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-009-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-009-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-010-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-010-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-010-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-010-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-010-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-011-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-011-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-011-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-011-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-011-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-012-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-012-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-012-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-012-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-012-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-013-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-013-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-013-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-013-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-013-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-014-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-014-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-014-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-014-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-014-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-015-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-015-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-015-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-015-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-015-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-016-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-016-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-016-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-016-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-016-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-017-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-017-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-017-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-017-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-017-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-018-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-018-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-018-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-018-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-018-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-019-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-019-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-019-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-019-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-019-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-020-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-020-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-020-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-020-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-020-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-021-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-021-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-021-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-021-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-021-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-022-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-022-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-022-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-022-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-022-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-023-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-023-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-023-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-023-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-023-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-024-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-024-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-024-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-024-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-024-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-025-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-025-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-025-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-025-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-025-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-026-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-026-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-026-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-026-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-026-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-027-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-027-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-027-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-027-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-027-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-028-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-028-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-028-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-028-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-028-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-029-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-029-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-029-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-029-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-029-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-030-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-030-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-030-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-030-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-030-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-031-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-031-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-031-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-031-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-031-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-032-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-032-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-032-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-032-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-032-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-033-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-033-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-033-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-033-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-033-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-034-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-034-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-034-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-034-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-034-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-035-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-035-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-035-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-035-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-035-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-036-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-036-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-036-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-036-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-036-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-037-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-037-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-037-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-037-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-037-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-038-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-038-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-038-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-038-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-038-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-039-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-039-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-039-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-039-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-039-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-040-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-040-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-040-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-040-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-040-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-041-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-041-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-041-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-041-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-041-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-042-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-042-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-042-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-042-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-042-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-043-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-043-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-043-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-043-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-043-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-044-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-044-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-044-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-044-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-044-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-045-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-045-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-045-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-045-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-045-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-046-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-046-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-046-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-046-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-046-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-047-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-047-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-047-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-047-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-047-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-048-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-048-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-048-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-048-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-048-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-049-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-049-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-049-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-049-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-049-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-050-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-050-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-050-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-050-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-050-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-051-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-051-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-051-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-051-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-051-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-052-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-052-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-052-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-052-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-052-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-053-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-053-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-053-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-053-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-053-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-054-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-054-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-054-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-054-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-054-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-055-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-055-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-055-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-055-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-055-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-056-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-056-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-056-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-056-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-056-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-057-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-057-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-057-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-057-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-057-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-058-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-058-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-058-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-058-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-058-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-059-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-059-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-059-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-059-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-059-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-060-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-060-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-060-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-060-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-060-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-061-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-061-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-061-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-061-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-061-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-062-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-062-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-062-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-062-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-062-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-063-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-063-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-063-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-063-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-063-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-064-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-064-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-064-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-064-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-064-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-065-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-065-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-065-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-065-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-065-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-066-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-066-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-066-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-066-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-066-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-067-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-067-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-067-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-067-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-067-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-068-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-068-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-068-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-068-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-068-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-069-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-069-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-069-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-069-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-069-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-070-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-070-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-070-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-070-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-070-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-071-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-071-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-071-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-071-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-071-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-072-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-072-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-072-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-072-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-072-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-073-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-073-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-073-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-073-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-073-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-074-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-074-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-074-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-074-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-074-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-075-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-075-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-075-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-075-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-075-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-076-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-076-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-076-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-076-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-076-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-077-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-077-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-077-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-077-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-077-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-078-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-078-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-078-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-078-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-078-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-079-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-079-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-079-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-079-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-079-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-080-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-080-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-080-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-080-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-080-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-081-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-081-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-081-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-081-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-081-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-082-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-082-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-082-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-082-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-082-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-083-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-083-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-083-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-083-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-083-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-084-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-084-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-084-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-084-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-084-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-085-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-085-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-085-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-085-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-085-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-086-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-086-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-086-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-086-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-086-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-087-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-087-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-087-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-087-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-087-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-088-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-088-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-088-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-088-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-088-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-089-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-089-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-089-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-089-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-089-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-090-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-090-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-090-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-090-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-090-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-091-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-091-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-091-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-091-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-091-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-092-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-092-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-092-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-092-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-092-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-093-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-093-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-093-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-093-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-093-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-094-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-094-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-094-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-094-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-094-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-095-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-095-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-095-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-095-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-095-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-096-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-096-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-096-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-096-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-096-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-097-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-097-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-097-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-097-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-097-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-098-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-098-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-098-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-098-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-098-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-099-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-099-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-099-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-099-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-099-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-100-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-100-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-100-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-100-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-100-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-101-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-101-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-101-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-101-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-101-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-102-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-102-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-102-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-102-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-102-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-103-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-103-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-103-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-103-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-103-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-104-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-104-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-104-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-104-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-104-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-104-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-104-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-104-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-104-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-104-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-105-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-105-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-105-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-105-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-105-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-106-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-106-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-106-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-106-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-106-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-107-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-107-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-107-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-107-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-107-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-108-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-108-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-108-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-108-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-108-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-109-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-109-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-109-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-109-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-109-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-110-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-110-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-110-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-110-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-110-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-111-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-111-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-111-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-111-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-111-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-112-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-112-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-112-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-112-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-112-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-113-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-113-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-113-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-113-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-113-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-114-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-114-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-114-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-114-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-114-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-115-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-115-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-115-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-115-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-115-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-116-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-116-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-116-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-116-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-116-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-117-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-117-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-117-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-117-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-117-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-118-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-118-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-118-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-118-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-118-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-119-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-119-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-119-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-119-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-119-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-120-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-120-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-120-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-120-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-120-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-121-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-121-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-121-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-121-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-121-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-122-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-122-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-122-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-122-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-122-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-123-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-123-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-123-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-123-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-123-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-124-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-124-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-124-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-124-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-124-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-125-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-125-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-125-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-125-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-125-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-126-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-126-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-126-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-126-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-126-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-127-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-127-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-127-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-127-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-127-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-128-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-128-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-128-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-128-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-128-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-129-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-129-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-129-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-129-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-129-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-130-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-130-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-130-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-130-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-130-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-131-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-131-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-131-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-131-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-131-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-132-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-132-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-132-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-132-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-132-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-133-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-133-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-133-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-133-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-133-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-134-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-134-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-134-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-134-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-134-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-135-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-135-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-135-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-135-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-135-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-136-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-136-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-136-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-136-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-136-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-137-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-137-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-137-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-137-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-137-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-138-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-138-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-138-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-138-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-138-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-139-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-139-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-139-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-139-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-139-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-140-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-140-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-140-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-140-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-140-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-141-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-141-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-141-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-141-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-141-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-142-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-142-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-142-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-142-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-142-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-143-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-143-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-143-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-143-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-143-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-144-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-144-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-144-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-144-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-144-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-145-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-145-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-145-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-145-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-145-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-146-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-146-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-146-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-146-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-146-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-147-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-147-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-147-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-147-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-147-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-148-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-148-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-148-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-148-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-148-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-149-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-149-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-149-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-149-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-149-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-150-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-150-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-150-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-150-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-150-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-151-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-151-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-151-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-151-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-151-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-152-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-152-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-152-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-152-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-152-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-153-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-153-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-153-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-153-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-153-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-154-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-154-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-154-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-154-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-154-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-155-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-155-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-155-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-155-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-155-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-156-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-156-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-156-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-156-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-156-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-157-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-157-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-157-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-157-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-157-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-158-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-158-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-158-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-158-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-158-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-159-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-159-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-159-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-159-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-159-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-160-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-160-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-160-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-160-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-160-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-161-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-161-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-161-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-161-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-161-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-162-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-162-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-162-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-162-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-162-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-163-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-163-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-163-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-163-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-163-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-164-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-164-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-164-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-164-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-164-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-165-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-165-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-165-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-165-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-165-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-166-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-166-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-166-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-166-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-166-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-167-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-167-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-167-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-167-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-167-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-168-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-168-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-168-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-168-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-168-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-169-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-169-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-169-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-169-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-169-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-170-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-170-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-170-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-170-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-170-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-171-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-171-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-171-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-171-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-171-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-172-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-172-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-172-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-172-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-172-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-173-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-173-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-173-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-173-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-173-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-174-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-174-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-174-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-174-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-174-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-175-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-175-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-175-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-175-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-175-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-176-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-176-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-176-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-176-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-176-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-177-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-177-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-177-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-177-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-177-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-178-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-178-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-178-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-178-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-178-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-179-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-179-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-179-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-179-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-179-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-180-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-180-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-180-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-180-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-180-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-181-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-181-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-181-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-181-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-181-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-182-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-182-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-182-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-182-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-182-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-183-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-183-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-183-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-183-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-183-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-184-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-184-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-184-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-184-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-184-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-185-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-185-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-185-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-185-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-185-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-186-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-186-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-186-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-186-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-186-00004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-187-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-187-00001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-187-00002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-187-00003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-187-00004.npy + + # Wikipedia + - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-0-00000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-1-00000.npy diff --git a/scripts/beaker/peteish/peteish13-eval-launch.sh b/scripts/beaker/peteish/peteish13-eval-launch.sh new file mode 100755 index 000000000..c8c657393 --- /dev/null +++ b/scripts/beaker/peteish/peteish13-eval-launch.sh @@ -0,0 +1,41 @@ +#!/usr/bin/env bash + +set -ex + +NUM_NODES=8 + +gantry run \ + --allow-dirty \ + --workspace ai2/OLMo-tiny \ + --task-name peteish13-eval \ + --description "Pete-ish 13B eval" \ + --priority high \ + --preemptible \ + --beaker-image petew/olmo-torch23-gantry \ + --cluster ai2/jupiter-cirrascale-2 \ + --gpus 8 \ + --replicas "${NUM_NODES}" \ + --leader-selection \ + --host-networking \ + --propagate-failure \ + --propagate-preemption \ + --synchronized-start-timeout 90m \ + --budget ai2/oe-training \ + --no-nfs \ + --weka oe-training-default:/weka/oe-training-default \ + --no-python \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env OLMO_TASK=model \ + --env R2_PROFILE=R2 \ + --env S3_PROFILE=S3 \ + --env WEKA_PROFILE=WEKA \ + --env-secret AWS_CONFIG=PETEW_AWS_CONFIG \ + --env-secret AWS_CREDENTIALS=PETEW_AWS_CREDENTIALS \ + --env-secret R2_ENDPOINT_URL=R2_ENDPOINT_URL \ + --env-secret WEKA_ENDPOINT_URL=WEKA_ENDPOINT_URL \ + --env-secret WANDB_API_KEY=JIACHENGL_WANDB_API_KEY \ + --shared-memory 10GiB \ + --yes \ + --timeout=-1 \ + -- /bin/bash -c "scripts/beaker/peteish/peteish13-eval.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/peteish/peteish13-eval.sh b/scripts/beaker/peteish/peteish13-eval.sh new file mode 100755 index 000000000..b4914eb6e --- /dev/null +++ b/scripts/beaker/peteish/peteish13-eval.sh @@ -0,0 +1,60 @@ +#!/usr/bin/env bash + +set -exuo pipefail +IFS=$'\n\t' + +BEAKER_LEADER_REPLICA_HOSTNAME=$1 +shift + +NUM_NODES=$1 +shift + +BEAKER_REPLICA_RANK=$1 +shift + +# Setup Python environment. +conda shell.bash activate base + +# Install flash-attn +#conda install -y -c nvidia cuda-python +pip install packaging ninja +export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE +pip install flash-attn==2.5.9.post1 --no-build-isolation +# pip install awscli +pip install '.[train]' +pip freeze + +# Move AWS credentials from env to relevant files +mkdir -p ~/.aws +printenv AWS_CONFIG > ~/.aws/config +printenv AWS_CREDENTIALS > ~/.aws/credentials + +# Force processes to synchronize at init_process_group +export TORCH_DIST_INIT_BARRIER=1 + +# Tell OLMo all ranks share the same filesystem for checkpoints. +export OLMO_SHARED_FS=1 + +export NCCL_DEBUG=INFO +export NCCL_IB_HCA="^=mlx5_bond_0" +export NCCL_SOCKET_IFNAME=ib +# export NCCL_IB_GID_INDEX=0 + +torchrun \ + --nnodes "${NUM_NODES}:${NUM_NODES}" \ + --nproc-per-node 8 \ + --rdzv_id 12347 \ + --rdzv_backend static \ + --rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \ + --node_rank "${BEAKER_REPLICA_RANK}" \ + --rdzv_conf 'read_timeout=420' \ + scripts/eval.py \ + configs/peteish13-weka.yaml \ + --run_name="${GANTRY_TASK_NAME}" \ + --save_interval_ephemeral=500 \ + --save_overwrite \ + --device_eval_batch_size=16 \ + --wandb.group="peteish13-highlr" \ + --load_path="/weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish13-highlr-zlossfix/step596057" + + # '--load_path=${path.last_checkpoint:${save_folder}}' \