From 46cfccefe5b52f9a3793919402220da409062ebe Mon Sep 17 00:00:00 2001 From: Dirk Groeneveld Date: Mon, 25 Nov 2024 22:48:58 -0800 Subject: [PATCH] Delete a bunch of unused scripts --- scripts/augusta/peteish1-muplr-launch.sh | 37 --- scripts/augusta/peteish1-muplr.sh | 87 ------ .../amberish/amberish1-8k-cham-launch.sh | 40 --- scripts/beaker/amberish/amberish1-8k-cham.sh | 64 ----- .../amberish1-8k-doc-mask-cham-launch.sh | 40 --- ...mberish1-8k-doc-mask-cham-rtheta-launch.sh | 40 --- .../amberish1-8k-doc-mask-cham-rtheta.sh | 66 ----- .../amberish/amberish1-8k-doc-mask-cham.sh | 66 ----- .../amberish/amberish1-chameleon-launch.sh | 40 --- .../beaker/amberish/amberish1-chameleon.sh | 67 ----- .../amberish/amberish1-emb-init-1-launch.sh | 40 --- .../beaker/amberish/amberish1-emb-init-1.sh | 73 ----- scripts/beaker/amberish/amberish1-launch.sh | 40 --- .../amberish1-selective-updates-launch.sh | 40 --- .../amberish/amberish1-selective-updates.sh | 72 ----- .../amberish/amberish1-wd-all-launch.sh | 40 --- scripts/beaker/amberish/amberish1-wd-all.sh | 72 ----- .../amberish/amberish1-z-loss-launch.sh | 40 --- scripts/beaker/amberish/amberish1-z-loss.sh | 75 ----- scripts/beaker/amberish/amberish1.sh | 71 ----- scripts/beaker/amberish/amberish7-launch.sh | 40 --- scripts/beaker/amberish/amberish7.sh | 74 ----- scripts/beaker/amberish/amberish70-launch.sh | 41 --- scripts/beaker/amberish/amberish70.sh | 72 ----- .../annealing/launch_annealing_amberish.sh | 43 --- scripts/beaker/chameleon/llamaish1-launch.sh | 33 --- .../chameleon/llamaish1-normal-launch.sh | 33 --- ...sh1-normal-qk-norm-reorder-zloss-launch.sh | 33 --- .../llamaish1-normal-qk-norm-reorder-zloss.sh | 60 ---- scripts/beaker/chameleon/llamaish1-normal.sh | 61 ---- .../llamaish1-qk-norm-reorder-launch.sh | 33 --- .../llamaish1-qk-norm-reorder-zloss-launch.sh | 33 --- .../llamaish1-qk-norm-reorder-zloss.sh | 58 ---- .../chameleon/llamaish1-qk-norm-reorder.sh | 57 ---- scripts/beaker/chameleon/llamaish1.sh | 58 ---- ...sh7-normal-qk-norm-reorder-zloss-launch.sh | 33 --- .../llamaish7-normal-qk-norm-reorder-zloss.sh | 57 ---- .../chameleon/llamaish7-qk-norm-launch.sh | 33 --- .../llamaish7-qk-norm-reorder-launch.sh | 33 --- .../llamaish7-qk-norm-reorder-zloss-launch.sh | 33 --- .../llamaish7-qk-norm-reorder-zloss.sh | 54 ---- .../chameleon/llamaish7-qk-norm-reorder.sh | 52 ---- scripts/beaker/chameleon/llamaish7-qk-norm.sh | 50 ---- scripts/beaker/ib-ananya-1b.sh | 58 ---- scripts/beaker/llamaish7-launch.sh | 32 --- scripts/beaker/llamaish7-normal-launch.sh | 33 --- scripts/beaker/llamaish7-normal.sh | 54 ---- scripts/beaker/llamaish7.sh | 46 --- scripts/beaker/mitch-ish-7b.sh | 30 -- scripts/beaker/mitchish65.sh | 33 --- scripts/beaker/mitchish7-launch.sh | 32 --- scripts/beaker/mitchish7-llamainit-launch.sh | 32 --- scripts/beaker/mitchish7-llamainit.sh | 42 --- scripts/beaker/mitchish7.sh | 44 --- .../beaker/mitchish70-from160510-launch.sh | 34 --- scripts/beaker/mitchish70-from160510.sh | 42 --- scripts/beaker/mitchish70-launch.sh | 35 --- scripts/beaker/mitchish70-loadtest-launch.sh | 35 --- scripts/beaker/mitchish70-loadtest.sh | 36 --- scripts/beaker/mitchish70.sh | 31 -- .../beaker/olmo-small-ablation-on-gantry.sh | 59 ---- scripts/beaker/olmo7-ablation-baseline.sh | 36 --- scripts/beaker/olmo7-ablation-dedupeparas.sh | 36 --- scripts/beaker/olmo7-ablation-final2.sh | 36 --- scripts/beaker/olmo7-ablation-refheavy.sh | 36 --- scripts/beaker/pile-llamaish7-launch.sh | 32 --- scripts/beaker/pile-llamaish7.sh | 45 --- scripts/beaker/tiny-llamaish-launch.sh | 34 --- scripts/beaker/tiny-llamaish.sh | 50 ---- scripts/beaker/warm_hf_cache.sh | 9 - scripts/kempner/llama7.sh | 42 --- scripts/kempner/log_into_node.sh | 5 - scripts/kempner/mitch-ish-7b.sh | 53 ---- scripts/kempner/v1-mix-small.sh | 42 --- scripts/mcli/manage_run.py | 264 ------------------ scripts/mcli/unshard_mitchish70.sh | 21 -- scripts/pyspy_all_nodes.sh | 12 - scripts/pyspy_all_processes.sh | 14 - scripts/run_with_environment.sh | 30 -- 79 files changed, 3689 deletions(-) delete mode 100755 scripts/augusta/peteish1-muplr-launch.sh delete mode 100755 scripts/augusta/peteish1-muplr.sh delete mode 100755 scripts/beaker/amberish/amberish1-8k-cham-launch.sh delete mode 100755 scripts/beaker/amberish/amberish1-8k-cham.sh delete mode 100755 scripts/beaker/amberish/amberish1-8k-doc-mask-cham-launch.sh delete mode 100755 scripts/beaker/amberish/amberish1-8k-doc-mask-cham-rtheta-launch.sh delete mode 100755 scripts/beaker/amberish/amberish1-8k-doc-mask-cham-rtheta.sh delete mode 100755 scripts/beaker/amberish/amberish1-8k-doc-mask-cham.sh delete mode 100755 scripts/beaker/amberish/amberish1-chameleon-launch.sh delete mode 100755 scripts/beaker/amberish/amberish1-chameleon.sh delete mode 100755 scripts/beaker/amberish/amberish1-emb-init-1-launch.sh delete mode 100755 scripts/beaker/amberish/amberish1-emb-init-1.sh delete mode 100755 scripts/beaker/amberish/amberish1-launch.sh delete mode 100755 scripts/beaker/amberish/amberish1-selective-updates-launch.sh delete mode 100755 scripts/beaker/amberish/amberish1-selective-updates.sh delete mode 100755 scripts/beaker/amberish/amberish1-wd-all-launch.sh delete mode 100755 scripts/beaker/amberish/amberish1-wd-all.sh delete mode 100755 scripts/beaker/amberish/amberish1-z-loss-launch.sh delete mode 100755 scripts/beaker/amberish/amberish1-z-loss.sh delete mode 100755 scripts/beaker/amberish/amberish1.sh delete mode 100755 scripts/beaker/amberish/amberish7-launch.sh delete mode 100755 scripts/beaker/amberish/amberish7.sh delete mode 100755 scripts/beaker/amberish/amberish70-launch.sh delete mode 100755 scripts/beaker/amberish/amberish70.sh delete mode 100755 scripts/beaker/annealing/launch_annealing_amberish.sh delete mode 100755 scripts/beaker/chameleon/llamaish1-launch.sh delete mode 100755 scripts/beaker/chameleon/llamaish1-normal-launch.sh delete mode 100755 scripts/beaker/chameleon/llamaish1-normal-qk-norm-reorder-zloss-launch.sh delete mode 100755 scripts/beaker/chameleon/llamaish1-normal-qk-norm-reorder-zloss.sh delete mode 100755 scripts/beaker/chameleon/llamaish1-normal.sh delete mode 100755 scripts/beaker/chameleon/llamaish1-qk-norm-reorder-launch.sh delete mode 100755 scripts/beaker/chameleon/llamaish1-qk-norm-reorder-zloss-launch.sh delete mode 100755 scripts/beaker/chameleon/llamaish1-qk-norm-reorder-zloss.sh delete mode 100755 scripts/beaker/chameleon/llamaish1-qk-norm-reorder.sh delete mode 100755 scripts/beaker/chameleon/llamaish1.sh delete mode 100755 scripts/beaker/chameleon/llamaish7-normal-qk-norm-reorder-zloss-launch.sh delete mode 100755 scripts/beaker/chameleon/llamaish7-normal-qk-norm-reorder-zloss.sh delete mode 100755 scripts/beaker/chameleon/llamaish7-qk-norm-launch.sh delete mode 100755 scripts/beaker/chameleon/llamaish7-qk-norm-reorder-launch.sh delete mode 100755 scripts/beaker/chameleon/llamaish7-qk-norm-reorder-zloss-launch.sh delete mode 100755 scripts/beaker/chameleon/llamaish7-qk-norm-reorder-zloss.sh delete mode 100755 scripts/beaker/chameleon/llamaish7-qk-norm-reorder.sh delete mode 100755 scripts/beaker/chameleon/llamaish7-qk-norm.sh delete mode 100755 scripts/beaker/ib-ananya-1b.sh delete mode 100755 scripts/beaker/llamaish7-launch.sh delete mode 100755 scripts/beaker/llamaish7-normal-launch.sh delete mode 100755 scripts/beaker/llamaish7-normal.sh delete mode 100755 scripts/beaker/llamaish7.sh delete mode 100755 scripts/beaker/mitch-ish-7b.sh delete mode 100755 scripts/beaker/mitchish65.sh delete mode 100755 scripts/beaker/mitchish7-launch.sh delete mode 100755 scripts/beaker/mitchish7-llamainit-launch.sh delete mode 100755 scripts/beaker/mitchish7-llamainit.sh delete mode 100755 scripts/beaker/mitchish7.sh delete mode 100755 scripts/beaker/mitchish70-from160510-launch.sh delete mode 100755 scripts/beaker/mitchish70-from160510.sh delete mode 100755 scripts/beaker/mitchish70-launch.sh delete mode 100755 scripts/beaker/mitchish70-loadtest-launch.sh delete mode 100755 scripts/beaker/mitchish70-loadtest.sh delete mode 100755 scripts/beaker/mitchish70.sh delete mode 100755 scripts/beaker/olmo-small-ablation-on-gantry.sh delete mode 100755 scripts/beaker/olmo7-ablation-baseline.sh delete mode 100755 scripts/beaker/olmo7-ablation-dedupeparas.sh delete mode 100755 scripts/beaker/olmo7-ablation-final2.sh delete mode 100755 scripts/beaker/olmo7-ablation-refheavy.sh delete mode 100755 scripts/beaker/pile-llamaish7-launch.sh delete mode 100755 scripts/beaker/pile-llamaish7.sh delete mode 100755 scripts/beaker/tiny-llamaish-launch.sh delete mode 100755 scripts/beaker/tiny-llamaish.sh delete mode 100755 scripts/beaker/warm_hf_cache.sh delete mode 100644 scripts/kempner/llama7.sh delete mode 100755 scripts/kempner/log_into_node.sh delete mode 100644 scripts/kempner/mitch-ish-7b.sh delete mode 100644 scripts/kempner/v1-mix-small.sh delete mode 100644 scripts/mcli/manage_run.py delete mode 100755 scripts/mcli/unshard_mitchish70.sh delete mode 100755 scripts/pyspy_all_nodes.sh delete mode 100755 scripts/pyspy_all_processes.sh delete mode 100755 scripts/run_with_environment.sh diff --git a/scripts/augusta/peteish1-muplr-launch.sh b/scripts/augusta/peteish1-muplr-launch.sh deleted file mode 100755 index 568d42138..000000000 --- a/scripts/augusta/peteish1-muplr-launch.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env bash - -set -ex - -NUM_NODES=$1 -shift - -gantry run \ - --workspace ai2/13B \ - --task-name peteish1-muplr \ - --description "Peteish1 muP LR" \ - --priority high \ - --preemptible \ - --beaker-image michalg/cuda11.8-ubuntu20.04-arb \ - --cluster ai2/augusta-google-1 \ - --gpus 8 \ - --replicas "${NUM_NODES}" \ - --leader-selection \ - --host-networking \ - --budget ai2/oe-training \ - --no-nfs \ - --propagate-failure \ - --propagate-preemption \ - --synchronized-start-timeout 15m \ - --no-python \ - --env LOG_FILTER_TYPE=local_rank0_only \ - --env OMP_NUM_THREADS=8 \ - --env OLMO_TASK=model \ - --env-secret WANDB_API_KEY=DIRKG_WANDB_API_KEY \ - --env-secret AWS_ACCESS_KEY_ID=DIRKG_AWS_ACCESS_KEY_ID \ - --env-secret AWS_SECRET_ACCESS_KEY=DIRKG_AWS_SECRET_ACCESS_KEY \ - --shared-memory 10GiB \ - --yes \ - --timeout=-1 \ - --allow-dirty \ - --retries 10 \ - -- /bin/bash -c "scripts/augusta/peteish1-muplr.sh \$BEAKER_LEADER_REPLICA_HOSTNAME \$BEAKER_REPLICA_RANK" diff --git a/scripts/augusta/peteish1-muplr.sh b/scripts/augusta/peteish1-muplr.sh deleted file mode 100755 index 7ca843161..000000000 --- a/scripts/augusta/peteish1-muplr.sh +++ /dev/null @@ -1,87 +0,0 @@ -#!/usr/bin/env bash - -set -exuo pipefail -IFS=$'\n\t' - -BEAKER_LEADER_REPLICA_HOSTNAME=$1 -shift - -BEAKER_REPLICA_RANK=$1 -shift - -# augusta specific environment -export LD_LIBRARY_PATH="/var/lib/tcpxo/lib64:${LD_LIBRARY_PATH}" -export NCCL_CROSS_NIC=0 -export NCCL_ALGO=Ring,Tree -export NCCL_PROTO=Simple -export NCCL_MIN_NCHANNELS=4 -export NCCL_P2P_NET_CHUNKSIZE=524288 -export NCCL_P2P_PCI_CHUNKSIZE=524288 -export NCCL_P2P_NVL_CHUNKSIZE=1048576 -export NCCL_FASTRAK_NUM_FLOWS=2 -export NCCL_FASTRAK_ENABLE_CONTROL_CHANNEL=0 -export NCCL_BUFFSIZE=8388608 -export NCCL_FASTRAK_USE_SNAP=1 -export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 -export NCCL_NET_GDR_LEVEL=PIX -export NCCL_FASTRAK_ENABLE_HOTPATH_LOGGING=0 -export NCCL_TUNER_PLUGIN=libnccl-tuner.so -export NCCL_TUNER_CONFIG_PATH=/var/lib/tcpxo/lib64/a3plus_tuner_config.textproto -export NCCL_SHIMNET_GUEST_CONFIG_CHECKER_CONFIG_FILE=/var/lib/tcpxo/lib64/a3plus_guest_config.textproto -export NCCL_FASTRAK_PLUGIN_ACCEPT_TIMEOUT_MS=600000 -export NCCL_NVLS_ENABLE=0 -export NCCL_DEBUG=WARN -export NCCL_FASTRAK_CTRL_DEV=enp0s12 -export NCCL_FASTRAK_IFNAME=enp6s0,enp7s0,enp13s0,enp14s0,enp134s0,enp135s0,enp141s0,enp142s0 -export NCCL_SOCKET_IFNAME=enp0s12 -export NCCL_USE_SNAP=1 -export NCCL_FASTRAK_USE_LLCM=1 -export NCCL_FASTRAK_LLCM_DEVICE_DIRECTORY=/dev/aperture_devices - -# Install flash-attn -#conda install -y pytorch-cuda==12.4 packaging ninja cccl cuda-nvcc libcusolver-dev cuda-profiler-api libcusparse-dev libcublas-dev -c pytorch -c nvidia -#pip install flash-attn==2.5.9.post1 --no-build-isolation -pip install '.[train]' -pip freeze - -# Force processes to synchronize at init_process_group -export TORCH_DIST_INIT_BARRIER=1 -# Better error handling from Python -export PYTHONFAULTHANDLER=1 - -NAME=${GANTRY_TASK_NAME// /_} -RUN_NAME=$NAME-$(date -u +"%Y%m%d_%H%M%S") -SAVE_FOLDER=/data/$RUN_NAME -mkdir -p $SAVE_FOLDER - -torchrun \ - --nnodes "${BEAKER_REPLICA_COUNT}:${BEAKER_REPLICA_COUNT}" \ - --nproc-per-node 8 \ - --rdzv_id 12348 \ - --rdzv_backend static \ - --rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \ - --node_rank "${BEAKER_REPLICA_RANK}" \ - --rdzv_conf 'read_timeout=420' \ - scripts/train.py \ - configs/peteish1-google.yaml \ - --run_name=$RUN_NAME \ - --wandb.group=$NAME \ - --optimizer.learning_rate=7.81e-3 \ - --save_interval_ephemeral=10000 \ - --eval_interval=10000 \ - --fsdp.sharding_strategy=HYBRID_SHARD \ - --fsdp.hybrid_sharding_num_model_replicas="${BEAKER_REPLICA_COUNT}" \ - --fsdp.wrapping_strategy=by_block_and_size \ - --save_folder=$SAVE_FOLDER \ - --remote_save_folder="gs://ai2-llm/checkpoints/OLMo-medium/$NAME/" \ - --try_load_latest_save \ - --save_overwrite \ - --sharded_checkpointer=olmo_core \ - --device_train_microbatch_size=4 \ - --device_eval_batch_size=8 \ - --compile.fullgraph=false \ - --fused_loss=false \ - --model.flash_attention=false \ - --data.num_workers=32 \ - --optimizer.metrics_log_interval=10 \ - --data.prefetch_factor=8 diff --git a/scripts/beaker/amberish/amberish1-8k-cham-launch.sh b/scripts/beaker/amberish/amberish1-8k-cham-launch.sh deleted file mode 100755 index de5fb345e..000000000 --- a/scripts/beaker/amberish/amberish1-8k-cham-launch.sh +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/env bash - -set -ex - -NUM_NODES=16 - -gantry run \ - --workspace ai2/OLMo-pretraining-stability \ - --task-name amberish1-8k-cham \ - --description "Amberish 1B with 8k context length and chameleon fixes" \ - --priority urgent \ - --preemptible \ - --beaker-image petew/olmo-torch23-gantry \ - --cluster ai2/jupiter-cirrascale-2 \ - --gpus 8 \ - --replicas "${NUM_NODES}" \ - --leader-selection \ - --host-networking \ - --budget ai2/oe-training \ - --no-nfs \ - --weka oe-training-default:/weka/oe-training-default \ - --propagate-failure \ - --propagate-preemption \ - --synchronized-start-timeout 90m \ - --no-python \ - --env LOG_FILTER_TYPE=local_rank0_only \ - --env OMP_NUM_THREADS=8 \ - --env OLMO_TASK=model \ - --env R2_PROFILE=R2 \ - --env S3_PROFILE=S3 \ - --env WEKA_PROFILE=WEKA \ - --env-secret AWS_CONFIG=PETEW_AWS_CONFIG \ - --env-secret AWS_CREDENTIALS=PETEW_AWS_CREDENTIALS \ - --env-secret R2_ENDPOINT_URL=R2_ENDPOINT_URL \ - --env-secret WEKA_ENDPOINT_URL=WEKA_ENDPOINT_URL \ - --env-secret WANDB_API_KEY=PETEW_WANDB_API_KEY \ - --shared-memory 10GiB \ - --yes \ - --timeout=-1 \ - -- /bin/bash -c "scripts/beaker/amberish/amberish1-8k-cham.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/amberish/amberish1-8k-cham.sh b/scripts/beaker/amberish/amberish1-8k-cham.sh deleted file mode 100755 index e65db6df4..000000000 --- a/scripts/beaker/amberish/amberish1-8k-cham.sh +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/env bash - -set -exuo pipefail -IFS=$'\n\t' - -BEAKER_LEADER_REPLICA_HOSTNAME=$1 -shift - -NUM_NODES=$1 -shift - -BEAKER_REPLICA_RANK=$1 -shift - -# Setup Python environment. -conda shell.bash activate base - -# Install flash-attn -#conda install -y -c nvidia cuda-python -pip install packaging ninja -export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE -pip install flash-attn==2.5.9.post1 --no-build-isolation -# pip install awscli -pip install '.[train]' -pip freeze - -# Move AWS credentials from env to relevant files -mkdir -p ~/.aws -printenv AWS_CONFIG > ~/.aws/config -printenv AWS_CREDENTIALS > ~/.aws/credentials - -# Force processes to synchronize at init_process_group -export TORCH_DIST_INIT_BARRIER=1 - -# Tell OLMo all ranks share the same filesystem for checkpoints. -export OLMO_SHARED_FS=1 - -export NCCL_DEBUG=INFO -export NCCL_IB_HCA="^=mlx5_bond_0" -export NCCL_SOCKET_IFNAME=ib -# export NCCL_IB_GID_INDEX=0 - -torchrun \ - --nnodes "${NUM_NODES}:${NUM_NODES}" \ - --nproc-per-node 8 \ - --rdzv_id 12347 \ - --rdzv_backend static \ - --rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \ - --node_rank "${BEAKER_REPLICA_RANK}" \ - --rdzv_conf 'read_timeout=420' \ - scripts/train.py \ - configs/amberish1-weka.yaml \ - --run_name="${GANTRY_TASK_NAME}" \ - --model.max_sequence_length=8192 \ - --device_train_microbatch_size=2 \ - --global_train_batch_size=512 \ - --fused_loss=true \ - --softmax_auxiliary_loss=true \ - --auxiliary_loss_multiplier=1e-5 \ - --model.attention_layer_norm=true \ - --model.norm_after=true \ - --save_overwrite - - # '--load_path=${path.last_checkpoint:${save_folder}}' \ diff --git a/scripts/beaker/amberish/amberish1-8k-doc-mask-cham-launch.sh b/scripts/beaker/amberish/amberish1-8k-doc-mask-cham-launch.sh deleted file mode 100755 index 70bd4d937..000000000 --- a/scripts/beaker/amberish/amberish1-8k-doc-mask-cham-launch.sh +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/env bash - -set -ex - -NUM_NODES=16 - -gantry run \ - --workspace ai2/OLMo-pretraining-stability \ - --task-name amberish1-8k-doc-mask-cham \ - --description "Amberish 1B with 8k context length, doc masking, and chameleon fixes" \ - --priority urgent \ - --preemptible \ - --beaker-image petew/olmo-torch23-gantry \ - --cluster ai2/jupiter-cirrascale-2 \ - --gpus 8 \ - --replicas "${NUM_NODES}" \ - --leader-selection \ - --host-networking \ - --budget ai2/oe-training \ - --no-nfs \ - --weka oe-training-default:/weka/oe-training-default \ - --propagate-failure \ - --propagate-preemption \ - --synchronized-start-timeout 90m \ - --no-python \ - --env LOG_FILTER_TYPE=local_rank0_only \ - --env OMP_NUM_THREADS=8 \ - --env OLMO_TASK=model \ - --env R2_PROFILE=R2 \ - --env S3_PROFILE=S3 \ - --env WEKA_PROFILE=WEKA \ - --env-secret AWS_CONFIG=PETEW_AWS_CONFIG \ - --env-secret AWS_CREDENTIALS=PETEW_AWS_CREDENTIALS \ - --env-secret R2_ENDPOINT_URL=R2_ENDPOINT_URL \ - --env-secret WEKA_ENDPOINT_URL=WEKA_ENDPOINT_URL \ - --env-secret WANDB_API_KEY=PETEW_WANDB_API_KEY \ - --shared-memory 10GiB \ - --yes \ - --timeout=-1 \ - -- /bin/bash -c "scripts/beaker/amberish/amberish1-8k-doc-mask-cham.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/amberish/amberish1-8k-doc-mask-cham-rtheta-launch.sh b/scripts/beaker/amberish/amberish1-8k-doc-mask-cham-rtheta-launch.sh deleted file mode 100755 index 7a6bbb93c..000000000 --- a/scripts/beaker/amberish/amberish1-8k-doc-mask-cham-rtheta-launch.sh +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/env bash - -set -ex - -NUM_NODES=16 - -gantry run \ - --workspace ai2/OLMo-pretraining-stability \ - --task-name amberish1-8k-doc-mask-cham-rtheta \ - --description "Amberish 1B with 8k context length, doc masking, and chameleon fixes" \ - --priority urgent \ - --preemptible \ - --beaker-image petew/olmo-torch23-gantry \ - --cluster ai2/jupiter-cirrascale-2 \ - --gpus 8 \ - --replicas "${NUM_NODES}" \ - --leader-selection \ - --host-networking \ - --budget ai2/oe-training \ - --no-nfs \ - --weka oe-training-default:/weka/oe-training-default \ - --propagate-failure \ - --propagate-preemption \ - --synchronized-start-timeout 90m \ - --no-python \ - --env LOG_FILTER_TYPE=local_rank0_only \ - --env OMP_NUM_THREADS=8 \ - --env OLMO_TASK=model \ - --env R2_PROFILE=R2 \ - --env S3_PROFILE=S3 \ - --env WEKA_PROFILE=WEKA \ - --env-secret AWS_CONFIG=PETEW_AWS_CONFIG \ - --env-secret AWS_CREDENTIALS=PETEW_AWS_CREDENTIALS \ - --env-secret R2_ENDPOINT_URL=R2_ENDPOINT_URL \ - --env-secret WEKA_ENDPOINT_URL=WEKA_ENDPOINT_URL \ - --env-secret WANDB_API_KEY=PETEW_WANDB_API_KEY \ - --shared-memory 10GiB \ - --yes \ - --timeout=-1 \ - -- /bin/bash -c "scripts/beaker/amberish/amberish1-8k-doc-mask-cham-rtheta.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/amberish/amberish1-8k-doc-mask-cham-rtheta.sh b/scripts/beaker/amberish/amberish1-8k-doc-mask-cham-rtheta.sh deleted file mode 100755 index d37d6687d..000000000 --- a/scripts/beaker/amberish/amberish1-8k-doc-mask-cham-rtheta.sh +++ /dev/null @@ -1,66 +0,0 @@ -#!/usr/bin/env bash - -set -exuo pipefail -IFS=$'\n\t' - -BEAKER_LEADER_REPLICA_HOSTNAME=$1 -shift - -NUM_NODES=$1 -shift - -BEAKER_REPLICA_RANK=$1 -shift - -# Setup Python environment. -conda shell.bash activate base - -# Install flash-attn -#conda install -y -c nvidia cuda-python -pip install packaging ninja -export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE -pip install flash-attn==2.5.9.post1 --no-build-isolation -# pip install awscli -pip install '.[train]' -pip freeze - -# Move AWS credentials from env to relevant files -mkdir -p ~/.aws -printenv AWS_CONFIG > ~/.aws/config -printenv AWS_CREDENTIALS > ~/.aws/credentials - -# Force processes to synchronize at init_process_group -export TORCH_DIST_INIT_BARRIER=1 - -# Tell OLMo all ranks share the same filesystem for checkpoints. -export OLMO_SHARED_FS=1 - -export NCCL_DEBUG=INFO -export NCCL_IB_HCA="^=mlx5_bond_0" -export NCCL_SOCKET_IFNAME=ib -# export NCCL_IB_GID_INDEX=0 - -torchrun \ - --nnodes "${NUM_NODES}:${NUM_NODES}" \ - --nproc-per-node 8 \ - --rdzv_id 12347 \ - --rdzv_backend static \ - --rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \ - --node_rank "${BEAKER_REPLICA_RANK}" \ - --rdzv_conf 'read_timeout=420' \ - scripts/train.py \ - configs/amberish1-weka.yaml \ - --run_name="${GANTRY_TASK_NAME}" \ - --model.max_sequence_length=8192 \ - --device_train_microbatch_size=2 \ - --global_train_batch_size=512 \ - --fused_loss=true \ - --data.generate_doc_lengths=true \ - --softmax_auxiliary_loss=true \ - --auxiliary_loss_multiplier=1e-5 \ - --model.attention_layer_norm=true \ - --model.norm_after=true \ - --model.rope_theta=500000 \ - --save_overwrite - - # '--load_path=${path.last_checkpoint:${save_folder}}' \ diff --git a/scripts/beaker/amberish/amberish1-8k-doc-mask-cham.sh b/scripts/beaker/amberish/amberish1-8k-doc-mask-cham.sh deleted file mode 100755 index c9bb1bef6..000000000 --- a/scripts/beaker/amberish/amberish1-8k-doc-mask-cham.sh +++ /dev/null @@ -1,66 +0,0 @@ -#!/usr/bin/env bash - -set -exuo pipefail -IFS=$'\n\t' - -BEAKER_LEADER_REPLICA_HOSTNAME=$1 -shift - -NUM_NODES=$1 -shift - -BEAKER_REPLICA_RANK=$1 -shift - -# Setup Python environment. -conda shell.bash activate base - -# Install flash-attn -#conda install -y -c nvidia cuda-python -pip install packaging ninja -export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE -pip install flash-attn==2.5.9.post1 --no-build-isolation -# pip install awscli -pip install '.[train]' -pip freeze - -# Move AWS credentials from env to relevant files -mkdir -p ~/.aws -printenv AWS_CONFIG > ~/.aws/config -printenv AWS_CREDENTIALS > ~/.aws/credentials - -# Force processes to synchronize at init_process_group -export TORCH_DIST_INIT_BARRIER=1 - -# Tell OLMo all ranks share the same filesystem for checkpoints. -export OLMO_SHARED_FS=1 - -export NCCL_DEBUG=INFO -export NCCL_IB_HCA="^=mlx5_bond_0" -export NCCL_SOCKET_IFNAME=ib -# export NCCL_IB_GID_INDEX=0 - -torchrun \ - --nnodes "${NUM_NODES}:${NUM_NODES}" \ - --nproc-per-node 8 \ - --rdzv_id 12347 \ - --rdzv_backend static \ - --rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \ - --node_rank "${BEAKER_REPLICA_RANK}" \ - --rdzv_conf 'read_timeout=420' \ - scripts/train.py \ - configs/amberish1-weka.yaml \ - --run_name="${GANTRY_TASK_NAME}" \ - --model.max_sequence_length=8192 \ - --device_train_microbatch_size=2 \ - --global_train_batch_size=512 \ - --fused_loss=true \ - --data.generate_doc_lengths=true \ - --softmax_auxiliary_loss=true \ - --auxiliary_loss_multiplier=1e-5 \ - --model.attention_layer_norm=true \ - --model.norm_after=true \ - '--load_path=${path.last_checkpoint:${save_folder}}' \ - --save_overwrite - - # '--load_path=${path.last_checkpoint:${save_folder}}' \ diff --git a/scripts/beaker/amberish/amberish1-chameleon-launch.sh b/scripts/beaker/amberish/amberish1-chameleon-launch.sh deleted file mode 100755 index 2056ed755..000000000 --- a/scripts/beaker/amberish/amberish1-chameleon-launch.sh +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/env bash - -set -ex - -NUM_NODES=8 - -gantry run \ - --workspace ai2/OLMo-pretraining-stability \ - --task-name amberish1-chameleon2 \ - --description "Amberish 1B with Chameleon stability settings" \ - --priority urgent \ - --preemptible \ - --beaker-image petew/olmo-torch23-gantry \ - --cluster ai2/jupiter-cirrascale-2 \ - --gpus 8 \ - --replicas "${NUM_NODES}" \ - --leader-selection \ - --host-networking \ - --budget ai2/oe-training \ - --no-nfs \ - --weka oe-training-default:/weka/oe-training-default \ - --propagate-failure \ - --propagate-preemption \ - --synchronized-start-timeout 90m \ - --no-python \ - --env LOG_FILTER_TYPE=local_rank0_only \ - --env OMP_NUM_THREADS=8 \ - --env OLMO_TASK=model \ - --env R2_PROFILE=R2 \ - --env S3_PROFILE=S3 \ - --env WEKA_PROFILE=WEKA \ - --env-secret AWS_CONFIG=PETEW_AWS_CONFIG \ - --env-secret AWS_CREDENTIALS=PETEW_AWS_CREDENTIALS \ - --env-secret R2_ENDPOINT_URL=R2_ENDPOINT_URL \ - --env-secret WEKA_ENDPOINT_URL=WEKA_ENDPOINT_URL \ - --env-secret WANDB_API_KEY=PETEW_WANDB_API_KEY \ - --shared-memory 10GiB \ - --yes \ - --timeout=-1 \ - -- /bin/bash -c "scripts/beaker/amberish/amberish1-chameleon.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/amberish/amberish1-chameleon.sh b/scripts/beaker/amberish/amberish1-chameleon.sh deleted file mode 100755 index 6d8a0e5d7..000000000 --- a/scripts/beaker/amberish/amberish1-chameleon.sh +++ /dev/null @@ -1,67 +0,0 @@ -#!/usr/bin/env bash - -set -exuo pipefail -IFS=$'\n\t' - -BEAKER_LEADER_REPLICA_HOSTNAME=$1 -shift - -NUM_NODES=$1 -shift - -BEAKER_REPLICA_RANK=$1 -shift - -# Setup Python environment. -conda shell.bash activate base - -# Install flash-attn -pip install packaging ninja -export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE -pip install flash-attn==2.5.9.post1 --no-build-isolation -pip install '.[train]' -pip freeze - -# Warm HF cache -# mkdir -p /root/.cache -# pushd /root/.cache -# curl "https://storage.googleapis.com/hf-cache/huggingface_cache_v4.tar.gz" | tar --keep-newer-files -xzf - -# popd -# export HF_DATASETS_OFFLINE=1 - -# Move AWS credentials from env to relevant files -mkdir -p ~/.aws -printenv AWS_CONFIG > ~/.aws/config -printenv AWS_CREDENTIALS > ~/.aws/credentials - -# Force processes to synchronize at init_process_group -export TORCH_DIST_INIT_BARRIER=1 - -# Tell OLMo all ranks share the same filesystem for checkpoints. -export OLMO_SHARED_FS=1 - -export NCCL_DEBUG=INFO -export NCCL_IB_HCA="^=mlx5_bond_0" -export NCCL_SOCKET_IFNAME=ib - -torchrun \ - --nnodes "${NUM_NODES}:${NUM_NODES}" \ - --nproc-per-node 8 \ - --rdzv_id 12347 \ - --rdzv_backend static \ - --rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \ - --node_rank "${BEAKER_REPLICA_RANK}" \ - --rdzv_conf 'read_timeout=420' \ - scripts/train.py \ - configs/amberish1-weka.yaml \ - --run_name="${GANTRY_TASK_NAME}" \ - --device_train_microbatch_size=4 \ - --save_interval_ephemeral=null \ - --softmax_auxiliary_loss=true \ - --auxiliary_loss_multiplier=1e-5 \ - --model.attention_layer_norm=true \ - --model.norm_after=true \ - --fused_loss=true \ - --save_overwrite - - #'--load_path=${path.last_checkpoint:${save_folder}}' \ diff --git a/scripts/beaker/amberish/amberish1-emb-init-1-launch.sh b/scripts/beaker/amberish/amberish1-emb-init-1-launch.sh deleted file mode 100755 index 2e677db5d..000000000 --- a/scripts/beaker/amberish/amberish1-emb-init-1-launch.sh +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/env bash - -set -ex - -NUM_NODES=8 - -gantry run \ - --workspace ai2/OLMo-pretraining-stability \ - --task-name amberish1-emb-init-1 \ - --description "Amberish 1B train with embedding init of 1" \ - --priority urgent \ - --preemptible \ - --beaker-image petew/olmo-torch23-gantry \ - --cluster ai2/jupiter-cirrascale-2 \ - --gpus 8 \ - --replicas "${NUM_NODES}" \ - --leader-selection \ - --host-networking \ - --budget ai2/oe-training \ - --no-nfs \ - --weka oe-training-default:/weka/oe-training-default \ - --propagate-failure \ - --propagate-preemption \ - --synchronized-start-timeout 90m \ - --no-python \ - --env LOG_FILTER_TYPE=local_rank0_only \ - --env OMP_NUM_THREADS=8 \ - --env OLMO_TASK=model \ - --env R2_PROFILE=R2 \ - --env S3_PROFILE=S3 \ - --env WEKA_PROFILE=WEKA \ - --env-secret AWS_CONFIG=PETEW_AWS_CONFIG \ - --env-secret AWS_CREDENTIALS=PETEW_AWS_CREDENTIALS \ - --env-secret R2_ENDPOINT_URL=R2_ENDPOINT_URL \ - --env-secret WEKA_ENDPOINT_URL=WEKA_ENDPOINT_URL \ - --env-secret WANDB_API_KEY=PETEW_WANDB_API_KEY \ - --shared-memory 10GiB \ - --yes \ - --timeout=-1 \ - -- /bin/bash -c "scripts/beaker/amberish/amberish1-emb-init-1.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/amberish/amberish1-emb-init-1.sh b/scripts/beaker/amberish/amberish1-emb-init-1.sh deleted file mode 100755 index 049dbcfd2..000000000 --- a/scripts/beaker/amberish/amberish1-emb-init-1.sh +++ /dev/null @@ -1,73 +0,0 @@ -#!/usr/bin/env bash - -set -exuo pipefail -IFS=$'\n\t' - -BEAKER_LEADER_REPLICA_HOSTNAME=$1 -shift - -NUM_NODES=$1 -shift - -BEAKER_REPLICA_RANK=$1 -shift - -# Setup Python environment. -conda shell.bash activate base - -# Install flash-attn -#conda install -y -c nvidia cuda-python -pip install packaging ninja -export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE -pip install flash-attn==2.5.9.post1 --no-build-isolation -# pip install awscli -pip install '.[train]' -pip freeze - -# Warm HF cache -mkdir -p /root/.cache -pushd /root/.cache -# curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf - -curl "https://storage.googleapis.com/hf-cache/huggingface_cache_v4.tar.gz" | tar --keep-newer-files -xzf - -popd -export HF_DATASETS_OFFLINE=1 - -# Move AWS credentials from env to relevant files -mkdir -p ~/.aws -printenv AWS_CONFIG > ~/.aws/config -printenv AWS_CREDENTIALS > ~/.aws/credentials - -# mkdir /root/checkpoint-unsharded -# aws s3 cp --no-progress --recursive --profile=S3 \ -# s3://ai2-llm/checkpoints/OLMo-medium/llamaish7-EmbInitFix/step0-unsharded \ -# /root/checkpoint-unsharded - -# Force processes to synchronize at init_process_group -export TORCH_DIST_INIT_BARRIER=1 - -# Tell OLMo all ranks share the same filesystem for checkpoints. -export OLMO_SHARED_FS=1 - -export NCCL_DEBUG=INFO -export NCCL_IB_HCA="^=mlx5_bond_0" -export NCCL_SOCKET_IFNAME=ib -# export NCCL_IB_GID_INDEX=0 - -torchrun \ - --nnodes "${NUM_NODES}:${NUM_NODES}" \ - --nproc-per-node 8 \ - --rdzv_id 12347 \ - --rdzv_backend static \ - --rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \ - --node_rank "${BEAKER_REPLICA_RANK}" \ - --rdzv_conf 'read_timeout=420' \ - scripts/train.py \ - configs/amberish1-weka.yaml \ - --run_name="${GANTRY_TASK_NAME}" \ - --device_train_microbatch_size=4 \ - '--load_path=${path.last_checkpoint:${save_folder}}' \ - --model.emb_init_std=1.0 \ - --save_overwrite - - # '--load_path=${path.last_checkpoint:${save_folder}}' \ - # --model.emb_init_std=1.0 \ diff --git a/scripts/beaker/amberish/amberish1-launch.sh b/scripts/beaker/amberish/amberish1-launch.sh deleted file mode 100755 index a6bddd6b4..000000000 --- a/scripts/beaker/amberish/amberish1-launch.sh +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/env bash - -set -ex - -NUM_NODES=16 - -gantry run \ - --workspace ai2/OLMo-pretraining-stability \ - --task-name amberish1-doc-mask \ - --description "Amberish 1B with document masking" \ - --priority urgent \ - --preemptible \ - --beaker-image petew/olmo-torch23-gantry \ - --cluster ai2/jupiter-cirrascale-2 \ - --gpus 8 \ - --replicas "${NUM_NODES}" \ - --leader-selection \ - --host-networking \ - --budget ai2/oe-training \ - --no-nfs \ - --weka oe-training-default:/weka/oe-training-default \ - --propagate-failure \ - --propagate-preemption \ - --synchronized-start-timeout 90m \ - --no-python \ - --env LOG_FILTER_TYPE=local_rank0_only \ - --env OMP_NUM_THREADS=8 \ - --env OLMO_TASK=model \ - --env R2_PROFILE=R2 \ - --env S3_PROFILE=S3 \ - --env WEKA_PROFILE=WEKA \ - --env-secret AWS_CONFIG=PETEW_AWS_CONFIG \ - --env-secret AWS_CREDENTIALS=PETEW_AWS_CREDENTIALS \ - --env-secret R2_ENDPOINT_URL=R2_ENDPOINT_URL \ - --env-secret WEKA_ENDPOINT_URL=WEKA_ENDPOINT_URL \ - --env-secret WANDB_API_KEY=PETEW_WANDB_API_KEY \ - --shared-memory 10GiB \ - --yes \ - --timeout=-1 \ - -- /bin/bash -c "scripts/beaker/amberish/amberish1.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/amberish/amberish1-selective-updates-launch.sh b/scripts/beaker/amberish/amberish1-selective-updates-launch.sh deleted file mode 100755 index 93c12d292..000000000 --- a/scripts/beaker/amberish/amberish1-selective-updates-launch.sh +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/env bash - -set -ex - -NUM_NODES=16 - -gantry run \ - --workspace ai2/OLMo-pretraining-stability \ - --task-name amberish1-selective-updates \ - --description "Amberish 1B train with selective updates" \ - --priority urgent \ - --preemptible \ - --beaker-image petew/olmo-torch23-gantry \ - --cluster ai2/jupiter-cirrascale-2 \ - --gpus 8 \ - --replicas "${NUM_NODES}" \ - --leader-selection \ - --host-networking \ - --budget ai2/oe-training \ - --no-nfs \ - --weka oe-training-default:/weka/oe-training-default \ - --propagate-failure \ - --propagate-preemption \ - --synchronized-start-timeout 90m \ - --no-python \ - --env LOG_FILTER_TYPE=local_rank0_only \ - --env OMP_NUM_THREADS=8 \ - --env OLMO_TASK=model \ - --env R2_PROFILE=R2 \ - --env S3_PROFILE=S3 \ - --env WEKA_PROFILE=WEKA \ - --env-secret AWS_CONFIG=PETEW_AWS_CONFIG \ - --env-secret AWS_CREDENTIALS=PETEW_AWS_CREDENTIALS \ - --env-secret R2_ENDPOINT_URL=R2_ENDPOINT_URL \ - --env-secret WEKA_ENDPOINT_URL=WEKA_ENDPOINT_URL \ - --env-secret WANDB_API_KEY=PETEW_WANDB_API_KEY \ - --shared-memory 10GiB \ - --yes \ - --timeout=-1 \ - -- /bin/bash -c "scripts/beaker/amberish/amberish1-selective-updates.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/amberish/amberish1-selective-updates.sh b/scripts/beaker/amberish/amberish1-selective-updates.sh deleted file mode 100755 index ab962b1d7..000000000 --- a/scripts/beaker/amberish/amberish1-selective-updates.sh +++ /dev/null @@ -1,72 +0,0 @@ -#!/usr/bin/env bash - -set -exuo pipefail -IFS=$'\n\t' - -BEAKER_LEADER_REPLICA_HOSTNAME=$1 -shift - -NUM_NODES=$1 -shift - -BEAKER_REPLICA_RANK=$1 -shift - -# Setup Python environment. -conda shell.bash activate base - -# Install flash-attn -#conda install -y -c nvidia cuda-python -pip install packaging ninja -export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE -pip install flash-attn==2.5.9.post1 --no-build-isolation -# pip install awscli -pip install '.[train]' -pip freeze - -# Warm HF cache -mkdir -p /root/.cache -pushd /root/.cache -# curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf - -curl "https://storage.googleapis.com/hf-cache/huggingface_cache_v4.tar.gz" | tar --keep-newer-files -xzf - -popd -export HF_DATASETS_OFFLINE=1 - -# Move AWS credentials from env to relevant files -mkdir -p ~/.aws -printenv AWS_CONFIG > ~/.aws/config -printenv AWS_CREDENTIALS > ~/.aws/credentials - -# mkdir /root/checkpoint-unsharded -# aws s3 cp --no-progress --recursive --profile=S3 \ -# s3://ai2-llm/checkpoints/OLMo-medium/llamaish7-EmbInitFix/step0-unsharded \ -# /root/checkpoint-unsharded - -# Force processes to synchronize at init_process_group -export TORCH_DIST_INIT_BARRIER=1 - -# Tell OLMo all ranks share the same filesystem for checkpoints. -export OLMO_SHARED_FS=1 - -export NCCL_DEBUG=INFO -export NCCL_IB_HCA="^=mlx5_bond_0" -export NCCL_SOCKET_IFNAME=ib -# export NCCL_IB_GID_INDEX=0 - -torchrun \ - --nnodes "${NUM_NODES}:${NUM_NODES}" \ - --nproc-per-node 8 \ - --rdzv_id 12347 \ - --rdzv_backend static \ - --rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \ - --node_rank "${BEAKER_REPLICA_RANK}" \ - --rdzv_conf 'read_timeout=420' \ - scripts/train.py \ - configs/amberish1-weka.yaml \ - --run_name="${GANTRY_TASK_NAME}" \ - --optimizer.selective_updates=true \ - --device_train_microbatch_size=4 \ - '--load_path=${path.last_checkpoint:${save_folder}}' \ - --save_overwrite - - # '--load_path=${path.last_checkpoint:${save_folder}}' \ diff --git a/scripts/beaker/amberish/amberish1-wd-all-launch.sh b/scripts/beaker/amberish/amberish1-wd-all-launch.sh deleted file mode 100755 index b4557ed44..000000000 --- a/scripts/beaker/amberish/amberish1-wd-all-launch.sh +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/env bash - -set -ex - -NUM_NODES=8 - -gantry run \ - --workspace ai2/OLMo-pretraining-stability \ - --task-name amberish1-wd-all \ - --description "Amberish 1B train with WD everywhere" \ - --priority urgent \ - --preemptible \ - --beaker-image petew/olmo-torch23-gantry \ - --cluster ai2/jupiter-cirrascale-2 \ - --gpus 8 \ - --replicas "${NUM_NODES}" \ - --leader-selection \ - --host-networking \ - --budget ai2/oe-training \ - --no-nfs \ - --weka oe-training-default:/weka/oe-training-default \ - --propagate-failure \ - --propagate-preemption \ - --synchronized-start-timeout 90m \ - --no-python \ - --env LOG_FILTER_TYPE=local_rank0_only \ - --env OMP_NUM_THREADS=8 \ - --env OLMO_TASK=model \ - --env R2_PROFILE=R2 \ - --env S3_PROFILE=S3 \ - --env WEKA_PROFILE=WEKA \ - --env-secret AWS_CONFIG=PETEW_AWS_CONFIG \ - --env-secret AWS_CREDENTIALS=PETEW_AWS_CREDENTIALS \ - --env-secret R2_ENDPOINT_URL=R2_ENDPOINT_URL \ - --env-secret WEKA_ENDPOINT_URL=WEKA_ENDPOINT_URL \ - --env-secret WANDB_API_KEY=PETEW_WANDB_API_KEY \ - --shared-memory 10GiB \ - --yes \ - --timeout=-1 \ - -- /bin/bash -c "scripts/beaker/amberish/amberish1-wd-all.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/amberish/amberish1-wd-all.sh b/scripts/beaker/amberish/amberish1-wd-all.sh deleted file mode 100755 index e7890c81b..000000000 --- a/scripts/beaker/amberish/amberish1-wd-all.sh +++ /dev/null @@ -1,72 +0,0 @@ -#!/usr/bin/env bash - -set -exuo pipefail -IFS=$'\n\t' - -BEAKER_LEADER_REPLICA_HOSTNAME=$1 -shift - -NUM_NODES=$1 -shift - -BEAKER_REPLICA_RANK=$1 -shift - -# Setup Python environment. -conda shell.bash activate base - -# Install flash-attn -#conda install -y -c nvidia cuda-python -pip install packaging ninja -export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE -pip install flash-attn==2.5.9.post1 --no-build-isolation -# pip install awscli -pip install '.[train]' -pip freeze - -# Warm HF cache -mkdir -p /root/.cache -pushd /root/.cache -# curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf - -curl "https://storage.googleapis.com/hf-cache/huggingface_cache_v4.tar.gz" | tar --keep-newer-files -xzf - -popd -export HF_DATASETS_OFFLINE=1 - -# Move AWS credentials from env to relevant files -mkdir -p ~/.aws -printenv AWS_CONFIG > ~/.aws/config -printenv AWS_CREDENTIALS > ~/.aws/credentials - -# mkdir /root/checkpoint-unsharded -# aws s3 cp --no-progress --recursive --profile=S3 \ -# s3://ai2-llm/checkpoints/OLMo-medium/llamaish7-EmbInitFix/step0-unsharded \ -# /root/checkpoint-unsharded - -# Force processes to synchronize at init_process_group -export TORCH_DIST_INIT_BARRIER=1 - -# Tell OLMo all ranks share the same filesystem for checkpoints. -export OLMO_SHARED_FS=1 - -export NCCL_DEBUG=INFO -export NCCL_IB_HCA="^=mlx5_bond_0" -export NCCL_SOCKET_IFNAME=ib -# export NCCL_IB_GID_INDEX=0 - -torchrun \ - --nnodes "${NUM_NODES}:${NUM_NODES}" \ - --nproc-per-node 8 \ - --rdzv_id 12347 \ - --rdzv_backend static \ - --rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \ - --node_rank "${BEAKER_REPLICA_RANK}" \ - --rdzv_conf 'read_timeout=420' \ - scripts/train.py \ - configs/amberish1-weka.yaml \ - --run_name="${GANTRY_TASK_NAME}" \ - --optimizer.decay_embeddings=true \ - --device_train_microbatch_size=4 \ - '--load_path=${path.last_checkpoint:${save_folder}}' \ - --save_overwrite - - # '--load_path=${path.last_checkpoint:${save_folder}}' \ diff --git a/scripts/beaker/amberish/amberish1-z-loss-launch.sh b/scripts/beaker/amberish/amberish1-z-loss-launch.sh deleted file mode 100755 index 08ceb6917..000000000 --- a/scripts/beaker/amberish/amberish1-z-loss-launch.sh +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/env bash - -set -ex - -NUM_NODES=32 - -gantry run \ - --workspace ai2/OLMo-pretraining-stability \ - --task-name amberish1-z-loss \ - --description "Amberish 1B train with z-loss" \ - --priority urgent \ - --preemptible \ - --beaker-image petew/olmo-torch23-gantry \ - --cluster ai2/jupiter-cirrascale-2 \ - --gpus 8 \ - --replicas "${NUM_NODES}" \ - --leader-selection \ - --host-networking \ - --budget ai2/oe-training \ - --no-nfs \ - --weka oe-training-default:/weka/oe-training-default \ - --propagate-failure \ - --propagate-preemption \ - --synchronized-start-timeout 90m \ - --no-python \ - --env LOG_FILTER_TYPE=local_rank0_only \ - --env OMP_NUM_THREADS=8 \ - --env OLMO_TASK=model \ - --env R2_PROFILE=R2 \ - --env S3_PROFILE=S3 \ - --env WEKA_PROFILE=WEKA \ - --env-secret AWS_CONFIG=PETEW_AWS_CONFIG \ - --env-secret AWS_CREDENTIALS=PETEW_AWS_CREDENTIALS \ - --env-secret R2_ENDPOINT_URL=R2_ENDPOINT_URL \ - --env-secret WEKA_ENDPOINT_URL=WEKA_ENDPOINT_URL \ - --env-secret WANDB_API_KEY=PETEW_WANDB_API_KEY \ - --shared-memory 10GiB \ - --yes \ - --timeout=-1 \ - -- /bin/bash -c "scripts/beaker/amberish/amberish1-z-loss.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/amberish/amberish1-z-loss.sh b/scripts/beaker/amberish/amberish1-z-loss.sh deleted file mode 100755 index fd194e805..000000000 --- a/scripts/beaker/amberish/amberish1-z-loss.sh +++ /dev/null @@ -1,75 +0,0 @@ -#!/usr/bin/env bash - -set -exuo pipefail -IFS=$'\n\t' - -BEAKER_LEADER_REPLICA_HOSTNAME=$1 -shift - -NUM_NODES=$1 -shift - -BEAKER_REPLICA_RANK=$1 -shift - -# Setup Python environment. -conda shell.bash activate base - -# Install flash-attn -#conda install -y -c nvidia cuda-python -pip install packaging ninja -export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE -pip install flash-attn==2.5.9.post1 --no-build-isolation -# pip install awscli -pip install '.[train]' -pip freeze - -# Warm HF cache -mkdir -p /root/.cache -pushd /root/.cache -# curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf - -curl "https://storage.googleapis.com/hf-cache/huggingface_cache_v4.tar.gz" | tar --keep-newer-files -xzf - -popd -export HF_DATASETS_OFFLINE=1 - -# Move AWS credentials from env to relevant files -mkdir -p ~/.aws -printenv AWS_CONFIG > ~/.aws/config -printenv AWS_CREDENTIALS > ~/.aws/credentials - -# mkdir /root/checkpoint-unsharded -# aws s3 cp --no-progress --recursive --profile=S3 \ -# s3://ai2-llm/checkpoints/OLMo-medium/llamaish7-EmbInitFix/step0-unsharded \ -# /root/checkpoint-unsharded - -# Force processes to synchronize at init_process_group -export TORCH_DIST_INIT_BARRIER=1 - -# Tell OLMo all ranks share the same filesystem for checkpoints. -export OLMO_SHARED_FS=1 - -export NCCL_DEBUG=INFO -export NCCL_IB_HCA="^=mlx5_bond_0" -export NCCL_SOCKET_IFNAME=ib -# export NCCL_IB_GID_INDEX=0 - -torchrun \ - --nnodes "${NUM_NODES}:${NUM_NODES}" \ - --nproc-per-node 8 \ - --rdzv_id 12347 \ - --rdzv_backend static \ - --rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \ - --node_rank "${BEAKER_REPLICA_RANK}" \ - --rdzv_conf 'read_timeout=420' \ - scripts/train.py \ - configs/amberish1-weka.yaml \ - --run_name="${GANTRY_TASK_NAME}" \ - --device_train_microbatch_size=4 \ - --softmax_auxiliary_loss=true \ - --save_interval_ephemeral=null \ - '--load_path=${path.last_checkpoint:${save_folder}}' \ - --save_overwrite - - # '--load_path=${path.last_checkpoint:${save_folder}}' \ - # --fsdp.sharding_strategy=HYBRID_SHARD \ - # --fsdp.hybrid_sharding_num_model_replicas=2 \ diff --git a/scripts/beaker/amberish/amberish1.sh b/scripts/beaker/amberish/amberish1.sh deleted file mode 100755 index 0a4c41471..000000000 --- a/scripts/beaker/amberish/amberish1.sh +++ /dev/null @@ -1,71 +0,0 @@ -#!/usr/bin/env bash - -set -exuo pipefail -IFS=$'\n\t' - -BEAKER_LEADER_REPLICA_HOSTNAME=$1 -shift - -NUM_NODES=$1 -shift - -BEAKER_REPLICA_RANK=$1 -shift - -# Setup Python environment. -conda shell.bash activate base - -# Install flash-attn -#conda install -y -c nvidia cuda-python -pip install packaging ninja -export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE -pip install flash-attn==2.5.9.post1 --no-build-isolation -# pip install awscli -pip install '.[train]' -pip freeze - -# Warm HF cache -mkdir -p /root/.cache -pushd /root/.cache -# curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf - -curl "https://storage.googleapis.com/hf-cache/huggingface_cache_v4.tar.gz" | tar --keep-newer-files -xzf - -popd -export HF_DATASETS_OFFLINE=1 - -# Move AWS credentials from env to relevant files -mkdir -p ~/.aws -printenv AWS_CONFIG > ~/.aws/config -printenv AWS_CREDENTIALS > ~/.aws/credentials - -# mkdir /root/checkpoint-unsharded -# aws s3 cp --no-progress --recursive --profile=S3 \ -# s3://ai2-llm/checkpoints/OLMo-medium/llamaish7-EmbInitFix/step0-unsharded \ -# /root/checkpoint-unsharded - -# Force processes to synchronize at init_process_group -export TORCH_DIST_INIT_BARRIER=1 - -# Tell OLMo all ranks share the same filesystem for checkpoints. -export OLMO_SHARED_FS=1 - -export NCCL_DEBUG=INFO -export NCCL_IB_HCA="^=mlx5_bond_0" -export NCCL_SOCKET_IFNAME=ib -# export NCCL_IB_GID_INDEX=0 - -torchrun \ - --nnodes "${NUM_NODES}:${NUM_NODES}" \ - --nproc-per-node 8 \ - --rdzv_id 12347 \ - --rdzv_backend static \ - --rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \ - --node_rank "${BEAKER_REPLICA_RANK}" \ - --rdzv_conf 'read_timeout=420' \ - scripts/train.py \ - configs/amberish1-weka.yaml \ - --run_name="${GANTRY_TASK_NAME}" \ - --device_train_microbatch_size=4 \ - --data.generate_doc_lengths=true \ - --save_overwrite - - # '--load_path=${path.last_checkpoint:${save_folder}}' \ diff --git a/scripts/beaker/amberish/amberish7-launch.sh b/scripts/beaker/amberish/amberish7-launch.sh deleted file mode 100755 index 2e3ce9468..000000000 --- a/scripts/beaker/amberish/amberish7-launch.sh +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/env bash - -set -ex - -NUM_NODES=16 - -gantry run \ - --workspace ai2/OLMo-pretraining-stability \ - --task-name amberish7 \ - --description "Amberish 7B train" \ - --priority urgent \ - --preemptible \ - --beaker-image petew/olmo-torch23-gantry \ - --cluster ai2/jupiter-cirrascale-2 \ - --gpus 8 \ - --replicas "${NUM_NODES}" \ - --leader-selection \ - --host-networking \ - --budget ai2/oe-training \ - --no-nfs \ - --weka oe-training-default:/weka/oe-training-default \ - --propagate-failure \ - --propagate-preemption \ - --synchronized-start-timeout 90m \ - --no-python \ - --env LOG_FILTER_TYPE=local_rank0_only \ - --env OMP_NUM_THREADS=8 \ - --env OLMO_TASK=model \ - --env R2_PROFILE=R2 \ - --env S3_PROFILE=S3 \ - --env WEKA_PROFILE=WEKA \ - --env-secret AWS_CONFIG=PETEW_AWS_CONFIG \ - --env-secret AWS_CREDENTIALS=PETEW_AWS_CREDENTIALS \ - --env-secret R2_ENDPOINT_URL=R2_ENDPOINT_URL \ - --env-secret WEKA_ENDPOINT_URL=WEKA_ENDPOINT_URL \ - --env-secret WANDB_API_KEY=PETEW_WANDB_API_KEY \ - --shared-memory 10GiB \ - --yes \ - --timeout=-1 \ - -- /bin/bash -c "scripts/beaker/amberish/amberish7.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/amberish/amberish7.sh b/scripts/beaker/amberish/amberish7.sh deleted file mode 100755 index 9191431ff..000000000 --- a/scripts/beaker/amberish/amberish7.sh +++ /dev/null @@ -1,74 +0,0 @@ -#!/usr/bin/env bash - -set -exuo pipefail -IFS=$'\n\t' - -BEAKER_LEADER_REPLICA_HOSTNAME=$1 -shift - -NUM_NODES=$1 -shift - -BEAKER_REPLICA_RANK=$1 -shift - -# Setup Python environment. -conda shell.bash activate base - -# Install flash-attn -#conda install -y -c nvidia cuda-python -pip install packaging ninja -export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE -pip install flash-attn==2.5.9.post1 --no-build-isolation -# pip install awscli -pip install '.[train]' -pip freeze - -# Warm HF cache -mkdir -p /root/.cache -pushd /root/.cache -curl "https://storage.googleapis.com/hf-cache/huggingface_cache_v4.tar.gz" | tar --keep-newer-files -xzf - -popd -export HF_DATASETS_OFFLINE=1 - -# Move AWS credentials from env to relevant files -mkdir -p ~/.aws -printenv AWS_CONFIG > ~/.aws/config -printenv AWS_CREDENTIALS > ~/.aws/credentials - -# mkdir /root/checkpoint-unsharded -# aws s3 cp --no-progress --recursive --profile=S3 \ -# s3://ai2-llm/checkpoints/OLMo-medium/llamaish7-EmbInitFix/step0-unsharded \ -# /root/checkpoint-unsharded - -# Force processes to synchronize at init_process_group -export TORCH_DIST_INIT_BARRIER=1 - -# Tell OLMo all ranks share the same filesystem for checkpoints. -export OLMO_SHARED_FS=1 - -export NCCL_DEBUG=INFO -export NCCL_IB_HCA="^=mlx5_bond_0" -export NCCL_SOCKET_IFNAME=ib -# export NCCL_IB_GID_INDEX=0 - -torchrun \ - --nnodes "${NUM_NODES}:${NUM_NODES}" \ - --nproc-per-node 8 \ - --rdzv_id 12347 \ - --rdzv_backend static \ - --rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \ - --node_rank "${BEAKER_REPLICA_RANK}" \ - --rdzv_conf 'read_timeout=420' \ - scripts/train.py \ - configs/amberish7-weka.yaml \ - --run_name="${GANTRY_TASK_NAME}" \ - --save_overwrite \ - --save_interval_ephemeral=500 \ - --optimizer.metrics_log_interval=1 \ - --epoch=1 \ - '--load_path=${path.last_checkpoint:${save_folder}}' - - # '--load_path=${save_folder}/step409000' - # --fsdp.sharding_strategy=HYBRID_SHARD \ - # --fsdp.hybrid_sharding_num_model_replicas=4 \ diff --git a/scripts/beaker/amberish/amberish70-launch.sh b/scripts/beaker/amberish/amberish70-launch.sh deleted file mode 100755 index 82d91f284..000000000 --- a/scripts/beaker/amberish/amberish70-launch.sh +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/env bash - -set -ex - -# NUM_NODES=120 # 960 GPUs -NUM_NODES=112 # 896 GPUs - -gantry run \ - --workspace ai2/OLMo-pretraining-stability \ - --task-name acceptance-test \ - --description "70B acceptance test" \ - --priority urgent \ - --preemptible \ - --beaker-image petew/olmo-torch23-gantry \ - --cluster ai2/jupiter-cirrascale-2 \ - --gpus 8 \ - --replicas "${NUM_NODES}" \ - --leader-selection \ - --host-networking \ - --budget ai2/oe-training \ - --no-nfs \ - --weka oe-training-default:/weka/oe-training-default \ - --propagate-failure \ - --propagate-preemption \ - --synchronized-start-timeout 90m \ - --no-python \ - --env LOG_FILTER_TYPE=local_rank0_only \ - --env OMP_NUM_THREADS=8 \ - --env OLMO_TASK=model \ - --env R2_PROFILE=R2 \ - --env S3_PROFILE=S3 \ - --env WEKA_PROFILE=WEKA \ - --env-secret AWS_CONFIG=PETEW_AWS_CONFIG \ - --env-secret AWS_CREDENTIALS=PETEW_AWS_CREDENTIALS \ - --env-secret R2_ENDPOINT_URL=R2_ENDPOINT_URL \ - --env-secret WEKA_ENDPOINT_URL=WEKA_ENDPOINT_URL \ - --env-secret WANDB_API_KEY=PETEW_WANDB_API_KEY \ - --shared-memory 10GiB \ - --yes \ - --timeout=-1 \ - -- /bin/bash -c "scripts/beaker/amberish/amberish70.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/amberish/amberish70.sh b/scripts/beaker/amberish/amberish70.sh deleted file mode 100755 index 6802f4d2a..000000000 --- a/scripts/beaker/amberish/amberish70.sh +++ /dev/null @@ -1,72 +0,0 @@ -#!/usr/bin/env bash - -set -exuo pipefail -IFS=$'\n\t' - -BEAKER_LEADER_REPLICA_HOSTNAME=$1 -shift - -NUM_NODES=$1 -shift - -BEAKER_REPLICA_RANK=$1 -shift - -# Setup Python environment. -conda shell.bash activate base - -# Install flash-attn -#conda install -y -c nvidia cuda-python -pip install packaging ninja -export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE -pip install flash-attn==2.5.9.post1 --no-build-isolation -# pip install awscli -pip install '.[train]' -pip freeze - -# Warm HF cache -mkdir -p /root/.cache -pushd /root/.cache -curl "https://storage.googleapis.com/hf-cache/huggingface_cache_v4.tar.gz" | tar --keep-newer-files -xzf - -popd -export HF_DATASETS_OFFLINE=1 - -# Move AWS credentials from env to relevant files -mkdir -p ~/.aws -printenv AWS_CONFIG > ~/.aws/config -printenv AWS_CREDENTIALS > ~/.aws/credentials - -# mkdir /root/checkpoint-unsharded -# aws s3 cp --no-progress --recursive --profile=S3 \ -# s3://ai2-llm/checkpoints/OLMo-medium/llamaish7-EmbInitFix/step0-unsharded \ -# /root/checkpoint-unsharded - -# Force processes to synchronize at init_process_group -export TORCH_DIST_INIT_BARRIER=1 - -# Tell OLMo all ranks share the same filesystem for checkpoints. -export OLMO_SHARED_FS=1 - -export NCCL_DEBUG=INFO -export NCCL_IB_HCA="^=mlx5_bond_0" -export NCCL_SOCKET_IFNAME=ib -# export NCCL_IB_GID_INDEX=0 - -mbz=4 - -torchrun \ - --nnodes "${NUM_NODES}:${NUM_NODES}" \ - --nproc-per-node 8 \ - --rdzv_id 12347 \ - --rdzv_backend static \ - --rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \ - --node_rank "${BEAKER_REPLICA_RANK}" \ - --rdzv_conf 'read_timeout=420' \ - scripts/train.py \ - configs/amberish70-weka.yaml \ - --run_name="${GANTRY_TASK_NAME}" \ - --fsdp.sharding_strategy=HYBRID_SHARD \ - --fsdp.hybrid_sharding_num_model_replicas=8 \ - --device_train_microbatch_size="${mbz}" \ - --global_train_batch_size=$((NUM_NODES * 8 * mbz)) \ - --save_overwrite diff --git a/scripts/beaker/annealing/launch_annealing_amberish.sh b/scripts/beaker/annealing/launch_annealing_amberish.sh deleted file mode 100755 index fb5d10b91..000000000 --- a/scripts/beaker/annealing/launch_annealing_amberish.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env bash -# Similar to `launch_annealing.sh`, but doesn't use fused loss. - -set -ex - -CONFIG_NAME=$1 -NUM_NODES=$2 -CLUSTER=$3 -PRIORITY=$4 - -CONFIG_DIR=configs/annealing -CONFIG_PATH=${CONFIG_DIR}/${CONFIG_NAME}.yaml - -gantry run \ - --preemptible \ - --allow-dirty \ - --workspace ai2/OLMo-pretraining-stability \ - --task-name ${CONFIG_NAME} \ - --description ${CONFIG_NAME} \ - --priority $PRIORITY \ - --beaker-image shanea/olmo-torch2.2-gantry \ - --cluster $CLUSTER \ - --gpus 8 \ - --replicas "${NUM_NODES}" \ - --leader-selection \ - --propagate-failure \ - --synchronized-start-timeout "30m" \ - --host-networking \ - --nfs \ - --budget ai2/oe-training \ - --env LOG_FILTER_TYPE=local_rank0_only \ - --env OMP_NUM_THREADS=8 \ - --env OLMO_TASK=model \ - --env-secret WANDB_API_KEY=WANDB_API_KEY \ - --env-secret AWS_ACCESS_KEY_ID=AWS_ACCESS_KEY_ID \ - --env-secret AWS_SECRET_ACCESS_KEY=AWS_SECRET_ACCESS_KEY \ - --env-secret R2_ACCESS_KEY_ID=R2_ACCESS_KEY_ID \ - --env-secret R2_SECRET_ACCESS_KEY=R2_SECRET_ACCESS_KEY \ - --env-secret R2_ENDPOINT_URL=R2_ENDPOINT_URL \ - --shared-memory 10GiB \ - --venv base \ - --yes \ - -- /bin/bash -c "source scripts/beaker/warm_hf_cache.sh && torchrun --nnodes ${NUM_NODES}:${NUM_NODES} --nproc-per-node 8 --rdzv_id=101 --rdzv_backend=c10d --rdzv_endpoint=\$BEAKER_LEADER_REPLICA_HOSTNAME:29400 scripts/train.py ${CONFIG_PATH} --model.flash_attention=true --fsdp.wrapping_strategy=by_block_and_size --fsdp.sharding_strategy=SHARD_GRAD_OP --activation_checkpointing=fine_grained --device_train_microbatch_size=2 --global_train_batch_size=1024 --gen1_gc_interval=8 --save_num_checkpoints_to_keep=2" diff --git a/scripts/beaker/chameleon/llamaish1-launch.sh b/scripts/beaker/chameleon/llamaish1-launch.sh deleted file mode 100755 index 5bf0260eb..000000000 --- a/scripts/beaker/chameleon/llamaish1-launch.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/usr/bin/env bash - -set -ex - -NUM_NODES=16 - -gantry run \ - --workspace ai2/OLMo-training \ - --task-name llamaish1 \ - --description "OLMo small - 1B - Llamaish" \ - --priority high \ - --preemptible \ - --beaker-image shanea/olmo-torch2.2-gantry \ - --cluster ai2/jupiter-cirrascale-2 \ - --gpus 8 \ - --replicas "${NUM_NODES}" \ - --leader-selection \ - --host-networking \ - --budget ai2/oe-training \ - --no-nfs \ - --propagate-failure \ - --synchronized-start-timeout 600m \ - --env LOG_FILTER_TYPE=local_rank0_only \ - --env OMP_NUM_THREADS=8 \ - --env OLMO_TASK=model \ - --env-secret WANDB_API_KEY=AKSHITAB_WANDB_API_KEY \ - --env-secret AWS_ACCESS_KEY_ID=AKSHITAB_AWS_ACCESS_KEY_ID \ - --env-secret AWS_SECRET_ACCESS_KEY=AKSHITAB_AWS_SECRET_ACCESS_KEY \ - --shared-memory 10GiB \ - --venv base \ - --yes \ - --timeout=-1 \ - -- /bin/bash -c "scripts/beaker/chameleon/llamaish1.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/chameleon/llamaish1-normal-launch.sh b/scripts/beaker/chameleon/llamaish1-normal-launch.sh deleted file mode 100755 index 5d82bb568..000000000 --- a/scripts/beaker/chameleon/llamaish1-normal-launch.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/usr/bin/env bash - -set -ex - -NUM_NODES=8 - -gantry run \ - --workspace ai2/OLMo-training \ - --task-name llamaish1-normal \ - --description "OLMo small - 1B - Llamaish Normal New" \ - --priority high \ - --preemptible \ - --beaker-image shanea/olmo-torch2.2-gantry \ - --cluster ai2/jupiter-cirrascale-2 \ - --gpus 8 \ - --replicas "${NUM_NODES}" \ - --leader-selection \ - --host-networking \ - --budget ai2/oe-training \ - --no-nfs \ - --propagate-failure \ - --synchronized-start-timeout 600m \ - --env LOG_FILTER_TYPE=local_rank0_only \ - --env OMP_NUM_THREADS=8 \ - --env OLMO_TASK=model \ - --env-secret WANDB_API_KEY=AKSHITAB_WANDB_API_KEY \ - --env-secret AWS_ACCESS_KEY_ID=AKSHITAB_AWS_ACCESS_KEY_ID \ - --env-secret AWS_SECRET_ACCESS_KEY=AKSHITAB_AWS_SECRET_ACCESS_KEY \ - --shared-memory 10GiB \ - --venv base \ - --yes \ - --timeout=-1 \ - -- /bin/bash -c "scripts/beaker/chameleon/llamaish1-normal.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/chameleon/llamaish1-normal-qk-norm-reorder-zloss-launch.sh b/scripts/beaker/chameleon/llamaish1-normal-qk-norm-reorder-zloss-launch.sh deleted file mode 100755 index 0e1ff8bbc..000000000 --- a/scripts/beaker/chameleon/llamaish1-normal-qk-norm-reorder-zloss-launch.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/usr/bin/env bash - -set -ex - -NUM_NODES=8 - -gantry run \ - --workspace ai2/OLMo-training \ - --task-name llamaish1-normal-qk-norm-reorder-zloss \ - --description "OLMo small - 1B - Llamaish Normal QK norm reorder zloss New" \ - --priority high \ - --preemptible \ - --beaker-image shanea/olmo-torch2.2-gantry \ - --cluster ai2/jupiter-cirrascale \ - --gpus 8 \ - --replicas "${NUM_NODES}" \ - --leader-selection \ - --host-networking \ - --budget ai2/oe-training \ - --no-nfs \ - --propagate-failure \ - --synchronized-start-timeout 600m \ - --env LOG_FILTER_TYPE=local_rank0_only \ - --env OMP_NUM_THREADS=8 \ - --env OLMO_TASK=model \ - --env-secret WANDB_API_KEY=AKSHITAB_WANDB_API_KEY \ - --env-secret AWS_ACCESS_KEY_ID=AKSHITAB_AWS_ACCESS_KEY_ID \ - --env-secret AWS_SECRET_ACCESS_KEY=AKSHITAB_AWS_SECRET_ACCESS_KEY \ - --shared-memory 10GiB \ - --venv base \ - --yes \ - --timeout=-1 \ - -- /bin/bash -c "scripts/beaker/chameleon/llamaish1-normal-qk-norm-reorder-zloss.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/chameleon/llamaish1-normal-qk-norm-reorder-zloss.sh b/scripts/beaker/chameleon/llamaish1-normal-qk-norm-reorder-zloss.sh deleted file mode 100755 index 497ef0445..000000000 --- a/scripts/beaker/chameleon/llamaish1-normal-qk-norm-reorder-zloss.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/usr/bin/env bash -set -exuo pipefail -IFS=$'\n\t' - -BEAKER_LEADER_REPLICA_HOSTNAME=$1 -shift - -NUM_NODES=$1 -shift - -BEAKER_REPLICA_RANK=$1 -shift - -# Warm HF cache -mkdir -p /root/.cache -pushd /root/.cache -curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf - -popd -export HF_DATASETS_OFFLINE=1 - -export EXPERIMENT=llamaish1-normal-qk-norm-reorder-zloss-new - -torchrun \ - --nnodes ${NUM_NODES}:${NUM_NODES} \ - --nproc-per-node 8 \ - --rdzv_id=12347 \ - --rdzv_backend=static \ - --rdzv_endpoint=$BEAKER_LEADER_REPLICA_HOSTNAME:29400 \ - --node_rank=$BEAKER_REPLICA_RANK \ - --rdzv_conf="read_timeout=420" \ - scripts/train.py \ - configs/llamaish1-s3.yaml \ - --run_name=$EXPERIMENT \ - --wandb.name=$EXPERIMENT \ - --wandb.group=$EXPERIMENT \ - --model.flash_attention=true \ - --fsdp.wrapping_strategy=by_block_and_size \ - --fsdp.sharding_strategy=NO_SHARD \ - --gen1_gc_interval=null \ - --save_folder=runs/ \ - --activation_checkpointing=fine_grained \ - --fused_loss=true \ - --device_train_microbatch_size=4 \ - --global_train_batch_size=512 \ - --save_interval=250 \ - --eval_interval=250 \ - --optimizer.metrics_log_interval=1 \ - --save_overwrite \ - --model.init_fn=normal \ - --model.init_std=0.02 \ - --model.clip_qkv=null \ - --scheduler.grad_clip_warmup_steps=null \ - --save_num_checkpoints_to_keep=3 \ - --model.attention_layer_norm=true \ - --model.norm_after=true \ - --softmax_auxiliary_loss=true \ - --auxiliary_loss_multiplier=1e-5 \ - --load_path=s3://ai2-llm/checkpoints/OLMo-small/llamaish1-normal-new/step0 - #'--load_path=${path.last_checkpoint:s3://ai2-llm/checkpoints/OLMo-small/llamaish1-normal-qk-norm-reorder-zloss-new/}' - diff --git a/scripts/beaker/chameleon/llamaish1-normal.sh b/scripts/beaker/chameleon/llamaish1-normal.sh deleted file mode 100755 index 3188ec7bd..000000000 --- a/scripts/beaker/chameleon/llamaish1-normal.sh +++ /dev/null @@ -1,61 +0,0 @@ -#!/usr/bin/env bash -set -exuo pipefail -IFS=$'\n\t' - -BEAKER_LEADER_REPLICA_HOSTNAME=$1 -shift - -NUM_NODES=$1 -shift - -BEAKER_REPLICA_RANK=$1 -shift - -# Warm HF cache -mkdir -p /root/.cache -pushd /root/.cache -curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf - -popd -export HF_DATASETS_OFFLINE=1 - -export EXPERIMENT=llamaish1-normal-new - -torchrun \ - --nnodes ${NUM_NODES}:${NUM_NODES} \ - --nproc-per-node 8 \ - --rdzv_id=12347 \ - --rdzv_backend=static \ - --rdzv_endpoint=$BEAKER_LEADER_REPLICA_HOSTNAME:29400 \ - --node_rank=$BEAKER_REPLICA_RANK \ - --rdzv_conf="read_timeout=420" \ - scripts/train.py \ - configs/llamaish1-s3.yaml \ - --run_name=$EXPERIMENT \ - --wandb.name=$EXPERIMENT \ - --wandb.group=$EXPERIMENT \ - --model.flash_attention=true \ - --fsdp.wrapping_strategy=by_block_and_size \ - --fsdp.sharding_strategy=NO_SHARD \ - --gen1_gc_interval=null \ - --save_folder=runs/ \ - --activation_checkpointing=fine_grained \ - --fused_loss=true \ - --device_train_microbatch_size=4 \ - --global_train_batch_size=512 \ - --save_interval=250 \ - --eval_interval=250 \ - --optimizer.metrics_log_interval=1 \ - --save_overwrite \ - --model.init_fn=normal \ - --model.init_std=0.02 \ - --model.clip_qkv=null \ - --save_num_checkpoints_to_keep=3 \ - --scheduler.grad_clip_warmup_steps=null \ - --scheduler.t_warmup=2000 \ - --scheduler.units=steps - #'--load_path=${path.last_checkpoint:s3://ai2-llm/checkpoints/OLMo-small/llamaish1-normal-new/}' - # --model.attention_layer_norm=true \ - # --model.norm_after=true \ - # --softmax_auxiliary_loss=true \ - # --auxiliary_loss_multiplier=1e-5 - diff --git a/scripts/beaker/chameleon/llamaish1-qk-norm-reorder-launch.sh b/scripts/beaker/chameleon/llamaish1-qk-norm-reorder-launch.sh deleted file mode 100755 index 0cb706507..000000000 --- a/scripts/beaker/chameleon/llamaish1-qk-norm-reorder-launch.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/usr/bin/env bash - -set -ex - -NUM_NODES=8 - -gantry run \ - --workspace ai2/OLMo-training \ - --task-name llamaish1-qk-norm-reorder \ - --description "OLMo small - 1B - Llamaish QK norm reorder" \ - --priority high \ - --preemptible \ - --beaker-image shanea/olmo-torch2.2-gantry \ - --cluster ai2/jupiter-cirrascale-2 \ - --gpus 8 \ - --replicas "${NUM_NODES}" \ - --leader-selection \ - --host-networking \ - --budget ai2/oe-training \ - --no-nfs \ - --propagate-failure \ - --synchronized-start-timeout 600m \ - --env LOG_FILTER_TYPE=local_rank0_only \ - --env OMP_NUM_THREADS=8 \ - --env OLMO_TASK=model \ - --env-secret WANDB_API_KEY=AKSHITAB_WANDB_API_KEY \ - --env-secret AWS_ACCESS_KEY_ID=AKSHITAB_AWS_ACCESS_KEY_ID \ - --env-secret AWS_SECRET_ACCESS_KEY=AKSHITAB_AWS_SECRET_ACCESS_KEY \ - --shared-memory 10GiB \ - --venv base \ - --yes \ - --timeout=-1 \ - -- /bin/bash -c "scripts/beaker/chameleon/llamaish1-qk-norm-reorder.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/chameleon/llamaish1-qk-norm-reorder-zloss-launch.sh b/scripts/beaker/chameleon/llamaish1-qk-norm-reorder-zloss-launch.sh deleted file mode 100755 index 0f568060a..000000000 --- a/scripts/beaker/chameleon/llamaish1-qk-norm-reorder-zloss-launch.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/usr/bin/env bash - -set -ex - -NUM_NODES=8 - -gantry run \ - --workspace ai2/OLMo-training \ - --task-name llamaish1-qk-norm-reorder-zloss \ - --description "OLMo small - 1B - Llamaish QK norm reorder zloss" \ - --priority high \ - --preemptible \ - --beaker-image shanea/olmo-torch2.2-gantry \ - --cluster ai2/jupiter-cirrascale \ - --gpus 8 \ - --replicas "${NUM_NODES}" \ - --leader-selection \ - --host-networking \ - --budget ai2/oe-training \ - --no-nfs \ - --propagate-failure \ - --synchronized-start-timeout 600m \ - --env LOG_FILTER_TYPE=local_rank0_only \ - --env OMP_NUM_THREADS=8 \ - --env OLMO_TASK=model \ - --env-secret WANDB_API_KEY=AKSHITAB_WANDB_API_KEY \ - --env-secret AWS_ACCESS_KEY_ID=AKSHITAB_AWS_ACCESS_KEY_ID \ - --env-secret AWS_SECRET_ACCESS_KEY=AKSHITAB_AWS_SECRET_ACCESS_KEY \ - --shared-memory 10GiB \ - --venv base \ - --yes \ - --timeout=-1 \ - -- /bin/bash -c "scripts/beaker/chameleon/llamaish1-qk-norm-reorder-zloss.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/chameleon/llamaish1-qk-norm-reorder-zloss.sh b/scripts/beaker/chameleon/llamaish1-qk-norm-reorder-zloss.sh deleted file mode 100755 index 470fc0871..000000000 --- a/scripts/beaker/chameleon/llamaish1-qk-norm-reorder-zloss.sh +++ /dev/null @@ -1,58 +0,0 @@ -#!/usr/bin/env bash -set -exuo pipefail -IFS=$'\n\t' - -BEAKER_LEADER_REPLICA_HOSTNAME=$1 -shift - -NUM_NODES=$1 -shift - -BEAKER_REPLICA_RANK=$1 -shift - -# Warm HF cache -mkdir -p /root/.cache -pushd /root/.cache -curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf - -popd -export HF_DATASETS_OFFLINE=1 - -export EXPERIMENT=llamaish1-qk-norm-reorder-zloss - -torchrun \ - --nnodes ${NUM_NODES}:${NUM_NODES} \ - --nproc-per-node 8 \ - --rdzv_id=12347 \ - --rdzv_backend=static \ - --rdzv_endpoint=$BEAKER_LEADER_REPLICA_HOSTNAME:29400 \ - --node_rank=$BEAKER_REPLICA_RANK \ - --rdzv_conf="read_timeout=420" \ - scripts/train.py \ - configs/llamaish1-s3.yaml \ - --run_name=$EXPERIMENT \ - --wandb.name=$EXPERIMENT \ - --wandb.group=$EXPERIMENT \ - --model.flash_attention=true \ - --fsdp.wrapping_strategy=by_block_and_size \ - --fsdp.sharding_strategy=NO_SHARD \ - --gen1_gc_interval=null \ - --save_folder=runs/ \ - --activation_checkpointing=fine_grained \ - --fused_loss=true \ - --device_train_microbatch_size=4 \ - --global_train_batch_size=512 \ - --save_interval=250 \ - --eval_interval=250 \ - --optimizer.metrics_log_interval=1 \ - --save_overwrite \ - --model.scale_emb_init \ - --model.clip_qkv=null \ - --scheduler.grad_clip_warmup_steps=null \ - --save_num_checkpoints_to_keep=3 \ - --model.attention_layer_norm=true \ - --model.norm_after=true \ - --softmax_auxiliary_loss=true \ - --auxiliary_loss_multiplier=1e-5 \ - --load_path=s3://ai2-llm/checkpoints/OLMo-small/llamaish1/step0 - #'--load_path=${path.last_checkpoint:s3://ai2-llm/checkpoints/OLMo-small/llamaish1-qk-norm-reorder-zloss/}' diff --git a/scripts/beaker/chameleon/llamaish1-qk-norm-reorder.sh b/scripts/beaker/chameleon/llamaish1-qk-norm-reorder.sh deleted file mode 100755 index 3bf523e9c..000000000 --- a/scripts/beaker/chameleon/llamaish1-qk-norm-reorder.sh +++ /dev/null @@ -1,57 +0,0 @@ -#!/usr/bin/env bash -set -exuo pipefail -IFS=$'\n\t' - -BEAKER_LEADER_REPLICA_HOSTNAME=$1 -shift - -NUM_NODES=$1 -shift - -BEAKER_REPLICA_RANK=$1 -shift - -# Warm HF cache -mkdir -p /root/.cache -pushd /root/.cache -curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf - -popd -export HF_DATASETS_OFFLINE=1 - -export EXPERIMENT=llamaish1-qk-norm-reorder - -torchrun \ - --nnodes ${NUM_NODES}:${NUM_NODES} \ - --nproc-per-node 8 \ - --rdzv_id=12347 \ - --rdzv_backend=static \ - --rdzv_endpoint=$BEAKER_LEADER_REPLICA_HOSTNAME:29400 \ - --node_rank=$BEAKER_REPLICA_RANK \ - --rdzv_conf="read_timeout=420" \ - scripts/train.py \ - configs/llamaish1-s3.yaml \ - --run_name=$EXPERIMENT \ - --wandb.name=$EXPERIMENT \ - --wandb.group=$EXPERIMENT \ - --model.flash_attention=true \ - --fsdp.wrapping_strategy=by_block_and_size \ - --fsdp.sharding_strategy=NO_SHARD \ - --gen1_gc_interval=null \ - --save_folder=runs/ \ - --activation_checkpointing=fine_grained \ - --fused_loss=true \ - --device_train_microbatch_size=4 \ - --global_train_batch_size=512 \ - --save_interval=250 \ - --eval_interval=250 \ - --optimizer.metrics_log_interval=1 \ - --save_overwrite \ - --model.scale_emb_init \ - --model.clip_qkv=null \ - --scheduler.grad_clip_warmup_steps=null \ - --save_num_checkpoints_to_keep=3 \ - --model.attention_layer_norm=true \ - --model.norm_after=true \ - --softmax_auxiliary_loss=false \ - --load_path=s3://ai2-llm/checkpoints/OLMo-small/llamaish1/step0 - #'--load_path=${path.last_checkpoint:s3://ai2-llm/checkpoints/OLMo-small/llamaish1-qk-norm-reorder/}' diff --git a/scripts/beaker/chameleon/llamaish1.sh b/scripts/beaker/chameleon/llamaish1.sh deleted file mode 100755 index fe7e7e62f..000000000 --- a/scripts/beaker/chameleon/llamaish1.sh +++ /dev/null @@ -1,58 +0,0 @@ -#!/usr/bin/env bash -set -exuo pipefail -IFS=$'\n\t' - -BEAKER_LEADER_REPLICA_HOSTNAME=$1 -shift - -NUM_NODES=$1 -shift - -BEAKER_REPLICA_RANK=$1 -shift - -# Warm HF cache -mkdir -p /root/.cache -pushd /root/.cache -curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf - -popd -export HF_DATASETS_OFFLINE=1 - -export EXPERIMENT=llamaish1 - -torchrun \ - --nnodes ${NUM_NODES}:${NUM_NODES} \ - --nproc-per-node 8 \ - --rdzv_id=12347 \ - --rdzv_backend=static \ - --rdzv_endpoint=$BEAKER_LEADER_REPLICA_HOSTNAME:29400 \ - --node_rank=$BEAKER_REPLICA_RANK \ - --rdzv_conf="read_timeout=420" \ - scripts/train.py \ - configs/llamaish1-s3.yaml \ - --run_name=$EXPERIMENT \ - --wandb.name=$EXPERIMENT \ - --wandb.group=$EXPERIMENT \ - --model.flash_attention=true \ - --fsdp.wrapping_strategy=by_block_and_size \ - --fsdp.sharding_strategy=NO_SHARD \ - --gen1_gc_interval=null \ - --save_folder=runs/ \ - --activation_checkpointing=fine_grained \ - --fused_loss=true \ - --device_train_microbatch_size=4 \ - --global_train_batch_size=512 \ - --save_interval=250 \ - --eval_interval=250 \ - --optimizer.metrics_log_interval=1 \ - --save_overwrite \ - --model.scale_emb_init \ - --model.clip_qkv=null \ - --save_num_checkpoints_to_keep=3 \ - --scheduler.grad_clip_warmup_steps=null \ - '--load_path=${path.last_checkpoint:s3://ai2-llm/checkpoints/OLMo-small/llamaish1/}' - # --model.attention_layer_norm=true \ - # --model.norm_after=true \ - # --softmax_auxiliary_loss=true \ - # --auxiliary_loss_multiplier=1e-5 - #'--load_path=${path.last_checkpoint:s3://ai2-llm/checkpoints/OLMo-small/llamaish1/}' diff --git a/scripts/beaker/chameleon/llamaish7-normal-qk-norm-reorder-zloss-launch.sh b/scripts/beaker/chameleon/llamaish7-normal-qk-norm-reorder-zloss-launch.sh deleted file mode 100755 index 7c72b27da..000000000 --- a/scripts/beaker/chameleon/llamaish7-normal-qk-norm-reorder-zloss-launch.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/usr/bin/env bash - -set -ex - -NUM_NODES=8 - -gantry run \ - --workspace ai2/OLMo-training \ - --task-name llamaish7-normal-qk-norm-reorder-zloss \ - --description "OLMo medium - 7B - Llamaish Normal QK norm reorder zloss" \ - --priority high \ - --preemptible \ - --beaker-image shanea/olmo-torch2.2-gantry \ - --cluster ai2/jupiter-cirrascale \ - --gpus 8 \ - --replicas "${NUM_NODES}" \ - --leader-selection \ - --host-networking \ - --budget ai2/oe-training \ - --no-nfs \ - --propagate-failure \ - --synchronized-start-timeout 600m \ - --env LOG_FILTER_TYPE=local_rank0_only \ - --env OMP_NUM_THREADS=8 \ - --env OLMO_TASK=model \ - --env-secret WANDB_API_KEY=AKSHITAB_WANDB_API_KEY \ - --env-secret AWS_ACCESS_KEY_ID=AKSHITAB_AWS_ACCESS_KEY_ID \ - --env-secret AWS_SECRET_ACCESS_KEY=AKSHITAB_AWS_SECRET_ACCESS_KEY \ - --shared-memory 10GiB \ - --venv base \ - --yes \ - --timeout=-1 \ - -- /bin/bash -c "scripts/beaker/chameleon/llamaish7-normal-qk-norm-reorder-zloss.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/chameleon/llamaish7-normal-qk-norm-reorder-zloss.sh b/scripts/beaker/chameleon/llamaish7-normal-qk-norm-reorder-zloss.sh deleted file mode 100755 index ee73526a0..000000000 --- a/scripts/beaker/chameleon/llamaish7-normal-qk-norm-reorder-zloss.sh +++ /dev/null @@ -1,57 +0,0 @@ -#!/usr/bin/env bash -set -exuo pipefail -IFS=$'\n\t' - -BEAKER_LEADER_REPLICA_HOSTNAME=$1 -shift - -NUM_NODES=$1 -shift - -BEAKER_REPLICA_RANK=$1 -shift - -# Warm HF cache -mkdir -p /root/.cache -pushd /root/.cache -curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf - -popd -export HF_DATASETS_OFFLINE=1 - -export EXPERIMENT=llamaish7-normal-qk-norm-reorder-zloss - -torchrun \ - --nnodes ${NUM_NODES}:${NUM_NODES} \ - --nproc-per-node 8 \ - --rdzv_id=12347 \ - --rdzv_backend=static \ - --rdzv_endpoint=$BEAKER_LEADER_REPLICA_HOSTNAME:29400 \ - --node_rank=$BEAKER_REPLICA_RANK \ - --rdzv_conf="read_timeout=420" \ - scripts/train.py \ - configs/llamaish7-s3.yaml \ - --run_name=$EXPERIMENT \ - --wandb.name=$EXPERIMENT \ - --wandb.group=$EXPERIMENT \ - --model.flash_attention=true \ - --fsdp.wrapping_strategy=by_block_and_size \ - --fsdp.sharding_strategy=SHARD_GRAD_OP \ - --save_folder=runs/ \ - --activation_checkpointing=fine_grained \ - --fused_loss=true \ - --device_train_microbatch_size=2 \ - --global_train_batch_size=1024 \ - --save_interval=250 \ - --eval_interval=250 \ - --optimizer.metrics_log_interval=1 \ - --save_overwrite \ - --model.init_fn=normal \ - --model.init_std=0.02 \ - --model.scale_emb_init \ - --model.clip_qkv=null \ - --model.attention_layer_norm=true \ - --model.norm_after=true \ - --softmax_auxiliary_loss=true \ - --auxiliary_loss_multiplier=1e-5 \ - --save_num_checkpoints_to_keep=3 - # '--load_path=${path.last_checkpoint:s3://ai2-llm/checkpoints/OLMo-medium/llamaish7-qk-norm-reorder-zloss/}' diff --git a/scripts/beaker/chameleon/llamaish7-qk-norm-launch.sh b/scripts/beaker/chameleon/llamaish7-qk-norm-launch.sh deleted file mode 100755 index d78b56a0a..000000000 --- a/scripts/beaker/chameleon/llamaish7-qk-norm-launch.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/usr/bin/env bash - -set -ex - -NUM_NODES=8 - -gantry run \ - --workspace ai2/OLMo-training \ - --task-name llamaish7-qk-norm \ - --description "OLMo medium - 7B - Llamaish QK norm" \ - --priority high \ - --preemptible \ - --beaker-image shanea/olmo-torch2.2-gantry \ - --cluster ai2/jupiter-cirrascale \ - --gpus 8 \ - --replicas "${NUM_NODES}" \ - --leader-selection \ - --host-networking \ - --budget ai2/oe-training \ - --no-nfs \ - --propagate-failure \ - --synchronized-start-timeout 600m \ - --env LOG_FILTER_TYPE=local_rank0_only \ - --env OMP_NUM_THREADS=8 \ - --env OLMO_TASK=model \ - --env-secret WANDB_API_KEY=AKSHITAB_WANDB_API_KEY \ - --env-secret AWS_ACCESS_KEY_ID=AKSHITAB_AWS_ACCESS_KEY_ID \ - --env-secret AWS_SECRET_ACCESS_KEY=AKSHITAB_AWS_SECRET_ACCESS_KEY \ - --shared-memory 10GiB \ - --venv base \ - --yes \ - --timeout=-1 \ - -- /bin/bash -c "scripts/beaker/chameleon/llamaish7-qk-norm.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/chameleon/llamaish7-qk-norm-reorder-launch.sh b/scripts/beaker/chameleon/llamaish7-qk-norm-reorder-launch.sh deleted file mode 100755 index dee45ea79..000000000 --- a/scripts/beaker/chameleon/llamaish7-qk-norm-reorder-launch.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/usr/bin/env bash - -set -ex - -NUM_NODES=8 - -gantry run \ - --workspace ai2/OLMo-training \ - --task-name llamaish7-qk-norm-reorder \ - --description "OLMo medium - 7B - Llamaish QK norm reorder" \ - --priority high \ - --preemptible \ - --beaker-image shanea/olmo-torch2.2-gantry \ - --cluster ai2/jupiter-cirrascale \ - --gpus 8 \ - --replicas "${NUM_NODES}" \ - --leader-selection \ - --host-networking \ - --budget ai2/oe-training \ - --no-nfs \ - --propagate-failure \ - --synchronized-start-timeout 600m \ - --env LOG_FILTER_TYPE=local_rank0_only \ - --env OMP_NUM_THREADS=8 \ - --env OLMO_TASK=model \ - --env-secret WANDB_API_KEY=AKSHITAB_WANDB_API_KEY \ - --env-secret AWS_ACCESS_KEY_ID=AKSHITAB_AWS_ACCESS_KEY_ID \ - --env-secret AWS_SECRET_ACCESS_KEY=AKSHITAB_AWS_SECRET_ACCESS_KEY \ - --shared-memory 10GiB \ - --venv base \ - --yes \ - --timeout=-1 \ - -- /bin/bash -c "scripts/beaker/chameleon/llamaish7-qk-norm-reorder.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/chameleon/llamaish7-qk-norm-reorder-zloss-launch.sh b/scripts/beaker/chameleon/llamaish7-qk-norm-reorder-zloss-launch.sh deleted file mode 100755 index 68bcdcf00..000000000 --- a/scripts/beaker/chameleon/llamaish7-qk-norm-reorder-zloss-launch.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/usr/bin/env bash - -set -ex - -NUM_NODES=8 - -gantry run \ - --workspace ai2/OLMo-training \ - --task-name llamaish7-qk-norm-reorder-zloss \ - --description "OLMo medium - 7B - Llamaish QK norm reorder zloss" \ - --priority high \ - --preemptible \ - --beaker-image shanea/olmo-torch2.2-gantry \ - --cluster ai2/jupiter-cirrascale \ - --gpus 8 \ - --replicas "${NUM_NODES}" \ - --leader-selection \ - --host-networking \ - --budget ai2/oe-training \ - --no-nfs \ - --propagate-failure \ - --synchronized-start-timeout 600m \ - --env LOG_FILTER_TYPE=local_rank0_only \ - --env OMP_NUM_THREADS=8 \ - --env OLMO_TASK=model \ - --env-secret WANDB_API_KEY=AKSHITAB_WANDB_API_KEY \ - --env-secret AWS_ACCESS_KEY_ID=AKSHITAB_AWS_ACCESS_KEY_ID \ - --env-secret AWS_SECRET_ACCESS_KEY=AKSHITAB_AWS_SECRET_ACCESS_KEY \ - --shared-memory 10GiB \ - --venv base \ - --yes \ - --timeout=-1 \ - -- /bin/bash -c "scripts/beaker/chameleon/llamaish7-qk-norm-reorder-zloss.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/chameleon/llamaish7-qk-norm-reorder-zloss.sh b/scripts/beaker/chameleon/llamaish7-qk-norm-reorder-zloss.sh deleted file mode 100755 index d61c40a4a..000000000 --- a/scripts/beaker/chameleon/llamaish7-qk-norm-reorder-zloss.sh +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/env bash -set -exuo pipefail -IFS=$'\n\t' - -BEAKER_LEADER_REPLICA_HOSTNAME=$1 -shift - -NUM_NODES=$1 -shift - -BEAKER_REPLICA_RANK=$1 -shift - -# Warm HF cache -mkdir -p /root/.cache -pushd /root/.cache -curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf - -popd -export HF_DATASETS_OFFLINE=1 - -torchrun \ - --nnodes ${NUM_NODES}:${NUM_NODES} \ - --nproc-per-node 8 \ - --rdzv_id=12347 \ - --rdzv_backend=static \ - --rdzv_endpoint=$BEAKER_LEADER_REPLICA_HOSTNAME:29400 \ - --node_rank=$BEAKER_REPLICA_RANK \ - --rdzv_conf="read_timeout=420" \ - scripts/train.py \ - configs/llamaish7-s3.yaml \ - --run_name=llamaish7-qk-norm-reorder-zloss \ - --wandb.name=llamaish7-qk-norm-reorder-zloss \ - --wandb.group=llamaish7-qk-norm-reorder-zloss \ - --model.flash_attention=true \ - --fsdp.wrapping_strategy=by_block_and_size \ - --fsdp.sharding_strategy=SHARD_GRAD_OP \ - --save_folder=runs/ \ - --activation_checkpointing=fine_grained \ - --fused_loss=true \ - --device_train_microbatch_size=2 \ - --global_train_batch_size=1024 \ - --save_interval=250 \ - --eval_interval=250 \ - --optimizer.metrics_log_interval=1 \ - --save_overwrite \ - --model.scale_emb_init \ - --model.clip_qkv=null \ - --model.attention_layer_norm=true \ - --model.norm_after=true \ - --softmax_auxiliary_loss=true \ - --auxiliary_loss_multiplier=1e-5 \ - --save_num_checkpoints_to_keep=3 \ - --load_path=s3://ai2-llm/checkpoints/OLMo-medium/llamaish7-EmbInitFix/step0-unsharded - # '--load_path=${path.last_checkpoint:s3://ai2-llm/checkpoints/OLMo-medium/llamaish7-qk-norm-reorder-zloss/}' diff --git a/scripts/beaker/chameleon/llamaish7-qk-norm-reorder.sh b/scripts/beaker/chameleon/llamaish7-qk-norm-reorder.sh deleted file mode 100755 index 95eb82733..000000000 --- a/scripts/beaker/chameleon/llamaish7-qk-norm-reorder.sh +++ /dev/null @@ -1,52 +0,0 @@ -#!/usr/bin/env bash -set -exuo pipefail -IFS=$'\n\t' - -BEAKER_LEADER_REPLICA_HOSTNAME=$1 -shift - -NUM_NODES=$1 -shift - -BEAKER_REPLICA_RANK=$1 -shift - -# Warm HF cache -mkdir -p /root/.cache -pushd /root/.cache -curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf - -popd -export HF_DATASETS_OFFLINE=1 - -torchrun \ - --nnodes ${NUM_NODES}:${NUM_NODES} \ - --nproc-per-node 8 \ - --rdzv_id=12347 \ - --rdzv_backend=static \ - --rdzv_endpoint=$BEAKER_LEADER_REPLICA_HOSTNAME:29400 \ - --node_rank=$BEAKER_REPLICA_RANK \ - --rdzv_conf="read_timeout=420" \ - scripts/train.py \ - configs/llamaish7-s3.yaml \ - --run_name=llamaish7-qk-norm-reorder \ - --wandb.name=llamaish7-qk-norm-reorder \ - --wandb.group=llamaish7-qk-norm-reorder \ - --model.flash_attention=true \ - --fsdp.wrapping_strategy=by_block_and_size \ - --fsdp.sharding_strategy=SHARD_GRAD_OP \ - --save_folder=runs/ \ - --activation_checkpointing=fine_grained \ - --fused_loss=true \ - --device_train_microbatch_size=2 \ - --global_train_batch_size=1024 \ - --save_interval=250 \ - --eval_interval=250 \ - --optimizer.metrics_log_interval=1 \ - --save_overwrite \ - --model.scale_emb_init \ - --model.clip_qkv=null \ - --model.attention_layer_norm=true \ - --model.norm_after=true \ - --save_num_checkpoints_to_keep=3 \ - '--load_path=${path.last_checkpoint:s3://ai2-llm/checkpoints/OLMo-medium/llamaish7-qk-norm-reorder/}' - #--load_path=s3://ai2-llm/checkpoints/OLMo-medium/llamaish7-EmbInitFix/step0-unsharded diff --git a/scripts/beaker/chameleon/llamaish7-qk-norm.sh b/scripts/beaker/chameleon/llamaish7-qk-norm.sh deleted file mode 100755 index d41a7cae7..000000000 --- a/scripts/beaker/chameleon/llamaish7-qk-norm.sh +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env bash -set -exuo pipefail -IFS=$'\n\t' - -BEAKER_LEADER_REPLICA_HOSTNAME=$1 -shift - -NUM_NODES=$1 -shift - -BEAKER_REPLICA_RANK=$1 -shift - -# Warm HF cache -mkdir -p /root/.cache -pushd /root/.cache -curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf - -popd -export HF_DATASETS_OFFLINE=1 - -torchrun \ - --nnodes ${NUM_NODES}:${NUM_NODES} \ - --nproc-per-node 8 \ - --rdzv_id=12347 \ - --rdzv_backend=static \ - --rdzv_endpoint=$BEAKER_LEADER_REPLICA_HOSTNAME:29400 \ - --node_rank=$BEAKER_REPLICA_RANK \ - --rdzv_conf="read_timeout=420" \ - scripts/train.py \ - configs/llamaish7-s3.yaml \ - --run_name=llamaish7-qk-norm \ - --wandb.name=llamaish7-qk-norm \ - --wandb.group=llamaish7-qk-norm \ - --model.flash_attention=true \ - --fsdp.wrapping_strategy=by_block_and_size \ - --fsdp.sharding_strategy=SHARD_GRAD_OP \ - --save_folder=runs/ \ - --activation_checkpointing=fine_grained \ - --fused_loss=true \ - --device_train_microbatch_size=2 \ - --global_train_batch_size=1024 \ - --save_interval=50 \ - --eval_interval=50 \ - --optimizer.metrics_log_interval=1 \ - --save_overwrite \ - --model.scale_emb_init \ - --model.clip_qkv=null \ - --model.attention_layer_norm=true \ - --load_path=s3://ai2-llm/checkpoints/OLMo-medium/llamaish7-EmbInitFix/step0-unsharded - # '--load_path=${path.last_checkpoint:s3://ai2-llm/checkpoints/OLMo-medium/llamaish7-qk-norm/}' diff --git a/scripts/beaker/ib-ananya-1b.sh b/scripts/beaker/ib-ananya-1b.sh deleted file mode 100755 index aff1460c7..000000000 --- a/scripts/beaker/ib-ananya-1b.sh +++ /dev/null @@ -1,58 +0,0 @@ -#!/bin/bash - -set -ex - -export LOAD_PATH_ARG="" -export CONFIG_PATH=scripts/ananya-1b-ib.yaml -export NCCL_DEBUG=INFO - -# get run name, we will use this as task name in gantry -RUN_NAME=$(cat $CONFIG_PATH | grep -ohP "^run_name\:\w*(.+)$" | sed 's/run_name:\s*//') - -# get a hash of the load path and config path; take the first 8 characters -RUN_HASH=$(echo "${LOAD_PATH_ARG}-${CONFIG_PATH}" | md5sum | cut -c 1-8) - -# compose the two -FULL_RUN_NAME="${RUN_NAME}-${RUN_HASH}" - -# check if there is an env var called 'WANDB_API_KEY' and if so, create a flag -# to pass to gantry -if [ -z ${WANDB_API_KEY+x} ]; then - WANDB_API_KEY_ARG="--env-secret WANDB_API_KEY=WANDB_API_KEY" -else - WANDB_API_KEY_ARG="--env WANDB_API_KEY=${WANDB_API_KEY}" -fi - -# check if there is an env var called 'AWS_ACCESS_KEY_ID' and 'AWS_SECRET_ACCESS_KEY' and if so, create a flag -# to pass to gantry -if [ -z ${WANDB_API_KEY+x} ]; then - AWS_ACCESS_KEY_ID_ARG="--env-secret AWS_ACCESS_KEY_ID=AWS_ACCESS_KEY_ID" - AWS_SECRET_ACCESS_KEY_ARG="--env-secret AWS_SECRET_ACCESS_KEY=AWS_SECRET_ACCESS_KEY" -else - AWS_ACCESS_KEY_ID_ARG="--env AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID}" - AWS_SECRET_ACCESS_KEY_ARG="--env AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY}" -fi - -NUM_NODES=2 - -gantry run \ - --workspace ai2/llm-testing \ - --task-name "${FULL_RUN_NAME}" \ - --description "${FULL_RUN_NAME}" \ - --priority "high" \ - --beaker-image olmo-torch2-gantry \ - --cluster ai2/general-cirrascale-a100-80g-ib \ - --gpus 8 \ - --replicas ${NUM_NODES} \ - --leader-selection \ - --host-networking \ - --nfs \ - ${WANDB_API_KEY_ARG} \ - ${AWS_ACCESS_KEY_ID_ARG} \ - ${AWS_SECRET_ACCESS_KEY_ARG} \ - --env LOG_FILTER_TYPE=local_rank0_only \ - --env OMP_NUM_THREADS=8 \ - --shared-memory 10GiB \ - --venv base \ - --yes \ - -- /bin/bash -c "torchrun --nnodes ${NUM_NODES}:${NUM_NODES} --nproc-per-node 8 --rdzv_id=101 --rdzv_backend=c10d --rdzv_endpoint=$BEAKER_LEADER_REPLICA_HOSTNAME scripts/train.py ${CONFIG_PATH} --model.flash_attention=true" diff --git a/scripts/beaker/llamaish7-launch.sh b/scripts/beaker/llamaish7-launch.sh deleted file mode 100755 index e7ced5ba6..000000000 --- a/scripts/beaker/llamaish7-launch.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/env bash - -set -ex - -NUM_NODES=8 - -gantry run \ - --workspace ai2/dirkg \ - --task-name llamaish7 \ - --description "OLMo medium - 7B - Llamaish" \ - --priority high \ - --beaker-image shanea/olmo-torch2.2-gantry \ - --cluster ai2/pluto-cirrascale \ - --gpus 8 \ - --replicas "${NUM_NODES}" \ - --leader-selection \ - --host-networking \ - --budget ai2/oe-training \ - --no-nfs \ - --propagate-failure \ - --synchronized-start-timeout 10m \ - --env LOG_FILTER_TYPE=local_rank0_only \ - --env OMP_NUM_THREADS=8 \ - --env OLMO_TASK=model \ - --env-secret WANDB_API_KEY=WANDB_API_KEY \ - --env-secret AWS_ACCESS_KEY_ID=AWS_ACCESS_KEY_ID \ - --env-secret AWS_SECRET_ACCESS_KEY=AWS_SECRET_ACCESS_KEY \ - --shared-memory 10GiB \ - --venv base \ - --yes \ - --timeout=-1 \ - -- /bin/bash -c "scripts/beaker/llamaish7.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/llamaish7-normal-launch.sh b/scripts/beaker/llamaish7-normal-launch.sh deleted file mode 100755 index de9bd286b..000000000 --- a/scripts/beaker/llamaish7-normal-launch.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/usr/bin/env bash - -set -ex - -NUM_NODES=64 - -gantry run \ - --workspace ai2/OLMo-training \ - --task-name llamaish7-normal-qk-norm-reorder-zloss \ - --description "OLMo medium - 7B - Llamaish Normal" \ - --priority urgent \ - --preemptible \ - --beaker-image shanea/olmo-torch2.3-gantry \ - --cluster ai2/jupiter-cirrascale-2 \ - --gpus 8 \ - --replicas "${NUM_NODES}" \ - --leader-selection \ - --host-networking \ - --budget ai2/oe-training \ - --no-nfs \ - --propagate-failure \ - --synchronized-start-timeout 15m \ - --env LOG_FILTER_TYPE=local_rank0_only \ - --env OMP_NUM_THREADS=8 \ - --env OLMO_TASK=model \ - --env-secret WANDB_API_KEY=AKSHITAB_WANDB_API_KEY \ - --env-secret AWS_ACCESS_KEY_ID=AKSHITAB_AWS_ACCESS_KEY_ID \ - --env-secret AWS_SECRET_ACCESS_KEY=AKSHITAB_AWS_SECRET_ACCESS_KEY \ - --shared-memory 10GiB \ - --venv base \ - --yes \ - --timeout=-1 \ - -- /bin/bash -c "scripts/beaker/llamaish7-normal.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/llamaish7-normal.sh b/scripts/beaker/llamaish7-normal.sh deleted file mode 100755 index 8b6b3bc3b..000000000 --- a/scripts/beaker/llamaish7-normal.sh +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/env bash -set -exuo pipefail -IFS=$'\n\t' - -BEAKER_LEADER_REPLICA_HOSTNAME=$1 -shift - -NUM_NODES=$1 -shift - -BEAKER_REPLICA_RANK=$1 -shift - -# Warm HF cache -mkdir -p /root/.cache -pushd /root/.cache -curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf - -popd -export HF_DATASETS_OFFLINE=1 - -export EXPERIMENT=llamaish7-normal - -torchrun \ - --nnodes ${NUM_NODES}:${NUM_NODES} \ - --nproc-per-node 8 \ - --rdzv_id=12347 \ - --rdzv_backend=static \ - --rdzv_endpoint=$BEAKER_LEADER_REPLICA_HOSTNAME:29400 \ - --node_rank=$BEAKER_REPLICA_RANK \ - --rdzv_conf="read_timeout=420" \ - scripts/train.py \ - configs/llamaish7-s3.yaml \ - --run_name=$EXPERIMENT \ - --wandb.name=$EXPERIMENT \ - --wandb.group=$EXPERIMENT \ - --model.flash_attention=true \ - --fsdp.wrapping_strategy=by_block_and_size \ - --fsdp.sharding_strategy=SHARD_GRAD_OP \ - --save_folder=runs/ \ - --activation_checkpointing=fine_grained \ - --fused_loss=true \ - --device_train_microbatch_size=2 \ - --global_train_batch_size=1024 \ - --save_interval=250 \ - --eval_interval=250 \ - --optimizer.metrics_log_interval=1 \ - --save_overwrite \ - --model.init_fn=normal \ - --model.init_std=0.02 \ - --model.clip_qkv=null \ - --save_num_checkpoints_to_keep=3 \ - --scheduler.units=steps \ - --scheduler.t_warmup=2000 - # '--load_path=${path.last_checkpoint:s3://ai2-llm/checkpoints/OLMo-medium/llamaish7-normal/}' diff --git a/scripts/beaker/llamaish7.sh b/scripts/beaker/llamaish7.sh deleted file mode 100755 index 8bdd69a37..000000000 --- a/scripts/beaker/llamaish7.sh +++ /dev/null @@ -1,46 +0,0 @@ -#!/usr/bin/env bash -set -exuo pipefail -IFS=$'\n\t' - -BEAKER_LEADER_REPLICA_HOSTNAME=$1 -shift - -NUM_NODES=$1 -shift - -BEAKER_REPLICA_RANK=$1 -shift - -# Warm HF cache -mkdir -p /root/.cache -pushd /root/.cache -curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf - -popd -export HF_DATASETS_OFFLINE=1 - -torchrun \ - --nnodes ${NUM_NODES}:${NUM_NODES} \ - --nproc-per-node 8 \ - --rdzv_id=12347 \ - --rdzv_backend=static \ - --rdzv_endpoint=$BEAKER_LEADER_REPLICA_HOSTNAME:29400 \ - --node_rank=$BEAKER_REPLICA_RANK \ - --rdzv_conf="read_timeout=420" \ - scripts/train.py \ - configs/llamaish7-s3.yaml \ - --run_name=llamaish7-detailed \ - --wandb.name=llamaish7-detailed \ - --wandb.group=llamaish7-detailed \ - --model.flash_attention=true \ - --fsdp.wrapping_strategy=by_block_and_size \ - --fsdp.sharding_strategy=SHARD_GRAD_OP \ - --save_folder=runs/ \ - --activation_checkpointing=fine_grained \ - --fused_loss=true \ - --device_train_microbatch_size=2 \ - --global_train_batch_size=1024 \ - --save_interval=50 \ - --eval_interval=50 \ - --optimizer.metrics_log_interval=1 \ - --save_overwrite \ - '--load_path=${path.last_checkpoint:s3://ai2-llm/checkpoints/OLMo-medium/llamaish7-detailed/}' diff --git a/scripts/beaker/mitch-ish-7b.sh b/scripts/beaker/mitch-ish-7b.sh deleted file mode 100755 index 3fd81cade..000000000 --- a/scripts/beaker/mitch-ish-7b.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/usr/bin/env bash - -set -ex - -CONFIG_PATH=configs/v1_5-mix-medium-mitch-ish-s3.yaml -NUM_NODES=4 -ARGS='--activation_checkpointing=fine_grained wandb.name=v1_5-mix-mitch-ish-mcli-final --epoch=1 --optimizer.learning_rate=0.000023 --scheduler.t_warmup=556000 --scheduler.t_max=557000 --scheduler.alpha_f=0.001 --stop_at=557000' - -gantry run \ - --allow-dirty \ - --workspace ai2/llm-testing \ - --task-name mitchish-mcli-final \ - --description mitchish-mcli-final \ - --priority high \ - --beaker-image olmo-torch2-gantry \ - --cluster ai2/general-cirrascale-a100-80g-ib \ - --gpus 8 \ - --replicas "${NUM_NODES}" \ - --nfs \ - --mount /net/nfs.cirrascale/allennlp/petew/cache:/root/.cache \ - --env LOG_FILTER_TYPE=local_rank0_only \ - --env OMP_NUM_THREADS=8 \ - --env OLMO_TASK=model \ - --env-secret WANDB_API_KEY=WANDB_API_KEY \ - --env-secret AWS_ACCESS_KEY_ID=AWS_ACCESS_KEY_ID \ - --env-secret AWS_SECRET_ACCESS_KEY=AWS_SECRET_ACCESS_KEY \ - --shared-memory 10GiB \ - --venv base \ - --yes \ - -- /bin/bash -c "torchrun --nnodes ${NUM_NODES}:${NUM_NODES} --nproc-per-node 8 --rdzv_id=101 --rdzv_backend=c10d --rdzv_endpoint=\$BEAKER_LEADER_REPLICA_HOSTNAME:29400 scripts/train.py ${CONFIG_PATH} ${ARGS}" diff --git a/scripts/beaker/mitchish65.sh b/scripts/beaker/mitchish65.sh deleted file mode 100755 index 9c2061bd5..000000000 --- a/scripts/beaker/mitchish65.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/usr/bin/env bash - -set -ex - -CONFIG_PATH=configs/mitchish65-s3.yaml -NUM_NODES=4 -ARGS='--device_train_microbatch_size=4 --model.flash_attention=true' - -gantry run \ - --allow-dirty \ - --workspace ai2/llm-testing \ - --task-name mitchish65 \ - --description mitchish65 \ - --priority high \ - --beaker-image olmo-torch2-gantry \ - --cluster ai2/general-cirrascale-a100-80g-ib \ - --gpus 8 \ - --replicas "${NUM_NODES}" \ - --lead-selection \ - --host-networking \ - --budget ai2/oe-training \ - --nfs \ - --mount /net/nfs.cirrascale/allennlp/petew/cache:/root/.cache \ - --env LOG_FILTER_TYPE=local_rank0_only \ - --env OMP_NUM_THREADS=8 \ - --env OLMO_TASK=model \ - --env-secret WANDB_API_KEY=WANDB_API_KEY \ - --env-secret AWS_ACCESS_KEY_ID=AWS_ACCESS_KEY_ID \ - --env-secret AWS_SECRET_ACCESS_KEY=AWS_SECRET_ACCESS_KEY \ - --shared-memory 10GiB \ - --venv base \ - --yes \ - -- /bin/bash -c "torchrun --nnodes ${NUM_NODES}:${NUM_NODES} --nproc-per-node 8 --rdzv_id=101 --rdzv_backend=c10d --rdzv_endpoint=\$BEAKER_LEADER_REPLICA_HOSTNAME:29400 scripts/train.py ${CONFIG_PATH} ${ARGS}" diff --git a/scripts/beaker/mitchish7-launch.sh b/scripts/beaker/mitchish7-launch.sh deleted file mode 100755 index 3e41098a7..000000000 --- a/scripts/beaker/mitchish7-launch.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/env bash - -set -ex - -NUM_NODES=8 - -gantry run \ - --workspace ai2/dirkg \ - --task-name mitchish7 \ - --description "OLMo medium - 7B" \ - --priority normal \ - --beaker-image shanea/olmo-torch2.2-gantry \ - --cluster ai2/jupiter-cirrascale \ - --gpus 8 \ - --replicas "${NUM_NODES}" \ - --leader-selection \ - --host-networking \ - --budget ai2/oe-training \ - --no-nfs \ - --propagate-failure \ - --synchronized-start-timeout 10m \ - --env LOG_FILTER_TYPE=local_rank0_only \ - --env OMP_NUM_THREADS=8 \ - --env OLMO_TASK=model \ - --env-secret WANDB_API_KEY=WANDB_API_KEY \ - --env-secret AWS_ACCESS_KEY_ID=AWS_ACCESS_KEY_ID \ - --env-secret AWS_SECRET_ACCESS_KEY=AWS_SECRET_ACCESS_KEY \ - --shared-memory 10GiB \ - --venv base \ - --yes \ - --timeout=-1 \ - -- /bin/bash -c "scripts/beaker/mitchish7.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/mitchish7-llamainit-launch.sh b/scripts/beaker/mitchish7-llamainit-launch.sh deleted file mode 100755 index 7c39bb9b7..000000000 --- a/scripts/beaker/mitchish7-llamainit-launch.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/env bash - -set -ex - -NUM_NODES=8 - -gantry run \ - --workspace ai2/dirkg \ - --task-name mitchish7-llamainit \ - --description "OLMo medium - 7B - Llama Init" \ - --priority high \ - --beaker-image shanea/olmo-torch2.2-gantry \ - --cluster ai2/pluto-cirrascale \ - --gpus 8 \ - --replicas "${NUM_NODES}" \ - --leader-selection \ - --host-networking \ - --budget ai2/oe-training \ - --no-nfs \ - --propagate-failure \ - --synchronized-start-timeout 10m \ - --env LOG_FILTER_TYPE=local_rank0_only \ - --env OMP_NUM_THREADS=8 \ - --env OLMO_TASK=model \ - --env-secret WANDB_API_KEY=WANDB_API_KEY \ - --env-secret AWS_ACCESS_KEY_ID=AWS_ACCESS_KEY_ID \ - --env-secret AWS_SECRET_ACCESS_KEY=AWS_SECRET_ACCESS_KEY \ - --shared-memory 10GiB \ - --venv base \ - --yes \ - --timeout=-1 \ - -- /bin/bash -c "scripts/beaker/mitchish7-llamainit.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/mitchish7-llamainit.sh b/scripts/beaker/mitchish7-llamainit.sh deleted file mode 100755 index 640dd2bf6..000000000 --- a/scripts/beaker/mitchish7-llamainit.sh +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/env bash -set -exuo pipefail -IFS=$'\n\t' - -BEAKER_LEADER_REPLICA_HOSTNAME=$1 -shift - -NUM_NODES=$1 -shift - -BEAKER_REPLICA_RANK=$1 -shift - -# Warm HF cache -mkdir -p /root/.cache -pushd /root/.cache -curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf - -popd -export HF_DATASETS_OFFLINE=1 - -torchrun \ - --nnodes ${NUM_NODES}:${NUM_NODES} \ - --nproc-per-node 8 \ - --rdzv_id=12346 \ - --rdzv_backend=static \ - --rdzv_endpoint=$BEAKER_LEADER_REPLICA_HOSTNAME:29400 \ - --node_rank=$BEAKER_REPLICA_RANK \ - --rdzv_conf="read_timeout=420" \ - scripts/train.py \ - configs/mitchish7-llamainit-s3.yaml \ - --run_name=mitchish7-llamainit \ - --wandb.name=mitchish7-llamainit \ - --model.flash_attention=true \ - --fsdp.wrapping_strategy=by_block_and_size \ - --fsdp.sharding_strategy=SHARD_GRAD_OP \ - --save_folder=runs/ \ - --activation_checkpointing=fine_grained \ - --fused_loss=true \ - --device_train_microbatch_size=2 \ - --global_train_batch_size=1024 \ - --save_overwrite \ - '--load_path=${path.last_checkpoint:s3://ai2-llm/checkpoints/OLMo-medium/mitchish7-llamainit/}' \ No newline at end of file diff --git a/scripts/beaker/mitchish7.sh b/scripts/beaker/mitchish7.sh deleted file mode 100755 index d91c6eaac..000000000 --- a/scripts/beaker/mitchish7.sh +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/env bash -set -exuo pipefail -IFS=$'\n\t' - -BEAKER_LEADER_REPLICA_HOSTNAME=$1 -shift - -NUM_NODES=$1 -shift - -BEAKER_REPLICA_RANK=$1 -shift - -# Warm HF cache -mkdir -p /root/.cache -pushd /root/.cache -curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf - -popd -export HF_DATASETS_OFFLINE=1 - -torchrun \ - --nnodes ${NUM_NODES}:${NUM_NODES} \ - --nproc-per-node 8 \ - --rdzv_id=12347 \ - --rdzv_backend=static \ - --rdzv_endpoint=$BEAKER_LEADER_REPLICA_HOSTNAME:29400 \ - --node_rank=$BEAKER_REPLICA_RANK \ - --rdzv_conf="read_timeout=420" \ - scripts/train.py \ - configs/mitchish7-s3.yaml \ - --run_name=mitchish7-datafix \ - --wandb.name=mitchish7-datafix \ - --model.flash_attention=true \ - --fsdp.wrapping_strategy=by_block_and_size \ - --fsdp.sharding_strategy=SHARD_GRAD_OP \ - --save_folder=runs/ \ - --activation_checkpointing=fine_grained \ - --fused_loss=true \ - --device_train_microbatch_size=2 \ - --global_train_batch_size=1024 \ - --gen1_gc_interval=32 \ - --save_overwrite \ - '--load_path=${path.last_checkpoint:s3://ai2-llm/checkpoints/OLMo-medium/mitchish7/}' - # --load_path=s3://ai2-llm/checkpoints/OLMo-medium/mitchish7/step614000/ \ No newline at end of file diff --git a/scripts/beaker/mitchish70-from160510-launch.sh b/scripts/beaker/mitchish70-from160510-launch.sh deleted file mode 100755 index 7d0279ca0..000000000 --- a/scripts/beaker/mitchish70-from160510-launch.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/env bash - -set -ex - -NUM_NODES=8 - -gantry run \ - --workspace ai2/OLMo-training \ - --task-name mitchish70-from160510 \ - --description "OLMo large - 70B - from160510" \ - --priority normal \ - --beaker-image shanea/olmo-torch2.2-gantry \ - --cluster ai2/jupiter-cirrascale \ - --cluster ai2/pluto-cirrascale \ - --preemptible \ - --gpus 8 \ - --replicas "${NUM_NODES}" \ - --leader-selection \ - --host-networking \ - --budget ai2/oe-training \ - --no-nfs \ - --propagate-failure \ - --synchronized-start-timeout 10m \ - --env LOG_FILTER_TYPE=local_rank0_only \ - --env OMP_NUM_THREADS=8 \ - --env OLMO_TASK=model \ - --env-secret WANDB_API_KEY=DIRKG_WANDB_API_KEY \ - --env-secret AWS_ACCESS_KEY_ID=AWS_ACCESS_KEY_ID \ - --env-secret AWS_SECRET_ACCESS_KEY=AWS_SECRET_ACCESS_KEY \ - --shared-memory 10GiB \ - --venv base \ - --yes \ - --timeout=-1 \ - -- /bin/bash -c "scripts/beaker/mitchish70-from160510.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/mitchish70-from160510.sh b/scripts/beaker/mitchish70-from160510.sh deleted file mode 100755 index a35cfee4d..000000000 --- a/scripts/beaker/mitchish70-from160510.sh +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/env bash -set -exuo pipefail -IFS=$'\n\t' - -BEAKER_LEADER_REPLICA_HOSTNAME=$1 -shift - -NUM_NODES=$1 -shift - -BEAKER_REPLICA_RANK=$1 -shift - -# Warm HF cache -mkdir -p /root/.cache -pushd /root/.cache -curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf - -popd -export HF_DATASETS_OFFLINE=1 - -torchrun \ - --nnodes ${NUM_NODES}:${NUM_NODES} \ - --nproc-per-node 8 \ - --rdzv_id=52346 \ - --rdzv_backend=static \ - --rdzv_endpoint=$BEAKER_LEADER_REPLICA_HOSTNAME:29400 \ - --node_rank=$BEAKER_REPLICA_RANK \ - --rdzv_conf="read_timeout=420" \ - scripts/train.py \ - configs/mitchish70-s3.yaml \ - --run_name=mitchish70-from160510 \ - '--wandb.group=${run_name}' \ - '--load_path=${path.last_checkpoint:${remote_save_folder}}' \ - --load_path_sharded_checkpointer=olmo_core \ - --sharded_checkpointer=olmo_core \ - --global_train_batch_size=3584 \ - --device_train_microbatch_size=4 \ - --fsdp.sharding_strategy=FULL_SHARD \ - --save_overwrite \ - --optimizer.learning_rate=3.0e-05 \ - --scheduler.alpha_f=1.0 \ - --scheduler.t_warmup=0 diff --git a/scripts/beaker/mitchish70-launch.sh b/scripts/beaker/mitchish70-launch.sh deleted file mode 100755 index 0b2c88872..000000000 --- a/scripts/beaker/mitchish70-launch.sh +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/env bash - -set -ex - -NUM_NODES=4 - -gantry run \ - --workspace ai2/dirkg \ - --task-name mitchish70 \ - --description "OLMo large - 70B" \ - --priority high \ - --stop-preemptible \ - --beaker-image petew/olmo-torch2-gantry \ - --cluster ai2/pluto-cirrascale \ - --gpus 8 \ - --replicas "${NUM_NODES}" \ - --leader-selection \ - --host-networking \ - --budget ai2/oe-training \ - --nfs \ - --mount /net/nfs.cirrascale/allennlp/petew/cache:/root/.cache \ - --env LOG_FILTER_TYPE=local_rank0_only \ - --env OMP_NUM_THREADS=8 \ - --env OLMO_TASK=model \ - --env-secret WANDB_API_KEY=WANDB_API_KEY \ - --env-secret AWS_ACCESS_KEY_ID=AWS_ACCESS_KEY_ID \ - --env-secret AWS_SECRET_ACCESS_KEY=AWS_SECRET_ACCESS_KEY \ - --env-secret R2_ACCESS_KEY_ID=R2_ACCESS_KEY_ID \ - --env-secret R2_SECRET_ACCESS_KEY=R2_SECRET_ACCESS_KEY \ - --env-secret R2_ENDPOINT_URL=R2_ENDPOINT_URL \ - --shared-memory 10GiB \ - --venv base \ - --yes \ - --timeout=-1 \ - -- /bin/bash -c "scripts/beaker/mitchish70.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES}" diff --git a/scripts/beaker/mitchish70-loadtest-launch.sh b/scripts/beaker/mitchish70-loadtest-launch.sh deleted file mode 100755 index 8718f9446..000000000 --- a/scripts/beaker/mitchish70-loadtest-launch.sh +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/env bash - -set -ex - -NUM_NODES=4 - -gantry run \ - --workspace ai2/dirkg \ - --task-name mitchish70-loadtest \ - --description "OLMo large - 70B - loadtest" \ - --priority high \ - --stop-preemptible \ - --beaker-image petew/olmo-torch2-gantry \ - --cluster ai2/pluto-cirrascale \ - --gpus 8 \ - --replicas "${NUM_NODES}" \ - --leader-selection \ - --host-networking \ - --budget ai2/oe-training \ - --nfs \ - --mount /net/nfs.cirrascale/allennlp/petew/cache:/root/.cache \ - --env LOG_FILTER_TYPE=local_rank0_only \ - --env OMP_NUM_THREADS=8 \ - --env OLMO_TASK=model \ - --env-secret WANDB_API_KEY=WANDB_API_KEY \ - --env-secret AWS_ACCESS_KEY_ID=AWS_ACCESS_KEY_ID \ - --env-secret AWS_SECRET_ACCESS_KEY=AWS_SECRET_ACCESS_KEY \ - --env-secret R2_ACCESS_KEY_ID=R2_ACCESS_KEY_ID \ - --env-secret R2_SECRET_ACCESS_KEY=R2_SECRET_ACCESS_KEY \ - --env-secret R2_ENDPOINT_URL=R2_ENDPOINT_URL \ - --shared-memory 10GiB \ - --venv base \ - --yes \ - --timeout=-1 \ - -- /bin/bash -c "scripts/beaker/mitchish70-loadtest.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES}" diff --git a/scripts/beaker/mitchish70-loadtest.sh b/scripts/beaker/mitchish70-loadtest.sh deleted file mode 100755 index d24b103b1..000000000 --- a/scripts/beaker/mitchish70-loadtest.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/env bash -set -exuo pipefail -IFS=$'\n\t' - -BEAKER_LEADER_REPLICA_HOSTNAME=$1 -shift - -NUM_NODES=$1 -shift - -# Warm HF cache -mkdir -p /root/.cache -pushd /root/.cache -curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf - -popd -export HF_DATASETS_OFFLINE=1 - -torchrun \ - --nnodes ${NUM_NODES}:${NUM_NODES} \ - --nproc-per-node 8 \ - --rdzv_id=101 \ - --rdzv_backend=c10d \ - --rdzv_endpoint=$BEAKER_LEADER_REPLICA_HOSTNAME:29400 \ - scripts/train.py \ - configs/mitchish70-s3.yaml \ - --run_name=mitchish70-loadtest \ - --wandb.name=mitchish70-loadtest \ - --model.flash_attention=true \ - --fsdp.wrapping_strategy=by_block_and_size \ - --save_folder=runs/ \ - --fused_loss=true \ - --device_train_microbatch_size=2 \ - --global_train_batch_size=512 \ - --save_overwrite \ - --remote_save_folder=null \ - --load_path=s3://ai2-llm/checkpoints/OLMo-large/mitchish70-002/step32300-unsharded \ No newline at end of file diff --git a/scripts/beaker/mitchish70.sh b/scripts/beaker/mitchish70.sh deleted file mode 100755 index 3aec7fcab..000000000 --- a/scripts/beaker/mitchish70.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/usr/bin/env bash -set -exuo pipefail -IFS=$'\n\t' - -BEAKER_LEADER_REPLICA_HOSTNAME=$1 -shift - -NUM_NODES=$1 -shift - -# Warm HF cache -mkdir -p /root/.cache -pushd /root/.cache -curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf - -popd -export HF_DATASETS_OFFLINE=1 - -torchrun \ - --nnodes ${NUM_NODES}:${NUM_NODES} \ - --nproc-per-node 8 \ - --rdzv_id=101 \ - --rdzv_backend=c10d \ - --rdzv_endpoint=$BEAKER_LEADER_REPLICA_HOSTNAME:29400 \ - scripts/train.py \ - configs/mitchish70-s3.yaml \ - --run_name=mitchish70-002 \ - --wandb.name=mitchish70-official \ - --device_train_microbatch_size=3 \ - --global_train_batch_size=1536 \ - '--load_path=${path.last_checkpoint:${remote_save_folder}}' \ - --save_overwrite \ No newline at end of file diff --git a/scripts/beaker/olmo-small-ablation-on-gantry.sh b/scripts/beaker/olmo-small-ablation-on-gantry.sh deleted file mode 100755 index 15381246f..000000000 --- a/scripts/beaker/olmo-small-ablation-on-gantry.sh +++ /dev/null @@ -1,59 +0,0 @@ -#!/bin/bash - -set -ex - -# check if LOAD_PATH is provided as an environment variable; if so, create an argument -# to pass to the training script -if [ -z ${LOAD_PATH+x} ]; then - LOAD_PATH_ARG="" -else - LOAD_PATH_ARG="--load_path=${LOAD_PATH}" -fi - - -# check if CONFIG PATH is provided as an environment variable; -# if so, use that instead of olmo-small-ablation.yaml -if [ -z ${CONFIG_PATH+x} ]; then - export CONFIG_PATH=configs/olmo-small-ablation.yaml -else - export CONFIG_PATH="${CONFIG_PATH}" -fi - -# get run name, we will use this as task name in gantry -RUN_NAME=$(cat $CONFIG_PATH | grep -ohP "^run_name\:\w*(.+)$" | sed 's/run_name:\s*//') - -# get a hash of the load path and config path; take the first 8 characters -RUN_HASH=$(echo "${LOAD_PATH_ARG}-${CONFIG_PATH}" | md5sum | cut -c 1-8) - -# compose the two -FULL_RUN_NAME="${RUN_NAME}-${RUN_HASH}" - -# check if there is an env var called 'WANDB_API_KEY' and if so, create a flag -# to pass to gantry -if [ -z ${WANDB_API_KEY+x} ]; then - WANDB_API_KEY_ARG="--env-secret WANDB_API_KEY=WANDB_API_KEY" -else - WANDB_API_KEY_ARG="--env WANDB_API_KEY=${WANDB_API_KEY}" -fi - -NUM_NODES=4 - -gantry run \ - --workspace ai2/llm-testing \ - --task-name "${FULL_RUN_NAME}" \ - --description "${FULL_RUN_NAME}" \ - --priority "normal" \ - --beaker-image olmo-torch2-gantry \ - --cluster ai2/general-cirrascale-a100-80g-ib \ - --gpus 8 \ - --replicas ${NUM_NODES} \ - --leader-selection \ - --host-networking \ - --nfs \ - ${WANDB_API_KEY_ARG} \ - --env LOG_FILTER_TYPE=local_rank0_only \ - --env OMP_NUM_THREADS=8 \ - --shared-memory 10GiB \ - --venv base \ - --yes \ - -- /bin/bash -c "torchrun --nnodes ${NUM_NODES}:${NUM_NODES} --nproc-per-node 8 --rdzv_id=101 --rdzv_backend=c10d --rdzv_endpoint=\$BEAKER_LEADER_REPLICA_HOSTNAME:29400 scripts/train.py ${CONFIG_PATH} --run_name=${FULL_RUN_NAME} ${LOAD_PATH_ARG} --device_train_microbatch_size=8 --model.flash_attention=true ${@}" diff --git a/scripts/beaker/olmo7-ablation-baseline.sh b/scripts/beaker/olmo7-ablation-baseline.sh deleted file mode 100755 index cd64e59e1..000000000 --- a/scripts/beaker/olmo7-ablation-baseline.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/env bash - -set -ex - -CONFIG_PATH=configs/olmo7-ablation-baseline.yaml -NUM_NODES=8 -ARGS='--run_name=olmo7-ablation-baseline --wandb.name=baseline --model.flash_attention=true --fsdp.wrapping_strategy=by_block_and_size --fsdp.sharding_strategy=SHARD_GRAD_OP --save_folder=runs/ --device_train_microbatch_size=3 --global_train_batch_size=6144 --wandb.group=baseline --remote_save_folder=s3://ai2-llm/checkpoints/olmo7-ablation/baseline3 --load_path=s3://ai2-llm/checkpoints/olmo7-ablation/baseline3/step7800' - -gantry run \ - --allow-dirty \ - --workspace ai2/llm-testing \ - --task-name olmo7-ablation-baseline \ - --description olmo7-ablation-baseline \ - --priority high \ - --beaker-image olmo-torch2-gantry \ - --cluster ai2/pluto-cirrascale \ - --gpus 8 \ - --replicas "${NUM_NODES}" \ - --leader-selection \ - --host-networking \ - --nfs \ - --mount /net/nfs.cirrascale/allennlp/petew/cache:/root/.cache \ - --budget ai2/oe-training \ - --env LOG_FILTER_TYPE=local_rank0_only \ - --env OMP_NUM_THREADS=8 \ - --env OLMO_TASK=model \ - --env-secret WANDB_API_KEY=WANDB_API_KEY \ - --env-secret AWS_ACCESS_KEY_ID=AWS_ACCESS_KEY_ID \ - --env-secret AWS_SECRET_ACCESS_KEY=AWS_SECRET_ACCESS_KEY \ - --env-secret R2_ACCESS_KEY_ID=R2_ACCESS_KEY_ID \ - --env-secret R2_SECRET_ACCESS_KEY=R2_SECRET_ACCESS_KEY \ - --env-secret R2_ENDPOINT_URL=R2_ENDPOINT_URL \ - --shared-memory 10GiB \ - --venv base \ - --yes \ - -- /bin/bash -c "source scripts/beaker/warm_hf_cache.sh && torchrun --nnodes ${NUM_NODES}:${NUM_NODES} --nproc-per-node 8 --rdzv_id=101 --rdzv_backend=c10d --rdzv_endpoint=\$BEAKER_LEADER_REPLICA_HOSTNAME:29400 scripts/train.py ${CONFIG_PATH} ${ARGS}" diff --git a/scripts/beaker/olmo7-ablation-dedupeparas.sh b/scripts/beaker/olmo7-ablation-dedupeparas.sh deleted file mode 100755 index 0f9e6badf..000000000 --- a/scripts/beaker/olmo7-ablation-dedupeparas.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/env bash - -set -ex - -CONFIG_PATH=configs/olmo7-ablation-dedupeparas.yaml -NUM_NODES=8 -ARGS='--run_name=olmo7-ablation-dedupeparas --wandb.name=dedupeparas --model.flash_attention=true --fsdp.wrapping_strategy=by_block_and_size --fsdp.sharding_strategy=SHARD_GRAD_OP --save_folder=runs/ --device_train_microbatch_size=3 --global_train_batch_size=6144 --wandb.group=dedupeparas --remote_save_folder=s3://ai2-llm/checkpoints/olmo7-ablation/dedupeparas' - -gantry run \ - --allow-dirty \ - --workspace ai2/llm-testing \ - --task-name olmo7-ablation-dedupeparas \ - --description olmo7-ablation-dedupeparas \ - --priority high \ - --beaker-image olmo-torch2-gantry \ - --cluster ai2/pluto-cirrascale \ - --gpus 8 \ - --replicas "${NUM_NODES}" \ - --leader-selection \ - --host-networking \ - --nfs \ - --mount /net/nfs.cirrascale/allennlp/petew/cache:/root/.cache \ - --budget ai2/oe-training \ - --env LOG_FILTER_TYPE=local_rank0_only \ - --env OMP_NUM_THREADS=8 \ - --env OLMO_TASK=model \ - --env-secret WANDB_API_KEY=WANDB_API_KEY \ - --env-secret AWS_ACCESS_KEY_ID=AWS_ACCESS_KEY_ID \ - --env-secret AWS_SECRET_ACCESS_KEY=AWS_SECRET_ACCESS_KEY \ - --env-secret R2_ACCESS_KEY_ID=R2_ACCESS_KEY_ID \ - --env-secret R2_SECRET_ACCESS_KEY=R2_SECRET_ACCESS_KEY \ - --env-secret R2_ENDPOINT_URL=R2_ENDPOINT_URL \ - --shared-memory 10GiB \ - --venv base \ - --yes \ - -- /bin/bash -c "source scripts/beaker/warm_hf_cache.sh && torchrun --nnodes ${NUM_NODES}:${NUM_NODES} --nproc-per-node 8 --rdzv_id=101 --rdzv_backend=c10d --rdzv_endpoint=\$BEAKER_LEADER_REPLICA_HOSTNAME:29400 scripts/train.py ${CONFIG_PATH} ${ARGS}" diff --git a/scripts/beaker/olmo7-ablation-final2.sh b/scripts/beaker/olmo7-ablation-final2.sh deleted file mode 100755 index 3fbf72573..000000000 --- a/scripts/beaker/olmo7-ablation-final2.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/env bash - -set -ex - -CONFIG_PATH=configs/olmo7-ablation-final2.yaml -NUM_NODES=8 -ARGS='--run_name=olmo7-ablation-final2 --wandb.name=final2 --model.flash_attention=true --fsdp.wrapping_strategy=by_block_and_size --fsdp.sharding_strategy=SHARD_GRAD_OP --save_folder=runs/ --device_train_microbatch_size=3 --global_train_batch_size=6144 --wandb.group=final2 --remote_save_folder=s3://ai2-llm/checkpoints/olmo7-ablation/final2' - -gantry run \ - --allow-dirty \ - --workspace ai2/llm-testing \ - --task-name olmo7-ablation-final2 \ - --description olmo7-ablation-final2 \ - --priority high \ - --beaker-image olmo-torch2-gantry \ - --cluster ai2/pluto-cirrascale \ - --gpus 8 \ - --replicas "${NUM_NODES}" \ - --leader-selection \ - --host-networking \ - --nfs \ - --mount /net/nfs.cirrascale/allennlp/petew/cache:/root/.cache \ - --budget ai2/oe-training \ - --env LOG_FILTER_TYPE=local_rank0_only \ - --env OMP_NUM_THREADS=8 \ - --env OLMO_TASK=model \ - --env-secret WANDB_API_KEY=WANDB_API_KEY \ - --env-secret AWS_ACCESS_KEY_ID=AWS_ACCESS_KEY_ID \ - --env-secret AWS_SECRET_ACCESS_KEY=AWS_SECRET_ACCESS_KEY \ - --env-secret R2_ACCESS_KEY_ID=R2_ACCESS_KEY_ID \ - --env-secret R2_SECRET_ACCESS_KEY=R2_SECRET_ACCESS_KEY \ - --env-secret R2_ENDPOINT_URL=R2_ENDPOINT_URL \ - --shared-memory 10GiB \ - --venv base \ - --yes \ - -- /bin/bash -c "source scripts/beaker/warm_hf_cache.sh && torchrun --nnodes ${NUM_NODES}:${NUM_NODES} --nproc-per-node 8 --rdzv_id=101 --rdzv_backend=c10d --rdzv_endpoint=\$BEAKER_LEADER_REPLICA_HOSTNAME:29400 scripts/train.py ${CONFIG_PATH} ${ARGS}" diff --git a/scripts/beaker/olmo7-ablation-refheavy.sh b/scripts/beaker/olmo7-ablation-refheavy.sh deleted file mode 100755 index fe1c61aa1..000000000 --- a/scripts/beaker/olmo7-ablation-refheavy.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/env bash - -set -ex - -CONFIG_PATH=configs/olmo7-ablation-refheavy.yaml -NUM_NODES=8 -ARGS='--run_name=olmo7-ablation-refheavy --wandb.name=refheavy --model.flash_attention=true --fsdp.wrapping_strategy=by_block_and_size --fsdp.sharding_strategy=SHARD_GRAD_OP --save_folder=runs/ --device_train_microbatch_size=3 --global_train_batch_size=6144 --wandb.group=refheavy --remote_save_folder=s3://ai2-llm/checkpoints/olmo7-ablation/refheavy' - -gantry run \ - --allow-dirty \ - --workspace ai2/llm-testing \ - --task-name olmo7-ablation-refheavy \ - --description olmo7-ablation-refheavy \ - --priority high \ - --beaker-image olmo-torch2-gantry \ - --cluster ai2/pluto-cirrascale \ - --gpus 8 \ - --replicas "${NUM_NODES}" \ - --leader-selection \ - --host-networking \ - --nfs \ - --mount /net/nfs.cirrascale/allennlp/petew/cache:/root/.cache \ - --budget ai2/oe-training \ - --env LOG_FILTER_TYPE=local_rank0_only \ - --env OMP_NUM_THREADS=8 \ - --env OLMO_TASK=model \ - --env-secret WANDB_API_KEY=WANDB_API_KEY \ - --env-secret AWS_ACCESS_KEY_ID=AWS_ACCESS_KEY_ID \ - --env-secret AWS_SECRET_ACCESS_KEY=AWS_SECRET_ACCESS_KEY \ - --env-secret R2_ACCESS_KEY_ID=R2_ACCESS_KEY_ID \ - --env-secret R2_SECRET_ACCESS_KEY=R2_SECRET_ACCESS_KEY \ - --env-secret R2_ENDPOINT_URL=R2_ENDPOINT_URL \ - --shared-memory 10GiB \ - --venv base \ - --yes \ - -- /bin/bash -c "source scripts/beaker/warm_hf_cache.sh && torchrun --nnodes ${NUM_NODES}:${NUM_NODES} --nproc-per-node 8 --rdzv_id=101 --rdzv_backend=c10d --rdzv_endpoint=\$BEAKER_LEADER_REPLICA_HOSTNAME:29400 scripts/train.py ${CONFIG_PATH} ${ARGS}" diff --git a/scripts/beaker/pile-llamaish7-launch.sh b/scripts/beaker/pile-llamaish7-launch.sh deleted file mode 100755 index 129ffb0e1..000000000 --- a/scripts/beaker/pile-llamaish7-launch.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/env bash - -set -ex - -NUM_NODES=8 - -gantry run \ - --workspace ai2/akshitab \ - --task-name pile-llamaish7 \ - --description "OLMo medium - 7B - Llamaish - Pile" \ - --priority high \ - --beaker-image shanea/olmo-torch2.2-gantry \ - --cluster ai2/jupiter-cirrascale \ - --gpus 8 \ - --replicas "${NUM_NODES}" \ - --leader-selection \ - --host-networking \ - --budget ai2/oe-training \ - --no-nfs \ - --propagate-failure \ - --synchronized-start-timeout 20m \ - --env LOG_FILTER_TYPE=local_rank0_only \ - --env OMP_NUM_THREADS=8 \ - --env OLMO_TASK=model \ - --env-secret WANDB_API_KEY=WANDB_API_KEY \ - --env-secret AWS_ACCESS_KEY_ID=AWS_ACCESS_KEY_ID \ - --env-secret AWS_SECRET_ACCESS_KEY=AWS_SECRET_ACCESS_KEY \ - --shared-memory 10GiB \ - --venv base \ - --yes \ - --timeout=-1 \ - -- /bin/bash -c "scripts/beaker/pile-llamaish7.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/pile-llamaish7.sh b/scripts/beaker/pile-llamaish7.sh deleted file mode 100755 index f2e8fcf5c..000000000 --- a/scripts/beaker/pile-llamaish7.sh +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env bash -set -exuo pipefail -IFS=$'\n\t' - -BEAKER_LEADER_REPLICA_HOSTNAME=$1 -shift - -NUM_NODES=$1 -shift - -BEAKER_REPLICA_RANK=$1 -shift - -# Warm HF cache -mkdir -p /root/.cache -pushd /root/.cache -curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf - -popd -export HF_DATASETS_OFFLINE=1 - -torchrun \ - --nnodes ${NUM_NODES}:${NUM_NODES} \ - --nproc-per-node 8 \ - --rdzv_id=12347 \ - --rdzv_backend=static \ - --rdzv_endpoint=$BEAKER_LEADER_REPLICA_HOSTNAME:29400 \ - --node_rank=$BEAKER_REPLICA_RANK \ - --rdzv_conf="read_timeout=420" \ - scripts/train.py \ - configs/pile-llamaish7-s3.yaml \ - --run_name=pile-llamaish7 \ - --wandb.name=pile-llamaish7 \ - --wandb.group=pile-llamaish7 \ - --model.flash_attention=true \ - --fsdp.wrapping_strategy=by_block_and_size \ - --fsdp.sharding_strategy=SHARD_GRAD_OP \ - --save_folder=runs/ \ - --activation_checkpointing=fine_grained \ - --fused_loss=true \ - --device_train_microbatch_size=2 \ - --global_train_batch_size=1024 \ - --save_interval=50 \ - --eval_interval=50 \ - --optimizer.metrics_log_interval=1 \ - '--load_path=${path.last_checkpoint:s3://ai2-llm/checkpoints/OLMo-medium/pile-llamaish7/}' diff --git a/scripts/beaker/tiny-llamaish-launch.sh b/scripts/beaker/tiny-llamaish-launch.sh deleted file mode 100755 index 26c8f6866..000000000 --- a/scripts/beaker/tiny-llamaish-launch.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/env bash - -set -ex - -NUM_NODES=1 - -gantry run \ - --workspace ai2/OLMo-training \ - --task-name tiny-llamaish \ - --description "OLMo tiny-llamaish test" \ - --priority high \ - --preemptible \ - --beaker-image shanea/olmo-torch2.2-gantry \ - --cluster ai2/jupiter-cirrascale \ - --gpus 2 \ - --replicas "${NUM_NODES}" \ - --leader-selection \ - --host-networking \ - --budget ai2/oe-training \ - --no-nfs \ - --propagate-failure \ - --env LOG_FILTER_TYPE=local_rank0_only \ - --env OMP_NUM_THREADS=8 \ - --env OLMO_TASK=model \ - --env-secret WANDB_API_KEY=AKSHITAB_WANDB_API_KEY \ - --env-secret AWS_ACCESS_KEY_ID=AKSHITAB_AWS_ACCESS_KEY_ID \ - --env-secret AWS_SECRET_ACCESS_KEY=AKSHITAB_AWS_SECRET_ACCESS_KEY \ - --shared-memory 10GiB \ - --venv base \ - --yes \ - --timeout=-1 \ - --allow-dirty \ - -- /bin/bash -c "scripts/beaker/tiny-llamaish.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" - #--synchronized-start-timeout 600m diff --git a/scripts/beaker/tiny-llamaish.sh b/scripts/beaker/tiny-llamaish.sh deleted file mode 100755 index b5731dc56..000000000 --- a/scripts/beaker/tiny-llamaish.sh +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env bash -set -exuo pipefail -IFS=$'\n\t' - -BEAKER_LEADER_REPLICA_HOSTNAME=$1 -shift - -NUM_NODES=$1 -shift - -BEAKER_REPLICA_RANK=$1 -shift - -# Warm HF cache -mkdir -p /root/.cache -pushd /root/.cache -curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf - -popd -export HF_DATASETS_OFFLINE=1 - -torchrun \ - --nnodes ${NUM_NODES}:${NUM_NODES} \ - --nproc-per-node 1 \ - --rdzv_id=12347 \ - --rdzv_backend=static \ - --rdzv_endpoint=$BEAKER_LEADER_REPLICA_HOSTNAME:29400 \ - --node_rank=$BEAKER_REPLICA_RANK \ - --rdzv_conf="read_timeout=420" \ - scripts/train.py \ - configs/tiny-llamaish-s3.yaml \ - --run_name=tiny-llamaish \ - --wandb.name=tiny-llamaish \ - --wandb.group=tiny-llamaish \ - --model.flash_attention=true \ - --fsdp.wrapping_strategy=by_block_and_size \ - --fsdp.sharding_strategy=SHARD_GRAD_OP \ - --save_folder=runs/ \ - --activation_checkpointing=fine_grained \ - --fused_loss=true \ - --device_train_microbatch_size=2 \ - --global_train_batch_size=1024 \ - --save_interval=50 \ - --eval_interval=50 \ - --optimizer.metrics_log_interval=1 \ - --save_overwrite \ - --model.scale_emb_init \ - --model.attention_layer_norm=true \ - --model.norm_after=true \ - --softmax_auxiliary_loss=true - #'--load_path=${path.last_checkpoint:s3://ai2-llm/checkpoints/OLMo-medium/llamaish7-qk-norm-reorder/}' diff --git a/scripts/beaker/warm_hf_cache.sh b/scripts/beaker/warm_hf_cache.sh deleted file mode 100755 index 66ab383d9..000000000 --- a/scripts/beaker/warm_hf_cache.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/usr/bin/env bash - -set -ex - -mkdir -p /root/.cache -pushd /root/.cache -curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf - -popd -export HF_DATASETS_OFFLINE=1 diff --git a/scripts/kempner/llama7.sh b/scripts/kempner/llama7.sh deleted file mode 100644 index c7d35ad40..000000000 --- a/scripts/kempner/llama7.sh +++ /dev/null @@ -1,42 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=llama7 -#SBATCH --account=kempner_lab -#SBATCH --output=/n/holyscratch01/kempner_lab/Lab/logs/%j.log -#SBATCH --nodes=16 # Total number of nodes -#SBATCH --ntasks-per-node=4 -#SBATCH --gpus-per-node=4 # Allocate one gpu per MPI rank -#SBATCH --cpus-per-task=16 -#SBATCH --time=167:00:00 -#SBATCH --mem=0 # All memory on the node -#SBATCH --partition=kempner_project - -export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK -export MPICH_GPU_SUPPORT_ENABLED=1 -export MIOPEN_USER_DB_PATH=/tmp/${USER}-miopen-cache-${SLURM_JOB_ID} -export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH} - -export PYTHONPATH=.:${PYTHONPATH} - -# Try playing with max_split_size_mb if you run into OOM errors. -# export PYTORCH_HIP_ALLOC_CONF=max_split_size_mb:512 - -export DATA_PATH=/n/home06/dgroeneveld/data/preprocessed/olmo-mix -export EVAL_DATA_PATH=/n/home06/dgroeneveld/data/eval-data -export CHECKPOINTS_PATH=/n/home06/dgroeneveld/checkpoints - -export PYTORCH_KERNEL_CACHE_PATH=/tmp/pytorch_kernel_cache/ -mkdir -p $PYTORCH_KERNEL_CACHE_PATH - -srun \ - --cpus-per-task=$SLURM_CPUS_PER_TASK \ - --distribution=block:block \ - --kill-on-bad-exit \ - scripts/run_with_environment.sh \ - $HOME/miniconda3/envs/LLM/bin/python -u scripts/train.py configs/llama7.yaml \ - --run_name=kempner_llama7_${SLURM_JOB_ID} \ - --save_folder=/n/holyscratch01/kempner_lab/Lab/checkpoints/${SLURM_JOB_ID}/ \ - --data.num_workers=4 \ - --device_train_microbatch_size=6 \ - --time_limit=$((167 * 60 * 60)) \ - --model.flash_attention=true \ - ${@} diff --git a/scripts/kempner/log_into_node.sh b/scripts/kempner/log_into_node.sh deleted file mode 100755 index d4785d592..000000000 --- a/scripts/kempner/log_into_node.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash - -set -euxo pipefail - -srun --interactive --pty --jobid=$1 bash \ No newline at end of file diff --git a/scripts/kempner/mitch-ish-7b.sh b/scripts/kempner/mitch-ish-7b.sh deleted file mode 100644 index 3d206bd72..000000000 --- a/scripts/kempner/mitch-ish-7b.sh +++ /dev/null @@ -1,53 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=v1.5-mix-medium-mitch-ish -#SBATCH --account=kempner_lab -#SBATCH --output=/n/holyscratch01/kempner_lab/Lab/logs-petew/%j.log -#SBATCH --nodes=8 # Total number of nodes -#SBATCH --ntasks-per-node=4 -#SBATCH --gpus-per-node=4 # Allocate one gpu per MPI rank -#SBATCH --cpus-per-task=16 -#SBATCH --time=167:00:00 -#SBATCH --mem=0 # All memory on the node -#SBATCH --partition=kempner_project - -export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK -export MPICH_GPU_SUPPORT_ENABLED=1 -export MIOPEN_USER_DB_PATH=/tmp/${USER}-miopen-cache-${SLURM_JOB_ID} -export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH} - -export PYTHONPATH=.:${PYTHONPATH} - -# Try playing with max_split_size_mb if you run into OOM errors. -# export PYTORCH_HIP_ALLOC_CONF=max_split_size_mb:512 - -export DATA_PATH=/n/home06/dgroeneveld/data/preprocessed/olmo-mix -export EVAL_DATA_PATH=/n/home06/dgroeneveld/data/eval-data -export CHECKPOINTS_PATH=/n/home06/dgroeneveld/checkpoints - -export PYTORCH_KERNEL_CACHE_PATH=/tmp/pytorch_kernel_cache/ -mkdir -p $PYTORCH_KERNEL_CACHE_PATH - -LOAD_PATH=s3://ai2-llm/checkpoints/7b/v1_5-mix-mitch-ish/step556000-unsharded -# SAVE_PATH=s3://ai2-llm/checkpoints/7b/v1_5-mix-mitch-ish-final-tulu - -srun \ - "--cpus-per-task=$SLURM_CPUS_PER_TASK" \ - --distribution=block:block \ - --kill-on-bad-exit \ - scripts/run_with_environment.sh \ - $HOME/miniconda3/envs/LLM/bin/python -u scripts/train.py configs/v1_5-mix-medium-mitch-ish-s3.yaml \ - "--run_name=kempner_${SLURM_JOB_ID}" \ - --wandb.name=v1_5-mix-mitch-ish-final-tulu \ - '--data.paths=[s3://ai2-llm/preprocessed/tulu-v2-sft-mixture/gpt-neox-20b-pii-special/data.npy,s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample-9B/gpt-neox-20b-pii-special/data.npy]' \ - --eval_interval=100 \ - --save_interval=500 \ - "--load_path=${LOAD_PATH}" \ - --restore_dataloader=false \ - --optimizer.learning_rate=0.000023 \ - --scheduler.t_warmup=556000 \ - --scheduler.alpha_f=0.001 \ - --scheduler.t_max=558223 \ - --stop_at=558223 \ - --time_limit=$((167 * 60 * 60)) \ - --model.flash_attention=true \ - "--save_folder=/n/holyscratch01/kempner_lab/Lab/checkpoints/${SLURM_JOB_ID}/" diff --git a/scripts/kempner/v1-mix-small.sh b/scripts/kempner/v1-mix-small.sh deleted file mode 100644 index 8d0c026a8..000000000 --- a/scripts/kempner/v1-mix-small.sh +++ /dev/null @@ -1,42 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=v1-mix-small -#SBATCH --account=kempner_lab -#SBATCH --output=/n/holyscratch01/kempner_lab/Lab/logs/%j.log -#SBATCH --nodes=16 # Total number of nodes -#SBATCH --ntasks-per-node=4 -#SBATCH --gpus-per-node=4 # Allocate one gpu per MPI rank -#SBATCH --cpus-per-task=16 -#SBATCH --time=167:00:00 -#SBATCH --mem=0 # All memory on the node -#SBATCH --partition=kempner_project - -export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK -export MPICH_GPU_SUPPORT_ENABLED=1 -export MIOPEN_USER_DB_PATH=/tmp/${USER}-miopen-cache-${SLURM_JOB_ID} -export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH} - -export PYTHONPATH=.:${PYTHONPATH} - -# Try playing with max_split_size_mb if you run into OOM errors. -# export PYTORCH_HIP_ALLOC_CONF=max_split_size_mb:512 - -export DATA_PATH=/n/home06/dgroeneveld/data/preprocessed/olmo-mix -export EVAL_DATA_PATH=/n/home06/dgroeneveld/data/eval-data -export CHECKPOINTS_PATH=/n/home06/dgroeneveld/checkpoints - -export PYTORCH_KERNEL_CACHE_PATH=/tmp/pytorch_kernel_cache/ -mkdir -p $PYTORCH_KERNEL_CACHE_PATH - -srun \ - --cpus-per-task=$SLURM_CPUS_PER_TASK \ - --distribution=block:block \ - --kill-on-bad-exit \ - scripts/run_with_environment.sh \ - $HOME/miniconda3/envs/LLM/bin/python -u scripts/train.py configs/v1-mix-small.yaml \ - --run_name=kempner_${SLURM_JOB_ID} \ - --time_limit=$((167 * 60 * 60)) \ - --device_train_microbatch_size=2 \ - --fsdp.sharding_strategy=NO_SHARD \ - --save_folder=/n/holyscratch01/kempner_lab/Lab/checkpoints/${SLURM_JOB_ID}/ \ - --model.flash_attention=true \ - ${@} diff --git a/scripts/mcli/manage_run.py b/scripts/mcli/manage_run.py deleted file mode 100644 index 8b008d8dd..000000000 --- a/scripts/mcli/manage_run.py +++ /dev/null @@ -1,264 +0,0 @@ -""" -This script is meant to be run periodically (e.g. every 30 minutes) to automatically -restart a run if necessary on MosaicML's platform. -You can also use it as an alternative to `mcli run` as a one-off script for launching a new run. -The benefit of using this script is that it will automatically detect bad nodes before launching the run. - -It takes an MCLI run config and attempts to manage the run as follows: -- If a run with the same name on the specified cluster is already running or queued, it does nothing. -- If there's enough nodes available to run the job, it submits and monitors a light-weight test run on each - available node to determine which nodes are working properly. -- If there's enough working nodes it will launch a new run on a subset of the working nodes. - -For example: - python scripts/mcli/manage_run.py configs/mcli/mitchish7.yaml - -Notes: -- This script will always override the `compute.node_names` field in your MCLI config when it launches - a new run, so there is no need to specify `node_names` manually. Just specify the number of gpus - (`compute.gpus`). -""" - -import argparse -import sys -import time -from concurrent.futures import as_completed -from typing import List, Optional, Set - -import mcli -import mcli.api.runs -import yaml -from mcli.api.model.cluster_details import Instance -from mcli.api.runs import Run, RunConfig, RunStatus -from rich import print -from rich.progress import track -from rich.prompt import Confirm - -_SKIP_CONFIRMATION = False -_DEFAULT_TIMEOUT = 360 - - -def get_test_config( - *, cluster_name: str, image_name: str, node_name: str, instance_name: Optional[str] = None -) -> RunConfig: - """ - Get a run config for testing if a node is working properly. - """ - run_config = RunConfig( - name="test-run", - image=image_name, - compute=dict(cluster=cluster_name, nodes=1, node_names=[node_name]), # type: ignore - command='''python -c "import torch; torch.rand(2, 3).cuda() @ torch.rand(3, 2).cuda(); print('All good!')"''', - ) - if instance_name is not None: - run_config.compute["instance"] = instance_name - return run_config - - -def submit_runs(run_configs: List[RunConfig], timeout: int = _DEFAULT_TIMEOUT) -> List[Run]: - """ - Submit a list of runs. - """ - futures = [] - for run_config in run_configs: - futures.append(mcli.api.runs.create_run(run_config, future=True)) - - runs = [] - for future in track( - as_completed(futures, timeout=timeout), total=len(futures), description="Submitting runs..." - ): - runs.append(future.result()) - - return runs - - -def wait_on_runs(runs: List[Run], timeout: int = _DEFAULT_TIMEOUT) -> List[Run]: - """ - Wait on a list of runs to reach 'COMPLETED' status (or a failure of some kind). - """ - futures = [] - for i, run in enumerate(runs): - futures.append(mcli.api.runs.wait_for_run_status(run, RunStatus.COMPLETED, future=True)) - if i == 0: - # HACK: this works around a bug in `mcli`. - time.sleep(0.05) - - results = [] - for future in track( - as_completed(futures, timeout=timeout), total=len(futures), description="Waiting on runs..." - ): - results.append(future.result()) - - return results - - -def identify_bad_nodes( - *, - available_nodes: Set[str], - cluster_name: str, - image_name: str, - instance_name: Optional[str] = None, - timeout: int = _DEFAULT_TIMEOUT, -) -> Set[str]: - """ - Identify faulty nodes from a set of nodes on a cluster. - """ - bad_nodes = set() - test_runs = submit_runs( - [ - get_test_config( - cluster_name=cluster_name, image_name=image_name, node_name=node_name, instance_name=instance_name - ) - for node_name in available_nodes - ], - timeout=timeout, - ) - - try: - test_runs = wait_on_runs(test_runs, timeout=timeout) - except BaseException: - print("Stopping test runs due to error...") - mcli.api.runs.stop_runs(test_runs) - raise - - for run in test_runs: - if not run.nodes: - run = mcli.api.runs.get_run(run) - assert len(run.nodes) == 1 - node_name = run.nodes[0].name - if run.status in {RunStatus.FAILED, RunStatus.UNKNOWN, RunStatus.STOPPED}: - bad_nodes.add(node_name) - print(f" [red]✖️[/] '{node_name}' {run.status} (run '{run.name}')") - elif run.status in {RunStatus.COMPLETED}: - print(f" [green]✔️[/] '{node_name}' {run.status} (run '{run.name}')") - else: - print(f" [yellow]?[/] '{node_name}' {run.status} (run '{run.name}')") - - return bad_nodes - - -def confirm_continue(prompt: str) -> bool: - if _SKIP_CONFIRMATION: - print(prompt) - return True - else: - return Confirm.ask(f"{prompt} Continue?") - - -def main(config_path: str, timeout: int = _DEFAULT_TIMEOUT) -> int: - # Read target run config and grab relevant fields. - with open(config_path, "r") as f: - config = yaml.safe_load(f) - cluster_name = config["compute"]["cluster"] - instance_name = config["compute"].get("instance") - image_name = config["image"] - run_prefix = config["name"] - gpus_required = config["compute"]["gpus"] - - # Get cluster metadata. - cluster = mcli.get_cluster(cluster_name) - assert cluster.utilization is not None - - # Check if config is already running or queued on the cluster. - for run in cluster.utilization.active_runs_by_user: - if run.name.startswith(f"{run_prefix}-"): - print(f"[green]✔️[/] Run '{run.name}' is already active") - return 0 - for run in cluster.utilization.queued_runs_by_user: - if run.name.startswith(f"{run_prefix}-"): - print(f"[green]✔️[/] Run '{run.name}' is already queued") - return 0 - - # Collect cluster instance metadata. - instance: Optional[Instance] = None - for instance_util in cluster.utilization.cluster_instance_utils: - if instance_name is None or instance_util.instance.name == instance_name: - instance = instance_util.instance - break - assert instance is not None - assert gpus_required % instance.gpus == 0 - nodes_required = gpus_required // instance.gpus - - # Gather all nodes. - all_nodes = set() - for node in instance.node_details: - all_nodes.add(node.name) - - print(f"There are {len(all_nodes)} total nodes") - - if nodes_required > len(all_nodes): - print(f"[yellow]Not enough nodes to meet requirement of {nodes_required} ({gpus_required} GPUs)[/]") - return 1 - - # Filter out nodes that already have a job. - available_nodes = all_nodes.copy() - for run in cluster.utilization.active_runs_by_user: - run = mcli.get_run(run.name) - for node in run.nodes: - if node.name in available_nodes: - available_nodes.remove(node.name) - - print(f"There are {len(available_nodes)} available nodes") - - if nodes_required > len(available_nodes): - print( - f"[yellow]Not enough nodes available to meet requirement of {nodes_required} ({gpus_required} GPUs)[/]" - ) - return 1 - - if not confirm_continue( - f"Submitting test runs to the {len(available_nodes)} available nodes to determine working nodes..." - ): - return 1 - bad_nodes = identify_bad_nodes( - available_nodes=available_nodes, - cluster_name=cluster_name, - image_name=image_name, - instance_name=instance_name, - timeout=timeout, - ) - if bad_nodes: - print( - f"[yellow]Identified {len(bad_nodes)} bad nodes. Please notify MosaicML team if you haven't already.[/]" - ) - - # Gather all working nodes. - working_nodes = set() - for node in available_nodes: - if node not in bad_nodes: - working_nodes.add(node) - - print(f"There are {len(working_nodes)} working available nodes") - - if nodes_required > len(working_nodes): - print( - f"[yellow]Not enough working nodes available to meet requirement of {nodes_required} ({gpus_required} GPUs)[/]" - ) - return 1 - - # Initialize run config to submit. - run_config = RunConfig(**config) - run_config.compute["node_names"] = list(working_nodes)[:nodes_required] - - # Submit job. - if not confirm_continue("Launching new run..."): - return 1 - run = mcli.create_run(run_config, timeout=timeout) - print(f"[green]✔️[/] Launched new run '{run.name}'") - - return 0 - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(prog="mcli-run-manager") - parser.add_argument("run_config") - parser.add_argument("-y", "--yes", action="store_true", help="Skip confirmation prompts") - parser.add_argument( - "-t", "--timeout", type=int, default=_DEFAULT_TIMEOUT, help="Timeout in seconds to wait for jobs" - ) - - args = parser.parse_args() - if args.yes: - _SKIP_CONFIRMATION = True - - sys.exit(main(args.run_config, timeout=args.timeout)) diff --git a/scripts/mcli/unshard_mitchish70.sh b/scripts/mcli/unshard_mitchish70.sh deleted file mode 100755 index 9fcdeb3f9..000000000 --- a/scripts/mcli/unshard_mitchish70.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -remote_sharded_checkpoint=$(python -c "from olmo.util import find_latest_checkpoint; print(find_latest_checkpoint('s3://ai2-llm/checkpoints/OLMo-large/mitchish70-002'))") -local_folder=~/checkpoints - -mkdir -p ${local_folder} - -local_sharded_checkpoint="${local_folder}/$(basename ${remote_sharded_checkpoint})" -remote_unsharded_checkpoint="${remote_sharded_checkpoint}-unsharded" -local_unsharded_checkpoint="${local_sharded_checkpoint}-unsharded" - -echo "Downloading '${remote_sharded_checkpoint}' to '${local_sharded_checkpoint}'..." -aws s3 cp --recursive ${remote_sharded_checkpoint} ${local_sharded_checkpoint} - -echo "Unsharding '${local_sharded_checkpoint}' to '${local_unsharded_checkpoint}'..." -python scripts/unshard.py ${local_sharded_checkpoint} ${local_unsharded_checkpoint} --safe-tensors --type=local - -echo "Uploading '${local_unsharded_checkpoint}' to '${remote_unsharded_checkpoint}'..." -aws s3 cp --recursive ${local_unsharded_checkpoint} ${remote_unsharded_checkpoint} diff --git a/scripts/pyspy_all_nodes.sh b/scripts/pyspy_all_nodes.sh deleted file mode 100755 index 8efca7798..000000000 --- a/scripts/pyspy_all_nodes.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -set -euxo pipefail - -srun --overlap --jobid $1 \ - singularity exec \ - -B"$PROJECT_DIR:$PROJECT_DIR" \ - -B"$SCRATCH_DIR:$SCRATCH_DIR" \ - -B"$FLASH_DIR:$FLASH_DIR" \ - -B /var/spool/slurmd,/opt/cray/,/usr/lib64/libcxi.so.1,/usr/lib64/libjansson.so.4,/usr/lib64/libjson-c.so.3 \ - $OLMO_CONTAINER \ - bash scripts/pyspy_all_processes.sh | sort -s -t: -k1,1 diff --git a/scripts/pyspy_all_processes.sh b/scripts/pyspy_all_processes.sh deleted file mode 100755 index 0d6f1eee4..000000000 --- a/scripts/pyspy_all_processes.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -export NODENAME=$(hostname -s) - -# Redirect stdout and stderr so that we get a prefix with the node name -exec > >(trap "" INT TERM; sed -u "s/^/$NODENAME out: /") -exec 2> >(trap "" INT TERM; sed -u "s/^/$NODENAME err: /" >&2) - -ps -x -o pid,comm | grep " python" | sed -r 's/^[ ]*([0-9]+) .*/\1/g' | while read i; do - echo "Process $i:" - py-spy dump --pid $i; -done diff --git a/scripts/run_with_environment.sh b/scripts/run_with_environment.sh deleted file mode 100755 index 89566cf25..000000000 --- a/scripts/run_with_environment.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/bash - -# Note: This script does not run inside the container. It runs on the bare compute node. - -set -euo pipefail - -export NODENAME=$(hostname -s) -export MASTER_ADDR=$(scontrol show hostnames | head -n 1) -export MASTER_PORT=39591 -export WORLD_SIZE=$SLURM_NTASKS -export RANK=$SLURM_PROCID -export FS_LOCAL_RANK=$SLURM_PROCID -export LOCAL_WORLD_SIZE=$SLURM_NTASKS_PER_NODE -export LOCAL_RANK=$SLURM_LOCALID -export NODE_RANK=$((($RANK - $LOCAL_RANK) / $LOCAL_WORLD_SIZE)) - -# Redirect stdout and stderr so that we get a prefix with the node name -exec > >(trap "" INT TERM; sed -u "s/^/$NODENAME:$LOCAL_RANK out: /") -exec 2> >(trap "" INT TERM; sed -u "s/^/$NODENAME:$LOCAL_RANK err: /" >&2) - -if [ $SLURM_LOCALID -eq 0 ] ; then - if command -v rocm-smi &> /dev/null ; then - rm -rf /dev/shm/* || true - rocm-smi || true # rocm-smi returns exit code 2 even when it succeeds - fi -else - sleep 2 -fi - -exec $*