Skip to content

Commit

Permalink
Add ladder eval stuff
Browse files Browse the repository at this point in the history
  • Loading branch information
liujch1998 committed Nov 21, 2024
1 parent 5b91f0f commit a508eb9
Show file tree
Hide file tree
Showing 4 changed files with 127 additions and 1 deletion.
39 changes: 39 additions & 0 deletions scripts/beaker/ladder_peteish_eval-launch.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#!/usr/bin/env bash

set -ex

NUM_NODES=$1
shift

if [[ $NUM_NODES -eq 1 ]]; then
MULTI_NODE_ARGS=""
COMMAND="scripts/beaker/ladder_peteish.sh localhost ${NUM_NODES} 0 $*"
else
MULTI_NODE_ARGS="--replicas ${NUM_NODES} --leader-selection --host-networking --propagate-failure --propagate-preemption --synchronized-start-timeout 30m"
COMMAND="scripts/beaker/ladder_peteish_eval.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK $*"
fi

gantry run \
--allow-dirty \
--workspace ai2/OLMo-tiny \
--task-name ladder \
--description "OLMo ladder with $*" \
--priority high \
--preemptible \
--beaker-image shanea/olmo-torch23-gantry \
--cluster ai2/jupiter-cirrascale-2 \
--gpus 8 \
$MULTI_NODE_ARGS \
--budget ai2/oe-training \
--no-nfs \
--weka oe-training-default:/weka/oe-training-default \
--env LOG_FILTER_TYPE=local_rank0_only \
--env OMP_NUM_THREADS=8 \
--env OLMO_TASK=model \
--env-secret WANDB_API_KEY=JIACHENGL_WANDB_API_KEY \
--env-secret AWS_ACCESS_KEY_ID=AKSHITAB_AWS_ACCESS_KEY_ID \
--env-secret AWS_SECRET_ACCESS_KEY=AKSHITAB_AWS_SECRET_ACCESS_KEY \
--shared-memory 10GiB \
--venv base \
--yes \
-- /bin/bash -c "${COMMAND}"
43 changes: 43 additions & 0 deletions scripts/beaker/ladder_peteish_eval.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#!/usr/bin/env bash
set -exuo pipefail
IFS=$'\n\t'

BEAKER_LEADER_REPLICA_HOSTNAME=$1
shift

NUM_NODES=$1
shift

BEAKER_REPLICA_RANK=$1
shift

## Install flash attn
pip install packaging ninja
export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE
pip install flash-attn==2.5.9.post1 --no-build-isolation
pip install '.[train]'

# Force processes to synchronize at init_process_group
export TORCH_DIST_INIT_BARRIER=1

# Tell OLMo all ranks share the same filesystem for checkpoints.
export OLMO_SHARED_FS=1

export NCCL_DEBUG=INFO
export NCCL_IB_HCA="^=mlx5_bond_0"
export NCCL_SOCKET_IFNAME=ib
# export NCCL_IB_GID_INDEX=0

# debug flags for IB NCCL error
export TORCH_SHOW_CPP_STACKTRACES=1
export NCCL_INFO=DEBUG

torchrun \
--nnodes ${NUM_NODES}:${NUM_NODES} \
--nproc-per-node 8 \
--rdzv_id=12347 \
--rdzv_backend=static \
--rdzv_endpoint=$BEAKER_LEADER_REPLICA_HOSTNAME:29400 \
--node_rank=$BEAKER_REPLICA_RANK \
--rdzv_conf="read_timeout=420" \
scripts/ladder_peteish.py eval "$@"
23 changes: 22 additions & 1 deletion scripts/ladder_peteish.py
Original file line number Diff line number Diff line change
Expand Up @@ -486,6 +486,24 @@ def train_cmd(args: argparse.Namespace):
main(cfg)


def eval_cmd(args: argparse.Namespace):
cfg = config_from_args(args)
log.info(f"save folder from config: {cfg.save_folder}")

try:
mp.set_start_method("spawn", force=True)
except RuntimeError as e:
print(f"failed to set multiprocessing start method: {e}")
torch.cuda.set_device(f"cuda:{get_local_rank()}")
dist.init_process_group(backend="nccl", timeout=timedelta(minutes=30))
prepare_cli_environment()
add_cached_path_clients()

from eval import main

main(cfg)


if __name__ == "__main__":
parser = argparse.ArgumentParser(os.path.basename(__file__))
subparsers = parser.add_subparsers(required=True)
Expand Down Expand Up @@ -535,7 +553,10 @@ def train_cmd(args: argparse.Namespace):
train_parser = subparsers.add_parser("train")
train_parser.set_defaults(func=train_cmd)

for subparser in [dump_parser, train_parser]:
eval_parser = subparsers.add_parser("eval")
eval_parser.set_defaults(func=eval_cmd)

for subparser in [dump_parser, train_parser, eval_parser]:
subparser.add_argument("--model", type=str, required=True)
subparser.add_argument("--data", type=str, required=True)
subparser.add_argument("--length", type=str, default="2xC")
Expand Down
23 changes: 23 additions & 0 deletions scripts/ladder_peteish_eval.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
./scripts/beaker/ladder_peteish_eval-launch.sh 2 --model 190M --data olmoe-mix-0924 --length 1xC --name peteish-final-eval --save_overwrite --device_batch_size 4 --batch_size_divisor 64 --device_eval_batch_size 16 --load_path /weka/oe-training-default/ai2-llm/checkpoints/OLMo-ladder/peteish-final-190M-1xC/step7272-unsharded
./scripts/beaker/ladder_peteish_eval-launch.sh 2 --model 370M --data olmoe-mix-0924 --length 1xC --name peteish-final-eval --save_overwrite --device_batch_size 4 --batch_size_divisor 64 --device_eval_batch_size 16 --load_path /weka/oe-training-default/ai2-llm/checkpoints/OLMo-ladder/peteish-final-370M-1xC/step14173-unsharded
./scripts/beaker/ladder_peteish_eval-launch.sh 2 --model 760M --data olmoe-mix-0924 --length 1xC --name peteish-final-eval --save_overwrite --device_batch_size 2 --batch_size_divisor 64 --device_eval_batch_size 8 --load_path /weka/oe-training-default/ai2-llm/checkpoints/OLMo-ladder/peteish-final-600M-1xC/step11405-unsharded
./scripts/beaker/ladder_peteish_eval-launch.sh 2 --model 600M --data olmoe-mix-0924 --length 1xC --name peteish-final-eval --save_overwrite --device_batch_size 2 --batch_size_divisor 64 --device_eval_batch_size 8 --load_path /weka/oe-training-default/ai2-llm/checkpoints/OLMo-ladder/peteish-final-760M-1xC/step14000-unsharded
./scripts/beaker/ladder_peteish_eval-launch.sh 2 --model 1B --data olmoe-mix-0924 --length 1xC --name peteish-final-eval --save_overwrite --device_batch_size 1 --batch_size_divisor 64 --device_eval_batch_size 4 --load_path /weka/oe-training-default/ai2-llm/checkpoints/OLMo-ladder/peteish-final-1B-1xC/step16000-unsharded

./scripts/beaker/ladder_peteish_eval-launch.sh 2 --model 190M --data olmoe-mix-0924 --length 2xC --name peteish-final-eval --save_overwrite --device_batch_size 4 --batch_size_divisor 64 --device_eval_batch_size 16 --load_path /weka/oe-training-default/ai2-llm/checkpoints/OLMo-ladder/peteish-final-190M-2xC/step14000-unsharded
./scripts/beaker/ladder_peteish_eval-launch.sh 2 --model 370M --data olmoe-mix-0924 --length 2xC --name peteish-final-eval --save_overwrite --device_batch_size 4 --batch_size_divisor 64 --device_eval_batch_size 16 --load_path /weka/oe-training-default/ai2-llm/checkpoints/OLMo-ladder/peteish-final-370M-2xC/step28336-unsharded
./scripts/beaker/ladder_peteish_eval-launch.sh 2 --model 600M --data olmoe-mix-0924 --length 2xC --name peteish-final-eval --save_overwrite --device_batch_size 2 --batch_size_divisor 64 --device_eval_batch_size 8 --load_path /weka/oe-training-default/ai2-llm/checkpoints/OLMo-ladder/peteish-final-600M-2xC/step22799-unsharded
./scripts/beaker/ladder_peteish_eval-launch.sh 2 --model 760M --data olmoe-mix-0924 --length 2xC --name peteish-final-eval --save_overwrite --device_batch_size 2 --batch_size_divisor 64 --device_eval_batch_size 8 --load_path /weka/oe-training-default/ai2-llm/checkpoints/OLMo-ladder/peteish-final-760M-2xC/step28934-unsharded
./scripts/beaker/ladder_peteish_eval-launch.sh 2 --model 1B --data olmoe-mix-0924 --length 2xC --name peteish-final-eval --save_overwrite --device_batch_size 1 --batch_size_divisor 64 --device_eval_batch_size 4 --load_path /weka/oe-training-default/ai2-llm/checkpoints/OLMo-ladder/peteish-final-1B-2xC/step32547-unsharded

./scripts/beaker/ladder_peteish_eval-launch.sh 2 --model 190M --data olmoe-mix-0924 --length 5xC --name peteish-final-eval --save_overwrite --device_batch_size 4 --batch_size_divisor 64 --device_eval_batch_size 16 --load_path /weka/oe-training-default/ai2-llm/checkpoints/OLMo-ladder/peteish-final-190M-5xC/step36318-unsharded
./scripts/beaker/ladder_peteish_eval-launch.sh 2 --model 370M --data olmoe-mix-0924 --length 5xC --name peteish-final-eval --save_overwrite --device_batch_size 4 --batch_size_divisor 64 --device_eval_batch_size 16 --load_path /weka/oe-training-default/ai2-llm/checkpoints/OLMo-ladder/peteish-final-370M-5xC/step70823-unsharded
./scripts/beaker/ladder_peteish_eval-launch.sh 2 --model 600M --data olmoe-mix-0924 --length 5xC --name peteish-final-eval --save_overwrite --device_batch_size 2 --batch_size_divisor 64 --device_eval_batch_size 8 --load_path /weka/oe-training-default/ai2-llm/checkpoints/OLMo-ladder/peteish-final-600M-5xC/step55000-unsharded
./scripts/beaker/ladder_peteish_eval-launch.sh 2 --model 760M --data olmoe-mix-0924 --length 5xC --name peteish-final-eval --save_overwrite --device_batch_size 2 --batch_size_divisor 64 --device_eval_batch_size 8 --load_path /weka/oe-training-default/ai2-llm/checkpoints/OLMo-ladder/peteish-final-760M-5xC/step72320-unsharded
./scripts/beaker/ladder_peteish_eval-launch.sh 2 --model 1B --data olmoe-mix-0924 --length 5xC --name peteish-final-eval --save_overwrite --device_batch_size 1 --batch_size_divisor 64 --device_eval_batch_size 4 --load_path /weka/oe-training-default/ai2-llm/checkpoints/OLMo-ladder/peteish-final-1B-5xC/step81352-unsharded

./scripts/beaker/ladder_peteish_eval-launch.sh 2 --model 190M --data olmoe-mix-0924 --length 10xC --name peteish-final-eval --save_overwrite --device_batch_size 4 --batch_size_divisor 64 --device_eval_batch_size 16 --load_path /weka/oe-training-default/ai2-llm/checkpoints/OLMo-ladder/peteish-final-190M-10xC/step72625-unsharded
./scripts/beaker/ladder_peteish_eval-launch.sh 2 --model 370M --data olmoe-mix-0924 --length 10xC --name peteish-final-eval --save_overwrite --device_batch_size 4 --batch_size_divisor 64 --device_eval_batch_size 16 --load_path /weka/oe-training-default/ai2-llm/checkpoints/OLMo-ladder/peteish-final-370M-10xC/step141636-unsharded
./scripts/beaker/ladder_peteish_eval-launch.sh 2 --model 600M --data olmoe-mix-0924 --length 10xC --name peteish-final-eval --save_overwrite --device_batch_size 2 --batch_size_divisor 64 --device_eval_batch_size 8 --load_path /weka/oe-training-default/ai2-llm/checkpoints/OLMo-ladder/peteish-final-600M-10xC/step113000-unsharded
./scripts/beaker/ladder_peteish_eval-launch.sh 2 --model 760M --data olmoe-mix-0924 --length 10xC --name peteish-final-eval --save_overwrite --device_batch_size 2 --batch_size_divisor 64 --device_eval_batch_size 8 --load_path /weka/oe-training-default/ai2-llm/checkpoints/OLMo-ladder/peteish-final-760M-10xC/step144630-unsharded
./scripts/beaker/ladder_peteish_eval-launch.sh 2 --model 1B --data olmoe-mix-0924 --length 10xC --name peteish-final-eval --save_overwrite --device_batch_size 1 --batch_size_divisor 64 --device_eval_batch_size 4 --load_path /weka/oe-training-default/ai2-llm/checkpoints/OLMo-ladder/peteish-final-1B-10xC/step162000-unsharded

0 comments on commit a508eb9

Please sign in to comment.