diff --git a/scripts/lumi/c4-large-on-lumi.sh b/scripts/lumi/c4-large-on-lumi.sh deleted file mode 100644 index b822090cb..000000000 --- a/scripts/lumi/c4-large-on-lumi.sh +++ /dev/null @@ -1,51 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=c4-large -#SBATCH --account=project_462000229 -#SBATCH --output=/pfs/lustref1/flash/project_462000229/logs/%j.log -#SBATCH --nodes=32 # Total number of nodes -#SBATCH --ntasks-per-node=8 -#SBATCH --gpus-per-node=8 # Allocate one gpu per MPI rank -#SBATCH --cpus-per-task=6 -#SBATCH --time=00:15:00 -#SBATCH --mem=0 # All memory on the node -#SBATCH --partition=standard-g - -module load LUMI/22.08 partition/G - -export OLMO_CONTAINER=llm-lumi_latest.sif - -export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK -export MPICH_GPU_SUPPORT_ENABLED=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET_GDR_LEVEL=3 -export MIOPEN_USER_DB_PATH=/tmp/${USER}-miopen-cache-${SLURM_JOB_ID} -export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH} -export CXI_FORK_SAFE=1 -export CXI_FORK_SAFE_HP=1 -export FI_CXI_DISABLE_CQ_HUGETLB=1 - -# We need to set this to avoid "Cassini Event Queue overflow detected." errors. -export FI_CXI_DEFAULT_CQ_SIZE=131072 - -#export NCCL_DEBUG=INFO -export PYTHONPATH=.:${PYTHONPATH} -export ROCM_PATH=/opt/rocm -export SINGULARITYENV_LD_LIBRARY_PATH=/usr/local/lib:/opt/cray/libfabric/1.15.2.0/lib64 - -# Try playing with max_split_size_mb if you run into OOM errors. -export PYTORCH_HIP_ALLOC_CONF=max_split_size_mb:512 - -srun \ - --cpus-per-task=$SLURM_CPUS_PER_TASK \ - --distribution=block:block \ - --kill-on-bad-exit \ - scripts/run_with_environment.sh \ - singularity exec \ - -B"$PROJECT_DIR:$PROJECT_DIR" \ - -B"$SCRATCH_DIR:$SCRATCH_DIR" \ - -B"$FLASH_DIR:$FLASH_DIR" \ - -B /opt/cray:/opt/cray \ - -B /usr/lib64/libcxi.so.1:/usr/lib64/libcxi.so.1 \ - -B /usr/lib64/libjson-c.so.3:/usr/lib64/libjson-c.so.3 \ - $PROJECT_DIR/containers/$OLMO_CONTAINER \ - python scripts/train.py configs/c4-large.yaml --run_name=${SLURM_JOB_ID} ${@} diff --git a/scripts/lumi/c4-medium-on-lumi.sh b/scripts/lumi/c4-medium-on-lumi.sh deleted file mode 100644 index c0a5ea282..000000000 --- a/scripts/lumi/c4-medium-on-lumi.sh +++ /dev/null @@ -1,51 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=c4-medium -#SBATCH --account=project_462000229 -#SBATCH --output=/pfs/lustref1/flash/project_462000229/logs/%j.log -#SBATCH --nodes=64 # Total number of nodes -#SBATCH --ntasks-per-node=8 -#SBATCH --gpus-per-node=8 # Allocate one gpu per MPI rank -#SBATCH --cpus-per-task=6 -#SBATCH --time=48:00:00 -#SBATCH --mem=0 # All memory on the node -#SBATCH --partition=standard-g - -module load LUMI/22.08 partition/G - -export OLMO_CONTAINER=llm-lumi_latest.sif - -export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK -export MPICH_GPU_SUPPORT_ENABLED=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET_GDR_LEVEL=3 -export MIOPEN_USER_DB_PATH=/tmp/${USER}-miopen-cache-${SLURM_JOB_ID} -export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH} -export CXI_FORK_SAFE=1 -export CXI_FORK_SAFE_HP=1 -export FI_CXI_DISABLE_CQ_HUGETLB=1 - -# We need to set this to avoid "Cassini Event Queue overflow detected." errors. -export FI_CXI_DEFAULT_CQ_SIZE=131072 - -#export NCCL_DEBUG=INFO -export PYTHONPATH=.:${PYTHONPATH} -export ROCM_PATH=/opt/rocm -export SINGULARITYENV_LD_LIBRARY_PATH=/usr/local/lib:/opt/cray/libfabric/1.15.2.0/lib64 - -# Try playing with max_split_size_mb if you run into OOM errors. -export PYTORCH_HIP_ALLOC_CONF=max_split_size_mb:512 - -srun \ - --cpus-per-task=$SLURM_CPUS_PER_TASK \ - --distribution=block:block \ - --kill-on-bad-exit \ - scripts/run_with_environment.sh \ - singularity exec \ - -B"$PROJECT_DIR:$PROJECT_DIR" \ - -B"$SCRATCH_DIR:$SCRATCH_DIR" \ - -B"$FLASH_DIR:$FLASH_DIR" \ - -B /opt/cray:/opt/cray \ - -B /usr/lib64/libcxi.so.1:/usr/lib64/libcxi.so.1 \ - -B /usr/lib64/libjson-c.so.3:/usr/lib64/libjson-c.so.3 \ - $PROJECT_DIR/containers/$OLMO_CONTAINER \ - python scripts/train.py configs/c4-medium.yaml --run_name=${SLURM_JOB_ID} ${@} diff --git a/scripts/lumi/c4-small-on-lumi.sh b/scripts/lumi/c4-small-on-lumi.sh deleted file mode 100644 index 42e7ed060..000000000 --- a/scripts/lumi/c4-small-on-lumi.sh +++ /dev/null @@ -1,51 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=c4-small -#SBATCH --account=project_462000229 -#SBATCH --output=/pfs/lustref1/flash/project_462000229/logs/%j.log -#SBATCH --nodes=32 # Total number of nodes -#SBATCH --ntasks-per-node=8 -#SBATCH --gpus-per-node=8 # Allocate one gpu per MPI rank -#SBATCH --cpus-per-task=6 -#SBATCH --time=48:00:00 -#SBATCH --mem=0 # All memory on the node -#SBATCH --partition=standard-g - -module load LUMI/22.08 partition/G - -export OLMO_CONTAINER=llm-lumi_latest.sif - -export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK -export MPICH_GPU_SUPPORT_ENABLED=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET_GDR_LEVEL=3 -export MIOPEN_USER_DB_PATH=/tmp/${USER}-miopen-cache-${SLURM_JOB_ID} -export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH} -export CXI_FORK_SAFE=1 -export CXI_FORK_SAFE_HP=1 -export FI_CXI_DISABLE_CQ_HUGETLB=1 - -# We need to set this to avoid "Cassini Event Queue overflow detected." errors. -export FI_CXI_DEFAULT_CQ_SIZE=131072 - -#export NCCL_DEBUG=INFO -export PYTHONPATH=.:${PYTHONPATH} -export ROCM_PATH=/opt/rocm -export SINGULARITYENV_LD_LIBRARY_PATH=/usr/local/lib:/opt/cray/libfabric/1.15.2.0/lib64 - -# Try playing with max_split_size_mb if you run into OOM errors. -# export PYTORCH_HIP_ALLOC_CONF=max_split_size_mb:512 - -srun \ - --cpus-per-task=$SLURM_CPUS_PER_TASK \ - --distribution=block:block \ - --kill-on-bad-exit \ - scripts/run_with_environment.sh \ - singularity exec \ - -B"$PROJECT_DIR:$PROJECT_DIR" \ - -B"$SCRATCH_DIR:$SCRATCH_DIR" \ - -B"$FLASH_DIR:$FLASH_DIR" \ - -B /opt/cray:/opt/cray \ - -B /usr/lib64/libcxi.so.1:/usr/lib64/libcxi.so.1 \ - -B /usr/lib64/libjson-c.so.3:/usr/lib64/libjson-c.so.3 \ - $PROJECT_DIR/containers/$OLMO_CONTAINER \ - python scripts/train.py configs/c4-small.yaml --run_name=${SLURM_JOB_ID} ${@} diff --git a/scripts/lumi/demo.sh b/scripts/lumi/demo.sh deleted file mode 100755 index 3a876e2ec..000000000 --- a/scripts/lumi/demo.sh +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/env bash -# -# Demo script for running multinode jobs on LUMI. You can run this as a batch job using -# sbatch or as part of an interactive session by running this script as an executable. -# -#SBATCH --job-name=demo -#SBATCH --account=project_462000229 -#SBATCH --output=/scratch/project_462000229/logs/%j.log -#SBATCH --nodes=128 # Total number of nodes -#SBATCH --ntasks-per-node=8 -#SBATCH --gpus-per-node=8 # Allocate one gpu per MPI rank -#SBATCH --cpus-per-task=6 -#SBATCH --time=48:00:00 -#SBATCH --time-min=12:00:00 -#SBATCH --mem=0 # All memory on the node -#SBATCH --partition=standard-g - -module load LUMI/24.03 partition/G - -## Container-dependent settings -export OLMO_CONTAINER=$PROJECT_DIR/containers/lumi-torch25rc-rocm62-py312.sif -export ROCM_PATH=/opt/rocm -export CONDA_ENV=pytorch -export PYTHONPATH=.:${PYTHONPATH} - -## General LUMI settings (these rarely change) -export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK -export MPICH_GPU_SUPPORT_ENABLED=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET_GDR_LEVEL=3 -export MIOPEN_USER_DB_PATH=/tmp/${USER}-miopen-cache-${SLURM_JOB_ID} -export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH} -export CXI_FORK_SAFE=1 -export CXI_FORK_SAFE_HP=1 -export FI_CXI_DISABLE_CQ_HUGETLB=1 -export GPU_MAX_HW_QUEUES=8 -# We need to set this to avoid "Cassini Event Queue overflow detected." errors. -export FI_CXI_DEFAULT_CQ_SIZE=131072 - -## Job settings -export CHECKPOINTS_PATH=$SCRATCH_DIR/checkpoints -export HF_DATASETS_OFFLINE=1 -export SINGULARITYENV_TORCH_DIST_INIT_BARRIER=1 -# Try playing with max_split_size_mb if you run into OOM errors. -#export PYTORCH_HIP_ALLOC_CONF=max_split_size_mb:128 - -## Debug settings -#export NCCL_DEBUG=INFO -#export FI_LOG_LEVEL=INFO - -srun \ - --cpus-per-task=$SLURM_CPUS_PER_TASK \ - --distribution=block:block \ - --kill-on-bad-exit \ - scripts/run_with_environment.sh \ - singularity exec \ - -B"$PROJECT_DIR:$PROJECT_DIR" \ - -B"$FLASH_DIR:$FLASH_DIR" \ - -B"$SCRATCH_DIR:$SCRATCH_DIR" \ - -B /var/spool/slurmd,/opt/cray/,/usr/lib64/libcxi.so.1,/usr/lib64/libjansson.so.4,/usr/lib64/libjson-c.so.3 \ - $OLMO_CONTAINER \ - scripts/lumi/run-in-container.sh \ - python scripts/train.py configs/mitchish1-s3.yaml \ - "${@}" diff --git a/scripts/lumi/llama7.sh b/scripts/lumi/llama7.sh deleted file mode 100644 index 8a3947106..000000000 --- a/scripts/lumi/llama7.sh +++ /dev/null @@ -1,56 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=llama7 -#SBATCH --account=project_462000229 -#SBATCH --output=/pfs/lustref1/flash/project_462000229/logs/%j.log -#SBATCH --nodes=128 # Total number of nodes -#SBATCH --ntasks-per-node=8 -#SBATCH --gpus-per-node=8 # Allocate one gpu per MPI rank -#SBATCH --cpus-per-task=6 -#SBATCH --time=48:00:00 -#SBATCH --time-min=24:00:00 -#SBATCH --mem=0 # All memory on the node -#SBATCH --partition=standard-g - -module load LUMI/22.08 partition/G - -export OLMO_CONTAINER=llm-lumi_latest.sif - -export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK -export MPICH_GPU_SUPPORT_ENABLED=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET_GDR_LEVEL=3 -export MIOPEN_USER_DB_PATH=/tmp/${USER}-miopen-cache-${SLURM_JOB_ID} -export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH} -export CXI_FORK_SAFE=1 -export CXI_FORK_SAFE_HP=1 -export FI_CXI_DISABLE_CQ_HUGETLB=1 - -# We need to set this to avoid "Cassini Event Queue overflow detected." errors. -export FI_CXI_DEFAULT_CQ_SIZE=131072 - -#export NCCL_DEBUG=INFO -export PYTHONPATH=.:${PYTHONPATH} -export ROCM_PATH=/opt/rocm -export SINGULARITYENV_LD_LIBRARY_PATH=/usr/local/lib:/opt/cray/libfabric/1.15.2.0/lib64 - -# Try playing with max_split_size_mb if you run into OOM errors. -#export PYTORCH_HIP_ALLOC_CONF=max_split_size_mb:128 - -export DATA_PATH=$FLASH_DIR/preprocessed/olmo-mix -export CHECKPOINTS_PATH=$FLASH_DIR/checkpoints -export EVAL_DATA_PATH=$SCRATCH_DIR/eval-data - -srun \ - --cpus-per-task=$SLURM_CPUS_PER_TASK \ - --distribution=block:block \ - --kill-on-bad-exit \ - scripts/run_with_environment.sh \ - singularity exec \ - -B"$PROJECT_DIR:$PROJECT_DIR" \ - -B"$FLASH_DIR:$FLASH_DIR" \ - -B"$SCRATCH_DIR:$SCRATCH_DIR" \ - -B /opt/cray:/opt/cray \ - -B /usr/lib64/libcxi.so.1:/usr/lib64/libcxi.so.1 \ - -B /usr/lib64/libjson-c.so.3:/usr/lib64/libjson-c.so.3 \ - $PROJECT_DIR/containers/$OLMO_CONTAINER \ - python scripts/train.py configs/llama7.yaml --run_name=${SLURM_JOB_ID} ${@} diff --git a/scripts/lumi/log_into_node.sh b/scripts/lumi/log_into_node.sh deleted file mode 100755 index 2d4e8c8cc..000000000 --- a/scripts/lumi/log_into_node.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -set -euxo pipefail - -srun --interactive --pty --jobid=$1 \ - singularity exec \ - -B"$PROJECT_DIR:$PROJECT_DIR" \ - -B"$SCRATCH_DIR:$SCRATCH_DIR" \ - -B"$FLASH_DIR:$FLASH_DIR" \ - -B /var/spool/slurmd,/opt/cray/,/usr/lib64/libcxi.so.1,/usr/lib64/libjansson.so.4,/usr/lib64/libjson-c.so.3 \ - $OLMO_CONTAINER \ - fish diff --git a/scripts/lumi/lumi-interactive.sh b/scripts/lumi/lumi-interactive.sh deleted file mode 100755 index 856a1958c..000000000 --- a/scripts/lumi/lumi-interactive.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash -# Run an interactive shell in our singularity image on LUMI. - -singularity shell \ - -B"$PROJECT_DIR:$PROJECT_DIR" \ - -B"$SCRATCH_DIR:$SCRATCH_DIR" \ - -B"$FLASH_DIR:$FLASH_DIR" \ - -B /var/spool/slurmd,/opt/cray/,/usr/lib64/libcxi.so.1,/usr/lib64/libjansson.so.4,/usr/lib64/libjson-c.so.3 \ - $OLMO_CONTAINER diff --git a/scripts/lumi/mitch-ish-7b.sh b/scripts/lumi/mitch-ish-7b.sh deleted file mode 100644 index e30470c30..000000000 --- a/scripts/lumi/mitch-ish-7b.sh +++ /dev/null @@ -1,70 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=v1.5-mix-medium-mitch-ish -#SBATCH --account=project_462000229 -#SBATCH --output=/pfs/lustref1/flash/project_462000229/logs/%j.log -#SBATCH --nodes=128 # Total number of nodes -#SBATCH --ntasks-per-node=8 -#SBATCH --gpus-per-node=8 # Allocate one gpu per MPI rank -#SBATCH --cpus-per-task=6 -#SBATCH --time=48:00:00 -#SBATCH --time-min=24:00:00 -#SBATCH --mem=0 # All memory on the node -#SBATCH --partition=standard-g - -module load LUMI/22.08 partition/G - -# export OLMO_CONTAINER=llm-lumi_latest.sif -export OLMO_CONTAINER=llm-lumi-torch21_latest.sif - -export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK -export MPICH_GPU_SUPPORT_ENABLED=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET_GDR_LEVEL=3 -export MIOPEN_USER_DB_PATH=/tmp/${USER}-miopen-cache-${SLURM_JOB_ID} -export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH} -export CXI_FORK_SAFE=1 -export CXI_FORK_SAFE_HP=1 -export FI_CXI_DISABLE_CQ_HUGETLB=1 -export GPU_MAX_HW_QUEUES=8 - -# We need to set this to avoid "Cassini Event Queue overflow detected." errors. -export FI_CXI_DEFAULT_CQ_SIZE=131072 - -#export NCCL_DEBUG=INFO -export PYTHONPATH=.:${PYTHONPATH} -export ROCM_PATH=/opt/rocm -export SINGULARITYENV_LD_LIBRARY_PATH=/usr/local/lib:/opt/cray/libfabric/1.15.2.0/lib64 - -# Try playing with max_split_size_mb if you run into OOM errors. -#export PYTORCH_HIP_ALLOC_CONF=max_split_size_mb:128 - -export HF_DATASETS_OFFLINE=1 - -export DATA_PATH=$FLASH_DIR/preprocessed/olmo-mix -export CHECKPOINTS_PATH=$FLASH_DIR/checkpoints -export EVAL_DATA_PATH=$SCRATCH_DIR/eval-data - -srun \ - --cpus-per-task=$SLURM_CPUS_PER_TASK \ - --distribution=block:block \ - --kill-on-bad-exit \ - scripts/run_with_environment.sh \ - singularity exec \ - -B"$PROJECT_DIR:$PROJECT_DIR" \ - -B"$FLASH_DIR:$FLASH_DIR" \ - -B"$SCRATCH_DIR:$SCRATCH_DIR" \ - -B /opt/cray:/opt/cray \ - -B /usr/lib64/libcxi.so.1:/usr/lib64/libcxi.so.1 \ - -B /usr/lib64/libjson-c.so.3:/usr/lib64/libjson-c.so.3 \ - $PROJECT_DIR/containers/$OLMO_CONTAINER \ - python scripts/train.py configs/v1_5-mix-medium-mitch-ish.yaml ${@} \ - --run_name=${SLURM_JOB_ID} \ - --activation_checkpointing=fine_grained \ - --fsdp.wrapping_strategy=one_in_four \ - --fsdp.sharding_strategy=FULL_SHARD \ - --sharded_checkpointer=local \ - --wandb.name=v1_5-mix-mitch-ish-lumi \ - --save_interval=10000 \ - --save_interval_ephemeral=1000 \ - --remote_save_folder=s3://ai2-llm/checkpoints/7b/mitchish-lumi \ - --save_folder=${FLASH_DIR}/checkpoints/mitchish-lumi diff --git a/scripts/lumi/mitchish65-randomseed.sh b/scripts/lumi/mitchish65-randomseed.sh deleted file mode 100644 index 064fb86b4..000000000 --- a/scripts/lumi/mitchish65-randomseed.sh +++ /dev/null @@ -1,63 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=mitchish65 -#SBATCH --account=project_462000229 -#SBATCH --output=/pfs/lustref1/flash/project_462000229/logs/%j.log -#SBATCH --nodes=64 # Total number of nodes -#SBATCH --ntasks-per-node=8 -#SBATCH --gpus-per-node=8 # Allocate one gpu per MPI rank -#SBATCH --cpus-per-task=6 -#SBATCH --time=48:00:00 -#SBATCH --mem=0 # All memory on the node -#SBATCH --partition=standard-g - -SEED=$1 -shift - -export OLMO_CONTAINER=llm-lumi-torch21_latest.sif - -export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK -export MPICH_GPU_SUPPORT_ENABLED=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET_GDR_LEVEL=3 -export MIOPEN_USER_DB_PATH=/tmp/${USER}-miopen-cache-${SLURM_JOB_ID} -export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH} -export CXI_FORK_SAFE=1 -export CXI_FORK_SAFE_HP=1 -export FI_CXI_DISABLE_CQ_HUGETLB=1 - -# We need to set this to avoid "Cassini Event Queue overflow detected." errors. -export FI_CXI_DEFAULT_CQ_SIZE=131072 - -#export NCCL_DEBUG=INFO -export PYTHONPATH=.:${PYTHONPATH} -export ROCM_PATH=/opt/rocm -export SINGULARITYENV_LD_LIBRARY_PATH=/usr/local/lib:/opt/cray/libfabric/1.15.2.0/lib64 - -# Try playing with max_split_size_mb if you run into OOM errors. -#export PYTORCH_HIP_ALLOC_CONF=max_split_size_mb:128 - -export DATA_PATH=$FLASH_DIR/preprocessed/olmo-mix -export CHECKPOINTS_PATH=$FLASH_DIR/checkpoints -export EVAL_DATA_PATH=$SCRATCH_DIR/eval-data - -srun \ - --cpus-per-task=$SLURM_CPUS_PER_TASK \ - --distribution=block:block \ - --kill-on-bad-exit \ - scripts/run_with_environment.sh \ - singularity exec \ - -B"$PROJECT_DIR:$PROJECT_DIR" \ - -B"$FLASH_DIR:$FLASH_DIR" \ - -B"$SCRATCH_DIR:$SCRATCH_DIR" \ - -B /opt/cray:/opt/cray \ - -B /usr/lib64/libcxi.so.1:/usr/lib64/libcxi.so.1 \ - -B /usr/lib64/libjson-c.so.3:/usr/lib64/libjson-c.so.3 \ - $PROJECT_DIR/containers/$OLMO_CONTAINER \ - python scripts/train.py configs/mitchish65.yaml \ - --run_name=seed${SEED}-${SLURM_JOB_ID} \ - --time_limit=$((47 * 60 * 60)) \ - --canceled_check_interval=10 \ - --device_train_microbatch_size=2 \ - --global_train_batch_size=1024 \ - --seed=${SEED} - ${@} diff --git a/scripts/lumi/mitchish65.sh b/scripts/lumi/mitchish65.sh deleted file mode 100644 index 859029d37..000000000 --- a/scripts/lumi/mitchish65.sh +++ /dev/null @@ -1,59 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=mitchish65 -#SBATCH --account=project_462000229 -#SBATCH --output=/pfs/lustref1/flash/project_462000229/logs/%j.log -#SBATCH --nodes=128 # Total number of nodes -#SBATCH --ntasks-per-node=8 -#SBATCH --gpus-per-node=8 # Allocate one gpu per MPI rank -#SBATCH --cpus-per-task=6 -#SBATCH --time=48:00:00 -#SBATCH --mem=0 # All memory on the node -#SBATCH --partition=standard-g - -export OLMO_CONTAINER=llm-lumi-torch21_latest.sif - -export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK -export MPICH_GPU_SUPPORT_ENABLED=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET_GDR_LEVEL=3 -export MIOPEN_USER_DB_PATH=/tmp/${USER}-miopen-cache-${SLURM_JOB_ID} -export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH} -export CXI_FORK_SAFE=1 -export CXI_FORK_SAFE_HP=1 -export FI_CXI_DISABLE_CQ_HUGETLB=1 - -# We need to set this to avoid "Cassini Event Queue overflow detected." errors. -export FI_CXI_DEFAULT_CQ_SIZE=131072 - -#export NCCL_DEBUG=INFO -export PYTHONPATH=.:${PYTHONPATH} -export ROCM_PATH=/opt/rocm -export SINGULARITYENV_LD_LIBRARY_PATH=/usr/local/lib:/opt/cray/libfabric/1.15.2.0/lib64 - -# Try playing with max_split_size_mb if you run into OOM errors. -#export PYTORCH_HIP_ALLOC_CONF=max_split_size_mb:128 - -export DATA_PATH=$FLASH_DIR/preprocessed/olmo-mix -export CHECKPOINTS_PATH=$FLASH_DIR/checkpoints -export EVAL_DATA_PATH=$SCRATCH_DIR/eval-data - -srun \ - --cpus-per-task=$SLURM_CPUS_PER_TASK \ - --distribution=block:block \ - --kill-on-bad-exit \ - scripts/run_with_environment.sh \ - singularity exec \ - -B"$PROJECT_DIR:$PROJECT_DIR" \ - -B"$FLASH_DIR:$FLASH_DIR" \ - -B"$SCRATCH_DIR:$SCRATCH_DIR" \ - -B /opt/cray:/opt/cray \ - -B /usr/lib64/libcxi.so.1:/usr/lib64/libcxi.so.1 \ - -B /usr/lib64/libjson-c.so.3:/usr/lib64/libjson-c.so.3 \ - $PROJECT_DIR/containers/$OLMO_CONTAINER \ - python scripts/train.py configs/mitchish65.yaml \ - --run_name=${SLURM_JOB_ID} \ - --time_limit=$((47 * 60 * 60)) \ - --canceled_check_interval=10 \ - --device_train_microbatch_size=2 \ - --save_interval=1000 \ - ${@} diff --git a/scripts/lumi/mitchish70.sh b/scripts/lumi/mitchish70.sh deleted file mode 100644 index e06b39437..000000000 --- a/scripts/lumi/mitchish70.sh +++ /dev/null @@ -1,61 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=mitchish70 -#SBATCH --account=project_462000229 -#SBATCH --output=/pfs/lustref1/flash/project_462000229/logs/%j.log -#SBATCH --nodes=64 # Total number of nodes -#SBATCH --ntasks-per-node=8 -#SBATCH --gpus-per-node=8 # Allocate one gpu per MPI rank -#SBATCH --cpus-per-task=6 -#SBATCH --time=48:00:00 -#SBATCH --mem=0 # All memory on the node -#SBATCH --partition=standard-g - -export OLMO_CONTAINER=llm-lumi-torch21_latest.sif - -export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK -export MPICH_GPU_SUPPORT_ENABLED=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET_GDR_LEVEL=3 -export MIOPEN_USER_DB_PATH=/tmp/${USER}-miopen-cache-${SLURM_JOB_ID} -export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH} -export CXI_FORK_SAFE=1 -export CXI_FORK_SAFE_HP=1 -export FI_CXI_DISABLE_CQ_HUGETLB=1 - -# We need to set this to avoid "Cassini Event Queue overflow detected." errors. -export FI_CXI_DEFAULT_CQ_SIZE=131072 - -#export NCCL_DEBUG=INFO -export PYTHONPATH=.:${PYTHONPATH} -export ROCM_PATH=/opt/rocm -export SINGULARITYENV_LD_LIBRARY_PATH=/usr/local/lib:/opt/cray/libfabric/1.15.2.0/lib64 - -# Try playing with max_split_size_mb if you run into OOM errors. -#export PYTORCH_HIP_ALLOC_CONF=max_split_size_mb:128 - -export HF_DATASETS_OFFLINE=1 - -export DATA_PATH=$FLASH_DIR/preprocessed/olmo-mix -export CHECKPOINTS_PATH=$FLASH_DIR/checkpoints -export EVAL_DATA_PATH=$SCRATCH_DIR/eval-data - -srun \ - --cpus-per-task=$SLURM_CPUS_PER_TASK \ - --distribution=block:block \ - --kill-on-bad-exit \ - scripts/run_with_environment.sh \ - singularity exec \ - -B"$PROJECT_DIR:$PROJECT_DIR" \ - -B"$FLASH_DIR:$FLASH_DIR" \ - -B"$SCRATCH_DIR:$SCRATCH_DIR" \ - -B /opt/cray:/opt/cray \ - -B /usr/lib64/libcxi.so.1:/usr/lib64/libcxi.so.1 \ - -B /usr/lib64/libjson-c.so.3:/usr/lib64/libjson-c.so.3 \ - $PROJECT_DIR/containers/$OLMO_CONTAINER \ - python scripts/train.py configs/mitchish70.yaml \ - --run_name=${SLURM_JOB_ID} \ - --time_limit=$((47 * 60 * 60)) \ - --canceled_check_interval=10 \ - --device_train_microbatch_size=2 \ - --save_interval=1000 \ - ${@} diff --git a/scripts/lumi/olmo-small-ablation-on-lumi.sh b/scripts/lumi/olmo-small-ablation-on-lumi.sh deleted file mode 100644 index 205b4d122..000000000 --- a/scripts/lumi/olmo-small-ablation-on-lumi.sh +++ /dev/null @@ -1,70 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=olmo-small-ablation -#SBATCH --account=project_462000229 -#SBATCH --output=/pfs/lustref1/flash/project_462000229/logs/%j.log -#SBATCH --nodes=16 # Total number of nodes -#SBATCH --ntasks-per-node=8 -#SBATCH --gpus-per-node=8 # Allocate one gpu per MPI rank -#SBATCH --cpus-per-task=6 -#SBATCH --time=48:00:00 -#SBATCH --mem=0 # All memory on the node -#SBATCH --partition=standard-g - -module load LUMI/22.08 partition/G - -# check if LOAD_PATH is provided as an environment variable; if so, create an argument -# to pass to the training script -if [ -z ${LOAD_PATH+x} ]; then - LOAD_PATH_ARG="" -else - LOAD_PATH_ARG="--load_path=${LOAD_PATH}" -fi - - -export OLMO_CONTAINER=llm-lumi_latest.sif - -export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK -export MPICH_GPU_SUPPORT_ENABLED=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET_GDR_LEVEL=3 -export MIOPEN_USER_DB_PATH=/tmp/${USER}-miopen-cache-${SLURM_JOB_ID} -export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH} -export CXI_FORK_SAFE=1 -export CXI_FORK_SAFE_HP=1 -export FI_CXI_DISABLE_CQ_HUGETLB=1 - -# We need to set this to avoid "Cassini Event Queue overflow detected." errors. -export FI_CXI_DEFAULT_CQ_SIZE=131072 - -#export NCCL_DEBUG=INFO -export PYTHONPATH=.:${PYTHONPATH} -export WANDB_PROJECT=c4-small -export ROCM_PATH=/opt/rocm -export SINGULARITYENV_LD_LIBRARY_PATH=/usr/local/lib:/opt/cray/libfabric/1.15.2.0/lib64 -export CONFIG_PATH=configs/olmo-small-ablation.yaml - -# Try playing with max_split_size_mb if you run into OOM errors. -# export PYTORCH_HIP_ALLOC_CONF=max_split_size_mb:512 - -# get run name, we will postpend it with the job id of this slurm run -export RUN_NAME=$(cat $CONFIG_PATH | grep -ohP "^run_name\:\w*(.+)$" | sed 's/run_name:\s*//') - -# actually run the training script -srun \ - --cpus-per-task=$SLURM_CPUS_PER_TASK \ - --distribution=block:block \ - --kill-on-bad-exit \ - scripts/run_with_environment.sh \ - singularity exec \ - -B"$PROJECT_DIR:$PROJECT_DIR" \ - -B"$SCRATCH_DIR:$SCRATCH_DIR" \ - -B"$FLASH_DIR:$FLASH_DIR" \ - -B /opt/cray:/opt/cray \ - -B /usr/lib64/libcxi.so.1:/usr/lib64/libcxi.so.1 \ - -B /usr/lib64/libjson-c.so.3:/usr/lib64/libjson-c.so.3 \ - $PROJECT_DIR/containers/$OLMO_CONTAINER \ - python scripts/train.py $CONFIG_PATH \ - --run_name="${RUN_NAME}_${SLURM_JOB_ID}" \ - --wandb.project=$WANDB_PROJECT \ - $LOAD_PATH_ARG \ - ${@} diff --git a/scripts/lumi/olmo7-ablations.sh b/scripts/lumi/olmo7-ablations.sh deleted file mode 100644 index e91b6cce2..000000000 --- a/scripts/lumi/olmo7-ablations.sh +++ /dev/null @@ -1,65 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=olmo7-ablation -#SBATCH --account=project_462000229 -#SBATCH --output=/pfs/lustref1/flash/project_462000229/logs/%j.log -#SBATCH --nodes=128 # Total number of nodes -#SBATCH --ntasks-per-node=8 -#SBATCH --gpus-per-node=8 # Allocate one gpu per MPI rank -#SBATCH --cpus-per-task=6 -#SBATCH --time=39:15:00 -#SBATCH --mem=0 # All memory on the node -#SBATCH --partition=standard-g - -WANDB_GROUP=$1 -shift - -export OLMO_CONTAINER=llm-lumi-torch21_latest.sif - -export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK -export MPICH_GPU_SUPPORT_ENABLED=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET_GDR_LEVEL=3 -export MIOPEN_USER_DB_PATH=/tmp/${USER}-miopen-cache-${SLURM_JOB_ID} -export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH} -export CXI_FORK_SAFE=1 -export CXI_FORK_SAFE_HP=1 -export FI_CXI_DISABLE_CQ_HUGETLB=1 - -# We need to set this to avoid "Cassini Event Queue overflow detected." errors. -export FI_CXI_DEFAULT_CQ_SIZE=131072 - -#export NCCL_DEBUG=INFO -export PYTHONPATH=.:${PYTHONPATH} -export ROCM_PATH=/opt/rocm -export SINGULARITYENV_LD_LIBRARY_PATH=/usr/local/lib:/opt/cray/libfabric/1.15.2.0/lib64 - -# Try playing with max_split_size_mb if you run into OOM errors. -#export PYTORCH_HIP_ALLOC_CONF=max_split_size_mb:128 - -export HF_DATASETS_OFFLINE=1 - -export DATA_PATH=$FLASH_DIR/preprocessed/olmo-mix -export CHECKPOINTS_PATH=$FLASH_DIR/checkpoints -export EVAL_DATA_PATH=$SCRATCH_DIR/eval-data - -srun \ - --cpus-per-task=$SLURM_CPUS_PER_TASK \ - --distribution=block:block \ - --kill-on-bad-exit \ - scripts/run_with_environment.sh \ - singularity exec \ - -B"$PROJECT_DIR:$PROJECT_DIR" \ - -B"$FLASH_DIR:$FLASH_DIR" \ - -B"$SCRATCH_DIR:$SCRATCH_DIR" \ - -B /opt/cray:/opt/cray \ - -B /usr/lib64/libcxi.so.1:/usr/lib64/libcxi.so.1 \ - -B /usr/lib64/libjson-c.so.3:/usr/lib64/libjson-c.so.3 \ - $PROJECT_DIR/containers/$OLMO_CONTAINER \ - python scripts/train.py configs/olmo7-ablation.yaml ${@} \ - --run_name=${SLURM_JOB_ID} \ - --activation_checkpointing=fine_grained \ - --fsdp.wrapping_strategy=one_in_four \ - --fsdp.sharding_strategy=FULL_SHARD \ - --sharded_checkpointer=local \ - --time_limit=$((39 * 60 * 60)) \ - --wandb.group=$WANDB_GROUP diff --git a/scripts/lumi/peteish13-highlr.sh b/scripts/lumi/peteish13-highlr.sh deleted file mode 100644 index 4c10bc519..000000000 --- a/scripts/lumi/peteish13-highlr.sh +++ /dev/null @@ -1,77 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=peteish13 -#SBATCH --account=project_462000229 -#SBATCH --output=/scratch/project_462000229/logs/%j.log -#SBATCH --nodes=128 # Total number of nodes -#SBATCH --ntasks-per-node=8 -#SBATCH --gpus-per-node=8 # Allocate one gpu per MPI rank -#SBATCH --cpus-per-task=6 -#SBATCH --time=48:00:00 -#SBATCH --time-min=8:30:00 -#SBATCH --mem=0 # All memory on the node -#SBATCH --partition=standard-g - -module load LUMI/24.03 partition/G - -export OLMO_CONTAINER=lumi-torch25rc-rocm62-py312.sif -export SIF_CONTAINER=$PROJECT_DIR/containers/$OLMO_CONTAINER -#export SIF_CONTAINER=$SIF -export CONDA_ENV=pytorch - -export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK -export MPICH_GPU_SUPPORT_ENABLED=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET_GDR_LEVEL=3 -export MIOPEN_USER_DB_PATH=/tmp/${USER}-miopen-cache-${SLURM_JOB_ID} -export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH} -export CXI_FORK_SAFE=1 -export CXI_FORK_SAFE_HP=1 -export FI_CXI_DISABLE_CQ_HUGETLB=1 -export GPU_MAX_HW_QUEUES=8 - -# We need to set this to avoid "Cassini Event Queue overflow detected." errors. -export FI_CXI_DEFAULT_CQ_SIZE=131072 - -#export NCCL_DEBUG=INFO -export PYTHONPATH=.:${PYTHONPATH} -export ROCM_PATH=/opt/rocm -#export SINGULARITYENV_LD_LIBRARY_PATH=/usr/local/lib:/opt/cray/libfabric/1.15.2.0/lib64:/opt/rocm/lib -export SINGULARITYENV_TORCH_DIST_INIT_BARRIER=1 - -# Try playing with max_split_size_mb if you run into OOM errors. -#export PYTORCH_HIP_ALLOC_CONF=max_split_size_mb:128 - -export CHECKPOINTS_PATH=$SCRATCH_DIR/checkpoints - -srun \ - --cpus-per-task=$SLURM_CPUS_PER_TASK \ - --distribution=block:block \ - --kill-on-bad-exit \ - scripts/run_with_environment.sh \ - singularity exec \ - -B"$PROJECT_DIR:$PROJECT_DIR" \ - -B"$FLASH_DIR:$FLASH_DIR" \ - -B"$SCRATCH_DIR:$SCRATCH_DIR" \ - -B /var/spool/slurmd,/opt/cray/,/usr/lib64/libcxi.so.1,/usr/lib64/libjansson.so.4,/usr/lib64/libjson-c.so.3 \ - $SIF_CONTAINER \ - scripts/lumi/run-in-container.sh \ - python scripts/train.py configs/peteish13-lumi.yaml \ - --run_name=peteish13-highlr_${SLURM_JOB_ID} \ - --wandb.name=peteish13-highlr_${SLURM_JOB_ID} \ - --wandb.group=peteish13-highlr \ - --data.num_workers=$SLURM_CPUS_PER_TASK \ - --data.prefetch_factor=2 \ - --save_folder=$CHECKPOINTS_PATH/peteish13-highlr/${SLURM_JOB_ID} \ - --remote_save_folder=s3://ai2-llm/checkpoints/OLMo-medium/peteish13-highlr/ \ - --fused_loss=false \ - --model.flash_attention=false \ - --device_train_microbatch_size=2 \ - --activation_checkpointing=whole_layer \ - --fsdp.sharding_strategy=HYBRID_SHARD \ - --fsdp.hybrid_sharding_num_model_replicas=$SLURM_NNODES \ - --sharded_checkpointer=olmo_core \ - --save_overwrite \ - --time_limit=$((8 * 60 * 60)) \ - '--load_path=${path.last_checkpoint:${remote_save_folder}}' \ - --optimizer.learning_rate=9.0e-4 \ - "${@}" diff --git a/scripts/lumi/peteish13-medlr.sh b/scripts/lumi/peteish13-medlr.sh deleted file mode 100644 index 23fb7699c..000000000 --- a/scripts/lumi/peteish13-medlr.sh +++ /dev/null @@ -1,76 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=peteish13 -#SBATCH --account=project_462000229 -#SBATCH --output=/scratch/project_462000229/logs/%j.log -#SBATCH --nodes=128 # Total number of nodes -#SBATCH --ntasks-per-node=8 -#SBATCH --gpus-per-node=8 # Allocate one gpu per MPI rank -#SBATCH --cpus-per-task=6 -#SBATCH --time=12:00:00 -#SBATCH --mem=0 # All memory on the node -#SBATCH --partition=standard-g - -module load LUMI/24.03 partition/G - -export OLMO_CONTAINER=lumi-torch25rc-rocm62-py312.sif -export SIF_CONTAINER=$PROJECT_DIR/containers/$OLMO_CONTAINER -#export SIF_CONTAINER=$SIF -export CONDA_ENV=pytorch - -export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK -export MPICH_GPU_SUPPORT_ENABLED=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET_GDR_LEVEL=3 -export MIOPEN_USER_DB_PATH=/tmp/${USER}-miopen-cache-${SLURM_JOB_ID} -export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH} -export CXI_FORK_SAFE=1 -export CXI_FORK_SAFE_HP=1 -export FI_CXI_DISABLE_CQ_HUGETLB=1 -export GPU_MAX_HW_QUEUES=8 - -# We need to set this to avoid "Cassini Event Queue overflow detected." errors. -export FI_CXI_DEFAULT_CQ_SIZE=131072 - -#export NCCL_DEBUG=INFO -export PYTHONPATH=.:${PYTHONPATH} -export ROCM_PATH=/opt/rocm -#export SINGULARITYENV_LD_LIBRARY_PATH=/usr/local/lib:/opt/cray/libfabric/1.15.2.0/lib64:/opt/rocm/lib -export SINGULARITYENV_TORCH_DIST_INIT_BARRIER=1 - -# Try playing with max_split_size_mb if you run into OOM errors. -#export PYTORCH_HIP_ALLOC_CONF=max_split_size_mb:128 - -export CHECKPOINTS_PATH=$SCRATCH_DIR/checkpoints - -srun \ - --cpus-per-task=$SLURM_CPUS_PER_TASK \ - --distribution=block:block \ - --kill-on-bad-exit \ - scripts/run_with_environment.sh \ - singularity exec \ - -B"$PROJECT_DIR:$PROJECT_DIR" \ - -B"$FLASH_DIR:$FLASH_DIR" \ - -B"$SCRATCH_DIR:$SCRATCH_DIR" \ - -B /var/spool/slurmd,/opt/cray/,/usr/lib64/libcxi.so.1,/usr/lib64/libjansson.so.4,/usr/lib64/libjson-c.so.3 \ - $SIF_CONTAINER \ - scripts/lumi/run-in-container.sh \ - python scripts/train.py configs/peteish13-lumi.yaml \ - --run_name=peteish13-medlr_${SLURM_JOB_ID} \ - --wandb.name=peteish13-medlr_${SLURM_JOB_ID} \ - --wandb.group=peteish13-medlr \ - --data.num_workers=$SLURM_CPUS_PER_TASK \ - --data.prefetch_factor=2 \ - --save_folder=$CHECKPOINTS_PATH/peteish13-medlr/${SLURM_JOB_ID} \ - --remote_save_folder=s3://ai2-llm/checkpoints/OLMo-medium/peteish13-medlr/ \ - --fused_loss=false \ - --model.flash_attention=false \ - --device_train_microbatch_size=2 \ - --activation_checkpointing=whole_layer \ - --fsdp.sharding_strategy=HYBRID_SHARD \ - --fsdp.hybrid_sharding_num_model_replicas=$SLURM_NNODES \ - --sharded_checkpointer=olmo_core \ - --save_overwrite \ - --time_limit=$((11 * 60 * 60)) \ - '--load_path=${path.last_checkpoint:${remote_save_folder}}' \ - --optimizer.learning_rate=6e-4 \ - "${@}" diff --git a/scripts/lumi/peteish13.sh b/scripts/lumi/peteish13.sh deleted file mode 100644 index 0b419eb7d..000000000 --- a/scripts/lumi/peteish13.sh +++ /dev/null @@ -1,76 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=peteish13 -#SBATCH --account=project_462000229 -#SBATCH --output=/scratch/project_462000229/logs/%j.log -#SBATCH --nodes=128 # Total number of nodes -#SBATCH --ntasks-per-node=8 -#SBATCH --gpus-per-node=8 # Allocate one gpu per MPI rank -#SBATCH --cpus-per-task=6 -#SBATCH --time=48:00:00 -#SBATCH --time-min=48:00:00 -#SBATCH --mem=0 # All memory on the node -#SBATCH --partition=standard-g - -module load LUMI/24.03 partition/G - -export OLMO_CONTAINER=lumi-torch25rc-rocm62-py312.sif -export SIF_CONTAINER=$PROJECT_DIR/containers/$OLMO_CONTAINER -#export SIF_CONTAINER=$SIF -export CONDA_ENV=pytorch - -export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK -export MPICH_GPU_SUPPORT_ENABLED=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET_GDR_LEVEL=3 -export MIOPEN_USER_DB_PATH=/tmp/${USER}-miopen-cache-${SLURM_JOB_ID} -export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH} -export CXI_FORK_SAFE=1 -export CXI_FORK_SAFE_HP=1 -export FI_CXI_DISABLE_CQ_HUGETLB=1 -export GPU_MAX_HW_QUEUES=8 - -# We need to set this to avoid "Cassini Event Queue overflow detected." errors. -export FI_CXI_DEFAULT_CQ_SIZE=131072 - -#export NCCL_DEBUG=INFO -export PYTHONPATH=.:${PYTHONPATH} -export ROCM_PATH=/opt/rocm -#export SINGULARITYENV_LD_LIBRARY_PATH=/usr/local/lib:/opt/cray/libfabric/1.15.2.0/lib64:/opt/rocm/lib -export SINGULARITYENV_TORCH_DIST_INIT_BARRIER=1 - -# Try playing with max_split_size_mb if you run into OOM errors. -#export PYTORCH_HIP_ALLOC_CONF=max_split_size_mb:128 - -export CHECKPOINTS_PATH=$SCRATCH_DIR/checkpoints - -srun \ - --cpus-per-task=$SLURM_CPUS_PER_TASK \ - --distribution=block:block \ - --kill-on-bad-exit \ - scripts/run_with_environment.sh \ - singularity exec \ - -B"$PROJECT_DIR:$PROJECT_DIR" \ - -B"$FLASH_DIR:$FLASH_DIR" \ - -B"$SCRATCH_DIR:$SCRATCH_DIR" \ - -B /var/spool/slurmd,/opt/cray/,/usr/lib64/libcxi.so.1,/usr/lib64/libjansson.so.4,/usr/lib64/libjson-c.so.3 \ - $SIF_CONTAINER \ - scripts/lumi/run-in-container.sh \ - python scripts/train.py configs/peteish13-s3.yaml \ - --run_name=peteish13-lumi_${SLURM_JOB_ID} \ - --wandb.name=peteish13-lumi_${SLURM_JOB_ID} \ - --wandb.group=peteish13-lumi \ - --data.num_workers=$SLURM_CPUS_PER_TASK \ - --data.prefetch_factor=2 \ - --save_folder=$CHECKPOINTS_PATH/peteish13/${SLURM_JOB_ID} \ - --remote_save_folder=s3://ai2-llm/checkpoints/OLMo-medium/peteish13-lumi/ \ - --fused_loss=false \ - --model.flash_attention=false \ - --device_train_microbatch_size=2 \ - --activation_checkpointing=whole_layer \ - --fsdp.sharding_strategy=HYBRID_SHARD \ - --fsdp.hybrid_sharding_num_model_replicas=$SLURM_NNODES \ - --sharded_checkpointer=olmo_core \ - --save_overwrite \ - --time_limit=$((47 * 60 * 60)) \ - '--load_path=${path.last_checkpoint:${remote_save_folder}}' \ - "${@}" diff --git a/scripts/lumi/peteish7.sh b/scripts/lumi/peteish7.sh deleted file mode 100644 index 22bfe370b..000000000 --- a/scripts/lumi/peteish7.sh +++ /dev/null @@ -1,74 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=peteish7 -#SBATCH --account=project_462000229 -#SBATCH --output=/scratch/project_462000229/logs/%j.log -#SBATCH --nodes=64 # Total number of nodes -#SBATCH --ntasks-per-node=8 -#SBATCH --gpus-per-node=8 # Allocate one gpu per MPI rank -#SBATCH --cpus-per-task=6 -#SBATCH --time=08:00:00 -#SBATCH --time-min=08:00:00 -#SBATCH --mem=0 # All memory on the node -#SBATCH --partition=standard-g - -module load LUMI/23.09 partition/G - -export OLMO_CONTAINER=llm-lumi-torch22_latest.sif -export SIF_CONTAINER=$PROJECT_DIR/containers/$OLMO_CONTAINER -#export SIF_CONTAINER=$SIF - -export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK -export MPICH_GPU_SUPPORT_ENABLED=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET_GDR_LEVEL=3 -export MIOPEN_USER_DB_PATH=/tmp/${USER}-miopen-cache-${SLURM_JOB_ID} -export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH} -export CXI_FORK_SAFE=1 -export CXI_FORK_SAFE_HP=1 -export FI_CXI_DISABLE_CQ_HUGETLB=1 -export GPU_MAX_HW_QUEUES=8 - -# We need to set this to avoid "Cassini Event Queue overflow detected." errors. -export FI_CXI_DEFAULT_CQ_SIZE=131072 - -#export NCCL_DEBUG=INFO -export PYTHONPATH=.:${PYTHONPATH} -export ROCM_PATH=/opt/rocm -export SINGULARITYENV_LD_LIBRARY_PATH=/usr/local/lib:/opt/cray/libfabric/1.15.2.0/lib64:/opt/rocm/lib -export SINGULARITYENV_TORCH_DIST_INIT_BARRIER=1 - -# Try playing with max_split_size_mb if you run into OOM errors. -#export PYTORCH_HIP_ALLOC_CONF=max_split_size_mb:128 - -export CHECKPOINTS_PATH=$SCRATCH_DIR/checkpoints - -srun \ - --cpus-per-task=$SLURM_CPUS_PER_TASK \ - --distribution=block:block \ - --kill-on-bad-exit \ - scripts/run_with_environment.sh \ - singularity exec \ - -B"$PROJECT_DIR:$PROJECT_DIR" \ - -B"$FLASH_DIR:$FLASH_DIR" \ - -B"$SCRATCH_DIR:$SCRATCH_DIR" \ - -B /usr/lib64/libjson-c.so.3:/usr/lib64/libjson-c.so.3 \ - $SIF_CONTAINER \ - python scripts/train.py configs/peteish7-s3.yaml \ - --run_name=peteish7-lumi_${SLURM_JOB_ID} \ - --wandb.name=peteish7-lumi_${SLURM_JOB_ID} \ - --wandb.group=peteish7-lumi \ - --data.num_workers=$SLURM_CPUS_PER_TASK \ - --data.prefetch_factor=2 \ - --save_folder=$CHECKPOINTS_PATH/peteish7/${SLURM_JOB_ID} \ - --remote_save_folder=s3://ai2-llm/checkpoints/OLMo-medium/peteish7-lumi2/ \ - --fused_loss=false \ - --model.flash_attention=false \ - --device_train_microbatch_size=2 \ - --activation_checkpointing=whole_layer \ - --fsdp.sharding_strategy=SHARD_GRAD_OP \ - --sharded_checkpointer=local \ - --save_overwrite \ - --load_path=/users/dgroeneveld/scratch_dir/ai2-llm/checkpoints/OLMo-medium/peteish7/step0-unsharded \ - "${@}" - -# '--load_path=${path.last_checkpoint:${save_folder}}' \ diff --git a/scripts/lumi/pile-llamaish7.sh b/scripts/lumi/pile-llamaish7.sh deleted file mode 100644 index c032ddbe3..000000000 --- a/scripts/lumi/pile-llamaish7.sh +++ /dev/null @@ -1,65 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=pile-llamaish7 -#SBATCH --account=project_462000229 -#SBATCH --output=/pfs/lustref1/flash/project_462000229/logs/%j.log -#SBATCH --nodes=8 # Total number of nodes -#SBATCH --ntasks-per-node=8 -#SBATCH --gpus-per-node=8 # Allocate one gpu per MPI rank -#SBATCH --cpus-per-task=6 -#SBATCH --time=39:15:00 -#SBATCH --mem=0 # All memory on the node -#SBATCH --partition=standard-g - -WANDB_GROUP=$1 -shift - -export OLMO_CONTAINER=llm-lumi-torch21_latest.sif - -export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK -export MPICH_GPU_SUPPORT_ENABLED=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET_GDR_LEVEL=3 -export MIOPEN_USER_DB_PATH=/tmp/${USER}-miopen-cache-${SLURM_JOB_ID} -export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH} -export CXI_FORK_SAFE=1 -export CXI_FORK_SAFE_HP=1 -export FI_CXI_DISABLE_CQ_HUGETLB=1 - -# We need to set this to avoid "Cassini Event Queue overflow detected." errors. -export FI_CXI_DEFAULT_CQ_SIZE=131072 - -#export NCCL_DEBUG=INFO -export PYTHONPATH=.:${PYTHONPATH} -export ROCM_PATH=/opt/rocm -export SINGULARITYENV_LD_LIBRARY_PATH=/usr/local/lib:/opt/cray/libfabric/1.15.2.0/lib64 - -# Try playing with max_split_size_mb if you run into OOM errors. -#export PYTORCH_HIP_ALLOC_CONF=max_split_size_mb:128 - -export HF_DATASETS_OFFLINE=1 - -export DATA_PATH=$FLASH_DIR/preprocessed/pile -export CHECKPOINTS_PATH=$FLASH_DIR/checkpoints -export EVAL_DATA_PATH=$SCRATCH_DIR/eval-data - -srun \ - --cpus-per-task=$SLURM_CPUS_PER_TASK \ - --distribution=block:block \ - --kill-on-bad-exit \ - scripts/run_with_environment.sh \ - singularity exec \ - -B"$PROJECT_DIR:$PROJECT_DIR" \ - -B"$FLASH_DIR:$FLASH_DIR" \ - -B"$SCRATCH_DIR:$SCRATCH_DIR" \ - -B /opt/cray:/opt/cray \ - -B /usr/lib64/libcxi.so.1:/usr/lib64/libcxi.so.1 \ - -B /usr/lib64/libjson-c.so.3:/usr/lib64/libjson-c.so.3 \ - $PROJECT_DIR/containers/$OLMO_CONTAINER \ - python scripts/train.py configs/pile-llamaish7.yaml ${@} \ - --run_name=${SLURM_JOB_ID} \ - --activation_checkpointing=fine_grained \ - --fsdp.wrapping_strategy=one_in_four \ - --fsdp.sharding_strategy=FULL_SHARD \ - --sharded_checkpointer=local \ - --time_limit=$((39 * 60 * 60)) \ - --wandb.group=$WANDB_GROUP diff --git a/scripts/lumi/run-in-container.sh b/scripts/lumi/run-in-container.sh deleted file mode 100755 index 57654031d..000000000 --- a/scripts/lumi/run-in-container.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/usr/bin/env bash -# Put setup of conda in an env variable if conda is needed -if [[ ! -z "${CONDA_ENV}" ]]; then - source /opt/miniconda3/bin/activate ${CONDA_ENV} -fi - -${@} diff --git a/scripts/lumi/v1-mix-medium-on-lumi.sh b/scripts/lumi/v1-mix-medium-on-lumi.sh deleted file mode 100644 index 23861f2fe..000000000 --- a/scripts/lumi/v1-mix-medium-on-lumi.sh +++ /dev/null @@ -1,56 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=v1-mix-medium -#SBATCH --account=project_462000229 -#SBATCH --output=/pfs/lustref1/flash/project_462000229/logs/%j.log -#SBATCH --nodes=32 # Total number of nodes -#SBATCH --ntasks-per-node=8 -#SBATCH --gpus-per-node=8 # Allocate one gpu per MPI rank -#SBATCH --cpus-per-task=6 -#SBATCH --time=48:00:00 -#SBATCH --time-min=8:00:00 -#SBATCH --mem=0 # All memory on the node -#SBATCH --partition=standard-g - -module load LUMI/22.08 partition/G - -export OLMO_CONTAINER=llm-lumi_latest.sif - -export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK -export MPICH_GPU_SUPPORT_ENABLED=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET_GDR_LEVEL=3 -export MIOPEN_USER_DB_PATH=/tmp/${USER}-miopen-cache-${SLURM_JOB_ID} -export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH} -export CXI_FORK_SAFE=1 -export CXI_FORK_SAFE_HP=1 -export FI_CXI_DISABLE_CQ_HUGETLB=1 - -# We need to set this to avoid "Cassini Event Queue overflow detected." errors. -export FI_CXI_DEFAULT_CQ_SIZE=131072 - -#export NCCL_DEBUG=INFO -export PYTHONPATH=.:${PYTHONPATH} -export ROCM_PATH=/opt/rocm -export SINGULARITYENV_LD_LIBRARY_PATH=/usr/local/lib:/opt/cray/libfabric/1.15.2.0/lib64 - -# Try playing with max_split_size_mb if you run into OOM errors. -#export PYTORCH_HIP_ALLOC_CONF=max_split_size_mb:128 - -export DATA_PATH=$FLASH_DIR/preprocessed/olmo-mix -export CHECKPOINTS_PATH=$FLASH_DIR/checkpoints -export EVAL_DATA_PATH=$SCRATCH_DIR/eval-data - -srun \ - --cpus-per-task=$SLURM_CPUS_PER_TASK \ - --distribution=block:block \ - --kill-on-bad-exit \ - scripts/run_with_environment.sh \ - singularity exec \ - -B"$PROJECT_DIR:$PROJECT_DIR" \ - -B"$FLASH_DIR:$FLASH_DIR" \ - -B"$SCRATCH_DIR:$SCRATCH_DIR" \ - -B /opt/cray:/opt/cray \ - -B /usr/lib64/libcxi.so.1:/usr/lib64/libcxi.so.1 \ - -B /usr/lib64/libjson-c.so.3:/usr/lib64/libjson-c.so.3 \ - $PROJECT_DIR/containers/$OLMO_CONTAINER \ - python scripts/train.py configs/v1-mix-medium.yaml --run_name=${SLURM_JOB_ID} ${@} diff --git a/scripts/lumi/v1-mix-small-on-lumi.sh b/scripts/lumi/v1-mix-small-on-lumi.sh deleted file mode 100644 index 9b8607024..000000000 --- a/scripts/lumi/v1-mix-small-on-lumi.sh +++ /dev/null @@ -1,56 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=v1-mix-small -#SBATCH --account=project_462000229 -#SBATCH --output=/pfs/lustref1/flash/project_462000229/logs/%j.log -#SBATCH --nodes=32 # Total number of nodes -#SBATCH --ntasks-per-node=8 -#SBATCH --gpus-per-node=8 # Allocate one gpu per MPI rank -#SBATCH --cpus-per-task=6 -#SBATCH --time=48:00:00 -#SBATCH --time-min=24:00:00 -#SBATCH --mem=0 # All memory on the node -#SBATCH --partition=standard-g - -module load LUMI/22.08 partition/G - -export OLMO_CONTAINER=llm-lumi_latest.sif - -export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK -export MPICH_GPU_SUPPORT_ENABLED=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET_GDR_LEVEL=3 -export MIOPEN_USER_DB_PATH=/tmp/${USER}-miopen-cache-${SLURM_JOB_ID} -export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH} -export CXI_FORK_SAFE=1 -export CXI_FORK_SAFE_HP=1 -export FI_CXI_DISABLE_CQ_HUGETLB=1 - -# We need to set this to avoid "Cassini Event Queue overflow detected." errors. -export FI_CXI_DEFAULT_CQ_SIZE=131072 - -#export NCCL_DEBUG=INFO -export PYTHONPATH=.:${PYTHONPATH} -export ROCM_PATH=/opt/rocm -export SINGULARITYENV_LD_LIBRARY_PATH=/usr/local/lib:/opt/cray/libfabric/1.15.2.0/lib64 - -# Try playing with max_split_size_mb if you run into OOM errors. -#export PYTORCH_HIP_ALLOC_CONF=max_split_size_mb:512 - -export DATA_PATH=$FLASH_DIR/preprocessed/olmo-mix -export CHECKPOINTS_PATH=$FLASH_DIR/checkpoints -export EVAL_DATA_PATH=$SCRATCH_DIR/eval-data - -srun \ - --cpus-per-task=$SLURM_CPUS_PER_TASK \ - --distribution=block:block \ - --kill-on-bad-exit \ - scripts/run_with_environment.sh \ - singularity exec \ - -B"$PROJECT_DIR:$PROJECT_DIR" \ - -B"$FLASH_DIR:$FLASH_DIR" \ - -B"$SCRATCH_DIR:$SCRATCH_DIR" \ - -B /opt/cray:/opt/cray \ - -B /usr/lib64/libcxi.so.1:/usr/lib64/libcxi.so.1 \ - -B /usr/lib64/libjson-c.so.3:/usr/lib64/libjson-c.so.3 \ - $PROJECT_DIR/containers/$OLMO_CONTAINER \ - python scripts/train.py configs/v1-mix-small.yaml --run_name=${SLURM_JOB_ID} ${@}