From 648697993f4e65481509f3acdc739adf418c39af Mon Sep 17 00:00:00 2001 From: Varuni Sastry <88804132+vksastry@users.noreply.github.com> Date: Thu, 14 Dec 2023 15:04:31 -0600 Subject: [PATCH 1/3] Update example-programs.md --- docs/ai-testbed/sambanova/example-programs.md | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/docs/ai-testbed/sambanova/example-programs.md b/docs/ai-testbed/sambanova/example-programs.md index e619697e2..a30070a4b 100644 --- a/docs/ai-testbed/sambanova/example-programs.md +++ b/docs/ai-testbed/sambanova/example-programs.md @@ -272,7 +272,7 @@ inner train loop time : 374.6789753437042 for 10 epochs, number of global steps: ## Gpt 1.5B The Gpt 1.5B application example is provided in the the path : `/opt/sambaflow/apps/nlp/transformers_on_rdu/`. -The scripts containing the `compile` and `run` commands for Gpt1.5B model can be accessed at [Gpt1.5B_single.sh](./files/Gpt1.5B_single.sh "Gpt1.5B_single.sh") or at `/data/ANL/scripts/Gpt1.5B_single.sh` on any SN30 compute node. This script is compiled and run for only 1 instance and the model fits on 4 tiles or half of a RDU. +The scripts containing the `compile` and `run` commands for Gpt1.5B model can be accessed at the path `/data/ANL/scripts/Gpt1.5B_base_single_compile.sh` and `/data/ANL/scripts/Gpt1.5B_base_single_run.sh` on any SN30 compute node. This script is compiled and run for only 1 instance and the model fits on 4 tiles or half of a RDU. The scripts are provided for reference. Change directory and copy files. @@ -282,20 +282,21 @@ cd ~/apps/nlp/Gpt1.5B_single ``` Copy and paste the contents of -[Gpt1.5B_single.sh](./files/Gpt1.5B_single.sh "Gpt1.5B_single.sh") -to a file with the same name into the current directory using your favorite editor. +[Gpt1.5B_base_single_compile.sh](./files/Gpt1.5B_base_single_compile.sh "Gpt1.5B_base_single_compile.sh") and [Gpt1.5B_base_single_run.sh](./files/Gpt1.5B_base_single_run.sh "Gpt1.5B_base_single_run.sh") +to a file with the same names into the current directory using your favorite editor. -or copy the contents from `/data/ANL/scripts/Gpt1.5B_single.sh`. +or copy the contents from `/data/ANL/scripts/Gpt1.5B_base_single_compile.sh` and `/data/ANL/scripts/Gpt1.5B_base_single_run.sh`. ```bash -cp /data/ANL/scripts/Gpt1.5B_single.sh ~/apps/nlp/Gpt1.5B_single/ +cp /data/ANL/scripts/Gpt1.5B_base_single_compile.sh ~/apps/nlp/Gpt1.5B_single/ +cp /data/ANL/scripts/Gpt1.5B_base_single_run.sh ~/apps/nlp/Gpt1.5B_single/ ``` Run the script. ```bash -chmod +x Gpt1.5B_single.sh -./Gpt1.5B_single.sh +chmod +x Gpt1.5B_base_single_compile.sh +./Gpt1.5B_base_single_compile.sh ``` You can inspect the `compile` and `run` commands in the script to learn that this model trains with a batch size of 16 for 1 instance over 4 tiles. The human decision file and the compiler config file helps to optimize the compute and memory resources specific to this Gpt 1.5B model run. From 1480a33b5197e3e3a6da28487bebb23ef84189ac Mon Sep 17 00:00:00 2001 From: Varuni Sastry <88804132+vksastry@users.noreply.github.com> Date: Thu, 14 Dec 2023 15:15:31 -0600 Subject: [PATCH 2/3] Add gpt1.5 single instance files --- .../files/Gpt1.5B_base_single_compile.sh | 74 +++++++++++++++++++ .../files/Gpt1.5B_base_single_run.sh | 55 ++++++++++++++ 2 files changed, 129 insertions(+) create mode 100644 docs/ai-testbed/sambanova/files/Gpt1.5B_base_single_compile.sh create mode 100644 docs/ai-testbed/sambanova/files/Gpt1.5B_base_single_run.sh diff --git a/docs/ai-testbed/sambanova/files/Gpt1.5B_base_single_compile.sh b/docs/ai-testbed/sambanova/files/Gpt1.5B_base_single_compile.sh new file mode 100644 index 000000000..a9361fd55 --- /dev/null +++ b/docs/ai-testbed/sambanova/files/Gpt1.5B_base_single_compile.sh @@ -0,0 +1,74 @@ +#! /bin/bash +set -e +export SOFTWARE_HOME=/opt +ACTIVATE=/opt/sambaflow/apps/nlp/transformers_on_rdu/venv/bin/activate +LOGDIR=`date +%m%d%y.%H` +if [ "$2" ] ; then +LOGDIR=$2 +fi +MODEL_NAME="GPT1.5B_base_single_$1" +OUTPUT_PATH=/data/ANL/results/$(hostname)/${USER}/${LOGDIR}/${MODEL_NAME}.out +echo "Using ${OUTPUT_PATH} for output" +mkdir -p /data/ANL/results/$(hostname)/${USER}/${LOGDIR} + +####################### +# Edit these variables. +####################### +export OMP_NUM_THREADS=18 +export REQUESTS_CA_BUNDLE=/usr/local/lib/python3.8/site-packages/certifi/cacert.pem +export CURL_CA_BUNDLE=/etc/ssl/certs/ca-certificates.crt + +####################### +# Start script timer +SECONDS=0 +# Temp file location +DIRECTORY=$$ +OUTDIR=/data/scratch/${USER}/${MODEL_NAME} +mkdir -p ${OUTDIR} +source ${ACTIVATE} +echo "Model: " ${MODEL_NAME} > ${OUTPUT_PATH} 2>&1 +echo "Date: " $(date +%m/%d/%y) >> ${OUTPUT_PATH} 2>&1 +echo "Time: " $(date +%H:%M) >> ${OUTPUT_PATH} 2>&1 +apt list --installed sambaflow >> ${OUTPUT_PATH} 2>&1 +cd ${OUTDIR} +####################### +echo "Machine State Before: " >> ${OUTPUT_PATH} 2>&1 +/opt/sambaflow/bin/snfadm -l inventory >> ${OUTPUT_PATH} 2>&1 +####################### +export SN_NUM_THREADS=32 + +if [ $1 -eq 256 ] ; then + BATCH_SIZE=256 +elif [ $1 -eq 128 ] ; then + BATCH_SIZE=128 +elif [ $1 -eq 64 ] ; then + BATCH_SIZE=64 +elif [ $1 -eq 32 ] ; then + BATCH_SIZE=32 +elif [ $1 -eq 16 ] ; then + BATCH_SIZE=16 +else + echo "Batchsize $1 is invalid use 16,32,64,or 128,256" $2 >> ${OUTPUT_PATH} 2>&1 + exit 1 +fi + +if [ ! -e ${OUTDIR}/${MODEL_NAME}/${MODEL_NAME}.pef ] ; then + echo "COMPILE START AT ${SECONDS}" >> ${OUTPUT_PATH} 2>&1 + export GAS=1 + + export CC=compiler_configs_gpt1dot5b_perf.json + #env | grep PYTHONPATH >> ${OUTPUT_PATH} 2>&1 + COMMAND="python /opt/sambaflow/apps/nlp/transformers_on_rdu/transformers_hook.py compile --pef-name=${MODEL_NAME} --output-folder=${OUTDIR} --module_name gpt2_pretrain --task_name clm --max_seq_length 1024 -b $BATCH_SIZE --output_dir=${OUTDIR}/hf_gpt1dot5b_ss1k_gas_${GAS}_bs${BATCH_SIZE} --overwrite_output_dir --do_train --per_device_train_batch_size ${BATCH_SIZE} --tokenizer_name gpt2 --model_name gpt2 --mac-v2 --non_split_head --mac-human-decision /opt/sambaflow/apps/nlp/transformers_on_rdu/human_decisions_gm/mac_v2_overrides/gpt2_48_enc_full_recompute_training_spatialmapping_tiling16_clmerge_gm_pardp2_lnsd.json --compiler-configs-file /opt/sambaflow/apps/nlp/transformers_on_rdu/human_decisions_gm/compiler_configs/$CC --skip_broadcast_patch --config_name /opt/sambaflow/apps/nlp/transformers_on_rdu/customer_specific/mv/configs/gpt2_config_xl_50260.json --no_index_select_patch --weight_decay 0.1 --max_grad_norm_clip 1.0 --num-tiles 4 --enable-stochastic-rounding" + + + echo "COMPILE COMMAND: $COMMAND" >> ${OUTPUT_PATH} 2>&1 + eval $COMMAND >> ${OUTPUT_PATH} 2>&1 + echo "COMPILE END AT ${SECONDS}" >> ${OUTPUT_PATH} 2>&1 +fi +####################### +echo "RUN" >> ${OUTPUT_PATH} 2>&1 +/usr/local/bin/sbatch --output=${HOME}/slurm-%A.out --ntasks 1 --gres=rdu:8 --ntasks-per-node 16 --nodes 1 --nodelist $(hostname) --cpus-per-task=8 /data/ANL/scripts/Gpt1.5B_base_single_run.sh $BATCH_SIZE $2 >> ${OUTPUT_PATH} 2>&1 + +echo "Machine state After: " >> ${OUTPUT_PATH} 2>&1 +/opt/sambaflow/bin/snfadm -l inventory >> ${OUTPUT_PATH} 2>&1 +echo "Duration: " $SECONDS >> ${OUTPUT_PATH} 2>&1 diff --git a/docs/ai-testbed/sambanova/files/Gpt1.5B_base_single_run.sh b/docs/ai-testbed/sambanova/files/Gpt1.5B_base_single_run.sh new file mode 100644 index 000000000..132667cd1 --- /dev/null +++ b/docs/ai-testbed/sambanova/files/Gpt1.5B_base_single_run.sh @@ -0,0 +1,55 @@ +#! /bin/bash +set -e +export SOFTWARE_HOME=/opt +LOGDIR=`date +%m%d%y.%H` +if [ "$2" ] ; then +LOGDIR=$2 +fi +MODEL_NAME="GPT1.5B_base_single_$1" +OUTPUT_PATH=/data/ANL/results/$(hostname)/${USER}/${LOGDIR}/${MODEL_NAME}.out +echo "Using ${OUTPUT_PATH} for output" +mkdir -p /data/ANL/results/$(hostname)/${USER}/${LOGDIR} + +ACTIVATE=/opt/sambaflow/apps/nlp/transformers_on_rdu/venv/bin/activate +####################### +# Edit these variables. +####################### +export OMP_NUM_THREADS=18 +####################### +# Start script timer +SECONDS=0 +# Temp file location +DIRECTORY=$$ +OUTDIR=/data/scratch/${USER}/${MODEL_NAME} +mkdir -p ${OUTDIR} +source ${ACTIVATE} +echo "Model: " ${MODEL_NAME} >> ${OUTPUT_PATH} 2>&1 +echo "Date: " $(date +%m/%d/%y) >> ${OUTPUT_PATH} 2>&1 +echo "Time: " $(date +%H:%M) >> ${OUTPUT_PATH} 2>&1 +apt list --installed sambaflow >> ${OUTPUT_PATH} 2>&1 +cd ${OUTDIR} +####################### +echo "Machine State Before: " >> ${OUTPUT_PATH} 2>&1 +/opt/sambaflow/bin/snfadm -l inventory >> ${OUTPUT_PATH} 2>&1 +####################### +if [ ! -e ${OUTDIR}/${MODEL_NAME}/${MODEL_NAME}.pef ] ; then + echo "PEF ${OUTDIR}/${MODEL_NAME}/${MODEL_NAME}.pef does not exist, exiting" >> ${OUTPUT_PATH} 2>&1 + exit 1 +fi + +####################### +echo "RUN" >> ${OUTPUT_PATH} 2>&1 +#export CCL_TIMEOUT=3600 +export REQUESTS_CA_BUNDLE=/usr/local/lib/python3.8/site-packages/certifi/cacert.pem +export CURL_CA_BUNDLE="/etc/ssl/certs/ca-certificates.crt" +export SAMBA_CCL_HIERARCHICAL_ALLREDUCE=1 + +COMMAND="/usr/local/bin/srun --mpi=pmi2 python /opt/sambaflow/apps/nlp/transformers_on_rdu/transformers_hook.py run -b $1 --data_dir /data/ANL/ss1024 --pef=${OUTDIR}/${MODEL_NAME}/${MODEL_NAME}.pef --output_dir=${OUTDIR}/hf_gpt1dot5b_ss1k_gas_1_bs16 --module_name gpt2_pretrain --task_name clm --max_seq_length 1024 --overwrite_output_dir --do_train --per_device_train_batch_size $1 --tokenizer_name gpt2 --model_name gpt2 --non_split_head --skip_broadcast_patch --no_index_select_patch --config_name /opt/sambaflow/apps/nlp/transformers_on_rdu/customer_specific/mv/configs/gpt2_config_xl_50260.json --max_grad_norm_clip 1.0 --skip_checkpoint --logging_steps 1 --max_steps 75000 --learning_rate 0.00025 --steps_this_run 100" >> ${OUTPUT_PATH} 2>&1 + +echo "COMMAND= $COMMAND" >> ${OUTPUT_PATH} 2>&1 +eval $COMMAND >> ${OUTPUT_PATH} 2>&1 + +####################### +echo "Machine state After: " >> ${OUTPUT_PATH} 2>&1 +/opt/sambaflow/bin/snfadm -l inventory >> ${OUTPUT_PATH} 2>&1 +echo "Duration: " $SECONDS >> ${OUTPUT_PATH} 2>&1 From 149df8cefeddaa912a6e00e715fd9a296edbb544 Mon Sep 17 00:00:00 2001 From: Varuni Sastry <88804132+vksastry@users.noreply.github.com> Date: Fri, 15 Dec 2023 08:53:11 -0600 Subject: [PATCH 3/3] Update example-programs.md --- docs/ai-testbed/sambanova/example-programs.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/docs/ai-testbed/sambanova/example-programs.md b/docs/ai-testbed/sambanova/example-programs.md index a30070a4b..0a5c29ece 100644 --- a/docs/ai-testbed/sambanova/example-programs.md +++ b/docs/ai-testbed/sambanova/example-programs.md @@ -292,30 +292,30 @@ cp /data/ANL/scripts/Gpt1.5B_base_single_compile.sh ~/apps/nlp/Gpt1.5B_single/ cp /data/ANL/scripts/Gpt1.5B_base_single_run.sh ~/apps/nlp/Gpt1.5B_single/ ``` -Run the script. +Run the script with batch size as an argument(shown below with an example of 32). ```bash chmod +x Gpt1.5B_base_single_compile.sh -./Gpt1.5B_base_single_compile.sh +./Gpt1.5B_base_single_compile.sh 32 ``` -You can inspect the `compile` and `run` commands in the script to learn that this model trains with a batch size of 16 for 1 instance over 4 tiles. The human decision file and the compiler config file helps to optimize the compute and memory resources specific to this Gpt 1.5B model run. +The Gpt1.5B_base_single_compile.sh script will internally call the Gpt1.5B_base_single_run.sh to perform the training. You can inspect the `compile` and `run` commands in the scripts to learn that this model trains with a batch size of 32 for 1 instance over 4 tiles. The human decision file and the compiler config file helps to optimize the compute and memory resources specific to this Gpt 1.5B model run. ```bash -python /opt/sambaflow/apps/nlp/transformers_on_rdu/transformers_hook.py compile --module_name gpt2_pretrain --task_name clm --max_seq_length 1024 -b 16 --output_dir=${OUTDIR}/hf_output --overwrite_output_dir --do_train --per_device_train_batch_size 16 --cache ${OUTDIR}/cache/ --tokenizer_name gpt2 --model_name gpt2 --mac-v2 --non_split_head --mac-human-decision /opt/sambaflow/apps/nlp/transformers_on_rdu/human_decisions_gm/mac_v2_overrides/gpt2_48_enc_full_recompute_training_spatialmapping_tiling16_clmerge_gm_nonpardp_lnsd.json --compiler-configs-file /opt/sambaflow/apps/nlp/transformers_on_rdu/human_decisions_gm/compiler_configs/compiler_configs_gpt2_sc_recompute_spatialmapping_tiling16_clsmerge_withcls_nonpardp_norc_e2e.json --skip_broadcast_patch --config_name /opt/sambaflow/apps/nlp/transformers_on_rdu/customer_specific/mv/configs/gpt2_config_xl_50260.json --no_index_select_patch --weight_decay 0.1 --max_grad_norm_clip 1.0 --num-tiles 4 --pef-name=gpt15_single --output-folder=${OUTDIR} +python /opt/sambaflow/apps/nlp/transformers_on_rdu/transformers_hook.py compile --pef-name=GPT1.5B_base_single_32 --output-folder=/data/scratch/user/GPT1.5B_base_single_32 --module_name gpt2_pretrain --task_name clm --max_seq_length 1024 -b 32 --output_dir=/data/scratch/user/GPT1.5B_base_single_32/hf_gpt1dot5b_ss1k_gas_1_bs32 --overwrite_output_dir --do_train --per_device_train_batch_size 32 --tokenizer_name gpt2 --model_name gpt2 --mac-v2 --non_split_head --mac-human-decision /opt/sambaflow/apps/nlp/transformers_on_rdu/human_decisions_gm/mac_v2_overrides/gpt2_48_enc_full_recompute_training_spatialmapping_tiling16_clmerge_gm_pardp2_lnsd.json --compiler-configs-file /opt/sambaflow/apps/nlp/transformers_on_rdu/human_decisions_gm/compiler_configs/compiler_configs_gpt1dot5b_perf.json --skip_broadcast_patch --config_name /opt/sambaflow/apps/nlp/transformers_on_rdu/customer_specific/mv/configs/gpt2_config_xl_50260.json --no_index_select_patch --weight_decay 0.1 --max_grad_norm_clip 1.0 --num-tiles 4 --enable-stochastic-rounding ``` ```bash -python /opt/sambaflow/apps/nlp/transformers_on_rdu/transformers_hook.py run -b 16 --module_name gpt2_pretrain --task_name clm --max_seq_length 1024 --overwrite_output_dir --do_train --per_device_train_batch_size 16 --cache ${OUTDIR}/cache/ --tokenizer_name gpt2 --model_name gpt2 --non_split_head --skip_broadcast_patch --no_index_select_patch --output_dir=${OUTDIR}/hf_output --config_name /opt/sambaflow/apps/nlp/transformers_on_rdu/customer_specific/mv/configs/gpt2_config_xl_50260.json --max_grad_norm_clip 1.0 --skip_checkpoint --data_dir /data/ANL/ss1024 --logging_steps 1 --max_steps 900000 --learning_rate 0.00025 --steps_this_run 100 --pef=${OUTDIR}/gpt15_single/gpt15_single.pef >> ${OUTPUT_PATH} 2>&1 +COMMAND= /usr/local/bin/srun --mpi=pmi2 python /opt/sambaflow/apps/nlp/transformers_on_rdu/transformers_hook.py run -b 32 --data_dir /data/ANL/ss1024 --pef=/data/scratch/user/GPT1.5B_base_single_32/GPT1.5B_base_single_32/GPT1.5B_base_single_32.pef --output_dir=/data/scratch/user/GPT1.5B_base_single_32/hf_gpt1dot5b_ss1k_gas_1_bs16 --module_name gpt2_pretrain --task_name clm --max_seq_length 1024 --overwrite_output_dir --do_train --per_device_train_batch_size 32 --tokenizer_name gpt2 --model_name gpt2 --non_split_head --skip_broadcast_patch --no_index_select_patch --config_name /opt/sambaflow/apps/nlp/transformers_on_rdu/customer_specific/mv/configs/gpt2_config_xl_50260.json --max_grad_norm_clip 1.0 --skip_checkpoint --logging_steps 1 --max_steps 75000 --learning_rate 0.00025 --steps_this_run 100 ``` The `sntilestat` command shows that the application runs on 4 tiles as shown below. ```bash -/XRDU_0/RDU_0/TILE_0 2.1 96.9 0.8 0.1 0.0 0.0 796481 vsastry python /opt/sambaflow/apps/nlp/transformers_on_rdu/ -/XRDU_0/RDU_0/TILE_1 2.1 96.9 0.8 0.1 0.0 0.0 796481 vsastry python /opt/sambaflow/apps/nlp/transformers_on_rdu/ -/XRDU_0/RDU_0/TILE_2 2.5 96.9 0.4 0.1 0.0 0.0 796481 vsastry python /opt/sambaflow/apps/nlp/transformers_on_rdu/ -/XRDU_0/RDU_0/TILE_3 2.5 96.9 0.4 0.1 0.0 0.0 796481 vsastry python /opt/sambaflow/apps/nlp/transformers_on_rdu/ +/XRDU_0/RDU_0/TILE_0 2.1 96.9 0.8 0.1 0.0 0.0 796481 user python /opt/sambaflow/apps/nlp/transformers_on_rdu/ +/XRDU_0/RDU_0/TILE_1 2.1 96.9 0.8 0.1 0.0 0.0 796481 user python /opt/sambaflow/apps/nlp/transformers_on_rdu/ +/XRDU_0/RDU_0/TILE_2 2.5 96.9 0.4 0.1 0.0 0.0 796481 user python /opt/sambaflow/apps/nlp/transformers_on_rdu/ +/XRDU_0/RDU_0/TILE_3 2.5 96.9 0.4 0.1 0.0 0.0 796481 user python /opt/sambaflow/apps/nlp/transformers_on_rdu/ /XRDU_0/RDU_0/TILE_4 100.0 0.0 0.0 0.0 0.0 0.0 /XRDU_0/RDU_0/TILE_5 100.0 0.0 0.0 0.0 0.0 0.0 ...