-
Notifications
You must be signed in to change notification settings - Fork 30
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #307 from argonne-lcf/feature/SN_1.17
Feature/sn 1.17
- Loading branch information
Showing
3 changed files
with
145 additions
and
15 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
74 changes: 74 additions & 0 deletions
74
docs/ai-testbed/sambanova/files/Gpt1.5B_base_single_compile.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
#! /bin/bash | ||
set -e | ||
export SOFTWARE_HOME=/opt | ||
ACTIVATE=/opt/sambaflow/apps/nlp/transformers_on_rdu/venv/bin/activate | ||
LOGDIR=`date +%m%d%y.%H` | ||
if [ "$2" ] ; then | ||
LOGDIR=$2 | ||
fi | ||
MODEL_NAME="GPT1.5B_base_single_$1" | ||
OUTPUT_PATH=/data/ANL/results/$(hostname)/${USER}/${LOGDIR}/${MODEL_NAME}.out | ||
echo "Using ${OUTPUT_PATH} for output" | ||
mkdir -p /data/ANL/results/$(hostname)/${USER}/${LOGDIR} | ||
|
||
####################### | ||
# Edit these variables. | ||
####################### | ||
export OMP_NUM_THREADS=18 | ||
export REQUESTS_CA_BUNDLE=/usr/local/lib/python3.8/site-packages/certifi/cacert.pem | ||
export CURL_CA_BUNDLE=/etc/ssl/certs/ca-certificates.crt | ||
|
||
####################### | ||
# Start script timer | ||
SECONDS=0 | ||
# Temp file location | ||
DIRECTORY=$$ | ||
OUTDIR=/data/scratch/${USER}/${MODEL_NAME} | ||
mkdir -p ${OUTDIR} | ||
source ${ACTIVATE} | ||
echo "Model: " ${MODEL_NAME} > ${OUTPUT_PATH} 2>&1 | ||
echo "Date: " $(date +%m/%d/%y) >> ${OUTPUT_PATH} 2>&1 | ||
echo "Time: " $(date +%H:%M) >> ${OUTPUT_PATH} 2>&1 | ||
apt list --installed sambaflow >> ${OUTPUT_PATH} 2>&1 | ||
cd ${OUTDIR} | ||
####################### | ||
echo "Machine State Before: " >> ${OUTPUT_PATH} 2>&1 | ||
/opt/sambaflow/bin/snfadm -l inventory >> ${OUTPUT_PATH} 2>&1 | ||
####################### | ||
export SN_NUM_THREADS=32 | ||
|
||
if [ $1 -eq 256 ] ; then | ||
BATCH_SIZE=256 | ||
elif [ $1 -eq 128 ] ; then | ||
BATCH_SIZE=128 | ||
elif [ $1 -eq 64 ] ; then | ||
BATCH_SIZE=64 | ||
elif [ $1 -eq 32 ] ; then | ||
BATCH_SIZE=32 | ||
elif [ $1 -eq 16 ] ; then | ||
BATCH_SIZE=16 | ||
else | ||
echo "Batchsize $1 is invalid use 16,32,64,or 128,256" $2 >> ${OUTPUT_PATH} 2>&1 | ||
exit 1 | ||
fi | ||
|
||
if [ ! -e ${OUTDIR}/${MODEL_NAME}/${MODEL_NAME}.pef ] ; then | ||
echo "COMPILE START AT ${SECONDS}" >> ${OUTPUT_PATH} 2>&1 | ||
export GAS=1 | ||
|
||
export CC=compiler_configs_gpt1dot5b_perf.json | ||
#env | grep PYTHONPATH >> ${OUTPUT_PATH} 2>&1 | ||
COMMAND="python /opt/sambaflow/apps/nlp/transformers_on_rdu/transformers_hook.py compile --pef-name=${MODEL_NAME} --output-folder=${OUTDIR} --module_name gpt2_pretrain --task_name clm --max_seq_length 1024 -b $BATCH_SIZE --output_dir=${OUTDIR}/hf_gpt1dot5b_ss1k_gas_${GAS}_bs${BATCH_SIZE} --overwrite_output_dir --do_train --per_device_train_batch_size ${BATCH_SIZE} --tokenizer_name gpt2 --model_name gpt2 --mac-v2 --non_split_head --mac-human-decision /opt/sambaflow/apps/nlp/transformers_on_rdu/human_decisions_gm/mac_v2_overrides/gpt2_48_enc_full_recompute_training_spatialmapping_tiling16_clmerge_gm_pardp2_lnsd.json --compiler-configs-file /opt/sambaflow/apps/nlp/transformers_on_rdu/human_decisions_gm/compiler_configs/$CC --skip_broadcast_patch --config_name /opt/sambaflow/apps/nlp/transformers_on_rdu/customer_specific/mv/configs/gpt2_config_xl_50260.json --no_index_select_patch --weight_decay 0.1 --max_grad_norm_clip 1.0 --num-tiles 4 --enable-stochastic-rounding" | ||
|
||
|
||
echo "COMPILE COMMAND: $COMMAND" >> ${OUTPUT_PATH} 2>&1 | ||
eval $COMMAND >> ${OUTPUT_PATH} 2>&1 | ||
echo "COMPILE END AT ${SECONDS}" >> ${OUTPUT_PATH} 2>&1 | ||
fi | ||
####################### | ||
echo "RUN" >> ${OUTPUT_PATH} 2>&1 | ||
/usr/local/bin/sbatch --output=${HOME}/slurm-%A.out --ntasks 1 --gres=rdu:8 --ntasks-per-node 16 --nodes 1 --nodelist $(hostname) --cpus-per-task=8 /data/ANL/scripts/Gpt1.5B_base_single_run.sh $BATCH_SIZE $2 >> ${OUTPUT_PATH} 2>&1 | ||
|
||
echo "Machine state After: " >> ${OUTPUT_PATH} 2>&1 | ||
/opt/sambaflow/bin/snfadm -l inventory >> ${OUTPUT_PATH} 2>&1 | ||
echo "Duration: " $SECONDS >> ${OUTPUT_PATH} 2>&1 |
55 changes: 55 additions & 0 deletions
55
docs/ai-testbed/sambanova/files/Gpt1.5B_base_single_run.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
#! /bin/bash | ||
set -e | ||
export SOFTWARE_HOME=/opt | ||
LOGDIR=`date +%m%d%y.%H` | ||
if [ "$2" ] ; then | ||
LOGDIR=$2 | ||
fi | ||
MODEL_NAME="GPT1.5B_base_single_$1" | ||
OUTPUT_PATH=/data/ANL/results/$(hostname)/${USER}/${LOGDIR}/${MODEL_NAME}.out | ||
echo "Using ${OUTPUT_PATH} for output" | ||
mkdir -p /data/ANL/results/$(hostname)/${USER}/${LOGDIR} | ||
|
||
ACTIVATE=/opt/sambaflow/apps/nlp/transformers_on_rdu/venv/bin/activate | ||
####################### | ||
# Edit these variables. | ||
####################### | ||
export OMP_NUM_THREADS=18 | ||
####################### | ||
# Start script timer | ||
SECONDS=0 | ||
# Temp file location | ||
DIRECTORY=$$ | ||
OUTDIR=/data/scratch/${USER}/${MODEL_NAME} | ||
mkdir -p ${OUTDIR} | ||
source ${ACTIVATE} | ||
echo "Model: " ${MODEL_NAME} >> ${OUTPUT_PATH} 2>&1 | ||
echo "Date: " $(date +%m/%d/%y) >> ${OUTPUT_PATH} 2>&1 | ||
echo "Time: " $(date +%H:%M) >> ${OUTPUT_PATH} 2>&1 | ||
apt list --installed sambaflow >> ${OUTPUT_PATH} 2>&1 | ||
cd ${OUTDIR} | ||
####################### | ||
echo "Machine State Before: " >> ${OUTPUT_PATH} 2>&1 | ||
/opt/sambaflow/bin/snfadm -l inventory >> ${OUTPUT_PATH} 2>&1 | ||
####################### | ||
if [ ! -e ${OUTDIR}/${MODEL_NAME}/${MODEL_NAME}.pef ] ; then | ||
echo "PEF ${OUTDIR}/${MODEL_NAME}/${MODEL_NAME}.pef does not exist, exiting" >> ${OUTPUT_PATH} 2>&1 | ||
exit 1 | ||
fi | ||
|
||
####################### | ||
echo "RUN" >> ${OUTPUT_PATH} 2>&1 | ||
#export CCL_TIMEOUT=3600 | ||
export REQUESTS_CA_BUNDLE=/usr/local/lib/python3.8/site-packages/certifi/cacert.pem | ||
export CURL_CA_BUNDLE="/etc/ssl/certs/ca-certificates.crt" | ||
export SAMBA_CCL_HIERARCHICAL_ALLREDUCE=1 | ||
|
||
COMMAND="/usr/local/bin/srun --mpi=pmi2 python /opt/sambaflow/apps/nlp/transformers_on_rdu/transformers_hook.py run -b $1 --data_dir /data/ANL/ss1024 --pef=${OUTDIR}/${MODEL_NAME}/${MODEL_NAME}.pef --output_dir=${OUTDIR}/hf_gpt1dot5b_ss1k_gas_1_bs16 --module_name gpt2_pretrain --task_name clm --max_seq_length 1024 --overwrite_output_dir --do_train --per_device_train_batch_size $1 --tokenizer_name gpt2 --model_name gpt2 --non_split_head --skip_broadcast_patch --no_index_select_patch --config_name /opt/sambaflow/apps/nlp/transformers_on_rdu/customer_specific/mv/configs/gpt2_config_xl_50260.json --max_grad_norm_clip 1.0 --skip_checkpoint --logging_steps 1 --max_steps 75000 --learning_rate 0.00025 --steps_this_run 100" >> ${OUTPUT_PATH} 2>&1 | ||
|
||
echo "COMMAND= $COMMAND" >> ${OUTPUT_PATH} 2>&1 | ||
eval $COMMAND >> ${OUTPUT_PATH} 2>&1 | ||
|
||
####################### | ||
echo "Machine state After: " >> ${OUTPUT_PATH} 2>&1 | ||
/opt/sambaflow/bin/snfadm -l inventory >> ${OUTPUT_PATH} 2>&1 | ||
echo "Duration: " $SECONDS >> ${OUTPUT_PATH} 2>&1 |