Skip to content

Commit

Permalink
Megatron-DeepSpeed-fork content for 1.17.0
Browse files Browse the repository at this point in the history
Signed-off-by: SW publisher <[email protected]>
  • Loading branch information
SW publisher authored and Jenkins committed Aug 9, 2024
1 parent 7eb36a1 commit f315700
Show file tree
Hide file tree
Showing 93 changed files with 6,491 additions and 1,334 deletions.
826 changes: 285 additions & 541 deletions README.md

Large diffs are not rendered by default.

37 changes: 26 additions & 11 deletions examples_deepspeed/MoE/ds_pretrain_gpt_125M_MoE64.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company.

#!/bin/bash
DIR=`pwd`
###############################################################################
Expand Down Expand Up @@ -119,8 +121,14 @@ MP_SIZE=1
## Currently we don't support PP for MoE. To disable PP, set PP_SIZE
## to 1 and use the "--no-pipeline-parallel" arg.
PP_SIZE=1
NUM_GPUS=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2))
NUM_GPUS_PERNODE=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
nvidia-smi || count_GPU=0
if [[ ${count_GPU} == 0 ]];then
NUM_GPUS=$(lspci | grep -i "Processing accelerators: Habana Labs Ltd." | wc -l)
NUM_GPUS_PERNODE=${NUM_GPUS}
else
NUM_GPUS=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2))
NUM_GPUS_PERNODE=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
fi
NUM_NODE=$(( ${NUM_GPUS} / ${NUM_GPUS_PERNODE} ))
###############################################################################
### MoE configs
Expand Down Expand Up @@ -172,6 +180,7 @@ LOG_INTERVAL=10
EVAL_ITERS=10
EVAL_INTERVAL=100
SAVE_INTERVAL=10000
EXIT_INTERVAL=${HL_EXIT_INTERVAL:-0}

## Standard deviation for weight initialization
## We used 0.014 for 350M/1.3B dense/MoE models, and used 0.01 for 6.7B
Expand Down Expand Up @@ -241,13 +250,17 @@ if [ "${USE_INTERNAL_DATA}" = "true" ]; then
0.00208 ${NIH} 0.13017 ${CC2020} 0.09446 ${PCC} 0.15652 ${CC2021} \
0.01359 ${ARX} 0.01588 ${GIT}"
else
VOCAB_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-vocab.json
MERGE_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-merges.txt
#VOCAB_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-vocab.json
#MERGE_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-merges.txt
# Public the Pile dataset, can be downloaded at https://mystic.the-eye.eu/public/AI/pile_neox/
# For cluster Azure-EastUS-V100-32GB-4, Lab-RR1-V100
DATA_PATH=/vc_data_blob/users/conglli/the_pile_public_merged_nopreprocessing/pile_text_document
#DATA_PATH=/vc_data_blob/users/conglli/the_pile_public_merged_nopreprocessing/pile_text_document
# For cluster Azure-WestUS3-A100
# DATA_PATH=/blob/data/the_pile_public_merged_nopreprocessing/pile_text_document
BASE_DATA_PATH=${HL_DATA_DIR_ROOT:-/data/bigscience/oscar-en/}
VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
DATA_PATH=${BASE_DATA_PATH}/meg-gpt2_text_document
fi
###############################################################################
data_options=" \
Expand Down Expand Up @@ -284,6 +297,7 @@ megatron_options=" \
--min-lr ${MIN_LR} \
--lr-decay-style cosine \
--split 98,2,0 \
--exit-interval ${EXIT_INTERVAL} \
--log-interval ${LOG_INTERVAL} \
--eval-interval ${EVAL_INTERVAL} \
--eval-iters ${EVAL_ITERS} \
Expand All @@ -299,11 +313,12 @@ megatron_options=" \
--log-timers-to-tensorboard \
--log-batch-size-to-tensorboard \
--log-validation-ppl-to-tensorboard \
--no-gradient-accumulation-fusion \
--tensorboard-dir ${TENSORBOARD_DIR}"

if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
megatron_options="${megatron_options} \
--checkpoint-activations"
--checkpoint-activations --recompute-granularity=full --recompute-method=uniform"
fi

if [[ $EP_SIZE -gt 1 ]]; then
Expand All @@ -329,12 +344,12 @@ sed "s/CONFIG_BATCH_SIZE/${GLOBAL_BATCH_SIZE}/" ${template_json} \
| sed "s/CONFIG_CL_MIN/${CL_START_SEQLEN}/" \
| sed "s/CONFIG_CL_MAX/${SEQ_LEN}/" \
| sed "s/CONFIG_CL_DURATION/${CL_STEP}/" \
> ${config_json}
> ${config_json}

deepspeed_options=" \
--deepspeed \
--deepspeed_config ${config_json} \
--pipeline-model-parallel-size ${PP_SIZE}"
--deepspeed \
--deepspeed_config ${config_json} \
--pipeline-model-parallel-size ${PP_SIZE}"

# Currently MoE is not compatible with pipeline parallel
if [[ $EP_SIZE -gt 1 ]]; then
Expand Down Expand Up @@ -369,4 +384,4 @@ fi
run_cmd="deepspeed ${DIR}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} &> ${OUTPUT_BASEPATH}/log/${NAME}_${host}_${current_time}.log"
echo ${run_cmd}
eval ${run_cmd}
set +x
set +x
29 changes: 22 additions & 7 deletions examples_deepspeed/MoE/ds_pretrain_gpt_125M_dense_cl.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company.

#!/bin/bash
DIR=`pwd`
###############################################################################
Expand Down Expand Up @@ -123,8 +125,14 @@ NO_PP="true"
ZERO_STAGE=0

## Total number of GPUs
NUM_GPUS=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2))
NUM_GPUS_PERNODE=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
nvidia-smi || count_GPU=0
if [[ ${count_GPU} == 0 ]];then
NUM_GPUS=$(lspci | grep -i "Processing accelerators: Habana Labs Ltd." | wc -l)
NUM_GPUS_PERNODE=${NUM_GPUS}
else
NUM_GPUS=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2))
NUM_GPUS_PERNODE=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
fi
NUM_NODE=$(( ${NUM_GPUS} / ${NUM_GPUS_PERNODE} ))
DP_SIZE=$(( ${NUM_GPUS} / ${PP_SIZE} / ${MP_SIZE} ))
###############################################################################
Expand All @@ -143,6 +151,7 @@ LOG_INTERVAL=10
EVAL_ITERS=10
EVAL_INTERVAL=100
SAVE_INTERVAL=1000
EXIT_INTERVAL=${HL_EXIT_INTERVAL:-0}

## Standard deviation for weight initialization. Usually larger model needs
## lower std. We used a heuristic equation of sqrt(1/3/HIDDEN_SIZE) from the
Expand Down Expand Up @@ -175,13 +184,17 @@ mkdir -p ${LOG_PATH}
mkdir -p ${TENSORBOARD_PATH}
mkdir -p ${CHECKPOINT_PATH}

VOCAB_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-vocab.json
MERGE_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-merges.txt
#VOCAB_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-vocab.json
#MERGE_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-merges.txt
# Public the Pile dataset, can be downloaded at https://mystic.the-eye.eu/public/AI/pile_neox/
# For cluster Azure-EastUS-V100-32GB-4, Lab-RR1-V100
DATA_PATH=/vc_data_blob/users/conglli/the_pile_public_merged_nopreprocessing/pile_text_document
#DATA_PATH=/vc_data_blob/users/conglli/the_pile_public_merged_nopreprocessing/pile_text_document
# For cluster Azure-WestUS3-A100
# DATA_PATH=/blob/data/the_pile_public_merged_nopreprocessing/pile_text_document
BASE_DATA_PATH=${HL_DATA_DIR_ROOT:-/data/bigscience/oscar-en/}
VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
DATA_PATH=${BASE_DATA_PATH}/meg-gpt2_text_document
###############################################################################
data_options=" \
--vocab-file ${VOCAB_PATH} \
Expand Down Expand Up @@ -211,6 +224,7 @@ megatron_options=" \
--min-lr ${MIN_LR} \
--lr-decay-style cosine \
--split 98,2,0 \
--exit-interval ${EXIT_INTERVAL} \
--log-interval ${LOG_INTERVAL} \
--eval-interval ${EVAL_INTERVAL} \
--eval-iters ${EVAL_ITERS} \
Expand All @@ -226,11 +240,12 @@ megatron_options=" \
--log-timers-to-tensorboard \
--log-batch-size-to-tensorboard \
--log-validation-ppl-to-tensorboard \
--no-gradient-accumulation-fusion \
--tensorboard-dir ${TENSORBOARD_PATH}"

if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
megatron_options="${megatron_options} \
--checkpoint-activations"
--checkpoint-activations --recompute-granularity=full --recompute-method=uniform"
fi

if [ "${LOG_OPTIMIZER_STATE}" = "true" ]; then
Expand Down Expand Up @@ -306,4 +321,4 @@ fi
run_cmd="deepspeed ${DIR}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} &> ${LOG_PATH}/${NAME}_${host}_${current_time}.log"
echo ${run_cmd}
eval ${run_cmd}
set +x
set +x
2 changes: 1 addition & 1 deletion examples_deepspeed/MoE/readme_evalharness.md
Original file line number Diff line number Diff line change
Expand Up @@ -165,4 +165,4 @@ Import location: Replace data at selected cell
4. Now it should be easy to align the new records with the old ones - delete irrelevant records and Insert->Cells where data is missing until the first 2 columns match
5. now create 2 cols in the main table on top and now it should be safe to Copy-n-Paste the 2-col data range, without the task/metrics columns into the newly created space. -->
5. now create 2 cols in the main table on top and now it should be safe to Copy-n-Paste the 2-col data range, without the task/metrics columns into the newly created space. -->
20 changes: 13 additions & 7 deletions examples_deepspeed/run_deepspeed_example.sh
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company.

#!/bin/bash
set -ex

BASE_PATH=/vc_data/Megatron-LM/data
DATA_PATH=${BASE_PATH}/indexed_datasets/megatron
BASE_DATA_PATH=${HL_DATA_DIR_ROOT:-/data/bigscience/oscar-en/}
DATA_PATH=${BASE_DATA_PATH}/meg-gpt2_text_document
VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
DS_CONFIG=ds_config.json

TP=1
Expand Down Expand Up @@ -48,7 +52,7 @@ ds_args=" --zero-stage=$ZERO_STAGE ${ds_args}"
ds_args=" --deepspeed-activation-checkpointing ${ds_args}"


deepspeed pretrain_gpt.py \
deepspeed ../pretrain_gpt.py \
--tensor-model-parallel-size $TP \
--pipeline-model-parallel-size $PP \
--num-layers $NLAYERS \
Expand All @@ -67,8 +71,8 @@ deepspeed pretrain_gpt.py \
--eval-iters 40 \
--eval-interval 1000 \
--data-path $DATA_PATH \
--vocab-file $BASE_PATH/gpt2-vocab.json \
--merge-file $BASE_PATH/gpt2-merges.txt \
--vocab-file $VOCAB_PATH \
--merge-file $MERGE_PATH \
--save-interval 1000 \
--split 98,2,0 \
--clip-grad 1.0 \
Expand All @@ -78,7 +82,9 @@ deepspeed pretrain_gpt.py \
--init-method-std 0.006 \
--fp16 \
--checkpoint-activations \
--recompute-granularity=full \
--recompute-method=uniform \
--no-gradient-accumulation-fusion \
--tensorboard-dir $OUTPUT_DIR \
$ds_args \
--exit-interval 5000 | tee ${OUTPUT_DIR}/output.log

--exit-interval 5000 | tee ${OUTPUT_DIR}/output.log
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company.

#!/bin/bash
dir=`pwd`
###############################################################################
Expand Down Expand Up @@ -147,8 +149,14 @@ no_pp="true"
zero_stage=1

## Total number of GPUs. ds_ssh is from DeepSpeed library.
num_gpus=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2))
num_gpus_pernode=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
nvidia-smi || count_GPU=0
if [[ ${count_GPU} == 0 ]];then
num_gpus=$(lspci | grep -i "Processing accelerators: Habana Labs Ltd." | wc -l)
num_gpus_pernode=${num_gpus}
else
num_gpus=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2))
num_gpus_pernode=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
fi
num_node=$(( ${num_gpus} / ${num_gpus_pernode} ))

## Data parallel size.
Expand Down Expand Up @@ -187,21 +195,28 @@ host="${HOSTNAME}"
seed=1234
num_workers=0

data_path="BookCorpusDataset_text_document"
if [ ! -f "BookCorpusDataset_text_document.bin" ]; then
wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.bin
fi
if [ ! -f "BookCorpusDataset_text_document.idx" ]; then
wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.idx
fi

vocab_path="gpt2-vocab.json"
if [ ! -f "$vocab_path" ]; then
wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
fi
merge_path="gpt2-merges.txt"
if [ ! -f "$merge_path" ]; then
wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
USE_INTERNAL_DATA="false"
if [ "${USE_INTERNAL_DATA}" = "true" ]; then
data_path="BookCorpusDataset_text_document"
if [ ! -f "BookCorpusDataset_text_document.bin" ]; then
wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.bin
fi
if [ ! -f "BookCorpusDataset_text_document.idx" ]; then
wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.idx
fi
vocab_path="gpt2-vocab.json"
if [ ! -f "$vocab_path" ]; then
wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
fi
merge_path="gpt2-merges.txt"
if [ ! -f "$merge_path" ]; then
wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
fi
else
BASE_DATA_PATH=${HL_DATA_DIR_ROOT:-/data/bigscience/oscar-en/}
data_path=${BASE_DATA_PATH}/meg-gpt2_text_document
vocab_path=${BASE_DATA_PATH}/gpt2-vocab.json
merge_path=${BASE_DATA_PATH}/gpt2-merges.txt
fi

prescale_grad="true"
Expand Down Expand Up @@ -282,11 +297,12 @@ megatron_options=" \
--log-timers-to-tensorboard \
--log-batch-size-to-tensorboard \
--log-validation-ppl-to-tensorboard \
--no-gradient-accumulation-fusion \
--tensorboard-dir ${tensorboard_path}"

if [ "${activation_checkpoint}" = "true" ]; then
megatron_options="${megatron_options} \
--checkpoint-activations"
--checkpoint-activations --recompute-granularity=full --recompute-method=uniform"
fi

if [ "${log_optimizer_state}" = "true" ]; then
Expand Down Expand Up @@ -338,4 +354,4 @@ if [[ $iteration -gt 0 ]]; then
ds_ssh "echo $iteration_2 > $iteration_file_2"
fi

deepspeed ${dir}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} 2>&1 | tee ${log_path}/${jobname}_${host}_${current_time}.log
deepspeed ${dir}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} 2>&1 | tee ${log_path}/${jobname}_${host}_${current_time}.log
Loading

0 comments on commit f315700

Please sign in to comment.