Megatron-DeepSpeed-fork content for 1.17.0

Signed-off-by: SW publisher <[email protected]>
HabanaAI · Aug 9, 2024 · f315700 · f315700
1 parent 7eb36a1
commit f315700
Show file tree

Hide file tree

Showing 93 changed files with 6,491 additions and 1,334 deletions.
diff --git a/README.md b/README.md
diff --git a/examples_deepspeed/MoE/ds_pretrain_gpt_125M_MoE64.sh b/examples_deepspeed/MoE/ds_pretrain_gpt_125M_MoE64.sh
@@ -1,3 +1,5 @@
+# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company.
+
 #!/bin/bash
 DIR=`pwd`
 ###############################################################################
@@ -119,8 +121,14 @@ MP_SIZE=1
 ## Currently we don't support PP for MoE. To disable PP, set PP_SIZE
 ## to 1 and use the "--no-pipeline-parallel" arg.
 PP_SIZE=1
-NUM_GPUS=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2))
-NUM_GPUS_PERNODE=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
+nvidia-smi || count_GPU=0
+if [[ ${count_GPU} == 0 ]];then
+    NUM_GPUS=$(lspci | grep -i "Processing accelerators: Habana Labs Ltd." | wc -l)
+    NUM_GPUS_PERNODE=${NUM_GPUS}
+else
+    NUM_GPUS=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2))
+    NUM_GPUS_PERNODE=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
+fi
 NUM_NODE=$(( ${NUM_GPUS} / ${NUM_GPUS_PERNODE} ))
 ###############################################################################
 ### MoE configs
@@ -172,6 +180,7 @@ LOG_INTERVAL=10
 EVAL_ITERS=10
 EVAL_INTERVAL=100
 SAVE_INTERVAL=10000
+EXIT_INTERVAL=${HL_EXIT_INTERVAL:-0}
 
 ## Standard deviation for weight initialization
 ## We used 0.014 for 350M/1.3B dense/MoE models, and used 0.01 for 6.7B
@@ -241,13 +250,17 @@ if [ "${USE_INTERNAL_DATA}" = "true" ]; then
     0.00208 ${NIH} 0.13017 ${CC2020} 0.09446 ${PCC} 0.15652 ${CC2021} \
     0.01359 ${ARX} 0.01588 ${GIT}"
 else
-    VOCAB_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-vocab.json
-    MERGE_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-merges.txt
+    #VOCAB_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-vocab.json
+    #MERGE_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-merges.txt
     # Public the Pile dataset, can be downloaded at https://mystic.the-eye.eu/public/AI/pile_neox/
     # For cluster Azure-EastUS-V100-32GB-4, Lab-RR1-V100
-    DATA_PATH=/vc_data_blob/users/conglli/the_pile_public_merged_nopreprocessing/pile_text_document
+    #DATA_PATH=/vc_data_blob/users/conglli/the_pile_public_merged_nopreprocessing/pile_text_document
     # For cluster Azure-WestUS3-A100
     # DATA_PATH=/blob/data/the_pile_public_merged_nopreprocessing/pile_text_document
+    BASE_DATA_PATH=${HL_DATA_DIR_ROOT:-/data/bigscience/oscar-en/}
+    VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
+    MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
+    DATA_PATH=${BASE_DATA_PATH}/meg-gpt2_text_document
 fi
 ###############################################################################
 data_options=" \
@@ -284,6 +297,7 @@ megatron_options=" \
         --min-lr ${MIN_LR} \
         --lr-decay-style cosine \
         --split 98,2,0 \
+        --exit-interval ${EXIT_INTERVAL} \
         --log-interval ${LOG_INTERVAL} \
         --eval-interval ${EVAL_INTERVAL} \
         --eval-iters ${EVAL_ITERS} \
@@ -299,11 +313,12 @@ megatron_options=" \
         --log-timers-to-tensorboard \
         --log-batch-size-to-tensorboard \
         --log-validation-ppl-to-tensorboard \
+        --no-gradient-accumulation-fusion \
         --tensorboard-dir ${TENSORBOARD_DIR}"
 
 if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
 megatron_options="${megatron_options} \
-        --checkpoint-activations"
+        --checkpoint-activations --recompute-granularity=full --recompute-method=uniform"
 fi
 
 if [[ $EP_SIZE -gt 1 ]]; then
@@ -329,12 +344,12 @@ sed "s/CONFIG_BATCH_SIZE/${GLOBAL_BATCH_SIZE}/" ${template_json} \
     | sed "s/CONFIG_CL_MIN/${CL_START_SEQLEN}/" \
     | sed "s/CONFIG_CL_MAX/${SEQ_LEN}/" \
     | sed "s/CONFIG_CL_DURATION/${CL_STEP}/" \
-	  > ${config_json}
+        > ${config_json}
 
 deepspeed_options=" \
-		    --deepspeed \
-		    --deepspeed_config ${config_json} \
-		    --pipeline-model-parallel-size ${PP_SIZE}"
+            --deepspeed \
+            --deepspeed_config ${config_json} \
+            --pipeline-model-parallel-size ${PP_SIZE}"
 
 # Currently MoE is not compatible with pipeline parallel
 if [[ $EP_SIZE -gt 1 ]]; then
@@ -369,4 +384,4 @@ fi
 run_cmd="deepspeed ${DIR}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} &> ${OUTPUT_BASEPATH}/log/${NAME}_${host}_${current_time}.log"
 echo ${run_cmd}
 eval ${run_cmd}
-set +x
+set +x
diff --git a/examples_deepspeed/MoE/ds_pretrain_gpt_125M_dense_cl.sh b/examples_deepspeed/MoE/ds_pretrain_gpt_125M_dense_cl.sh
@@ -1,3 +1,5 @@
+# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company.
+
 #!/bin/bash
 DIR=`pwd`
 ###############################################################################
@@ -123,8 +125,14 @@ NO_PP="true"
 ZERO_STAGE=0
 
 ## Total number of GPUs
-NUM_GPUS=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2))
-NUM_GPUS_PERNODE=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
+nvidia-smi || count_GPU=0
+if [[ ${count_GPU} == 0 ]];then
+    NUM_GPUS=$(lspci | grep -i "Processing accelerators: Habana Labs Ltd." | wc -l)
+    NUM_GPUS_PERNODE=${NUM_GPUS}
+else
+    NUM_GPUS=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2))
+    NUM_GPUS_PERNODE=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
+fi
 NUM_NODE=$(( ${NUM_GPUS} / ${NUM_GPUS_PERNODE} ))
 DP_SIZE=$(( ${NUM_GPUS} / ${PP_SIZE} / ${MP_SIZE} ))
 ###############################################################################
@@ -143,6 +151,7 @@ LOG_INTERVAL=10
 EVAL_ITERS=10
 EVAL_INTERVAL=100
 SAVE_INTERVAL=1000
+EXIT_INTERVAL=${HL_EXIT_INTERVAL:-0}
 
 ## Standard deviation for weight initialization. Usually larger model needs
 ## lower std. We used a heuristic equation of sqrt(1/3/HIDDEN_SIZE) from the
@@ -175,13 +184,17 @@ mkdir -p ${LOG_PATH}
 mkdir -p ${TENSORBOARD_PATH}
 mkdir -p ${CHECKPOINT_PATH}
 
-VOCAB_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-vocab.json
-MERGE_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-merges.txt
+#VOCAB_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-vocab.json
+#MERGE_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-merges.txt
 # Public the Pile dataset, can be downloaded at https://mystic.the-eye.eu/public/AI/pile_neox/
 # For cluster Azure-EastUS-V100-32GB-4, Lab-RR1-V100
-DATA_PATH=/vc_data_blob/users/conglli/the_pile_public_merged_nopreprocessing/pile_text_document
+#DATA_PATH=/vc_data_blob/users/conglli/the_pile_public_merged_nopreprocessing/pile_text_document
 # For cluster Azure-WestUS3-A100
 # DATA_PATH=/blob/data/the_pile_public_merged_nopreprocessing/pile_text_document
+BASE_DATA_PATH=${HL_DATA_DIR_ROOT:-/data/bigscience/oscar-en/}
+VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
+MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
+DATA_PATH=${BASE_DATA_PATH}/meg-gpt2_text_document
 ###############################################################################
 data_options=" \
          --vocab-file ${VOCAB_PATH} \
@@ -211,6 +224,7 @@ megatron_options=" \
         --min-lr ${MIN_LR} \
         --lr-decay-style cosine \
         --split 98,2,0 \
+        --exit-interval ${EXIT_INTERVAL} \
         --log-interval ${LOG_INTERVAL} \
         --eval-interval ${EVAL_INTERVAL} \
         --eval-iters ${EVAL_ITERS} \
@@ -226,11 +240,12 @@ megatron_options=" \
         --log-timers-to-tensorboard \
         --log-batch-size-to-tensorboard \
         --log-validation-ppl-to-tensorboard \
+        --no-gradient-accumulation-fusion \
         --tensorboard-dir ${TENSORBOARD_PATH}"
 
 if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
 megatron_options="${megatron_options} \
-        --checkpoint-activations"
+        --checkpoint-activations --recompute-granularity=full --recompute-method=uniform"
 fi
 
 if [ "${LOG_OPTIMIZER_STATE}" = "true" ]; then
@@ -306,4 +321,4 @@ fi
 run_cmd="deepspeed ${DIR}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} &> ${LOG_PATH}/${NAME}_${host}_${current_time}.log"
 echo ${run_cmd}
 eval ${run_cmd}
-set +x
+set +x
diff --git a/examples_deepspeed/MoE/readme_evalharness.md b/examples_deepspeed/MoE/readme_evalharness.md
@@ -165,4 +165,4 @@ Import location: Replace data at selected cell
 
 4. Now it should be easy to align the new records with the old ones - delete irrelevant records and Insert->Cells where data is missing until the first 2 columns match
 
-5. now create 2 cols in the main table on top and now it should be safe to Copy-n-Paste the 2-col data range, without the task/metrics columns into the newly created space. -->
+5. now create 2 cols in the main table on top and now it should be safe to Copy-n-Paste the 2-col data range, without the task/metrics columns into the newly created space. -->
diff --git a/examples_deepspeed/run_deepspeed_example.sh b/examples_deepspeed/run_deepspeed_example.sh
@@ -1,8 +1,12 @@
+# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company.
+
 #!/bin/bash
 set -ex
 
-BASE_PATH=/vc_data/Megatron-LM/data
-DATA_PATH=${BASE_PATH}/indexed_datasets/megatron
+BASE_DATA_PATH=${HL_DATA_DIR_ROOT:-/data/bigscience/oscar-en/}
+DATA_PATH=${BASE_DATA_PATH}/meg-gpt2_text_document
+VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
+MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
 DS_CONFIG=ds_config.json
 
 TP=1
@@ -48,7 +52,7 @@ ds_args=" --zero-stage=$ZERO_STAGE ${ds_args}"
 ds_args=" --deepspeed-activation-checkpointing ${ds_args}"
 
 
-deepspeed pretrain_gpt.py \
+deepspeed ../pretrain_gpt.py \
     --tensor-model-parallel-size $TP \
     --pipeline-model-parallel-size $PP \
     --num-layers $NLAYERS \
@@ -67,8 +71,8 @@ deepspeed pretrain_gpt.py \
     --eval-iters 40 \
     --eval-interval 1000 \
     --data-path $DATA_PATH \
-    --vocab-file $BASE_PATH/gpt2-vocab.json \
-    --merge-file $BASE_PATH/gpt2-merges.txt \
+    --vocab-file $VOCAB_PATH \
+    --merge-file $MERGE_PATH \
     --save-interval 1000 \
     --split 98,2,0 \
     --clip-grad 1.0 \
@@ -78,7 +82,9 @@ deepspeed pretrain_gpt.py \
     --init-method-std 0.006 \
     --fp16 \
     --checkpoint-activations \
+    --recompute-granularity=full \
+    --recompute-method=uniform \
+    --no-gradient-accumulation-fusion \
     --tensorboard-dir $OUTPUT_DIR \
     $ds_args \
-    --exit-interval 5000 | tee ${OUTPUT_DIR}/output.log
-
+    --exit-interval 5000 | tee ${OUTPUT_DIR}/output.log
diff --git a/examples_deepspeed/sequence_parallel/ds_pretrain_gpt_1.3B_seq_parallel_32k.sh b/examples_deepspeed/sequence_parallel/ds_pretrain_gpt_1.3B_seq_parallel_32k.sh
@@ -1,3 +1,5 @@
+# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company.
+
 #!/bin/bash
 dir=`pwd`
 ###############################################################################
@@ -147,8 +149,14 @@ no_pp="true"
 zero_stage=1
 
 ## Total number of GPUs. ds_ssh is from DeepSpeed library.
-num_gpus=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2))
-num_gpus_pernode=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
+nvidia-smi || count_GPU=0
+if [[ ${count_GPU} == 0 ]];then
+    num_gpus=$(lspci | grep -i "Processing accelerators: Habana Labs Ltd." | wc -l)
+    num_gpus_pernode=${num_gpus}
+else
+    num_gpus=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2))
+    num_gpus_pernode=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
+fi
 num_node=$(( ${num_gpus} / ${num_gpus_pernode} ))
 
 ## Data parallel size.
@@ -187,21 +195,28 @@ host="${HOSTNAME}"
 seed=1234
 num_workers=0
 
-data_path="BookCorpusDataset_text_document"
-if [ ! -f "BookCorpusDataset_text_document.bin" ]; then
-    wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.bin
-fi
-if [ ! -f "BookCorpusDataset_text_document.idx" ]; then
-    wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.idx
-fi
-
-vocab_path="gpt2-vocab.json"
-if [ ! -f "$vocab_path" ]; then
-    wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
-fi
-merge_path="gpt2-merges.txt"
-if [ ! -f "$merge_path" ]; then
-    wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
+USE_INTERNAL_DATA="false"
+if [ "${USE_INTERNAL_DATA}" = "true" ]; then
+    data_path="BookCorpusDataset_text_document"
+    if [ ! -f "BookCorpusDataset_text_document.bin" ]; then
+        wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.bin
+    fi
+    if [ ! -f "BookCorpusDataset_text_document.idx" ]; then
+        wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.idx
+    fi
+    vocab_path="gpt2-vocab.json"
+    if [ ! -f "$vocab_path" ]; then
+        wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
+    fi
+    merge_path="gpt2-merges.txt"
+    if [ ! -f "$merge_path" ]; then
+        wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
+    fi
+else
+    BASE_DATA_PATH=${HL_DATA_DIR_ROOT:-/data/bigscience/oscar-en/}
+    data_path=${BASE_DATA_PATH}/meg-gpt2_text_document
+    vocab_path=${BASE_DATA_PATH}/gpt2-vocab.json
+    merge_path=${BASE_DATA_PATH}/gpt2-merges.txt
 fi
 
 prescale_grad="true"
@@ -282,11 +297,12 @@ megatron_options=" \
     --log-timers-to-tensorboard \
     --log-batch-size-to-tensorboard \
     --log-validation-ppl-to-tensorboard \
+    --no-gradient-accumulation-fusion \
     --tensorboard-dir ${tensorboard_path}"
 
 if [ "${activation_checkpoint}" = "true" ]; then
 megatron_options="${megatron_options} \
-    --checkpoint-activations"
+    --checkpoint-activations --recompute-granularity=full --recompute-method=uniform"
 fi
 
 if [ "${log_optimizer_state}" = "true" ]; then
@@ -338,4 +354,4 @@ if [[ $iteration -gt 0 ]]; then
     ds_ssh "echo $iteration_2 > $iteration_file_2"
 fi
 
-deepspeed ${dir}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} 2>&1 | tee ${log_path}/${jobname}_${host}_${current_time}.log
+deepspeed ${dir}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} 2>&1 | tee ${log_path}/${jobname}_${host}_${current_time}.log
Original file line number	Diff line number	Diff line change
Expand Up		@@ -165,4 +165,4 @@ Import location: Replace data at selected cell

		4. Now it should be easy to align the new records with the old ones - delete irrelevant records and Insert->Cells where data is missing until the first 2 columns match

		5. now create 2 cols in the main table on top and now it should be safe to Copy-n-Paste the 2-col data range, without the task/metrics columns into the newly created space. -->
		5. now create 2 cols in the main table on top and now it should be safe to Copy-n-Paste the 2-col data range, without the task/metrics columns into the newly created space. -->