Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update README.md #4

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
829 changes: 288 additions & 541 deletions README.md

Large diffs are not rendered by default.

37 changes: 26 additions & 11 deletions examples_deepspeed/MoE/ds_pretrain_gpt_125M_MoE64.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company.

#!/bin/bash
DIR=`pwd`
###############################################################################
Expand Down Expand Up @@ -119,8 +121,14 @@ MP_SIZE=1
## Currently we don't support PP for MoE. To disable PP, set PP_SIZE
## to 1 and use the "--no-pipeline-parallel" arg.
PP_SIZE=1
NUM_GPUS=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2))
NUM_GPUS_PERNODE=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
nvidia-smi || count_GPU=0
if [[ ${count_GPU} == 0 ]];then
NUM_GPUS=$(lspci | grep -i "Processing accelerators: Habana Labs Ltd." | wc -l)
NUM_GPUS_PERNODE=${NUM_GPUS}
else
NUM_GPUS=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2))
NUM_GPUS_PERNODE=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
fi
NUM_NODE=$(( ${NUM_GPUS} / ${NUM_GPUS_PERNODE} ))
###############################################################################
### MoE configs
Expand Down Expand Up @@ -172,6 +180,7 @@ LOG_INTERVAL=10
EVAL_ITERS=10
EVAL_INTERVAL=100
SAVE_INTERVAL=10000
EXIT_INTERVAL=${HL_EXIT_INTERVAL:-0}

## Standard deviation for weight initialization
## We used 0.014 for 350M/1.3B dense/MoE models, and used 0.01 for 6.7B
Expand Down Expand Up @@ -241,13 +250,17 @@ if [ "${USE_INTERNAL_DATA}" = "true" ]; then
0.00208 ${NIH} 0.13017 ${CC2020} 0.09446 ${PCC} 0.15652 ${CC2021} \
0.01359 ${ARX} 0.01588 ${GIT}"
else
VOCAB_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-vocab.json
MERGE_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-merges.txt
#VOCAB_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-vocab.json
#MERGE_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-merges.txt
# Public the Pile dataset, can be downloaded at https://mystic.the-eye.eu/public/AI/pile_neox/
# For cluster Azure-EastUS-V100-32GB-4, Lab-RR1-V100
DATA_PATH=/vc_data_blob/users/conglli/the_pile_public_merged_nopreprocessing/pile_text_document
#DATA_PATH=/vc_data_blob/users/conglli/the_pile_public_merged_nopreprocessing/pile_text_document
# For cluster Azure-WestUS3-A100
# DATA_PATH=/blob/data/the_pile_public_merged_nopreprocessing/pile_text_document
BASE_DATA_PATH=${HL_DATA_DIR_ROOT:-/data/bigscience/oscar-en/}
VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
DATA_PATH=${BASE_DATA_PATH}/meg-gpt2_text_document
fi
###############################################################################
data_options=" \
Expand Down Expand Up @@ -284,6 +297,7 @@ megatron_options=" \
--min-lr ${MIN_LR} \
--lr-decay-style cosine \
--split 98,2,0 \
--exit-interval ${EXIT_INTERVAL} \
--log-interval ${LOG_INTERVAL} \
--eval-interval ${EVAL_INTERVAL} \
--eval-iters ${EVAL_ITERS} \
Expand All @@ -299,11 +313,12 @@ megatron_options=" \
--log-timers-to-tensorboard \
--log-batch-size-to-tensorboard \
--log-validation-ppl-to-tensorboard \
--no-gradient-accumulation-fusion \
--tensorboard-dir ${TENSORBOARD_DIR}"

if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
megatron_options="${megatron_options} \
--checkpoint-activations"
--checkpoint-activations --recompute-granularity=full --recompute-method=uniform"
fi

if [[ $EP_SIZE -gt 1 ]]; then
Expand All @@ -329,12 +344,12 @@ sed "s/CONFIG_BATCH_SIZE/${GLOBAL_BATCH_SIZE}/" ${template_json} \
| sed "s/CONFIG_CL_MIN/${CL_START_SEQLEN}/" \
| sed "s/CONFIG_CL_MAX/${SEQ_LEN}/" \
| sed "s/CONFIG_CL_DURATION/${CL_STEP}/" \
> ${config_json}
> ${config_json}

deepspeed_options=" \
--deepspeed \
--deepspeed_config ${config_json} \
--pipeline-model-parallel-size ${PP_SIZE}"
--deepspeed \
--deepspeed_config ${config_json} \
--pipeline-model-parallel-size ${PP_SIZE}"

# Currently MoE is not compatible with pipeline parallel
if [[ $EP_SIZE -gt 1 ]]; then
Expand Down Expand Up @@ -369,4 +384,4 @@ fi
run_cmd="deepspeed ${DIR}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} &> ${OUTPUT_BASEPATH}/log/${NAME}_${host}_${current_time}.log"
echo ${run_cmd}
eval ${run_cmd}
set +x
set +x
29 changes: 22 additions & 7 deletions examples_deepspeed/MoE/ds_pretrain_gpt_125M_dense_cl.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company.

#!/bin/bash
DIR=`pwd`
###############################################################################
Expand Down Expand Up @@ -123,8 +125,14 @@ NO_PP="true"
ZERO_STAGE=0

## Total number of GPUs
NUM_GPUS=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2))
NUM_GPUS_PERNODE=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
nvidia-smi || count_GPU=0
if [[ ${count_GPU} == 0 ]];then
NUM_GPUS=$(lspci | grep -i "Processing accelerators: Habana Labs Ltd." | wc -l)
NUM_GPUS_PERNODE=${NUM_GPUS}
else
NUM_GPUS=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2))
NUM_GPUS_PERNODE=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
fi
NUM_NODE=$(( ${NUM_GPUS} / ${NUM_GPUS_PERNODE} ))
DP_SIZE=$(( ${NUM_GPUS} / ${PP_SIZE} / ${MP_SIZE} ))
###############################################################################
Expand All @@ -143,6 +151,7 @@ LOG_INTERVAL=10
EVAL_ITERS=10
EVAL_INTERVAL=100
SAVE_INTERVAL=1000
EXIT_INTERVAL=${HL_EXIT_INTERVAL:-0}

## Standard deviation for weight initialization. Usually larger model needs
## lower std. We used a heuristic equation of sqrt(1/3/HIDDEN_SIZE) from the
Expand Down Expand Up @@ -175,13 +184,17 @@ mkdir -p ${LOG_PATH}
mkdir -p ${TENSORBOARD_PATH}
mkdir -p ${CHECKPOINT_PATH}

VOCAB_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-vocab.json
MERGE_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-merges.txt
#VOCAB_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-vocab.json
#MERGE_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-merges.txt
# Public the Pile dataset, can be downloaded at https://mystic.the-eye.eu/public/AI/pile_neox/
# For cluster Azure-EastUS-V100-32GB-4, Lab-RR1-V100
DATA_PATH=/vc_data_blob/users/conglli/the_pile_public_merged_nopreprocessing/pile_text_document
#DATA_PATH=/vc_data_blob/users/conglli/the_pile_public_merged_nopreprocessing/pile_text_document
# For cluster Azure-WestUS3-A100
# DATA_PATH=/blob/data/the_pile_public_merged_nopreprocessing/pile_text_document
BASE_DATA_PATH=${HL_DATA_DIR_ROOT:-/data/bigscience/oscar-en/}
VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
DATA_PATH=${BASE_DATA_PATH}/meg-gpt2_text_document
###############################################################################
data_options=" \
--vocab-file ${VOCAB_PATH} \
Expand Down Expand Up @@ -211,6 +224,7 @@ megatron_options=" \
--min-lr ${MIN_LR} \
--lr-decay-style cosine \
--split 98,2,0 \
--exit-interval ${EXIT_INTERVAL} \
--log-interval ${LOG_INTERVAL} \
--eval-interval ${EVAL_INTERVAL} \
--eval-iters ${EVAL_ITERS} \
Expand All @@ -226,11 +240,12 @@ megatron_options=" \
--log-timers-to-tensorboard \
--log-batch-size-to-tensorboard \
--log-validation-ppl-to-tensorboard \
--no-gradient-accumulation-fusion \
--tensorboard-dir ${TENSORBOARD_PATH}"

if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
megatron_options="${megatron_options} \
--checkpoint-activations"
--checkpoint-activations --recompute-granularity=full --recompute-method=uniform"
fi

if [ "${LOG_OPTIMIZER_STATE}" = "true" ]; then
Expand Down Expand Up @@ -306,4 +321,4 @@ fi
run_cmd="deepspeed ${DIR}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} &> ${LOG_PATH}/${NAME}_${host}_${current_time}.log"
echo ${run_cmd}
eval ${run_cmd}
set +x
set +x
2 changes: 1 addition & 1 deletion examples_deepspeed/MoE/readme_evalharness.md
Original file line number Diff line number Diff line change
Expand Up @@ -165,4 +165,4 @@ Import location: Replace data at selected cell

4. Now it should be easy to align the new records with the old ones - delete irrelevant records and Insert->Cells where data is missing until the first 2 columns match

5. now create 2 cols in the main table on top and now it should be safe to Copy-n-Paste the 2-col data range, without the task/metrics columns into the newly created space. -->
5. now create 2 cols in the main table on top and now it should be safe to Copy-n-Paste the 2-col data range, without the task/metrics columns into the newly created space. -->
20 changes: 13 additions & 7 deletions examples_deepspeed/run_deepspeed_example.sh
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company.

#!/bin/bash
set -ex

BASE_PATH=/vc_data/Megatron-LM/data
DATA_PATH=${BASE_PATH}/indexed_datasets/megatron
BASE_DATA_PATH=${HL_DATA_DIR_ROOT:-/data/bigscience/oscar-en/}
DATA_PATH=${BASE_DATA_PATH}/meg-gpt2_text_document
VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
DS_CONFIG=ds_config.json

TP=1
Expand Down Expand Up @@ -48,7 +52,7 @@ ds_args=" --zero-stage=$ZERO_STAGE ${ds_args}"
ds_args=" --deepspeed-activation-checkpointing ${ds_args}"


deepspeed pretrain_gpt.py \
deepspeed ../pretrain_gpt.py \
--tensor-model-parallel-size $TP \
--pipeline-model-parallel-size $PP \
--num-layers $NLAYERS \
Expand All @@ -67,8 +71,8 @@ deepspeed pretrain_gpt.py \
--eval-iters 40 \
--eval-interval 1000 \
--data-path $DATA_PATH \
--vocab-file $BASE_PATH/gpt2-vocab.json \
--merge-file $BASE_PATH/gpt2-merges.txt \
--vocab-file $VOCAB_PATH \
--merge-file $MERGE_PATH \
--save-interval 1000 \
--split 98,2,0 \
--clip-grad 1.0 \
Expand All @@ -78,7 +82,9 @@ deepspeed pretrain_gpt.py \
--init-method-std 0.006 \
--fp16 \
--checkpoint-activations \
--recompute-granularity=full \
--recompute-method=uniform \
--no-gradient-accumulation-fusion \
--tensorboard-dir $OUTPUT_DIR \
$ds_args \
--exit-interval 5000 | tee ${OUTPUT_DIR}/output.log

--exit-interval 5000 | tee ${OUTPUT_DIR}/output.log
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company.

#!/bin/bash
dir=`pwd`
###############################################################################
Expand Down Expand Up @@ -147,8 +149,14 @@ no_pp="true"
zero_stage=1

## Total number of GPUs. ds_ssh is from DeepSpeed library.
num_gpus=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2))
num_gpus_pernode=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
nvidia-smi || count_GPU=0
if [[ ${count_GPU} == 0 ]];then
num_gpus=$(lspci | grep -i "Processing accelerators: Habana Labs Ltd." | wc -l)
num_gpus_pernode=${num_gpus}
else
num_gpus=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2))
num_gpus_pernode=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
fi
num_node=$(( ${num_gpus} / ${num_gpus_pernode} ))

## Data parallel size.
Expand Down Expand Up @@ -187,21 +195,28 @@ host="${HOSTNAME}"
seed=1234
num_workers=0

data_path="BookCorpusDataset_text_document"
if [ ! -f "BookCorpusDataset_text_document.bin" ]; then
wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.bin
fi
if [ ! -f "BookCorpusDataset_text_document.idx" ]; then
wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.idx
fi

vocab_path="gpt2-vocab.json"
if [ ! -f "$vocab_path" ]; then
wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
fi
merge_path="gpt2-merges.txt"
if [ ! -f "$merge_path" ]; then
wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
USE_INTERNAL_DATA="false"
if [ "${USE_INTERNAL_DATA}" = "true" ]; then
data_path="BookCorpusDataset_text_document"
if [ ! -f "BookCorpusDataset_text_document.bin" ]; then
wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.bin
fi
if [ ! -f "BookCorpusDataset_text_document.idx" ]; then
wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.idx
fi
vocab_path="gpt2-vocab.json"
if [ ! -f "$vocab_path" ]; then
wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
fi
merge_path="gpt2-merges.txt"
if [ ! -f "$merge_path" ]; then
wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
fi
else
BASE_DATA_PATH=${HL_DATA_DIR_ROOT:-/data/bigscience/oscar-en/}
data_path=${BASE_DATA_PATH}/meg-gpt2_text_document
vocab_path=${BASE_DATA_PATH}/gpt2-vocab.json
merge_path=${BASE_DATA_PATH}/gpt2-merges.txt
fi

prescale_grad="true"
Expand Down Expand Up @@ -282,11 +297,12 @@ megatron_options=" \
--log-timers-to-tensorboard \
--log-batch-size-to-tensorboard \
--log-validation-ppl-to-tensorboard \
--no-gradient-accumulation-fusion \
--tensorboard-dir ${tensorboard_path}"

if [ "${activation_checkpoint}" = "true" ]; then
megatron_options="${megatron_options} \
--checkpoint-activations"
--checkpoint-activations --recompute-granularity=full --recompute-method=uniform"
fi

if [ "${log_optimizer_state}" = "true" ]; then
Expand Down Expand Up @@ -338,4 +354,4 @@ if [[ $iteration -gt 0 ]]; then
ds_ssh "echo $iteration_2 > $iteration_file_2"
fi

deepspeed ${dir}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} 2>&1 | tee ${log_path}/${jobname}_${host}_${current_time}.log
deepspeed ${dir}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} 2>&1 | tee ${log_path}/${jobname}_${host}_${current_time}.log
Loading