diff --git a/examples/sc21/CONFIG.sh b/examples/sc21/CONFIG.sh new file mode 100755 index 0000000000..f17ccd7b02 --- /dev/null +++ b/examples/sc21/CONFIG.sh @@ -0,0 +1,57 @@ +#!/bin/bash + + +# SLURM options. +export SLURM_PARTITION= +export SLURM_ACCOUNT= + + +# Source code. +export MEGATRON_CODE_DIR= + + +# This variable is used to mount the relevant part of the filesystem +# inside the docker container. Note that the `MEGATRON_CODE_DIR` and the +# launch directory already get mounted; this variable should be used to +# mount the directories that contain the data and tokenizer files. +export DOCKER_MOUNT_DIR= + + +# Data and tokenizer files. +MEGATRON_DATA= +BPE_VOCAB_FILE= +BPE_MERGE_FILE= + + +# Megatron input parameters. +# `MEGATRON_EXTRA_PARAMS` can be used to provide any extra parameters +# that are not listed here. +export MEGATRON_PARAMS=" ${MEGATRON_EXTRA_PARAMS} \ + --tensor-model-parallel-size ${TP} \ + --pipeline-model-parallel-size ${PP} \ + --micro-batch-size ${MBS} \ + --global-batch-size ${GBS} \ + --num-layers ${NLS} \ + --hidden-size ${HS} \ + --num-attention-heads ${NAH} \ + --DDP-impl ${DDP} \ + --data-path ${MEGATRON_DATA} \ + --vocab-file ${BPE_VOCAB_FILE} \ + --merge-file ${BPE_MERGE_FILE} \ + --log-interval 5 \ + --seq-length 2048 \ + --max-position-embeddings 2048 \ + --train-iters 500 \ + --lr-decay-iters 320 \ + --lr 0.0001 \ + --min-lr 0.00001 \ + --lr-decay-style cosine \ + --lr-warmup-fraction 0.01 \ + --split 969,30,1 \ + --eval-iters 100 \ + --eval-interval 1000 \ + --clip-grad 1.0 \ + --fp16 \ + --loss-scale 8192 " + + diff --git a/examples/sc21/README.md b/examples/sc21/README.md new file mode 100644 index 0000000000..940c37903e --- /dev/null +++ b/examples/sc21/README.md @@ -0,0 +1,45 @@ +# Reproducing Figures in SC21 Paper + + +This directory contains some of the scripts that were used to produce the +results in the [Megatron paper](https://arxiv.org/pdf/2104.04473.pdf) that is +to appear at [SuperComputing 2021](https://sc21.supercomputing.org/). These +scripts use [Slurm](https://slurm.schedmd.com/documentation.html) with the +[pyxis plugin](https://github.com/NVIDIA/pyxis), but can be modified for other +schedulers as well. + + +## Setup + +All the cluster-dependent variables are in [`CONFIG.sh`](./CONFIG.sh). Please +update the unspecified values (in angle brackets `<...>`) before launching any +scripts. + + + +## Scripts + +Below is a list of scripts that can be used to reproduce various figures in our +[paper](https://arxiv.org/pdf/2104.04473.pdf): + +* [run_table_1.sh](./run_table_1.sh): Table 1 showing weak-scaling throughput +for GPT models ranging from 1 billion to 1 trillion parameters. +* [run_figure_11.sh](./run_figure_11.sh): Figure 11 showing the weak-scaling +performance of pipeline parallelism. +* [run_figure_12.sh](./run_figure_12.sh): Figure 12 showing the effect of +the interleaved schedule on a 175B GPT model. +* [run_figure_13.sh](./run_figure_13.sh): Figure 13 showing the effect of +different degrees of pipeline and tensor model parallelism on a model with +162.2 billion parameters. +* [run_figure_14.sh](./run_figure_14.sh): Figure 14 showing the effect of +different degrees of data and pipeline model parallelism on a model with +5.9 billion parameters. +* [run_figure_15.sh](./run_figure_15.sh): Figure 15 showing the effect of +different degrees of data and tensor model parallelism on a model with +5.9 billion parameters. +* [run_figure_16.sh](./run_figure_16.sh): Figure 16 showing the effect of +microbatch size. +* [run_figure_17.sh](./run_figure_17.sh): Figure 17 showing the effect of +activation recomputation. +* [run_figure_18.sh](./run_figure_18.sh): Figure 18 showing the effect of +the scatter-gather communication optimization. diff --git a/examples/sc21/SBATCH.sh b/examples/sc21/SBATCH.sh new file mode 100755 index 0000000000..95431b9b7e --- /dev/null +++ b/examples/sc21/SBATCH.sh @@ -0,0 +1,13 @@ +#!/bin/bash + + +sbatch -p ${SLURM_PARTITION} \ + -A ${SLURM_ACCOUNT} \ + --job-name=${JOB_NAME} \ + --nodes=${NNODES} \ + --export=MEGATRON_CODE_DIR,MEGATRON_PARAMS,DOCKER_MOUNT_DIR SRUN.sh + +exit 0 + + + diff --git a/examples/sc21/SRUN.sh b/examples/sc21/SRUN.sh new file mode 100755 index 0000000000..52a9aff0c1 --- /dev/null +++ b/examples/sc21/SRUN.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +#SBATCH -t 0:30:00 --exclusive --mem=0 --overcommit --ntasks-per-node=8 + + +THIS_DIR=`pwd` +DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'` +mkdir -p ${THIS_DIR}/logs + + +CMD="python -u ${MEGATRON_CODE_DIR}/pretrain_gpt.py ${MEGATRON_PARAMS}" + + +srun -l \ + --container-image "nvcr.io#nvidia/pytorch:20.12-py3" \ + --container-mounts "${THIS_DIR}:${THIS_DIR},${MEGATRON_CODE_DIR}:${MEGATRON_CODE_DIR},${DOCKER_MOUNT_DIR}:${DOCKER_MOUNT_DIR}" \ + --output=${THIS_DIR}/logs/%x_%j_$DATETIME.log sh -c "${CMD}" + diff --git a/examples/sc21/run_figure_11.sh b/examples/sc21/run_figure_11.sh new file mode 100755 index 0000000000..136db85104 --- /dev/null +++ b/examples/sc21/run_figure_11.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +# ================================ +# Choose the case to run. +# ================================ + +# Pipeline-parallel size options = [1, 2, 4, 8]. +PP=1 + +# Batch size (global batch size) options = [8, 128]. +GBS=8 + + + + + +# Set pipeline-parallel size options. +NLS=$((3*PP)) +NNODES=${PP} + + +# Other params. +TP=8 +MBS=1 +HS=20480 +NAH=128 +DDP=local +MEGATRON_EXTRA_PARAMS="--checkpoint-activations " + + +# Name of the job. +export JOB_NAME=results_figure_11_pipeline_parallel_size_${PP}_batch_size_${GBS} + + +# Import the configs. +. `pwd`/CONFIG.sh + + +# Submit the job. +. `pwd`/SBATCH.sh + + +exit 0 + + + diff --git a/examples/sc21/run_figure_12.sh b/examples/sc21/run_figure_12.sh new file mode 100755 index 0000000000..f57554b36b --- /dev/null +++ b/examples/sc21/run_figure_12.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +# ================================ +# Choose the case to run. +# ================================ + +# Interleaved schedule options = [YES, NO]. +INTERLEAVED=YES + +# Batch size (global batch size) options = [12, 24, 36, ..., 60]. +GBS=12 + + + + + +# Set interleaved schedule options. +if [ ${INTERLEAVED} == "YES" ]; then + MEGATRON_EXTRA_PARAMS="--checkpoint-activations --num-layers-per-virtual-pipeline-stage 2 " +elif [ ${INTERLEAVED} == "NO" ]; then + MEGATRON_EXTRA_PARAMS="--checkpoint-activations " +else + echo "Invalid configuration" + exit 1 +fi + + +# Other params. +TP=8 +PP=12 +MBS=1 +NLS=96 +HS=12288 +NAH=96 +DDP=local +NNODES=12 + + +# Name of the job. +export JOB_NAME=results_figure_12_interleaved_${INTERLEAVED}_batch_size_${GBS} + + +# Import the configs. +. `pwd`/CONFIG.sh + + +# Submit the job. +. `pwd`/SBATCH.sh + + +exit 0 + + + diff --git a/examples/sc21/run_figure_13.sh b/examples/sc21/run_figure_13.sh new file mode 100755 index 0000000000..461aa77c6b --- /dev/null +++ b/examples/sc21/run_figure_13.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +# ================================ +# Choose the case to run. +# ================================ + +# Pipeline-parallel size options = [2, 4, 8, 16, 32]. +PP=2 + +# Batch size (global batch size) options = [32, 128]. +GBS=32 + + + + + +# Set pipeline-parallel and tensor-parallel size options. +TP=$((64/PP)) + + +# Other params. +MBS=1 +NLS=32 +HS=20480 +NAH=128 +DDP=local +MEGATRON_EXTRA_PARAMS="--checkpoint-activations " +NNODES=8 + + +# Name of the job. +export JOB_NAME=results_figure_13_pipeline_parallel_size_${PP}_tensor_parallel_size_${TP}_batch_size_${GBS} + + +# Import the configs. +. `pwd`/CONFIG.sh + + +# Submit the job. +. `pwd`/SBATCH.sh + + +exit 0 + + + diff --git a/examples/sc21/run_figure_14.sh b/examples/sc21/run_figure_14.sh new file mode 100755 index 0000000000..a578b6ce89 --- /dev/null +++ b/examples/sc21/run_figure_14.sh @@ -0,0 +1,47 @@ +#!/bin/bash + +# ================================ +# Choose the case to run. +# ================================ + +# Pipeline-parallel size options = [2, 4, 8, 16, 32]. +PP=2 + +# Batch size (global batch size) options = [32, 512]. +GBS=32 + + + + + +# Set pipeline-parallel and data-parallel size options. +DP=$((64/PP)) + + +# Other params. +TP=1 +MBS=1 +NLS=32 +HS=3840 +NAH=32 +DDP=local +MEGATRON_EXTRA_PARAMS="--checkpoint-activations " +NNODES=8 + + +# Name of the job. +export JOB_NAME=results_figure_14_pipeline_parallel_size_${PP}_data_parallel_size_${DP}_batch_size_${GBS} + + +# Import the configs. +. `pwd`/CONFIG.sh + + +# Submit the job. +. `pwd`/SBATCH.sh + + +exit 0 + + + diff --git a/examples/sc21/run_figure_15.sh b/examples/sc21/run_figure_15.sh new file mode 100755 index 0000000000..8fad224570 --- /dev/null +++ b/examples/sc21/run_figure_15.sh @@ -0,0 +1,47 @@ +#!/bin/bash + +# ================================ +# Choose the case to run. +# ================================ + +# Tensor-parallel size options = [2, 4, 8, 16, 32]. +TP=2 + +# Batch size (global batch size) options = [32, 128, 512]. +GBS=32 + + + + + +# Set tensor-parallel and data-parallel size options. +DP=$((64/TP)) + + +# Other params. +PP=1 +MBS=1 +NLS=32 +HS=3840 +NAH=32 +DDP=local +MEGATRON_EXTRA_PARAMS="--checkpoint-activations " +NNODES=8 + + +# Name of the job. +export JOB_NAME=results_figure_15_tensor_parallel_size_${TP}_data_parallel_size_${DP}_batch_size_${GBS} + + +# Import the configs. +. `pwd`/CONFIG.sh + + +# Submit the job. +. `pwd`/SBATCH.sh + + +exit 0 + + + diff --git a/examples/sc21/run_figure_16.sh b/examples/sc21/run_figure_16.sh new file mode 100755 index 0000000000..0fb78f4ad3 --- /dev/null +++ b/examples/sc21/run_figure_16.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +# ================================ +# Choose the case to run. +# ================================ + +# Microbatch size options = [1, 2, 4, 8]. +MBS=1 + +# Batch size (global batch size) options = [128, 512]. +GBS=128 + + + + + +# Other params. +TP=8 +PP=8 +NLS=32 +HS=15360 +NAH=128 +DDP=local +MEGATRON_EXTRA_PARAMS="--checkpoint-activations " +NNODES=8 + + +# Name of the job. +export JOB_NAME=results_figure_16_microbatch_size_${MBS}_batch_size_${GBS} + + +# Import the configs. +. `pwd`/CONFIG.sh + + +# Submit the job. +. `pwd`/SBATCH.sh + + +exit 0 + + + diff --git a/examples/sc21/run_figure_17.sh b/examples/sc21/run_figure_17.sh new file mode 100755 index 0000000000..8ec7ee2bfe --- /dev/null +++ b/examples/sc21/run_figure_17.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +# ================================ +# Choose the case to run. +# ================================ + +# Activation recomputation options = [YES, NO]. +ACTIVATION_RECOMPUTATION=YES + +# Batch size (global batch size) options = [1, 2, 4, ..., 256]. +GBS=1 + + + + + +# Set activation recomputation. +if [ ${ACTIVATION_RECOMPUTATION} == "YES" ]; then + MEGATRON_EXTRA_PARAMS="--checkpoint-activations " +elif [ ${ACTIVATION_RECOMPUTATION} == "NO" ]; then + MEGATRON_EXTRA_PARAMS="" +else + echo "Invalid configuration" + exit 1 +fi + + +# Other params. +TP=8 +PP=16 +MBS=1 +NLS=80 +HS=12288 +NAH=96 +DDP=local +NNODES=16 + + +# Name of the job. +export JOB_NAME=results_figure_17_activation_recomputation_${ACTIVATION_RECOMPUTATION}_batch_size_${GBS} + + +# Import the configs. +. `pwd`/CONFIG.sh + + +# Submit the job. +. `pwd`/SBATCH.sh + + +exit 0 + + + diff --git a/examples/sc21/run_figure_18.sh b/examples/sc21/run_figure_18.sh new file mode 100755 index 0000000000..be93d8ae08 --- /dev/null +++ b/examples/sc21/run_figure_18.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +# ================================ +# Choose the case to run. +# ================================ + +# Scatter-gather communication optimization options = [YES, NO]. +SCATTER_GATHER=YES + +# Batch size (global batch size) options = [12, 24, 36, ..., 60]. +GBS=12 + + + + + +# Set scatter-gather communication optimization options. +if [ ${SCATTER_GATHER} == "YES" ]; then + MEGATRON_EXTRA_PARAMS="--checkpoint-activations --num-layers-per-virtual-pipeline-stage 2 " +elif [ ${SCATTER_GATHER} == "NO" ]; then + MEGATRON_EXTRA_PARAMS="--checkpoint-activations --num-layers-per-virtual-pipeline-stage 2 --no-scatter-gather-tensors-in-pipeline " +else + echo "Invalid configuration" + exit 1 +fi + + +# Other params. +TP=8 +PP=12 +MBS=1 +NLS=96 +HS=12288 +NAH=96 +DDP=local +NNODES=12 + + +# Name of the job. +export JOB_NAME=results_figure_18_scatter_gather_${SCATTER_GATHER}_batch_size_${GBS} + + +# Import the configs. +. `pwd`/CONFIG.sh + + +# Submit the job. +. `pwd`/SBATCH.sh + + +exit 0 + + + diff --git a/examples/sc21/run_table_1.sh b/examples/sc21/run_table_1.sh new file mode 100755 index 0000000000..d233472545 --- /dev/null +++ b/examples/sc21/run_table_1.sh @@ -0,0 +1,145 @@ +#!/bin/bash + +# ================================ +# Choose the case to run. +# ================================ +# model size options = [1.7B, 3.6B, 7.5B, 18B, 39B, 76B, 145B, 310B, 530B, 1T] +MODEL_SIZE=1.7B + + + + + + +if [ ${MODEL_SIZE} == "1.7B" ]; then + TP=1 + PP=1 + MBS=16 + GBS=512 + NLS=24 + HS=2304 + NAH=24 + DDP=torch + NNODES=4 + MEGATRON_EXTRA_PARAMS="--checkpoint-activations " +elif [ ${MODEL_SIZE} == "3.6B" ]; then + TP=2 + PP=1 + MBS=16 + GBS=512 + NLS=30 + HS=3072 + NAH=32 + DDP=torch + NNODES=8 + MEGATRON_EXTRA_PARAMS="--checkpoint-activations " +elif [ ${MODEL_SIZE} == "7.5B" ]; then + TP=4 + PP=1 + MBS=16 + GBS=512 + NLS=36 + HS=4096 + NAH=32 + DDP=torch + NNODES=16 + MEGATRON_EXTRA_PARAMS="--checkpoint-activations " +elif [ ${MODEL_SIZE} == "18B" ]; then + TP=8 + PP=1 + MBS=8 + GBS=1024 + NLS=40 + HS=6144 + NAH=48 + DDP=torch + NNODES=32 + MEGATRON_EXTRA_PARAMS="--checkpoint-activations " +elif [ ${MODEL_SIZE} == "39B" ]; then + TP=8 + PP=2 + MBS=4 + GBS=1536 + NLS=48 + HS=8192 + NAH=64 + DDP=local + NNODES=64 + MEGATRON_EXTRA_PARAMS="--checkpoint-activations " +elif [ ${MODEL_SIZE} == "76B" ]; then + TP=8 + PP=4 + MBS=2 + GBS=1792 + NLS=60 + HS=10240 + NAH=80 + DDP=local + NNODES=128 + MEGATRON_EXTRA_PARAMS="--checkpoint-activations --num-layers-per-virtual-pipeline-stage 5" +elif [ ${MODEL_SIZE} == "145B" ]; then + TP=8 + PP=8 + MBS=2 + GBS=2304 + NLS=80 + HS=12288 + NAH=96 + DDP=local + NNODES=192 + MEGATRON_EXTRA_PARAMS="--checkpoint-activations --num-layers-per-virtual-pipeline-stage 5 " +elif [ ${MODEL_SIZE} == "310B" ]; then + TP=8 + PP=16 + MBS=1 + GBS=2160 + NLS=96 + HS=16384 + NAH=128 + DDP=local + NNODES=240 + MEGATRON_EXTRA_PARAMS="--checkpoint-activations --num-layers-per-virtual-pipeline-stage 3 " +elif [ ${MODEL_SIZE} == "530B" ]; then + TP=8 + PP=35 + MBS=1 + GBS=2520 + NLS=105 + HS=20480 + NAH=128 + DDP=local + NNODES=315 + MEGATRON_EXTRA_PARAMS="--checkpoint-activations --num-layers-per-virtual-pipeline-stage 1 " +elif [ ${MODEL_SIZE} == "1T" ]; then + TP=8 + PP=64 + MBS=1 + GBS=3072 + NLS=128 + HS=25600 + NAH=160 + DDP=local + NNODES=384 + MEGATRON_EXTRA_PARAMS="--checkpoint-activations " +else + echo "Invalid configuration" + exit 1 +fi + + +# Name of the job +export JOB_NAME=results_table_1_model_size_${MODEL_SIZE} + + +# Import the configs. +. `pwd`/CONFIG.sh + + +# Submit the job. +. `pwd`/SBATCH.sh + + +exit 0 + + +