Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Update ALCF/helpers.sh #69

Merged
merged 1 commit into from
Nov 16, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 35 additions & 9 deletions ALCF/helpers.sh
Original file line number Diff line number Diff line change
Expand Up @@ -194,10 +194,13 @@ setup_run_cmd() {
# min_lr=$(python3 -c 'print(f"{2 / (10 ** 5):.8f}")')
# "--min-lr ${LR:-${min_lr}}" # 2e-5
# "--min-lr ${MIN_LR:-"2e-6"}" # 2e-5
export LR="${LR:-0.0002}"
export LR_DECAY_STYLE="${LR_DECAY_STYLE:-cosine}"
export LR_WARMUP_FRAC="${LR_WARMUP_FRAC:-0.05}"
lr_flags=(
"--lr ${LR:-0.0002}"
"--lr-decay-style ${LR_DECAY_STYLE:-cosine}"
"--lr-warmup-fraction ${LR_WARMUP_FRAC:-0.05}"
"--lr ${LR}"
"--lr-decay-style ${LR_DECAY_STYLE}"
"--lr-warmup-fraction ${LR_WARMUP_FRAC}"
)
if [[ -n "${LR_DECAY_ITERS:-}" ]]; then
lr_flags+=("--lr-decay-iters ${LR_DECAY_ITERS:-}")
Expand Down Expand Up @@ -225,9 +228,9 @@ setup_run_cmd() {
"${lr_flags[@]}"
"${custom_args[@]}"
"${llama_flags[@]}"
"${DATA_FLAGS}"
"${FLASH_ARG}"
"${TIMING_STR}"
"${TIMING_STR:-}"
"${DATA_FLAGS}"
"${TOKENIZER_FLAGS}"
"${tb_flags[@]}"
"${ds_args[@]}"
Expand Down Expand Up @@ -316,6 +319,8 @@ get_machine_name() {
else
machine="polaris"
fi
elif [[ $(hostname) == sophia* ]]; then
machine="sophia"
elif [[ $(hostname) == nid* ]]; then
machine="perlmutter"
else
Expand All @@ -325,6 +330,7 @@ get_machine_name() {
}

get_machine() {
machine=$(hostname)
if [[ $(hostname) == x4* ]]; then
machine="aurora"
elif [[ $(hostname) == x1* ]]; then
Expand All @@ -335,6 +341,8 @@ get_machine() {
else
machine="polaris"
fi
elif [[ $(hostname) == sophia* ]]; then
machine="sophia"
elif [[ $(hostname) == nid* ]]; then
machine="perlmutter"
else
Expand Down Expand Up @@ -366,7 +374,7 @@ setupSrun() {

printJobInfo() {
echo "++++++++++++++++++++++++++++++++++++++++++++++++++"
echo "- MPICH_DIR=${MPICH_DIR:-${MPI_ROOT}}"
echo "- MPICH_DIR=${MPICH_DIR:-${MPI_ROOT:-}}"
echo "- Using $(which python3)"
echo "- WORLD_SIZE:${WORLD_SIZE-}"
echo "- BACKEND: ${BE:-}"
Expand Down Expand Up @@ -406,6 +414,8 @@ setupLauncher() {
mn=$(get_machine_name)
if [[ "${mn}" == "aurora" || "${mn}" == "sunspot" ]]; then
LAUNCHER="${DIST_LAUNCH} --pmi=pmix --genvall $(which python3) -Wignore ${EXEC}"
elif [[ "${mn}" == "sophia" ]]; then
LAUNCHER="${DIST_LAUNCH} $(which python3) -Wignore ${EXEC}"
else
LAUNCHER="${DIST_LAUNCH} --genvall $(which python3) -Wignore ${EXEC}"
fi
Expand Down Expand Up @@ -626,6 +636,22 @@ setParams() {
fi
echo "Setting up AWS NCCL OFI Plugin on Polaris..."
source "${WORKING_DIR}/ALCF/aws_ofi_nccl_plugin.sh" || exit
# ---- [Sophia] ----------------------
elif [[ "${mn}" == sophia* ]]; then
# export LAUNCH_CMD="${LAUNCH_CMD:-deepspeed}"
TP=${TP:-1} # TP = 2
export NCCL=${NCCL:-nccl} # NCCL
export BE="${NCCL}" # BE = NCCL
export DTYPE=${DTYPE:-bf16} # DTYPE: FP16
export GRAD_ACC_STEPS=${GRAD_ACC_STEPS:-8} # GRADIENT_ACC_STEPS
export MICRO_BATCH="${MICRO_BATCH:-$(get_batch_size_on_polaris)}"
if [[ -n "${NO_FLASH_ATTN-}" ]]; then
echo "Not using flash-attn!!"
else
FLASH_ARG="--use-flash-attn-v2"
fi
# echo "Setting up AWS NCCL OFI Plugin on Polaris..."
# source "${WORKING_DIR}/ALCF/aws_ofi_nccl_plugin.sh" || exit
# [Perlmutter]
elif [[ "${mn}" == login* || "${mn}" == nid* ]]; then
TP="${TP:-2}"
Expand Down Expand Up @@ -1046,7 +1072,7 @@ setup_tokenizer_and_data() {
export TOKENIZER_TYPE="GPT2"
_tokenizer_flags+=("--tokenizer-type GPT2BPETokenizer")
machine=$(get_machine_name)
if [[ ${machine} == "polaris" ]]; then
if [[ ${machine} == "polaris" || ${machine} == "sophia" ]]; then
export DATA_PARENT="${DATA_PARENT:-/eagle/argonne_tpc/foremans/projects/argonne-lcf/Megatron-DeepSpeed/dataset}"
elif [[ ${machine} == "sunspot" ]]; then
export DATA_PARENT="${DATA_PARENT:-/gila/Aurora_deployment/foremans/anl_24_q2_release/Megatron-DeepSpeed/dataset}"
Expand Down Expand Up @@ -1075,7 +1101,7 @@ setup_tokenizer_and_data() {
echo "Using tokenizer: ${TOKENIZER_TYPE}. Setting up data with ${DATA_FILE_LIST:-}"
setData "${dfl}" || exit
fi
export DATA_FLAGS="${_data_flags[*]}"
export DATA_FLAGS="${_data_flags[*]:-}"
export TOKENIZER_FLAGS="${_tokenizer_flags[*]}"
printf "[setData] DATA_FLAGS: %s\n" "$(printGreen "${DATA_FLAGS}")"
printf "[setData] TOKENIZER_FLAGS: %s\n" "$(printMagenta "${TOKENIZER_FLAGS}")"
Expand Down Expand Up @@ -1113,7 +1139,7 @@ setData() { # ------------------------[dfl: abbrv. for DATA_FILE_LIST]
printf "WEIGHT_SUM: %s\n" "${WEIGHT_SUM}"
printf "DFL_STEM: %s\n" "${DFL_STEM}"
printf "DATA_CACHE_PATH: %s\n" "${DATA_CACHE_PATH}"
printf "DATA_FLAGS: %s\n" "${DATA_FLAGS}"
printf "DATA_FLAGS: %s\n" "${DATA_FLAGS:-}"
echo "--------------------"
}

Expand Down