Skip to content

Commit

Permalink
[kunlunxin] add llama3 70b patch
Browse files Browse the repository at this point in the history
  • Loading branch information
brianlcy123 committed Nov 21, 2024
1 parent 6b238b7 commit 8308ae6
Show file tree
Hide file tree
Showing 5 changed files with 230 additions and 128 deletions.
102 changes: 46 additions & 56 deletions examples/llama/conf/config.yaml
Original file line number Diff line number Diff line change
@@ -1,63 +1,53 @@
action: run
# defaults:
# - train: train_llama2_7b
# - _self_

# experiment:
# exp_name: llama2
# exp_dir: ./outputs_llama2
# task:
# type: train
# backend: megatron
# entrypoint: ./flagscale/train/train_llama.py
# runner:
# backend: torchrun
# nnodes: 1
# nproc_per_node: 8
# hostfile: hostfile
# envs:
# CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7
# CUDA_DEVICE_MAX_CONNECTIONS: 1

# action: run

# hydra:
# run:
# dir: ${experiment.exp_dir}/hydra


defaults:
- train: train_llama3_70b_finetune
- _self_
- train: train_llama3_70b
- _self_

experiment:
cmds:
before_start: source ~/.bashrc; export LD_LIBRARY_PATH=/share/project/PUBLIC/data/llama3-70b/xpu_output//xccl/3.0.0.4_20241107/xccl_rdma-ubuntu_x86_64/so/:/share/project/PUBLIC/data/llama3-70b/xpu_output//xhpc/20241107/xhpc-ubuntu2004_x86_64/xblas/so:/share/project/PUBLIC/data/llama3-70b/xpu_output//xhpc/20241107/xhpc-ubuntu2004_x86_64/xfa/so:/share/project/PUBLIC/data/llama3-70b/xpu_output//xhpc/20241107/xhpc-ubuntu2004_x86_64/xpudnn/so:/share/project/PUBLIC/data/llama3-70b/xpu_output//xre/5.0.21.5/xre-Linux-x86_64-5.0.21.5/so
envs:
ALLGATHER_ASYNC: false
ALLREDUCE_ASYNC: false
ALLREDUCE_FUSION: 0
BKCL_CCIX_BUFFER_GM: 1
BKCL_CCIX_RING: 1
BKCL_ENABLE_XDR: 1
BKCL_FLAT_RING: 1
BKCL_KL3_TURBO_MODE: 1
BKCL_RDMA_FORCE_TREE: 1
BKCL_RDMA_NICS: ens11np0,ens11np0,ens13np0,ens13np0,ens15np0,ens15np0,ens17np0,ens17np0
BKCL_RDMA_PROXY_DISABLE: 1
BKCL_RING_BUFFER_GM: 1
BKCL_TIMEOUT: 360000
BKCL_TRANS_UNSUPPORTED_DATATYPE: 8
BKCL_TREE_THRESHOLD: 1
BKCL_XLINK_C2C: 1
BKCL_XLINK_D2D: 0
BKCL_XLINK_ETH: 0
CUDART_DUMMY_REGISTER: 1
CUDA_DEVICE_MAX_CONNECTIONS: 8
CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7
DIST_MULTI_STREAM: true
FAST_SWIGLU_ENABLE: 1
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NVTE_APPLY_QK_LAYER_SCALING: 0
USE_FAST_BF16_FC: true
USE_L3: 1
XBLAS_FC_HBM_VERSION: 40
XDNN_USE_FAST_SWISH: true
XMLIR_BATCH_PARALLEL: true
XMLIR_DISABLE_CUDA_ALLOCATOR: true
XMLIR_DIST_ASYNC_ISEND_IRECV: 1
XMLIR_DIST_SINGLETON_STREAM: true
XMLIR_DUMP_FALLBACK_OP_LIST_BOOL: true
XMLIR_ENABLE_FALLBACK_TO_CPU_BOOL: false
XMLIR_FA_GEMM_TYPE: float
XMLIR_PARALLEL_SAVE_MEMORY: false
XMLIR_XDNN_PYTORCH_CHECK_ENABLE_FALLBACK_BOOL: 0
XPU_FORCE_USERMODE_LAUNCH: 1
XPU_ZEBU_MODE: 1
exp_dir: /share/project/PUBLIC/data/llama3-70b/FlagOpen/FlagPerf/training/result/run20241120200156/llama3_70B_continuetrain:flagscale_llama:R300p:4:8:1/round1/10.1.15.7_noderank0/outputs_llama3
exp_name: llama3
runner:
backend: torchrun
hostfile: /share/project/PUBLIC/data/llama3-70b/FlagOpen/FlagScale/hostfile
nnodes: 4
nproc_per_node: 8
ssh_port: 3702
exp_dir: ./outputs_llama3_70b
task:
type: train
backend: megatron
entrypoint: ./flagscale/train/train_llama.py
type: train
hydra:
runner:
backend: torchrun
nnodes: 4
nproc_per_node: 8
hostfile: ${hostfile??}
envs:
CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7
CUDA_DEVICE_MAX_CONNECTIONS: 1
NVTE_APPLY_QK_LAYER_SCALING: 0
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
action: run

hydra:
run:
dir: ${experiment.exp_dir}/hydra
dir: ${experiment.exp_dir}/hydra
124 changes: 64 additions & 60 deletions examples/llama/conf/train/train_llama3_70b_finetune.yaml
Original file line number Diff line number Diff line change
@@ -1,72 +1,76 @@
data:
data_path: /share/project/PUBLIC/data/llama3-70b/llama3_dataset/dedup-md5-pile-pile-cc_text_document
split: 1
tokenizer:
tokenizer_path: /share/project/PUBLIC/data/llama3-70b/llama3_tokenizer
tokenizer_type: Llama3TokenizerFS
vocab_size: 128256
system:
tensor_model_parallel_size: 8
pipeline_model_parallel_size: 4
make_vocab_size_divisible_by: 64
disable_bias_linear: True
sequence_parallel: True
use_flash_attn: True
use_distributed_optimizer: True
use_mcore_models: True
transformer_impl: transformer_engine
precision:
bf16: True
attention_softmax_in_fp32: True
accumulate_allreduce_grads_in_fp32: True
logging:
log_interval: 1
tensorboard_log_interval: 1
wandb_project: "train-llama3-70B"
wandb_exp_name: "train-llama3-70B"
checkpoint:
load: ${ckpt_path:??}
ckpt_format: torch
save_interval: 100
finetune: True

model:
add_qkv_bias: false
attention_dropout: 0.0
clip_grad: 1.0
ffn_hidden_size: 28672
global_batch_size: 1024
group_query_attention: true
hidden_dropout: 0.0
num_layers: 80
hidden_size: 8192
init_method_std: 0.02
num_attention_heads: 64
group_query_attention: True
num_query_groups: 8
ffn_hidden_size: 28672
seq_length: 8192
max_position_embeddings: 8192
micro_batch_size: 1
no_position_embedding: true
norm_epsilon: 1e-5
norm_init_weight: 0.02
use_rotary_position_embeddings: True
rotary_base: 500000
no_position_embedding: True
reset_position_ids: True
add_qkv_bias: false
reset_attention_mask: True
swiglu: True
normalization: RMSNorm
num_attention_heads: 64
num_layers: 80
num_query_groups: 8
untie_embeddings_and_output_weights: True
init_method_std: 0.02
attention_dropout: 0.0
hidden_dropout: 0.0
clip_grad: 1.0

train_samples: 6160066
micro_batch_size: 1
global_batch_size: 1024
seed: 42

optimizer:
start_weight_decay: 0
end_weight_decay: 5e-7
weight_decay_incr_style: cosine
adam_beta1: 0.9
adam_beta2: 0.95
end_weight_decay: 5e-7
lr_scheduler:
lr: 5e-6
lr_decay_style: cosine
lr_warmup_samples: 51200
min_lr: 0
start_weight_decay: 0
weight_decay_incr_style: cosine
reset_attention_mask: true
reset_position_ids: true
rotary_base: 500000
seed: 42
seq_length: 8192
swiglu: true
train_samples: 512000
transformer_impl: transformer_engine
untie_embeddings_and_output_weights: true
use_mcore_models: true
use_rotary_position_embeddings: true
system:
checkpoint:
ckpt_format: torch
finetune: true
load: /share/project/PUBLIC/data/llama3-70b/llama3_ckpt
save_interval: 100
disable_bias_linear: true
logging:
log_interval: 1
tensorboard_log_interval: 1
wandb_exp_name: train-llama3-70B
wandb_project: train-llama3-70B
make_vocab_size_divisible_by: 64
pipeline_model_parallel_size: 4
precision:
accumulate_allreduce_grads_in_fp32: true
attention_softmax_in_fp32: true
bf16: true
sequence_parallel: true
tensor_model_parallel_size: 8
transformer_impl: transformer_engine
use_distributed_optimizer: true
use_flash_attn: true
use_mcore_models: true
lr_warmup_samples: 2048000
lr_decay_style: cosine

data:
data_path: ${data_path:??}
split: 1
tokenizer:
tokenizer_type: Llama3TokenizerFS
tokenizer_path: ${tokenizer_path:??}
vocab_size: 128256


109 changes: 109 additions & 0 deletions hardware/kunlunxin_R300p/a44556c0/a44556c0.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
From 9289f099424ba4d0dec83fb5715d4d2561f4c4d8 Mon Sep 17 00:00:00 2001
From: brianlcy123 <[email protected]>
Date: Thu, 21 Nov 2024 15:46:54 +0800
Subject: [PATCH] [kunlunxin] add updated llama3 70b patch

---
examples/llama/conf/config.yaml | 40 ++++++++++++++++++++-
megatron/megatron/training/arguments.py | 18 +++++-----
megatron/megatron/training/checkpointing.py | 5 +--
3 files changed, 51 insertions(+), 12 deletions(-)

diff --git a/examples/llama/conf/config.yaml b/examples/llama/conf/config.yaml
index 592c45bf..27fb83ae 100644
--- a/examples/llama/conf/config.yaml
+++ b/examples/llama/conf/config.yaml
@@ -45,7 +45,45 @@ experiment:
CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7
CUDA_DEVICE_MAX_CONNECTIONS: 1
NVTE_APPLY_QK_LAYER_SCALING: 0
- NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+ NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1
+ ALLGATHER_ASYNC: false
+ ALLREDUCE_ASYNC: false
+ ALLREDUCE_FUSION: 0
+ BKCL_CCIX_BUFFER_GM: 1
+ BKCL_CCIX_RING: 1
+ BKCL_ENABLE_XDR: 1
+ BKCL_FLAT_RING: 1
+ BKCL_KL3_TURBO_MODE: 1
+ BKCL_RDMA_FORCE_TREE: 1
+ BKCL_RDMA_NICS: ens11np0,ens11np0,ens13np0,ens13np0,ens15np0,ens15np0,ens17np0,ens17np0
+ BKCL_RDMA_PROXY_DISABLE: 1
+ BKCL_RING_BUFFER_GM: 1
+ BKCL_TIMEOUT: 360000
+ BKCL_TRANS_UNSUPPORTED_DATATYPE: 8
+ BKCL_TREE_THRESHOLD: 1
+ BKCL_XLINK_C2C: 1
+ BKCL_XLINK_D2D: 0
+ BKCL_XLINK_ETH: 0
+ CUDART_DUMMY_REGISTER: 1
+ FAST_SWIGLU_ENABLE: 1
+ USE_FAST_BF16_FC: true
+ USE_L3: 1
+ XDNN_USE_FAST_SWISH: true
+ XPU_ZEBU_MODE: 1
+ XPU_FORCE_USERMODE_LAUNCH: 1
+ DIST_MULTI_STREAM: true
+ XMLIR_DIST_SINGLETON_STREAM: true
+ XMLIR_FA_GEMM_TYPE: float
+ XBLAS_FC_HBM_VERSION: 40
+ XMLIR_PARALLEL_SAVE_MEMORY: false
+ XMLIR_DISABLE_CUDA_ALLOCATOR: true
+ XMLIR_XDNN_PYTORCH_CHECK_ENABLE_FALLBACK_BOOL: 0
+ XMLIR_ENABLE_FALLBACK_TO_CPU_BOOL: False
+ XMLIR_DUMP_FALLBACK_OP_LIST_BOOL: true
+ XMLIR_BATCH_PARALLEL: true
+ DIST_MULTI_STREAM: true
+ CUDA_DEVICE_MAX_CONNECTIONS: 8
+ XMLIR_DIST_ASYNC_ISEND_IRECV: 1
action: run

hydra:
diff --git a/megatron/megatron/training/arguments.py b/megatron/megatron/training/arguments.py
index e20f178b..7e79da2a 100644
--- a/megatron/megatron/training/arguments.py
+++ b/megatron/megatron/training/arguments.py
@@ -652,15 +652,15 @@ def validate_args(args, defaults={}):
if args.sequence_parallel:
args.async_tensor_model_parallel_allreduce = False

- if os.environ.get('CUDA_DEVICE_MAX_CONNECTIONS') != "1":
- if args.sequence_parallel:
- raise RuntimeError(
- "Using sequence parallelism requires setting the environment variable "
- "CUDA_DEVICE_MAX_CONNECTIONS to 1")
- if args.async_tensor_model_parallel_allreduce:
- raise RuntimeError(
- "Using async gradient all reduce requires setting the environment "
- "variable CUDA_DEVICE_MAX_CONNECTIONS to 1")
+ # if os.environ.get('CUDA_DEVICE_MAX_CONNECTIONS') != "1":
+ # if args.sequence_parallel:
+ # raise RuntimeError(
+ # "Using sequence parallelism requires setting the environment variable "
+ # "CUDA_DEVICE_MAX_CONNECTIONS to 1")
+ # if args.async_tensor_model_parallel_allreduce:
+ # raise RuntimeError(
+ # "Using async gradient all reduce requires setting the environment "
+ # "variable CUDA_DEVICE_MAX_CONNECTIONS to 1")

# Disable bias gelu fusion if we are disabling bias altogether
if not args.add_bias_linear:
diff --git a/megatron/megatron/training/checkpointing.py b/megatron/megatron/training/checkpointing.py
index 01425f36..80fa0254 100644
--- a/megatron/megatron/training/checkpointing.py
+++ b/megatron/megatron/training/checkpointing.py
@@ -530,8 +530,9 @@ def save_dataloader_state(train_iterator, iteration, dataloader_save_path):

torch.distributed.barrier(group=mpu.get_data_parallel_group())

- if mpu.get_data_parallel_rank() == 0:
- ensure_directory_exists(data_state_save_path)
+ # if mpu.get_data_parallel_rank() == 0:
+ # ensure_directory_exists(data_state_save_path)
+ ensure_directory_exists(data_state_save_path)

torch.distributed.barrier(group=mpu.get_data_parallel_group())

--
2.25.1
18 changes: 9 additions & 9 deletions megatron/megatron/training/arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -548,15 +548,15 @@ def validate_args(args, defaults={}):
if args.sequence_parallel:
args.async_tensor_model_parallel_allreduce = False

# if os.environ.get('CUDA_DEVICE_MAX_CONNECTIONS') != "1":
# if args.sequence_parallel:
# raise RuntimeError(
# "Using sequence parallelism requires setting the environment variable "
# "CUDA_DEVICE_MAX_CONNECTIONS to 1")
# if args.async_tensor_model_parallel_allreduce:
# raise RuntimeError(
# "Using async gradient all reduce requires setting the environment "
# "variable CUDA_DEVICE_MAX_CONNECTIONS to 1")
if os.environ.get('CUDA_DEVICE_MAX_CONNECTIONS') != "1":
if args.sequence_parallel:
raise RuntimeError(
"Using sequence parallelism requires setting the environment variable "
"CUDA_DEVICE_MAX_CONNECTIONS to 1")
if args.async_tensor_model_parallel_allreduce:
raise RuntimeError(
"Using async gradient all reduce requires setting the environment "
"variable CUDA_DEVICE_MAX_CONNECTIONS to 1")

# Disable bias gelu fusion if we are disabling bias altogether
if not args.add_bias_linear:
Expand Down
5 changes: 2 additions & 3 deletions megatron/megatron/training/checkpointing.py
Original file line number Diff line number Diff line change
Expand Up @@ -578,9 +578,8 @@ def save_dataloader_state(train_iterator, iteration, dataloader_save_path):

torch.distributed.barrier(group=mpu.get_data_parallel_group())

# if mpu.get_data_parallel_rank() == 0:
# ensure_directory_exists(data_state_save_path)
ensure_directory_exists(data_state_save_path)
if mpu.get_data_parallel_rank() == 0:
ensure_directory_exists(data_state_save_path)

torch.distributed.barrier(group=mpu.get_data_parallel_group())

Expand Down

0 comments on commit 8308ae6

Please sign in to comment.