-
Notifications
You must be signed in to change notification settings - Fork 48
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
6b238b7
commit 8308ae6
Showing
5 changed files
with
230 additions
and
128 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,63 +1,53 @@ | ||
action: run | ||
# defaults: | ||
# - train: train_llama2_7b | ||
# - _self_ | ||
|
||
# experiment: | ||
# exp_name: llama2 | ||
# exp_dir: ./outputs_llama2 | ||
# task: | ||
# type: train | ||
# backend: megatron | ||
# entrypoint: ./flagscale/train/train_llama.py | ||
# runner: | ||
# backend: torchrun | ||
# nnodes: 1 | ||
# nproc_per_node: 8 | ||
# hostfile: hostfile | ||
# envs: | ||
# CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7 | ||
# CUDA_DEVICE_MAX_CONNECTIONS: 1 | ||
|
||
# action: run | ||
|
||
# hydra: | ||
# run: | ||
# dir: ${experiment.exp_dir}/hydra | ||
|
||
|
||
defaults: | ||
- train: train_llama3_70b_finetune | ||
- _self_ | ||
- train: train_llama3_70b | ||
- _self_ | ||
|
||
experiment: | ||
cmds: | ||
before_start: source ~/.bashrc; export LD_LIBRARY_PATH=/share/project/PUBLIC/data/llama3-70b/xpu_output//xccl/3.0.0.4_20241107/xccl_rdma-ubuntu_x86_64/so/:/share/project/PUBLIC/data/llama3-70b/xpu_output//xhpc/20241107/xhpc-ubuntu2004_x86_64/xblas/so:/share/project/PUBLIC/data/llama3-70b/xpu_output//xhpc/20241107/xhpc-ubuntu2004_x86_64/xfa/so:/share/project/PUBLIC/data/llama3-70b/xpu_output//xhpc/20241107/xhpc-ubuntu2004_x86_64/xpudnn/so:/share/project/PUBLIC/data/llama3-70b/xpu_output//xre/5.0.21.5/xre-Linux-x86_64-5.0.21.5/so | ||
envs: | ||
ALLGATHER_ASYNC: false | ||
ALLREDUCE_ASYNC: false | ||
ALLREDUCE_FUSION: 0 | ||
BKCL_CCIX_BUFFER_GM: 1 | ||
BKCL_CCIX_RING: 1 | ||
BKCL_ENABLE_XDR: 1 | ||
BKCL_FLAT_RING: 1 | ||
BKCL_KL3_TURBO_MODE: 1 | ||
BKCL_RDMA_FORCE_TREE: 1 | ||
BKCL_RDMA_NICS: ens11np0,ens11np0,ens13np0,ens13np0,ens15np0,ens15np0,ens17np0,ens17np0 | ||
BKCL_RDMA_PROXY_DISABLE: 1 | ||
BKCL_RING_BUFFER_GM: 1 | ||
BKCL_TIMEOUT: 360000 | ||
BKCL_TRANS_UNSUPPORTED_DATATYPE: 8 | ||
BKCL_TREE_THRESHOLD: 1 | ||
BKCL_XLINK_C2C: 1 | ||
BKCL_XLINK_D2D: 0 | ||
BKCL_XLINK_ETH: 0 | ||
CUDART_DUMMY_REGISTER: 1 | ||
CUDA_DEVICE_MAX_CONNECTIONS: 8 | ||
CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7 | ||
DIST_MULTI_STREAM: true | ||
FAST_SWIGLU_ENABLE: 1 | ||
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 | ||
NVTE_APPLY_QK_LAYER_SCALING: 0 | ||
USE_FAST_BF16_FC: true | ||
USE_L3: 1 | ||
XBLAS_FC_HBM_VERSION: 40 | ||
XDNN_USE_FAST_SWISH: true | ||
XMLIR_BATCH_PARALLEL: true | ||
XMLIR_DISABLE_CUDA_ALLOCATOR: true | ||
XMLIR_DIST_ASYNC_ISEND_IRECV: 1 | ||
XMLIR_DIST_SINGLETON_STREAM: true | ||
XMLIR_DUMP_FALLBACK_OP_LIST_BOOL: true | ||
XMLIR_ENABLE_FALLBACK_TO_CPU_BOOL: false | ||
XMLIR_FA_GEMM_TYPE: float | ||
XMLIR_PARALLEL_SAVE_MEMORY: false | ||
XMLIR_XDNN_PYTORCH_CHECK_ENABLE_FALLBACK_BOOL: 0 | ||
XPU_FORCE_USERMODE_LAUNCH: 1 | ||
XPU_ZEBU_MODE: 1 | ||
exp_dir: /share/project/PUBLIC/data/llama3-70b/FlagOpen/FlagPerf/training/result/run20241120200156/llama3_70B_continuetrain:flagscale_llama:R300p:4:8:1/round1/10.1.15.7_noderank0/outputs_llama3 | ||
exp_name: llama3 | ||
runner: | ||
backend: torchrun | ||
hostfile: /share/project/PUBLIC/data/llama3-70b/FlagOpen/FlagScale/hostfile | ||
nnodes: 4 | ||
nproc_per_node: 8 | ||
ssh_port: 3702 | ||
exp_dir: ./outputs_llama3_70b | ||
task: | ||
type: train | ||
backend: megatron | ||
entrypoint: ./flagscale/train/train_llama.py | ||
type: train | ||
hydra: | ||
runner: | ||
backend: torchrun | ||
nnodes: 4 | ||
nproc_per_node: 8 | ||
hostfile: ${hostfile??} | ||
envs: | ||
CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7 | ||
CUDA_DEVICE_MAX_CONNECTIONS: 1 | ||
NVTE_APPLY_QK_LAYER_SCALING: 0 | ||
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 | ||
action: run | ||
|
||
hydra: | ||
run: | ||
dir: ${experiment.exp_dir}/hydra | ||
dir: ${experiment.exp_dir}/hydra |
124 changes: 64 additions & 60 deletions
124
examples/llama/conf/train/train_llama3_70b_finetune.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,72 +1,76 @@ | ||
data: | ||
data_path: /share/project/PUBLIC/data/llama3-70b/llama3_dataset/dedup-md5-pile-pile-cc_text_document | ||
split: 1 | ||
tokenizer: | ||
tokenizer_path: /share/project/PUBLIC/data/llama3-70b/llama3_tokenizer | ||
tokenizer_type: Llama3TokenizerFS | ||
vocab_size: 128256 | ||
system: | ||
tensor_model_parallel_size: 8 | ||
pipeline_model_parallel_size: 4 | ||
make_vocab_size_divisible_by: 64 | ||
disable_bias_linear: True | ||
sequence_parallel: True | ||
use_flash_attn: True | ||
use_distributed_optimizer: True | ||
use_mcore_models: True | ||
transformer_impl: transformer_engine | ||
precision: | ||
bf16: True | ||
attention_softmax_in_fp32: True | ||
accumulate_allreduce_grads_in_fp32: True | ||
logging: | ||
log_interval: 1 | ||
tensorboard_log_interval: 1 | ||
wandb_project: "train-llama3-70B" | ||
wandb_exp_name: "train-llama3-70B" | ||
checkpoint: | ||
load: ${ckpt_path:??} | ||
ckpt_format: torch | ||
save_interval: 100 | ||
finetune: True | ||
|
||
model: | ||
add_qkv_bias: false | ||
attention_dropout: 0.0 | ||
clip_grad: 1.0 | ||
ffn_hidden_size: 28672 | ||
global_batch_size: 1024 | ||
group_query_attention: true | ||
hidden_dropout: 0.0 | ||
num_layers: 80 | ||
hidden_size: 8192 | ||
init_method_std: 0.02 | ||
num_attention_heads: 64 | ||
group_query_attention: True | ||
num_query_groups: 8 | ||
ffn_hidden_size: 28672 | ||
seq_length: 8192 | ||
max_position_embeddings: 8192 | ||
micro_batch_size: 1 | ||
no_position_embedding: true | ||
norm_epsilon: 1e-5 | ||
norm_init_weight: 0.02 | ||
use_rotary_position_embeddings: True | ||
rotary_base: 500000 | ||
no_position_embedding: True | ||
reset_position_ids: True | ||
add_qkv_bias: false | ||
reset_attention_mask: True | ||
swiglu: True | ||
normalization: RMSNorm | ||
num_attention_heads: 64 | ||
num_layers: 80 | ||
num_query_groups: 8 | ||
untie_embeddings_and_output_weights: True | ||
init_method_std: 0.02 | ||
attention_dropout: 0.0 | ||
hidden_dropout: 0.0 | ||
clip_grad: 1.0 | ||
|
||
train_samples: 6160066 | ||
micro_batch_size: 1 | ||
global_batch_size: 1024 | ||
seed: 42 | ||
|
||
optimizer: | ||
start_weight_decay: 0 | ||
end_weight_decay: 5e-7 | ||
weight_decay_incr_style: cosine | ||
adam_beta1: 0.9 | ||
adam_beta2: 0.95 | ||
end_weight_decay: 5e-7 | ||
lr_scheduler: | ||
lr: 5e-6 | ||
lr_decay_style: cosine | ||
lr_warmup_samples: 51200 | ||
min_lr: 0 | ||
start_weight_decay: 0 | ||
weight_decay_incr_style: cosine | ||
reset_attention_mask: true | ||
reset_position_ids: true | ||
rotary_base: 500000 | ||
seed: 42 | ||
seq_length: 8192 | ||
swiglu: true | ||
train_samples: 512000 | ||
transformer_impl: transformer_engine | ||
untie_embeddings_and_output_weights: true | ||
use_mcore_models: true | ||
use_rotary_position_embeddings: true | ||
system: | ||
checkpoint: | ||
ckpt_format: torch | ||
finetune: true | ||
load: /share/project/PUBLIC/data/llama3-70b/llama3_ckpt | ||
save_interval: 100 | ||
disable_bias_linear: true | ||
logging: | ||
log_interval: 1 | ||
tensorboard_log_interval: 1 | ||
wandb_exp_name: train-llama3-70B | ||
wandb_project: train-llama3-70B | ||
make_vocab_size_divisible_by: 64 | ||
pipeline_model_parallel_size: 4 | ||
precision: | ||
accumulate_allreduce_grads_in_fp32: true | ||
attention_softmax_in_fp32: true | ||
bf16: true | ||
sequence_parallel: true | ||
tensor_model_parallel_size: 8 | ||
transformer_impl: transformer_engine | ||
use_distributed_optimizer: true | ||
use_flash_attn: true | ||
use_mcore_models: true | ||
lr_warmup_samples: 2048000 | ||
lr_decay_style: cosine | ||
|
||
data: | ||
data_path: ${data_path:??} | ||
split: 1 | ||
tokenizer: | ||
tokenizer_type: Llama3TokenizerFS | ||
tokenizer_path: ${tokenizer_path:??} | ||
vocab_size: 128256 | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,109 @@ | ||
From 9289f099424ba4d0dec83fb5715d4d2561f4c4d8 Mon Sep 17 00:00:00 2001 | ||
From: brianlcy123 <[email protected]> | ||
Date: Thu, 21 Nov 2024 15:46:54 +0800 | ||
Subject: [PATCH] [kunlunxin] add updated llama3 70b patch | ||
|
||
--- | ||
examples/llama/conf/config.yaml | 40 ++++++++++++++++++++- | ||
megatron/megatron/training/arguments.py | 18 +++++----- | ||
megatron/megatron/training/checkpointing.py | 5 +-- | ||
3 files changed, 51 insertions(+), 12 deletions(-) | ||
|
||
diff --git a/examples/llama/conf/config.yaml b/examples/llama/conf/config.yaml | ||
index 592c45bf..27fb83ae 100644 | ||
--- a/examples/llama/conf/config.yaml | ||
+++ b/examples/llama/conf/config.yaml | ||
@@ -45,7 +45,45 @@ experiment: | ||
CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7 | ||
CUDA_DEVICE_MAX_CONNECTIONS: 1 | ||
NVTE_APPLY_QK_LAYER_SCALING: 0 | ||
- NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 | ||
+ NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1 | ||
+ ALLGATHER_ASYNC: false | ||
+ ALLREDUCE_ASYNC: false | ||
+ ALLREDUCE_FUSION: 0 | ||
+ BKCL_CCIX_BUFFER_GM: 1 | ||
+ BKCL_CCIX_RING: 1 | ||
+ BKCL_ENABLE_XDR: 1 | ||
+ BKCL_FLAT_RING: 1 | ||
+ BKCL_KL3_TURBO_MODE: 1 | ||
+ BKCL_RDMA_FORCE_TREE: 1 | ||
+ BKCL_RDMA_NICS: ens11np0,ens11np0,ens13np0,ens13np0,ens15np0,ens15np0,ens17np0,ens17np0 | ||
+ BKCL_RDMA_PROXY_DISABLE: 1 | ||
+ BKCL_RING_BUFFER_GM: 1 | ||
+ BKCL_TIMEOUT: 360000 | ||
+ BKCL_TRANS_UNSUPPORTED_DATATYPE: 8 | ||
+ BKCL_TREE_THRESHOLD: 1 | ||
+ BKCL_XLINK_C2C: 1 | ||
+ BKCL_XLINK_D2D: 0 | ||
+ BKCL_XLINK_ETH: 0 | ||
+ CUDART_DUMMY_REGISTER: 1 | ||
+ FAST_SWIGLU_ENABLE: 1 | ||
+ USE_FAST_BF16_FC: true | ||
+ USE_L3: 1 | ||
+ XDNN_USE_FAST_SWISH: true | ||
+ XPU_ZEBU_MODE: 1 | ||
+ XPU_FORCE_USERMODE_LAUNCH: 1 | ||
+ DIST_MULTI_STREAM: true | ||
+ XMLIR_DIST_SINGLETON_STREAM: true | ||
+ XMLIR_FA_GEMM_TYPE: float | ||
+ XBLAS_FC_HBM_VERSION: 40 | ||
+ XMLIR_PARALLEL_SAVE_MEMORY: false | ||
+ XMLIR_DISABLE_CUDA_ALLOCATOR: true | ||
+ XMLIR_XDNN_PYTORCH_CHECK_ENABLE_FALLBACK_BOOL: 0 | ||
+ XMLIR_ENABLE_FALLBACK_TO_CPU_BOOL: False | ||
+ XMLIR_DUMP_FALLBACK_OP_LIST_BOOL: true | ||
+ XMLIR_BATCH_PARALLEL: true | ||
+ DIST_MULTI_STREAM: true | ||
+ CUDA_DEVICE_MAX_CONNECTIONS: 8 | ||
+ XMLIR_DIST_ASYNC_ISEND_IRECV: 1 | ||
action: run | ||
|
||
hydra: | ||
diff --git a/megatron/megatron/training/arguments.py b/megatron/megatron/training/arguments.py | ||
index e20f178b..7e79da2a 100644 | ||
--- a/megatron/megatron/training/arguments.py | ||
+++ b/megatron/megatron/training/arguments.py | ||
@@ -652,15 +652,15 @@ def validate_args(args, defaults={}): | ||
if args.sequence_parallel: | ||
args.async_tensor_model_parallel_allreduce = False | ||
|
||
- if os.environ.get('CUDA_DEVICE_MAX_CONNECTIONS') != "1": | ||
- if args.sequence_parallel: | ||
- raise RuntimeError( | ||
- "Using sequence parallelism requires setting the environment variable " | ||
- "CUDA_DEVICE_MAX_CONNECTIONS to 1") | ||
- if args.async_tensor_model_parallel_allreduce: | ||
- raise RuntimeError( | ||
- "Using async gradient all reduce requires setting the environment " | ||
- "variable CUDA_DEVICE_MAX_CONNECTIONS to 1") | ||
+ # if os.environ.get('CUDA_DEVICE_MAX_CONNECTIONS') != "1": | ||
+ # if args.sequence_parallel: | ||
+ # raise RuntimeError( | ||
+ # "Using sequence parallelism requires setting the environment variable " | ||
+ # "CUDA_DEVICE_MAX_CONNECTIONS to 1") | ||
+ # if args.async_tensor_model_parallel_allreduce: | ||
+ # raise RuntimeError( | ||
+ # "Using async gradient all reduce requires setting the environment " | ||
+ # "variable CUDA_DEVICE_MAX_CONNECTIONS to 1") | ||
|
||
# Disable bias gelu fusion if we are disabling bias altogether | ||
if not args.add_bias_linear: | ||
diff --git a/megatron/megatron/training/checkpointing.py b/megatron/megatron/training/checkpointing.py | ||
index 01425f36..80fa0254 100644 | ||
--- a/megatron/megatron/training/checkpointing.py | ||
+++ b/megatron/megatron/training/checkpointing.py | ||
@@ -530,8 +530,9 @@ def save_dataloader_state(train_iterator, iteration, dataloader_save_path): | ||
|
||
torch.distributed.barrier(group=mpu.get_data_parallel_group()) | ||
|
||
- if mpu.get_data_parallel_rank() == 0: | ||
- ensure_directory_exists(data_state_save_path) | ||
+ # if mpu.get_data_parallel_rank() == 0: | ||
+ # ensure_directory_exists(data_state_save_path) | ||
+ ensure_directory_exists(data_state_save_path) | ||
|
||
torch.distributed.barrier(group=mpu.get_data_parallel_group()) | ||
|
||
-- | ||
2.25.1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters