Skip to content

Commit

Permalink
[Iluvatar] add example of training llama2-7b with tp hetero mode enabled
Browse files Browse the repository at this point in the history
  • Loading branch information
yu.song committed Jun 7, 2024
1 parent 27577d9 commit 3579803
Show file tree
Hide file tree
Showing 2 changed files with 95 additions and 1 deletion.
29 changes: 28 additions & 1 deletion examples/llama/conf/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -49,4 +49,31 @@ action: run

hydra:
run:
dir: ${experiment.exp_dir}/hydra
dir: ${experiment.exp_dir}/hydra

# defaults:
# - train: train_llama2_7b_tp_hetero
# - _self_

# experiment:
# exp_name: llama2_tp_hetero
# exp_dir: ./outputs_llama2_tp_hetero
# task:
# type: train
# backend: megatron
# entrypoint: ./flagscale/train/hetero/train_llama.py
# runner:
# backend: torchrun
# nnodes: 1
# nproc_per_node: 8
# hostfile: hostfile
# envs:
# CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7
# CUDA_DEVICE_MAX_CONNECTIONS: 1

# action: run

# hydra:
# run:
# dir: ${experiment.exp_dir}/hydra

67 changes: 67 additions & 0 deletions examples/llama/conf/train/train_llama2_7b_tp_hetero.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
system:
tensor_model_parallel_size: 4
pipeline_model_parallel_size: 3
disable_bias_linear: True
use_flash_attn: True
sequence_parallel: True
use_distributed_optimizer: True
hetero_mode: pp
hetero_device_types: A100
hetero_current_device_type: A100
hetero_pipeline_stages: [3,16,8,8]
process_meshes: [4,1,1,2,1,2]
precision:
bf16: True
initial_loss_scale: 16384
min_loss_scale: 1.0
logging:
log_interval: 1
checkpoint:
save_interval: 100

model:
use_mcore_models: True
transformer_impl: transformer_engine
num_layers: 32
hidden_size: 4096
ffn_hidden_size: 11008
num_attention_heads: 32
seq_length: 4096
group_query_attention: False
num_query_groups: 8
max_position_embeddings: 4096
norm_epsilon: 1e-5
use_rotary_position_embeddings: True
no_position_embedding: True
swiglu: True
normalization: RMSNorm
untie_embeddings_and_output_weights: True
init_method_std: 0.02
attention_dropout: 0.0
hidden_dropout: 0.0
weight_decay: 0.1
clip_grad: 1.0
train_iters: 30
eval_iters: 0
eval_interval: 2000
micro_batch_size: 1
global_batch_size: 32

optimizer:
weight_decay: 1e-2
adam_beta1: 0.9
adam_beta2: 0.95
lr_scheduler:
lr: 0.00015
min_lr: 1.0e-5
lr_warmup_fraction: .01
lr_decay_iters: 1
lr_decay_style: cosine

data:
data_path: ${data_path:??}
split: 1
tokenizer:
tokenizer_type: Llama2Tokenizer
tokenizer_model: examples/llama/tokenizer.model
vocab_size: 32000

0 comments on commit 3579803

Please sign in to comment.