diff --git a/examples/llama/conf/config.yaml b/examples/llama/conf/config.yaml index 89ae03e9b..cc836e182 100644 --- a/examples/llama/conf/config.yaml +++ b/examples/llama/conf/config.yaml @@ -49,4 +49,31 @@ action: run hydra: run: - dir: ${experiment.exp_dir}/hydra \ No newline at end of file + dir: ${experiment.exp_dir}/hydra + +# defaults: +# - train: train_llama2_7b_tp_hetero +# - _self_ + +# experiment: +# exp_name: llama2_tp_hetero +# exp_dir: ./outputs_llama2_tp_hetero +# task: +# type: train +# backend: megatron +# entrypoint: ./flagscale/train/hetero/train_llama.py +# runner: +# backend: torchrun +# nnodes: 1 +# nproc_per_node: 8 +# hostfile: hostfile +# envs: +# CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7 +# CUDA_DEVICE_MAX_CONNECTIONS: 1 + +# action: run + +# hydra: +# run: +# dir: ${experiment.exp_dir}/hydra + diff --git a/examples/llama/conf/train/train_llama2_7b_tp_hetero.yaml b/examples/llama/conf/train/train_llama2_7b_tp_hetero.yaml new file mode 100644 index 000000000..90995e2d9 --- /dev/null +++ b/examples/llama/conf/train/train_llama2_7b_tp_hetero.yaml @@ -0,0 +1,67 @@ +system: + tensor_model_parallel_size: 4 + pipeline_model_parallel_size: 3 + disable_bias_linear: True + use_flash_attn: True + sequence_parallel: True + use_distributed_optimizer: True + hetero_mode: pp + hetero_device_types: A100 + hetero_current_device_type: A100 + hetero_pipeline_stages: [3,16,8,8] + process_meshes: [4,1,1,2,1,2] + precision: + bf16: True + initial_loss_scale: 16384 + min_loss_scale: 1.0 + logging: + log_interval: 1 + checkpoint: + save_interval: 100 + +model: + use_mcore_models: True + transformer_impl: transformer_engine + num_layers: 32 + hidden_size: 4096 + ffn_hidden_size: 11008 + num_attention_heads: 32 + seq_length: 4096 + group_query_attention: False + num_query_groups: 8 + max_position_embeddings: 4096 + norm_epsilon: 1e-5 + use_rotary_position_embeddings: True + no_position_embedding: True + swiglu: True + normalization: RMSNorm + untie_embeddings_and_output_weights: True + init_method_std: 0.02 + attention_dropout: 0.0 + hidden_dropout: 0.0 + weight_decay: 0.1 + clip_grad: 1.0 + train_iters: 30 + eval_iters: 0 + eval_interval: 2000 + micro_batch_size: 1 + global_batch_size: 32 + + optimizer: + weight_decay: 1e-2 + adam_beta1: 0.9 + adam_beta2: 0.95 + lr_scheduler: + lr: 0.00015 + min_lr: 1.0e-5 + lr_warmup_fraction: .01 + lr_decay_iters: 1 + lr_decay_style: cosine + +data: + data_path: ${data_path:??} + split: 1 + tokenizer: + tokenizer_type: Llama2Tokenizer + tokenizer_model: examples/llama/tokenizer.model + vocab_size: 32000