[kunlunxin] add llama3 70b patch

FlagOpen · Nov 21, 2024 · 8308ae6 · 8308ae6
1 parent 6b238b7
commit 8308ae6
Show file tree

Hide file tree

Showing 5 changed files with 230 additions and 128 deletions.
diff --git a/examples/llama/conf/config.yaml b/examples/llama/conf/config.yaml
@@ -1,63 +1,53 @@
-action: run
+# defaults:
+#   - train: train_llama2_7b 
+#   - _self_
+
+# experiment:
+#   exp_name: llama2
+#   exp_dir: ./outputs_llama2
+#   task:
+#     type: train
+#     backend: megatron
+#     entrypoint: ./flagscale/train/train_llama.py
+#   runner:
+#     backend: torchrun
+#     nnodes: 1 
+#     nproc_per_node: 8 
+#     hostfile: hostfile
+#   envs:
+#     CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7 
+#     CUDA_DEVICE_MAX_CONNECTIONS: 1 
+
+# action: run 
+
+# hydra: 
+#   run:
+#     dir: ${experiment.exp_dir}/hydra 
+
+
 defaults:
-- train: train_llama3_70b_finetune
-- _self_
+  - train: train_llama3_70b
+  - _self_
+
 experiment:
-  cmds:
-    before_start: source ~/.bashrc; export LD_LIBRARY_PATH=/share/project/PUBLIC/data/llama3-70b/xpu_output//xccl/3.0.0.4_20241107/xccl_rdma-ubuntu_x86_64/so/:/share/project/PUBLIC/data/llama3-70b/xpu_output//xhpc/20241107/xhpc-ubuntu2004_x86_64/xblas/so:/share/project/PUBLIC/data/llama3-70b/xpu_output//xhpc/20241107/xhpc-ubuntu2004_x86_64/xfa/so:/share/project/PUBLIC/data/llama3-70b/xpu_output//xhpc/20241107/xhpc-ubuntu2004_x86_64/xpudnn/so:/share/project/PUBLIC/data/llama3-70b/xpu_output//xre/5.0.21.5/xre-Linux-x86_64-5.0.21.5/so
-  envs:
-    ALLGATHER_ASYNC: false
-    ALLREDUCE_ASYNC: false
-    ALLREDUCE_FUSION: 0
-    BKCL_CCIX_BUFFER_GM: 1
-    BKCL_CCIX_RING: 1
-    BKCL_ENABLE_XDR: 1
-    BKCL_FLAT_RING: 1
-    BKCL_KL3_TURBO_MODE: 1
-    BKCL_RDMA_FORCE_TREE: 1
-    BKCL_RDMA_NICS: ens11np0,ens11np0,ens13np0,ens13np0,ens15np0,ens15np0,ens17np0,ens17np0
-    BKCL_RDMA_PROXY_DISABLE: 1
-    BKCL_RING_BUFFER_GM: 1
-    BKCL_TIMEOUT: 360000
-    BKCL_TRANS_UNSUPPORTED_DATATYPE: 8
-    BKCL_TREE_THRESHOLD: 1
-    BKCL_XLINK_C2C: 1
-    BKCL_XLINK_D2D: 0
-    BKCL_XLINK_ETH: 0
-    CUDART_DUMMY_REGISTER: 1
-    CUDA_DEVICE_MAX_CONNECTIONS: 8
-    CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7
-    DIST_MULTI_STREAM: true
-    FAST_SWIGLU_ENABLE: 1
-    NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
-    NVTE_APPLY_QK_LAYER_SCALING: 0
-    USE_FAST_BF16_FC: true
-    USE_L3: 1
-    XBLAS_FC_HBM_VERSION: 40
-    XDNN_USE_FAST_SWISH: true
-    XMLIR_BATCH_PARALLEL: true
-    XMLIR_DISABLE_CUDA_ALLOCATOR: true
-    XMLIR_DIST_ASYNC_ISEND_IRECV: 1
-    XMLIR_DIST_SINGLETON_STREAM: true
-    XMLIR_DUMP_FALLBACK_OP_LIST_BOOL: true
-    XMLIR_ENABLE_FALLBACK_TO_CPU_BOOL: false
-    XMLIR_FA_GEMM_TYPE: float
-    XMLIR_PARALLEL_SAVE_MEMORY: false
-    XMLIR_XDNN_PYTORCH_CHECK_ENABLE_FALLBACK_BOOL: 0
-    XPU_FORCE_USERMODE_LAUNCH: 1
-    XPU_ZEBU_MODE: 1
-  exp_dir: /share/project/PUBLIC/data/llama3-70b/FlagOpen/FlagPerf/training/result/run20241120200156/llama3_70B_continuetrain:flagscale_llama:R300p:4:8:1/round1/10.1.15.7_noderank0/outputs_llama3
   exp_name: llama3
-  runner:
-    backend: torchrun
-    hostfile: /share/project/PUBLIC/data/llama3-70b/FlagOpen/FlagScale/hostfile
-    nnodes: 4
-    nproc_per_node: 8
-    ssh_port: 3702
+  exp_dir: ./outputs_llama3_70b
   task:
+    type: train
     backend: megatron
     entrypoint: ./flagscale/train/train_llama.py
-    type: train
-hydra:
+  runner:
+    backend: torchrun
+    nnodes: 4 
+    nproc_per_node: 8 
+    hostfile: ${hostfile??} 
+  envs:
+    CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7 
+    CUDA_DEVICE_MAX_CONNECTIONS: 1
+    NVTE_APPLY_QK_LAYER_SCALING: 0
+    NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+action: run 
+
+hydra: 
   run:
-    dir: ${experiment.exp_dir}/hydra
+    dir: ${experiment.exp_dir}/hydra 
diff --git a/examples/llama/conf/train/train_llama3_70b_finetune.yaml b/examples/llama/conf/train/train_llama3_70b_finetune.yaml
@@ -1,72 +1,76 @@
-data:
-  data_path: /share/project/PUBLIC/data/llama3-70b/llama3_dataset/dedup-md5-pile-pile-cc_text_document
-  split: 1
-  tokenizer:
-    tokenizer_path: /share/project/PUBLIC/data/llama3-70b/llama3_tokenizer
-    tokenizer_type: Llama3TokenizerFS
-    vocab_size: 128256
+system:
+  tensor_model_parallel_size: 8
+  pipeline_model_parallel_size: 4
+  make_vocab_size_divisible_by: 64
+  disable_bias_linear: True
+  sequence_parallel: True
+  use_flash_attn: True
+  use_distributed_optimizer: True
+  use_mcore_models: True
+  transformer_impl: transformer_engine
+  precision:
+    bf16: True
+    attention_softmax_in_fp32: True
+    accumulate_allreduce_grads_in_fp32: True
+  logging:
+    log_interval: 1
+    tensorboard_log_interval: 1
+    wandb_project: "train-llama3-70B" 
+    wandb_exp_name: "train-llama3-70B" 
+  checkpoint:
+    load: ${ckpt_path:??}
+    ckpt_format: torch
+    save_interval: 100
+    finetune: True
+
 model:
-  add_qkv_bias: false
-  attention_dropout: 0.0
-  clip_grad: 1.0
-  ffn_hidden_size: 28672
-  global_batch_size: 1024
-  group_query_attention: true
-  hidden_dropout: 0.0
+  num_layers: 80
   hidden_size: 8192
-  init_method_std: 0.02
+  num_attention_heads: 64
+  group_query_attention: True
+  num_query_groups: 8
+  ffn_hidden_size: 28672
+  seq_length: 8192
   max_position_embeddings: 8192
-  micro_batch_size: 1
-  no_position_embedding: true
   norm_epsilon: 1e-5
   norm_init_weight: 0.02
+  use_rotary_position_embeddings: True
+  rotary_base: 500000
+  no_position_embedding: True
+  reset_position_ids: True
+  add_qkv_bias: false
+  reset_attention_mask: True
+  swiglu: True
   normalization: RMSNorm
-  num_attention_heads: 64
-  num_layers: 80
-  num_query_groups: 8
+  untie_embeddings_and_output_weights: True
+  init_method_std: 0.02
+  attention_dropout: 0.0
+  hidden_dropout: 0.0
+  clip_grad: 1.0
+
+  train_samples: 6160066
+  micro_batch_size: 1
+  global_batch_size: 1024
+  seed: 42
+
   optimizer:
+    start_weight_decay: 0
+    end_weight_decay: 5e-7
+    weight_decay_incr_style: cosine
     adam_beta1: 0.9
     adam_beta2: 0.95
-    end_weight_decay: 5e-7
     lr_scheduler:
       lr: 5e-6
-      lr_decay_style: cosine
-      lr_warmup_samples: 51200
       min_lr: 0
-    start_weight_decay: 0
-    weight_decay_incr_style: cosine
-  reset_attention_mask: true
-  reset_position_ids: true
-  rotary_base: 500000
-  seed: 42
-  seq_length: 8192
-  swiglu: true
-  train_samples: 512000
-  transformer_impl: transformer_engine
-  untie_embeddings_and_output_weights: true
-  use_mcore_models: true
-  use_rotary_position_embeddings: true
-system:
-  checkpoint:
-    ckpt_format: torch
-    finetune: true
-    load: /share/project/PUBLIC/data/llama3-70b/llama3_ckpt
-    save_interval: 100
-  disable_bias_linear: true
-  logging:
-    log_interval: 1
-    tensorboard_log_interval: 1
-    wandb_exp_name: train-llama3-70B
-    wandb_project: train-llama3-70B
-  make_vocab_size_divisible_by: 64
-  pipeline_model_parallel_size: 4
-  precision:
-    accumulate_allreduce_grads_in_fp32: true
-    attention_softmax_in_fp32: true
-    bf16: true
-  sequence_parallel: true
-  tensor_model_parallel_size: 8
-  transformer_impl: transformer_engine
-  use_distributed_optimizer: true
-  use_flash_attn: true
-  use_mcore_models: true
+      lr_warmup_samples: 2048000
+      lr_decay_style: cosine
+
+data:
+  data_path: ${data_path:??}
+  split: 1 
+  tokenizer:
+    tokenizer_type: Llama3TokenizerFS
+    tokenizer_path: ${tokenizer_path:??}
+    vocab_size: 128256
+
+
diff --git a/hardware/kunlunxin_R300p/a44556c0/a44556c0.patch b/hardware/kunlunxin_R300p/a44556c0/a44556c0.patch
@@ -0,0 +1,109 @@
+From 9289f099424ba4d0dec83fb5715d4d2561f4c4d8 Mon Sep 17 00:00:00 2001
+From: brianlcy123 <[email protected]>
+Date: Thu, 21 Nov 2024 15:46:54 +0800
+Subject: [PATCH] [kunlunxin] add updated llama3 70b patch
+
+---
+ examples/llama/conf/config.yaml             | 40 ++++++++++++++++++++-
+ megatron/megatron/training/arguments.py     | 18 +++++-----
+ megatron/megatron/training/checkpointing.py |  5 +--
+ 3 files changed, 51 insertions(+), 12 deletions(-)
+
+diff --git a/examples/llama/conf/config.yaml b/examples/llama/conf/config.yaml
+index 592c45bf..27fb83ae 100644
+--- a/examples/llama/conf/config.yaml
++++ b/examples/llama/conf/config.yaml
+@@ -45,7 +45,45 @@ experiment:
+     CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7 
+     CUDA_DEVICE_MAX_CONNECTIONS: 1
+     NVTE_APPLY_QK_LAYER_SCALING: 0
+-    NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
++    NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1
++    ALLGATHER_ASYNC: false
++    ALLREDUCE_ASYNC: false
++    ALLREDUCE_FUSION: 0
++    BKCL_CCIX_BUFFER_GM: 1
++    BKCL_CCIX_RING: 1
++    BKCL_ENABLE_XDR: 1
++    BKCL_FLAT_RING: 1
++    BKCL_KL3_TURBO_MODE: 1
++    BKCL_RDMA_FORCE_TREE: 1
++    BKCL_RDMA_NICS: ens11np0,ens11np0,ens13np0,ens13np0,ens15np0,ens15np0,ens17np0,ens17np0
++    BKCL_RDMA_PROXY_DISABLE: 1
++    BKCL_RING_BUFFER_GM: 1
++    BKCL_TIMEOUT: 360000
++    BKCL_TRANS_UNSUPPORTED_DATATYPE: 8
++    BKCL_TREE_THRESHOLD: 1
++    BKCL_XLINK_C2C: 1
++    BKCL_XLINK_D2D: 0
++    BKCL_XLINK_ETH: 0
++    CUDART_DUMMY_REGISTER: 1
++    FAST_SWIGLU_ENABLE: 1
++    USE_FAST_BF16_FC: true
++    USE_L3: 1
++    XDNN_USE_FAST_SWISH: true
++    XPU_ZEBU_MODE: 1
++    XPU_FORCE_USERMODE_LAUNCH: 1
++    DIST_MULTI_STREAM: true
++    XMLIR_DIST_SINGLETON_STREAM: true
++    XMLIR_FA_GEMM_TYPE: float
++    XBLAS_FC_HBM_VERSION: 40
++    XMLIR_PARALLEL_SAVE_MEMORY: false
++    XMLIR_DISABLE_CUDA_ALLOCATOR: true
++    XMLIR_XDNN_PYTORCH_CHECK_ENABLE_FALLBACK_BOOL: 0
++    XMLIR_ENABLE_FALLBACK_TO_CPU_BOOL: False
++    XMLIR_DUMP_FALLBACK_OP_LIST_BOOL: true
++    XMLIR_BATCH_PARALLEL: true
++    DIST_MULTI_STREAM: true
++    CUDA_DEVICE_MAX_CONNECTIONS: 8
++    XMLIR_DIST_ASYNC_ISEND_IRECV: 1
+ action: run 
+
+ hydra: 
+diff --git a/megatron/megatron/training/arguments.py b/megatron/megatron/training/arguments.py
+index e20f178b..7e79da2a 100644
+--- a/megatron/megatron/training/arguments.py
++++ b/megatron/megatron/training/arguments.py
+@@ -652,15 +652,15 @@ def validate_args(args, defaults={}):
+     if args.sequence_parallel:
+         args.async_tensor_model_parallel_allreduce = False
+
+-    if os.environ.get('CUDA_DEVICE_MAX_CONNECTIONS') != "1":
+-        if args.sequence_parallel:
+-            raise RuntimeError(
+-                "Using sequence parallelism requires setting the environment variable "
+-                "CUDA_DEVICE_MAX_CONNECTIONS to 1")
+-        if args.async_tensor_model_parallel_allreduce:
+-            raise RuntimeError(
+-                "Using async gradient all reduce requires setting the environment "
+-                "variable CUDA_DEVICE_MAX_CONNECTIONS to 1")
++    # if os.environ.get('CUDA_DEVICE_MAX_CONNECTIONS') != "1":
++    #     if args.sequence_parallel:
++    #         raise RuntimeError(
++    #             "Using sequence parallelism requires setting the environment variable "
++    #             "CUDA_DEVICE_MAX_CONNECTIONS to 1")
++    #     if args.async_tensor_model_parallel_allreduce:
++    #         raise RuntimeError(
++    #             "Using async gradient all reduce requires setting the environment "
++    #             "variable CUDA_DEVICE_MAX_CONNECTIONS to 1")
+
+     # Disable bias gelu fusion if we are disabling bias altogether
+     if not args.add_bias_linear:
+diff --git a/megatron/megatron/training/checkpointing.py b/megatron/megatron/training/checkpointing.py
+index 01425f36..80fa0254 100644
+--- a/megatron/megatron/training/checkpointing.py
++++ b/megatron/megatron/training/checkpointing.py
+@@ -530,8 +530,9 @@ def save_dataloader_state(train_iterator, iteration, dataloader_save_path):
+
+     torch.distributed.barrier(group=mpu.get_data_parallel_group())
+
+-    if mpu.get_data_parallel_rank() == 0:
+-        ensure_directory_exists(data_state_save_path)
++    # if mpu.get_data_parallel_rank() == 0:
++    #     ensure_directory_exists(data_state_save_path)
++    ensure_directory_exists(data_state_save_path)
+
+     torch.distributed.barrier(group=mpu.get_data_parallel_group())
+
+-- 
+2.25.1
diff --git a/megatron/megatron/training/arguments.py b/megatron/megatron/training/arguments.py
@@ -548,15 +548,15 @@ def validate_args(args, defaults={}):
     if args.sequence_parallel:
         args.async_tensor_model_parallel_allreduce = False
 
-    # if os.environ.get('CUDA_DEVICE_MAX_CONNECTIONS') != "1":
-    #     if args.sequence_parallel:
-    #         raise RuntimeError(
-    #             "Using sequence parallelism requires setting the environment variable "
-    #             "CUDA_DEVICE_MAX_CONNECTIONS to 1")
-    #     if args.async_tensor_model_parallel_allreduce:
-    #         raise RuntimeError(
-    #             "Using async gradient all reduce requires setting the environment "
-    #             "variable CUDA_DEVICE_MAX_CONNECTIONS to 1")
+    if os.environ.get('CUDA_DEVICE_MAX_CONNECTIONS') != "1":
+        if args.sequence_parallel:
+            raise RuntimeError(
+                "Using sequence parallelism requires setting the environment variable "
+                "CUDA_DEVICE_MAX_CONNECTIONS to 1")
+        if args.async_tensor_model_parallel_allreduce:
+            raise RuntimeError(
+                "Using async gradient all reduce requires setting the environment "
+                "variable CUDA_DEVICE_MAX_CONNECTIONS to 1")
 
     # Disable bias gelu fusion if we are disabling bias altogether
     if not args.add_bias_linear:

diff --git a/megatron/megatron/training/checkpointing.py b/megatron/megatron/training/checkpointing.py
@@ -578,9 +578,8 @@ def save_dataloader_state(train_iterator, iteration, dataloader_save_path):
 
     torch.distributed.barrier(group=mpu.get_data_parallel_group())
 
-    # if mpu.get_data_parallel_rank() == 0:
-    #     ensure_directory_exists(data_state_save_path)
-    ensure_directory_exists(data_state_save_path)
+    if mpu.get_data_parallel_rank() == 0:
+        ensure_directory_exists(data_state_save_path)
 
     torch.distributed.barrier(group=mpu.get_data_parallel_group())