FlagOpen · aoyulong · Nov 26, 2024 · Nov 21, 2024 · Nov 21, 2024 · Nov 21, 2024
@@ -0,0 +1,107 @@
+From 9289f099424ba4d0dec83fb5715d4d2561f4c4d8 Mon Sep 17 00:00:00 2001
+From: brianlcy123 <[email protected]>
+Date: Thu, 21 Nov 2024 15:46:54 +0800
+Subject: [PATCH] [kunlunxin] add updated llama3 70b patch
+
+---
+ examples/llama/conf/config.yaml             | 40 ++++++++++++++++++++-
+ megatron/megatron/training/arguments.py     | 18 +++++-----
+ megatron/megatron/training/checkpointing.py |  5 +--
+ 3 files changed, 51 insertions(+), 12 deletions(-)
+
+diff --git a/examples/llama/conf/config.yaml b/examples/llama/conf/config.yaml
+index 592c45bf..27fb83ae 100644
+--- a/examples/llama/conf/config.yaml
++++ b/examples/llama/conf/config.yaml
+@@ -46,6 +46,44 @@ experiment:
+     CUDA_DEVICE_MAX_CONNECTIONS: 1
+     NVTE_APPLY_QK_LAYER_SCALING: 0
+     NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
++    ALLGATHER_ASYNC: false
++    ALLREDUCE_ASYNC: false
++    ALLREDUCE_FUSION: 0
++    BKCL_CCIX_BUFFER_GM: 1
++    BKCL_CCIX_RING: 1
++    BKCL_ENABLE_XDR: 1
++    BKCL_FLAT_RING: 1
++    BKCL_KL3_TURBO_MODE: 1
++    BKCL_RDMA_FORCE_TREE: 1
++    BKCL_RDMA_NICS: ens11np0,ens11np0,ens13np0,ens13np0,ens15np0,ens15np0,ens17np0,ens17np0
++    BKCL_RDMA_PROXY_DISABLE: 1
++    BKCL_RING_BUFFER_GM: 1
++    BKCL_TIMEOUT: 360000
++    BKCL_TRANS_UNSUPPORTED_DATATYPE: 8
++    BKCL_TREE_THRESHOLD: 1
++    BKCL_XLINK_C2C: 1
++    BKCL_XLINK_D2D: 0
++    BKCL_XLINK_ETH: 0
++    CUDART_DUMMY_REGISTER: 1
++    FAST_SWIGLU_ENABLE: 1
++    USE_FAST_BF16_FC: true
++    USE_L3: 1
++    XDNN_USE_FAST_SWISH: true
++    XPU_ZEBU_MODE: 1
++    XPU_FORCE_USERMODE_LAUNCH: 1
++    DIST_MULTI_STREAM: true
++    XMLIR_DIST_SINGLETON_STREAM: true
++    XMLIR_FA_GEMM_TYPE: float
++    XBLAS_FC_HBM_VERSION: 40
++    XMLIR_PARALLEL_SAVE_MEMORY: false
++    XMLIR_DISABLE_CUDA_ALLOCATOR: true
++    XMLIR_XDNN_PYTORCH_CHECK_ENABLE_FALLBACK_BOOL: 0
++    XMLIR_ENABLE_FALLBACK_TO_CPU_BOOL: False
++    XMLIR_DUMP_FALLBACK_OP_LIST_BOOL: true
++    XMLIR_BATCH_PARALLEL: true
++    DIST_MULTI_STREAM: true
++    CUDA_DEVICE_MAX_CONNECTIONS: 8
++    XMLIR_DIST_ASYNC_ISEND_IRECV: 1
+ action: run 
+
+ hydra: 
+diff --git a/megatron/megatron/training/arguments.py b/megatron/megatron/training/arguments.py
+index e20f178b..7e79da2a 100644
+--- a/megatron/megatron/training/arguments.py
++++ b/megatron/megatron/training/arguments.py
+@@ -652,15 +652,15 @@ def validate_args(args, defaults={}):
+     if args.sequence_parallel:
+         args.async_tensor_model_parallel_allreduce = False
+
+-    if os.environ.get('CUDA_DEVICE_MAX_CONNECTIONS') != "1":
+-        if args.sequence_parallel:
+-            raise RuntimeError(
+-                "Using sequence parallelism requires setting the environment variable "
+-                "CUDA_DEVICE_MAX_CONNECTIONS to 1")
+-        if args.async_tensor_model_parallel_allreduce:
+-            raise RuntimeError(
+-                "Using async gradient all reduce requires setting the environment "
+-                "variable CUDA_DEVICE_MAX_CONNECTIONS to 1")
++    # if os.environ.get('CUDA_DEVICE_MAX_CONNECTIONS') != "1":
++    #     if args.sequence_parallel:
++    #         raise RuntimeError(
++    #             "Using sequence parallelism requires setting the environment variable "
++    #             "CUDA_DEVICE_MAX_CONNECTIONS to 1")
++    #     if args.async_tensor_model_parallel_allreduce:
++    #         raise RuntimeError(
++    #             "Using async gradient all reduce requires setting the environment "
++    #             "variable CUDA_DEVICE_MAX_CONNECTIONS to 1")
+
+     # Disable bias gelu fusion if we are disabling bias altogether
+     if not args.add_bias_linear:
+diff --git a/megatron/megatron/training/checkpointing.py b/megatron/megatron/training/checkpointing.py
+index 01425f36..80fa0254 100644
+--- a/megatron/megatron/training/checkpointing.py
++++ b/megatron/megatron/training/checkpointing.py
+@@ -530,8 +530,9 @@ def save_dataloader_state(train_iterator, iteration, dataloader_save_path):
+
+     torch.distributed.barrier(group=mpu.get_data_parallel_group())
+
+-    if mpu.get_data_parallel_rank() == 0:
+-        ensure_directory_exists(data_state_save_path)
++    # if mpu.get_data_parallel_rank() == 0:
++    #     ensure_directory_exists(data_state_save_path)
++    ensure_directory_exists(data_state_save_path)
+
+     torch.distributed.barrier(group=mpu.get_data_parallel_group())
+
+-- 
+2.25.1