diff --git a/.github/workflows/demo_in_readme.yaml b/.github/workflows/demo_in_readme.yaml index 1ad39efe..11ae253d 100644 --- a/.github/workflows/demo_in_readme.yaml +++ b/.github/workflows/demo_in_readme.yaml @@ -45,6 +45,8 @@ jobs: id: basic_train run: | source activate ${evo_env_torch21_flash2} + export TORCH_NCCL_AVOID_RECORD_STREAMS=1 + export CUDA_DEVICE_MAX_CONNECTIONS=1 sh ./ci_scripts/train/slurm_train.sh ${GITHUB_RUN_ID}-${GITHUB_JOB} - name: load_preset_ckpt @@ -52,18 +54,24 @@ jobs: run: | source activate ${evo_env_torch21_flash2} export PYTHONPATH=$PWD:$PYTHONPATH + export TORCH_NCCL_AVOID_RECORD_STREAMS=1 + export CUDA_DEVICE_MAX_CONNECTIONS=1 sh ./ci_scripts/train/load_ckpt.sh 7B_load_preset_ckpt ${GITHUB_RUN_ID}-${GITHUB_JOB} - name: load_new_ckpt run: | source activate ${evo_env_torch21_flash2} export PYTHONPATH=$PWD:$PYTHONPATH + export TORCH_NCCL_AVOID_RECORD_STREAMS=1 + export CUDA_DEVICE_MAX_CONNECTIONS=1 sh ./ci_scripts/train/load_ckpt.sh 7B_load_new_ckpt ${GITHUB_RUN_ID}-${GITHUB_JOB} rm -rf $GITHUB_WORKSPACE/llm_ckpts - name: torchrun-train run: | source activate ${evo_env_torch21_flash2} + export TORCH_NCCL_AVOID_RECORD_STREAMS=1 + export CUDA_DEVICE_MAX_CONNECTIONS=1 sh ./ci_scripts/train/torchrun.sh ${GITHUB_RUN_ID}-${GITHUB_JOB} rm -rf $GITHUB_WORKSPACE/llm_ckpts diff --git a/internlm/core/trainer_builder.py b/internlm/core/trainer_builder.py index 2b82bc1f..80d37744 100644 --- a/internlm/core/trainer_builder.py +++ b/internlm/core/trainer_builder.py @@ -1,5 +1,6 @@ import gc import logging +import os import time from functools import partial from typing import Dict, List, Optional, Union @@ -8,6 +9,7 @@ import torch.distributed as dist from torch.utils.data import DataLoader +from internlm.accelerator import AcceleratorType, get_accelerator from internlm.checkpoint.checkpoint_manager import CheckpointManager from internlm.core.context import global_context as gpc from internlm.core.context.process_group_initializer import ParallelMode @@ -31,7 +33,6 @@ ) from internlm.utils.common import ( BatchSkipper, - check_cuda_env, enable_pytorch_expandable_segments, get_current_device, get_megatron_flops, @@ -47,6 +48,32 @@ # global llm logger logger = logging.getLogger(__file__) +internlm_accelerator = get_accelerator() + + +def check_cuda_env(): + if internlm_accelerator.get_accelerator_backend() == AcceleratorType.GPU: + wp_fwd_per = gpc.config.parallel.weight.get("forward_overlap_per", "layer") + ewp_fwd_per = gpc.config.parallel.expert_weight.get("forward_overlap_per", "layer") + wp_size = gpc.config.parallel.weight.get("size", 1) + ewp_size = gpc.config.parallel.expert_weight.get("size", 1) + open_max_conns = (wp_size == 1 or wp_fwd_per != "layer") and (ewp_size == 1 or ewp_fwd_per != "layer") + if open_max_conns: + max_connections = os.getenv("CUDA_DEVICE_MAX_CONNECTIONS") + assert ( + max_connections is not None + ), "Env var CUDA_DEVICE_MAX_CONNECTIONS has not been set, please set it to 1!" + assert ( + max_connections == "1" + ), "Env var CUDA_DEVICE_MAX_CONNECTIONS is set to {}, it should be set to 1!".format(max_connections) + + avoid_record_streams = os.getenv("TORCH_NCCL_AVOID_RECORD_STREAMS") + assert ( + avoid_record_streams is not None + ), "Env var TORCH_NCCL_AVOID_RECORD_STREAMS has not been set, please set it to 1!" + assert ( + avoid_record_streams == "1" + ), "Env var TORCH_NCCL_AVOID_RECORD_STREAMS is set to {}, it should be set to 1!".format(avoid_record_streams) class TrainerBuilder(Trainer): diff --git a/internlm/data/tokenized/dummy_dataset.py b/internlm/data/tokenized/dummy_dataset.py index dcb6c027..1e64e00a 100644 --- a/internlm/data/tokenized/dummy_dataset.py +++ b/internlm/data/tokenized/dummy_dataset.py @@ -4,7 +4,7 @@ import numpy as np from torch.utils.data import Dataset -# from internlm.core.context.parallel_context import global_context as gpc +from internlm.core.context.parallel_context import global_context as gpc class RandomDataset(Dataset): @@ -30,7 +30,7 @@ def __init__(self, num_samples=10000, max_len=1024, fixed_seqlen: bool = False) while len(d) < max_len: r *= 2 d = list(range(n)) * r - # r = r % gpc.config.model.vocab_size + r = r % gpc.config.model.vocab_size d = [n, r] + d d = d[:max_len] data.append(d) diff --git a/internlm/utils/common.py b/internlm/utils/common.py index 56ebcfbe..a2ae2b26 100644 --- a/internlm/utils/common.py +++ b/internlm/utils/common.py @@ -249,11 +249,6 @@ def enable_pytorch_expandable_segments(): logger.warning("To support the 'expandable_segments' configuration, please upgrade torch to version 2.1.0.") -def check_cuda_env(): - if os.getenv("CUDA_DEVICE_MAX_CONNECTIONS") is None: - logger.warning("Env var CUDA_DEVICE_MAX_CONNECTIONS has not be set, please note this!") - - class DummyProfile: """ Dummy Profile. diff --git a/tests/test_data/test_batch_sampler.py b/tests/test_data/test_batch_sampler.py index 6beeb7a7..42e61c3f 100644 --- a/tests/test_data/test_batch_sampler.py +++ b/tests/test_data/test_batch_sampler.py @@ -45,6 +45,7 @@ def do_warmup(args): rank, worldsize, init_config, should_sccuess, answer = args build_environment(rank, worldsize, init_config) gpc.config.model.num_chunks = 1 if gpc.get_world_size(ParallelMode.PIPELINE) == 1 else 2 + gpc.config.model.vocab_size = 92544 engine, scheduler = init_model_and_optim( 8, gpc.config.model.num_chunks,