From c48ffd5513fc052043590356fb76f9d8a0a20502 Mon Sep 17 00:00:00 2001 From: sfwang Date: Fri, 20 Sep 2024 16:35:44 +0800 Subject: [PATCH 1/3] [metax] Add llava1.5_7b_continuetrain-flagscale --- .../metax/docker_image/flagscale/Dockerfile | 45 ++++++++++++++++--- .../flagscale/flagscale_install.sh | 4 +- .../metax/docker_image/flagscale/start.sh | 3 ++ .../README.md | 1 + .../config/config_C500x4x8.py | 28 ++++++++++++ .../config/environment_variables.sh | 0 .../config/requirements.txt | 1 + training/run_benchmarks/config/test_conf.py | 1 + 8 files changed, 75 insertions(+), 8 deletions(-) create mode 100644 training/metax/docker_image/flagscale/start.sh create mode 100644 training/metax/llava1.5_7b_continuetrain-flagscale/README.md create mode 100644 training/metax/llava1.5_7b_continuetrain-flagscale/config/config_C500x4x8.py create mode 100644 training/metax/llava1.5_7b_continuetrain-flagscale/config/environment_variables.sh create mode 100644 training/metax/llava1.5_7b_continuetrain-flagscale/config/requirements.txt diff --git a/training/metax/docker_image/flagscale/Dockerfile b/training/metax/docker_image/flagscale/Dockerfile index 2470c426d..9d6f645b9 100755 --- a/training/metax/docker_image/flagscale/Dockerfile +++ b/training/metax/docker_image/flagscale/Dockerfile @@ -1,9 +1,42 @@ -FROM maca-c500-pytorch-2.19.0.9-ubuntu18.04-amd64:FlagPerf_aquila2_7b +FROM metax-flagscale:2.24.0.2-ubuntu20.04-amd64-v7 ENV PATH="/opt/conda/bin:${PATH}" + +ENV MACA_PATH="/opt/maca" +ENV MACA_CLANG_PATH="${MACA_PATH}/mxgpu_llvm/bin" +ENV MACA_CLANG="${MACA_PATH}/mxgpu_llvm" +ENV DEVINFO_ROOT="${MACA_PATH}" +ENV CUCC_PATH="${MACA_PATH}/tools/cu-bridge" +ENV CUDA_PATH="${CUCC_PATH}" + +ENV PATH="${CUCC_PATH}:${MACA_PATH}/bin:${MACA_CLANG}/bin:${PATH}" +ENV LD_LIBRARY_PATH="${MACA_PATH}/lib:${MACA_PATH}/mxgpu_llvm/lib:${LD_LIBRARY_PATH}" + +# ENV GLOO_SOCKET_IFNAME=ens115f0 + ENV MACA_SMALL_PAGESIZE_ENABLE=1 -ENV MHA_USE_BLAS=ON -ENV MHA_BWD_NO_ATOMIC_F64=1 -ENV CUCC_PATH=${MACA_PATH}/tools/cu-bridge -ENV CUDA_PATH=${CUCC_PATH} +ENV MCPYTORCH_DISABLE_PRINT=1 + +ENV MCCL_NET_GDR_LEVEL=7 +ENV MCCL_P2P_LEVEL=SYS +ENV MCCL_LIMIT_RING_LL_THREADTHRESHOLDS=1 +ENV FORCE_ACTIVATE_WAIT=1 + +ENV SET_DEVICE_NUMA_PREFERRED=1 + +ENV MAX_JOBS=20 +ENV PYTORCH_ENABLE_SAME_RAND_A100=1 +ENV MCCL_IB_GID_INDEX=1 +ENV NVTE_FLASH_ATTN=1 +ENV NVTE_FUSED_ATTN=0 +ENV HYDRA_FULL_ERROR=1 + +#ENV MCBLAS_CUSTOMIZED_CONFIG_PATH=/workspace/Megatron-LM_metax/mcblas_customized_config.yaml + RUN /bin/bash -c "uname -a" -RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple pybind11 regex +RUN /bin/bash -c alias python3=python +#RUN pip install pybind11 regex hydra-core wandb s3fs --trusted-host mirrors.aliyun.com +#RUN pip install braceexpand webdataset --trusted-host mirrors.aliyun.com + +COPY start.sh /start.sh +RUN chmod +x /start.sh +ENTRYPOINT ["/start.sh"] \ No newline at end of file diff --git a/training/metax/docker_image/flagscale/flagscale_install.sh b/training/metax/docker_image/flagscale/flagscale_install.sh index cd1c3b143..5849c5c8e 100755 --- a/training/metax/docker_image/flagscale/flagscale_install.sh +++ b/training/metax/docker_image/flagscale/flagscale_install.sh @@ -1,5 +1,5 @@ #!/bin/bash # using github mirrors to avoid github TTL -cp -r /root/FlagScale /workspace/ +cp -r /data/dataset/llava/FlagScale /workspace/ echo 'export PYTHONPATH=$PYTHONPATH:/workspace/FlagScale' >> /root/.bashrc -source /root/.bashrc +source /root/.bashrc \ No newline at end of file diff --git a/training/metax/docker_image/flagscale/start.sh b/training/metax/docker_image/flagscale/start.sh new file mode 100644 index 000000000..f95436293 --- /dev/null +++ b/training/metax/docker_image/flagscale/start.sh @@ -0,0 +1,3 @@ +#!/bin/bash +service ssh restart +exec "$@" \ No newline at end of file diff --git a/training/metax/llava1.5_7b_continuetrain-flagscale/README.md b/training/metax/llava1.5_7b_continuetrain-flagscale/README.md new file mode 100644 index 000000000..d44c78c53 --- /dev/null +++ b/training/metax/llava1.5_7b_continuetrain-flagscale/README.md @@ -0,0 +1 @@ +此测例为FlagScale相关项目测例 diff --git a/training/metax/llava1.5_7b_continuetrain-flagscale/config/config_C500x4x8.py b/training/metax/llava1.5_7b_continuetrain-flagscale/config/config_C500x4x8.py new file mode 100644 index 000000000..e81a2a43b --- /dev/null +++ b/training/metax/llava1.5_7b_continuetrain-flagscale/config/config_C500x4x8.py @@ -0,0 +1,28 @@ +# scale_parent must under FlagPerf/ or data_dir/, otherwise you cannot mount it into baremetal, therefore cannot use shared storage +scale_parent = "/metax/sfwang/FlagScale/build/metax_C500" +scale_home = f"{scale_parent}/FlagScale" + +# this cmd should install scale at . is set by flagperf.training.benchmarks.llava1.5_7b.flagscale.run_pretraining.py +#scale_download_cmd = f"cd {scale_parent}; git clone https://github.com/FlagOpen/FlagScale.git; cd FlagScale; git checkout 085811f" +scale_download_cmd = f"cd {scale_parent}" + +# NV need nothing because all requirements have been established in base docker image. vendor can do anything related here +scale_install_cmd = "" + +# locate energon. the copy from energon_install_path to flagscale/megatron/ is done by flagperf...run_pretraining.py +energon_locate_cmd = r"pip show megatron-energon | grep Location | awk -F: '{print $2}' | xargs" + +scale_conf_dir = f"{scale_home}/examples/llava/conf" +configyaml = f"{scale_conf_dir}/config.yaml" +trainyaml = f"{scale_conf_dir}/train/train_llava1.5_7b.yaml" +datasetyaml = f"{scale_home}/megatron/examples/multimodal/pretrain_dataset.yaml" +prompt = f"{scale_home}/megatron/examples/multimodal/manual_prompts.json" + +cmds = {"before_start": ""} +# flagscale's requirements +flagscale_chip_type = "C500" +flagscale_ssh_port = 1234 +flops = -1 + +# for llava's algorithm +steps = 5000 \ No newline at end of file diff --git a/training/metax/llava1.5_7b_continuetrain-flagscale/config/environment_variables.sh b/training/metax/llava1.5_7b_continuetrain-flagscale/config/environment_variables.sh new file mode 100644 index 000000000..e69de29bb diff --git a/training/metax/llava1.5_7b_continuetrain-flagscale/config/requirements.txt b/training/metax/llava1.5_7b_continuetrain-flagscale/config/requirements.txt new file mode 100644 index 000000000..4f0d1d961 --- /dev/null +++ b/training/metax/llava1.5_7b_continuetrain-flagscale/config/requirements.txt @@ -0,0 +1 @@ +megatron-energon==2.2.0 diff --git a/training/run_benchmarks/config/test_conf.py b/training/run_benchmarks/config/test_conf.py index db6b3696a..a12774def 100755 --- a/training/run_benchmarks/config/test_conf.py +++ b/training/run_benchmarks/config/test_conf.py @@ -174,6 +174,7 @@ # "llama2_7b:deepspeed:S4000:1:8:1": "/data/flagperf/llama/openwebtext", # metax cases + #"llava1.5_7b_continuetrain:flagscale:C500:4:8:1": "/data/dataset/llava" #"llama3_8B:megatron_core060:C500:1:8:1": "/data/llama3_8b" # "llama2_70B:megatron:C500:4:8:1": "/data/llama2-70B" #"chatglm3_6b:deepspeed:C500:1:8:1": "/raid/dataset//chatglm3-6b" From 60d0ddb5efce2816a9c9e55a3f7497f5df13deea Mon Sep 17 00:00:00 2001 From: sfwang Date: Fri, 4 Oct 2024 11:01:35 +0800 Subject: [PATCH 2/3] [metax] add llama3-70b flagscale continue train --- .../flagscale/run_pretraining.py | 2 +- .../README.md | 1 + .../config/config_C500x4x8.py | 25 +++++++++++++++++++ .../config/requirements.txt | 0 training/run_benchmarks/config/test_conf.py | 1 + 5 files changed, 28 insertions(+), 1 deletion(-) create mode 100644 training/metax/llama3_70B_continuetrain-flagscale/README.md create mode 100644 training/metax/llama3_70B_continuetrain-flagscale/config/config_C500x4x8.py create mode 100644 training/metax/llama3_70B_continuetrain-flagscale/config/requirements.txt diff --git a/training/benchmarks/llama3_70B_continuetrain/flagscale/run_pretraining.py b/training/benchmarks/llama3_70B_continuetrain/flagscale/run_pretraining.py index 323740124..72c7f57c4 100644 --- a/training/benchmarks/llama3_70B_continuetrain/flagscale/run_pretraining.py +++ b/training/benchmarks/llama3_70B_continuetrain/flagscale/run_pretraining.py @@ -176,7 +176,7 @@ def replace_yamls(scale_home, config_module, args): print("Maybe some errors") if len(info_line) == getattr(module, "steps"): break - time.sleep(300) + time.sleep(300) if args.vendor != 'metax' else time.sleep(10) infos = [] for line in info_line: diff --git a/training/metax/llama3_70B_continuetrain-flagscale/README.md b/training/metax/llama3_70B_continuetrain-flagscale/README.md new file mode 100644 index 000000000..d44c78c53 --- /dev/null +++ b/training/metax/llama3_70B_continuetrain-flagscale/README.md @@ -0,0 +1 @@ +此测例为FlagScale相关项目测例 diff --git a/training/metax/llama3_70B_continuetrain-flagscale/config/config_C500x4x8.py b/training/metax/llama3_70B_continuetrain-flagscale/config/config_C500x4x8.py new file mode 100644 index 000000000..3a0829310 --- /dev/null +++ b/training/metax/llama3_70B_continuetrain-flagscale/config/config_C500x4x8.py @@ -0,0 +1,25 @@ +# scale_parent must under FlagPerf/ or data_dir/, otherwise you cannot mount it into baremetal, therefore cannot use shared storage +scale_parent = "/share/project/FlagPerf/FlagScale/build/metax_C500" +scale_home = f"{scale_parent}/FlagScale" + +# this cmd should install scale at . is set by flagperf.training.benchmarks.llava1.5_7b.flagscale.run_pretraining.py +scale_download_cmd = f"cd {scale_parent}" + +# NV need nothing because all requirements have been established in base docker image. vendor can do anything related here +scale_install_cmd = "" + +scale_conf_dir = f"{scale_home}/examples/llama/conf" +configyaml = f"{scale_conf_dir}/config.yaml" +trainyaml = f"{scale_conf_dir}/train/train_llama3_70b_finetune.yaml" +dataset = f"SAMPLE50B/llama3/llama3_dataset" +tokenizer = f"SAMPLE50B/llama3/llama3_tokenizer" +ckpt = f"llama3_ckpt" + +cmds = {"before_start": ""} +# flagscale's requirements +flagscale_chip_type = "C500" +flagscale_ssh_port = 1234 +flops = -1 + +# for llava's algorithm +steps = 500 \ No newline at end of file diff --git a/training/metax/llama3_70B_continuetrain-flagscale/config/requirements.txt b/training/metax/llama3_70B_continuetrain-flagscale/config/requirements.txt new file mode 100644 index 000000000..e69de29bb diff --git a/training/run_benchmarks/config/test_conf.py b/training/run_benchmarks/config/test_conf.py index a12774def..dcb74f82b 100755 --- a/training/run_benchmarks/config/test_conf.py +++ b/training/run_benchmarks/config/test_conf.py @@ -174,6 +174,7 @@ # "llama2_7b:deepspeed:S4000:1:8:1": "/data/flagperf/llama/openwebtext", # metax cases + #"llama3_70B_continuetrain:flagscale:C500:4:8:1": "/metax/dataset" #"llava1.5_7b_continuetrain:flagscale:C500:4:8:1": "/data/dataset/llava" #"llama3_8B:megatron_core060:C500:1:8:1": "/data/llama3_8b" # "llama2_70B:megatron:C500:4:8:1": "/data/llama2-70B" From e43fc595b4b0042492b83f33858dd4e44b9ec0f9 Mon Sep 17 00:00:00 2001 From: sfwang Date: Sun, 6 Oct 2024 00:56:26 +0000 Subject: [PATCH 3/3] [metax] work around some fail in rank0 in metax case --- .../llama3_70B_continuetrain/flagscale/run_pretraining.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/training/benchmarks/llama3_70B_continuetrain/flagscale/run_pretraining.py b/training/benchmarks/llama3_70B_continuetrain/flagscale/run_pretraining.py index 72c7f57c4..2910f07ad 100644 --- a/training/benchmarks/llama3_70B_continuetrain/flagscale/run_pretraining.py +++ b/training/benchmarks/llama3_70B_continuetrain/flagscale/run_pretraining.py @@ -176,7 +176,7 @@ def replace_yamls(scale_home, config_module, args): print("Maybe some errors") if len(info_line) == getattr(module, "steps"): break - time.sleep(300) if args.vendor != 'metax' else time.sleep(10) + time.sleep(300) if args.vendor != 'metax' else time.sleep(5) infos = [] for line in info_line: