-
Notifications
You must be signed in to change notification settings - Fork 104
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[cambricon] support llava1.5_7b & llama3_70b & mixtral_8*7b with
flagscale
- Loading branch information
cifar10
committed
Sep 21, 2024
1 parent
2ed227e
commit e706e3f
Showing
12 changed files
with
128 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
FROM cambricon_flagscale_2409_pt21 | ||
#shell | ||
SHELL ["/bin/bash", "-c"] | ||
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -yq --no-install-recommends tzdata && apt-get install -y openssh-server && mkdir -p /run/sshd | ||
RUN apt update -y && apt install -y sudo dmidecode ipmitool sysstat net-tools sshpass | ||
# modify ~/.bashrc file | ||
RUN sed -i '/\[ -z "\$PS1" \] \&\& return/s/^/#/' ~/.bashrc | ||
RUN echo -e "\n# Add environment variables\n\ | ||
export NEUWARE_HOME=/usr/local/neuware\n\ | ||
export LD_LIBRARY_PATH=/usr/local/neuware/lib64:/usr/local/openmpi/lib:${LD_LIBRARY_PATH}\n\ | ||
export PATH=/torch/venv3/pytorch/bin:/usr/local/neuware/bin:/usr/local/openmpi/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:${PATH}\n\ | ||
export CNCL_MLULINK_OVER_ROCE_DISABLE=1\n\ | ||
export CNCL_MLULINK_CROSS_HOSTS_ENABLE=0\n\ | ||
export CNCL_MLU_DIRECT_LEVEL=1" >> ~/.bashrc |
7 changes: 7 additions & 0 deletions
7
training/cambricon/docker_image/flagscale_2409/flagscale_2409_install.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
#!/bin/bash | ||
#!/bin/bash | ||
set -xe | ||
pip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple | ||
pip3 install regex==2024.5.15 schedule==1.2.2 accelerate==0.31.0 transformers==4.40.1 protobuf==3.20.0 | ||
pip3 install pybind11 hydra-core s3fs braceexpand webdataset wandb loguru sentencepiece datasets | ||
pip3 install megatron-energon==2.2.0 |
1 change: 1 addition & 0 deletions
1
training/cambricon/llama3_70B_continuetrain-flagscale/README.md
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
此测例为FlagScale相关项目测例 |
34 changes: 34 additions & 0 deletions
34
training/cambricon/llama3_70B_continuetrain-flagscale/config/config_MLUx4x8.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
# scale_parent must under FlagPerf/ or data_dir/, otherwise you cannot mount it into baremetal, therefore cannot use shared storage | ||
scale_parent = "/share/project/zhaodeming/data_dir" | ||
scale_home = f"{scale_parent}/FlagScale/build/cambricon_MLU/FlagScale" | ||
|
||
# this cmd should install scale at <scale_home>. <scale_home> is set by flagperf.training.benchmarks.llava1.5_7b.flagscale.run_pretraining.py | ||
scale_download_cmd = f"cd {scale_parent}" | ||
|
||
# NV need nothing because all requirements have been established in base docker image. vendor can do anything related here | ||
scale_install_cmd = "" | ||
|
||
scale_conf_dir = f"{scale_home}/examples/llama/conf" | ||
configyaml = f"{scale_conf_dir}/config.yaml" | ||
trainyaml = f"{scale_conf_dir}/train/train_llama3_70b_finetune.yaml" | ||
dataset = f"{scale_parent}/SAMPLE50B/llama3/llama3_dataset" | ||
tokenizer = f"{scale_parent}/SAMPLE50B/llama3/llama3_tokenizer" | ||
ckpt = f"{scale_parent}/llama3_ckpt" | ||
|
||
#cmds = {"before_start": "source /root/miniconda3/bin/activate flagscale"} | ||
cmds = {} | ||
# flagscale's requirements | ||
flagscale_chip_type = "MLU" | ||
flagscale_ssh_port = 55623 | ||
|
||
# 请在Flagperf目录下,创建MLU_FP16_FLOPS.py, | ||
# 文件包含MLU硬件算力值,示例如下: | ||
# MLU_FP16_FLOPS=1.0 | ||
FLOPS_DIR='../../../../' | ||
import sys | ||
sys.path.append(FLOPS_DIR) | ||
from MLU_FP16_FLOPS import MLU_FP16_FLOPS | ||
flops = float(MLU_FP16_FLOPS) | ||
|
||
# for llama's algorithm | ||
steps = 500 |
Empty file.
1 change: 1 addition & 0 deletions
1
training/cambricon/llava1.5_7b_continuetrain-flagscale/README.md
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
此测例为FlagScale相关项目测例 |
35 changes: 35 additions & 0 deletions
35
training/cambricon/llava1.5_7b_continuetrain-flagscale/config/config_MLUx4x8.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
# scale_parent must under FlagPerf/ or data_dir/, otherwise you cannot mount it into baremetal, therefore cannot use shared storage | ||
scale_parent = "/share/project/zhaodeming/data_dir" | ||
scale_home = f"{scale_parent}/FlagScale/build/cambricon_MLU/FlagScale" | ||
|
||
# this cmd should install scale at <scale_home>. <scale_home> is set by flagperf.training.benchmarks.llava1.5_7b.flagscale.run_pretraining.py | ||
scale_download_cmd = f"cd {scale_parent}" | ||
|
||
# NV need nothing because all requirements have been established in base docker image. vendor can do anything related here | ||
scale_install_cmd = "" | ||
|
||
# locate energon. the copy from energon_install_path to flagscale/megatron/ is done by flagperf...run_pretraining.py | ||
energon_locate_cmd = r"pip show megatron-energon | grep Location | awk -F: '{print $2}' | xargs" | ||
|
||
scale_conf_dir = f"{scale_home}/examples/llava/conf" | ||
configyaml = f"{scale_conf_dir}/config.yaml" | ||
trainyaml = f"{scale_conf_dir}/train/train_llava1.5_7b.yaml" | ||
datasetyaml = f"{scale_home}/megatron/examples/multimodal/pretrain_dataset.yaml" | ||
prompt = f"{scale_home}/megatron/examples/multimodal/manual_prompts.json" | ||
|
||
cmds = {} | ||
# flagscale's requirements | ||
flagscale_chip_type = "MLU" | ||
flagscale_ssh_port = 55623 | ||
|
||
# 请在Flagperf目录下,创建MLU_FP16_FLOPS.py, | ||
# 文件包含MLU硬件算力值,示例如下: | ||
# MLU_FP16_FLOPS=1.0 | ||
FLOPS_DIR='../../../../' | ||
import sys | ||
sys.path.append(FLOPS_DIR) | ||
from MLU_FP16_FLOPS import MLU_FP16_FLOPS | ||
flops = float(MLU_FP16_FLOPS) | ||
|
||
# for llava's algorithm | ||
steps = 5000 |
1 change: 1 addition & 0 deletions
1
training/cambricon/llava1.5_7b_continuetrain-flagscale/config/requirements.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
megatron-energon==2.2.0 |
1 change: 1 addition & 0 deletions
1
training/cambricon/mixtral_8x7B_continuetrain-flagscale/README.md
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
此测例为FlagScale相关项目测例 |
33 changes: 33 additions & 0 deletions
33
training/cambricon/mixtral_8x7B_continuetrain-flagscale/config/config_MLUx4x8.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
# scale_parent must under FlagPerf/ or data_dir/, otherwise you cannot mount it into baremetal, therefore cannot use shared storage | ||
scale_parent = "/share/project/zhaodeming/data_dir" | ||
scale_home = f"{scale_parent}/FlagScale/build/cambricon_MLU/FlagScale" | ||
|
||
# this cmd should install scale at <scale_home>. <scale_home> is set by flagperf.training.benchmarks.llava1.5_7b.flagscale.run_pretraining.py | ||
scale_download_cmd = f"cd {scale_parent}" | ||
|
||
# NV need nothing because all requirements have been established in base docker image. vendor can do anything related here | ||
scale_install_cmd = "" | ||
|
||
scale_conf_dir = f"{scale_home}/examples/mixtral/conf" | ||
configyaml = f"{scale_conf_dir}/config.yaml" | ||
trainyaml = f"{scale_conf_dir}/train/train_mixtral_8x7b.yaml" | ||
dataset = f"{scale_parent}/SAMPLE50B/mixtral/mixtral_dataset" | ||
tokenizer = f"{scale_parent}/SAMPLE50B/mixtral/mixtral_tokenizer" | ||
ckpt = f"{scale_parent}/mixtral_tp2_pp4_ep4_latest" | ||
|
||
cmds = {} | ||
# flagscale's requirements | ||
flagscale_chip_type = "MLU" | ||
flagscale_ssh_port = 55623 | ||
|
||
# 请在Flagperf目录下,创建MLU_FP16_FLOPS.py, | ||
# 文件包含MLU硬件算力值,示例如下: | ||
# MLU_FP16_FLOPS=1.0 | ||
FLOPS_DIR='../../../../' | ||
import sys | ||
sys.path.append(FLOPS_DIR) | ||
from MLU_FP16_FLOPS import MLU_FP16_FLOPS | ||
flops = float(MLU_FP16_FLOPS) | ||
|
||
# for mixtral's algorithm | ||
steps = 1000 |
Empty file.