Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[cambricon] support llava1.5_7b & llama3_70b & mixtral_8*7b with #752

Merged
merged 1 commit into from
Sep 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -164,8 +164,8 @@ def replace_yamls(scale_home, config_module, args):
args.log_dir, "outputs_llama3", "logs", "host_" +
str(timestamp_log_noderank) + "_" + timestamp_log_host + ".output")

info_line = []
while True:
info_line = []
try:
with open(timestamp_log_file, 'r') as f:
lines = f.readlines()
Expand Down
14 changes: 14 additions & 0 deletions training/cambricon/docker_image/flagscale_2409/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
FROM cambricon_flagscale_2409_pt21
#shell
SHELL ["/bin/bash", "-c"]
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -yq --no-install-recommends tzdata && apt-get install -y openssh-server && mkdir -p /run/sshd
RUN apt update -y && apt install -y sudo dmidecode ipmitool sysstat net-tools sshpass
# modify ~/.bashrc file
RUN sed -i '/\[ -z "\$PS1" \] \&\& return/s/^/#/' ~/.bashrc
RUN echo -e "\n# Add environment variables\n\
export NEUWARE_HOME=/usr/local/neuware\n\
export LD_LIBRARY_PATH=/usr/local/neuware/lib64:/usr/local/openmpi/lib:${LD_LIBRARY_PATH}\n\
export PATH=/torch/venv3/pytorch/bin:/usr/local/neuware/bin:/usr/local/openmpi/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:${PATH}\n\
export CNCL_MLULINK_OVER_ROCE_DISABLE=1\n\
export CNCL_MLULINK_CROSS_HOSTS_ENABLE=0\n\
export CNCL_MLU_DIRECT_LEVEL=1" >> ~/.bashrc
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/bin/bash
#!/bin/bash
set -xe
pip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
pip3 install regex==2024.5.15 schedule==1.2.2 accelerate==0.31.0 transformers==4.40.1 protobuf==3.20.0
pip3 install pybind11 hydra-core s3fs braceexpand webdataset wandb loguru sentencepiece datasets
pip3 install megatron-energon==2.2.0
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
此测例为FlagScale相关项目测例
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# scale_parent must under FlagPerf/ or data_dir/, otherwise you cannot mount it into baremetal, therefore cannot use shared storage
scale_parent = "/share/project/zhaodeming/data_dir"
scale_home = f"{scale_parent}/FlagScale/build/cambricon_MLU/FlagScale"

# this cmd should install scale at <scale_home>. <scale_home> is set by flagperf.training.benchmarks.llava1.5_7b.flagscale.run_pretraining.py
scale_download_cmd = f"cd {scale_parent}"

# NV need nothing because all requirements have been established in base docker image. vendor can do anything related here
scale_install_cmd = ""

scale_conf_dir = f"{scale_home}/examples/llama/conf"
configyaml = f"{scale_conf_dir}/config.yaml"
trainyaml = f"{scale_conf_dir}/train/train_llama3_70b_finetune.yaml"
dataset = f"{scale_parent}/SAMPLE50B/llama3/llama3_dataset"
tokenizer = f"{scale_parent}/SAMPLE50B/llama3/llama3_tokenizer"
ckpt = f"{scale_parent}/llama3_ckpt"

#cmds = {"before_start": "source /root/miniconda3/bin/activate flagscale"}
cmds = {}
# flagscale's requirements
flagscale_chip_type = "MLU"
flagscale_ssh_port = 55623

# 请在Flagperf目录下,创建MLU_FP16_FLOPS.py,
# 文件包含MLU硬件算力值,示例如下:
# MLU_FP16_FLOPS=1.0
FLOPS_DIR='../../../../'
import sys
sys.path.append(FLOPS_DIR)
from MLU_FP16_FLOPS import MLU_FP16_FLOPS
flops = float(MLU_FP16_FLOPS)

# for llama's algorithm
steps = 500
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
此测例为FlagScale相关项目测例
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# scale_parent must under FlagPerf/ or data_dir/, otherwise you cannot mount it into baremetal, therefore cannot use shared storage
scale_parent = "/share/project/zhaodeming/data_dir"
scale_home = f"{scale_parent}/FlagScale/build/cambricon_MLU/FlagScale"

# this cmd should install scale at <scale_home>. <scale_home> is set by flagperf.training.benchmarks.llava1.5_7b.flagscale.run_pretraining.py
scale_download_cmd = f"cd {scale_parent}"

# NV need nothing because all requirements have been established in base docker image. vendor can do anything related here
scale_install_cmd = ""

# locate energon. the copy from energon_install_path to flagscale/megatron/ is done by flagperf...run_pretraining.py
energon_locate_cmd = r"pip show megatron-energon | grep Location | awk -F: '{print $2}' | xargs"

scale_conf_dir = f"{scale_home}/examples/llava/conf"
configyaml = f"{scale_conf_dir}/config.yaml"
trainyaml = f"{scale_conf_dir}/train/train_llava1.5_7b.yaml"
datasetyaml = f"{scale_home}/megatron/examples/multimodal/pretrain_dataset.yaml"
prompt = f"{scale_home}/megatron/examples/multimodal/manual_prompts.json"

cmds = {}
# flagscale's requirements
flagscale_chip_type = "MLU"
flagscale_ssh_port = 55623

# 请在Flagperf目录下,创建MLU_FP16_FLOPS.py,
# 文件包含MLU硬件算力值,示例如下:
# MLU_FP16_FLOPS=1.0
FLOPS_DIR='../../../../'
import sys
sys.path.append(FLOPS_DIR)
from MLU_FP16_FLOPS import MLU_FP16_FLOPS
flops = float(MLU_FP16_FLOPS)

# for llava's algorithm
steps = 5000
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
megatron-energon==2.2.0
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
此测例为FlagScale相关项目测例
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# scale_parent must under FlagPerf/ or data_dir/, otherwise you cannot mount it into baremetal, therefore cannot use shared storage
scale_parent = "/share/project/zhaodeming/data_dir"
scale_home = f"{scale_parent}/FlagScale/build/cambricon_MLU/FlagScale"

# this cmd should install scale at <scale_home>. <scale_home> is set by flagperf.training.benchmarks.llava1.5_7b.flagscale.run_pretraining.py
scale_download_cmd = f"cd {scale_parent}"

# NV need nothing because all requirements have been established in base docker image. vendor can do anything related here
scale_install_cmd = ""

scale_conf_dir = f"{scale_home}/examples/mixtral/conf"
configyaml = f"{scale_conf_dir}/config.yaml"
trainyaml = f"{scale_conf_dir}/train/train_mixtral_8x7b.yaml"
dataset = f"{scale_parent}/SAMPLE50B/mixtral/mixtral_dataset"
tokenizer = f"{scale_parent}/SAMPLE50B/mixtral/mixtral_tokenizer"
ckpt = f"{scale_parent}/mixtral_tp2_pp4_ep4_latest"

cmds = {}
# flagscale's requirements
flagscale_chip_type = "MLU"
flagscale_ssh_port = 55623

# 请在Flagperf目录下,创建MLU_FP16_FLOPS.py,
# 文件包含MLU硬件算力值,示例如下:
# MLU_FP16_FLOPS=1.0
FLOPS_DIR='../../../../'
import sys
sys.path.append(FLOPS_DIR)
from MLU_FP16_FLOPS import MLU_FP16_FLOPS
flops = float(MLU_FP16_FLOPS)

# for mixtral's algorithm
steps = 1000