Skip to content

Commit

Permalink
[cambricon] support llava1.5_7b & llama3_70b & mixtral_8*7b with
Browse files Browse the repository at this point in the history
flagscale
  • Loading branch information
cifar10 committed Sep 21, 2024
1 parent 2ed227e commit e706e3f
Show file tree
Hide file tree
Showing 12 changed files with 128 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -164,8 +164,8 @@ def replace_yamls(scale_home, config_module, args):
args.log_dir, "outputs_llama3", "logs", "host_" +
str(timestamp_log_noderank) + "_" + timestamp_log_host + ".output")

info_line = []
while True:
info_line = []
try:
with open(timestamp_log_file, 'r') as f:
lines = f.readlines()
Expand Down
14 changes: 14 additions & 0 deletions training/cambricon/docker_image/flagscale_2409/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
FROM cambricon_flagscale_2409_pt21
#shell
SHELL ["/bin/bash", "-c"]
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -yq --no-install-recommends tzdata && apt-get install -y openssh-server && mkdir -p /run/sshd
RUN apt update -y && apt install -y sudo dmidecode ipmitool sysstat net-tools sshpass
# modify ~/.bashrc file
RUN sed -i '/\[ -z "\$PS1" \] \&\& return/s/^/#/' ~/.bashrc
RUN echo -e "\n# Add environment variables\n\
export NEUWARE_HOME=/usr/local/neuware\n\
export LD_LIBRARY_PATH=/usr/local/neuware/lib64:/usr/local/openmpi/lib:${LD_LIBRARY_PATH}\n\
export PATH=/torch/venv3/pytorch/bin:/usr/local/neuware/bin:/usr/local/openmpi/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:${PATH}\n\
export CNCL_MLULINK_OVER_ROCE_DISABLE=1\n\
export CNCL_MLULINK_CROSS_HOSTS_ENABLE=0\n\
export CNCL_MLU_DIRECT_LEVEL=1" >> ~/.bashrc
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/bin/bash
#!/bin/bash
set -xe
pip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
pip3 install regex==2024.5.15 schedule==1.2.2 accelerate==0.31.0 transformers==4.40.1 protobuf==3.20.0
pip3 install pybind11 hydra-core s3fs braceexpand webdataset wandb loguru sentencepiece datasets
pip3 install megatron-energon==2.2.0
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
此测例为FlagScale相关项目测例
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# scale_parent must under FlagPerf/ or data_dir/, otherwise you cannot mount it into baremetal, therefore cannot use shared storage
scale_parent = "/share/project/zhaodeming/data_dir"
scale_home = f"{scale_parent}/FlagScale/build/cambricon_MLU/FlagScale"

# this cmd should install scale at <scale_home>. <scale_home> is set by flagperf.training.benchmarks.llava1.5_7b.flagscale.run_pretraining.py
scale_download_cmd = f"cd {scale_parent}"

# NV need nothing because all requirements have been established in base docker image. vendor can do anything related here
scale_install_cmd = ""

scale_conf_dir = f"{scale_home}/examples/llama/conf"
configyaml = f"{scale_conf_dir}/config.yaml"
trainyaml = f"{scale_conf_dir}/train/train_llama3_70b_finetune.yaml"
dataset = f"{scale_parent}/SAMPLE50B/llama3/llama3_dataset"
tokenizer = f"{scale_parent}/SAMPLE50B/llama3/llama3_tokenizer"
ckpt = f"{scale_parent}/llama3_ckpt"

#cmds = {"before_start": "source /root/miniconda3/bin/activate flagscale"}
cmds = {}
# flagscale's requirements
flagscale_chip_type = "MLU"
flagscale_ssh_port = 55623

# 请在Flagperf目录下,创建MLU_FP16_FLOPS.py,
# 文件包含MLU硬件算力值,示例如下:
# MLU_FP16_FLOPS=1.0
FLOPS_DIR='../../../../'
import sys
sys.path.append(FLOPS_DIR)
from MLU_FP16_FLOPS import MLU_FP16_FLOPS
flops = float(MLU_FP16_FLOPS)

# for llama's algorithm
steps = 500
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
此测例为FlagScale相关项目测例
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# scale_parent must under FlagPerf/ or data_dir/, otherwise you cannot mount it into baremetal, therefore cannot use shared storage
scale_parent = "/share/project/zhaodeming/data_dir"
scale_home = f"{scale_parent}/FlagScale/build/cambricon_MLU/FlagScale"

# this cmd should install scale at <scale_home>. <scale_home> is set by flagperf.training.benchmarks.llava1.5_7b.flagscale.run_pretraining.py
scale_download_cmd = f"cd {scale_parent}"

# NV need nothing because all requirements have been established in base docker image. vendor can do anything related here
scale_install_cmd = ""

# locate energon. the copy from energon_install_path to flagscale/megatron/ is done by flagperf...run_pretraining.py
energon_locate_cmd = r"pip show megatron-energon | grep Location | awk -F: '{print $2}' | xargs"

scale_conf_dir = f"{scale_home}/examples/llava/conf"
configyaml = f"{scale_conf_dir}/config.yaml"
trainyaml = f"{scale_conf_dir}/train/train_llava1.5_7b.yaml"
datasetyaml = f"{scale_home}/megatron/examples/multimodal/pretrain_dataset.yaml"
prompt = f"{scale_home}/megatron/examples/multimodal/manual_prompts.json"

cmds = {}
# flagscale's requirements
flagscale_chip_type = "MLU"
flagscale_ssh_port = 55623

# 请在Flagperf目录下,创建MLU_FP16_FLOPS.py,
# 文件包含MLU硬件算力值,示例如下:
# MLU_FP16_FLOPS=1.0
FLOPS_DIR='../../../../'
import sys
sys.path.append(FLOPS_DIR)
from MLU_FP16_FLOPS import MLU_FP16_FLOPS
flops = float(MLU_FP16_FLOPS)

# for llava's algorithm
steps = 5000
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
megatron-energon==2.2.0
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
此测例为FlagScale相关项目测例
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# scale_parent must under FlagPerf/ or data_dir/, otherwise you cannot mount it into baremetal, therefore cannot use shared storage
scale_parent = "/share/project/zhaodeming/data_dir"
scale_home = f"{scale_parent}/FlagScale/build/cambricon_MLU/FlagScale"

# this cmd should install scale at <scale_home>. <scale_home> is set by flagperf.training.benchmarks.llava1.5_7b.flagscale.run_pretraining.py
scale_download_cmd = f"cd {scale_parent}"

# NV need nothing because all requirements have been established in base docker image. vendor can do anything related here
scale_install_cmd = ""

scale_conf_dir = f"{scale_home}/examples/mixtral/conf"
configyaml = f"{scale_conf_dir}/config.yaml"
trainyaml = f"{scale_conf_dir}/train/train_mixtral_8x7b.yaml"
dataset = f"{scale_parent}/SAMPLE50B/mixtral/mixtral_dataset"
tokenizer = f"{scale_parent}/SAMPLE50B/mixtral/mixtral_tokenizer"
ckpt = f"{scale_parent}/mixtral_tp2_pp4_ep4_latest"

cmds = {}
# flagscale's requirements
flagscale_chip_type = "MLU"
flagscale_ssh_port = 55623

# 请在Flagperf目录下,创建MLU_FP16_FLOPS.py,
# 文件包含MLU硬件算力值,示例如下:
# MLU_FP16_FLOPS=1.0
FLOPS_DIR='../../../../'
import sys
sys.path.append(FLOPS_DIR)
from MLU_FP16_FLOPS import MLU_FP16_FLOPS
flops = float(MLU_FP16_FLOPS)

# for mixtral's algorithm
steps = 1000
Empty file.

0 comments on commit e706e3f

Please sign in to comment.