Skip to content

Commit

Permalink
[KUNLUNXIN] case config: llava1.5_7b. (#762)
Browse files Browse the repository at this point in the history
  • Loading branch information
zhangsanfeng2022 authored Oct 16, 2024
1 parent 7fdab21 commit f67e60c
Show file tree
Hide file tree
Showing 9 changed files with 85 additions and 0 deletions.
12 changes: 12 additions & 0 deletions training/kunlunxin/docker_image/flagscale_llava/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
FROM zhiyuan_flagscale_llava:newest
RUN /bin/bash -c "pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple"
RUN /bin/bash -c "uname -a"
RUN /bin/bash -c alias python3=python
ENV PATH /root/miniconda/envs/python39_torch201_cuda/bin:$PATH

RUN echo '#!/bin/bash\nservice ssh restart\nexec "$@"' > /docker-start.sh

RUN chmod +x /docker-start.sh
RUN apt-get install -y openssh-server
ENTRYPOINT ["sh", "/docker-start.sh"]

Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/bash

set -x

# cd /opt/xpytorch && bash xpytorch-cp39-torch201-ubuntu2004-x64.run

CUDART_DUMMY_REGISTER=1 python -m torch_xmlir --doctor &> /tmp/xpytorch.version.out
CUDART_DUMMY_REGISTER=1 python -c "import torch; print(torch.rand(512, 128).cuda())" &> /tmp/xpytorch.test.out
/etc/init.d/ssh restart
1 change: 1 addition & 0 deletions training/kunlunxin/llava1.5_7b-flagscale/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
此测例为FlagScale相关项目测例
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# scale_parent must under FlagPerf/ or data_dir/, otherwise you cannot mount it into baremetal, therefore cannot use shared storage
#scale_parent = "/share"
scale_parent = "/share/project/PUBLIC/data/llava1.5-7b"
scale_home = f"{scale_parent}/FlagScale/build/kunlunxin_R300p/FlagScale"

# this cmd should install scale at <scale_home>. <scale_home> is set by flagperf.training.benchmarks.llava1.5_7b.flagscale.run_pretraining.py
#scale_download_cmd = f"cd {scale_parent}; git clone https://github.com/FlagOpen/FlagScale.git; cd FlagScale; git checkout 085811f"
scale_download_cmd = f"cd {scale_home}"

# NV need nothing because all requirements have been established in base docker image. vendor can do anything related here
scale_install_cmd = ""

# locate energon. the copy from energon_install_path to flagscale/megatron/ is done by flagperf...run_pretraining.py
energon_locate_cmd = r"pip show megatron-energon | grep Location | awk -F: '{print $2}' | xargs"

scale_conf_dir = f"{scale_home}/examples/llava/conf"
configyaml = f"{scale_conf_dir}/config.yaml"
trainyaml = f"{scale_conf_dir}/train/train_llava1.5_7b.yaml"
datasetyaml = f"{scale_home}/megatron/examples/multimodal/pretrain_dataset.yaml"
prompt = f"{scale_home}/megatron/examples/multimodal/manual_prompts.json"

cmds = {"before_start": "source ~/.bashrc"}
# flagscale's requirements
flagscale_chip_type = "R300p"
flagscale_ssh_port = 4323
flops = 999

# for llava's algorithm
steps = 30
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
megatron-energon==2.2.0
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
此测例为FlagScale相关项目测例
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# scale_parent must under FlagPerf/ or data_dir/, otherwise you cannot mount it into baremetal, therefore cannot use shared storage
#scale_parent = "/share"
scale_parent = "/share/project/PUBLIC/data/llava1.5-7b"
scale_home = f"{scale_parent}/FlagScale/build/kunlunxin_R300p/FlagScale"

# this cmd should install scale at <scale_home>. <scale_home> is set by flagperf.training.benchmarks.llava1.5_7b.flagscale.run_pretraining.py
#scale_download_cmd = f"cd {scale_parent}; git clone https://github.com/FlagOpen/FlagScale.git; cd FlagScale; git checkout 085811f"
scale_download_cmd = f"cd {scale_home}"

# NV need nothing because all requirements have been established in base docker image. vendor can do anything related here
scale_install_cmd = ""

# locate energon. the copy from energon_install_path to flagscale/megatron/ is done by flagperf...run_pretraining.py
energon_locate_cmd = r"pip show megatron-energon | grep Location | awk -F: '{print $2}' | xargs"

scale_conf_dir = f"{scale_home}/examples/llava/conf"
configyaml = f"{scale_conf_dir}/config.yaml"
trainyaml = f"{scale_conf_dir}/train/train_llava1.5_7b.yaml"
datasetyaml = f"{scale_home}/megatron/examples/multimodal/pretrain_dataset.yaml"
prompt = f"{scale_home}/megatron/examples/multimodal/manual_prompts.json"

cmds = {"before_start": "source ~/.bashrc"}
# flagscale's requirements
flagscale_chip_type = "R300p"
flagscale_ssh_port = 4323
flops = 999

# for llava's algorithm
steps = 5000
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
megatron-energon==2.2.0
2 changes: 2 additions & 0 deletions training/run_benchmarks/config/test_conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,8 @@
# "mixtral_8x7B:megatron_core060:H100:4:8:1": "/raid/datasets/mistral"

# kunlunxin cases
#"llava1.5_7b:flagscale_llava:R300p:4:8:1": "/workspace/data_dir"
#"llava1.5_7b_continuetrain:flagscale_llava:R300p:4:8:1": "/workspace/data_dir"
# "gpt2:pytorch:R300:1:8:1": "/raid/dataset/gpt2",
# "resnet50:pytorch:R300:1:8:1": "/raid/dataset/ImageNet_1k_2012/",
# "mask_rcnn:pytorch:R300:1:8:1": "/raid/dataset/coco2017/",
Expand Down

0 comments on commit f67e60c

Please sign in to comment.