[KUNLUNXIN] case config: llava1.5_7b.

FlagOpen · Oct 15, 2024 · 70e8988 · 70e8988
1 parent ea906ae
commit 70e8988
Show file tree

Hide file tree

Showing 9 changed files with 81 additions and 0 deletions.
diff --git a/training/kunlunxin/docker_image/flagscale_llava/Dockerfile b/training/kunlunxin/docker_image/flagscale_llava/Dockerfile
@@ -0,0 +1,12 @@
+FROM zhiyuan_flagscale_llava:newest
+RUN /bin/bash -c "pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple"
+RUN /bin/bash -c "uname -a"
+RUN /bin/bash -c alias python3=python
+ENV PATH /root/miniconda/envs/python39_torch201_cuda/bin:$PATH
+
+echo '#!/bin/bash\nservice ssh restart\nexec "$@"' > /docker-start.sh
+
+RUN chmod +x /docker-start.sh
+RUN apt-get install -y openssh-server
+ENTRYPOINT ["sh", "/docker-start.sh"]
+
diff --git a/training/kunlunxin/docker_image/flagscale_llava/flagscale_llava_install.sh b/training/kunlunxin/docker_image/flagscale_llava/flagscale_llava_install.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+set -x
+
+# cd /opt/xpytorch && bash xpytorch-cp39-torch201-ubuntu2004-x64.run
+
+CUDART_DUMMY_REGISTER=1 python -m torch_xmlir --doctor &> /tmp/xpytorch.version.out
+CUDART_DUMMY_REGISTER=1 python -c "import torch; print(torch.rand(512, 128).cuda())" &> /tmp/xpytorch.test.out
+/etc/init.d/ssh restart
diff --git a/training/kunlunxin/llava1.5_7b-flagscale/README.md b/training/kunlunxin/llava1.5_7b-flagscale/README.md
@@ -0,0 +1 @@
+此测例为FlagScale相关项目测例
diff --git a/training/kunlunxin/llava1.5_7b-flagscale/config/config_R300px4x8.py b/training/kunlunxin/llava1.5_7b-flagscale/config/config_R300px4x8.py
@@ -0,0 +1,27 @@
+# scale_parent must under FlagPerf/ or data_dir/, otherwise you cannot mount it into baremetal, therefore cannot use shared storage
+scale_parent = "/share/project/PUBLIC"
+scale_home = f"{scale_parent}/FlagScale"
+
+# this cmd should install scale at <scale_home>. <scale_home> is set by flagperf.training.benchmarks.llava1.5_7b.flagscale.run_pretraining.py
+scale_download_cmd = f"cd {scale_parent}"
+
+# NV need nothing because all requirements have been established in base docker image. vendor can do anything related here
+scale_install_cmd = ""
+
+# locate energon. the copy from energon_install_path to flagscale/megatron/ is done by flagperf...run_pretraining.py
+energon_locate_cmd = r"pip show megatron-energon | grep Location | awk -F: '{print $2}' | xargs"
+
+scale_conf_dir = f"{scale_home}/examples/llava/conf"
+configyaml = f"{scale_conf_dir}/config.yaml"
+trainyaml = f"{scale_conf_dir}/train/train_llava1.5_7b.yaml"
+datasetyaml = f"{scale_home}/megatron/examples/multimodal/pretrain_dataset.yaml"
+prompt = f"{scale_home}/megatron/examples/multimodal/manual_prompts.json"
+
+cmds = {"before_start": "source ~/.bashrc"}
+# flagscale's requirements
+flagscale_chip_type = "R300p"
+flagscale_ssh_port = 4323
+flops = 999
+
+# for llava's algorithm
+steps = 30
diff --git a/training/kunlunxin/llava1.5_7b-flagscale/config/requirements.txt b/training/kunlunxin/llava1.5_7b-flagscale/config/requirements.txt
@@ -0,0 +1 @@
+megatron-energon==2.2.0
diff --git a/training/kunlunxin/llava1.5_7b_continuetrain-flagscale/README.md b/training/kunlunxin/llava1.5_7b_continuetrain-flagscale/README.md
@@ -0,0 +1 @@
+此测例为FlagScale相关项目测例
diff --git a/training/kunlunxin/llava1.5_7b_continuetrain-flagscale/config/config_R300px4x8.py b/training/kunlunxin/llava1.5_7b_continuetrain-flagscale/config/config_R300px4x8.py
@@ -0,0 +1,27 @@
+# scale_parent must under FlagPerf/ or data_dir/, otherwise you cannot mount it into baremetal, therefore cannot use shared storage
+scale_parent = "/share/project/PUBLIC"
+scale_home = f"{scale_parent}/FlagScale"
+
+# this cmd should install scale at <scale_home>. <scale_home> is set by flagperf.training.benchmarks.llava1.5_7b.flagscale.run_pretraining.py
+scale_download_cmd = f"cd {scale_parent}"
+
+# NV need nothing because all requirements have been established in base docker image. vendor can do anything related here
+scale_install_cmd = ""
+
+# locate energon. the copy from energon_install_path to flagscale/megatron/ is done by flagperf...run_pretraining.py
+energon_locate_cmd = r"pip show megatron-energon | grep Location | awk -F: '{print $2}' | xargs"
+
+scale_conf_dir = f"{scale_home}/examples/llava/conf"
+configyaml = f"{scale_conf_dir}/config.yaml"
+trainyaml = f"{scale_conf_dir}/train/train_llava1.5_7b.yaml"
+datasetyaml = f"{scale_home}/megatron/examples/multimodal/pretrain_dataset.yaml"
+prompt = f"{scale_home}/megatron/examples/multimodal/manual_prompts.json"
+
+cmds = {"before_start": "source ~/.bashrc"}
+# flagscale's requirements
+flagscale_chip_type = "R300p"
+flagscale_ssh_port = 4323
+flops = 999
+
+# for llava's algorithm
+steps = 5000
diff --git a/training/kunlunxin/llava1.5_7b_continuetrain-flagscale/config/requirements.txt b/training/kunlunxin/llava1.5_7b_continuetrain-flagscale/config/requirements.txt
@@ -0,0 +1 @@
+megatron-energon==2.2.0
diff --git a/training/run_benchmarks/config/test_conf.py b/training/run_benchmarks/config/test_conf.py
@@ -133,6 +133,8 @@
     # "mixtral_8x7B:megatron_core060:H100:4:8:1": "/raid/datasets/mistral"
 
     # kunlunxin cases
+    #"llava1.5_7b:flagscale_llava:R300p:4:8:1": "/workspace/data_dir"
+    #"llava1.5_7b_continuetrain:flagscale_llava:R300p:4:8:1": "/workspace/data_dir"
     # "gpt2:pytorch:R300:1:8:1": "/raid/dataset/gpt2",
     # "resnet50:pytorch:R300:1:8:1": "/raid/dataset/ImageNet_1k_2012/",
     # "mask_rcnn:pytorch:R300:1:8:1": "/raid/dataset/coco2017/",