Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Slc/modify cuda ci #235

Merged
merged 21 commits into from
Aug 17, 2023
Merged
Show file tree
Hide file tree
Changes from 20 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 18 additions & 7 deletions scripts/ci/ci_one_iter.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
#!/bin/bash

function clone_needed_repo() {

set -e
# clone some repositories

Expand All @@ -16,11 +15,12 @@ function clone_needed_repo() {
MMACTION2_VERSION=dipu_v1.0.0_one_iter_tool
MMOCR_VERSION=dipu_v1.0.0_one_iter_tool
MMAGIC=dipu_v1.0.0_one_iter_tool
SMART_VERSION=dev_for_mmcv2.0
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

把这个逻辑合入到dev_for_mmcv2.0分支内。

SMART_VERSION=slc/support-eiengine-trans
MMYOLO=dipu_v0.5.0_one_iter_tool
DIENGINE=dipu_v0.4.8_one_iter_tool


rm -rf SMART && git clone -b ${SMART_VERSION} https://github.com/ParrotsDL/SMART.git
rm -rf DI-engine && git clone -b ${DIENGINE} https://github.com/DeepLink-org/DI-engine.git
rm -rf SMART && git clone -b ${SMART_VERSION} https://github.com/DeepLink-org/SMART.git
rm -rf mmpretrain && git clone -b ${MMPRETRAIN_VERSION} https://github.com/DeepLink-org/mmpretrain.git
rm -rf mmdetection && git clone -b ${MMDETECTION_VERSION} https://github.com/DeepLink-org/mmdetection.git
rm -rf mmsegmentation && git clone -b ${MMSEGMENTATION_VERSION} https://github.com/DeepLink-org/mmsegmentation.git
Expand All @@ -38,19 +38,28 @@ function clone_needed_repo() {
function build_needed_repo_cuda() {
cd mmcv
MMCV_WITH_DIOPI=1 MMCV_WITH_OPS=1 python setup.py build_ext -i
cd ..
# cd ../mmdet
# pip install -e . --no-deps
# cd ../mmyolo
# # Install albumentations
# pip install -r requirements/albu.txt --no-deps
# # Install MMYOLO
# pip install -e . --no-deps
cd ..
# cd mmagic
# pip install -e . -v --no-deps
# pip install -e . -v
# cd ../mmpretrain
# pip install -e .
# cd ..
# cd DI-engine
# pip install -e .
# cd ..
# #安装强化学习需要用的包
# pip install lz4
# pip install readerwriterlock
# pip install Flask==2.1.0
# pip install transformers
# pip install accelerate
}

function build_needed_repo_camb() {
Expand All @@ -68,6 +77,7 @@ function export_repo_pythonpath(){
export PYTHONPATH=${basic_path}/mmagic:$PYTHONPATH
export PYTHONPATH=${basic_path}/data/stable-diffusion-v1-5:$PYTHONPATH
export PYTHONPATH=${basic_path}/mmagic/mmagic/models/editors/stable_diffusion:$PYTHONPATH
export PYTHONPATH=${basic_path}/DI-engine:$PYTHONPATH
elif [ "$1" = "camb" ]; then
echo "Executing CAMB operation in pythonpath..."
export PYTHONPATH=/mnt/lustre/share/platform/env/miniconda3.8/envs/pt2.0_diopi/mmcvs/9b1209f:$PYTHONPATH
Expand Down Expand Up @@ -99,11 +109,12 @@ function build_dataset(){
ln -s /mnt/lustre/share_data/parrots.tester.s.03/dataset/data_for_ln/imagenet data/imagenet
ln -s /mnt/lustre/share_data/parrots.tester.s.03/dataset/data_for_ln/coco data/coco
ln -s /mnt/lustre/share_data/parrots.tester.s.03/dataset/data_for_ln/cityscapes data/cityscapes
# ln -s /mnt/lustre/share_data/parrots.tester.s.03/dataset/data_for_ln/kinetics400 data/kinetics400 #数据集还在迁移
ln -s /mnt/lustre/share_data/openmmlab/datasets/action/Kinetics400 data/kinetics400
ln -s /mnt/lustre/share_data/parrots.tester.s.03/dataset/data_for_ln/icdar2015 data/icdar2015
ln -s /mnt/lustre/share_data/parrots.tester.s.03/dataset/data_for_ln/mjsynth data/mjsynth
ln -s /mnt/lustre/share_data/parrots.tester.s.03/dataset/data_for_ln/kitti data/kitti
ln -s /mnt/lustre/share_data/shenliancheng/swin_large_patch4_window12_384_22k.pth data/swin_large_patch4_window12_384_22k.pth
ln -s /mnt/lustre/share_data/parrots.tester.s.03/models_code/mmagic/stable-diffusion-v1-5 data/stable-diffusion-v1-5

elif [ "$1" = "camb" ]; then
echo "Executing CAMB operation in build dataset..."
Expand Down
32 changes: 24 additions & 8 deletions scripts/ci/ci_run_one_iter.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
error_flag = multiprocessing.Value('i',0) #if encount error

if device_type == 'cuda':
random_model_num = 8
random_model_num = 100
print("we use cuda!")
else:
random_model_num = 100
Expand Down Expand Up @@ -57,10 +57,22 @@ def process_one_iter(model_info):
p3 = model_info_list[2]
p4 = model_info_list[3] if len(model_info_list) == 4 else ""

train_path = p1 + "/tools/train.py"
config_path = p1 + "/configs/" + p2
work_dir = "--work-dir=./one_iter_data/" + p3
opt_arg = p4
if("mm" in p1):
train_path = p1 + "/tools/train.py"
config_path = p1 + "/configs/" + p2
work_dir = "--work-dir=./one_iter_data/" + p3
opt_arg = p4
package_name = "mmlab"
elif("DI" in p1):
train_path = p1+"/"+p2
config_path = ""
work_dir = ""
opt_arg = ""
package_name = "diengine"
else:
print("Wrong model info in {}".format(model_info), flush = True)
exit(1)

os.environ['ONE_ITER_TOOL_STORAGE_PATH'] = os.getcwd()+"/one_iter_data/" + p3

storage_path = os.environ['ONE_ITER_TOOL_STORAGE_PATH']
Expand Down Expand Up @@ -93,11 +105,15 @@ def process_one_iter(model_info):
github_job_name = github_job #为了方便统一scancel,因此使用同样的jobname

if device_type == 'cuda':
cmd_run_one_iter = "srun --job-name={} --partition={} --gres={} --cpus-per-task=5 --mem=16G --time=40 sh SMART/tools/one_iter_tool/run_one_iter.sh {} {} {} {}".format(github_job_name, slurm_par, gpu_requests, train_path, config_path, work_dir, opt_arg)
cmd_cp_one_iter = "srun --job-name={} --partition={} --gres={} --cpus-per-task=5 --mem=16G --time=30 sh SMART/tools/one_iter_tool/compare_one_iter.sh".format(github_job_name, slurm_par, gpu_requests)
if(p2 == "stable_diffusion/stable-diffusion_ddim_denoisingunet_infer.py"):
cmd_run_one_iter = "srun --job-name={} --partition={} --gres={} --cpus-per-task=5 --mem=16G --time=40 sh mmagic/configs/stable_diffusion/stable-diffusion_ddim_denoisingunet_one_iter.sh".format(github_job_name, slurm_par, gpu_requests)
cmd_cp_one_iter = ""
else:
cmd_run_one_iter = "srun --job-name={} --partition={} --gres={} --cpus-per-task=5 --mem=16G --time=40 sh SMART/tools/one_iter_tool/run_one_iter.sh {} {} {} {}".format(github_job_name, slurm_par, gpu_requests, train_path, config_path, work_dir, opt_arg)
cmd_cp_one_iter = "srun --job-name={} --partition={} --gres={} --cpus-per-task=5 --mem=16G --time=30 sh SMART/tools/one_iter_tool/compare_one_iter.sh {}".format(github_job_name, slurm_par, gpu_requests, package_name)
else:
cmd_run_one_iter = "srun --job-name={} --partition={} --gres={} --time=40 sh SMART/tools/one_iter_tool/run_one_iter.sh {} {} {} {}".format(github_job_name, slurm_par, gpu_requests, train_path, config_path, work_dir, opt_arg)
cmd_cp_one_iter = "srun --job-name={} --partition={} --gres={} --time=30 sh SMART/tools/one_iter_tool/compare_one_iter.sh".format(github_job_name, slurm_par, gpu_requests)
cmd_cp_one_iter = "srun --job-name={} --partition={} --gres={} --time=30 sh SMART/tools/one_iter_tool/compare_one_iter.sh {}".format(github_job_name, slurm_par, gpu_requests, package_name)

run_cmd(cmd_run_one_iter)
run_cmd(cmd_cp_one_iter)
Expand Down
1 change: 1 addition & 0 deletions scripts/ci/nv/ci_nv_run_one_iter.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ def handle_error(error):
print("Kill all!", flush = True)
p.terminate()
error_flag.value = 1
exit(1)

if __name__=='__main__':
curPath = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
Expand Down
33 changes: 20 additions & 13 deletions scripts/ci/test_one_iter_model_list.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,30 +50,37 @@ cuda:
- model_cfg: "mmpretrain vision_transformer/vit-base-p16_64xb64_in1k-384px.py workdirs_vit-base-p16_64xb64_in1k-384px"
- model_cfg: "mmpretrain efficientnet/efficientnet-b2_8xb32_in1k.py workdirs_efficientnet-b2_8xb32_in1k"
- model_cfg: "mmpretrain mobilenet_v3/mobilenet-v3-large_8xb128_in1k.py workdirs_mobilenet-v3-large_8xb128_in1k"
- model_cfg: "mmpretrain mobilenet_v2/mobilenet-v2_8xb32_in1k.py workdirs_mobilenet-v2_8xb32_in1k"
- model_cfg: "mmpretrain convnext/convnext-small_32xb128_in1k.py workdirs_convnext-small_32xb128_in1k"
- model_cfg: "mmpretrain shufflenet_v2/shufflenet-v2-1x_16xb64_in1k.py workdirs_shufflenet-v2-1x_16xb64_in1k"
fallback_op_list: "native_batch_norm*"
# mmdetection
- model_cfg: "mmdetection detr/detr_r50_8xb2-150e_coco.py workdirs_detr_r50_8xb2-150e_coco"
- model_cfg: "mmdetection yolo/yolov3_d53_8xb8-320-273e_coco.py workdirs_yolov3_d53_8xb8-320-273e_coco"
- model_cfg: "mmdetection ssd/ssd300_coco.py workdirs_ssd300_coco"
- model_cfg: "mmdetection fcos/fcos_r50-dcn-caffe_fpn_gn-head-center-normbbox-centeronreg-giou_1x_coco.py workdirs_fcos_r50-dcn-caffe_fpn_gn-head-center-normbbox-centeronreg-giou_1x_coco"
- model_cfg: "mmdetection retinanet/retinanet_r50_fpn_1x_coco.py workdirs_retinanet_r50_fpn_1x_coco"
- model_cfg: "mmdetection retinanet/retinanet_r50_fpn_1x_coco.py workdirs_retinanet_r50_fpn_1x_coco"
- model_cfg: "mmdetection mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py workdirs_mask-rcnn_r50_fpn_1x_coco"
- model_cfg: "mmdetection faster_rcnn/faster-rcnn_r101_fpn_1x_coco.py workdirs_faster-rcnn_r101_fpn_1x_coco"
- model_cfg: "mmdetection dyhead/atss_swin-l-p4-w12_fpn_dyhead_ms-2x_coco.py workdirs_atss_swin-l-p4-w12_fpn_dyhead_ms-2x_coco"
# mmpose
- model_cfg: "mmpose body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-8xb64-210e_coco-256x192.py workdirs_td-hm_hrnet-w32_udp-8xb64-210e_coco-256x192"
# mmaction2
# - "mmaction2 recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py workdirs_tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb" #数据集还在迁移
- model_cfg: "mmaction2 recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py workdirs_tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb"
# mmocr
- model_cfg: "mmocr textrecog/crnn/crnn_mini-vgg_5e_mj.py workdirs_crnn_mini-vgg_5e_mj"
- model_cfg: "mmocr textdet/dbnet/dbnet_resnet50-dcnv2_fpnc_1200e_icdar2015.py workdirs_dbnet_resnet50-dcnv2_fpnc_1200e_icdar20" #smart工具对比 cpu存全量时问题消失
- model_cfg: "mmocr textdet/dbnet/dbnet_resnet50-dcnv2_fpnc_1200e_icdar2015.py workdirs_dbnet_resnet50-dcnv2_fpnc_1200e_icdar20"
# mmsegmentation
- model_cfg: "mmsegmentation deeplabv3/deeplabv3_r50-d8_4xb2-40k_cityscapes-512x1024.py workdirs_r50-d8_4xb2-40k_cityscapes-512x1024"
- model_cfg: "mmsegmentation deeplabv3plus/deeplabv3plus_r50-d8_4xb2-40k_cityscapes-512x1024.py workdirs_deeplabv3plus_r50-d8_4xb2-40k_cityscapes-512x1024"
# # 超时
- model_cfg: "mmpretrain convnext/convnext-small_32xb128_in1k.py workdirs_convnext-small_32xb128_in1k" #时间过长 爆显存
- model_cfg: "mmsegmentation unet/unet-s5-d16_fcn_4xb4-160k_cityscapes-512x1024.py workdirs_unet-s5-d16_fcn_4xb4-160k_cityscapes-512x1024"
- model_cfg: "mmsegmentation pspnet/pspnet_r50-d8_4xb2-40k_cityscapes-512x1024.py workdirs_pspnet_r50-d8_4xb2-40k_cityscapes-512x1024"
# mmyolo
- model_cfg: "mmyolo yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py wordir_yolov5_s-v61_syncbn_8xb16-300e_coco"
- model_cfg: "mmsegmentation unet/unet-s5-d16_fcn_4xb4-160k_cityscapes-512x1024.py workdirs_unet-s5-d16_fcn_4xb4-160k_cityscapes-512x1024" #时间过长
- model_cfg: "mmsegmentation pspnet/pspnet_r50-d8_4xb2-40k_cityscapes-512x1024.py workdirs_pspnet_r50-d8_4xb2-40k_cityscapes-512x1024" #时间过长
- model_cfg: "mmdetection mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py workdirs_mask-rcnn_r50_fpn_1x_coco"
- model_cfg: "mmdetection3d pointpillars/pointpillars_hv_secfpn_8xb6-160e_kitti-3d-3class.py workdirs_pointpillars_hv_secfpn_8xb6-160e_kitti-3d-3class" #装的库还有问题
- model_cfg: "mmdetection faster_rcnn/faster-rcnn_r101_fpn_1x_coco.py workdirs_faster-rcnn_r101_fpn_1x_coco"
# - "mmagic configs/stable_diffusion/stable-diffusion_ddim_denoisingunet_infer.py workdirs_stable-diffusion_ddim_denoisingunet.py" #模型文件还缺少
- model_cfg: "mmpretrain mobilenet_v2/mobilenet-v2_8xb32_in1k.py workdirs_mobilenet-v2_8xb32_in1k"
- model_cfg: "mmdetection dyhead/atss_swin-l-p4-w12_fpn_dyhead_ms-2x_coco.py workdirs_atss_swin-l-p4-w12_fpn_dyhead_ms-2x_coco" #精度问题
# mmdetection3d
- model_cfg: "mmdetection3d pointpillars/pointpillars_hv_secfpn_8xb6-160e_kitti-3d-3class.py workdirs_pointpillars_hv_secfpn_8xb6-160e_kitti-3d-3class"
# DI-engine
- model_cfg: "DI-engine ding/example/ppo.py workdirs_ppo"
- model_cfg: "DI-engine ding/example/sac.py workdirs_sac"
# mmagic
- model_cfg: "mmagic stable_diffusion/stable-diffusion_ddim_denoisingunet_infer.py workdirs_stable-diffusion_ddim_denoisingunet"