vsort/vs_onnxruntime.cpp: replay the first dml execution #310
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Build (Windows-ORT) | |
on: | |
push: | |
paths: | |
- 'common/**' | |
- 'vsort/**' | |
- '.github/workflows/windows-ort.yml' | |
workflow_call: | |
inputs: | |
tag: | |
description: 'which tag to upload to' | |
required: true | |
type: string | |
workflow_dispatch: | |
inputs: | |
tag: | |
description: 'which tag to upload to' | |
default: '' | |
jobs: | |
build-windows: | |
runs-on: windows-2022 | |
defaults: | |
run: | |
shell: cmd | |
working-directory: vsort | |
steps: | |
- name: Checkout repo | |
uses: actions/checkout@v4 | |
with: | |
fetch-depth: 0 | |
- name: Setup MSVC | |
uses: ilammy/msvc-dev-cmd@v1 | |
- name: Setup Ninja | |
run: pip install ninja | |
- name: Cache protobuf | |
id: cache-protobuf | |
uses: actions/cache@v4 | |
with: | |
path: vsort/protobuf/install | |
key: ${{ runner.os }}-vsort-protobuf-v4 | |
- name: Checkout protobuf | |
uses: actions/checkout@v4 | |
if: steps.cache-protobuf.outputs.cache-hit != 'true' | |
with: | |
repository: protocolbuffers/protobuf | |
# follows protobuf in https://github.com/AmusementClub/onnxruntime/blob/master/cmake/external/onnxruntime_external_deps.cmake#L203 | |
# if you change this, remember to bump the version of the cache key. | |
ref: v3.21.12 | |
fetch-depth: 1 | |
path: vsort/protobuf | |
- name: Configure protobuf | |
if: steps.cache-protobuf.outputs.cache-hit != 'true' | |
run: cmake -S protobuf -B protobuf\build_rel -G Ninja -LA | |
-D CMAKE_BUILD_TYPE=Release | |
-D protobuf_BUILD_SHARED_LIBS=OFF -D protobuf_BUILD_TESTS=OFF | |
- name: Build protobuf | |
if: steps.cache-protobuf.outputs.cache-hit != 'true' | |
run: cmake --build protobuf\build_rel --verbose | |
- name: Install protobuf | |
if: steps.cache-protobuf.outputs.cache-hit != 'true' | |
run: cmake --install protobuf\build_rel --prefix protobuf\install | |
- name: Cache onnx | |
id: cache-onnx | |
uses: actions/cache@v4 | |
with: | |
path: vsort/onnx/install | |
key: ${{ runner.os }}-vsort-onnx-v5 | |
- name: Checkout onnx | |
if: steps.cache-onnx.outputs.cache-hit != 'true' | |
uses: actions/checkout@v4 | |
with: | |
repository: onnx/onnx | |
# follows onnx in https://github.com/AmusementClub/onnxruntime/tree/master/cmake/external | |
# if you change this, remember to bump the version of the cache key. | |
ref: 990217f043af7222348ca8f0301e17fa7b841781 | |
fetch-depth: 1 | |
path: vsort/onnx | |
- name: Configure onnx | |
if: steps.cache-onnx.outputs.cache-hit != 'true' | |
run: cmake -S onnx -B onnx\build -G Ninja -LA | |
-D CMAKE_BUILD_TYPE=Release | |
-D Protobuf_PROTOC_EXECUTABLE=protobuf\install\bin\protoc | |
-D Protobuf_LITE_LIBRARY=protobuf\install\lib | |
-D Protobuf_LIBRARIES=protobuf\install\lib | |
-D ONNX_USE_LITE_PROTO=ON -D ONNX_USE_PROTOBUF_SHARED_LIBS=OFF | |
-D ONNX_GEN_PB_TYPE_STUBS=OFF -D ONNX_ML=0 | |
-D ONNX_USE_MSVC_STATIC_RUNTIME=1 | |
- name: Build onnx | |
if: steps.cache-onnx.outputs.cache-hit != 'true' | |
run: cmake --build onnx\build --verbose | |
- name: Install onnx | |
if: steps.cache-onnx.outputs.cache-hit != 'true' | |
run: cmake --install onnx\build --prefix onnx\install | |
- name: Download VapourSynth headers | |
run: | | |
curl -s -o vs.zip -L https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R54.zip | |
unzip -q vs.zip | |
mv vapoursynth-*/ vapoursynth/ | |
- name: Download ONNX Runtime Precompilation | |
run: | | |
curl -s -o ortgpu.zip -LJO https://github.com/AmusementClub/onnxruntime/releases/download/orttraining_rc2-8036-geb41d57f21-240425-0428/onnxruntime-gpu-win64.zip | |
unzip -q ortgpu.zip | |
- name: Cache CUDA | |
id: cache-cuda | |
uses: actions/cache@v4 | |
with: | |
path: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA | |
key: ${{ runner.os }}-cuda-12.4.1 | |
- name: Setup CUDA | |
if: steps.cache-cuda.outputs.cache-hit != 'true' | |
run: | | |
curl -s -o cuda_installer.exe -L https://developer.download.nvidia.com/compute/cuda/12.4.1/network_installers/cuda_12.4.1_windows_network.exe | |
cuda_installer.exe -s nvcc_12.4 cudart_12.4 | |
- name: Configure | |
run: cmake -S . -B build -G Ninja -LA | |
-D CMAKE_BUILD_TYPE=Release | |
-D CMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded | |
-D VAPOURSYNTH_INCLUDE_DIRECTORY=vapoursynth\include | |
-D protobuf_DIR=protobuf\install\cmake | |
-D ONNX_DIR=onnx\install\lib\cmake\ONNX | |
-D ONNX_RUNTIME_API_DIRECTORY=onnxruntime-gpu\include\onnxruntime | |
-D ONNX_RUNTIME_LIB_DIRECTORY=onnxruntime-gpu\lib | |
-D ENABLE_CUDA=1 | |
-D CUDAToolkit_ROOT="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | |
-D ENABLE_DML=1 | |
-D CMAKE_CXX_STANDARD=20 | |
- name: Build | |
run: cmake --build build --verbose | |
- name: Install | |
run: | | |
cmake --install build --prefix install | |
mkdir artifact | |
mkdir artifact\vsort | |
copy install\bin\vsort.dll artifact\ | |
copy onnxruntime-gpu\bin\*.dll artifact\vsort\ | |
copy onnxruntime-gpu\lib\*.dll artifact\vsort\ | |
- name: Download DirectML Library | |
# follows DirectML in https://github.com/AmusementClub/onnxruntime/blob/master/cmake/external/dml.cmake#L44 | |
run: | | |
curl -s -o directml.nupkg -LJO https://www.nuget.org/api/v2/package/Microsoft.AI.DirectML/1.14.1 | |
unzip -q directml.nupkg -d dml | |
copy dml\bin\x64-win\DirectML.dll artifact\vsort\ | |
- name: Upload | |
uses: actions/upload-artifact@v4 | |
with: | |
name: VSORT-Windows-x64 | |
path: vsort/artifact | |
- name: Setup Python portable | |
run: | | |
curl -s -o python.zip -LJO https://www.python.org/ftp/python/3.9.10/python-3.9.10-embed-amd64.zip | |
7z x python.zip -ovs_portable | |
- name: Install VapourSynth portable | |
run: | | |
curl -s -o vs.7z -LJO https://github.com/vapoursynth/vapoursynth/releases/download/R54/VapourSynth64-Portable-R54.7z | |
7z x vs.7z -ovs_portable -y | |
- name: Copy plugin | |
run: | | |
copy artifact\*.dll vs_portable\vapoursynth64\plugins | |
mkdir vs_portable\vapoursynth64\plugins\vsort\ | |
copy artifact\vsort\*.dll vs_portable\vapoursynth64\plugins\vsort\ | |
- name: Install waifu2x model | |
run: | | |
curl -s -o waifu2x.7z -LJO https://github.com/AmusementClub/vs-mlrt/releases/download/model-20211209/waifu2x_v3.7z | |
7z x waifu2x.7z -ovs_portable\vapoursynth64\plugins\models | |
- name: Download x265 | |
run: | | |
curl -s -o x265.7z -LJO https://github.com/AmusementClub/x265/releases/download/Yuuki-3.5-AC3/x265-win64-x86-64-clang.Yuuki-3.5-AC3.7z | |
7z x x265.7z -ovs_portable\ | |
- name: Create script | |
shell: bash | |
run: echo "import vapoursynth as vs;from vapoursynth import core;import sys;print(core.ort, file=sys.stderr);print(core.ort.Version(),file=sys.stderr);core.std.BlankClip(format=vs.RGBS).ort.Model(r\"waifu2x\\upconv_7_anime_style_art_rgb\\scale2.0x_model.onnx\", builtin=True).resize.Bicubic(format=vs.YUV420P10, matrix_s='709').set_output()" > test.vpy | |
- name: Run vspipe | |
shell: bash | |
run: | | |
set -ex | |
vs_portable/vspipe -i test.vpy - | |
vs_portable/vspipe --y4m -p -e 9 test.vpy - | vs_portable/x265 --log-file x265.log --log-file-level info --y4m -D 10 --preset ultrafast -o out.hevc - | |
ls -l out.hevc x265.log | |
cat x265.log | |
grep -F 'encoded 10 frames' x265.log || exit 2 | |
grep -i 'error' x265.log && exit 1 | |
exit 0 | |
- name: Create script (fp16) | |
shell: bash | |
run: echo "import vapoursynth as vs;from vapoursynth import core;import sys;print(core.ort, file=sys.stderr);core.std.BlankClip(format=vs.RGBS).ort.Model(r\"waifu2x\\upconv_7_anime_style_art_rgb\\scale2.0x_model.onnx\", builtin=True, fp16=True).resize.Bicubic(format=vs.YUV420P10, matrix_s='709').set_output()" > test_fp16.vpy | |
- name: Run vspipe (fp16) | |
shell: bash | |
run: | | |
set -ex | |
vs_portable/vspipe -i test_fp16.vpy - | |
vs_portable/vspipe --y4m -p -e 9 test_fp16.vpy - | vs_portable/x265 --log-file x265.log --log-file-level info --y4m -D 10 --preset ultrafast -o out.hevc - | |
ls -l out.hevc x265.log | |
cat x265.log | |
grep -F 'encoded 10 frames' x265.log || exit 2 | |
grep -i 'error' x265.log && exit 1 | |
exit 0 | |
- name: Create script (fp16 input) | |
shell: bash | |
run: echo "import vapoursynth as vs;from vapoursynth import core;import sys;print(core.ort, file=sys.stderr);flt=core.std.BlankClip(format=vs.RGBH).ort.Model(r\"waifu2x\\upconv_7_anime_style_art_rgb\\scale2.0x_model.onnx\", builtin=True, fp16=True);print(flt,file=sys.stderr);flt.resize.Bicubic(format=vs.YUV420P10, matrix_s='709').set_output()" > test_fp16_input.vpy | |
- name: Run vspipe (fp16 input) | |
shell: bash | |
run: | | |
set -ex | |
vs_portable/vspipe -i test_fp16_input.vpy - | |
vs_portable/vspipe --y4m -p -e 9 test_fp16_input.vpy - | vs_portable/x265 --log-file x265.log --log-file-level info --y4m -D 10 --preset ultrafast -o out.hevc - | |
ls -l out.hevc x265.log | |
cat x265.log | |
grep -F 'encoded 10 frames' x265.log || exit 2 | |
grep -i 'error' x265.log && exit 1 | |
exit 0 | |
- name: Create script (fp16 output) | |
shell: bash | |
run: echo "import vapoursynth as vs;from vapoursynth import core;import sys;print(core.ort, file=sys.stderr);flt=core.std.BlankClip(format=vs.RGBS).ort.Model(r\"waifu2x\\upconv_7_anime_style_art_rgb\\scale2.0x_model.onnx\", builtin=True, fp16=True, output_format=1);print(flt,file=sys.stderr);flt.resize.Bicubic(format=vs.YUV420P10, matrix_s='709').set_output()" > test_fp16_output.vpy | |
- name: Run vspipe (fp16 output) | |
shell: bash | |
run: | | |
set -ex | |
vs_portable/vspipe -i test_fp16_output.vpy - | |
vs_portable/vspipe --y4m -p -e 9 test_fp16_output.vpy - | vs_portable/x265 --log-file x265.log --log-file-level info --y4m -D 10 --preset ultrafast -o out.hevc - | |
ls -l out.hevc x265.log | |
cat x265.log | |
grep -F 'encoded 10 frames' x265.log || exit 2 | |
grep -i 'error' x265.log && exit 1 | |
exit 0 | |
- name: Create script (flexible output) | |
shell: bash | |
run: echo "import vapoursynth as vs;from vapoursynth import core;import sys;print(core.ort, file=sys.stderr);print(core.ort.Version(),file=sys.stderr);prop='test';output=core.std.BlankClip(format=vs.RGBS).ort.Model(r\"waifu2x\\upconv_7_anime_style_art_rgb\\scale2.0x_model.onnx\", builtin=True, flexible_output_prop=prop);core.std.ShufflePlanes([output['clip'].std.PropToClip(prop=f'{prop}{i}') for i in range(output['num_planes'])], [0, 0, 0], vs.RGB).resize.Bicubic(format=vs.YUV420P10, matrix_s='709').set_output()" > test_flexible_output.vpy | |
- name: Run vspipe (flexible output) | |
shell: bash | |
run: | | |
set -ex | |
vs_portable/vspipe -i test_flexible_output.vpy - | |
vs_portable/vspipe --y4m -p -e 9 test_flexible_output.vpy - | vs_portable/x265 --log-file x265.log --log-file-level info --y4m -D 10 --preset ultrafast -o out.hevc - | |
ls -l out.hevc x265.log | |
cat x265.log | |
grep -F 'encoded 10 frames' x265.log || exit 2 | |
grep -i 'error' x265.log && exit 1 | |
exit 0 | |
- name: Describe | |
run: git describe --tags --long | |
- name: Dump dependencies | |
run: dumpbin /dependents artifact\vsort.dll | |
- name: Compress artifact for release | |
if: github.event_name == 'workflow_dispatch' && github.event.inputs.tag != '' | |
run: | | |
cd artifact | |
7z a -t7z -mx=7 ../../VSORT-Windows-x64.${{ github.event.inputs.tag }}.7z . | |
- name: Release | |
uses: softprops/action-gh-release@v2 | |
if: github.event_name == 'workflow_dispatch' && github.event.inputs.tag != '' | |
with: | |
tag_name: ${{ inputs.tag }} | |
files: VSORT-Windows-x64.${{ github.event.inputs.tag }}.7z | |
fail_on_unmatched_files: true | |
generate_release_notes: false | |
prerelease: true |