Skip to content
This repository was archived by the owner on Nov 27, 2024. It is now read-only.

StableVideoDiffusion model converter #137

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions OnnxStack.Converter/stable_diffusion_video/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
/footprints/
/cache/
/result_*.png
20 changes: 20 additions & 0 deletions OnnxStack.Converter/stable_diffusion_video/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# OnnxStack.Converter

## Requirements
```bash
pip install onnxruntime-directml
pip install olive-ai[directml]
python -m pip install -r requirements.txt
```

## Usage
```bash
convert.py --optimize --model_input '..\stable-video-diffusion-img2vid-xt' --model_output '..\converted'
```
`--optimize` - Run the model optimization

`--model_input` - Safetensor model to convert

`--model_output` - Output for converted ONNX model (NOTE: This folder is deleted before each run)

`--image_encoder` - Convert the optional image encoder
8 changes: 8 additions & 0 deletions OnnxStack.Converter/stable_diffusion_video/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
# --------------------------------------------------------------------------

vae_sample_size = 512
unet_sample_size = 24
cross_attention_dim = 1280
110 changes: 110 additions & 0 deletions OnnxStack.Converter/stable_diffusion_video/config_unet.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
{
"input_model": {
"type": "PyTorchModel",
"config": {
"model_path": "stabilityai/stable-video-diffusion-img2vid-xt",
"model_loader": "unet_load",
"model_script": "models.py",
"io_config": {
"input_names": [ "sample", "timestep", "encoder_hidden_states", "added_time_ids" ],
"output_names": [ "out_sample" ],
"dynamic_axes": {
"sample": {"0": "batch", "1": "frames", "2": "channel", "3": "height", "4": "width"},
"timestep": {"0": "timestep"},
"encoder_hidden_states": {"0": "batch", "1": "sequence_length", "2": "cross_attention_dim"},
"added_time_ids": {"0": "batch", "1": "num_additional_ids" }
}
},
"dummy_inputs_func": "unet_conversion_inputs"
}
},
"systems": {
"local_system": {
"type": "LocalSystem",
"config": {
"accelerators": [
{
"device": "gpu",
"execution_providers": [
"DmlExecutionProvider"
]
}
]
}
}
},
"evaluators": {
"common_evaluator": {
"metrics": [
{
"name": "latency",
"type": "latency",
"sub_types": [{"name": "avg"}],
"user_config": {
"user_script": "models.py",
"dataloader_func": "unet_data_loader",
"batch_size": 2
}
}
]
}
},
"passes": {
"convert": {
"type": "OnnxConversion",
"config": {
"target_opset": 16,
"save_as_external_data": true,
"all_tensors_to_one_file": true
}
},
"optimize": {
"type": "OrtTransformersOptimization",
"config": {
"model_type": "unet",
"opt_level": 0,
"float16": true,
"use_gpu": true,
"keep_io_types": true,
"optimization_options": {
"enable_gelu": true,
"enable_layer_norm": true,
"enable_attention": true,
"use_multi_head_attention": true,
"enable_skip_layer_norm": false,
"enable_embed_layer_norm": true,
"enable_bias_skip_layer_norm": false,
"enable_bias_gelu": true,
"enable_gelu_approximation": false,
"enable_qordered_matmul": false,
"enable_shape_inference": true,
"enable_gemm_fast_gelu": false,
"enable_nhwc_conv": false,
"enable_group_norm": true,
"enable_bias_splitgelu": false,
"enable_packed_qkv": true,
"enable_packed_kv": true,
"enable_bias_add": false,
"group_norm_channels_last": false
},
"force_fp32_ops": ["RandomNormalLike"],
"force_fp16_inputs": {
"GroupNorm": [0, 1, 2]
}
}
}
},
"pass_flows": [
["convert", "optimize"]
],
"engine": {
"log_severity_level": 0,
"evaluator": "common_evaluator",
"evaluate_input_model": false,
"host": "local_system",
"target": "local_system",
"cache_dir": "cache",
"output_name": "unet",
"output_dir": "footprints"
}
}
105 changes: 105 additions & 0 deletions OnnxStack.Converter/stable_diffusion_video/config_vae_decoder.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
{
"input_model": {
"type": "PyTorchModel",
"config": {
"model_path": "stabilityai/stable-video-diffusion-img2vid-xt",
"model_loader": "vae_decoder_load",
"model_script": "models.py",
"io_config": {
"input_names": [ "latent_sample", "num_frames" ],
"output_names": [ "sample" ],
"dynamic_axes": {
"latent_sample": { "0": "batch", "1": "channels", "2": "height", "3": "width" }
}
},
"dummy_inputs_func": "vae_decoder_conversion_inputs"
}
},
"systems": {
"local_system": {
"type": "LocalSystem",
"config": {
"accelerators": [
{
"device": "gpu",
"execution_providers": [
"DmlExecutionProvider"
]
}
]
}
}
},
"evaluators": {
"common_evaluator": {
"metrics": [
{
"name": "latency",
"type": "latency",
"sub_types": [{"name": "avg"}],
"user_config": {
"user_script": "models.py",
"dataloader_func": "vae_decoder_data_loader",
"batch_size": 1
}
}
]
}
},
"passes": {
"convert": {
"type": "OnnxConversion",
"config": {
"target_opset": 16
}
},
"optimize": {
"type": "OrtTransformersOptimization",
"config": {
"model_type": "vae",
"opt_level": 0,
"float16": true,
"use_gpu": true,
"keep_io_types": false,
"optimization_options": {
"enable_gelu": true,
"enable_layer_norm": true,
"enable_attention": true,
"use_multi_head_attention": true,
"enable_skip_layer_norm": false,
"enable_embed_layer_norm": true,
"enable_bias_skip_layer_norm": false,
"enable_bias_gelu": true,
"enable_gelu_approximation": false,
"enable_qordered_matmul": false,
"enable_shape_inference": true,
"enable_gemm_fast_gelu": false,
"enable_nhwc_conv": false,
"enable_group_norm": true,
"enable_bias_splitgelu": false,
"enable_packed_qkv": true,
"enable_packed_kv": true,
"enable_bias_add": false,
"group_norm_channels_last": false
},
"force_fp32_ops": ["RandomNormalLike"],
"force_fp16_inputs": {
"GroupNorm": [0, 1, 2]
}
}
}
},
"pass_flows": [
["convert", "optimize"]
],
"engine": {
"log_severity_level": 0,
"evaluator": "common_evaluator",
"evaluate_input_model": false,
"host": "local_system",
"target": "local_system",
"cache_dir": "cache",
"output_name": "vae_decoder",
"output_dir": "footprints"
}
}
103 changes: 103 additions & 0 deletions OnnxStack.Converter/stable_diffusion_video/config_vae_encoder.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
{
"input_model": {
"type": "PyTorchModel",
"config": {
"model_path": "stabilityai/stable-video-diffusion-img2vid-xt",
"model_loader": "vae_encoder_load",
"model_script": "models.py",
"io_config": {
"input_names": [ "sample" ],
"output_names": [ "latent_sample" ],
"dynamic_axes": { "sample": { "0": "batch", "1": "channels", "2": "height", "3": "width" } }
},
"dummy_inputs_func": "vae_encoder_conversion_inputs"
}
},
"systems": {
"local_system": {
"type": "LocalSystem",
"config": {
"accelerators": [
{
"device": "gpu",
"execution_providers": [
"DmlExecutionProvider"
]
}
]
}
}
},
"evaluators": {
"common_evaluator": {
"metrics": [
{
"name": "latency",
"type": "latency",
"sub_types": [{"name": "avg"}],
"user_config": {
"user_script": "models.py",
"dataloader_func": "vae_encoder_data_loader",
"batch_size": 1
}
}
]
}
},
"passes": {
"convert": {
"type": "OnnxConversion",
"config": {
"target_opset": 16
}
},
"optimize": {
"type": "OrtTransformersOptimization",
"config": {
"model_type": "vae",
"opt_level": 0,
"float16": true,
"use_gpu": true,
"keep_io_types": false,
"optimization_options": {
"enable_gelu": true,
"enable_layer_norm": true,
"enable_attention": true,
"use_multi_head_attention": true,
"enable_skip_layer_norm": false,
"enable_embed_layer_norm": true,
"enable_bias_skip_layer_norm": false,
"enable_bias_gelu": true,
"enable_gelu_approximation": false,
"enable_qordered_matmul": false,
"enable_shape_inference": true,
"enable_gemm_fast_gelu": false,
"enable_nhwc_conv": false,
"enable_group_norm": true,
"enable_bias_splitgelu": false,
"enable_packed_qkv": true,
"enable_packed_kv": true,
"enable_bias_add": false,
"group_norm_channels_last": false
},
"force_fp32_ops": ["RandomNormalLike"],
"force_fp16_inputs": {
"GroupNorm": [0, 1, 2]
}
}
}
},
"pass_flows": [
["convert", "optimize"]
],
"engine": {
"log_severity_level": 0,
"evaluator": "common_evaluator",
"evaluate_input_model": false,
"host": "local_system",
"target": "local_system",
"cache_dir": "cache",
"output_name": "vae_encoder",
"output_dir": "footprints"
}
}
Loading