Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add SDXL regression test on mi308 #19747

Merged
merged 9 commits into from
Jan 25, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions .github/workflows/pkgci_regression_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,18 @@ jobs:
- name: amdgpu_rocm_mi250_gfx90a
rocm-chip: gfx90a
backend: rocm
sku: mi250
runs-on: nodai-amdgpu-mi250-x86-64
- name: amdgpu_rocm_mi300_gfx942
rocm-chip: gfx942
backend: rocm
sku: mi300
runs-on: nodai-amdgpu-mi300-x86-64
- name: amdgpu_rocm_mi308_gfx942
rocm-chip: gfx942
backend: rocm
sku: mi308
runs-on: nodai-amdgpu-mi308-x86-64
env:
PACKAGE_DOWNLOAD_DIR: ${{ github.workspace }}/.packages
IREE_TEST_PATH_EXTENSION: ${{ github.workspace }}/build_tools/pkgci/external_test_suite
Expand Down Expand Up @@ -95,6 +102,7 @@ jobs:
--durations=0
env:
ROCM_CHIP: ${{ matrix.rocm-chip }}
SKU: ${{ matrix.sku }}

- name: "Running SD3 special model tests"
if: "!cancelled()"
Expand Down Expand Up @@ -165,3 +173,32 @@ jobs:
--timeout=600 \
--retries 7
echo "$(<job_summary.md )" >> $GITHUB_STEP_SUMMARY
# Note: allowing 10% deviation from observed averages here to account for
# different runner conditions.
- name: "Running SDXL rocm pipeline benchmark (mi308)"
if: contains(matrix.name, 'rocm_mi308_gfx942')
run: |
source ${VENV_DIR}/bin/activate
pytest ./experimental/benchmarks/sdxl/benchmark_sdxl_rocm.py \
--goldentime-tolerance-multiplier 1.1 \
--goldentime-rocm-e2e-ms 800.0 \
--goldentime-rocm-unet-ms 195.0 \
--goldentime-rocm-clip-ms 15.0 \
--goldentime-rocm-vae-ms 190.0 \
--goldendispatch-rocm-unet 1602 \
--goldendispatch-rocm-clip 1139 \
--goldendispatch-rocm-vae 246 \
--goldensize-rocm-unet-bytes 2270000 \
--goldensize-rocm-clip-bytes 860000 \
--goldensize-rocm-vae-bytes 840000 \
--goldentime-rocm-punet-int8-fp16-ms 140.0 \
--goldentime-rocm-punet-int8-fp8-ms 150 \
--goldendispatch-rocm-punet-int8-fp16 1424 \
--goldendispatch-rocm-punet-int8-fp8 1704 \
--goldensize-rocm-punet-int8-fp8-bytes 2800000 \
--goldensize-rocm-punet-int8-fp16-bytes 2560000 \
--rocm-chip gfx942 \
--log-cli-level=info \
--timeout=600 \
--retries 7
echo "$(<job_summary.md )" >> $GITHUB_STEP_SUMMARY
Original file line number Diff line number Diff line change
@@ -0,0 +1,255 @@
module attributes { transform.with_named_sequence } {
//===----------------------------------------------------------------------===//
// Tuning infra
//===----------------------------------------------------------------------===//

transform.named_sequence @apply_op_config(%op: !transform.any_op {transform.readonly},
%config: !transform.any_param {transform.readonly}) {
transform.annotate %op "compilation_info" = %config : !transform.any_op, !transform.any_param
// transform.print %op {name = "Applied"} : !transform.any_op
transform.yield
}

transform.named_sequence @apply_attn_op_config(%attention: !transform.any_op {transform.readonly},
%config: !transform.any_param {transform.readonly},
%decomposition_config: !transform.any_param {transform.readonly}) {
transform.annotate %attention "compilation_info" = %config : !transform.any_op, !transform.any_param
transform.annotate %attention "decomposition_config" = %decomposition_config : !transform.any_op, !transform.any_param
// transform.print %attention {name = "Applied attention config"} : !transform.any_op
transform.yield
}

transform.named_sequence @match_attention_f16(%attention: !transform.any_op {transform.readonly}) -> (!transform.any_op, !transform.any_param, !transform.any_param) {
transform.match.operation_name %attention ["iree_linalg_ext.attention"] : !transform.any_op
%in0 = transform.get_operand %attention[0] : (!transform.any_op) -> !transform.any_value
transform.iree.match.cast_compatible_type %in0 = tensor<?x?x?x?xf16> : !transform.any_value

%config = transform.param.constant #iree_codegen.compilation_info<
lowering_config = #iree_gpu.lowering_config<{workgroup = [1, 1, 64, 0, 0, 0], reduction=[0, 0, 0, 0, 0, 64], promote_operands = [1, 2]}>,
translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute
workgroup_size = [64, 4]
subgroup_size = 64 ,
{llvm_func_attrs = { "amdgpu-waves-per-eu" = "2", "denormal-fp-math-f32" = "preserve-sign" }}>>
-> !transform.any_param

%decomposition_config = transform.param.constant {
qk_attrs = {attention_qk_matmul,
lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.virtual_mma_layout<intrinsic = VMFMA_F32_32x32x16_F16>,
subgroup_m_count = 4, subgroup_n_count = 1, promote_operands = [1] }>},
pv_attrs = {attention_pv_matmul,
lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>,
subgroup_m_count = 4, subgroup_n_count = 1, promote_operands = [1] }>}
} -> !transform.any_param

transform.yield %attention, %config, %decomposition_config : !transform.any_op, !transform.any_param, !transform.any_param
}

transform.named_sequence @match_mmt_f16_f16_f32(%root: !transform.any_op {transform.readonly}) -> (!transform.any_op) {
transform.match.operation_name %root ["linalg.generic"] : !transform.any_op
// transform.print %root {name = "Generic"} : !transform.any_op
%ins, %outs = transform.iree.match.cast_compatible_dag_from_root %root {
^bb0(%lhs: tensor<?x?xf16>, %rhs: tensor<?x?xf16>, %out: tensor<?x?xf32>):
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>,
affine_map<(d0, d1, d2) -> (d1, d2)>,
affine_map<(d0, d1, d2) -> (d0, d1)>],
iterator_types = ["parallel", "parallel", "reduction"]}
ins(%lhs, %rhs : tensor<?x?xf16>, tensor<?x?xf16>) outs(%out : tensor<?x?xf32>) {
^bb0(%in: f16, %in_0: f16, %acc: f32):
%18 = arith.extf %in : f16 to f32
%19 = arith.extf %in_0 : f16 to f32
%20 = arith.mulf %18, %19 : f32
%21 = arith.addf %acc, %20 : f32
linalg.yield %21 : f32
} -> tensor<?x?xf32>
} : (!transform.any_op) -> (!transform.any_value, !transform.any_value)
transform.yield %root : !transform.any_op
}

// TUNING_SPEC_BEGIN DO NOT REMOVE

//===----------------------------------------------------------------------===//
// Matmul tuning
//===----------------------------------------------------------------------===//

transform.named_sequence @match_mmt_1920x10240x1280(%matmul: !transform.any_op {transform.readonly}) -> (!transform.any_op, !transform.any_param) {
%mmt = transform.include @match_mmt_f16_f16_f32 failures(propagate) (%matmul) : (!transform.any_op) -> !transform.any_op
%lhs = transform.get_operand %matmul[0] : (!transform.any_op) -> !transform.any_value
%rhs = transform.get_operand %matmul[1] : (!transform.any_op) -> !transform.any_value
transform.iree.match.cast_compatible_type %lhs = tensor<1920x1280xf16> : !transform.any_value
transform.iree.match.cast_compatible_type %rhs = tensor<10240x1280xf16> : !transform.any_value
%config = transform.param.constant #iree_codegen.compilation_info<
lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1],
mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
subgroup_m_count = 4, subgroup_n_count = 2,
reduction = [0, 0, 32],
workgroup = [128, 128, 0]}>,
translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute
workgroup_size = [128, 4, 1] subgroup_size = 64,
{gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true>,
llvm_func_attrs = {"amdgpu-waves-per-eu" = "2"}
}>> -> !transform.any_param
transform.yield %matmul, %config : !transform.any_op, !transform.any_param
}

transform.named_sequence @match_mmt_1920x1280x1280(%matmul: !transform.any_op {transform.readonly}) -> (!transform.any_op, !transform.any_param) {
%mmt = transform.include @match_mmt_f16_f16_f32 failures(propagate) (%matmul) : (!transform.any_op) -> !transform.any_op
%lhs = transform.get_operand %matmul[0] : (!transform.any_op) -> !transform.any_value
%rhs = transform.get_operand %matmul[1] : (!transform.any_op) -> !transform.any_value
transform.iree.match.cast_compatible_type %lhs = tensor<1920x1280xf16> : !transform.any_value
transform.iree.match.cast_compatible_type %rhs = tensor<1280x1280xf16> : !transform.any_value
%config = transform.param.constant #iree_codegen.compilation_info<
lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1],
mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
subgroup_m_count = 4, subgroup_n_count = 2,
reduction = [0, 0, 32],
workgroup = [128, 128, 0]}>,
translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute
workgroup_size = [128, 4, 1] subgroup_size = 64,
{gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true>,
llvm_func_attrs = {"amdgpu-waves-per-eu" = "2"}
}>> -> !transform.any_param
transform.yield %matmul, %config : !transform.any_op, !transform.any_param
}

transform.named_sequence @match_mmt_1920x1280x5120(%matmul: !transform.any_op {transform.readonly}) -> (!transform.any_op, !transform.any_param) {
%mmt = transform.include @match_mmt_f16_f16_f32 failures(propagate) (%matmul) : (!transform.any_op) -> !transform.any_op
%lhs = transform.get_operand %matmul[0] : (!transform.any_op) -> !transform.any_value
%rhs = transform.get_operand %matmul[1] : (!transform.any_op) -> !transform.any_value
transform.iree.match.cast_compatible_type %lhs = tensor<1920x5120xf16> : !transform.any_value
transform.iree.match.cast_compatible_type %rhs = tensor<1280x5120xf16> : !transform.any_value
%config = transform.param.constant #iree_codegen.compilation_info<
lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1],
mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
subgroup_m_count = 4, subgroup_n_count = 2,
reduction = [0, 0, 32],
workgroup = [128, 128, 0]}>,
translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute
workgroup_size = [128, 4, 1] subgroup_size = 64,
{gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true>,
llvm_func_attrs = {"amdgpu-waves-per-eu" = "2"}
}>> -> !transform.any_param
transform.yield %matmul, %config : !transform.any_op, !transform.any_param
}

transform.named_sequence @match_mmt_7680x5120x640(%matmul: !transform.any_op {transform.readonly}) -> (!transform.any_op, !transform.any_param) {
%mmt = transform.include @match_mmt_f16_f16_f32 failures(propagate) (%matmul) : (!transform.any_op) -> !transform.any_op
%lhs = transform.get_operand %matmul[0] : (!transform.any_op) -> !transform.any_value
%rhs = transform.get_operand %matmul[1] : (!transform.any_op) -> !transform.any_value
transform.iree.match.cast_compatible_type %lhs = tensor<7680x640xf16> : !transform.any_value
transform.iree.match.cast_compatible_type %rhs = tensor<5120x640xf16> : !transform.any_value
%config = transform.param.constant #iree_codegen.compilation_info<
lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1],
mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
subgroup_m_count = 2, subgroup_n_count = 4,
reduction = [0, 0, 32],
workgroup = [128, 256, 0]}>,
translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute
workgroup_size = [256, 2, 1] subgroup_size = 64,
{gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true>,
llvm_func_attrs = {"amdgpu-waves-per-eu" = "2"}
}>> -> !transform.any_param
transform.yield %matmul, %config : !transform.any_op, !transform.any_param
}

transform.named_sequence @match_mmt_128x1280x2048(%matmul: !transform.any_op {transform.readonly}) -> (!transform.any_op, !transform.any_param) {
%mmt = transform.include @match_mmt_f16_f16_f32 failures(propagate) (%matmul) : (!transform.any_op) -> !transform.any_op
%lhs = transform.get_operand %matmul[0] : (!transform.any_op) -> !transform.any_value
%rhs = transform.get_operand %matmul[1] : (!transform.any_op) -> !transform.any_value
transform.iree.match.cast_compatible_type %lhs = tensor<1280x2048xf16> : !transform.any_value
transform.iree.match.cast_compatible_type %rhs = tensor<1280x2048xf16> : !transform.any_value
%config = transform.param.constant #iree_codegen.compilation_info<
lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1],
mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
subgroup_m_count = 2, subgroup_n_count = 1,
reduction = [0, 0, 128],
workgroup = [64, 16, 0]}>,
translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute
workgroup_size = [64, 2, 1] subgroup_size = 64,
{gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true>,
llvm_func_attrs = {"amdgpu-waves-per-eu" = "2"}
}>> -> !transform.any_param
transform.yield %matmul, %config : !transform.any_op, !transform.any_param
}

transform.named_sequence @match_mmt_7680x640x640(%matmul: !transform.any_op {transform.readonly}) -> (!transform.any_op, !transform.any_param) {
%mmt = transform.include @match_mmt_f16_f16_f32 failures(propagate) (%matmul) : (!transform.any_op) -> !transform.any_op
%lhs = transform.get_operand %matmul[0] : (!transform.any_op) -> !transform.any_value
%rhs = transform.get_operand %matmul[1] : (!transform.any_op) -> !transform.any_value
transform.iree.match.cast_compatible_type %lhs = tensor<7680x640xf16> : !transform.any_value
transform.iree.match.cast_compatible_type %rhs = tensor<640x640xf16> : !transform.any_value
%config = transform.param.constant #iree_codegen.compilation_info<
lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1],
mma_kind = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>,
subgroup_m_count = 1, subgroup_n_count = 4,
reduction = [0, 0, 32],
workgroup = [256, 128, 0]}>,
translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute
workgroup_size = [256, 1, 1] subgroup_size = 64,
{gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true>,
llvm_func_attrs = {"amdgpu-waves-per-eu" = "1"}
}>> -> !transform.any_param
transform.yield %matmul, %config : !transform.any_op, !transform.any_param
}

transform.named_sequence @match_mmt_7680x640x2560(%matmul: !transform.any_op {transform.readonly}) -> (!transform.any_op, !transform.any_param) {
%mmt = transform.include @match_mmt_f16_f16_f32 failures(propagate) (%matmul) : (!transform.any_op) -> !transform.any_op
%lhs = transform.get_operand %matmul[0] : (!transform.any_op) -> !transform.any_value
%rhs = transform.get_operand %matmul[1] : (!transform.any_op) -> !transform.any_value
transform.iree.match.cast_compatible_type %lhs = tensor<7680x2560xf16> : !transform.any_value
transform.iree.match.cast_compatible_type %rhs = tensor<640x2560xf16> : !transform.any_value
%config = transform.param.constant #iree_codegen.compilation_info<
lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1],
mma_kind = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>,
subgroup_m_count = 4, subgroup_n_count = 2,
reduction = [0, 0, 32],
workgroup = [256, 128, 0]}>,
translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute
workgroup_size = [128, 4, 1] subgroup_size = 64,
{gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true>,
llvm_func_attrs = {"amdgpu-waves-per-eu" = "4"}
}>> -> !transform.any_param
transform.yield %matmul, %config : !transform.any_op, !transform.any_param
}

//===----------------------------------------------------------------------===//
// Convolution tuning
//===----------------------------------------------------------------------===//

//===----------------------------------------------------------------------===//
// Batch matmul tuning
//===----------------------------------------------------------------------===//

//===----------------------------------------------------------------------===//
// Broadcast rhs mmt tuning
//===----------------------------------------------------------------------===//

//===----------------------------------------------------------------------===//
// Contraction tuning
//===----------------------------------------------------------------------===//

// TUNING_SPEC_END DO NOT REMOVE

//===----------------------------------------------------------------------===//
// Entry point
//===----------------------------------------------------------------------===//

transform.named_sequence @__kernel_config(%variant_op: !transform.any_op {transform.consumed}) {
transform.foreach_match in %variant_op
@match_attention_f16 -> @apply_attn_op_config

// TUNING_MATCH_BEGIN DO NOT REMOVE

// MMT.
, @match_mmt_1920x10240x1280 -> @apply_op_config
, @match_mmt_1920x1280x1280 -> @apply_op_config
, @match_mmt_1920x1280x5120 -> @apply_op_config
, @match_mmt_7680x5120x640 -> @apply_op_config
, @match_mmt_128x1280x2048 -> @apply_op_config
, @match_mmt_7680x640x640 -> @apply_op_config
, @match_mmt_7680x640x2560 -> @apply_op_config

// TUNING_MATCH_END DO NOT REMOVE
: (!transform.any_op) -> (!transform.any_op)
transform.yield
}
} //// module
Loading
Loading