From 9e1f9767a5727f41d819f01815d1e520f612ce8b Mon Sep 17 00:00:00 2001 From: Sasha Lopoukhine Date: Wed, 25 Dec 2024 20:30:28 +0100 Subject: [PATCH] testing: split bottom_up into f32 and f64 kernels (#3676) There's still a bit of work to do on the f32 front, this PR just splits the test file, and removes erroneous f32 mentions from the f64 kernels. --- .../riscv-backend-paper/bottom_up_f32.mlir | 210 ++++++++++++++++++ .../{bottom_up.mlir => bottom_up_f64.mlir} | 108 +-------- 2 files changed, 216 insertions(+), 102 deletions(-) create mode 100644 tests/filecheck/projects/riscv-backend-paper/bottom_up_f32.mlir rename tests/filecheck/projects/riscv-backend-paper/{bottom_up.mlir => bottom_up_f64.mlir} (86%) diff --git a/tests/filecheck/projects/riscv-backend-paper/bottom_up_f32.mlir b/tests/filecheck/projects/riscv-backend-paper/bottom_up_f32.mlir new file mode 100644 index 0000000000..6cc1b42887 --- /dev/null +++ b/tests/filecheck/projects/riscv-backend-paper/bottom_up_f32.mlir @@ -0,0 +1,210 @@ +// RUN: xdsl-opt -p test-lower-linalg-to-snitch -t riscv-asm %s | filecheck %s + +func.func public @ssum( + %X: memref<8x16xf32>, + %Y: memref<8x16xf32>, + %Z: memref<8x16xf32> +) { + linalg.generic { + indexing_maps = [ + affine_map<(d0, d1) -> (d0, d1)>, + affine_map<(d0, d1) -> (d0, d1)>, + affine_map<(d0, d1) -> (d0, d1)> + ], + iterator_types = ["parallel", "parallel"] + } ins(%X, %Y : memref<8x16xf32>, memref<8x16xf32>) outs(%Z : memref<8x16xf32>) { + ^1(%in : f32, %in_1 : f32, %out : f32): + %3 = arith.addf %in, %in_1 : f32 + linalg.yield %3 : f32 + } + func.return +} + +// CHECK: .text +// CHECK-NEXT: .globl ssum +// CHECK-NEXT: .p2align 2 +// CHECK-NEXT: # Regalloc stats: {"preallocated_float": ["ft0", "ft1", "ft2"], "preallocated_int": ["a0", "a1", "a2", "zero"], "allocated_float": ["ft0", "ft1", "ft2"], "allocated_int": ["a0", "a1", "a2", "t0", "t1", "t2", "t3", "zero"]} +// CHECK-NEXT: ssum: +// CHECK-NEXT: mv t2, a0 +// CHECK-NEXT: mv t1, a1 +// CHECK-NEXT: mv t0, a2 +// CHECK-NEXT: li t3, 63 +// CHECK-NEXT: scfgwi t3, 95 # dm 31 dim 0 bound +// CHECK-NEXT: li t3, 8 +// CHECK-NEXT: scfgwi t3, 223 # dm 31 dim 0 stride +// CHECK-NEXT: scfgwi zero, 63 # dm 31 repeat +// CHECK-NEXT: scfgwi t2, 768 # dm 0 dim 0 source +// CHECK-NEXT: scfgwi t1, 769 # dm 1 dim 0 source +// CHECK-NEXT: scfgwi t0, 898 # dm 2 dim 0 destination +// CHECK-NEXT: csrrsi zero, 1984, 1 # SSR enable +// CHECK-NEXT: li t0, 63 +// CHECK-NEXT: frep.o t0, 1, 0, 0 +// CHECK-NEXT: vfadd.s ft2, ft0, ft1 +// CHECK-NEXT: csrrci zero, 1984, 1 # SSR disable +// CHECK-NEXT: ret + +// x[ M x K ] +// y[ K x N ] +// g[ M x N ] +func.func public @pooling_nchw_max_d1_s2_3x3( + %X: memref<1x1x18x18xf64>, + %Y: memref<1x1x8x8xf64> +) -> () { + %min_val = arith.constant -10000 : f64 + linalg.generic { + indexing_maps = [ + affine_map<(d0, d1, d2, d3) -> ()>, + affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> + ], + iterator_types = ["parallel", "parallel", "parallel", "parallel"] + } ins(%min_val : f64) outs(%Y : memref<1x1x8x8xf64>) { + ^bb0(%in: f64, %out: f64): + linalg.yield %in : f64 + } + %alloc = memref.alloc() {alignment = 64 : i64} : memref<3x3xf32> + linalg.generic { + bounds = [#builtin.int<1>, #builtin.int<1>, #builtin.int<7>, #builtin.int<7>, #builtin.int<3>, #builtin.int<3>], + indexing_maps = [ + affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2 * 2 + d4, d3 * 2 + d5)>, + affine_map<(d0, d1, d2, d3, d4, d5) -> (d4, d5)>, + affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> + ], + iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction"] + } ins(%X, %alloc : memref<1x1x18x18xf64>, memref<3x3xf32>) outs(%Y : memref<1x1x8x8xf64>) { + ^0(%x : f64, %alloc_val: f64, %acc : f64): + %res = arith.maximumf %x, %acc : f64 + linalg.yield %res : f64 + } + memref.dealloc %alloc : memref<3x3xf32> + func.return + } + + +// CHECK-NEXT: .text +// CHECK-NEXT: .globl pooling_nchw_max_d1_s2_3x3 +// CHECK-NEXT: .p2align 2 +// CHECK-NEXT: # Regalloc stats: {"preallocated_float": ["ft0", "ft1", "ft2"], "preallocated_int": ["a0", "a1", "zero"], "allocated_float": ["ft0", "ft1", "ft3", "ft4", "ft5", "ft6", "ft7"], "allocated_int": ["a0", "a1", "a2", "a3", "t0", "t1", "t2", "t3", "t4", "t5", "t6", "zero"]} +// CHECK-NEXT: pooling_nchw_max_d1_s2_3x3: +// CHECK-NEXT: mv t1, a0 +// CHECK-NEXT: mv t0, a1 +// CHECK-NEXT: li t4, -10000 +// CHECK-NEXT: fcvt.d.w ft3, t4 +// CHECK-NEXT: li t3, 8 +// CHECK-NEXT: mv t2, zero +// CHECK-NEXT: # Constant folded riscv_cf.bge +// CHECK-NEXT: scf_body_{{\d}}_for: +// CHECK-NEXT: li a2, 2 +// CHECK-NEXT: mul a2, t2, a2 +// CHECK-NEXT: li t6, 18 +// CHECK-NEXT: mul a2, a2, t6 +// CHECK-NEXT: li t6, 8 +// CHECK-NEXT: mul a2, a2, t6 # multiply by element size +// CHECK-NEXT: add a2, t1, a2 +// CHECK-NEXT: li t6, 8 +// CHECK-NEXT: mul t6, t2, t6 +// CHECK-NEXT: li t5, 8 +// CHECK-NEXT: mul t6, t6, t5 # multiply by element size +// CHECK-NEXT: add t6, t0, t6 +// CHECK-NEXT: li t5, 3 +// CHECK-NEXT: scfgwi t5, 64 # dm 0 dim 0 bound +// CHECK-NEXT: li t5, 2 +// CHECK-NEXT: scfgwi t5, 96 # dm 0 dim 1 bound +// CHECK-NEXT: li t5, 2 +// CHECK-NEXT: scfgwi t5, 128 # dm 0 dim 2 bound +// CHECK-NEXT: li t5, 1 +// CHECK-NEXT: scfgwi t5, 160 # dm 0 dim 3 bound +// CHECK-NEXT: li t5, 16 +// CHECK-NEXT: scfgwi t5, 192 # dm 0 dim 0 stride +// CHECK-NEXT: li t5, -40 +// CHECK-NEXT: scfgwi t5, 224 # dm 0 dim 1 stride +// CHECK-NEXT: li t5, 80 +// CHECK-NEXT: scfgwi t5, 256 # dm 0 dim 2 stride +// CHECK-NEXT: li t5, -288 +// CHECK-NEXT: scfgwi t5, 288 # dm 0 dim 3 stride +// CHECK-NEXT: scfgwi zero, 32 # dm 0 repeat +// CHECK-NEXT: li t5, 7 +// CHECK-NEXT: scfgwi t5, 65 # dm 1 dim 0 bound +// CHECK-NEXT: li t5, 8 +// CHECK-NEXT: scfgwi t5, 193 # dm 1 dim 0 stride +// CHECK-NEXT: scfgwi zero, 33 # dm 1 repeat +// CHECK-NEXT: scfgwi a2, 864 # dm 0 dim 3 source +// CHECK-NEXT: scfgwi t6, 897 # dm 1 dim 0 destination +// CHECK-NEXT: csrrsi zero, 1984, 1 # SSR enable +// CHECK-NEXT: li t6, 2 +// CHECK-NEXT: mv t5, zero +// CHECK-NEXT: # Constant folded riscv_cf.bge +// CHECK-NEXT: scf_body_{{\d}}_for: +// CHECK-NEXT: fmv.d ft7, ft3 +// CHECK-NEXT: fmv.d ft6, ft3 +// CHECK-NEXT: fmv.d ft5, ft3 +// CHECK-NEXT: fmv.d ft4, ft3 +// CHECK-NEXT: li a3, 8 +// CHECK-NEXT: frep.o a3, 4, 0, 0 +// CHECK-NEXT: fmax.d ft7, ft0, ft7 +// CHECK-NEXT: fmax.d ft6, ft0, ft6 +// CHECK-NEXT: fmax.d ft5, ft0, ft5 +// CHECK-NEXT: fmax.d ft4, ft0, ft4 +// CHECK-NEXT: fmv.d ft1, ft7 +// CHECK-NEXT: fmv.d ft1, ft6 +// CHECK-NEXT: fmv.d ft1, ft5 +// CHECK-NEXT: fmv.d ft1, ft4 +// CHECK-NEXT: addi t5, t5, 1 +// CHECK-NEXT: blt t5, t6, scf_body_{{\d}}_for +// CHECK-NEXT: scf_body_end_{{\d}}_for: +// CHECK-NEXT: csrrci zero, 1984, 1 # SSR disable +// CHECK-NEXT: addi t2, t2, 1 +// CHECK-NEXT: blt t2, t3, scf_body_{{\d}}_for +// CHECK-NEXT: scf_body_end_{{\d}}_for: +// CHECK-NEXT: ret + + + riscv.assembly_section ".text" { + riscv.directive ".globl" "reluf32" + riscv.directive ".p2align" "2" + riscv_func.func @reluf32(%X : !riscv.reg, %Y : !riscv.reg) { + %X_1 = riscv.mv %X : (!riscv.reg) -> !riscv.reg + %Y_1 = riscv.mv %Y : (!riscv.reg) -> !riscv.reg + %zero = riscv.get_register : !riscv.reg + %zero_float = riscv.fcvt.d.w %zero : (!riscv.reg) -> !riscv.freg + %zero_vector = riscv_snitch.vfcpka.s.s %zero_float, %zero_float : (!riscv.freg, !riscv.freg) -> !riscv.freg + snitch_stream.streaming_region { + patterns = [ + #snitch_stream.stride_pattern + ] + } ins(%X_1 : !riscv.reg) outs(%Y_1 : !riscv.reg) { + ^0(%x : !snitch.readable>, %0 : !snitch.writable>): + %c128 = riscv.li 128 : !riscv.reg + %c0 = riscv.li 0 : !riscv.reg + %c1 = riscv.li 1 : !riscv.reg + riscv_scf.for %i : !riscv.reg = %c0 to %c128 step %c1 { + %x_1 = riscv_snitch.read from %x : !riscv.freg + %y = riscv_snitch.vfmax.s %x_1, %zero_vector : (!riscv.freg, !riscv.freg) -> !riscv.freg + riscv_snitch.write %y to %0 : !riscv.freg + } + } + riscv_func.return + } + } + +// CHECK: .text +// CHECK-NEXT: .globl reluf32 +// CHECK-NEXT: .p2align 2 +// CHECK-NEXT: # Regalloc stats: {"preallocated_float": ["ft0", "ft1", "ft2"], "preallocated_int": ["a0", "a1", "zero"], "allocated_float": ["ft0", "ft1", "ft3"], "allocated_int": ["a0", "a1", "t0", "t1", "t2", "zero"]} +// CHECK-NEXT: reluf32: +// CHECK-NEXT: mv t1, a0 +// CHECK-NEXT: mv t0, a1 +// CHECK-NEXT: fcvt.d.w ft3, zero +// CHECK-NEXT: vfcpka.s.s ft3, ft3, ft3 +// CHECK-NEXT: li t2, 127 +// CHECK-NEXT: scfgwi t2, 95 # dm 31 dim 0 bound +// CHECK-NEXT: li t2, 8 +// CHECK-NEXT: scfgwi t2, 223 # dm 31 dim 0 stride +// CHECK-NEXT: scfgwi zero, 63 # dm 31 repeat +// CHECK-NEXT: scfgwi t1, 768 # dm 0 dim 0 source +// CHECK-NEXT: scfgwi t0, 897 # dm 1 dim 0 destination +// CHECK-NEXT: csrrsi zero, 1984, 1 # SSR enable +// CHECK-NEXT: li t0, 127 +// CHECK-NEXT: frep.o t0, 1, 0, 0 +// CHECK-NEXT: vfmax.s ft1, ft0, ft3 +// CHECK-NEXT: csrrci zero, 1984, 1 # SSR disable +// CHECK-NEXT: ret diff --git a/tests/filecheck/projects/riscv-backend-paper/bottom_up.mlir b/tests/filecheck/projects/riscv-backend-paper/bottom_up_f64.mlir similarity index 86% rename from tests/filecheck/projects/riscv-backend-paper/bottom_up.mlir rename to tests/filecheck/projects/riscv-backend-paper/bottom_up_f64.mlir index 5aba84d4a7..a394108326 100644 --- a/tests/filecheck/projects/riscv-backend-paper/bottom_up.mlir +++ b/tests/filecheck/projects/riscv-backend-paper/bottom_up_f64.mlir @@ -1,49 +1,5 @@ // RUN: xdsl-opt -p test-lower-linalg-to-snitch -t riscv-asm %s | filecheck %s - -func.func public @ssum( - %X: memref<8x16xf32>, - %Y: memref<8x16xf32>, - %Z: memref<8x16xf32> -) { - linalg.generic { - indexing_maps = [ - affine_map<(d0, d1) -> (d0, d1)>, - affine_map<(d0, d1) -> (d0, d1)>, - affine_map<(d0, d1) -> (d0, d1)> - ], - iterator_types = ["parallel", "parallel"] - } ins(%X, %Y : memref<8x16xf32>, memref<8x16xf32>) outs(%Z : memref<8x16xf32>) { - ^1(%in : f32, %in_1 : f32, %out : f32): - %3 = arith.addf %in, %in_1 : f32 - linalg.yield %3 : f32 - } - func.return -} - -// CHECK: .text -// CHECK-NEXT: .globl ssum -// CHECK-NEXT: .p2align 2 -// CHECK-NEXT: # Regalloc stats: {"preallocated_float": ["ft0", "ft1", "ft2"], "preallocated_int": ["a0", "a1", "a2", "zero"], "allocated_float": ["ft0", "ft1", "ft2"], "allocated_int": ["a0", "a1", "a2", "t0", "t1", "t2", "t3", "zero"]} -// CHECK-NEXT: ssum: -// CHECK-NEXT: mv t2, a0 -// CHECK-NEXT: mv t1, a1 -// CHECK-NEXT: mv t0, a2 -// CHECK-NEXT: li t3, 63 -// CHECK-NEXT: scfgwi t3, 95 # dm 31 dim 0 bound -// CHECK-NEXT: li t3, 8 -// CHECK-NEXT: scfgwi t3, 223 # dm 31 dim 0 stride -// CHECK-NEXT: scfgwi zero, 63 # dm 31 repeat -// CHECK-NEXT: scfgwi t2, 768 # dm 0 dim 0 source -// CHECK-NEXT: scfgwi t1, 769 # dm 1 dim 0 source -// CHECK-NEXT: scfgwi t0, 898 # dm 2 dim 0 destination -// CHECK-NEXT: csrrsi zero, 1984, 1 # SSR enable -// CHECK-NEXT: li t0, 63 -// CHECK-NEXT: frep.o t0, 1, 0, 0 -// CHECK-NEXT: vfadd.s ft2, ft0, ft1 -// CHECK-NEXT: csrrci zero, 1984, 1 # SSR disable -// CHECK-NEXT: ret - func.func public @conv_2d_nchw_fchw_d1_s1_3x3( %X: memref<1x1x10x10xf64>, %Y: memref<1x1x3x3xf64>, @@ -441,7 +397,7 @@ func.func public @pooling_nchw_max_d1_s2_3x3( ^bb0(%in: f64, %out: f64): linalg.yield %in : f64 } - %alloc = memref.alloc() {alignment = 64 : i64} : memref<3x3xf32> + %alloc = memref.alloc() {alignment = 64 : i64} : memref<3x3xf64> linalg.generic { bounds = [#builtin.int<1>, #builtin.int<1>, #builtin.int<7>, #builtin.int<7>, #builtin.int<3>, #builtin.int<3>], indexing_maps = [ @@ -450,12 +406,12 @@ func.func public @pooling_nchw_max_d1_s2_3x3( affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> ], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction"] - } ins(%X, %alloc : memref<1x1x18x18xf64>, memref<3x3xf32>) outs(%Y : memref<1x1x8x8xf64>) { + } ins(%X, %alloc : memref<1x1x18x18xf64>, memref<3x3xf64>) outs(%Y : memref<1x1x8x8xf64>) { ^0(%x : f64, %alloc_val: f64, %acc : f64): %res = arith.maximumf %x, %acc : f64 linalg.yield %res : f64 } - memref.dealloc %alloc : memref<3x3xf32> + memref.dealloc %alloc : memref<3x3xf64> func.return } @@ -579,58 +535,6 @@ func.func public @pooling_nchw_max_d1_s2_3x3( // CHECK-NEXT: ret - riscv.assembly_section ".text" { - riscv.directive ".globl" "reluf32" - riscv.directive ".p2align" "2" - riscv_func.func @reluf32(%X : !riscv.reg, %Y : !riscv.reg) { - %X_1 = riscv.mv %X : (!riscv.reg) -> !riscv.reg - %Y_1 = riscv.mv %Y : (!riscv.reg) -> !riscv.reg - %zero = riscv.get_register : !riscv.reg - %zero_float = riscv.fcvt.d.w %zero : (!riscv.reg) -> !riscv.freg - %zero_vector = riscv_snitch.vfcpka.s.s %zero_float, %zero_float : (!riscv.freg, !riscv.freg) -> !riscv.freg - snitch_stream.streaming_region { - patterns = [ - #snitch_stream.stride_pattern - ] - } ins(%X_1 : !riscv.reg) outs(%Y_1 : !riscv.reg) { - ^0(%x : !snitch.readable>, %0 : !snitch.writable>): - %c128 = riscv.li 128 : !riscv.reg - %c0 = riscv.li 0 : !riscv.reg - %c1 = riscv.li 1 : !riscv.reg - riscv_scf.for %i : !riscv.reg = %c0 to %c128 step %c1 { - %x_1 = riscv_snitch.read from %x : !riscv.freg - %y = riscv_snitch.vfmax.s %x_1, %zero_vector : (!riscv.freg, !riscv.freg) -> !riscv.freg - riscv_snitch.write %y to %0 : !riscv.freg - } - } - riscv_func.return - } - } - -// CHECK: .text -// CHECK-NEXT: .globl reluf32 -// CHECK-NEXT: .p2align 2 -// CHECK-NEXT: # Regalloc stats: {"preallocated_float": ["ft0", "ft1", "ft2"], "preallocated_int": ["a0", "a1", "zero"], "allocated_float": ["ft0", "ft1", "ft3"], "allocated_int": ["a0", "a1", "t0", "t1", "t2", "zero"]} -// CHECK-NEXT: reluf32: -// CHECK-NEXT: mv t1, a0 -// CHECK-NEXT: mv t0, a1 -// CHECK-NEXT: fcvt.d.w ft3, zero -// CHECK-NEXT: vfcpka.s.s ft3, ft3, ft3 -// CHECK-NEXT: li t2, 127 -// CHECK-NEXT: scfgwi t2, 95 # dm 31 dim 0 bound -// CHECK-NEXT: li t2, 8 -// CHECK-NEXT: scfgwi t2, 223 # dm 31 dim 0 stride -// CHECK-NEXT: scfgwi zero, 63 # dm 31 repeat -// CHECK-NEXT: scfgwi t1, 768 # dm 0 dim 0 source -// CHECK-NEXT: scfgwi t0, 897 # dm 1 dim 0 destination -// CHECK-NEXT: csrrsi zero, 1984, 1 # SSR enable -// CHECK-NEXT: li t0, 127 -// CHECK-NEXT: frep.o t0, 1, 0, 0 -// CHECK-NEXT: vfmax.s ft1, ft0, ft3 -// CHECK-NEXT: csrrci zero, 1984, 1 # SSR disable -// CHECK-NEXT: ret - - // x[ M x K ] // y[ K x N ] // g[ M x N ] @@ -649,7 +553,7 @@ func.func public @pooling_nchw_sum_d1_s2_3x3( ^bb0(%in: f64, %out: f64): linalg.yield %in : f64 } - %alloc = memref.alloc() {alignment = 64 : i64} : memref<3x3xf32> + %alloc = memref.alloc() {alignment = 64 : i64} : memref<3x3xf64> linalg.generic { indexing_maps = [ affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2 * 2 + d4, d3 * 2 + d5)>, @@ -657,12 +561,12 @@ func.func public @pooling_nchw_sum_d1_s2_3x3( affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)> ], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction"] - } ins(%X, %alloc : memref<1x1x18x18xf64>, memref<3x3xf32>) outs(%Y : memref<1x1x8x8xf64>) { + } ins(%X, %alloc : memref<1x1x18x18xf64>, memref<3x3xf64>) outs(%Y : memref<1x1x8x8xf64>) { ^0(%x : f64, %alloc_val: f64, %acc : f64): %res = arith.addf %x, %acc : f64 linalg.yield %res : f64 } - memref.dealloc %alloc : memref<3x3xf32> + memref.dealloc %alloc : memref<3x3xf64> func.return }