diff --git a/CMakeLists.txt b/CMakeLists.txt index 6fae95179..642f427c3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -100,7 +100,9 @@ set(BENCH_OMP_CFGS ${CONFIG_DIR}/omp/dnn-bf16.json ${CONFIG_DIR}/omp/mlir-fp32.json ${CONFIG_DIR}/omp/mlir-bf16.json + ${CONFIG_DIR}/omp/mlir-fp32-vector-to-kernel.json ${CONFIG_DIR}/omp/torch-dynamo.json + ${CONFIG_DIR}/omp/torch-dynamo-vector-to-kernel.json ) string(JOIN ',' BENCH_OMP_CFGS_STR ${BENCH_OMP_CFGS}) add_custom_target(benchmarks-omp ${BENCHMARK_DIR}/driver.py -v --build ${PROJECT_BINARY_DIR} -n 10 diff --git a/benchmarks/config/base/base.json b/benchmarks/config/base/base.json index 48e86205e..0251bbf78 100644 --- a/benchmarks/config/base/base.json +++ b/benchmarks/config/base/base.json @@ -36,6 +36,13 @@ "flags": [ "-n", "100" ], "extensions": [] }, + "gemm_fp32_mlir_vector": { + "type": "IR-GEN", + "benchmark": [ "mlir-gen", "--kernel=const --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ], + "environment": {}, + "flags": [ "-n", "100", "-run-args='--vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], + "extensions": [] + }, "gemm_bf16_dp2_mlir": { "type": "IR-GEN", "benchmark": [ "mlir-gen", "--kernel=const --float-type=bf16 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32 --vnni=2" ], @@ -57,6 +64,13 @@ "flags": [ "-n", "100" ], "extensions": [] }, + "mlp_fp32_mlir_vector": { + "type": "IR-GEN", + "benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ], + "environment": {}, + "flags": [ "-n", "100", "-run-args='--def-parallel --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], + "extensions": [] + }, "mlp_bf16_dp2_mlir": { "type": "IR-GEN", "benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=bf16 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32 --vnni=2" ], @@ -81,12 +95,26 @@ "flags": [ "-n", "100" ], "extensions": [ "(avx2|asimd)" ] }, + "fp32_3x1024_const_mlir_vector": { + "type": "IR-GEN", + "benchmark": [ "mlir-gen", "--kernel=const --float-type=f32 --batch=256 --layers=1024,1024,1024,1024" ], + "environment": {}, + "flags": [ "-n", "100", "-run-args='--vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], + "extensions": [ "(avx2|asimd)" ] + }, "fp32_3x1024_args_mlir": { "type": "IR-GEN", "benchmark": [ "mlir-gen", "--kernel=args --float-type=f32 --batch=256 --layers=1024,1024,1024,1024" ], "environment": {}, "flags": [ "-n", "100" ], "extensions": [ "(avx2|asimd)" ] + }, + "fp32_3x1024_args_mlir_vector": { + "type": "IR-GEN", + "benchmark": [ "mlir-gen", "--kernel=args --float-type=f32 --batch=256 --layers=1024,1024,1024,1024" ], + "environment": {}, + "flags": [ "-n", "100", "-run-args='--vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], + "extensions": [ "(avx2|asimd)" ] }, "bf16_3x1024_const_mlir": { "type": "IR-GEN", @@ -112,6 +140,13 @@ "flags": [ "-n", "100" ], "extensions": [ "(avx2|asimd)" ] }, + "fp32_3x1024_const_mlir_vector": { + "type": "IR-GEN", + "benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024" ], + "environment": {}, + "flags": [ "-n", "100", "-run-args='--def-parallel --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], + "extensions": [ "(avx2|asimd)" ] + }, "fp32_3x1024_args_mlir": { "type": "IR-GEN", "benchmark": [ "mlir-gen", "--kernel=args --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024" ], @@ -119,6 +154,13 @@ "flags": [ "-n", "100" ], "extensions": [ "(avx2|asimd)" ] }, + "fp32_3x1024_args_mlir_vector": { + "type": "IR-GEN", + "benchmark": [ "mlir-gen", "--kernel=args --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024" ], + "environment": {}, + "flags": [ "-n", "100", "-run-args=' --def-parallel --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], + "extensions": [ "(avx2|asimd)" ] + }, "bf16_3x1024_const_mlir": { "type": "IR-GEN", "benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=bf16 --batch=256 --layers=1024,1024,1024,1024" ], diff --git a/benchmarks/config/omp/mlir-fp32-vector-to-kernel.json b/benchmarks/config/omp/mlir-fp32-vector-to-kernel.json new file mode 100644 index 000000000..6bed81a47 --- /dev/null +++ b/benchmarks/config/omp/mlir-fp32-vector-to-kernel.json @@ -0,0 +1,128 @@ + +[ + { + "gemm_fp32_mlir_vector_kernel_32": { + "fp32_3x1024_omp_2_mlir": { + "type": "IR-GEN", + "benchmark": [ "mlir-gen", "--kernel=const --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ], + "environment": { "OMP_NUM_THREADS": "2", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,16 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], + "extensions": [ "(avx2|asimd)" ] + }, + "fp32_3x1024_omp_4_mlir": { + "type": "IR-GEN", + "benchmark": [ "mlir-gen", "--kernel=const --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ], + "environment": { "OMP_NUM_THREADS": "4", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,8 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], + "extensions": [ "(avx2|asimd)" ] + }, + "fp32_3x1024_omp_8_mlir": { + "type": "IR-GEN", + "benchmark": [ "mlir-gen", "--kernel=const --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ], + "environment": { "OMP_NUM_THREADS": "8", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=4,8 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], + "extensions": [ "(avx2|asimd)" ] + }, + "fp32_3x1024_omp_16_mlir": { + "type": "IR-GEN", + "benchmark": [ "mlir-gen", "--kernel=const --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ], + "environment": { "OMP_NUM_THREADS": "16", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=2,8 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], + "extensions": [ "(avx2|asimd)" ] + } + }}, + { + "mlp_fp32_mlir_vector_kernel_32": { + "fp32_3x1024_omp_2_mlir": { + "type": "IR-GEN", + "benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ], + "environment": { "OMP_NUM_THREADS": "2", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,16 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], + "extensions": [ "(avx2|asimd)" ] + }, + "fp32_3x1024_omp_4_mlir": { + "type": "IR-GEN", + "benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ], + "environment": { "OMP_NUM_THREADS": "4", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,8 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], + "extensions": [ "(avx2|asimd)" ] + }, + "fp32_3x1024_omp_8_mlir": { + "type": "IR-GEN", + "benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ], + "environment": { "OMP_NUM_THREADS": "8", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=4,8 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], + "extensions": [ "(avx2|asimd)" ] + }, + "fp32_3x1024_omp_16_mlir": { + "type": "IR-GEN", + "benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ], + "environment": { "OMP_NUM_THREADS": "16", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=2,8 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], + "extensions": [ "(avx2|asimd)" ] + } + }}, + { + "gemm_fp32_mlir_vector_kernel_64": { + "fp32_3x1024_omp_2_mlir": { + "type": "IR-GEN", + "benchmark": [ "mlir-gen", "--kernel=const --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=64,64,64" ], + "environment": { "OMP_NUM_THREADS": "2", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=4,8 --vector-to-kernels --lhsTile=16,64 --rhsTile=64,1'" ], + "extensions": [ "(avx2|asimd)" ] + }, + "fp32_3x1024_omp_4_mlir": { + "type": "IR-GEN", + "benchmark": [ "mlir-gen", "--kernel=const --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=64,64,64" ], + "environment": { "OMP_NUM_THREADS": "4", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=4,4 --vector-to-kernels --lhsTile=16,64 --rhsTile=64,1'" ], + "extensions": [ "(avx2|asimd)" ] + }, + "fp32_3x1024_omp_8_mlir": { + "type": "IR-GEN", + "benchmark": [ "mlir-gen", "--kernel=const --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=64,64,64" ], + "environment": { "OMP_NUM_THREADS": "8", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=2,4 --vector-to-kernels --lhsTile=16,64 --rhsTile=64,1'" ], + "extensions": [ "(avx2|asimd)" ] + }, + "fp32_3x1024_omp_16_mlir": { + "type": "IR-GEN", + "benchmark": [ "mlir-gen", "--kernel=const --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=64,64,64" ], + "environment": { "OMP_NUM_THREADS": "16", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=1,4 --vector-to-kernels --lhsTile=16,64 --rhsTile=64,1'" ], + "extensions": [ "(avx2|asimd)" ] + } + }}, + { + "mlp_fp32_mlir_vector_kernel_64": { + "fp32_3x1024_omp_2_mlir": { + "type": "IR-GEN", + "benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=64,64,64" ], + "environment": { "OMP_NUM_THREADS": "2", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=4,8 --vector-to-kernels --lhsTile=16,64 --rhsTile=64,1'" ], + "extensions": [ "(avx2|asimd)" ] + }, + "fp32_3x1024_omp_4_mlir": { + "type": "IR-GEN", + "benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=64,64,64" ], + "environment": { "OMP_NUM_THREADS": "4", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=4,4 --vector-to-kernels --lhsTile=16,64 --rhsTile=64,1'" ], + "extensions": [ "(avx2|asimd)" ] + }, + "fp32_3x1024_omp_8_mlir": { + "type": "IR-GEN", + "benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=64,64,64" ], + "environment": { "OMP_NUM_THREADS": "8", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=2,4 --vector-to-kernels --lhsTile=16,64 --rhsTile=64,1'" ], + "extensions": [ "(avx2|asimd)" ] + }, + "fp32_3x1024_omp_16_mlir": { + "type": "IR-GEN", + "benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=64,64,64" ], + "environment": { "OMP_NUM_THREADS": "16", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=1,4 --vector-to-kernels --lhsTile=16,64 --rhsTile=64,1'" ], + "extensions": [ "(avx2|asimd)" ] + } + }} +] + diff --git a/benchmarks/config/omp/torch-dynamo-vector-to-kernel.json b/benchmarks/config/omp/torch-dynamo-vector-to-kernel.json new file mode 100644 index 000000000..74e2b5fa9 --- /dev/null +++ b/benchmarks/config/omp/torch-dynamo-vector-to-kernel.json @@ -0,0 +1,64 @@ +[ + { + "gemm_fp32_torch_vector_kernel" : { + "fp32_3x1024_omp_2_mlir": { + "type": "MLIR", + "benchmark": "pytorch/torch-dynamo-gemm-fp32-3x1024.mlir", + "environment": { "OMP_NUM_THREADS": "2", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,16 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], + "extensions": [ ] + }, + "fp32_3x1024_omp_4_mlir": { + "type": "MLIR", + "benchmark": "pytorch/torch-dynamo-gemm-fp32-3x1024.mlir", + "environment": { "OMP_NUM_THREADS": "4", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,8 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], + "extensions": [ ] + }, + "fp32_3x1024_omp_8_mlir": { + "type": "MLIR", + "benchmark": "pytorch/torch-dynamo-gemm-fp32-3x1024.mlir", + "environment": { "OMP_NUM_THREADS": "8", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=4,8 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], + "extensions": [ ] + }, + "fp32_3x1024_omp_16_mlir": { + "type": "MLIR", + "benchmark": "pytorch/torch-dynamo-gemm-fp32-3x1024.mlir", + "environment": { "OMP_NUM_THREADS": "16", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=2,8 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], + "extensions": [ ] + } + }}, + { + "mlp_fp32_torch_vector_kernel" : { + "fp32_3x1024_omp_2_mlir": { + "type": "MLIR", + "benchmark": "pytorch/torch-dynamo-mlp-fp32-3x1024.mlir", + "environment": { "OMP_NUM_THREADS": "2", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,16 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], + "extensions": [ ] + }, + "fp32_3x1024_omp_4_mlir": { + "type": "MLIR", + "benchmark": "pytorch/torch-dynamo-mlp-fp32-3x1024.mlir", + "environment": { "OMP_NUM_THREADS": "4", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,8 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], + "extensions": [ ] + }, + "fp32_3x1024_omp_8_mlir": { + "type": "MLIR", + "benchmark": "pytorch/torch-dynamo-mlp-fp32-3x1024.mlir", + "environment": { "OMP_NUM_THREADS": "8", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=4,8 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], + "extensions": [ ] + }, + "fp32_3x1024_omp_16_mlir": { + "type": "MLIR", + "benchmark": "pytorch/torch-dynamo-mlp-fp32-3x1024.mlir", + "environment": { "OMP_NUM_THREADS": "16", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" }, + "flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=2,8 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ], + "extensions": [ ] + } + }} +] diff --git a/lib/TPP/DefaultPipeline.cpp b/lib/TPP/DefaultPipeline.cpp index 3078eedf1..ebab1af8d 100644 --- a/lib/TPP/DefaultPipeline.cpp +++ b/lib/TPP/DefaultPipeline.cpp @@ -57,6 +57,10 @@ llvm::cl::opt linalgToVector("linalg-to-vector", llvm::cl::desc("Lower linalg to vector"), llvm::cl::init(false)); +llvm::cl::opt vectorToKernel("vector-to-kernels", + llvm::cl::desc("Lower vector to micro-kernels"), + llvm::cl::init(false)); + llvm::cl::opt lowerPackUnpackWithoutTranspose( "lower-pack-unpack-without-transpose", llvm::cl::desc("Lower packs and unpacks reverting any dim permutations"), @@ -158,6 +162,7 @@ struct DefaultPipeline : public tpp::impl::DefaultPipelineBase, lowerPackUnpackWithoutTranspose; tppDefaultOptions.lhsTile = lhsTile; tppDefaultOptions.rhsTile = rhsTile; + tppDefaultOptions.vectorToKernel = vectorToKernel; pm.addPass(createDefaultTppPasses(tppDefaultOptions)); } diff --git a/lib/TPP/DefaultTppPasses.cpp b/lib/TPP/DefaultTppPasses.cpp index 2790b8fdc..b5cc4e635 100644 --- a/lib/TPP/DefaultTppPasses.cpp +++ b/lib/TPP/DefaultTppPasses.cpp @@ -87,7 +87,7 @@ struct DefaultTppPasses SmallVector skipOperations; // General "linalg-to-vector" choice needs to skip all XSMM matching at // linalg level. - if (linalgToVector) { + if (linalgToVector || vectorToKernel) { skipOperations.push_back("all"); } if (vectorToXSMM) { @@ -95,8 +95,6 @@ struct DefaultTppPasses skipOperations.push_back("transpose"); skipOperations.push_back("vnni"); } - if (vectorToKernel) - skipOperations.clear(); // Pipeline building starts here. pm.addPass(createFoldAddIntoDest()); @@ -141,8 +139,12 @@ struct DefaultTppPasses BrgemmLinalgTilingOptions{lhsTile, rhsTile})); pm.addNestedPass(createLoopInvariantCodeMotionPass()); pm.addNestedPass(createVectorizationPass()); - pm.addNestedPass(createCanonicalizerPass()); - if (vectorToXSMM) { + + //Please note, canonicalizer should be after hoisting pass because + //it fuses outer tiling loops and it results in no pattern + //matching for hoisting pass. Moved inside VectorToKernel Path. + + if (vectorToXSMM) { pm.addPass(createVectorToXSMM()); } if (vectorToKernel) { @@ -187,3 +189,4 @@ struct DefaultTppPasses }; } // namespace + diff --git a/lib/TPP/PassBundles/VectorToKernel.cpp b/lib/TPP/PassBundles/VectorToKernel.cpp index cf0f8c634..8335b3c46 100644 --- a/lib/TPP/PassBundles/VectorToKernel.cpp +++ b/lib/TPP/PassBundles/VectorToKernel.cpp @@ -13,6 +13,7 @@ #include "mlir/Pass/Pass.h" #include "mlir/Pass/PassManager.h" #include "llvm/Support/Debug.h" +#include "mlir/Transforms/Passes.h" #include "TPP/PassBundles.h" #include "TPP/PassUtils.h" @@ -48,7 +49,8 @@ struct VectorToKernel : public tpp::impl::VectorToKernelBase, private: void constructPipeline() override { - LLVM_DEBUG(llvm::dbgs() << "Adding vector-to-kernel passes\n"); - // Not Implemented Yet. + pm.addNestedPass(createHoistVectorTransfers()); + pm.addNestedPass(createCanonicalizerPass()); + pm.addNestedPass(createVectorContractToFMA()); } }; diff --git a/scripts/benchmarks/build_and_run.sh b/scripts/benchmarks/build_and_run.sh index 2afdbd596..02eba0e57 100755 --- a/scripts/benchmarks/build_and_run.sh +++ b/scripts/benchmarks/build_and_run.sh @@ -64,7 +64,7 @@ echo_run ./driver.py -vv \ --build "${BUILD_DIR}" echo " ========= OpenMP Benchmarks ===========" -for cfg in dnn-fp32 dnn-bf16 mlir-fp32 mlir-bf16; do +for cfg in dnn-fp32 dnn-bf16 mlir-fp32 mlir-bf16 mlir-fp32-vector-to-kernel; do echo_run ./driver.py -vv \ -n ${NUM_ITER} \ -c "${CONFIG_DIR}/omp/${cfg}.json" \ diff --git a/scripts/github/benchmark.sh b/scripts/github/benchmark.sh index cabba9121..207831042 100755 --- a/scripts/github/benchmark.sh +++ b/scripts/github/benchmark.sh @@ -111,8 +111,10 @@ if [ "$BENCH_OMP" ]; then benchmark omp/dnn-fp32.json "OpenMP XSMM-DNN FP32" benchmark omp/dnn-bf16.json "OpenMP XSMM-DNN BF16" benchmark omp/mlir-fp32.json "OpenMP TPP-MLIR FP32" + benchmark omp/mlir-fp32-vector-to-kernel.json "OpenMP TPP-MLIR VECTOR-TO-KERNEL FP32" benchmark omp/mlir-bf16.json "OpenMP TPP-MLIR BF16" benchmark omp/torch-dynamo.json "OpenMP TPP-MLIR PyTorch" + benchmark omp/torch-dynamo-vector-to-kernel.json "OpenMP TPP-MLIR VECTOR-TO-KERNEL PyTorch" fi # Matmul Benchmarks