Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Vector-to-kernel pipeline bundle #987

Merged
merged 14 commits into from
Dec 6, 2024
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ set(CONFIG_DIR "${BENCHMARK_DIR}/config")
# Run baseline benchmarks with default iterations to track simple performance
set(BENCH_CFGS
${CONFIG_DIR}/base/base.json
${CONFIG_DIR}/base/vector-to-kernel.json
${CONFIG_DIR}/base/pack.json
${CONFIG_DIR}/base/mha.json
${CONFIG_DIR}/base/named-ops.json
Expand All @@ -100,7 +101,9 @@ set(BENCH_OMP_CFGS
${CONFIG_DIR}/omp/dnn-bf16.json
${CONFIG_DIR}/omp/mlir-fp32.json
${CONFIG_DIR}/omp/mlir-bf16.json
${CONFIG_DIR}/omp/mlir-fp32-vector-to-kernel.json
${CONFIG_DIR}/omp/torch-dynamo.json
${CONFIG_DIR}/omp/torch-dynamo-vector-to-kernel.json
)
string(JOIN ',' BENCH_OMP_CFGS_STR ${BENCH_OMP_CFGS})
add_custom_target(benchmarks-omp ${BENCHMARK_DIR}/driver.py -v --build ${PROJECT_BINARY_DIR} -n 10
Expand Down
34 changes: 34 additions & 0 deletions benchmarks/config/base/base.json
Original file line number Diff line number Diff line change
Expand Up @@ -133,5 +133,39 @@
"flags": [ "-n", "100"],
"extensions": [ "(avx2|asimd)" ]
}
}},
{
"gemm_models_vector_kernel": {
rengolin marked this conversation as resolved.
Show resolved Hide resolved
"fp32_3x1024_const_mlir": {
"type": "IR-GEN",
"benchmark": [ "mlir-gen", "--kernel=const --float-type=f32 --batch=256 --layers=1024,1024,1024,1024" ],
"environment": {},
"flags": [ "-n", "100", "-run-args='--vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
"extensions": [ "(avx2|asimd)" ]
},
"fp32_3x1024_args_mlir": {
"type": "IR-GEN",
"benchmark": [ "mlir-gen", "--kernel=args --float-type=f32 --batch=256 --layers=1024,1024,1024,1024" ],
"environment": {},
"flags": [ "-n", "100", "-run-args='--vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
"extensions": [ "(avx2|asimd)" ]
}
}},
{
"mlp_models_vector_kernel": {
"fp32_3x1024_const_mlir": {
"type": "IR-GEN",
"benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024" ],
"environment": {},
"flags": [ "-n", "100", "-run-args='--def-parallel --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
"extensions": [ "(avx2|asimd)" ]
},
"fp32_3x1024_args_mlir": {
"type": "IR-GEN",
"benchmark": [ "mlir-gen", "--kernel=args --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024" ],
"environment": {},
"flags": [ "-n", "100", "-run-args=' --def-parallel --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
"extensions": [ "(avx2|asimd)" ]
}
}}
]
19 changes: 19 additions & 0 deletions benchmarks/config/base/vector-to-kernel.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
[
{
"prepacked_targets_vector_kernel": {
"gemm_fp32_mlir": {
rengolin marked this conversation as resolved.
Show resolved Hide resolved
"type": "IR-GEN",
"benchmark": [ "mlir-gen", "--kernel=const --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ],
"environment": {},
"flags": [ "-n", "100", "-run-args='--vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
"extensions": []
},
"mlp_fp32_mlir": {
"type": "IR-GEN",
"benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ],
"environment": {},
"flags": [ "-n", "100", "-run-args='--def-parallel --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
"extensions": []
}
}}
]
64 changes: 64 additions & 0 deletions benchmarks/config/omp/mlir-fp32-vector-to-kernel.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
[
{
"gemm_fp32_mlir_vector_kernel": {
"fp32_3x1024_omp_2_mlir": {
"type": "IR-GEN",
"benchmark": [ "mlir-gen", "--kernel=const --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ],
"environment": { "OMP_NUM_THREADS": "2", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
"flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,16 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
"extensions": [ "(avx2|asimd)" ]
},
"fp32_3x1024_omp_4_mlir": {
"type": "IR-GEN",
"benchmark": [ "mlir-gen", "--kernel=const --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ],
"environment": { "OMP_NUM_THREADS": "4", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
"flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,8 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
"extensions": [ "(avx2|asimd)" ]
},
"fp32_3x1024_omp_8_mlir": {
"type": "IR-GEN",
"benchmark": [ "mlir-gen", "--kernel=const --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ],
"environment": { "OMP_NUM_THREADS": "8", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
"flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=4,8 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
"extensions": [ "(avx2|asimd)" ]
},
"fp32_3x1024_omp_16_mlir": {
"type": "IR-GEN",
"benchmark": [ "mlir-gen", "--kernel=const --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ],
"environment": { "OMP_NUM_THREADS": "16", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
"flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=2,8 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
"extensions": [ "(avx2|asimd)" ]
}
}},
{
"mlp_fp32_mlir_vector_kernel": {
"fp32_3x1024_omp_2_mlir": {
"type": "IR-GEN",
"benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ],
"environment": { "OMP_NUM_THREADS": "2", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
"flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,16 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
"extensions": [ "(avx2|asimd)" ]
},
"fp32_3x1024_omp_4_mlir": {
"type": "IR-GEN",
"benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ],
"environment": { "OMP_NUM_THREADS": "4", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
"flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,8 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
"extensions": [ "(avx2|asimd)" ]
},
"fp32_3x1024_omp_8_mlir": {
"type": "IR-GEN",
"benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ],
"environment": { "OMP_NUM_THREADS": "8", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
"flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=4,8 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
"extensions": [ "(avx2|asimd)" ]
},
"fp32_3x1024_omp_16_mlir": {
"type": "IR-GEN",
"benchmark": [ "mlir-gen", "--kernel=const --bias --relu --float-type=f32 --batch=256 --layers=1024,1024,1024,1024 --tiles=32,32,32" ],
"environment": { "OMP_NUM_THREADS": "16", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
"flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=2,8 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
"extensions": [ "(avx2|asimd)" ]
}
}}
]
64 changes: 64 additions & 0 deletions benchmarks/config/omp/torch-dynamo-vector-to-kernel.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
[
{
"gemm_fp32_torch_vector_kernel" : {
"fp32_3x1024_omp_2_mlir": {
"type": "MLIR",
"benchmark": "pytorch/torch-dynamo-gemm-fp32-3x1024.mlir",
"environment": { "OMP_NUM_THREADS": "2", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
"flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,16 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
"extensions": [ ]
},
"fp32_3x1024_omp_4_mlir": {
"type": "MLIR",
"benchmark": "pytorch/torch-dynamo-gemm-fp32-3x1024.mlir",
"environment": { "OMP_NUM_THREADS": "4", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
"flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,8 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
"extensions": [ ]
},
"fp32_3x1024_omp_8_mlir": {
"type": "MLIR",
"benchmark": "pytorch/torch-dynamo-gemm-fp32-3x1024.mlir",
"environment": { "OMP_NUM_THREADS": "8", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
"flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=4,8 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
"extensions": [ ]
},
"fp32_3x1024_omp_16_mlir": {
"type": "MLIR",
"benchmark": "pytorch/torch-dynamo-gemm-fp32-3x1024.mlir",
"environment": { "OMP_NUM_THREADS": "16", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
"flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=2,8 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
"extensions": [ ]
}
}},
{
"mlp_fp32_torch_vector_kernel" : {
"fp32_3x1024_omp_2_mlir": {
"type": "MLIR",
"benchmark": "pytorch/torch-dynamo-mlp-fp32-3x1024.mlir",
"environment": { "OMP_NUM_THREADS": "2", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
"flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,16 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
"extensions": [ ]
},
"fp32_3x1024_omp_4_mlir": {
"type": "MLIR",
"benchmark": "pytorch/torch-dynamo-mlp-fp32-3x1024.mlir",
"environment": { "OMP_NUM_THREADS": "4", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
"flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=8,8 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
"extensions": [ ]
},
"fp32_3x1024_omp_8_mlir": {
"type": "MLIR",
"benchmark": "pytorch/torch-dynamo-mlp-fp32-3x1024.mlir",
"environment": { "OMP_NUM_THREADS": "8", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
"flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=4,8 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
"extensions": [ ]
},
"fp32_3x1024_omp_16_mlir": {
"type": "MLIR",
"benchmark": "pytorch/torch-dynamo-mlp-fp32-3x1024.mlir",
"environment": { "OMP_NUM_THREADS": "16", "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" },
"flags": [ "-n", "100", "-run-args='--def-parallel --parallel-task-grid=2,8 --vector-to-kernels --lhsTile=4,32 --rhsTile=32,1'" ],
"extensions": [ ]
}
}}
]
5 changes: 5 additions & 0 deletions lib/TPP/DefaultPipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,10 @@ llvm::cl::opt<bool> linalgToVector("linalg-to-vector",
llvm::cl::desc("Lower linalg to vector"),
llvm::cl::init(false));

llvm::cl::opt<bool> vectorToKernel("vector-to-kernels",
llvm::cl::desc("Lower vector to micro-kernels"),
llvm::cl::init(false));

llvm::cl::opt<bool> lowerPackUnpackWithoutTranspose(
"lower-pack-unpack-without-transpose",
llvm::cl::desc("Lower packs and unpacks reverting any dim permutations"),
Expand Down Expand Up @@ -158,6 +162,7 @@ struct DefaultPipeline : public tpp::impl::DefaultPipelineBase<DefaultPipeline>,
lowerPackUnpackWithoutTranspose;
tppDefaultOptions.lhsTile = lhsTile;
tppDefaultOptions.rhsTile = rhsTile;
tppDefaultOptions.vectorToKernel = vectorToKernel;

pm.addPass(createDefaultTppPasses(tppDefaultOptions));
}
Expand Down
13 changes: 8 additions & 5 deletions lib/TPP/DefaultTppPasses.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,16 +87,14 @@ struct DefaultTppPasses
SmallVector<std::string> skipOperations;
// General "linalg-to-vector" choice needs to skip all XSMM matching at
// linalg level.
if (linalgToVector) {
if (linalgToVector || vectorToKernel) {
skipOperations.push_back("all");
}
if (vectorToXSMM) {
skipOperations.clear();
skipOperations.push_back("transpose");
skipOperations.push_back("vnni");
}
if (vectorToKernel)
skipOperations.clear();

// Pipeline building starts here.
pm.addPass(createFoldAddIntoDest());
Expand Down Expand Up @@ -141,8 +139,12 @@ struct DefaultTppPasses
BrgemmLinalgTilingOptions{lhsTile, rhsTile}));
pm.addNestedPass<func::FuncOp>(createLoopInvariantCodeMotionPass());
pm.addNestedPass<func::FuncOp>(createVectorizationPass());
pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
arun-thmn marked this conversation as resolved.
Show resolved Hide resolved
if (vectorToXSMM) {

//Please note, canonicalizer should be after hoisting pass because
//it fuses outer tiling loops and it results in no pattern
//matching for hoisting pass. Moved inside VectorToKernel Path.
rengolin marked this conversation as resolved.
Show resolved Hide resolved

if (vectorToXSMM) {
pm.addPass(createVectorToXSMM());
}
if (vectorToKernel) {
Expand Down Expand Up @@ -187,3 +189,4 @@ struct DefaultTppPasses
};

} // namespace

6 changes: 4 additions & 2 deletions lib/TPP/PassBundles/VectorToKernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include "mlir/Pass/Pass.h"
#include "mlir/Pass/PassManager.h"
#include "llvm/Support/Debug.h"
#include "mlir/Transforms/Passes.h"

#include "TPP/PassBundles.h"
#include "TPP/PassUtils.h"
Expand Down Expand Up @@ -48,7 +49,8 @@ struct VectorToKernel : public tpp::impl::VectorToKernelBase<VectorToKernel>,

private:
void constructPipeline() override {
LLVM_DEBUG(llvm::dbgs() << "Adding vector-to-kernel passes\n");
// Not Implemented Yet.
pm.addNestedPass<func::FuncOp>(createHoistVectorTransfers());
pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
pm.addNestedPass<func::FuncOp>(createVectorContractToFMA());
}
};
8 changes: 7 additions & 1 deletion scripts/benchmarks/build_and_run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -57,14 +57,20 @@ echo_run ./driver.py -vv \
-c "${CONFIG_DIR}/base/base.json" \
--build "${BUILD_DIR}"

echo " ========= Vector-to-kernel Base Benchmarks ==========="
echo_run ./driver.py -vv \
-n ${NUM_ITER} \
-c "${CONFIG_DIR}/base/vector-to-kernel.json" \
--build "${BUILD_DIR}"

echo " ========= PyTorch Benchmarks ==========="
echo_run ./driver.py -vv \
-n ${NUM_ITER} \
-c "${CONFIG_DIR}/pytorch/torch_dynamo.json" \
--build "${BUILD_DIR}"

echo " ========= OpenMP Benchmarks ==========="
for cfg in dnn-fp32 dnn-bf16 mlir-fp32 mlir-bf16; do
for cfg in dnn-fp32 dnn-bf16 mlir-fp32 mlir-bf16 mlir-fp32-vector-to-kernel; do
echo_run ./driver.py -vv \
-n ${NUM_ITER} \
-c "${CONFIG_DIR}/omp/${cfg}.json" \
Expand Down
3 changes: 3 additions & 0 deletions scripts/github/benchmark.sh
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ benchmark () {
# Base Benchmarks
if [ "$BENCH_BASE" ]; then
benchmark base/base.json "Base Benchmarks"
benchmark base/vector-to-kernel.json "Base Vector-to-kernel Benchmarks"
benchmark base/pack.json "Pack Benchmarks"
benchmark base/mha.json "MHA Benchmarks"
benchmark base/named-ops.json "Named Ops Benchmarks"
Expand All @@ -111,8 +112,10 @@ if [ "$BENCH_OMP" ]; then
benchmark omp/dnn-fp32.json "OpenMP XSMM-DNN FP32"
benchmark omp/dnn-bf16.json "OpenMP XSMM-DNN BF16"
benchmark omp/mlir-fp32.json "OpenMP TPP-MLIR FP32"
benchmark omp/mlir-fp32-vector-to-kernel.json "OpenMP TPP-MLIR VECTOR-TO-KERNEL FP32"
benchmark omp/mlir-bf16.json "OpenMP TPP-MLIR BF16"
benchmark omp/torch-dynamo.json "OpenMP TPP-MLIR PyTorch"
benchmark omp/torch-dynamo-vector-to-kernel.json "OpenMP TPP-MLIR VECTOR-TO-KERNEL PyTorch"
fi

# Matmul Benchmarks
Expand Down