Skip to content

Commit

Permalink
Linalg to WMMA lowering rework (#871)
Browse files Browse the repository at this point in the history
Reworks linalg lowering to WMMA ops to support computations on memory
tiles larger than hardware WMMA tiles.

The new lowering effectively applies warp (or subgroup) tiling on 2D
tiled linalg GEMM-like operations. Additionally, the reduction dimension
tiling is performed at the same time to ensure that the computation fits
within limits of hardware resources.
The elementwise consumers are fused with GEMM when possible.

The warp tiling is done at the time of linalg lowering as the parameter
decisions, such as tile sizes, are driven by the compute workload.
Mapping first to individual WMMA operations and splitting them later
into multiple ops operating on smaller sub-tiles would be more complex
due to need for use-chain analysis and extra validation.
  • Loading branch information
adam-smnk authored Jan 25, 2024
1 parent 5b26ef9 commit 6833392
Show file tree
Hide file tree
Showing 9 changed files with 685 additions and 249 deletions.
11 changes: 8 additions & 3 deletions include/TPP/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -336,7 +336,8 @@ def GpuConversion : Pass<"gpu-conversion", "ModuleOp"> {
let options = [
Option<"useWmma", "wmma",
"bool", /*default=*/"false",
"Use WMMA operations">
"Use WMMA operations">,
ListOption<"warpTile", "warp-tile", "int64_t", "Warp tile sizes MxNxK">,
];
}

Expand All @@ -347,7 +348,7 @@ def GpuToCuda : Pass<"gpu-to-cuda", "ModuleOp"> {
/*default=*/"\"nvptx64-nvidia-cuda\"",
"GPU target triple.">,
Option<"gpuChip", "chip", "std::string",
/*default=*/"\"sm_35\"",
/*default=*/"\"sm_70\"",
"GPU target architecture.">,
Option<"gpuFeatures", "features", "std::string",
/*default=*/"\"+ptx60\"",
Expand Down Expand Up @@ -458,7 +459,11 @@ def LinalgToGpu : Pass<"linalg-to-gpu", "func::FuncOp"> {
let options = [
Option<"useWmma", "wmma",
"bool", /*default=*/"false",
"Use WMMA operations">
"Use WMMA operations">,
ListOption<"warpTile", "warp-tile", "int64_t", "Warp tile sizes MxNxK">,
Option<"kTile", "k-tile", "int64_t",
/*default=*/"32",
"GEMM tile size for reduction dimension.">,
];
}

Expand Down
2 changes: 1 addition & 1 deletion lib/TPP/GPU/GpuConversion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ struct GpuConversion : public tpp::impl::GpuConversionBase<GpuConversion>,
// the default lowering for any remaining ops.
pm.addNestedPass<func::FuncOp>(createLinalgDeGeneralize());
pm.addNestedPass<func::FuncOp>(
createLinalgToGpu(LinalgToGpuOptions{useWmma}));
createLinalgToGpu(LinalgToGpuOptions{useWmma, warpTile}));
pm.addNestedPass<func::FuncOp>(createConvertLinalgToParallelLoopsPass());

// Map loops into GPU kernels.
Expand Down
42 changes: 35 additions & 7 deletions lib/TPP/GPU/GpuPipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,11 @@ llvm::cl::opt<bool> gpuWmma("gpu-wmma",
llvm::cl::desc("Enable GPU WMMA support"),
llvm::cl::init(false));

llvm::cl::list<int64_t> wmmaTileSizes(
"wmma-tile-sizes", llvm::cl::desc("GPU WMMA tile sizes MxNxK"),
llvm::cl::list_init<int64_t>(SmallVector<int64_t>{16, 16, 16}),
llvm::cl::CommaSeparated);

namespace mlir {
namespace tpp {
#define GEN_PASS_DEF_GPUPIPELINE
Expand All @@ -70,6 +75,31 @@ GpuType parseGpuOption(StringRef gpuStr) {
return *type;
}

struct GpuOptions {
std::string triple;
std::string chip;
std::string features;
};

GpuOptions getGpuOptions(GpuType gpuType) {
GpuOptions options;

switch (gpuType) {
case GpuType::Cuda: {
options.triple = "nvptx64-nvidia-cuda";
options.chip = "sm_70";
options.features = "+ptx60";
break;
}
case GpuType::Vulkan: {
// No options needed at the moment.
break;
}
}

return options;
}

// GPU pipeline - map and lower operations to enable execution on a GPU.
struct GpuPipeline : public tpp::impl::GpuPipelineBase<GpuPipeline>,
UtilityPassBase<ModuleOp> {
Expand Down Expand Up @@ -112,6 +142,7 @@ struct GpuPipeline : public tpp::impl::GpuPipelineBase<GpuPipeline>,
pm.clear();

GpuType gpuType = parseGpuOption(this->gpuBackend);
GpuOptions gpuOptions = getGpuOptions(gpuType);

// Tile to split the kernel into threads and blocks.
// Use default tiling to handle both packed and unpacked ops.
Expand All @@ -128,21 +159,18 @@ struct GpuPipeline : public tpp::impl::GpuPipelineBase<GpuPipeline>,
pm.addNestedPass<func::FuncOp>(createCleanup());

// Convert to generic GPU ops.
pm.addPass(createGpuConversion(GpuConversionOptions{gpuWmma}));
pm.addPass(
createGpuConversion(GpuConversionOptions{gpuWmma, wmmaTileSizes}));

// Lower GPU ops to the chosen GPU backend.
switch (gpuType) {
case GpuType::Cuda: {
std::string gpuTriple = "nvptx64-nvidia-cuda";
std::string gpuChip = "sm_70";
std::string gpuFeatures = "+ptx60";

// Perform explicit GPU data transfers only for CUDA as the unified
// memory is not currently used here.
// Vulkan runner assumes usage of GPU unified memory.
pm.addNestedPass<func::FuncOp>(createGpuDataTransfer());
pm.addPass(
createGpuToCuda(GpuToCudaOptions{gpuTriple, gpuChip, gpuFeatures}));
pm.addPass(createGpuToCuda(GpuToCudaOptions{
gpuOptions.triple, gpuOptions.chip, gpuOptions.features}));
break;
}
case GpuType::Vulkan: {
Expand Down
Loading

0 comments on commit 6833392

Please sign in to comment.