Linalg to WMMA lowering rework (#871)

Reworks linalg lowering to WMMA ops to support computations on memory tiles larger than hardware WMMA tiles. The new lowering effectively applies warp (or subgroup) tiling on 2D tiled linalg GEMM-like operations. Additionally, the reduction dimension tiling is performed at the same time to ensure that the computation fits within limits of hardware resources. The elementwise consumers are fused with GEMM when possible. The warp tiling is done at the time of linalg lowering as the parameter decisions, such as tile sizes, are driven by the compute workload. Mapping first to individual WMMA operations and splitting them later into multiple ops operating on smaller sub-tiles would be more complex due to need for use-chain analysis and extra validation.
plaidml · Jan 25, 2024 · 6833392 · 6833392
1 parent 5b26ef9
commit 6833392
Show file tree

Hide file tree

Showing 9 changed files with 685 additions and 249 deletions.
diff --git a/include/TPP/Passes.td b/include/TPP/Passes.td
@@ -336,7 +336,8 @@ def GpuConversion : Pass<"gpu-conversion", "ModuleOp"> {
   let options = [
     Option<"useWmma", "wmma",
            "bool", /*default=*/"false",
-           "Use WMMA operations">
+           "Use WMMA operations">,
+    ListOption<"warpTile", "warp-tile", "int64_t", "Warp tile sizes MxNxK">,
   ];
 }
 
@@ -347,7 +348,7 @@ def GpuToCuda : Pass<"gpu-to-cuda", "ModuleOp"> {
             /*default=*/"\"nvptx64-nvidia-cuda\"",
            "GPU target triple.">,
     Option<"gpuChip", "chip", "std::string",
-            /*default=*/"\"sm_35\"",
+            /*default=*/"\"sm_70\"",
            "GPU target architecture.">,
     Option<"gpuFeatures", "features", "std::string",
             /*default=*/"\"+ptx60\"",
@@ -458,7 +459,11 @@ def LinalgToGpu : Pass<"linalg-to-gpu", "func::FuncOp"> {
   let options = [
     Option<"useWmma", "wmma",
            "bool", /*default=*/"false",
-           "Use WMMA operations">
+           "Use WMMA operations">,
+    ListOption<"warpTile", "warp-tile", "int64_t", "Warp tile sizes MxNxK">,
+    Option<"kTile", "k-tile", "int64_t",
+           /*default=*/"32",
+           "GEMM tile size for reduction dimension.">,
   ];
 }
 

diff --git a/lib/TPP/GPU/GpuConversion.cpp b/lib/TPP/GPU/GpuConversion.cpp
@@ -69,7 +69,7 @@ struct GpuConversion : public tpp::impl::GpuConversionBase<GpuConversion>,
     // the default lowering for any remaining ops.
     pm.addNestedPass<func::FuncOp>(createLinalgDeGeneralize());
     pm.addNestedPass<func::FuncOp>(
-        createLinalgToGpu(LinalgToGpuOptions{useWmma}));
+        createLinalgToGpu(LinalgToGpuOptions{useWmma, warpTile}));
     pm.addNestedPass<func::FuncOp>(createConvertLinalgToParallelLoopsPass());
 
     // Map loops into GPU kernels.

diff --git a/lib/TPP/GPU/GpuPipeline.cpp b/lib/TPP/GPU/GpuPipeline.cpp
@@ -46,6 +46,11 @@ llvm::cl::opt<bool> gpuWmma("gpu-wmma",
                             llvm::cl::desc("Enable GPU WMMA support"),
                             llvm::cl::init(false));
 
+llvm::cl::list<int64_t> wmmaTileSizes(
+    "wmma-tile-sizes", llvm::cl::desc("GPU WMMA tile sizes MxNxK"),
+    llvm::cl::list_init<int64_t>(SmallVector<int64_t>{16, 16, 16}),
+    llvm::cl::CommaSeparated);
+
 namespace mlir {
 namespace tpp {
 #define GEN_PASS_DEF_GPUPIPELINE
@@ -70,6 +75,31 @@ GpuType parseGpuOption(StringRef gpuStr) {
   return *type;
 }
 
+struct GpuOptions {
+  std::string triple;
+  std::string chip;
+  std::string features;
+};
+
+GpuOptions getGpuOptions(GpuType gpuType) {
+  GpuOptions options;
+
+  switch (gpuType) {
+  case GpuType::Cuda: {
+    options.triple = "nvptx64-nvidia-cuda";
+    options.chip = "sm_70";
+    options.features = "+ptx60";
+    break;
+  }
+  case GpuType::Vulkan: {
+    // No options needed at the moment.
+    break;
+  }
+  }
+
+  return options;
+}
+
 // GPU pipeline - map and lower operations to enable execution on a GPU.
 struct GpuPipeline : public tpp::impl::GpuPipelineBase<GpuPipeline>,
                      UtilityPassBase<ModuleOp> {
@@ -112,6 +142,7 @@ struct GpuPipeline : public tpp::impl::GpuPipelineBase<GpuPipeline>,
     pm.clear();
 
     GpuType gpuType = parseGpuOption(this->gpuBackend);
+    GpuOptions gpuOptions = getGpuOptions(gpuType);
 
     // Tile to split the kernel into threads and blocks.
     // Use default tiling to handle both packed and unpacked ops.
@@ -128,21 +159,18 @@ struct GpuPipeline : public tpp::impl::GpuPipelineBase<GpuPipeline>,
     pm.addNestedPass<func::FuncOp>(createCleanup());
 
     // Convert to generic GPU ops.
-    pm.addPass(createGpuConversion(GpuConversionOptions{gpuWmma}));
+    pm.addPass(
+        createGpuConversion(GpuConversionOptions{gpuWmma, wmmaTileSizes}));
 
     // Lower GPU ops to the chosen GPU backend.
     switch (gpuType) {
     case GpuType::Cuda: {
-      std::string gpuTriple = "nvptx64-nvidia-cuda";
-      std::string gpuChip = "sm_70";
-      std::string gpuFeatures = "+ptx60";
-
       // Perform explicit GPU data transfers only for CUDA as the unified
       // memory is not currently used here.
       // Vulkan runner assumes usage of GPU unified memory.
       pm.addNestedPass<func::FuncOp>(createGpuDataTransfer());
-      pm.addPass(
-          createGpuToCuda(GpuToCudaOptions{gpuTriple, gpuChip, gpuFeatures}));
+      pm.addPass(createGpuToCuda(GpuToCudaOptions{
+          gpuOptions.triple, gpuOptions.chip, gpuOptions.features}));
       break;
     }
     case GpuType::Vulkan: {