triton-lang · Mogball · Nov 28, 2024 · Nov 23, 2024 · Nov 25, 2024 · Nov 26, 2024
@@ -153,6 +153,19 @@ class ScanLoweringHelper {
   SmallVector<Type> srcElementTypes;
 };
 
+// Helper class for lowering `tt.gather` operations. This class shares lowering
+// logic between shared memory allocation and LLVM codegen.
+class GatherLoweringHelper {
+public:
+  GatherLoweringHelper(triton::GatherOp gatherOp);
+
+  // Get the shared memory scratch size required by this op.
+  unsigned getScratchSizeInBytes();
+
+private:
+  triton::GatherOp gatherOp;
+};
+
 // Decomposes a reshape into simpler pieces.
 //
 // As an example, suppose we have a reshape from [4,4,4] to [2,2,8,2].

@@ -92,6 +92,10 @@ void populateScanOpToLLVMPatterns(LLVMTypeConverter &typeConverter,
                                   RewritePatternSet &patterns,
                                   const TargetInfoBase &targetInfo,
                                   PatternBenefit benefit);
+void populateGatherOpToLLVMPatterns(LLVMTypeConverter &typeConverter,
+                                    RewritePatternSet &patterns,
+                                    const TargetInfoBase &targetInfo,
+                                    PatternBenefit benefit);
 
 void populateConvertLayoutOpToLLVMPatterns(LLVMTypeConverter &typeConverter,
                                            const TargetInfoBase &targetInfo,

@@ -1125,6 +1125,11 @@ emitBaseIndexForLayout(Location loc, RewriterBase &rewriter,
 
 // Emit indices calculation within each ConversionPattern, and returns a
 // [elemsPerThread X rank] index matrix.
+//
+// For example, for a thread a owns `elemsPerThread` elements of a tensor with
+// type `type` and layout `layout`, the result will contain `elemsPerThread`
+// vectors. Each vector contains the SSA values of the indices required to
+// access the corresponding element, starting from the inner dimension.
 SmallVector<SmallVector<Value>>
 emitIndices(Location loc, RewriterBase &rewriter, const TargetInfoBase &target,
             Attribute layout, RankedTensorType type, bool withCTAOffset);

@@ -869,6 +869,32 @@ def TT_HistogramOp : TT_Op<"histogram", [Pure]> {
   }];
 }
 
+//
+// Gather Op
+//
+def TT_GatherOp : TT_Op<"gather", [Pure,
+    DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
+  let summary = "local gather operation";
+  let description = [{
+    Gather elements from the input tensor using the indices tensor along a
+    single specified axis. The output tensor has the same shape as the indices
+    tensor. The input and indices tensors must have the same number of
+    dimension, and each dimension of the indices tensor that is not the gather
+    dimension cannot be greater than the corresponding dimension in the input
+    tensor.
+  }];
+
+  let arguments = (ins TT_Tensor:$src, TT_IntTensor:$indices, I32Attr:$axis);
+  let results = (outs TT_Tensor:$result);
+
+  let assemblyFormat = [{
+    $src `[` $indices `]` attr-dict `:`
+    functional-type(operands, results)
+  }];
+
+  let hasVerifier = 1;
+}
+
 //
 // Print Op
 //

@@ -125,6 +125,10 @@ unsigned defaultAllocationAnalysisScratchSizeFn(Operation *op) {
     ScanLoweringHelper helper(scanOp);
     return helper.getScratchSizeInBytes();
   }
+  if (auto gatherOp = dyn_cast<GatherOp>(op)) {
+    GatherLoweringHelper helper(gatherOp);
+    return helper.getScratchSizeInBytes();
+  }
   if (auto histogram = dyn_cast<HistogramOp>(op)) {
     auto dstTy = histogram.getType();
     int threadsPerWarp = gpu::TritonGPUDialect::getThreadsPerWarp(

@@ -408,6 +408,17 @@ unsigned ScanLoweringHelper::getAxisBlockStride() {
   llvm_unreachable("Axis not found in order");
 }
 
+GatherLoweringHelper::GatherLoweringHelper(triton::GatherOp gatherOp)
+    : gatherOp(gatherOp) {}
+
+unsigned GatherLoweringHelper::getScratchSizeInBytes() {
+  // For now, lower the gather op by writing the source tensor to shared memory.
+  // TODO(jeff): Leverage locality to avoid using scratch space when possible.
+  RankedTensorType srcType = gatherOp.getSrc().getType();
+  return product(srcType.getShape()) *
+         ceil<unsigned>(srcType.getElementTypeBitWidth(), 8);
+}
+
 unsigned getNumScratchElements(ArrayRef<unsigned> shape) {
   if (shape.empty())
     return 0;

@@ -13,6 +13,7 @@ add_triton_library(TritonGPUToLLVM
     AllocateSharedMemory.cpp
     ReduceOpToLLVM.cpp
     ScanOpToLLVM.cpp
+    GatherOpToLLVM.cpp
     ConvertLayoutOpToLLVM.cpp
     ControlFlowOpToLLVM.cpp
     FuncOpToLLVM.cpp

@@ -0,0 +1,109 @@
+#include "triton/Conversion/TritonGPUToLLVM/PatternTritonGPUOpToLLVM.h"
+#include "triton/Conversion/TritonGPUToLLVM/Utility.h"
+
+using namespace mlir;
+using namespace mlir::triton;
+
+namespace {
+class GatherOpConversion : public ConvertOpToLLVMPattern<GatherOp> {
+public:
+  GatherOpConversion(LLVMTypeConverter &typeConverter,
+                     const TargetInfoBase &targetInfo, PatternBenefit benefit)
+      : ConvertOpToLLVMPattern(typeConverter, benefit), targetInfo(targetInfo) {
+  }
+
+  LogicalResult
+  matchAndRewrite(GatherOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override;
+
+private:
+  const TargetInfoBase &targetInfo;
+};
+
+LogicalResult
+GatherOpConversion::matchAndRewrite(GatherOp op, OpAdaptor adaptor,
+                                    ConversionPatternRewriter &rewriter) const {
+  Location loc = op.getLoc();
+  RankedTensorType srcType = op.getSrc().getType();
+
+  // Compute the src subtensor shape owned by this CTA.
+  SmallVector<unsigned> srcShapePerCTA =
+      convertType<unsigned>(triton::gpu::getShapePerCTA(srcType));
+
+  // Grab the src values in this thread.
+  SmallVector<Value> srcValues =
+      unpackLLElements(loc, adaptor.getSrc(), rewriter);
+
+  // Emit the indices of the src values owned by this thread.
+  SmallVector<SmallVector<Value>> srcIndices =
+      emitIndices(loc, rewriter, targetInfo, srcType.getEncoding(),
+                  op.getSrc().getType(), /*withCTAOffset=*/true);
+
+  // Store the src values owned by the thread into their respective location in
+  // the scratch memory.
+  assert(srcValues.size() == srcIndices.size());
+
+  // Get the base pointer to the scratch memory.
+  Value smemBase = LLVM::getSharedMemoryBase(loc, rewriter, targetInfo, op);
+
+  // For each src element owned by the thread, index into the scratch memory and
+  // then store it.
+  Type elemType = getTypeConverter()->convertType(srcType.getElementType());
+  for (auto [value, indices] : llvm::zip(srcValues, srcIndices)) {
+    // Convert the index at each dim into a single offset given the shape of the
+    // tensor.
+    Value offset = LLVM::linearize(rewriter, loc, indices, srcShapePerCTA);
+    // Emit the offset into the shared memory and then store the value.
+    Value ptr = gep(smemBase.getType(), elemType, smemBase, offset);
+    store(value, ptr);
 Value getRedundantDataMask(ModuleOp moduleOp, Type valueTy, 
 Value getRedundantDataMask(ModuleOp moduleOp, Type valueTy, 
+  }
+
+  // Synchronize the whole CTA.
+  // TODO(jeff): Should we teach Membar that gather synchronizes?
+  barrier();
+
+  // Grab the index values owned by this thread.
+  SmallVector<Value> idxValues =
+      unpackLLElements(loc, adaptor.getIndices(), rewriter);
+
+  // I = LL(pid)
+  // idx = indices[I]
+  // I_gather = [I[d] if d != axis else idx for d in range(len(I))]
+  // out[I] = src[I_gather]
+  RankedTensorType dstType = op.getType();
+  SmallVector<SmallVector<Value>> dstIndices =
+      emitIndices(loc, rewriter, targetInfo, dstType.getEncoding(), dstType,
+                  /*withCTAOffset=*/true);
+
+  unsigned idxWidth = op.getIndices().getType().getElementTypeBitWidth();
+  unsigned axis = op.getAxis();
+  SmallVector<Value> results(dstIndices.size());
+  for (auto [i, idx, indices] : llvm::enumerate(idxValues, dstIndices)) {
+    // The LL index computations are performed with 32 bit integers. If the
+    // indices are something else, cast them to i32.
+    if (idxWidth > 32) {
+      idx = trunc(i32_ty, idx);
+    } else if (idxWidth < 32) {
+      // Negative indices don't make sense, so zero-extend.
+      idx = zext(i32_ty, idx);
+    }
+    indices[axis] = idx;
+    Value offset = LLVM::linearize(rewriter, loc, indices, srcShapePerCTA);
+    Value ptr = gep(smemBase.getType(), elemType, smemBase, offset);
+    results[i] = load(elemType, ptr);
+  }
+
+  Value packed =
+      packLLElements(loc, getTypeConverter(), results, rewriter, dstType);
+  rewriter.replaceOp(op, packed);
+  return success();
+}
+
+} // namespace
+
+void triton::populateGatherOpToLLVMPatterns(LLVMTypeConverter &typeConverter,
+                                            RewritePatternSet &patterns,
+                                            const TargetInfoBase &targetInfo,
+                                            PatternBenefit benefit) {
+  patterns.insert<GatherOpConversion>(typeConverter, targetInfo, benefit);
+}
@@ -537,6 +537,7 @@ void populateTritonPatterns(TritonGPUTypeConverter &typeConverter,
       GenericOpPattern<triton::MakeRangeOp>, TritonExpandDimsPattern,
       TritonTransPattern, TritonDotPattern, GenericOpPattern<triton::LoadOp>,
       GenericOpPattern<triton::StoreOp>, GenericOpPattern<triton::HistogramOp>,
+      GenericOpPattern<triton::GatherOp>,
       GenericOpPattern<triton::ExternElementwiseOp>,
       GenericOpPattern<triton::PrintOp>, GenericOpPattern<triton::AssertOp>,
       GenericOpPattern<triton::AtomicCASOp>,

@@ -1073,6 +1073,55 @@ Speculation::Speculatability ExternElementwiseOp::getSpeculatability() {
   return Speculation::NotSpeculatable;
 }
 
+// -- GatherOp --
+LogicalResult GatherOp::verify() {
+  RankedTensorType indicesTy = getIndices().getType();
+  RankedTensorType srcTy = getSrc().getType();
+  RankedTensorType resTy = getResult().getType();
+
+  if (indicesTy.getShape() != resTy.getShape()) {
+    return emitOpError("indices and output shapes must match");
+  }
+  if (indicesTy.getEncoding() != resTy.getEncoding()) {
+    return emitOpError("indices and output encodings must match");
+  }
+  if (srcTy.getElementType() != resTy.getElementType()) {
+    return emitOpError("input and output element types must match");
+  }
+  if (srcTy.getRank() != indicesTy.getRank()) {
+    return emitOpError("input and indices ranks must match");
+  }
+  if (getAxis() >= srcTy.getRank()) {
+    return emitOpError("gather dimension must be less than the input rank");
+  }
+  for (int dim = 0; dim < indicesTy.getRank(); ++dim) {
+    if (dim == getAxis())
+      continue;
+    if (indicesTy.getShape()[dim] > srcTy.getShape()[dim]) {
+      return emitOpError("indices dimension ")
+             << dim
+             << " cannot be greater than the corresponding input dimension";
+    }
+  }
+
+  return success();
+}
+
+LogicalResult GatherOp::inferReturnTypes(
+    MLIRContext *context, std::optional<Location> location, ValueRange operands,
+    DictionaryAttr attributes, OpaqueProperties properties, RegionRange regions,
+    SmallVectorImpl<Type> &inferredReturnTypes) {
+  GatherOpAdaptor adaptor(operands, attributes, properties, regions);
+  auto indicesType = cast<RankedTensorType>(adaptor.getIndices().getType());
+  auto srcType = cast<RankedTensorType>(adaptor.getSrc().getType());
+
+  // Shape and encoding of the indices with the element type of the src.
+  inferredReturnTypes.push_back(
+      RankedTensorType::get(indicesType.getShape(), srcType.getElementType(),
+                            indicesType.getEncoding()));
+  return success();
+}
+
 // -- ExperimentalTensormapCreateOp --
 LogicalResult ExperimentalTensormapCreateOp::verify() {
   auto rank = getBoxDim().size();

@@ -282,6 +282,7 @@ SmallVector<Value> LayoutPropagation::propagateToUsers(Value value,
       setEncoding(user->getResults(), info, changed, user);
       continue;
     }
+    // TODO(jeff): Propagate tt.gather indices layout to dst.
   }
   return changed;
 }
@@ -709,6 +710,7 @@ Operation *LayoutPropagation::rewriteOp(Operation *op) {
     }
     return newOp;
   }
+  // TODO(jeff): Handle tt.gather once it supports layout propagation.
   llvm::report_fatal_error("unexpected op in rewrite");
   return nullptr;
 }

@@ -472,6 +472,8 @@ std::optional<Attribute> inferSrcEncoding(Operation *op, Attribute encoding) {
     return inferSrcEncoding(trans, encoding);
   if (auto reshape = dyn_cast<triton::ReshapeOp>(op))
     return inferSrcEncoding(reshape, encoding);
+  // TODO(jeff): Handle progagating tt.gather indices -> dst layout.
+  // This requires updating the API to specify the exact operands and results.
 
   return std::nullopt;
 }
@@ -499,6 +501,7 @@ std::optional<Attribute> inferDstEncoding(Operation *op, Attribute encoding) {
     return inferDstEncoding(trans, encoding);
   if (auto reshape = dyn_cast<triton::ReshapeOp>(op))
     return inferDstEncoding(reshape, encoding);
+  // TODO(jeff): Handle progagating tt.gather indices -> dst layout.
 
   return std::nullopt;
 }

@@ -1625,6 +1625,9 @@ void init_triton_ir(py::module &&m) {
                      IntegerType::get(operand.getContext(), 32)),
                  operand);
            })
+      .def("create_gather",
+           [](TritonOpBuilder &self, Value src, Value indices, int axis)
+               -> Value { return self.create<GatherOp>(src, indices, axis); })
       // Force GPU barrier
       .def("create_barrier",
            [](TritonOpBuilder &self) { self.create<mlir::gpu::BarrierOp>(); })

@@ -6087,3 +6087,43 @@ def kernel(In, Out,  #
                   perm[0], perm[1], perm[2], perm[3], perm[4], red_dims[0], red_dims[1], red_dims[2])
 
     assert torch.all(ref == result)
+
+
+@pytest.mark.parametrize("src_shape, indices_shape, axis", [
+    ([4, 4], [8, 2], 0),
+    ([128, 64], [256, 32], 0),
+    ([128, 64], [128, 128], 1),
+])
+def test_gather(src_shape, indices_shape, axis):
+
+    @triton.jit
+    def gather_kernel(src_ptr, idx_ptr, out_ptr, axis: tl.constexpr, src_dim0: tl.constexpr, src_dim1: tl.constexpr,
+                      src_stride0: tl.constexpr, src_stride1: tl.constexpr, idx_dim0: tl.constexpr,
+                      idx_dim1: tl.constexpr, idx_stride0: tl.constexpr, idx_stride1: tl.constexpr,
+                      out_dim0: tl.constexpr, out_dim1: tl.constexpr, out_stride0: tl.constexpr,
+                      out_stride1: tl.constexpr):
+        src_offs = (tl.arange(0, src_dim0)[:, None] * src_stride0 + tl.arange(0, src_dim1)[None, :] * src_stride1)
+        src = tl.load(src_ptr + src_offs)
+
+        idx_offs = (tl.arange(0, idx_dim0)[:, None] * idx_stride0 + tl.arange(0, idx_dim1)[None, :] * idx_stride1)
+        idx = tl.load(idx_ptr + idx_offs)
+
+        out = tl.gather(src, idx, axis)
+
+        out_offs = (tl.arange(0, out_dim0)[:, None] * out_stride0 + tl.arange(0, out_dim1)[None, :] * out_stride1)
+        tl.store(out_ptr + out_offs, out)
+
+    def triton_gather(src: torch.Tensor, axis: int, indices: torch.Tensor):
+        output = torch.empty(indices.shape, dtype=src.dtype, device=src.device)
+
+        gather_kernel[(1, )](src, indices, output, axis, src.shape[0], src.shape[1],
+                             src.stride(0), src.stride(1), indices.shape[0], indices.shape[1], indices.stride(0),
+                             indices.stride(1), output.shape[0], output.shape[1], output.stride(0), output.stride(1))
+
+        return output
+
+    src = torch.randn(src_shape, device='cuda')
+    indices = torch.randint(0, src.shape[axis], indices_shape, device='cuda')
+    ref = torch.gather(src, axis, indices)
+    result = triton_gather(src, axis, indices)
+    assert torch.all(ref == result)
@@ -70,6 +70,7 @@
     float8e5b16,
     full,
     function_type,
+    gather,
     histogram,
     inline_asm_elementwise,
     int1,
@@ -188,6 +189,7 @@
     "fma",
     "full",
     "function_type",
+    "gather",
     "histogram",
     "inline_asm_elementwise",
     "interleave",