diff --git a/.github/workflows/tpp-llvm.yml b/.github/workflows/tpp-llvm.yml index 431a4c2b2..c12c0bfaa 100644 --- a/.github/workflows/tpp-llvm.yml +++ b/.github/workflows/tpp-llvm.yml @@ -26,7 +26,7 @@ jobs: - uses: actions/checkout@v4 - name: LLVM CUDA run: |- - scripts/buildkite/check_llvm.sh || \ + GPU=cuda scripts/buildkite/check_llvm.sh || \ ${{ env.SRUN }} --partition=a100,v100 --time=0:30:00 -- \ 'KIND=RelWithDebInfo COMPILER=clang GPU=cuda \ ${{ github.workspace }}/scripts/buildkite/build_llvm.sh' @@ -37,8 +37,7 @@ jobs: - uses: actions/checkout@v4 - name: LLVM Vulkan run: |- - scripts/buildkite/check_llvm.sh || \ + GPU=vulkan scripts/buildkite/check_llvm.sh || \ ${{ env.SRUN }} --partition=spr-all --time=0:30:00 -- \ 'KIND=RelWithDebInfo COMPILER=clang GPU=vulkan \ ${{ github.workspace }}/scripts/buildkite/build_llvm.sh' - diff --git a/build_tools/llvm_version.txt b/build_tools/llvm_version.txt index 385d4451a..ef1e144e5 100644 --- a/build_tools/llvm_version.txt +++ b/build_tools/llvm_version.txt @@ -1 +1 @@ -1846523bb77275de954ac573110171bd39bfa930 +2641ed7d26198557a63ce447a0584a96494a5ac4 diff --git a/lib/TPP/GPU/LinalgToXeGPU.cpp b/lib/TPP/GPU/LinalgToXeGPU.cpp index 983c04728..33bf2e470 100644 --- a/lib/TPP/GPU/LinalgToXeGPU.cpp +++ b/lib/TPP/GPU/LinalgToXeGPU.cpp @@ -344,7 +344,7 @@ static std::optional lowerEltwiseOp(linalg::LinalgOp linalgOp, // Unhandled type. Bail out. return std::nullopt; }) - .Case([&](linalg::NegfOp negfOp) -> std::optional { + .Case([&](linalg::NegFOp negfOp) -> std::optional { assert(operands.size() == 1 && "Invalid number of operands for negf"); return rewriter.create(loc, resType, operands[0]) .getResult(); @@ -724,9 +724,9 @@ loadNdDescTiles(PatternRewriter &rewriter, Location loc, ValueRange loadTiles, VectorType vecLoadType = VectorType::get(tileType.getShape(), tileType.getElementType()); - IntegerAttr vnniAxisAttr = nullptr; + UnitAttr vnniPackedAttr = nullptr; if (vnniConf) { - vnniAxisAttr = IntegerAttr::get(rewriter.getI64Type(), vnniConf->vnniAxis); + vnniPackedAttr = rewriter.getUnitAttr(); vecLoadType = getVnniVector(tileType.getShape(), tileType.getElementType(), *vnniConf); } @@ -734,7 +734,7 @@ loadNdDescTiles(PatternRewriter &rewriter, Location loc, ValueRange loadTiles, SmallVector loadVec; for (auto tile : loadTiles) { auto loadOp = rewriter.create( - loc, vecLoadType, tile, vnniAxisAttr, transpose, + loc, vecLoadType, tile, vnniPackedAttr, transpose, /*l1_hint=*/hint, /*l2_hint=*/hint, /*l3_hint=*/hint); loadVec.push_back(loadOp); @@ -1043,12 +1043,11 @@ static LogicalResult createDPASKernel(linalg::LinalgOp linalgOp, if (vnniFactor == -1) return failure(); - VnniConfig vnniConfA{.vnniFactor = vnniFactor, .vnniAxis = 1}; VnniConfig vnniConfB{.vnniFactor = vnniFactor, .vnniAxis = 0}; // Load A sub-tiles. - SmallVector loadVecA = - loadNdDescTiles(rewriter, loc, tilesA, readCacheHint, vnniConfA); + SmallVector loadVecA = loadNdDescTiles( + rewriter, loc, tilesA, readCacheHint, /*vnniConf=*/std::nullopt); auto tileTypeA = cast(tilesA[0].getType()); // Load B sub-tiles. @@ -1077,9 +1076,9 @@ static LogicalResult createDPASKernel(linalg::LinalgOp linalgOp, } // Extract DPAS tiles from loaded sub-tiles. - TilesArray dpasVecA = extractVecSubTiles(rewriter, loc, loadVecA, - {dimM, kTile}, tileTypeA.getShape(), - {dpasTileM, dpasTileK}, vnniConfA); + TilesArray dpasVecA = extractVecSubTiles( + rewriter, loc, loadVecA, {dimM, kTile}, tileTypeA.getShape(), + {dpasTileM, dpasTileK}, /*vnniConf=*/std::nullopt); TilesArray dpasVecB = extractVecSubTiles(rewriter, loc, loadVecB, {kTile, dimN}, tileTypeB.getShape(), {dpasTileK, dpasTileN}, vnniConfB); @@ -1378,7 +1377,7 @@ void populateLinalgEltwiseToXeGPUPatterns(RewritePatternSet &patterns, ConvertNamedEltwiseToXeGPU, ConvertNamedEltwiseToXeGPU, ConvertNamedEltwiseToXeGPU, - ConvertNamedEltwiseToXeGPU, + ConvertNamedEltwiseToXeGPU, ConvertNamedEltwiseToXeGPU>(patterns.getContext(), options); } diff --git a/lib/TPP/Transforms/ToBlockLayoutAndBack.cpp b/lib/TPP/Transforms/ToBlockLayoutAndBack.cpp index bc678c07b..bb13c16a5 100644 --- a/lib/TPP/Transforms/ToBlockLayoutAndBack.cpp +++ b/lib/TPP/Transforms/ToBlockLayoutAndBack.cpp @@ -652,7 +652,7 @@ struct PropagatePackUnPack MLIRContext *ctx = getOperation().getContext(); RewritePatternSet patterns(ctx); linalg::populateDataLayoutPropagationPatterns( - patterns, [](Operation *op) { return true; }); + patterns, [](OpOperand *operand) { return true; }); (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns)); } }; @@ -813,7 +813,9 @@ void mlir::tpp::populateSimplifyPacking(RewritePatternSet &patterns) { // Propagate packs/unpacks only through expand shapes at this point. // This captures the transformation scope of the replaced downstream pass. linalg::populateDataLayoutPropagationPatterns( - patterns, [](Operation *op) { return isa(op); }); + patterns, [](OpOperand *operand) { + return isa(operand->get().getDefiningOp()); + }); ctx->getLoadedDialect()->getCanonicalizationPatterns( patterns); patterns.add(ctx); diff --git a/scripts/buildkite/build_llvm.sh b/scripts/buildkite/build_llvm.sh index 968987945..864bcb195 100755 --- a/scripts/buildkite/build_llvm.sh +++ b/scripts/buildkite/build_llvm.sh @@ -33,7 +33,7 @@ fi # Destination for tar balls if [ ! "${LLVM_TAR_DIR}" ]; then - LLVM_TAR_DIR="/tmp/tpp-llvm-tar" + LLVM_TAR_DIR="/scratch/tpp-llvm-tar" fi LLVM_TAR_DIR=$(add_device_extensions ${LLVM_TAR_DIR} ${GPU}) mkdir -p ${LLVM_TAR_DIR} @@ -90,8 +90,9 @@ fi check_program ${LINKER} if [ ! "${LLVM_BUILD_DIR}" ]; then - LLVM_BUILD_DIR="/tmp/tpp-llvm" + LLVM_BUILD_DIR="/scratch/tpp-llvm" fi +LLVM_BUILD_DIR=$(add_device_extensions ${LLVM_BUILD_DIR} ${GPU}) LLVM_BUILD_DIR=$(realpath ${LLVM_BUILD_DIR}) LLVM_BUILD_DIR=${LLVM_BUILD_DIR:-build-${COMPILER}} mkdir -p ${LLVM_BUILD_DIR} @@ -140,6 +141,7 @@ echo "--- BUILD" echo_run ninja -C ${LLVM_BUILD_DIR} all if [ $? != 0 ]; then rm -r ${LLVM_INSTALL_DIR} + rm -r ${LLVM_BUILD_DIR} exit 1 fi @@ -148,14 +150,20 @@ echo "--- CHECK" echo_run ninja -C ${LLVM_BUILD_DIR} check-all if [ $? != 0 ]; then rm -r ${LLVM_INSTALL_DIR} + rm -r ${LLVM_BUILD_DIR} exit 1 fi - # Install LLVM - echo "--- INSTALL" - mkdir -p ${LLVM_INSTALL_DIR} - echo_run ninja -C ${LLVM_BUILD_DIR} install +# Install LLVM +echo "--- INSTALL" +mkdir -p ${LLVM_INSTALL_DIR} +echo_run ninja -C ${LLVM_BUILD_DIR} install if [ $? != 0 ]; then rm -r ${LLVM_INSTALL_DIR} + rm -r ${LLVM_BUILD_DIR} exit 1 fi + +# Cleanup +echo "--- CLEANUP" +rm -r ${LLVM_BUILD_DIR} diff --git a/test/GPU/linalg-to-xegpu-dpas.mlir b/test/GPU/linalg-to-xegpu-dpas.mlir index a64a93f7a..cd976e64f 100644 --- a/test/GPU/linalg-to-xegpu-dpas.mlir +++ b/test/GPU/linalg-to-xegpu-dpas.mlir @@ -63,9 +63,9 @@ func.func @matmul(%arg0: memref<32x32xf16>, %arg1: memref<32x32xf16>, %arg2: mem // Extract DPAS-sized chunks from larger loaded tile A. // Tile B is already in the correct shape. -// CHECK: %[[vA_flat:.+]] = vector.shape_cast %[[vA]] : vector<32x8x2xf16> to vector<512xf16> +// CHECK: %[[vA_flat:.+]] = vector.shape_cast %[[vA]] : vector<32x16xf16> to vector<512xf16> // CHECK: %[[vA_dpas_flat:.+]] = vector.extract_strided_slice{{.*}}: vector<512xf16> to vector<128xf16> -// CHECK: %[[vA_dpas:.+]] = vector.shape_cast %[[vA_dpas_flat]] : vector<128xf16> to vector<8x8x2xf16> +// CHECK: %[[vA_dpas:.+]] = vector.shape_cast %[[vA_dpas_flat]] : vector<128xf16> to vector<8x16xf16> // CHECK-COUNT-3: vector.extract_strided_slice // Perform DPAS computation. diff --git a/test/Passes/tile-and-fuse-default.mlir b/test/Passes/tile-and-fuse-default.mlir index d1e5f1079..bb4f9dfd9 100644 --- a/test/Passes/tile-and-fuse-default.mlir +++ b/test/Passes/tile-and-fuse-default.mlir @@ -595,7 +595,7 @@ func.func @check_tile_propagation_to_eltwise_consumer(%arg0: tensor<2x2x2x4xf32> // CHECK-LABEL: check_tile_propagation_to_eltwise_consumer // CHECK-SAME: %[[ARG0:.+]]: tensor<2x2x2x4xf32>, %[[ARG1:.+]]: tensor<2x4x8x2xf32>, // CHECK-SAME: %[[ARG2:.+]]: tensor<2x2x8x2xf32>, %[[ARG3:.+]]: tensor<2x2x8x2xf32> -// CHECK: %[[C8:.+]] = arith.constant 8 : index +// CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index // CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index // CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index diff --git a/test/Passes/tile-and-fuse.mlir b/test/Passes/tile-and-fuse.mlir index 7a95761d1..daf9ad426 100644 --- a/test/Passes/tile-and-fuse.mlir +++ b/test/Passes/tile-and-fuse.mlir @@ -342,7 +342,7 @@ func.func @mlp(%arg0: tensor<8x112x32x32xbf16>, %arg1: tensor<112x112x32x32xbf16 %max = arith.maximumf %in, %cst : bf16 linalg.yield %max : bf16 } -> tensor<8x112x32x32xbf16> - // CHECK: %[[C112:.+]] = arith.constant 112 : index + // CHECK-DAG: %[[C112:.+]] = arith.constant 112 : index // CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index // CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index