From 9c53a05fb56128cda8869a563b999e90b31f2e17 Mon Sep 17 00:00:00 2001 From: Adam Siemieniuk Date: Tue, 20 Aug 2024 20:58:01 +0200 Subject: [PATCH] Bump LLVM (#957) Fixes for upstream API changes and improves tests' checks. Improves CI build scripts and recovery from LLVM build failures. Moves build directory to prevent running out of space on local disk during concurrent builds. --- .github/workflows/tpp-llvm.yml | 5 ++--- build_tools/llvm_version.txt | 2 +- lib/TPP/GPU/LinalgToXeGPU.cpp | 21 ++++++++++----------- lib/TPP/Transforms/ToBlockLayoutAndBack.cpp | 6 ++++-- scripts/buildkite/build_llvm.sh | 20 ++++++++++++++------ test/GPU/linalg-to-xegpu-dpas.mlir | 4 ++-- test/Passes/tile-and-fuse-default.mlir | 2 +- test/Passes/tile-and-fuse.mlir | 2 +- 8 files changed, 35 insertions(+), 27 deletions(-) diff --git a/.github/workflows/tpp-llvm.yml b/.github/workflows/tpp-llvm.yml index 431a4c2b2..c12c0bfaa 100644 --- a/.github/workflows/tpp-llvm.yml +++ b/.github/workflows/tpp-llvm.yml @@ -26,7 +26,7 @@ jobs: - uses: actions/checkout@v4 - name: LLVM CUDA run: |- - scripts/buildkite/check_llvm.sh || \ + GPU=cuda scripts/buildkite/check_llvm.sh || \ ${{ env.SRUN }} --partition=a100,v100 --time=0:30:00 -- \ 'KIND=RelWithDebInfo COMPILER=clang GPU=cuda \ ${{ github.workspace }}/scripts/buildkite/build_llvm.sh' @@ -37,8 +37,7 @@ jobs: - uses: actions/checkout@v4 - name: LLVM Vulkan run: |- - scripts/buildkite/check_llvm.sh || \ + GPU=vulkan scripts/buildkite/check_llvm.sh || \ ${{ env.SRUN }} --partition=spr-all --time=0:30:00 -- \ 'KIND=RelWithDebInfo COMPILER=clang GPU=vulkan \ ${{ github.workspace }}/scripts/buildkite/build_llvm.sh' - diff --git a/build_tools/llvm_version.txt b/build_tools/llvm_version.txt index 385d4451a..ef1e144e5 100644 --- a/build_tools/llvm_version.txt +++ b/build_tools/llvm_version.txt @@ -1 +1 @@ -1846523bb77275de954ac573110171bd39bfa930 +2641ed7d26198557a63ce447a0584a96494a5ac4 diff --git a/lib/TPP/GPU/LinalgToXeGPU.cpp b/lib/TPP/GPU/LinalgToXeGPU.cpp index 983c04728..33bf2e470 100644 --- a/lib/TPP/GPU/LinalgToXeGPU.cpp +++ b/lib/TPP/GPU/LinalgToXeGPU.cpp @@ -344,7 +344,7 @@ static std::optional lowerEltwiseOp(linalg::LinalgOp linalgOp, // Unhandled type. Bail out. return std::nullopt; }) - .Case([&](linalg::NegfOp negfOp) -> std::optional { + .Case([&](linalg::NegFOp negfOp) -> std::optional { assert(operands.size() == 1 && "Invalid number of operands for negf"); return rewriter.create(loc, resType, operands[0]) .getResult(); @@ -724,9 +724,9 @@ loadNdDescTiles(PatternRewriter &rewriter, Location loc, ValueRange loadTiles, VectorType vecLoadType = VectorType::get(tileType.getShape(), tileType.getElementType()); - IntegerAttr vnniAxisAttr = nullptr; + UnitAttr vnniPackedAttr = nullptr; if (vnniConf) { - vnniAxisAttr = IntegerAttr::get(rewriter.getI64Type(), vnniConf->vnniAxis); + vnniPackedAttr = rewriter.getUnitAttr(); vecLoadType = getVnniVector(tileType.getShape(), tileType.getElementType(), *vnniConf); } @@ -734,7 +734,7 @@ loadNdDescTiles(PatternRewriter &rewriter, Location loc, ValueRange loadTiles, SmallVector loadVec; for (auto tile : loadTiles) { auto loadOp = rewriter.create( - loc, vecLoadType, tile, vnniAxisAttr, transpose, + loc, vecLoadType, tile, vnniPackedAttr, transpose, /*l1_hint=*/hint, /*l2_hint=*/hint, /*l3_hint=*/hint); loadVec.push_back(loadOp); @@ -1043,12 +1043,11 @@ static LogicalResult createDPASKernel(linalg::LinalgOp linalgOp, if (vnniFactor == -1) return failure(); - VnniConfig vnniConfA{.vnniFactor = vnniFactor, .vnniAxis = 1}; VnniConfig vnniConfB{.vnniFactor = vnniFactor, .vnniAxis = 0}; // Load A sub-tiles. - SmallVector loadVecA = - loadNdDescTiles(rewriter, loc, tilesA, readCacheHint, vnniConfA); + SmallVector loadVecA = loadNdDescTiles( + rewriter, loc, tilesA, readCacheHint, /*vnniConf=*/std::nullopt); auto tileTypeA = cast(tilesA[0].getType()); // Load B sub-tiles. @@ -1077,9 +1076,9 @@ static LogicalResult createDPASKernel(linalg::LinalgOp linalgOp, } // Extract DPAS tiles from loaded sub-tiles. - TilesArray dpasVecA = extractVecSubTiles(rewriter, loc, loadVecA, - {dimM, kTile}, tileTypeA.getShape(), - {dpasTileM, dpasTileK}, vnniConfA); + TilesArray dpasVecA = extractVecSubTiles( + rewriter, loc, loadVecA, {dimM, kTile}, tileTypeA.getShape(), + {dpasTileM, dpasTileK}, /*vnniConf=*/std::nullopt); TilesArray dpasVecB = extractVecSubTiles(rewriter, loc, loadVecB, {kTile, dimN}, tileTypeB.getShape(), {dpasTileK, dpasTileN}, vnniConfB); @@ -1378,7 +1377,7 @@ void populateLinalgEltwiseToXeGPUPatterns(RewritePatternSet &patterns, ConvertNamedEltwiseToXeGPU, ConvertNamedEltwiseToXeGPU, ConvertNamedEltwiseToXeGPU, - ConvertNamedEltwiseToXeGPU, + ConvertNamedEltwiseToXeGPU, ConvertNamedEltwiseToXeGPU>(patterns.getContext(), options); } diff --git a/lib/TPP/Transforms/ToBlockLayoutAndBack.cpp b/lib/TPP/Transforms/ToBlockLayoutAndBack.cpp index bc678c07b..bb13c16a5 100644 --- a/lib/TPP/Transforms/ToBlockLayoutAndBack.cpp +++ b/lib/TPP/Transforms/ToBlockLayoutAndBack.cpp @@ -652,7 +652,7 @@ struct PropagatePackUnPack MLIRContext *ctx = getOperation().getContext(); RewritePatternSet patterns(ctx); linalg::populateDataLayoutPropagationPatterns( - patterns, [](Operation *op) { return true; }); + patterns, [](OpOperand *operand) { return true; }); (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns)); } }; @@ -813,7 +813,9 @@ void mlir::tpp::populateSimplifyPacking(RewritePatternSet &patterns) { // Propagate packs/unpacks only through expand shapes at this point. // This captures the transformation scope of the replaced downstream pass. linalg::populateDataLayoutPropagationPatterns( - patterns, [](Operation *op) { return isa(op); }); + patterns, [](OpOperand *operand) { + return isa(operand->get().getDefiningOp()); + }); ctx->getLoadedDialect()->getCanonicalizationPatterns( patterns); patterns.add(ctx); diff --git a/scripts/buildkite/build_llvm.sh b/scripts/buildkite/build_llvm.sh index 968987945..864bcb195 100755 --- a/scripts/buildkite/build_llvm.sh +++ b/scripts/buildkite/build_llvm.sh @@ -33,7 +33,7 @@ fi # Destination for tar balls if [ ! "${LLVM_TAR_DIR}" ]; then - LLVM_TAR_DIR="/tmp/tpp-llvm-tar" + LLVM_TAR_DIR="/scratch/tpp-llvm-tar" fi LLVM_TAR_DIR=$(add_device_extensions ${LLVM_TAR_DIR} ${GPU}) mkdir -p ${LLVM_TAR_DIR} @@ -90,8 +90,9 @@ fi check_program ${LINKER} if [ ! "${LLVM_BUILD_DIR}" ]; then - LLVM_BUILD_DIR="/tmp/tpp-llvm" + LLVM_BUILD_DIR="/scratch/tpp-llvm" fi +LLVM_BUILD_DIR=$(add_device_extensions ${LLVM_BUILD_DIR} ${GPU}) LLVM_BUILD_DIR=$(realpath ${LLVM_BUILD_DIR}) LLVM_BUILD_DIR=${LLVM_BUILD_DIR:-build-${COMPILER}} mkdir -p ${LLVM_BUILD_DIR} @@ -140,6 +141,7 @@ echo "--- BUILD" echo_run ninja -C ${LLVM_BUILD_DIR} all if [ $? != 0 ]; then rm -r ${LLVM_INSTALL_DIR} + rm -r ${LLVM_BUILD_DIR} exit 1 fi @@ -148,14 +150,20 @@ echo "--- CHECK" echo_run ninja -C ${LLVM_BUILD_DIR} check-all if [ $? != 0 ]; then rm -r ${LLVM_INSTALL_DIR} + rm -r ${LLVM_BUILD_DIR} exit 1 fi - # Install LLVM - echo "--- INSTALL" - mkdir -p ${LLVM_INSTALL_DIR} - echo_run ninja -C ${LLVM_BUILD_DIR} install +# Install LLVM +echo "--- INSTALL" +mkdir -p ${LLVM_INSTALL_DIR} +echo_run ninja -C ${LLVM_BUILD_DIR} install if [ $? != 0 ]; then rm -r ${LLVM_INSTALL_DIR} + rm -r ${LLVM_BUILD_DIR} exit 1 fi + +# Cleanup +echo "--- CLEANUP" +rm -r ${LLVM_BUILD_DIR} diff --git a/test/GPU/linalg-to-xegpu-dpas.mlir b/test/GPU/linalg-to-xegpu-dpas.mlir index a64a93f7a..cd976e64f 100644 --- a/test/GPU/linalg-to-xegpu-dpas.mlir +++ b/test/GPU/linalg-to-xegpu-dpas.mlir @@ -63,9 +63,9 @@ func.func @matmul(%arg0: memref<32x32xf16>, %arg1: memref<32x32xf16>, %arg2: mem // Extract DPAS-sized chunks from larger loaded tile A. // Tile B is already in the correct shape. -// CHECK: %[[vA_flat:.+]] = vector.shape_cast %[[vA]] : vector<32x8x2xf16> to vector<512xf16> +// CHECK: %[[vA_flat:.+]] = vector.shape_cast %[[vA]] : vector<32x16xf16> to vector<512xf16> // CHECK: %[[vA_dpas_flat:.+]] = vector.extract_strided_slice{{.*}}: vector<512xf16> to vector<128xf16> -// CHECK: %[[vA_dpas:.+]] = vector.shape_cast %[[vA_dpas_flat]] : vector<128xf16> to vector<8x8x2xf16> +// CHECK: %[[vA_dpas:.+]] = vector.shape_cast %[[vA_dpas_flat]] : vector<128xf16> to vector<8x16xf16> // CHECK-COUNT-3: vector.extract_strided_slice // Perform DPAS computation. diff --git a/test/Passes/tile-and-fuse-default.mlir b/test/Passes/tile-and-fuse-default.mlir index d1e5f1079..bb4f9dfd9 100644 --- a/test/Passes/tile-and-fuse-default.mlir +++ b/test/Passes/tile-and-fuse-default.mlir @@ -595,7 +595,7 @@ func.func @check_tile_propagation_to_eltwise_consumer(%arg0: tensor<2x2x2x4xf32> // CHECK-LABEL: check_tile_propagation_to_eltwise_consumer // CHECK-SAME: %[[ARG0:.+]]: tensor<2x2x2x4xf32>, %[[ARG1:.+]]: tensor<2x4x8x2xf32>, // CHECK-SAME: %[[ARG2:.+]]: tensor<2x2x8x2xf32>, %[[ARG3:.+]]: tensor<2x2x8x2xf32> -// CHECK: %[[C8:.+]] = arith.constant 8 : index +// CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index // CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index // CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index diff --git a/test/Passes/tile-and-fuse.mlir b/test/Passes/tile-and-fuse.mlir index 7a95761d1..daf9ad426 100644 --- a/test/Passes/tile-and-fuse.mlir +++ b/test/Passes/tile-and-fuse.mlir @@ -342,7 +342,7 @@ func.func @mlp(%arg0: tensor<8x112x32x32xbf16>, %arg1: tensor<112x112x32x32xbf16 %max = arith.maximumf %in, %cst : bf16 linalg.yield %max : bf16 } -> tensor<8x112x32x32xbf16> - // CHECK: %[[C112:.+]] = arith.constant 112 : index + // CHECK-DAG: %[[C112:.+]] = arith.constant 112 : index // CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index // CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index