From 49a0052a080c9fab2b0af6fbcc858e2078baa2c5 Mon Sep 17 00:00:00 2001 From: Kareem Ergawy Date: Mon, 17 Jun 2024 12:09:10 +0200 Subject: [PATCH] [flang][OpenMP] Add support for multi-range `do concurrent` loops (#89) * [flang][OpenMP] Add support for multi-range `do concurrent` loops Extends `do concurrent` to OpenMP mapping by adding support for multi-range loops. The current implementation only works for perfectly nested loops. So taking this input: ```fortran do concurrent(i=1:n, j=1:m) a(i,j,k) = i * j end do ``` will behave in exactly the same way as this input: ``` do concurrent(i=1:n) do concurrent(j=1:m) a(i,j,k) = i * j end do end do ``` --- .../Transforms/DoConcurrentConversion.cpp | 528 ++++++++++++++---- .../multiple_iteration_ranges.f90 | 122 ++++ .../DoConcurrent/not_perfectly_nested.f90 | 63 +++ 3 files changed, 610 insertions(+), 103 deletions(-) create mode 100644 flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90 create mode 100644 flang/test/Transforms/DoConcurrent/not_perfectly_nested.f90 diff --git a/flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp b/flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp index e099b74ddbe9e1..b30379da272ea6 100644 --- a/flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp +++ b/flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp @@ -24,6 +24,7 @@ #include "mlir/Transforms/RegionUtils.h" #include "llvm/Frontend/OpenMP/OMPConstants.h" +#include #include #include @@ -149,12 +150,334 @@ mlir::Value calculateTripCount(fir::FirOpBuilder &builder, mlir::Location loc, } // namespace Fortran namespace { +namespace looputils { +/// Stores info needed about the induction/iteration variable for each `do +/// concurrent` in a loop nest. This includes: +/// * the operation allocating memory for iteration variable, +/// * the operation(s) updating the iteration variable with the current +/// iteration number. +struct InductionVariableInfo { + mlir::Operation *iterVarMemDef; + llvm::SetVector indVarUpdateOps; +}; + +using LoopNestToIndVarMap = + llvm::MapVector; + +/// Given an operation `op`, this returns true if `op`'s operand is ultimately +/// the loop's induction variable. Detecting this helps finding the live-in +/// value corresponding to the induction variable in case the induction variable +/// is indirectly used in the loop (e.g. throught a cast op). +bool isIndVarUltimateOperand(mlir::Operation *op, fir::DoLoopOp doLoop) { + while (op != nullptr && op->getNumOperands() > 0) { + auto ivIt = llvm::find_if(op->getOperands(), [&](mlir::Value operand) { + return operand == doLoop.getInductionVar(); + }); + + if (ivIt != op->getOperands().end()) + return true; + + op = op->getOperand(0).getDefiningOp(); + } + + return false; +}; + +/// Collect the list of values used inside the loop but defined outside of it. +/// The first item in the returned list is always the loop's induction +/// variable. +void collectLoopLiveIns(fir::DoLoopOp doLoop, + llvm::SmallVectorImpl &liveIns) { + llvm::SmallDenseSet seenValues; + llvm::SmallDenseSet seenOps; + + mlir::visitUsedValuesDefinedAbove( + doLoop.getRegion(), [&](mlir::OpOperand *operand) { + if (!seenValues.insert(operand->get()).second) + return; + + mlir::Operation *definingOp = operand->get().getDefiningOp(); + // We want to collect ops corresponding to live-ins only once. + if (definingOp && !seenOps.insert(definingOp).second) + return; + + liveIns.push_back(operand->get()); + + if (isIndVarUltimateOperand(operand->getOwner(), doLoop)) + std::swap(*liveIns.begin(), *liveIns.rbegin()); + }); +} + +/// Collects the op(s) responsible for updating a loop's iteration variable with +/// the current iteration number. For example, for the input IR: +/// ``` +/// %i = fir.alloca i32 {bindc_name = "i"} +/// %i_decl:2 = hlfir.declare %i ... +/// ... +/// fir.do_loop %i_iv = %lb to %ub step %step unordered { +/// %1 = fir.convert %i_iv : (index) -> i32 +/// fir.store %1 to %i_decl#1 : !fir.ref +/// ... +/// } +/// ``` +/// this function would return the first 2 ops in the `fir.do_loop`'s region. +llvm::SetVector +extractIndVarUpdateOps(fir::DoLoopOp doLoop) { + mlir::Value indVar = doLoop.getInductionVar(); + llvm::SetVector indVarUpdateOps; + + llvm::SmallVector toProcess; + toProcess.push_back(indVar); + + llvm::DenseSet done; + + while (!toProcess.empty()) { + mlir::Value val = toProcess.back(); + toProcess.pop_back(); + + if (!done.insert(val).second) + continue; + + for (mlir::Operation *user : val.getUsers()) { + indVarUpdateOps.insert(user); + + for (mlir::Value result : user->getResults()) + toProcess.push_back(result); + } + } + + return std::move(indVarUpdateOps); +} + +/// Starting with a value and the end of a defintion/conversion chain, walk the +/// chain backwards and collect all the visited ops along the way. For example, +/// given this IR: +/// ``` +/// %c10 = arith.constant 10 : i32 +/// %10 = fir.convert %c10 : (i32) -> index +/// ``` +/// and giving `%10` as the starting input: `link`, `defChain` would contain +/// both of the above ops. +mlir::LogicalResult +collectIndirectOpChain(mlir::Operation *link, + llvm::SmallVectorImpl &opChain) { + while (!mlir::isa_and_present(link)) { + if (auto convertOp = mlir::dyn_cast_if_present(link)) { + opChain.push_back(link); + link = convertOp.getValue().getDefiningOp(); + continue; + } + + std::string opStr; + llvm::raw_string_ostream opOs(opStr); + opOs << "Unexpected operation: " << *link; + return mlir::emitError(link->getLoc(), opOs.str()); + } + + opChain.push_back(link); + std::reverse(opChain.begin(), opChain.end()); + return mlir::success(); +} + +/// Starting with `outerLoop` collect a perfectly nested loop nest, if any. This +/// function collects as much as possible loops in the nest; it case it fails to +/// recognize a certain nested loop as part of the nest it just returns the +/// parent loops it discovered before. +mlir::LogicalResult collectLoopNest(fir::DoLoopOp outerLoop, + LoopNestToIndVarMap &loopNest) { + assert(outerLoop.getUnordered()); + llvm::SmallVector outerLoopLiveIns; + collectLoopLiveIns(outerLoop, outerLoopLiveIns); + + while (true) { + loopNest.try_emplace( + outerLoop, + InductionVariableInfo{ + outerLoopLiveIns.front().getDefiningOp(), + std::move(looputils::extractIndVarUpdateOps(outerLoop))}); + + auto directlyNestedLoops = outerLoop.getRegion().getOps(); + llvm::SmallVector unorderedLoops; + + for (auto nestedLoop : directlyNestedLoops) + if (nestedLoop.getUnordered()) + unorderedLoops.push_back(nestedLoop); + + if (unorderedLoops.empty()) + break; + + if (unorderedLoops.size() > 1) + return mlir::failure(); + + fir::DoLoopOp nestedUnorderedLoop = unorderedLoops.front(); + + if ((nestedUnorderedLoop.getLowerBound().getDefiningOp() == nullptr) || + (nestedUnorderedLoop.getUpperBound().getDefiningOp() == nullptr) || + (nestedUnorderedLoop.getStep().getDefiningOp() == nullptr)) + return mlir::failure(); + + llvm::SmallVector nestedLiveIns; + collectLoopLiveIns(nestedUnorderedLoop, nestedLiveIns); + + llvm::DenseSet outerLiveInsSet; + llvm::DenseSet nestedLiveInsSet; + + // Returns a "unified" view of an mlir::Value. This utility checks if the + // value is defined by an op, and if so, return the first value defined by + // that op (if there are many), otherwise just returns the value. + // + // This serves the purpose that if, for example, `%op_res#0` is used in the + // outer loop and `%op_res#1` is used in the nested loop (or vice versa), + // that we detect both as the same value. If we did not do so, we might + // falesely detect that the 2 loops are not perfectly nested since they use + // "different" sets of values. + auto getUnifiedLiveInView = [](mlir::Value liveIn) { + return liveIn.getDefiningOp() != nullptr + ? liveIn.getDefiningOp()->getResult(0) + : liveIn; + }; + + // Re-package both lists of live-ins into sets so that we can use set + // equality to compare the values used in the outerloop vs. the nestd one. + + for (auto liveIn : nestedLiveIns) + nestedLiveInsSet.insert(getUnifiedLiveInView(liveIn)); + + mlir::Value outerLoopIV; + for (auto liveIn : outerLoopLiveIns) { + outerLiveInsSet.insert(getUnifiedLiveInView(liveIn)); + + // Keep track of the IV of the outerloop. See `isPerfectlyNested` for more + // info on the reason. + if (outerLoopIV == nullptr) + outerLoopIV = getUnifiedLiveInView(liveIn); + } + + // For the 2 loops to be perfectly nested, either: + // * both would have exactly the same set of live-in values or, + // * the outer loop would have exactly 1 extra live-in value: the outer + // loop's induction variable; this happens when the outer loop's IV is + // *not* referenced in the nested loop. + bool isPerfectlyNested = [&]() { + if (outerLiveInsSet == nestedLiveInsSet) + return true; + + if ((outerLiveInsSet.size() == nestedLiveIns.size() + 1) && + !nestedLiveInsSet.contains(outerLoopIV)) + return true; + + return false; + }(); + + if (!isPerfectlyNested) + return mlir::failure(); + + outerLoop = nestedUnorderedLoop; + outerLoopLiveIns = std::move(nestedLiveIns); + } + + return mlir::success(); +} + +/// Prepares the `fir.do_loop` nest to be easily mapped to OpenMP. In +/// particular, this function would take this input IR: +/// ``` +/// fir.do_loop %i_iv = %i_lb to %i_ub step %i_step unordered { +/// fir.store %i_iv to %i#1 : !fir.ref +/// %j_lb = arith.constant 1 : i32 +/// %j_ub = arith.constant 10 : i32 +/// %j_step = arith.constant 1 : index +/// +/// fir.do_loop %j_iv = %j_lb to %j_ub step %j_step unordered { +/// fir.store %j_iv to %j#1 : !fir.ref +/// ... +/// } +/// } +/// ``` +/// +/// into the following form (using generic op form since the result is +/// technically an invalid `fir.do_loop` op: +/// +/// ``` +/// "fir.do_loop"(%i_lb, %i_ub, %i_step) <{unordered}> ({ +/// ^bb0(%i_iv: index): +/// %j_lb = "arith.constant"() <{value = 1 : i32}> : () -> i32 +/// %j_ub = "arith.constant"() <{value = 10 : i32}> : () -> i32 +/// %j_step = "arith.constant"() <{value = 1 : index}> : () -> index +/// +/// "fir.do_loop"(%j_lb, %j_ub, %j_step) <{unordered}> ({ +/// ^bb0(%new_i_iv: index, %new_j_iv: index): +/// "fir.store"(%new_i_iv, %i#1) : (i32, !fir.ref) -> () +/// "fir.store"(%new_j_iv, %j#1) : (i32, !fir.ref) -> () +/// ... +/// }) +/// ``` +/// +/// What happened to the loop nest is the following: +/// +/// * the innermost loop's entry block was updated from having one operand to +/// having `n` operands where `n` is the number of loops in the nest, +/// +/// * the outer loop(s)' ops that update the IVs were sank inside the innermost +/// loop (see the `"fir.store"(%new_i_iv, %i#1)` op above), +/// +/// * the innermost loop's entry block's arguments were mapped in order from the +/// outermost to the innermost IV. +/// +/// With this IR change, we can directly inline the innermost loop's region into +/// the newly generated `omp.loop_nest` op. +/// +/// Note that this function has a pre-condition that \p loopNest consists of +/// perfectly nested loops; i.e. there are no in-between ops between 2 nested +/// loops except for the ops to setup the inner loop's LB, UB, and step. These +/// ops are handled/cloned by `genLoopNestClauseOps(..)`. +void sinkLoopIVArgs(mlir::ConversionPatternRewriter &rewriter, + looputils::LoopNestToIndVarMap &loopNest) { + if (loopNest.size() <= 1) + return; + + fir::DoLoopOp innermostLoop = loopNest.back().first; + mlir::Operation &innermostFirstOp = innermostLoop.getRegion().front().front(); + + llvm::SmallVector argTypes; + llvm::SmallVector argLocs; + + for (auto &[doLoop, indVarInfo] : llvm::drop_end(loopNest)) { + // Sink the IV update ops to the innermost loop. We need to do for all loops + // except for the innermost one, hence the `drop_end` usage above. + for (mlir::Operation *op : indVarInfo.indVarUpdateOps) + op->moveBefore(&innermostFirstOp); + + argTypes.push_back(doLoop.getInductionVar().getType()); + argLocs.push_back(doLoop.getInductionVar().getLoc()); + } + + mlir::Region &innermmostRegion = innermostLoop.getRegion(); + // Extend the innermost entry block with arguments to represent the outer IVs. + innermmostRegion.addArguments(argTypes, argLocs); + + unsigned idx = 1; + // In reverse, remap the IVs of the loop nest from the old values to the new + // ones. We do that in reverse since the first argument before this loop is + // the old IV for the innermost loop. Therefore, we want to replace it first + // before the old value (1st argument in the block) is remapped to be the IV + // of the outermost loop in the nest. + for (auto &[doLoop, _] : llvm::reverse(loopNest)) { + doLoop.getInductionVar().replaceAllUsesWith( + innermmostRegion.getArgument(innermmostRegion.getNumArguments() - idx)); + ++idx; + } +} +} // namespace looputils + class DoConcurrentConversion : public mlir::OpConversionPattern { public: using mlir::OpConversionPattern::OpConversionPattern; - DoConcurrentConversion(mlir::MLIRContext *context, bool mapToDevice) - : OpConversionPattern(context), mapToDevice(mapToDevice) {} + DoConcurrentConversion(mlir::MLIRContext *context, bool mapToDevice, + llvm::DenseSet &concurrentLoopsToSkip) + : OpConversionPattern(context), mapToDevice(mapToDevice), + concurrentLoopsToSkip(concurrentLoopsToSkip) {} mlir::LogicalResult matchAndRewrite(fir::DoLoopOp doLoop, OpAdaptor adaptor, @@ -188,9 +511,15 @@ class DoConcurrentConversion : public mlir::OpConversionPattern { "constant LB, UB, and step values."); } - llvm::SmallVector liveIns; - collectLoopLiveIns(doLoop, liveIns); - assert(!liveIns.empty()); + llvm::SmallVector outermostLoopLives; + looputils::collectLoopLiveIns(doLoop, outermostLoopLives); + assert(!outermostLoopLives.empty()); + + looputils::LoopNestToIndVarMap loopNest; + bool hasRemainingNestedLoops = + failed(looputils::collectLoopNest(doLoop, loopNest)); + + looputils::sinkLoopIVArgs(rewriter, loopNest); mlir::IRMapping mapper; mlir::omp::TargetOp targetOp; @@ -198,18 +527,24 @@ class DoConcurrentConversion : public mlir::OpConversionPattern { if (mapToDevice) { mlir::omp::TargetClauseOps clauseOps; - for (mlir::Value liveIn : liveIns) + + // The outermost loop will contain all the live-in values in all nested + // loops since live-in values are collected recursively for all nested + // ops. + for (mlir::Value liveIn : outermostLoopLives) clauseOps.mapVars.push_back(genMapInfoOpForLiveIn(rewriter, liveIn)); - targetOp = - genTargetOp(doLoop.getLoc(), rewriter, mapper, liveIns, clauseOps); - genTeamsOp(doLoop.getLoc(), rewriter, doLoop, liveIns, mapper, + + targetOp = genTargetOp(doLoop.getLoc(), rewriter, mapper, + outermostLoopLives, clauseOps); + genTeamsOp(doLoop.getLoc(), rewriter, loopNest, mapper, loopNestClauseOps); genDistributeOp(doLoop.getLoc(), rewriter); } - genParallelOp(doLoop.getLoc(), rewriter, doLoop, liveIns, mapper, + genParallelOp(doLoop.getLoc(), rewriter, loopNest, mapper, loopNestClauseOps); - genWsLoopOp(rewriter, doLoop, mapper, loopNestClauseOps); + mlir::omp::LoopNestOp ompLoopNest = + genWsLoopOp(rewriter, loopNest.back().first, mapper, loopNestClauseOps); // Now that we created the nested `ws.loop` op, we set can the `target` op's // trip count. @@ -229,56 +564,22 @@ class DoConcurrentConversion : public mlir::OpConversionPattern { } rewriter.eraseOp(doLoop); - return mlir::success(); - } -private: - /// Collect the list of values used inside the loop but defined outside of it. - /// The first item in the returned list is always the loop's induction - /// variable. - void collectLoopLiveIns(fir::DoLoopOp doLoop, - llvm::SmallVectorImpl &liveIns) const { - // Given an operation `op`, this lambda returns true if `op`'s operand is - // ultimately the loop's induction variable. Detecting this helps finding - // the live-in value corresponding to the induction variable in case the - // induction variable is indirectly used in the loop (e.g. throught a cast - // op). - std::function isIndVarUltimateOperand = - [&](mlir::Operation *op) { - if (auto storeOp = mlir::dyn_cast_if_present(op)) { - return (storeOp.getValue() == doLoop.getInductionVar()) || - isIndVarUltimateOperand(storeOp.getValue().getDefiningOp()); - } - - if (auto convertOp = mlir::dyn_cast_if_present(op)) { - return convertOp.getOperand() == doLoop.getInductionVar() || - isIndVarUltimateOperand( - convertOp.getValue().getDefiningOp()); - } - - return false; - }; - - llvm::SmallDenseSet seenValues; - llvm::SmallDenseSet seenOps; - - mlir::visitUsedValuesDefinedAbove( - doLoop.getRegion(), [&](mlir::OpOperand *operand) { - if (!seenValues.insert(operand->get()).second) - return; - - mlir::Operation *definingOp = operand->get().getDefiningOp(); - // We want to collect ops corresponding to live-ins only once. - if (definingOp && !seenOps.insert(definingOp).second) - return; - - liveIns.push_back(operand->get()); + if (hasRemainingNestedLoops) { + // Mark `unordered` loops that are not perfectly nested to be skipped from + // the legality check of the `ConversionTarget` since we are not + // interested in mapping them to OpenMP. + ompLoopNest->walk([&](fir::DoLoopOp doLoop) { + if (doLoop.getUnordered()) { + concurrentLoopsToSkip.insert(doLoop); + } + }); + } - if (isIndVarUltimateOperand(operand->getOwner())) - std::swap(*liveIns.begin(), *liveIns.rbegin()); - }); + return mlir::success(); } +private: void genBoundsOps(mlir::ConversionPatternRewriter &rewriter, mlir::Location loc, hlfir::DeclareOp declareOp, llvm::SmallVectorImpl &boundsOps) const { @@ -375,7 +676,13 @@ class DoConcurrentConversion : public mlir::OpConversionPattern { llvm::zip_equal(region.getArguments(), clauseOps.mapVars)) { auto miOp = mlir::cast(mapInfoOp.getDefiningOp()); hlfir::DeclareOp liveInDeclare = genLiveInDeclare(rewriter, arg, miOp); - mapper.map(miOp.getVariableOperand(0), liveInDeclare.getBase()); + mlir::Value miOperand = miOp.getVariableOperand(0); + mapper.map(miOperand, liveInDeclare.getBase()); + + if (auto origDeclareOp = mlir::dyn_cast_if_present( + miOperand.getDefiningOp())) + mapper.map(origDeclareOp.getOriginalBase(), + liveInDeclare.getOriginalBase()); } rewriter.setInsertionPoint( @@ -428,8 +735,7 @@ class DoConcurrentConversion : public mlir::OpConversionPattern { mlir::omp::TeamsOp genTeamsOp(mlir::Location loc, mlir::ConversionPatternRewriter &rewriter, - fir::DoLoopOp doLoop, llvm::ArrayRef liveIns, - mlir::IRMapping &mapper, + looputils::LoopNestToIndVarMap &loopNest, mlir::IRMapping &mapper, mlir::omp::LoopNestClauseOps &loopNestClauseOps) const { auto teamsOp = rewriter.create( loc, /*clauses=*/mlir::omp::TeamsClauseOps{}); @@ -437,17 +743,16 @@ class DoConcurrentConversion : public mlir::OpConversionPattern { rewriter.createBlock(&teamsOp.getRegion()); rewriter.setInsertionPoint(rewriter.create(loc)); - genInductionVariableAlloc(rewriter, liveIns, mapper); - genLoopNestClauseOps(loc, rewriter, doLoop, mapper, loopNestClauseOps); + genLoopNestIndVarAllocs(rewriter, loopNest, mapper); + genLoopNestClauseOps(loc, rewriter, loopNest, mapper, loopNestClauseOps); return teamsOp; } - void - genLoopNestClauseOps(mlir::Location loc, - mlir::ConversionPatternRewriter &rewriter, - fir::DoLoopOp doLoop, mlir::IRMapping &mapper, - mlir::omp::LoopNestClauseOps &loopNestClauseOps) const { + void genLoopNestClauseOps( + mlir::Location loc, mlir::ConversionPatternRewriter &rewriter, + looputils::LoopNestToIndVarMap &loopNest, mlir::IRMapping &mapper, + mlir::omp::LoopNestClauseOps &loopNestClauseOps) const { assert(loopNestClauseOps.loopLBVar.empty() && "Loop nest bounds were already emitted!"); @@ -456,33 +761,37 @@ class DoConcurrentConversion : public mlir::OpConversionPattern { // `fir.convert`op, this lambda clones the `fir.convert` as well as the // value it converts from. We do this since `omp.target` regions are // isolated from above. - std::function - cloneBoundOrStepDefChain = [&](mlir::Operation *operation) { - if (mlir::isa_and_present(operation)) - return rewriter.clone(*operation, mapper); + auto cloneBoundOrStepOpChain = + [&](mlir::Operation *operation) -> mlir::Operation * { + llvm::SmallVector opChain; + mlir::LogicalResult extractResult = + looputils::collectIndirectOpChain(operation, opChain); + + if (failed(extractResult)) { + return nullptr; + } - if (auto convertOp = - mlir::dyn_cast_if_present(operation)) { - cloneBoundOrStepDefChain(convertOp.getValue().getDefiningOp()); - return rewriter.clone(*operation, mapper); - } - - std::string opStr; - llvm::raw_string_ostream opOs(opStr); - opOs << "Unexpected operation: " << *operation; - llvm_unreachable(opOs.str().c_str()); - }; + mlir::Operation *result; + for (mlir::Operation *link : opChain) + result = rewriter.clone(*link, mapper); - mlir::Operation *lbOp = doLoop.getLowerBound().getDefiningOp(); - mlir::Operation *ubOp = doLoop.getUpperBound().getDefiningOp(); - mlir::Operation *stepOp = doLoop.getStep().getDefiningOp(); + return result; + }; + + for (auto &[doLoop, _] : loopNest) { + mlir::Operation *lbOp = doLoop.getLowerBound().getDefiningOp(); + loopNestClauseOps.loopLBVar.push_back( + cloneBoundOrStepOpChain(lbOp)->getResult(0)); + + mlir::Operation *ubOp = doLoop.getUpperBound().getDefiningOp(); + loopNestClauseOps.loopUBVar.push_back( + cloneBoundOrStepOpChain(ubOp)->getResult(0)); + + mlir::Operation *stepOp = doLoop.getStep().getDefiningOp(); + loopNestClauseOps.loopStepVar.push_back( + cloneBoundOrStepOpChain(stepOp)->getResult(0)); + } - loopNestClauseOps.loopLBVar.push_back( - cloneBoundOrStepDefChain(lbOp)->getResult(0)); - loopNestClauseOps.loopLBVar.push_back( - cloneBoundOrStepDefChain(ubOp)->getResult(0)); - loopNestClauseOps.loopLBVar.push_back( - cloneBoundOrStepDefChain(stepOp)->getResult(0)); loopNestClauseOps.loopInclusiveAttr = rewriter.getUnitAttr(); } @@ -498,11 +807,18 @@ class DoConcurrentConversion : public mlir::OpConversionPattern { return distOp; } - void genInductionVariableAlloc(mlir::ConversionPatternRewriter &rewriter, - llvm::ArrayRef liveIns, - mlir::IRMapping &mapper) const { - mlir::Operation *indVarMemDef = liveIns.front().getDefiningOp(); + void genLoopNestIndVarAllocs(mlir::ConversionPatternRewriter &rewriter, + looputils::LoopNestToIndVarMap &loopNest, + mlir::IRMapping &mapper) const { + for (auto &[_, indVarInfo] : loopNest) + genInductionVariableAlloc(rewriter, indVarInfo.iterVarMemDef, mapper); + } + + mlir::Operation * + genInductionVariableAlloc(mlir::ConversionPatternRewriter &rewriter, + mlir::Operation *indVarMemDef, + mlir::IRMapping &mapper) const { assert( indVarMemDef != nullptr && "Induction variable memdef is expected to have a defining operation."); @@ -512,13 +828,16 @@ class DoConcurrentConversion : public mlir::OpConversionPattern { indVarDeclareAndAlloc.insert(operand.getDefiningOp()); indVarDeclareAndAlloc.insert(indVarMemDef); + mlir::Operation *result; for (mlir::Operation *opToClone : indVarDeclareAndAlloc) - rewriter.clone(*opToClone, mapper); + result = rewriter.clone(*opToClone, mapper); + + return result; } mlir::omp::ParallelOp genParallelOp(mlir::Location loc, mlir::ConversionPatternRewriter &rewriter, - fir::DoLoopOp doLoop, llvm::ArrayRef liveIns, + looputils::LoopNestToIndVarMap &loopNest, mlir::IRMapping &mapper, mlir::omp::LoopNestClauseOps &loopNestClauseOps) const { auto parallelOp = rewriter.create(loc); @@ -528,8 +847,8 @@ class DoConcurrentConversion : public mlir::OpConversionPattern { // If mapping to host, the local induction variable and loop bounds need to // be emitted as part of the `omp.parallel` op. if (!mapToDevice) { - genInductionVariableAlloc(rewriter, liveIns, mapper); - genLoopNestClauseOps(loc, rewriter, doLoop, mapper, loopNestClauseOps); + genLoopNestIndVarAllocs(rewriter, loopNest, mapper); + genLoopNestClauseOps(loc, rewriter, loopNest, mapper, loopNestClauseOps); } return parallelOp; @@ -562,6 +881,7 @@ class DoConcurrentConversion : public mlir::OpConversionPattern { } bool mapToDevice; + llvm::DenseSet &concurrentLoopsToSkip; }; class DoConcurrentConversionPass @@ -593,17 +913,19 @@ class DoConcurrentConversionPass "Valid values are: `host` or `device`"); return; } - + llvm::DenseSet concurrentLoopsToSkip; mlir::RewritePatternSet patterns(context); patterns.insert( - context, mapTo == fir::omp::DoConcurrentMappingKind::DCMK_Device); + context, mapTo == fir::omp::DoConcurrentMappingKind::DCMK_Device, + concurrentLoopsToSkip); mlir::ConversionTarget target(*context); target.addLegalDialect(); - target.addDynamicallyLegalOp( - [](fir::DoLoopOp op) { return !op.getUnordered(); }); + target.addDynamicallyLegalOp([&](fir::DoLoopOp op) { + return !op.getUnordered() || concurrentLoopsToSkip.contains(op); + }); if (mlir::failed(mlir::applyFullConversion(getOperation(), target, std::move(patterns)))) { diff --git a/flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90 b/flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90 new file mode 100644 index 00000000000000..a0364612976bcb --- /dev/null +++ b/flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90 @@ -0,0 +1,122 @@ +! Tests mapping of a `do concurrent` loop with multiple iteration ranges. + +! RUN: split-file %s %t + +! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-parallel=host %t/multi_range.f90 -o - \ +! RUN: | FileCheck %s --check-prefixes=HOST,COMMON + +! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-parallel=device %t/multi_range.f90 -o - \ +! RUN: | FileCheck %s --check-prefixes=DEVICE,COMMON + +! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-parallel=host %t/perfectly_nested.f90 -o - \ +! RUN: | FileCheck %s --check-prefixes=HOST,COMMON + +! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-parallel=device %t/perfectly_nested.f90 -o - \ +! RUN: | FileCheck %s --check-prefixes=DEVICE,COMMON + +! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-parallel=host %t/partially_nested.f90 -o - \ +! RUN: | FileCheck %s --check-prefixes=HOST,COMMON + +! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-parallel=device %t/partially_nested.f90 -o - \ +! RUN: | FileCheck %s --check-prefixes=DEVICE,COMMON + +!--- multi_range.f90 +program main + integer, parameter :: n = 10 + integer, parameter :: m = 20 + integer, parameter :: l = 30 + integer :: a(n, m, l) + + do concurrent(i=1:n, j=1:m, k=1:l) + a(i,j,k) = i * j + k + end do +end + +!--- perfectly_nested.f90 +program main + integer, parameter :: n = 10 + integer, parameter :: m = 20 + integer, parameter :: l = 30 + integer :: a(n, m, l) + + do concurrent(i=1:n) + do concurrent(j=1:m) + do concurrent(k=1:l) + a(i,j,k) = i * j + k + end do + end do + end do +end + +!--- partially_nested.f90 +program main + integer, parameter :: n = 10 + integer, parameter :: m = 20 + integer, parameter :: l = 30 + integer :: a(n, m, l) + + do concurrent(i=1:n, j=1:m) + do concurrent(k=1:l) + a(i,j,k) = i * j + k + end do + end do +end + +! DEVICE: omp.target +! DEVICE: omp.teams + +! HOST: omp.parallel { + +! COMMON-NEXT: %[[ITER_VAR_I:.*]] = fir.alloca i32 {bindc_name = "i"} +! COMMON-NEXT: %[[BINDING_I:.*]]:2 = hlfir.declare %[[ITER_VAR_I]] {uniq_name = "_QFEi"} + +! COMMON-NEXT: %[[ITER_VAR_J:.*]] = fir.alloca i32 {bindc_name = "j"} +! COMMON-NEXT: %[[BINDING_J:.*]]:2 = hlfir.declare %[[ITER_VAR_J]] {uniq_name = "_QFEj"} + +! COMMON-NEXT: %[[ITER_VAR_K:.*]] = fir.alloca i32 {bindc_name = "k"} +! COMMON-NEXT: %[[BINDING_K:.*]]:2 = hlfir.declare %[[ITER_VAR_K]] {uniq_name = "_QFEk"} + +! COMMON: %[[C1_1:.*]] = arith.constant 1 : i32 +! COMMON: %[[LB_I:.*]] = fir.convert %[[C1_1]] : (i32) -> index +! COMMON: %[[C10:.*]] = arith.constant 10 : i32 +! COMMON: %[[UB_I:.*]] = fir.convert %[[C10]] : (i32) -> index +! COMMON: %[[STEP_I:.*]] = arith.constant 1 : index + +! COMMON: %[[C1_2:.*]] = arith.constant 1 : i32 +! COMMON: %[[LB_J:.*]] = fir.convert %[[C1_2]] : (i32) -> index +! COMMON: %[[C20:.*]] = arith.constant 20 : i32 +! COMMON: %[[UB_J:.*]] = fir.convert %[[C20]] : (i32) -> index +! COMMON: %[[STEP_J:.*]] = arith.constant 1 : index + +! COMMON: %[[C1_3:.*]] = arith.constant 1 : i32 +! COMMON: %[[LB_K:.*]] = fir.convert %[[C1_3]] : (i32) -> index +! COMMON: %[[C30:.*]] = arith.constant 30 : i32 +! COMMON: %[[UB_K:.*]] = fir.convert %[[C30]] : (i32) -> index +! COMMON: %[[STEP_K:.*]] = arith.constant 1 : index + +! DEVICE: omp.distribute +! DEVICE-NEXT: omp.parallel + +! COMMON: omp.wsloop { +! COMMON-NEXT: omp.loop_nest +! COMMON-SAME: (%[[ARG0:[^[:space:]]+]], %[[ARG1:[^[:space:]]+]], %[[ARG2:[^[:space:]]+]]) +! COMMON-SAME: : index = (%[[LB_I]], %[[LB_J]], %[[LB_K]]) +! COMMON-SAME: to (%[[UB_I]], %[[UB_J]], %[[UB_K]]) inclusive +! COMMON-SAME: step (%[[STEP_I]], %[[STEP_J]], %[[STEP_K]]) { + +! COMMON-NEXT: %[[IV_IDX_I:.*]] = fir.convert %[[ARG0]] +! COMMON-NEXT: fir.store %[[IV_IDX_I]] to %[[BINDING_I]]#1 + +! COMMON-NEXT: %[[IV_IDX_J:.*]] = fir.convert %[[ARG1]] +! COMMON-NEXT: fir.store %[[IV_IDX_J]] to %[[BINDING_J]]#1 + +! COMMON-NEXT: %[[IV_IDX_K:.*]] = fir.convert %[[ARG2]] +! COMMON-NEXT: fir.store %[[IV_IDX_K]] to %[[BINDING_K]]#1 + +! COMMON: omp.yield +! COMMON-NEXT: } +! COMMON-NEXT: omp.terminator +! COMMON-NEXT: } + +! HOST-NEXT: omp.terminator +! HOST-NEXT: } diff --git a/flang/test/Transforms/DoConcurrent/not_perfectly_nested.f90 b/flang/test/Transforms/DoConcurrent/not_perfectly_nested.f90 new file mode 100644 index 00000000000000..559d26c39cba55 --- /dev/null +++ b/flang/test/Transforms/DoConcurrent/not_perfectly_nested.f90 @@ -0,0 +1,63 @@ +! Tests that if `do concurrent` is not perfectly nested in its parent loop, that +! we skip converting the not-perfectly nested `do concurrent` loop. + + +! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-parallel=host %s -o - \ +! RUN: | FileCheck %s --check-prefixes=HOST,COMMON + +! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-parallel=device %s -o - \ +! RUN: | FileCheck %s --check-prefixes=DEVICE,COMMON + +program main + integer, parameter :: n = 10 + integer, parameter :: m = 20 + integer, parameter :: l = 30 + integer x; + integer :: a(n, m, l) + + do concurrent(i=1:n) + x = 10 + do concurrent(j=1:m, k=1:l) + a(i,j,k) = i * j + k + end do + end do +end + +! HOST: %[[ORIG_K_ALLOC:.*]] = fir.alloca i32 {bindc_name = "k"} +! HOST: %[[ORIG_K_DECL:.*]]:2 = hlfir.declare %[[ORIG_K_ALLOC]] + +! HOST: %[[ORIG_J_ALLOC:.*]] = fir.alloca i32 {bindc_name = "j"} +! HOST: %[[ORIG_J_DECL:.*]]:2 = hlfir.declare %[[ORIG_J_ALLOC]] + +! DEVICE: omp.target + +! DEVICE: ^bb0(%[[I_ARG:[^[:space:]]+]]: !fir.ref, %[[X_ARG:[^[:space:]]+]]: !fir.ref, +! DEVICE-SAME: %[[J_ARG:[^[:space:]]+]]: !fir.ref, %[[K_ARG:[^[:space:]]+]]: !fir.ref, +! DEVICE-SAME: %[[A_ARG:[^[:space:]]+]]: !fir.ref>): + +! DEVICE: %[[TARGET_J_DECL:.*]]:2 = hlfir.declare %[[J_ARG]] {uniq_name = "_QFEj"} +! DEVICE: %[[TARGET_K_DECL:.*]]:2 = hlfir.declare %[[K_ARG]] {uniq_name = "_QFEk"} + +! DEVICE: omp.teams +! DEVICE: omp.distribute + +! COMMON: omp.parallel { +! COMMON: omp.wsloop { +! COMMON: omp.loop_nest ({{[^[:space:]]+}}) {{.*}} { +! COMMON: fir.do_loop %[[J_IV:.*]] = {{.*}} { +! COMMON: %[[J_IV_CONV:.*]] = fir.convert %[[J_IV]] : (index) -> i32 +! HOST: fir.store %[[J_IV_CONV]] to %[[ORIG_J_DECL]]#1 +! DEVICE: fir.store %[[J_IV_CONV]] to %[[TARGET_J_DECL]]#1 + +! COMMON: fir.do_loop %[[K_IV:.*]] = {{.*}} { +! COMMON: %[[K_IV_CONV:.*]] = fir.convert %[[K_IV]] : (index) -> i32 +! HOST: fir.store %[[K_IV_CONV]] to %[[ORIG_K_DECL]]#1 +! DEVICE: fir.store %[[K_IV_CONV]] to %[[TARGET_K_DECL]]#1 +! COMMON: } +! COMMON: } +! COMMON: omp.yield +! COMMON: } +! COMMON: omp.terminator +! COMMON: } +! COMMON: omp.terminator +! COMMON: }