| 1 | //===- DoConcurrentConversion.cpp -- map `DO CONCURRENT` to OpenMP loops --===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | |
| 9 | #include "flang/Optimizer/Builder/FIRBuilder.h" |
| 10 | #include "flang/Optimizer/Builder/Todo.h" |
| 11 | #include "flang/Optimizer/Dialect/FIROps.h" |
| 12 | #include "flang/Optimizer/OpenMP/Passes.h" |
| 13 | #include "flang/Optimizer/OpenMP/Utils.h" |
| 14 | #include "flang/Support/OpenMP-utils.h" |
| 15 | #include "mlir/Analysis/SliceAnalysis.h" |
| 16 | #include "mlir/Dialect/OpenMP/OpenMPDialect.h" |
| 17 | #include "mlir/IR/IRMapping.h" |
| 18 | #include "mlir/Transforms/DialectConversion.h" |
| 19 | #include "mlir/Transforms/RegionUtils.h" |
| 20 | |
| 21 | namespace flangomp { |
| 22 | #define GEN_PASS_DEF_DOCONCURRENTCONVERSIONPASS |
| 23 | #include "flang/Optimizer/OpenMP/Passes.h.inc" |
| 24 | } // namespace flangomp |
| 25 | |
| 26 | #define DEBUG_TYPE "do-concurrent-conversion" |
| 27 | #define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE << "]: ") |
| 28 | |
| 29 | namespace { |
| 30 | namespace looputils { |
| 31 | /// Stores info needed about the induction/iteration variable for each `do |
| 32 | /// concurrent` in a loop nest. |
| 33 | struct InductionVariableInfo { |
| 34 | InductionVariableInfo(fir::DoConcurrentLoopOp loop, |
| 35 | mlir::Value inductionVar) { |
| 36 | populateInfo(loop, inductionVar); |
| 37 | } |
| 38 | /// The operation allocating memory for iteration variable. |
| 39 | mlir::Operation *iterVarMemDef; |
| 40 | /// the operation(s) updating the iteration variable with the current |
| 41 | /// iteration number. |
| 42 | llvm::SmallVector<mlir::Operation *, 2> indVarUpdateOps; |
| 43 | |
| 44 | private: |
| 45 | /// For the \p doLoop parameter, find the following: |
| 46 | /// |
| 47 | /// 1. The operation that declares its iteration variable or allocates memory |
| 48 | /// for it. For example, give the following loop: |
| 49 | /// ``` |
| 50 | /// ... |
| 51 | /// %i:2 = hlfir.declare %0 {uniq_name = "_QFEi"} : ... |
| 52 | /// ... |
| 53 | /// fir.do_concurrent.loop (%ind_var) = (%lb) to (%ub) step (%s) { |
| 54 | /// %ind_var_conv = fir.convert %ind_var : (index) -> i32 |
| 55 | /// fir.store %ind_var_conv to %i#1 : !fir.ref<i32> |
| 56 | /// ... |
| 57 | /// } |
| 58 | /// ``` |
| 59 | /// |
| 60 | /// This function sets the `iterVarMemDef` member to the `hlfir.declare` op |
| 61 | /// for `%i`. |
| 62 | /// |
| 63 | /// 2. The operation(s) that update the loop's iteration variable from its |
| 64 | /// induction variable. For the above example, the `indVarUpdateOps` is |
| 65 | /// populated with the first 2 ops in the loop's body. |
| 66 | /// |
| 67 | /// Note: The current implementation is dependent on how flang emits loop |
| 68 | /// bodies; which is sufficient for the current simple test/use cases. If this |
| 69 | /// proves to be insufficient, this should be made more generic. |
| 70 | void populateInfo(fir::DoConcurrentLoopOp loop, mlir::Value inductionVar) { |
| 71 | mlir::Value result = nullptr; |
| 72 | |
| 73 | // Checks if a StoreOp is updating the memref of the loop's iteration |
| 74 | // variable. |
| 75 | auto isStoringIV = [&](fir::StoreOp storeOp) { |
| 76 | // Direct store into the IV memref. |
| 77 | if (storeOp.getValue() == inductionVar) { |
| 78 | indVarUpdateOps.push_back(storeOp); |
| 79 | return true; |
| 80 | } |
| 81 | |
| 82 | // Indirect store into the IV memref. |
| 83 | if (auto convertOp = mlir::dyn_cast<fir::ConvertOp>( |
| 84 | storeOp.getValue().getDefiningOp())) { |
| 85 | if (convertOp.getOperand() == inductionVar) { |
| 86 | indVarUpdateOps.push_back(convertOp); |
| 87 | indVarUpdateOps.push_back(storeOp); |
| 88 | return true; |
| 89 | } |
| 90 | } |
| 91 | |
| 92 | return false; |
| 93 | }; |
| 94 | |
| 95 | for (mlir::Operation &op : loop) { |
| 96 | if (auto storeOp = mlir::dyn_cast<fir::StoreOp>(op)) |
| 97 | if (isStoringIV(storeOp)) { |
| 98 | result = storeOp.getMemref(); |
| 99 | break; |
| 100 | } |
| 101 | } |
| 102 | |
| 103 | assert(result != nullptr && result.getDefiningOp() != nullptr); |
| 104 | iterVarMemDef = result.getDefiningOp(); |
| 105 | } |
| 106 | }; |
| 107 | |
| 108 | using InductionVariableInfos = llvm::SmallVector<InductionVariableInfo>; |
| 109 | |
| 110 | /// Collects values that are local to a loop: "loop-local values". A loop-local |
| 111 | /// value is one that is used exclusively inside the loop but allocated outside |
| 112 | /// of it. This usually corresponds to temporary values that are used inside the |
| 113 | /// loop body for initialzing other variables for example. |
| 114 | /// |
| 115 | /// See `flang/test/Transforms/DoConcurrent/locally_destroyed_temp.f90` for an |
| 116 | /// example of why we need this. |
| 117 | /// |
| 118 | /// \param [in] doLoop - the loop within which the function searches for values |
| 119 | /// used exclusively inside. |
| 120 | /// |
| 121 | /// \param [out] locals - the list of loop-local values detected for \p doLoop. |
| 122 | void collectLoopLocalValues(fir::DoConcurrentLoopOp loop, |
| 123 | llvm::SetVector<mlir::Value> &locals) { |
| 124 | loop.walk([&](mlir::Operation *op) { |
| 125 | for (mlir::Value operand : op->getOperands()) { |
| 126 | if (locals.contains(operand)) |
| 127 | continue; |
| 128 | |
| 129 | bool isLocal = true; |
| 130 | |
| 131 | if (!mlir::isa_and_present<fir::AllocaOp>(operand.getDefiningOp())) |
| 132 | continue; |
| 133 | |
| 134 | // Values defined inside the loop are not interesting since they do not |
| 135 | // need to be localized. |
| 136 | if (loop->isAncestor(operand.getDefiningOp())) |
| 137 | continue; |
| 138 | |
| 139 | for (auto *user : operand.getUsers()) { |
| 140 | if (!loop->isAncestor(user)) { |
| 141 | isLocal = false; |
| 142 | break; |
| 143 | } |
| 144 | } |
| 145 | |
| 146 | if (isLocal) |
| 147 | locals.insert(operand); |
| 148 | } |
| 149 | }); |
| 150 | } |
| 151 | |
| 152 | /// For a "loop-local" value \p local within a loop's scope, localizes that |
| 153 | /// value within the scope of the parallel region the loop maps to. Towards that |
| 154 | /// end, this function moves the allocation of \p local within \p allocRegion. |
| 155 | /// |
| 156 | /// \param local - the value used exclusively within a loop's scope (see |
| 157 | /// collectLoopLocalValues). |
| 158 | /// |
| 159 | /// \param allocRegion - the parallel region where \p local's allocation will be |
| 160 | /// privatized. |
| 161 | /// |
| 162 | /// \param rewriter - builder used for updating \p allocRegion. |
| 163 | static void localizeLoopLocalValue(mlir::Value local, mlir::Region &allocRegion, |
| 164 | mlir::ConversionPatternRewriter &rewriter) { |
| 165 | rewriter.moveOpBefore(local.getDefiningOp(), &allocRegion.front().front()); |
| 166 | } |
| 167 | } // namespace looputils |
| 168 | |
| 169 | class DoConcurrentConversion |
| 170 | : public mlir::OpConversionPattern<fir::DoConcurrentOp> { |
| 171 | public: |
| 172 | using mlir::OpConversionPattern<fir::DoConcurrentOp>::OpConversionPattern; |
| 173 | |
| 174 | DoConcurrentConversion( |
| 175 | mlir::MLIRContext *context, bool mapToDevice, |
| 176 | llvm::DenseSet<fir::DoConcurrentOp> &concurrentLoopsToSkip) |
| 177 | : OpConversionPattern(context), mapToDevice(mapToDevice), |
| 178 | concurrentLoopsToSkip(concurrentLoopsToSkip) {} |
| 179 | |
| 180 | mlir::LogicalResult |
| 181 | matchAndRewrite(fir::DoConcurrentOp doLoop, OpAdaptor adaptor, |
| 182 | mlir::ConversionPatternRewriter &rewriter) const override { |
| 183 | if (mapToDevice) |
| 184 | return doLoop.emitError( |
| 185 | "not yet implemented: Mapping `do concurrent` loops to device" ); |
| 186 | |
| 187 | looputils::InductionVariableInfos ivInfos; |
| 188 | auto loop = mlir::cast<fir::DoConcurrentLoopOp>( |
| 189 | doLoop.getRegion().back().getTerminator()); |
| 190 | |
| 191 | auto indVars = loop.getLoopInductionVars(); |
| 192 | assert(indVars.has_value()); |
| 193 | |
| 194 | for (mlir::Value indVar : *indVars) |
| 195 | ivInfos.emplace_back(loop, indVar); |
| 196 | |
| 197 | llvm::SetVector<mlir::Value> locals; |
| 198 | looputils::collectLoopLocalValues(loop, locals); |
| 199 | |
| 200 | mlir::IRMapping mapper; |
| 201 | mlir::omp::ParallelOp parallelOp = |
| 202 | genParallelOp(doLoop.getLoc(), rewriter, ivInfos, mapper); |
| 203 | mlir::omp::LoopNestOperands loopNestClauseOps; |
| 204 | genLoopNestClauseOps(doLoop.getLoc(), rewriter, loop, mapper, |
| 205 | loopNestClauseOps); |
| 206 | |
| 207 | for (mlir::Value local : locals) |
| 208 | looputils::localizeLoopLocalValue(local, parallelOp.getRegion(), |
| 209 | rewriter); |
| 210 | |
| 211 | mlir::omp::LoopNestOp ompLoopNest = |
| 212 | genWsLoopOp(rewriter, loop, mapper, loopNestClauseOps, |
| 213 | /*isComposite=*/mapToDevice); |
| 214 | |
| 215 | rewriter.setInsertionPoint(doLoop); |
| 216 | fir::FirOpBuilder builder( |
| 217 | rewriter, |
| 218 | fir::getKindMapping(doLoop->getParentOfType<mlir::ModuleOp>())); |
| 219 | |
| 220 | // Collect iteration variable(s) allocations so that we can move them |
| 221 | // outside the `fir.do_concurrent` wrapper (before erasing it). |
| 222 | llvm::SmallVector<mlir::Operation *> opsToMove; |
| 223 | for (mlir::Operation &op : llvm::drop_end(doLoop)) |
| 224 | opsToMove.push_back(&op); |
| 225 | |
| 226 | mlir::Block *allocBlock = builder.getAllocaBlock(); |
| 227 | |
| 228 | for (mlir::Operation *op : llvm::reverse(opsToMove)) { |
| 229 | rewriter.moveOpBefore(op, allocBlock, allocBlock->begin()); |
| 230 | } |
| 231 | |
| 232 | // Mark `unordered` loops that are not perfectly nested to be skipped from |
| 233 | // the legality check of the `ConversionTarget` since we are not interested |
| 234 | // in mapping them to OpenMP. |
| 235 | ompLoopNest->walk([&](fir::DoConcurrentOp doLoop) { |
| 236 | concurrentLoopsToSkip.insert(doLoop); |
| 237 | }); |
| 238 | |
| 239 | rewriter.eraseOp(doLoop); |
| 240 | |
| 241 | return mlir::success(); |
| 242 | } |
| 243 | |
| 244 | private: |
| 245 | mlir::omp::ParallelOp |
| 246 | genParallelOp(mlir::Location loc, mlir::ConversionPatternRewriter &rewriter, |
| 247 | looputils::InductionVariableInfos &ivInfos, |
| 248 | mlir::IRMapping &mapper) const { |
| 249 | auto parallelOp = rewriter.create<mlir::omp::ParallelOp>(loc); |
| 250 | rewriter.createBlock(¶llelOp.getRegion()); |
| 251 | rewriter.setInsertionPoint(rewriter.create<mlir::omp::TerminatorOp>(loc)); |
| 252 | |
| 253 | genLoopNestIndVarAllocs(rewriter, ivInfos, mapper); |
| 254 | return parallelOp; |
| 255 | } |
| 256 | |
| 257 | void genLoopNestIndVarAllocs(mlir::ConversionPatternRewriter &rewriter, |
| 258 | looputils::InductionVariableInfos &ivInfos, |
| 259 | mlir::IRMapping &mapper) const { |
| 260 | |
| 261 | for (auto &indVarInfo : ivInfos) |
| 262 | genInductionVariableAlloc(rewriter, indVarInfo.iterVarMemDef, mapper); |
| 263 | } |
| 264 | |
| 265 | mlir::Operation * |
| 266 | genInductionVariableAlloc(mlir::ConversionPatternRewriter &rewriter, |
| 267 | mlir::Operation *indVarMemDef, |
| 268 | mlir::IRMapping &mapper) const { |
| 269 | assert( |
| 270 | indVarMemDef != nullptr && |
| 271 | "Induction variable memdef is expected to have a defining operation." ); |
| 272 | |
| 273 | llvm::SmallSetVector<mlir::Operation *, 2> indVarDeclareAndAlloc; |
| 274 | for (auto operand : indVarMemDef->getOperands()) |
| 275 | indVarDeclareAndAlloc.insert(operand.getDefiningOp()); |
| 276 | indVarDeclareAndAlloc.insert(indVarMemDef); |
| 277 | |
| 278 | mlir::Operation *result; |
| 279 | for (mlir::Operation *opToClone : indVarDeclareAndAlloc) |
| 280 | result = rewriter.clone(*opToClone, mapper); |
| 281 | |
| 282 | return result; |
| 283 | } |
| 284 | |
| 285 | void |
| 286 | genLoopNestClauseOps(mlir::Location loc, |
| 287 | mlir::ConversionPatternRewriter &rewriter, |
| 288 | fir::DoConcurrentLoopOp loop, mlir::IRMapping &mapper, |
| 289 | mlir::omp::LoopNestOperands &loopNestClauseOps) const { |
| 290 | assert(loopNestClauseOps.loopLowerBounds.empty() && |
| 291 | "Loop nest bounds were already emitted!" ); |
| 292 | |
| 293 | auto populateBounds = [](mlir::Value var, |
| 294 | llvm::SmallVectorImpl<mlir::Value> &bounds) { |
| 295 | bounds.push_back(var.getDefiningOp()->getResult(0)); |
| 296 | }; |
| 297 | |
| 298 | for (auto [lb, ub, st] : llvm::zip_equal( |
| 299 | loop.getLowerBound(), loop.getUpperBound(), loop.getStep())) { |
| 300 | populateBounds(lb, loopNestClauseOps.loopLowerBounds); |
| 301 | populateBounds(ub, loopNestClauseOps.loopUpperBounds); |
| 302 | populateBounds(st, loopNestClauseOps.loopSteps); |
| 303 | } |
| 304 | |
| 305 | loopNestClauseOps.loopInclusive = rewriter.getUnitAttr(); |
| 306 | } |
| 307 | |
| 308 | mlir::omp::LoopNestOp |
| 309 | genWsLoopOp(mlir::ConversionPatternRewriter &rewriter, |
| 310 | fir::DoConcurrentLoopOp loop, mlir::IRMapping &mapper, |
| 311 | const mlir::omp::LoopNestOperands &clauseOps, |
| 312 | bool isComposite) const { |
| 313 | mlir::omp::WsloopOperands wsloopClauseOps; |
| 314 | |
| 315 | // For `local` (and `local_init`) opernads, emit corresponding `private` |
| 316 | // clauses and attach these clauses to the workshare loop. |
| 317 | if (!loop.getLocalOperands().empty()) |
| 318 | for (auto [op, sym, arg] : llvm::zip_equal( |
| 319 | loop.getLocalOperands(), |
| 320 | loop.getLocalSymsAttr().getAsRange<mlir::SymbolRefAttr>(), |
| 321 | loop.getRegionLocalArgs())) { |
| 322 | auto localizer = mlir::SymbolTable::lookupNearestSymbolFrom< |
| 323 | fir::LocalitySpecifierOp>(loop, sym); |
| 324 | if (localizer.getLocalitySpecifierType() == |
| 325 | fir::LocalitySpecifierType::LocalInit) |
| 326 | TODO(localizer.getLoc(), |
| 327 | "local_init conversion is not supported yet" ); |
| 328 | |
| 329 | auto oldIP = rewriter.saveInsertionPoint(); |
| 330 | rewriter.setInsertionPointAfter(localizer); |
| 331 | auto privatizer = rewriter.create<mlir::omp::PrivateClauseOp>( |
| 332 | localizer.getLoc(), sym.getLeafReference().str() + ".omp" , |
| 333 | localizer.getTypeAttr().getValue(), |
| 334 | mlir::omp::DataSharingClauseType::Private); |
| 335 | |
| 336 | if (!localizer.getInitRegion().empty()) { |
| 337 | rewriter.cloneRegionBefore(localizer.getInitRegion(), |
| 338 | privatizer.getInitRegion(), |
| 339 | privatizer.getInitRegion().begin()); |
| 340 | auto firYield = mlir::cast<fir::YieldOp>( |
| 341 | privatizer.getInitRegion().back().getTerminator()); |
| 342 | rewriter.setInsertionPoint(firYield); |
| 343 | rewriter.create<mlir::omp::YieldOp>(firYield.getLoc(), |
| 344 | firYield.getOperands()); |
| 345 | rewriter.eraseOp(firYield); |
| 346 | } |
| 347 | |
| 348 | if (!localizer.getDeallocRegion().empty()) { |
| 349 | rewriter.cloneRegionBefore(localizer.getDeallocRegion(), |
| 350 | privatizer.getDeallocRegion(), |
| 351 | privatizer.getDeallocRegion().begin()); |
| 352 | auto firYield = mlir::cast<fir::YieldOp>( |
| 353 | privatizer.getDeallocRegion().back().getTerminator()); |
| 354 | rewriter.setInsertionPoint(firYield); |
| 355 | rewriter.create<mlir::omp::YieldOp>(firYield.getLoc(), |
| 356 | firYield.getOperands()); |
| 357 | rewriter.eraseOp(firYield); |
| 358 | } |
| 359 | |
| 360 | rewriter.restoreInsertionPoint(oldIP); |
| 361 | |
| 362 | wsloopClauseOps.privateVars.push_back(op); |
| 363 | wsloopClauseOps.privateSyms.push_back( |
| 364 | mlir::SymbolRefAttr::get(privatizer)); |
| 365 | } |
| 366 | |
| 367 | auto wsloopOp = |
| 368 | rewriter.create<mlir::omp::WsloopOp>(loop.getLoc(), wsloopClauseOps); |
| 369 | wsloopOp.setComposite(isComposite); |
| 370 | |
| 371 | Fortran::common::openmp::EntryBlockArgs wsloopArgs; |
| 372 | wsloopArgs.priv.vars = wsloopClauseOps.privateVars; |
| 373 | Fortran::common::openmp::genEntryBlock(rewriter, wsloopArgs, |
| 374 | wsloopOp.getRegion()); |
| 375 | |
| 376 | auto loopNestOp = |
| 377 | rewriter.create<mlir::omp::LoopNestOp>(loop.getLoc(), clauseOps); |
| 378 | |
| 379 | // Clone the loop's body inside the loop nest construct using the |
| 380 | // mapped values. |
| 381 | rewriter.cloneRegionBefore(loop.getRegion(), loopNestOp.getRegion(), |
| 382 | loopNestOp.getRegion().begin(), mapper); |
| 383 | |
| 384 | rewriter.setInsertionPointToEnd(&loopNestOp.getRegion().back()); |
| 385 | rewriter.create<mlir::omp::YieldOp>(loop->getLoc()); |
| 386 | |
| 387 | // `local` region arguments are transferred/cloned from the `do concurrent` |
| 388 | // loop to the loopnest op when the region is cloned above. Instead, these |
| 389 | // region arguments should be on the workshare loop's region. |
| 390 | for (auto [wsloopArg, loopNestArg] : |
| 391 | llvm::zip_equal(wsloopOp.getRegion().getArguments(), |
| 392 | loopNestOp.getRegion().getArguments().drop_front( |
| 393 | clauseOps.loopLowerBounds.size()))) |
| 394 | rewriter.replaceAllUsesWith(loopNestArg, wsloopArg); |
| 395 | |
| 396 | for (unsigned i = 0; i < loop.getLocalVars().size(); ++i) |
| 397 | loopNestOp.getRegion().eraseArgument(clauseOps.loopLowerBounds.size()); |
| 398 | |
| 399 | return loopNestOp; |
| 400 | } |
| 401 | |
| 402 | bool mapToDevice; |
| 403 | llvm::DenseSet<fir::DoConcurrentOp> &concurrentLoopsToSkip; |
| 404 | }; |
| 405 | |
| 406 | class DoConcurrentConversionPass |
| 407 | : public flangomp::impl::DoConcurrentConversionPassBase< |
| 408 | DoConcurrentConversionPass> { |
| 409 | public: |
| 410 | DoConcurrentConversionPass() = default; |
| 411 | |
| 412 | DoConcurrentConversionPass( |
| 413 | const flangomp::DoConcurrentConversionPassOptions &options) |
| 414 | : DoConcurrentConversionPassBase(options) {} |
| 415 | |
| 416 | void runOnOperation() override { |
| 417 | mlir::func::FuncOp func = getOperation(); |
| 418 | |
| 419 | if (func.isDeclaration()) |
| 420 | return; |
| 421 | |
| 422 | mlir::MLIRContext *context = &getContext(); |
| 423 | |
| 424 | if (mapTo != flangomp::DoConcurrentMappingKind::DCMK_Host && |
| 425 | mapTo != flangomp::DoConcurrentMappingKind::DCMK_Device) { |
| 426 | mlir::emitWarning(mlir::UnknownLoc::get(context), |
| 427 | "DoConcurrentConversionPass: invalid `map-to` value. " |
| 428 | "Valid values are: `host` or `device`" ); |
| 429 | return; |
| 430 | } |
| 431 | |
| 432 | llvm::DenseSet<fir::DoConcurrentOp> concurrentLoopsToSkip; |
| 433 | mlir::RewritePatternSet patterns(context); |
| 434 | patterns.insert<DoConcurrentConversion>( |
| 435 | context, mapTo == flangomp::DoConcurrentMappingKind::DCMK_Device, |
| 436 | concurrentLoopsToSkip); |
| 437 | mlir::ConversionTarget target(*context); |
| 438 | target.addDynamicallyLegalOp<fir::DoConcurrentOp>( |
| 439 | [&](fir::DoConcurrentOp op) { |
| 440 | return concurrentLoopsToSkip.contains(op); |
| 441 | }); |
| 442 | target.markUnknownOpDynamicallyLegal( |
| 443 | [](mlir::Operation *) { return true; }); |
| 444 | |
| 445 | if (mlir::failed(mlir::applyFullConversion(getOperation(), target, |
| 446 | std::move(patterns)))) { |
| 447 | signalPassFailure(); |
| 448 | } |
| 449 | } |
| 450 | }; |
| 451 | } // namespace |
| 452 | |
| 453 | std::unique_ptr<mlir::Pass> |
| 454 | flangomp::createDoConcurrentConversionPass(bool mapToDevice) { |
| 455 | DoConcurrentConversionPassOptions options; |
| 456 | options.mapTo = mapToDevice ? flangomp::DoConcurrentMappingKind::DCMK_Device |
| 457 | : flangomp::DoConcurrentMappingKind::DCMK_Host; |
| 458 | |
| 459 | return std::make_unique<DoConcurrentConversionPass>(options); |
| 460 | } |
| 461 | |