1//===- DoConcurrentConversion.cpp -- map `DO CONCURRENT` to OpenMP loops --===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "flang/Optimizer/Builder/FIRBuilder.h"
10#include "flang/Optimizer/Builder/Todo.h"
11#include "flang/Optimizer/Dialect/FIROps.h"
12#include "flang/Optimizer/OpenMP/Passes.h"
13#include "flang/Optimizer/OpenMP/Utils.h"
14#include "flang/Support/OpenMP-utils.h"
15#include "mlir/Analysis/SliceAnalysis.h"
16#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
17#include "mlir/IR/IRMapping.h"
18#include "mlir/Transforms/DialectConversion.h"
19#include "mlir/Transforms/RegionUtils.h"
20
21namespace flangomp {
22#define GEN_PASS_DEF_DOCONCURRENTCONVERSIONPASS
23#include "flang/Optimizer/OpenMP/Passes.h.inc"
24} // namespace flangomp
25
26#define DEBUG_TYPE "do-concurrent-conversion"
27#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE << "]: ")
28
29namespace {
30namespace looputils {
31/// Stores info needed about the induction/iteration variable for each `do
32/// concurrent` in a loop nest.
33struct InductionVariableInfo {
34 InductionVariableInfo(fir::DoConcurrentLoopOp loop,
35 mlir::Value inductionVar) {
36 populateInfo(loop, inductionVar);
37 }
38 /// The operation allocating memory for iteration variable.
39 mlir::Operation *iterVarMemDef;
40 /// the operation(s) updating the iteration variable with the current
41 /// iteration number.
42 llvm::SmallVector<mlir::Operation *, 2> indVarUpdateOps;
43
44private:
45 /// For the \p doLoop parameter, find the following:
46 ///
47 /// 1. The operation that declares its iteration variable or allocates memory
48 /// for it. For example, give the following loop:
49 /// ```
50 /// ...
51 /// %i:2 = hlfir.declare %0 {uniq_name = "_QFEi"} : ...
52 /// ...
53 /// fir.do_concurrent.loop (%ind_var) = (%lb) to (%ub) step (%s) {
54 /// %ind_var_conv = fir.convert %ind_var : (index) -> i32
55 /// fir.store %ind_var_conv to %i#1 : !fir.ref<i32>
56 /// ...
57 /// }
58 /// ```
59 ///
60 /// This function sets the `iterVarMemDef` member to the `hlfir.declare` op
61 /// for `%i`.
62 ///
63 /// 2. The operation(s) that update the loop's iteration variable from its
64 /// induction variable. For the above example, the `indVarUpdateOps` is
65 /// populated with the first 2 ops in the loop's body.
66 ///
67 /// Note: The current implementation is dependent on how flang emits loop
68 /// bodies; which is sufficient for the current simple test/use cases. If this
69 /// proves to be insufficient, this should be made more generic.
70 void populateInfo(fir::DoConcurrentLoopOp loop, mlir::Value inductionVar) {
71 mlir::Value result = nullptr;
72
73 // Checks if a StoreOp is updating the memref of the loop's iteration
74 // variable.
75 auto isStoringIV = [&](fir::StoreOp storeOp) {
76 // Direct store into the IV memref.
77 if (storeOp.getValue() == inductionVar) {
78 indVarUpdateOps.push_back(storeOp);
79 return true;
80 }
81
82 // Indirect store into the IV memref.
83 if (auto convertOp = mlir::dyn_cast<fir::ConvertOp>(
84 storeOp.getValue().getDefiningOp())) {
85 if (convertOp.getOperand() == inductionVar) {
86 indVarUpdateOps.push_back(convertOp);
87 indVarUpdateOps.push_back(storeOp);
88 return true;
89 }
90 }
91
92 return false;
93 };
94
95 for (mlir::Operation &op : loop) {
96 if (auto storeOp = mlir::dyn_cast<fir::StoreOp>(op))
97 if (isStoringIV(storeOp)) {
98 result = storeOp.getMemref();
99 break;
100 }
101 }
102
103 assert(result != nullptr && result.getDefiningOp() != nullptr);
104 iterVarMemDef = result.getDefiningOp();
105 }
106};
107
108using InductionVariableInfos = llvm::SmallVector<InductionVariableInfo>;
109
110/// Collects values that are local to a loop: "loop-local values". A loop-local
111/// value is one that is used exclusively inside the loop but allocated outside
112/// of it. This usually corresponds to temporary values that are used inside the
113/// loop body for initialzing other variables for example.
114///
115/// See `flang/test/Transforms/DoConcurrent/locally_destroyed_temp.f90` for an
116/// example of why we need this.
117///
118/// \param [in] doLoop - the loop within which the function searches for values
119/// used exclusively inside.
120///
121/// \param [out] locals - the list of loop-local values detected for \p doLoop.
122void collectLoopLocalValues(fir::DoConcurrentLoopOp loop,
123 llvm::SetVector<mlir::Value> &locals) {
124 loop.walk([&](mlir::Operation *op) {
125 for (mlir::Value operand : op->getOperands()) {
126 if (locals.contains(operand))
127 continue;
128
129 bool isLocal = true;
130
131 if (!mlir::isa_and_present<fir::AllocaOp>(operand.getDefiningOp()))
132 continue;
133
134 // Values defined inside the loop are not interesting since they do not
135 // need to be localized.
136 if (loop->isAncestor(operand.getDefiningOp()))
137 continue;
138
139 for (auto *user : operand.getUsers()) {
140 if (!loop->isAncestor(user)) {
141 isLocal = false;
142 break;
143 }
144 }
145
146 if (isLocal)
147 locals.insert(operand);
148 }
149 });
150}
151
152/// For a "loop-local" value \p local within a loop's scope, localizes that
153/// value within the scope of the parallel region the loop maps to. Towards that
154/// end, this function moves the allocation of \p local within \p allocRegion.
155///
156/// \param local - the value used exclusively within a loop's scope (see
157/// collectLoopLocalValues).
158///
159/// \param allocRegion - the parallel region where \p local's allocation will be
160/// privatized.
161///
162/// \param rewriter - builder used for updating \p allocRegion.
163static void localizeLoopLocalValue(mlir::Value local, mlir::Region &allocRegion,
164 mlir::ConversionPatternRewriter &rewriter) {
165 rewriter.moveOpBefore(local.getDefiningOp(), &allocRegion.front().front());
166}
167} // namespace looputils
168
169class DoConcurrentConversion
170 : public mlir::OpConversionPattern<fir::DoConcurrentOp> {
171public:
172 using mlir::OpConversionPattern<fir::DoConcurrentOp>::OpConversionPattern;
173
174 DoConcurrentConversion(
175 mlir::MLIRContext *context, bool mapToDevice,
176 llvm::DenseSet<fir::DoConcurrentOp> &concurrentLoopsToSkip)
177 : OpConversionPattern(context), mapToDevice(mapToDevice),
178 concurrentLoopsToSkip(concurrentLoopsToSkip) {}
179
180 mlir::LogicalResult
181 matchAndRewrite(fir::DoConcurrentOp doLoop, OpAdaptor adaptor,
182 mlir::ConversionPatternRewriter &rewriter) const override {
183 if (mapToDevice)
184 return doLoop.emitError(
185 "not yet implemented: Mapping `do concurrent` loops to device");
186
187 looputils::InductionVariableInfos ivInfos;
188 auto loop = mlir::cast<fir::DoConcurrentLoopOp>(
189 doLoop.getRegion().back().getTerminator());
190
191 auto indVars = loop.getLoopInductionVars();
192 assert(indVars.has_value());
193
194 for (mlir::Value indVar : *indVars)
195 ivInfos.emplace_back(loop, indVar);
196
197 llvm::SetVector<mlir::Value> locals;
198 looputils::collectLoopLocalValues(loop, locals);
199
200 mlir::IRMapping mapper;
201 mlir::omp::ParallelOp parallelOp =
202 genParallelOp(doLoop.getLoc(), rewriter, ivInfos, mapper);
203 mlir::omp::LoopNestOperands loopNestClauseOps;
204 genLoopNestClauseOps(doLoop.getLoc(), rewriter, loop, mapper,
205 loopNestClauseOps);
206
207 for (mlir::Value local : locals)
208 looputils::localizeLoopLocalValue(local, parallelOp.getRegion(),
209 rewriter);
210
211 mlir::omp::LoopNestOp ompLoopNest =
212 genWsLoopOp(rewriter, loop, mapper, loopNestClauseOps,
213 /*isComposite=*/mapToDevice);
214
215 rewriter.setInsertionPoint(doLoop);
216 fir::FirOpBuilder builder(
217 rewriter,
218 fir::getKindMapping(doLoop->getParentOfType<mlir::ModuleOp>()));
219
220 // Collect iteration variable(s) allocations so that we can move them
221 // outside the `fir.do_concurrent` wrapper (before erasing it).
222 llvm::SmallVector<mlir::Operation *> opsToMove;
223 for (mlir::Operation &op : llvm::drop_end(doLoop))
224 opsToMove.push_back(&op);
225
226 mlir::Block *allocBlock = builder.getAllocaBlock();
227
228 for (mlir::Operation *op : llvm::reverse(opsToMove)) {
229 rewriter.moveOpBefore(op, allocBlock, allocBlock->begin());
230 }
231
232 // Mark `unordered` loops that are not perfectly nested to be skipped from
233 // the legality check of the `ConversionTarget` since we are not interested
234 // in mapping them to OpenMP.
235 ompLoopNest->walk([&](fir::DoConcurrentOp doLoop) {
236 concurrentLoopsToSkip.insert(doLoop);
237 });
238
239 rewriter.eraseOp(doLoop);
240
241 return mlir::success();
242 }
243
244private:
245 mlir::omp::ParallelOp
246 genParallelOp(mlir::Location loc, mlir::ConversionPatternRewriter &rewriter,
247 looputils::InductionVariableInfos &ivInfos,
248 mlir::IRMapping &mapper) const {
249 auto parallelOp = rewriter.create<mlir::omp::ParallelOp>(loc);
250 rewriter.createBlock(&parallelOp.getRegion());
251 rewriter.setInsertionPoint(rewriter.create<mlir::omp::TerminatorOp>(loc));
252
253 genLoopNestIndVarAllocs(rewriter, ivInfos, mapper);
254 return parallelOp;
255 }
256
257 void genLoopNestIndVarAllocs(mlir::ConversionPatternRewriter &rewriter,
258 looputils::InductionVariableInfos &ivInfos,
259 mlir::IRMapping &mapper) const {
260
261 for (auto &indVarInfo : ivInfos)
262 genInductionVariableAlloc(rewriter, indVarInfo.iterVarMemDef, mapper);
263 }
264
265 mlir::Operation *
266 genInductionVariableAlloc(mlir::ConversionPatternRewriter &rewriter,
267 mlir::Operation *indVarMemDef,
268 mlir::IRMapping &mapper) const {
269 assert(
270 indVarMemDef != nullptr &&
271 "Induction variable memdef is expected to have a defining operation.");
272
273 llvm::SmallSetVector<mlir::Operation *, 2> indVarDeclareAndAlloc;
274 for (auto operand : indVarMemDef->getOperands())
275 indVarDeclareAndAlloc.insert(operand.getDefiningOp());
276 indVarDeclareAndAlloc.insert(indVarMemDef);
277
278 mlir::Operation *result;
279 for (mlir::Operation *opToClone : indVarDeclareAndAlloc)
280 result = rewriter.clone(*opToClone, mapper);
281
282 return result;
283 }
284
285 void
286 genLoopNestClauseOps(mlir::Location loc,
287 mlir::ConversionPatternRewriter &rewriter,
288 fir::DoConcurrentLoopOp loop, mlir::IRMapping &mapper,
289 mlir::omp::LoopNestOperands &loopNestClauseOps) const {
290 assert(loopNestClauseOps.loopLowerBounds.empty() &&
291 "Loop nest bounds were already emitted!");
292
293 auto populateBounds = [](mlir::Value var,
294 llvm::SmallVectorImpl<mlir::Value> &bounds) {
295 bounds.push_back(var.getDefiningOp()->getResult(0));
296 };
297
298 for (auto [lb, ub, st] : llvm::zip_equal(
299 loop.getLowerBound(), loop.getUpperBound(), loop.getStep())) {
300 populateBounds(lb, loopNestClauseOps.loopLowerBounds);
301 populateBounds(ub, loopNestClauseOps.loopUpperBounds);
302 populateBounds(st, loopNestClauseOps.loopSteps);
303 }
304
305 loopNestClauseOps.loopInclusive = rewriter.getUnitAttr();
306 }
307
308 mlir::omp::LoopNestOp
309 genWsLoopOp(mlir::ConversionPatternRewriter &rewriter,
310 fir::DoConcurrentLoopOp loop, mlir::IRMapping &mapper,
311 const mlir::omp::LoopNestOperands &clauseOps,
312 bool isComposite) const {
313 mlir::omp::WsloopOperands wsloopClauseOps;
314
315 auto cloneFIRRegionToOMP = [&rewriter](mlir::Region &firRegion,
316 mlir::Region &ompRegion) {
317 if (!firRegion.empty()) {
318 rewriter.cloneRegionBefore(firRegion, ompRegion, ompRegion.begin());
319 auto firYield =
320 mlir::cast<fir::YieldOp>(ompRegion.back().getTerminator());
321 rewriter.setInsertionPoint(firYield);
322 rewriter.create<mlir::omp::YieldOp>(firYield.getLoc(),
323 firYield.getOperands());
324 rewriter.eraseOp(firYield);
325 }
326 };
327
328 // For `local` (and `local_init`) opernads, emit corresponding `private`
329 // clauses and attach these clauses to the workshare loop.
330 if (!loop.getLocalVars().empty())
331 for (auto [op, sym, arg] : llvm::zip_equal(
332 loop.getLocalVars(),
333 loop.getLocalSymsAttr().getAsRange<mlir::SymbolRefAttr>(),
334 loop.getRegionLocalArgs())) {
335 auto localizer = mlir::SymbolTable::lookupNearestSymbolFrom<
336 fir::LocalitySpecifierOp>(loop, sym);
337 if (localizer.getLocalitySpecifierType() ==
338 fir::LocalitySpecifierType::LocalInit)
339 TODO(localizer.getLoc(),
340 "local_init conversion is not supported yet");
341
342 mlir::OpBuilder::InsertionGuard guard(rewriter);
343 rewriter.setInsertionPointAfter(localizer);
344
345 auto privatizer = rewriter.create<mlir::omp::PrivateClauseOp>(
346 localizer.getLoc(), sym.getLeafReference().str() + ".omp",
347 localizer.getTypeAttr().getValue(),
348 mlir::omp::DataSharingClauseType::Private);
349
350 cloneFIRRegionToOMP(localizer.getInitRegion(),
351 privatizer.getInitRegion());
352 cloneFIRRegionToOMP(localizer.getDeallocRegion(),
353 privatizer.getDeallocRegion());
354
355 wsloopClauseOps.privateVars.push_back(op);
356 wsloopClauseOps.privateSyms.push_back(
357 mlir::SymbolRefAttr::get(privatizer));
358 }
359
360 if (!loop.getReduceVars().empty()) {
361 for (auto [op, byRef, sym, arg] : llvm::zip_equal(
362 loop.getReduceVars(), loop.getReduceByrefAttr().asArrayRef(),
363 loop.getReduceSymsAttr().getAsRange<mlir::SymbolRefAttr>(),
364 loop.getRegionReduceArgs())) {
365 auto firReducer =
366 mlir::SymbolTable::lookupNearestSymbolFrom<fir::DeclareReductionOp>(
367 loop, sym);
368
369 mlir::OpBuilder::InsertionGuard guard(rewriter);
370 rewriter.setInsertionPointAfter(firReducer);
371
372 auto ompReducer = rewriter.create<mlir::omp::DeclareReductionOp>(
373 firReducer.getLoc(), sym.getLeafReference().str() + ".omp",
374 firReducer.getTypeAttr().getValue());
375
376 cloneFIRRegionToOMP(firReducer.getAllocRegion(),
377 ompReducer.getAllocRegion());
378 cloneFIRRegionToOMP(firReducer.getInitializerRegion(),
379 ompReducer.getInitializerRegion());
380 cloneFIRRegionToOMP(firReducer.getReductionRegion(),
381 ompReducer.getReductionRegion());
382 cloneFIRRegionToOMP(firReducer.getAtomicReductionRegion(),
383 ompReducer.getAtomicReductionRegion());
384 cloneFIRRegionToOMP(firReducer.getCleanupRegion(),
385 ompReducer.getCleanupRegion());
386
387 wsloopClauseOps.reductionVars.push_back(op);
388 wsloopClauseOps.reductionByref.push_back(byRef);
389 wsloopClauseOps.reductionSyms.push_back(
390 mlir::SymbolRefAttr::get(ompReducer));
391 }
392 }
393
394 auto wsloopOp =
395 rewriter.create<mlir::omp::WsloopOp>(loop.getLoc(), wsloopClauseOps);
396 wsloopOp.setComposite(isComposite);
397
398 Fortran::common::openmp::EntryBlockArgs wsloopArgs;
399 wsloopArgs.priv.vars = wsloopClauseOps.privateVars;
400 wsloopArgs.reduction.vars = wsloopClauseOps.reductionVars;
401 Fortran::common::openmp::genEntryBlock(rewriter, wsloopArgs,
402 wsloopOp.getRegion());
403
404 auto loopNestOp =
405 rewriter.create<mlir::omp::LoopNestOp>(loop.getLoc(), clauseOps);
406
407 // Clone the loop's body inside the loop nest construct using the
408 // mapped values.
409 rewriter.cloneRegionBefore(loop.getRegion(), loopNestOp.getRegion(),
410 loopNestOp.getRegion().begin(), mapper);
411
412 rewriter.setInsertionPointToEnd(&loopNestOp.getRegion().back());
413 rewriter.create<mlir::omp::YieldOp>(loop->getLoc());
414
415 // `local` region arguments are transferred/cloned from the `do concurrent`
416 // loop to the loopnest op when the region is cloned above. Instead, these
417 // region arguments should be on the workshare loop's region.
418 for (auto [wsloopArg, loopNestArg] :
419 llvm::zip_equal(wsloopOp.getRegion().getArguments(),
420 loopNestOp.getRegion().getArguments().drop_front(
421 clauseOps.loopLowerBounds.size())))
422 rewriter.replaceAllUsesWith(loopNestArg, wsloopArg);
423
424 for (unsigned i = 0;
425 i < loop.getLocalVars().size() + loop.getReduceVars().size(); ++i)
426 loopNestOp.getRegion().eraseArgument(clauseOps.loopLowerBounds.size());
427
428 return loopNestOp;
429 }
430
431 bool mapToDevice;
432 llvm::DenseSet<fir::DoConcurrentOp> &concurrentLoopsToSkip;
433};
434
435class DoConcurrentConversionPass
436 : public flangomp::impl::DoConcurrentConversionPassBase<
437 DoConcurrentConversionPass> {
438public:
439 DoConcurrentConversionPass() = default;
440
441 DoConcurrentConversionPass(
442 const flangomp::DoConcurrentConversionPassOptions &options)
443 : DoConcurrentConversionPassBase(options) {}
444
445 void runOnOperation() override {
446 mlir::func::FuncOp func = getOperation();
447
448 if (func.isDeclaration())
449 return;
450
451 mlir::MLIRContext *context = &getContext();
452
453 if (mapTo != flangomp::DoConcurrentMappingKind::DCMK_Host &&
454 mapTo != flangomp::DoConcurrentMappingKind::DCMK_Device) {
455 mlir::emitWarning(mlir::UnknownLoc::get(context),
456 "DoConcurrentConversionPass: invalid `map-to` value. "
457 "Valid values are: `host` or `device`");
458 return;
459 }
460
461 llvm::DenseSet<fir::DoConcurrentOp> concurrentLoopsToSkip;
462 mlir::RewritePatternSet patterns(context);
463 patterns.insert<DoConcurrentConversion>(
464 context, mapTo == flangomp::DoConcurrentMappingKind::DCMK_Device,
465 concurrentLoopsToSkip);
466 mlir::ConversionTarget target(*context);
467 target.addDynamicallyLegalOp<fir::DoConcurrentOp>(
468 [&](fir::DoConcurrentOp op) {
469 return concurrentLoopsToSkip.contains(op);
470 });
471 target.markUnknownOpDynamicallyLegal(
472 [](mlir::Operation *) { return true; });
473
474 if (mlir::failed(mlir::applyFullConversion(getOperation(), target,
475 std::move(patterns)))) {
476 signalPassFailure();
477 }
478 }
479};
480} // namespace
481
482std::unique_ptr<mlir::Pass>
483flangomp::createDoConcurrentConversionPass(bool mapToDevice) {
484 DoConcurrentConversionPassOptions options;
485 options.mapTo = mapToDevice ? flangomp::DoConcurrentMappingKind::DCMK_Device
486 : flangomp::DoConcurrentMappingKind::DCMK_Host;
487
488 return std::make_unique<DoConcurrentConversionPass>(options);
489}
490

source code of flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp