1 | //===- DoConcurrentConversion.cpp -- map `DO CONCURRENT` to OpenMP loops --===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #include "flang/Optimizer/Builder/FIRBuilder.h" |
10 | #include "flang/Optimizer/Builder/Todo.h" |
11 | #include "flang/Optimizer/Dialect/FIROps.h" |
12 | #include "flang/Optimizer/OpenMP/Passes.h" |
13 | #include "flang/Optimizer/OpenMP/Utils.h" |
14 | #include "flang/Support/OpenMP-utils.h" |
15 | #include "mlir/Analysis/SliceAnalysis.h" |
16 | #include "mlir/Dialect/OpenMP/OpenMPDialect.h" |
17 | #include "mlir/IR/IRMapping.h" |
18 | #include "mlir/Transforms/DialectConversion.h" |
19 | #include "mlir/Transforms/RegionUtils.h" |
20 | |
21 | namespace flangomp { |
22 | #define GEN_PASS_DEF_DOCONCURRENTCONVERSIONPASS |
23 | #include "flang/Optimizer/OpenMP/Passes.h.inc" |
24 | } // namespace flangomp |
25 | |
26 | #define DEBUG_TYPE "do-concurrent-conversion" |
27 | #define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE << "]: ") |
28 | |
29 | namespace { |
30 | namespace looputils { |
31 | /// Stores info needed about the induction/iteration variable for each `do |
32 | /// concurrent` in a loop nest. |
33 | struct InductionVariableInfo { |
34 | InductionVariableInfo(fir::DoConcurrentLoopOp loop, |
35 | mlir::Value inductionVar) { |
36 | populateInfo(loop, inductionVar); |
37 | } |
38 | /// The operation allocating memory for iteration variable. |
39 | mlir::Operation *iterVarMemDef; |
40 | /// the operation(s) updating the iteration variable with the current |
41 | /// iteration number. |
42 | llvm::SmallVector<mlir::Operation *, 2> indVarUpdateOps; |
43 | |
44 | private: |
45 | /// For the \p doLoop parameter, find the following: |
46 | /// |
47 | /// 1. The operation that declares its iteration variable or allocates memory |
48 | /// for it. For example, give the following loop: |
49 | /// ``` |
50 | /// ... |
51 | /// %i:2 = hlfir.declare %0 {uniq_name = "_QFEi"} : ... |
52 | /// ... |
53 | /// fir.do_concurrent.loop (%ind_var) = (%lb) to (%ub) step (%s) { |
54 | /// %ind_var_conv = fir.convert %ind_var : (index) -> i32 |
55 | /// fir.store %ind_var_conv to %i#1 : !fir.ref<i32> |
56 | /// ... |
57 | /// } |
58 | /// ``` |
59 | /// |
60 | /// This function sets the `iterVarMemDef` member to the `hlfir.declare` op |
61 | /// for `%i`. |
62 | /// |
63 | /// 2. The operation(s) that update the loop's iteration variable from its |
64 | /// induction variable. For the above example, the `indVarUpdateOps` is |
65 | /// populated with the first 2 ops in the loop's body. |
66 | /// |
67 | /// Note: The current implementation is dependent on how flang emits loop |
68 | /// bodies; which is sufficient for the current simple test/use cases. If this |
69 | /// proves to be insufficient, this should be made more generic. |
70 | void populateInfo(fir::DoConcurrentLoopOp loop, mlir::Value inductionVar) { |
71 | mlir::Value result = nullptr; |
72 | |
73 | // Checks if a StoreOp is updating the memref of the loop's iteration |
74 | // variable. |
75 | auto isStoringIV = [&](fir::StoreOp storeOp) { |
76 | // Direct store into the IV memref. |
77 | if (storeOp.getValue() == inductionVar) { |
78 | indVarUpdateOps.push_back(storeOp); |
79 | return true; |
80 | } |
81 | |
82 | // Indirect store into the IV memref. |
83 | if (auto convertOp = mlir::dyn_cast<fir::ConvertOp>( |
84 | storeOp.getValue().getDefiningOp())) { |
85 | if (convertOp.getOperand() == inductionVar) { |
86 | indVarUpdateOps.push_back(convertOp); |
87 | indVarUpdateOps.push_back(storeOp); |
88 | return true; |
89 | } |
90 | } |
91 | |
92 | return false; |
93 | }; |
94 | |
95 | for (mlir::Operation &op : loop) { |
96 | if (auto storeOp = mlir::dyn_cast<fir::StoreOp>(op)) |
97 | if (isStoringIV(storeOp)) { |
98 | result = storeOp.getMemref(); |
99 | break; |
100 | } |
101 | } |
102 | |
103 | assert(result != nullptr && result.getDefiningOp() != nullptr); |
104 | iterVarMemDef = result.getDefiningOp(); |
105 | } |
106 | }; |
107 | |
108 | using InductionVariableInfos = llvm::SmallVector<InductionVariableInfo>; |
109 | |
110 | /// Collects values that are local to a loop: "loop-local values". A loop-local |
111 | /// value is one that is used exclusively inside the loop but allocated outside |
112 | /// of it. This usually corresponds to temporary values that are used inside the |
113 | /// loop body for initialzing other variables for example. |
114 | /// |
115 | /// See `flang/test/Transforms/DoConcurrent/locally_destroyed_temp.f90` for an |
116 | /// example of why we need this. |
117 | /// |
118 | /// \param [in] doLoop - the loop within which the function searches for values |
119 | /// used exclusively inside. |
120 | /// |
121 | /// \param [out] locals - the list of loop-local values detected for \p doLoop. |
122 | void collectLoopLocalValues(fir::DoConcurrentLoopOp loop, |
123 | llvm::SetVector<mlir::Value> &locals) { |
124 | loop.walk([&](mlir::Operation *op) { |
125 | for (mlir::Value operand : op->getOperands()) { |
126 | if (locals.contains(operand)) |
127 | continue; |
128 | |
129 | bool isLocal = true; |
130 | |
131 | if (!mlir::isa_and_present<fir::AllocaOp>(operand.getDefiningOp())) |
132 | continue; |
133 | |
134 | // Values defined inside the loop are not interesting since they do not |
135 | // need to be localized. |
136 | if (loop->isAncestor(operand.getDefiningOp())) |
137 | continue; |
138 | |
139 | for (auto *user : operand.getUsers()) { |
140 | if (!loop->isAncestor(user)) { |
141 | isLocal = false; |
142 | break; |
143 | } |
144 | } |
145 | |
146 | if (isLocal) |
147 | locals.insert(operand); |
148 | } |
149 | }); |
150 | } |
151 | |
152 | /// For a "loop-local" value \p local within a loop's scope, localizes that |
153 | /// value within the scope of the parallel region the loop maps to. Towards that |
154 | /// end, this function moves the allocation of \p local within \p allocRegion. |
155 | /// |
156 | /// \param local - the value used exclusively within a loop's scope (see |
157 | /// collectLoopLocalValues). |
158 | /// |
159 | /// \param allocRegion - the parallel region where \p local's allocation will be |
160 | /// privatized. |
161 | /// |
162 | /// \param rewriter - builder used for updating \p allocRegion. |
163 | static void localizeLoopLocalValue(mlir::Value local, mlir::Region &allocRegion, |
164 | mlir::ConversionPatternRewriter &rewriter) { |
165 | rewriter.moveOpBefore(local.getDefiningOp(), &allocRegion.front().front()); |
166 | } |
167 | } // namespace looputils |
168 | |
169 | class DoConcurrentConversion |
170 | : public mlir::OpConversionPattern<fir::DoConcurrentOp> { |
171 | public: |
172 | using mlir::OpConversionPattern<fir::DoConcurrentOp>::OpConversionPattern; |
173 | |
174 | DoConcurrentConversion( |
175 | mlir::MLIRContext *context, bool mapToDevice, |
176 | llvm::DenseSet<fir::DoConcurrentOp> &concurrentLoopsToSkip) |
177 | : OpConversionPattern(context), mapToDevice(mapToDevice), |
178 | concurrentLoopsToSkip(concurrentLoopsToSkip) {} |
179 | |
180 | mlir::LogicalResult |
181 | matchAndRewrite(fir::DoConcurrentOp doLoop, OpAdaptor adaptor, |
182 | mlir::ConversionPatternRewriter &rewriter) const override { |
183 | if (mapToDevice) |
184 | return doLoop.emitError( |
185 | "not yet implemented: Mapping `do concurrent` loops to device" ); |
186 | |
187 | looputils::InductionVariableInfos ivInfos; |
188 | auto loop = mlir::cast<fir::DoConcurrentLoopOp>( |
189 | doLoop.getRegion().back().getTerminator()); |
190 | |
191 | auto indVars = loop.getLoopInductionVars(); |
192 | assert(indVars.has_value()); |
193 | |
194 | for (mlir::Value indVar : *indVars) |
195 | ivInfos.emplace_back(loop, indVar); |
196 | |
197 | llvm::SetVector<mlir::Value> locals; |
198 | looputils::collectLoopLocalValues(loop, locals); |
199 | |
200 | mlir::IRMapping mapper; |
201 | mlir::omp::ParallelOp parallelOp = |
202 | genParallelOp(doLoop.getLoc(), rewriter, ivInfos, mapper); |
203 | mlir::omp::LoopNestOperands loopNestClauseOps; |
204 | genLoopNestClauseOps(doLoop.getLoc(), rewriter, loop, mapper, |
205 | loopNestClauseOps); |
206 | |
207 | for (mlir::Value local : locals) |
208 | looputils::localizeLoopLocalValue(local, parallelOp.getRegion(), |
209 | rewriter); |
210 | |
211 | mlir::omp::LoopNestOp ompLoopNest = |
212 | genWsLoopOp(rewriter, loop, mapper, loopNestClauseOps, |
213 | /*isComposite=*/mapToDevice); |
214 | |
215 | rewriter.setInsertionPoint(doLoop); |
216 | fir::FirOpBuilder builder( |
217 | rewriter, |
218 | fir::getKindMapping(doLoop->getParentOfType<mlir::ModuleOp>())); |
219 | |
220 | // Collect iteration variable(s) allocations so that we can move them |
221 | // outside the `fir.do_concurrent` wrapper (before erasing it). |
222 | llvm::SmallVector<mlir::Operation *> opsToMove; |
223 | for (mlir::Operation &op : llvm::drop_end(doLoop)) |
224 | opsToMove.push_back(&op); |
225 | |
226 | mlir::Block *allocBlock = builder.getAllocaBlock(); |
227 | |
228 | for (mlir::Operation *op : llvm::reverse(opsToMove)) { |
229 | rewriter.moveOpBefore(op, allocBlock, allocBlock->begin()); |
230 | } |
231 | |
232 | // Mark `unordered` loops that are not perfectly nested to be skipped from |
233 | // the legality check of the `ConversionTarget` since we are not interested |
234 | // in mapping them to OpenMP. |
235 | ompLoopNest->walk([&](fir::DoConcurrentOp doLoop) { |
236 | concurrentLoopsToSkip.insert(doLoop); |
237 | }); |
238 | |
239 | rewriter.eraseOp(doLoop); |
240 | |
241 | return mlir::success(); |
242 | } |
243 | |
244 | private: |
245 | mlir::omp::ParallelOp |
246 | genParallelOp(mlir::Location loc, mlir::ConversionPatternRewriter &rewriter, |
247 | looputils::InductionVariableInfos &ivInfos, |
248 | mlir::IRMapping &mapper) const { |
249 | auto parallelOp = rewriter.create<mlir::omp::ParallelOp>(loc); |
250 | rewriter.createBlock(¶llelOp.getRegion()); |
251 | rewriter.setInsertionPoint(rewriter.create<mlir::omp::TerminatorOp>(loc)); |
252 | |
253 | genLoopNestIndVarAllocs(rewriter, ivInfos, mapper); |
254 | return parallelOp; |
255 | } |
256 | |
257 | void genLoopNestIndVarAllocs(mlir::ConversionPatternRewriter &rewriter, |
258 | looputils::InductionVariableInfos &ivInfos, |
259 | mlir::IRMapping &mapper) const { |
260 | |
261 | for (auto &indVarInfo : ivInfos) |
262 | genInductionVariableAlloc(rewriter, indVarInfo.iterVarMemDef, mapper); |
263 | } |
264 | |
265 | mlir::Operation * |
266 | genInductionVariableAlloc(mlir::ConversionPatternRewriter &rewriter, |
267 | mlir::Operation *indVarMemDef, |
268 | mlir::IRMapping &mapper) const { |
269 | assert( |
270 | indVarMemDef != nullptr && |
271 | "Induction variable memdef is expected to have a defining operation." ); |
272 | |
273 | llvm::SmallSetVector<mlir::Operation *, 2> indVarDeclareAndAlloc; |
274 | for (auto operand : indVarMemDef->getOperands()) |
275 | indVarDeclareAndAlloc.insert(operand.getDefiningOp()); |
276 | indVarDeclareAndAlloc.insert(indVarMemDef); |
277 | |
278 | mlir::Operation *result; |
279 | for (mlir::Operation *opToClone : indVarDeclareAndAlloc) |
280 | result = rewriter.clone(*opToClone, mapper); |
281 | |
282 | return result; |
283 | } |
284 | |
285 | void |
286 | genLoopNestClauseOps(mlir::Location loc, |
287 | mlir::ConversionPatternRewriter &rewriter, |
288 | fir::DoConcurrentLoopOp loop, mlir::IRMapping &mapper, |
289 | mlir::omp::LoopNestOperands &loopNestClauseOps) const { |
290 | assert(loopNestClauseOps.loopLowerBounds.empty() && |
291 | "Loop nest bounds were already emitted!" ); |
292 | |
293 | auto populateBounds = [](mlir::Value var, |
294 | llvm::SmallVectorImpl<mlir::Value> &bounds) { |
295 | bounds.push_back(var.getDefiningOp()->getResult(0)); |
296 | }; |
297 | |
298 | for (auto [lb, ub, st] : llvm::zip_equal( |
299 | loop.getLowerBound(), loop.getUpperBound(), loop.getStep())) { |
300 | populateBounds(lb, loopNestClauseOps.loopLowerBounds); |
301 | populateBounds(ub, loopNestClauseOps.loopUpperBounds); |
302 | populateBounds(st, loopNestClauseOps.loopSteps); |
303 | } |
304 | |
305 | loopNestClauseOps.loopInclusive = rewriter.getUnitAttr(); |
306 | } |
307 | |
308 | mlir::omp::LoopNestOp |
309 | genWsLoopOp(mlir::ConversionPatternRewriter &rewriter, |
310 | fir::DoConcurrentLoopOp loop, mlir::IRMapping &mapper, |
311 | const mlir::omp::LoopNestOperands &clauseOps, |
312 | bool isComposite) const { |
313 | mlir::omp::WsloopOperands wsloopClauseOps; |
314 | |
315 | // For `local` (and `local_init`) opernads, emit corresponding `private` |
316 | // clauses and attach these clauses to the workshare loop. |
317 | if (!loop.getLocalOperands().empty()) |
318 | for (auto [op, sym, arg] : llvm::zip_equal( |
319 | loop.getLocalOperands(), |
320 | loop.getLocalSymsAttr().getAsRange<mlir::SymbolRefAttr>(), |
321 | loop.getRegionLocalArgs())) { |
322 | auto localizer = mlir::SymbolTable::lookupNearestSymbolFrom< |
323 | fir::LocalitySpecifierOp>(loop, sym); |
324 | if (localizer.getLocalitySpecifierType() == |
325 | fir::LocalitySpecifierType::LocalInit) |
326 | TODO(localizer.getLoc(), |
327 | "local_init conversion is not supported yet" ); |
328 | |
329 | auto oldIP = rewriter.saveInsertionPoint(); |
330 | rewriter.setInsertionPointAfter(localizer); |
331 | auto privatizer = rewriter.create<mlir::omp::PrivateClauseOp>( |
332 | localizer.getLoc(), sym.getLeafReference().str() + ".omp" , |
333 | localizer.getTypeAttr().getValue(), |
334 | mlir::omp::DataSharingClauseType::Private); |
335 | |
336 | if (!localizer.getInitRegion().empty()) { |
337 | rewriter.cloneRegionBefore(localizer.getInitRegion(), |
338 | privatizer.getInitRegion(), |
339 | privatizer.getInitRegion().begin()); |
340 | auto firYield = mlir::cast<fir::YieldOp>( |
341 | privatizer.getInitRegion().back().getTerminator()); |
342 | rewriter.setInsertionPoint(firYield); |
343 | rewriter.create<mlir::omp::YieldOp>(firYield.getLoc(), |
344 | firYield.getOperands()); |
345 | rewriter.eraseOp(firYield); |
346 | } |
347 | |
348 | if (!localizer.getDeallocRegion().empty()) { |
349 | rewriter.cloneRegionBefore(localizer.getDeallocRegion(), |
350 | privatizer.getDeallocRegion(), |
351 | privatizer.getDeallocRegion().begin()); |
352 | auto firYield = mlir::cast<fir::YieldOp>( |
353 | privatizer.getDeallocRegion().back().getTerminator()); |
354 | rewriter.setInsertionPoint(firYield); |
355 | rewriter.create<mlir::omp::YieldOp>(firYield.getLoc(), |
356 | firYield.getOperands()); |
357 | rewriter.eraseOp(firYield); |
358 | } |
359 | |
360 | rewriter.restoreInsertionPoint(oldIP); |
361 | |
362 | wsloopClauseOps.privateVars.push_back(op); |
363 | wsloopClauseOps.privateSyms.push_back( |
364 | mlir::SymbolRefAttr::get(privatizer)); |
365 | } |
366 | |
367 | auto wsloopOp = |
368 | rewriter.create<mlir::omp::WsloopOp>(loop.getLoc(), wsloopClauseOps); |
369 | wsloopOp.setComposite(isComposite); |
370 | |
371 | Fortran::common::openmp::EntryBlockArgs wsloopArgs; |
372 | wsloopArgs.priv.vars = wsloopClauseOps.privateVars; |
373 | Fortran::common::openmp::genEntryBlock(rewriter, wsloopArgs, |
374 | wsloopOp.getRegion()); |
375 | |
376 | auto loopNestOp = |
377 | rewriter.create<mlir::omp::LoopNestOp>(loop.getLoc(), clauseOps); |
378 | |
379 | // Clone the loop's body inside the loop nest construct using the |
380 | // mapped values. |
381 | rewriter.cloneRegionBefore(loop.getRegion(), loopNestOp.getRegion(), |
382 | loopNestOp.getRegion().begin(), mapper); |
383 | |
384 | rewriter.setInsertionPointToEnd(&loopNestOp.getRegion().back()); |
385 | rewriter.create<mlir::omp::YieldOp>(loop->getLoc()); |
386 | |
387 | // `local` region arguments are transferred/cloned from the `do concurrent` |
388 | // loop to the loopnest op when the region is cloned above. Instead, these |
389 | // region arguments should be on the workshare loop's region. |
390 | for (auto [wsloopArg, loopNestArg] : |
391 | llvm::zip_equal(wsloopOp.getRegion().getArguments(), |
392 | loopNestOp.getRegion().getArguments().drop_front( |
393 | clauseOps.loopLowerBounds.size()))) |
394 | rewriter.replaceAllUsesWith(loopNestArg, wsloopArg); |
395 | |
396 | for (unsigned i = 0; i < loop.getLocalVars().size(); ++i) |
397 | loopNestOp.getRegion().eraseArgument(clauseOps.loopLowerBounds.size()); |
398 | |
399 | return loopNestOp; |
400 | } |
401 | |
402 | bool mapToDevice; |
403 | llvm::DenseSet<fir::DoConcurrentOp> &concurrentLoopsToSkip; |
404 | }; |
405 | |
406 | class DoConcurrentConversionPass |
407 | : public flangomp::impl::DoConcurrentConversionPassBase< |
408 | DoConcurrentConversionPass> { |
409 | public: |
410 | DoConcurrentConversionPass() = default; |
411 | |
412 | DoConcurrentConversionPass( |
413 | const flangomp::DoConcurrentConversionPassOptions &options) |
414 | : DoConcurrentConversionPassBase(options) {} |
415 | |
416 | void runOnOperation() override { |
417 | mlir::func::FuncOp func = getOperation(); |
418 | |
419 | if (func.isDeclaration()) |
420 | return; |
421 | |
422 | mlir::MLIRContext *context = &getContext(); |
423 | |
424 | if (mapTo != flangomp::DoConcurrentMappingKind::DCMK_Host && |
425 | mapTo != flangomp::DoConcurrentMappingKind::DCMK_Device) { |
426 | mlir::emitWarning(mlir::UnknownLoc::get(context), |
427 | "DoConcurrentConversionPass: invalid `map-to` value. " |
428 | "Valid values are: `host` or `device`" ); |
429 | return; |
430 | } |
431 | |
432 | llvm::DenseSet<fir::DoConcurrentOp> concurrentLoopsToSkip; |
433 | mlir::RewritePatternSet patterns(context); |
434 | patterns.insert<DoConcurrentConversion>( |
435 | context, mapTo == flangomp::DoConcurrentMappingKind::DCMK_Device, |
436 | concurrentLoopsToSkip); |
437 | mlir::ConversionTarget target(*context); |
438 | target.addDynamicallyLegalOp<fir::DoConcurrentOp>( |
439 | [&](fir::DoConcurrentOp op) { |
440 | return concurrentLoopsToSkip.contains(op); |
441 | }); |
442 | target.markUnknownOpDynamicallyLegal( |
443 | [](mlir::Operation *) { return true; }); |
444 | |
445 | if (mlir::failed(mlir::applyFullConversion(getOperation(), target, |
446 | std::move(patterns)))) { |
447 | signalPassFailure(); |
448 | } |
449 | } |
450 | }; |
451 | } // namespace |
452 | |
453 | std::unique_ptr<mlir::Pass> |
454 | flangomp::createDoConcurrentConversionPass(bool mapToDevice) { |
455 | DoConcurrentConversionPassOptions options; |
456 | options.mapTo = mapToDevice ? flangomp::DoConcurrentMappingKind::DCMK_Device |
457 | : flangomp::DoConcurrentMappingKind::DCMK_Host; |
458 | |
459 | return std::make_unique<DoConcurrentConversionPass>(options); |
460 | } |
461 | |