1//===- SCFToGPU.cpp - Convert an affine loop nest to a GPU kernel -------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This implements a straightforward conversion of an loop nest into a GPU
10// kernel. The caller is expected to guarantee that the conversion is correct
11// or to further transform the kernel to ensure correctness.
12//
13//===----------------------------------------------------------------------===//
14
15#include "mlir/Conversion/SCFToGPU/SCFToGPU.h"
16
17#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
18#include "mlir/Dialect/Affine/IR/AffineOps.h"
19#include "mlir/Dialect/Arith/IR/Arith.h"
20#include "mlir/Dialect/GPU/IR/GPUDialect.h"
21#include "mlir/Dialect/GPU/Transforms/ParallelLoopMapper.h"
22#include "mlir/Dialect/MemRef/IR/MemRef.h"
23#include "mlir/Dialect/SCF/IR/SCF.h"
24#include "mlir/IR/AffineExpr.h"
25#include "mlir/IR/Builders.h"
26#include "mlir/IR/IRMapping.h"
27#include "mlir/Interfaces/SideEffectInterfaces.h"
28#include "mlir/Pass/Pass.h"
29#include "mlir/Transforms/DialectConversion.h"
30#include "mlir/Transforms/Passes.h"
31#include "mlir/Transforms/RegionUtils.h"
32#include "llvm/Support/Debug.h"
33#include <optional>
34
35#define DEBUG_TYPE "loops-to-gpu"
36
37using namespace mlir;
38using namespace mlir::affine;
39using namespace mlir::scf;
40
41// Name of internal attribute to mark visited operations during conversion.
42//
43// NOTE: The conversion originally used the following legality criteria:
44// `!parallelOp->hasAttr(gpu::getMappingAttrName())`
45// But the provided pattern might reject some cases based on more detailed
46// analysis of the `mapping` attribute.
47// To avoid dialect conversion failure due to non-converted illegal operation
48// we use this extra Unit attribute as a marker, that the operation was checked
49// by the pattern and is should be considered as legal in the following legality
50// checks. The `finalizeParallelLoopToGPUConversion` function performs clean up
51// of this extra attributes ans is supposed to be called after the dialect
52// conversion.
53//
54// TODO: Implement a cleaner solution, factoring out the "matching" logic
55// from the pattern and its callees into a separate function that can be called
56// from both the pattern and the op legality check.
57static constexpr StringLiteral kVisitedAttrName = "SCFToGPU_visited";
58
59// Extract an indexed value from KernelDim3.
60static Value getDim3Value(const gpu::KernelDim3 &dim3, unsigned pos) {
61 switch (pos) {
62 case 0:
63 return dim3.x;
64 case 1:
65 return dim3.y;
66 case 2:
67 return dim3.z;
68 default:
69 llvm_unreachable("dim3 position out of bounds");
70 }
71 return nullptr;
72}
73
74// Get the lower bound-related operands of a loop operation.
75static Operation::operand_range getLowerBoundOperands(AffineForOp forOp) {
76 return forOp.getLowerBoundOperands();
77}
78
79// Get the upper bound-related operands of a loop operation.
80static Operation::operand_range getUpperBoundOperands(AffineForOp forOp) {
81 return forOp.getUpperBoundOperands();
82}
83
84// Get a Value that corresponds to the loop step. If the step is an attribute,
85// materialize a corresponding constant using builder.
86static Value getOrCreateStep(AffineForOp forOp, OpBuilder &builder) {
87 return builder.create<arith::ConstantIndexOp>(location: forOp.getLoc(),
88 args: forOp.getStepAsInt());
89}
90
91// Get a Value for the loop lower bound. If the value requires computation,
92// materialize the instructions using builder.
93static Value getOrEmitLowerBound(AffineForOp forOp, OpBuilder &builder) {
94 return lowerAffineLowerBound(op: forOp, builder);
95}
96
97// Get a Value for the loop upper bound. If the value requires computation,
98// materialize the instructions using builder.
99static Value getOrEmitUpperBound(AffineForOp forOp, OpBuilder &builder) {
100 return lowerAffineUpperBound(op: forOp, builder);
101}
102
103// Check the structure of the loop nest:
104// - there are enough loops to map to numDims;
105// - the loops are perfectly nested;
106// - the loop bounds can be computed above the outermost loop.
107// This roughly corresponds to the "matcher" part of the pattern-based
108// rewriting infrastructure.
109static LogicalResult checkAffineLoopNestMappableImpl(AffineForOp forOp,
110 unsigned numDims) {
111 Region &limit = forOp.getRegion();
112 for (unsigned i = 0, e = numDims; i < e; ++i) {
113 Operation *nested = &forOp.getBody()->front();
114 if (!areValuesDefinedAbove(values: getLowerBoundOperands(forOp), limit) ||
115 !areValuesDefinedAbove(values: getUpperBoundOperands(forOp), limit))
116 return forOp.emitError(
117 message: "loops with bounds depending on other mapped loops "
118 "are not supported");
119
120 // The innermost loop can have an arbitrary body, skip the perfect nesting
121 // check for it.
122 if (i == e - 1)
123 break;
124
125 auto begin = forOp.getBody()->begin(), end = forOp.getBody()->end();
126 if (forOp.getBody()->empty() || std::next(x: begin, n: 2) != end)
127 return forOp.emitError(message: "expected perfectly nested loops in the body");
128
129 if (!(forOp = dyn_cast<AffineForOp>(Val: nested)))
130 return nested->emitError(message: "expected a nested loop");
131 }
132 return success();
133}
134
135static LogicalResult checkAffineLoopNestMappable(AffineForOp forOp,
136 unsigned numBlockDims,
137 unsigned numThreadDims) {
138 if (numBlockDims < 1 || numThreadDims < 1) {
139 LLVM_DEBUG(llvm::dbgs() << "nothing to map");
140 return success();
141 }
142
143 if (numBlockDims > 3) {
144 return forOp.emitError(message: "cannot map to more than 3 block dimensions");
145 }
146 if (numThreadDims > 3) {
147 return forOp.emitError(message: "cannot map to more than 3 thread dimensions");
148 }
149 return checkAffineLoopNestMappableImpl(forOp, numDims: numBlockDims + numThreadDims);
150}
151
152namespace {
153// Helper structure that holds common state of the loop to GPU kernel
154// conversion.
155struct AffineLoopToGpuConverter {
156 std::optional<AffineForOp> collectBounds(AffineForOp forOp,
157 unsigned numLoops);
158
159 void createLaunch(AffineForOp rootForOp, AffineForOp innermostForOp,
160 unsigned numBlockDims, unsigned numThreadDims);
161
162 // Ranges of the loops mapped to blocks or threads.
163 SmallVector<Value, 6> dims;
164 // Lower bounds of the loops mapped to blocks or threads.
165 SmallVector<Value, 6> lbs;
166 // Induction variables of the loops mapped to blocks or threads.
167 SmallVector<Value, 6> ivs;
168 // Steps of the loops mapped to blocks or threads.
169 SmallVector<Value, 6> steps;
170};
171} // namespace
172
173// Collect ranges, bounds, steps and induction variables in preparation for
174// mapping a loop nest of depth "numLoops" rooted at "forOp" to a GPU kernel.
175// This may fail if the IR for computing loop bounds cannot be constructed, for
176// example if an affine loop uses semi-affine maps. Return the last loop to be
177// mapped on success, std::nullopt on failure.
178std::optional<AffineForOp>
179AffineLoopToGpuConverter::collectBounds(AffineForOp forOp, unsigned numLoops) {
180 OpBuilder builder(forOp.getOperation());
181 dims.reserve(N: numLoops);
182 lbs.reserve(N: numLoops);
183 ivs.reserve(N: numLoops);
184 steps.reserve(N: numLoops);
185 AffineForOp currentLoop = forOp;
186 for (unsigned i = 0; i < numLoops; ++i) {
187 Value lowerBound = getOrEmitLowerBound(forOp: currentLoop, builder);
188 Value upperBound = getOrEmitUpperBound(forOp: currentLoop, builder);
189 if (!lowerBound || !upperBound) {
190 return std::nullopt;
191 }
192
193 Value range = builder.create<arith::SubIOp>(location: currentLoop.getLoc(),
194 args&: upperBound, args&: lowerBound);
195 Value step = getOrCreateStep(forOp: currentLoop, builder);
196 if (getConstantIntValue(ofr: step) != static_cast<int64_t>(1))
197 range =
198 builder.create<arith::CeilDivSIOp>(location: currentLoop.getLoc(), args&: range, args&: step);
199 dims.push_back(Elt: range);
200
201 lbs.push_back(Elt: lowerBound);
202 ivs.push_back(Elt: currentLoop.getInductionVar());
203 steps.push_back(Elt: step);
204
205 if (i != numLoops - 1)
206 currentLoop = cast<AffineForOp>(Val: &currentLoop.getBody()->front());
207 }
208 return currentLoop;
209}
210
211// Replace the rooted at "rootForOp" with a GPU launch operation. This expects
212// "innermostForOp" to point to the last loop to be transformed to the kernel,
213// and to have (numBlockDims + numThreadDims) perfectly nested loops between
214// "rootForOp" and "innermostForOp".
215void AffineLoopToGpuConverter::createLaunch(AffineForOp rootForOp,
216 AffineForOp innermostForOp,
217 unsigned numBlockDims,
218 unsigned numThreadDims) {
219 OpBuilder builder(rootForOp.getOperation());
220 // Prepare the grid and block sizes for the launch operation. If there is
221 // no loop mapped to a specific dimension, use constant "1" as its size.
222 Value constOne =
223 (numBlockDims < 3 || numThreadDims < 3)
224 ? builder.create<arith::ConstantIndexOp>(location: rootForOp.getLoc(), args: 1)
225 : nullptr;
226 Value gridSizeX = numBlockDims > 0 ? dims[0] : constOne;
227 Value gridSizeY = numBlockDims > 1 ? dims[1] : constOne;
228 Value gridSizeZ = numBlockDims > 2 ? dims[2] : constOne;
229 Value blockSizeX = numThreadDims > 0 ? dims[numBlockDims] : constOne;
230 Value blockSizeY = numThreadDims > 1 ? dims[numBlockDims + 1] : constOne;
231 Value blockSizeZ = numThreadDims > 2 ? dims[numBlockDims + 2] : constOne;
232
233 // Create a launch op and move the body region of the innermost loop to the
234 // launch op.
235 auto launchOp = builder.create<gpu::LaunchOp>(
236 location: rootForOp.getLoc(), args&: gridSizeX, args&: gridSizeY, args&: gridSizeZ, args&: blockSizeX,
237 args&: blockSizeY, args&: blockSizeZ);
238
239 // Replace the loop terminator (loops contain only a single block) with the
240 // gpu terminator and move the operations from the loop body block to the gpu
241 // launch body block. Do not move the entire block because of the difference
242 // in block arguments.
243 Operation &terminator = innermostForOp.getBody()->back();
244 Location terminatorLoc = terminator.getLoc();
245 terminator.erase();
246 builder.setInsertionPointToEnd(innermostForOp.getBody());
247 builder.create<gpu::TerminatorOp>(location: terminatorLoc, args: TypeRange());
248 launchOp.getBody().front().getOperations().splice(
249 where: launchOp.getBody().front().begin(),
250 L2&: innermostForOp.getBody()->getOperations());
251
252 // Remap the loop iterators to use block/thread identifiers instead. Loops
253 // may iterate from LB with step S whereas GPU thread/block ids always iterate
254 // from 0 to N with step 1. Therefore, loop induction variables are replaced
255 // with (gpu-thread/block-id * S) + LB.
256 builder.setInsertionPointToStart(&launchOp.getBody().front());
257 auto *lbArgumentIt = lbs.begin();
258 auto *stepArgumentIt = steps.begin();
259 for (const auto &en : llvm::enumerate(First&: ivs)) {
260 Value id =
261 en.index() < numBlockDims
262 ? getDim3Value(dim3: launchOp.getBlockIds(), pos: en.index())
263 : getDim3Value(dim3: launchOp.getThreadIds(), pos: en.index() - numBlockDims);
264 Value step = steps[en.index()];
265 if (getConstantIntValue(ofr: step) != static_cast<int64_t>(1))
266 id = builder.create<arith::MulIOp>(location: rootForOp.getLoc(), args&: step, args&: id);
267
268 Value ivReplacement =
269 builder.create<arith::AddIOp>(location: rootForOp.getLoc(), args&: *lbArgumentIt, args&: id);
270 en.value().replaceAllUsesWith(newValue: ivReplacement);
271 std::advance(i&: lbArgumentIt, n: 1);
272 std::advance(i&: stepArgumentIt, n: 1);
273 }
274
275 // We are done and can erase the original outermost loop.
276 rootForOp.erase();
277}
278
279// Generic loop to GPU kernel conversion function.
280static LogicalResult convertAffineLoopNestToGPULaunch(AffineForOp forOp,
281 unsigned numBlockDims,
282 unsigned numThreadDims) {
283 if (failed(Result: checkAffineLoopNestMappable(forOp, numBlockDims, numThreadDims)))
284 return failure();
285
286 AffineLoopToGpuConverter converter;
287 auto maybeInnerLoop =
288 converter.collectBounds(forOp, numLoops: numBlockDims + numThreadDims);
289 if (!maybeInnerLoop)
290 return failure();
291 converter.createLaunch(rootForOp: forOp, innermostForOp: *maybeInnerLoop, numBlockDims, numThreadDims);
292
293 return success();
294}
295
296LogicalResult mlir::convertAffineLoopNestToGPULaunch(AffineForOp forOp,
297 unsigned numBlockDims,
298 unsigned numThreadDims) {
299 return ::convertAffineLoopNestToGPULaunch(forOp, numBlockDims, numThreadDims);
300}
301
302namespace {
303struct ParallelToGpuLaunchLowering : public OpRewritePattern<ParallelOp> {
304 using OpRewritePattern<ParallelOp>::OpRewritePattern;
305
306 LogicalResult matchAndRewrite(ParallelOp parallelOp,
307 PatternRewriter &rewriter) const override;
308};
309} // namespace
310
311/// Tries to derive a static upper bound from the defining operation of
312/// `upperBound`.
313static Value deriveStaticUpperBound(Value upperBound,
314 PatternRewriter &rewriter) {
315 if (auto op = upperBound.getDefiningOp<arith::ConstantIndexOp>()) {
316 return op;
317 }
318
319 if (auto minOp = upperBound.getDefiningOp<AffineMinOp>()) {
320 for (const AffineExpr &result : minOp.getMap().getResults()) {
321 if (auto constExpr = dyn_cast<AffineConstantExpr>(Val: result)) {
322 return rewriter.create<arith::ConstantIndexOp>(location: minOp.getLoc(),
323 args: constExpr.getValue());
324 }
325 }
326 }
327
328 if (auto minOp = upperBound.getDefiningOp<arith::MinSIOp>()) {
329 for (Value operand : {minOp.getLhs(), minOp.getRhs()}) {
330 if (auto staticBound = deriveStaticUpperBound(upperBound: operand, rewriter))
331 return staticBound;
332 }
333 }
334
335 if (auto multiplyOp = upperBound.getDefiningOp<arith::MulIOp>()) {
336 if (auto lhs = dyn_cast_or_null<arith::ConstantIndexOp>(
337 Val: deriveStaticUpperBound(upperBound: multiplyOp.getOperand(i: 0), rewriter)
338 .getDefiningOp()))
339 if (auto rhs = dyn_cast_or_null<arith::ConstantIndexOp>(
340 Val: deriveStaticUpperBound(upperBound: multiplyOp.getOperand(i: 1), rewriter)
341 .getDefiningOp())) {
342 // Assumptions about the upper bound of minimum computations no longer
343 // work if multiplied by mixed signs, so abort in this case.
344 if ((lhs.value() < 0) != (rhs.value() < 0))
345 return {};
346
347 return rewriter.create<arith::ConstantIndexOp>(
348 location: multiplyOp.getLoc(), args: lhs.value() * rhs.value());
349 }
350 }
351
352 return {};
353}
354
355static bool isMappedToProcessor(gpu::Processor processor) {
356 return processor != gpu::Processor::Sequential;
357}
358
359static unsigned getLaunchOpArgumentNum(gpu::Processor processor) {
360 switch (processor) {
361 case gpu::Processor::BlockX:
362 return 0;
363 case gpu::Processor::BlockY:
364 return 1;
365 case gpu::Processor::BlockZ:
366 return 2;
367 case gpu::Processor::ThreadX:
368 return 3;
369 case gpu::Processor::ThreadY:
370 return 4;
371 case gpu::Processor::ThreadZ:
372 return 5;
373 default:;
374 }
375 llvm_unreachable(
376 "invalid processor type while retrieving launch op argument number");
377}
378
379/// Modifies the current transformation state to capture the effect of the given
380/// `scf.parallel` operation on index substitutions and the operations to be
381/// inserted.
382/// Specifically, if a dimension of a parallel loop is mapped to a hardware id,
383/// this function will
384/// - compute the loop index based on the hardware id and affine map from the
385/// mapping and update `cloningMap` to substitute all uses.
386/// - derive a new upper bound for the hardware id and augment the provided
387/// `gpu.launch operation` accordingly.
388/// - if the upper bound is imprecise, insert a conditional in the `gpu.launch`
389/// and update the rewriter to insert into the conditional's body.
390/// If the dimension is mapped to sequential,
391/// - insert a for loop into the body and update the rewriter to insert into
392/// the for loop's body.
393/// - update the `cloningMap` to replace uses of the index with the index of
394/// the new for loop.
395/// In either case,
396/// - append the instructions from the loops body to worklist, in reverse order.
397/// To note the end of the current scope in case a loop or conditional was
398/// inserted, a sentinel (the `gpu.launch` operation) is inserted into the
399/// worklist. This signals the processor of the worklist to pop the rewriter
400/// one scope-level up.
401static LogicalResult processParallelLoop(
402 ParallelOp parallelOp, gpu::LaunchOp launchOp, IRMapping &cloningMap,
403 SmallVectorImpl<Operation *> &worklist,
404 DenseMap<gpu::Processor, Value> &bounds, PatternRewriter &rewriter) {
405 // TODO: Verify that this is a valid GPU mapping.
406 // processor ids: 0-2 block [x/y/z], 3-5 -> thread [x/y/z], 6-> sequential
407 ArrayAttr mapping =
408 parallelOp->getAttrOfType<ArrayAttr>(name: gpu::getMappingAttrName());
409
410 // TODO: Support multiple reductions.
411 if (!mapping || parallelOp.getNumResults() > 1)
412 return failure();
413
414 Location loc = parallelOp.getLoc();
415
416 auto launchIndependent = [&launchOp](Value val) {
417 return val.getParentRegion()->isAncestor(other: launchOp->getParentRegion());
418 };
419
420 auto ensureLaunchIndependent = [&rewriter,
421 launchIndependent](Value val) -> Value {
422 if (launchIndependent(val))
423 return val;
424 if (auto constOp = val.getDefiningOp<arith::ConstantOp>())
425 return rewriter.create<arith::ConstantOp>(location: constOp.getLoc(),
426 args: constOp.getValue());
427 return {};
428 };
429
430 for (auto config : llvm::zip(
431 t&: mapping, u: parallelOp.getInductionVars(), args: parallelOp.getLowerBound(),
432 args: parallelOp.getUpperBound(), args: parallelOp.getStep())) {
433 Attribute mappingAttribute;
434 Value iv, lowerBound, upperBound, step;
435 std::tie(args&: mappingAttribute, args&: iv, args&: lowerBound, args&: upperBound, args&: step) = config;
436 auto annotation =
437 dyn_cast<gpu::ParallelLoopDimMappingAttr>(Val&: mappingAttribute);
438 if (!annotation)
439 return parallelOp.emitOpError()
440 << "expected mapping attribute for lowering to GPU";
441 Value newIndex;
442 gpu::Processor processor = annotation.getProcessor();
443
444 if (isMappedToProcessor(processor)) {
445 // Use the corresponding thread/grid index as replacement for the loop iv.
446 Value operand =
447 launchOp.getBody().getArgument(i: getLaunchOpArgumentNum(processor));
448 // Take the indexmap and add the lower bound and step computations in.
449 // This computes operand * step + lowerBound.
450 // Use an affine map here so that it composes nicely with the provided
451 // annotation.
452 AffineMap lowerAndStep = AffineMap::get(
453 dimCount: 1, symbolCount: 2,
454 result: rewriter.getAffineDimExpr(position: 0) * rewriter.getAffineSymbolExpr(position: 0) +
455 rewriter.getAffineSymbolExpr(position: 1));
456 newIndex = rewriter.create<AffineApplyOp>(
457 location: loc, args: annotation.getMap().compose(map: lowerAndStep),
458 args: ValueRange{operand, ensureLaunchIndependent(step),
459 ensureLaunchIndependent(lowerBound)});
460 // If there was also a bound, insert that, too.
461 // TODO: Check that we do not assign bounds twice.
462 if (annotation.getBound()) {
463 // We pass as the single operand to the bound-map the number of
464 // iterations, which is (upperBound - lowerBound) ceilDiv step. To
465 // support inner loops with dynamic upper bounds (as generated by e.g.
466 // tiling), try to derive a max for the bounds. If the used bound for
467 // the hardware id is imprecise, wrap the contained code into a
468 // conditional. If the lower-bound is constant or defined before the
469 // launch, we can use it in the launch bounds. Otherwise fail.
470 if (!launchIndependent(lowerBound) &&
471 !isa_and_nonnull<arith::ConstantOp>(Val: lowerBound.getDefiningOp()))
472 return failure();
473 // The step must also be constant or defined outside of the loop nest.
474 if (!launchIndependent(step) &&
475 !isa_and_nonnull<arith::ConstantOp>(Val: step.getDefiningOp()))
476 return failure();
477 // If the upper-bound is constant or defined before the launch, we can
478 // use it in the launch bounds directly. Otherwise try derive a bound.
479 bool boundIsPrecise =
480 launchIndependent(upperBound) ||
481 isa_and_nonnull<arith::ConstantOp>(Val: upperBound.getDefiningOp());
482 {
483 PatternRewriter::InsertionGuard guard(rewriter);
484 rewriter.setInsertionPoint(launchOp);
485 if (!boundIsPrecise) {
486 upperBound = deriveStaticUpperBound(upperBound, rewriter);
487 if (!upperBound) {
488 return rewriter.notifyMatchFailure(
489 arg&: parallelOp,
490 msg: "cannot derive loop-invariant upper bound for number of"
491 "iterations");
492 }
493 }
494 // Compute the number of iterations needed. We compute this as an
495 // affine expression ceilDiv (upperBound - lowerBound) step. We use
496 // affine.apply here so that it composes nicely with the provided map.
497 AffineMap stepMap = AffineMap::get(
498 dimCount: 1, symbolCount: 2,
499 result: ((rewriter.getAffineDimExpr(position: 0) - rewriter.getAffineSymbolExpr(position: 0))
500 .ceilDiv(other: rewriter.getAffineSymbolExpr(position: 1))));
501 Value launchBound = rewriter.create<AffineApplyOp>(
502 location: loc, args: annotation.getBound().compose(map: stepMap),
503 args: ValueRange{
504 ensureLaunchIndependent(
505 cloningMap.lookupOrDefault(from: upperBound)),
506 ensureLaunchIndependent(
507 cloningMap.lookupOrDefault(from: lowerBound)),
508 ensureLaunchIndependent(cloningMap.lookupOrDefault(from: step))});
509 // todo(herhut,ravishankarm): Update the behavior of setMappingAttr
510 // when this condition is relaxed.
511 if (!bounds.try_emplace(Key: processor, Args&: launchBound).second) {
512 return rewriter.notifyMatchFailure(
513 arg&: parallelOp, msg: "cannot redefine the bound for processor " +
514 Twine(static_cast<int64_t>(processor)));
515 }
516 }
517 if (!boundIsPrecise) {
518 // We are using an approximation, create a surrounding conditional.
519 Value originalBound = std::get<3>(t&: config);
520 arith::CmpIOp pred = rewriter.create<arith::CmpIOp>(
521 location: loc, args: arith::CmpIPredicate::slt, args&: newIndex,
522 args: cloningMap.lookupOrDefault(from: originalBound));
523 scf::IfOp ifOp = rewriter.create<scf::IfOp>(location: loc, args&: pred, args: false);
524 rewriter.setInsertionPointToStart(&ifOp.getThenRegion().front());
525 // Put a sentinel into the worklist so we know when to pop out of the
526 // if body again. We use the launchOp here, as that cannot be part of
527 // the bodies instruction.
528 worklist.push_back(Elt: launchOp.getOperation());
529 }
530 }
531 } else {
532 // Create a sequential for loop.
533 auto loopOp = rewriter.create<scf::ForOp>(
534 location: loc, args: cloningMap.lookupOrDefault(from: lowerBound),
535 args: cloningMap.lookupOrDefault(from: upperBound),
536 args: cloningMap.lookupOrDefault(from: step));
537 newIndex = loopOp.getInductionVar();
538 rewriter.setInsertionPointToStart(loopOp.getBody());
539 // Put a sentinel into the worklist so we know when to pop out of the loop
540 // body again. We use the launchOp here, as that cannot be part of the
541 // bodies instruction.
542 worklist.push_back(Elt: launchOp.getOperation());
543 }
544 cloningMap.map(from: iv, to: newIndex);
545 }
546
547 // Propagate custom user defined optional attributes, that can be used at
548 // later stage, such as extension data for GPU kernel dispatch
549 for (const auto &namedAttr : parallelOp->getAttrs()) {
550 if (namedAttr.getName() == gpu::getMappingAttrName() ||
551 namedAttr.getName() == ParallelOp::getOperandSegmentSizeAttr())
552 continue;
553 launchOp->setAttr(name: namedAttr.getName(), value: namedAttr.getValue());
554 }
555
556 Block *body = parallelOp.getBody();
557 worklist.reserve(N: worklist.size() + body->getOperations().size());
558 // Include scf.reduce terminator if exists and has an operand.
559 if (auto terminator = body->getTerminator();
560 isa<scf::ReduceOp>(Val: terminator) && terminator->getOperands().size() == 1) {
561 worklist.push_back(Elt: terminator);
562 }
563 for (Operation &op : llvm::reverse(C: body->without_terminator()))
564 worklist.push_back(Elt: &op);
565 return success();
566}
567
568/// Lower a `scf.parallel` operation into a corresponding `gpu.launch`
569/// operation.
570///
571/// This essentially transforms a loop nest into a corresponding SIMT function.
572/// The conversion is driven by mapping annotations on the `scf.parallel`
573/// operations. The mapping is provided via a `DictionaryAttribute` named
574/// `mapping`, which has three entries:
575/// - processor: the hardware id to map to. 0-2 are block dimensions, 3-5 are
576/// thread dimensions and 6 is sequential.
577/// - map : An affine map that is used to pre-process hardware ids before
578/// substitution.
579/// - bound : An affine map that is used to compute the bound of the hardware
580/// id based on an upper bound of the number of iterations.
581/// If the `scf.parallel` contains nested `scf.parallel` operations, those
582/// need to be annotated, as well. Structurally, the transformation works by
583/// splicing all operations from nested `scf.parallel` operations into a single
584/// sequence. Indices mapped to hardware ids are substituted with those ids,
585/// wheras sequential mappings result in a sequential for-loop. To have more
586/// flexibility when mapping code to hardware ids, the transform supports two
587/// affine maps. The first `map` is used to compute the actual index for
588/// substitution from the hardware id. The second `bound` is used to compute the
589/// launch dimension for the hardware id from the number of iterations the
590/// mapped loop is performing. Note that the number of iterations might be
591/// imprecise if the corresponding loop-bounds are loop-dependent. In such case,
592/// the hardware id might iterate over additional indices. The transformation
593/// caters for this by predicating the created sequence of instructions on
594/// the actual loop bound. This only works if an static upper bound for the
595/// dynamic loop bound can be derived, currently via analyzing `affine.min`
596/// operations.
597LogicalResult
598ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
599 PatternRewriter &rewriter) const {
600 // Mark the operation as visited for recursive legality check.
601 parallelOp->setAttr(name: kVisitedAttrName, value: rewriter.getUnitAttr());
602
603 // We can only transform starting at the outer-most loop. Launches inside of
604 // parallel loops are not supported.
605 if (auto parentLoop = parallelOp->getParentOfType<ParallelOp>())
606 return failure();
607 // Create a launch operation. We start with bound one for all grid/block
608 // sizes. Those will be refined later as we discover them from mappings.
609 Location loc = parallelOp.getLoc();
610 Value constantOne =
611 rewriter.create<arith::ConstantIndexOp>(location: parallelOp.getLoc(), args: 1);
612 gpu::LaunchOp launchOp = rewriter.create<gpu::LaunchOp>(
613 location: parallelOp.getLoc(), args&: constantOne, args&: constantOne, args&: constantOne, args&: constantOne,
614 args&: constantOne, args&: constantOne);
615 rewriter.setInsertionPointToEnd(&launchOp.getBody().front());
616 rewriter.create<gpu::TerminatorOp>(location: loc);
617 rewriter.setInsertionPointToStart(&launchOp.getBody().front());
618
619 IRMapping cloningMap;
620 llvm::DenseMap<gpu::Processor, Value> launchBounds;
621 SmallVector<Operation *, 16> worklist;
622 if (failed(Result: processParallelLoop(parallelOp, launchOp, cloningMap, worklist,
623 bounds&: launchBounds, rewriter)))
624 return failure();
625
626 // Whether we have seen any side-effects. Reset when leaving an inner scope.
627 bool seenSideeffects = false;
628 // Whether we have left a nesting scope (and hence are no longer innermost).
629 bool leftNestingScope = false;
630 while (!worklist.empty()) {
631 Operation *op = worklist.pop_back_val();
632 // Now walk over the body and clone it.
633 // TODO: This is only correct if there either is no further scf.parallel
634 // nested or this code is side-effect free. Otherwise we might need
635 // predication. We are overly conservative for now and only allow
636 // side-effects in the innermost scope.
637 if (auto nestedParallel = dyn_cast<ParallelOp>(Val: op)) {
638 // Before entering a nested scope, make sure there have been no
639 // sideeffects until now.
640 if (seenSideeffects)
641 return failure();
642 // A nested scf.parallel needs insertion of code to compute indices.
643 // Insert that now. This will also update the worklist with the loops
644 // body.
645 if (failed(Result: processParallelLoop(parallelOp: nestedParallel, launchOp, cloningMap,
646 worklist, bounds&: launchBounds, rewriter)))
647 return failure();
648 } else if (op == launchOp.getOperation()) {
649 // Found our sentinel value. We have finished the operations from one
650 // nesting level, pop one level back up.
651 auto *parent = rewriter.getInsertionPoint()->getParentOp();
652 rewriter.setInsertionPointAfter(parent);
653 leftNestingScope = true;
654 seenSideeffects = false;
655 } else if (auto reduceOp = dyn_cast<scf::ReduceOp>(Val: op)) {
656 // Convert scf.reduction op
657 auto parentLoop = op->getParentOfType<ParallelOp>();
658 if (!parentLoop || op->getOperands().size() != 1)
659 return failure();
660 auto operand = op->getOperands().front();
661 auto newValue = cloningMap.lookupOrNull(from: operand);
662 if (!newValue || !operand.getType().isSignlessIntOrFloat())
663 return failure();
664 // Ensure reduction region is isolated from above.
665 llvm::SetVector<Value> externalValues;
666 getUsedValuesDefinedAbove(regions: reduceOp.getRegion(i: 0), values&: externalValues);
667 if (externalValues.size())
668 return failure();
669 // Replace by gpu.all_reduce.
670 auto gpuRedOp = rewriter.create<gpu::AllReduceOp>(location: loc, args&: newValue);
671 cloningMap.map(from: parentLoop->getResult(idx: 0), to: gpuRedOp.getResult());
672 // Copy region.
673 rewriter.inlineRegionBefore(region&: reduceOp.getRegion(i: 0), parent&: gpuRedOp.getRegion(),
674 before: gpuRedOp.getRegion().begin());
675 // Replace src.reduce.return with gpu.yield.
676 auto scfReturn = gpuRedOp.getRegion().front().getTerminator();
677 auto ip = rewriter.saveInsertionPoint();
678 rewriter.setInsertionPointToEnd(&gpuRedOp.getRegion().front());
679 rewriter.replaceOpWithNewOp<gpu::YieldOp>(
680 op: scfReturn, args: scfReturn->getOperands().front());
681 rewriter.restoreInsertionPoint(ip);
682 } else {
683 // Otherwise we copy it over.
684 Operation *clone = rewriter.clone(op&: *op, mapper&: cloningMap);
685 cloningMap.map(from: op->getResults(), to: clone->getResults());
686 // Check for side effects.
687 // TODO: Handle region side effects properly.
688 seenSideeffects |=
689 !isMemoryEffectFree(op: clone) || clone->getNumRegions() != 0;
690 // If we are no longer in the innermost scope, sideeffects are disallowed.
691 if (seenSideeffects && leftNestingScope)
692 return failure();
693 }
694 }
695
696 // Now that we succeeded creating the launch operation, also update the
697 // bounds.
698 for (auto bound : launchBounds)
699 launchOp.setOperand(i: getLaunchOpArgumentNum(processor: std::get<0>(in&: bound)),
700 value: std::get<1>(in&: bound));
701
702 rewriter.eraseOp(op: parallelOp);
703 return success();
704}
705
706void mlir::populateParallelLoopToGPUPatterns(RewritePatternSet &patterns) {
707 patterns.add<ParallelToGpuLaunchLowering>(arg: patterns.getContext());
708}
709
710void mlir::configureParallelLoopToGPULegality(ConversionTarget &target) {
711 target.addLegalDialect<memref::MemRefDialect>();
712 target.addDynamicallyLegalOp<scf::ParallelOp>(callback: [](scf::ParallelOp parallelOp) {
713 return !parallelOp->hasAttr(name: gpu::getMappingAttrName()) ||
714 parallelOp->hasAttr(name: kVisitedAttrName);
715 });
716}
717
718void mlir::finalizeParallelLoopToGPUConversion(Operation *op) {
719 op->walk(callback: [](scf::ParallelOp parallelOp) {
720 parallelOp->removeAttr(name: kVisitedAttrName);
721 });
722}
723

source code of mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp