1//===- LoopUtils.cpp ---- Misc utilities for loop transformation ----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements miscellaneous loop transformation routines.
10//
11//===----------------------------------------------------------------------===//
12
13#include "mlir/Dialect/Affine/LoopUtils.h"
14#include "mlir/Analysis/SliceAnalysis.h"
15#include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h"
16#include "mlir/Dialect/Affine/Analysis/Utils.h"
17#include "mlir/Dialect/Affine/IR/AffineValueMap.h"
18#include "mlir/Dialect/Affine/Utils.h"
19#include "mlir/Dialect/Func/IR/FuncOps.h"
20#include "mlir/Dialect/MemRef/IR/MemRef.h"
21#include "mlir/Dialect/SCF/IR/SCF.h"
22#include "mlir/IR/IRMapping.h"
23#include "mlir/IR/IntegerSet.h"
24#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
25#include "llvm/ADT/MapVector.h"
26#include "llvm/ADT/SmallPtrSet.h"
27#include "llvm/Support/Debug.h"
28#include "llvm/Support/raw_ostream.h"
29#include <optional>
30
31#define DEBUG_TYPE "loop-utils"
32
33using namespace mlir;
34using namespace affine;
35using namespace presburger;
36using llvm::SmallMapVector;
37
38/// Computes the cleanup loop lower bound of the loop being unrolled with
39/// the specified unroll factor; this bound will also be upper bound of the main
40/// part of the unrolled loop. Computes the bound as an AffineMap with its
41/// operands or a null map when the trip count can't be expressed as an affine
42/// expression.
43static void
44getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor,
45 AffineMap &cleanupLbMap,
46 SmallVectorImpl<Value> &cleanupLbOperands) {
47 AffineMap tripCountMap;
48 SmallVector<Value, 4> tripCountOperands;
49 getTripCountMapAndOperands(forOp, map: &tripCountMap, operands: &tripCountOperands);
50 // Trip count can't be computed.
51 if (!tripCountMap) {
52 cleanupLbMap = AffineMap();
53 return;
54 }
55
56 OpBuilder b(forOp);
57 auto lbMap = forOp.getLowerBoundMap();
58 auto lb = b.create<AffineApplyOp>(location: forOp.getLoc(), args&: lbMap,
59 args: forOp.getLowerBoundOperands());
60
61 // For each upper bound expr, get the range.
62 // Eg: affine.for %i = lb to min (ub1, ub2),
63 // where tripCountExprs yield (tr1, tr2), we create affine.apply's:
64 // lb + tr1 - tr1 % ufactor, lb + tr2 - tr2 % ufactor; the results of all
65 // these affine.apply's make up the cleanup loop lower bound.
66 SmallVector<AffineExpr, 4> bumpExprs(tripCountMap.getNumResults());
67 SmallVector<Value, 4> bumpValues(tripCountMap.getNumResults());
68 int64_t step = forOp.getStepAsInt();
69 for (unsigned i = 0, e = tripCountMap.getNumResults(); i < e; i++) {
70 auto tripCountExpr = tripCountMap.getResult(idx: i);
71 bumpExprs[i] = (tripCountExpr - tripCountExpr % unrollFactor) * step;
72 auto bumpMap = AffineMap::get(dimCount: tripCountMap.getNumDims(),
73 symbolCount: tripCountMap.getNumSymbols(), result: bumpExprs[i]);
74 bumpValues[i] =
75 b.create<AffineApplyOp>(location: forOp.getLoc(), args&: bumpMap, args&: tripCountOperands);
76 }
77
78 SmallVector<AffineExpr, 4> newUbExprs(tripCountMap.getNumResults());
79 for (unsigned i = 0, e = bumpExprs.size(); i < e; i++)
80 newUbExprs[i] = b.getAffineDimExpr(position: 0) + b.getAffineDimExpr(position: i + 1);
81
82 cleanupLbOperands.clear();
83 cleanupLbOperands.push_back(Elt: lb);
84 cleanupLbOperands.append(in_start: bumpValues.begin(), in_end: bumpValues.end());
85 cleanupLbMap = AffineMap::get(dimCount: 1 + tripCountMap.getNumResults(), symbolCount: 0, results: newUbExprs,
86 context: b.getContext());
87 // Simplify the cleanupLbMap + cleanupLbOperands.
88 fullyComposeAffineMapAndOperands(map: &cleanupLbMap, operands: &cleanupLbOperands);
89 cleanupLbMap = simplifyAffineMap(map: cleanupLbMap);
90 canonicalizeMapAndOperands(map: &cleanupLbMap, operands: &cleanupLbOperands);
91 // Remove any affine.apply's that became dead from the simplification above.
92 for (auto v : bumpValues)
93 if (v.use_empty())
94 v.getDefiningOp()->erase();
95
96 if (lb.use_empty())
97 lb.erase();
98}
99
100/// Helper to replace uses of loop carried values (iter_args) and loop
101/// yield values while promoting single iteration affine.for ops.
102static void replaceIterArgsAndYieldResults(AffineForOp forOp) {
103 // Replace uses of iter arguments with iter operands (initial values).
104 auto iterOperands = forOp.getInits();
105 auto iterArgs = forOp.getRegionIterArgs();
106 for (auto e : llvm::zip(t&: iterOperands, u&: iterArgs))
107 std::get<1>(t&: e).replaceAllUsesWith(newValue: std::get<0>(t&: e));
108
109 // Replace uses of loop results with the values yielded by the loop.
110 auto outerResults = forOp.getResults();
111 auto innerResults = forOp.getBody()->getTerminator()->getOperands();
112 for (auto e : llvm::zip(t&: outerResults, u&: innerResults))
113 std::get<0>(t&: e).replaceAllUsesWith(newValue: std::get<1>(t&: e));
114}
115
116/// Promotes the loop body of a forOp to its containing block if the forOp
117/// was known to have a single iteration.
118LogicalResult mlir::affine::promoteIfSingleIteration(AffineForOp forOp) {
119 std::optional<uint64_t> tripCount = getConstantTripCount(forOp);
120 if (!tripCount || *tripCount != 1)
121 return failure();
122
123 // TODO: extend this for arbitrary affine bounds.
124 if (forOp.getLowerBoundMap().getNumResults() != 1)
125 return failure();
126
127 // Replaces all IV uses to its single iteration value.
128 auto iv = forOp.getInductionVar();
129 auto *parentBlock = forOp->getBlock();
130 if (!iv.use_empty()) {
131 if (forOp.hasConstantLowerBound()) {
132 auto func = forOp->getParentOfType<FunctionOpInterface>();
133 OpBuilder builder(forOp->getContext());
134 if (func)
135 builder.setInsertionPointToStart(&func.getFunctionBody().front());
136 else
137 builder.setInsertionPoint(forOp);
138 auto constOp = builder.create<arith::ConstantIndexOp>(
139 location: forOp.getLoc(), args: forOp.getConstantLowerBound());
140 iv.replaceAllUsesWith(newValue: constOp);
141 } else {
142 auto lbOperands = forOp.getLowerBoundOperands();
143 auto lbMap = forOp.getLowerBoundMap();
144 OpBuilder builder(forOp);
145 if (lbMap == builder.getDimIdentityMap()) {
146 // No need of generating an affine.apply.
147 iv.replaceAllUsesWith(newValue: lbOperands[0]);
148 } else {
149 auto affineApplyOp =
150 builder.create<AffineApplyOp>(location: forOp.getLoc(), args&: lbMap, args&: lbOperands);
151 iv.replaceAllUsesWith(newValue: affineApplyOp);
152 }
153 }
154 }
155
156 replaceIterArgsAndYieldResults(forOp);
157
158 // Move the loop body operations, except for its terminator, to the loop's
159 // containing block.
160 forOp.getBody()->back().erase();
161 parentBlock->getOperations().splice(where: Block::iterator(forOp),
162 L2&: forOp.getBody()->getOperations());
163 forOp.erase();
164 return success();
165}
166
167/// Generates an affine.for op with the specified lower and upper bounds
168/// while generating the right IV remappings to realize shifts for operations in
169/// its body. The operations that go into the loop body are specified in
170/// opGroupQueue starting from the specified offset, and in that order. The
171/// first element of the pair specifies the shift applied to that group of
172/// operations; the shift is multiplied by the loop step before being applied.
173/// Returns nullptr if the generated loop simplifies to a single iteration one.
174static AffineForOp generateShiftedLoop(
175 AffineMap lbMap, AffineMap ubMap,
176 const std::vector<std::pair<uint64_t, ArrayRef<Operation *>>> &opGroupQueue,
177 unsigned offset, AffineForOp srcForOp, OpBuilder b) {
178 auto lbOperands = srcForOp.getLowerBoundOperands();
179 auto ubOperands = srcForOp.getUpperBoundOperands();
180
181 assert(lbMap.getNumInputs() == lbOperands.size());
182 assert(ubMap.getNumInputs() == ubOperands.size());
183
184 auto loopChunk =
185 b.create<AffineForOp>(location: srcForOp.getLoc(), args&: lbOperands, args&: lbMap, args&: ubOperands,
186 args&: ubMap, args: srcForOp.getStepAsInt());
187 auto loopChunkIV = loopChunk.getInductionVar();
188 auto srcIV = srcForOp.getInductionVar();
189
190 IRMapping operandMap;
191
192 auto bodyBuilder = OpBuilder::atBlockTerminator(block: loopChunk.getBody());
193 for (const auto &it : llvm::drop_begin(RangeOrContainer: opGroupQueue, N: offset)) {
194 uint64_t shift = it.first;
195 auto ops = it.second;
196 // All 'same shift' operations get added with their operands being
197 // remapped to results of cloned operations, and their IV used remapped.
198 // Generate the remapping if the shift is not zero: remappedIV = newIV -
199 // shift.
200 if (!srcIV.use_empty() && shift != 0) {
201 auto ivRemap = bodyBuilder.create<AffineApplyOp>(
202 location: srcForOp.getLoc(),
203 args: bodyBuilder.getSingleDimShiftAffineMap(
204 shift: -static_cast<int64_t>(srcForOp.getStepAsInt() * shift)),
205 args&: loopChunkIV);
206 operandMap.map(from: srcIV, to: ivRemap);
207 } else {
208 operandMap.map(from: srcIV, to: loopChunkIV);
209 }
210 for (auto *op : ops)
211 bodyBuilder.clone(op&: *op, mapper&: operandMap);
212 };
213 if (succeeded(Result: promoteIfSingleIteration(forOp: loopChunk)))
214 return AffineForOp();
215 return loopChunk;
216}
217
218// The skewing of operations with respect to one another can be used for
219// example to allow overlap of asynchronous operations (such as DMA
220// communication) with computation, or just relative shifting of operations
221// for better register reuse, locality or parallelism. As such, the shifts are
222// typically expected to be at most of the order of the number of operations.
223// This method should not be used as a substitute for loop distribution/fission.
224// This method uses an algorithm// in time linear in the number of operations
225// in the body of the for loop - (using the 'sweep line' paradigm). This method
226// asserts preservation of SSA dominance. A check for that as well as that for
227// memory-based dependence preservation check rests with the users of this
228// method.
229LogicalResult mlir::affine::affineForOpBodySkew(AffineForOp forOp,
230 ArrayRef<uint64_t> shifts,
231 bool unrollPrologueEpilogue) {
232 assert(forOp.getBody()->getOperations().size() == shifts.size() &&
233 "too few/many shifts");
234 if (forOp.getBody()->begin() == std::prev(x: forOp.getBody()->end()))
235 return success();
236
237 // If the trip counts aren't constant, we would need versioning and
238 // conditional guards (or context information to prevent such versioning). The
239 // better way to pipeline for such loops is to first tile them and extract
240 // constant trip count "full tiles" before applying this.
241 auto mayBeConstTripCount = getConstantTripCount(forOp);
242 if (!mayBeConstTripCount) {
243 LLVM_DEBUG(forOp.emitRemark("non-constant trip count loop not handled"));
244 return success();
245 }
246 uint64_t tripCount = *mayBeConstTripCount;
247
248 assert(isOpwiseShiftValid(forOp, shifts) &&
249 "shifts will lead to an invalid transformation\n");
250
251 int64_t step = forOp.getStepAsInt();
252
253 unsigned numChildOps = shifts.size();
254
255 // Do a linear time (counting) sort for the shifts.
256 uint64_t maxShift = *llvm::max_element(Range&: shifts);
257 if (maxShift >= numChildOps) {
258 // Large shifts are not the typical use case.
259 forOp.emitWarning(message: "not shifting because shifts are unrealistically large");
260 return success();
261 }
262
263 // An array of operation groups sorted by shift amount; each group has all
264 // operations with the same shift in the order in which they appear in the
265 // body of the 'affine.for' op.
266 std::vector<std::vector<Operation *>> sortedOpGroups(maxShift + 1);
267 unsigned pos = 0;
268 for (auto &op : forOp.getBody()->without_terminator()) {
269 auto shift = shifts[pos++];
270 sortedOpGroups[shift].push_back(x: &op);
271 }
272
273 // Unless the shifts have a specific pattern (which actually would be the
274 // common use case), prologue and epilogue are not meaningfully defined.
275 // Nevertheless, if 'unrollPrologueEpilogue' is set, we will treat the first
276 // loop generated as the prologue and the last as epilogue and unroll these
277 // fully.
278 AffineForOp prologue, epilogue;
279
280 // Do a sweep over the sorted shifts while storing open groups in a
281 // vector, and generating loop portions as necessary during the sweep. A block
282 // of operations is paired with its shift.
283 std::vector<std::pair<uint64_t, ArrayRef<Operation *>>> opGroupQueue;
284
285 auto origLbMap = forOp.getLowerBoundMap();
286 uint64_t lbShift = 0;
287 OpBuilder b(forOp);
288 for (uint64_t d = 0, e = sortedOpGroups.size(); d < e; ++d) {
289 // If nothing is shifted by d, continue.
290 if (sortedOpGroups[d].empty())
291 continue;
292 if (!opGroupQueue.empty()) {
293 assert(d > 0 &&
294 "Queue expected to be empty when the first block is found");
295 // The interval for which the loop needs to be generated here is:
296 // [lbShift, min(lbShift + tripCount, d)) and the body of the
297 // loop needs to have all operations in opQueue in that order.
298 AffineForOp res;
299 if (lbShift + tripCount * step < d * step) {
300 res = generateShiftedLoop(
301 lbMap: b.getShiftedAffineMap(map: origLbMap, shift: lbShift),
302 ubMap: b.getShiftedAffineMap(map: origLbMap, shift: lbShift + tripCount * step),
303 opGroupQueue, /*offset=*/0, srcForOp: forOp, b);
304 // Entire loop for the queued op groups generated, empty it.
305 opGroupQueue.clear();
306 lbShift += tripCount * step;
307 } else {
308 res = generateShiftedLoop(lbMap: b.getShiftedAffineMap(map: origLbMap, shift: lbShift),
309 ubMap: b.getShiftedAffineMap(map: origLbMap, shift: d),
310 opGroupQueue, /*offset=*/0, srcForOp: forOp, b);
311 lbShift = d * step;
312 }
313
314 if (res) {
315 // Simplify/canonicalize the affine.for.
316 RewritePatternSet patterns(res.getContext());
317 AffineForOp::getCanonicalizationPatterns(results&: patterns, context: res.getContext());
318 bool erased;
319 (void)applyOpPatternsGreedily(
320 ops: res.getOperation(), patterns: std::move(patterns),
321 config: GreedyRewriteConfig().setStrictness(
322 GreedyRewriteStrictness::ExistingAndNewOps),
323 /*changed=*/nullptr, allErased: &erased);
324 if (!erased && !prologue)
325 prologue = res;
326 if (!erased)
327 epilogue = res;
328 }
329 } else {
330 // Start of first interval.
331 lbShift = d * step;
332 }
333 // Augment the list of operations that get into the current open interval.
334 opGroupQueue.emplace_back(args&: d, args&: sortedOpGroups[d]);
335 }
336
337 // Those operations groups left in the queue now need to be processed (FIFO)
338 // and their loops completed.
339 for (unsigned i = 0, e = opGroupQueue.size(); i < e; ++i) {
340 uint64_t ubShift = (opGroupQueue[i].first + tripCount) * step;
341 epilogue = generateShiftedLoop(lbMap: b.getShiftedAffineMap(map: origLbMap, shift: lbShift),
342 ubMap: b.getShiftedAffineMap(map: origLbMap, shift: ubShift),
343 opGroupQueue, /*offset=*/i, srcForOp: forOp, b);
344 lbShift = ubShift;
345 if (!prologue)
346 prologue = epilogue;
347 }
348
349 // Erase the original for op.
350 forOp.erase();
351
352 if (unrollPrologueEpilogue && prologue)
353 (void)loopUnrollFull(forOp: prologue);
354 if (unrollPrologueEpilogue && !epilogue && epilogue != prologue)
355 (void)loopUnrollFull(forOp: epilogue);
356
357 return success();
358}
359
360/// Checks whether a loop nest is hyper-rectangular or not.
361static LogicalResult
362checkIfHyperRectangular(MutableArrayRef<AffineForOp> input) {
363 FlatAffineValueConstraints cst;
364 SmallVector<Operation *, 8> ops(input.begin(), input.end());
365 // 0-d or 1-d is trivially hyper-rectangular.
366 if (input.size() <= 1)
367 return success();
368 if (failed(Result: getIndexSet(ops, domain: &cst))) {
369 LLVM_DEBUG(llvm::dbgs() << "Index set computation failed!\n");
370 return failure();
371 }
372 if (!cst.isHyperRectangular(pos: 0, num: input.size())) {
373 LLVM_DEBUG(llvm::dbgs()
374 << "Non-hyperrectangular nests not supported for tiling!\n");
375 return failure();
376 }
377 return success();
378}
379
380/// Check if the input nest is supported for tiling and whether tiling would be
381/// legal or not.
382template <typename t>
383static LogicalResult performPreTilingChecks(MutableArrayRef<AffineForOp> input,
384 ArrayRef<t> tileSizes) {
385 assert(input.size() == tileSizes.size() && "Too few/many tile sizes");
386
387 if (llvm::any_of(input,
388 [](AffineForOp op) { return op.getNumResults() > 0; })) {
389 LLVM_DEBUG(llvm::dbgs()
390 << "Cannot tile nest where a loop has yield values\n");
391 return failure();
392 }
393
394 // Check if the supplied `for` ops are all successively nested.
395 if (!isPerfectlyNested(loops: input)) {
396 LLVM_DEBUG(llvm::dbgs() << "input loops not perfectly nested");
397 return failure();
398 }
399
400 // TODO: handle non hyper-rectangular spaces.
401 if (failed(Result: checkIfHyperRectangular(input)))
402 return failure();
403
404 return success();
405}
406
407/// Move the loop body of AffineForOp 'src' from 'src' into the specified
408/// location in destination's body, ignoring the terminator.
409static void moveLoopBodyImpl(AffineForOp src, AffineForOp dest,
410 Block::iterator loc) {
411 auto &ops = src.getBody()->getOperations();
412 dest.getBody()->getOperations().splice(where: loc, L2&: ops, first: ops.begin(),
413 last: std::prev(x: ops.end()));
414}
415
416/// Move the loop body of AffineForOp 'src' from 'src' to the start of dest
417/// body.
418static void moveLoopBody(AffineForOp src, AffineForOp dest) {
419 moveLoopBodyImpl(src, dest, loc: dest.getBody()->begin());
420}
421
422/// Constructs tiled loop nest, without setting the loop bounds and move the
423/// body of the original loop nest to the tiled loop nest.
424static void constructTiledLoopNest(MutableArrayRef<AffineForOp> origLoops,
425 AffineForOp rootAffineForOp, unsigned width,
426 MutableArrayRef<AffineForOp> tiledLoops) {
427 Location loc = rootAffineForOp.getLoc();
428
429 // The outermost among the loops as we add more..
430 Operation *topLoop = rootAffineForOp.getOperation();
431 AffineForOp innermostPointLoop;
432
433 // Add intra-tile (or point) loops.
434 for (unsigned i = 0; i < width; i++) {
435 OpBuilder b(topLoop);
436 // Loop bounds will be set later.
437 AffineForOp pointLoop = b.create<AffineForOp>(location: loc, args: 0, args: 0);
438 pointLoop.getBody()->getOperations().splice(
439 where: pointLoop.getBody()->begin(), L2&: topLoop->getBlock()->getOperations(),
440 N: topLoop);
441 tiledLoops[2 * width - 1 - i] = pointLoop;
442 topLoop = pointLoop.getOperation();
443 if (i == 0)
444 innermostPointLoop = pointLoop;
445 }
446
447 // Add tile space loops;
448 for (unsigned i = width; i < 2 * width; i++) {
449 OpBuilder b(topLoop);
450 // Loop bounds will be set later.
451 AffineForOp tileSpaceLoop = b.create<AffineForOp>(location: loc, args: 0, args: 0);
452 tileSpaceLoop.getBody()->getOperations().splice(
453 where: tileSpaceLoop.getBody()->begin(), L2&: topLoop->getBlock()->getOperations(),
454 N: topLoop);
455 tiledLoops[2 * width - i - 1] = tileSpaceLoop;
456 topLoop = tileSpaceLoop.getOperation();
457 }
458
459 // Move the loop body of the original nest to the new one.
460 moveLoopBody(src: origLoops.back(), dest: innermostPointLoop);
461}
462
463/// Set lower and upper bounds of intra-tile loops for parametric tiling.
464// TODO: Handle non-constant lower bounds.
465static void setIntraTileBoundsParametric(OpBuilder &b, AffineForOp origLoop,
466 AffineForOp newInterTileLoop,
467 AffineForOp newIntraTileLoop,
468 Value tileSize) {
469 // The lower bound for the intra-tile loop is represented by an affine map
470 // as (%i, %t0)->((%i - %origlb) * %t0 + %origlb). Similarly, the upper bound
471 // for the intra-tile loop is represented by an affine map as (%i, %t0)->((%i
472 // - %origlb) * %t0) + (%t0 * %origLoopStep) + %origlb), where %i is loop IV
473 // of the corresponding inter-tile loop, %t0 is the corresponding tiling
474 // parameter, %origlb is lower bound and %origLoopStep is the loop step of the
475 // corresponding inter-tile loop.
476
477 assert(origLoop.hasConstantLowerBound() &&
478 "expected input loops to have constant lower bound.");
479
480 // Get lower bound of original loop as an affine expression.
481 AffineExpr origLowerBoundExpr;
482 origLowerBoundExpr =
483 b.getAffineConstantExpr(constant: origLoop.getConstantLowerBound());
484
485 // Add dim operands from original lower/upper bound.
486 SmallVector<Value, 4> lbOperands, ubOperands;
487 AffineBound lb = origLoop.getLowerBound();
488 AffineBound ub = origLoop.getUpperBound();
489 lbOperands.reserve(N: lb.getNumOperands() + 2);
490 ubOperands.reserve(N: ub.getNumOperands() + 2);
491 AffineMap origLbMap = lb.getMap();
492 AffineMap origUbMap = ub.getMap();
493 for (unsigned j = 0, e = origLbMap.getNumDims(); j < e; ++j)
494 lbOperands.push_back(Elt: lb.getOperand(idx: j));
495 for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
496 ubOperands.push_back(Elt: ub.getOperand(idx: j));
497
498 // Add a new dim operand in lb/ubOperands corresponding to the origLoop
499 // IV.
500 lbOperands.push_back(Elt: newInterTileLoop.getInductionVar());
501 ubOperands.push_back(Elt: newInterTileLoop.getInductionVar());
502
503 // Get loop IV as an affine expression for lower/upper bound. Size of
504 // lb/ubOperands is guaranteed to be atleast one.
505 AffineExpr lbLoopIvExpr = b.getAffineDimExpr(position: lbOperands.size() - 1);
506 AffineExpr ubLoopIvExpr = b.getAffineDimExpr(position: ubOperands.size() - 1);
507
508 // Add symbol operands from original lower/upper bound.
509 for (unsigned j = 0, e = origLbMap.getNumSymbols(); j < e; ++j)
510 lbOperands.push_back(Elt: lb.getOperand(idx: origLbMap.getNumDims() + j));
511 for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
512 ubOperands.push_back(Elt: ub.getOperand(idx: origUbMap.getNumDims() + j));
513
514 // Add a new symbol operand which is the tile size for this loop.
515 lbOperands.push_back(Elt: tileSize);
516 ubOperands.push_back(Elt: tileSize);
517
518 SmallVector<AffineExpr, 4> lbBoundExprs;
519 SmallVector<AffineExpr, 4> ubBoundExprs;
520 lbBoundExprs.reserve(N: origLbMap.getNumResults());
521 ubBoundExprs.reserve(N: origUbMap.getNumResults());
522
523 // Get tiling parameter as an affine expression for lb/ub.
524 AffineExpr lbTileParameter = b.getAffineSymbolExpr(position: origLbMap.getNumSymbols());
525 AffineExpr ubTileParameter = b.getAffineSymbolExpr(position: origUbMap.getNumSymbols());
526
527 // Insert lb as inter-tile ((loop IV - origlb) * tilingParameter) + origlb.
528 lbBoundExprs.push_back(
529 Elt: ((lbLoopIvExpr - origLowerBoundExpr) * lbTileParameter) +
530 origLowerBoundExpr);
531
532 // Get the origLoopStep as an affine expression.
533 AffineExpr origLoopStep = b.getAffineConstantExpr(constant: origLoop.getStepAsInt());
534
535 // Insert ub as inter-tile ((loop IV - origlb) * tilingParameter) +
536 // (tilingParameter * origLoopStep) + origlb.
537 ubBoundExprs.push_back(
538 Elt: ((ubLoopIvExpr - origLowerBoundExpr) * ubTileParameter) +
539 (ubTileParameter * origLoopStep) + origLowerBoundExpr);
540
541 ubBoundExprs.append(in_start: origUbMap.getResults().begin(),
542 in_end: origUbMap.getResults().end());
543
544 AffineMap lbMap =
545 AffineMap::get(dimCount: origLbMap.getNumDims() + 1, symbolCount: origLbMap.getNumSymbols() + 1,
546 results: lbBoundExprs, context: b.getContext());
547 newIntraTileLoop.setLowerBound(operands: lbOperands, map: lbMap);
548
549 AffineMap ubMap =
550 AffineMap::get(dimCount: origUbMap.getNumDims() + 1, symbolCount: origUbMap.getNumSymbols() + 1,
551 results: ubBoundExprs, context: b.getContext());
552 newIntraTileLoop.setUpperBound(operands: ubOperands, map: ubMap);
553
554 // Original loop step must be preserved.
555 newIntraTileLoop.setStep(origLoop.getStepAsInt());
556}
557
558/// Set lower and upper bounds of inter-tile loops for parametric tiling.
559// TODO: Handle non-constant lower bounds.
560static void setInterTileBoundsParametric(OpBuilder &b, AffineForOp origLoop,
561 AffineForOp newLoop, Value tileSize) {
562 OperandRange newLbOperands = origLoop.getLowerBoundOperands();
563
564 // The lower bounds for inter-tile loops are same as the corresponding lower
565 // bounds of original loops.
566 newLoop.setLowerBound(operands: newLbOperands, map: origLoop.getLowerBoundMap());
567
568 // The new upper bound map for inter-tile loops, assuming constant lower
569 // bounds, are now originalLowerBound + ceildiv((originalUpperBound -
570 // originalLowerBound), tiling parameter); where tiling parameter is the
571 // respective tile size for that loop. For e.g. if the original ubmap was
572 // ()->(1024), the new map will be
573 // ()[s0]->(ceildiv((1024 -lb) % s0)), where s0 is the tiling parameter.
574 // Therefore a new symbol operand is inserted in the map and the result
575 // expression is overwritten.
576
577 assert(origLoop.hasConstantLowerBound() &&
578 "expected input loops to have constant lower bound.");
579
580 // Get lower bound of original loop as an affine expression.
581 AffineExpr origLowerBoundExpr;
582 origLowerBoundExpr =
583 b.getAffineConstantExpr(constant: origLoop.getConstantLowerBound());
584
585 // Add dim operands from original upper bound.
586 SmallVector<Value, 4> ubOperands;
587 AffineBound ub = origLoop.getUpperBound();
588 ubOperands.reserve(N: ub.getNumOperands() + 1);
589 AffineMap origUbMap = ub.getMap();
590 for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
591 ubOperands.push_back(Elt: ub.getOperand(idx: j));
592
593 // Add symbol operands from original upper bound.
594 for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
595 ubOperands.push_back(Elt: ub.getOperand(idx: origUbMap.getNumDims() + j));
596
597 // Add a new symbol operand which is the tile size for this loop.
598 ubOperands.push_back(Elt: tileSize);
599
600 // Get tiling parameter as an affine expression.
601 AffineExpr tileParameter = b.getAffineSymbolExpr(position: origUbMap.getNumSymbols());
602
603 SmallVector<AffineExpr, 4> boundExprs;
604 boundExprs.reserve(N: origUbMap.getNumResults());
605 int64_t origUpperBound;
606 AffineExpr origUpperBoundExpr;
607
608 // If upper bound for the original loop is constant, then the constant can
609 // be obtained as an affine expression straight away.
610 if (origLoop.hasConstantUpperBound()) {
611 origUpperBound = origLoop.getConstantUpperBound();
612
613 // Get original constant upper bound as an affine expression.
614 origUpperBoundExpr = b.getAffineConstantExpr(constant: origUpperBound);
615
616 // Insert the bound as originalLowerBoundceildiv((originalUpperBound -
617 // originalLowerBound), tilingParameter).
618 boundExprs.push_back(
619 Elt: origLowerBoundExpr +
620 (origUpperBoundExpr - origLowerBoundExpr).ceilDiv(other: tileParameter));
621 } else {
622 // If upper bound for the original loop is not constant then two cases
623 // are possible, although there handeling is the same, 1.) The result of
624 // ubmap has only one result expression. For e.g.
625 // affine.for %i = 5 to %ub
626 //
627 // A symbol operand is added which represents the tiling parameter. The
628 // new loop bounds here will be like ()[s0, s1] -> ((s0 - 5) ceildiv s1 + 5)
629 // where 's0' is the original upper bound and 's1' is the tiling
630 // parameter. 2.) When ubMap has more than one result expression. For e.g.
631 // #map0 = affine_map<()[s0, s1] -> (s0, s1)
632 // affine.for %i = 5 to min #map0()[%s0, %s1]
633 //
634 // A symbol operand is added which represents the tiling parameter. The
635 // new loop bounds will be like ()[s0, s1, s2] -> ((s0 - 5) ceildiv s2 + 5,
636 // (s1 -5) ceildiv s2 + 5), where s2 is the tiling parameter.
637
638 // Insert the bounds as originalLowerBound + ceildiv((originalUpperBound -
639 // originalLowerBound), tilingParameter).
640 for (AffineExpr origUpperBoundExpr : origUbMap.getResults())
641 boundExprs.push_back(
642 Elt: origLowerBoundExpr +
643 (origUpperBoundExpr - origLowerBoundExpr).ceilDiv(other: tileParameter));
644 }
645
646 AffineMap ubMap =
647 AffineMap::get(dimCount: origUbMap.getNumDims(), symbolCount: origUbMap.getNumSymbols() + 1,
648 results: boundExprs, context: b.getContext());
649 newLoop.setUpperBound(operands: ubOperands, map: ubMap);
650
651 // Original loop step must be preserved.
652 newLoop.setStep(origLoop.getStepAsInt());
653}
654
655/// Constructs and sets new loop bounds after tiling for the case of
656/// hyper-rectangular index sets, where the bounds of one dimension do not
657/// depend on other dimensions and tiling parameters are captured from SSA
658/// values. Bounds of each dimension can thus be treated independently,
659/// and deriving the new bounds is much simpler and faster than for the case of
660/// tiling arbitrary polyhedral shapes.
661static void constructParametricallyTiledIndexSetHyperRect(
662 MutableArrayRef<AffineForOp> origLoops,
663 MutableArrayRef<AffineForOp> newLoops, ArrayRef<Value> tileSizes) {
664 assert(!origLoops.empty() && "expected atleast one loop in band");
665 assert(origLoops.size() == tileSizes.size() &&
666 "expected tiling parameter for each loop in band.");
667
668 OpBuilder b(origLoops[0].getOperation());
669 unsigned width = origLoops.size();
670
671 // Set bounds for tile space loops.
672 for (unsigned i = 0; i < width; ++i) {
673 setInterTileBoundsParametric(b, origLoop: origLoops[i], newLoop: newLoops[i], tileSize: tileSizes[i]);
674 }
675
676 // Set bounds for intra-tile loops.
677 for (unsigned i = 0; i < width; ++i) {
678 setIntraTileBoundsParametric(b, origLoop: origLoops[i], newInterTileLoop: newLoops[i],
679 newIntraTileLoop: newLoops[i + width], tileSize: tileSizes[i]);
680 }
681}
682
683/// Constructs and sets new loop bounds after tiling for the case of
684/// hyper-rectangular index sets, where the bounds of one dimension do not
685/// depend on other dimensions. Bounds of each dimension can thus be treated
686/// independently, and deriving the new bounds is much simpler and faster
687/// than for the case of tiling arbitrary polyhedral shapes.
688static void
689constructTiledIndexSetHyperRect(MutableArrayRef<AffineForOp> origLoops,
690 MutableArrayRef<AffineForOp> newLoops,
691 ArrayRef<unsigned> tileSizes) {
692 assert(!origLoops.empty());
693 assert(origLoops.size() == tileSizes.size());
694
695 OpBuilder b(origLoops[0].getOperation());
696 unsigned width = origLoops.size();
697
698 // Bounds for tile space loops.
699 for (unsigned i = 0; i < width; i++) {
700 OperandRange newLbOperands = origLoops[i].getLowerBoundOperands();
701 OperandRange newUbOperands = origLoops[i].getUpperBoundOperands();
702 newLoops[i].setLowerBound(operands: newLbOperands, map: origLoops[i].getLowerBoundMap());
703 newLoops[i].setUpperBound(operands: newUbOperands, map: origLoops[i].getUpperBoundMap());
704 // If the step size of original loop is x and tileSize is y then after
705 // tiling the tile space loops' step size becomes x*y.
706 newLoops[i].setStep(tileSizes[i] * origLoops[i].getStepAsInt());
707 }
708 // Bounds for intra-tile loops.
709 for (unsigned i = 0; i < width; i++) {
710 int64_t largestDiv = getLargestDivisorOfTripCount(forOp: origLoops[i]);
711 std::optional<uint64_t> mayBeConstantCount =
712 getConstantTripCount(forOp: origLoops[i]);
713 // The lower bound is just the tile-space loop.
714 AffineMap lbMap = b.getDimIdentityMap();
715 newLoops[width + i].setLowerBound(
716 /*operands=*/newLoops[i].getInductionVar(), map: lbMap);
717 // The step sizes of intra-tile loops is just the original loops' step size.
718 newLoops[width + i].setStep(origLoops[i].getStepAsInt());
719
720 // Set the upper bound.
721 if (mayBeConstantCount && *mayBeConstantCount < tileSizes[i]) {
722 // Trip count is less than the tile size: upper bound is lower bound +
723 // trip count * stepSize.
724 AffineMap ubMap = b.getSingleDimShiftAffineMap(
725 shift: *mayBeConstantCount * origLoops[i].getStepAsInt());
726 newLoops[width + i].setUpperBound(
727 /*operands=*/newLoops[i].getInductionVar(), map: ubMap);
728 } else if (largestDiv % tileSizes[i] != 0) {
729 // Intra-tile loop ii goes from i to min(i + tileSize * stepSize, ub_i).
730 // Construct the upper bound map; the operands are the original operands
731 // with 'i' (tile-space loop) appended to it. The new upper bound map is
732 // the original one with an additional expression i + tileSize * stepSize
733 // appended.
734
735 // Add dim operands from original upper bound.
736 SmallVector<Value, 4> ubOperands;
737 AffineBound ub = origLoops[i].getUpperBound();
738 ubOperands.reserve(N: ub.getNumOperands() + 1);
739 AffineMap origUbMap = ub.getMap();
740 for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
741 ubOperands.push_back(Elt: ub.getOperand(idx: j));
742
743 // Add dim operand for new loop upper bound.
744 ubOperands.push_back(Elt: newLoops[i].getInductionVar());
745
746 // Add symbol operands from original upper bound.
747 for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
748 ubOperands.push_back(Elt: ub.getOperand(idx: origUbMap.getNumDims() + j));
749
750 SmallVector<AffineExpr, 4> boundExprs;
751 boundExprs.reserve(N: 1 + origUbMap.getNumResults());
752 AffineExpr dim = b.getAffineDimExpr(position: origUbMap.getNumDims());
753 // The new upper bound map is the original one with an additional
754 // expression i + tileSize * stepSize (of original loop) appended.
755 boundExprs.push_back(Elt: dim + tileSizes[i] * origLoops[i].getStepAsInt());
756 boundExprs.append(in_start: origUbMap.getResults().begin(),
757 in_end: origUbMap.getResults().end());
758 AffineMap ubMap =
759 AffineMap::get(dimCount: origUbMap.getNumDims() + 1, symbolCount: origUbMap.getNumSymbols(),
760 results: boundExprs, context: b.getContext());
761 newLoops[width + i].setUpperBound(/*operands=*/ubOperands, map: ubMap);
762 } else {
763 // No need of the min expression.
764 AffineExpr dim = b.getAffineDimExpr(position: 0);
765 AffineMap ubMap = AffineMap::get(
766 dimCount: 1, symbolCount: 0, result: dim + tileSizes[i] * origLoops[i].getStepAsInt());
767 newLoops[width + i].setUpperBound(operands: newLoops[i].getInductionVar(), map: ubMap);
768 }
769 }
770}
771
772LogicalResult
773mlir::affine::tilePerfectlyNested(MutableArrayRef<AffineForOp> input,
774 ArrayRef<unsigned> tileSizes,
775 SmallVectorImpl<AffineForOp> *tiledNest) {
776 if (input.empty())
777 return success();
778
779 if (failed(Result: performPreTilingChecks(input, tileSizes)))
780 return failure();
781
782 MutableArrayRef<AffineForOp> origLoops = input;
783 AffineForOp rootAffineForOp = origLoops[0];
784
785 // Note that width is at least one since the band isn't empty.
786 unsigned width = input.size();
787 SmallVector<AffineForOp, 6> tiledLoops(2 * width);
788
789 // Construct a tiled loop nest without setting their bounds. Bounds are
790 // set later.
791 constructTiledLoopNest(origLoops, rootAffineForOp, width, tiledLoops);
792
793 SmallVector<Value, 8> origLoopIVs;
794 extractForInductionVars(forInsts: input, ivs: &origLoopIVs);
795
796 // Set loop bounds for the tiled loop nest.
797 constructTiledIndexSetHyperRect(origLoops, newLoops: tiledLoops, tileSizes);
798
799 // Replace original IVs with intra-tile loop IVs.
800 for (unsigned i = 0; i < width; i++)
801 origLoopIVs[i].replaceAllUsesWith(newValue: tiledLoops[i + width].getInductionVar());
802
803 // Erase the old loop nest.
804 rootAffineForOp.erase();
805
806 if (tiledNest)
807 *tiledNest = std::move(tiledLoops);
808
809 return success();
810}
811
812/// Tiles the specified band of perfectly nested loops creating tile-space
813/// loops and intra-tile loops, using SSA values as tiling parameters. A band
814/// is a contiguous set of loops.
815LogicalResult mlir::affine::tilePerfectlyNestedParametric(
816 MutableArrayRef<AffineForOp> input, ArrayRef<Value> tileSizes,
817 SmallVectorImpl<AffineForOp> *tiledNest) {
818 if (input.empty())
819 return success();
820
821 if (failed(Result: performPreTilingChecks(input, tileSizes)))
822 return failure();
823
824 MutableArrayRef<AffineForOp> origLoops = input;
825 AffineForOp rootAffineForOp = origLoops[0];
826 unsigned width = input.size();
827 SmallVector<AffineForOp, 6> tiledLoops(2 * width);
828
829 // Construct a tiled loop nest without setting their bounds. Bounds are
830 // set later.
831 constructTiledLoopNest(origLoops, rootAffineForOp, width, tiledLoops);
832
833 SmallVector<Value, 8> origLoopIVs;
834 extractForInductionVars(forInsts: input, ivs: &origLoopIVs);
835
836 // Set loop bounds for the tiled loop nest.
837 constructParametricallyTiledIndexSetHyperRect(origLoops, newLoops: tiledLoops,
838 tileSizes);
839
840 // Replace original IVs with intra-tile loop IVs.
841 for (unsigned i = 0; i < width; i++)
842 origLoopIVs[i].replaceAllUsesWith(newValue: tiledLoops[i + width].getInductionVar());
843
844 // Erase the old loop nest.
845 rootAffineForOp.erase();
846
847 if (tiledNest)
848 *tiledNest = std::move(tiledLoops);
849
850 return success();
851}
852
853/// Get perfectly nested sequence of loops starting at root of loop nest
854/// (the first op being another AffineFor, and the second op - a terminator).
855/// A loop is perfectly nested iff: the first op in the loop's body is another
856/// AffineForOp, and the second op is a terminator).
857void mlir::affine::getPerfectlyNestedLoops(
858 SmallVectorImpl<AffineForOp> &nestedLoops, AffineForOp root) {
859 for (unsigned i = 0; i < std::numeric_limits<unsigned>::max(); ++i) {
860 nestedLoops.push_back(Elt: root);
861 Block &body = root.getRegion().front();
862 if (body.begin() != std::prev(x: body.end(), n: 2))
863 return;
864
865 root = dyn_cast<AffineForOp>(Val: &body.front());
866 if (!root)
867 return;
868 }
869}
870
871/// Identify valid and profitable bands of loops to tile. This is currently just
872/// a temporary placeholder to test the mechanics of tiled code generation.
873/// Returns all maximal outermost perfect loop nests to tile.
874void mlir::affine::getTileableBands(
875 func::FuncOp f, std::vector<SmallVector<AffineForOp, 6>> *bands) {
876 // Get maximal perfect nest of 'affine.for' insts starting from root
877 // (inclusive).
878 for (AffineForOp forOp : f.getOps<AffineForOp>()) {
879 SmallVector<AffineForOp, 6> band;
880 getPerfectlyNestedLoops(nestedLoops&: band, root: forOp);
881 bands->push_back(x: band);
882 }
883}
884
885/// Unrolls this loop completely.
886LogicalResult mlir::affine::loopUnrollFull(AffineForOp forOp) {
887 std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
888 if (mayBeConstantTripCount.has_value()) {
889 uint64_t tripCount = *mayBeConstantTripCount;
890 if (tripCount == 0)
891 return success();
892 if (tripCount == 1)
893 return promoteIfSingleIteration(forOp);
894 return loopUnrollByFactor(forOp, unrollFactor: tripCount);
895 }
896 return failure();
897}
898
899/// Unrolls this loop by the specified factor or by the trip count (if constant)
900/// whichever is lower.
901LogicalResult mlir::affine::loopUnrollUpToFactor(AffineForOp forOp,
902 uint64_t unrollFactor) {
903 std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
904 if (mayBeConstantTripCount.has_value() &&
905 *mayBeConstantTripCount < unrollFactor)
906 return loopUnrollByFactor(forOp, unrollFactor: *mayBeConstantTripCount);
907 return loopUnrollByFactor(forOp, unrollFactor);
908}
909
910/// Generates unrolled copies of AffineForOp 'loopBodyBlock', with associated
911/// 'forOpIV' by 'unrollFactor', calling 'ivRemapFn' to remap 'forOpIV' for each
912/// unrolled body. If specified, annotates the Ops in each unrolled iteration
913/// using annotateFn.
914static void generateUnrolledLoop(
915 Block *loopBodyBlock, Value forOpIV, uint64_t unrollFactor,
916 function_ref<Value(unsigned, Value, OpBuilder)> ivRemapFn,
917 function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn,
918 ValueRange iterArgs, ValueRange yieldedValues) {
919 // Builder to insert unrolled bodies just before the terminator of the body of
920 // 'forOp'.
921 auto builder = OpBuilder::atBlockTerminator(block: loopBodyBlock);
922
923 constexpr auto defaultAnnotateFn = [](unsigned, Operation *, OpBuilder) {};
924 if (!annotateFn)
925 annotateFn = defaultAnnotateFn;
926
927 // Keep a pointer to the last non-terminator operation in the original block
928 // so that we know what to clone (since we are doing this in-place).
929 Block::iterator srcBlockEnd = std::prev(x: loopBodyBlock->end(), n: 2);
930
931 // Unroll the contents of 'forOp' (append unrollFactor - 1 additional copies).
932 SmallVector<Value, 4> lastYielded(yieldedValues);
933
934 for (unsigned i = 1; i < unrollFactor; i++) {
935 IRMapping operandMap;
936
937 // Prepare operand map.
938 operandMap.map(from&: iterArgs, to&: lastYielded);
939
940 // If the induction variable is used, create a remapping to the value for
941 // this unrolled instance.
942 if (!forOpIV.use_empty()) {
943 Value ivUnroll = ivRemapFn(i, forOpIV, builder);
944 operandMap.map(from: forOpIV, to: ivUnroll);
945 }
946
947 // Clone the original body of 'forOp'.
948 for (auto it = loopBodyBlock->begin(); it != std::next(x: srcBlockEnd); it++) {
949 Operation *clonedOp = builder.clone(op&: *it, mapper&: operandMap);
950 annotateFn(i, clonedOp, builder);
951 }
952
953 // Update yielded values. If the yielded value is defined outside the
954 // `loopBodyBlock` or if it is a BlockArgument then it won't be cloned, thus
955 // the `lastYielded` value remains unchanged. Else, update the `lastYielded`
956 // value with the clone corresponding to the yielded value.
957 for (unsigned i = 0, e = lastYielded.size(); i < e; i++) {
958 Operation *defOp = yieldedValues[i].getDefiningOp();
959 if (defOp && defOp->getBlock() == loopBodyBlock)
960 lastYielded[i] = operandMap.lookup(from: yieldedValues[i]);
961 }
962 }
963
964 // Make sure we annotate the Ops in the original body. We do this last so that
965 // any annotations are not copied into the cloned Ops above.
966 for (auto it = loopBodyBlock->begin(); it != std::next(x: srcBlockEnd); it++)
967 annotateFn(0, &*it, builder);
968
969 // Update operands of the yield statement.
970 loopBodyBlock->getTerminator()->setOperands(lastYielded);
971}
972
973/// Helper to generate cleanup loop for unroll or unroll-and-jam when the trip
974/// count is not a multiple of `unrollFactor`.
975static LogicalResult generateCleanupLoopForUnroll(AffineForOp forOp,
976 uint64_t unrollFactor) {
977 // Insert the cleanup loop right after 'forOp'.
978 OpBuilder builder(forOp->getBlock(), std::next(x: Block::iterator(forOp)));
979 auto cleanupForOp = cast<AffineForOp>(Val: builder.clone(op&: *forOp));
980
981 // Update uses of `forOp` results. `cleanupForOp` should use `forOp` result
982 // and produce results for the original users of `forOp` results.
983 auto results = forOp.getResults();
984 auto cleanupResults = cleanupForOp.getResults();
985 auto cleanupIterOperands = cleanupForOp.getInits();
986
987 for (auto e : llvm::zip(t&: results, u&: cleanupResults, args&: cleanupIterOperands)) {
988 std::get<0>(t&: e).replaceAllUsesWith(newValue: std::get<1>(t&: e));
989 cleanupForOp->replaceUsesOfWith(from: std::get<2>(t&: e), to: std::get<0>(t&: e));
990 }
991
992 AffineMap cleanupMap;
993 SmallVector<Value, 4> cleanupOperands;
994 getCleanupLoopLowerBound(forOp, unrollFactor, cleanupLbMap&: cleanupMap, cleanupLbOperands&: cleanupOperands);
995 if (!cleanupMap)
996 return failure();
997
998 cleanupForOp.setLowerBound(operands: cleanupOperands, map: cleanupMap);
999 // Promote the loop body up if this has turned into a single iteration loop.
1000 (void)promoteIfSingleIteration(forOp: cleanupForOp);
1001
1002 // Adjust upper bound of the original loop; this is the same as the lower
1003 // bound of the cleanup loop.
1004 forOp.setUpperBound(operands: cleanupOperands, map: cleanupMap);
1005 return success();
1006}
1007
1008/// Unrolls this loop by the specified factor. Returns success if the loop
1009/// is successfully unrolled.
1010LogicalResult mlir::affine::loopUnrollByFactor(
1011 AffineForOp forOp, uint64_t unrollFactor,
1012 function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn,
1013 bool cleanUpUnroll) {
1014 assert(unrollFactor > 0 && "unroll factor should be positive");
1015
1016 std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
1017 if (unrollFactor == 1) {
1018 if (mayBeConstantTripCount == 1 && failed(Result: promoteIfSingleIteration(forOp)))
1019 return failure();
1020 return success();
1021 }
1022
1023 // Nothing in the loop body other than the terminator.
1024 if (llvm::hasSingleElement(C&: forOp.getBody()->getOperations()))
1025 return success();
1026
1027 // If the trip count is lower than the unroll factor, no unrolled body.
1028 if (mayBeConstantTripCount && *mayBeConstantTripCount < unrollFactor) {
1029 if (cleanUpUnroll) {
1030 // Unroll the cleanup loop if cleanUpUnroll is specified.
1031 return loopUnrollFull(forOp);
1032 }
1033
1034 return failure();
1035 }
1036
1037 // Generate the cleanup loop if trip count isn't a multiple of unrollFactor.
1038 if (getLargestDivisorOfTripCount(forOp) % unrollFactor != 0) {
1039 // Loops where the lower bound is a max expression or the upper bound is
1040 // a min expression and the trip count doesn't divide the unroll factor
1041 // can't be unrolled since the lower bound of the cleanup loop in such cases
1042 // cannot be expressed as an affine function or a max over affine functions.
1043 if (forOp.getLowerBoundMap().getNumResults() != 1 ||
1044 forOp.getUpperBoundMap().getNumResults() != 1)
1045 return failure();
1046 if (cleanUpUnroll)
1047 // Force unroll including cleanup loop
1048 return loopUnrollFull(forOp);
1049 if (failed(Result: generateCleanupLoopForUnroll(forOp, unrollFactor)))
1050 assert(false && "cleanup loop lower bound map for single result lower "
1051 "and upper bound maps can always be determined");
1052 }
1053
1054 ValueRange iterArgs(forOp.getRegionIterArgs());
1055 auto yieldedValues = forOp.getBody()->getTerminator()->getOperands();
1056
1057 // Scale the step of loop being unrolled by unroll factor.
1058 int64_t step = forOp.getStepAsInt();
1059 forOp.setStep(step * unrollFactor);
1060 generateUnrolledLoop(
1061 loopBodyBlock: forOp.getBody(), forOpIV: forOp.getInductionVar(), unrollFactor,
1062 ivRemapFn: [&](unsigned i, Value iv, OpBuilder b) {
1063 // iv' = iv + i * step
1064 auto d0 = b.getAffineDimExpr(position: 0);
1065 auto bumpMap = AffineMap::get(dimCount: 1, symbolCount: 0, result: d0 + i * step);
1066 return b.create<AffineApplyOp>(location: forOp.getLoc(), args&: bumpMap, args&: iv);
1067 },
1068 /*annotateFn=*/annotateFn,
1069 /*iterArgs=*/iterArgs, /*yieldedValues=*/yieldedValues);
1070
1071 // Promote the loop body up if this has turned into a single iteration loop.
1072 (void)promoteIfSingleIteration(forOp);
1073 return success();
1074}
1075
1076LogicalResult mlir::affine::loopUnrollJamUpToFactor(AffineForOp forOp,
1077 uint64_t unrollJamFactor) {
1078 std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
1079 if (mayBeConstantTripCount.has_value() &&
1080 *mayBeConstantTripCount < unrollJamFactor)
1081 return loopUnrollJamByFactor(forOp, unrollJamFactor: *mayBeConstantTripCount);
1082 return loopUnrollJamByFactor(forOp, unrollJamFactor);
1083}
1084
1085/// Check if all control operands of all loops are defined outside of `forOp`
1086/// and return false if not.
1087static bool areInnerBoundsInvariant(AffineForOp forOp) {
1088 auto walkResult = forOp.walk(callback: [&](AffineForOp aForOp) {
1089 for (auto controlOperand : aForOp.getControlOperands()) {
1090 if (!forOp.isDefinedOutsideOfLoop(value: controlOperand))
1091 return WalkResult::interrupt();
1092 }
1093 return WalkResult::advance();
1094 });
1095 return !walkResult.wasInterrupted();
1096}
1097
1098/// Unrolls and jams this loop by the specified factor.
1099LogicalResult mlir::affine::loopUnrollJamByFactor(AffineForOp forOp,
1100 uint64_t unrollJamFactor) {
1101 assert(unrollJamFactor > 0 && "unroll jam factor should be positive");
1102
1103 std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
1104 if (unrollJamFactor == 1) {
1105 if (mayBeConstantTripCount == 1 && failed(Result: promoteIfSingleIteration(forOp)))
1106 return failure();
1107 return success();
1108 }
1109
1110 // Nothing in the loop body other than the terminator.
1111 if (llvm::hasSingleElement(C&: forOp.getBody()->getOperations()))
1112 return success();
1113
1114 // If the trip count is lower than the unroll jam factor, no unroll jam.
1115 if (mayBeConstantTripCount && *mayBeConstantTripCount < unrollJamFactor) {
1116 LLVM_DEBUG(llvm::dbgs() << "[failed] trip count < unroll-jam factor\n");
1117 return failure();
1118 }
1119
1120 // If any control operand of any inner loop of `forOp` is defined within
1121 // `forOp`, no unroll jam.
1122 if (!areInnerBoundsInvariant(forOp))
1123 return failure();
1124
1125 // Gather all sub-blocks to jam upon the loop being unrolled.
1126 JamBlockGatherer<AffineForOp> jbg;
1127 jbg.walk(op: forOp);
1128 auto &subBlocks = jbg.subBlocks;
1129
1130 // Collect loops with iter_args.
1131 SmallVector<AffineForOp, 4> loopsWithIterArgs;
1132 forOp.walk(callback: [&](AffineForOp aForOp) {
1133 if (aForOp.getNumIterOperands() > 0)
1134 loopsWithIterArgs.push_back(Elt: aForOp);
1135 });
1136
1137 // Get supported reductions to be used for creating reduction ops at the end.
1138 SmallVector<LoopReduction> reductions;
1139 if (forOp.getNumIterOperands() > 0)
1140 getSupportedReductions(forOp, supportedReductions&: reductions);
1141
1142 // Generate the cleanup loop if trip count isn't a multiple of
1143 // unrollJamFactor.
1144 if (getLargestDivisorOfTripCount(forOp) % unrollJamFactor != 0) {
1145 // Loops where the lower bound is a max expression or the upper bound is
1146 // a min expression and the trip count doesn't divide the unroll factor
1147 // can't be unrolled since the lower bound of the cleanup loop in such cases
1148 // cannot be expressed as an affine function or a max over affine functions.
1149 if (forOp.getLowerBoundMap().getNumResults() != 1 ||
1150 forOp.getUpperBoundMap().getNumResults() != 1)
1151 return failure();
1152 if (failed(Result: generateCleanupLoopForUnroll(forOp, unrollFactor: unrollJamFactor)))
1153 assert(false && "cleanup loop lower bound map for single result lower "
1154 "and upper bound maps can always be determined");
1155 }
1156
1157 // `operandMaps[i - 1]` carries old->new operand mapping for the ith unrolled
1158 // iteration. There are (`unrollJamFactor` - 1) iterations.
1159 SmallVector<IRMapping, 4> operandMaps(unrollJamFactor - 1);
1160
1161 // For any loop with iter_args, replace it with a new loop that has
1162 // `unrollJamFactor` copies of its iterOperands, iter_args and yield
1163 // operands.
1164 SmallVector<AffineForOp, 4> newLoopsWithIterArgs;
1165 IRRewriter rewriter(forOp.getContext());
1166 for (AffineForOp oldForOp : loopsWithIterArgs) {
1167 SmallVector<Value> dupIterOperands, dupYieldOperands;
1168 ValueRange oldIterOperands = oldForOp.getInits();
1169 ValueRange oldIterArgs = oldForOp.getRegionIterArgs();
1170 ValueRange oldYieldOperands =
1171 cast<AffineYieldOp>(Val: oldForOp.getBody()->getTerminator()).getOperands();
1172 // Get additional iterOperands, iterArgs, and yield operands. We will
1173 // fix iterOperands and yield operands after cloning of sub-blocks.
1174 for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1175 dupIterOperands.append(in_start: oldIterOperands.begin(), in_end: oldIterOperands.end());
1176 dupYieldOperands.append(in_start: oldYieldOperands.begin(), in_end: oldYieldOperands.end());
1177 }
1178 // Create a new loop with additional iterOperands, iter_args and yield
1179 // operands. This new loop will take the loop body of the original loop.
1180 bool forOpReplaced = oldForOp == forOp;
1181 AffineForOp newForOp =
1182 cast<AffineForOp>(Val: *oldForOp.replaceWithAdditionalYields(
1183 rewriter, newInitOperands: dupIterOperands, /*replaceInitOperandUsesInLoop=*/false,
1184 newYieldValuesFn: [&](OpBuilder &b, Location loc, ArrayRef<BlockArgument> newBbArgs) {
1185 return dupYieldOperands;
1186 }));
1187 newLoopsWithIterArgs.push_back(Elt: newForOp);
1188 // `forOp` has been replaced with a new loop.
1189 if (forOpReplaced)
1190 forOp = newForOp;
1191 // Update `operandMaps` for `newForOp` iterArgs and results.
1192 ValueRange newIterArgs = newForOp.getRegionIterArgs();
1193 unsigned oldNumIterArgs = oldIterArgs.size();
1194 ValueRange newResults = newForOp.getResults();
1195 unsigned oldNumResults = newResults.size() / unrollJamFactor;
1196 assert(oldNumIterArgs == oldNumResults &&
1197 "oldNumIterArgs must be the same as oldNumResults");
1198 for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1199 for (unsigned j = 0; j < oldNumIterArgs; ++j) {
1200 // `newForOp` has `unrollJamFactor` - 1 new sets of iterArgs and
1201 // results. Update `operandMaps[i - 1]` to map old iterArgs and results
1202 // to those in the `i`th new set.
1203 operandMaps[i - 1].map(from: newIterArgs[j],
1204 to: newIterArgs[i * oldNumIterArgs + j]);
1205 operandMaps[i - 1].map(from: newResults[j],
1206 to: newResults[i * oldNumResults + j]);
1207 }
1208 }
1209 }
1210
1211 // Scale the step of loop being unroll-jammed by the unroll-jam factor.
1212 int64_t step = forOp.getStepAsInt();
1213 forOp.setStep(step * unrollJamFactor);
1214
1215 auto forOpIV = forOp.getInductionVar();
1216 // Unroll and jam (appends unrollJamFactor - 1 additional copies).
1217 for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1218 for (auto &subBlock : subBlocks) {
1219 // Builder to insert unroll-jammed bodies. Insert right at the end of
1220 // sub-block.
1221 OpBuilder builder(subBlock.first->getBlock(), std::next(x: subBlock.second));
1222
1223 // If the induction variable is used, create a remapping to the value for
1224 // this unrolled instance.
1225 if (!forOpIV.use_empty()) {
1226 // iv' = iv + i * step, i = 1 to unrollJamFactor-1.
1227 auto d0 = builder.getAffineDimExpr(position: 0);
1228 auto bumpMap = AffineMap::get(dimCount: 1, symbolCount: 0, result: d0 + i * step);
1229 auto ivUnroll =
1230 builder.create<AffineApplyOp>(location: forOp.getLoc(), args&: bumpMap, args&: forOpIV);
1231 operandMaps[i - 1].map(from: forOpIV, to: ivUnroll);
1232 }
1233 // Clone the sub-block being unroll-jammed.
1234 for (auto it = subBlock.first; it != std::next(x: subBlock.second); ++it)
1235 builder.clone(op&: *it, mapper&: operandMaps[i - 1]);
1236 }
1237 // Fix iterOperands and yield op operands of newly created loops.
1238 for (auto newForOp : newLoopsWithIterArgs) {
1239 unsigned oldNumIterOperands =
1240 newForOp.getNumIterOperands() / unrollJamFactor;
1241 unsigned numControlOperands = newForOp.getNumControlOperands();
1242 auto yieldOp = cast<AffineYieldOp>(Val: newForOp.getBody()->getTerminator());
1243 unsigned oldNumYieldOperands = yieldOp.getNumOperands() / unrollJamFactor;
1244 assert(oldNumIterOperands == oldNumYieldOperands &&
1245 "oldNumIterOperands must be the same as oldNumYieldOperands");
1246 for (unsigned j = 0; j < oldNumIterOperands; ++j) {
1247 // The `i`th duplication of an old iterOperand or yield op operand
1248 // needs to be replaced with a mapped value from `operandMaps[i - 1]`
1249 // if such mapped value exists.
1250 newForOp.setOperand(i: numControlOperands + i * oldNumIterOperands + j,
1251 value: operandMaps[i - 1].lookupOrDefault(
1252 from: newForOp.getOperand(i: numControlOperands + j)));
1253 yieldOp.setOperand(
1254 i: i * oldNumYieldOperands + j,
1255 value: operandMaps[i - 1].lookupOrDefault(from: yieldOp.getOperand(i: j)));
1256 }
1257 }
1258 }
1259 if (forOp.getNumResults() > 0) {
1260 // Create reduction ops to combine every `unrollJamFactor` related results
1261 // into one value. For example, for %0:2 = affine.for ... and addf, we add
1262 // %1 = arith.addf %0#0, %0#1, and replace the following uses of %0#0 with
1263 // %1.
1264 rewriter.setInsertionPointAfter(forOp);
1265 auto loc = forOp.getLoc();
1266 unsigned oldNumResults = forOp.getNumResults() / unrollJamFactor;
1267 for (LoopReduction &reduction : reductions) {
1268 unsigned pos = reduction.iterArgPosition;
1269 Value lhs = forOp.getResult(i: pos);
1270 Value rhs;
1271 SmallPtrSet<Operation *, 4> newOps;
1272 for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1273 rhs = forOp.getResult(i: i * oldNumResults + pos);
1274 // Create ops based on reduction type.
1275 lhs = arith::getReductionOp(op: reduction.kind, builder&: rewriter, loc, lhs, rhs);
1276 if (!lhs)
1277 return failure();
1278 Operation *op = lhs.getDefiningOp();
1279 assert(op && "Reduction op should have been created");
1280 newOps.insert(Ptr: op);
1281 }
1282 // Replace all uses except those in newly created reduction ops.
1283 forOp.getResult(i: pos).replaceAllUsesExcept(newValue: lhs, exceptions: newOps);
1284 }
1285 }
1286
1287 // Promote the loop body up if this has turned into a single iteration loop.
1288 (void)promoteIfSingleIteration(forOp);
1289 return success();
1290}
1291
1292/// Performs loop interchange on 'forOpA' and 'forOpB', where 'forOpB' is
1293/// nested within 'forOpA' as the only non-terminator operation in its block.
1294void mlir::affine::interchangeLoops(AffineForOp forOpA, AffineForOp forOpB) {
1295 assert(&*forOpA.getBody()->begin() == forOpB.getOperation());
1296 auto &forOpABody = forOpA.getBody()->getOperations();
1297 auto &forOpBBody = forOpB.getBody()->getOperations();
1298
1299 // 1) Splice forOpA's non-terminator operations (which is just forOpB) just
1300 // before forOpA (in ForOpA's parent's block) this should leave 'forOpA's
1301 // body containing only the terminator.
1302 forOpA->getBlock()->getOperations().splice(where: Block::iterator(forOpA),
1303 L2&: forOpABody, first: forOpABody.begin(),
1304 last: std::prev(x: forOpABody.end()));
1305 // 2) Splice forOpB's non-terminator operations into the beginning of forOpA's
1306 // body (this leaves forOpB's body containing only the terminator).
1307 forOpABody.splice(where: forOpABody.begin(), L2&: forOpBBody, first: forOpBBody.begin(),
1308 last: std::prev(x: forOpBBody.end()));
1309 // 3) Splice forOpA into the beginning of forOpB's body.
1310 forOpBBody.splice(where: forOpBBody.begin(), L2&: forOpA->getBlock()->getOperations(),
1311 first: Block::iterator(forOpA));
1312}
1313
1314// Checks each dependence component against the permutation to see if the
1315// desired loop interchange would violate dependences by making the
1316// dependence component lexicographically negative.
1317static bool checkLoopInterchangeDependences(
1318 const std::vector<SmallVector<DependenceComponent, 2>> &depCompsVec,
1319 ArrayRef<AffineForOp> loops, ArrayRef<unsigned> loopPermMap) {
1320 // Invert permutation map.
1321 unsigned maxLoopDepth = loops.size();
1322 SmallVector<unsigned, 4> loopPermMapInv;
1323 loopPermMapInv.resize(N: maxLoopDepth);
1324 for (unsigned i = 0; i < maxLoopDepth; ++i)
1325 loopPermMapInv[loopPermMap[i]] = i;
1326
1327 // Check each dependence component against the permutation to see if the
1328 // desired loop interchange permutation would make the dependence vectors
1329 // lexicographically negative.
1330 // Example 1: [-1, 1][0, 0]
1331 // Example 2: [0, 0][-1, 1]
1332 for (const auto &depComps : depCompsVec) {
1333 assert(depComps.size() >= maxLoopDepth);
1334 // Check if the first non-zero dependence component is positive.
1335 // This iterates through loops in the desired order.
1336 for (unsigned j = 0; j < maxLoopDepth; ++j) {
1337 unsigned permIndex = loopPermMapInv[j];
1338 assert(depComps[permIndex].lb);
1339 int64_t depCompLb = *depComps[permIndex].lb;
1340 if (depCompLb > 0)
1341 break;
1342 if (depCompLb < 0)
1343 return false;
1344 }
1345 }
1346 return true;
1347}
1348
1349/// Checks if the loop interchange permutation 'loopPermMap' of the perfectly
1350/// nested sequence of loops in 'loops' would violate dependences.
1351bool mlir::affine::isValidLoopInterchangePermutation(
1352 ArrayRef<AffineForOp> loops, ArrayRef<unsigned> loopPermMap) {
1353 assert(loopPermMap.size() == loops.size() && "invalid loop perm map");
1354 unsigned maxLoopDepth = loops.size();
1355 if (maxLoopDepth == 1)
1356 return true;
1357 // Gather dependence components for dependences between all ops in loop nest
1358 // rooted at 'loops[0]', at loop depths in range [1, maxLoopDepth].
1359 std::vector<SmallVector<DependenceComponent, 2>> depCompsVec;
1360 getDependenceComponents(forOp: loops[0], maxLoopDepth, depCompsVec: &depCompsVec);
1361 return checkLoopInterchangeDependences(depCompsVec, loops, loopPermMap);
1362}
1363
1364/// Returns true if `loops` is a perfectly nested loop nest, where loops appear
1365/// in it from outermost to innermost.
1366bool LLVM_ATTRIBUTE_UNUSED
1367mlir::affine::isPerfectlyNested(ArrayRef<AffineForOp> loops) {
1368 assert(!loops.empty() && "no loops provided");
1369
1370 // We already know that the block can't be empty.
1371 auto hasTwoElements = [](Block *block) {
1372 auto secondOpIt = std::next(x: block->begin());
1373 return secondOpIt != block->end() && &*secondOpIt == &block->back();
1374 };
1375
1376 auto enclosingLoop = loops.front();
1377 for (auto loop : loops.drop_front()) {
1378 auto parentForOp = dyn_cast<AffineForOp>(Val: loop->getParentOp());
1379 // parentForOp's body should be just this loop and the terminator.
1380 if (parentForOp != enclosingLoop || !hasTwoElements(parentForOp.getBody()))
1381 return false;
1382 enclosingLoop = loop;
1383 }
1384 return true;
1385}
1386
1387// input[i] should move from position i -> permMap[i]. Returns the position in
1388// `input` that becomes the new outermost loop.
1389unsigned mlir::affine::permuteLoops(ArrayRef<AffineForOp> input,
1390 ArrayRef<unsigned> permMap) {
1391 assert(input.size() == permMap.size() && "invalid permutation map size");
1392 // Check whether the permutation spec is valid. This is a small vector - we'll
1393 // just sort and check if it's iota.
1394 SmallVector<unsigned, 4> checkPermMap(permMap);
1395 llvm::sort(C&: checkPermMap);
1396 if (llvm::any_of(Range: llvm::enumerate(First&: checkPermMap),
1397 P: [](const auto &en) { return en.value() != en.index(); }))
1398 assert(false && "invalid permutation map");
1399
1400 // Nothing to do.
1401 if (input.size() < 2)
1402 return 0;
1403
1404 assert(isPerfectlyNested(input) && "input not perfectly nested");
1405
1406 // Compute the inverse mapping, invPermMap: since input[i] goes to position
1407 // permMap[i], position i of the permuted nest is at input[invPermMap[i]].
1408 SmallVector<std::pair<unsigned, unsigned>, 4> invPermMap;
1409 for (unsigned i = 0, e = input.size(); i < e; ++i)
1410 invPermMap.push_back(Elt: {permMap[i], i});
1411 llvm::sort(C&: invPermMap);
1412
1413 // Move the innermost loop body to the loop that would be the innermost in the
1414 // permuted nest (only if the innermost loop is going to change).
1415 if (permMap.back() != input.size() - 1) {
1416 Block *destBody = ((AffineForOp)input[invPermMap.back().second]).getBody();
1417 Block *srcBody = ((AffineForOp)input.back()).getBody();
1418 destBody->getOperations().splice(where: destBody->begin(),
1419 L2&: srcBody->getOperations(), first: srcBody->begin(),
1420 last: std::prev(x: srcBody->end()));
1421 }
1422
1423 // We'll move each loop in `input` in the reverse order so that its body is
1424 // empty when we are moving it; this incurs zero copies and no erasing.
1425 for (int i = input.size() - 1; i >= 0; --i) {
1426 // If this has to become the outermost loop after permutation, add it to the
1427 // parent block of the original root.
1428 if (permMap[i] == 0) {
1429 // If the root remains the same, nothing to do.
1430 if (i == 0)
1431 continue;
1432 // Make input[i] the new outermost loop moving it into parentBlock.
1433 auto *parentBlock = input[0]->getBlock();
1434 parentBlock->getOperations().splice(where: Block::iterator(input[0]),
1435 L2&: input[i]->getBlock()->getOperations(),
1436 first: Block::iterator(input[i]));
1437 continue;
1438 }
1439
1440 // If the parent in the permuted order is the same as in the original,
1441 // nothing to do.
1442 unsigned parentPosInInput = invPermMap[permMap[i] - 1].second;
1443 if (i > 0 && static_cast<unsigned>(i - 1) == parentPosInInput)
1444 continue;
1445
1446 // Move input[i] to its surrounding loop in the transformed nest.
1447 auto *destBody = ((AffineForOp)input[parentPosInInput]).getBody();
1448 destBody->getOperations().splice(where: destBody->begin(),
1449 L2&: input[i]->getBlock()->getOperations(),
1450 first: Block::iterator(input[i]));
1451 }
1452
1453 return invPermMap[0].second;
1454}
1455
1456// Sinks all sequential loops to the innermost levels (while preserving
1457// relative order among them) and moves all parallel loops to the
1458// outermost (while again preserving relative order among them).
1459AffineForOp mlir::affine::sinkSequentialLoops(AffineForOp forOp) {
1460 SmallVector<AffineForOp, 4> loops;
1461 getPerfectlyNestedLoops(nestedLoops&: loops, root: forOp);
1462 if (loops.size() < 2)
1463 return forOp;
1464
1465 // Gather dependence components for dependences between all ops in loop nest
1466 // rooted at 'loops[0]', at loop depths in range [1, maxLoopDepth].
1467 unsigned maxLoopDepth = loops.size();
1468 std::vector<SmallVector<DependenceComponent, 2>> depCompsVec;
1469 getDependenceComponents(forOp: loops[0], maxLoopDepth, depCompsVec: &depCompsVec);
1470
1471 // Mark loops as either parallel or sequential.
1472 SmallVector<bool, 8> isParallelLoop(maxLoopDepth, true);
1473 for (auto &depComps : depCompsVec) {
1474 assert(depComps.size() >= maxLoopDepth);
1475 for (unsigned j = 0; j < maxLoopDepth; ++j) {
1476 DependenceComponent &depComp = depComps[j];
1477 assert(depComp.lb.has_value() && depComp.ub.has_value());
1478 if (*depComp.lb != 0 || *depComp.ub != 0)
1479 isParallelLoop[j] = false;
1480 }
1481 }
1482
1483 unsigned numParallelLoops = llvm::count(Range&: isParallelLoop, Element: true);
1484
1485 // Compute permutation of loops that sinks sequential loops (and thus raises
1486 // parallel loops) while preserving relative order.
1487 SmallVector<unsigned, 4> loopPermMap(maxLoopDepth);
1488 unsigned nextSequentialLoop = numParallelLoops;
1489 unsigned nextParallelLoop = 0;
1490 for (unsigned i = 0; i < maxLoopDepth; ++i) {
1491 if (isParallelLoop[i]) {
1492 loopPermMap[i] = nextParallelLoop++;
1493 } else {
1494 loopPermMap[i] = nextSequentialLoop++;
1495 }
1496 }
1497
1498 // Check if permutation 'loopPermMap' would violate dependences.
1499 if (!checkLoopInterchangeDependences(depCompsVec, loops, loopPermMap))
1500 return forOp;
1501 // Perform loop interchange according to permutation 'loopPermMap'.
1502 unsigned loopNestRootIndex = permuteLoops(input: loops, permMap: loopPermMap);
1503 return loops[loopNestRootIndex];
1504}
1505
1506// Factors out common behavior to add a new `iv` (resp. `iv` + `offset`) to the
1507// lower (resp. upper) loop bound. When called for both the lower and upper
1508// bounds, the resulting IR resembles:
1509//
1510// ```mlir
1511// affine.for %i = max (`iv, ...) to min (`iv` + `offset`) {
1512// ...
1513// }
1514// ```
1515static void augmentMapAndBounds(OpBuilder &b, Value iv, AffineMap *map,
1516 SmallVector<Value, 4> *operands,
1517 int64_t offset = 0) {
1518 auto bounds = llvm::to_vector<4>(Range: map->getResults());
1519 bounds.push_back(Elt: b.getAffineDimExpr(position: map->getNumDims()) + offset);
1520 operands->insert(I: operands->begin() + map->getNumDims(), Elt: iv);
1521 *map = AffineMap::get(dimCount: map->getNumDims() + 1, symbolCount: map->getNumSymbols(), results: bounds,
1522 context: b.getContext());
1523 canonicalizeMapAndOperands(map, operands);
1524}
1525
1526// Stripmines `forOp` by `factor` and sinks it under each of the `targets`.
1527// Stripmine-sink is a primitive building block for generalized tiling of
1528// imperfectly nested loops.
1529// This transformation is purely mechanical and does not check legality,
1530// profitability or even structural correctness. It is the user's
1531// responsibility to specify `targets` that are dominated by `forOp`.
1532// Returns the new AffineForOps, one per `targets`, nested immediately under
1533// each of the `targets`.
1534static SmallVector<AffineForOp, 8>
1535stripmineSink(AffineForOp forOp, uint64_t factor,
1536 ArrayRef<AffineForOp> targets) {
1537 auto originalStep = forOp.getStepAsInt();
1538 auto scaledStep = originalStep * factor;
1539 forOp.setStep(scaledStep);
1540
1541 OpBuilder b(forOp->getBlock(), std::next(x: Block::iterator(forOp)));
1542
1543 // Lower-bound map creation.
1544 auto lbMap = forOp.getLowerBoundMap();
1545 SmallVector<Value, 4> lbOperands(forOp.getLowerBoundOperands());
1546 augmentMapAndBounds(b, iv: forOp.getInductionVar(), map: &lbMap, operands: &lbOperands);
1547
1548 // Upper-bound map creation.
1549 auto ubMap = forOp.getUpperBoundMap();
1550 SmallVector<Value, 4> ubOperands(forOp.getUpperBoundOperands());
1551 augmentMapAndBounds(b, iv: forOp.getInductionVar(), map: &ubMap, operands: &ubOperands,
1552 /*offset=*/scaledStep);
1553
1554 auto iv = forOp.getInductionVar();
1555 SmallVector<AffineForOp, 8> innerLoops;
1556 for (auto t : targets) {
1557 // Insert newForOp before the terminator of `t`.
1558 auto b = OpBuilder::atBlockTerminator(block: t.getBody());
1559 auto newForOp = b.create<AffineForOp>(location: t.getLoc(), args&: lbOperands, args&: lbMap,
1560 args&: ubOperands, args&: ubMap, args&: originalStep);
1561 auto begin = t.getBody()->begin();
1562 // Skip terminator and `newForOp` which is just before the terminator.
1563 auto nOps = t.getBody()->getOperations().size() - 2;
1564 newForOp.getBody()->getOperations().splice(
1565 where: newForOp.getBody()->getOperations().begin(),
1566 L2&: t.getBody()->getOperations(), first: begin, last: std::next(x: begin, n: nOps));
1567 replaceAllUsesInRegionWith(orig: iv, replacement: newForOp.getInductionVar(),
1568 region&: newForOp.getRegion());
1569 innerLoops.push_back(Elt: newForOp);
1570 }
1571
1572 return innerLoops;
1573}
1574
1575// Stripmines a `forOp` by `factor` and sinks it under a single `target`.
1576// Returns the new AffineForOps, nested immediately under `target`.
1577template <typename SizeType>
1578static AffineForOp stripmineSink(AffineForOp forOp, SizeType factor,
1579 AffineForOp target) {
1580 // TODO: Use cheap structural assertions that targets are nested under
1581 // forOp and that targets are not nested under each other when DominanceInfo
1582 // exposes the capability. It seems overkill to construct a whole function
1583 // dominance tree at this point.
1584 auto res = stripmineSink(forOp, factor, ArrayRef<AffineForOp>(target));
1585 assert(res.size() == 1 && "Expected 1 inner forOp");
1586 return res[0];
1587}
1588
1589SmallVector<SmallVector<AffineForOp, 8>, 8>
1590mlir::affine::tile(ArrayRef<AffineForOp> forOps, ArrayRef<uint64_t> sizes,
1591 ArrayRef<AffineForOp> targets) {
1592 SmallVector<SmallVector<AffineForOp, 8>, 8> res;
1593 SmallVector<AffineForOp, 8> currentTargets(targets);
1594 for (auto it : llvm::zip(t&: forOps, u&: sizes)) {
1595 auto step = stripmineSink(forOp: std::get<0>(t&: it), factor: std::get<1>(t&: it), targets: currentTargets);
1596 res.push_back(Elt: step);
1597 currentTargets = step;
1598 }
1599 return res;
1600}
1601
1602SmallVector<AffineForOp, 8> mlir::affine::tile(ArrayRef<AffineForOp> forOps,
1603 ArrayRef<uint64_t> sizes,
1604 AffineForOp target) {
1605 SmallVector<AffineForOp, 8> res;
1606 for (auto loops : tile(forOps, sizes, targets: ArrayRef<AffineForOp>(target)))
1607 res.push_back(Elt: llvm::getSingleElement(C&: loops));
1608 return res;
1609}
1610
1611LogicalResult mlir::affine::coalesceLoops(MutableArrayRef<AffineForOp> loops) {
1612 if (loops.size() < 2)
1613 return success();
1614
1615 AffineForOp innermost = loops.back();
1616 AffineForOp outermost = loops.front();
1617 AffineBound ub = outermost.getUpperBound();
1618 AffineMap origUbMap = ub.getMap();
1619 Location loc = outermost.getLoc();
1620 OpBuilder builder(outermost);
1621 for (AffineForOp loop : loops) {
1622 // We only work on normalized loops.
1623 if (loop.getStepAsInt() != 1 || !loop.hasConstantLowerBound() ||
1624 loop.getConstantLowerBound() != 0)
1625 return failure();
1626 }
1627 SmallVector<Value, 4> upperBoundSymbols;
1628 SmallVector<Value, 4> ubOperands(ub.getOperands().begin(),
1629 ub.getOperands().end());
1630
1631 // 1. Store the upper bound of the outermost loop in a variable.
1632 Value prev;
1633 if (!llvm::hasSingleElement(C: origUbMap.getResults()))
1634 prev = builder.create<AffineMinOp>(location: loc, args&: origUbMap, args&: ubOperands);
1635 else
1636 prev = builder.create<AffineApplyOp>(location: loc, args&: origUbMap, args&: ubOperands);
1637 upperBoundSymbols.push_back(Elt: prev);
1638
1639 // 2. Emit code computing the upper bound of the coalesced loop as product of
1640 // the number of iterations of all loops.
1641 for (AffineForOp loop : loops.drop_front()) {
1642 ub = loop.getUpperBound();
1643 origUbMap = ub.getMap();
1644 ubOperands = ub.getOperands();
1645 Value upperBound;
1646 // If upper bound map has more than one result, take their minimum.
1647 if (!llvm::hasSingleElement(C: origUbMap.getResults()))
1648 upperBound = builder.create<AffineMinOp>(location: loc, args&: origUbMap, args&: ubOperands);
1649 else
1650 upperBound = builder.create<AffineApplyOp>(location: loc, args&: origUbMap, args&: ubOperands);
1651 upperBoundSymbols.push_back(Elt: upperBound);
1652 SmallVector<Value, 4> operands;
1653 operands.push_back(Elt: prev);
1654 operands.push_back(Elt: upperBound);
1655 // Maintain running product of loop upper bounds.
1656 prev = builder.create<AffineApplyOp>(
1657 location: loc,
1658 args: AffineMap::get(/*dimCount=*/1,
1659 /*symbolCount=*/1,
1660 result: builder.getAffineDimExpr(position: 0) *
1661 builder.getAffineSymbolExpr(position: 0)),
1662 args&: operands);
1663 }
1664 // Set upper bound of the coalesced loop.
1665 AffineMap newUbMap = AffineMap::get(
1666 /*dimCount=*/0,
1667 /*symbolCount=*/1, results: builder.getAffineSymbolExpr(position: 0), context: builder.getContext());
1668 outermost.setUpperBound(operands: prev, map: newUbMap);
1669
1670 builder.setInsertionPointToStart(outermost.getBody());
1671
1672 // 3. Remap induction variables. For each original loop, the value of the
1673 // induction variable can be obtained by dividing the induction variable of
1674 // the linearized loop by the total number of iterations of the loops nested
1675 // in it modulo the number of iterations in this loop (remove the values
1676 // related to the outer loops):
1677 // iv_i = floordiv(iv_linear, product-of-loop-ranges-until-i) mod range_i.
1678 // Compute these iteratively from the innermost loop by creating a "running
1679 // quotient" of division by the range.
1680 Value previous = outermost.getInductionVar();
1681 for (unsigned idx = loops.size(); idx > 0; --idx) {
1682 if (idx != loops.size()) {
1683 SmallVector<Value, 4> operands;
1684 operands.push_back(Elt: previous);
1685 operands.push_back(Elt: upperBoundSymbols[idx]);
1686 previous = builder.create<AffineApplyOp>(
1687 location: loc,
1688 args: AffineMap::get(
1689 /*dimCount=*/1, /*symbolCount=*/1,
1690 result: builder.getAffineDimExpr(position: 0).floorDiv(
1691 other: builder.getAffineSymbolExpr(position: 0))),
1692 args&: operands);
1693 }
1694 // Modified value of the induction variables of the nested loops after
1695 // coalescing.
1696 Value inductionVariable;
1697 if (idx == 1) {
1698 inductionVariable = previous;
1699 } else {
1700 SmallVector<Value, 4> applyOperands;
1701 applyOperands.push_back(Elt: previous);
1702 applyOperands.push_back(Elt: upperBoundSymbols[idx - 1]);
1703 inductionVariable = builder.create<AffineApplyOp>(
1704 location: loc,
1705 args: AffineMap::get(
1706 /*dimCount=*/1, /*symbolCount=*/1,
1707 result: builder.getAffineDimExpr(position: 0) % builder.getAffineSymbolExpr(position: 0)),
1708 args&: applyOperands);
1709 }
1710 replaceAllUsesInRegionWith(orig: loops[idx - 1].getInductionVar(),
1711 replacement: inductionVariable, region&: loops.back().getRegion());
1712 }
1713
1714 // 4. Move the operations from the innermost just above the second-outermost
1715 // loop, delete the extra terminator and the second-outermost loop.
1716 AffineForOp secondOutermostLoop = loops[1];
1717 innermost.getBody()->back().erase();
1718 outermost.getBody()->getOperations().splice(
1719 where: Block::iterator(secondOutermostLoop.getOperation()),
1720 L2&: innermost.getBody()->getOperations());
1721 secondOutermostLoop.erase();
1722 return success();
1723}
1724
1725void mlir::affine::mapLoopToProcessorIds(scf::ForOp forOp,
1726 ArrayRef<Value> processorId,
1727 ArrayRef<Value> numProcessors) {
1728 assert(processorId.size() == numProcessors.size());
1729 if (processorId.empty())
1730 return;
1731
1732 OpBuilder b(forOp);
1733 Location loc(forOp.getLoc());
1734 AffineExpr lhs, rhs;
1735 bindSymbols(ctx: forOp.getContext(), exprs&: lhs, exprs&: rhs);
1736 auto mulMap = AffineMap::get(dimCount: 0, symbolCount: 2, result: lhs * rhs);
1737 auto addMap = AffineMap::get(dimCount: 0, symbolCount: 2, result: lhs + rhs);
1738
1739 Value linearIndex = processorId.front();
1740 for (unsigned i = 1, e = processorId.size(); i < e; ++i) {
1741 auto mulApplyOp = b.create<AffineApplyOp>(
1742 location: loc, args&: mulMap, args: ValueRange{linearIndex, numProcessors[i]});
1743 linearIndex = b.create<AffineApplyOp>(
1744 location: loc, args&: addMap, args: ValueRange{mulApplyOp, processorId[i]});
1745 }
1746
1747 auto mulApplyOp = b.create<AffineApplyOp>(
1748 location: loc, args&: mulMap, args: ValueRange{linearIndex, forOp.getStep()});
1749 Value lb = b.create<AffineApplyOp>(
1750 location: loc, args&: addMap, args: ValueRange{mulApplyOp, forOp.getLowerBound()});
1751 forOp.setLowerBound(lb);
1752
1753 Value step = forOp.getStep();
1754 for (auto numProcs : numProcessors)
1755 step = b.create<AffineApplyOp>(location: loc, args&: mulMap, args: ValueRange{numProcs, step});
1756 forOp.setStep(step);
1757}
1758
1759/// Given a memref region, determine the lowest depth at which transfers can be
1760/// placed for it, and return the corresponding block, start and end positions
1761/// in the block for placing incoming (read) and outgoing (write) copies
1762/// respectively. The lowest depth depends on whether the region being accessed
1763/// is hoistable with respect to one or more immediately surrounding loops.
1764static void
1765findHighestBlockForPlacement(const MemRefRegion &region, Block &block,
1766 Block::iterator &begin, Block::iterator &end,
1767 Block **copyPlacementBlock,
1768 Block::iterator *copyInPlacementStart,
1769 Block::iterator *copyOutPlacementStart) {
1770 const auto *cst = region.getConstraints();
1771 SmallVector<Value, 4> symbols;
1772 cst->getValues(start: cst->getNumDimVars(), end: cst->getNumDimAndSymbolVars(), values: &symbols);
1773
1774 SmallVector<Operation *, 4> enclosingAffineOps;
1775 getEnclosingAffineOps(op&: *block.begin(), ops: &enclosingAffineOps);
1776 // Walk up loop parents till we find an IV on which this region is
1777 // symbolic/variant or we hit `hoistGuard`.
1778 auto it = enclosingAffineOps.rbegin();
1779 AffineForOp lastInvariantFor;
1780 for (auto e = enclosingAffineOps.rend(); it != e; ++it) {
1781 Operation *enclosingOp = *it;
1782 // We can't hoist past the definition of the memref being copied.
1783 Value memref = region.memref;
1784 if (!memref.getParentRegion()->isAncestor(other: enclosingOp->getParentRegion())) {
1785 LLVM_DEBUG(
1786 llvm::dbgs()
1787 << "memref definition will end up not dominating hoist location\n");
1788 break;
1789 }
1790
1791 auto affineFor = dyn_cast<AffineForOp>(Val: enclosingOp);
1792 if (!affineFor)
1793 break;
1794 // TODO: also need to be checking this for regions symbols that
1795 // aren't loop IVs, whether we are within their resp. defs' dominance scope.
1796 if (llvm::is_contained(Range&: symbols, Element: affineFor.getInductionVar()))
1797 break;
1798 lastInvariantFor = affineFor;
1799 }
1800
1801 if (it != enclosingAffineOps.rbegin()) {
1802 *copyInPlacementStart = Block::iterator(lastInvariantFor);
1803 *copyOutPlacementStart = std::next(x: *copyInPlacementStart);
1804 *copyPlacementBlock = lastInvariantFor->getBlock();
1805 } else {
1806 *copyInPlacementStart = begin;
1807 *copyOutPlacementStart = end;
1808 *copyPlacementBlock = &block;
1809 }
1810}
1811
1812// Info comprising stride and number of elements transferred every stride.
1813struct StrideInfo {
1814 int64_t stride;
1815 int64_t numEltPerStride;
1816};
1817
1818/// Returns striding information for a copy/transfer of this region with
1819/// potentially multiple striding levels from outermost to innermost. For an
1820/// n-dimensional region, there can be at most n-1 levels of striding
1821/// successively nested.
1822// TODO: make this work with non-identity layout maps.
1823static void getMultiLevelStrides(const MemRefRegion &region,
1824 ArrayRef<int64_t> bufferShape,
1825 SmallVectorImpl<StrideInfo> *strideInfos) {
1826 if (bufferShape.size() <= 1)
1827 return;
1828
1829 int64_t numEltPerStride = 1;
1830 int64_t stride = 1;
1831 for (int d = bufferShape.size() - 1; d >= 1; d--) {
1832 int64_t dimSize = cast<MemRefType>(Val: region.memref.getType()).getDimSize(idx: d);
1833 stride *= dimSize;
1834 numEltPerStride *= bufferShape[d];
1835 // A stride is needed only if the region has a shorter extent than the
1836 // memref along the dimension *and* has an extent greater than one along the
1837 // next major dimension.
1838 if (bufferShape[d] < dimSize && bufferShape[d - 1] > 1) {
1839 strideInfos->push_back(Elt: {.stride: stride, .numEltPerStride: numEltPerStride});
1840 }
1841 }
1842}
1843
1844/// Generates a point-wise copy from/to a non-zero ranked `memref' to/from
1845/// `fastMemRef' and returns the outermost AffineForOp of the copy loop nest.
1846/// `lbMaps` and `ubMaps` along with `lbOperands` and `ubOperands` hold the
1847/// lower and upper bound information for the copy loop nest. `fastBufOffsets`
1848/// contain the expressions to be subtracted out from the respective copy loop
1849/// iterators in order to index the fast buffer. If `copyOut' is true, generates
1850/// a copy-out; otherwise a copy-in. Builder `b` should be set to the point the
1851/// copy nest is inserted.
1852//
1853/// The copy-in nest is generated as follows as an example for a 2-d region:
1854/// for x = ...
1855/// for y = ...
1856/// fast_buf[x - offset_x][y - offset_y] = memref[x][y]
1857///
1858static AffineForOp
1859generatePointWiseCopy(Location loc, Value memref, Value fastMemRef,
1860 ArrayRef<AffineMap> lbMaps, ArrayRef<Value> lbOperands,
1861 ArrayRef<AffineMap> ubMaps, ArrayRef<Value> ubOperands,
1862 ArrayRef<AffineExpr> fastBufOffsets, bool isCopyOut,
1863 OpBuilder b) {
1864 assert(llvm::all_of(lbMaps, [&](AffineMap lbMap) {
1865 return lbMap.getNumInputs() == lbOperands.size();
1866 }));
1867 assert(llvm::all_of(ubMaps, [&](AffineMap ubMap) {
1868 return ubMap.getNumInputs() == ubOperands.size();
1869 }));
1870
1871 unsigned rank = cast<MemRefType>(Val: memref.getType()).getRank();
1872 // A copy nest can't be generated for 0-ranked memrefs.
1873 assert(rank != 0 && "non-zero rank memref expected");
1874 assert(lbMaps.size() == rank && "wrong number of lb maps");
1875 assert(ubMaps.size() == rank && "wrong number of ub maps");
1876
1877 SmallVector<Value, 4> memIndices;
1878 SmallVector<AffineExpr, 4> fastBufExprs;
1879 SmallVector<Value, 4> fastBufMapOperands;
1880 AffineForOp copyNestRoot;
1881 SmallVector<AffineApplyOp, 4> mayBeDeadApplys;
1882 for (unsigned d = 0; d < rank; ++d) {
1883 auto forOp = createCanonicalizedAffineForOp(b, loc, lbOperands, lbMap: lbMaps[d],
1884 ubOperands, ubMap: ubMaps[d]);
1885 if (d == 0)
1886 copyNestRoot = forOp;
1887
1888 b = OpBuilder::atBlockTerminator(block: forOp.getBody());
1889
1890 auto fastBufOffsetMap =
1891 AffineMap::get(dimCount: lbOperands.size(), symbolCount: 0, result: fastBufOffsets[d]);
1892 auto offset = b.create<AffineApplyOp>(location: loc, args&: fastBufOffsetMap, args&: lbOperands);
1893
1894 // Construct the subscript for the fast memref being copied into/from:
1895 // x - offset_x.
1896 fastBufExprs.push_back(Elt: b.getAffineDimExpr(position: 2 * d + 1) -
1897 b.getAffineDimExpr(position: 2 * d));
1898 fastBufMapOperands.push_back(Elt: offset);
1899 fastBufMapOperands.push_back(Elt: forOp.getInductionVar());
1900 mayBeDeadApplys.push_back(Elt: offset);
1901
1902 // Subscript for the slow memref being copied.
1903 memIndices.push_back(Elt: forOp.getInductionVar());
1904 }
1905
1906 auto fastBufMap =
1907 AffineMap::get(dimCount: 2 * rank, /*symbolCount=*/0, results: fastBufExprs, context: b.getContext());
1908 fullyComposeAffineMapAndOperands(map: &fastBufMap, operands: &fastBufMapOperands);
1909 fastBufMap = simplifyAffineMap(map: fastBufMap);
1910 canonicalizeMapAndOperands(map: &fastBufMap, operands: &fastBufMapOperands);
1911
1912 // Drop any dead affine.applys.
1913 for (auto applyOp : mayBeDeadApplys)
1914 if (applyOp.use_empty())
1915 applyOp.erase();
1916
1917 if (!isCopyOut) {
1918 // Copy in.
1919 auto load = b.create<AffineLoadOp>(location: loc, args&: memref, args&: memIndices);
1920 b.create<AffineStoreOp>(location: loc, args&: load, args&: fastMemRef, args&: fastBufMap,
1921 args&: fastBufMapOperands);
1922 return copyNestRoot;
1923 }
1924
1925 // Copy out.
1926 auto load =
1927 b.create<AffineLoadOp>(location: loc, args&: fastMemRef, args&: fastBufMap, args&: fastBufMapOperands);
1928 b.create<AffineStoreOp>(location: loc, args&: load, args&: memref, args&: memIndices);
1929 return copyNestRoot;
1930}
1931
1932static InFlightDiagnostic LLVM_ATTRIBUTE_UNUSED
1933emitRemarkForBlock(Block &block) {
1934 return block.getParentOp()->emitRemark();
1935}
1936
1937/// Creates a buffer in the faster memory space for the specified memref region
1938/// (memref has to be non-zero ranked); generates a copy from the lower memory
1939/// space to this one, and replaces all loads/stores in the block range
1940/// [`begin', `end') of `block' to load/store from that buffer. Returns failure
1941/// if copies could not be generated due to yet unimplemented cases.
1942/// `copyInPlacementStart` and `copyOutPlacementStart` in copyPlacementBlock
1943/// specify the insertion points where the incoming copies and outgoing copies,
1944/// respectively, should be inserted (the insertion happens right before the
1945/// insertion point). Since `begin` can itself be invalidated due to the memref
1946/// rewriting done from this method, the output argument `nBegin` is set to its
1947/// replacement (set to `begin` if no invalidation happens). Since outgoing
1948/// copies could have been inserted at `end`, the output argument `nEnd` is set
1949/// to the new end. `sizeInBytes` is set to the size of the fast buffer
1950/// allocated.
1951static LogicalResult generateCopy(
1952 const MemRefRegion &region, Block *block, Block::iterator begin,
1953 Block::iterator end, Block *copyPlacementBlock,
1954 Block::iterator copyInPlacementStart, Block::iterator copyOutPlacementStart,
1955 const AffineCopyOptions &copyOptions, DenseMap<Value, Value> &fastBufferMap,
1956 DenseSet<Operation *> &copyNests, uint64_t *sizeInBytes,
1957 Block::iterator *nBegin, Block::iterator *nEnd) {
1958 *nBegin = begin;
1959 *nEnd = end;
1960
1961 auto f = begin->getParentOfType<FunctionOpInterface>();
1962 OpBuilder topBuilder(f.getFunctionBody());
1963 Value zeroIndex = topBuilder.create<arith::ConstantIndexOp>(location: f.getLoc(), args: 0);
1964
1965 *sizeInBytes = 0;
1966
1967 if (begin == end)
1968 return success();
1969
1970 // Record the last op in the block for which we are performing copy
1971 // generation. We later do the memref replacement only in [begin, lastCopyOp]
1972 // so that the original memref's used in the data movement code themselves
1973 // don't get replaced.
1974 Operation *lastCopyOp = end->getPrevNode();
1975
1976 // Is the copy out point at the end of the block where we are doing
1977 // explicit copying.
1978 bool isCopyOutAtEndOfBlock = (end == copyOutPlacementStart);
1979
1980 // Copies for read regions are going to be inserted at 'begin'.
1981 OpBuilder prologue(copyPlacementBlock, copyInPlacementStart);
1982 // Copies for write regions are going to be inserted at 'end'.
1983 OpBuilder epilogue(copyPlacementBlock, copyOutPlacementStart);
1984 OpBuilder &b = region.isWrite() ? epilogue : prologue;
1985
1986 // Builder to create constants at the top level.
1987 auto func =
1988 copyPlacementBlock->getParent()->getParentOfType<FunctionOpInterface>();
1989 OpBuilder top(func.getFunctionBody());
1990
1991 auto loc = region.loc;
1992 auto memref = region.memref;
1993 auto memRefType = cast<MemRefType>(Val: memref.getType());
1994
1995 if (!memRefType.getLayout().isIdentity()) {
1996 LLVM_DEBUG(llvm::dbgs() << "Non-identity layout map not yet supported\n");
1997 return failure();
1998 }
1999
2000 // Indices to use for the copying.
2001 // Indices for the original memref being copied from/to.
2002 SmallVector<Value, 4> memIndices;
2003 // Indices for the faster buffer being copied into/from.
2004 SmallVector<Value, 4> bufIndices;
2005
2006 unsigned rank = memRefType.getRank();
2007 if (rank == 0) {
2008 LLVM_DEBUG(llvm::dbgs() << "Non-zero ranked memrefs supported\n");
2009 return failure();
2010 }
2011
2012 SmallVector<int64_t, 4> fastBufferShape;
2013
2014 // Compute the extents of the buffer.
2015 SmallVector<AffineMap, 2> lbs;
2016 lbs.reserve(N: rank);
2017 std::optional<int64_t> numElements =
2018 region.getConstantBoundingSizeAndShape(shape: &fastBufferShape, lbs: &lbs);
2019 if (!numElements) {
2020 LLVM_DEBUG(llvm::dbgs() << "Non-constant region size not supported\n");
2021 return failure();
2022 }
2023
2024 if (llvm::any_of(Range&: lbs, P: [](AffineMap lb) { return lb.getNumResults() > 1; })) {
2025 // This can be supported in the future if needed.
2026 LLVM_DEBUG(llvm::dbgs()
2027 << "Max lower bound for memref region start not supported\n");
2028 return failure();
2029 }
2030
2031 if (*numElements == 0) {
2032 LLVM_DEBUG(llvm::dbgs() << "Nothing to copy\n");
2033 return success();
2034 }
2035
2036 SmallVector<AffineMap, 4> lbMaps(rank), ubMaps(rank);
2037 for (unsigned i = 0; i < rank; ++i) {
2038 region.getLowerAndUpperBound(pos: i, lbMap&: lbMaps[i], ubMap&: ubMaps[i]);
2039 if (lbMaps[i].getNumResults() == 0 || ubMaps[i].getNumResults() == 0) {
2040 LLVM_DEBUG(llvm::dbgs()
2041 << "Missing lower or upper bound for region along dimension: "
2042 << i << '\n');
2043 return failure();
2044 }
2045 }
2046
2047 const FlatAffineValueConstraints *cst = region.getConstraints();
2048 // 'regionSymbols' hold values that this memory region is symbolic/parametric
2049 // on; these typically include loop IVs surrounding the level at which the
2050 // copy generation is being done or other valid symbols in MLIR.
2051 SmallVector<Value, 8> regionSymbols;
2052 cst->getValues(start: rank, end: cst->getNumVars(), values: &regionSymbols);
2053
2054 // Construct the access expression for the fast memory buffer. The access
2055 // expression for a particular dimension of the fast buffer is obtained by
2056 // subtracting out the lower bound on the original memref's data region
2057 // along the corresponding dimension.
2058
2059 // Index start offsets for faster memory buffer relative to the original.
2060 SmallVector<AffineExpr, 4> fastBufOffsets;
2061 fastBufOffsets.reserve(N: rank);
2062 for (unsigned d = 0; d < rank; d++) {
2063 assert(lbs[d].getNumSymbols() == cst->getNumCols() - rank - 1 &&
2064 "incorrect bound size");
2065
2066 // Set copy start location for this dimension in the lower memory space
2067 // memref.
2068 if (lbs[d].isSingleConstant()) {
2069 auto indexVal = lbs[d].getSingleConstantResult();
2070 if (indexVal == 0) {
2071 memIndices.push_back(Elt: zeroIndex);
2072 } else {
2073 memIndices.push_back(
2074 Elt: top.create<arith::ConstantIndexOp>(location: loc, args&: indexVal).getResult());
2075 }
2076 } else {
2077 // The coordinate for the start location is just the lower bound along the
2078 // corresponding dimension on the memory region (stored in 'offset').
2079 // Remap all inputs of the map to dimensions uniformly since in the
2080 // generate IR we need valid affine symbols as opposed to "symbols" for
2081 // the purpose of the memref region.
2082 SmallVector<AffineExpr> symReplacements(lbs[d].getNumSymbols());
2083 for (unsigned i = 0, e = lbs[d].getNumSymbols(); i < e; ++i)
2084 symReplacements[i] = top.getAffineDimExpr(position: i);
2085 lbs[d] = lbs[d].replaceDimsAndSymbols(
2086 /*dimReplacements=*/{}, symReplacements, numResultDims: lbs[d].getNumSymbols(),
2087 /*numResultSyms=*/0);
2088 memIndices.push_back(Elt: b.create<AffineApplyOp>(location: loc, args&: lbs[d], args&: regionSymbols));
2089 }
2090 // The fast buffer is copied into at location zero; addressing is relative.
2091 bufIndices.push_back(Elt: zeroIndex);
2092
2093 // Record the offsets since they are needed to remap the memory accesses of
2094 // the original memref further below.
2095 fastBufOffsets.push_back(Elt: lbs[d].getResult(idx: 0));
2096 }
2097
2098 // The faster memory space buffer.
2099 Value fastMemRef;
2100
2101 // Check if a buffer was already created.
2102 bool existingBuf = fastBufferMap.count(Val: memref) > 0;
2103 if (!existingBuf) {
2104 AffineMap fastBufferLayout = b.getMultiDimIdentityMap(rank);
2105 auto fastMemRefType =
2106 MemRefType::get(shape: fastBufferShape, elementType: memRefType.getElementType(),
2107 map: fastBufferLayout, memorySpaceInd: copyOptions.fastMemorySpace);
2108
2109 // Create the fast memory space buffer just before the 'affine.for'
2110 // operation.
2111 fastMemRef =
2112 prologue.create<memref::AllocOp>(location: loc, args&: fastMemRefType).getResult();
2113 // Record it.
2114 fastBufferMap[memref] = fastMemRef;
2115 // fastMemRefType is a constant shaped memref.
2116 auto maySizeInBytes = getIntOrFloatMemRefSizeInBytes(memRefType: fastMemRefType);
2117 // We don't account for things of unknown size.
2118 *sizeInBytes = maySizeInBytes.value_or(u: 0);
2119
2120 LLVM_DEBUG(emitRemarkForBlock(*block)
2121 << "Creating fast buffer of type " << fastMemRefType
2122 << " and size " << llvm::divideCeil(*sizeInBytes, 1024)
2123 << " KiB\n");
2124 } else {
2125 // Reuse the one already created.
2126 fastMemRef = fastBufferMap[memref];
2127 }
2128
2129 auto numElementsSSA = top.create<arith::ConstantIndexOp>(location: loc, args&: *numElements);
2130
2131 Value dmaStride;
2132 Value numEltPerDmaStride;
2133 if (copyOptions.generateDma) {
2134 SmallVector<StrideInfo, 4> dmaStrideInfos;
2135 getMultiLevelStrides(region, bufferShape: fastBufferShape, strideInfos: &dmaStrideInfos);
2136
2137 // TODO: use all stride levels once DmaStartOp is extended for
2138 // multi-level strides.
2139 if (dmaStrideInfos.size() > 1) {
2140 LLVM_DEBUG(llvm::dbgs() << "Only up to one level of stride supported\n");
2141 return failure();
2142 }
2143
2144 if (!dmaStrideInfos.empty()) {
2145 dmaStride =
2146 top.create<arith::ConstantIndexOp>(location: loc, args&: dmaStrideInfos[0].stride);
2147 numEltPerDmaStride = top.create<arith::ConstantIndexOp>(
2148 location: loc, args&: dmaStrideInfos[0].numEltPerStride);
2149 }
2150 }
2151
2152 // Create fully composed affine maps for each memref.
2153 auto memAffineMap = b.getMultiDimIdentityMap(rank: memIndices.size());
2154 fullyComposeAffineMapAndOperands(map: &memAffineMap, operands: &memIndices);
2155 auto bufAffineMap = b.getMultiDimIdentityMap(rank: bufIndices.size());
2156 fullyComposeAffineMapAndOperands(map: &bufAffineMap, operands: &bufIndices);
2157
2158 if (!copyOptions.generateDma) {
2159 // Point-wise copy generation.
2160 auto copyNest =
2161 generatePointWiseCopy(loc, memref, fastMemRef, lbMaps,
2162 /*lbOperands=*/regionSymbols, ubMaps,
2163 /*ubOperands=*/regionSymbols, fastBufOffsets,
2164 /*isCopyOut=*/region.isWrite(), b);
2165
2166 // Record this so that we can skip it from yet another copy.
2167 copyNests.insert(V: copyNest);
2168
2169 // Since new ops are being appended (for copy out's), adjust the end to
2170 // mark end of block range being processed if necessary.
2171 if (region.isWrite() && isCopyOutAtEndOfBlock)
2172 *nEnd = Block::iterator(copyNest.getOperation());
2173 } else {
2174 // DMA generation.
2175 // Create a tag (single element 1-d memref) for the DMA.
2176 auto tagMemRefType = MemRefType::get(shape: {1}, elementType: top.getIntegerType(width: 32), map: {},
2177 memorySpaceInd: copyOptions.tagMemorySpace);
2178 auto tagMemRef = prologue.create<memref::AllocOp>(location: loc, args&: tagMemRefType);
2179
2180 SmallVector<Value, 4> tagIndices({zeroIndex});
2181 auto tagAffineMap = b.getMultiDimIdentityMap(rank: tagIndices.size());
2182 fullyComposeAffineMapAndOperands(map: &tagAffineMap, operands: &tagIndices);
2183 if (!region.isWrite()) {
2184 // DMA non-blocking read from original buffer to fast buffer.
2185 b.create<AffineDmaStartOp>(location: loc, args&: memref, args&: memAffineMap, args&: memIndices,
2186 args&: fastMemRef, args&: bufAffineMap, args&: bufIndices,
2187 args&: tagMemRef, args&: tagAffineMap, args&: tagIndices,
2188 args&: numElementsSSA, args&: dmaStride, args&: numEltPerDmaStride);
2189 } else {
2190 // DMA non-blocking write from fast buffer to the original memref.
2191 auto op = b.create<AffineDmaStartOp>(
2192 location: loc, args&: fastMemRef, args&: bufAffineMap, args&: bufIndices, args&: memref, args&: memAffineMap,
2193 args&: memIndices, args&: tagMemRef, args&: tagAffineMap, args&: tagIndices, args&: numElementsSSA,
2194 args&: dmaStride, args&: numEltPerDmaStride);
2195 // Since new ops may be appended at 'end' (for outgoing DMAs), adjust the
2196 // end to mark end of block range being processed.
2197 if (isCopyOutAtEndOfBlock)
2198 *nEnd = Block::iterator(op.getOperation());
2199 }
2200
2201 // Matching DMA wait to block on completion; tag always has a 0 index.
2202 b.create<AffineDmaWaitOp>(location: loc, args&: tagMemRef, args&: tagAffineMap, args&: zeroIndex,
2203 args&: numElementsSSA);
2204
2205 // Generate dealloc for the tag.
2206 auto tagDeallocOp = epilogue.create<memref::DeallocOp>(location: loc, args&: tagMemRef);
2207 if (*nEnd == end && isCopyOutAtEndOfBlock)
2208 // Since new ops are being appended (for outgoing DMAs), adjust the end to
2209 // mark end of range of the original.
2210 *nEnd = Block::iterator(tagDeallocOp.getOperation());
2211 }
2212
2213 // Generate dealloc for the buffer.
2214 if (!existingBuf) {
2215 auto bufDeallocOp = epilogue.create<memref::DeallocOp>(location: loc, args&: fastMemRef);
2216 // When generating pointwise copies, `nEnd' has to be set to deallocOp on
2217 // the fast buffer (since it marks the new end insertion point).
2218 if (!copyOptions.generateDma && *nEnd == end && isCopyOutAtEndOfBlock)
2219 *nEnd = Block::iterator(bufDeallocOp.getOperation());
2220 }
2221
2222 // Replace all uses of the old memref with the faster one while remapping
2223 // access indices (subtracting out lower bound offsets for each dimension).
2224 // Ex: to replace load %A[%i, %j] with load %Abuf[%i - %iT, %j - %jT],
2225 // index remap will be (%i, %j) -> (%i - %iT, %j - %jT),
2226 // i.e., affine.apply (d0, d1, d2, d3) -> (d2-d0, d3-d1) (%iT, %jT, %i, %j),
2227 // and (%iT, %jT) will be the 'extraOperands' for 'rep all memref uses with'.
2228 // d2, d3 correspond to the original indices (%i, %j).
2229 SmallVector<AffineExpr, 4> remapExprs;
2230 remapExprs.reserve(N: rank);
2231 for (unsigned i = 0; i < rank; i++) {
2232 // The starting operands of indexRemap will be regionSymbols (the symbols on
2233 // which the memref region is parametric); then those corresponding to
2234 // the memref's original indices follow.
2235 auto dimExpr = b.getAffineDimExpr(position: regionSymbols.size() + i);
2236 remapExprs.push_back(Elt: dimExpr - fastBufOffsets[i]);
2237 }
2238 auto indexRemap = AffineMap::get(dimCount: regionSymbols.size() + rank, symbolCount: 0, results: remapExprs,
2239 context: b.getContext());
2240
2241 // Record the begin since it may be invalidated by memref replacement.
2242 Block::iterator prevOfBegin;
2243 bool isBeginAtStartOfBlock = (begin == block->begin());
2244 if (!isBeginAtStartOfBlock)
2245 prevOfBegin = std::prev(x: begin);
2246
2247 auto userFilterFn = [&](Operation *user) {
2248 auto *ancestorUser = block->findAncestorOpInBlock(op&: *user);
2249 return ancestorUser && !ancestorUser->isBeforeInBlock(other: &*begin) &&
2250 !lastCopyOp->isBeforeInBlock(other: ancestorUser);
2251 };
2252
2253 // *Only* those uses within the range [begin, end) of 'block' are replaced.
2254 (void)replaceAllMemRefUsesWith(oldMemRef: memref, newMemRef: fastMemRef,
2255 /*extraIndices=*/{}, indexRemap,
2256 /*extraOperands=*/regionSymbols,
2257 /*symbolOperands=*/{}, userFilterFn);
2258
2259 *nBegin = isBeginAtStartOfBlock ? block->begin() : std::next(x: prevOfBegin);
2260
2261 return success();
2262}
2263
2264/// Construct the memref region to just include the entire memref. Returns false
2265/// dynamic shaped memref's for now. `numParamLoopIVs` is the number of
2266/// enclosing loop IVs of `op` (starting from the outermost) that the region
2267/// is parametric on.
2268static bool getFullMemRefAsRegion(Operation *op, unsigned numParamLoopIVs,
2269 MemRefRegion *region) {
2270 unsigned rank;
2271 if (auto loadOp = dyn_cast<AffineLoadOp>(Val: op)) {
2272 rank = loadOp.getMemRefType().getRank();
2273 region->memref = loadOp.getMemRef();
2274 region->setWrite(false);
2275 } else if (auto storeOp = dyn_cast<AffineStoreOp>(Val: op)) {
2276 rank = storeOp.getMemRefType().getRank();
2277 region->memref = storeOp.getMemRef();
2278 region->setWrite(true);
2279 } else {
2280 assert(false && "expected load or store op");
2281 return false;
2282 }
2283 auto memRefType = cast<MemRefType>(Val: region->memref.getType());
2284 if (!memRefType.hasStaticShape())
2285 return false;
2286
2287 auto *regionCst = region->getConstraints();
2288
2289 // Just get the first numSymbols IVs, which the memref region is parametric
2290 // on.
2291 SmallVector<AffineForOp, 4> ivs;
2292 getAffineForIVs(op&: *op, loops: &ivs);
2293 ivs.resize(N: numParamLoopIVs);
2294 SmallVector<Value, 4> symbols;
2295 extractForInductionVars(forInsts: ivs, ivs: &symbols);
2296 *regionCst = FlatAffineValueConstraints(rank, numParamLoopIVs, 0);
2297 regionCst->setValues(start: rank, end: rank + numParamLoopIVs, values: symbols);
2298
2299 // Memref dim sizes provide the bounds.
2300 for (unsigned d = 0; d < rank; d++) {
2301 auto dimSize = memRefType.getDimSize(idx: d);
2302 assert(dimSize > 0 && "filtered dynamic shapes above");
2303 regionCst->addBound(type: BoundType::LB, pos: d, value: 0);
2304 regionCst->addBound(type: BoundType::UB, pos: d, value: dimSize - 1);
2305 }
2306 return true;
2307}
2308
2309LogicalResult
2310mlir::affine::affineDataCopyGenerate(Block::iterator begin, Block::iterator end,
2311 const AffineCopyOptions &copyOptions,
2312 std::optional<Value> filterMemRef,
2313 DenseSet<Operation *> &copyNests) {
2314 if (begin == end)
2315 return success();
2316
2317 assert(begin->getBlock() == std::prev(end)->getBlock() &&
2318 "Inconsistent block begin/end args");
2319 assert(end != end->getBlock()->end() && "end can't be the block terminator");
2320
2321 Block *block = begin->getBlock();
2322
2323 // Copies will be generated for this depth, i.e., symbolic in all loops
2324 // surrounding the this block range.
2325 unsigned copyDepth = getNestingDepth(op: &*begin);
2326
2327 LLVM_DEBUG(llvm::dbgs() << "Generating copies at depth " << copyDepth
2328 << "\n");
2329 LLVM_DEBUG(llvm::dbgs() << "from begin: " << *begin << "\n");
2330 LLVM_DEBUG(llvm::dbgs() << "to inclusive end: " << *std::prev(end) << "\n");
2331
2332 // List of memory regions to copy for. We need a map vector to have a
2333 // guaranteed iteration order to write test cases. CHECK-DAG doesn't help here
2334 // since the alloc's for example are identical except for the SSA id.
2335 SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4> readRegions;
2336 SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4> writeRegions;
2337
2338 // Map from original memref's to the fast buffers that their accesses are
2339 // replaced with.
2340 DenseMap<Value, Value> fastBufferMap;
2341
2342 // To check for errors when walking the block.
2343 bool error = false;
2344
2345 // Walk this range of operations to gather all memory regions.
2346 block->walk(begin, end, callback: [&](Operation *opInst) {
2347 Value memref;
2348 MemRefType memrefType;
2349 // Gather regions to allocate to buffers in faster memory space.
2350 if (auto loadOp = dyn_cast<AffineLoadOp>(Val: opInst)) {
2351 memref = loadOp.getMemRef();
2352 memrefType = loadOp.getMemRefType();
2353 } else if (auto storeOp = dyn_cast<AffineStoreOp>(Val: opInst)) {
2354 memref = storeOp.getMemRef();
2355 memrefType = storeOp.getMemRefType();
2356 }
2357 // Not an affine.load/store op.
2358 if (!memref)
2359 return;
2360
2361 if ((filterMemRef.has_value() && filterMemRef != memref) ||
2362 (isa_and_nonnull<IntegerAttr>(Val: memrefType.getMemorySpace()) &&
2363 memrefType.getMemorySpaceAsInt() != copyOptions.slowMemorySpace))
2364 return;
2365
2366 if (!memref.getParentRegion()->isAncestor(other: block->getParent())) {
2367 LLVM_DEBUG(llvm::dbgs() << "memref definition is inside of the depth at "
2368 "which copy-in/copy-out would happen\n");
2369 return;
2370 }
2371
2372 // Compute the MemRefRegion accessed.
2373 auto region = std::make_unique<MemRefRegion>(args: opInst->getLoc());
2374 if (failed(Result: region->compute(op: opInst, loopDepth: copyDepth, /*sliceState=*/nullptr,
2375 /*addMemRefDimBounds=*/false))) {
2376 LLVM_DEBUG(llvm::dbgs()
2377 << "Error obtaining memory region: semi-affine maps?\n");
2378 LLVM_DEBUG(llvm::dbgs() << "over-approximating to the entire memref\n");
2379 if (!getFullMemRefAsRegion(op: opInst, numParamLoopIVs: copyDepth, region: region.get())) {
2380 LLVM_DEBUG(
2381 opInst->emitError("non-constant memref sizes not yet supported"));
2382 error = true;
2383 return;
2384 }
2385 }
2386
2387 // Each memref has a single buffer associated with it irrespective of how
2388 // many load's and store's happen on it.
2389 // TODO: in the future, when regions don't intersect and satisfy
2390 // other properties (based on load/store regions), we could consider
2391 // multiple buffers per memref.
2392
2393 // Add to the appropriate region if it's not already in it, or take a
2394 // bounding box union with the existing one if it's already in there.
2395 // Note that a memref may have both read and write regions - so update the
2396 // region in the other list if one exists (write in case of read and vice
2397 // versa) since there is a single bounding box for a memref across all reads
2398 // and writes that happen on it.
2399
2400 // Attempts to update; returns true if 'region' exists in targetRegions.
2401 auto updateRegion =
2402 [&](const SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4>
2403 &targetRegions) {
2404 const auto *const it = targetRegions.find(Key: region->memref);
2405 if (it == targetRegions.end())
2406 return false;
2407
2408 // Perform a union with the existing region.
2409 if (failed(Result: it->second->unionBoundingBox(other: *region))) {
2410 LLVM_DEBUG(llvm::dbgs()
2411 << "Memory region bounding box failed; "
2412 "over-approximating to the entire memref\n");
2413 // If the union fails, we will overapproximate.
2414 if (!getFullMemRefAsRegion(op: opInst, numParamLoopIVs: copyDepth, region: region.get())) {
2415 LLVM_DEBUG(opInst->emitError(
2416 "non-constant memref sizes not yet supported"));
2417 error = true;
2418 return true;
2419 }
2420 it->second->getConstraints()->clearAndCopyFrom(
2421 other: *region->getConstraints());
2422 } else {
2423 // Union was computed and stored in 'it->second': copy to 'region'.
2424 region->getConstraints()->clearAndCopyFrom(
2425 other: *it->second->getConstraints());
2426 }
2427 return true;
2428 };
2429
2430 bool existsInRead = updateRegion(readRegions);
2431 if (error)
2432 return;
2433 bool existsInWrite = updateRegion(writeRegions);
2434 if (error)
2435 return;
2436
2437 // Finally add it to the region list.
2438 if (region->isWrite() && !existsInWrite) {
2439 writeRegions[region->memref] = std::move(region);
2440 } else if (!region->isWrite() && !existsInRead) {
2441 readRegions[region->memref] = std::move(region);
2442 }
2443 });
2444
2445 if (error) {
2446 LLVM_DEBUG(begin->emitError(
2447 "copy generation failed for one or more memref's in this block\n"));
2448 return failure();
2449 }
2450
2451 uint64_t totalCopyBuffersSizeInBytes = 0;
2452 bool ret = true;
2453 auto processRegions =
2454 [&](const SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4>
2455 &regions) {
2456 for (const auto &regionEntry : regions) {
2457 // For each region, hoist copy in/out past all hoistable
2458 // 'affine.for's.
2459 Block::iterator copyInPlacementStart, copyOutPlacementStart;
2460 Block *copyPlacementBlock;
2461 findHighestBlockForPlacement(
2462 region: *regionEntry.second, block&: *block, begin, end, copyPlacementBlock: &copyPlacementBlock,
2463 copyInPlacementStart: &copyInPlacementStart, copyOutPlacementStart: &copyOutPlacementStart);
2464
2465 uint64_t sizeInBytes;
2466 Block::iterator nBegin, nEnd;
2467 LogicalResult iRet = generateCopy(
2468 region: *regionEntry.second, block, begin, end, copyPlacementBlock,
2469 copyInPlacementStart, copyOutPlacementStart, copyOptions,
2470 fastBufferMap, copyNests, sizeInBytes: &sizeInBytes, nBegin: &nBegin, nEnd: &nEnd);
2471 if (succeeded(Result: iRet)) {
2472 // begin/end could have been invalidated, and need update.
2473 begin = nBegin;
2474 end = nEnd;
2475 totalCopyBuffersSizeInBytes += sizeInBytes;
2476 }
2477 ret = ret & succeeded(Result: iRet);
2478 }
2479 };
2480 processRegions(readRegions);
2481 processRegions(writeRegions);
2482
2483 if (!ret) {
2484 LLVM_DEBUG(begin->emitError(
2485 "copy generation failed for one or more memref's in this block\n"));
2486 return failure();
2487 }
2488
2489 // For a range of operations, a note will be emitted at the caller.
2490 AffineForOp forOp;
2491 if (llvm::DebugFlag && (forOp = dyn_cast<AffineForOp>(Val: &*begin))) {
2492 LLVM_DEBUG(forOp.emitRemark()
2493 << llvm::divideCeil(totalCopyBuffersSizeInBytes, 1024)
2494 << " KiB of copy buffers in fast memory space for this block");
2495 }
2496
2497 if (totalCopyBuffersSizeInBytes > copyOptions.fastMemCapacityBytes) {
2498 block->getParentOp()->emitWarning(
2499 message: "total size of all copy buffers' for this block exceeds fast memory "
2500 "capacity");
2501 }
2502
2503 return success();
2504}
2505
2506// A convenience version of affineDataCopyGenerate for all ops in the body of
2507// an AffineForOp.
2508LogicalResult mlir::affine::affineDataCopyGenerate(
2509 AffineForOp forOp, const AffineCopyOptions &copyOptions,
2510 std::optional<Value> filterMemRef, DenseSet<Operation *> &copyNests) {
2511 return affineDataCopyGenerate(begin: forOp.getBody()->begin(),
2512 end: std::prev(x: forOp.getBody()->end()), copyOptions,
2513 filterMemRef, copyNests);
2514}
2515
2516LogicalResult mlir::affine::generateCopyForMemRegion(
2517 const MemRefRegion &memrefRegion, Operation *analyzedOp,
2518 const AffineCopyOptions &copyOptions, CopyGenerateResult &result) {
2519 Block *block = analyzedOp->getBlock();
2520 auto begin = analyzedOp->getIterator();
2521 auto end = std::next(x: begin);
2522 DenseMap<Value, Value> fastBufferMap;
2523 DenseSet<Operation *> copyNests;
2524
2525 auto err = generateCopy(region: memrefRegion, block, begin, end, copyPlacementBlock: block, copyInPlacementStart: begin, copyOutPlacementStart: end,
2526 copyOptions, fastBufferMap, copyNests,
2527 sizeInBytes: &result.sizeInBytes, nBegin: &begin, nEnd: &end);
2528 if (failed(Result: err))
2529 return err;
2530
2531 const auto &en = fastBufferMap.find(Val: memrefRegion.memref);
2532 // In some cases (empty loops), no copy generation would have happened.
2533 if (en == fastBufferMap.end())
2534 return failure();
2535 result.alloc = en->second.getDefiningOp();
2536 assert(result.alloc && "fast buffer expected to be locally allocated");
2537 assert(copyNests.size() <= 1 && "At most one copy nest is expected.");
2538 result.copyNest = copyNests.empty() ? nullptr : *copyNests.begin();
2539 return success();
2540}
2541
2542/// Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'.
2543static void
2544gatherLoopsInBlock(Block *block, unsigned currLoopDepth,
2545 std::vector<SmallVector<AffineForOp, 2>> &depthToLoops) {
2546 // Add a new empty level to output if it doesn't exist level already.
2547 assert(currLoopDepth <= depthToLoops.size() && "Unexpected currLoopDepth");
2548 if (currLoopDepth == depthToLoops.size())
2549 depthToLoops.emplace_back();
2550
2551 for (auto &op : *block) {
2552 if (auto forOp = dyn_cast<AffineForOp>(Val&: op)) {
2553 depthToLoops[currLoopDepth].push_back(Elt: forOp);
2554 gatherLoopsInBlock(block: forOp.getBody(), currLoopDepth: currLoopDepth + 1, depthToLoops);
2555 }
2556 }
2557}
2558
2559/// Gathers all AffineForOps in 'func.func' grouped by loop depth.
2560void mlir::affine::gatherLoops(
2561 func::FuncOp func, std::vector<SmallVector<AffineForOp, 2>> &depthToLoops) {
2562 for (auto &block : func)
2563 gatherLoopsInBlock(block: &block, /*currLoopDepth=*/0, depthToLoops);
2564
2565 // Remove last loop level from output since it's empty.
2566 if (!depthToLoops.empty()) {
2567 assert(depthToLoops.back().empty() && "Last loop level is not empty?");
2568 depthToLoops.pop_back();
2569 }
2570}
2571
2572AffineForOp mlir::affine::createCanonicalizedAffineForOp(
2573 OpBuilder b, Location loc, ValueRange lbOperands, AffineMap lbMap,
2574 ValueRange ubOperands, AffineMap ubMap, int64_t step) {
2575 SmallVector<Value, 4> lowerOperands(lbOperands);
2576 SmallVector<Value, 4> upperOperands(ubOperands);
2577
2578 fullyComposeAffineMapAndOperands(map: &lbMap, operands: &lowerOperands);
2579 canonicalizeMapAndOperands(map: &lbMap, operands: &lowerOperands);
2580 lbMap = removeDuplicateExprs(map: lbMap);
2581 fullyComposeAffineMapAndOperands(map: &ubMap, operands: &upperOperands);
2582 canonicalizeMapAndOperands(map: &ubMap, operands: &upperOperands);
2583 ubMap = removeDuplicateExprs(map: ubMap);
2584
2585 return b.create<AffineForOp>(location: loc, args&: lowerOperands, args&: lbMap, args&: upperOperands, args&: ubMap,
2586 args&: step);
2587}
2588
2589/// Creates an AffineIfOp that encodes the conditional to choose between
2590/// the constant trip count version and an unknown trip count version of this
2591/// nest of loops. This is used to separate partial and full tiles if `loops`
2592/// has the intra-tile loops. The affine.if op is inserted at the builder
2593/// insertion point of `b`.
2594static AffineIfOp createSeparationCondition(MutableArrayRef<AffineForOp> loops,
2595 OpBuilder b) {
2596 if (loops.empty())
2597 return nullptr;
2598
2599 auto *context = loops[0].getContext();
2600
2601 FlatAffineValueConstraints cst;
2602 SmallVector<Operation *, 8> ops;
2603 llvm::append_range(C&: ops, R&: loops);
2604 (void)getIndexSet(ops, domain: &cst);
2605
2606 // Remove constraints that are independent of these loop IVs.
2607 cst.removeIndependentConstraints(/*pos=*/0, /*num=*/loops.size());
2608
2609 // Construct the constraint set representing the guard for full tiles. The
2610 // lower bound (and upper bound) corresponding to the full tile should be
2611 // larger (and resp. smaller) than any other lower (or upper bound).
2612 SmallVector<int64_t, 8> fullTileLb, fullTileUb;
2613 for (auto loop : loops) {
2614 (void)loop;
2615 // TODO: Non-unit stride is not an issue to generalize to.
2616 assert(loop.getStepAsInt() == 1 && "point loop step expected to be one");
2617 // Mark everything symbols for the purpose of finding a constant diff pair.
2618 cst.setDimSymbolSeparation(/*newSymbolCount=*/cst.getNumDimAndSymbolVars() -
2619 1);
2620 unsigned fullTileLbPos, fullTileUbPos;
2621 if (!((IntegerRelation)cst)
2622 .getConstantBoundOnDimSize(pos: 0, /*lb=*/nullptr,
2623 /*boundFloorDivisor=*/nullptr,
2624 /*ub=*/nullptr, minLbPos: &fullTileLbPos,
2625 minUbPos: &fullTileUbPos)) {
2626 LLVM_DEBUG(llvm::dbgs() << "Can't get constant diff pair for a loop\n");
2627 return nullptr;
2628 }
2629
2630 SmallVector<unsigned, 4> lbIndices, ubIndices;
2631 cst.getLowerAndUpperBoundIndices(/*pos=*/0, lbIndices: &lbIndices, ubIndices: &ubIndices);
2632
2633 auto fLb = cst.getInequality(idx: fullTileLbPos);
2634 auto fUb = cst.getInequality(idx: fullTileUbPos);
2635 fullTileLb.assign(in_start: fLb.begin(), in_end: fLb.end());
2636 fullTileUb.assign(in_start: fUb.begin(), in_end: fUb.end());
2637
2638 // Full tile lower bound should be >= than any other lower bound.
2639 for (auto lbIndex : lbIndices)
2640 for (unsigned i = 0, e = cst.getNumCols(); i < e; ++i)
2641 cst.atIneq(i: lbIndex, j: i) = fullTileLb[i] - cst.atIneq(i: lbIndex, j: i);
2642
2643 // Full tile upper bound should be <= any other upper bound.
2644 for (auto ubIndex : ubIndices)
2645 for (unsigned i = 0, e = cst.getNumCols(); i < e; ++i)
2646 cst.atIneq(i: ubIndex, j: i) -= fullTileUb[i];
2647
2648 cst.removeVar(pos: 0);
2649 }
2650
2651 // The previous step leads to all zeros for the full tile lb and ub position
2652 // itself; remove those and any other duplicates / trivial redundancies.
2653 cst.removeTrivialRedundancy();
2654
2655 // Turn everything into dims conservatively since we earlier turned all
2656 // trailing ids past point loop IV into symbols. Some of these could be outer
2657 // loop IVs; we'll canonicalize anyway.
2658 cst.setDimSymbolSeparation(0);
2659
2660 IntegerSet ifCondSet = cst.getAsIntegerSet(context);
2661 // ifCondSet can be null if cst was empty -- this can happen if all loops
2662 // in the nest have constant trip counts.
2663 if (!ifCondSet)
2664 return nullptr;
2665
2666 SmallVector<Value, 4> setOperands;
2667 cst.getValues(start: 0, end: cst.getNumDimAndSymbolVars(), values: &setOperands);
2668 canonicalizeSetAndOperands(set: &ifCondSet, operands: &setOperands);
2669 return b.create<AffineIfOp>(location: loops[0].getLoc(), args&: ifCondSet, args&: setOperands,
2670 /*withElseRegion=*/args: true);
2671}
2672
2673/// Create the full tile loop nest (along with its body).
2674static LogicalResult
2675createFullTiles(MutableArrayRef<AffineForOp> inputNest,
2676 SmallVectorImpl<AffineForOp> &fullTileLoops, OpBuilder b) {
2677 fullTileLoops.reserve(N: inputNest.size());
2678
2679 // For each loop in the original nest identify a lower/upper bound pair such
2680 // that their difference is a constant.
2681 FlatAffineValueConstraints cst;
2682 for (auto loop : inputNest) {
2683 // TODO: straightforward to generalize to a non-unit stride.
2684 if (loop.getStepAsInt() != 1) {
2685 LLVM_DEBUG(llvm::dbgs()
2686 << "[tile separation] non-unit stride not implemented\n");
2687 return failure();
2688 }
2689 SmallVector<Operation *, 1> loopOp{loop.getOperation()};
2690 (void)getIndexSet(ops: loopOp, domain: &cst);
2691 // We will mark everything other than this loop IV as symbol for getting a
2692 // pair of <lb, ub> with a constant difference.
2693 cst.setDimSymbolSeparation(cst.getNumDimAndSymbolVars() - 1);
2694 unsigned lbPos, ubPos;
2695 if (!((IntegerRelation)cst)
2696 .getConstantBoundOnDimSize(/*pos=*/0, /*lb=*/nullptr,
2697 /*boundFloorDivisor=*/nullptr,
2698 /*ub=*/nullptr, minLbPos: &lbPos, minUbPos: &ubPos) ||
2699 lbPos == ubPos) {
2700 LLVM_DEBUG(llvm::dbgs() << "[tile separation] Can't get constant diff / "
2701 "equalities not yet handled\n");
2702 return failure();
2703 }
2704
2705 // Set all variables as dimensions uniformly since some of those marked as
2706 // symbols above could be outer loop IVs (corresponding tile space IVs).
2707 cst.setDimSymbolSeparation(/*newSymbolCount=*/0);
2708
2709 AffineValueMap lbVmap, ubVmap;
2710 cst.getIneqAsAffineValueMap(/*pos=*/0, ineqPos: lbPos, vmap&: lbVmap, context: b.getContext());
2711 cst.getIneqAsAffineValueMap(/*pos=*/0, ineqPos: ubPos, vmap&: ubVmap, context: b.getContext());
2712 AffineForOp fullTileLoop = createCanonicalizedAffineForOp(
2713 b, loc: loop.getLoc(), lbOperands: lbVmap.getOperands(), lbMap: lbVmap.getAffineMap(),
2714 ubOperands: ubVmap.getOperands(), ubMap: ubVmap.getAffineMap());
2715 b = OpBuilder::atBlockTerminator(block: fullTileLoop.getBody());
2716 fullTileLoops.push_back(Elt: fullTileLoop);
2717 }
2718
2719 // Add the body for the full tile loop nest.
2720 IRMapping operandMap;
2721 for (const auto &loopEn : llvm::enumerate(First&: inputNest))
2722 operandMap.map(from: loopEn.value().getInductionVar(),
2723 to: fullTileLoops[loopEn.index()].getInductionVar());
2724 b = OpBuilder::atBlockTerminator(block: fullTileLoops.back().getBody());
2725 for (auto &op : inputNest.back().getBody()->without_terminator())
2726 b.clone(op, mapper&: operandMap);
2727 return success();
2728}
2729
2730LogicalResult
2731mlir::affine::separateFullTiles(MutableArrayRef<AffineForOp> inputNest,
2732 SmallVectorImpl<AffineForOp> *fullTileNest) {
2733 if (inputNest.empty())
2734 return success();
2735
2736 auto firstLoop = inputNest[0];
2737
2738 // Each successive for op has to be nested in the other.
2739 auto prevLoop = firstLoop;
2740 for (auto loop : inputNest.drop_front(N: 1)) {
2741 assert(loop->getParentOp() == prevLoop && "input not contiguously nested");
2742 prevLoop = loop;
2743 }
2744
2745 // Create the full tile loop nest.
2746 SmallVector<AffineForOp, 4> fullTileLoops;
2747 OpBuilder b(firstLoop);
2748 if (failed(Result: createFullTiles(inputNest, fullTileLoops, b))) {
2749 if (!fullTileLoops.empty())
2750 fullTileLoops.front().erase();
2751 return failure();
2752 }
2753
2754 // Create and insert the version select right before the root of the nest.
2755 b = OpBuilder(firstLoop);
2756 AffineIfOp ifOp = createSeparationCondition(loops: inputNest, b);
2757 if (!ifOp) {
2758 fullTileLoops.front().erase();
2759 LLVM_DEBUG(llvm::dbgs() << "All tiles are full tiles, or failure creating "
2760 "separation condition\n");
2761 return failure();
2762 }
2763
2764 // Move the full tile into the then block.
2765 Block *thenBlock = ifOp.getThenBlock();
2766 AffineForOp outermostFullTileLoop = fullTileLoops[0];
2767 thenBlock->getOperations().splice(
2768 where: std::prev(x: thenBlock->end()),
2769 L2&: outermostFullTileLoop->getBlock()->getOperations(),
2770 first: Block::iterator(outermostFullTileLoop));
2771
2772 // Move the partial tile into the else block. The partial tile is the same as
2773 // the original loop nest.
2774 Block *elseBlock = ifOp.getElseBlock();
2775 elseBlock->getOperations().splice(where: std::prev(x: elseBlock->end()),
2776 L2&: firstLoop->getBlock()->getOperations(),
2777 first: Block::iterator(firstLoop));
2778
2779 if (fullTileNest)
2780 *fullTileNest = std::move(fullTileLoops);
2781
2782 return success();
2783}
2784
2785LogicalResult affine::coalescePerfectlyNestedAffineLoops(AffineForOp op) {
2786 LogicalResult result(failure());
2787 SmallVector<AffineForOp> loops;
2788 getPerfectlyNestedLoops(nestedLoops&: loops, root: op);
2789 if (loops.size() <= 1)
2790 return success();
2791
2792 // Look for a band of loops that can be coalesced, i.e. perfectly nested
2793 // loops with bounds defined above some loop.
2794 // 1. For each loop, find above which parent loop its operands are
2795 // defined.
2796 SmallVector<unsigned> operandsDefinedAbove(loops.size());
2797 for (unsigned i = 0, e = loops.size(); i < e; ++i) {
2798 operandsDefinedAbove[i] = i;
2799 for (unsigned j = 0; j < i; ++j) {
2800 if (areValuesDefinedAbove(values: loops[i].getOperands(), limit&: loops[j].getRegion())) {
2801 operandsDefinedAbove[i] = j;
2802 break;
2803 }
2804 }
2805 }
2806
2807 // 2. Identify bands of loops such that the operands of all of them are
2808 // defined above the first loop in the band. Traverse the nest bottom-up
2809 // so that modifications don't invalidate the inner loops.
2810 for (unsigned end = loops.size(); end > 0; --end) {
2811 unsigned start = 0;
2812 for (; start < end - 1; ++start) {
2813 auto maxPos =
2814 *std::max_element(first: std::next(x: operandsDefinedAbove.begin(), n: start),
2815 last: std::next(x: operandsDefinedAbove.begin(), n: end));
2816 if (maxPos > start)
2817 continue;
2818 assert(maxPos == start &&
2819 "expected loop bounds to be known at the start of the band");
2820 auto band = llvm::MutableArrayRef(loops.data() + start, end - start);
2821 if (succeeded(Result: coalesceLoops(loops: band)))
2822 result = success();
2823 break;
2824 }
2825 // If a band was found and transformed, keep looking at the loops above
2826 // the outermost transformed loop.
2827 if (start != end - 1)
2828 end = start + 1;
2829 }
2830 return result;
2831}
2832
2833int64_t mlir::affine::numEnclosingInvariantLoops(OpOperand &operand) {
2834 int64_t count = 0;
2835 Operation *currentOp = operand.getOwner();
2836 while (auto loopOp = currentOp->getParentOfType<LoopLikeOpInterface>()) {
2837 if (!loopOp.isDefinedOutsideOfLoop(value: operand.get()))
2838 break;
2839 currentOp = loopOp;
2840 count++;
2841 }
2842 return count;
2843}
2844

source code of mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp