1//===- LoopUtils.cpp ---- Misc utilities for loop transformation ----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements miscellaneous loop transformation routines.
10//
11//===----------------------------------------------------------------------===//
12
13#include "mlir/Dialect/Affine/LoopUtils.h"
14#include "mlir/Analysis/SliceAnalysis.h"
15#include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h"
16#include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h"
17#include "mlir/Dialect/Affine/Analysis/Utils.h"
18#include "mlir/Dialect/Affine/IR/AffineOps.h"
19#include "mlir/Dialect/Affine/IR/AffineValueMap.h"
20#include "mlir/Dialect/Affine/Utils.h"
21#include "mlir/Dialect/Func/IR/FuncOps.h"
22#include "mlir/Dialect/MemRef/IR/MemRef.h"
23#include "mlir/Dialect/SCF/IR/SCF.h"
24#include "mlir/IR/IRMapping.h"
25#include "mlir/IR/IntegerSet.h"
26#include "mlir/Support/MathExtras.h"
27#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
28#include "llvm/ADT/MapVector.h"
29#include "llvm/ADT/SmallPtrSet.h"
30#include "llvm/Support/Debug.h"
31#include "llvm/Support/raw_ostream.h"
32#include <optional>
33
34#define DEBUG_TYPE "loop-utils"
35
36using namespace mlir;
37using namespace affine;
38using namespace presburger;
39using llvm::SmallMapVector;
40
41namespace {
42// This structure is to pass and return sets of loop parameters without
43// confusing the order.
44struct LoopParams {
45 Value lowerBound;
46 Value upperBound;
47 Value step;
48};
49} // namespace
50
51/// Computes the cleanup loop lower bound of the loop being unrolled with
52/// the specified unroll factor; this bound will also be upper bound of the main
53/// part of the unrolled loop. Computes the bound as an AffineMap with its
54/// operands or a null map when the trip count can't be expressed as an affine
55/// expression.
56static void
57getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor,
58 AffineMap &cleanupLbMap,
59 SmallVectorImpl<Value> &cleanupLbOperands) {
60 AffineMap tripCountMap;
61 SmallVector<Value, 4> tripCountOperands;
62 getTripCountMapAndOperands(forOp, &tripCountMap, &tripCountOperands);
63 // Trip count can't be computed.
64 if (!tripCountMap) {
65 cleanupLbMap = AffineMap();
66 return;
67 }
68
69 OpBuilder b(forOp);
70 auto lbMap = forOp.getLowerBoundMap();
71 auto lb = b.create<AffineApplyOp>(forOp.getLoc(), lbMap,
72 forOp.getLowerBoundOperands());
73
74 // For each upper bound expr, get the range.
75 // Eg: affine.for %i = lb to min (ub1, ub2),
76 // where tripCountExprs yield (tr1, tr2), we create affine.apply's:
77 // lb + tr1 - tr1 % ufactor, lb + tr2 - tr2 % ufactor; the results of all
78 // these affine.apply's make up the cleanup loop lower bound.
79 SmallVector<AffineExpr, 4> bumpExprs(tripCountMap.getNumResults());
80 SmallVector<Value, 4> bumpValues(tripCountMap.getNumResults());
81 int64_t step = forOp.getStepAsInt();
82 for (unsigned i = 0, e = tripCountMap.getNumResults(); i < e; i++) {
83 auto tripCountExpr = tripCountMap.getResult(idx: i);
84 bumpExprs[i] = (tripCountExpr - tripCountExpr % unrollFactor) * step;
85 auto bumpMap = AffineMap::get(dimCount: tripCountMap.getNumDims(),
86 symbolCount: tripCountMap.getNumSymbols(), result: bumpExprs[i]);
87 bumpValues[i] =
88 b.create<AffineApplyOp>(forOp.getLoc(), bumpMap, tripCountOperands);
89 }
90
91 SmallVector<AffineExpr, 4> newUbExprs(tripCountMap.getNumResults());
92 for (unsigned i = 0, e = bumpExprs.size(); i < e; i++)
93 newUbExprs[i] = b.getAffineDimExpr(position: 0) + b.getAffineDimExpr(position: i + 1);
94
95 cleanupLbOperands.clear();
96 cleanupLbOperands.push_back(Elt: lb);
97 cleanupLbOperands.append(in_start: bumpValues.begin(), in_end: bumpValues.end());
98 cleanupLbMap = AffineMap::get(dimCount: 1 + tripCountMap.getNumResults(), symbolCount: 0, results: newUbExprs,
99 context: b.getContext());
100 // Simplify the cleanupLbMap + cleanupLbOperands.
101 fullyComposeAffineMapAndOperands(map: &cleanupLbMap, operands: &cleanupLbOperands);
102 cleanupLbMap = simplifyAffineMap(map: cleanupLbMap);
103 canonicalizeMapAndOperands(map: &cleanupLbMap, operands: &cleanupLbOperands);
104 // Remove any affine.apply's that became dead from the simplification above.
105 for (auto v : bumpValues)
106 if (v.use_empty())
107 v.getDefiningOp()->erase();
108
109 if (lb.use_empty())
110 lb.erase();
111}
112
113/// Helper to replace uses of loop carried values (iter_args) and loop
114/// yield values while promoting single iteration affine.for ops.
115static void replaceIterArgsAndYieldResults(AffineForOp forOp) {
116 // Replace uses of iter arguments with iter operands (initial values).
117 auto iterOperands = forOp.getInits();
118 auto iterArgs = forOp.getRegionIterArgs();
119 for (auto e : llvm::zip(iterOperands, iterArgs))
120 std::get<1>(e).replaceAllUsesWith(std::get<0>(e));
121
122 // Replace uses of loop results with the values yielded by the loop.
123 auto outerResults = forOp.getResults();
124 auto innerResults = forOp.getBody()->getTerminator()->getOperands();
125 for (auto e : llvm::zip(outerResults, innerResults))
126 std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
127}
128
129/// Promotes the loop body of a forOp to its containing block if the forOp
130/// was known to have a single iteration.
131LogicalResult mlir::affine::promoteIfSingleIteration(AffineForOp forOp) {
132 std::optional<uint64_t> tripCount = getConstantTripCount(forOp);
133 if (!tripCount || *tripCount != 1)
134 return failure();
135
136 // TODO: extend this for arbitrary affine bounds.
137 if (forOp.getLowerBoundMap().getNumResults() != 1)
138 return failure();
139
140 // Replaces all IV uses to its single iteration value.
141 auto iv = forOp.getInductionVar();
142 auto *parentBlock = forOp->getBlock();
143 if (!iv.use_empty()) {
144 if (forOp.hasConstantLowerBound()) {
145 OpBuilder topBuilder(forOp->getParentOfType<func::FuncOp>().getBody());
146 auto constOp = topBuilder.create<arith::ConstantIndexOp>(
147 forOp.getLoc(), forOp.getConstantLowerBound());
148 iv.replaceAllUsesWith(constOp);
149 } else {
150 auto lbOperands = forOp.getLowerBoundOperands();
151 auto lbMap = forOp.getLowerBoundMap();
152 OpBuilder builder(forOp);
153 if (lbMap == builder.getDimIdentityMap()) {
154 // No need of generating an affine.apply.
155 iv.replaceAllUsesWith(lbOperands[0]);
156 } else {
157 auto affineApplyOp =
158 builder.create<AffineApplyOp>(forOp.getLoc(), lbMap, lbOperands);
159 iv.replaceAllUsesWith(affineApplyOp);
160 }
161 }
162 }
163
164 replaceIterArgsAndYieldResults(forOp);
165
166 // Move the loop body operations, except for its terminator, to the loop's
167 // containing block.
168 forOp.getBody()->back().erase();
169 parentBlock->getOperations().splice(Block::iterator(forOp),
170 forOp.getBody()->getOperations());
171 forOp.erase();
172 return success();
173}
174
175/// Generates an affine.for op with the specified lower and upper bounds
176/// while generating the right IV remappings to realize shifts for operations in
177/// its body. The operations that go into the loop body are specified in
178/// opGroupQueue starting from the specified offset, and in that order. The
179/// first element of the pair specifies the shift applied to that group of
180/// operations; the shift is multiplied by the loop step before being applied.
181/// Returns nullptr if the generated loop simplifies to a single iteration one.
182static AffineForOp generateShiftedLoop(
183 AffineMap lbMap, AffineMap ubMap,
184 const std::vector<std::pair<uint64_t, ArrayRef<Operation *>>> &opGroupQueue,
185 unsigned offset, AffineForOp srcForOp, OpBuilder b) {
186 auto lbOperands = srcForOp.getLowerBoundOperands();
187 auto ubOperands = srcForOp.getUpperBoundOperands();
188
189 assert(lbMap.getNumInputs() == lbOperands.size());
190 assert(ubMap.getNumInputs() == ubOperands.size());
191
192 auto loopChunk =
193 b.create<AffineForOp>(srcForOp.getLoc(), lbOperands, lbMap, ubOperands,
194 ubMap, srcForOp.getStepAsInt());
195 auto loopChunkIV = loopChunk.getInductionVar();
196 auto srcIV = srcForOp.getInductionVar();
197
198 IRMapping operandMap;
199
200 auto bodyBuilder = OpBuilder::atBlockTerminator(block: loopChunk.getBody());
201 for (const auto &it : llvm::drop_begin(RangeOrContainer: opGroupQueue, N: offset)) {
202 uint64_t shift = it.first;
203 auto ops = it.second;
204 // All 'same shift' operations get added with their operands being
205 // remapped to results of cloned operations, and their IV used remapped.
206 // Generate the remapping if the shift is not zero: remappedIV = newIV -
207 // shift.
208 if (!srcIV.use_empty() && shift != 0) {
209 auto ivRemap = bodyBuilder.create<AffineApplyOp>(
210 srcForOp.getLoc(),
211 bodyBuilder.getSingleDimShiftAffineMap(
212 -static_cast<int64_t>(srcForOp.getStepAsInt() * shift)),
213 loopChunkIV);
214 operandMap.map(srcIV, ivRemap);
215 } else {
216 operandMap.map(srcIV, loopChunkIV);
217 }
218 for (auto *op : ops)
219 bodyBuilder.clone(*op, operandMap);
220 };
221 if (succeeded(promoteIfSingleIteration(loopChunk)))
222 return AffineForOp();
223 return loopChunk;
224}
225
226// The skewing of operations with respect to one another can be used for
227// example to allow overlap of asynchronous operations (such as DMA
228// communication) with computation, or just relative shifting of operations
229// for better register reuse, locality or parallelism. As such, the shifts are
230// typically expected to be at most of the order of the number of operations.
231// This method should not be used as a substitute for loop distribution/fission.
232// This method uses an algorithm// in time linear in the number of operations
233// in the body of the for loop - (using the 'sweep line' paradigm). This method
234// asserts preservation of SSA dominance. A check for that as well as that for
235// memory-based dependence preservation check rests with the users of this
236// method.
237LogicalResult mlir::affine::affineForOpBodySkew(AffineForOp forOp,
238 ArrayRef<uint64_t> shifts,
239 bool unrollPrologueEpilogue) {
240 assert(forOp.getBody()->getOperations().size() == shifts.size() &&
241 "too few/many shifts");
242 if (forOp.getBody()->begin() == std::prev(forOp.getBody()->end()))
243 return success();
244
245 // If the trip counts aren't constant, we would need versioning and
246 // conditional guards (or context information to prevent such versioning). The
247 // better way to pipeline for such loops is to first tile them and extract
248 // constant trip count "full tiles" before applying this.
249 auto mayBeConstTripCount = getConstantTripCount(forOp);
250 if (!mayBeConstTripCount) {
251 LLVM_DEBUG(forOp.emitRemark("non-constant trip count loop not handled"));
252 return success();
253 }
254 uint64_t tripCount = *mayBeConstTripCount;
255
256 assert(isOpwiseShiftValid(forOp, shifts) &&
257 "shifts will lead to an invalid transformation\n");
258
259 int64_t step = forOp.getStepAsInt();
260
261 unsigned numChildOps = shifts.size();
262
263 // Do a linear time (counting) sort for the shifts.
264 uint64_t maxShift = *llvm::max_element(Range&: shifts);
265 if (maxShift >= numChildOps) {
266 // Large shifts are not the typical use case.
267 forOp.emitWarning("not shifting because shifts are unrealistically large");
268 return success();
269 }
270
271 // An array of operation groups sorted by shift amount; each group has all
272 // operations with the same shift in the order in which they appear in the
273 // body of the 'affine.for' op.
274 std::vector<std::vector<Operation *>> sortedOpGroups(maxShift + 1);
275 unsigned pos = 0;
276 for (auto &op : forOp.getBody()->without_terminator()) {
277 auto shift = shifts[pos++];
278 sortedOpGroups[shift].push_back(&op);
279 }
280
281 // Unless the shifts have a specific pattern (which actually would be the
282 // common use case), prologue and epilogue are not meaningfully defined.
283 // Nevertheless, if 'unrollPrologueEpilogue' is set, we will treat the first
284 // loop generated as the prologue and the last as epilogue and unroll these
285 // fully.
286 AffineForOp prologue, epilogue;
287
288 // Do a sweep over the sorted shifts while storing open groups in a
289 // vector, and generating loop portions as necessary during the sweep. A block
290 // of operations is paired with its shift.
291 std::vector<std::pair<uint64_t, ArrayRef<Operation *>>> opGroupQueue;
292
293 auto origLbMap = forOp.getLowerBoundMap();
294 uint64_t lbShift = 0;
295 OpBuilder b(forOp);
296 for (uint64_t d = 0, e = sortedOpGroups.size(); d < e; ++d) {
297 // If nothing is shifted by d, continue.
298 if (sortedOpGroups[d].empty())
299 continue;
300 if (!opGroupQueue.empty()) {
301 assert(d > 0 &&
302 "Queue expected to be empty when the first block is found");
303 // The interval for which the loop needs to be generated here is:
304 // [lbShift, min(lbShift + tripCount, d)) and the body of the
305 // loop needs to have all operations in opQueue in that order.
306 AffineForOp res;
307 if (lbShift + tripCount * step < d * step) {
308 res = generateShiftedLoop(
309 b.getShiftedAffineMap(origLbMap, lbShift),
310 b.getShiftedAffineMap(origLbMap, lbShift + tripCount * step),
311 opGroupQueue, /*offset=*/0, forOp, b);
312 // Entire loop for the queued op groups generated, empty it.
313 opGroupQueue.clear();
314 lbShift += tripCount * step;
315 } else {
316 res = generateShiftedLoop(b.getShiftedAffineMap(origLbMap, lbShift),
317 b.getShiftedAffineMap(origLbMap, d),
318 opGroupQueue, /*offset=*/0, forOp, b);
319 lbShift = d * step;
320 }
321
322 if (res) {
323 // Simplify/canonicalize the affine.for.
324 RewritePatternSet patterns(res.getContext());
325 AffineForOp::getCanonicalizationPatterns(patterns, res.getContext());
326 GreedyRewriteConfig config;
327 config.strictMode = GreedyRewriteStrictness::ExistingOps;
328 bool erased;
329 (void)applyOpPatternsAndFold(res.getOperation(), std::move(patterns),
330 config, /*changed=*/nullptr, &erased);
331 if (!erased && !prologue)
332 prologue = res;
333 if (!erased)
334 epilogue = res;
335 }
336 } else {
337 // Start of first interval.
338 lbShift = d * step;
339 }
340 // Augment the list of operations that get into the current open interval.
341 opGroupQueue.emplace_back(args&: d, args&: sortedOpGroups[d]);
342 }
343
344 // Those operations groups left in the queue now need to be processed (FIFO)
345 // and their loops completed.
346 for (unsigned i = 0, e = opGroupQueue.size(); i < e; ++i) {
347 uint64_t ubShift = (opGroupQueue[i].first + tripCount) * step;
348 epilogue = generateShiftedLoop(b.getShiftedAffineMap(origLbMap, lbShift),
349 b.getShiftedAffineMap(origLbMap, ubShift),
350 opGroupQueue, /*offset=*/i, forOp, b);
351 lbShift = ubShift;
352 if (!prologue)
353 prologue = epilogue;
354 }
355
356 // Erase the original for op.
357 forOp.erase();
358
359 if (unrollPrologueEpilogue && prologue)
360 (void)loopUnrollFull(prologue);
361 if (unrollPrologueEpilogue && !epilogue && epilogue != prologue)
362 (void)loopUnrollFull(epilogue);
363
364 return success();
365}
366
367/// Checks whether a loop nest is hyper-rectangular or not.
368static LogicalResult
369checkIfHyperRectangular(MutableArrayRef<AffineForOp> input) {
370 FlatAffineValueConstraints cst;
371 SmallVector<Operation *, 8> ops(input.begin(), input.end());
372 // 0-d or 1-d is trivially hyper-rectangular.
373 if (input.size() <= 1)
374 return success();
375 if (failed(result: getIndexSet(ops, domain: &cst))) {
376 LLVM_DEBUG(llvm::dbgs() << "Index set computation failed!\n");
377 return failure();
378 }
379 if (!cst.isHyperRectangular(pos: 0, num: input.size())) {
380 LLVM_DEBUG(llvm::dbgs()
381 << "Non-hyperrectangular nests not supported for tiling!\n");
382 return failure();
383 }
384 return success();
385}
386
387/// Check if the input nest is supported for tiling and whether tiling would be
388/// legal or not.
389template <typename t>
390static LogicalResult performPreTilingChecks(MutableArrayRef<AffineForOp> input,
391 ArrayRef<t> tileSizes) {
392 assert(input.size() == tileSizes.size() && "Too few/many tile sizes");
393
394 if (llvm::any_of(input,
395 [](AffineForOp op) { return op.getNumResults() > 0; })) {
396 LLVM_DEBUG(llvm::dbgs()
397 << "Cannot tile nest where a loop has yield values\n");
398 return failure();
399 }
400
401 // Check if the supplied `for` ops are all successively nested.
402 if (!isPerfectlyNested(loops: input)) {
403 LLVM_DEBUG(llvm::dbgs() << "input loops not perfectly nested");
404 return failure();
405 }
406
407 // TODO: handle non hyper-rectangular spaces.
408 if (failed(result: checkIfHyperRectangular(input)))
409 return failure();
410
411 return success();
412}
413
414/// Move the loop body of AffineForOp 'src' from 'src' into the specified
415/// location in destination's body, ignoring the terminator.
416static void moveLoopBodyImpl(AffineForOp src, AffineForOp dest,
417 Block::iterator loc) {
418 auto &ops = src.getBody()->getOperations();
419 dest.getBody()->getOperations().splice(loc, ops, ops.begin(),
420 std::prev(ops.end()));
421}
422
423/// Move the loop body of AffineForOp 'src' from 'src' to the start of dest
424/// body.
425static void moveLoopBody(AffineForOp src, AffineForOp dest) {
426 moveLoopBodyImpl(src, dest, dest.getBody()->begin());
427}
428
429/// Constructs tiled loop nest, without setting the loop bounds and move the
430/// body of the original loop nest to the tiled loop nest.
431static void constructTiledLoopNest(MutableArrayRef<AffineForOp> origLoops,
432 AffineForOp rootAffineForOp, unsigned width,
433 MutableArrayRef<AffineForOp> tiledLoops) {
434 Location loc = rootAffineForOp.getLoc();
435
436 // The outermost among the loops as we add more..
437 Operation *topLoop = rootAffineForOp.getOperation();
438 AffineForOp innermostPointLoop;
439
440 // Add intra-tile (or point) loops.
441 for (unsigned i = 0; i < width; i++) {
442 OpBuilder b(topLoop);
443 // Loop bounds will be set later.
444 AffineForOp pointLoop = b.create<AffineForOp>(loc, 0, 0);
445 pointLoop.getBody()->getOperations().splice(
446 pointLoop.getBody()->begin(), topLoop->getBlock()->getOperations(),
447 topLoop);
448 tiledLoops[2 * width - 1 - i] = pointLoop;
449 topLoop = pointLoop.getOperation();
450 if (i == 0)
451 innermostPointLoop = pointLoop;
452 }
453
454 // Add tile space loops;
455 for (unsigned i = width; i < 2 * width; i++) {
456 OpBuilder b(topLoop);
457 // Loop bounds will be set later.
458 AffineForOp tileSpaceLoop = b.create<AffineForOp>(loc, 0, 0);
459 tileSpaceLoop.getBody()->getOperations().splice(
460 tileSpaceLoop.getBody()->begin(), topLoop->getBlock()->getOperations(),
461 topLoop);
462 tiledLoops[2 * width - i - 1] = tileSpaceLoop;
463 topLoop = tileSpaceLoop.getOperation();
464 }
465
466 // Move the loop body of the original nest to the new one.
467 moveLoopBody(origLoops.back(), innermostPointLoop);
468}
469
470/// Set lower and upper bounds of intra-tile loops for parametric tiling.
471// TODO: Handle non-constant lower bounds.
472static void setIntraTileBoundsParametric(OpBuilder &b, AffineForOp origLoop,
473 AffineForOp newInterTileLoop,
474 AffineForOp newIntraTileLoop,
475 Value tileSize) {
476 // The lower bound for the intra-tile loop is represented by an affine map
477 // as (%i, %t0)->((%i - %origlb) * %t0 + %origlb). Similarly, the upper bound
478 // for the intra-tile loop is represented by an affine map as (%i, %t0)->((%i
479 // - %origlb) * %t0) + (%t0 * %origLoopStep) + %origlb), where %i is loop IV
480 // of the corresponding inter-tile loop, %t0 is the corresponding tiling
481 // parameter, %origlb is lower bound and %origLoopStep is the loop step of the
482 // corresponding inter-tile loop.
483
484 assert(origLoop.hasConstantLowerBound() &&
485 "expected input loops to have constant lower bound.");
486
487 // Get lower bound of original loop as an affine expression.
488 AffineExpr origLowerBoundExpr;
489 origLowerBoundExpr =
490 b.getAffineConstantExpr(constant: origLoop.getConstantLowerBound());
491
492 // Add dim operands from original lower/upper bound.
493 SmallVector<Value, 4> lbOperands, ubOperands;
494 AffineBound lb = origLoop.getLowerBound();
495 AffineBound ub = origLoop.getUpperBound();
496 lbOperands.reserve(N: lb.getNumOperands() + 2);
497 ubOperands.reserve(N: ub.getNumOperands() + 2);
498 AffineMap origLbMap = lb.getMap();
499 AffineMap origUbMap = ub.getMap();
500 for (unsigned j = 0, e = origLbMap.getNumDims(); j < e; ++j)
501 lbOperands.push_back(Elt: lb.getOperand(idx: j));
502 for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
503 ubOperands.push_back(Elt: ub.getOperand(idx: j));
504
505 // Add a new dim operand in lb/ubOperands corresponding to the origLoop
506 // IV.
507 lbOperands.push_back(Elt: newInterTileLoop.getInductionVar());
508 ubOperands.push_back(Elt: newInterTileLoop.getInductionVar());
509
510 // Get loop IV as an affine expression for lower/upper bound. Size of
511 // lb/ubOperands is guaranteed to be atleast one.
512 AffineExpr lbLoopIvExpr = b.getAffineDimExpr(position: lbOperands.size() - 1);
513 AffineExpr ubLoopIvExpr = b.getAffineDimExpr(position: ubOperands.size() - 1);
514
515 // Add symbol operands from original lower/upper bound.
516 for (unsigned j = 0, e = origLbMap.getNumSymbols(); j < e; ++j)
517 lbOperands.push_back(Elt: lb.getOperand(idx: origLbMap.getNumDims() + j));
518 for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
519 ubOperands.push_back(Elt: ub.getOperand(idx: origUbMap.getNumDims() + j));
520
521 // Add a new symbol operand which is the tile size for this loop.
522 lbOperands.push_back(Elt: tileSize);
523 ubOperands.push_back(Elt: tileSize);
524
525 SmallVector<AffineExpr, 4> lbBoundExprs;
526 SmallVector<AffineExpr, 4> ubBoundExprs;
527 lbBoundExprs.reserve(N: origLbMap.getNumResults());
528 ubBoundExprs.reserve(N: origUbMap.getNumResults());
529
530 // Get tiling parameter as an affine expression for lb/ub.
531 AffineExpr lbTileParameter = b.getAffineSymbolExpr(position: origLbMap.getNumSymbols());
532 AffineExpr ubTileParameter = b.getAffineSymbolExpr(position: origUbMap.getNumSymbols());
533
534 // Insert lb as inter-tile ((loop IV - origlb) * tilingParameter) + origlb.
535 lbBoundExprs.push_back(
536 Elt: ((lbLoopIvExpr - origLowerBoundExpr) * lbTileParameter) +
537 origLowerBoundExpr);
538
539 // Get the origLoopStep as an affine expression.
540 AffineExpr origLoopStep = b.getAffineConstantExpr(constant: origLoop.getStepAsInt());
541
542 // Insert ub as inter-tile ((loop IV - origlb) * tilingParameter) +
543 // (tilingParameter * origLoopStep) + origlb.
544 ubBoundExprs.push_back(
545 Elt: ((ubLoopIvExpr - origLowerBoundExpr) * ubTileParameter) +
546 (ubTileParameter * origLoopStep) + origLowerBoundExpr);
547
548 ubBoundExprs.append(in_start: origUbMap.getResults().begin(),
549 in_end: origUbMap.getResults().end());
550
551 AffineMap lbMap =
552 AffineMap::get(dimCount: origLbMap.getNumDims() + 1, symbolCount: origLbMap.getNumSymbols() + 1,
553 results: lbBoundExprs, context: b.getContext());
554 newIntraTileLoop.setLowerBound(lbOperands, lbMap);
555
556 AffineMap ubMap =
557 AffineMap::get(dimCount: origUbMap.getNumDims() + 1, symbolCount: origUbMap.getNumSymbols() + 1,
558 results: ubBoundExprs, context: b.getContext());
559 newIntraTileLoop.setUpperBound(ubOperands, ubMap);
560
561 // Original loop step must be preserved.
562 newIntraTileLoop.setStep(origLoop.getStepAsInt());
563}
564
565/// Set lower and upper bounds of inter-tile loops for parametric tiling.
566// TODO: Handle non-constant lower bounds.
567static void setInterTileBoundsParametric(OpBuilder &b, AffineForOp origLoop,
568 AffineForOp newLoop, Value tileSize) {
569 OperandRange newLbOperands = origLoop.getLowerBoundOperands();
570
571 // The lower bounds for inter-tile loops are same as the corresponding lower
572 // bounds of original loops.
573 newLoop.setLowerBound(newLbOperands, origLoop.getLowerBoundMap());
574
575 // The new upper bound map for inter-tile loops, assuming constant lower
576 // bounds, are now originalLowerBound + ceildiv((originalUpperBound -
577 // originalLowerBound), tiling parameter); where tiling parameter is the
578 // respective tile size for that loop. For e.g. if the original ubmap was
579 // ()->(1024), the new map will be
580 // ()[s0]->(ceildiv((1024 -lb) % s0)), where s0 is the tiling parameter.
581 // Therefore a new symbol operand is inserted in the map and the result
582 // expression is overwritten.
583
584 assert(origLoop.hasConstantLowerBound() &&
585 "expected input loops to have constant lower bound.");
586
587 // Get lower bound of original loop as an affine expression.
588 AffineExpr origLowerBoundExpr;
589 origLowerBoundExpr =
590 b.getAffineConstantExpr(constant: origLoop.getConstantLowerBound());
591
592 // Add dim operands from original upper bound.
593 SmallVector<Value, 4> ubOperands;
594 AffineBound ub = origLoop.getUpperBound();
595 ubOperands.reserve(N: ub.getNumOperands() + 1);
596 AffineMap origUbMap = ub.getMap();
597 for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
598 ubOperands.push_back(Elt: ub.getOperand(idx: j));
599
600 // Add symbol operands from original upper bound.
601 for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
602 ubOperands.push_back(Elt: ub.getOperand(idx: origUbMap.getNumDims() + j));
603
604 // Add a new symbol operand which is the tile size for this loop.
605 ubOperands.push_back(Elt: tileSize);
606
607 // Get tiling parameter as an affine expression.
608 AffineExpr tileParameter = b.getAffineSymbolExpr(position: origUbMap.getNumSymbols());
609
610 SmallVector<AffineExpr, 4> boundExprs;
611 boundExprs.reserve(N: origUbMap.getNumResults());
612 int64_t origUpperBound;
613 AffineExpr origUpperBoundExpr;
614
615 // If upper bound for the original loop is constant, then the constant can
616 // be obtained as an affine expression straight away.
617 if (origLoop.hasConstantUpperBound()) {
618 origUpperBound = origLoop.getConstantUpperBound();
619
620 // Get original constant upper bound as an affine expression.
621 origUpperBoundExpr = b.getAffineConstantExpr(constant: origUpperBound);
622
623 // Insert the bound as originalLowerBoundceildiv((originalUpperBound -
624 // originalLowerBound), tilingParameter).
625 boundExprs.push_back(
626 Elt: origLowerBoundExpr +
627 (origUpperBoundExpr - origLowerBoundExpr).ceilDiv(other: tileParameter));
628 } else {
629 // If upper bound for the original loop is not constant then two cases
630 // are possible, although there handeling is the same, 1.) The result of
631 // ubmap has only one result expression. For e.g.
632 // affine.for %i = 5 to %ub
633 //
634 // A symbol operand is added which represents the tiling parameter. The
635 // new loop bounds here will be like ()[s0, s1] -> ((s0 - 5) ceildiv s1 + 5)
636 // where 's0' is the original upper bound and 's1' is the tiling
637 // parameter. 2.) When ubMap has more than one result expression. For e.g.
638 // #map0 = affine_map<()[s0, s1] -> (s0, s1)
639 // affine.for %i = 5 to min #map0()[%s0, %s1]
640 //
641 // A symbol operand is added which represents the tiling parameter. The
642 // new loop bounds will be like ()[s0, s1, s2] -> ((s0 - 5) ceildiv s2 + 5,
643 // (s1 -5) ceildiv s2 + 5), where s2 is the tiling parameter.
644
645 // Insert the bounds as originalLowerBound + ceildiv((originalUpperBound -
646 // originalLowerBound), tilingParameter).
647 for (AffineExpr origUpperBoundExpr : origUbMap.getResults())
648 boundExprs.push_back(
649 origLowerBoundExpr +
650 (origUpperBoundExpr - origLowerBoundExpr).ceilDiv(tileParameter));
651 }
652
653 AffineMap ubMap =
654 AffineMap::get(dimCount: origUbMap.getNumDims(), symbolCount: origUbMap.getNumSymbols() + 1,
655 results: boundExprs, context: b.getContext());
656 newLoop.setUpperBound(ubOperands, ubMap);
657
658 // Original loop step must be preserved.
659 newLoop.setStep(origLoop.getStepAsInt());
660}
661
662/// Constructs and sets new loop bounds after tiling for the case of
663/// hyper-rectangular index sets, where the bounds of one dimension do not
664/// depend on other dimensions and tiling parameters are captured from SSA
665/// values. Bounds of each dimension can thus be treated independently,
666/// and deriving the new bounds is much simpler and faster than for the case of
667/// tiling arbitrary polyhedral shapes.
668static void constructParametricallyTiledIndexSetHyperRect(
669 MutableArrayRef<AffineForOp> origLoops,
670 MutableArrayRef<AffineForOp> newLoops, ArrayRef<Value> tileSizes) {
671 assert(!origLoops.empty() && "expected atleast one loop in band");
672 assert(origLoops.size() == tileSizes.size() &&
673 "expected tiling parameter for each loop in band.");
674
675 OpBuilder b(origLoops[0].getOperation());
676 unsigned width = origLoops.size();
677
678 // Set bounds for tile space loops.
679 for (unsigned i = 0; i < width; ++i) {
680 setInterTileBoundsParametric(b, origLoops[i], newLoops[i], tileSizes[i]);
681 }
682
683 // Set bounds for intra-tile loops.
684 for (unsigned i = 0; i < width; ++i) {
685 setIntraTileBoundsParametric(b, origLoops[i], newLoops[i],
686 newLoops[i + width], tileSizes[i]);
687 }
688}
689
690/// Constructs and sets new loop bounds after tiling for the case of
691/// hyper-rectangular index sets, where the bounds of one dimension do not
692/// depend on other dimensions. Bounds of each dimension can thus be treated
693/// independently, and deriving the new bounds is much simpler and faster
694/// than for the case of tiling arbitrary polyhedral shapes.
695static void
696constructTiledIndexSetHyperRect(MutableArrayRef<AffineForOp> origLoops,
697 MutableArrayRef<AffineForOp> newLoops,
698 ArrayRef<unsigned> tileSizes) {
699 assert(!origLoops.empty());
700 assert(origLoops.size() == tileSizes.size());
701
702 OpBuilder b(origLoops[0].getOperation());
703 unsigned width = origLoops.size();
704
705 // Bounds for tile space loops.
706 for (unsigned i = 0; i < width; i++) {
707 OperandRange newLbOperands = origLoops[i].getLowerBoundOperands();
708 OperandRange newUbOperands = origLoops[i].getUpperBoundOperands();
709 newLoops[i].setLowerBound(newLbOperands, origLoops[i].getLowerBoundMap());
710 newLoops[i].setUpperBound(newUbOperands, origLoops[i].getUpperBoundMap());
711 // If the step size of original loop is x and tileSize is y then after
712 // tiling the tile space loops' step size becomes x*y.
713 newLoops[i].setStep(tileSizes[i] * origLoops[i].getStepAsInt());
714 }
715 // Bounds for intra-tile loops.
716 for (unsigned i = 0; i < width; i++) {
717 int64_t largestDiv = getLargestDivisorOfTripCount(origLoops[i]);
718 std::optional<uint64_t> mayBeConstantCount =
719 getConstantTripCount(origLoops[i]);
720 // The lower bound is just the tile-space loop.
721 AffineMap lbMap = b.getDimIdentityMap();
722 newLoops[width + i].setLowerBound(
723 /*operands=*/newLoops[i].getInductionVar(), lbMap);
724 // The step sizes of intra-tile loops is just the original loops' step size.
725 newLoops[width + i].setStep(origLoops[i].getStepAsInt());
726
727 // Set the upper bound.
728 if (mayBeConstantCount && *mayBeConstantCount < tileSizes[i]) {
729 // Trip count is less than the tile size: upper bound is lower bound +
730 // trip count * stepSize.
731 AffineMap ubMap = b.getSingleDimShiftAffineMap(
732 shift: *mayBeConstantCount * origLoops[i].getStepAsInt());
733 newLoops[width + i].setUpperBound(
734 /*operands=*/newLoops[i].getInductionVar(), ubMap);
735 } else if (largestDiv % tileSizes[i] != 0) {
736 // Intra-tile loop ii goes from i to min(i + tileSize * stepSize, ub_i).
737 // Construct the upper bound map; the operands are the original operands
738 // with 'i' (tile-space loop) appended to it. The new upper bound map is
739 // the original one with an additional expression i + tileSize * stepSize
740 // appended.
741
742 // Add dim operands from original upper bound.
743 SmallVector<Value, 4> ubOperands;
744 AffineBound ub = origLoops[i].getUpperBound();
745 ubOperands.reserve(N: ub.getNumOperands() + 1);
746 AffineMap origUbMap = ub.getMap();
747 for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
748 ubOperands.push_back(Elt: ub.getOperand(idx: j));
749
750 // Add dim operand for new loop upper bound.
751 ubOperands.push_back(Elt: newLoops[i].getInductionVar());
752
753 // Add symbol operands from original upper bound.
754 for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
755 ubOperands.push_back(Elt: ub.getOperand(idx: origUbMap.getNumDims() + j));
756
757 SmallVector<AffineExpr, 4> boundExprs;
758 boundExprs.reserve(N: 1 + origUbMap.getNumResults());
759 AffineExpr dim = b.getAffineDimExpr(position: origUbMap.getNumDims());
760 // The new upper bound map is the original one with an additional
761 // expression i + tileSize * stepSize (of original loop) appended.
762 boundExprs.push_back(Elt: dim + tileSizes[i] * origLoops[i].getStepAsInt());
763 boundExprs.append(in_start: origUbMap.getResults().begin(),
764 in_end: origUbMap.getResults().end());
765 AffineMap ubMap =
766 AffineMap::get(dimCount: origUbMap.getNumDims() + 1, symbolCount: origUbMap.getNumSymbols(),
767 results: boundExprs, context: b.getContext());
768 newLoops[width + i].setUpperBound(/*operands=*/ubOperands, ubMap);
769 } else {
770 // No need of the min expression.
771 AffineExpr dim = b.getAffineDimExpr(position: 0);
772 AffineMap ubMap = AffineMap::get(
773 1, 0, dim + tileSizes[i] * origLoops[i].getStepAsInt());
774 newLoops[width + i].setUpperBound(newLoops[i].getInductionVar(), ubMap);
775 }
776 }
777}
778
779LogicalResult
780mlir::affine::tilePerfectlyNested(MutableArrayRef<AffineForOp> input,
781 ArrayRef<unsigned> tileSizes,
782 SmallVectorImpl<AffineForOp> *tiledNest) {
783 if (input.empty())
784 return success();
785
786 if (failed(result: performPreTilingChecks(input, tileSizes)))
787 return failure();
788
789 MutableArrayRef<AffineForOp> origLoops = input;
790 AffineForOp rootAffineForOp = origLoops[0];
791
792 // Note that width is at least one since the band isn't empty.
793 unsigned width = input.size();
794 SmallVector<AffineForOp, 6> tiledLoops(2 * width);
795
796 // Construct a tiled loop nest without setting their bounds. Bounds are
797 // set later.
798 constructTiledLoopNest(origLoops, rootAffineForOp, width, tiledLoops);
799
800 SmallVector<Value, 8> origLoopIVs;
801 extractForInductionVars(forInsts: input, ivs: &origLoopIVs);
802
803 // Set loop bounds for the tiled loop nest.
804 constructTiledIndexSetHyperRect(origLoops, tiledLoops, tileSizes);
805
806 // Replace original IVs with intra-tile loop IVs.
807 for (unsigned i = 0; i < width; i++)
808 origLoopIVs[i].replaceAllUsesWith(newValue: tiledLoops[i + width].getInductionVar());
809
810 // Erase the old loop nest.
811 rootAffineForOp.erase();
812
813 if (tiledNest)
814 *tiledNest = std::move(tiledLoops);
815
816 return success();
817}
818
819/// Tiles the specified band of perfectly nested loops creating tile-space
820/// loops and intra-tile loops, using SSA values as tiling parameters. A band
821/// is a contiguous set of loops.
822LogicalResult mlir::affine::tilePerfectlyNestedParametric(
823 MutableArrayRef<AffineForOp> input, ArrayRef<Value> tileSizes,
824 SmallVectorImpl<AffineForOp> *tiledNest) {
825 if (input.empty())
826 return success();
827
828 if (failed(result: performPreTilingChecks(input, tileSizes)))
829 return failure();
830
831 MutableArrayRef<AffineForOp> origLoops = input;
832 AffineForOp rootAffineForOp = origLoops[0];
833 unsigned width = input.size();
834 SmallVector<AffineForOp, 6> tiledLoops(2 * width);
835
836 // Construct a tiled loop nest without setting their bounds. Bounds are
837 // set later.
838 constructTiledLoopNest(origLoops, rootAffineForOp, width, tiledLoops);
839
840 SmallVector<Value, 8> origLoopIVs;
841 extractForInductionVars(forInsts: input, ivs: &origLoopIVs);
842
843 // Set loop bounds for the tiled loop nest.
844 constructParametricallyTiledIndexSetHyperRect(origLoops, tiledLoops,
845 tileSizes);
846
847 // Replace original IVs with intra-tile loop IVs.
848 for (unsigned i = 0; i < width; i++)
849 origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar());
850
851 // Erase the old loop nest.
852 rootAffineForOp.erase();
853
854 if (tiledNest)
855 *tiledNest = std::move(tiledLoops);
856
857 return success();
858}
859
860/// Get perfectly nested sequence of loops starting at root of loop nest
861/// (the first op being another AffineFor, and the second op - a terminator).
862/// A loop is perfectly nested iff: the first op in the loop's body is another
863/// AffineForOp, and the second op is a terminator).
864void mlir::affine::getPerfectlyNestedLoops(
865 SmallVectorImpl<AffineForOp> &nestedLoops, AffineForOp root) {
866 for (unsigned i = 0; i < std::numeric_limits<unsigned>::max(); ++i) {
867 nestedLoops.push_back(root);
868 Block &body = root.getRegion().front();
869 if (body.begin() != std::prev(x: body.end(), n: 2))
870 return;
871
872 root = dyn_cast<AffineForOp>(&body.front());
873 if (!root)
874 return;
875 }
876}
877
878/// Identify valid and profitable bands of loops to tile. This is currently just
879/// a temporary placeholder to test the mechanics of tiled code generation.
880/// Returns all maximal outermost perfect loop nests to tile.
881void mlir::affine::getTileableBands(
882 func::FuncOp f, std::vector<SmallVector<AffineForOp, 6>> *bands) {
883 // Get maximal perfect nest of 'affine.for' insts starting from root
884 // (inclusive).
885 for (AffineForOp forOp : f.getOps<AffineForOp>()) {
886 SmallVector<AffineForOp, 6> band;
887 getPerfectlyNestedLoops(band, forOp);
888 bands->push_back(band);
889 }
890}
891
892/// Unrolls this loop completely.
893LogicalResult mlir::affine::loopUnrollFull(AffineForOp forOp) {
894 std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
895 if (mayBeConstantTripCount.has_value()) {
896 uint64_t tripCount = *mayBeConstantTripCount;
897 if (tripCount == 0)
898 return success();
899 if (tripCount == 1)
900 return promoteIfSingleIteration(forOp);
901 return loopUnrollByFactor(forOp, tripCount);
902 }
903 return failure();
904}
905
906/// Unrolls this loop by the specified factor or by the trip count (if constant)
907/// whichever is lower.
908LogicalResult mlir::affine::loopUnrollUpToFactor(AffineForOp forOp,
909 uint64_t unrollFactor) {
910 std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
911 if (mayBeConstantTripCount.has_value() &&
912 *mayBeConstantTripCount < unrollFactor)
913 return loopUnrollByFactor(forOp, *mayBeConstantTripCount);
914 return loopUnrollByFactor(forOp, unrollFactor);
915}
916
917/// Generates unrolled copies of AffineForOp 'loopBodyBlock', with associated
918/// 'forOpIV' by 'unrollFactor', calling 'ivRemapFn' to remap 'forOpIV' for each
919/// unrolled body. If specified, annotates the Ops in each unrolled iteration
920/// using annotateFn.
921static void generateUnrolledLoop(
922 Block *loopBodyBlock, Value forOpIV, uint64_t unrollFactor,
923 function_ref<Value(unsigned, Value, OpBuilder)> ivRemapFn,
924 function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn,
925 ValueRange iterArgs, ValueRange yieldedValues) {
926 // Builder to insert unrolled bodies just before the terminator of the body of
927 // 'forOp'.
928 auto builder = OpBuilder::atBlockTerminator(block: loopBodyBlock);
929
930 if (!annotateFn)
931 annotateFn = [](unsigned, Operation *, OpBuilder) {};
932
933 // Keep a pointer to the last non-terminator operation in the original block
934 // so that we know what to clone (since we are doing this in-place).
935 Block::iterator srcBlockEnd = std::prev(x: loopBodyBlock->end(), n: 2);
936
937 // Unroll the contents of 'forOp' (append unrollFactor - 1 additional copies).
938 SmallVector<Value, 4> lastYielded(yieldedValues);
939
940 for (unsigned i = 1; i < unrollFactor; i++) {
941 IRMapping operandMap;
942
943 // Prepare operand map.
944 operandMap.map(from&: iterArgs, to&: lastYielded);
945
946 // If the induction variable is used, create a remapping to the value for
947 // this unrolled instance.
948 if (!forOpIV.use_empty()) {
949 Value ivUnroll = ivRemapFn(i, forOpIV, builder);
950 operandMap.map(from: forOpIV, to: ivUnroll);
951 }
952
953 // Clone the original body of 'forOp'.
954 for (auto it = loopBodyBlock->begin(); it != std::next(x: srcBlockEnd); it++) {
955 Operation *clonedOp = builder.clone(op&: *it, mapper&: operandMap);
956 annotateFn(i, clonedOp, builder);
957 }
958
959 // Update yielded values. If the yielded value is defined outside the
960 // `loopBodyBlock` or if it is a BlockArgument then it won't be cloned, thus
961 // the `lastYielded` value remains unchanged. Else, update the `lastYielded`
962 // value with the clone corresponding to the yielded value.
963 for (unsigned i = 0, e = lastYielded.size(); i < e; i++) {
964 Operation *defOp = yieldedValues[i].getDefiningOp();
965 if (defOp && defOp->getBlock() == loopBodyBlock)
966 lastYielded[i] = operandMap.lookup(from: yieldedValues[i]);
967 }
968 }
969
970 // Make sure we annotate the Ops in the original body. We do this last so that
971 // any annotations are not copied into the cloned Ops above.
972 for (auto it = loopBodyBlock->begin(); it != std::next(x: srcBlockEnd); it++)
973 annotateFn(0, &*it, builder);
974
975 // Update operands of the yield statement.
976 loopBodyBlock->getTerminator()->setOperands(lastYielded);
977}
978
979/// Helper to generate cleanup loop for unroll or unroll-and-jam when the trip
980/// count is not a multiple of `unrollFactor`.
981static LogicalResult generateCleanupLoopForUnroll(AffineForOp forOp,
982 uint64_t unrollFactor) {
983 // Insert the cleanup loop right after 'forOp'.
984 OpBuilder builder(forOp->getBlock(), std::next(x: Block::iterator(forOp)));
985 auto cleanupForOp = cast<AffineForOp>(builder.clone(*forOp));
986
987 // Update uses of `forOp` results. `cleanupForOp` should use `forOp` result
988 // and produce results for the original users of `forOp` results.
989 auto results = forOp.getResults();
990 auto cleanupResults = cleanupForOp.getResults();
991 auto cleanupIterOperands = cleanupForOp.getInits();
992
993 for (auto e : llvm::zip(results, cleanupResults, cleanupIterOperands)) {
994 std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
995 cleanupForOp->replaceUsesOfWith(std::get<2>(e), std::get<0>(e));
996 }
997
998 AffineMap cleanupMap;
999 SmallVector<Value, 4> cleanupOperands;
1000 getCleanupLoopLowerBound(forOp, unrollFactor, cleanupMap, cleanupOperands);
1001 if (!cleanupMap)
1002 return failure();
1003
1004 cleanupForOp.setLowerBound(cleanupOperands, cleanupMap);
1005 // Promote the loop body up if this has turned into a single iteration loop.
1006 (void)promoteIfSingleIteration(cleanupForOp);
1007
1008 // Adjust upper bound of the original loop; this is the same as the lower
1009 // bound of the cleanup loop.
1010 forOp.setUpperBound(cleanupOperands, cleanupMap);
1011 return success();
1012}
1013
1014/// Unrolls this loop by the specified factor. Returns success if the loop
1015/// is successfully unrolled.
1016LogicalResult mlir::affine::loopUnrollByFactor(
1017 AffineForOp forOp, uint64_t unrollFactor,
1018 function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn,
1019 bool cleanUpUnroll) {
1020 assert(unrollFactor > 0 && "unroll factor should be positive");
1021
1022 std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
1023 if (unrollFactor == 1) {
1024 if (mayBeConstantTripCount && *mayBeConstantTripCount == 1 &&
1025 failed(promoteIfSingleIteration(forOp)))
1026 return failure();
1027 return success();
1028 }
1029
1030 // Nothing in the loop body other than the terminator.
1031 if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
1032 return success();
1033
1034 // If the trip count is lower than the unroll factor, no unrolled body.
1035 if (mayBeConstantTripCount && *mayBeConstantTripCount < unrollFactor) {
1036 if (cleanUpUnroll) {
1037 // Unroll the cleanup loop if cleanUpUnroll is specified.
1038 return loopUnrollFull(forOp);
1039 }
1040
1041 return failure();
1042 }
1043
1044 // Generate the cleanup loop if trip count isn't a multiple of unrollFactor.
1045 if (getLargestDivisorOfTripCount(forOp) % unrollFactor != 0) {
1046 // Loops where the lower bound is a max expression or the upper bound is
1047 // a min expression and the trip count doesn't divide the unroll factor
1048 // can't be unrolled since the lower bound of the cleanup loop in such cases
1049 // cannot be expressed as an affine function or a max over affine functions.
1050 if (forOp.getLowerBoundMap().getNumResults() != 1 ||
1051 forOp.getUpperBoundMap().getNumResults() != 1)
1052 return failure();
1053 if (cleanUpUnroll)
1054 // Force unroll including cleanup loop
1055 return loopUnrollFull(forOp);
1056 if (failed(generateCleanupLoopForUnroll(forOp, unrollFactor)))
1057 assert(false && "cleanup loop lower bound map for single result lower "
1058 "and upper bound maps can always be determined");
1059 }
1060
1061 ValueRange iterArgs(forOp.getRegionIterArgs());
1062 auto yieldedValues = forOp.getBody()->getTerminator()->getOperands();
1063
1064 // Scale the step of loop being unrolled by unroll factor.
1065 int64_t step = forOp.getStepAsInt();
1066 forOp.setStep(step * unrollFactor);
1067 generateUnrolledLoop(
1068 forOp.getBody(), forOp.getInductionVar(), unrollFactor,
1069 [&](unsigned i, Value iv, OpBuilder b) {
1070 // iv' = iv + i * step
1071 auto d0 = b.getAffineDimExpr(position: 0);
1072 auto bumpMap = AffineMap::get(dimCount: 1, symbolCount: 0, result: d0 + i * step);
1073 return b.create<AffineApplyOp>(forOp.getLoc(), bumpMap, iv);
1074 },
1075 /*annotateFn=*/annotateFn,
1076 /*iterArgs=*/iterArgs, /*yieldedValues=*/yieldedValues);
1077
1078 // Promote the loop body up if this has turned into a single iteration loop.
1079 (void)promoteIfSingleIteration(forOp);
1080 return success();
1081}
1082
1083LogicalResult mlir::affine::loopUnrollJamUpToFactor(AffineForOp forOp,
1084 uint64_t unrollJamFactor) {
1085 std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
1086 if (mayBeConstantTripCount.has_value() &&
1087 *mayBeConstantTripCount < unrollJamFactor)
1088 return loopUnrollJamByFactor(forOp, *mayBeConstantTripCount);
1089 return loopUnrollJamByFactor(forOp, unrollJamFactor);
1090}
1091
1092/// Check if all control operands of all loops are defined outside of `forOp`
1093/// and return false if not.
1094static bool areInnerBoundsInvariant(AffineForOp forOp) {
1095 auto walkResult = forOp.walk([&](AffineForOp aForOp) {
1096 for (auto controlOperand : aForOp.getControlOperands()) {
1097 if (!forOp.isDefinedOutsideOfLoop(controlOperand))
1098 return WalkResult::interrupt();
1099 }
1100 return WalkResult::advance();
1101 });
1102 return !walkResult.wasInterrupted();
1103}
1104
1105// Gathers all maximal sub-blocks of operations that do not themselves
1106// include a for op (a operation could have a descendant for op though
1107// in its tree). Ignore the block terminators.
1108struct JamBlockGatherer {
1109 // Store iterators to the first and last op of each sub-block found.
1110 std::vector<std::pair<Block::iterator, Block::iterator>> subBlocks;
1111
1112 // This is a linear time walk.
1113 void walk(Operation *op) {
1114 for (auto &region : op->getRegions())
1115 for (auto &block : region)
1116 walk(block);
1117 }
1118
1119 void walk(Block &block) {
1120 for (auto it = block.begin(), e = std::prev(x: block.end()); it != e;) {
1121 auto subBlockStart = it;
1122 while (it != e && !isa<AffineForOp>(Val: &*it))
1123 ++it;
1124 if (it != subBlockStart)
1125 subBlocks.emplace_back(args&: subBlockStart, args: std::prev(x: it));
1126 // Process all for ops that appear next.
1127 while (it != e && isa<AffineForOp>(Val: &*it))
1128 walk(op: &*it++);
1129 }
1130 }
1131};
1132
1133/// Unrolls and jams this loop by the specified factor.
1134LogicalResult mlir::affine::loopUnrollJamByFactor(AffineForOp forOp,
1135 uint64_t unrollJamFactor) {
1136 assert(unrollJamFactor > 0 && "unroll jam factor should be positive");
1137
1138 std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
1139 if (unrollJamFactor == 1) {
1140 if (mayBeConstantTripCount && *mayBeConstantTripCount == 1 &&
1141 failed(promoteIfSingleIteration(forOp)))
1142 return failure();
1143 return success();
1144 }
1145
1146 // Nothing in the loop body other than the terminator.
1147 if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
1148 return success();
1149
1150 // If the trip count is lower than the unroll jam factor, no unroll jam.
1151 if (mayBeConstantTripCount && *mayBeConstantTripCount < unrollJamFactor) {
1152 LLVM_DEBUG(llvm::dbgs() << "[failed] trip count < unroll-jam factor\n");
1153 return failure();
1154 }
1155
1156 // If any control operand of any inner loop of `forOp` is defined within
1157 // `forOp`, no unroll jam.
1158 if (!areInnerBoundsInvariant(forOp))
1159 return failure();
1160
1161 // Gather all sub-blocks to jam upon the loop being unrolled.
1162 JamBlockGatherer jbg;
1163 jbg.walk(forOp);
1164 auto &subBlocks = jbg.subBlocks;
1165
1166 // Collect loops with iter_args.
1167 SmallVector<AffineForOp, 4> loopsWithIterArgs;
1168 forOp.walk([&](AffineForOp aForOp) {
1169 if (aForOp.getNumIterOperands() > 0)
1170 loopsWithIterArgs.push_back(aForOp);
1171 });
1172
1173 // Get supported reductions to be used for creating reduction ops at the end.
1174 SmallVector<LoopReduction> reductions;
1175 if (forOp.getNumIterOperands() > 0)
1176 getSupportedReductions(forOp, reductions);
1177
1178 // Generate the cleanup loop if trip count isn't a multiple of
1179 // unrollJamFactor.
1180 if (getLargestDivisorOfTripCount(forOp) % unrollJamFactor != 0) {
1181 // Loops where the lower bound is a max expression or the upper bound is
1182 // a min expression and the trip count doesn't divide the unroll factor
1183 // can't be unrolled since the lower bound of the cleanup loop in such cases
1184 // cannot be expressed as an affine function or a max over affine functions.
1185 if (forOp.getLowerBoundMap().getNumResults() != 1 ||
1186 forOp.getUpperBoundMap().getNumResults() != 1)
1187 return failure();
1188 if (failed(generateCleanupLoopForUnroll(forOp, unrollJamFactor)))
1189 assert(false && "cleanup loop lower bound map for single result lower "
1190 "and upper bound maps can always be determined");
1191 }
1192
1193 // `operandMaps[i - 1]` carries old->new operand mapping for the ith unrolled
1194 // iteration. There are (`unrollJamFactor` - 1) iterations.
1195 SmallVector<IRMapping, 4> operandMaps(unrollJamFactor - 1);
1196
1197 // For any loop with iter_args, replace it with a new loop that has
1198 // `unrollJamFactor` copies of its iterOperands, iter_args and yield
1199 // operands.
1200 SmallVector<AffineForOp, 4> newLoopsWithIterArgs;
1201 IRRewriter rewriter(forOp.getContext());
1202 for (AffineForOp oldForOp : loopsWithIterArgs) {
1203 SmallVector<Value> dupIterOperands, dupYieldOperands;
1204 ValueRange oldIterOperands = oldForOp.getInits();
1205 ValueRange oldIterArgs = oldForOp.getRegionIterArgs();
1206 ValueRange oldYieldOperands =
1207 cast<AffineYieldOp>(oldForOp.getBody()->getTerminator()).getOperands();
1208 // Get additional iterOperands, iterArgs, and yield operands. We will
1209 // fix iterOperands and yield operands after cloning of sub-blocks.
1210 for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1211 dupIterOperands.append(oldIterOperands.begin(), oldIterOperands.end());
1212 dupYieldOperands.append(oldYieldOperands.begin(), oldYieldOperands.end());
1213 }
1214 // Create a new loop with additional iterOperands, iter_args and yield
1215 // operands. This new loop will take the loop body of the original loop.
1216 bool forOpReplaced = oldForOp == forOp;
1217 AffineForOp newForOp =
1218 cast<AffineForOp>(*oldForOp.replaceWithAdditionalYields(
1219 rewriter, dupIterOperands, /*replaceInitOperandUsesInLoop=*/false,
1220 [&](OpBuilder &b, Location loc, ArrayRef<BlockArgument> newBbArgs) {
1221 return dupYieldOperands;
1222 }));
1223 newLoopsWithIterArgs.push_back(newForOp);
1224 // `forOp` has been replaced with a new loop.
1225 if (forOpReplaced)
1226 forOp = newForOp;
1227 // Update `operandMaps` for `newForOp` iterArgs and results.
1228 ValueRange newIterArgs = newForOp.getRegionIterArgs();
1229 unsigned oldNumIterArgs = oldIterArgs.size();
1230 ValueRange newResults = newForOp.getResults();
1231 unsigned oldNumResults = newResults.size() / unrollJamFactor;
1232 assert(oldNumIterArgs == oldNumResults &&
1233 "oldNumIterArgs must be the same as oldNumResults");
1234 for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1235 for (unsigned j = 0; j < oldNumIterArgs; ++j) {
1236 // `newForOp` has `unrollJamFactor` - 1 new sets of iterArgs and
1237 // results. Update `operandMaps[i - 1]` to map old iterArgs and results
1238 // to those in the `i`th new set.
1239 operandMaps[i - 1].map(newIterArgs[j],
1240 newIterArgs[i * oldNumIterArgs + j]);
1241 operandMaps[i - 1].map(newResults[j],
1242 newResults[i * oldNumResults + j]);
1243 }
1244 }
1245 }
1246
1247 // Scale the step of loop being unroll-jammed by the unroll-jam factor.
1248 int64_t step = forOp.getStepAsInt();
1249 forOp.setStep(step * unrollJamFactor);
1250
1251 auto forOpIV = forOp.getInductionVar();
1252 // Unroll and jam (appends unrollJamFactor - 1 additional copies).
1253 for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1254 for (auto &subBlock : subBlocks) {
1255 // Builder to insert unroll-jammed bodies. Insert right at the end of
1256 // sub-block.
1257 OpBuilder builder(subBlock.first->getBlock(), std::next(x: subBlock.second));
1258
1259 // If the induction variable is used, create a remapping to the value for
1260 // this unrolled instance.
1261 if (!forOpIV.use_empty()) {
1262 // iv' = iv + i * step, i = 1 to unrollJamFactor-1.
1263 auto d0 = builder.getAffineDimExpr(position: 0);
1264 auto bumpMap = AffineMap::get(dimCount: 1, symbolCount: 0, result: d0 + i * step);
1265 auto ivUnroll =
1266 builder.create<AffineApplyOp>(forOp.getLoc(), bumpMap, forOpIV);
1267 operandMaps[i - 1].map(forOpIV, ivUnroll);
1268 }
1269 // Clone the sub-block being unroll-jammed.
1270 for (auto it = subBlock.first; it != std::next(x: subBlock.second); ++it)
1271 builder.clone(op&: *it, mapper&: operandMaps[i - 1]);
1272 }
1273 // Fix iterOperands and yield op operands of newly created loops.
1274 for (auto newForOp : newLoopsWithIterArgs) {
1275 unsigned oldNumIterOperands =
1276 newForOp.getNumIterOperands() / unrollJamFactor;
1277 unsigned numControlOperands = newForOp.getNumControlOperands();
1278 auto yieldOp = cast<AffineYieldOp>(newForOp.getBody()->getTerminator());
1279 unsigned oldNumYieldOperands = yieldOp.getNumOperands() / unrollJamFactor;
1280 assert(oldNumIterOperands == oldNumYieldOperands &&
1281 "oldNumIterOperands must be the same as oldNumYieldOperands");
1282 for (unsigned j = 0; j < oldNumIterOperands; ++j) {
1283 // The `i`th duplication of an old iterOperand or yield op operand
1284 // needs to be replaced with a mapped value from `operandMaps[i - 1]`
1285 // if such mapped value exists.
1286 newForOp.setOperand(numControlOperands + i * oldNumIterOperands + j,
1287 operandMaps[i - 1].lookupOrDefault(
1288 newForOp.getOperand(numControlOperands + j)));
1289 yieldOp.setOperand(
1290 i * oldNumYieldOperands + j,
1291 operandMaps[i - 1].lookupOrDefault(yieldOp.getOperand(j)));
1292 }
1293 }
1294 }
1295 if (forOp.getNumResults() > 0) {
1296 // Create reduction ops to combine every `unrollJamFactor` related results
1297 // into one value. For example, for %0:2 = affine.for ... and addf, we add
1298 // %1 = arith.addf %0#0, %0#1, and replace the following uses of %0#0 with
1299 // %1.
1300 rewriter.setInsertionPointAfter(forOp);
1301 auto loc = forOp.getLoc();
1302 unsigned oldNumResults = forOp.getNumResults() / unrollJamFactor;
1303 for (LoopReduction &reduction : reductions) {
1304 unsigned pos = reduction.iterArgPosition;
1305 Value lhs = forOp.getResult(pos);
1306 Value rhs;
1307 SmallPtrSet<Operation *, 4> newOps;
1308 for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1309 rhs = forOp.getResult(i * oldNumResults + pos);
1310 // Create ops based on reduction type.
1311 lhs = arith::getReductionOp(reduction.kind, rewriter, loc, lhs, rhs);
1312 if (!lhs)
1313 return failure();
1314 Operation *op = lhs.getDefiningOp();
1315 assert(op && "Reduction op should have been created");
1316 newOps.insert(Ptr: op);
1317 }
1318 // Replace all uses except those in newly created reduction ops.
1319 forOp.getResult(pos).replaceAllUsesExcept(lhs, newOps);
1320 }
1321 }
1322
1323 // Promote the loop body up if this has turned into a single iteration loop.
1324 (void)promoteIfSingleIteration(forOp);
1325 return success();
1326}
1327
1328/// Performs loop interchange on 'forOpA' and 'forOpB', where 'forOpB' is
1329/// nested within 'forOpA' as the only non-terminator operation in its block.
1330void mlir::affine::interchangeLoops(AffineForOp forOpA, AffineForOp forOpB) {
1331 assert(&*forOpA.getBody()->begin() == forOpB.getOperation());
1332 auto &forOpABody = forOpA.getBody()->getOperations();
1333 auto &forOpBBody = forOpB.getBody()->getOperations();
1334
1335 // 1) Splice forOpA's non-terminator operations (which is just forOpB) just
1336 // before forOpA (in ForOpA's parent's block) this should leave 'forOpA's
1337 // body containing only the terminator.
1338 forOpA->getBlock()->getOperations().splice(Block::iterator(forOpA),
1339 forOpABody, forOpABody.begin(),
1340 std::prev(forOpABody.end()));
1341 // 2) Splice forOpB's non-terminator operations into the beginning of forOpA's
1342 // body (this leaves forOpB's body containing only the terminator).
1343 forOpABody.splice(forOpABody.begin(), forOpBBody, forOpBBody.begin(),
1344 std::prev(forOpBBody.end()));
1345 // 3) Splice forOpA into the beginning of forOpB's body.
1346 forOpBBody.splice(forOpBBody.begin(), forOpA->getBlock()->getOperations(),
1347 Block::iterator(forOpA));
1348}
1349
1350// Checks each dependence component against the permutation to see if the
1351// desired loop interchange would violate dependences by making the
1352// dependence component lexicographically negative.
1353static bool checkLoopInterchangeDependences(
1354 const std::vector<SmallVector<DependenceComponent, 2>> &depCompsVec,
1355 ArrayRef<AffineForOp> loops, ArrayRef<unsigned> loopPermMap) {
1356 // Invert permutation map.
1357 unsigned maxLoopDepth = loops.size();
1358 SmallVector<unsigned, 4> loopPermMapInv;
1359 loopPermMapInv.resize(N: maxLoopDepth);
1360 for (unsigned i = 0; i < maxLoopDepth; ++i)
1361 loopPermMapInv[loopPermMap[i]] = i;
1362
1363 // Check each dependence component against the permutation to see if the
1364 // desired loop interchange permutation would make the dependence vectors
1365 // lexicographically negative.
1366 // Example 1: [-1, 1][0, 0]
1367 // Example 2: [0, 0][-1, 1]
1368 for (const auto &depComps : depCompsVec) {
1369 assert(depComps.size() >= maxLoopDepth);
1370 // Check if the first non-zero dependence component is positive.
1371 // This iterates through loops in the desired order.
1372 for (unsigned j = 0; j < maxLoopDepth; ++j) {
1373 unsigned permIndex = loopPermMapInv[j];
1374 assert(depComps[permIndex].lb);
1375 int64_t depCompLb = *depComps[permIndex].lb;
1376 if (depCompLb > 0)
1377 break;
1378 if (depCompLb < 0)
1379 return false;
1380 }
1381 }
1382 return true;
1383}
1384
1385/// Checks if the loop interchange permutation 'loopPermMap' of the perfectly
1386/// nested sequence of loops in 'loops' would violate dependences.
1387bool mlir::affine::isValidLoopInterchangePermutation(
1388 ArrayRef<AffineForOp> loops, ArrayRef<unsigned> loopPermMap) {
1389 // Gather dependence components for dependences between all ops in loop nest
1390 // rooted at 'loops[0]', at loop depths in range [1, maxLoopDepth].
1391 assert(loopPermMap.size() == loops.size());
1392 unsigned maxLoopDepth = loops.size();
1393 std::vector<SmallVector<DependenceComponent, 2>> depCompsVec;
1394 getDependenceComponents(loops[0], maxLoopDepth, &depCompsVec);
1395 return checkLoopInterchangeDependences(depCompsVec, loops, loopPermMap);
1396}
1397
1398/// Returns true if `loops` is a perfectly nested loop nest, where loops appear
1399/// in it from outermost to innermost.
1400bool LLVM_ATTRIBUTE_UNUSED
1401mlir::affine::isPerfectlyNested(ArrayRef<AffineForOp> loops) {
1402 assert(!loops.empty() && "no loops provided");
1403
1404 // We already know that the block can't be empty.
1405 auto hasTwoElements = [](Block *block) {
1406 auto secondOpIt = std::next(x: block->begin());
1407 return secondOpIt != block->end() && &*secondOpIt == &block->back();
1408 };
1409
1410 auto enclosingLoop = loops.front();
1411 for (auto loop : loops.drop_front()) {
1412 auto parentForOp = dyn_cast<AffineForOp>(loop->getParentOp());
1413 // parentForOp's body should be just this loop and the terminator.
1414 if (parentForOp != enclosingLoop || !hasTwoElements(parentForOp.getBody()))
1415 return false;
1416 enclosingLoop = loop;
1417 }
1418 return true;
1419}
1420
1421// input[i] should move from position i -> permMap[i]. Returns the position in
1422// `input` that becomes the new outermost loop.
1423unsigned mlir::affine::permuteLoops(MutableArrayRef<AffineForOp> input,
1424 ArrayRef<unsigned> permMap) {
1425 assert(input.size() == permMap.size() && "invalid permutation map size");
1426 // Check whether the permutation spec is valid. This is a small vector - we'll
1427 // just sort and check if it's iota.
1428 SmallVector<unsigned, 4> checkPermMap(permMap.begin(), permMap.end());
1429 llvm::sort(C&: checkPermMap);
1430 if (llvm::any_of(Range: llvm::enumerate(First&: checkPermMap),
1431 P: [](const auto &en) { return en.value() != en.index(); }))
1432 assert(false && "invalid permutation map");
1433
1434 // Nothing to do.
1435 if (input.size() < 2)
1436 return 0;
1437
1438 assert(isPerfectlyNested(input) && "input not perfectly nested");
1439
1440 // Compute the inverse mapping, invPermMap: since input[i] goes to position
1441 // permMap[i], position i of the permuted nest is at input[invPermMap[i]].
1442 SmallVector<std::pair<unsigned, unsigned>, 4> invPermMap;
1443 for (unsigned i = 0, e = input.size(); i < e; ++i)
1444 invPermMap.push_back(Elt: {permMap[i], i});
1445 llvm::sort(C&: invPermMap);
1446
1447 // Move the innermost loop body to the loop that would be the innermost in the
1448 // permuted nest (only if the innermost loop is going to change).
1449 if (permMap.back() != input.size() - 1) {
1450 auto *destBody = input[invPermMap.back().second].getBody();
1451 auto *srcBody = input.back().getBody();
1452 destBody->getOperations().splice(destBody->begin(),
1453 srcBody->getOperations(), srcBody->begin(),
1454 std::prev(srcBody->end()));
1455 }
1456
1457 // We'll move each loop in `input` in the reverse order so that its body is
1458 // empty when we are moving it; this incurs zero copies and no erasing.
1459 for (int i = input.size() - 1; i >= 0; --i) {
1460 // If this has to become the outermost loop after permutation, add it to the
1461 // parent block of the original root.
1462 if (permMap[i] == 0) {
1463 // If the root remains the same, nothing to do.
1464 if (i == 0)
1465 continue;
1466 // Make input[i] the new outermost loop moving it into parentBlock.
1467 auto *parentBlock = input[0]->getBlock();
1468 parentBlock->getOperations().splice(Block::iterator(input[0]),
1469 input[i]->getBlock()->getOperations(),
1470 Block::iterator(input[i]));
1471 continue;
1472 }
1473
1474 // If the parent in the permuted order is the same as in the original,
1475 // nothing to do.
1476 unsigned parentPosInInput = invPermMap[permMap[i] - 1].second;
1477 if (i > 0 && static_cast<unsigned>(i - 1) == parentPosInInput)
1478 continue;
1479
1480 // Move input[i] to its surrounding loop in the transformed nest.
1481 auto *destBody = input[parentPosInInput].getBody();
1482 destBody->getOperations().splice(destBody->begin(),
1483 input[i]->getBlock()->getOperations(),
1484 Block::iterator(input[i]));
1485 }
1486
1487 return invPermMap[0].second;
1488}
1489
1490// Sinks all sequential loops to the innermost levels (while preserving
1491// relative order among them) and moves all parallel loops to the
1492// outermost (while again preserving relative order among them).
1493AffineForOp mlir::affine::sinkSequentialLoops(AffineForOp forOp) {
1494 SmallVector<AffineForOp, 4> loops;
1495 getPerfectlyNestedLoops(loops, forOp);
1496 if (loops.size() < 2)
1497 return forOp;
1498
1499 // Gather dependence components for dependences between all ops in loop nest
1500 // rooted at 'loops[0]', at loop depths in range [1, maxLoopDepth].
1501 unsigned maxLoopDepth = loops.size();
1502 std::vector<SmallVector<DependenceComponent, 2>> depCompsVec;
1503 getDependenceComponents(loops[0], maxLoopDepth, &depCompsVec);
1504
1505 // Mark loops as either parallel or sequential.
1506 SmallVector<bool, 8> isParallelLoop(maxLoopDepth, true);
1507 for (auto &depComps : depCompsVec) {
1508 assert(depComps.size() >= maxLoopDepth);
1509 for (unsigned j = 0; j < maxLoopDepth; ++j) {
1510 DependenceComponent &depComp = depComps[j];
1511 assert(depComp.lb.has_value() && depComp.ub.has_value());
1512 if (*depComp.lb != 0 || *depComp.ub != 0)
1513 isParallelLoop[j] = false;
1514 }
1515 }
1516
1517 unsigned numParallelLoops = llvm::count(Range&: isParallelLoop, Element: true);
1518
1519 // Compute permutation of loops that sinks sequential loops (and thus raises
1520 // parallel loops) while preserving relative order.
1521 SmallVector<unsigned, 4> loopPermMap(maxLoopDepth);
1522 unsigned nextSequentialLoop = numParallelLoops;
1523 unsigned nextParallelLoop = 0;
1524 for (unsigned i = 0; i < maxLoopDepth; ++i) {
1525 if (isParallelLoop[i]) {
1526 loopPermMap[i] = nextParallelLoop++;
1527 } else {
1528 loopPermMap[i] = nextSequentialLoop++;
1529 }
1530 }
1531
1532 // Check if permutation 'loopPermMap' would violate dependences.
1533 if (!checkLoopInterchangeDependences(depCompsVec, loops, loopPermMap))
1534 return forOp;
1535 // Perform loop interchange according to permutation 'loopPermMap'.
1536 unsigned loopNestRootIndex = permuteLoops(loops, loopPermMap);
1537 return loops[loopNestRootIndex];
1538}
1539
1540// Factors out common behavior to add a new `iv` (resp. `iv` + `offset`) to the
1541// lower (resp. upper) loop bound. When called for both the lower and upper
1542// bounds, the resulting IR resembles:
1543//
1544// ```mlir
1545// affine.for %i = max (`iv, ...) to min (`iv` + `offset`) {
1546// ...
1547// }
1548// ```
1549static void augmentMapAndBounds(OpBuilder &b, Value iv, AffineMap *map,
1550 SmallVector<Value, 4> *operands,
1551 int64_t offset = 0) {
1552 auto bounds = llvm::to_vector<4>(Range: map->getResults());
1553 bounds.push_back(Elt: b.getAffineDimExpr(position: map->getNumDims()) + offset);
1554 operands->insert(I: operands->begin() + map->getNumDims(), Elt: iv);
1555 *map = AffineMap::get(dimCount: map->getNumDims() + 1, symbolCount: map->getNumSymbols(), results: bounds,
1556 context: b.getContext());
1557 canonicalizeMapAndOperands(map, operands);
1558}
1559
1560// Stripmines `forOp` by `factor` and sinks it under each of the `targets`.
1561// Stripmine-sink is a primitive building block for generalized tiling of
1562// imperfectly nested loops.
1563// This transformation is purely mechanical and does not check legality,
1564// profitability or even structural correctness. It is the user's
1565// responsibility to specify `targets` that are dominated by `forOp`.
1566// Returns the new AffineForOps, one per `targets`, nested immediately under
1567// each of the `targets`.
1568static SmallVector<AffineForOp, 8>
1569stripmineSink(AffineForOp forOp, uint64_t factor,
1570 ArrayRef<AffineForOp> targets) {
1571 auto originalStep = forOp.getStepAsInt();
1572 auto scaledStep = originalStep * factor;
1573 forOp.setStep(scaledStep);
1574
1575 OpBuilder b(forOp->getBlock(), std::next(x: Block::iterator(forOp)));
1576
1577 // Lower-bound map creation.
1578 auto lbMap = forOp.getLowerBoundMap();
1579 SmallVector<Value, 4> lbOperands(forOp.getLowerBoundOperands());
1580 augmentMapAndBounds(b, forOp.getInductionVar(), &lbMap, &lbOperands);
1581
1582 // Upper-bound map creation.
1583 auto ubMap = forOp.getUpperBoundMap();
1584 SmallVector<Value, 4> ubOperands(forOp.getUpperBoundOperands());
1585 augmentMapAndBounds(b, forOp.getInductionVar(), &ubMap, &ubOperands,
1586 /*offset=*/scaledStep);
1587
1588 auto iv = forOp.getInductionVar();
1589 SmallVector<AffineForOp, 8> innerLoops;
1590 for (auto t : targets) {
1591 // Insert newForOp before the terminator of `t`.
1592 auto b = OpBuilder::atBlockTerminator(t.getBody());
1593 auto newForOp = b.create<AffineForOp>(t.getLoc(), lbOperands, lbMap,
1594 ubOperands, ubMap, originalStep);
1595 auto begin = t.getBody()->begin();
1596 // Skip terminator and `newForOp` which is just before the terminator.
1597 auto nOps = t.getBody()->getOperations().size() - 2;
1598 newForOp.getBody()->getOperations().splice(
1599 newForOp.getBody()->getOperations().begin(),
1600 t.getBody()->getOperations(), begin, std::next(begin, nOps));
1601 replaceAllUsesInRegionWith(iv, newForOp.getInductionVar(),
1602 newForOp.getRegion());
1603 innerLoops.push_back(newForOp);
1604 }
1605
1606 return innerLoops;
1607}
1608
1609// Stripmines a `forOp` by `factor` and sinks it under a single `target`.
1610// Returns the new AffineForOps, nested immediately under `target`.
1611template <typename SizeType>
1612static AffineForOp stripmineSink(AffineForOp forOp, SizeType factor,
1613 AffineForOp target) {
1614 // TODO: Use cheap structural assertions that targets are nested under
1615 // forOp and that targets are not nested under each other when DominanceInfo
1616 // exposes the capability. It seems overkill to construct a whole function
1617 // dominance tree at this point.
1618 auto res = stripmineSink(forOp, factor, ArrayRef<AffineForOp>(target));
1619 assert(res.size() == 1 && "Expected 1 inner forOp");
1620 return res[0];
1621}
1622
1623SmallVector<SmallVector<AffineForOp, 8>, 8>
1624mlir::affine::tile(ArrayRef<AffineForOp> forOps, ArrayRef<uint64_t> sizes,
1625 ArrayRef<AffineForOp> targets) {
1626 SmallVector<SmallVector<AffineForOp, 8>, 8> res;
1627 SmallVector<AffineForOp, 8> currentTargets(targets.begin(), targets.end());
1628 for (auto it : llvm::zip(t&: forOps, u&: sizes)) {
1629 auto step = stripmineSink(std::get<0>(t&: it), std::get<1>(t&: it), currentTargets);
1630 res.push_back(step);
1631 currentTargets = step;
1632 }
1633 return res;
1634}
1635
1636SmallVector<AffineForOp, 8> mlir::affine::tile(ArrayRef<AffineForOp> forOps,
1637 ArrayRef<uint64_t> sizes,
1638 AffineForOp target) {
1639 SmallVector<AffineForOp, 8> res;
1640 for (auto loops : tile(forOps, sizes, ArrayRef<AffineForOp>(target))) {
1641 assert(loops.size() == 1);
1642 res.push_back(loops[0]);
1643 }
1644 return res;
1645}
1646
1647LogicalResult mlir::affine::coalesceLoops(MutableArrayRef<AffineForOp> loops) {
1648 if (loops.size() < 2)
1649 return success();
1650
1651 AffineForOp innermost = loops.back();
1652 AffineForOp outermost = loops.front();
1653 AffineBound ub = outermost.getUpperBound();
1654 AffineMap origUbMap = ub.getMap();
1655 Location loc = outermost.getLoc();
1656 OpBuilder builder(outermost);
1657 for (AffineForOp loop : loops) {
1658 // We only work on normalized loops.
1659 if (loop.getStepAsInt() != 1 || !loop.hasConstantLowerBound() ||
1660 loop.getConstantLowerBound() != 0)
1661 return failure();
1662 }
1663 SmallVector<Value, 4> upperBoundSymbols;
1664 SmallVector<Value, 4> ubOperands(ub.getOperands().begin(),
1665 ub.getOperands().end());
1666
1667 // 1. Store the upper bound of the outermost loop in a variable.
1668 Value prev;
1669 if (!llvm::hasSingleElement(C: origUbMap.getResults()))
1670 prev = builder.create<AffineMinOp>(loc, origUbMap, ubOperands);
1671 else
1672 prev = builder.create<AffineApplyOp>(loc, origUbMap, ubOperands);
1673 upperBoundSymbols.push_back(Elt: prev);
1674
1675 // 2. Emit code computing the upper bound of the coalesced loop as product of
1676 // the number of iterations of all loops.
1677 for (AffineForOp loop : loops.drop_front()) {
1678 ub = loop.getUpperBound();
1679 origUbMap = ub.getMap();
1680 ubOperands = ub.getOperands();
1681 Value upperBound;
1682 // If upper bound map has more than one result, take their minimum.
1683 if (!llvm::hasSingleElement(origUbMap.getResults()))
1684 upperBound = builder.create<AffineMinOp>(loc, origUbMap, ubOperands);
1685 else
1686 upperBound = builder.create<AffineApplyOp>(loc, origUbMap, ubOperands);
1687 upperBoundSymbols.push_back(upperBound);
1688 SmallVector<Value, 4> operands;
1689 operands.push_back(prev);
1690 operands.push_back(upperBound);
1691 // Maintain running product of loop upper bounds.
1692 prev = builder.create<AffineApplyOp>(
1693 loc,
1694 AffineMap::get(/*dimCount=*/1,
1695 /*symbolCount=*/1,
1696 builder.getAffineDimExpr(0) *
1697 builder.getAffineSymbolExpr(0)),
1698 operands);
1699 }
1700 // Set upper bound of the coalesced loop.
1701 AffineMap newUbMap = AffineMap::get(
1702 /*dimCount=*/0,
1703 /*symbolCount=*/1, results: builder.getAffineSymbolExpr(position: 0), context: builder.getContext());
1704 outermost.setUpperBound(prev, newUbMap);
1705
1706 builder.setInsertionPointToStart(outermost.getBody());
1707
1708 // 3. Remap induction variables. For each original loop, the value of the
1709 // induction variable can be obtained by dividing the induction variable of
1710 // the linearized loop by the total number of iterations of the loops nested
1711 // in it modulo the number of iterations in this loop (remove the values
1712 // related to the outer loops):
1713 // iv_i = floordiv(iv_linear, product-of-loop-ranges-until-i) mod range_i.
1714 // Compute these iteratively from the innermost loop by creating a "running
1715 // quotient" of division by the range.
1716 Value previous = outermost.getInductionVar();
1717 for (unsigned idx = loops.size(); idx > 0; --idx) {
1718 if (idx != loops.size()) {
1719 SmallVector<Value, 4> operands;
1720 operands.push_back(Elt: previous);
1721 operands.push_back(Elt: upperBoundSymbols[idx]);
1722 previous = builder.create<AffineApplyOp>(
1723 loc,
1724 AffineMap::get(
1725 /*dimCount=*/1, /*symbolCount=*/1,
1726 result: builder.getAffineDimExpr(position: 0).floorDiv(
1727 other: builder.getAffineSymbolExpr(position: 0))),
1728 operands);
1729 }
1730 // Modified value of the induction variables of the nested loops after
1731 // coalescing.
1732 Value inductionVariable;
1733 if (idx == 1) {
1734 inductionVariable = previous;
1735 } else {
1736 SmallVector<Value, 4> applyOperands;
1737 applyOperands.push_back(Elt: previous);
1738 applyOperands.push_back(Elt: upperBoundSymbols[idx - 1]);
1739 inductionVariable = builder.create<AffineApplyOp>(
1740 loc,
1741 AffineMap::get(
1742 /*dimCount=*/1, /*symbolCount=*/1,
1743 result: builder.getAffineDimExpr(position: 0) % builder.getAffineSymbolExpr(position: 0)),
1744 applyOperands);
1745 }
1746 replaceAllUsesInRegionWith(loops[idx - 1].getInductionVar(),
1747 inductionVariable, loops.back().getRegion());
1748 }
1749
1750 // 4. Move the operations from the innermost just above the second-outermost
1751 // loop, delete the extra terminator and the second-outermost loop.
1752 AffineForOp secondOutermostLoop = loops[1];
1753 innermost.getBody()->back().erase();
1754 outermost.getBody()->getOperations().splice(
1755 Block::iterator(secondOutermostLoop.getOperation()),
1756 innermost.getBody()->getOperations());
1757 secondOutermostLoop.erase();
1758 return success();
1759}
1760
1761void mlir::affine::mapLoopToProcessorIds(scf::ForOp forOp,
1762 ArrayRef<Value> processorId,
1763 ArrayRef<Value> numProcessors) {
1764 assert(processorId.size() == numProcessors.size());
1765 if (processorId.empty())
1766 return;
1767
1768 OpBuilder b(forOp);
1769 Location loc(forOp.getLoc());
1770 AffineExpr lhs, rhs;
1771 bindSymbols(forOp.getContext(), lhs, rhs);
1772 auto mulMap = AffineMap::get(dimCount: 0, symbolCount: 2, result: lhs * rhs);
1773 auto addMap = AffineMap::get(dimCount: 0, symbolCount: 2, result: lhs + rhs);
1774
1775 Value linearIndex = processorId.front();
1776 for (unsigned i = 1, e = processorId.size(); i < e; ++i) {
1777 auto mulApplyOp = b.create<AffineApplyOp>(
1778 loc, mulMap, ValueRange{linearIndex, numProcessors[i]});
1779 linearIndex = b.create<AffineApplyOp>(
1780 loc, addMap, ValueRange{mulApplyOp, processorId[i]});
1781 }
1782
1783 auto mulApplyOp = b.create<AffineApplyOp>(
1784 loc, mulMap, ValueRange{linearIndex, forOp.getStep()});
1785 Value lb = b.create<AffineApplyOp>(
1786 loc, addMap, ValueRange{mulApplyOp, forOp.getLowerBound()});
1787 forOp.setLowerBound(lb);
1788
1789 Value step = forOp.getStep();
1790 for (auto numProcs : numProcessors)
1791 step = b.create<AffineApplyOp>(loc, mulMap, ValueRange{numProcs, step});
1792 forOp.setStep(step);
1793}
1794
1795/// Given a memref region, determine the lowest depth at which transfers can be
1796/// placed for it, and return the corresponding block, start and end positions
1797/// in the block for placing incoming (read) and outgoing (write) copies
1798/// respectively. The lowest depth depends on whether the region being accessed
1799/// is hoistable with respect to one or more immediately surrounding loops.
1800static void
1801findHighestBlockForPlacement(const MemRefRegion &region, Block &block,
1802 Block::iterator &begin, Block::iterator &end,
1803 Block **copyPlacementBlock,
1804 Block::iterator *copyInPlacementStart,
1805 Block::iterator *copyOutPlacementStart) {
1806 const auto *cst = region.getConstraints();
1807 SmallVector<Value, 4> symbols;
1808 cst->getValues(start: cst->getNumDimVars(), end: cst->getNumDimAndSymbolVars(), values: &symbols);
1809
1810 SmallVector<AffineForOp, 4> enclosingFors;
1811 getAffineForIVs(*block.begin(), &enclosingFors);
1812 // Walk up loop parents till we find an IV on which this region is
1813 // symbolic/variant.
1814 auto it = enclosingFors.rbegin();
1815 for (auto e = enclosingFors.rend(); it != e; ++it) {
1816 // TODO: also need to be checking this for regions symbols that
1817 // aren't loop IVs, whether we are within their resp. defs' dominance scope.
1818 if (llvm::is_contained(symbols, it->getInductionVar()))
1819 break;
1820 }
1821
1822 if (it != enclosingFors.rbegin()) {
1823 auto lastInvariantIV = *std::prev(it);
1824 *copyInPlacementStart = Block::iterator(lastInvariantIV.getOperation());
1825 *copyOutPlacementStart = std::next(x: *copyInPlacementStart);
1826 *copyPlacementBlock = lastInvariantIV->getBlock();
1827 } else {
1828 *copyInPlacementStart = begin;
1829 *copyOutPlacementStart = end;
1830 *copyPlacementBlock = &block;
1831 }
1832}
1833
1834// Info comprising stride and number of elements transferred every stride.
1835struct StrideInfo {
1836 int64_t stride;
1837 int64_t numEltPerStride;
1838};
1839
1840/// Returns striding information for a copy/transfer of this region with
1841/// potentially multiple striding levels from outermost to innermost. For an
1842/// n-dimensional region, there can be at most n-1 levels of striding
1843/// successively nested.
1844// TODO: make this work with non-identity layout maps.
1845static void getMultiLevelStrides(const MemRefRegion &region,
1846 ArrayRef<int64_t> bufferShape,
1847 SmallVectorImpl<StrideInfo> *strideInfos) {
1848 if (bufferShape.size() <= 1)
1849 return;
1850
1851 int64_t numEltPerStride = 1;
1852 int64_t stride = 1;
1853 for (int d = bufferShape.size() - 1; d >= 1; d--) {
1854 int64_t dimSize = cast<MemRefType>(region.memref.getType()).getDimSize(d);
1855 stride *= dimSize;
1856 numEltPerStride *= bufferShape[d];
1857 // A stride is needed only if the region has a shorter extent than the
1858 // memref along the dimension *and* has an extent greater than one along the
1859 // next major dimension.
1860 if (bufferShape[d] < dimSize && bufferShape[d - 1] > 1) {
1861 strideInfos->push_back(Elt: {.stride: stride, .numEltPerStride: numEltPerStride});
1862 }
1863 }
1864}
1865
1866/// Generates a point-wise copy from/to `memref' to/from `fastMemRef' and
1867/// returns the outermost AffineForOp of the copy loop nest. `lbMaps` and
1868/// `ubMaps` along with `lbOperands` and `ubOperands` hold the lower and upper
1869/// bound information for the copy loop nest. `fastBufOffsets` contain the
1870/// expressions to be subtracted out from the respective copy loop iterators in
1871/// order to index the fast buffer. If `copyOut' is true, generates a copy-out;
1872/// otherwise a copy-in. Builder `b` should be set to the point the copy nest is
1873/// inserted.
1874//
1875/// The copy-in nest is generated as follows as an example for a 2-d region:
1876/// for x = ...
1877/// for y = ...
1878/// fast_buf[x - offset_x][y - offset_y] = memref[x][y]
1879///
1880static AffineForOp
1881generatePointWiseCopy(Location loc, Value memref, Value fastMemRef,
1882 ArrayRef<AffineMap> lbMaps, ArrayRef<Value> lbOperands,
1883 ArrayRef<AffineMap> ubMaps, ArrayRef<Value> ubOperands,
1884 ArrayRef<AffineExpr> fastBufOffsets, bool isCopyOut,
1885 OpBuilder b) {
1886 assert(llvm::all_of(lbMaps, [&](AffineMap lbMap) {
1887 return lbMap.getNumInputs() == lbOperands.size();
1888 }));
1889 assert(llvm::all_of(ubMaps, [&](AffineMap ubMap) {
1890 return ubMap.getNumInputs() == ubOperands.size();
1891 }));
1892
1893 unsigned rank = cast<MemRefType>(memref.getType()).getRank();
1894 assert(lbMaps.size() == rank && "wrong number of lb maps");
1895 assert(ubMaps.size() == rank && "wrong number of ub maps");
1896
1897 SmallVector<Value, 4> memIndices;
1898 SmallVector<AffineExpr, 4> fastBufExprs;
1899 SmallVector<Value, 4> fastBufMapOperands;
1900 AffineForOp copyNestRoot;
1901 SmallVector<AffineApplyOp, 4> mayBeDeadApplys;
1902 for (unsigned d = 0; d < rank; ++d) {
1903 auto forOp = createCanonicalizedAffineForOp(b, loc, lbOperands, lbMaps[d],
1904 ubOperands, ubMaps[d]);
1905 if (d == 0)
1906 copyNestRoot = forOp;
1907
1908 b = OpBuilder::atBlockTerminator(block: forOp.getBody());
1909
1910 auto fastBufOffsetMap =
1911 AffineMap::get(dimCount: lbOperands.size(), symbolCount: 0, result: fastBufOffsets[d]);
1912 auto offset = b.create<AffineApplyOp>(loc, fastBufOffsetMap, lbOperands);
1913
1914 // Construct the subscript for the fast memref being copied into/from:
1915 // x - offset_x.
1916 fastBufExprs.push_back(Elt: b.getAffineDimExpr(position: 2 * d + 1) -
1917 b.getAffineDimExpr(position: 2 * d));
1918 fastBufMapOperands.push_back(Elt: offset);
1919 fastBufMapOperands.push_back(Elt: forOp.getInductionVar());
1920 mayBeDeadApplys.push_back(offset);
1921
1922 // Subscript for the slow memref being copied.
1923 memIndices.push_back(Elt: forOp.getInductionVar());
1924 }
1925
1926 auto fastBufMap =
1927 AffineMap::get(dimCount: 2 * rank, /*symbolCount=*/0, results: fastBufExprs, context: b.getContext());
1928 fullyComposeAffineMapAndOperands(&fastBufMap, &fastBufMapOperands);
1929 fastBufMap = simplifyAffineMap(fastBufMap);
1930 canonicalizeMapAndOperands(&fastBufMap, &fastBufMapOperands);
1931
1932 // Drop any dead affine.applys.
1933 for (auto applyOp : mayBeDeadApplys)
1934 if (applyOp.use_empty())
1935 applyOp.erase();
1936
1937 if (!isCopyOut) {
1938 // Copy in.
1939 auto load = b.create<AffineLoadOp>(loc, memref, memIndices);
1940 b.create<AffineStoreOp>(loc, load, fastMemRef, fastBufMap,
1941 fastBufMapOperands);
1942 return copyNestRoot;
1943 }
1944
1945 // Copy out.
1946 auto load =
1947 b.create<AffineLoadOp>(loc, fastMemRef, fastBufMap, fastBufMapOperands);
1948 b.create<AffineStoreOp>(loc, load, memref, memIndices);
1949 return copyNestRoot;
1950}
1951
1952static InFlightDiagnostic LLVM_ATTRIBUTE_UNUSED
1953emitRemarkForBlock(Block &block) {
1954 return block.getParentOp()->emitRemark();
1955}
1956
1957/// Creates a buffer in the faster memory space for the specified memref region;
1958/// generates a copy from the lower memory space to this one, and replaces all
1959/// loads/stores in the block range [`begin', `end') of `block' to load/store
1960/// from that buffer. Returns failure if copies could not be generated due to
1961/// yet unimplemented cases. `copyInPlacementStart` and `copyOutPlacementStart`
1962/// in copyPlacementBlock specify the insertion points where the incoming copies
1963/// and outgoing copies, respectively, should be inserted (the insertion happens
1964/// right before the insertion point). Since `begin` can itself be invalidated
1965/// due to the memref rewriting done from this method, the output argument
1966/// `nBegin` is set to its replacement (set to `begin` if no invalidation
1967/// happens). Since outgoing copies could have been inserted at `end`, the
1968/// output argument `nEnd` is set to the new end. `sizeInBytes` is set to the
1969/// size of the fast buffer allocated.
1970static LogicalResult generateCopy(
1971 const MemRefRegion &region, Block *block, Block::iterator begin,
1972 Block::iterator end, Block *copyPlacementBlock,
1973 Block::iterator copyInPlacementStart, Block::iterator copyOutPlacementStart,
1974 const AffineCopyOptions &copyOptions, DenseMap<Value, Value> &fastBufferMap,
1975 DenseSet<Operation *> &copyNests, uint64_t *sizeInBytes,
1976 Block::iterator *nBegin, Block::iterator *nEnd) {
1977 *nBegin = begin;
1978 *nEnd = end;
1979
1980 func::FuncOp f = begin->getParentOfType<func::FuncOp>();
1981 OpBuilder topBuilder(f.getBody());
1982 Value zeroIndex = topBuilder.create<arith::ConstantIndexOp>(f.getLoc(), 0);
1983
1984 *sizeInBytes = 0;
1985
1986 if (begin == end)
1987 return success();
1988
1989 // Is the copy out point at the end of the block where we are doing
1990 // explicit copying.
1991 bool isCopyOutAtEndOfBlock = (end == copyOutPlacementStart);
1992
1993 // Copies for read regions are going to be inserted at 'begin'.
1994 OpBuilder prologue(copyPlacementBlock, copyInPlacementStart);
1995 // Copies for write regions are going to be inserted at 'end'.
1996 OpBuilder epilogue(copyPlacementBlock, copyOutPlacementStart);
1997 OpBuilder &b = region.isWrite() ? epilogue : prologue;
1998
1999 // Builder to create constants at the top level.
2000 auto func = copyPlacementBlock->getParent()->getParentOfType<func::FuncOp>();
2001 OpBuilder top(func.getBody());
2002
2003 auto loc = region.loc;
2004 auto memref = region.memref;
2005 auto memRefType = cast<MemRefType>(memref.getType());
2006
2007 if (!memRefType.getLayout().isIdentity()) {
2008 LLVM_DEBUG(llvm::dbgs() << "Non-identity layout map not yet supported\n");
2009 return failure();
2010 }
2011
2012 // Indices to use for the copying.
2013 // Indices for the original memref being copied from/to.
2014 SmallVector<Value, 4> memIndices;
2015 // Indices for the faster buffer being copied into/from.
2016 SmallVector<Value, 4> bufIndices;
2017
2018 unsigned rank = memRefType.getRank();
2019 SmallVector<int64_t, 4> fastBufferShape;
2020
2021 // Compute the extents of the buffer.
2022 std::vector<SmallVector<int64_t, 4>> lbs;
2023 SmallVector<int64_t, 8> lbDivisors;
2024 lbs.reserve(n: rank);
2025 std::optional<int64_t> numElements = region.getConstantBoundingSizeAndShape(
2026 shape: &fastBufferShape, lbs: &lbs, lbDivisors: &lbDivisors);
2027 if (!numElements) {
2028 LLVM_DEBUG(llvm::dbgs() << "Non-constant region size not supported\n");
2029 return failure();
2030 }
2031
2032 if (*numElements == 0) {
2033 LLVM_DEBUG(llvm::dbgs() << "Nothing to copy\n");
2034 return success();
2035 }
2036
2037 SmallVector<AffineMap, 4> lbMaps(rank), ubMaps(rank);
2038 for (unsigned i = 0; i < rank; ++i)
2039 region.getLowerAndUpperBound(pos: i, lbMap&: lbMaps[i], ubMap&: ubMaps[i]);
2040
2041 const FlatAffineValueConstraints *cst = region.getConstraints();
2042 // 'regionSymbols' hold values that this memory region is symbolic/parametric
2043 // on; these typically include loop IVs surrounding the level at which the
2044 // copy generation is being done or other valid symbols in MLIR.
2045 SmallVector<Value, 8> regionSymbols;
2046 cst->getValues(start: rank, end: cst->getNumVars(), values: &regionSymbols);
2047
2048 // Construct the index expressions for the fast memory buffer. The index
2049 // expression for a particular dimension of the fast buffer is obtained by
2050 // subtracting out the lower bound on the original memref's data region
2051 // along the corresponding dimension.
2052
2053 // Index start offsets for faster memory buffer relative to the original.
2054 SmallVector<AffineExpr, 4> fastBufOffsets;
2055 fastBufOffsets.reserve(N: rank);
2056 for (unsigned d = 0; d < rank; d++) {
2057 assert(lbs[d].size() == cst->getNumCols() - rank && "incorrect bound size");
2058
2059 AffineExpr offset = top.getAffineConstantExpr(constant: 0);
2060 for (unsigned j = 0, e = cst->getNumCols() - rank - 1; j < e; j++)
2061 offset = offset + lbs[d][j] * top.getAffineDimExpr(position: j);
2062 assert(lbDivisors[d] > 0);
2063 offset =
2064 (offset + lbs[d][cst->getNumCols() - 1 - rank]).floorDiv(v: lbDivisors[d]);
2065
2066 // Set copy start location for this dimension in the lower memory space
2067 // memref.
2068 if (auto caf = dyn_cast<AffineConstantExpr>(offset)) {
2069 auto indexVal = caf.getValue();
2070 if (indexVal == 0) {
2071 memIndices.push_back(Elt: zeroIndex);
2072 } else {
2073 memIndices.push_back(
2074 Elt: top.create<arith::ConstantIndexOp>(loc, indexVal).getResult());
2075 }
2076 } else {
2077 // The coordinate for the start location is just the lower bound along the
2078 // corresponding dimension on the memory region (stored in 'offset').
2079 auto map = AffineMap::get(
2080 dimCount: cst->getNumDimVars() + cst->getNumSymbolVars() - rank, symbolCount: 0, result: offset);
2081 memIndices.push_back(Elt: b.create<AffineApplyOp>(loc, map, regionSymbols));
2082 }
2083 // The fast buffer is copied into at location zero; addressing is relative.
2084 bufIndices.push_back(Elt: zeroIndex);
2085
2086 // Record the offsets since they are needed to remap the memory accesses of
2087 // the original memref further below.
2088 fastBufOffsets.push_back(Elt: offset);
2089 }
2090
2091 // The faster memory space buffer.
2092 Value fastMemRef;
2093
2094 // Check if a buffer was already created.
2095 bool existingBuf = fastBufferMap.count(Val: memref) > 0;
2096 if (!existingBuf) {
2097 AffineMap fastBufferLayout = b.getMultiDimIdentityMap(rank);
2098 auto fastMemRefType =
2099 MemRefType::get(fastBufferShape, memRefType.getElementType(),
2100 fastBufferLayout, copyOptions.fastMemorySpace);
2101
2102 // Create the fast memory space buffer just before the 'affine.for'
2103 // operation.
2104 fastMemRef =
2105 prologue.create<memref::AllocOp>(loc, fastMemRefType).getResult();
2106 // Record it.
2107 fastBufferMap[memref] = fastMemRef;
2108 // fastMemRefType is a constant shaped memref.
2109 auto maySizeInBytes = getIntOrFloatMemRefSizeInBytes(fastMemRefType);
2110 // We don't account for things of unknown size.
2111 *sizeInBytes = maySizeInBytes ? *maySizeInBytes : 0;
2112
2113 LLVM_DEBUG(emitRemarkForBlock(*block)
2114 << "Creating fast buffer of type " << fastMemRefType
2115 << " and size " << llvm::divideCeil(*sizeInBytes, 1024)
2116 << " KiB\n");
2117 } else {
2118 // Reuse the one already created.
2119 fastMemRef = fastBufferMap[memref];
2120 }
2121
2122 auto numElementsSSA = top.create<arith::ConstantIndexOp>(location: loc, args&: *numElements);
2123
2124 Value dmaStride = nullptr;
2125 Value numEltPerDmaStride = nullptr;
2126 if (copyOptions.generateDma) {
2127 SmallVector<StrideInfo, 4> dmaStrideInfos;
2128 getMultiLevelStrides(region, bufferShape: fastBufferShape, strideInfos: &dmaStrideInfos);
2129
2130 // TODO: use all stride levels once DmaStartOp is extended for
2131 // multi-level strides.
2132 if (dmaStrideInfos.size() > 1) {
2133 LLVM_DEBUG(llvm::dbgs() << "Only up to one level of stride supported\n");
2134 return failure();
2135 }
2136
2137 if (!dmaStrideInfos.empty()) {
2138 dmaStride =
2139 top.create<arith::ConstantIndexOp>(location: loc, args&: dmaStrideInfos[0].stride);
2140 numEltPerDmaStride = top.create<arith::ConstantIndexOp>(
2141 location: loc, args&: dmaStrideInfos[0].numEltPerStride);
2142 }
2143 }
2144
2145 // Record the last operation where we want the memref replacement to end. We
2146 // later do the memref replacement only in [begin, postDomFilter] so
2147 // that the original memref's used in the data movement code themselves don't
2148 // get replaced.
2149 auto postDomFilter = std::prev(x: end);
2150
2151 // Create fully composed affine maps for each memref.
2152 auto memAffineMap = b.getMultiDimIdentityMap(rank: memIndices.size());
2153 fullyComposeAffineMapAndOperands(map: &memAffineMap, operands: &memIndices);
2154 auto bufAffineMap = b.getMultiDimIdentityMap(rank: bufIndices.size());
2155 fullyComposeAffineMapAndOperands(map: &bufAffineMap, operands: &bufIndices);
2156
2157 if (!copyOptions.generateDma) {
2158 // Point-wise copy generation.
2159 auto copyNest =
2160 generatePointWiseCopy(loc, memref, fastMemRef, lbMaps,
2161 /*lbOperands=*/regionSymbols, ubMaps,
2162 /*ubOperands=*/regionSymbols, fastBufOffsets,
2163 /*isCopyOut=*/region.isWrite(), b);
2164
2165 // Record this so that we can skip it from yet another copy.
2166 copyNests.insert(copyNest);
2167
2168 // Since new ops are being appended (for copy out's), adjust the end to
2169 // mark end of block range being processed if necessary.
2170 if (region.isWrite() && isCopyOutAtEndOfBlock)
2171 *nEnd = Block::iterator(copyNest.getOperation());
2172 } else {
2173 // DMA generation.
2174 // Create a tag (single element 1-d memref) for the DMA.
2175 auto tagMemRefType = MemRefType::get({1}, top.getIntegerType(32), {},
2176 copyOptions.tagMemorySpace);
2177 auto tagMemRef = prologue.create<memref::AllocOp>(loc, tagMemRefType);
2178
2179 SmallVector<Value, 4> tagIndices({zeroIndex});
2180 auto tagAffineMap = b.getMultiDimIdentityMap(rank: tagIndices.size());
2181 fullyComposeAffineMapAndOperands(&tagAffineMap, &tagIndices);
2182 if (!region.isWrite()) {
2183 // DMA non-blocking read from original buffer to fast buffer.
2184 b.create<AffineDmaStartOp>(loc, memref, memAffineMap, memIndices,
2185 fastMemRef, bufAffineMap, bufIndices,
2186 tagMemRef, tagAffineMap, tagIndices,
2187 numElementsSSA, dmaStride, numEltPerDmaStride);
2188 } else {
2189 // DMA non-blocking write from fast buffer to the original memref.
2190 auto op = b.create<AffineDmaStartOp>(
2191 loc, fastMemRef, bufAffineMap, bufIndices, memref, memAffineMap,
2192 memIndices, tagMemRef, tagAffineMap, tagIndices, numElementsSSA,
2193 dmaStride, numEltPerDmaStride);
2194 // Since new ops may be appended at 'end' (for outgoing DMAs), adjust the
2195 // end to mark end of block range being processed.
2196 if (isCopyOutAtEndOfBlock)
2197 *nEnd = Block::iterator(op.getOperation());
2198 }
2199
2200 // Matching DMA wait to block on completion; tag always has a 0 index.
2201 b.create<AffineDmaWaitOp>(loc, tagMemRef, tagAffineMap, zeroIndex,
2202 numElementsSSA);
2203
2204 // Generate dealloc for the tag.
2205 auto tagDeallocOp = epilogue.create<memref::DeallocOp>(loc, tagMemRef);
2206 if (*nEnd == end && isCopyOutAtEndOfBlock)
2207 // Since new ops are being appended (for outgoing DMAs), adjust the end to
2208 // mark end of range of the original.
2209 *nEnd = Block::iterator(tagDeallocOp.getOperation());
2210 }
2211
2212 // Generate dealloc for the buffer.
2213 if (!existingBuf) {
2214 auto bufDeallocOp = epilogue.create<memref::DeallocOp>(loc, fastMemRef);
2215 // When generating pointwise copies, `nEnd' has to be set to deallocOp on
2216 // the fast buffer (since it marks the new end insertion point).
2217 if (!copyOptions.generateDma && *nEnd == end && isCopyOutAtEndOfBlock)
2218 *nEnd = Block::iterator(bufDeallocOp.getOperation());
2219 }
2220
2221 // Replace all uses of the old memref with the faster one while remapping
2222 // access indices (subtracting out lower bound offsets for each dimension).
2223 // Ex: to replace load %A[%i, %j] with load %Abuf[%i - %iT, %j - %jT],
2224 // index remap will be (%i, %j) -> (%i - %iT, %j - %jT),
2225 // i.e., affine.apply (d0, d1, d2, d3) -> (d2-d0, d3-d1) (%iT, %jT, %i, %j),
2226 // and (%iT, %jT) will be the 'extraOperands' for 'rep all memref uses with'.
2227 // d2, d3 correspond to the original indices (%i, %j).
2228 SmallVector<AffineExpr, 4> remapExprs;
2229 remapExprs.reserve(N: rank);
2230 for (unsigned i = 0; i < rank; i++) {
2231 // The starting operands of indexRemap will be regionSymbols (the symbols on
2232 // which the memref region is parametric); then those corresponding to
2233 // the memref's original indices follow.
2234 auto dimExpr = b.getAffineDimExpr(position: regionSymbols.size() + i);
2235 remapExprs.push_back(Elt: dimExpr - fastBufOffsets[i]);
2236 }
2237 auto indexRemap = AffineMap::get(dimCount: regionSymbols.size() + rank, symbolCount: 0, results: remapExprs,
2238 context: b.getContext());
2239
2240 // Record the begin since it may be invalidated by memref replacement.
2241 Block::iterator prevOfBegin;
2242 bool isBeginAtStartOfBlock = (begin == block->begin());
2243 if (!isBeginAtStartOfBlock)
2244 prevOfBegin = std::prev(x: begin);
2245
2246 // *Only* those uses within the range [begin, end) of 'block' are replaced.
2247 (void)replaceAllMemRefUsesWith(memref, fastMemRef,
2248 /*extraIndices=*/{}, indexRemap,
2249 /*extraOperands=*/regionSymbols,
2250 /*symbolOperands=*/{},
2251 /*domOpFilter=*/&*begin,
2252 /*postDomOpFilter=*/&*postDomFilter);
2253
2254 *nBegin = isBeginAtStartOfBlock ? block->begin() : std::next(x: prevOfBegin);
2255
2256 return success();
2257}
2258
2259/// Construct the memref region to just include the entire memref. Returns false
2260/// dynamic shaped memref's for now. `numParamLoopIVs` is the number of
2261/// enclosing loop IVs of `op` (starting from the outermost) that the region
2262/// is parametric on.
2263static bool getFullMemRefAsRegion(Operation *op, unsigned numParamLoopIVs,
2264 MemRefRegion *region) {
2265 unsigned rank;
2266 if (auto loadOp = dyn_cast<AffineLoadOp>(op)) {
2267 rank = loadOp.getMemRefType().getRank();
2268 region->memref = loadOp.getMemRef();
2269 region->setWrite(false);
2270 } else if (auto storeOp = dyn_cast<AffineStoreOp>(op)) {
2271 rank = storeOp.getMemRefType().getRank();
2272 region->memref = storeOp.getMemRef();
2273 region->setWrite(true);
2274 } else {
2275 assert(false && "expected load or store op");
2276 return false;
2277 }
2278 auto memRefType = cast<MemRefType>(region->memref.getType());
2279 if (!memRefType.hasStaticShape())
2280 return false;
2281
2282 auto *regionCst = region->getConstraints();
2283
2284 // Just get the first numSymbols IVs, which the memref region is parametric
2285 // on.
2286 SmallVector<AffineForOp, 4> ivs;
2287 getAffineForIVs(*op, &ivs);
2288 ivs.resize(numParamLoopIVs);
2289 SmallVector<Value, 4> symbols;
2290 extractForInductionVars(ivs, &symbols);
2291 *regionCst = FlatAffineValueConstraints(rank, numParamLoopIVs, 0);
2292 regionCst->setValues(start: rank, end: rank + numParamLoopIVs, values: symbols);
2293
2294 // Memref dim sizes provide the bounds.
2295 for (unsigned d = 0; d < rank; d++) {
2296 auto dimSize = memRefType.getDimSize(d);
2297 assert(dimSize > 0 && "filtered dynamic shapes above");
2298 regionCst->addBound(type: BoundType::LB, pos: d, value: 0);
2299 regionCst->addBound(BoundType::UB, d, dimSize - 1);
2300 }
2301 return true;
2302}
2303
2304LogicalResult
2305mlir::affine::affineDataCopyGenerate(Block::iterator begin, Block::iterator end,
2306 const AffineCopyOptions &copyOptions,
2307 std::optional<Value> filterMemRef,
2308 DenseSet<Operation *> &copyNests) {
2309 if (begin == end)
2310 return success();
2311
2312 assert(begin->getBlock() == std::prev(end)->getBlock() &&
2313 "Inconsistent block begin/end args");
2314 assert(end != end->getBlock()->end() && "end can't be the block terminator");
2315
2316 Block *block = begin->getBlock();
2317
2318 // Copies will be generated for this depth, i.e., symbolic in all loops
2319 // surrounding the this block range.
2320 unsigned copyDepth = getNestingDepth(op: &*begin);
2321
2322 LLVM_DEBUG(llvm::dbgs() << "Generating copies at depth " << copyDepth
2323 << "\n");
2324 LLVM_DEBUG(llvm::dbgs() << "from begin: " << *begin << "\n");
2325 LLVM_DEBUG(llvm::dbgs() << "to inclusive end: " << *std::prev(end) << "\n");
2326
2327 // List of memory regions to copy for. We need a map vector to have a
2328 // guaranteed iteration order to write test cases. CHECK-DAG doesn't help here
2329 // since the alloc's for example are identical except for the SSA id.
2330 SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4> readRegions;
2331 SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4> writeRegions;
2332
2333 // Map from original memref's to the fast buffers that their accesses are
2334 // replaced with.
2335 DenseMap<Value, Value> fastBufferMap;
2336
2337 // To check for errors when walking the block.
2338 bool error = false;
2339
2340 // Walk this range of operations to gather all memory regions.
2341 block->walk(begin, end, callback: [&](Operation *opInst) {
2342 // Gather regions to allocate to buffers in faster memory space.
2343 if (auto loadOp = dyn_cast<AffineLoadOp>(opInst)) {
2344 if ((filterMemRef.has_value() && filterMemRef != loadOp.getMemRef()) ||
2345 (loadOp.getMemRefType().getMemorySpaceAsInt() !=
2346 copyOptions.slowMemorySpace))
2347 return;
2348 } else if (auto storeOp = dyn_cast<AffineStoreOp>(opInst)) {
2349 if ((filterMemRef.has_value() && filterMemRef != storeOp.getMemRef()) ||
2350 storeOp.getMemRefType().getMemorySpaceAsInt() !=
2351 copyOptions.slowMemorySpace)
2352 return;
2353 } else {
2354 // Neither load nor a store op.
2355 return;
2356 }
2357
2358 // Compute the MemRefRegion accessed.
2359 auto region = std::make_unique<MemRefRegion>(args: opInst->getLoc());
2360 if (failed(result: region->compute(op: opInst, loopDepth: copyDepth, /*sliceState=*/nullptr,
2361 /*addMemRefDimBounds=*/false))) {
2362 LLVM_DEBUG(llvm::dbgs()
2363 << "Error obtaining memory region: semi-affine maps?\n");
2364 LLVM_DEBUG(llvm::dbgs() << "over-approximating to the entire memref\n");
2365 if (!getFullMemRefAsRegion(op: opInst, numParamLoopIVs: copyDepth, region: region.get())) {
2366 LLVM_DEBUG(
2367 opInst->emitError("non-constant memref sizes not yet supported"));
2368 error = true;
2369 return;
2370 }
2371 }
2372
2373 // Each memref has a single buffer associated with it irrespective of how
2374 // many load's and store's happen on it.
2375 // TODO: in the future, when regions don't intersect and satisfy
2376 // other properties (based on load/store regions), we could consider
2377 // multiple buffers per memref.
2378
2379 // Add to the appropriate region if it's not already in it, or take a
2380 // bounding box union with the existing one if it's already in there.
2381 // Note that a memref may have both read and write regions - so update the
2382 // region in the other list if one exists (write in case of read and vice
2383 // versa) since there is a single bounding box for a memref across all reads
2384 // and writes that happen on it.
2385
2386 // Attempts to update; returns true if 'region' exists in targetRegions.
2387 auto updateRegion =
2388 [&](const SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4>
2389 &targetRegions) {
2390 const auto *const it = targetRegions.find(Key: region->memref);
2391 if (it == targetRegions.end())
2392 return false;
2393
2394 // Perform a union with the existing region.
2395 if (failed(result: it->second->unionBoundingBox(other: *region))) {
2396 LLVM_DEBUG(llvm::dbgs()
2397 << "Memory region bounding box failed; "
2398 "over-approximating to the entire memref\n");
2399 // If the union fails, we will overapproximate.
2400 if (!getFullMemRefAsRegion(op: opInst, numParamLoopIVs: copyDepth, region: region.get())) {
2401 LLVM_DEBUG(opInst->emitError(
2402 "non-constant memref sizes not yet supported"));
2403 error = true;
2404 return true;
2405 }
2406 it->second->getConstraints()->clearAndCopyFrom(
2407 other: *region->getConstraints());
2408 } else {
2409 // Union was computed and stored in 'it->second': copy to 'region'.
2410 region->getConstraints()->clearAndCopyFrom(
2411 other: *it->second->getConstraints());
2412 }
2413 return true;
2414 };
2415
2416 bool existsInRead = updateRegion(readRegions);
2417 if (error)
2418 return;
2419 bool existsInWrite = updateRegion(writeRegions);
2420 if (error)
2421 return;
2422
2423 // Finally add it to the region list.
2424 if (region->isWrite() && !existsInWrite) {
2425 writeRegions[region->memref] = std::move(region);
2426 } else if (!region->isWrite() && !existsInRead) {
2427 readRegions[region->memref] = std::move(region);
2428 }
2429 });
2430
2431 if (error) {
2432 LLVM_DEBUG(begin->emitError(
2433 "copy generation failed for one or more memref's in this block\n"));
2434 return failure();
2435 }
2436
2437 uint64_t totalCopyBuffersSizeInBytes = 0;
2438 bool ret = true;
2439 auto processRegions =
2440 [&](const SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4>
2441 &regions) {
2442 for (const auto &regionEntry : regions) {
2443 // For each region, hoist copy in/out past all hoistable
2444 // 'affine.for's.
2445 Block::iterator copyInPlacementStart, copyOutPlacementStart;
2446 Block *copyPlacementBlock;
2447 findHighestBlockForPlacement(
2448 region: *regionEntry.second, block&: *block, begin, end, copyPlacementBlock: &copyPlacementBlock,
2449 copyInPlacementStart: &copyInPlacementStart, copyOutPlacementStart: &copyOutPlacementStart);
2450
2451 uint64_t sizeInBytes;
2452 Block::iterator nBegin, nEnd;
2453 LogicalResult iRet = generateCopy(
2454 region: *regionEntry.second, block, begin, end, copyPlacementBlock,
2455 copyInPlacementStart, copyOutPlacementStart, copyOptions,
2456 fastBufferMap, copyNests, sizeInBytes: &sizeInBytes, nBegin: &nBegin, nEnd: &nEnd);
2457 if (succeeded(result: iRet)) {
2458 // begin/end could have been invalidated, and need update.
2459 begin = nBegin;
2460 end = nEnd;
2461 totalCopyBuffersSizeInBytes += sizeInBytes;
2462 }
2463 ret = ret & succeeded(result: iRet);
2464 }
2465 };
2466 processRegions(readRegions);
2467 processRegions(writeRegions);
2468
2469 if (!ret) {
2470 LLVM_DEBUG(begin->emitError(
2471 "copy generation failed for one or more memref's in this block\n"));
2472 return failure();
2473 }
2474
2475 // For a range of operations, a note will be emitted at the caller.
2476 AffineForOp forOp;
2477 if (llvm::DebugFlag && (forOp = dyn_cast<AffineForOp>(&*begin))) {
2478 LLVM_DEBUG(forOp.emitRemark()
2479 << llvm::divideCeil(totalCopyBuffersSizeInBytes, 1024)
2480 << " KiB of copy buffers in fast memory space for this block");
2481 }
2482
2483 if (totalCopyBuffersSizeInBytes > copyOptions.fastMemCapacityBytes) {
2484 block->getParentOp()->emitWarning(
2485 message: "total size of all copy buffers' for this block exceeds fast memory "
2486 "capacity");
2487 }
2488
2489 return success();
2490}
2491
2492// A convenience version of affineDataCopyGenerate for all ops in the body of
2493// an AffineForOp.
2494LogicalResult mlir::affine::affineDataCopyGenerate(
2495 AffineForOp forOp, const AffineCopyOptions &copyOptions,
2496 std::optional<Value> filterMemRef, DenseSet<Operation *> &copyNests) {
2497 return affineDataCopyGenerate(forOp.getBody()->begin(),
2498 std::prev(forOp.getBody()->end()), copyOptions,
2499 filterMemRef, copyNests);
2500}
2501
2502LogicalResult mlir::affine::generateCopyForMemRegion(
2503 const MemRefRegion &memrefRegion, Operation *analyzedOp,
2504 const AffineCopyOptions &copyOptions, CopyGenerateResult &result) {
2505 Block *block = analyzedOp->getBlock();
2506 auto begin = analyzedOp->getIterator();
2507 auto end = std::next(begin);
2508 DenseMap<Value, Value> fastBufferMap;
2509 DenseSet<Operation *> copyNests;
2510
2511 auto err = generateCopy(memrefRegion, block, begin, end, block, begin, end,
2512 copyOptions, fastBufferMap, copyNests,
2513 &result.sizeInBytes, &begin, &end);
2514 if (failed(err))
2515 return err;
2516
2517 const auto &en = fastBufferMap.find(Val: memrefRegion.memref);
2518 // In some cases (empty loops), no copy generation would have happened.
2519 if (en == fastBufferMap.end())
2520 return failure();
2521 result.alloc = en->second.getDefiningOp();
2522 assert(result.alloc && "fast buffer expected to be locally allocated");
2523 assert(copyNests.size() <= 1 && "At most one copy nest is expected.");
2524 result.copyNest = copyNests.empty() ? nullptr : *copyNests.begin();
2525 return success();
2526}
2527
2528/// Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'.
2529static void
2530gatherLoopsInBlock(Block *block, unsigned currLoopDepth,
2531 std::vector<SmallVector<AffineForOp, 2>> &depthToLoops) {
2532 // Add a new empty level to output if it doesn't exist level already.
2533 assert(currLoopDepth <= depthToLoops.size() && "Unexpected currLoopDepth");
2534 if (currLoopDepth == depthToLoops.size())
2535 depthToLoops.emplace_back();
2536
2537 for (auto &op : *block) {
2538 if (auto forOp = dyn_cast<AffineForOp>(op)) {
2539 depthToLoops[currLoopDepth].push_back(forOp);
2540 gatherLoopsInBlock(forOp.getBody(), currLoopDepth + 1, depthToLoops);
2541 }
2542 }
2543}
2544
2545/// Gathers all AffineForOps in 'func.func' grouped by loop depth.
2546void mlir::affine::gatherLoops(
2547 func::FuncOp func, std::vector<SmallVector<AffineForOp, 2>> &depthToLoops) {
2548 for (auto &block : func)
2549 gatherLoopsInBlock(&block, /*currLoopDepth=*/0, depthToLoops);
2550
2551 // Remove last loop level from output since it's empty.
2552 if (!depthToLoops.empty()) {
2553 assert(depthToLoops.back().empty() && "Last loop level is not empty?");
2554 depthToLoops.pop_back();
2555 }
2556}
2557
2558AffineForOp mlir::affine::createCanonicalizedAffineForOp(
2559 OpBuilder b, Location loc, ValueRange lbOperands, AffineMap lbMap,
2560 ValueRange ubOperands, AffineMap ubMap, int64_t step) {
2561 SmallVector<Value, 4> lowerOperands(lbOperands);
2562 SmallVector<Value, 4> upperOperands(ubOperands);
2563
2564 fullyComposeAffineMapAndOperands(map: &lbMap, operands: &lowerOperands);
2565 canonicalizeMapAndOperands(map: &lbMap, operands: &lowerOperands);
2566 lbMap = removeDuplicateExprs(map: lbMap);
2567 fullyComposeAffineMapAndOperands(map: &ubMap, operands: &upperOperands);
2568 canonicalizeMapAndOperands(map: &ubMap, operands: &upperOperands);
2569 ubMap = removeDuplicateExprs(map: ubMap);
2570
2571 return b.create<AffineForOp>(loc, lowerOperands, lbMap, upperOperands, ubMap,
2572 step);
2573}
2574
2575/// Creates an AffineIfOp that encodes the conditional to choose between
2576/// the constant trip count version and an unknown trip count version of this
2577/// nest of loops. This is used to separate partial and full tiles if `loops`
2578/// has the intra-tile loops. The affine.if op is inserted at the builder
2579/// insertion point of `b`.
2580static AffineIfOp createSeparationCondition(MutableArrayRef<AffineForOp> loops,
2581 OpBuilder b) {
2582 if (loops.empty())
2583 return nullptr;
2584
2585 auto *context = loops[0].getContext();
2586
2587 FlatAffineValueConstraints cst;
2588 SmallVector<Operation *, 8> ops;
2589 llvm::append_range(C&: ops, R&: loops);
2590 (void)getIndexSet(ops, domain: &cst);
2591
2592 // Remove constraints that are independent of these loop IVs.
2593 cst.removeIndependentConstraints(/*pos=*/0, /*num=*/loops.size());
2594
2595 // Construct the constraint set representing the guard for full tiles. The
2596 // lower bound (and upper bound) corresponding to the full tile should be
2597 // larger (and resp. smaller) than any other lower (or upper bound).
2598 SmallVector<int64_t, 8> fullTileLb, fullTileUb;
2599 for (auto loop : loops) {
2600 (void)loop;
2601 // TODO: Non-unit stride is not an issue to generalize to.
2602 assert(loop.getStepAsInt() == 1 && "point loop step expected to be one");
2603 // Mark everything symbols for the purpose of finding a constant diff pair.
2604 cst.setDimSymbolSeparation(/*newSymbolCount=*/cst.getNumDimAndSymbolVars() -
2605 1);
2606 unsigned fullTileLbPos, fullTileUbPos;
2607 if (!cst.getConstantBoundOnDimSize(0, /*lb=*/nullptr,
2608 /*boundFloorDivisor=*/nullptr,
2609 /*ub=*/nullptr, &fullTileLbPos,
2610 &fullTileUbPos)) {
2611 LLVM_DEBUG(llvm::dbgs() << "Can't get constant diff pair for a loop\n");
2612 return nullptr;
2613 }
2614
2615 SmallVector<unsigned, 4> lbIndices, ubIndices;
2616 cst.getLowerAndUpperBoundIndices(/*pos=*/0, &lbIndices, &ubIndices);
2617
2618 auto fLb = cst.getInequality(fullTileLbPos);
2619 auto fUb = cst.getInequality(fullTileUbPos);
2620 fullTileLb.assign(fLb.begin(), fLb.end());
2621 fullTileUb.assign(fUb.begin(), fUb.end());
2622
2623 // Full tile lower bound should be >= than any other lower bound.
2624 for (auto lbIndex : lbIndices)
2625 for (unsigned i = 0, e = cst.getNumCols(); i < e; ++i)
2626 cst.atIneq(lbIndex, i) = fullTileLb[i] - cst.atIneq(lbIndex, i);
2627
2628 // Full tile upper bound should be <= any other upper bound.
2629 for (auto ubIndex : ubIndices)
2630 for (unsigned i = 0, e = cst.getNumCols(); i < e; ++i)
2631 cst.atIneq(ubIndex, i) -= fullTileUb[i];
2632
2633 cst.removeVar(0);
2634 }
2635
2636 // The previous step leads to all zeros for the full tile lb and ub position
2637 // itself; remove those and any other duplicates / trivial redundancies.
2638 cst.removeTrivialRedundancy();
2639
2640 // Turn everything into dims conservatively since we earlier turned all
2641 // trailing ids past point loop IV into symbols. Some of these could be outer
2642 // loop IVs; we'll canonicalize anyway.
2643 cst.setDimSymbolSeparation(0);
2644
2645 IntegerSet ifCondSet = cst.getAsIntegerSet(context: context);
2646 // ifCondSet can be null if cst was empty -- this can happen if all loops
2647 // in the nest have constant trip counts.
2648 if (!ifCondSet)
2649 return nullptr;
2650
2651 SmallVector<Value, 4> setOperands;
2652 cst.getValues(start: 0, end: cst.getNumDimAndSymbolVars(), values: &setOperands);
2653 canonicalizeSetAndOperands(set: &ifCondSet, operands: &setOperands);
2654 return b.create<AffineIfOp>(loops[0].getLoc(), ifCondSet, setOperands,
2655 /*withElseRegion=*/true);
2656}
2657
2658/// Create the full tile loop nest (along with its body).
2659static LogicalResult
2660createFullTiles(MutableArrayRef<AffineForOp> inputNest,
2661 SmallVectorImpl<AffineForOp> &fullTileLoops, OpBuilder b) {
2662 fullTileLoops.reserve(N: inputNest.size());
2663
2664 // For each loop in the original nest identify a lower/upper bound pair such
2665 // that their difference is a constant.
2666 FlatAffineValueConstraints cst;
2667 for (auto loop : inputNest) {
2668 // TODO: straightforward to generalize to a non-unit stride.
2669 if (loop.getStepAsInt() != 1) {
2670 LLVM_DEBUG(llvm::dbgs()
2671 << "[tile separation] non-unit stride not implemented\n");
2672 return failure();
2673 }
2674 SmallVector<Operation *, 1> loopOp{loop.getOperation()};
2675 (void)getIndexSet(loopOp, &cst);
2676 // We will mark everything other than this loop IV as symbol for getting a
2677 // pair of <lb, ub> with a constant difference.
2678 cst.setDimSymbolSeparation(cst.getNumDimAndSymbolVars() - 1);
2679 unsigned lbPos, ubPos;
2680 if (!cst.getConstantBoundOnDimSize(/*pos=*/0, /*lb=*/nullptr,
2681 /*boundFloorDivisor=*/nullptr,
2682 /*ub=*/nullptr, &lbPos, &ubPos) ||
2683 lbPos == ubPos) {
2684 LLVM_DEBUG(llvm::dbgs() << "[tile separation] Can't get constant diff / "
2685 "equalities not yet handled\n");
2686 return failure();
2687 }
2688
2689 // Set all variables as dimensions uniformly since some of those marked as
2690 // symbols above could be outer loop IVs (corresponding tile space IVs).
2691 cst.setDimSymbolSeparation(/*newSymbolCount=*/0);
2692
2693 AffineValueMap lbVmap, ubVmap;
2694 cst.getIneqAsAffineValueMap(/*pos=*/0, lbPos, lbVmap, b.getContext());
2695 cst.getIneqAsAffineValueMap(/*pos=*/0, ubPos, ubVmap, b.getContext());
2696 AffineForOp fullTileLoop = createCanonicalizedAffineForOp(
2697 b, loop.getLoc(), lbVmap.getOperands(), lbVmap.getAffineMap(),
2698 ubVmap.getOperands(), ubVmap.getAffineMap());
2699 b = OpBuilder::atBlockTerminator(fullTileLoop.getBody());
2700 fullTileLoops.push_back(fullTileLoop);
2701 }
2702
2703 // Add the body for the full tile loop nest.
2704 IRMapping operandMap;
2705 for (const auto &loopEn : llvm::enumerate(inputNest))
2706 operandMap.map(loopEn.value().getInductionVar(),
2707 fullTileLoops[loopEn.index()].getInductionVar());
2708 b = OpBuilder::atBlockTerminator(block: fullTileLoops.back().getBody());
2709 for (auto &op : inputNest.back().getBody()->without_terminator())
2710 b.clone(op, operandMap);
2711 return success();
2712}
2713
2714LogicalResult
2715mlir::affine::separateFullTiles(MutableArrayRef<AffineForOp> inputNest,
2716 SmallVectorImpl<AffineForOp> *fullTileNest) {
2717 if (inputNest.empty())
2718 return success();
2719
2720 auto firstLoop = inputNest[0];
2721
2722 // Each successive for op has to be nested in the other.
2723 auto prevLoop = firstLoop;
2724 for (auto loop : inputNest.drop_front(1)) {
2725 assert(loop->getParentOp() == prevLoop && "input not contiguously nested");
2726 prevLoop = loop;
2727 }
2728
2729 // Create the full tile loop nest.
2730 SmallVector<AffineForOp, 4> fullTileLoops;
2731 OpBuilder b(firstLoop);
2732 if (failed(result: createFullTiles(inputNest, fullTileLoops, b))) {
2733 if (!fullTileLoops.empty())
2734 fullTileLoops.front().erase();
2735 return failure();
2736 }
2737
2738 // Create and insert the version select right before the root of the nest.
2739 b = OpBuilder(firstLoop);
2740 AffineIfOp ifOp = createSeparationCondition(inputNest, b);
2741 if (!ifOp) {
2742 fullTileLoops.front().erase();
2743 LLVM_DEBUG(llvm::dbgs() << "All tiles are full tiles, or failure creating "
2744 "separation condition\n");
2745 return failure();
2746 }
2747
2748 // Move the full tile into the then block.
2749 Block *thenBlock = ifOp.getThenBlock();
2750 AffineForOp outermostFullTileLoop = fullTileLoops[0];
2751 thenBlock->getOperations().splice(
2752 std::prev(x: thenBlock->end()),
2753 outermostFullTileLoop->getBlock()->getOperations(),
2754 Block::iterator(outermostFullTileLoop));
2755
2756 // Move the partial tile into the else block. The partial tile is the same as
2757 // the original loop nest.
2758 Block *elseBlock = ifOp.getElseBlock();
2759 elseBlock->getOperations().splice(std::prev(x: elseBlock->end()),
2760 firstLoop->getBlock()->getOperations(),
2761 Block::iterator(firstLoop));
2762
2763 if (fullTileNest)
2764 *fullTileNest = std::move(fullTileLoops);
2765
2766 return success();
2767}
2768
2769LogicalResult affine::coalescePerfectlyNestedAffineLoops(AffineForOp op) {
2770 LogicalResult result(failure());
2771 SmallVector<AffineForOp> loops;
2772 getPerfectlyNestedLoops(loops, op);
2773 if (loops.size() <= 1)
2774 return success();
2775
2776 // Look for a band of loops that can be coalesced, i.e. perfectly nested
2777 // loops with bounds defined above some loop.
2778 // 1. For each loop, find above which parent loop its operands are
2779 // defined.
2780 SmallVector<unsigned> operandsDefinedAbove(loops.size());
2781 for (unsigned i = 0, e = loops.size(); i < e; ++i) {
2782 operandsDefinedAbove[i] = i;
2783 for (unsigned j = 0; j < i; ++j) {
2784 if (areValuesDefinedAbove(loops[i].getOperands(), loops[j].getRegion())) {
2785 operandsDefinedAbove[i] = j;
2786 break;
2787 }
2788 }
2789 }
2790
2791 // 2. Identify bands of loops such that the operands of all of them are
2792 // defined above the first loop in the band. Traverse the nest bottom-up
2793 // so that modifications don't invalidate the inner loops.
2794 for (unsigned end = loops.size(); end > 0; --end) {
2795 unsigned start = 0;
2796 for (; start < end - 1; ++start) {
2797 auto maxPos =
2798 *std::max_element(first: std::next(x: operandsDefinedAbove.begin(), n: start),
2799 last: std::next(x: operandsDefinedAbove.begin(), n: end));
2800 if (maxPos > start)
2801 continue;
2802 assert(maxPos == start &&
2803 "expected loop bounds to be known at the start of the band");
2804 auto band = llvm::MutableArrayRef(loops.data() + start, end - start);
2805 if (succeeded(coalesceLoops(band)))
2806 result = success();
2807 break;
2808 }
2809 // If a band was found and transformed, keep looking at the loops above
2810 // the outermost transformed loop.
2811 if (start != end - 1)
2812 end = start + 1;
2813 }
2814 return result;
2815}
2816

source code of mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp