1//===- LoopUtils.cpp ---- Misc utilities for loop transformation ----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements miscellaneous loop transformation routines.
10//
11//===----------------------------------------------------------------------===//
12
13#include "mlir/Dialect/Affine/LoopUtils.h"
14#include "mlir/Analysis/SliceAnalysis.h"
15#include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h"
16#include "mlir/Dialect/Affine/Analysis/Utils.h"
17#include "mlir/Dialect/Affine/IR/AffineValueMap.h"
18#include "mlir/Dialect/Affine/Utils.h"
19#include "mlir/Dialect/Func/IR/FuncOps.h"
20#include "mlir/Dialect/MemRef/IR/MemRef.h"
21#include "mlir/Dialect/SCF/IR/SCF.h"
22#include "mlir/IR/IRMapping.h"
23#include "mlir/IR/IntegerSet.h"
24#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
25#include "llvm/ADT/MapVector.h"
26#include "llvm/ADT/SmallPtrSet.h"
27#include "llvm/Support/Debug.h"
28#include "llvm/Support/raw_ostream.h"
29#include <optional>
30
31#define DEBUG_TYPE "loop-utils"
32
33using namespace mlir;
34using namespace affine;
35using namespace presburger;
36using llvm::SmallMapVector;
37
38/// Computes the cleanup loop lower bound of the loop being unrolled with
39/// the specified unroll factor; this bound will also be upper bound of the main
40/// part of the unrolled loop. Computes the bound as an AffineMap with its
41/// operands or a null map when the trip count can't be expressed as an affine
42/// expression.
43static void
44getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor,
45 AffineMap &cleanupLbMap,
46 SmallVectorImpl<Value> &cleanupLbOperands) {
47 AffineMap tripCountMap;
48 SmallVector<Value, 4> tripCountOperands;
49 getTripCountMapAndOperands(forOp, &tripCountMap, &tripCountOperands);
50 // Trip count can't be computed.
51 if (!tripCountMap) {
52 cleanupLbMap = AffineMap();
53 return;
54 }
55
56 OpBuilder b(forOp);
57 auto lbMap = forOp.getLowerBoundMap();
58 auto lb = b.create<AffineApplyOp>(forOp.getLoc(), lbMap,
59 forOp.getLowerBoundOperands());
60
61 // For each upper bound expr, get the range.
62 // Eg: affine.for %i = lb to min (ub1, ub2),
63 // where tripCountExprs yield (tr1, tr2), we create affine.apply's:
64 // lb + tr1 - tr1 % ufactor, lb + tr2 - tr2 % ufactor; the results of all
65 // these affine.apply's make up the cleanup loop lower bound.
66 SmallVector<AffineExpr, 4> bumpExprs(tripCountMap.getNumResults());
67 SmallVector<Value, 4> bumpValues(tripCountMap.getNumResults());
68 int64_t step = forOp.getStepAsInt();
69 for (unsigned i = 0, e = tripCountMap.getNumResults(); i < e; i++) {
70 auto tripCountExpr = tripCountMap.getResult(idx: i);
71 bumpExprs[i] = (tripCountExpr - tripCountExpr % unrollFactor) * step;
72 auto bumpMap = AffineMap::get(dimCount: tripCountMap.getNumDims(),
73 symbolCount: tripCountMap.getNumSymbols(), result: bumpExprs[i]);
74 bumpValues[i] =
75 b.create<AffineApplyOp>(forOp.getLoc(), bumpMap, tripCountOperands);
76 }
77
78 SmallVector<AffineExpr, 4> newUbExprs(tripCountMap.getNumResults());
79 for (unsigned i = 0, e = bumpExprs.size(); i < e; i++)
80 newUbExprs[i] = b.getAffineDimExpr(position: 0) + b.getAffineDimExpr(position: i + 1);
81
82 cleanupLbOperands.clear();
83 cleanupLbOperands.push_back(Elt: lb);
84 cleanupLbOperands.append(in_start: bumpValues.begin(), in_end: bumpValues.end());
85 cleanupLbMap = AffineMap::get(dimCount: 1 + tripCountMap.getNumResults(), symbolCount: 0, results: newUbExprs,
86 context: b.getContext());
87 // Simplify the cleanupLbMap + cleanupLbOperands.
88 fullyComposeAffineMapAndOperands(map: &cleanupLbMap, operands: &cleanupLbOperands);
89 cleanupLbMap = simplifyAffineMap(map: cleanupLbMap);
90 canonicalizeMapAndOperands(map: &cleanupLbMap, operands: &cleanupLbOperands);
91 // Remove any affine.apply's that became dead from the simplification above.
92 for (auto v : bumpValues)
93 if (v.use_empty())
94 v.getDefiningOp()->erase();
95
96 if (lb.use_empty())
97 lb.erase();
98}
99
100/// Helper to replace uses of loop carried values (iter_args) and loop
101/// yield values while promoting single iteration affine.for ops.
102static void replaceIterArgsAndYieldResults(AffineForOp forOp) {
103 // Replace uses of iter arguments with iter operands (initial values).
104 auto iterOperands = forOp.getInits();
105 auto iterArgs = forOp.getRegionIterArgs();
106 for (auto e : llvm::zip(iterOperands, iterArgs))
107 std::get<1>(e).replaceAllUsesWith(std::get<0>(e));
108
109 // Replace uses of loop results with the values yielded by the loop.
110 auto outerResults = forOp.getResults();
111 auto innerResults = forOp.getBody()->getTerminator()->getOperands();
112 for (auto e : llvm::zip(outerResults, innerResults))
113 std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
114}
115
116/// Promotes the loop body of a forOp to its containing block if the forOp
117/// was known to have a single iteration.
118LogicalResult mlir::affine::promoteIfSingleIteration(AffineForOp forOp) {
119 std::optional<uint64_t> tripCount = getConstantTripCount(forOp);
120 if (!tripCount || *tripCount != 1)
121 return failure();
122
123 // TODO: extend this for arbitrary affine bounds.
124 if (forOp.getLowerBoundMap().getNumResults() != 1)
125 return failure();
126
127 // Replaces all IV uses to its single iteration value.
128 auto iv = forOp.getInductionVar();
129 auto *parentBlock = forOp->getBlock();
130 if (!iv.use_empty()) {
131 if (forOp.hasConstantLowerBound()) {
132 auto func = forOp->getParentOfType<FunctionOpInterface>();
133 OpBuilder builder(forOp->getContext());
134 if (func)
135 builder.setInsertionPointToStart(&func.getFunctionBody().front());
136 else
137 builder.setInsertionPoint(forOp);
138 auto constOp = builder.create<arith::ConstantIndexOp>(
139 forOp.getLoc(), forOp.getConstantLowerBound());
140 iv.replaceAllUsesWith(constOp);
141 } else {
142 auto lbOperands = forOp.getLowerBoundOperands();
143 auto lbMap = forOp.getLowerBoundMap();
144 OpBuilder builder(forOp);
145 if (lbMap == builder.getDimIdentityMap()) {
146 // No need of generating an affine.apply.
147 iv.replaceAllUsesWith(lbOperands[0]);
148 } else {
149 auto affineApplyOp =
150 builder.create<AffineApplyOp>(forOp.getLoc(), lbMap, lbOperands);
151 iv.replaceAllUsesWith(affineApplyOp);
152 }
153 }
154 }
155
156 replaceIterArgsAndYieldResults(forOp);
157
158 // Move the loop body operations, except for its terminator, to the loop's
159 // containing block.
160 forOp.getBody()->back().erase();
161 parentBlock->getOperations().splice(Block::iterator(forOp),
162 forOp.getBody()->getOperations());
163 forOp.erase();
164 return success();
165}
166
167/// Generates an affine.for op with the specified lower and upper bounds
168/// while generating the right IV remappings to realize shifts for operations in
169/// its body. The operations that go into the loop body are specified in
170/// opGroupQueue starting from the specified offset, and in that order. The
171/// first element of the pair specifies the shift applied to that group of
172/// operations; the shift is multiplied by the loop step before being applied.
173/// Returns nullptr if the generated loop simplifies to a single iteration one.
174static AffineForOp generateShiftedLoop(
175 AffineMap lbMap, AffineMap ubMap,
176 const std::vector<std::pair<uint64_t, ArrayRef<Operation *>>> &opGroupQueue,
177 unsigned offset, AffineForOp srcForOp, OpBuilder b) {
178 auto lbOperands = srcForOp.getLowerBoundOperands();
179 auto ubOperands = srcForOp.getUpperBoundOperands();
180
181 assert(lbMap.getNumInputs() == lbOperands.size());
182 assert(ubMap.getNumInputs() == ubOperands.size());
183
184 auto loopChunk =
185 b.create<AffineForOp>(srcForOp.getLoc(), lbOperands, lbMap, ubOperands,
186 ubMap, srcForOp.getStepAsInt());
187 auto loopChunkIV = loopChunk.getInductionVar();
188 auto srcIV = srcForOp.getInductionVar();
189
190 IRMapping operandMap;
191
192 auto bodyBuilder = OpBuilder::atBlockTerminator(block: loopChunk.getBody());
193 for (const auto &it : llvm::drop_begin(RangeOrContainer: opGroupQueue, N: offset)) {
194 uint64_t shift = it.first;
195 auto ops = it.second;
196 // All 'same shift' operations get added with their operands being
197 // remapped to results of cloned operations, and their IV used remapped.
198 // Generate the remapping if the shift is not zero: remappedIV = newIV -
199 // shift.
200 if (!srcIV.use_empty() && shift != 0) {
201 auto ivRemap = bodyBuilder.create<AffineApplyOp>(
202 srcForOp.getLoc(),
203 bodyBuilder.getSingleDimShiftAffineMap(
204 -static_cast<int64_t>(srcForOp.getStepAsInt() * shift)),
205 loopChunkIV);
206 operandMap.map(srcIV, ivRemap);
207 } else {
208 operandMap.map(srcIV, loopChunkIV);
209 }
210 for (auto *op : ops)
211 bodyBuilder.clone(*op, operandMap);
212 };
213 if (succeeded(promoteIfSingleIteration(loopChunk)))
214 return AffineForOp();
215 return loopChunk;
216}
217
218// The skewing of operations with respect to one another can be used for
219// example to allow overlap of asynchronous operations (such as DMA
220// communication) with computation, or just relative shifting of operations
221// for better register reuse, locality or parallelism. As such, the shifts are
222// typically expected to be at most of the order of the number of operations.
223// This method should not be used as a substitute for loop distribution/fission.
224// This method uses an algorithm// in time linear in the number of operations
225// in the body of the for loop - (using the 'sweep line' paradigm). This method
226// asserts preservation of SSA dominance. A check for that as well as that for
227// memory-based dependence preservation check rests with the users of this
228// method.
229LogicalResult mlir::affine::affineForOpBodySkew(AffineForOp forOp,
230 ArrayRef<uint64_t> shifts,
231 bool unrollPrologueEpilogue) {
232 assert(forOp.getBody()->getOperations().size() == shifts.size() &&
233 "too few/many shifts");
234 if (forOp.getBody()->begin() == std::prev(forOp.getBody()->end()))
235 return success();
236
237 // If the trip counts aren't constant, we would need versioning and
238 // conditional guards (or context information to prevent such versioning). The
239 // better way to pipeline for such loops is to first tile them and extract
240 // constant trip count "full tiles" before applying this.
241 auto mayBeConstTripCount = getConstantTripCount(forOp);
242 if (!mayBeConstTripCount) {
243 LLVM_DEBUG(forOp.emitRemark("non-constant trip count loop not handled"));
244 return success();
245 }
246 uint64_t tripCount = *mayBeConstTripCount;
247
248 assert(isOpwiseShiftValid(forOp, shifts) &&
249 "shifts will lead to an invalid transformation\n");
250
251 int64_t step = forOp.getStepAsInt();
252
253 unsigned numChildOps = shifts.size();
254
255 // Do a linear time (counting) sort for the shifts.
256 uint64_t maxShift = *llvm::max_element(Range&: shifts);
257 if (maxShift >= numChildOps) {
258 // Large shifts are not the typical use case.
259 forOp.emitWarning("not shifting because shifts are unrealistically large");
260 return success();
261 }
262
263 // An array of operation groups sorted by shift amount; each group has all
264 // operations with the same shift in the order in which they appear in the
265 // body of the 'affine.for' op.
266 std::vector<std::vector<Operation *>> sortedOpGroups(maxShift + 1);
267 unsigned pos = 0;
268 for (auto &op : forOp.getBody()->without_terminator()) {
269 auto shift = shifts[pos++];
270 sortedOpGroups[shift].push_back(&op);
271 }
272
273 // Unless the shifts have a specific pattern (which actually would be the
274 // common use case), prologue and epilogue are not meaningfully defined.
275 // Nevertheless, if 'unrollPrologueEpilogue' is set, we will treat the first
276 // loop generated as the prologue and the last as epilogue and unroll these
277 // fully.
278 AffineForOp prologue, epilogue;
279
280 // Do a sweep over the sorted shifts while storing open groups in a
281 // vector, and generating loop portions as necessary during the sweep. A block
282 // of operations is paired with its shift.
283 std::vector<std::pair<uint64_t, ArrayRef<Operation *>>> opGroupQueue;
284
285 auto origLbMap = forOp.getLowerBoundMap();
286 uint64_t lbShift = 0;
287 OpBuilder b(forOp);
288 for (uint64_t d = 0, e = sortedOpGroups.size(); d < e; ++d) {
289 // If nothing is shifted by d, continue.
290 if (sortedOpGroups[d].empty())
291 continue;
292 if (!opGroupQueue.empty()) {
293 assert(d > 0 &&
294 "Queue expected to be empty when the first block is found");
295 // The interval for which the loop needs to be generated here is:
296 // [lbShift, min(lbShift + tripCount, d)) and the body of the
297 // loop needs to have all operations in opQueue in that order.
298 AffineForOp res;
299 if (lbShift + tripCount * step < d * step) {
300 res = generateShiftedLoop(
301 b.getShiftedAffineMap(origLbMap, lbShift),
302 b.getShiftedAffineMap(origLbMap, lbShift + tripCount * step),
303 opGroupQueue, /*offset=*/0, forOp, b);
304 // Entire loop for the queued op groups generated, empty it.
305 opGroupQueue.clear();
306 lbShift += tripCount * step;
307 } else {
308 res = generateShiftedLoop(b.getShiftedAffineMap(origLbMap, lbShift),
309 b.getShiftedAffineMap(origLbMap, d),
310 opGroupQueue, /*offset=*/0, forOp, b);
311 lbShift = d * step;
312 }
313
314 if (res) {
315 // Simplify/canonicalize the affine.for.
316 RewritePatternSet patterns(res.getContext());
317 AffineForOp::getCanonicalizationPatterns(patterns, res.getContext());
318 bool erased;
319 (void)applyOpPatternsGreedily(
320 res.getOperation(), std::move(patterns),
321 GreedyRewriteConfig().setStrictness(
322 GreedyRewriteStrictness::ExistingAndNewOps),
323 /*changed=*/nullptr, &erased);
324 if (!erased && !prologue)
325 prologue = res;
326 if (!erased)
327 epilogue = res;
328 }
329 } else {
330 // Start of first interval.
331 lbShift = d * step;
332 }
333 // Augment the list of operations that get into the current open interval.
334 opGroupQueue.emplace_back(args&: d, args&: sortedOpGroups[d]);
335 }
336
337 // Those operations groups left in the queue now need to be processed (FIFO)
338 // and their loops completed.
339 for (unsigned i = 0, e = opGroupQueue.size(); i < e; ++i) {
340 uint64_t ubShift = (opGroupQueue[i].first + tripCount) * step;
341 epilogue = generateShiftedLoop(b.getShiftedAffineMap(origLbMap, lbShift),
342 b.getShiftedAffineMap(origLbMap, ubShift),
343 opGroupQueue, /*offset=*/i, forOp, b);
344 lbShift = ubShift;
345 if (!prologue)
346 prologue = epilogue;
347 }
348
349 // Erase the original for op.
350 forOp.erase();
351
352 if (unrollPrologueEpilogue && prologue)
353 (void)loopUnrollFull(prologue);
354 if (unrollPrologueEpilogue && !epilogue && epilogue != prologue)
355 (void)loopUnrollFull(epilogue);
356
357 return success();
358}
359
360/// Checks whether a loop nest is hyper-rectangular or not.
361static LogicalResult
362checkIfHyperRectangular(MutableArrayRef<AffineForOp> input) {
363 FlatAffineValueConstraints cst;
364 SmallVector<Operation *, 8> ops(input.begin(), input.end());
365 // 0-d or 1-d is trivially hyper-rectangular.
366 if (input.size() <= 1)
367 return success();
368 if (failed(Result: getIndexSet(ops, domain: &cst))) {
369 LLVM_DEBUG(llvm::dbgs() << "Index set computation failed!\n");
370 return failure();
371 }
372 if (!cst.isHyperRectangular(pos: 0, num: input.size())) {
373 LLVM_DEBUG(llvm::dbgs()
374 << "Non-hyperrectangular nests not supported for tiling!\n");
375 return failure();
376 }
377 return success();
378}
379
380/// Check if the input nest is supported for tiling and whether tiling would be
381/// legal or not.
382template <typename t>
383static LogicalResult performPreTilingChecks(MutableArrayRef<AffineForOp> input,
384 ArrayRef<t> tileSizes) {
385 assert(input.size() == tileSizes.size() && "Too few/many tile sizes");
386
387 if (llvm::any_of(input,
388 [](AffineForOp op) { return op.getNumResults() > 0; })) {
389 LLVM_DEBUG(llvm::dbgs()
390 << "Cannot tile nest where a loop has yield values\n");
391 return failure();
392 }
393
394 // Check if the supplied `for` ops are all successively nested.
395 if (!isPerfectlyNested(loops: input)) {
396 LLVM_DEBUG(llvm::dbgs() << "input loops not perfectly nested");
397 return failure();
398 }
399
400 // TODO: handle non hyper-rectangular spaces.
401 if (failed(Result: checkIfHyperRectangular(input)))
402 return failure();
403
404 return success();
405}
406
407/// Move the loop body of AffineForOp 'src' from 'src' into the specified
408/// location in destination's body, ignoring the terminator.
409static void moveLoopBodyImpl(AffineForOp src, AffineForOp dest,
410 Block::iterator loc) {
411 auto &ops = src.getBody()->getOperations();
412 dest.getBody()->getOperations().splice(loc, ops, ops.begin(),
413 std::prev(ops.end()));
414}
415
416/// Move the loop body of AffineForOp 'src' from 'src' to the start of dest
417/// body.
418static void moveLoopBody(AffineForOp src, AffineForOp dest) {
419 moveLoopBodyImpl(src, dest, dest.getBody()->begin());
420}
421
422/// Constructs tiled loop nest, without setting the loop bounds and move the
423/// body of the original loop nest to the tiled loop nest.
424static void constructTiledLoopNest(MutableArrayRef<AffineForOp> origLoops,
425 AffineForOp rootAffineForOp, unsigned width,
426 MutableArrayRef<AffineForOp> tiledLoops) {
427 Location loc = rootAffineForOp.getLoc();
428
429 // The outermost among the loops as we add more..
430 Operation *topLoop = rootAffineForOp.getOperation();
431 AffineForOp innermostPointLoop;
432
433 // Add intra-tile (or point) loops.
434 for (unsigned i = 0; i < width; i++) {
435 OpBuilder b(topLoop);
436 // Loop bounds will be set later.
437 AffineForOp pointLoop = b.create<AffineForOp>(loc, 0, 0);
438 pointLoop.getBody()->getOperations().splice(
439 pointLoop.getBody()->begin(), topLoop->getBlock()->getOperations(),
440 topLoop);
441 tiledLoops[2 * width - 1 - i] = pointLoop;
442 topLoop = pointLoop.getOperation();
443 if (i == 0)
444 innermostPointLoop = pointLoop;
445 }
446
447 // Add tile space loops;
448 for (unsigned i = width; i < 2 * width; i++) {
449 OpBuilder b(topLoop);
450 // Loop bounds will be set later.
451 AffineForOp tileSpaceLoop = b.create<AffineForOp>(loc, 0, 0);
452 tileSpaceLoop.getBody()->getOperations().splice(
453 tileSpaceLoop.getBody()->begin(), topLoop->getBlock()->getOperations(),
454 topLoop);
455 tiledLoops[2 * width - i - 1] = tileSpaceLoop;
456 topLoop = tileSpaceLoop.getOperation();
457 }
458
459 // Move the loop body of the original nest to the new one.
460 moveLoopBody(origLoops.back(), innermostPointLoop);
461}
462
463/// Set lower and upper bounds of intra-tile loops for parametric tiling.
464// TODO: Handle non-constant lower bounds.
465static void setIntraTileBoundsParametric(OpBuilder &b, AffineForOp origLoop,
466 AffineForOp newInterTileLoop,
467 AffineForOp newIntraTileLoop,
468 Value tileSize) {
469 // The lower bound for the intra-tile loop is represented by an affine map
470 // as (%i, %t0)->((%i - %origlb) * %t0 + %origlb). Similarly, the upper bound
471 // for the intra-tile loop is represented by an affine map as (%i, %t0)->((%i
472 // - %origlb) * %t0) + (%t0 * %origLoopStep) + %origlb), where %i is loop IV
473 // of the corresponding inter-tile loop, %t0 is the corresponding tiling
474 // parameter, %origlb is lower bound and %origLoopStep is the loop step of the
475 // corresponding inter-tile loop.
476
477 assert(origLoop.hasConstantLowerBound() &&
478 "expected input loops to have constant lower bound.");
479
480 // Get lower bound of original loop as an affine expression.
481 AffineExpr origLowerBoundExpr;
482 origLowerBoundExpr =
483 b.getAffineConstantExpr(constant: origLoop.getConstantLowerBound());
484
485 // Add dim operands from original lower/upper bound.
486 SmallVector<Value, 4> lbOperands, ubOperands;
487 AffineBound lb = origLoop.getLowerBound();
488 AffineBound ub = origLoop.getUpperBound();
489 lbOperands.reserve(N: lb.getNumOperands() + 2);
490 ubOperands.reserve(N: ub.getNumOperands() + 2);
491 AffineMap origLbMap = lb.getMap();
492 AffineMap origUbMap = ub.getMap();
493 for (unsigned j = 0, e = origLbMap.getNumDims(); j < e; ++j)
494 lbOperands.push_back(Elt: lb.getOperand(idx: j));
495 for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
496 ubOperands.push_back(Elt: ub.getOperand(idx: j));
497
498 // Add a new dim operand in lb/ubOperands corresponding to the origLoop
499 // IV.
500 lbOperands.push_back(Elt: newInterTileLoop.getInductionVar());
501 ubOperands.push_back(Elt: newInterTileLoop.getInductionVar());
502
503 // Get loop IV as an affine expression for lower/upper bound. Size of
504 // lb/ubOperands is guaranteed to be atleast one.
505 AffineExpr lbLoopIvExpr = b.getAffineDimExpr(position: lbOperands.size() - 1);
506 AffineExpr ubLoopIvExpr = b.getAffineDimExpr(position: ubOperands.size() - 1);
507
508 // Add symbol operands from original lower/upper bound.
509 for (unsigned j = 0, e = origLbMap.getNumSymbols(); j < e; ++j)
510 lbOperands.push_back(Elt: lb.getOperand(idx: origLbMap.getNumDims() + j));
511 for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
512 ubOperands.push_back(Elt: ub.getOperand(idx: origUbMap.getNumDims() + j));
513
514 // Add a new symbol operand which is the tile size for this loop.
515 lbOperands.push_back(Elt: tileSize);
516 ubOperands.push_back(Elt: tileSize);
517
518 SmallVector<AffineExpr, 4> lbBoundExprs;
519 SmallVector<AffineExpr, 4> ubBoundExprs;
520 lbBoundExprs.reserve(N: origLbMap.getNumResults());
521 ubBoundExprs.reserve(N: origUbMap.getNumResults());
522
523 // Get tiling parameter as an affine expression for lb/ub.
524 AffineExpr lbTileParameter = b.getAffineSymbolExpr(position: origLbMap.getNumSymbols());
525 AffineExpr ubTileParameter = b.getAffineSymbolExpr(position: origUbMap.getNumSymbols());
526
527 // Insert lb as inter-tile ((loop IV - origlb) * tilingParameter) + origlb.
528 lbBoundExprs.push_back(
529 Elt: ((lbLoopIvExpr - origLowerBoundExpr) * lbTileParameter) +
530 origLowerBoundExpr);
531
532 // Get the origLoopStep as an affine expression.
533 AffineExpr origLoopStep = b.getAffineConstantExpr(constant: origLoop.getStepAsInt());
534
535 // Insert ub as inter-tile ((loop IV - origlb) * tilingParameter) +
536 // (tilingParameter * origLoopStep) + origlb.
537 ubBoundExprs.push_back(
538 Elt: ((ubLoopIvExpr - origLowerBoundExpr) * ubTileParameter) +
539 (ubTileParameter * origLoopStep) + origLowerBoundExpr);
540
541 ubBoundExprs.append(in_start: origUbMap.getResults().begin(),
542 in_end: origUbMap.getResults().end());
543
544 AffineMap lbMap =
545 AffineMap::get(dimCount: origLbMap.getNumDims() + 1, symbolCount: origLbMap.getNumSymbols() + 1,
546 results: lbBoundExprs, context: b.getContext());
547 newIntraTileLoop.setLowerBound(lbOperands, lbMap);
548
549 AffineMap ubMap =
550 AffineMap::get(dimCount: origUbMap.getNumDims() + 1, symbolCount: origUbMap.getNumSymbols() + 1,
551 results: ubBoundExprs, context: b.getContext());
552 newIntraTileLoop.setUpperBound(ubOperands, ubMap);
553
554 // Original loop step must be preserved.
555 newIntraTileLoop.setStep(origLoop.getStepAsInt());
556}
557
558/// Set lower and upper bounds of inter-tile loops for parametric tiling.
559// TODO: Handle non-constant lower bounds.
560static void setInterTileBoundsParametric(OpBuilder &b, AffineForOp origLoop,
561 AffineForOp newLoop, Value tileSize) {
562 OperandRange newLbOperands = origLoop.getLowerBoundOperands();
563
564 // The lower bounds for inter-tile loops are same as the corresponding lower
565 // bounds of original loops.
566 newLoop.setLowerBound(newLbOperands, origLoop.getLowerBoundMap());
567
568 // The new upper bound map for inter-tile loops, assuming constant lower
569 // bounds, are now originalLowerBound + ceildiv((originalUpperBound -
570 // originalLowerBound), tiling parameter); where tiling parameter is the
571 // respective tile size for that loop. For e.g. if the original ubmap was
572 // ()->(1024), the new map will be
573 // ()[s0]->(ceildiv((1024 -lb) % s0)), where s0 is the tiling parameter.
574 // Therefore a new symbol operand is inserted in the map and the result
575 // expression is overwritten.
576
577 assert(origLoop.hasConstantLowerBound() &&
578 "expected input loops to have constant lower bound.");
579
580 // Get lower bound of original loop as an affine expression.
581 AffineExpr origLowerBoundExpr;
582 origLowerBoundExpr =
583 b.getAffineConstantExpr(constant: origLoop.getConstantLowerBound());
584
585 // Add dim operands from original upper bound.
586 SmallVector<Value, 4> ubOperands;
587 AffineBound ub = origLoop.getUpperBound();
588 ubOperands.reserve(N: ub.getNumOperands() + 1);
589 AffineMap origUbMap = ub.getMap();
590 for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
591 ubOperands.push_back(Elt: ub.getOperand(idx: j));
592
593 // Add symbol operands from original upper bound.
594 for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
595 ubOperands.push_back(Elt: ub.getOperand(idx: origUbMap.getNumDims() + j));
596
597 // Add a new symbol operand which is the tile size for this loop.
598 ubOperands.push_back(Elt: tileSize);
599
600 // Get tiling parameter as an affine expression.
601 AffineExpr tileParameter = b.getAffineSymbolExpr(position: origUbMap.getNumSymbols());
602
603 SmallVector<AffineExpr, 4> boundExprs;
604 boundExprs.reserve(N: origUbMap.getNumResults());
605 int64_t origUpperBound;
606 AffineExpr origUpperBoundExpr;
607
608 // If upper bound for the original loop is constant, then the constant can
609 // be obtained as an affine expression straight away.
610 if (origLoop.hasConstantUpperBound()) {
611 origUpperBound = origLoop.getConstantUpperBound();
612
613 // Get original constant upper bound as an affine expression.
614 origUpperBoundExpr = b.getAffineConstantExpr(constant: origUpperBound);
615
616 // Insert the bound as originalLowerBoundceildiv((originalUpperBound -
617 // originalLowerBound), tilingParameter).
618 boundExprs.push_back(
619 Elt: origLowerBoundExpr +
620 (origUpperBoundExpr - origLowerBoundExpr).ceilDiv(other: tileParameter));
621 } else {
622 // If upper bound for the original loop is not constant then two cases
623 // are possible, although there handeling is the same, 1.) The result of
624 // ubmap has only one result expression. For e.g.
625 // affine.for %i = 5 to %ub
626 //
627 // A symbol operand is added which represents the tiling parameter. The
628 // new loop bounds here will be like ()[s0, s1] -> ((s0 - 5) ceildiv s1 + 5)
629 // where 's0' is the original upper bound and 's1' is the tiling
630 // parameter. 2.) When ubMap has more than one result expression. For e.g.
631 // #map0 = affine_map<()[s0, s1] -> (s0, s1)
632 // affine.for %i = 5 to min #map0()[%s0, %s1]
633 //
634 // A symbol operand is added which represents the tiling parameter. The
635 // new loop bounds will be like ()[s0, s1, s2] -> ((s0 - 5) ceildiv s2 + 5,
636 // (s1 -5) ceildiv s2 + 5), where s2 is the tiling parameter.
637
638 // Insert the bounds as originalLowerBound + ceildiv((originalUpperBound -
639 // originalLowerBound), tilingParameter).
640 for (AffineExpr origUpperBoundExpr : origUbMap.getResults())
641 boundExprs.push_back(
642 origLowerBoundExpr +
643 (origUpperBoundExpr - origLowerBoundExpr).ceilDiv(tileParameter));
644 }
645
646 AffineMap ubMap =
647 AffineMap::get(dimCount: origUbMap.getNumDims(), symbolCount: origUbMap.getNumSymbols() + 1,
648 results: boundExprs, context: b.getContext());
649 newLoop.setUpperBound(ubOperands, ubMap);
650
651 // Original loop step must be preserved.
652 newLoop.setStep(origLoop.getStepAsInt());
653}
654
655/// Constructs and sets new loop bounds after tiling for the case of
656/// hyper-rectangular index sets, where the bounds of one dimension do not
657/// depend on other dimensions and tiling parameters are captured from SSA
658/// values. Bounds of each dimension can thus be treated independently,
659/// and deriving the new bounds is much simpler and faster than for the case of
660/// tiling arbitrary polyhedral shapes.
661static void constructParametricallyTiledIndexSetHyperRect(
662 MutableArrayRef<AffineForOp> origLoops,
663 MutableArrayRef<AffineForOp> newLoops, ArrayRef<Value> tileSizes) {
664 assert(!origLoops.empty() && "expected atleast one loop in band");
665 assert(origLoops.size() == tileSizes.size() &&
666 "expected tiling parameter for each loop in band.");
667
668 OpBuilder b(origLoops[0].getOperation());
669 unsigned width = origLoops.size();
670
671 // Set bounds for tile space loops.
672 for (unsigned i = 0; i < width; ++i) {
673 setInterTileBoundsParametric(b, origLoops[i], newLoops[i], tileSizes[i]);
674 }
675
676 // Set bounds for intra-tile loops.
677 for (unsigned i = 0; i < width; ++i) {
678 setIntraTileBoundsParametric(b, origLoops[i], newLoops[i],
679 newLoops[i + width], tileSizes[i]);
680 }
681}
682
683/// Constructs and sets new loop bounds after tiling for the case of
684/// hyper-rectangular index sets, where the bounds of one dimension do not
685/// depend on other dimensions. Bounds of each dimension can thus be treated
686/// independently, and deriving the new bounds is much simpler and faster
687/// than for the case of tiling arbitrary polyhedral shapes.
688static void
689constructTiledIndexSetHyperRect(MutableArrayRef<AffineForOp> origLoops,
690 MutableArrayRef<AffineForOp> newLoops,
691 ArrayRef<unsigned> tileSizes) {
692 assert(!origLoops.empty());
693 assert(origLoops.size() == tileSizes.size());
694
695 OpBuilder b(origLoops[0].getOperation());
696 unsigned width = origLoops.size();
697
698 // Bounds for tile space loops.
699 for (unsigned i = 0; i < width; i++) {
700 OperandRange newLbOperands = origLoops[i].getLowerBoundOperands();
701 OperandRange newUbOperands = origLoops[i].getUpperBoundOperands();
702 newLoops[i].setLowerBound(newLbOperands, origLoops[i].getLowerBoundMap());
703 newLoops[i].setUpperBound(newUbOperands, origLoops[i].getUpperBoundMap());
704 // If the step size of original loop is x and tileSize is y then after
705 // tiling the tile space loops' step size becomes x*y.
706 newLoops[i].setStep(tileSizes[i] * origLoops[i].getStepAsInt());
707 }
708 // Bounds for intra-tile loops.
709 for (unsigned i = 0; i < width; i++) {
710 int64_t largestDiv = getLargestDivisorOfTripCount(origLoops[i]);
711 std::optional<uint64_t> mayBeConstantCount =
712 getConstantTripCount(origLoops[i]);
713 // The lower bound is just the tile-space loop.
714 AffineMap lbMap = b.getDimIdentityMap();
715 newLoops[width + i].setLowerBound(
716 /*operands=*/newLoops[i].getInductionVar(), lbMap);
717 // The step sizes of intra-tile loops is just the original loops' step size.
718 newLoops[width + i].setStep(origLoops[i].getStepAsInt());
719
720 // Set the upper bound.
721 if (mayBeConstantCount && *mayBeConstantCount < tileSizes[i]) {
722 // Trip count is less than the tile size: upper bound is lower bound +
723 // trip count * stepSize.
724 AffineMap ubMap = b.getSingleDimShiftAffineMap(
725 shift: *mayBeConstantCount * origLoops[i].getStepAsInt());
726 newLoops[width + i].setUpperBound(
727 /*operands=*/newLoops[i].getInductionVar(), ubMap);
728 } else if (largestDiv % tileSizes[i] != 0) {
729 // Intra-tile loop ii goes from i to min(i + tileSize * stepSize, ub_i).
730 // Construct the upper bound map; the operands are the original operands
731 // with 'i' (tile-space loop) appended to it. The new upper bound map is
732 // the original one with an additional expression i + tileSize * stepSize
733 // appended.
734
735 // Add dim operands from original upper bound.
736 SmallVector<Value, 4> ubOperands;
737 AffineBound ub = origLoops[i].getUpperBound();
738 ubOperands.reserve(N: ub.getNumOperands() + 1);
739 AffineMap origUbMap = ub.getMap();
740 for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
741 ubOperands.push_back(Elt: ub.getOperand(idx: j));
742
743 // Add dim operand for new loop upper bound.
744 ubOperands.push_back(Elt: newLoops[i].getInductionVar());
745
746 // Add symbol operands from original upper bound.
747 for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
748 ubOperands.push_back(Elt: ub.getOperand(idx: origUbMap.getNumDims() + j));
749
750 SmallVector<AffineExpr, 4> boundExprs;
751 boundExprs.reserve(N: 1 + origUbMap.getNumResults());
752 AffineExpr dim = b.getAffineDimExpr(position: origUbMap.getNumDims());
753 // The new upper bound map is the original one with an additional
754 // expression i + tileSize * stepSize (of original loop) appended.
755 boundExprs.push_back(Elt: dim + tileSizes[i] * origLoops[i].getStepAsInt());
756 boundExprs.append(in_start: origUbMap.getResults().begin(),
757 in_end: origUbMap.getResults().end());
758 AffineMap ubMap =
759 AffineMap::get(dimCount: origUbMap.getNumDims() + 1, symbolCount: origUbMap.getNumSymbols(),
760 results: boundExprs, context: b.getContext());
761 newLoops[width + i].setUpperBound(/*operands=*/ubOperands, ubMap);
762 } else {
763 // No need of the min expression.
764 AffineExpr dim = b.getAffineDimExpr(position: 0);
765 AffineMap ubMap = AffineMap::get(
766 1, 0, dim + tileSizes[i] * origLoops[i].getStepAsInt());
767 newLoops[width + i].setUpperBound(newLoops[i].getInductionVar(), ubMap);
768 }
769 }
770}
771
772LogicalResult
773mlir::affine::tilePerfectlyNested(MutableArrayRef<AffineForOp> input,
774 ArrayRef<unsigned> tileSizes,
775 SmallVectorImpl<AffineForOp> *tiledNest) {
776 if (input.empty())
777 return success();
778
779 if (failed(Result: performPreTilingChecks(input, tileSizes)))
780 return failure();
781
782 MutableArrayRef<AffineForOp> origLoops = input;
783 AffineForOp rootAffineForOp = origLoops[0];
784
785 // Note that width is at least one since the band isn't empty.
786 unsigned width = input.size();
787 SmallVector<AffineForOp, 6> tiledLoops(2 * width);
788
789 // Construct a tiled loop nest without setting their bounds. Bounds are
790 // set later.
791 constructTiledLoopNest(origLoops, rootAffineForOp, width, tiledLoops);
792
793 SmallVector<Value, 8> origLoopIVs;
794 extractForInductionVars(forInsts: input, ivs: &origLoopIVs);
795
796 // Set loop bounds for the tiled loop nest.
797 constructTiledIndexSetHyperRect(origLoops, tiledLoops, tileSizes);
798
799 // Replace original IVs with intra-tile loop IVs.
800 for (unsigned i = 0; i < width; i++)
801 origLoopIVs[i].replaceAllUsesWith(newValue: tiledLoops[i + width].getInductionVar());
802
803 // Erase the old loop nest.
804 rootAffineForOp.erase();
805
806 if (tiledNest)
807 *tiledNest = std::move(tiledLoops);
808
809 return success();
810}
811
812/// Tiles the specified band of perfectly nested loops creating tile-space
813/// loops and intra-tile loops, using SSA values as tiling parameters. A band
814/// is a contiguous set of loops.
815LogicalResult mlir::affine::tilePerfectlyNestedParametric(
816 MutableArrayRef<AffineForOp> input, ArrayRef<Value> tileSizes,
817 SmallVectorImpl<AffineForOp> *tiledNest) {
818 if (input.empty())
819 return success();
820
821 if (failed(Result: performPreTilingChecks(input, tileSizes)))
822 return failure();
823
824 MutableArrayRef<AffineForOp> origLoops = input;
825 AffineForOp rootAffineForOp = origLoops[0];
826 unsigned width = input.size();
827 SmallVector<AffineForOp, 6> tiledLoops(2 * width);
828
829 // Construct a tiled loop nest without setting their bounds. Bounds are
830 // set later.
831 constructTiledLoopNest(origLoops, rootAffineForOp, width, tiledLoops);
832
833 SmallVector<Value, 8> origLoopIVs;
834 extractForInductionVars(forInsts: input, ivs: &origLoopIVs);
835
836 // Set loop bounds for the tiled loop nest.
837 constructParametricallyTiledIndexSetHyperRect(origLoops, tiledLoops,
838 tileSizes);
839
840 // Replace original IVs with intra-tile loop IVs.
841 for (unsigned i = 0; i < width; i++)
842 origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar());
843
844 // Erase the old loop nest.
845 rootAffineForOp.erase();
846
847 if (tiledNest)
848 *tiledNest = std::move(tiledLoops);
849
850 return success();
851}
852
853/// Get perfectly nested sequence of loops starting at root of loop nest
854/// (the first op being another AffineFor, and the second op - a terminator).
855/// A loop is perfectly nested iff: the first op in the loop's body is another
856/// AffineForOp, and the second op is a terminator).
857void mlir::affine::getPerfectlyNestedLoops(
858 SmallVectorImpl<AffineForOp> &nestedLoops, AffineForOp root) {
859 for (unsigned i = 0; i < std::numeric_limits<unsigned>::max(); ++i) {
860 nestedLoops.push_back(root);
861 Block &body = root.getRegion().front();
862 if (body.begin() != std::prev(x: body.end(), n: 2))
863 return;
864
865 root = dyn_cast<AffineForOp>(&body.front());
866 if (!root)
867 return;
868 }
869}
870
871/// Identify valid and profitable bands of loops to tile. This is currently just
872/// a temporary placeholder to test the mechanics of tiled code generation.
873/// Returns all maximal outermost perfect loop nests to tile.
874void mlir::affine::getTileableBands(
875 func::FuncOp f, std::vector<SmallVector<AffineForOp, 6>> *bands) {
876 // Get maximal perfect nest of 'affine.for' insts starting from root
877 // (inclusive).
878 for (AffineForOp forOp : f.getOps<AffineForOp>()) {
879 SmallVector<AffineForOp, 6> band;
880 getPerfectlyNestedLoops(band, forOp);
881 bands->push_back(band);
882 }
883}
884
885/// Unrolls this loop completely.
886LogicalResult mlir::affine::loopUnrollFull(AffineForOp forOp) {
887 std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
888 if (mayBeConstantTripCount.has_value()) {
889 uint64_t tripCount = *mayBeConstantTripCount;
890 if (tripCount == 0)
891 return success();
892 if (tripCount == 1)
893 return promoteIfSingleIteration(forOp);
894 return loopUnrollByFactor(forOp, tripCount);
895 }
896 return failure();
897}
898
899/// Unrolls this loop by the specified factor or by the trip count (if constant)
900/// whichever is lower.
901LogicalResult mlir::affine::loopUnrollUpToFactor(AffineForOp forOp,
902 uint64_t unrollFactor) {
903 std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
904 if (mayBeConstantTripCount.has_value() &&
905 *mayBeConstantTripCount < unrollFactor)
906 return loopUnrollByFactor(forOp, *mayBeConstantTripCount);
907 return loopUnrollByFactor(forOp, unrollFactor);
908}
909
910/// Generates unrolled copies of AffineForOp 'loopBodyBlock', with associated
911/// 'forOpIV' by 'unrollFactor', calling 'ivRemapFn' to remap 'forOpIV' for each
912/// unrolled body. If specified, annotates the Ops in each unrolled iteration
913/// using annotateFn.
914static void generateUnrolledLoop(
915 Block *loopBodyBlock, Value forOpIV, uint64_t unrollFactor,
916 function_ref<Value(unsigned, Value, OpBuilder)> ivRemapFn,
917 function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn,
918 ValueRange iterArgs, ValueRange yieldedValues) {
919 // Builder to insert unrolled bodies just before the terminator of the body of
920 // 'forOp'.
921 auto builder = OpBuilder::atBlockTerminator(block: loopBodyBlock);
922
923 constexpr auto defaultAnnotateFn = [](unsigned, Operation *, OpBuilder) {};
924 if (!annotateFn)
925 annotateFn = defaultAnnotateFn;
926
927 // Keep a pointer to the last non-terminator operation in the original block
928 // so that we know what to clone (since we are doing this in-place).
929 Block::iterator srcBlockEnd = std::prev(x: loopBodyBlock->end(), n: 2);
930
931 // Unroll the contents of 'forOp' (append unrollFactor - 1 additional copies).
932 SmallVector<Value, 4> lastYielded(yieldedValues);
933
934 for (unsigned i = 1; i < unrollFactor; i++) {
935 IRMapping operandMap;
936
937 // Prepare operand map.
938 operandMap.map(from&: iterArgs, to&: lastYielded);
939
940 // If the induction variable is used, create a remapping to the value for
941 // this unrolled instance.
942 if (!forOpIV.use_empty()) {
943 Value ivUnroll = ivRemapFn(i, forOpIV, builder);
944 operandMap.map(from: forOpIV, to: ivUnroll);
945 }
946
947 // Clone the original body of 'forOp'.
948 for (auto it = loopBodyBlock->begin(); it != std::next(x: srcBlockEnd); it++) {
949 Operation *clonedOp = builder.clone(op&: *it, mapper&: operandMap);
950 annotateFn(i, clonedOp, builder);
951 }
952
953 // Update yielded values. If the yielded value is defined outside the
954 // `loopBodyBlock` or if it is a BlockArgument then it won't be cloned, thus
955 // the `lastYielded` value remains unchanged. Else, update the `lastYielded`
956 // value with the clone corresponding to the yielded value.
957 for (unsigned i = 0, e = lastYielded.size(); i < e; i++) {
958 Operation *defOp = yieldedValues[i].getDefiningOp();
959 if (defOp && defOp->getBlock() == loopBodyBlock)
960 lastYielded[i] = operandMap.lookup(from: yieldedValues[i]);
961 }
962 }
963
964 // Make sure we annotate the Ops in the original body. We do this last so that
965 // any annotations are not copied into the cloned Ops above.
966 for (auto it = loopBodyBlock->begin(); it != std::next(x: srcBlockEnd); it++)
967 annotateFn(0, &*it, builder);
968
969 // Update operands of the yield statement.
970 loopBodyBlock->getTerminator()->setOperands(lastYielded);
971}
972
973/// Helper to generate cleanup loop for unroll or unroll-and-jam when the trip
974/// count is not a multiple of `unrollFactor`.
975static LogicalResult generateCleanupLoopForUnroll(AffineForOp forOp,
976 uint64_t unrollFactor) {
977 // Insert the cleanup loop right after 'forOp'.
978 OpBuilder builder(forOp->getBlock(), std::next(x: Block::iterator(forOp)));
979 auto cleanupForOp = cast<AffineForOp>(builder.clone(*forOp));
980
981 // Update uses of `forOp` results. `cleanupForOp` should use `forOp` result
982 // and produce results for the original users of `forOp` results.
983 auto results = forOp.getResults();
984 auto cleanupResults = cleanupForOp.getResults();
985 auto cleanupIterOperands = cleanupForOp.getInits();
986
987 for (auto e : llvm::zip(results, cleanupResults, cleanupIterOperands)) {
988 std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
989 cleanupForOp->replaceUsesOfWith(std::get<2>(e), std::get<0>(e));
990 }
991
992 AffineMap cleanupMap;
993 SmallVector<Value, 4> cleanupOperands;
994 getCleanupLoopLowerBound(forOp, unrollFactor, cleanupMap, cleanupOperands);
995 if (!cleanupMap)
996 return failure();
997
998 cleanupForOp.setLowerBound(cleanupOperands, cleanupMap);
999 // Promote the loop body up if this has turned into a single iteration loop.
1000 (void)promoteIfSingleIteration(cleanupForOp);
1001
1002 // Adjust upper bound of the original loop; this is the same as the lower
1003 // bound of the cleanup loop.
1004 forOp.setUpperBound(cleanupOperands, cleanupMap);
1005 return success();
1006}
1007
1008/// Unrolls this loop by the specified factor. Returns success if the loop
1009/// is successfully unrolled.
1010LogicalResult mlir::affine::loopUnrollByFactor(
1011 AffineForOp forOp, uint64_t unrollFactor,
1012 function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn,
1013 bool cleanUpUnroll) {
1014 assert(unrollFactor > 0 && "unroll factor should be positive");
1015
1016 std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
1017 if (unrollFactor == 1) {
1018 if (mayBeConstantTripCount && *mayBeConstantTripCount == 1 &&
1019 failed(promoteIfSingleIteration(forOp)))
1020 return failure();
1021 return success();
1022 }
1023
1024 // Nothing in the loop body other than the terminator.
1025 if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
1026 return success();
1027
1028 // If the trip count is lower than the unroll factor, no unrolled body.
1029 if (mayBeConstantTripCount && *mayBeConstantTripCount < unrollFactor) {
1030 if (cleanUpUnroll) {
1031 // Unroll the cleanup loop if cleanUpUnroll is specified.
1032 return loopUnrollFull(forOp);
1033 }
1034
1035 return failure();
1036 }
1037
1038 // Generate the cleanup loop if trip count isn't a multiple of unrollFactor.
1039 if (getLargestDivisorOfTripCount(forOp) % unrollFactor != 0) {
1040 // Loops where the lower bound is a max expression or the upper bound is
1041 // a min expression and the trip count doesn't divide the unroll factor
1042 // can't be unrolled since the lower bound of the cleanup loop in such cases
1043 // cannot be expressed as an affine function or a max over affine functions.
1044 if (forOp.getLowerBoundMap().getNumResults() != 1 ||
1045 forOp.getUpperBoundMap().getNumResults() != 1)
1046 return failure();
1047 if (cleanUpUnroll)
1048 // Force unroll including cleanup loop
1049 return loopUnrollFull(forOp);
1050 if (failed(generateCleanupLoopForUnroll(forOp, unrollFactor)))
1051 assert(false && "cleanup loop lower bound map for single result lower "
1052 "and upper bound maps can always be determined");
1053 }
1054
1055 ValueRange iterArgs(forOp.getRegionIterArgs());
1056 auto yieldedValues = forOp.getBody()->getTerminator()->getOperands();
1057
1058 // Scale the step of loop being unrolled by unroll factor.
1059 int64_t step = forOp.getStepAsInt();
1060 forOp.setStep(step * unrollFactor);
1061 generateUnrolledLoop(
1062 forOp.getBody(), forOp.getInductionVar(), unrollFactor,
1063 [&](unsigned i, Value iv, OpBuilder b) {
1064 // iv' = iv + i * step
1065 auto d0 = b.getAffineDimExpr(position: 0);
1066 auto bumpMap = AffineMap::get(dimCount: 1, symbolCount: 0, result: d0 + i * step);
1067 return b.create<AffineApplyOp>(forOp.getLoc(), bumpMap, iv);
1068 },
1069 /*annotateFn=*/annotateFn,
1070 /*iterArgs=*/iterArgs, /*yieldedValues=*/yieldedValues);
1071
1072 // Promote the loop body up if this has turned into a single iteration loop.
1073 (void)promoteIfSingleIteration(forOp);
1074 return success();
1075}
1076
1077LogicalResult mlir::affine::loopUnrollJamUpToFactor(AffineForOp forOp,
1078 uint64_t unrollJamFactor) {
1079 std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
1080 if (mayBeConstantTripCount.has_value() &&
1081 *mayBeConstantTripCount < unrollJamFactor)
1082 return loopUnrollJamByFactor(forOp, *mayBeConstantTripCount);
1083 return loopUnrollJamByFactor(forOp, unrollJamFactor);
1084}
1085
1086/// Check if all control operands of all loops are defined outside of `forOp`
1087/// and return false if not.
1088static bool areInnerBoundsInvariant(AffineForOp forOp) {
1089 auto walkResult = forOp.walk([&](AffineForOp aForOp) {
1090 for (auto controlOperand : aForOp.getControlOperands()) {
1091 if (!forOp.isDefinedOutsideOfLoop(controlOperand))
1092 return WalkResult::interrupt();
1093 }
1094 return WalkResult::advance();
1095 });
1096 return !walkResult.wasInterrupted();
1097}
1098
1099/// Unrolls and jams this loop by the specified factor.
1100LogicalResult mlir::affine::loopUnrollJamByFactor(AffineForOp forOp,
1101 uint64_t unrollJamFactor) {
1102 assert(unrollJamFactor > 0 && "unroll jam factor should be positive");
1103
1104 std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
1105 if (unrollJamFactor == 1) {
1106 if (mayBeConstantTripCount && *mayBeConstantTripCount == 1 &&
1107 failed(promoteIfSingleIteration(forOp)))
1108 return failure();
1109 return success();
1110 }
1111
1112 // Nothing in the loop body other than the terminator.
1113 if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
1114 return success();
1115
1116 // If the trip count is lower than the unroll jam factor, no unroll jam.
1117 if (mayBeConstantTripCount && *mayBeConstantTripCount < unrollJamFactor) {
1118 LLVM_DEBUG(llvm::dbgs() << "[failed] trip count < unroll-jam factor\n");
1119 return failure();
1120 }
1121
1122 // If any control operand of any inner loop of `forOp` is defined within
1123 // `forOp`, no unroll jam.
1124 if (!areInnerBoundsInvariant(forOp))
1125 return failure();
1126
1127 // Gather all sub-blocks to jam upon the loop being unrolled.
1128 JamBlockGatherer<AffineForOp> jbg;
1129 jbg.walk(forOp);
1130 auto &subBlocks = jbg.subBlocks;
1131
1132 // Collect loops with iter_args.
1133 SmallVector<AffineForOp, 4> loopsWithIterArgs;
1134 forOp.walk([&](AffineForOp aForOp) {
1135 if (aForOp.getNumIterOperands() > 0)
1136 loopsWithIterArgs.push_back(aForOp);
1137 });
1138
1139 // Get supported reductions to be used for creating reduction ops at the end.
1140 SmallVector<LoopReduction> reductions;
1141 if (forOp.getNumIterOperands() > 0)
1142 getSupportedReductions(forOp, reductions);
1143
1144 // Generate the cleanup loop if trip count isn't a multiple of
1145 // unrollJamFactor.
1146 if (getLargestDivisorOfTripCount(forOp) % unrollJamFactor != 0) {
1147 // Loops where the lower bound is a max expression or the upper bound is
1148 // a min expression and the trip count doesn't divide the unroll factor
1149 // can't be unrolled since the lower bound of the cleanup loop in such cases
1150 // cannot be expressed as an affine function or a max over affine functions.
1151 if (forOp.getLowerBoundMap().getNumResults() != 1 ||
1152 forOp.getUpperBoundMap().getNumResults() != 1)
1153 return failure();
1154 if (failed(generateCleanupLoopForUnroll(forOp, unrollJamFactor)))
1155 assert(false && "cleanup loop lower bound map for single result lower "
1156 "and upper bound maps can always be determined");
1157 }
1158
1159 // `operandMaps[i - 1]` carries old->new operand mapping for the ith unrolled
1160 // iteration. There are (`unrollJamFactor` - 1) iterations.
1161 SmallVector<IRMapping, 4> operandMaps(unrollJamFactor - 1);
1162
1163 // For any loop with iter_args, replace it with a new loop that has
1164 // `unrollJamFactor` copies of its iterOperands, iter_args and yield
1165 // operands.
1166 SmallVector<AffineForOp, 4> newLoopsWithIterArgs;
1167 IRRewriter rewriter(forOp.getContext());
1168 for (AffineForOp oldForOp : loopsWithIterArgs) {
1169 SmallVector<Value> dupIterOperands, dupYieldOperands;
1170 ValueRange oldIterOperands = oldForOp.getInits();
1171 ValueRange oldIterArgs = oldForOp.getRegionIterArgs();
1172 ValueRange oldYieldOperands =
1173 cast<AffineYieldOp>(oldForOp.getBody()->getTerminator()).getOperands();
1174 // Get additional iterOperands, iterArgs, and yield operands. We will
1175 // fix iterOperands and yield operands after cloning of sub-blocks.
1176 for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1177 dupIterOperands.append(oldIterOperands.begin(), oldIterOperands.end());
1178 dupYieldOperands.append(oldYieldOperands.begin(), oldYieldOperands.end());
1179 }
1180 // Create a new loop with additional iterOperands, iter_args and yield
1181 // operands. This new loop will take the loop body of the original loop.
1182 bool forOpReplaced = oldForOp == forOp;
1183 AffineForOp newForOp =
1184 cast<AffineForOp>(*oldForOp.replaceWithAdditionalYields(
1185 rewriter, dupIterOperands, /*replaceInitOperandUsesInLoop=*/false,
1186 [&](OpBuilder &b, Location loc, ArrayRef<BlockArgument> newBbArgs) {
1187 return dupYieldOperands;
1188 }));
1189 newLoopsWithIterArgs.push_back(newForOp);
1190 // `forOp` has been replaced with a new loop.
1191 if (forOpReplaced)
1192 forOp = newForOp;
1193 // Update `operandMaps` for `newForOp` iterArgs and results.
1194 ValueRange newIterArgs = newForOp.getRegionIterArgs();
1195 unsigned oldNumIterArgs = oldIterArgs.size();
1196 ValueRange newResults = newForOp.getResults();
1197 unsigned oldNumResults = newResults.size() / unrollJamFactor;
1198 assert(oldNumIterArgs == oldNumResults &&
1199 "oldNumIterArgs must be the same as oldNumResults");
1200 for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1201 for (unsigned j = 0; j < oldNumIterArgs; ++j) {
1202 // `newForOp` has `unrollJamFactor` - 1 new sets of iterArgs and
1203 // results. Update `operandMaps[i - 1]` to map old iterArgs and results
1204 // to those in the `i`th new set.
1205 operandMaps[i - 1].map(newIterArgs[j],
1206 newIterArgs[i * oldNumIterArgs + j]);
1207 operandMaps[i - 1].map(newResults[j],
1208 newResults[i * oldNumResults + j]);
1209 }
1210 }
1211 }
1212
1213 // Scale the step of loop being unroll-jammed by the unroll-jam factor.
1214 int64_t step = forOp.getStepAsInt();
1215 forOp.setStep(step * unrollJamFactor);
1216
1217 auto forOpIV = forOp.getInductionVar();
1218 // Unroll and jam (appends unrollJamFactor - 1 additional copies).
1219 for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1220 for (auto &subBlock : subBlocks) {
1221 // Builder to insert unroll-jammed bodies. Insert right at the end of
1222 // sub-block.
1223 OpBuilder builder(subBlock.first->getBlock(), std::next(x: subBlock.second));
1224
1225 // If the induction variable is used, create a remapping to the value for
1226 // this unrolled instance.
1227 if (!forOpIV.use_empty()) {
1228 // iv' = iv + i * step, i = 1 to unrollJamFactor-1.
1229 auto d0 = builder.getAffineDimExpr(position: 0);
1230 auto bumpMap = AffineMap::get(dimCount: 1, symbolCount: 0, result: d0 + i * step);
1231 auto ivUnroll =
1232 builder.create<AffineApplyOp>(forOp.getLoc(), bumpMap, forOpIV);
1233 operandMaps[i - 1].map(forOpIV, ivUnroll);
1234 }
1235 // Clone the sub-block being unroll-jammed.
1236 for (auto it = subBlock.first; it != std::next(x: subBlock.second); ++it)
1237 builder.clone(op&: *it, mapper&: operandMaps[i - 1]);
1238 }
1239 // Fix iterOperands and yield op operands of newly created loops.
1240 for (auto newForOp : newLoopsWithIterArgs) {
1241 unsigned oldNumIterOperands =
1242 newForOp.getNumIterOperands() / unrollJamFactor;
1243 unsigned numControlOperands = newForOp.getNumControlOperands();
1244 auto yieldOp = cast<AffineYieldOp>(newForOp.getBody()->getTerminator());
1245 unsigned oldNumYieldOperands = yieldOp.getNumOperands() / unrollJamFactor;
1246 assert(oldNumIterOperands == oldNumYieldOperands &&
1247 "oldNumIterOperands must be the same as oldNumYieldOperands");
1248 for (unsigned j = 0; j < oldNumIterOperands; ++j) {
1249 // The `i`th duplication of an old iterOperand or yield op operand
1250 // needs to be replaced with a mapped value from `operandMaps[i - 1]`
1251 // if such mapped value exists.
1252 newForOp.setOperand(numControlOperands + i * oldNumIterOperands + j,
1253 operandMaps[i - 1].lookupOrDefault(
1254 newForOp.getOperand(numControlOperands + j)));
1255 yieldOp.setOperand(
1256 i * oldNumYieldOperands + j,
1257 operandMaps[i - 1].lookupOrDefault(yieldOp.getOperand(j)));
1258 }
1259 }
1260 }
1261 if (forOp.getNumResults() > 0) {
1262 // Create reduction ops to combine every `unrollJamFactor` related results
1263 // into one value. For example, for %0:2 = affine.for ... and addf, we add
1264 // %1 = arith.addf %0#0, %0#1, and replace the following uses of %0#0 with
1265 // %1.
1266 rewriter.setInsertionPointAfter(forOp);
1267 auto loc = forOp.getLoc();
1268 unsigned oldNumResults = forOp.getNumResults() / unrollJamFactor;
1269 for (LoopReduction &reduction : reductions) {
1270 unsigned pos = reduction.iterArgPosition;
1271 Value lhs = forOp.getResult(pos);
1272 Value rhs;
1273 SmallPtrSet<Operation *, 4> newOps;
1274 for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1275 rhs = forOp.getResult(i * oldNumResults + pos);
1276 // Create ops based on reduction type.
1277 lhs = arith::getReductionOp(reduction.kind, rewriter, loc, lhs, rhs);
1278 if (!lhs)
1279 return failure();
1280 Operation *op = lhs.getDefiningOp();
1281 assert(op && "Reduction op should have been created");
1282 newOps.insert(Ptr: op);
1283 }
1284 // Replace all uses except those in newly created reduction ops.
1285 forOp.getResult(pos).replaceAllUsesExcept(lhs, newOps);
1286 }
1287 }
1288
1289 // Promote the loop body up if this has turned into a single iteration loop.
1290 (void)promoteIfSingleIteration(forOp);
1291 return success();
1292}
1293
1294/// Performs loop interchange on 'forOpA' and 'forOpB', where 'forOpB' is
1295/// nested within 'forOpA' as the only non-terminator operation in its block.
1296void mlir::affine::interchangeLoops(AffineForOp forOpA, AffineForOp forOpB) {
1297 assert(&*forOpA.getBody()->begin() == forOpB.getOperation());
1298 auto &forOpABody = forOpA.getBody()->getOperations();
1299 auto &forOpBBody = forOpB.getBody()->getOperations();
1300
1301 // 1) Splice forOpA's non-terminator operations (which is just forOpB) just
1302 // before forOpA (in ForOpA's parent's block) this should leave 'forOpA's
1303 // body containing only the terminator.
1304 forOpA->getBlock()->getOperations().splice(Block::iterator(forOpA),
1305 forOpABody, forOpABody.begin(),
1306 std::prev(forOpABody.end()));
1307 // 2) Splice forOpB's non-terminator operations into the beginning of forOpA's
1308 // body (this leaves forOpB's body containing only the terminator).
1309 forOpABody.splice(forOpABody.begin(), forOpBBody, forOpBBody.begin(),
1310 std::prev(forOpBBody.end()));
1311 // 3) Splice forOpA into the beginning of forOpB's body.
1312 forOpBBody.splice(forOpBBody.begin(), forOpA->getBlock()->getOperations(),
1313 Block::iterator(forOpA));
1314}
1315
1316// Checks each dependence component against the permutation to see if the
1317// desired loop interchange would violate dependences by making the
1318// dependence component lexicographically negative.
1319static bool checkLoopInterchangeDependences(
1320 const std::vector<SmallVector<DependenceComponent, 2>> &depCompsVec,
1321 ArrayRef<AffineForOp> loops, ArrayRef<unsigned> loopPermMap) {
1322 // Invert permutation map.
1323 unsigned maxLoopDepth = loops.size();
1324 SmallVector<unsigned, 4> loopPermMapInv;
1325 loopPermMapInv.resize(N: maxLoopDepth);
1326 for (unsigned i = 0; i < maxLoopDepth; ++i)
1327 loopPermMapInv[loopPermMap[i]] = i;
1328
1329 // Check each dependence component against the permutation to see if the
1330 // desired loop interchange permutation would make the dependence vectors
1331 // lexicographically negative.
1332 // Example 1: [-1, 1][0, 0]
1333 // Example 2: [0, 0][-1, 1]
1334 for (const auto &depComps : depCompsVec) {
1335 assert(depComps.size() >= maxLoopDepth);
1336 // Check if the first non-zero dependence component is positive.
1337 // This iterates through loops in the desired order.
1338 for (unsigned j = 0; j < maxLoopDepth; ++j) {
1339 unsigned permIndex = loopPermMapInv[j];
1340 assert(depComps[permIndex].lb);
1341 int64_t depCompLb = *depComps[permIndex].lb;
1342 if (depCompLb > 0)
1343 break;
1344 if (depCompLb < 0)
1345 return false;
1346 }
1347 }
1348 return true;
1349}
1350
1351/// Checks if the loop interchange permutation 'loopPermMap' of the perfectly
1352/// nested sequence of loops in 'loops' would violate dependences.
1353bool mlir::affine::isValidLoopInterchangePermutation(
1354 ArrayRef<AffineForOp> loops, ArrayRef<unsigned> loopPermMap) {
1355 assert(loopPermMap.size() == loops.size() && "invalid loop perm map");
1356 unsigned maxLoopDepth = loops.size();
1357 if (maxLoopDepth == 1)
1358 return true;
1359 // Gather dependence components for dependences between all ops in loop nest
1360 // rooted at 'loops[0]', at loop depths in range [1, maxLoopDepth].
1361 std::vector<SmallVector<DependenceComponent, 2>> depCompsVec;
1362 getDependenceComponents(loops[0], maxLoopDepth, &depCompsVec);
1363 return checkLoopInterchangeDependences(depCompsVec, loops, loopPermMap);
1364}
1365
1366/// Returns true if `loops` is a perfectly nested loop nest, where loops appear
1367/// in it from outermost to innermost.
1368bool LLVM_ATTRIBUTE_UNUSED
1369mlir::affine::isPerfectlyNested(ArrayRef<AffineForOp> loops) {
1370 assert(!loops.empty() && "no loops provided");
1371
1372 // We already know that the block can't be empty.
1373 auto hasTwoElements = [](Block *block) {
1374 auto secondOpIt = std::next(x: block->begin());
1375 return secondOpIt != block->end() && &*secondOpIt == &block->back();
1376 };
1377
1378 auto enclosingLoop = loops.front();
1379 for (auto loop : loops.drop_front()) {
1380 auto parentForOp = dyn_cast<AffineForOp>(loop->getParentOp());
1381 // parentForOp's body should be just this loop and the terminator.
1382 if (parentForOp != enclosingLoop || !hasTwoElements(parentForOp.getBody()))
1383 return false;
1384 enclosingLoop = loop;
1385 }
1386 return true;
1387}
1388
1389// input[i] should move from position i -> permMap[i]. Returns the position in
1390// `input` that becomes the new outermost loop.
1391unsigned mlir::affine::permuteLoops(ArrayRef<AffineForOp> input,
1392 ArrayRef<unsigned> permMap) {
1393 assert(input.size() == permMap.size() && "invalid permutation map size");
1394 // Check whether the permutation spec is valid. This is a small vector - we'll
1395 // just sort and check if it's iota.
1396 SmallVector<unsigned, 4> checkPermMap(permMap);
1397 llvm::sort(C&: checkPermMap);
1398 if (llvm::any_of(Range: llvm::enumerate(First&: checkPermMap),
1399 P: [](const auto &en) { return en.value() != en.index(); }))
1400 assert(false && "invalid permutation map");
1401
1402 // Nothing to do.
1403 if (input.size() < 2)
1404 return 0;
1405
1406 assert(isPerfectlyNested(input) && "input not perfectly nested");
1407
1408 // Compute the inverse mapping, invPermMap: since input[i] goes to position
1409 // permMap[i], position i of the permuted nest is at input[invPermMap[i]].
1410 SmallVector<std::pair<unsigned, unsigned>, 4> invPermMap;
1411 for (unsigned i = 0, e = input.size(); i < e; ++i)
1412 invPermMap.push_back(Elt: {permMap[i], i});
1413 llvm::sort(C&: invPermMap);
1414
1415 // Move the innermost loop body to the loop that would be the innermost in the
1416 // permuted nest (only if the innermost loop is going to change).
1417 if (permMap.back() != input.size() - 1) {
1418 Block *destBody = ((AffineForOp)input[invPermMap.back().second]).getBody();
1419 Block *srcBody = ((AffineForOp)input.back()).getBody();
1420 destBody->getOperations().splice(where: destBody->begin(),
1421 L2&: srcBody->getOperations(), first: srcBody->begin(),
1422 last: std::prev(x: srcBody->end()));
1423 }
1424
1425 // We'll move each loop in `input` in the reverse order so that its body is
1426 // empty when we are moving it; this incurs zero copies and no erasing.
1427 for (int i = input.size() - 1; i >= 0; --i) {
1428 // If this has to become the outermost loop after permutation, add it to the
1429 // parent block of the original root.
1430 if (permMap[i] == 0) {
1431 // If the root remains the same, nothing to do.
1432 if (i == 0)
1433 continue;
1434 // Make input[i] the new outermost loop moving it into parentBlock.
1435 auto *parentBlock = input[0]->getBlock();
1436 parentBlock->getOperations().splice(Block::iterator(input[0]),
1437 input[i]->getBlock()->getOperations(),
1438 Block::iterator(input[i]));
1439 continue;
1440 }
1441
1442 // If the parent in the permuted order is the same as in the original,
1443 // nothing to do.
1444 unsigned parentPosInInput = invPermMap[permMap[i] - 1].second;
1445 if (i > 0 && static_cast<unsigned>(i - 1) == parentPosInInput)
1446 continue;
1447
1448 // Move input[i] to its surrounding loop in the transformed nest.
1449 auto *destBody = ((AffineForOp)input[parentPosInInput]).getBody();
1450 destBody->getOperations().splice(destBody->begin(),
1451 input[i]->getBlock()->getOperations(),
1452 Block::iterator(input[i]));
1453 }
1454
1455 return invPermMap[0].second;
1456}
1457
1458// Sinks all sequential loops to the innermost levels (while preserving
1459// relative order among them) and moves all parallel loops to the
1460// outermost (while again preserving relative order among them).
1461AffineForOp mlir::affine::sinkSequentialLoops(AffineForOp forOp) {
1462 SmallVector<AffineForOp, 4> loops;
1463 getPerfectlyNestedLoops(loops, forOp);
1464 if (loops.size() < 2)
1465 return forOp;
1466
1467 // Gather dependence components for dependences between all ops in loop nest
1468 // rooted at 'loops[0]', at loop depths in range [1, maxLoopDepth].
1469 unsigned maxLoopDepth = loops.size();
1470 std::vector<SmallVector<DependenceComponent, 2>> depCompsVec;
1471 getDependenceComponents(loops[0], maxLoopDepth, &depCompsVec);
1472
1473 // Mark loops as either parallel or sequential.
1474 SmallVector<bool, 8> isParallelLoop(maxLoopDepth, true);
1475 for (auto &depComps : depCompsVec) {
1476 assert(depComps.size() >= maxLoopDepth);
1477 for (unsigned j = 0; j < maxLoopDepth; ++j) {
1478 DependenceComponent &depComp = depComps[j];
1479 assert(depComp.lb.has_value() && depComp.ub.has_value());
1480 if (*depComp.lb != 0 || *depComp.ub != 0)
1481 isParallelLoop[j] = false;
1482 }
1483 }
1484
1485 unsigned numParallelLoops = llvm::count(Range&: isParallelLoop, Element: true);
1486
1487 // Compute permutation of loops that sinks sequential loops (and thus raises
1488 // parallel loops) while preserving relative order.
1489 SmallVector<unsigned, 4> loopPermMap(maxLoopDepth);
1490 unsigned nextSequentialLoop = numParallelLoops;
1491 unsigned nextParallelLoop = 0;
1492 for (unsigned i = 0; i < maxLoopDepth; ++i) {
1493 if (isParallelLoop[i]) {
1494 loopPermMap[i] = nextParallelLoop++;
1495 } else {
1496 loopPermMap[i] = nextSequentialLoop++;
1497 }
1498 }
1499
1500 // Check if permutation 'loopPermMap' would violate dependences.
1501 if (!checkLoopInterchangeDependences(depCompsVec, loops, loopPermMap))
1502 return forOp;
1503 // Perform loop interchange according to permutation 'loopPermMap'.
1504 unsigned loopNestRootIndex = permuteLoops(loops, loopPermMap);
1505 return loops[loopNestRootIndex];
1506}
1507
1508// Factors out common behavior to add a new `iv` (resp. `iv` + `offset`) to the
1509// lower (resp. upper) loop bound. When called for both the lower and upper
1510// bounds, the resulting IR resembles:
1511//
1512// ```mlir
1513// affine.for %i = max (`iv, ...) to min (`iv` + `offset`) {
1514// ...
1515// }
1516// ```
1517static void augmentMapAndBounds(OpBuilder &b, Value iv, AffineMap *map,
1518 SmallVector<Value, 4> *operands,
1519 int64_t offset = 0) {
1520 auto bounds = llvm::to_vector<4>(Range: map->getResults());
1521 bounds.push_back(Elt: b.getAffineDimExpr(position: map->getNumDims()) + offset);
1522 operands->insert(I: operands->begin() + map->getNumDims(), Elt: iv);
1523 *map = AffineMap::get(dimCount: map->getNumDims() + 1, symbolCount: map->getNumSymbols(), results: bounds,
1524 context: b.getContext());
1525 canonicalizeMapAndOperands(map, operands);
1526}
1527
1528// Stripmines `forOp` by `factor` and sinks it under each of the `targets`.
1529// Stripmine-sink is a primitive building block for generalized tiling of
1530// imperfectly nested loops.
1531// This transformation is purely mechanical and does not check legality,
1532// profitability or even structural correctness. It is the user's
1533// responsibility to specify `targets` that are dominated by `forOp`.
1534// Returns the new AffineForOps, one per `targets`, nested immediately under
1535// each of the `targets`.
1536static SmallVector<AffineForOp, 8>
1537stripmineSink(AffineForOp forOp, uint64_t factor,
1538 ArrayRef<AffineForOp> targets) {
1539 auto originalStep = forOp.getStepAsInt();
1540 auto scaledStep = originalStep * factor;
1541 forOp.setStep(scaledStep);
1542
1543 OpBuilder b(forOp->getBlock(), std::next(x: Block::iterator(forOp)));
1544
1545 // Lower-bound map creation.
1546 auto lbMap = forOp.getLowerBoundMap();
1547 SmallVector<Value, 4> lbOperands(forOp.getLowerBoundOperands());
1548 augmentMapAndBounds(b, forOp.getInductionVar(), &lbMap, &lbOperands);
1549
1550 // Upper-bound map creation.
1551 auto ubMap = forOp.getUpperBoundMap();
1552 SmallVector<Value, 4> ubOperands(forOp.getUpperBoundOperands());
1553 augmentMapAndBounds(b, forOp.getInductionVar(), &ubMap, &ubOperands,
1554 /*offset=*/scaledStep);
1555
1556 auto iv = forOp.getInductionVar();
1557 SmallVector<AffineForOp, 8> innerLoops;
1558 for (auto t : targets) {
1559 // Insert newForOp before the terminator of `t`.
1560 auto b = OpBuilder::atBlockTerminator(t.getBody());
1561 auto newForOp = b.create<AffineForOp>(t.getLoc(), lbOperands, lbMap,
1562 ubOperands, ubMap, originalStep);
1563 auto begin = t.getBody()->begin();
1564 // Skip terminator and `newForOp` which is just before the terminator.
1565 auto nOps = t.getBody()->getOperations().size() - 2;
1566 newForOp.getBody()->getOperations().splice(
1567 newForOp.getBody()->getOperations().begin(),
1568 t.getBody()->getOperations(), begin, std::next(begin, nOps));
1569 replaceAllUsesInRegionWith(iv, newForOp.getInductionVar(),
1570 newForOp.getRegion());
1571 innerLoops.push_back(newForOp);
1572 }
1573
1574 return innerLoops;
1575}
1576
1577// Stripmines a `forOp` by `factor` and sinks it under a single `target`.
1578// Returns the new AffineForOps, nested immediately under `target`.
1579template <typename SizeType>
1580static AffineForOp stripmineSink(AffineForOp forOp, SizeType factor,
1581 AffineForOp target) {
1582 // TODO: Use cheap structural assertions that targets are nested under
1583 // forOp and that targets are not nested under each other when DominanceInfo
1584 // exposes the capability. It seems overkill to construct a whole function
1585 // dominance tree at this point.
1586 auto res = stripmineSink(forOp, factor, ArrayRef<AffineForOp>(target));
1587 assert(res.size() == 1 && "Expected 1 inner forOp");
1588 return res[0];
1589}
1590
1591SmallVector<SmallVector<AffineForOp, 8>, 8>
1592mlir::affine::tile(ArrayRef<AffineForOp> forOps, ArrayRef<uint64_t> sizes,
1593 ArrayRef<AffineForOp> targets) {
1594 SmallVector<SmallVector<AffineForOp, 8>, 8> res;
1595 SmallVector<AffineForOp, 8> currentTargets(targets);
1596 for (auto it : llvm::zip(t&: forOps, u&: sizes)) {
1597 auto step = stripmineSink(std::get<0>(t&: it), std::get<1>(t&: it), currentTargets);
1598 res.push_back(step);
1599 currentTargets = step;
1600 }
1601 return res;
1602}
1603
1604SmallVector<AffineForOp, 8> mlir::affine::tile(ArrayRef<AffineForOp> forOps,
1605 ArrayRef<uint64_t> sizes,
1606 AffineForOp target) {
1607 SmallVector<AffineForOp, 8> res;
1608 for (auto loops : tile(forOps, sizes, ArrayRef<AffineForOp>(target)))
1609 res.push_back(llvm::getSingleElement(loops));
1610 return res;
1611}
1612
1613LogicalResult mlir::affine::coalesceLoops(MutableArrayRef<AffineForOp> loops) {
1614 if (loops.size() < 2)
1615 return success();
1616
1617 AffineForOp innermost = loops.back();
1618 AffineForOp outermost = loops.front();
1619 AffineBound ub = outermost.getUpperBound();
1620 AffineMap origUbMap = ub.getMap();
1621 Location loc = outermost.getLoc();
1622 OpBuilder builder(outermost);
1623 for (AffineForOp loop : loops) {
1624 // We only work on normalized loops.
1625 if (loop.getStepAsInt() != 1 || !loop.hasConstantLowerBound() ||
1626 loop.getConstantLowerBound() != 0)
1627 return failure();
1628 }
1629 SmallVector<Value, 4> upperBoundSymbols;
1630 SmallVector<Value, 4> ubOperands(ub.getOperands().begin(),
1631 ub.getOperands().end());
1632
1633 // 1. Store the upper bound of the outermost loop in a variable.
1634 Value prev;
1635 if (!llvm::hasSingleElement(C: origUbMap.getResults()))
1636 prev = builder.create<AffineMinOp>(loc, origUbMap, ubOperands);
1637 else
1638 prev = builder.create<AffineApplyOp>(loc, origUbMap, ubOperands);
1639 upperBoundSymbols.push_back(Elt: prev);
1640
1641 // 2. Emit code computing the upper bound of the coalesced loop as product of
1642 // the number of iterations of all loops.
1643 for (AffineForOp loop : loops.drop_front()) {
1644 ub = loop.getUpperBound();
1645 origUbMap = ub.getMap();
1646 ubOperands = ub.getOperands();
1647 Value upperBound;
1648 // If upper bound map has more than one result, take their minimum.
1649 if (!llvm::hasSingleElement(origUbMap.getResults()))
1650 upperBound = builder.create<AffineMinOp>(loc, origUbMap, ubOperands);
1651 else
1652 upperBound = builder.create<AffineApplyOp>(loc, origUbMap, ubOperands);
1653 upperBoundSymbols.push_back(upperBound);
1654 SmallVector<Value, 4> operands;
1655 operands.push_back(prev);
1656 operands.push_back(upperBound);
1657 // Maintain running product of loop upper bounds.
1658 prev = builder.create<AffineApplyOp>(
1659 loc,
1660 AffineMap::get(/*dimCount=*/1,
1661 /*symbolCount=*/1,
1662 builder.getAffineDimExpr(0) *
1663 builder.getAffineSymbolExpr(0)),
1664 operands);
1665 }
1666 // Set upper bound of the coalesced loop.
1667 AffineMap newUbMap = AffineMap::get(
1668 /*dimCount=*/0,
1669 /*symbolCount=*/1, results: builder.getAffineSymbolExpr(position: 0), context: builder.getContext());
1670 outermost.setUpperBound(prev, newUbMap);
1671
1672 builder.setInsertionPointToStart(outermost.getBody());
1673
1674 // 3. Remap induction variables. For each original loop, the value of the
1675 // induction variable can be obtained by dividing the induction variable of
1676 // the linearized loop by the total number of iterations of the loops nested
1677 // in it modulo the number of iterations in this loop (remove the values
1678 // related to the outer loops):
1679 // iv_i = floordiv(iv_linear, product-of-loop-ranges-until-i) mod range_i.
1680 // Compute these iteratively from the innermost loop by creating a "running
1681 // quotient" of division by the range.
1682 Value previous = outermost.getInductionVar();
1683 for (unsigned idx = loops.size(); idx > 0; --idx) {
1684 if (idx != loops.size()) {
1685 SmallVector<Value, 4> operands;
1686 operands.push_back(Elt: previous);
1687 operands.push_back(Elt: upperBoundSymbols[idx]);
1688 previous = builder.create<AffineApplyOp>(
1689 loc,
1690 AffineMap::get(
1691 /*dimCount=*/1, /*symbolCount=*/1,
1692 result: builder.getAffineDimExpr(position: 0).floorDiv(
1693 other: builder.getAffineSymbolExpr(position: 0))),
1694 operands);
1695 }
1696 // Modified value of the induction variables of the nested loops after
1697 // coalescing.
1698 Value inductionVariable;
1699 if (idx == 1) {
1700 inductionVariable = previous;
1701 } else {
1702 SmallVector<Value, 4> applyOperands;
1703 applyOperands.push_back(Elt: previous);
1704 applyOperands.push_back(Elt: upperBoundSymbols[idx - 1]);
1705 inductionVariable = builder.create<AffineApplyOp>(
1706 loc,
1707 AffineMap::get(
1708 /*dimCount=*/1, /*symbolCount=*/1,
1709 result: builder.getAffineDimExpr(position: 0) % builder.getAffineSymbolExpr(position: 0)),
1710 applyOperands);
1711 }
1712 replaceAllUsesInRegionWith(loops[idx - 1].getInductionVar(),
1713 inductionVariable, loops.back().getRegion());
1714 }
1715
1716 // 4. Move the operations from the innermost just above the second-outermost
1717 // loop, delete the extra terminator and the second-outermost loop.
1718 AffineForOp secondOutermostLoop = loops[1];
1719 innermost.getBody()->back().erase();
1720 outermost.getBody()->getOperations().splice(
1721 Block::iterator(secondOutermostLoop.getOperation()),
1722 innermost.getBody()->getOperations());
1723 secondOutermostLoop.erase();
1724 return success();
1725}
1726
1727void mlir::affine::mapLoopToProcessorIds(scf::ForOp forOp,
1728 ArrayRef<Value> processorId,
1729 ArrayRef<Value> numProcessors) {
1730 assert(processorId.size() == numProcessors.size());
1731 if (processorId.empty())
1732 return;
1733
1734 OpBuilder b(forOp);
1735 Location loc(forOp.getLoc());
1736 AffineExpr lhs, rhs;
1737 bindSymbols(forOp.getContext(), lhs, rhs);
1738 auto mulMap = AffineMap::get(dimCount: 0, symbolCount: 2, result: lhs * rhs);
1739 auto addMap = AffineMap::get(dimCount: 0, symbolCount: 2, result: lhs + rhs);
1740
1741 Value linearIndex = processorId.front();
1742 for (unsigned i = 1, e = processorId.size(); i < e; ++i) {
1743 auto mulApplyOp = b.create<AffineApplyOp>(
1744 loc, mulMap, ValueRange{linearIndex, numProcessors[i]});
1745 linearIndex = b.create<AffineApplyOp>(
1746 loc, addMap, ValueRange{mulApplyOp, processorId[i]});
1747 }
1748
1749 auto mulApplyOp = b.create<AffineApplyOp>(
1750 loc, mulMap, ValueRange{linearIndex, forOp.getStep()});
1751 Value lb = b.create<AffineApplyOp>(
1752 loc, addMap, ValueRange{mulApplyOp, forOp.getLowerBound()});
1753 forOp.setLowerBound(lb);
1754
1755 Value step = forOp.getStep();
1756 for (auto numProcs : numProcessors)
1757 step = b.create<AffineApplyOp>(loc, mulMap, ValueRange{numProcs, step});
1758 forOp.setStep(step);
1759}
1760
1761/// Given a memref region, determine the lowest depth at which transfers can be
1762/// placed for it, and return the corresponding block, start and end positions
1763/// in the block for placing incoming (read) and outgoing (write) copies
1764/// respectively. The lowest depth depends on whether the region being accessed
1765/// is hoistable with respect to one or more immediately surrounding loops.
1766static void
1767findHighestBlockForPlacement(const MemRefRegion &region, Block &block,
1768 Block::iterator &begin, Block::iterator &end,
1769 Block **copyPlacementBlock,
1770 Block::iterator *copyInPlacementStart,
1771 Block::iterator *copyOutPlacementStart) {
1772 const auto *cst = region.getConstraints();
1773 SmallVector<Value, 4> symbols;
1774 cst->getValues(start: cst->getNumDimVars(), end: cst->getNumDimAndSymbolVars(), values: &symbols);
1775
1776 SmallVector<Operation *, 4> enclosingAffineOps;
1777 getEnclosingAffineOps(op&: *block.begin(), ops: &enclosingAffineOps);
1778 // Walk up loop parents till we find an IV on which this region is
1779 // symbolic/variant or we hit `hoistGuard`.
1780 auto it = enclosingAffineOps.rbegin();
1781 AffineForOp lastInvariantFor;
1782 for (auto e = enclosingAffineOps.rend(); it != e; ++it) {
1783 Operation *enclosingOp = *it;
1784 // We can't hoist past the definition of the memref being copied.
1785 Value memref = region.memref;
1786 if (!memref.getParentRegion()->isAncestor(other: enclosingOp->getParentRegion())) {
1787 LLVM_DEBUG(
1788 llvm::dbgs()
1789 << "memref definition will end up not dominating hoist location\n");
1790 break;
1791 }
1792
1793 auto affineFor = dyn_cast<AffineForOp>(enclosingOp);
1794 if (!affineFor)
1795 break;
1796 // TODO: also need to be checking this for regions symbols that
1797 // aren't loop IVs, whether we are within their resp. defs' dominance scope.
1798 if (llvm::is_contained(symbols, affineFor.getInductionVar()))
1799 break;
1800 lastInvariantFor = affineFor;
1801 }
1802
1803 if (it != enclosingAffineOps.rbegin()) {
1804 *copyInPlacementStart = Block::iterator(lastInvariantFor);
1805 *copyOutPlacementStart = std::next(x: *copyInPlacementStart);
1806 *copyPlacementBlock = lastInvariantFor->getBlock();
1807 } else {
1808 *copyInPlacementStart = begin;
1809 *copyOutPlacementStart = end;
1810 *copyPlacementBlock = &block;
1811 }
1812}
1813
1814// Info comprising stride and number of elements transferred every stride.
1815struct StrideInfo {
1816 int64_t stride;
1817 int64_t numEltPerStride;
1818};
1819
1820/// Returns striding information for a copy/transfer of this region with
1821/// potentially multiple striding levels from outermost to innermost. For an
1822/// n-dimensional region, there can be at most n-1 levels of striding
1823/// successively nested.
1824// TODO: make this work with non-identity layout maps.
1825static void getMultiLevelStrides(const MemRefRegion &region,
1826 ArrayRef<int64_t> bufferShape,
1827 SmallVectorImpl<StrideInfo> *strideInfos) {
1828 if (bufferShape.size() <= 1)
1829 return;
1830
1831 int64_t numEltPerStride = 1;
1832 int64_t stride = 1;
1833 for (int d = bufferShape.size() - 1; d >= 1; d--) {
1834 int64_t dimSize = cast<MemRefType>(region.memref.getType()).getDimSize(d);
1835 stride *= dimSize;
1836 numEltPerStride *= bufferShape[d];
1837 // A stride is needed only if the region has a shorter extent than the
1838 // memref along the dimension *and* has an extent greater than one along the
1839 // next major dimension.
1840 if (bufferShape[d] < dimSize && bufferShape[d - 1] > 1) {
1841 strideInfos->push_back(Elt: {.stride: stride, .numEltPerStride: numEltPerStride});
1842 }
1843 }
1844}
1845
1846/// Generates a point-wise copy from/to a non-zero ranked `memref' to/from
1847/// `fastMemRef' and returns the outermost AffineForOp of the copy loop nest.
1848/// `lbMaps` and `ubMaps` along with `lbOperands` and `ubOperands` hold the
1849/// lower and upper bound information for the copy loop nest. `fastBufOffsets`
1850/// contain the expressions to be subtracted out from the respective copy loop
1851/// iterators in order to index the fast buffer. If `copyOut' is true, generates
1852/// a copy-out; otherwise a copy-in. Builder `b` should be set to the point the
1853/// copy nest is inserted.
1854//
1855/// The copy-in nest is generated as follows as an example for a 2-d region:
1856/// for x = ...
1857/// for y = ...
1858/// fast_buf[x - offset_x][y - offset_y] = memref[x][y]
1859///
1860static AffineForOp
1861generatePointWiseCopy(Location loc, Value memref, Value fastMemRef,
1862 ArrayRef<AffineMap> lbMaps, ArrayRef<Value> lbOperands,
1863 ArrayRef<AffineMap> ubMaps, ArrayRef<Value> ubOperands,
1864 ArrayRef<AffineExpr> fastBufOffsets, bool isCopyOut,
1865 OpBuilder b) {
1866 assert(llvm::all_of(lbMaps, [&](AffineMap lbMap) {
1867 return lbMap.getNumInputs() == lbOperands.size();
1868 }));
1869 assert(llvm::all_of(ubMaps, [&](AffineMap ubMap) {
1870 return ubMap.getNumInputs() == ubOperands.size();
1871 }));
1872
1873 unsigned rank = cast<MemRefType>(memref.getType()).getRank();
1874 // A copy nest can't be generated for 0-ranked memrefs.
1875 assert(rank != 0 && "non-zero rank memref expected");
1876 assert(lbMaps.size() == rank && "wrong number of lb maps");
1877 assert(ubMaps.size() == rank && "wrong number of ub maps");
1878
1879 SmallVector<Value, 4> memIndices;
1880 SmallVector<AffineExpr, 4> fastBufExprs;
1881 SmallVector<Value, 4> fastBufMapOperands;
1882 AffineForOp copyNestRoot;
1883 SmallVector<AffineApplyOp, 4> mayBeDeadApplys;
1884 for (unsigned d = 0; d < rank; ++d) {
1885 auto forOp = createCanonicalizedAffineForOp(b, loc, lbOperands, lbMaps[d],
1886 ubOperands, ubMaps[d]);
1887 if (d == 0)
1888 copyNestRoot = forOp;
1889
1890 b = OpBuilder::atBlockTerminator(block: forOp.getBody());
1891
1892 auto fastBufOffsetMap =
1893 AffineMap::get(dimCount: lbOperands.size(), symbolCount: 0, result: fastBufOffsets[d]);
1894 auto offset = b.create<AffineApplyOp>(loc, fastBufOffsetMap, lbOperands);
1895
1896 // Construct the subscript for the fast memref being copied into/from:
1897 // x - offset_x.
1898 fastBufExprs.push_back(Elt: b.getAffineDimExpr(position: 2 * d + 1) -
1899 b.getAffineDimExpr(position: 2 * d));
1900 fastBufMapOperands.push_back(Elt: offset);
1901 fastBufMapOperands.push_back(Elt: forOp.getInductionVar());
1902 mayBeDeadApplys.push_back(offset);
1903
1904 // Subscript for the slow memref being copied.
1905 memIndices.push_back(Elt: forOp.getInductionVar());
1906 }
1907
1908 auto fastBufMap =
1909 AffineMap::get(dimCount: 2 * rank, /*symbolCount=*/0, results: fastBufExprs, context: b.getContext());
1910 fullyComposeAffineMapAndOperands(&fastBufMap, &fastBufMapOperands);
1911 fastBufMap = simplifyAffineMap(fastBufMap);
1912 canonicalizeMapAndOperands(&fastBufMap, &fastBufMapOperands);
1913
1914 // Drop any dead affine.applys.
1915 for (auto applyOp : mayBeDeadApplys)
1916 if (applyOp.use_empty())
1917 applyOp.erase();
1918
1919 if (!isCopyOut) {
1920 // Copy in.
1921 auto load = b.create<AffineLoadOp>(loc, memref, memIndices);
1922 b.create<AffineStoreOp>(loc, load, fastMemRef, fastBufMap,
1923 fastBufMapOperands);
1924 return copyNestRoot;
1925 }
1926
1927 // Copy out.
1928 auto load =
1929 b.create<AffineLoadOp>(loc, fastMemRef, fastBufMap, fastBufMapOperands);
1930 b.create<AffineStoreOp>(loc, load, memref, memIndices);
1931 return copyNestRoot;
1932}
1933
1934static InFlightDiagnostic LLVM_ATTRIBUTE_UNUSED
1935emitRemarkForBlock(Block &block) {
1936 return block.getParentOp()->emitRemark();
1937}
1938
1939/// Creates a buffer in the faster memory space for the specified memref region
1940/// (memref has to be non-zero ranked); generates a copy from the lower memory
1941/// space to this one, and replaces all loads/stores in the block range
1942/// [`begin', `end') of `block' to load/store from that buffer. Returns failure
1943/// if copies could not be generated due to yet unimplemented cases.
1944/// `copyInPlacementStart` and `copyOutPlacementStart` in copyPlacementBlock
1945/// specify the insertion points where the incoming copies and outgoing copies,
1946/// respectively, should be inserted (the insertion happens right before the
1947/// insertion point). Since `begin` can itself be invalidated due to the memref
1948/// rewriting done from this method, the output argument `nBegin` is set to its
1949/// replacement (set to `begin` if no invalidation happens). Since outgoing
1950/// copies could have been inserted at `end`, the output argument `nEnd` is set
1951/// to the new end. `sizeInBytes` is set to the size of the fast buffer
1952/// allocated.
1953static LogicalResult generateCopy(
1954 const MemRefRegion &region, Block *block, Block::iterator begin,
1955 Block::iterator end, Block *copyPlacementBlock,
1956 Block::iterator copyInPlacementStart, Block::iterator copyOutPlacementStart,
1957 const AffineCopyOptions &copyOptions, DenseMap<Value, Value> &fastBufferMap,
1958 DenseSet<Operation *> &copyNests, uint64_t *sizeInBytes,
1959 Block::iterator *nBegin, Block::iterator *nEnd) {
1960 *nBegin = begin;
1961 *nEnd = end;
1962
1963 auto f = begin->getParentOfType<FunctionOpInterface>();
1964 OpBuilder topBuilder(f.getFunctionBody());
1965 Value zeroIndex = topBuilder.create<arith::ConstantIndexOp>(f.getLoc(), 0);
1966
1967 *sizeInBytes = 0;
1968
1969 if (begin == end)
1970 return success();
1971
1972 // Is the copy out point at the end of the block where we are doing
1973 // explicit copying.
1974 bool isCopyOutAtEndOfBlock = (end == copyOutPlacementStart);
1975
1976 // Copies for read regions are going to be inserted at 'begin'.
1977 OpBuilder prologue(copyPlacementBlock, copyInPlacementStart);
1978 // Copies for write regions are going to be inserted at 'end'.
1979 OpBuilder epilogue(copyPlacementBlock, copyOutPlacementStart);
1980 OpBuilder &b = region.isWrite() ? epilogue : prologue;
1981
1982 // Builder to create constants at the top level.
1983 auto func =
1984 copyPlacementBlock->getParent()->getParentOfType<FunctionOpInterface>();
1985 OpBuilder top(func.getFunctionBody());
1986
1987 auto loc = region.loc;
1988 auto memref = region.memref;
1989 auto memRefType = cast<MemRefType>(memref.getType());
1990
1991 if (!memRefType.getLayout().isIdentity()) {
1992 LLVM_DEBUG(llvm::dbgs() << "Non-identity layout map not yet supported\n");
1993 return failure();
1994 }
1995
1996 // Indices to use for the copying.
1997 // Indices for the original memref being copied from/to.
1998 SmallVector<Value, 4> memIndices;
1999 // Indices for the faster buffer being copied into/from.
2000 SmallVector<Value, 4> bufIndices;
2001
2002 unsigned rank = memRefType.getRank();
2003 if (rank == 0) {
2004 LLVM_DEBUG(llvm::dbgs() << "Non-zero ranked memrefs supported\n");
2005 return failure();
2006 }
2007
2008 SmallVector<int64_t, 4> fastBufferShape;
2009
2010 // Compute the extents of the buffer.
2011 SmallVector<AffineMap, 2> lbs;
2012 lbs.reserve(N: rank);
2013 std::optional<int64_t> numElements =
2014 region.getConstantBoundingSizeAndShape(shape: &fastBufferShape, lbs: &lbs);
2015 if (!numElements) {
2016 LLVM_DEBUG(llvm::dbgs() << "Non-constant region size not supported\n");
2017 return failure();
2018 }
2019
2020 if (llvm::any_of(Range&: lbs, P: [](AffineMap lb) { return lb.getNumResults() > 1; })) {
2021 // This can be supported in the future if needed.
2022 LLVM_DEBUG(llvm::dbgs()
2023 << "Max lower bound for memref region start not supported\n");
2024 return failure();
2025 }
2026
2027 if (*numElements == 0) {
2028 LLVM_DEBUG(llvm::dbgs() << "Nothing to copy\n");
2029 return success();
2030 }
2031
2032 SmallVector<AffineMap, 4> lbMaps(rank), ubMaps(rank);
2033 for (unsigned i = 0; i < rank; ++i) {
2034 region.getLowerAndUpperBound(pos: i, lbMap&: lbMaps[i], ubMap&: ubMaps[i]);
2035 if (lbMaps[i].getNumResults() == 0 || ubMaps[i].getNumResults() == 0) {
2036 LLVM_DEBUG(llvm::dbgs()
2037 << "Missing lower or upper bound for region along dimension: "
2038 << i << '\n');
2039 return failure();
2040 }
2041 }
2042
2043 const FlatAffineValueConstraints *cst = region.getConstraints();
2044 // 'regionSymbols' hold values that this memory region is symbolic/parametric
2045 // on; these typically include loop IVs surrounding the level at which the
2046 // copy generation is being done or other valid symbols in MLIR.
2047 SmallVector<Value, 8> regionSymbols;
2048 cst->getValues(start: rank, end: cst->getNumVars(), values: &regionSymbols);
2049
2050 // Construct the access expression for the fast memory buffer. The access
2051 // expression for a particular dimension of the fast buffer is obtained by
2052 // subtracting out the lower bound on the original memref's data region
2053 // along the corresponding dimension.
2054
2055 // Index start offsets for faster memory buffer relative to the original.
2056 SmallVector<AffineExpr, 4> fastBufOffsets;
2057 fastBufOffsets.reserve(N: rank);
2058 for (unsigned d = 0; d < rank; d++) {
2059 assert(lbs[d].getNumSymbols() == cst->getNumCols() - rank - 1 &&
2060 "incorrect bound size");
2061
2062 // Set copy start location for this dimension in the lower memory space
2063 // memref.
2064 if (lbs[d].isSingleConstant()) {
2065 auto indexVal = lbs[d].getSingleConstantResult();
2066 if (indexVal == 0) {
2067 memIndices.push_back(Elt: zeroIndex);
2068 } else {
2069 memIndices.push_back(
2070 Elt: top.create<arith::ConstantIndexOp>(location: loc, args&: indexVal).getResult());
2071 }
2072 } else {
2073 // The coordinate for the start location is just the lower bound along the
2074 // corresponding dimension on the memory region (stored in 'offset').
2075 // Remap all inputs of the map to dimensions uniformly since in the
2076 // generate IR we need valid affine symbols as opposed to "symbols" for
2077 // the purpose of the memref region.
2078 SmallVector<AffineExpr> symReplacements(lbs[d].getNumSymbols());
2079 for (unsigned i = 0, e = lbs[d].getNumSymbols(); i < e; ++i)
2080 symReplacements[i] = top.getAffineDimExpr(position: i);
2081 lbs[d] = lbs[d].replaceDimsAndSymbols(
2082 /*dimReplacements=*/{}, symReplacements, numResultDims: lbs[d].getNumSymbols(),
2083 /*numResultSyms=*/0);
2084 memIndices.push_back(b.create<AffineApplyOp>(loc, lbs[d], regionSymbols));
2085 }
2086 // The fast buffer is copied into at location zero; addressing is relative.
2087 bufIndices.push_back(Elt: zeroIndex);
2088
2089 // Record the offsets since they are needed to remap the memory accesses of
2090 // the original memref further below.
2091 fastBufOffsets.push_back(Elt: lbs[d].getResult(idx: 0));
2092 }
2093
2094 // The faster memory space buffer.
2095 Value fastMemRef;
2096
2097 // Check if a buffer was already created.
2098 bool existingBuf = fastBufferMap.count(Val: memref) > 0;
2099 if (!existingBuf) {
2100 AffineMap fastBufferLayout = b.getMultiDimIdentityMap(rank);
2101 auto fastMemRefType =
2102 MemRefType::get(fastBufferShape, memRefType.getElementType(),
2103 fastBufferLayout, copyOptions.fastMemorySpace);
2104
2105 // Create the fast memory space buffer just before the 'affine.for'
2106 // operation.
2107 fastMemRef =
2108 prologue.create<memref::AllocOp>(loc, fastMemRefType).getResult();
2109 // Record it.
2110 fastBufferMap[memref] = fastMemRef;
2111 // fastMemRefType is a constant shaped memref.
2112 auto maySizeInBytes = getIntOrFloatMemRefSizeInBytes(fastMemRefType);
2113 // We don't account for things of unknown size.
2114 *sizeInBytes = maySizeInBytes.value_or(0);
2115
2116 LLVM_DEBUG(emitRemarkForBlock(*block)
2117 << "Creating fast buffer of type " << fastMemRefType
2118 << " and size " << llvm::divideCeil(*sizeInBytes, 1024)
2119 << " KiB\n");
2120 } else {
2121 // Reuse the one already created.
2122 fastMemRef = fastBufferMap[memref];
2123 }
2124
2125 auto numElementsSSA = top.create<arith::ConstantIndexOp>(location: loc, args&: *numElements);
2126
2127 Value dmaStride;
2128 Value numEltPerDmaStride;
2129 if (copyOptions.generateDma) {
2130 SmallVector<StrideInfo, 4> dmaStrideInfos;
2131 getMultiLevelStrides(region, bufferShape: fastBufferShape, strideInfos: &dmaStrideInfos);
2132
2133 // TODO: use all stride levels once DmaStartOp is extended for
2134 // multi-level strides.
2135 if (dmaStrideInfos.size() > 1) {
2136 LLVM_DEBUG(llvm::dbgs() << "Only up to one level of stride supported\n");
2137 return failure();
2138 }
2139
2140 if (!dmaStrideInfos.empty()) {
2141 dmaStride =
2142 top.create<arith::ConstantIndexOp>(location: loc, args&: dmaStrideInfos[0].stride);
2143 numEltPerDmaStride = top.create<arith::ConstantIndexOp>(
2144 location: loc, args&: dmaStrideInfos[0].numEltPerStride);
2145 }
2146 }
2147
2148 // Record the last operation where we want the memref replacement to end. We
2149 // later do the memref replacement only in [begin, postDomFilter] so
2150 // that the original memref's used in the data movement code themselves don't
2151 // get replaced.
2152 auto postDomFilter = std::prev(x: end);
2153
2154 // Create fully composed affine maps for each memref.
2155 auto memAffineMap = b.getMultiDimIdentityMap(rank: memIndices.size());
2156 fullyComposeAffineMapAndOperands(map: &memAffineMap, operands: &memIndices);
2157 auto bufAffineMap = b.getMultiDimIdentityMap(rank: bufIndices.size());
2158 fullyComposeAffineMapAndOperands(map: &bufAffineMap, operands: &bufIndices);
2159
2160 if (!copyOptions.generateDma) {
2161 // Point-wise copy generation.
2162 auto copyNest =
2163 generatePointWiseCopy(loc, memref, fastMemRef, lbMaps,
2164 /*lbOperands=*/regionSymbols, ubMaps,
2165 /*ubOperands=*/regionSymbols, fastBufOffsets,
2166 /*isCopyOut=*/region.isWrite(), b);
2167
2168 // Record this so that we can skip it from yet another copy.
2169 copyNests.insert(copyNest);
2170
2171 // Since new ops are being appended (for copy out's), adjust the end to
2172 // mark end of block range being processed if necessary.
2173 if (region.isWrite() && isCopyOutAtEndOfBlock)
2174 *nEnd = Block::iterator(copyNest.getOperation());
2175 } else {
2176 // DMA generation.
2177 // Create a tag (single element 1-d memref) for the DMA.
2178 auto tagMemRefType = MemRefType::get({1}, top.getIntegerType(32), {},
2179 copyOptions.tagMemorySpace);
2180 auto tagMemRef = prologue.create<memref::AllocOp>(loc, tagMemRefType);
2181
2182 SmallVector<Value, 4> tagIndices({zeroIndex});
2183 auto tagAffineMap = b.getMultiDimIdentityMap(rank: tagIndices.size());
2184 fullyComposeAffineMapAndOperands(&tagAffineMap, &tagIndices);
2185 if (!region.isWrite()) {
2186 // DMA non-blocking read from original buffer to fast buffer.
2187 b.create<AffineDmaStartOp>(loc, memref, memAffineMap, memIndices,
2188 fastMemRef, bufAffineMap, bufIndices,
2189 tagMemRef, tagAffineMap, tagIndices,
2190 numElementsSSA, dmaStride, numEltPerDmaStride);
2191 } else {
2192 // DMA non-blocking write from fast buffer to the original memref.
2193 auto op = b.create<AffineDmaStartOp>(
2194 loc, fastMemRef, bufAffineMap, bufIndices, memref, memAffineMap,
2195 memIndices, tagMemRef, tagAffineMap, tagIndices, numElementsSSA,
2196 dmaStride, numEltPerDmaStride);
2197 // Since new ops may be appended at 'end' (for outgoing DMAs), adjust the
2198 // end to mark end of block range being processed.
2199 if (isCopyOutAtEndOfBlock)
2200 *nEnd = Block::iterator(op.getOperation());
2201 }
2202
2203 // Matching DMA wait to block on completion; tag always has a 0 index.
2204 b.create<AffineDmaWaitOp>(loc, tagMemRef, tagAffineMap, zeroIndex,
2205 numElementsSSA);
2206
2207 // Generate dealloc for the tag.
2208 auto tagDeallocOp = epilogue.create<memref::DeallocOp>(loc, tagMemRef);
2209 if (*nEnd == end && isCopyOutAtEndOfBlock)
2210 // Since new ops are being appended (for outgoing DMAs), adjust the end to
2211 // mark end of range of the original.
2212 *nEnd = Block::iterator(tagDeallocOp.getOperation());
2213 }
2214
2215 // Generate dealloc for the buffer.
2216 if (!existingBuf) {
2217 auto bufDeallocOp = epilogue.create<memref::DeallocOp>(loc, fastMemRef);
2218 // When generating pointwise copies, `nEnd' has to be set to deallocOp on
2219 // the fast buffer (since it marks the new end insertion point).
2220 if (!copyOptions.generateDma && *nEnd == end && isCopyOutAtEndOfBlock)
2221 *nEnd = Block::iterator(bufDeallocOp.getOperation());
2222 }
2223
2224 // Replace all uses of the old memref with the faster one while remapping
2225 // access indices (subtracting out lower bound offsets for each dimension).
2226 // Ex: to replace load %A[%i, %j] with load %Abuf[%i - %iT, %j - %jT],
2227 // index remap will be (%i, %j) -> (%i - %iT, %j - %jT),
2228 // i.e., affine.apply (d0, d1, d2, d3) -> (d2-d0, d3-d1) (%iT, %jT, %i, %j),
2229 // and (%iT, %jT) will be the 'extraOperands' for 'rep all memref uses with'.
2230 // d2, d3 correspond to the original indices (%i, %j).
2231 SmallVector<AffineExpr, 4> remapExprs;
2232 remapExprs.reserve(N: rank);
2233 for (unsigned i = 0; i < rank; i++) {
2234 // The starting operands of indexRemap will be regionSymbols (the symbols on
2235 // which the memref region is parametric); then those corresponding to
2236 // the memref's original indices follow.
2237 auto dimExpr = b.getAffineDimExpr(position: regionSymbols.size() + i);
2238 remapExprs.push_back(Elt: dimExpr - fastBufOffsets[i]);
2239 }
2240 auto indexRemap = AffineMap::get(dimCount: regionSymbols.size() + rank, symbolCount: 0, results: remapExprs,
2241 context: b.getContext());
2242
2243 // Record the begin since it may be invalidated by memref replacement.
2244 Block::iterator prevOfBegin;
2245 bool isBeginAtStartOfBlock = (begin == block->begin());
2246 if (!isBeginAtStartOfBlock)
2247 prevOfBegin = std::prev(x: begin);
2248
2249 // *Only* those uses within the range [begin, end) of 'block' are replaced.
2250 (void)replaceAllMemRefUsesWith(memref, fastMemRef,
2251 /*extraIndices=*/{}, indexRemap,
2252 /*extraOperands=*/regionSymbols,
2253 /*symbolOperands=*/{},
2254 /*domOpFilter=*/&*begin,
2255 /*postDomOpFilter=*/&*postDomFilter);
2256
2257 *nBegin = isBeginAtStartOfBlock ? block->begin() : std::next(x: prevOfBegin);
2258
2259 return success();
2260}
2261
2262/// Construct the memref region to just include the entire memref. Returns false
2263/// dynamic shaped memref's for now. `numParamLoopIVs` is the number of
2264/// enclosing loop IVs of `op` (starting from the outermost) that the region
2265/// is parametric on.
2266static bool getFullMemRefAsRegion(Operation *op, unsigned numParamLoopIVs,
2267 MemRefRegion *region) {
2268 unsigned rank;
2269 if (auto loadOp = dyn_cast<AffineLoadOp>(op)) {
2270 rank = loadOp.getMemRefType().getRank();
2271 region->memref = loadOp.getMemRef();
2272 region->setWrite(false);
2273 } else if (auto storeOp = dyn_cast<AffineStoreOp>(op)) {
2274 rank = storeOp.getMemRefType().getRank();
2275 region->memref = storeOp.getMemRef();
2276 region->setWrite(true);
2277 } else {
2278 assert(false && "expected load or store op");
2279 return false;
2280 }
2281 auto memRefType = cast<MemRefType>(region->memref.getType());
2282 if (!memRefType.hasStaticShape())
2283 return false;
2284
2285 auto *regionCst = region->getConstraints();
2286
2287 // Just get the first numSymbols IVs, which the memref region is parametric
2288 // on.
2289 SmallVector<AffineForOp, 4> ivs;
2290 getAffineForIVs(*op, &ivs);
2291 ivs.resize(numParamLoopIVs);
2292 SmallVector<Value, 4> symbols;
2293 extractForInductionVars(ivs, &symbols);
2294 *regionCst = FlatAffineValueConstraints(rank, numParamLoopIVs, 0);
2295 regionCst->setValues(start: rank, end: rank + numParamLoopIVs, values: symbols);
2296
2297 // Memref dim sizes provide the bounds.
2298 for (unsigned d = 0; d < rank; d++) {
2299 auto dimSize = memRefType.getDimSize(d);
2300 assert(dimSize > 0 && "filtered dynamic shapes above");
2301 regionCst->addBound(type: BoundType::LB, pos: d, value: 0);
2302 regionCst->addBound(BoundType::UB, d, dimSize - 1);
2303 }
2304 return true;
2305}
2306
2307LogicalResult
2308mlir::affine::affineDataCopyGenerate(Block::iterator begin, Block::iterator end,
2309 const AffineCopyOptions &copyOptions,
2310 std::optional<Value> filterMemRef,
2311 DenseSet<Operation *> &copyNests) {
2312 if (begin == end)
2313 return success();
2314
2315 assert(begin->getBlock() == std::prev(end)->getBlock() &&
2316 "Inconsistent block begin/end args");
2317 assert(end != end->getBlock()->end() && "end can't be the block terminator");
2318
2319 Block *block = begin->getBlock();
2320
2321 // Copies will be generated for this depth, i.e., symbolic in all loops
2322 // surrounding the this block range.
2323 unsigned copyDepth = getNestingDepth(op: &*begin);
2324
2325 LLVM_DEBUG(llvm::dbgs() << "Generating copies at depth " << copyDepth
2326 << "\n");
2327 LLVM_DEBUG(llvm::dbgs() << "from begin: " << *begin << "\n");
2328 LLVM_DEBUG(llvm::dbgs() << "to inclusive end: " << *std::prev(end) << "\n");
2329
2330 // List of memory regions to copy for. We need a map vector to have a
2331 // guaranteed iteration order to write test cases. CHECK-DAG doesn't help here
2332 // since the alloc's for example are identical except for the SSA id.
2333 SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4> readRegions;
2334 SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4> writeRegions;
2335
2336 // Map from original memref's to the fast buffers that their accesses are
2337 // replaced with.
2338 DenseMap<Value, Value> fastBufferMap;
2339
2340 // To check for errors when walking the block.
2341 bool error = false;
2342
2343 // Walk this range of operations to gather all memory regions.
2344 block->walk(begin, end, callback: [&](Operation *opInst) {
2345 Value memref;
2346 MemRefType memrefType;
2347 // Gather regions to allocate to buffers in faster memory space.
2348 if (auto loadOp = dyn_cast<AffineLoadOp>(opInst)) {
2349 memref = loadOp.getMemRef();
2350 memrefType = loadOp.getMemRefType();
2351 } else if (auto storeOp = dyn_cast<AffineStoreOp>(opInst)) {
2352 memref = storeOp.getMemRef();
2353 memrefType = storeOp.getMemRefType();
2354 }
2355 // Not an affine.load/store op.
2356 if (!memref)
2357 return;
2358
2359 if ((filterMemRef.has_value() && filterMemRef != memref) ||
2360 (isa_and_nonnull<IntegerAttr>(memrefType.getMemorySpace()) &&
2361 memrefType.getMemorySpaceAsInt() != copyOptions.slowMemorySpace))
2362 return;
2363
2364 if (!memref.getParentRegion()->isAncestor(other: block->getParent())) {
2365 LLVM_DEBUG(llvm::dbgs() << "memref definition is inside of the depth at "
2366 "which copy-in/copy-out would happen\n");
2367 return;
2368 }
2369
2370 // Compute the MemRefRegion accessed.
2371 auto region = std::make_unique<MemRefRegion>(args: opInst->getLoc());
2372 if (failed(Result: region->compute(op: opInst, loopDepth: copyDepth, /*sliceState=*/nullptr,
2373 /*addMemRefDimBounds=*/false))) {
2374 LLVM_DEBUG(llvm::dbgs()
2375 << "Error obtaining memory region: semi-affine maps?\n");
2376 LLVM_DEBUG(llvm::dbgs() << "over-approximating to the entire memref\n");
2377 if (!getFullMemRefAsRegion(op: opInst, numParamLoopIVs: copyDepth, region: region.get())) {
2378 LLVM_DEBUG(
2379 opInst->emitError("non-constant memref sizes not yet supported"));
2380 error = true;
2381 return;
2382 }
2383 }
2384
2385 // Each memref has a single buffer associated with it irrespective of how
2386 // many load's and store's happen on it.
2387 // TODO: in the future, when regions don't intersect and satisfy
2388 // other properties (based on load/store regions), we could consider
2389 // multiple buffers per memref.
2390
2391 // Add to the appropriate region if it's not already in it, or take a
2392 // bounding box union with the existing one if it's already in there.
2393 // Note that a memref may have both read and write regions - so update the
2394 // region in the other list if one exists (write in case of read and vice
2395 // versa) since there is a single bounding box for a memref across all reads
2396 // and writes that happen on it.
2397
2398 // Attempts to update; returns true if 'region' exists in targetRegions.
2399 auto updateRegion =
2400 [&](const SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4>
2401 &targetRegions) {
2402 const auto *const it = targetRegions.find(Key: region->memref);
2403 if (it == targetRegions.end())
2404 return false;
2405
2406 // Perform a union with the existing region.
2407 if (failed(Result: it->second->unionBoundingBox(other: *region))) {
2408 LLVM_DEBUG(llvm::dbgs()
2409 << "Memory region bounding box failed; "
2410 "over-approximating to the entire memref\n");
2411 // If the union fails, we will overapproximate.
2412 if (!getFullMemRefAsRegion(op: opInst, numParamLoopIVs: copyDepth, region: region.get())) {
2413 LLVM_DEBUG(opInst->emitError(
2414 "non-constant memref sizes not yet supported"));
2415 error = true;
2416 return true;
2417 }
2418 it->second->getConstraints()->clearAndCopyFrom(
2419 other: *region->getConstraints());
2420 } else {
2421 // Union was computed and stored in 'it->second': copy to 'region'.
2422 region->getConstraints()->clearAndCopyFrom(
2423 other: *it->second->getConstraints());
2424 }
2425 return true;
2426 };
2427
2428 bool existsInRead = updateRegion(readRegions);
2429 if (error)
2430 return;
2431 bool existsInWrite = updateRegion(writeRegions);
2432 if (error)
2433 return;
2434
2435 // Finally add it to the region list.
2436 if (region->isWrite() && !existsInWrite) {
2437 writeRegions[region->memref] = std::move(region);
2438 } else if (!region->isWrite() && !existsInRead) {
2439 readRegions[region->memref] = std::move(region);
2440 }
2441 });
2442
2443 if (error) {
2444 LLVM_DEBUG(begin->emitError(
2445 "copy generation failed for one or more memref's in this block\n"));
2446 return failure();
2447 }
2448
2449 uint64_t totalCopyBuffersSizeInBytes = 0;
2450 bool ret = true;
2451 auto processRegions =
2452 [&](const SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4>
2453 &regions) {
2454 for (const auto &regionEntry : regions) {
2455 // For each region, hoist copy in/out past all hoistable
2456 // 'affine.for's.
2457 Block::iterator copyInPlacementStart, copyOutPlacementStart;
2458 Block *copyPlacementBlock;
2459 findHighestBlockForPlacement(
2460 region: *regionEntry.second, block&: *block, begin, end, copyPlacementBlock: &copyPlacementBlock,
2461 copyInPlacementStart: &copyInPlacementStart, copyOutPlacementStart: &copyOutPlacementStart);
2462
2463 uint64_t sizeInBytes;
2464 Block::iterator nBegin, nEnd;
2465 LogicalResult iRet = generateCopy(
2466 region: *regionEntry.second, block, begin, end, copyPlacementBlock,
2467 copyInPlacementStart, copyOutPlacementStart, copyOptions,
2468 fastBufferMap, copyNests, sizeInBytes: &sizeInBytes, nBegin: &nBegin, nEnd: &nEnd);
2469 if (succeeded(Result: iRet)) {
2470 // begin/end could have been invalidated, and need update.
2471 begin = nBegin;
2472 end = nEnd;
2473 totalCopyBuffersSizeInBytes += sizeInBytes;
2474 }
2475 ret = ret & succeeded(Result: iRet);
2476 }
2477 };
2478 processRegions(readRegions);
2479 processRegions(writeRegions);
2480
2481 if (!ret) {
2482 LLVM_DEBUG(begin->emitError(
2483 "copy generation failed for one or more memref's in this block\n"));
2484 return failure();
2485 }
2486
2487 // For a range of operations, a note will be emitted at the caller.
2488 AffineForOp forOp;
2489 if (llvm::DebugFlag && (forOp = dyn_cast<AffineForOp>(&*begin))) {
2490 LLVM_DEBUG(forOp.emitRemark()
2491 << llvm::divideCeil(totalCopyBuffersSizeInBytes, 1024)
2492 << " KiB of copy buffers in fast memory space for this block");
2493 }
2494
2495 if (totalCopyBuffersSizeInBytes > copyOptions.fastMemCapacityBytes) {
2496 block->getParentOp()->emitWarning(
2497 message: "total size of all copy buffers' for this block exceeds fast memory "
2498 "capacity");
2499 }
2500
2501 return success();
2502}
2503
2504// A convenience version of affineDataCopyGenerate for all ops in the body of
2505// an AffineForOp.
2506LogicalResult mlir::affine::affineDataCopyGenerate(
2507 AffineForOp forOp, const AffineCopyOptions &copyOptions,
2508 std::optional<Value> filterMemRef, DenseSet<Operation *> &copyNests) {
2509 return affineDataCopyGenerate(forOp.getBody()->begin(),
2510 std::prev(forOp.getBody()->end()), copyOptions,
2511 filterMemRef, copyNests);
2512}
2513
2514LogicalResult mlir::affine::generateCopyForMemRegion(
2515 const MemRefRegion &memrefRegion, Operation *analyzedOp,
2516 const AffineCopyOptions &copyOptions, CopyGenerateResult &result) {
2517 Block *block = analyzedOp->getBlock();
2518 auto begin = analyzedOp->getIterator();
2519 auto end = std::next(begin);
2520 DenseMap<Value, Value> fastBufferMap;
2521 DenseSet<Operation *> copyNests;
2522
2523 auto err = generateCopy(memrefRegion, block, begin, end, block, begin, end,
2524 copyOptions, fastBufferMap, copyNests,
2525 &result.sizeInBytes, &begin, &end);
2526 if (failed(err))
2527 return err;
2528
2529 const auto &en = fastBufferMap.find(Val: memrefRegion.memref);
2530 // In some cases (empty loops), no copy generation would have happened.
2531 if (en == fastBufferMap.end())
2532 return failure();
2533 result.alloc = en->second.getDefiningOp();
2534 assert(result.alloc && "fast buffer expected to be locally allocated");
2535 assert(copyNests.size() <= 1 && "At most one copy nest is expected.");
2536 result.copyNest = copyNests.empty() ? nullptr : *copyNests.begin();
2537 return success();
2538}
2539
2540/// Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'.
2541static void
2542gatherLoopsInBlock(Block *block, unsigned currLoopDepth,
2543 std::vector<SmallVector<AffineForOp, 2>> &depthToLoops) {
2544 // Add a new empty level to output if it doesn't exist level already.
2545 assert(currLoopDepth <= depthToLoops.size() && "Unexpected currLoopDepth");
2546 if (currLoopDepth == depthToLoops.size())
2547 depthToLoops.emplace_back();
2548
2549 for (auto &op : *block) {
2550 if (auto forOp = dyn_cast<AffineForOp>(op)) {
2551 depthToLoops[currLoopDepth].push_back(forOp);
2552 gatherLoopsInBlock(forOp.getBody(), currLoopDepth + 1, depthToLoops);
2553 }
2554 }
2555}
2556
2557/// Gathers all AffineForOps in 'func.func' grouped by loop depth.
2558void mlir::affine::gatherLoops(
2559 func::FuncOp func, std::vector<SmallVector<AffineForOp, 2>> &depthToLoops) {
2560 for (auto &block : func)
2561 gatherLoopsInBlock(&block, /*currLoopDepth=*/0, depthToLoops);
2562
2563 // Remove last loop level from output since it's empty.
2564 if (!depthToLoops.empty()) {
2565 assert(depthToLoops.back().empty() && "Last loop level is not empty?");
2566 depthToLoops.pop_back();
2567 }
2568}
2569
2570AffineForOp mlir::affine::createCanonicalizedAffineForOp(
2571 OpBuilder b, Location loc, ValueRange lbOperands, AffineMap lbMap,
2572 ValueRange ubOperands, AffineMap ubMap, int64_t step) {
2573 SmallVector<Value, 4> lowerOperands(lbOperands);
2574 SmallVector<Value, 4> upperOperands(ubOperands);
2575
2576 fullyComposeAffineMapAndOperands(map: &lbMap, operands: &lowerOperands);
2577 canonicalizeMapAndOperands(map: &lbMap, operands: &lowerOperands);
2578 lbMap = removeDuplicateExprs(map: lbMap);
2579 fullyComposeAffineMapAndOperands(map: &ubMap, operands: &upperOperands);
2580 canonicalizeMapAndOperands(map: &ubMap, operands: &upperOperands);
2581 ubMap = removeDuplicateExprs(map: ubMap);
2582
2583 return b.create<AffineForOp>(loc, lowerOperands, lbMap, upperOperands, ubMap,
2584 step);
2585}
2586
2587/// Creates an AffineIfOp that encodes the conditional to choose between
2588/// the constant trip count version and an unknown trip count version of this
2589/// nest of loops. This is used to separate partial and full tiles if `loops`
2590/// has the intra-tile loops. The affine.if op is inserted at the builder
2591/// insertion point of `b`.
2592static AffineIfOp createSeparationCondition(MutableArrayRef<AffineForOp> loops,
2593 OpBuilder b) {
2594 if (loops.empty())
2595 return nullptr;
2596
2597 auto *context = loops[0].getContext();
2598
2599 FlatAffineValueConstraints cst;
2600 SmallVector<Operation *, 8> ops;
2601 llvm::append_range(C&: ops, R&: loops);
2602 (void)getIndexSet(ops, domain: &cst);
2603
2604 // Remove constraints that are independent of these loop IVs.
2605 cst.removeIndependentConstraints(/*pos=*/0, /*num=*/loops.size());
2606
2607 // Construct the constraint set representing the guard for full tiles. The
2608 // lower bound (and upper bound) corresponding to the full tile should be
2609 // larger (and resp. smaller) than any other lower (or upper bound).
2610 SmallVector<int64_t, 8> fullTileLb, fullTileUb;
2611 for (auto loop : loops) {
2612 (void)loop;
2613 // TODO: Non-unit stride is not an issue to generalize to.
2614 assert(loop.getStepAsInt() == 1 && "point loop step expected to be one");
2615 // Mark everything symbols for the purpose of finding a constant diff pair.
2616 cst.setDimSymbolSeparation(/*newSymbolCount=*/cst.getNumDimAndSymbolVars() -
2617 1);
2618 unsigned fullTileLbPos, fullTileUbPos;
2619 if (!((IntegerRelation)cst)
2620 .getConstantBoundOnDimSize(0, /*lb=*/nullptr,
2621 /*boundFloorDivisor=*/nullptr,
2622 /*ub=*/nullptr, &fullTileLbPos,
2623 &fullTileUbPos)) {
2624 LLVM_DEBUG(llvm::dbgs() << "Can't get constant diff pair for a loop\n");
2625 return nullptr;
2626 }
2627
2628 SmallVector<unsigned, 4> lbIndices, ubIndices;
2629 cst.getLowerAndUpperBoundIndices(/*pos=*/0, &lbIndices, &ubIndices);
2630
2631 auto fLb = cst.getInequality(fullTileLbPos);
2632 auto fUb = cst.getInequality(fullTileUbPos);
2633 fullTileLb.assign(fLb.begin(), fLb.end());
2634 fullTileUb.assign(fUb.begin(), fUb.end());
2635
2636 // Full tile lower bound should be >= than any other lower bound.
2637 for (auto lbIndex : lbIndices)
2638 for (unsigned i = 0, e = cst.getNumCols(); i < e; ++i)
2639 cst.atIneq(lbIndex, i) = fullTileLb[i] - cst.atIneq(lbIndex, i);
2640
2641 // Full tile upper bound should be <= any other upper bound.
2642 for (auto ubIndex : ubIndices)
2643 for (unsigned i = 0, e = cst.getNumCols(); i < e; ++i)
2644 cst.atIneq(ubIndex, i) -= fullTileUb[i];
2645
2646 cst.removeVar(0);
2647 }
2648
2649 // The previous step leads to all zeros for the full tile lb and ub position
2650 // itself; remove those and any other duplicates / trivial redundancies.
2651 cst.removeTrivialRedundancy();
2652
2653 // Turn everything into dims conservatively since we earlier turned all
2654 // trailing ids past point loop IV into symbols. Some of these could be outer
2655 // loop IVs; we'll canonicalize anyway.
2656 cst.setDimSymbolSeparation(0);
2657
2658 IntegerSet ifCondSet = cst.getAsIntegerSet(context: context);
2659 // ifCondSet can be null if cst was empty -- this can happen if all loops
2660 // in the nest have constant trip counts.
2661 if (!ifCondSet)
2662 return nullptr;
2663
2664 SmallVector<Value, 4> setOperands;
2665 cst.getValues(start: 0, end: cst.getNumDimAndSymbolVars(), values: &setOperands);
2666 canonicalizeSetAndOperands(set: &ifCondSet, operands: &setOperands);
2667 return b.create<AffineIfOp>(loops[0].getLoc(), ifCondSet, setOperands,
2668 /*withElseRegion=*/true);
2669}
2670
2671/// Create the full tile loop nest (along with its body).
2672static LogicalResult
2673createFullTiles(MutableArrayRef<AffineForOp> inputNest,
2674 SmallVectorImpl<AffineForOp> &fullTileLoops, OpBuilder b) {
2675 fullTileLoops.reserve(N: inputNest.size());
2676
2677 // For each loop in the original nest identify a lower/upper bound pair such
2678 // that their difference is a constant.
2679 FlatAffineValueConstraints cst;
2680 for (auto loop : inputNest) {
2681 // TODO: straightforward to generalize to a non-unit stride.
2682 if (loop.getStepAsInt() != 1) {
2683 LLVM_DEBUG(llvm::dbgs()
2684 << "[tile separation] non-unit stride not implemented\n");
2685 return failure();
2686 }
2687 SmallVector<Operation *, 1> loopOp{loop.getOperation()};
2688 (void)getIndexSet(loopOp, &cst);
2689 // We will mark everything other than this loop IV as symbol for getting a
2690 // pair of <lb, ub> with a constant difference.
2691 cst.setDimSymbolSeparation(cst.getNumDimAndSymbolVars() - 1);
2692 unsigned lbPos, ubPos;
2693 if (!((IntegerRelation)cst)
2694 .getConstantBoundOnDimSize(/*pos=*/0, /*lb=*/nullptr,
2695 /*boundFloorDivisor=*/nullptr,
2696 /*ub=*/nullptr, &lbPos, &ubPos) ||
2697 lbPos == ubPos) {
2698 LLVM_DEBUG(llvm::dbgs() << "[tile separation] Can't get constant diff / "
2699 "equalities not yet handled\n");
2700 return failure();
2701 }
2702
2703 // Set all variables as dimensions uniformly since some of those marked as
2704 // symbols above could be outer loop IVs (corresponding tile space IVs).
2705 cst.setDimSymbolSeparation(/*newSymbolCount=*/0);
2706
2707 AffineValueMap lbVmap, ubVmap;
2708 cst.getIneqAsAffineValueMap(/*pos=*/0, lbPos, lbVmap, b.getContext());
2709 cst.getIneqAsAffineValueMap(/*pos=*/0, ubPos, ubVmap, b.getContext());
2710 AffineForOp fullTileLoop = createCanonicalizedAffineForOp(
2711 b, loop.getLoc(), lbVmap.getOperands(), lbVmap.getAffineMap(),
2712 ubVmap.getOperands(), ubVmap.getAffineMap());
2713 b = OpBuilder::atBlockTerminator(fullTileLoop.getBody());
2714 fullTileLoops.push_back(fullTileLoop);
2715 }
2716
2717 // Add the body for the full tile loop nest.
2718 IRMapping operandMap;
2719 for (const auto &loopEn : llvm::enumerate(inputNest))
2720 operandMap.map(loopEn.value().getInductionVar(),
2721 fullTileLoops[loopEn.index()].getInductionVar());
2722 b = OpBuilder::atBlockTerminator(block: fullTileLoops.back().getBody());
2723 for (auto &op : inputNest.back().getBody()->without_terminator())
2724 b.clone(op, operandMap);
2725 return success();
2726}
2727
2728LogicalResult
2729mlir::affine::separateFullTiles(MutableArrayRef<AffineForOp> inputNest,
2730 SmallVectorImpl<AffineForOp> *fullTileNest) {
2731 if (inputNest.empty())
2732 return success();
2733
2734 auto firstLoop = inputNest[0];
2735
2736 // Each successive for op has to be nested in the other.
2737 auto prevLoop = firstLoop;
2738 for (auto loop : inputNest.drop_front(1)) {
2739 assert(loop->getParentOp() == prevLoop && "input not contiguously nested");
2740 prevLoop = loop;
2741 }
2742
2743 // Create the full tile loop nest.
2744 SmallVector<AffineForOp, 4> fullTileLoops;
2745 OpBuilder b(firstLoop);
2746 if (failed(Result: createFullTiles(inputNest, fullTileLoops, b))) {
2747 if (!fullTileLoops.empty())
2748 fullTileLoops.front().erase();
2749 return failure();
2750 }
2751
2752 // Create and insert the version select right before the root of the nest.
2753 b = OpBuilder(firstLoop);
2754 AffineIfOp ifOp = createSeparationCondition(inputNest, b);
2755 if (!ifOp) {
2756 fullTileLoops.front().erase();
2757 LLVM_DEBUG(llvm::dbgs() << "All tiles are full tiles, or failure creating "
2758 "separation condition\n");
2759 return failure();
2760 }
2761
2762 // Move the full tile into the then block.
2763 Block *thenBlock = ifOp.getThenBlock();
2764 AffineForOp outermostFullTileLoop = fullTileLoops[0];
2765 thenBlock->getOperations().splice(
2766 std::prev(x: thenBlock->end()),
2767 outermostFullTileLoop->getBlock()->getOperations(),
2768 Block::iterator(outermostFullTileLoop));
2769
2770 // Move the partial tile into the else block. The partial tile is the same as
2771 // the original loop nest.
2772 Block *elseBlock = ifOp.getElseBlock();
2773 elseBlock->getOperations().splice(std::prev(x: elseBlock->end()),
2774 firstLoop->getBlock()->getOperations(),
2775 Block::iterator(firstLoop));
2776
2777 if (fullTileNest)
2778 *fullTileNest = std::move(fullTileLoops);
2779
2780 return success();
2781}
2782
2783LogicalResult affine::coalescePerfectlyNestedAffineLoops(AffineForOp op) {
2784 LogicalResult result(failure());
2785 SmallVector<AffineForOp> loops;
2786 getPerfectlyNestedLoops(loops, op);
2787 if (loops.size() <= 1)
2788 return success();
2789
2790 // Look for a band of loops that can be coalesced, i.e. perfectly nested
2791 // loops with bounds defined above some loop.
2792 // 1. For each loop, find above which parent loop its operands are
2793 // defined.
2794 SmallVector<unsigned> operandsDefinedAbove(loops.size());
2795 for (unsigned i = 0, e = loops.size(); i < e; ++i) {
2796 operandsDefinedAbove[i] = i;
2797 for (unsigned j = 0; j < i; ++j) {
2798 if (areValuesDefinedAbove(loops[i].getOperands(), loops[j].getRegion())) {
2799 operandsDefinedAbove[i] = j;
2800 break;
2801 }
2802 }
2803 }
2804
2805 // 2. Identify bands of loops such that the operands of all of them are
2806 // defined above the first loop in the band. Traverse the nest bottom-up
2807 // so that modifications don't invalidate the inner loops.
2808 for (unsigned end = loops.size(); end > 0; --end) {
2809 unsigned start = 0;
2810 for (; start < end - 1; ++start) {
2811 auto maxPos =
2812 *std::max_element(first: std::next(x: operandsDefinedAbove.begin(), n: start),
2813 last: std::next(x: operandsDefinedAbove.begin(), n: end));
2814 if (maxPos > start)
2815 continue;
2816 assert(maxPos == start &&
2817 "expected loop bounds to be known at the start of the band");
2818 auto band = llvm::MutableArrayRef(loops.data() + start, end - start);
2819 if (succeeded(coalesceLoops(band)))
2820 result = success();
2821 break;
2822 }
2823 // If a band was found and transformed, keep looking at the loops above
2824 // the outermost transformed loop.
2825 if (start != end - 1)
2826 end = start + 1;
2827 }
2828 return result;
2829}
2830
2831int64_t mlir::affine::numEnclosingInvariantLoops(OpOperand &operand) {
2832 int64_t count = 0;
2833 Operation *currentOp = operand.getOwner();
2834 while (auto loopOp = currentOp->getParentOfType<LoopLikeOpInterface>()) {
2835 if (!loopOp.isDefinedOutsideOfLoop(operand.get()))
2836 break;
2837 currentOp = loopOp;
2838 count++;
2839 }
2840 return count;
2841}
2842

Provided by KDAB

Privacy Policy
Improve your Profiling and Debugging skills
Find out more

source code of mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp