1 | //===- LoopUtils.cpp ---- Misc utilities for loop transformation ----------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file implements miscellaneous loop transformation routines. |
10 | // |
11 | //===----------------------------------------------------------------------===// |
12 | |
13 | #include "mlir/Dialect/Affine/LoopUtils.h" |
14 | #include "mlir/Analysis/SliceAnalysis.h" |
15 | #include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h" |
16 | #include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h" |
17 | #include "mlir/Dialect/Affine/Analysis/Utils.h" |
18 | #include "mlir/Dialect/Affine/IR/AffineOps.h" |
19 | #include "mlir/Dialect/Affine/IR/AffineValueMap.h" |
20 | #include "mlir/Dialect/Affine/Utils.h" |
21 | #include "mlir/Dialect/Func/IR/FuncOps.h" |
22 | #include "mlir/Dialect/MemRef/IR/MemRef.h" |
23 | #include "mlir/Dialect/SCF/IR/SCF.h" |
24 | #include "mlir/IR/IRMapping.h" |
25 | #include "mlir/IR/IntegerSet.h" |
26 | #include "mlir/Support/MathExtras.h" |
27 | #include "mlir/Transforms/GreedyPatternRewriteDriver.h" |
28 | #include "llvm/ADT/MapVector.h" |
29 | #include "llvm/ADT/SmallPtrSet.h" |
30 | #include "llvm/Support/Debug.h" |
31 | #include "llvm/Support/raw_ostream.h" |
32 | #include <optional> |
33 | |
34 | #define DEBUG_TYPE "loop-utils" |
35 | |
36 | using namespace mlir; |
37 | using namespace affine; |
38 | using namespace presburger; |
39 | using llvm::SmallMapVector; |
40 | |
41 | namespace { |
42 | // This structure is to pass and return sets of loop parameters without |
43 | // confusing the order. |
44 | struct LoopParams { |
45 | Value lowerBound; |
46 | Value upperBound; |
47 | Value step; |
48 | }; |
49 | } // namespace |
50 | |
51 | /// Computes the cleanup loop lower bound of the loop being unrolled with |
52 | /// the specified unroll factor; this bound will also be upper bound of the main |
53 | /// part of the unrolled loop. Computes the bound as an AffineMap with its |
54 | /// operands or a null map when the trip count can't be expressed as an affine |
55 | /// expression. |
56 | static void |
57 | getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor, |
58 | AffineMap &cleanupLbMap, |
59 | SmallVectorImpl<Value> &cleanupLbOperands) { |
60 | AffineMap tripCountMap; |
61 | SmallVector<Value, 4> tripCountOperands; |
62 | getTripCountMapAndOperands(forOp, &tripCountMap, &tripCountOperands); |
63 | // Trip count can't be computed. |
64 | if (!tripCountMap) { |
65 | cleanupLbMap = AffineMap(); |
66 | return; |
67 | } |
68 | |
69 | OpBuilder b(forOp); |
70 | auto lbMap = forOp.getLowerBoundMap(); |
71 | auto lb = b.create<AffineApplyOp>(forOp.getLoc(), lbMap, |
72 | forOp.getLowerBoundOperands()); |
73 | |
74 | // For each upper bound expr, get the range. |
75 | // Eg: affine.for %i = lb to min (ub1, ub2), |
76 | // where tripCountExprs yield (tr1, tr2), we create affine.apply's: |
77 | // lb + tr1 - tr1 % ufactor, lb + tr2 - tr2 % ufactor; the results of all |
78 | // these affine.apply's make up the cleanup loop lower bound. |
79 | SmallVector<AffineExpr, 4> bumpExprs(tripCountMap.getNumResults()); |
80 | SmallVector<Value, 4> bumpValues(tripCountMap.getNumResults()); |
81 | int64_t step = forOp.getStepAsInt(); |
82 | for (unsigned i = 0, e = tripCountMap.getNumResults(); i < e; i++) { |
83 | auto tripCountExpr = tripCountMap.getResult(idx: i); |
84 | bumpExprs[i] = (tripCountExpr - tripCountExpr % unrollFactor) * step; |
85 | auto bumpMap = AffineMap::get(dimCount: tripCountMap.getNumDims(), |
86 | symbolCount: tripCountMap.getNumSymbols(), result: bumpExprs[i]); |
87 | bumpValues[i] = |
88 | b.create<AffineApplyOp>(forOp.getLoc(), bumpMap, tripCountOperands); |
89 | } |
90 | |
91 | SmallVector<AffineExpr, 4> newUbExprs(tripCountMap.getNumResults()); |
92 | for (unsigned i = 0, e = bumpExprs.size(); i < e; i++) |
93 | newUbExprs[i] = b.getAffineDimExpr(position: 0) + b.getAffineDimExpr(position: i + 1); |
94 | |
95 | cleanupLbOperands.clear(); |
96 | cleanupLbOperands.push_back(Elt: lb); |
97 | cleanupLbOperands.append(in_start: bumpValues.begin(), in_end: bumpValues.end()); |
98 | cleanupLbMap = AffineMap::get(dimCount: 1 + tripCountMap.getNumResults(), symbolCount: 0, results: newUbExprs, |
99 | context: b.getContext()); |
100 | // Simplify the cleanupLbMap + cleanupLbOperands. |
101 | fullyComposeAffineMapAndOperands(map: &cleanupLbMap, operands: &cleanupLbOperands); |
102 | cleanupLbMap = simplifyAffineMap(map: cleanupLbMap); |
103 | canonicalizeMapAndOperands(map: &cleanupLbMap, operands: &cleanupLbOperands); |
104 | // Remove any affine.apply's that became dead from the simplification above. |
105 | for (auto v : bumpValues) |
106 | if (v.use_empty()) |
107 | v.getDefiningOp()->erase(); |
108 | |
109 | if (lb.use_empty()) |
110 | lb.erase(); |
111 | } |
112 | |
113 | /// Helper to replace uses of loop carried values (iter_args) and loop |
114 | /// yield values while promoting single iteration affine.for ops. |
115 | static void replaceIterArgsAndYieldResults(AffineForOp forOp) { |
116 | // Replace uses of iter arguments with iter operands (initial values). |
117 | auto iterOperands = forOp.getInits(); |
118 | auto iterArgs = forOp.getRegionIterArgs(); |
119 | for (auto e : llvm::zip(iterOperands, iterArgs)) |
120 | std::get<1>(e).replaceAllUsesWith(std::get<0>(e)); |
121 | |
122 | // Replace uses of loop results with the values yielded by the loop. |
123 | auto outerResults = forOp.getResults(); |
124 | auto innerResults = forOp.getBody()->getTerminator()->getOperands(); |
125 | for (auto e : llvm::zip(outerResults, innerResults)) |
126 | std::get<0>(e).replaceAllUsesWith(std::get<1>(e)); |
127 | } |
128 | |
129 | /// Promotes the loop body of a forOp to its containing block if the forOp |
130 | /// was known to have a single iteration. |
131 | LogicalResult mlir::affine::promoteIfSingleIteration(AffineForOp forOp) { |
132 | std::optional<uint64_t> tripCount = getConstantTripCount(forOp); |
133 | if (!tripCount || *tripCount != 1) |
134 | return failure(); |
135 | |
136 | // TODO: extend this for arbitrary affine bounds. |
137 | if (forOp.getLowerBoundMap().getNumResults() != 1) |
138 | return failure(); |
139 | |
140 | // Replaces all IV uses to its single iteration value. |
141 | auto iv = forOp.getInductionVar(); |
142 | auto *parentBlock = forOp->getBlock(); |
143 | if (!iv.use_empty()) { |
144 | if (forOp.hasConstantLowerBound()) { |
145 | OpBuilder topBuilder(forOp->getParentOfType<func::FuncOp>().getBody()); |
146 | auto constOp = topBuilder.create<arith::ConstantIndexOp>( |
147 | forOp.getLoc(), forOp.getConstantLowerBound()); |
148 | iv.replaceAllUsesWith(constOp); |
149 | } else { |
150 | auto lbOperands = forOp.getLowerBoundOperands(); |
151 | auto lbMap = forOp.getLowerBoundMap(); |
152 | OpBuilder builder(forOp); |
153 | if (lbMap == builder.getDimIdentityMap()) { |
154 | // No need of generating an affine.apply. |
155 | iv.replaceAllUsesWith(lbOperands[0]); |
156 | } else { |
157 | auto affineApplyOp = |
158 | builder.create<AffineApplyOp>(forOp.getLoc(), lbMap, lbOperands); |
159 | iv.replaceAllUsesWith(affineApplyOp); |
160 | } |
161 | } |
162 | } |
163 | |
164 | replaceIterArgsAndYieldResults(forOp); |
165 | |
166 | // Move the loop body operations, except for its terminator, to the loop's |
167 | // containing block. |
168 | forOp.getBody()->back().erase(); |
169 | parentBlock->getOperations().splice(Block::iterator(forOp), |
170 | forOp.getBody()->getOperations()); |
171 | forOp.erase(); |
172 | return success(); |
173 | } |
174 | |
175 | /// Generates an affine.for op with the specified lower and upper bounds |
176 | /// while generating the right IV remappings to realize shifts for operations in |
177 | /// its body. The operations that go into the loop body are specified in |
178 | /// opGroupQueue starting from the specified offset, and in that order. The |
179 | /// first element of the pair specifies the shift applied to that group of |
180 | /// operations; the shift is multiplied by the loop step before being applied. |
181 | /// Returns nullptr if the generated loop simplifies to a single iteration one. |
182 | static AffineForOp generateShiftedLoop( |
183 | AffineMap lbMap, AffineMap ubMap, |
184 | const std::vector<std::pair<uint64_t, ArrayRef<Operation *>>> &opGroupQueue, |
185 | unsigned offset, AffineForOp srcForOp, OpBuilder b) { |
186 | auto lbOperands = srcForOp.getLowerBoundOperands(); |
187 | auto ubOperands = srcForOp.getUpperBoundOperands(); |
188 | |
189 | assert(lbMap.getNumInputs() == lbOperands.size()); |
190 | assert(ubMap.getNumInputs() == ubOperands.size()); |
191 | |
192 | auto loopChunk = |
193 | b.create<AffineForOp>(srcForOp.getLoc(), lbOperands, lbMap, ubOperands, |
194 | ubMap, srcForOp.getStepAsInt()); |
195 | auto loopChunkIV = loopChunk.getInductionVar(); |
196 | auto srcIV = srcForOp.getInductionVar(); |
197 | |
198 | IRMapping operandMap; |
199 | |
200 | auto bodyBuilder = OpBuilder::atBlockTerminator(block: loopChunk.getBody()); |
201 | for (const auto &it : llvm::drop_begin(RangeOrContainer: opGroupQueue, N: offset)) { |
202 | uint64_t shift = it.first; |
203 | auto ops = it.second; |
204 | // All 'same shift' operations get added with their operands being |
205 | // remapped to results of cloned operations, and their IV used remapped. |
206 | // Generate the remapping if the shift is not zero: remappedIV = newIV - |
207 | // shift. |
208 | if (!srcIV.use_empty() && shift != 0) { |
209 | auto ivRemap = bodyBuilder.create<AffineApplyOp>( |
210 | srcForOp.getLoc(), |
211 | bodyBuilder.getSingleDimShiftAffineMap( |
212 | -static_cast<int64_t>(srcForOp.getStepAsInt() * shift)), |
213 | loopChunkIV); |
214 | operandMap.map(srcIV, ivRemap); |
215 | } else { |
216 | operandMap.map(srcIV, loopChunkIV); |
217 | } |
218 | for (auto *op : ops) |
219 | bodyBuilder.clone(*op, operandMap); |
220 | }; |
221 | if (succeeded(promoteIfSingleIteration(loopChunk))) |
222 | return AffineForOp(); |
223 | return loopChunk; |
224 | } |
225 | |
226 | // The skewing of operations with respect to one another can be used for |
227 | // example to allow overlap of asynchronous operations (such as DMA |
228 | // communication) with computation, or just relative shifting of operations |
229 | // for better register reuse, locality or parallelism. As such, the shifts are |
230 | // typically expected to be at most of the order of the number of operations. |
231 | // This method should not be used as a substitute for loop distribution/fission. |
232 | // This method uses an algorithm// in time linear in the number of operations |
233 | // in the body of the for loop - (using the 'sweep line' paradigm). This method |
234 | // asserts preservation of SSA dominance. A check for that as well as that for |
235 | // memory-based dependence preservation check rests with the users of this |
236 | // method. |
237 | LogicalResult mlir::affine::affineForOpBodySkew(AffineForOp forOp, |
238 | ArrayRef<uint64_t> shifts, |
239 | bool unrollPrologueEpilogue) { |
240 | assert(forOp.getBody()->getOperations().size() == shifts.size() && |
241 | "too few/many shifts" ); |
242 | if (forOp.getBody()->begin() == std::prev(forOp.getBody()->end())) |
243 | return success(); |
244 | |
245 | // If the trip counts aren't constant, we would need versioning and |
246 | // conditional guards (or context information to prevent such versioning). The |
247 | // better way to pipeline for such loops is to first tile them and extract |
248 | // constant trip count "full tiles" before applying this. |
249 | auto mayBeConstTripCount = getConstantTripCount(forOp); |
250 | if (!mayBeConstTripCount) { |
251 | LLVM_DEBUG(forOp.emitRemark("non-constant trip count loop not handled" )); |
252 | return success(); |
253 | } |
254 | uint64_t tripCount = *mayBeConstTripCount; |
255 | |
256 | assert(isOpwiseShiftValid(forOp, shifts) && |
257 | "shifts will lead to an invalid transformation\n" ); |
258 | |
259 | int64_t step = forOp.getStepAsInt(); |
260 | |
261 | unsigned numChildOps = shifts.size(); |
262 | |
263 | // Do a linear time (counting) sort for the shifts. |
264 | uint64_t maxShift = *llvm::max_element(Range&: shifts); |
265 | if (maxShift >= numChildOps) { |
266 | // Large shifts are not the typical use case. |
267 | forOp.emitWarning("not shifting because shifts are unrealistically large" ); |
268 | return success(); |
269 | } |
270 | |
271 | // An array of operation groups sorted by shift amount; each group has all |
272 | // operations with the same shift in the order in which they appear in the |
273 | // body of the 'affine.for' op. |
274 | std::vector<std::vector<Operation *>> sortedOpGroups(maxShift + 1); |
275 | unsigned pos = 0; |
276 | for (auto &op : forOp.getBody()->without_terminator()) { |
277 | auto shift = shifts[pos++]; |
278 | sortedOpGroups[shift].push_back(&op); |
279 | } |
280 | |
281 | // Unless the shifts have a specific pattern (which actually would be the |
282 | // common use case), prologue and epilogue are not meaningfully defined. |
283 | // Nevertheless, if 'unrollPrologueEpilogue' is set, we will treat the first |
284 | // loop generated as the prologue and the last as epilogue and unroll these |
285 | // fully. |
286 | AffineForOp prologue, epilogue; |
287 | |
288 | // Do a sweep over the sorted shifts while storing open groups in a |
289 | // vector, and generating loop portions as necessary during the sweep. A block |
290 | // of operations is paired with its shift. |
291 | std::vector<std::pair<uint64_t, ArrayRef<Operation *>>> opGroupQueue; |
292 | |
293 | auto origLbMap = forOp.getLowerBoundMap(); |
294 | uint64_t lbShift = 0; |
295 | OpBuilder b(forOp); |
296 | for (uint64_t d = 0, e = sortedOpGroups.size(); d < e; ++d) { |
297 | // If nothing is shifted by d, continue. |
298 | if (sortedOpGroups[d].empty()) |
299 | continue; |
300 | if (!opGroupQueue.empty()) { |
301 | assert(d > 0 && |
302 | "Queue expected to be empty when the first block is found" ); |
303 | // The interval for which the loop needs to be generated here is: |
304 | // [lbShift, min(lbShift + tripCount, d)) and the body of the |
305 | // loop needs to have all operations in opQueue in that order. |
306 | AffineForOp res; |
307 | if (lbShift + tripCount * step < d * step) { |
308 | res = generateShiftedLoop( |
309 | b.getShiftedAffineMap(origLbMap, lbShift), |
310 | b.getShiftedAffineMap(origLbMap, lbShift + tripCount * step), |
311 | opGroupQueue, /*offset=*/0, forOp, b); |
312 | // Entire loop for the queued op groups generated, empty it. |
313 | opGroupQueue.clear(); |
314 | lbShift += tripCount * step; |
315 | } else { |
316 | res = generateShiftedLoop(b.getShiftedAffineMap(origLbMap, lbShift), |
317 | b.getShiftedAffineMap(origLbMap, d), |
318 | opGroupQueue, /*offset=*/0, forOp, b); |
319 | lbShift = d * step; |
320 | } |
321 | |
322 | if (res) { |
323 | // Simplify/canonicalize the affine.for. |
324 | RewritePatternSet patterns(res.getContext()); |
325 | AffineForOp::getCanonicalizationPatterns(patterns, res.getContext()); |
326 | GreedyRewriteConfig config; |
327 | config.strictMode = GreedyRewriteStrictness::ExistingOps; |
328 | bool erased; |
329 | (void)applyOpPatternsAndFold(res.getOperation(), std::move(patterns), |
330 | config, /*changed=*/nullptr, &erased); |
331 | if (!erased && !prologue) |
332 | prologue = res; |
333 | if (!erased) |
334 | epilogue = res; |
335 | } |
336 | } else { |
337 | // Start of first interval. |
338 | lbShift = d * step; |
339 | } |
340 | // Augment the list of operations that get into the current open interval. |
341 | opGroupQueue.emplace_back(args&: d, args&: sortedOpGroups[d]); |
342 | } |
343 | |
344 | // Those operations groups left in the queue now need to be processed (FIFO) |
345 | // and their loops completed. |
346 | for (unsigned i = 0, e = opGroupQueue.size(); i < e; ++i) { |
347 | uint64_t ubShift = (opGroupQueue[i].first + tripCount) * step; |
348 | epilogue = generateShiftedLoop(b.getShiftedAffineMap(origLbMap, lbShift), |
349 | b.getShiftedAffineMap(origLbMap, ubShift), |
350 | opGroupQueue, /*offset=*/i, forOp, b); |
351 | lbShift = ubShift; |
352 | if (!prologue) |
353 | prologue = epilogue; |
354 | } |
355 | |
356 | // Erase the original for op. |
357 | forOp.erase(); |
358 | |
359 | if (unrollPrologueEpilogue && prologue) |
360 | (void)loopUnrollFull(prologue); |
361 | if (unrollPrologueEpilogue && !epilogue && epilogue != prologue) |
362 | (void)loopUnrollFull(epilogue); |
363 | |
364 | return success(); |
365 | } |
366 | |
367 | /// Checks whether a loop nest is hyper-rectangular or not. |
368 | static LogicalResult |
369 | checkIfHyperRectangular(MutableArrayRef<AffineForOp> input) { |
370 | FlatAffineValueConstraints cst; |
371 | SmallVector<Operation *, 8> ops(input.begin(), input.end()); |
372 | // 0-d or 1-d is trivially hyper-rectangular. |
373 | if (input.size() <= 1) |
374 | return success(); |
375 | if (failed(result: getIndexSet(ops, domain: &cst))) { |
376 | LLVM_DEBUG(llvm::dbgs() << "Index set computation failed!\n" ); |
377 | return failure(); |
378 | } |
379 | if (!cst.isHyperRectangular(pos: 0, num: input.size())) { |
380 | LLVM_DEBUG(llvm::dbgs() |
381 | << "Non-hyperrectangular nests not supported for tiling!\n" ); |
382 | return failure(); |
383 | } |
384 | return success(); |
385 | } |
386 | |
387 | /// Check if the input nest is supported for tiling and whether tiling would be |
388 | /// legal or not. |
389 | template <typename t> |
390 | static LogicalResult performPreTilingChecks(MutableArrayRef<AffineForOp> input, |
391 | ArrayRef<t> tileSizes) { |
392 | assert(input.size() == tileSizes.size() && "Too few/many tile sizes" ); |
393 | |
394 | if (llvm::any_of(input, |
395 | [](AffineForOp op) { return op.getNumResults() > 0; })) { |
396 | LLVM_DEBUG(llvm::dbgs() |
397 | << "Cannot tile nest where a loop has yield values\n" ); |
398 | return failure(); |
399 | } |
400 | |
401 | // Check if the supplied `for` ops are all successively nested. |
402 | if (!isPerfectlyNested(loops: input)) { |
403 | LLVM_DEBUG(llvm::dbgs() << "input loops not perfectly nested" ); |
404 | return failure(); |
405 | } |
406 | |
407 | // TODO: handle non hyper-rectangular spaces. |
408 | if (failed(result: checkIfHyperRectangular(input))) |
409 | return failure(); |
410 | |
411 | return success(); |
412 | } |
413 | |
414 | /// Move the loop body of AffineForOp 'src' from 'src' into the specified |
415 | /// location in destination's body, ignoring the terminator. |
416 | static void moveLoopBodyImpl(AffineForOp src, AffineForOp dest, |
417 | Block::iterator loc) { |
418 | auto &ops = src.getBody()->getOperations(); |
419 | dest.getBody()->getOperations().splice(loc, ops, ops.begin(), |
420 | std::prev(ops.end())); |
421 | } |
422 | |
423 | /// Move the loop body of AffineForOp 'src' from 'src' to the start of dest |
424 | /// body. |
425 | static void moveLoopBody(AffineForOp src, AffineForOp dest) { |
426 | moveLoopBodyImpl(src, dest, dest.getBody()->begin()); |
427 | } |
428 | |
429 | /// Constructs tiled loop nest, without setting the loop bounds and move the |
430 | /// body of the original loop nest to the tiled loop nest. |
431 | static void constructTiledLoopNest(MutableArrayRef<AffineForOp> origLoops, |
432 | AffineForOp rootAffineForOp, unsigned width, |
433 | MutableArrayRef<AffineForOp> tiledLoops) { |
434 | Location loc = rootAffineForOp.getLoc(); |
435 | |
436 | // The outermost among the loops as we add more.. |
437 | Operation *topLoop = rootAffineForOp.getOperation(); |
438 | AffineForOp innermostPointLoop; |
439 | |
440 | // Add intra-tile (or point) loops. |
441 | for (unsigned i = 0; i < width; i++) { |
442 | OpBuilder b(topLoop); |
443 | // Loop bounds will be set later. |
444 | AffineForOp pointLoop = b.create<AffineForOp>(loc, 0, 0); |
445 | pointLoop.getBody()->getOperations().splice( |
446 | pointLoop.getBody()->begin(), topLoop->getBlock()->getOperations(), |
447 | topLoop); |
448 | tiledLoops[2 * width - 1 - i] = pointLoop; |
449 | topLoop = pointLoop.getOperation(); |
450 | if (i == 0) |
451 | innermostPointLoop = pointLoop; |
452 | } |
453 | |
454 | // Add tile space loops; |
455 | for (unsigned i = width; i < 2 * width; i++) { |
456 | OpBuilder b(topLoop); |
457 | // Loop bounds will be set later. |
458 | AffineForOp tileSpaceLoop = b.create<AffineForOp>(loc, 0, 0); |
459 | tileSpaceLoop.getBody()->getOperations().splice( |
460 | tileSpaceLoop.getBody()->begin(), topLoop->getBlock()->getOperations(), |
461 | topLoop); |
462 | tiledLoops[2 * width - i - 1] = tileSpaceLoop; |
463 | topLoop = tileSpaceLoop.getOperation(); |
464 | } |
465 | |
466 | // Move the loop body of the original nest to the new one. |
467 | moveLoopBody(origLoops.back(), innermostPointLoop); |
468 | } |
469 | |
470 | /// Set lower and upper bounds of intra-tile loops for parametric tiling. |
471 | // TODO: Handle non-constant lower bounds. |
472 | static void setIntraTileBoundsParametric(OpBuilder &b, AffineForOp origLoop, |
473 | AffineForOp newInterTileLoop, |
474 | AffineForOp newIntraTileLoop, |
475 | Value tileSize) { |
476 | // The lower bound for the intra-tile loop is represented by an affine map |
477 | // as (%i, %t0)->((%i - %origlb) * %t0 + %origlb). Similarly, the upper bound |
478 | // for the intra-tile loop is represented by an affine map as (%i, %t0)->((%i |
479 | // - %origlb) * %t0) + (%t0 * %origLoopStep) + %origlb), where %i is loop IV |
480 | // of the corresponding inter-tile loop, %t0 is the corresponding tiling |
481 | // parameter, %origlb is lower bound and %origLoopStep is the loop step of the |
482 | // corresponding inter-tile loop. |
483 | |
484 | assert(origLoop.hasConstantLowerBound() && |
485 | "expected input loops to have constant lower bound." ); |
486 | |
487 | // Get lower bound of original loop as an affine expression. |
488 | AffineExpr origLowerBoundExpr; |
489 | origLowerBoundExpr = |
490 | b.getAffineConstantExpr(constant: origLoop.getConstantLowerBound()); |
491 | |
492 | // Add dim operands from original lower/upper bound. |
493 | SmallVector<Value, 4> lbOperands, ubOperands; |
494 | AffineBound lb = origLoop.getLowerBound(); |
495 | AffineBound ub = origLoop.getUpperBound(); |
496 | lbOperands.reserve(N: lb.getNumOperands() + 2); |
497 | ubOperands.reserve(N: ub.getNumOperands() + 2); |
498 | AffineMap origLbMap = lb.getMap(); |
499 | AffineMap origUbMap = ub.getMap(); |
500 | for (unsigned j = 0, e = origLbMap.getNumDims(); j < e; ++j) |
501 | lbOperands.push_back(Elt: lb.getOperand(idx: j)); |
502 | for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j) |
503 | ubOperands.push_back(Elt: ub.getOperand(idx: j)); |
504 | |
505 | // Add a new dim operand in lb/ubOperands corresponding to the origLoop |
506 | // IV. |
507 | lbOperands.push_back(Elt: newInterTileLoop.getInductionVar()); |
508 | ubOperands.push_back(Elt: newInterTileLoop.getInductionVar()); |
509 | |
510 | // Get loop IV as an affine expression for lower/upper bound. Size of |
511 | // lb/ubOperands is guaranteed to be atleast one. |
512 | AffineExpr lbLoopIvExpr = b.getAffineDimExpr(position: lbOperands.size() - 1); |
513 | AffineExpr ubLoopIvExpr = b.getAffineDimExpr(position: ubOperands.size() - 1); |
514 | |
515 | // Add symbol operands from original lower/upper bound. |
516 | for (unsigned j = 0, e = origLbMap.getNumSymbols(); j < e; ++j) |
517 | lbOperands.push_back(Elt: lb.getOperand(idx: origLbMap.getNumDims() + j)); |
518 | for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j) |
519 | ubOperands.push_back(Elt: ub.getOperand(idx: origUbMap.getNumDims() + j)); |
520 | |
521 | // Add a new symbol operand which is the tile size for this loop. |
522 | lbOperands.push_back(Elt: tileSize); |
523 | ubOperands.push_back(Elt: tileSize); |
524 | |
525 | SmallVector<AffineExpr, 4> lbBoundExprs; |
526 | SmallVector<AffineExpr, 4> ubBoundExprs; |
527 | lbBoundExprs.reserve(N: origLbMap.getNumResults()); |
528 | ubBoundExprs.reserve(N: origUbMap.getNumResults()); |
529 | |
530 | // Get tiling parameter as an affine expression for lb/ub. |
531 | AffineExpr lbTileParameter = b.getAffineSymbolExpr(position: origLbMap.getNumSymbols()); |
532 | AffineExpr ubTileParameter = b.getAffineSymbolExpr(position: origUbMap.getNumSymbols()); |
533 | |
534 | // Insert lb as inter-tile ((loop IV - origlb) * tilingParameter) + origlb. |
535 | lbBoundExprs.push_back( |
536 | Elt: ((lbLoopIvExpr - origLowerBoundExpr) * lbTileParameter) + |
537 | origLowerBoundExpr); |
538 | |
539 | // Get the origLoopStep as an affine expression. |
540 | AffineExpr origLoopStep = b.getAffineConstantExpr(constant: origLoop.getStepAsInt()); |
541 | |
542 | // Insert ub as inter-tile ((loop IV - origlb) * tilingParameter) + |
543 | // (tilingParameter * origLoopStep) + origlb. |
544 | ubBoundExprs.push_back( |
545 | Elt: ((ubLoopIvExpr - origLowerBoundExpr) * ubTileParameter) + |
546 | (ubTileParameter * origLoopStep) + origLowerBoundExpr); |
547 | |
548 | ubBoundExprs.append(in_start: origUbMap.getResults().begin(), |
549 | in_end: origUbMap.getResults().end()); |
550 | |
551 | AffineMap lbMap = |
552 | AffineMap::get(dimCount: origLbMap.getNumDims() + 1, symbolCount: origLbMap.getNumSymbols() + 1, |
553 | results: lbBoundExprs, context: b.getContext()); |
554 | newIntraTileLoop.setLowerBound(lbOperands, lbMap); |
555 | |
556 | AffineMap ubMap = |
557 | AffineMap::get(dimCount: origUbMap.getNumDims() + 1, symbolCount: origUbMap.getNumSymbols() + 1, |
558 | results: ubBoundExprs, context: b.getContext()); |
559 | newIntraTileLoop.setUpperBound(ubOperands, ubMap); |
560 | |
561 | // Original loop step must be preserved. |
562 | newIntraTileLoop.setStep(origLoop.getStepAsInt()); |
563 | } |
564 | |
565 | /// Set lower and upper bounds of inter-tile loops for parametric tiling. |
566 | // TODO: Handle non-constant lower bounds. |
567 | static void setInterTileBoundsParametric(OpBuilder &b, AffineForOp origLoop, |
568 | AffineForOp newLoop, Value tileSize) { |
569 | OperandRange newLbOperands = origLoop.getLowerBoundOperands(); |
570 | |
571 | // The lower bounds for inter-tile loops are same as the corresponding lower |
572 | // bounds of original loops. |
573 | newLoop.setLowerBound(newLbOperands, origLoop.getLowerBoundMap()); |
574 | |
575 | // The new upper bound map for inter-tile loops, assuming constant lower |
576 | // bounds, are now originalLowerBound + ceildiv((originalUpperBound - |
577 | // originalLowerBound), tiling parameter); where tiling parameter is the |
578 | // respective tile size for that loop. For e.g. if the original ubmap was |
579 | // ()->(1024), the new map will be |
580 | // ()[s0]->(ceildiv((1024 -lb) % s0)), where s0 is the tiling parameter. |
581 | // Therefore a new symbol operand is inserted in the map and the result |
582 | // expression is overwritten. |
583 | |
584 | assert(origLoop.hasConstantLowerBound() && |
585 | "expected input loops to have constant lower bound." ); |
586 | |
587 | // Get lower bound of original loop as an affine expression. |
588 | AffineExpr origLowerBoundExpr; |
589 | origLowerBoundExpr = |
590 | b.getAffineConstantExpr(constant: origLoop.getConstantLowerBound()); |
591 | |
592 | // Add dim operands from original upper bound. |
593 | SmallVector<Value, 4> ubOperands; |
594 | AffineBound ub = origLoop.getUpperBound(); |
595 | ubOperands.reserve(N: ub.getNumOperands() + 1); |
596 | AffineMap origUbMap = ub.getMap(); |
597 | for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j) |
598 | ubOperands.push_back(Elt: ub.getOperand(idx: j)); |
599 | |
600 | // Add symbol operands from original upper bound. |
601 | for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j) |
602 | ubOperands.push_back(Elt: ub.getOperand(idx: origUbMap.getNumDims() + j)); |
603 | |
604 | // Add a new symbol operand which is the tile size for this loop. |
605 | ubOperands.push_back(Elt: tileSize); |
606 | |
607 | // Get tiling parameter as an affine expression. |
608 | AffineExpr tileParameter = b.getAffineSymbolExpr(position: origUbMap.getNumSymbols()); |
609 | |
610 | SmallVector<AffineExpr, 4> boundExprs; |
611 | boundExprs.reserve(N: origUbMap.getNumResults()); |
612 | int64_t origUpperBound; |
613 | AffineExpr origUpperBoundExpr; |
614 | |
615 | // If upper bound for the original loop is constant, then the constant can |
616 | // be obtained as an affine expression straight away. |
617 | if (origLoop.hasConstantUpperBound()) { |
618 | origUpperBound = origLoop.getConstantUpperBound(); |
619 | |
620 | // Get original constant upper bound as an affine expression. |
621 | origUpperBoundExpr = b.getAffineConstantExpr(constant: origUpperBound); |
622 | |
623 | // Insert the bound as originalLowerBoundceildiv((originalUpperBound - |
624 | // originalLowerBound), tilingParameter). |
625 | boundExprs.push_back( |
626 | Elt: origLowerBoundExpr + |
627 | (origUpperBoundExpr - origLowerBoundExpr).ceilDiv(other: tileParameter)); |
628 | } else { |
629 | // If upper bound for the original loop is not constant then two cases |
630 | // are possible, although there handeling is the same, 1.) The result of |
631 | // ubmap has only one result expression. For e.g. |
632 | // affine.for %i = 5 to %ub |
633 | // |
634 | // A symbol operand is added which represents the tiling parameter. The |
635 | // new loop bounds here will be like ()[s0, s1] -> ((s0 - 5) ceildiv s1 + 5) |
636 | // where 's0' is the original upper bound and 's1' is the tiling |
637 | // parameter. 2.) When ubMap has more than one result expression. For e.g. |
638 | // #map0 = affine_map<()[s0, s1] -> (s0, s1) |
639 | // affine.for %i = 5 to min #map0()[%s0, %s1] |
640 | // |
641 | // A symbol operand is added which represents the tiling parameter. The |
642 | // new loop bounds will be like ()[s0, s1, s2] -> ((s0 - 5) ceildiv s2 + 5, |
643 | // (s1 -5) ceildiv s2 + 5), where s2 is the tiling parameter. |
644 | |
645 | // Insert the bounds as originalLowerBound + ceildiv((originalUpperBound - |
646 | // originalLowerBound), tilingParameter). |
647 | for (AffineExpr origUpperBoundExpr : origUbMap.getResults()) |
648 | boundExprs.push_back( |
649 | origLowerBoundExpr + |
650 | (origUpperBoundExpr - origLowerBoundExpr).ceilDiv(tileParameter)); |
651 | } |
652 | |
653 | AffineMap ubMap = |
654 | AffineMap::get(dimCount: origUbMap.getNumDims(), symbolCount: origUbMap.getNumSymbols() + 1, |
655 | results: boundExprs, context: b.getContext()); |
656 | newLoop.setUpperBound(ubOperands, ubMap); |
657 | |
658 | // Original loop step must be preserved. |
659 | newLoop.setStep(origLoop.getStepAsInt()); |
660 | } |
661 | |
662 | /// Constructs and sets new loop bounds after tiling for the case of |
663 | /// hyper-rectangular index sets, where the bounds of one dimension do not |
664 | /// depend on other dimensions and tiling parameters are captured from SSA |
665 | /// values. Bounds of each dimension can thus be treated independently, |
666 | /// and deriving the new bounds is much simpler and faster than for the case of |
667 | /// tiling arbitrary polyhedral shapes. |
668 | static void constructParametricallyTiledIndexSetHyperRect( |
669 | MutableArrayRef<AffineForOp> origLoops, |
670 | MutableArrayRef<AffineForOp> newLoops, ArrayRef<Value> tileSizes) { |
671 | assert(!origLoops.empty() && "expected atleast one loop in band" ); |
672 | assert(origLoops.size() == tileSizes.size() && |
673 | "expected tiling parameter for each loop in band." ); |
674 | |
675 | OpBuilder b(origLoops[0].getOperation()); |
676 | unsigned width = origLoops.size(); |
677 | |
678 | // Set bounds for tile space loops. |
679 | for (unsigned i = 0; i < width; ++i) { |
680 | setInterTileBoundsParametric(b, origLoops[i], newLoops[i], tileSizes[i]); |
681 | } |
682 | |
683 | // Set bounds for intra-tile loops. |
684 | for (unsigned i = 0; i < width; ++i) { |
685 | setIntraTileBoundsParametric(b, origLoops[i], newLoops[i], |
686 | newLoops[i + width], tileSizes[i]); |
687 | } |
688 | } |
689 | |
690 | /// Constructs and sets new loop bounds after tiling for the case of |
691 | /// hyper-rectangular index sets, where the bounds of one dimension do not |
692 | /// depend on other dimensions. Bounds of each dimension can thus be treated |
693 | /// independently, and deriving the new bounds is much simpler and faster |
694 | /// than for the case of tiling arbitrary polyhedral shapes. |
695 | static void |
696 | constructTiledIndexSetHyperRect(MutableArrayRef<AffineForOp> origLoops, |
697 | MutableArrayRef<AffineForOp> newLoops, |
698 | ArrayRef<unsigned> tileSizes) { |
699 | assert(!origLoops.empty()); |
700 | assert(origLoops.size() == tileSizes.size()); |
701 | |
702 | OpBuilder b(origLoops[0].getOperation()); |
703 | unsigned width = origLoops.size(); |
704 | |
705 | // Bounds for tile space loops. |
706 | for (unsigned i = 0; i < width; i++) { |
707 | OperandRange newLbOperands = origLoops[i].getLowerBoundOperands(); |
708 | OperandRange newUbOperands = origLoops[i].getUpperBoundOperands(); |
709 | newLoops[i].setLowerBound(newLbOperands, origLoops[i].getLowerBoundMap()); |
710 | newLoops[i].setUpperBound(newUbOperands, origLoops[i].getUpperBoundMap()); |
711 | // If the step size of original loop is x and tileSize is y then after |
712 | // tiling the tile space loops' step size becomes x*y. |
713 | newLoops[i].setStep(tileSizes[i] * origLoops[i].getStepAsInt()); |
714 | } |
715 | // Bounds for intra-tile loops. |
716 | for (unsigned i = 0; i < width; i++) { |
717 | int64_t largestDiv = getLargestDivisorOfTripCount(origLoops[i]); |
718 | std::optional<uint64_t> mayBeConstantCount = |
719 | getConstantTripCount(origLoops[i]); |
720 | // The lower bound is just the tile-space loop. |
721 | AffineMap lbMap = b.getDimIdentityMap(); |
722 | newLoops[width + i].setLowerBound( |
723 | /*operands=*/newLoops[i].getInductionVar(), lbMap); |
724 | // The step sizes of intra-tile loops is just the original loops' step size. |
725 | newLoops[width + i].setStep(origLoops[i].getStepAsInt()); |
726 | |
727 | // Set the upper bound. |
728 | if (mayBeConstantCount && *mayBeConstantCount < tileSizes[i]) { |
729 | // Trip count is less than the tile size: upper bound is lower bound + |
730 | // trip count * stepSize. |
731 | AffineMap ubMap = b.getSingleDimShiftAffineMap( |
732 | shift: *mayBeConstantCount * origLoops[i].getStepAsInt()); |
733 | newLoops[width + i].setUpperBound( |
734 | /*operands=*/newLoops[i].getInductionVar(), ubMap); |
735 | } else if (largestDiv % tileSizes[i] != 0) { |
736 | // Intra-tile loop ii goes from i to min(i + tileSize * stepSize, ub_i). |
737 | // Construct the upper bound map; the operands are the original operands |
738 | // with 'i' (tile-space loop) appended to it. The new upper bound map is |
739 | // the original one with an additional expression i + tileSize * stepSize |
740 | // appended. |
741 | |
742 | // Add dim operands from original upper bound. |
743 | SmallVector<Value, 4> ubOperands; |
744 | AffineBound ub = origLoops[i].getUpperBound(); |
745 | ubOperands.reserve(N: ub.getNumOperands() + 1); |
746 | AffineMap origUbMap = ub.getMap(); |
747 | for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j) |
748 | ubOperands.push_back(Elt: ub.getOperand(idx: j)); |
749 | |
750 | // Add dim operand for new loop upper bound. |
751 | ubOperands.push_back(Elt: newLoops[i].getInductionVar()); |
752 | |
753 | // Add symbol operands from original upper bound. |
754 | for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j) |
755 | ubOperands.push_back(Elt: ub.getOperand(idx: origUbMap.getNumDims() + j)); |
756 | |
757 | SmallVector<AffineExpr, 4> boundExprs; |
758 | boundExprs.reserve(N: 1 + origUbMap.getNumResults()); |
759 | AffineExpr dim = b.getAffineDimExpr(position: origUbMap.getNumDims()); |
760 | // The new upper bound map is the original one with an additional |
761 | // expression i + tileSize * stepSize (of original loop) appended. |
762 | boundExprs.push_back(Elt: dim + tileSizes[i] * origLoops[i].getStepAsInt()); |
763 | boundExprs.append(in_start: origUbMap.getResults().begin(), |
764 | in_end: origUbMap.getResults().end()); |
765 | AffineMap ubMap = |
766 | AffineMap::get(dimCount: origUbMap.getNumDims() + 1, symbolCount: origUbMap.getNumSymbols(), |
767 | results: boundExprs, context: b.getContext()); |
768 | newLoops[width + i].setUpperBound(/*operands=*/ubOperands, ubMap); |
769 | } else { |
770 | // No need of the min expression. |
771 | AffineExpr dim = b.getAffineDimExpr(position: 0); |
772 | AffineMap ubMap = AffineMap::get( |
773 | 1, 0, dim + tileSizes[i] * origLoops[i].getStepAsInt()); |
774 | newLoops[width + i].setUpperBound(newLoops[i].getInductionVar(), ubMap); |
775 | } |
776 | } |
777 | } |
778 | |
779 | LogicalResult |
780 | mlir::affine::tilePerfectlyNested(MutableArrayRef<AffineForOp> input, |
781 | ArrayRef<unsigned> tileSizes, |
782 | SmallVectorImpl<AffineForOp> *tiledNest) { |
783 | if (input.empty()) |
784 | return success(); |
785 | |
786 | if (failed(result: performPreTilingChecks(input, tileSizes))) |
787 | return failure(); |
788 | |
789 | MutableArrayRef<AffineForOp> origLoops = input; |
790 | AffineForOp rootAffineForOp = origLoops[0]; |
791 | |
792 | // Note that width is at least one since the band isn't empty. |
793 | unsigned width = input.size(); |
794 | SmallVector<AffineForOp, 6> tiledLoops(2 * width); |
795 | |
796 | // Construct a tiled loop nest without setting their bounds. Bounds are |
797 | // set later. |
798 | constructTiledLoopNest(origLoops, rootAffineForOp, width, tiledLoops); |
799 | |
800 | SmallVector<Value, 8> origLoopIVs; |
801 | extractForInductionVars(forInsts: input, ivs: &origLoopIVs); |
802 | |
803 | // Set loop bounds for the tiled loop nest. |
804 | constructTiledIndexSetHyperRect(origLoops, tiledLoops, tileSizes); |
805 | |
806 | // Replace original IVs with intra-tile loop IVs. |
807 | for (unsigned i = 0; i < width; i++) |
808 | origLoopIVs[i].replaceAllUsesWith(newValue: tiledLoops[i + width].getInductionVar()); |
809 | |
810 | // Erase the old loop nest. |
811 | rootAffineForOp.erase(); |
812 | |
813 | if (tiledNest) |
814 | *tiledNest = std::move(tiledLoops); |
815 | |
816 | return success(); |
817 | } |
818 | |
819 | /// Tiles the specified band of perfectly nested loops creating tile-space |
820 | /// loops and intra-tile loops, using SSA values as tiling parameters. A band |
821 | /// is a contiguous set of loops. |
822 | LogicalResult mlir::affine::tilePerfectlyNestedParametric( |
823 | MutableArrayRef<AffineForOp> input, ArrayRef<Value> tileSizes, |
824 | SmallVectorImpl<AffineForOp> *tiledNest) { |
825 | if (input.empty()) |
826 | return success(); |
827 | |
828 | if (failed(result: performPreTilingChecks(input, tileSizes))) |
829 | return failure(); |
830 | |
831 | MutableArrayRef<AffineForOp> origLoops = input; |
832 | AffineForOp rootAffineForOp = origLoops[0]; |
833 | unsigned width = input.size(); |
834 | SmallVector<AffineForOp, 6> tiledLoops(2 * width); |
835 | |
836 | // Construct a tiled loop nest without setting their bounds. Bounds are |
837 | // set later. |
838 | constructTiledLoopNest(origLoops, rootAffineForOp, width, tiledLoops); |
839 | |
840 | SmallVector<Value, 8> origLoopIVs; |
841 | extractForInductionVars(forInsts: input, ivs: &origLoopIVs); |
842 | |
843 | // Set loop bounds for the tiled loop nest. |
844 | constructParametricallyTiledIndexSetHyperRect(origLoops, tiledLoops, |
845 | tileSizes); |
846 | |
847 | // Replace original IVs with intra-tile loop IVs. |
848 | for (unsigned i = 0; i < width; i++) |
849 | origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar()); |
850 | |
851 | // Erase the old loop nest. |
852 | rootAffineForOp.erase(); |
853 | |
854 | if (tiledNest) |
855 | *tiledNest = std::move(tiledLoops); |
856 | |
857 | return success(); |
858 | } |
859 | |
860 | /// Get perfectly nested sequence of loops starting at root of loop nest |
861 | /// (the first op being another AffineFor, and the second op - a terminator). |
862 | /// A loop is perfectly nested iff: the first op in the loop's body is another |
863 | /// AffineForOp, and the second op is a terminator). |
864 | void mlir::affine::getPerfectlyNestedLoops( |
865 | SmallVectorImpl<AffineForOp> &nestedLoops, AffineForOp root) { |
866 | for (unsigned i = 0; i < std::numeric_limits<unsigned>::max(); ++i) { |
867 | nestedLoops.push_back(root); |
868 | Block &body = root.getRegion().front(); |
869 | if (body.begin() != std::prev(x: body.end(), n: 2)) |
870 | return; |
871 | |
872 | root = dyn_cast<AffineForOp>(&body.front()); |
873 | if (!root) |
874 | return; |
875 | } |
876 | } |
877 | |
878 | /// Identify valid and profitable bands of loops to tile. This is currently just |
879 | /// a temporary placeholder to test the mechanics of tiled code generation. |
880 | /// Returns all maximal outermost perfect loop nests to tile. |
881 | void mlir::affine::getTileableBands( |
882 | func::FuncOp f, std::vector<SmallVector<AffineForOp, 6>> *bands) { |
883 | // Get maximal perfect nest of 'affine.for' insts starting from root |
884 | // (inclusive). |
885 | for (AffineForOp forOp : f.getOps<AffineForOp>()) { |
886 | SmallVector<AffineForOp, 6> band; |
887 | getPerfectlyNestedLoops(band, forOp); |
888 | bands->push_back(band); |
889 | } |
890 | } |
891 | |
892 | /// Unrolls this loop completely. |
893 | LogicalResult mlir::affine::loopUnrollFull(AffineForOp forOp) { |
894 | std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp); |
895 | if (mayBeConstantTripCount.has_value()) { |
896 | uint64_t tripCount = *mayBeConstantTripCount; |
897 | if (tripCount == 0) |
898 | return success(); |
899 | if (tripCount == 1) |
900 | return promoteIfSingleIteration(forOp); |
901 | return loopUnrollByFactor(forOp, tripCount); |
902 | } |
903 | return failure(); |
904 | } |
905 | |
906 | /// Unrolls this loop by the specified factor or by the trip count (if constant) |
907 | /// whichever is lower. |
908 | LogicalResult mlir::affine::loopUnrollUpToFactor(AffineForOp forOp, |
909 | uint64_t unrollFactor) { |
910 | std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp); |
911 | if (mayBeConstantTripCount.has_value() && |
912 | *mayBeConstantTripCount < unrollFactor) |
913 | return loopUnrollByFactor(forOp, *mayBeConstantTripCount); |
914 | return loopUnrollByFactor(forOp, unrollFactor); |
915 | } |
916 | |
917 | /// Generates unrolled copies of AffineForOp 'loopBodyBlock', with associated |
918 | /// 'forOpIV' by 'unrollFactor', calling 'ivRemapFn' to remap 'forOpIV' for each |
919 | /// unrolled body. If specified, annotates the Ops in each unrolled iteration |
920 | /// using annotateFn. |
921 | static void generateUnrolledLoop( |
922 | Block *loopBodyBlock, Value forOpIV, uint64_t unrollFactor, |
923 | function_ref<Value(unsigned, Value, OpBuilder)> ivRemapFn, |
924 | function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn, |
925 | ValueRange iterArgs, ValueRange yieldedValues) { |
926 | // Builder to insert unrolled bodies just before the terminator of the body of |
927 | // 'forOp'. |
928 | auto builder = OpBuilder::atBlockTerminator(block: loopBodyBlock); |
929 | |
930 | if (!annotateFn) |
931 | annotateFn = [](unsigned, Operation *, OpBuilder) {}; |
932 | |
933 | // Keep a pointer to the last non-terminator operation in the original block |
934 | // so that we know what to clone (since we are doing this in-place). |
935 | Block::iterator srcBlockEnd = std::prev(x: loopBodyBlock->end(), n: 2); |
936 | |
937 | // Unroll the contents of 'forOp' (append unrollFactor - 1 additional copies). |
938 | SmallVector<Value, 4> lastYielded(yieldedValues); |
939 | |
940 | for (unsigned i = 1; i < unrollFactor; i++) { |
941 | IRMapping operandMap; |
942 | |
943 | // Prepare operand map. |
944 | operandMap.map(from&: iterArgs, to&: lastYielded); |
945 | |
946 | // If the induction variable is used, create a remapping to the value for |
947 | // this unrolled instance. |
948 | if (!forOpIV.use_empty()) { |
949 | Value ivUnroll = ivRemapFn(i, forOpIV, builder); |
950 | operandMap.map(from: forOpIV, to: ivUnroll); |
951 | } |
952 | |
953 | // Clone the original body of 'forOp'. |
954 | for (auto it = loopBodyBlock->begin(); it != std::next(x: srcBlockEnd); it++) { |
955 | Operation *clonedOp = builder.clone(op&: *it, mapper&: operandMap); |
956 | annotateFn(i, clonedOp, builder); |
957 | } |
958 | |
959 | // Update yielded values. If the yielded value is defined outside the |
960 | // `loopBodyBlock` or if it is a BlockArgument then it won't be cloned, thus |
961 | // the `lastYielded` value remains unchanged. Else, update the `lastYielded` |
962 | // value with the clone corresponding to the yielded value. |
963 | for (unsigned i = 0, e = lastYielded.size(); i < e; i++) { |
964 | Operation *defOp = yieldedValues[i].getDefiningOp(); |
965 | if (defOp && defOp->getBlock() == loopBodyBlock) |
966 | lastYielded[i] = operandMap.lookup(from: yieldedValues[i]); |
967 | } |
968 | } |
969 | |
970 | // Make sure we annotate the Ops in the original body. We do this last so that |
971 | // any annotations are not copied into the cloned Ops above. |
972 | for (auto it = loopBodyBlock->begin(); it != std::next(x: srcBlockEnd); it++) |
973 | annotateFn(0, &*it, builder); |
974 | |
975 | // Update operands of the yield statement. |
976 | loopBodyBlock->getTerminator()->setOperands(lastYielded); |
977 | } |
978 | |
979 | /// Helper to generate cleanup loop for unroll or unroll-and-jam when the trip |
980 | /// count is not a multiple of `unrollFactor`. |
981 | static LogicalResult generateCleanupLoopForUnroll(AffineForOp forOp, |
982 | uint64_t unrollFactor) { |
983 | // Insert the cleanup loop right after 'forOp'. |
984 | OpBuilder builder(forOp->getBlock(), std::next(x: Block::iterator(forOp))); |
985 | auto cleanupForOp = cast<AffineForOp>(builder.clone(*forOp)); |
986 | |
987 | // Update uses of `forOp` results. `cleanupForOp` should use `forOp` result |
988 | // and produce results for the original users of `forOp` results. |
989 | auto results = forOp.getResults(); |
990 | auto cleanupResults = cleanupForOp.getResults(); |
991 | auto cleanupIterOperands = cleanupForOp.getInits(); |
992 | |
993 | for (auto e : llvm::zip(results, cleanupResults, cleanupIterOperands)) { |
994 | std::get<0>(e).replaceAllUsesWith(std::get<1>(e)); |
995 | cleanupForOp->replaceUsesOfWith(std::get<2>(e), std::get<0>(e)); |
996 | } |
997 | |
998 | AffineMap cleanupMap; |
999 | SmallVector<Value, 4> cleanupOperands; |
1000 | getCleanupLoopLowerBound(forOp, unrollFactor, cleanupMap, cleanupOperands); |
1001 | if (!cleanupMap) |
1002 | return failure(); |
1003 | |
1004 | cleanupForOp.setLowerBound(cleanupOperands, cleanupMap); |
1005 | // Promote the loop body up if this has turned into a single iteration loop. |
1006 | (void)promoteIfSingleIteration(cleanupForOp); |
1007 | |
1008 | // Adjust upper bound of the original loop; this is the same as the lower |
1009 | // bound of the cleanup loop. |
1010 | forOp.setUpperBound(cleanupOperands, cleanupMap); |
1011 | return success(); |
1012 | } |
1013 | |
1014 | /// Unrolls this loop by the specified factor. Returns success if the loop |
1015 | /// is successfully unrolled. |
1016 | LogicalResult mlir::affine::loopUnrollByFactor( |
1017 | AffineForOp forOp, uint64_t unrollFactor, |
1018 | function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn, |
1019 | bool cleanUpUnroll) { |
1020 | assert(unrollFactor > 0 && "unroll factor should be positive" ); |
1021 | |
1022 | std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp); |
1023 | if (unrollFactor == 1) { |
1024 | if (mayBeConstantTripCount && *mayBeConstantTripCount == 1 && |
1025 | failed(promoteIfSingleIteration(forOp))) |
1026 | return failure(); |
1027 | return success(); |
1028 | } |
1029 | |
1030 | // Nothing in the loop body other than the terminator. |
1031 | if (llvm::hasSingleElement(forOp.getBody()->getOperations())) |
1032 | return success(); |
1033 | |
1034 | // If the trip count is lower than the unroll factor, no unrolled body. |
1035 | if (mayBeConstantTripCount && *mayBeConstantTripCount < unrollFactor) { |
1036 | if (cleanUpUnroll) { |
1037 | // Unroll the cleanup loop if cleanUpUnroll is specified. |
1038 | return loopUnrollFull(forOp); |
1039 | } |
1040 | |
1041 | return failure(); |
1042 | } |
1043 | |
1044 | // Generate the cleanup loop if trip count isn't a multiple of unrollFactor. |
1045 | if (getLargestDivisorOfTripCount(forOp) % unrollFactor != 0) { |
1046 | // Loops where the lower bound is a max expression or the upper bound is |
1047 | // a min expression and the trip count doesn't divide the unroll factor |
1048 | // can't be unrolled since the lower bound of the cleanup loop in such cases |
1049 | // cannot be expressed as an affine function or a max over affine functions. |
1050 | if (forOp.getLowerBoundMap().getNumResults() != 1 || |
1051 | forOp.getUpperBoundMap().getNumResults() != 1) |
1052 | return failure(); |
1053 | if (cleanUpUnroll) |
1054 | // Force unroll including cleanup loop |
1055 | return loopUnrollFull(forOp); |
1056 | if (failed(generateCleanupLoopForUnroll(forOp, unrollFactor))) |
1057 | assert(false && "cleanup loop lower bound map for single result lower " |
1058 | "and upper bound maps can always be determined" ); |
1059 | } |
1060 | |
1061 | ValueRange iterArgs(forOp.getRegionIterArgs()); |
1062 | auto yieldedValues = forOp.getBody()->getTerminator()->getOperands(); |
1063 | |
1064 | // Scale the step of loop being unrolled by unroll factor. |
1065 | int64_t step = forOp.getStepAsInt(); |
1066 | forOp.setStep(step * unrollFactor); |
1067 | generateUnrolledLoop( |
1068 | forOp.getBody(), forOp.getInductionVar(), unrollFactor, |
1069 | [&](unsigned i, Value iv, OpBuilder b) { |
1070 | // iv' = iv + i * step |
1071 | auto d0 = b.getAffineDimExpr(position: 0); |
1072 | auto bumpMap = AffineMap::get(dimCount: 1, symbolCount: 0, result: d0 + i * step); |
1073 | return b.create<AffineApplyOp>(forOp.getLoc(), bumpMap, iv); |
1074 | }, |
1075 | /*annotateFn=*/annotateFn, |
1076 | /*iterArgs=*/iterArgs, /*yieldedValues=*/yieldedValues); |
1077 | |
1078 | // Promote the loop body up if this has turned into a single iteration loop. |
1079 | (void)promoteIfSingleIteration(forOp); |
1080 | return success(); |
1081 | } |
1082 | |
1083 | LogicalResult mlir::affine::loopUnrollJamUpToFactor(AffineForOp forOp, |
1084 | uint64_t unrollJamFactor) { |
1085 | std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp); |
1086 | if (mayBeConstantTripCount.has_value() && |
1087 | *mayBeConstantTripCount < unrollJamFactor) |
1088 | return loopUnrollJamByFactor(forOp, *mayBeConstantTripCount); |
1089 | return loopUnrollJamByFactor(forOp, unrollJamFactor); |
1090 | } |
1091 | |
1092 | /// Check if all control operands of all loops are defined outside of `forOp` |
1093 | /// and return false if not. |
1094 | static bool areInnerBoundsInvariant(AffineForOp forOp) { |
1095 | auto walkResult = forOp.walk([&](AffineForOp aForOp) { |
1096 | for (auto controlOperand : aForOp.getControlOperands()) { |
1097 | if (!forOp.isDefinedOutsideOfLoop(controlOperand)) |
1098 | return WalkResult::interrupt(); |
1099 | } |
1100 | return WalkResult::advance(); |
1101 | }); |
1102 | return !walkResult.wasInterrupted(); |
1103 | } |
1104 | |
1105 | // Gathers all maximal sub-blocks of operations that do not themselves |
1106 | // include a for op (a operation could have a descendant for op though |
1107 | // in its tree). Ignore the block terminators. |
1108 | struct JamBlockGatherer { |
1109 | // Store iterators to the first and last op of each sub-block found. |
1110 | std::vector<std::pair<Block::iterator, Block::iterator>> subBlocks; |
1111 | |
1112 | // This is a linear time walk. |
1113 | void walk(Operation *op) { |
1114 | for (auto ®ion : op->getRegions()) |
1115 | for (auto &block : region) |
1116 | walk(block); |
1117 | } |
1118 | |
1119 | void walk(Block &block) { |
1120 | for (auto it = block.begin(), e = std::prev(x: block.end()); it != e;) { |
1121 | auto subBlockStart = it; |
1122 | while (it != e && !isa<AffineForOp>(Val: &*it)) |
1123 | ++it; |
1124 | if (it != subBlockStart) |
1125 | subBlocks.emplace_back(args&: subBlockStart, args: std::prev(x: it)); |
1126 | // Process all for ops that appear next. |
1127 | while (it != e && isa<AffineForOp>(Val: &*it)) |
1128 | walk(op: &*it++); |
1129 | } |
1130 | } |
1131 | }; |
1132 | |
1133 | /// Unrolls and jams this loop by the specified factor. |
1134 | LogicalResult mlir::affine::loopUnrollJamByFactor(AffineForOp forOp, |
1135 | uint64_t unrollJamFactor) { |
1136 | assert(unrollJamFactor > 0 && "unroll jam factor should be positive" ); |
1137 | |
1138 | std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp); |
1139 | if (unrollJamFactor == 1) { |
1140 | if (mayBeConstantTripCount && *mayBeConstantTripCount == 1 && |
1141 | failed(promoteIfSingleIteration(forOp))) |
1142 | return failure(); |
1143 | return success(); |
1144 | } |
1145 | |
1146 | // Nothing in the loop body other than the terminator. |
1147 | if (llvm::hasSingleElement(forOp.getBody()->getOperations())) |
1148 | return success(); |
1149 | |
1150 | // If the trip count is lower than the unroll jam factor, no unroll jam. |
1151 | if (mayBeConstantTripCount && *mayBeConstantTripCount < unrollJamFactor) { |
1152 | LLVM_DEBUG(llvm::dbgs() << "[failed] trip count < unroll-jam factor\n" ); |
1153 | return failure(); |
1154 | } |
1155 | |
1156 | // If any control operand of any inner loop of `forOp` is defined within |
1157 | // `forOp`, no unroll jam. |
1158 | if (!areInnerBoundsInvariant(forOp)) |
1159 | return failure(); |
1160 | |
1161 | // Gather all sub-blocks to jam upon the loop being unrolled. |
1162 | JamBlockGatherer jbg; |
1163 | jbg.walk(forOp); |
1164 | auto &subBlocks = jbg.subBlocks; |
1165 | |
1166 | // Collect loops with iter_args. |
1167 | SmallVector<AffineForOp, 4> loopsWithIterArgs; |
1168 | forOp.walk([&](AffineForOp aForOp) { |
1169 | if (aForOp.getNumIterOperands() > 0) |
1170 | loopsWithIterArgs.push_back(aForOp); |
1171 | }); |
1172 | |
1173 | // Get supported reductions to be used for creating reduction ops at the end. |
1174 | SmallVector<LoopReduction> reductions; |
1175 | if (forOp.getNumIterOperands() > 0) |
1176 | getSupportedReductions(forOp, reductions); |
1177 | |
1178 | // Generate the cleanup loop if trip count isn't a multiple of |
1179 | // unrollJamFactor. |
1180 | if (getLargestDivisorOfTripCount(forOp) % unrollJamFactor != 0) { |
1181 | // Loops where the lower bound is a max expression or the upper bound is |
1182 | // a min expression and the trip count doesn't divide the unroll factor |
1183 | // can't be unrolled since the lower bound of the cleanup loop in such cases |
1184 | // cannot be expressed as an affine function or a max over affine functions. |
1185 | if (forOp.getLowerBoundMap().getNumResults() != 1 || |
1186 | forOp.getUpperBoundMap().getNumResults() != 1) |
1187 | return failure(); |
1188 | if (failed(generateCleanupLoopForUnroll(forOp, unrollJamFactor))) |
1189 | assert(false && "cleanup loop lower bound map for single result lower " |
1190 | "and upper bound maps can always be determined" ); |
1191 | } |
1192 | |
1193 | // `operandMaps[i - 1]` carries old->new operand mapping for the ith unrolled |
1194 | // iteration. There are (`unrollJamFactor` - 1) iterations. |
1195 | SmallVector<IRMapping, 4> operandMaps(unrollJamFactor - 1); |
1196 | |
1197 | // For any loop with iter_args, replace it with a new loop that has |
1198 | // `unrollJamFactor` copies of its iterOperands, iter_args and yield |
1199 | // operands. |
1200 | SmallVector<AffineForOp, 4> newLoopsWithIterArgs; |
1201 | IRRewriter rewriter(forOp.getContext()); |
1202 | for (AffineForOp oldForOp : loopsWithIterArgs) { |
1203 | SmallVector<Value> dupIterOperands, dupYieldOperands; |
1204 | ValueRange oldIterOperands = oldForOp.getInits(); |
1205 | ValueRange oldIterArgs = oldForOp.getRegionIterArgs(); |
1206 | ValueRange oldYieldOperands = |
1207 | cast<AffineYieldOp>(oldForOp.getBody()->getTerminator()).getOperands(); |
1208 | // Get additional iterOperands, iterArgs, and yield operands. We will |
1209 | // fix iterOperands and yield operands after cloning of sub-blocks. |
1210 | for (unsigned i = unrollJamFactor - 1; i >= 1; --i) { |
1211 | dupIterOperands.append(oldIterOperands.begin(), oldIterOperands.end()); |
1212 | dupYieldOperands.append(oldYieldOperands.begin(), oldYieldOperands.end()); |
1213 | } |
1214 | // Create a new loop with additional iterOperands, iter_args and yield |
1215 | // operands. This new loop will take the loop body of the original loop. |
1216 | bool forOpReplaced = oldForOp == forOp; |
1217 | AffineForOp newForOp = |
1218 | cast<AffineForOp>(*oldForOp.replaceWithAdditionalYields( |
1219 | rewriter, dupIterOperands, /*replaceInitOperandUsesInLoop=*/false, |
1220 | [&](OpBuilder &b, Location loc, ArrayRef<BlockArgument> newBbArgs) { |
1221 | return dupYieldOperands; |
1222 | })); |
1223 | newLoopsWithIterArgs.push_back(newForOp); |
1224 | // `forOp` has been replaced with a new loop. |
1225 | if (forOpReplaced) |
1226 | forOp = newForOp; |
1227 | // Update `operandMaps` for `newForOp` iterArgs and results. |
1228 | ValueRange newIterArgs = newForOp.getRegionIterArgs(); |
1229 | unsigned oldNumIterArgs = oldIterArgs.size(); |
1230 | ValueRange newResults = newForOp.getResults(); |
1231 | unsigned oldNumResults = newResults.size() / unrollJamFactor; |
1232 | assert(oldNumIterArgs == oldNumResults && |
1233 | "oldNumIterArgs must be the same as oldNumResults" ); |
1234 | for (unsigned i = unrollJamFactor - 1; i >= 1; --i) { |
1235 | for (unsigned j = 0; j < oldNumIterArgs; ++j) { |
1236 | // `newForOp` has `unrollJamFactor` - 1 new sets of iterArgs and |
1237 | // results. Update `operandMaps[i - 1]` to map old iterArgs and results |
1238 | // to those in the `i`th new set. |
1239 | operandMaps[i - 1].map(newIterArgs[j], |
1240 | newIterArgs[i * oldNumIterArgs + j]); |
1241 | operandMaps[i - 1].map(newResults[j], |
1242 | newResults[i * oldNumResults + j]); |
1243 | } |
1244 | } |
1245 | } |
1246 | |
1247 | // Scale the step of loop being unroll-jammed by the unroll-jam factor. |
1248 | int64_t step = forOp.getStepAsInt(); |
1249 | forOp.setStep(step * unrollJamFactor); |
1250 | |
1251 | auto forOpIV = forOp.getInductionVar(); |
1252 | // Unroll and jam (appends unrollJamFactor - 1 additional copies). |
1253 | for (unsigned i = unrollJamFactor - 1; i >= 1; --i) { |
1254 | for (auto &subBlock : subBlocks) { |
1255 | // Builder to insert unroll-jammed bodies. Insert right at the end of |
1256 | // sub-block. |
1257 | OpBuilder builder(subBlock.first->getBlock(), std::next(x: subBlock.second)); |
1258 | |
1259 | // If the induction variable is used, create a remapping to the value for |
1260 | // this unrolled instance. |
1261 | if (!forOpIV.use_empty()) { |
1262 | // iv' = iv + i * step, i = 1 to unrollJamFactor-1. |
1263 | auto d0 = builder.getAffineDimExpr(position: 0); |
1264 | auto bumpMap = AffineMap::get(dimCount: 1, symbolCount: 0, result: d0 + i * step); |
1265 | auto ivUnroll = |
1266 | builder.create<AffineApplyOp>(forOp.getLoc(), bumpMap, forOpIV); |
1267 | operandMaps[i - 1].map(forOpIV, ivUnroll); |
1268 | } |
1269 | // Clone the sub-block being unroll-jammed. |
1270 | for (auto it = subBlock.first; it != std::next(x: subBlock.second); ++it) |
1271 | builder.clone(op&: *it, mapper&: operandMaps[i - 1]); |
1272 | } |
1273 | // Fix iterOperands and yield op operands of newly created loops. |
1274 | for (auto newForOp : newLoopsWithIterArgs) { |
1275 | unsigned oldNumIterOperands = |
1276 | newForOp.getNumIterOperands() / unrollJamFactor; |
1277 | unsigned numControlOperands = newForOp.getNumControlOperands(); |
1278 | auto yieldOp = cast<AffineYieldOp>(newForOp.getBody()->getTerminator()); |
1279 | unsigned oldNumYieldOperands = yieldOp.getNumOperands() / unrollJamFactor; |
1280 | assert(oldNumIterOperands == oldNumYieldOperands && |
1281 | "oldNumIterOperands must be the same as oldNumYieldOperands" ); |
1282 | for (unsigned j = 0; j < oldNumIterOperands; ++j) { |
1283 | // The `i`th duplication of an old iterOperand or yield op operand |
1284 | // needs to be replaced with a mapped value from `operandMaps[i - 1]` |
1285 | // if such mapped value exists. |
1286 | newForOp.setOperand(numControlOperands + i * oldNumIterOperands + j, |
1287 | operandMaps[i - 1].lookupOrDefault( |
1288 | newForOp.getOperand(numControlOperands + j))); |
1289 | yieldOp.setOperand( |
1290 | i * oldNumYieldOperands + j, |
1291 | operandMaps[i - 1].lookupOrDefault(yieldOp.getOperand(j))); |
1292 | } |
1293 | } |
1294 | } |
1295 | if (forOp.getNumResults() > 0) { |
1296 | // Create reduction ops to combine every `unrollJamFactor` related results |
1297 | // into one value. For example, for %0:2 = affine.for ... and addf, we add |
1298 | // %1 = arith.addf %0#0, %0#1, and replace the following uses of %0#0 with |
1299 | // %1. |
1300 | rewriter.setInsertionPointAfter(forOp); |
1301 | auto loc = forOp.getLoc(); |
1302 | unsigned oldNumResults = forOp.getNumResults() / unrollJamFactor; |
1303 | for (LoopReduction &reduction : reductions) { |
1304 | unsigned pos = reduction.iterArgPosition; |
1305 | Value lhs = forOp.getResult(pos); |
1306 | Value rhs; |
1307 | SmallPtrSet<Operation *, 4> newOps; |
1308 | for (unsigned i = unrollJamFactor - 1; i >= 1; --i) { |
1309 | rhs = forOp.getResult(i * oldNumResults + pos); |
1310 | // Create ops based on reduction type. |
1311 | lhs = arith::getReductionOp(reduction.kind, rewriter, loc, lhs, rhs); |
1312 | if (!lhs) |
1313 | return failure(); |
1314 | Operation *op = lhs.getDefiningOp(); |
1315 | assert(op && "Reduction op should have been created" ); |
1316 | newOps.insert(Ptr: op); |
1317 | } |
1318 | // Replace all uses except those in newly created reduction ops. |
1319 | forOp.getResult(pos).replaceAllUsesExcept(lhs, newOps); |
1320 | } |
1321 | } |
1322 | |
1323 | // Promote the loop body up if this has turned into a single iteration loop. |
1324 | (void)promoteIfSingleIteration(forOp); |
1325 | return success(); |
1326 | } |
1327 | |
1328 | /// Performs loop interchange on 'forOpA' and 'forOpB', where 'forOpB' is |
1329 | /// nested within 'forOpA' as the only non-terminator operation in its block. |
1330 | void mlir::affine::interchangeLoops(AffineForOp forOpA, AffineForOp forOpB) { |
1331 | assert(&*forOpA.getBody()->begin() == forOpB.getOperation()); |
1332 | auto &forOpABody = forOpA.getBody()->getOperations(); |
1333 | auto &forOpBBody = forOpB.getBody()->getOperations(); |
1334 | |
1335 | // 1) Splice forOpA's non-terminator operations (which is just forOpB) just |
1336 | // before forOpA (in ForOpA's parent's block) this should leave 'forOpA's |
1337 | // body containing only the terminator. |
1338 | forOpA->getBlock()->getOperations().splice(Block::iterator(forOpA), |
1339 | forOpABody, forOpABody.begin(), |
1340 | std::prev(forOpABody.end())); |
1341 | // 2) Splice forOpB's non-terminator operations into the beginning of forOpA's |
1342 | // body (this leaves forOpB's body containing only the terminator). |
1343 | forOpABody.splice(forOpABody.begin(), forOpBBody, forOpBBody.begin(), |
1344 | std::prev(forOpBBody.end())); |
1345 | // 3) Splice forOpA into the beginning of forOpB's body. |
1346 | forOpBBody.splice(forOpBBody.begin(), forOpA->getBlock()->getOperations(), |
1347 | Block::iterator(forOpA)); |
1348 | } |
1349 | |
1350 | // Checks each dependence component against the permutation to see if the |
1351 | // desired loop interchange would violate dependences by making the |
1352 | // dependence component lexicographically negative. |
1353 | static bool checkLoopInterchangeDependences( |
1354 | const std::vector<SmallVector<DependenceComponent, 2>> &depCompsVec, |
1355 | ArrayRef<AffineForOp> loops, ArrayRef<unsigned> loopPermMap) { |
1356 | // Invert permutation map. |
1357 | unsigned maxLoopDepth = loops.size(); |
1358 | SmallVector<unsigned, 4> loopPermMapInv; |
1359 | loopPermMapInv.resize(N: maxLoopDepth); |
1360 | for (unsigned i = 0; i < maxLoopDepth; ++i) |
1361 | loopPermMapInv[loopPermMap[i]] = i; |
1362 | |
1363 | // Check each dependence component against the permutation to see if the |
1364 | // desired loop interchange permutation would make the dependence vectors |
1365 | // lexicographically negative. |
1366 | // Example 1: [-1, 1][0, 0] |
1367 | // Example 2: [0, 0][-1, 1] |
1368 | for (const auto &depComps : depCompsVec) { |
1369 | assert(depComps.size() >= maxLoopDepth); |
1370 | // Check if the first non-zero dependence component is positive. |
1371 | // This iterates through loops in the desired order. |
1372 | for (unsigned j = 0; j < maxLoopDepth; ++j) { |
1373 | unsigned permIndex = loopPermMapInv[j]; |
1374 | assert(depComps[permIndex].lb); |
1375 | int64_t depCompLb = *depComps[permIndex].lb; |
1376 | if (depCompLb > 0) |
1377 | break; |
1378 | if (depCompLb < 0) |
1379 | return false; |
1380 | } |
1381 | } |
1382 | return true; |
1383 | } |
1384 | |
1385 | /// Checks if the loop interchange permutation 'loopPermMap' of the perfectly |
1386 | /// nested sequence of loops in 'loops' would violate dependences. |
1387 | bool mlir::affine::isValidLoopInterchangePermutation( |
1388 | ArrayRef<AffineForOp> loops, ArrayRef<unsigned> loopPermMap) { |
1389 | // Gather dependence components for dependences between all ops in loop nest |
1390 | // rooted at 'loops[0]', at loop depths in range [1, maxLoopDepth]. |
1391 | assert(loopPermMap.size() == loops.size()); |
1392 | unsigned maxLoopDepth = loops.size(); |
1393 | std::vector<SmallVector<DependenceComponent, 2>> depCompsVec; |
1394 | getDependenceComponents(loops[0], maxLoopDepth, &depCompsVec); |
1395 | return checkLoopInterchangeDependences(depCompsVec, loops, loopPermMap); |
1396 | } |
1397 | |
1398 | /// Returns true if `loops` is a perfectly nested loop nest, where loops appear |
1399 | /// in it from outermost to innermost. |
1400 | bool LLVM_ATTRIBUTE_UNUSED |
1401 | mlir::affine::isPerfectlyNested(ArrayRef<AffineForOp> loops) { |
1402 | assert(!loops.empty() && "no loops provided" ); |
1403 | |
1404 | // We already know that the block can't be empty. |
1405 | auto hasTwoElements = [](Block *block) { |
1406 | auto secondOpIt = std::next(x: block->begin()); |
1407 | return secondOpIt != block->end() && &*secondOpIt == &block->back(); |
1408 | }; |
1409 | |
1410 | auto enclosingLoop = loops.front(); |
1411 | for (auto loop : loops.drop_front()) { |
1412 | auto parentForOp = dyn_cast<AffineForOp>(loop->getParentOp()); |
1413 | // parentForOp's body should be just this loop and the terminator. |
1414 | if (parentForOp != enclosingLoop || !hasTwoElements(parentForOp.getBody())) |
1415 | return false; |
1416 | enclosingLoop = loop; |
1417 | } |
1418 | return true; |
1419 | } |
1420 | |
1421 | // input[i] should move from position i -> permMap[i]. Returns the position in |
1422 | // `input` that becomes the new outermost loop. |
1423 | unsigned mlir::affine::permuteLoops(MutableArrayRef<AffineForOp> input, |
1424 | ArrayRef<unsigned> permMap) { |
1425 | assert(input.size() == permMap.size() && "invalid permutation map size" ); |
1426 | // Check whether the permutation spec is valid. This is a small vector - we'll |
1427 | // just sort and check if it's iota. |
1428 | SmallVector<unsigned, 4> checkPermMap(permMap.begin(), permMap.end()); |
1429 | llvm::sort(C&: checkPermMap); |
1430 | if (llvm::any_of(Range: llvm::enumerate(First&: checkPermMap), |
1431 | P: [](const auto &en) { return en.value() != en.index(); })) |
1432 | assert(false && "invalid permutation map" ); |
1433 | |
1434 | // Nothing to do. |
1435 | if (input.size() < 2) |
1436 | return 0; |
1437 | |
1438 | assert(isPerfectlyNested(input) && "input not perfectly nested" ); |
1439 | |
1440 | // Compute the inverse mapping, invPermMap: since input[i] goes to position |
1441 | // permMap[i], position i of the permuted nest is at input[invPermMap[i]]. |
1442 | SmallVector<std::pair<unsigned, unsigned>, 4> invPermMap; |
1443 | for (unsigned i = 0, e = input.size(); i < e; ++i) |
1444 | invPermMap.push_back(Elt: {permMap[i], i}); |
1445 | llvm::sort(C&: invPermMap); |
1446 | |
1447 | // Move the innermost loop body to the loop that would be the innermost in the |
1448 | // permuted nest (only if the innermost loop is going to change). |
1449 | if (permMap.back() != input.size() - 1) { |
1450 | auto *destBody = input[invPermMap.back().second].getBody(); |
1451 | auto *srcBody = input.back().getBody(); |
1452 | destBody->getOperations().splice(destBody->begin(), |
1453 | srcBody->getOperations(), srcBody->begin(), |
1454 | std::prev(srcBody->end())); |
1455 | } |
1456 | |
1457 | // We'll move each loop in `input` in the reverse order so that its body is |
1458 | // empty when we are moving it; this incurs zero copies and no erasing. |
1459 | for (int i = input.size() - 1; i >= 0; --i) { |
1460 | // If this has to become the outermost loop after permutation, add it to the |
1461 | // parent block of the original root. |
1462 | if (permMap[i] == 0) { |
1463 | // If the root remains the same, nothing to do. |
1464 | if (i == 0) |
1465 | continue; |
1466 | // Make input[i] the new outermost loop moving it into parentBlock. |
1467 | auto *parentBlock = input[0]->getBlock(); |
1468 | parentBlock->getOperations().splice(Block::iterator(input[0]), |
1469 | input[i]->getBlock()->getOperations(), |
1470 | Block::iterator(input[i])); |
1471 | continue; |
1472 | } |
1473 | |
1474 | // If the parent in the permuted order is the same as in the original, |
1475 | // nothing to do. |
1476 | unsigned parentPosInInput = invPermMap[permMap[i] - 1].second; |
1477 | if (i > 0 && static_cast<unsigned>(i - 1) == parentPosInInput) |
1478 | continue; |
1479 | |
1480 | // Move input[i] to its surrounding loop in the transformed nest. |
1481 | auto *destBody = input[parentPosInInput].getBody(); |
1482 | destBody->getOperations().splice(destBody->begin(), |
1483 | input[i]->getBlock()->getOperations(), |
1484 | Block::iterator(input[i])); |
1485 | } |
1486 | |
1487 | return invPermMap[0].second; |
1488 | } |
1489 | |
1490 | // Sinks all sequential loops to the innermost levels (while preserving |
1491 | // relative order among them) and moves all parallel loops to the |
1492 | // outermost (while again preserving relative order among them). |
1493 | AffineForOp mlir::affine::sinkSequentialLoops(AffineForOp forOp) { |
1494 | SmallVector<AffineForOp, 4> loops; |
1495 | getPerfectlyNestedLoops(loops, forOp); |
1496 | if (loops.size() < 2) |
1497 | return forOp; |
1498 | |
1499 | // Gather dependence components for dependences between all ops in loop nest |
1500 | // rooted at 'loops[0]', at loop depths in range [1, maxLoopDepth]. |
1501 | unsigned maxLoopDepth = loops.size(); |
1502 | std::vector<SmallVector<DependenceComponent, 2>> depCompsVec; |
1503 | getDependenceComponents(loops[0], maxLoopDepth, &depCompsVec); |
1504 | |
1505 | // Mark loops as either parallel or sequential. |
1506 | SmallVector<bool, 8> isParallelLoop(maxLoopDepth, true); |
1507 | for (auto &depComps : depCompsVec) { |
1508 | assert(depComps.size() >= maxLoopDepth); |
1509 | for (unsigned j = 0; j < maxLoopDepth; ++j) { |
1510 | DependenceComponent &depComp = depComps[j]; |
1511 | assert(depComp.lb.has_value() && depComp.ub.has_value()); |
1512 | if (*depComp.lb != 0 || *depComp.ub != 0) |
1513 | isParallelLoop[j] = false; |
1514 | } |
1515 | } |
1516 | |
1517 | unsigned numParallelLoops = llvm::count(Range&: isParallelLoop, Element: true); |
1518 | |
1519 | // Compute permutation of loops that sinks sequential loops (and thus raises |
1520 | // parallel loops) while preserving relative order. |
1521 | SmallVector<unsigned, 4> loopPermMap(maxLoopDepth); |
1522 | unsigned nextSequentialLoop = numParallelLoops; |
1523 | unsigned nextParallelLoop = 0; |
1524 | for (unsigned i = 0; i < maxLoopDepth; ++i) { |
1525 | if (isParallelLoop[i]) { |
1526 | loopPermMap[i] = nextParallelLoop++; |
1527 | } else { |
1528 | loopPermMap[i] = nextSequentialLoop++; |
1529 | } |
1530 | } |
1531 | |
1532 | // Check if permutation 'loopPermMap' would violate dependences. |
1533 | if (!checkLoopInterchangeDependences(depCompsVec, loops, loopPermMap)) |
1534 | return forOp; |
1535 | // Perform loop interchange according to permutation 'loopPermMap'. |
1536 | unsigned loopNestRootIndex = permuteLoops(loops, loopPermMap); |
1537 | return loops[loopNestRootIndex]; |
1538 | } |
1539 | |
1540 | // Factors out common behavior to add a new `iv` (resp. `iv` + `offset`) to the |
1541 | // lower (resp. upper) loop bound. When called for both the lower and upper |
1542 | // bounds, the resulting IR resembles: |
1543 | // |
1544 | // ```mlir |
1545 | // affine.for %i = max (`iv, ...) to min (`iv` + `offset`) { |
1546 | // ... |
1547 | // } |
1548 | // ``` |
1549 | static void augmentMapAndBounds(OpBuilder &b, Value iv, AffineMap *map, |
1550 | SmallVector<Value, 4> *operands, |
1551 | int64_t offset = 0) { |
1552 | auto bounds = llvm::to_vector<4>(Range: map->getResults()); |
1553 | bounds.push_back(Elt: b.getAffineDimExpr(position: map->getNumDims()) + offset); |
1554 | operands->insert(I: operands->begin() + map->getNumDims(), Elt: iv); |
1555 | *map = AffineMap::get(dimCount: map->getNumDims() + 1, symbolCount: map->getNumSymbols(), results: bounds, |
1556 | context: b.getContext()); |
1557 | canonicalizeMapAndOperands(map, operands); |
1558 | } |
1559 | |
1560 | // Stripmines `forOp` by `factor` and sinks it under each of the `targets`. |
1561 | // Stripmine-sink is a primitive building block for generalized tiling of |
1562 | // imperfectly nested loops. |
1563 | // This transformation is purely mechanical and does not check legality, |
1564 | // profitability or even structural correctness. It is the user's |
1565 | // responsibility to specify `targets` that are dominated by `forOp`. |
1566 | // Returns the new AffineForOps, one per `targets`, nested immediately under |
1567 | // each of the `targets`. |
1568 | static SmallVector<AffineForOp, 8> |
1569 | stripmineSink(AffineForOp forOp, uint64_t factor, |
1570 | ArrayRef<AffineForOp> targets) { |
1571 | auto originalStep = forOp.getStepAsInt(); |
1572 | auto scaledStep = originalStep * factor; |
1573 | forOp.setStep(scaledStep); |
1574 | |
1575 | OpBuilder b(forOp->getBlock(), std::next(x: Block::iterator(forOp))); |
1576 | |
1577 | // Lower-bound map creation. |
1578 | auto lbMap = forOp.getLowerBoundMap(); |
1579 | SmallVector<Value, 4> lbOperands(forOp.getLowerBoundOperands()); |
1580 | augmentMapAndBounds(b, forOp.getInductionVar(), &lbMap, &lbOperands); |
1581 | |
1582 | // Upper-bound map creation. |
1583 | auto ubMap = forOp.getUpperBoundMap(); |
1584 | SmallVector<Value, 4> ubOperands(forOp.getUpperBoundOperands()); |
1585 | augmentMapAndBounds(b, forOp.getInductionVar(), &ubMap, &ubOperands, |
1586 | /*offset=*/scaledStep); |
1587 | |
1588 | auto iv = forOp.getInductionVar(); |
1589 | SmallVector<AffineForOp, 8> innerLoops; |
1590 | for (auto t : targets) { |
1591 | // Insert newForOp before the terminator of `t`. |
1592 | auto b = OpBuilder::atBlockTerminator(t.getBody()); |
1593 | auto newForOp = b.create<AffineForOp>(t.getLoc(), lbOperands, lbMap, |
1594 | ubOperands, ubMap, originalStep); |
1595 | auto begin = t.getBody()->begin(); |
1596 | // Skip terminator and `newForOp` which is just before the terminator. |
1597 | auto nOps = t.getBody()->getOperations().size() - 2; |
1598 | newForOp.getBody()->getOperations().splice( |
1599 | newForOp.getBody()->getOperations().begin(), |
1600 | t.getBody()->getOperations(), begin, std::next(begin, nOps)); |
1601 | replaceAllUsesInRegionWith(iv, newForOp.getInductionVar(), |
1602 | newForOp.getRegion()); |
1603 | innerLoops.push_back(newForOp); |
1604 | } |
1605 | |
1606 | return innerLoops; |
1607 | } |
1608 | |
1609 | // Stripmines a `forOp` by `factor` and sinks it under a single `target`. |
1610 | // Returns the new AffineForOps, nested immediately under `target`. |
1611 | template <typename SizeType> |
1612 | static AffineForOp stripmineSink(AffineForOp forOp, SizeType factor, |
1613 | AffineForOp target) { |
1614 | // TODO: Use cheap structural assertions that targets are nested under |
1615 | // forOp and that targets are not nested under each other when DominanceInfo |
1616 | // exposes the capability. It seems overkill to construct a whole function |
1617 | // dominance tree at this point. |
1618 | auto res = stripmineSink(forOp, factor, ArrayRef<AffineForOp>(target)); |
1619 | assert(res.size() == 1 && "Expected 1 inner forOp" ); |
1620 | return res[0]; |
1621 | } |
1622 | |
1623 | SmallVector<SmallVector<AffineForOp, 8>, 8> |
1624 | mlir::affine::tile(ArrayRef<AffineForOp> forOps, ArrayRef<uint64_t> sizes, |
1625 | ArrayRef<AffineForOp> targets) { |
1626 | SmallVector<SmallVector<AffineForOp, 8>, 8> res; |
1627 | SmallVector<AffineForOp, 8> currentTargets(targets.begin(), targets.end()); |
1628 | for (auto it : llvm::zip(t&: forOps, u&: sizes)) { |
1629 | auto step = stripmineSink(std::get<0>(t&: it), std::get<1>(t&: it), currentTargets); |
1630 | res.push_back(step); |
1631 | currentTargets = step; |
1632 | } |
1633 | return res; |
1634 | } |
1635 | |
1636 | SmallVector<AffineForOp, 8> mlir::affine::tile(ArrayRef<AffineForOp> forOps, |
1637 | ArrayRef<uint64_t> sizes, |
1638 | AffineForOp target) { |
1639 | SmallVector<AffineForOp, 8> res; |
1640 | for (auto loops : tile(forOps, sizes, ArrayRef<AffineForOp>(target))) { |
1641 | assert(loops.size() == 1); |
1642 | res.push_back(loops[0]); |
1643 | } |
1644 | return res; |
1645 | } |
1646 | |
1647 | LogicalResult mlir::affine::coalesceLoops(MutableArrayRef<AffineForOp> loops) { |
1648 | if (loops.size() < 2) |
1649 | return success(); |
1650 | |
1651 | AffineForOp innermost = loops.back(); |
1652 | AffineForOp outermost = loops.front(); |
1653 | AffineBound ub = outermost.getUpperBound(); |
1654 | AffineMap origUbMap = ub.getMap(); |
1655 | Location loc = outermost.getLoc(); |
1656 | OpBuilder builder(outermost); |
1657 | for (AffineForOp loop : loops) { |
1658 | // We only work on normalized loops. |
1659 | if (loop.getStepAsInt() != 1 || !loop.hasConstantLowerBound() || |
1660 | loop.getConstantLowerBound() != 0) |
1661 | return failure(); |
1662 | } |
1663 | SmallVector<Value, 4> upperBoundSymbols; |
1664 | SmallVector<Value, 4> ubOperands(ub.getOperands().begin(), |
1665 | ub.getOperands().end()); |
1666 | |
1667 | // 1. Store the upper bound of the outermost loop in a variable. |
1668 | Value prev; |
1669 | if (!llvm::hasSingleElement(C: origUbMap.getResults())) |
1670 | prev = builder.create<AffineMinOp>(loc, origUbMap, ubOperands); |
1671 | else |
1672 | prev = builder.create<AffineApplyOp>(loc, origUbMap, ubOperands); |
1673 | upperBoundSymbols.push_back(Elt: prev); |
1674 | |
1675 | // 2. Emit code computing the upper bound of the coalesced loop as product of |
1676 | // the number of iterations of all loops. |
1677 | for (AffineForOp loop : loops.drop_front()) { |
1678 | ub = loop.getUpperBound(); |
1679 | origUbMap = ub.getMap(); |
1680 | ubOperands = ub.getOperands(); |
1681 | Value upperBound; |
1682 | // If upper bound map has more than one result, take their minimum. |
1683 | if (!llvm::hasSingleElement(origUbMap.getResults())) |
1684 | upperBound = builder.create<AffineMinOp>(loc, origUbMap, ubOperands); |
1685 | else |
1686 | upperBound = builder.create<AffineApplyOp>(loc, origUbMap, ubOperands); |
1687 | upperBoundSymbols.push_back(upperBound); |
1688 | SmallVector<Value, 4> operands; |
1689 | operands.push_back(prev); |
1690 | operands.push_back(upperBound); |
1691 | // Maintain running product of loop upper bounds. |
1692 | prev = builder.create<AffineApplyOp>( |
1693 | loc, |
1694 | AffineMap::get(/*dimCount=*/1, |
1695 | /*symbolCount=*/1, |
1696 | builder.getAffineDimExpr(0) * |
1697 | builder.getAffineSymbolExpr(0)), |
1698 | operands); |
1699 | } |
1700 | // Set upper bound of the coalesced loop. |
1701 | AffineMap newUbMap = AffineMap::get( |
1702 | /*dimCount=*/0, |
1703 | /*symbolCount=*/1, results: builder.getAffineSymbolExpr(position: 0), context: builder.getContext()); |
1704 | outermost.setUpperBound(prev, newUbMap); |
1705 | |
1706 | builder.setInsertionPointToStart(outermost.getBody()); |
1707 | |
1708 | // 3. Remap induction variables. For each original loop, the value of the |
1709 | // induction variable can be obtained by dividing the induction variable of |
1710 | // the linearized loop by the total number of iterations of the loops nested |
1711 | // in it modulo the number of iterations in this loop (remove the values |
1712 | // related to the outer loops): |
1713 | // iv_i = floordiv(iv_linear, product-of-loop-ranges-until-i) mod range_i. |
1714 | // Compute these iteratively from the innermost loop by creating a "running |
1715 | // quotient" of division by the range. |
1716 | Value previous = outermost.getInductionVar(); |
1717 | for (unsigned idx = loops.size(); idx > 0; --idx) { |
1718 | if (idx != loops.size()) { |
1719 | SmallVector<Value, 4> operands; |
1720 | operands.push_back(Elt: previous); |
1721 | operands.push_back(Elt: upperBoundSymbols[idx]); |
1722 | previous = builder.create<AffineApplyOp>( |
1723 | loc, |
1724 | AffineMap::get( |
1725 | /*dimCount=*/1, /*symbolCount=*/1, |
1726 | result: builder.getAffineDimExpr(position: 0).floorDiv( |
1727 | other: builder.getAffineSymbolExpr(position: 0))), |
1728 | operands); |
1729 | } |
1730 | // Modified value of the induction variables of the nested loops after |
1731 | // coalescing. |
1732 | Value inductionVariable; |
1733 | if (idx == 1) { |
1734 | inductionVariable = previous; |
1735 | } else { |
1736 | SmallVector<Value, 4> applyOperands; |
1737 | applyOperands.push_back(Elt: previous); |
1738 | applyOperands.push_back(Elt: upperBoundSymbols[idx - 1]); |
1739 | inductionVariable = builder.create<AffineApplyOp>( |
1740 | loc, |
1741 | AffineMap::get( |
1742 | /*dimCount=*/1, /*symbolCount=*/1, |
1743 | result: builder.getAffineDimExpr(position: 0) % builder.getAffineSymbolExpr(position: 0)), |
1744 | applyOperands); |
1745 | } |
1746 | replaceAllUsesInRegionWith(loops[idx - 1].getInductionVar(), |
1747 | inductionVariable, loops.back().getRegion()); |
1748 | } |
1749 | |
1750 | // 4. Move the operations from the innermost just above the second-outermost |
1751 | // loop, delete the extra terminator and the second-outermost loop. |
1752 | AffineForOp secondOutermostLoop = loops[1]; |
1753 | innermost.getBody()->back().erase(); |
1754 | outermost.getBody()->getOperations().splice( |
1755 | Block::iterator(secondOutermostLoop.getOperation()), |
1756 | innermost.getBody()->getOperations()); |
1757 | secondOutermostLoop.erase(); |
1758 | return success(); |
1759 | } |
1760 | |
1761 | void mlir::affine::mapLoopToProcessorIds(scf::ForOp forOp, |
1762 | ArrayRef<Value> processorId, |
1763 | ArrayRef<Value> numProcessors) { |
1764 | assert(processorId.size() == numProcessors.size()); |
1765 | if (processorId.empty()) |
1766 | return; |
1767 | |
1768 | OpBuilder b(forOp); |
1769 | Location loc(forOp.getLoc()); |
1770 | AffineExpr lhs, rhs; |
1771 | bindSymbols(forOp.getContext(), lhs, rhs); |
1772 | auto mulMap = AffineMap::get(dimCount: 0, symbolCount: 2, result: lhs * rhs); |
1773 | auto addMap = AffineMap::get(dimCount: 0, symbolCount: 2, result: lhs + rhs); |
1774 | |
1775 | Value linearIndex = processorId.front(); |
1776 | for (unsigned i = 1, e = processorId.size(); i < e; ++i) { |
1777 | auto mulApplyOp = b.create<AffineApplyOp>( |
1778 | loc, mulMap, ValueRange{linearIndex, numProcessors[i]}); |
1779 | linearIndex = b.create<AffineApplyOp>( |
1780 | loc, addMap, ValueRange{mulApplyOp, processorId[i]}); |
1781 | } |
1782 | |
1783 | auto mulApplyOp = b.create<AffineApplyOp>( |
1784 | loc, mulMap, ValueRange{linearIndex, forOp.getStep()}); |
1785 | Value lb = b.create<AffineApplyOp>( |
1786 | loc, addMap, ValueRange{mulApplyOp, forOp.getLowerBound()}); |
1787 | forOp.setLowerBound(lb); |
1788 | |
1789 | Value step = forOp.getStep(); |
1790 | for (auto numProcs : numProcessors) |
1791 | step = b.create<AffineApplyOp>(loc, mulMap, ValueRange{numProcs, step}); |
1792 | forOp.setStep(step); |
1793 | } |
1794 | |
1795 | /// Given a memref region, determine the lowest depth at which transfers can be |
1796 | /// placed for it, and return the corresponding block, start and end positions |
1797 | /// in the block for placing incoming (read) and outgoing (write) copies |
1798 | /// respectively. The lowest depth depends on whether the region being accessed |
1799 | /// is hoistable with respect to one or more immediately surrounding loops. |
1800 | static void |
1801 | findHighestBlockForPlacement(const MemRefRegion ®ion, Block &block, |
1802 | Block::iterator &begin, Block::iterator &end, |
1803 | Block **copyPlacementBlock, |
1804 | Block::iterator *copyInPlacementStart, |
1805 | Block::iterator *copyOutPlacementStart) { |
1806 | const auto *cst = region.getConstraints(); |
1807 | SmallVector<Value, 4> symbols; |
1808 | cst->getValues(start: cst->getNumDimVars(), end: cst->getNumDimAndSymbolVars(), values: &symbols); |
1809 | |
1810 | SmallVector<AffineForOp, 4> enclosingFors; |
1811 | getAffineForIVs(*block.begin(), &enclosingFors); |
1812 | // Walk up loop parents till we find an IV on which this region is |
1813 | // symbolic/variant. |
1814 | auto it = enclosingFors.rbegin(); |
1815 | for (auto e = enclosingFors.rend(); it != e; ++it) { |
1816 | // TODO: also need to be checking this for regions symbols that |
1817 | // aren't loop IVs, whether we are within their resp. defs' dominance scope. |
1818 | if (llvm::is_contained(symbols, it->getInductionVar())) |
1819 | break; |
1820 | } |
1821 | |
1822 | if (it != enclosingFors.rbegin()) { |
1823 | auto lastInvariantIV = *std::prev(it); |
1824 | *copyInPlacementStart = Block::iterator(lastInvariantIV.getOperation()); |
1825 | *copyOutPlacementStart = std::next(x: *copyInPlacementStart); |
1826 | *copyPlacementBlock = lastInvariantIV->getBlock(); |
1827 | } else { |
1828 | *copyInPlacementStart = begin; |
1829 | *copyOutPlacementStart = end; |
1830 | *copyPlacementBlock = █ |
1831 | } |
1832 | } |
1833 | |
1834 | // Info comprising stride and number of elements transferred every stride. |
1835 | struct StrideInfo { |
1836 | int64_t stride; |
1837 | int64_t numEltPerStride; |
1838 | }; |
1839 | |
1840 | /// Returns striding information for a copy/transfer of this region with |
1841 | /// potentially multiple striding levels from outermost to innermost. For an |
1842 | /// n-dimensional region, there can be at most n-1 levels of striding |
1843 | /// successively nested. |
1844 | // TODO: make this work with non-identity layout maps. |
1845 | static void getMultiLevelStrides(const MemRefRegion ®ion, |
1846 | ArrayRef<int64_t> bufferShape, |
1847 | SmallVectorImpl<StrideInfo> *strideInfos) { |
1848 | if (bufferShape.size() <= 1) |
1849 | return; |
1850 | |
1851 | int64_t numEltPerStride = 1; |
1852 | int64_t stride = 1; |
1853 | for (int d = bufferShape.size() - 1; d >= 1; d--) { |
1854 | int64_t dimSize = cast<MemRefType>(region.memref.getType()).getDimSize(d); |
1855 | stride *= dimSize; |
1856 | numEltPerStride *= bufferShape[d]; |
1857 | // A stride is needed only if the region has a shorter extent than the |
1858 | // memref along the dimension *and* has an extent greater than one along the |
1859 | // next major dimension. |
1860 | if (bufferShape[d] < dimSize && bufferShape[d - 1] > 1) { |
1861 | strideInfos->push_back(Elt: {.stride: stride, .numEltPerStride: numEltPerStride}); |
1862 | } |
1863 | } |
1864 | } |
1865 | |
1866 | /// Generates a point-wise copy from/to `memref' to/from `fastMemRef' and |
1867 | /// returns the outermost AffineForOp of the copy loop nest. `lbMaps` and |
1868 | /// `ubMaps` along with `lbOperands` and `ubOperands` hold the lower and upper |
1869 | /// bound information for the copy loop nest. `fastBufOffsets` contain the |
1870 | /// expressions to be subtracted out from the respective copy loop iterators in |
1871 | /// order to index the fast buffer. If `copyOut' is true, generates a copy-out; |
1872 | /// otherwise a copy-in. Builder `b` should be set to the point the copy nest is |
1873 | /// inserted. |
1874 | // |
1875 | /// The copy-in nest is generated as follows as an example for a 2-d region: |
1876 | /// for x = ... |
1877 | /// for y = ... |
1878 | /// fast_buf[x - offset_x][y - offset_y] = memref[x][y] |
1879 | /// |
1880 | static AffineForOp |
1881 | generatePointWiseCopy(Location loc, Value memref, Value fastMemRef, |
1882 | ArrayRef<AffineMap> lbMaps, ArrayRef<Value> lbOperands, |
1883 | ArrayRef<AffineMap> ubMaps, ArrayRef<Value> ubOperands, |
1884 | ArrayRef<AffineExpr> fastBufOffsets, bool isCopyOut, |
1885 | OpBuilder b) { |
1886 | assert(llvm::all_of(lbMaps, [&](AffineMap lbMap) { |
1887 | return lbMap.getNumInputs() == lbOperands.size(); |
1888 | })); |
1889 | assert(llvm::all_of(ubMaps, [&](AffineMap ubMap) { |
1890 | return ubMap.getNumInputs() == ubOperands.size(); |
1891 | })); |
1892 | |
1893 | unsigned rank = cast<MemRefType>(memref.getType()).getRank(); |
1894 | assert(lbMaps.size() == rank && "wrong number of lb maps" ); |
1895 | assert(ubMaps.size() == rank && "wrong number of ub maps" ); |
1896 | |
1897 | SmallVector<Value, 4> memIndices; |
1898 | SmallVector<AffineExpr, 4> fastBufExprs; |
1899 | SmallVector<Value, 4> fastBufMapOperands; |
1900 | AffineForOp copyNestRoot; |
1901 | SmallVector<AffineApplyOp, 4> mayBeDeadApplys; |
1902 | for (unsigned d = 0; d < rank; ++d) { |
1903 | auto forOp = createCanonicalizedAffineForOp(b, loc, lbOperands, lbMaps[d], |
1904 | ubOperands, ubMaps[d]); |
1905 | if (d == 0) |
1906 | copyNestRoot = forOp; |
1907 | |
1908 | b = OpBuilder::atBlockTerminator(block: forOp.getBody()); |
1909 | |
1910 | auto fastBufOffsetMap = |
1911 | AffineMap::get(dimCount: lbOperands.size(), symbolCount: 0, result: fastBufOffsets[d]); |
1912 | auto offset = b.create<AffineApplyOp>(loc, fastBufOffsetMap, lbOperands); |
1913 | |
1914 | // Construct the subscript for the fast memref being copied into/from: |
1915 | // x - offset_x. |
1916 | fastBufExprs.push_back(Elt: b.getAffineDimExpr(position: 2 * d + 1) - |
1917 | b.getAffineDimExpr(position: 2 * d)); |
1918 | fastBufMapOperands.push_back(Elt: offset); |
1919 | fastBufMapOperands.push_back(Elt: forOp.getInductionVar()); |
1920 | mayBeDeadApplys.push_back(offset); |
1921 | |
1922 | // Subscript for the slow memref being copied. |
1923 | memIndices.push_back(Elt: forOp.getInductionVar()); |
1924 | } |
1925 | |
1926 | auto fastBufMap = |
1927 | AffineMap::get(dimCount: 2 * rank, /*symbolCount=*/0, results: fastBufExprs, context: b.getContext()); |
1928 | fullyComposeAffineMapAndOperands(&fastBufMap, &fastBufMapOperands); |
1929 | fastBufMap = simplifyAffineMap(fastBufMap); |
1930 | canonicalizeMapAndOperands(&fastBufMap, &fastBufMapOperands); |
1931 | |
1932 | // Drop any dead affine.applys. |
1933 | for (auto applyOp : mayBeDeadApplys) |
1934 | if (applyOp.use_empty()) |
1935 | applyOp.erase(); |
1936 | |
1937 | if (!isCopyOut) { |
1938 | // Copy in. |
1939 | auto load = b.create<AffineLoadOp>(loc, memref, memIndices); |
1940 | b.create<AffineStoreOp>(loc, load, fastMemRef, fastBufMap, |
1941 | fastBufMapOperands); |
1942 | return copyNestRoot; |
1943 | } |
1944 | |
1945 | // Copy out. |
1946 | auto load = |
1947 | b.create<AffineLoadOp>(loc, fastMemRef, fastBufMap, fastBufMapOperands); |
1948 | b.create<AffineStoreOp>(loc, load, memref, memIndices); |
1949 | return copyNestRoot; |
1950 | } |
1951 | |
1952 | static InFlightDiagnostic LLVM_ATTRIBUTE_UNUSED |
1953 | (Block &block) { |
1954 | return block.getParentOp()->emitRemark(); |
1955 | } |
1956 | |
1957 | /// Creates a buffer in the faster memory space for the specified memref region; |
1958 | /// generates a copy from the lower memory space to this one, and replaces all |
1959 | /// loads/stores in the block range [`begin', `end') of `block' to load/store |
1960 | /// from that buffer. Returns failure if copies could not be generated due to |
1961 | /// yet unimplemented cases. `copyInPlacementStart` and `copyOutPlacementStart` |
1962 | /// in copyPlacementBlock specify the insertion points where the incoming copies |
1963 | /// and outgoing copies, respectively, should be inserted (the insertion happens |
1964 | /// right before the insertion point). Since `begin` can itself be invalidated |
1965 | /// due to the memref rewriting done from this method, the output argument |
1966 | /// `nBegin` is set to its replacement (set to `begin` if no invalidation |
1967 | /// happens). Since outgoing copies could have been inserted at `end`, the |
1968 | /// output argument `nEnd` is set to the new end. `sizeInBytes` is set to the |
1969 | /// size of the fast buffer allocated. |
1970 | static LogicalResult generateCopy( |
1971 | const MemRefRegion ®ion, Block *block, Block::iterator begin, |
1972 | Block::iterator end, Block *copyPlacementBlock, |
1973 | Block::iterator copyInPlacementStart, Block::iterator copyOutPlacementStart, |
1974 | const AffineCopyOptions ©Options, DenseMap<Value, Value> &fastBufferMap, |
1975 | DenseSet<Operation *> ©Nests, uint64_t *sizeInBytes, |
1976 | Block::iterator *nBegin, Block::iterator *nEnd) { |
1977 | *nBegin = begin; |
1978 | *nEnd = end; |
1979 | |
1980 | func::FuncOp f = begin->getParentOfType<func::FuncOp>(); |
1981 | OpBuilder topBuilder(f.getBody()); |
1982 | Value zeroIndex = topBuilder.create<arith::ConstantIndexOp>(f.getLoc(), 0); |
1983 | |
1984 | *sizeInBytes = 0; |
1985 | |
1986 | if (begin == end) |
1987 | return success(); |
1988 | |
1989 | // Is the copy out point at the end of the block where we are doing |
1990 | // explicit copying. |
1991 | bool isCopyOutAtEndOfBlock = (end == copyOutPlacementStart); |
1992 | |
1993 | // Copies for read regions are going to be inserted at 'begin'. |
1994 | OpBuilder prologue(copyPlacementBlock, copyInPlacementStart); |
1995 | // Copies for write regions are going to be inserted at 'end'. |
1996 | OpBuilder epilogue(copyPlacementBlock, copyOutPlacementStart); |
1997 | OpBuilder &b = region.isWrite() ? epilogue : prologue; |
1998 | |
1999 | // Builder to create constants at the top level. |
2000 | auto func = copyPlacementBlock->getParent()->getParentOfType<func::FuncOp>(); |
2001 | OpBuilder top(func.getBody()); |
2002 | |
2003 | auto loc = region.loc; |
2004 | auto memref = region.memref; |
2005 | auto memRefType = cast<MemRefType>(memref.getType()); |
2006 | |
2007 | if (!memRefType.getLayout().isIdentity()) { |
2008 | LLVM_DEBUG(llvm::dbgs() << "Non-identity layout map not yet supported\n" ); |
2009 | return failure(); |
2010 | } |
2011 | |
2012 | // Indices to use for the copying. |
2013 | // Indices for the original memref being copied from/to. |
2014 | SmallVector<Value, 4> memIndices; |
2015 | // Indices for the faster buffer being copied into/from. |
2016 | SmallVector<Value, 4> bufIndices; |
2017 | |
2018 | unsigned rank = memRefType.getRank(); |
2019 | SmallVector<int64_t, 4> fastBufferShape; |
2020 | |
2021 | // Compute the extents of the buffer. |
2022 | std::vector<SmallVector<int64_t, 4>> lbs; |
2023 | SmallVector<int64_t, 8> lbDivisors; |
2024 | lbs.reserve(n: rank); |
2025 | std::optional<int64_t> numElements = region.getConstantBoundingSizeAndShape( |
2026 | shape: &fastBufferShape, lbs: &lbs, lbDivisors: &lbDivisors); |
2027 | if (!numElements) { |
2028 | LLVM_DEBUG(llvm::dbgs() << "Non-constant region size not supported\n" ); |
2029 | return failure(); |
2030 | } |
2031 | |
2032 | if (*numElements == 0) { |
2033 | LLVM_DEBUG(llvm::dbgs() << "Nothing to copy\n" ); |
2034 | return success(); |
2035 | } |
2036 | |
2037 | SmallVector<AffineMap, 4> lbMaps(rank), ubMaps(rank); |
2038 | for (unsigned i = 0; i < rank; ++i) |
2039 | region.getLowerAndUpperBound(pos: i, lbMap&: lbMaps[i], ubMap&: ubMaps[i]); |
2040 | |
2041 | const FlatAffineValueConstraints *cst = region.getConstraints(); |
2042 | // 'regionSymbols' hold values that this memory region is symbolic/parametric |
2043 | // on; these typically include loop IVs surrounding the level at which the |
2044 | // copy generation is being done or other valid symbols in MLIR. |
2045 | SmallVector<Value, 8> regionSymbols; |
2046 | cst->getValues(start: rank, end: cst->getNumVars(), values: ®ionSymbols); |
2047 | |
2048 | // Construct the index expressions for the fast memory buffer. The index |
2049 | // expression for a particular dimension of the fast buffer is obtained by |
2050 | // subtracting out the lower bound on the original memref's data region |
2051 | // along the corresponding dimension. |
2052 | |
2053 | // Index start offsets for faster memory buffer relative to the original. |
2054 | SmallVector<AffineExpr, 4> fastBufOffsets; |
2055 | fastBufOffsets.reserve(N: rank); |
2056 | for (unsigned d = 0; d < rank; d++) { |
2057 | assert(lbs[d].size() == cst->getNumCols() - rank && "incorrect bound size" ); |
2058 | |
2059 | AffineExpr offset = top.getAffineConstantExpr(constant: 0); |
2060 | for (unsigned j = 0, e = cst->getNumCols() - rank - 1; j < e; j++) |
2061 | offset = offset + lbs[d][j] * top.getAffineDimExpr(position: j); |
2062 | assert(lbDivisors[d] > 0); |
2063 | offset = |
2064 | (offset + lbs[d][cst->getNumCols() - 1 - rank]).floorDiv(v: lbDivisors[d]); |
2065 | |
2066 | // Set copy start location for this dimension in the lower memory space |
2067 | // memref. |
2068 | if (auto caf = dyn_cast<AffineConstantExpr>(offset)) { |
2069 | auto indexVal = caf.getValue(); |
2070 | if (indexVal == 0) { |
2071 | memIndices.push_back(Elt: zeroIndex); |
2072 | } else { |
2073 | memIndices.push_back( |
2074 | Elt: top.create<arith::ConstantIndexOp>(loc, indexVal).getResult()); |
2075 | } |
2076 | } else { |
2077 | // The coordinate for the start location is just the lower bound along the |
2078 | // corresponding dimension on the memory region (stored in 'offset'). |
2079 | auto map = AffineMap::get( |
2080 | dimCount: cst->getNumDimVars() + cst->getNumSymbolVars() - rank, symbolCount: 0, result: offset); |
2081 | memIndices.push_back(Elt: b.create<AffineApplyOp>(loc, map, regionSymbols)); |
2082 | } |
2083 | // The fast buffer is copied into at location zero; addressing is relative. |
2084 | bufIndices.push_back(Elt: zeroIndex); |
2085 | |
2086 | // Record the offsets since they are needed to remap the memory accesses of |
2087 | // the original memref further below. |
2088 | fastBufOffsets.push_back(Elt: offset); |
2089 | } |
2090 | |
2091 | // The faster memory space buffer. |
2092 | Value fastMemRef; |
2093 | |
2094 | // Check if a buffer was already created. |
2095 | bool existingBuf = fastBufferMap.count(Val: memref) > 0; |
2096 | if (!existingBuf) { |
2097 | AffineMap fastBufferLayout = b.getMultiDimIdentityMap(rank); |
2098 | auto fastMemRefType = |
2099 | MemRefType::get(fastBufferShape, memRefType.getElementType(), |
2100 | fastBufferLayout, copyOptions.fastMemorySpace); |
2101 | |
2102 | // Create the fast memory space buffer just before the 'affine.for' |
2103 | // operation. |
2104 | fastMemRef = |
2105 | prologue.create<memref::AllocOp>(loc, fastMemRefType).getResult(); |
2106 | // Record it. |
2107 | fastBufferMap[memref] = fastMemRef; |
2108 | // fastMemRefType is a constant shaped memref. |
2109 | auto maySizeInBytes = getIntOrFloatMemRefSizeInBytes(fastMemRefType); |
2110 | // We don't account for things of unknown size. |
2111 | *sizeInBytes = maySizeInBytes ? *maySizeInBytes : 0; |
2112 | |
2113 | LLVM_DEBUG(emitRemarkForBlock(*block) |
2114 | << "Creating fast buffer of type " << fastMemRefType |
2115 | << " and size " << llvm::divideCeil(*sizeInBytes, 1024) |
2116 | << " KiB\n" ); |
2117 | } else { |
2118 | // Reuse the one already created. |
2119 | fastMemRef = fastBufferMap[memref]; |
2120 | } |
2121 | |
2122 | auto numElementsSSA = top.create<arith::ConstantIndexOp>(location: loc, args&: *numElements); |
2123 | |
2124 | Value dmaStride = nullptr; |
2125 | Value numEltPerDmaStride = nullptr; |
2126 | if (copyOptions.generateDma) { |
2127 | SmallVector<StrideInfo, 4> dmaStrideInfos; |
2128 | getMultiLevelStrides(region, bufferShape: fastBufferShape, strideInfos: &dmaStrideInfos); |
2129 | |
2130 | // TODO: use all stride levels once DmaStartOp is extended for |
2131 | // multi-level strides. |
2132 | if (dmaStrideInfos.size() > 1) { |
2133 | LLVM_DEBUG(llvm::dbgs() << "Only up to one level of stride supported\n" ); |
2134 | return failure(); |
2135 | } |
2136 | |
2137 | if (!dmaStrideInfos.empty()) { |
2138 | dmaStride = |
2139 | top.create<arith::ConstantIndexOp>(location: loc, args&: dmaStrideInfos[0].stride); |
2140 | numEltPerDmaStride = top.create<arith::ConstantIndexOp>( |
2141 | location: loc, args&: dmaStrideInfos[0].numEltPerStride); |
2142 | } |
2143 | } |
2144 | |
2145 | // Record the last operation where we want the memref replacement to end. We |
2146 | // later do the memref replacement only in [begin, postDomFilter] so |
2147 | // that the original memref's used in the data movement code themselves don't |
2148 | // get replaced. |
2149 | auto postDomFilter = std::prev(x: end); |
2150 | |
2151 | // Create fully composed affine maps for each memref. |
2152 | auto memAffineMap = b.getMultiDimIdentityMap(rank: memIndices.size()); |
2153 | fullyComposeAffineMapAndOperands(map: &memAffineMap, operands: &memIndices); |
2154 | auto bufAffineMap = b.getMultiDimIdentityMap(rank: bufIndices.size()); |
2155 | fullyComposeAffineMapAndOperands(map: &bufAffineMap, operands: &bufIndices); |
2156 | |
2157 | if (!copyOptions.generateDma) { |
2158 | // Point-wise copy generation. |
2159 | auto copyNest = |
2160 | generatePointWiseCopy(loc, memref, fastMemRef, lbMaps, |
2161 | /*lbOperands=*/regionSymbols, ubMaps, |
2162 | /*ubOperands=*/regionSymbols, fastBufOffsets, |
2163 | /*isCopyOut=*/region.isWrite(), b); |
2164 | |
2165 | // Record this so that we can skip it from yet another copy. |
2166 | copyNests.insert(copyNest); |
2167 | |
2168 | // Since new ops are being appended (for copy out's), adjust the end to |
2169 | // mark end of block range being processed if necessary. |
2170 | if (region.isWrite() && isCopyOutAtEndOfBlock) |
2171 | *nEnd = Block::iterator(copyNest.getOperation()); |
2172 | } else { |
2173 | // DMA generation. |
2174 | // Create a tag (single element 1-d memref) for the DMA. |
2175 | auto tagMemRefType = MemRefType::get({1}, top.getIntegerType(32), {}, |
2176 | copyOptions.tagMemorySpace); |
2177 | auto tagMemRef = prologue.create<memref::AllocOp>(loc, tagMemRefType); |
2178 | |
2179 | SmallVector<Value, 4> tagIndices({zeroIndex}); |
2180 | auto tagAffineMap = b.getMultiDimIdentityMap(rank: tagIndices.size()); |
2181 | fullyComposeAffineMapAndOperands(&tagAffineMap, &tagIndices); |
2182 | if (!region.isWrite()) { |
2183 | // DMA non-blocking read from original buffer to fast buffer. |
2184 | b.create<AffineDmaStartOp>(loc, memref, memAffineMap, memIndices, |
2185 | fastMemRef, bufAffineMap, bufIndices, |
2186 | tagMemRef, tagAffineMap, tagIndices, |
2187 | numElementsSSA, dmaStride, numEltPerDmaStride); |
2188 | } else { |
2189 | // DMA non-blocking write from fast buffer to the original memref. |
2190 | auto op = b.create<AffineDmaStartOp>( |
2191 | loc, fastMemRef, bufAffineMap, bufIndices, memref, memAffineMap, |
2192 | memIndices, tagMemRef, tagAffineMap, tagIndices, numElementsSSA, |
2193 | dmaStride, numEltPerDmaStride); |
2194 | // Since new ops may be appended at 'end' (for outgoing DMAs), adjust the |
2195 | // end to mark end of block range being processed. |
2196 | if (isCopyOutAtEndOfBlock) |
2197 | *nEnd = Block::iterator(op.getOperation()); |
2198 | } |
2199 | |
2200 | // Matching DMA wait to block on completion; tag always has a 0 index. |
2201 | b.create<AffineDmaWaitOp>(loc, tagMemRef, tagAffineMap, zeroIndex, |
2202 | numElementsSSA); |
2203 | |
2204 | // Generate dealloc for the tag. |
2205 | auto tagDeallocOp = epilogue.create<memref::DeallocOp>(loc, tagMemRef); |
2206 | if (*nEnd == end && isCopyOutAtEndOfBlock) |
2207 | // Since new ops are being appended (for outgoing DMAs), adjust the end to |
2208 | // mark end of range of the original. |
2209 | *nEnd = Block::iterator(tagDeallocOp.getOperation()); |
2210 | } |
2211 | |
2212 | // Generate dealloc for the buffer. |
2213 | if (!existingBuf) { |
2214 | auto bufDeallocOp = epilogue.create<memref::DeallocOp>(loc, fastMemRef); |
2215 | // When generating pointwise copies, `nEnd' has to be set to deallocOp on |
2216 | // the fast buffer (since it marks the new end insertion point). |
2217 | if (!copyOptions.generateDma && *nEnd == end && isCopyOutAtEndOfBlock) |
2218 | *nEnd = Block::iterator(bufDeallocOp.getOperation()); |
2219 | } |
2220 | |
2221 | // Replace all uses of the old memref with the faster one while remapping |
2222 | // access indices (subtracting out lower bound offsets for each dimension). |
2223 | // Ex: to replace load %A[%i, %j] with load %Abuf[%i - %iT, %j - %jT], |
2224 | // index remap will be (%i, %j) -> (%i - %iT, %j - %jT), |
2225 | // i.e., affine.apply (d0, d1, d2, d3) -> (d2-d0, d3-d1) (%iT, %jT, %i, %j), |
2226 | // and (%iT, %jT) will be the 'extraOperands' for 'rep all memref uses with'. |
2227 | // d2, d3 correspond to the original indices (%i, %j). |
2228 | SmallVector<AffineExpr, 4> remapExprs; |
2229 | remapExprs.reserve(N: rank); |
2230 | for (unsigned i = 0; i < rank; i++) { |
2231 | // The starting operands of indexRemap will be regionSymbols (the symbols on |
2232 | // which the memref region is parametric); then those corresponding to |
2233 | // the memref's original indices follow. |
2234 | auto dimExpr = b.getAffineDimExpr(position: regionSymbols.size() + i); |
2235 | remapExprs.push_back(Elt: dimExpr - fastBufOffsets[i]); |
2236 | } |
2237 | auto indexRemap = AffineMap::get(dimCount: regionSymbols.size() + rank, symbolCount: 0, results: remapExprs, |
2238 | context: b.getContext()); |
2239 | |
2240 | // Record the begin since it may be invalidated by memref replacement. |
2241 | Block::iterator prevOfBegin; |
2242 | bool isBeginAtStartOfBlock = (begin == block->begin()); |
2243 | if (!isBeginAtStartOfBlock) |
2244 | prevOfBegin = std::prev(x: begin); |
2245 | |
2246 | // *Only* those uses within the range [begin, end) of 'block' are replaced. |
2247 | (void)replaceAllMemRefUsesWith(memref, fastMemRef, |
2248 | /*extraIndices=*/{}, indexRemap, |
2249 | /*extraOperands=*/regionSymbols, |
2250 | /*symbolOperands=*/{}, |
2251 | /*domOpFilter=*/&*begin, |
2252 | /*postDomOpFilter=*/&*postDomFilter); |
2253 | |
2254 | *nBegin = isBeginAtStartOfBlock ? block->begin() : std::next(x: prevOfBegin); |
2255 | |
2256 | return success(); |
2257 | } |
2258 | |
2259 | /// Construct the memref region to just include the entire memref. Returns false |
2260 | /// dynamic shaped memref's for now. `numParamLoopIVs` is the number of |
2261 | /// enclosing loop IVs of `op` (starting from the outermost) that the region |
2262 | /// is parametric on. |
2263 | static bool getFullMemRefAsRegion(Operation *op, unsigned numParamLoopIVs, |
2264 | MemRefRegion *region) { |
2265 | unsigned rank; |
2266 | if (auto loadOp = dyn_cast<AffineLoadOp>(op)) { |
2267 | rank = loadOp.getMemRefType().getRank(); |
2268 | region->memref = loadOp.getMemRef(); |
2269 | region->setWrite(false); |
2270 | } else if (auto storeOp = dyn_cast<AffineStoreOp>(op)) { |
2271 | rank = storeOp.getMemRefType().getRank(); |
2272 | region->memref = storeOp.getMemRef(); |
2273 | region->setWrite(true); |
2274 | } else { |
2275 | assert(false && "expected load or store op" ); |
2276 | return false; |
2277 | } |
2278 | auto memRefType = cast<MemRefType>(region->memref.getType()); |
2279 | if (!memRefType.hasStaticShape()) |
2280 | return false; |
2281 | |
2282 | auto *regionCst = region->getConstraints(); |
2283 | |
2284 | // Just get the first numSymbols IVs, which the memref region is parametric |
2285 | // on. |
2286 | SmallVector<AffineForOp, 4> ivs; |
2287 | getAffineForIVs(*op, &ivs); |
2288 | ivs.resize(numParamLoopIVs); |
2289 | SmallVector<Value, 4> symbols; |
2290 | extractForInductionVars(ivs, &symbols); |
2291 | *regionCst = FlatAffineValueConstraints(rank, numParamLoopIVs, 0); |
2292 | regionCst->setValues(start: rank, end: rank + numParamLoopIVs, values: symbols); |
2293 | |
2294 | // Memref dim sizes provide the bounds. |
2295 | for (unsigned d = 0; d < rank; d++) { |
2296 | auto dimSize = memRefType.getDimSize(d); |
2297 | assert(dimSize > 0 && "filtered dynamic shapes above" ); |
2298 | regionCst->addBound(type: BoundType::LB, pos: d, value: 0); |
2299 | regionCst->addBound(BoundType::UB, d, dimSize - 1); |
2300 | } |
2301 | return true; |
2302 | } |
2303 | |
2304 | LogicalResult |
2305 | mlir::affine::affineDataCopyGenerate(Block::iterator begin, Block::iterator end, |
2306 | const AffineCopyOptions ©Options, |
2307 | std::optional<Value> filterMemRef, |
2308 | DenseSet<Operation *> ©Nests) { |
2309 | if (begin == end) |
2310 | return success(); |
2311 | |
2312 | assert(begin->getBlock() == std::prev(end)->getBlock() && |
2313 | "Inconsistent block begin/end args" ); |
2314 | assert(end != end->getBlock()->end() && "end can't be the block terminator" ); |
2315 | |
2316 | Block *block = begin->getBlock(); |
2317 | |
2318 | // Copies will be generated for this depth, i.e., symbolic in all loops |
2319 | // surrounding the this block range. |
2320 | unsigned copyDepth = getNestingDepth(op: &*begin); |
2321 | |
2322 | LLVM_DEBUG(llvm::dbgs() << "Generating copies at depth " << copyDepth |
2323 | << "\n" ); |
2324 | LLVM_DEBUG(llvm::dbgs() << "from begin: " << *begin << "\n" ); |
2325 | LLVM_DEBUG(llvm::dbgs() << "to inclusive end: " << *std::prev(end) << "\n" ); |
2326 | |
2327 | // List of memory regions to copy for. We need a map vector to have a |
2328 | // guaranteed iteration order to write test cases. CHECK-DAG doesn't help here |
2329 | // since the alloc's for example are identical except for the SSA id. |
2330 | SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4> readRegions; |
2331 | SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4> writeRegions; |
2332 | |
2333 | // Map from original memref's to the fast buffers that their accesses are |
2334 | // replaced with. |
2335 | DenseMap<Value, Value> fastBufferMap; |
2336 | |
2337 | // To check for errors when walking the block. |
2338 | bool error = false; |
2339 | |
2340 | // Walk this range of operations to gather all memory regions. |
2341 | block->walk(begin, end, callback: [&](Operation *opInst) { |
2342 | // Gather regions to allocate to buffers in faster memory space. |
2343 | if (auto loadOp = dyn_cast<AffineLoadOp>(opInst)) { |
2344 | if ((filterMemRef.has_value() && filterMemRef != loadOp.getMemRef()) || |
2345 | (loadOp.getMemRefType().getMemorySpaceAsInt() != |
2346 | copyOptions.slowMemorySpace)) |
2347 | return; |
2348 | } else if (auto storeOp = dyn_cast<AffineStoreOp>(opInst)) { |
2349 | if ((filterMemRef.has_value() && filterMemRef != storeOp.getMemRef()) || |
2350 | storeOp.getMemRefType().getMemorySpaceAsInt() != |
2351 | copyOptions.slowMemorySpace) |
2352 | return; |
2353 | } else { |
2354 | // Neither load nor a store op. |
2355 | return; |
2356 | } |
2357 | |
2358 | // Compute the MemRefRegion accessed. |
2359 | auto region = std::make_unique<MemRefRegion>(args: opInst->getLoc()); |
2360 | if (failed(result: region->compute(op: opInst, loopDepth: copyDepth, /*sliceState=*/nullptr, |
2361 | /*addMemRefDimBounds=*/false))) { |
2362 | LLVM_DEBUG(llvm::dbgs() |
2363 | << "Error obtaining memory region: semi-affine maps?\n" ); |
2364 | LLVM_DEBUG(llvm::dbgs() << "over-approximating to the entire memref\n" ); |
2365 | if (!getFullMemRefAsRegion(op: opInst, numParamLoopIVs: copyDepth, region: region.get())) { |
2366 | LLVM_DEBUG( |
2367 | opInst->emitError("non-constant memref sizes not yet supported" )); |
2368 | error = true; |
2369 | return; |
2370 | } |
2371 | } |
2372 | |
2373 | // Each memref has a single buffer associated with it irrespective of how |
2374 | // many load's and store's happen on it. |
2375 | // TODO: in the future, when regions don't intersect and satisfy |
2376 | // other properties (based on load/store regions), we could consider |
2377 | // multiple buffers per memref. |
2378 | |
2379 | // Add to the appropriate region if it's not already in it, or take a |
2380 | // bounding box union with the existing one if it's already in there. |
2381 | // Note that a memref may have both read and write regions - so update the |
2382 | // region in the other list if one exists (write in case of read and vice |
2383 | // versa) since there is a single bounding box for a memref across all reads |
2384 | // and writes that happen on it. |
2385 | |
2386 | // Attempts to update; returns true if 'region' exists in targetRegions. |
2387 | auto updateRegion = |
2388 | [&](const SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4> |
2389 | &targetRegions) { |
2390 | const auto *const it = targetRegions.find(Key: region->memref); |
2391 | if (it == targetRegions.end()) |
2392 | return false; |
2393 | |
2394 | // Perform a union with the existing region. |
2395 | if (failed(result: it->second->unionBoundingBox(other: *region))) { |
2396 | LLVM_DEBUG(llvm::dbgs() |
2397 | << "Memory region bounding box failed; " |
2398 | "over-approximating to the entire memref\n" ); |
2399 | // If the union fails, we will overapproximate. |
2400 | if (!getFullMemRefAsRegion(op: opInst, numParamLoopIVs: copyDepth, region: region.get())) { |
2401 | LLVM_DEBUG(opInst->emitError( |
2402 | "non-constant memref sizes not yet supported" )); |
2403 | error = true; |
2404 | return true; |
2405 | } |
2406 | it->second->getConstraints()->clearAndCopyFrom( |
2407 | other: *region->getConstraints()); |
2408 | } else { |
2409 | // Union was computed and stored in 'it->second': copy to 'region'. |
2410 | region->getConstraints()->clearAndCopyFrom( |
2411 | other: *it->second->getConstraints()); |
2412 | } |
2413 | return true; |
2414 | }; |
2415 | |
2416 | bool existsInRead = updateRegion(readRegions); |
2417 | if (error) |
2418 | return; |
2419 | bool existsInWrite = updateRegion(writeRegions); |
2420 | if (error) |
2421 | return; |
2422 | |
2423 | // Finally add it to the region list. |
2424 | if (region->isWrite() && !existsInWrite) { |
2425 | writeRegions[region->memref] = std::move(region); |
2426 | } else if (!region->isWrite() && !existsInRead) { |
2427 | readRegions[region->memref] = std::move(region); |
2428 | } |
2429 | }); |
2430 | |
2431 | if (error) { |
2432 | LLVM_DEBUG(begin->emitError( |
2433 | "copy generation failed for one or more memref's in this block\n" )); |
2434 | return failure(); |
2435 | } |
2436 | |
2437 | uint64_t = 0; |
2438 | bool ret = true; |
2439 | auto processRegions = |
2440 | [&](const SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4> |
2441 | ®ions) { |
2442 | for (const auto ®ionEntry : regions) { |
2443 | // For each region, hoist copy in/out past all hoistable |
2444 | // 'affine.for's. |
2445 | Block::iterator copyInPlacementStart, copyOutPlacementStart; |
2446 | Block *copyPlacementBlock; |
2447 | findHighestBlockForPlacement( |
2448 | region: *regionEntry.second, block&: *block, begin, end, copyPlacementBlock: ©PlacementBlock, |
2449 | copyInPlacementStart: ©InPlacementStart, copyOutPlacementStart: ©OutPlacementStart); |
2450 | |
2451 | uint64_t sizeInBytes; |
2452 | Block::iterator nBegin, nEnd; |
2453 | LogicalResult iRet = generateCopy( |
2454 | region: *regionEntry.second, block, begin, end, copyPlacementBlock, |
2455 | copyInPlacementStart, copyOutPlacementStart, copyOptions, |
2456 | fastBufferMap, copyNests, sizeInBytes: &sizeInBytes, nBegin: &nBegin, nEnd: &nEnd); |
2457 | if (succeeded(result: iRet)) { |
2458 | // begin/end could have been invalidated, and need update. |
2459 | begin = nBegin; |
2460 | end = nEnd; |
2461 | totalCopyBuffersSizeInBytes += sizeInBytes; |
2462 | } |
2463 | ret = ret & succeeded(result: iRet); |
2464 | } |
2465 | }; |
2466 | processRegions(readRegions); |
2467 | processRegions(writeRegions); |
2468 | |
2469 | if (!ret) { |
2470 | LLVM_DEBUG(begin->emitError( |
2471 | "copy generation failed for one or more memref's in this block\n" )); |
2472 | return failure(); |
2473 | } |
2474 | |
2475 | // For a range of operations, a note will be emitted at the caller. |
2476 | AffineForOp forOp; |
2477 | if (llvm::DebugFlag && (forOp = dyn_cast<AffineForOp>(&*begin))) { |
2478 | LLVM_DEBUG(forOp.emitRemark() |
2479 | << llvm::divideCeil(totalCopyBuffersSizeInBytes, 1024) |
2480 | << " KiB of copy buffers in fast memory space for this block" ); |
2481 | } |
2482 | |
2483 | if (totalCopyBuffersSizeInBytes > copyOptions.fastMemCapacityBytes) { |
2484 | block->getParentOp()->emitWarning( |
2485 | message: "total size of all copy buffers' for this block exceeds fast memory " |
2486 | "capacity" ); |
2487 | } |
2488 | |
2489 | return success(); |
2490 | } |
2491 | |
2492 | // A convenience version of affineDataCopyGenerate for all ops in the body of |
2493 | // an AffineForOp. |
2494 | LogicalResult mlir::affine::affineDataCopyGenerate( |
2495 | AffineForOp forOp, const AffineCopyOptions ©Options, |
2496 | std::optional<Value> filterMemRef, DenseSet<Operation *> ©Nests) { |
2497 | return affineDataCopyGenerate(forOp.getBody()->begin(), |
2498 | std::prev(forOp.getBody()->end()), copyOptions, |
2499 | filterMemRef, copyNests); |
2500 | } |
2501 | |
2502 | LogicalResult mlir::affine::generateCopyForMemRegion( |
2503 | const MemRefRegion &memrefRegion, Operation *analyzedOp, |
2504 | const AffineCopyOptions ©Options, CopyGenerateResult &result) { |
2505 | Block *block = analyzedOp->getBlock(); |
2506 | auto begin = analyzedOp->getIterator(); |
2507 | auto end = std::next(begin); |
2508 | DenseMap<Value, Value> fastBufferMap; |
2509 | DenseSet<Operation *> copyNests; |
2510 | |
2511 | auto err = generateCopy(memrefRegion, block, begin, end, block, begin, end, |
2512 | copyOptions, fastBufferMap, copyNests, |
2513 | &result.sizeInBytes, &begin, &end); |
2514 | if (failed(err)) |
2515 | return err; |
2516 | |
2517 | const auto &en = fastBufferMap.find(Val: memrefRegion.memref); |
2518 | // In some cases (empty loops), no copy generation would have happened. |
2519 | if (en == fastBufferMap.end()) |
2520 | return failure(); |
2521 | result.alloc = en->second.getDefiningOp(); |
2522 | assert(result.alloc && "fast buffer expected to be locally allocated" ); |
2523 | assert(copyNests.size() <= 1 && "At most one copy nest is expected." ); |
2524 | result.copyNest = copyNests.empty() ? nullptr : *copyNests.begin(); |
2525 | return success(); |
2526 | } |
2527 | |
2528 | /// Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'. |
2529 | static void |
2530 | gatherLoopsInBlock(Block *block, unsigned currLoopDepth, |
2531 | std::vector<SmallVector<AffineForOp, 2>> &depthToLoops) { |
2532 | // Add a new empty level to output if it doesn't exist level already. |
2533 | assert(currLoopDepth <= depthToLoops.size() && "Unexpected currLoopDepth" ); |
2534 | if (currLoopDepth == depthToLoops.size()) |
2535 | depthToLoops.emplace_back(); |
2536 | |
2537 | for (auto &op : *block) { |
2538 | if (auto forOp = dyn_cast<AffineForOp>(op)) { |
2539 | depthToLoops[currLoopDepth].push_back(forOp); |
2540 | gatherLoopsInBlock(forOp.getBody(), currLoopDepth + 1, depthToLoops); |
2541 | } |
2542 | } |
2543 | } |
2544 | |
2545 | /// Gathers all AffineForOps in 'func.func' grouped by loop depth. |
2546 | void mlir::affine::gatherLoops( |
2547 | func::FuncOp func, std::vector<SmallVector<AffineForOp, 2>> &depthToLoops) { |
2548 | for (auto &block : func) |
2549 | gatherLoopsInBlock(&block, /*currLoopDepth=*/0, depthToLoops); |
2550 | |
2551 | // Remove last loop level from output since it's empty. |
2552 | if (!depthToLoops.empty()) { |
2553 | assert(depthToLoops.back().empty() && "Last loop level is not empty?" ); |
2554 | depthToLoops.pop_back(); |
2555 | } |
2556 | } |
2557 | |
2558 | AffineForOp mlir::affine::createCanonicalizedAffineForOp( |
2559 | OpBuilder b, Location loc, ValueRange lbOperands, AffineMap lbMap, |
2560 | ValueRange ubOperands, AffineMap ubMap, int64_t step) { |
2561 | SmallVector<Value, 4> lowerOperands(lbOperands); |
2562 | SmallVector<Value, 4> upperOperands(ubOperands); |
2563 | |
2564 | fullyComposeAffineMapAndOperands(map: &lbMap, operands: &lowerOperands); |
2565 | canonicalizeMapAndOperands(map: &lbMap, operands: &lowerOperands); |
2566 | lbMap = removeDuplicateExprs(map: lbMap); |
2567 | fullyComposeAffineMapAndOperands(map: &ubMap, operands: &upperOperands); |
2568 | canonicalizeMapAndOperands(map: &ubMap, operands: &upperOperands); |
2569 | ubMap = removeDuplicateExprs(map: ubMap); |
2570 | |
2571 | return b.create<AffineForOp>(loc, lowerOperands, lbMap, upperOperands, ubMap, |
2572 | step); |
2573 | } |
2574 | |
2575 | /// Creates an AffineIfOp that encodes the conditional to choose between |
2576 | /// the constant trip count version and an unknown trip count version of this |
2577 | /// nest of loops. This is used to separate partial and full tiles if `loops` |
2578 | /// has the intra-tile loops. The affine.if op is inserted at the builder |
2579 | /// insertion point of `b`. |
2580 | static AffineIfOp createSeparationCondition(MutableArrayRef<AffineForOp> loops, |
2581 | OpBuilder b) { |
2582 | if (loops.empty()) |
2583 | return nullptr; |
2584 | |
2585 | auto *context = loops[0].getContext(); |
2586 | |
2587 | FlatAffineValueConstraints cst; |
2588 | SmallVector<Operation *, 8> ops; |
2589 | llvm::append_range(C&: ops, R&: loops); |
2590 | (void)getIndexSet(ops, domain: &cst); |
2591 | |
2592 | // Remove constraints that are independent of these loop IVs. |
2593 | cst.removeIndependentConstraints(/*pos=*/0, /*num=*/loops.size()); |
2594 | |
2595 | // Construct the constraint set representing the guard for full tiles. The |
2596 | // lower bound (and upper bound) corresponding to the full tile should be |
2597 | // larger (and resp. smaller) than any other lower (or upper bound). |
2598 | SmallVector<int64_t, 8> fullTileLb, fullTileUb; |
2599 | for (auto loop : loops) { |
2600 | (void)loop; |
2601 | // TODO: Non-unit stride is not an issue to generalize to. |
2602 | assert(loop.getStepAsInt() == 1 && "point loop step expected to be one" ); |
2603 | // Mark everything symbols for the purpose of finding a constant diff pair. |
2604 | cst.setDimSymbolSeparation(/*newSymbolCount=*/cst.getNumDimAndSymbolVars() - |
2605 | 1); |
2606 | unsigned fullTileLbPos, fullTileUbPos; |
2607 | if (!cst.getConstantBoundOnDimSize(0, /*lb=*/nullptr, |
2608 | /*boundFloorDivisor=*/nullptr, |
2609 | /*ub=*/nullptr, &fullTileLbPos, |
2610 | &fullTileUbPos)) { |
2611 | LLVM_DEBUG(llvm::dbgs() << "Can't get constant diff pair for a loop\n" ); |
2612 | return nullptr; |
2613 | } |
2614 | |
2615 | SmallVector<unsigned, 4> lbIndices, ubIndices; |
2616 | cst.getLowerAndUpperBoundIndices(/*pos=*/0, &lbIndices, &ubIndices); |
2617 | |
2618 | auto fLb = cst.getInequality(fullTileLbPos); |
2619 | auto fUb = cst.getInequality(fullTileUbPos); |
2620 | fullTileLb.assign(fLb.begin(), fLb.end()); |
2621 | fullTileUb.assign(fUb.begin(), fUb.end()); |
2622 | |
2623 | // Full tile lower bound should be >= than any other lower bound. |
2624 | for (auto lbIndex : lbIndices) |
2625 | for (unsigned i = 0, e = cst.getNumCols(); i < e; ++i) |
2626 | cst.atIneq(lbIndex, i) = fullTileLb[i] - cst.atIneq(lbIndex, i); |
2627 | |
2628 | // Full tile upper bound should be <= any other upper bound. |
2629 | for (auto ubIndex : ubIndices) |
2630 | for (unsigned i = 0, e = cst.getNumCols(); i < e; ++i) |
2631 | cst.atIneq(ubIndex, i) -= fullTileUb[i]; |
2632 | |
2633 | cst.removeVar(0); |
2634 | } |
2635 | |
2636 | // The previous step leads to all zeros for the full tile lb and ub position |
2637 | // itself; remove those and any other duplicates / trivial redundancies. |
2638 | cst.removeTrivialRedundancy(); |
2639 | |
2640 | // Turn everything into dims conservatively since we earlier turned all |
2641 | // trailing ids past point loop IV into symbols. Some of these could be outer |
2642 | // loop IVs; we'll canonicalize anyway. |
2643 | cst.setDimSymbolSeparation(0); |
2644 | |
2645 | IntegerSet ifCondSet = cst.getAsIntegerSet(context: context); |
2646 | // ifCondSet can be null if cst was empty -- this can happen if all loops |
2647 | // in the nest have constant trip counts. |
2648 | if (!ifCondSet) |
2649 | return nullptr; |
2650 | |
2651 | SmallVector<Value, 4> setOperands; |
2652 | cst.getValues(start: 0, end: cst.getNumDimAndSymbolVars(), values: &setOperands); |
2653 | canonicalizeSetAndOperands(set: &ifCondSet, operands: &setOperands); |
2654 | return b.create<AffineIfOp>(loops[0].getLoc(), ifCondSet, setOperands, |
2655 | /*withElseRegion=*/true); |
2656 | } |
2657 | |
2658 | /// Create the full tile loop nest (along with its body). |
2659 | static LogicalResult |
2660 | createFullTiles(MutableArrayRef<AffineForOp> inputNest, |
2661 | SmallVectorImpl<AffineForOp> &fullTileLoops, OpBuilder b) { |
2662 | fullTileLoops.reserve(N: inputNest.size()); |
2663 | |
2664 | // For each loop in the original nest identify a lower/upper bound pair such |
2665 | // that their difference is a constant. |
2666 | FlatAffineValueConstraints cst; |
2667 | for (auto loop : inputNest) { |
2668 | // TODO: straightforward to generalize to a non-unit stride. |
2669 | if (loop.getStepAsInt() != 1) { |
2670 | LLVM_DEBUG(llvm::dbgs() |
2671 | << "[tile separation] non-unit stride not implemented\n" ); |
2672 | return failure(); |
2673 | } |
2674 | SmallVector<Operation *, 1> loopOp{loop.getOperation()}; |
2675 | (void)getIndexSet(loopOp, &cst); |
2676 | // We will mark everything other than this loop IV as symbol for getting a |
2677 | // pair of <lb, ub> with a constant difference. |
2678 | cst.setDimSymbolSeparation(cst.getNumDimAndSymbolVars() - 1); |
2679 | unsigned lbPos, ubPos; |
2680 | if (!cst.getConstantBoundOnDimSize(/*pos=*/0, /*lb=*/nullptr, |
2681 | /*boundFloorDivisor=*/nullptr, |
2682 | /*ub=*/nullptr, &lbPos, &ubPos) || |
2683 | lbPos == ubPos) { |
2684 | LLVM_DEBUG(llvm::dbgs() << "[tile separation] Can't get constant diff / " |
2685 | "equalities not yet handled\n" ); |
2686 | return failure(); |
2687 | } |
2688 | |
2689 | // Set all variables as dimensions uniformly since some of those marked as |
2690 | // symbols above could be outer loop IVs (corresponding tile space IVs). |
2691 | cst.setDimSymbolSeparation(/*newSymbolCount=*/0); |
2692 | |
2693 | AffineValueMap lbVmap, ubVmap; |
2694 | cst.getIneqAsAffineValueMap(/*pos=*/0, lbPos, lbVmap, b.getContext()); |
2695 | cst.getIneqAsAffineValueMap(/*pos=*/0, ubPos, ubVmap, b.getContext()); |
2696 | AffineForOp fullTileLoop = createCanonicalizedAffineForOp( |
2697 | b, loop.getLoc(), lbVmap.getOperands(), lbVmap.getAffineMap(), |
2698 | ubVmap.getOperands(), ubVmap.getAffineMap()); |
2699 | b = OpBuilder::atBlockTerminator(fullTileLoop.getBody()); |
2700 | fullTileLoops.push_back(fullTileLoop); |
2701 | } |
2702 | |
2703 | // Add the body for the full tile loop nest. |
2704 | IRMapping operandMap; |
2705 | for (const auto &loopEn : llvm::enumerate(inputNest)) |
2706 | operandMap.map(loopEn.value().getInductionVar(), |
2707 | fullTileLoops[loopEn.index()].getInductionVar()); |
2708 | b = OpBuilder::atBlockTerminator(block: fullTileLoops.back().getBody()); |
2709 | for (auto &op : inputNest.back().getBody()->without_terminator()) |
2710 | b.clone(op, operandMap); |
2711 | return success(); |
2712 | } |
2713 | |
2714 | LogicalResult |
2715 | mlir::affine::separateFullTiles(MutableArrayRef<AffineForOp> inputNest, |
2716 | SmallVectorImpl<AffineForOp> *fullTileNest) { |
2717 | if (inputNest.empty()) |
2718 | return success(); |
2719 | |
2720 | auto firstLoop = inputNest[0]; |
2721 | |
2722 | // Each successive for op has to be nested in the other. |
2723 | auto prevLoop = firstLoop; |
2724 | for (auto loop : inputNest.drop_front(1)) { |
2725 | assert(loop->getParentOp() == prevLoop && "input not contiguously nested" ); |
2726 | prevLoop = loop; |
2727 | } |
2728 | |
2729 | // Create the full tile loop nest. |
2730 | SmallVector<AffineForOp, 4> fullTileLoops; |
2731 | OpBuilder b(firstLoop); |
2732 | if (failed(result: createFullTiles(inputNest, fullTileLoops, b))) { |
2733 | if (!fullTileLoops.empty()) |
2734 | fullTileLoops.front().erase(); |
2735 | return failure(); |
2736 | } |
2737 | |
2738 | // Create and insert the version select right before the root of the nest. |
2739 | b = OpBuilder(firstLoop); |
2740 | AffineIfOp ifOp = createSeparationCondition(inputNest, b); |
2741 | if (!ifOp) { |
2742 | fullTileLoops.front().erase(); |
2743 | LLVM_DEBUG(llvm::dbgs() << "All tiles are full tiles, or failure creating " |
2744 | "separation condition\n" ); |
2745 | return failure(); |
2746 | } |
2747 | |
2748 | // Move the full tile into the then block. |
2749 | Block *thenBlock = ifOp.getThenBlock(); |
2750 | AffineForOp outermostFullTileLoop = fullTileLoops[0]; |
2751 | thenBlock->getOperations().splice( |
2752 | std::prev(x: thenBlock->end()), |
2753 | outermostFullTileLoop->getBlock()->getOperations(), |
2754 | Block::iterator(outermostFullTileLoop)); |
2755 | |
2756 | // Move the partial tile into the else block. The partial tile is the same as |
2757 | // the original loop nest. |
2758 | Block *elseBlock = ifOp.getElseBlock(); |
2759 | elseBlock->getOperations().splice(std::prev(x: elseBlock->end()), |
2760 | firstLoop->getBlock()->getOperations(), |
2761 | Block::iterator(firstLoop)); |
2762 | |
2763 | if (fullTileNest) |
2764 | *fullTileNest = std::move(fullTileLoops); |
2765 | |
2766 | return success(); |
2767 | } |
2768 | |
2769 | LogicalResult affine::coalescePerfectlyNestedAffineLoops(AffineForOp op) { |
2770 | LogicalResult result(failure()); |
2771 | SmallVector<AffineForOp> loops; |
2772 | getPerfectlyNestedLoops(loops, op); |
2773 | if (loops.size() <= 1) |
2774 | return success(); |
2775 | |
2776 | // Look for a band of loops that can be coalesced, i.e. perfectly nested |
2777 | // loops with bounds defined above some loop. |
2778 | // 1. For each loop, find above which parent loop its operands are |
2779 | // defined. |
2780 | SmallVector<unsigned> operandsDefinedAbove(loops.size()); |
2781 | for (unsigned i = 0, e = loops.size(); i < e; ++i) { |
2782 | operandsDefinedAbove[i] = i; |
2783 | for (unsigned j = 0; j < i; ++j) { |
2784 | if (areValuesDefinedAbove(loops[i].getOperands(), loops[j].getRegion())) { |
2785 | operandsDefinedAbove[i] = j; |
2786 | break; |
2787 | } |
2788 | } |
2789 | } |
2790 | |
2791 | // 2. Identify bands of loops such that the operands of all of them are |
2792 | // defined above the first loop in the band. Traverse the nest bottom-up |
2793 | // so that modifications don't invalidate the inner loops. |
2794 | for (unsigned end = loops.size(); end > 0; --end) { |
2795 | unsigned start = 0; |
2796 | for (; start < end - 1; ++start) { |
2797 | auto maxPos = |
2798 | *std::max_element(first: std::next(x: operandsDefinedAbove.begin(), n: start), |
2799 | last: std::next(x: operandsDefinedAbove.begin(), n: end)); |
2800 | if (maxPos > start) |
2801 | continue; |
2802 | assert(maxPos == start && |
2803 | "expected loop bounds to be known at the start of the band" ); |
2804 | auto band = llvm::MutableArrayRef(loops.data() + start, end - start); |
2805 | if (succeeded(coalesceLoops(band))) |
2806 | result = success(); |
2807 | break; |
2808 | } |
2809 | // If a band was found and transformed, keep looking at the loops above |
2810 | // the outermost transformed loop. |
2811 | if (start != end - 1) |
2812 | end = start + 1; |
2813 | } |
2814 | return result; |
2815 | } |
2816 | |