1 | //===- LoopUtils.h - Loop transformation utilities --------------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This header file defines prototypes for various loop transformation utility |
10 | // methods: these are not passes by themselves but are used either by passes, |
11 | // optimization sequences, or in turn by other transformation utilities. |
12 | // |
13 | //===----------------------------------------------------------------------===// |
14 | |
15 | #ifndef MLIR_DIALECT_AFFINE_LOOPUTILS_H |
16 | #define MLIR_DIALECT_AFFINE_LOOPUTILS_H |
17 | |
18 | #include "mlir/IR/Block.h" |
19 | #include "mlir/Support/LLVM.h" |
20 | #include "mlir/Transforms/RegionUtils.h" |
21 | #include <optional> |
22 | |
23 | namespace mlir { |
24 | class AffineMap; |
25 | class LoopLikeOpInterface; |
26 | class OpBuilder; |
27 | class Value; |
28 | class ValueRange; |
29 | |
30 | namespace func { |
31 | class FuncOp; |
32 | } // namespace func |
33 | |
34 | namespace scf { |
35 | class ForOp; |
36 | class ParallelOp; |
37 | } // namespace scf |
38 | |
39 | namespace affine { |
40 | class AffineForOp; |
41 | struct MemRefRegion; |
42 | |
43 | /// Unrolls this for operation completely if the trip count is known to be |
44 | /// constant. Returns failure otherwise. |
45 | LogicalResult loopUnrollFull(AffineForOp forOp); |
46 | |
47 | /// Unrolls this for operation by the specified unroll factor. Returns failure |
48 | /// if the loop cannot be unrolled either due to restrictions or due to invalid |
49 | /// unroll factors. Requires positive loop bounds and step. If specified, |
50 | /// annotates the Ops in each unrolled iteration by applying `annotateFn`. |
51 | /// When `cleanUpUnroll` is true, we can ensure the cleanup loop is unrolled |
52 | /// regardless of the unroll factor. |
53 | LogicalResult loopUnrollByFactor( |
54 | AffineForOp forOp, uint64_t unrollFactor, |
55 | function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn = nullptr, |
56 | bool cleanUpUnroll = false); |
57 | |
58 | /// Unrolls this loop by the specified unroll factor or its trip count, |
59 | /// whichever is lower. |
60 | LogicalResult loopUnrollUpToFactor(AffineForOp forOp, uint64_t unrollFactor); |
61 | |
62 | /// Returns true if `loops` is a perfectly nested loop nest, where loops appear |
63 | /// in it from outermost to innermost. |
64 | bool LLVM_ATTRIBUTE_UNUSED isPerfectlyNested(ArrayRef<AffineForOp> loops); |
65 | |
66 | /// Get perfectly nested sequence of loops starting at root of loop nest |
67 | /// (the first op being another AffineFor, and the second op - a terminator). |
68 | /// A loop is perfectly nested iff: the first op in the loop's body is another |
69 | /// AffineForOp, and the second op is a terminator). |
70 | void getPerfectlyNestedLoops(SmallVectorImpl<AffineForOp> &nestedLoops, |
71 | AffineForOp root); |
72 | |
73 | /// Unrolls and jams this loop by the specified factor. `forOp` can be a loop |
74 | /// with iteration arguments performing supported reductions and its inner loops |
75 | /// can have iteration arguments. Returns success if the loop is successfully |
76 | /// unroll-jammed. |
77 | LogicalResult loopUnrollJamByFactor(AffineForOp forOp, |
78 | uint64_t unrollJamFactor); |
79 | |
80 | /// Unrolls and jams this loop by the specified factor or by the trip count (if |
81 | /// constant), whichever is lower. |
82 | LogicalResult loopUnrollJamUpToFactor(AffineForOp forOp, |
83 | uint64_t unrollJamFactor); |
84 | |
85 | /// Promotes the loop body of a AffineForOp to its containing block if the loop |
86 | /// was known to have a single iteration. |
87 | LogicalResult promoteIfSingleIteration(AffineForOp forOp); |
88 | |
89 | /// Promotes all single iteration AffineForOp's in the Function, i.e., moves |
90 | /// their body into the containing Block. |
91 | void promoteSingleIterationLoops(func::FuncOp f); |
92 | |
93 | /// Skew the operations in an affine.for's body with the specified |
94 | /// operation-wise shifts. The shifts are with respect to the original execution |
95 | /// order, and are multiplied by the loop 'step' before being applied. If |
96 | /// `unrollPrologueEpilogue` is set, fully unroll the prologue and epilogue |
97 | /// loops when possible. |
98 | LogicalResult affineForOpBodySkew(AffineForOp forOp, ArrayRef<uint64_t> shifts, |
99 | bool unrollPrologueEpilogue = false); |
100 | |
101 | /// Identify valid and profitable bands of loops to tile. This is currently just |
102 | /// a temporary placeholder to test the mechanics of tiled code generation. |
103 | /// Returns all maximal outermost perfect loop nests to tile. |
104 | void getTileableBands(func::FuncOp f, |
105 | std::vector<SmallVector<AffineForOp, 6>> *bands); |
106 | |
107 | /// Tiles the specified band of perfectly nested loops creating tile-space loops |
108 | /// and intra-tile loops. A band is a contiguous set of loops. This utility |
109 | /// doesn't check for the validity of tiling itself, but just performs it. |
110 | LogicalResult |
111 | tilePerfectlyNested(MutableArrayRef<AffineForOp> input, |
112 | ArrayRef<unsigned> tileSizes, |
113 | SmallVectorImpl<AffineForOp> *tiledNest = nullptr); |
114 | |
115 | /// Tiles the specified band of perfectly nested loops creating tile-space |
116 | /// loops and intra-tile loops, using SSA values as tiling parameters. A band |
117 | /// is a contiguous set of loops. |
118 | LogicalResult tilePerfectlyNestedParametric( |
119 | MutableArrayRef<AffineForOp> input, ArrayRef<Value> tileSizes, |
120 | SmallVectorImpl<AffineForOp> *tiledNest = nullptr); |
121 | |
122 | /// Performs loop interchange on 'forOpA' and 'forOpB'. Requires that 'forOpA' |
123 | /// and 'forOpB' are part of a perfectly nested sequence of loops. |
124 | void interchangeLoops(AffineForOp forOpA, AffineForOp forOpB); |
125 | |
126 | /// Checks if the loop interchange permutation 'loopPermMap', of the perfectly |
127 | /// nested sequence of loops in 'loops', would violate dependences (loop 'i' in |
128 | /// 'loops' is mapped to location 'j = 'loopPermMap[i]' in the interchange). |
129 | bool isValidLoopInterchangePermutation(ArrayRef<AffineForOp> loops, |
130 | ArrayRef<unsigned> loopPermMap); |
131 | |
132 | /// Performs a loop permutation on a perfectly nested loop nest `inputNest` |
133 | /// (where the contained loops appear from outer to inner) as specified by the |
134 | /// permutation `permMap`: loop 'i' in `inputNest` is mapped to location |
135 | /// 'loopPermMap[i]', where positions 0, 1, ... are from the outermost position |
136 | /// to inner. Returns the position in `inputNest` of the AffineForOp that |
137 | /// becomes the new outermost loop of this nest. This method always succeeds, |
138 | /// asserts out on invalid input / specifications. |
139 | unsigned permuteLoops(ArrayRef<AffineForOp> inputNest, |
140 | ArrayRef<unsigned> permMap); |
141 | |
142 | // Sinks all sequential loops to the innermost levels (while preserving |
143 | // relative order among them) and moves all parallel loops to the |
144 | // outermost (while again preserving relative order among them). |
145 | // Returns AffineForOp of the root of the new loop nest after loop interchanges. |
146 | AffineForOp sinkSequentialLoops(AffineForOp forOp); |
147 | |
148 | /// Performs tiling fo imperfectly nested loops (with interchange) by |
149 | /// strip-mining the `forOps` by `sizes` and sinking them, in their order of |
150 | /// occurrence in `forOps`, under each of the `targets`. |
151 | /// Returns the new AffineForOps, one per each of (`forOps`, `targets`) pair, |
152 | /// nested immediately under each of `targets`. |
153 | SmallVector<SmallVector<AffineForOp, 8>, 8> tile(ArrayRef<AffineForOp> forOps, |
154 | ArrayRef<uint64_t> sizes, |
155 | ArrayRef<AffineForOp> targets); |
156 | |
157 | /// Performs tiling (with interchange) by strip-mining the `forOps` by `sizes` |
158 | /// and sinking them, in their order of occurrence in `forOps`, under `target`. |
159 | /// Returns the new AffineForOps, one per `forOps`, nested immediately under |
160 | /// `target`. |
161 | SmallVector<AffineForOp, 8> tile(ArrayRef<AffineForOp> forOps, |
162 | ArrayRef<uint64_t> sizes, AffineForOp target); |
163 | |
164 | /// Explicit copy / DMA generation options for mlir::affineDataCopyGenerate. |
165 | struct AffineCopyOptions { |
166 | // True if DMAs should be generated instead of point-wise copies. |
167 | bool generateDma; |
168 | // The slower memory space from which data is to be moved. |
169 | unsigned slowMemorySpace; |
170 | // Memory space of the faster one (typically a scratchpad). |
171 | unsigned fastMemorySpace; |
172 | // Memory space to place tags in: only meaningful for DMAs. |
173 | unsigned tagMemorySpace; |
174 | // Capacity of the fast memory space in bytes. |
175 | uint64_t fastMemCapacityBytes; |
176 | }; |
177 | |
178 | /// Performs explicit copying for the contiguous sequence of operations in the |
179 | /// block iterator range [`begin', `end'), where `end' can't be past the |
180 | /// terminator of the block (since additional operations are potentially |
181 | /// inserted right before `end`. `copyOptions` provides various parameters, and |
182 | /// the output argument `copyNests` is the set of all copy nests inserted, each |
183 | /// represented by its root affine.for. Since we generate alloc's and dealloc's |
184 | /// for all fast buffers (before and after the range of operations resp. or at a |
185 | /// hoisted position), all of the fast memory capacity is assumed to be |
186 | /// available for processing this block range. When 'filterMemRef' is specified, |
187 | /// copies are only generated for the provided MemRef. Returns success if the |
188 | /// explicit copying succeeded for all memrefs on which affine load/stores were |
189 | /// encountered. For memrefs for whose element types a size in bytes can't be |
190 | /// computed (`index` type), their capacity is not accounted for and the |
191 | /// `fastMemCapacityBytes` copy option would be non-functional in such cases. |
192 | LogicalResult affineDataCopyGenerate(Block::iterator begin, Block::iterator end, |
193 | const AffineCopyOptions ©Options, |
194 | std::optional<Value> filterMemRef, |
195 | DenseSet<Operation *> ©Nests); |
196 | |
197 | /// A convenience version of affineDataCopyGenerate for all ops in the body of |
198 | /// an AffineForOp. |
199 | LogicalResult affineDataCopyGenerate(AffineForOp forOp, |
200 | const AffineCopyOptions ©Options, |
201 | std::optional<Value> filterMemRef, |
202 | DenseSet<Operation *> ©Nests); |
203 | |
204 | /// Result for calling generateCopyForMemRegion. |
205 | struct CopyGenerateResult { |
206 | // Number of bytes used by alloc. |
207 | uint64_t sizeInBytes; |
208 | |
209 | // The newly created buffer allocation. |
210 | Operation *alloc; |
211 | |
212 | // Generated loop nest for copying data between the allocated buffer and the |
213 | // original memref. |
214 | Operation *copyNest; |
215 | }; |
216 | |
217 | /// generateCopyForMemRegion is similar to affineDataCopyGenerate, but works |
218 | /// with a single memref region. `memrefRegion` is supposed to contain analysis |
219 | /// information within analyzedOp. The generated prologue and epilogue always |
220 | /// surround `analyzedOp`. |
221 | /// |
222 | /// Note that `analyzedOp` is a single op for API convenience, and the |
223 | /// [begin, end) version can be added as needed. |
224 | /// |
225 | /// Also note that certain options in `copyOptions` aren't looked at anymore, |
226 | /// like slowMemorySpace. |
227 | LogicalResult generateCopyForMemRegion(const MemRefRegion &memrefRegion, |
228 | Operation *analyzedOp, |
229 | const AffineCopyOptions ©Options, |
230 | CopyGenerateResult &result); |
231 | |
232 | /// Replace a perfect nest of "for" loops with a single linearized loop. Assumes |
233 | /// `loops` contains a list of perfectly nested loops outermost to innermost |
234 | /// that are normalized (step one and lower bound of zero) and with bounds and |
235 | /// steps independent of any loop induction variable involved in the nest. |
236 | /// Coalescing affine.for loops is not always possible, i.e., the result may not |
237 | /// be representable using affine.for. |
238 | LogicalResult coalesceLoops(MutableArrayRef<AffineForOp> loops); |
239 | |
240 | /// Maps `forOp` for execution on a parallel grid of virtual `processorIds` of |
241 | /// size given by `numProcessors`. This is achieved by embedding the SSA values |
242 | /// corresponding to `processorIds` and `numProcessors` into the bounds and step |
243 | /// of the `forOp`. No check is performed on the legality of the rewrite, it is |
244 | /// the caller's responsibility to ensure legality. |
245 | /// |
246 | /// Requires that `processorIds` and `numProcessors` have the same size and that |
247 | /// for each idx, `processorIds`[idx] takes, at runtime, all values between 0 |
248 | /// and `numProcessors`[idx] - 1. This corresponds to traditional use cases for: |
249 | /// 1. GPU (threadIdx, get_local_id(), ...) |
250 | /// 2. MPI (MPI_Comm_rank) |
251 | /// 3. OpenMP (omp_get_thread_num) |
252 | /// |
253 | /// Example: |
254 | /// Assuming a 2-d grid with processorIds = [blockIdx.x, threadIdx.x] and |
255 | /// numProcessors = [gridDim.x, blockDim.x], the loop: |
256 | /// |
257 | /// ``` |
258 | /// scf.for %i = %lb to %ub step %step { |
259 | /// ... |
260 | /// } |
261 | /// ``` |
262 | /// |
263 | /// is rewritten into a version resembling the following pseudo-IR: |
264 | /// |
265 | /// ``` |
266 | /// scf.for %i = %lb + %step * (threadIdx.x + blockIdx.x * blockDim.x) |
267 | /// to %ub step %gridDim.x * blockDim.x * %step { |
268 | /// ... |
269 | /// } |
270 | /// ``` |
271 | void mapLoopToProcessorIds(scf::ForOp forOp, ArrayRef<Value> processorId, |
272 | ArrayRef<Value> numProcessors); |
273 | |
274 | /// Gathers all AffineForOps in 'func.func' grouped by loop depth. |
275 | void gatherLoops(func::FuncOp func, |
276 | std::vector<SmallVector<AffineForOp, 2>> &depthToLoops); |
277 | |
278 | /// Creates an AffineForOp while ensuring that the lower and upper bounds are |
279 | /// canonicalized, i.e., unused and duplicate operands are removed, any constant |
280 | /// operands propagated/folded in, and duplicate bound maps dropped. |
281 | AffineForOp createCanonicalizedAffineForOp(OpBuilder b, Location loc, |
282 | ValueRange lbOperands, |
283 | AffineMap lbMap, |
284 | ValueRange ubOperands, |
285 | AffineMap ubMap, int64_t step = 1); |
286 | |
287 | /// Separates full tiles from partial tiles for a perfect nest `nest` by |
288 | /// generating a conditional guard that selects between the full tile version |
289 | /// and the partial tile version using an AffineIfOp. The original loop nest |
290 | /// is replaced by this guarded two version form. |
291 | /// |
292 | /// affine.if (cond) |
293 | /// // full_tile |
294 | /// else |
295 | /// // partial tile |
296 | /// |
297 | LogicalResult |
298 | separateFullTiles(MutableArrayRef<AffineForOp> nest, |
299 | SmallVectorImpl<AffineForOp> *fullTileNest = nullptr); |
300 | |
301 | /// Walk an affine.for to find a band to coalesce. |
302 | LogicalResult coalescePerfectlyNestedAffineLoops(AffineForOp op); |
303 | |
304 | /// Count the number of loops surrounding `operand` such that operand could be |
305 | /// hoisted above. |
306 | /// Stop counting at the first loop over which the operand cannot be hoisted. |
307 | /// This counts any LoopLikeOpInterface, not just affine.for. |
308 | int64_t numEnclosingInvariantLoops(OpOperand &operand); |
309 | } // namespace affine |
310 | } // namespace mlir |
311 | |
312 | #endif // MLIR_DIALECT_AFFINE_LOOPUTILS_H |
313 | |