1 | //===- LoopUtils.h - Loop transformation utilities --------------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This header file defines prototypes for various loop transformation utility |
10 | // methods: these are not passes by themselves but are used either by passes, |
11 | // optimization sequences, or in turn by other transformation utilities. |
12 | // |
13 | //===----------------------------------------------------------------------===// |
14 | |
15 | #ifndef MLIR_DIALECT_AFFINE_LOOPUTILS_H |
16 | #define MLIR_DIALECT_AFFINE_LOOPUTILS_H |
17 | |
18 | #include "mlir/IR/Block.h" |
19 | #include "mlir/Support/LLVM.h" |
20 | #include "mlir/Support/LogicalResult.h" |
21 | #include "mlir/Transforms/RegionUtils.h" |
22 | #include <optional> |
23 | |
24 | namespace mlir { |
25 | class AffineMap; |
26 | class LoopLikeOpInterface; |
27 | class OpBuilder; |
28 | class Value; |
29 | class ValueRange; |
30 | |
31 | namespace func { |
32 | class FuncOp; |
33 | } // namespace func |
34 | |
35 | namespace scf { |
36 | class ForOp; |
37 | class ParallelOp; |
38 | } // namespace scf |
39 | |
40 | namespace affine { |
41 | class AffineForOp; |
42 | struct MemRefRegion; |
43 | |
44 | /// Unrolls this for operation completely if the trip count is known to be |
45 | /// constant. Returns failure otherwise. |
46 | LogicalResult loopUnrollFull(AffineForOp forOp); |
47 | |
48 | /// Unrolls this for operation by the specified unroll factor. Returns failure |
49 | /// if the loop cannot be unrolled either due to restrictions or due to invalid |
50 | /// unroll factors. Requires positive loop bounds and step. If specified, |
51 | /// annotates the Ops in each unrolled iteration by applying `annotateFn`. |
52 | /// When `cleanUpUnroll` is true, we can ensure the cleanup loop is unrolled |
53 | /// regardless of the unroll factor. |
54 | LogicalResult loopUnrollByFactor( |
55 | AffineForOp forOp, uint64_t unrollFactor, |
56 | function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn = nullptr, |
57 | bool cleanUpUnroll = false); |
58 | |
59 | /// Unrolls this loop by the specified unroll factor or its trip count, |
60 | /// whichever is lower. |
61 | LogicalResult loopUnrollUpToFactor(AffineForOp forOp, uint64_t unrollFactor); |
62 | |
63 | /// Returns true if `loops` is a perfectly nested loop nest, where loops appear |
64 | /// in it from outermost to innermost. |
65 | bool LLVM_ATTRIBUTE_UNUSED isPerfectlyNested(ArrayRef<AffineForOp> loops); |
66 | |
67 | /// Get perfectly nested sequence of loops starting at root of loop nest |
68 | /// (the first op being another AffineFor, and the second op - a terminator). |
69 | /// A loop is perfectly nested iff: the first op in the loop's body is another |
70 | /// AffineForOp, and the second op is a terminator). |
71 | void getPerfectlyNestedLoops(SmallVectorImpl<AffineForOp> &nestedLoops, |
72 | AffineForOp root); |
73 | |
74 | /// Unrolls and jams this loop by the specified factor. `forOp` can be a loop |
75 | /// with iteration arguments performing supported reductions and its inner loops |
76 | /// can have iteration arguments. Returns success if the loop is successfully |
77 | /// unroll-jammed. |
78 | LogicalResult loopUnrollJamByFactor(AffineForOp forOp, |
79 | uint64_t unrollJamFactor); |
80 | |
81 | /// Unrolls and jams this loop by the specified factor or by the trip count (if |
82 | /// constant), whichever is lower. |
83 | LogicalResult loopUnrollJamUpToFactor(AffineForOp forOp, |
84 | uint64_t unrollJamFactor); |
85 | |
86 | /// Promotes the loop body of a AffineForOp to its containing block if the loop |
87 | /// was known to have a single iteration. |
88 | LogicalResult promoteIfSingleIteration(AffineForOp forOp); |
89 | |
90 | /// Promotes all single iteration AffineForOp's in the Function, i.e., moves |
91 | /// their body into the containing Block. |
92 | void promoteSingleIterationLoops(func::FuncOp f); |
93 | |
94 | /// Skew the operations in an affine.for's body with the specified |
95 | /// operation-wise shifts. The shifts are with respect to the original execution |
96 | /// order, and are multiplied by the loop 'step' before being applied. If |
97 | /// `unrollPrologueEpilogue` is set, fully unroll the prologue and epilogue |
98 | /// loops when possible. |
99 | LogicalResult affineForOpBodySkew(AffineForOp forOp, ArrayRef<uint64_t> shifts, |
100 | bool unrollPrologueEpilogue = false); |
101 | |
102 | /// Identify valid and profitable bands of loops to tile. This is currently just |
103 | /// a temporary placeholder to test the mechanics of tiled code generation. |
104 | /// Returns all maximal outermost perfect loop nests to tile. |
105 | void getTileableBands(func::FuncOp f, |
106 | std::vector<SmallVector<AffineForOp, 6>> *bands); |
107 | |
108 | /// Tiles the specified band of perfectly nested loops creating tile-space loops |
109 | /// and intra-tile loops. A band is a contiguous set of loops. This utility |
110 | /// doesn't check for the validity of tiling itself, but just performs it. |
111 | LogicalResult |
112 | tilePerfectlyNested(MutableArrayRef<AffineForOp> input, |
113 | ArrayRef<unsigned> tileSizes, |
114 | SmallVectorImpl<AffineForOp> *tiledNest = nullptr); |
115 | |
116 | /// Tiles the specified band of perfectly nested loops creating tile-space |
117 | /// loops and intra-tile loops, using SSA values as tiling parameters. A band |
118 | /// is a contiguous set of loops. |
119 | LogicalResult tilePerfectlyNestedParametric( |
120 | MutableArrayRef<AffineForOp> input, ArrayRef<Value> tileSizes, |
121 | SmallVectorImpl<AffineForOp> *tiledNest = nullptr); |
122 | |
123 | /// Performs loop interchange on 'forOpA' and 'forOpB'. Requires that 'forOpA' |
124 | /// and 'forOpB' are part of a perfectly nested sequence of loops. |
125 | void interchangeLoops(AffineForOp forOpA, AffineForOp forOpB); |
126 | |
127 | /// Checks if the loop interchange permutation 'loopPermMap', of the perfectly |
128 | /// nested sequence of loops in 'loops', would violate dependences (loop 'i' in |
129 | /// 'loops' is mapped to location 'j = 'loopPermMap[i]' in the interchange). |
130 | bool isValidLoopInterchangePermutation(ArrayRef<AffineForOp> loops, |
131 | ArrayRef<unsigned> loopPermMap); |
132 | |
133 | /// Performs a loop permutation on a perfectly nested loop nest `inputNest` |
134 | /// (where the contained loops appear from outer to inner) as specified by the |
135 | /// permutation `permMap`: loop 'i' in `inputNest` is mapped to location |
136 | /// 'loopPermMap[i]', where positions 0, 1, ... are from the outermost position |
137 | /// to inner. Returns the position in `inputNest` of the AffineForOp that |
138 | /// becomes the new outermost loop of this nest. This method always succeeds, |
139 | /// asserts out on invalid input / specifications. |
140 | unsigned permuteLoops(MutableArrayRef<AffineForOp> inputNest, |
141 | ArrayRef<unsigned> permMap); |
142 | |
143 | // Sinks all sequential loops to the innermost levels (while preserving |
144 | // relative order among them) and moves all parallel loops to the |
145 | // outermost (while again preserving relative order among them). |
146 | // Returns AffineForOp of the root of the new loop nest after loop interchanges. |
147 | AffineForOp sinkSequentialLoops(AffineForOp forOp); |
148 | |
149 | /// Performs tiling fo imperfectly nested loops (with interchange) by |
150 | /// strip-mining the `forOps` by `sizes` and sinking them, in their order of |
151 | /// occurrence in `forOps`, under each of the `targets`. |
152 | /// Returns the new AffineForOps, one per each of (`forOps`, `targets`) pair, |
153 | /// nested immediately under each of `targets`. |
154 | SmallVector<SmallVector<AffineForOp, 8>, 8> tile(ArrayRef<AffineForOp> forOps, |
155 | ArrayRef<uint64_t> sizes, |
156 | ArrayRef<AffineForOp> targets); |
157 | |
158 | /// Performs tiling (with interchange) by strip-mining the `forOps` by `sizes` |
159 | /// and sinking them, in their order of occurrence in `forOps`, under `target`. |
160 | /// Returns the new AffineForOps, one per `forOps`, nested immediately under |
161 | /// `target`. |
162 | SmallVector<AffineForOp, 8> tile(ArrayRef<AffineForOp> forOps, |
163 | ArrayRef<uint64_t> sizes, AffineForOp target); |
164 | |
165 | /// Explicit copy / DMA generation options for mlir::affineDataCopyGenerate. |
166 | struct AffineCopyOptions { |
167 | // True if DMAs should be generated instead of point-wise copies. |
168 | bool generateDma; |
169 | // The slower memory space from which data is to be moved. |
170 | unsigned slowMemorySpace; |
171 | // Memory space of the faster one (typically a scratchpad). |
172 | unsigned fastMemorySpace; |
173 | // Memory space to place tags in: only meaningful for DMAs. |
174 | unsigned tagMemorySpace; |
175 | // Capacity of the fast memory space in bytes. |
176 | uint64_t fastMemCapacityBytes; |
177 | }; |
178 | |
179 | /// Performs explicit copying for the contiguous sequence of operations in the |
180 | /// block iterator range [`begin', `end'), where `end' can't be past the |
181 | /// terminator of the block (since additional operations are potentially |
182 | /// inserted right before `end`. `copyOptions` provides various parameters, and |
183 | /// the output argument `copyNests` is the set of all copy nests inserted, each |
184 | /// represented by its root affine.for. Since we generate alloc's and dealloc's |
185 | /// for all fast buffers (before and after the range of operations resp. or at a |
186 | /// hoisted position), all of the fast memory capacity is assumed to be |
187 | /// available for processing this block range. When 'filterMemRef' is specified, |
188 | /// copies are only generated for the provided MemRef. Returns success if the |
189 | /// explicit copying succeeded for all memrefs on which affine load/stores were |
190 | /// encountered. For memrefs for whose element types a size in bytes can't be |
191 | /// computed (`index` type), their capacity is not accounted for and the |
192 | /// `fastMemCapacityBytes` copy option would be non-functional in such cases. |
193 | LogicalResult affineDataCopyGenerate(Block::iterator begin, Block::iterator end, |
194 | const AffineCopyOptions ©Options, |
195 | std::optional<Value> filterMemRef, |
196 | DenseSet<Operation *> ©Nests); |
197 | |
198 | /// A convenience version of affineDataCopyGenerate for all ops in the body of |
199 | /// an AffineForOp. |
200 | LogicalResult affineDataCopyGenerate(AffineForOp forOp, |
201 | const AffineCopyOptions ©Options, |
202 | std::optional<Value> filterMemRef, |
203 | DenseSet<Operation *> ©Nests); |
204 | |
205 | /// Result for calling generateCopyForMemRegion. |
206 | struct CopyGenerateResult { |
207 | // Number of bytes used by alloc. |
208 | uint64_t sizeInBytes; |
209 | |
210 | // The newly created buffer allocation. |
211 | Operation *alloc; |
212 | |
213 | // Generated loop nest for copying data between the allocated buffer and the |
214 | // original memref. |
215 | Operation *copyNest; |
216 | }; |
217 | |
218 | /// generateCopyForMemRegion is similar to affineDataCopyGenerate, but works |
219 | /// with a single memref region. `memrefRegion` is supposed to contain analysis |
220 | /// information within analyzedOp. The generated prologue and epilogue always |
221 | /// surround `analyzedOp`. |
222 | /// |
223 | /// Note that `analyzedOp` is a single op for API convenience, and the |
224 | /// [begin, end) version can be added as needed. |
225 | /// |
226 | /// Also note that certain options in `copyOptions` aren't looked at anymore, |
227 | /// like slowMemorySpace. |
228 | LogicalResult generateCopyForMemRegion(const MemRefRegion &memrefRegion, |
229 | Operation *analyzedOp, |
230 | const AffineCopyOptions ©Options, |
231 | CopyGenerateResult &result); |
232 | |
233 | /// Replace a perfect nest of "for" loops with a single linearized loop. Assumes |
234 | /// `loops` contains a list of perfectly nested loops outermost to innermost |
235 | /// that are normalized (step one and lower bound of zero) and with bounds and |
236 | /// steps independent of any loop induction variable involved in the nest. |
237 | /// Coalescing affine.for loops is not always possible, i.e., the result may not |
238 | /// be representable using affine.for. |
239 | LogicalResult coalesceLoops(MutableArrayRef<AffineForOp> loops); |
240 | |
241 | /// Maps `forOp` for execution on a parallel grid of virtual `processorIds` of |
242 | /// size given by `numProcessors`. This is achieved by embedding the SSA values |
243 | /// corresponding to `processorIds` and `numProcessors` into the bounds and step |
244 | /// of the `forOp`. No check is performed on the legality of the rewrite, it is |
245 | /// the caller's responsibility to ensure legality. |
246 | /// |
247 | /// Requires that `processorIds` and `numProcessors` have the same size and that |
248 | /// for each idx, `processorIds`[idx] takes, at runtime, all values between 0 |
249 | /// and `numProcessors`[idx] - 1. This corresponds to traditional use cases for: |
250 | /// 1. GPU (threadIdx, get_local_id(), ...) |
251 | /// 2. MPI (MPI_Comm_rank) |
252 | /// 3. OpenMP (omp_get_thread_num) |
253 | /// |
254 | /// Example: |
255 | /// Assuming a 2-d grid with processorIds = [blockIdx.x, threadIdx.x] and |
256 | /// numProcessors = [gridDim.x, blockDim.x], the loop: |
257 | /// |
258 | /// ``` |
259 | /// scf.for %i = %lb to %ub step %step { |
260 | /// ... |
261 | /// } |
262 | /// ``` |
263 | /// |
264 | /// is rewritten into a version resembling the following pseudo-IR: |
265 | /// |
266 | /// ``` |
267 | /// scf.for %i = %lb + %step * (threadIdx.x + blockIdx.x * blockDim.x) |
268 | /// to %ub step %gridDim.x * blockDim.x * %step { |
269 | /// ... |
270 | /// } |
271 | /// ``` |
272 | void mapLoopToProcessorIds(scf::ForOp forOp, ArrayRef<Value> processorId, |
273 | ArrayRef<Value> numProcessors); |
274 | |
275 | /// Gathers all AffineForOps in 'func.func' grouped by loop depth. |
276 | void gatherLoops(func::FuncOp func, |
277 | std::vector<SmallVector<AffineForOp, 2>> &depthToLoops); |
278 | |
279 | /// Creates an AffineForOp while ensuring that the lower and upper bounds are |
280 | /// canonicalized, i.e., unused and duplicate operands are removed, any constant |
281 | /// operands propagated/folded in, and duplicate bound maps dropped. |
282 | AffineForOp createCanonicalizedAffineForOp(OpBuilder b, Location loc, |
283 | ValueRange lbOperands, |
284 | AffineMap lbMap, |
285 | ValueRange ubOperands, |
286 | AffineMap ubMap, int64_t step = 1); |
287 | |
288 | /// Separates full tiles from partial tiles for a perfect nest `nest` by |
289 | /// generating a conditional guard that selects between the full tile version |
290 | /// and the partial tile version using an AffineIfOp. The original loop nest |
291 | /// is replaced by this guarded two version form. |
292 | /// |
293 | /// affine.if (cond) |
294 | /// // full_tile |
295 | /// else |
296 | /// // partial tile |
297 | /// |
298 | LogicalResult |
299 | separateFullTiles(MutableArrayRef<AffineForOp> nest, |
300 | SmallVectorImpl<AffineForOp> *fullTileNest = nullptr); |
301 | |
302 | /// Walk an affine.for to find a band to coalesce. |
303 | LogicalResult coalescePerfectlyNestedAffineLoops(AffineForOp op); |
304 | |
305 | } // namespace affine |
306 | } // namespace mlir |
307 | |
308 | #endif // MLIR_DIALECT_AFFINE_LOOPUTILS_H |
309 | |