1//===- LoopUtils.h - Loop transformation utilities --------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This header file defines prototypes for various loop transformation utility
10// methods: these are not passes by themselves but are used either by passes,
11// optimization sequences, or in turn by other transformation utilities.
12//
13//===----------------------------------------------------------------------===//
14
15#ifndef MLIR_DIALECT_AFFINE_LOOPUTILS_H
16#define MLIR_DIALECT_AFFINE_LOOPUTILS_H
17
18#include "mlir/IR/Block.h"
19#include "mlir/Support/LLVM.h"
20#include "mlir/Transforms/RegionUtils.h"
21#include <optional>
22
23namespace mlir {
24class AffineMap;
25class LoopLikeOpInterface;
26class OpBuilder;
27class Value;
28class ValueRange;
29
30namespace func {
31class FuncOp;
32} // namespace func
33
34namespace scf {
35class ForOp;
36class ParallelOp;
37} // namespace scf
38
39namespace affine {
40class AffineForOp;
41struct MemRefRegion;
42
43/// Unrolls this for operation completely if the trip count is known to be
44/// constant. Returns failure otherwise.
45LogicalResult loopUnrollFull(AffineForOp forOp);
46
47/// Unrolls this for operation by the specified unroll factor. Returns failure
48/// if the loop cannot be unrolled either due to restrictions or due to invalid
49/// unroll factors. Requires positive loop bounds and step. If specified,
50/// annotates the Ops in each unrolled iteration by applying `annotateFn`.
51/// When `cleanUpUnroll` is true, we can ensure the cleanup loop is unrolled
52/// regardless of the unroll factor.
53LogicalResult loopUnrollByFactor(
54 AffineForOp forOp, uint64_t unrollFactor,
55 function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn = nullptr,
56 bool cleanUpUnroll = false);
57
58/// Unrolls this loop by the specified unroll factor or its trip count,
59/// whichever is lower.
60LogicalResult loopUnrollUpToFactor(AffineForOp forOp, uint64_t unrollFactor);
61
62/// Returns true if `loops` is a perfectly nested loop nest, where loops appear
63/// in it from outermost to innermost.
64bool LLVM_ATTRIBUTE_UNUSED isPerfectlyNested(ArrayRef<AffineForOp> loops);
65
66/// Get perfectly nested sequence of loops starting at root of loop nest
67/// (the first op being another AffineFor, and the second op - a terminator).
68/// A loop is perfectly nested iff: the first op in the loop's body is another
69/// AffineForOp, and the second op is a terminator).
70void getPerfectlyNestedLoops(SmallVectorImpl<AffineForOp> &nestedLoops,
71 AffineForOp root);
72
73/// Unrolls and jams this loop by the specified factor. `forOp` can be a loop
74/// with iteration arguments performing supported reductions and its inner loops
75/// can have iteration arguments. Returns success if the loop is successfully
76/// unroll-jammed.
77LogicalResult loopUnrollJamByFactor(AffineForOp forOp,
78 uint64_t unrollJamFactor);
79
80/// Unrolls and jams this loop by the specified factor or by the trip count (if
81/// constant), whichever is lower.
82LogicalResult loopUnrollJamUpToFactor(AffineForOp forOp,
83 uint64_t unrollJamFactor);
84
85/// Promotes the loop body of a AffineForOp to its containing block if the loop
86/// was known to have a single iteration.
87LogicalResult promoteIfSingleIteration(AffineForOp forOp);
88
89/// Promotes all single iteration AffineForOp's in the Function, i.e., moves
90/// their body into the containing Block.
91void promoteSingleIterationLoops(func::FuncOp f);
92
93/// Skew the operations in an affine.for's body with the specified
94/// operation-wise shifts. The shifts are with respect to the original execution
95/// order, and are multiplied by the loop 'step' before being applied. If
96/// `unrollPrologueEpilogue` is set, fully unroll the prologue and epilogue
97/// loops when possible.
98LogicalResult affineForOpBodySkew(AffineForOp forOp, ArrayRef<uint64_t> shifts,
99 bool unrollPrologueEpilogue = false);
100
101/// Identify valid and profitable bands of loops to tile. This is currently just
102/// a temporary placeholder to test the mechanics of tiled code generation.
103/// Returns all maximal outermost perfect loop nests to tile.
104void getTileableBands(func::FuncOp f,
105 std::vector<SmallVector<AffineForOp, 6>> *bands);
106
107/// Tiles the specified band of perfectly nested loops creating tile-space loops
108/// and intra-tile loops. A band is a contiguous set of loops. This utility
109/// doesn't check for the validity of tiling itself, but just performs it.
110LogicalResult
111tilePerfectlyNested(MutableArrayRef<AffineForOp> input,
112 ArrayRef<unsigned> tileSizes,
113 SmallVectorImpl<AffineForOp> *tiledNest = nullptr);
114
115/// Tiles the specified band of perfectly nested loops creating tile-space
116/// loops and intra-tile loops, using SSA values as tiling parameters. A band
117/// is a contiguous set of loops.
118LogicalResult tilePerfectlyNestedParametric(
119 MutableArrayRef<AffineForOp> input, ArrayRef<Value> tileSizes,
120 SmallVectorImpl<AffineForOp> *tiledNest = nullptr);
121
122/// Performs loop interchange on 'forOpA' and 'forOpB'. Requires that 'forOpA'
123/// and 'forOpB' are part of a perfectly nested sequence of loops.
124void interchangeLoops(AffineForOp forOpA, AffineForOp forOpB);
125
126/// Checks if the loop interchange permutation 'loopPermMap', of the perfectly
127/// nested sequence of loops in 'loops', would violate dependences (loop 'i' in
128/// 'loops' is mapped to location 'j = 'loopPermMap[i]' in the interchange).
129bool isValidLoopInterchangePermutation(ArrayRef<AffineForOp> loops,
130 ArrayRef<unsigned> loopPermMap);
131
132/// Performs a loop permutation on a perfectly nested loop nest `inputNest`
133/// (where the contained loops appear from outer to inner) as specified by the
134/// permutation `permMap`: loop 'i' in `inputNest` is mapped to location
135/// 'loopPermMap[i]', where positions 0, 1, ... are from the outermost position
136/// to inner. Returns the position in `inputNest` of the AffineForOp that
137/// becomes the new outermost loop of this nest. This method always succeeds,
138/// asserts out on invalid input / specifications.
139unsigned permuteLoops(ArrayRef<AffineForOp> inputNest,
140 ArrayRef<unsigned> permMap);
141
142// Sinks all sequential loops to the innermost levels (while preserving
143// relative order among them) and moves all parallel loops to the
144// outermost (while again preserving relative order among them).
145// Returns AffineForOp of the root of the new loop nest after loop interchanges.
146AffineForOp sinkSequentialLoops(AffineForOp forOp);
147
148/// Performs tiling fo imperfectly nested loops (with interchange) by
149/// strip-mining the `forOps` by `sizes` and sinking them, in their order of
150/// occurrence in `forOps`, under each of the `targets`.
151/// Returns the new AffineForOps, one per each of (`forOps`, `targets`) pair,
152/// nested immediately under each of `targets`.
153SmallVector<SmallVector<AffineForOp, 8>, 8> tile(ArrayRef<AffineForOp> forOps,
154 ArrayRef<uint64_t> sizes,
155 ArrayRef<AffineForOp> targets);
156
157/// Performs tiling (with interchange) by strip-mining the `forOps` by `sizes`
158/// and sinking them, in their order of occurrence in `forOps`, under `target`.
159/// Returns the new AffineForOps, one per `forOps`, nested immediately under
160/// `target`.
161SmallVector<AffineForOp, 8> tile(ArrayRef<AffineForOp> forOps,
162 ArrayRef<uint64_t> sizes, AffineForOp target);
163
164/// Explicit copy / DMA generation options for mlir::affineDataCopyGenerate.
165struct AffineCopyOptions {
166 // True if DMAs should be generated instead of point-wise copies.
167 bool generateDma;
168 // The slower memory space from which data is to be moved.
169 unsigned slowMemorySpace;
170 // Memory space of the faster one (typically a scratchpad).
171 unsigned fastMemorySpace;
172 // Memory space to place tags in: only meaningful for DMAs.
173 unsigned tagMemorySpace;
174 // Capacity of the fast memory space in bytes.
175 uint64_t fastMemCapacityBytes;
176};
177
178/// Performs explicit copying for the contiguous sequence of operations in the
179/// block iterator range [`begin', `end'), where `end' can't be past the
180/// terminator of the block (since additional operations are potentially
181/// inserted right before `end`. `copyOptions` provides various parameters, and
182/// the output argument `copyNests` is the set of all copy nests inserted, each
183/// represented by its root affine.for. Since we generate alloc's and dealloc's
184/// for all fast buffers (before and after the range of operations resp. or at a
185/// hoisted position), all of the fast memory capacity is assumed to be
186/// available for processing this block range. When 'filterMemRef' is specified,
187/// copies are only generated for the provided MemRef. Returns success if the
188/// explicit copying succeeded for all memrefs on which affine load/stores were
189/// encountered. For memrefs for whose element types a size in bytes can't be
190/// computed (`index` type), their capacity is not accounted for and the
191/// `fastMemCapacityBytes` copy option would be non-functional in such cases.
192LogicalResult affineDataCopyGenerate(Block::iterator begin, Block::iterator end,
193 const AffineCopyOptions &copyOptions,
194 std::optional<Value> filterMemRef,
195 DenseSet<Operation *> &copyNests);
196
197/// A convenience version of affineDataCopyGenerate for all ops in the body of
198/// an AffineForOp.
199LogicalResult affineDataCopyGenerate(AffineForOp forOp,
200 const AffineCopyOptions &copyOptions,
201 std::optional<Value> filterMemRef,
202 DenseSet<Operation *> &copyNests);
203
204/// Result for calling generateCopyForMemRegion.
205struct CopyGenerateResult {
206 // Number of bytes used by alloc.
207 uint64_t sizeInBytes;
208
209 // The newly created buffer allocation.
210 Operation *alloc;
211
212 // Generated loop nest for copying data between the allocated buffer and the
213 // original memref.
214 Operation *copyNest;
215};
216
217/// generateCopyForMemRegion is similar to affineDataCopyGenerate, but works
218/// with a single memref region. `memrefRegion` is supposed to contain analysis
219/// information within analyzedOp. The generated prologue and epilogue always
220/// surround `analyzedOp`.
221///
222/// Note that `analyzedOp` is a single op for API convenience, and the
223/// [begin, end) version can be added as needed.
224///
225/// Also note that certain options in `copyOptions` aren't looked at anymore,
226/// like slowMemorySpace.
227LogicalResult generateCopyForMemRegion(const MemRefRegion &memrefRegion,
228 Operation *analyzedOp,
229 const AffineCopyOptions &copyOptions,
230 CopyGenerateResult &result);
231
232/// Replace a perfect nest of "for" loops with a single linearized loop. Assumes
233/// `loops` contains a list of perfectly nested loops outermost to innermost
234/// that are normalized (step one and lower bound of zero) and with bounds and
235/// steps independent of any loop induction variable involved in the nest.
236/// Coalescing affine.for loops is not always possible, i.e., the result may not
237/// be representable using affine.for.
238LogicalResult coalesceLoops(MutableArrayRef<AffineForOp> loops);
239
240/// Maps `forOp` for execution on a parallel grid of virtual `processorIds` of
241/// size given by `numProcessors`. This is achieved by embedding the SSA values
242/// corresponding to `processorIds` and `numProcessors` into the bounds and step
243/// of the `forOp`. No check is performed on the legality of the rewrite, it is
244/// the caller's responsibility to ensure legality.
245///
246/// Requires that `processorIds` and `numProcessors` have the same size and that
247/// for each idx, `processorIds`[idx] takes, at runtime, all values between 0
248/// and `numProcessors`[idx] - 1. This corresponds to traditional use cases for:
249/// 1. GPU (threadIdx, get_local_id(), ...)
250/// 2. MPI (MPI_Comm_rank)
251/// 3. OpenMP (omp_get_thread_num)
252///
253/// Example:
254/// Assuming a 2-d grid with processorIds = [blockIdx.x, threadIdx.x] and
255/// numProcessors = [gridDim.x, blockDim.x], the loop:
256///
257/// ```
258/// scf.for %i = %lb to %ub step %step {
259/// ...
260/// }
261/// ```
262///
263/// is rewritten into a version resembling the following pseudo-IR:
264///
265/// ```
266/// scf.for %i = %lb + %step * (threadIdx.x + blockIdx.x * blockDim.x)
267/// to %ub step %gridDim.x * blockDim.x * %step {
268/// ...
269/// }
270/// ```
271void mapLoopToProcessorIds(scf::ForOp forOp, ArrayRef<Value> processorId,
272 ArrayRef<Value> numProcessors);
273
274/// Gathers all AffineForOps in 'func.func' grouped by loop depth.
275void gatherLoops(func::FuncOp func,
276 std::vector<SmallVector<AffineForOp, 2>> &depthToLoops);
277
278/// Creates an AffineForOp while ensuring that the lower and upper bounds are
279/// canonicalized, i.e., unused and duplicate operands are removed, any constant
280/// operands propagated/folded in, and duplicate bound maps dropped.
281AffineForOp createCanonicalizedAffineForOp(OpBuilder b, Location loc,
282 ValueRange lbOperands,
283 AffineMap lbMap,
284 ValueRange ubOperands,
285 AffineMap ubMap, int64_t step = 1);
286
287/// Separates full tiles from partial tiles for a perfect nest `nest` by
288/// generating a conditional guard that selects between the full tile version
289/// and the partial tile version using an AffineIfOp. The original loop nest
290/// is replaced by this guarded two version form.
291///
292/// affine.if (cond)
293/// // full_tile
294/// else
295/// // partial tile
296///
297LogicalResult
298separateFullTiles(MutableArrayRef<AffineForOp> nest,
299 SmallVectorImpl<AffineForOp> *fullTileNest = nullptr);
300
301/// Walk an affine.for to find a band to coalesce.
302LogicalResult coalescePerfectlyNestedAffineLoops(AffineForOp op);
303
304/// Count the number of loops surrounding `operand` such that operand could be
305/// hoisted above.
306/// Stop counting at the first loop over which the operand cannot be hoisted.
307/// This counts any LoopLikeOpInterface, not just affine.for.
308int64_t numEnclosingInvariantLoops(OpOperand &operand);
309} // namespace affine
310} // namespace mlir
311
312#endif // MLIR_DIALECT_AFFINE_LOOPUTILS_H
313

source code of mlir/include/mlir/Dialect/Affine/LoopUtils.h