LoopUtils.h source code [mlir/include/mlir/Dialect/Affine/LoopUtils.h]

1	//===- LoopUtils.h - Loop transformation utilities --------------- C++ --===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This header file defines prototypes for various loop transformation utility
10	// methods: these are not passes by themselves but are used either by passes,
11	// optimization sequences, or in turn by other transformation utilities.
12	//
13	//===----------------------------------------------------------------------===//
14
15	#ifndef MLIR_DIALECT_AFFINE_LOOPUTILS_H
16	#define MLIR_DIALECT_AFFINE_LOOPUTILS_H
17
18	#include "mlir/IR/Block.h"
19	#include "mlir/Support/LLVM.h"
20	#include "mlir/Support/LogicalResult.h"
21	#include "mlir/Transforms/RegionUtils.h"
22	#include <optional>
23
24	namespace mlir {
25	class AffineMap;
26	class LoopLikeOpInterface;
27	class OpBuilder;
28	class Value;
29	class ValueRange;
30
31	namespace func {
32	class FuncOp;
33	} // namespace func
34
35	namespace scf {
36	class ForOp;
37	class ParallelOp;
38	} // namespace scf
39
40	namespace affine {
41	class AffineForOp;
42	struct MemRefRegion;
43
44	/// Unrolls this for operation completely if the trip count is known to be
45	/// constant. Returns failure otherwise.
46	LogicalResult loopUnrollFull(AffineForOp forOp);
47
48	/// Unrolls this for operation by the specified unroll factor. Returns failure
49	/// if the loop cannot be unrolled either due to restrictions or due to invalid
50	/// unroll factors. Requires positive loop bounds and step. If specified,
51	/// annotates the Ops in each unrolled iteration by applying `annotateFn`.
52	/// When `cleanUpUnroll` is true, we can ensure the cleanup loop is unrolled
53	/// regardless of the unroll factor.
54	LogicalResult loopUnrollByFactor(
55	AffineForOp forOp, uint64_t unrollFactor,
56	function_ref<void(unsigned, Operation , OpBuilder)> annotateFn = nullptr*,
57	bool cleanUpUnroll = false);
58
59	/// Unrolls this loop by the specified unroll factor or its trip count,
60	/// whichever is lower.
61	LogicalResult loopUnrollUpToFactor(AffineForOp forOp, uint64_t unrollFactor);
62
63	/// Returns true if `loops` is a perfectly nested loop nest, where loops appear
64	/// in it from outermost to innermost.
65	bool LLVM_ATTRIBUTE_UNUSED isPerfectlyNested(ArrayRef<AffineForOp> loops);
66
67	/// Get perfectly nested sequence of loops starting at root of loop nest
68	/// (the first op being another AffineFor, and the second op - a terminator).
69	/// A loop is perfectly nested iff: the first op in the loop's body is another
70	/// AffineForOp, and the second op is a terminator).
71	void getPerfectlyNestedLoops(SmallVectorImpl<AffineForOp> &nestedLoops,
72	AffineForOp root);
73
74	/// Unrolls and jams this loop by the specified factor. `forOp` can be a loop
75	/// with iteration arguments performing supported reductions and its inner loops
76	/// can have iteration arguments. Returns success if the loop is successfully
77	/// unroll-jammed.
78	LogicalResult loopUnrollJamByFactor(AffineForOp forOp,
79	uint64_t unrollJamFactor);
80
81	/// Unrolls and jams this loop by the specified factor or by the trip count (if
82	/// constant), whichever is lower.
83	LogicalResult loopUnrollJamUpToFactor(AffineForOp forOp,
84	uint64_t unrollJamFactor);
85
86	/// Promotes the loop body of a AffineForOp to its containing block if the loop
87	/// was known to have a single iteration.
88	LogicalResult promoteIfSingleIteration(AffineForOp forOp);
89
90	/// Promotes all single iteration AffineForOp's in the Function, i.e., moves
91	/// their body into the containing Block.
92	void promoteSingleIterationLoops(func::FuncOp f);
93
94	/// Skew the operations in an affine.for's body with the specified
95	/// operation-wise shifts. The shifts are with respect to the original execution
96	/// order, and are multiplied by the loop 'step' before being applied. If
97	/// `unrollPrologueEpilogue` is set, fully unroll the prologue and epilogue
98	/// loops when possible.
99	LogicalResult affineForOpBodySkew(AffineForOp forOp, ArrayRef<uint64_t> shifts,
100	bool unrollPrologueEpilogue = false);
101
102	/// Identify valid and profitable bands of loops to tile. This is currently just
103	/// a temporary placeholder to test the mechanics of tiled code generation.
104	/// Returns all maximal outermost perfect loop nests to tile.
105	void getTileableBands(func::FuncOp f,
106	std::vector<SmallVector<AffineForOp, `6`>> *bands);
107
108	/// Tiles the specified band of perfectly nested loops creating tile-space loops
109	/// and intra-tile loops. A band is a contiguous set of loops. This utility
110	/// doesn't check for the validity of tiling itself, but just performs it.
111	LogicalResult
112	tilePerfectlyNested(MutableArrayRef<AffineForOp> input,
113	ArrayRef<unsigned> tileSizes,
114	SmallVectorImpl<AffineForOp> tiledNest = nullptr*);
115
116	/// Tiles the specified band of perfectly nested loops creating tile-space
117	/// loops and intra-tile loops, using SSA values as tiling parameters. A band
118	/// is a contiguous set of loops.
119	LogicalResult tilePerfectlyNestedParametric(
120	MutableArrayRef<AffineForOp> input, ArrayRef<Value> tileSizes,
121	SmallVectorImpl<AffineForOp> tiledNest = nullptr*);
122
123	/// Performs loop interchange on 'forOpA' and 'forOpB'. Requires that 'forOpA'
124	/// and 'forOpB' are part of a perfectly nested sequence of loops.
125	void interchangeLoops(AffineForOp forOpA, AffineForOp forOpB);
126
127	/// Checks if the loop interchange permutation 'loopPermMap', of the perfectly
128	/// nested sequence of loops in 'loops', would violate dependences (loop 'i' in
129	/// 'loops' is mapped to location 'j = 'loopPermMap[i]' in the interchange).
130	bool isValidLoopInterchangePermutation(ArrayRef<AffineForOp> loops,
131	ArrayRef<unsigned> loopPermMap);
132
133	/// Performs a loop permutation on a perfectly nested loop nest `inputNest`
134	/// (where the contained loops appear from outer to inner) as specified by the
135	/// permutation `permMap`: loop 'i' in `inputNest` is mapped to location
136	/// 'loopPermMap[i]', where positions 0, 1, ... are from the outermost position
137	/// to inner. Returns the position in `inputNest` of the AffineForOp that
138	/// becomes the new outermost loop of this nest. This method always succeeds,
139	/// asserts out on invalid input / specifications.
140	unsigned permuteLoops(MutableArrayRef<AffineForOp> inputNest,
141	ArrayRef<unsigned> permMap);
142
143	// Sinks all sequential loops to the innermost levels (while preserving
144	// relative order among them) and moves all parallel loops to the
145	// outermost (while again preserving relative order among them).
146	// Returns AffineForOp of the root of the new loop nest after loop interchanges.
147	AffineForOp sinkSequentialLoops(AffineForOp forOp);
148
149	/// Performs tiling fo imperfectly nested loops (with interchange) by
150	/// strip-mining the `forOps` by `sizes` and sinking them, in their order of
151	/// occurrence in `forOps`, under each of the `targets`.
152	/// Returns the new AffineForOps, one per each of (`forOps`, `targets`) pair,
153	/// nested immediately under each of `targets`.
154	SmallVector<SmallVector<AffineForOp, `8`>, `8`> tile(ArrayRef<AffineForOp> forOps,
155	ArrayRef<uint64_t> sizes,
156	ArrayRef<AffineForOp> targets);
157
158	/// Performs tiling (with interchange) by strip-mining the `forOps` by `sizes`
159	/// and sinking them, in their order of occurrence in `forOps`, under `target`.
160	/// Returns the new AffineForOps, one per `forOps`, nested immediately under
161	/// `target`.
162	SmallVector<AffineForOp, `8`> tile(ArrayRef<AffineForOp> forOps,
163	ArrayRef<uint64_t> sizes, AffineForOp target);
164
165	/// Explicit copy / DMA generation options for mlir::affineDataCopyGenerate.
166	struct AffineCopyOptions {
167	// True if DMAs should be generated instead of point-wise copies.
168	bool generateDma;
169	// The slower memory space from which data is to be moved.
170	unsigned slowMemorySpace;
171	// Memory space of the faster one (typically a scratchpad).
172	unsigned fastMemorySpace;
173	// Memory space to place tags in: only meaningful for DMAs.
174	unsigned tagMemorySpace;
175	// Capacity of the fast memory space in bytes.
176	uint64_t fastMemCapacityBytes;
177	};
178
179	/// Performs explicit copying for the contiguous sequence of operations in the
180	/// block iterator range [`begin', `end'), where `end' can't be past the
181	/// terminator of the block (since additional operations are potentially
182	/// inserted right before `end`. `copyOptions` provides various parameters, and
183	/// the output argument `copyNests` is the set of all copy nests inserted, each
184	/// represented by its root affine.for. Since we generate alloc's and dealloc's
185	/// for all fast buffers (before and after the range of operations resp. or at a
186	/// hoisted position), all of the fast memory capacity is assumed to be
187	/// available for processing this block range. When 'filterMemRef' is specified,
188	/// copies are only generated for the provided MemRef. Returns success if the
189	/// explicit copying succeeded for all memrefs on which affine load/stores were
190	/// encountered. For memrefs for whose element types a size in bytes can't be
191	/// computed (`index` type), their capacity is not accounted for and the
192	/// `fastMemCapacityBytes` copy option would be non-functional in such cases.
193	LogicalResult affineDataCopyGenerate(Block::iterator begin, Block::iterator end,
194	const AffineCopyOptions &copyOptions,
195	std::optional<Value> filterMemRef,
196	DenseSet<Operation *> &copyNests);
197
198	/// A convenience version of affineDataCopyGenerate for all ops in the body of
199	/// an AffineForOp.
200	LogicalResult affineDataCopyGenerate(AffineForOp forOp,
201	const AffineCopyOptions &copyOptions,
202	std::optional<Value> filterMemRef,
203	DenseSet<Operation *> &copyNests);
204
205	/// Result for calling generateCopyForMemRegion.
206	struct CopyGenerateResult {
207	// Number of bytes used by alloc.
208	uint64_t sizeInBytes;
209
210	// The newly created buffer allocation.
211	Operation *alloc;
212
213	// Generated loop nest for copying data between the allocated buffer and the
214	// original memref.
215	Operation *copyNest;
216	};
217
218	/// generateCopyForMemRegion is similar to affineDataCopyGenerate, but works
219	/// with a single memref region. `memrefRegion` is supposed to contain analysis
220	/// information within analyzedOp. The generated prologue and epilogue always
221	/// surround `analyzedOp`.
222	///
223	/// Note that `analyzedOp` is a single op for API convenience, and the
224	/// [begin, end) version can be added as needed.
225	///
226	/// Also note that certain options in `copyOptions` aren't looked at anymore,
227	/// like slowMemorySpace.
228	LogicalResult generateCopyForMemRegion(const MemRefRegion &memrefRegion,
229	Operation *analyzedOp,
230	const AffineCopyOptions &copyOptions,
231	CopyGenerateResult &result);
232
233	/// Replace a perfect nest of "for" loops with a single linearized loop. Assumes
234	/// `loops` contains a list of perfectly nested loops outermost to innermost
235	/// that are normalized (step one and lower bound of zero) and with bounds and
236	/// steps independent of any loop induction variable involved in the nest.
237	/// Coalescing affine.for loops is not always possible, i.e., the result may not
238	/// be representable using affine.for.
239	LogicalResult coalesceLoops(MutableArrayRef<AffineForOp> loops);
240
241	/// Maps `forOp` for execution on a parallel grid of virtual `processorIds` of
242	/// size given by `numProcessors`. This is achieved by embedding the SSA values
243	/// corresponding to `processorIds` and `numProcessors` into the bounds and step
244	/// of the `forOp`. No check is performed on the legality of the rewrite, it is
245	/// the caller's responsibility to ensure legality.
246	///
247	/// Requires that `processorIds` and `numProcessors` have the same size and that
248	/// for each idx, `processorIds`[idx] takes, at runtime, all values between 0
249	/// and `numProcessors`[idx] - 1. This corresponds to traditional use cases for:
250	/// 1. GPU (threadIdx, get_local_id(), ...)
251	/// 2. MPI (MPI_Comm_rank)
252	/// 3. OpenMP (omp_get_thread_num)
253	///
254	/// Example:
255	/// Assuming a 2-d grid with processorIds = [blockIdx.x, threadIdx.x] and
256	/// numProcessors = [gridDim.x, blockDim.x], the loop:
257	///
258	/// ```
259	/// scf.for %i = %lb to %ub step %step {
260	/// ...
261	/// }
262	/// ```
263	///
264	/// is rewritten into a version resembling the following pseudo-IR:
265	///
266	/// ```
267	/// scf.for %i = %lb + %step (threadIdx.x + blockIdx.x * blockDim.x)*
268	/// to %ub step %gridDim.x blockDim.x * %step {*
269	/// ...
270	/// }
271	/// ```
272	void mapLoopToProcessorIds(scf::ForOp forOp, ArrayRef<Value> processorId,
273	ArrayRef<Value> numProcessors);
274
275	/// Gathers all AffineForOps in 'func.func' grouped by loop depth.
276	void gatherLoops(func::FuncOp func,
277	std::vector<SmallVector<AffineForOp, `2`>> &depthToLoops);
278
279	/// Creates an AffineForOp while ensuring that the lower and upper bounds are
280	/// canonicalized, i.e., unused and duplicate operands are removed, any constant
281	/// operands propagated/folded in, and duplicate bound maps dropped.
282	AffineForOp createCanonicalizedAffineForOp(OpBuilder b, Location loc,
283	ValueRange lbOperands,
284	AffineMap lbMap,
285	ValueRange ubOperands,
286	AffineMap ubMap, int64_t step = `1`);
287
288	/// Separates full tiles from partial tiles for a perfect nest `nest` by
289	/// generating a conditional guard that selects between the full tile version
290	/// and the partial tile version using an AffineIfOp. The original loop nest
291	/// is replaced by this guarded two version form.
292	///
293	/// affine.if (cond)
294	/// // full_tile
295	/// else
296	/// // partial tile
297	///
298	LogicalResult
299	separateFullTiles(MutableArrayRef<AffineForOp> nest,
300	SmallVectorImpl<AffineForOp> fullTileNest = nullptr*);
301
302	/// Walk an affine.for to find a band to coalesce.
303	LogicalResult coalescePerfectlyNestedAffineLoops(AffineForOp op);
304
305	} // namespace affine
306	} // namespace mlir
307
308	#endif // MLIR_DIALECT_AFFINE_LOOPUTILS_H
309

source code of mlir/include/mlir/Dialect/Affine/LoopUtils.h