| 1 | //===- LoopUtils.h - Loop transformation utilities --------------*- C++ -*-===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // This header file defines prototypes for various loop transformation utility |
| 10 | // methods: these are not passes by themselves but are used either by passes, |
| 11 | // optimization sequences, or in turn by other transformation utilities. |
| 12 | // |
| 13 | //===----------------------------------------------------------------------===// |
| 14 | |
| 15 | #ifndef MLIR_DIALECT_AFFINE_LOOPUTILS_H |
| 16 | #define MLIR_DIALECT_AFFINE_LOOPUTILS_H |
| 17 | |
| 18 | #include "mlir/IR/Block.h" |
| 19 | #include "mlir/Support/LLVM.h" |
| 20 | #include "mlir/Transforms/RegionUtils.h" |
| 21 | #include <optional> |
| 22 | |
| 23 | namespace mlir { |
| 24 | class AffineMap; |
| 25 | class LoopLikeOpInterface; |
| 26 | class OpBuilder; |
| 27 | class Value; |
| 28 | class ValueRange; |
| 29 | |
| 30 | namespace func { |
| 31 | class FuncOp; |
| 32 | } // namespace func |
| 33 | |
| 34 | namespace scf { |
| 35 | class ForOp; |
| 36 | class ParallelOp; |
| 37 | } // namespace scf |
| 38 | |
| 39 | namespace affine { |
| 40 | class AffineForOp; |
| 41 | struct MemRefRegion; |
| 42 | |
| 43 | /// Unrolls this for operation completely if the trip count is known to be |
| 44 | /// constant. Returns failure otherwise. |
| 45 | LogicalResult loopUnrollFull(AffineForOp forOp); |
| 46 | |
| 47 | /// Unrolls this for operation by the specified unroll factor. Returns failure |
| 48 | /// if the loop cannot be unrolled either due to restrictions or due to invalid |
| 49 | /// unroll factors. Requires positive loop bounds and step. If specified, |
| 50 | /// annotates the Ops in each unrolled iteration by applying `annotateFn`. |
| 51 | /// When `cleanUpUnroll` is true, we can ensure the cleanup loop is unrolled |
| 52 | /// regardless of the unroll factor. |
| 53 | LogicalResult loopUnrollByFactor( |
| 54 | AffineForOp forOp, uint64_t unrollFactor, |
| 55 | function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn = nullptr, |
| 56 | bool cleanUpUnroll = false); |
| 57 | |
| 58 | /// Unrolls this loop by the specified unroll factor or its trip count, |
| 59 | /// whichever is lower. |
| 60 | LogicalResult loopUnrollUpToFactor(AffineForOp forOp, uint64_t unrollFactor); |
| 61 | |
| 62 | /// Returns true if `loops` is a perfectly nested loop nest, where loops appear |
| 63 | /// in it from outermost to innermost. |
| 64 | bool LLVM_ATTRIBUTE_UNUSED isPerfectlyNested(ArrayRef<AffineForOp> loops); |
| 65 | |
| 66 | /// Get perfectly nested sequence of loops starting at root of loop nest |
| 67 | /// (the first op being another AffineFor, and the second op - a terminator). |
| 68 | /// A loop is perfectly nested iff: the first op in the loop's body is another |
| 69 | /// AffineForOp, and the second op is a terminator). |
| 70 | void getPerfectlyNestedLoops(SmallVectorImpl<AffineForOp> &nestedLoops, |
| 71 | AffineForOp root); |
| 72 | |
| 73 | /// Unrolls and jams this loop by the specified factor. `forOp` can be a loop |
| 74 | /// with iteration arguments performing supported reductions and its inner loops |
| 75 | /// can have iteration arguments. Returns success if the loop is successfully |
| 76 | /// unroll-jammed. |
| 77 | LogicalResult loopUnrollJamByFactor(AffineForOp forOp, |
| 78 | uint64_t unrollJamFactor); |
| 79 | |
| 80 | /// Unrolls and jams this loop by the specified factor or by the trip count (if |
| 81 | /// constant), whichever is lower. |
| 82 | LogicalResult loopUnrollJamUpToFactor(AffineForOp forOp, |
| 83 | uint64_t unrollJamFactor); |
| 84 | |
| 85 | /// Promotes the loop body of a AffineForOp to its containing block if the loop |
| 86 | /// was known to have a single iteration. |
| 87 | LogicalResult promoteIfSingleIteration(AffineForOp forOp); |
| 88 | |
| 89 | /// Promotes all single iteration AffineForOp's in the Function, i.e., moves |
| 90 | /// their body into the containing Block. |
| 91 | void promoteSingleIterationLoops(func::FuncOp f); |
| 92 | |
| 93 | /// Skew the operations in an affine.for's body with the specified |
| 94 | /// operation-wise shifts. The shifts are with respect to the original execution |
| 95 | /// order, and are multiplied by the loop 'step' before being applied. If |
| 96 | /// `unrollPrologueEpilogue` is set, fully unroll the prologue and epilogue |
| 97 | /// loops when possible. |
| 98 | LogicalResult affineForOpBodySkew(AffineForOp forOp, ArrayRef<uint64_t> shifts, |
| 99 | bool unrollPrologueEpilogue = false); |
| 100 | |
| 101 | /// Identify valid and profitable bands of loops to tile. This is currently just |
| 102 | /// a temporary placeholder to test the mechanics of tiled code generation. |
| 103 | /// Returns all maximal outermost perfect loop nests to tile. |
| 104 | void getTileableBands(func::FuncOp f, |
| 105 | std::vector<SmallVector<AffineForOp, 6>> *bands); |
| 106 | |
| 107 | /// Tiles the specified band of perfectly nested loops creating tile-space loops |
| 108 | /// and intra-tile loops. A band is a contiguous set of loops. This utility |
| 109 | /// doesn't check for the validity of tiling itself, but just performs it. |
| 110 | LogicalResult |
| 111 | tilePerfectlyNested(MutableArrayRef<AffineForOp> input, |
| 112 | ArrayRef<unsigned> tileSizes, |
| 113 | SmallVectorImpl<AffineForOp> *tiledNest = nullptr); |
| 114 | |
| 115 | /// Tiles the specified band of perfectly nested loops creating tile-space |
| 116 | /// loops and intra-tile loops, using SSA values as tiling parameters. A band |
| 117 | /// is a contiguous set of loops. |
| 118 | LogicalResult tilePerfectlyNestedParametric( |
| 119 | MutableArrayRef<AffineForOp> input, ArrayRef<Value> tileSizes, |
| 120 | SmallVectorImpl<AffineForOp> *tiledNest = nullptr); |
| 121 | |
| 122 | /// Performs loop interchange on 'forOpA' and 'forOpB'. Requires that 'forOpA' |
| 123 | /// and 'forOpB' are part of a perfectly nested sequence of loops. |
| 124 | void interchangeLoops(AffineForOp forOpA, AffineForOp forOpB); |
| 125 | |
| 126 | /// Checks if the loop interchange permutation 'loopPermMap', of the perfectly |
| 127 | /// nested sequence of loops in 'loops', would violate dependences (loop 'i' in |
| 128 | /// 'loops' is mapped to location 'j = 'loopPermMap[i]' in the interchange). |
| 129 | bool isValidLoopInterchangePermutation(ArrayRef<AffineForOp> loops, |
| 130 | ArrayRef<unsigned> loopPermMap); |
| 131 | |
| 132 | /// Performs a loop permutation on a perfectly nested loop nest `inputNest` |
| 133 | /// (where the contained loops appear from outer to inner) as specified by the |
| 134 | /// permutation `permMap`: loop 'i' in `inputNest` is mapped to location |
| 135 | /// 'loopPermMap[i]', where positions 0, 1, ... are from the outermost position |
| 136 | /// to inner. Returns the position in `inputNest` of the AffineForOp that |
| 137 | /// becomes the new outermost loop of this nest. This method always succeeds, |
| 138 | /// asserts out on invalid input / specifications. |
| 139 | unsigned permuteLoops(ArrayRef<AffineForOp> inputNest, |
| 140 | ArrayRef<unsigned> permMap); |
| 141 | |
| 142 | // Sinks all sequential loops to the innermost levels (while preserving |
| 143 | // relative order among them) and moves all parallel loops to the |
| 144 | // outermost (while again preserving relative order among them). |
| 145 | // Returns AffineForOp of the root of the new loop nest after loop interchanges. |
| 146 | AffineForOp sinkSequentialLoops(AffineForOp forOp); |
| 147 | |
| 148 | /// Performs tiling fo imperfectly nested loops (with interchange) by |
| 149 | /// strip-mining the `forOps` by `sizes` and sinking them, in their order of |
| 150 | /// occurrence in `forOps`, under each of the `targets`. |
| 151 | /// Returns the new AffineForOps, one per each of (`forOps`, `targets`) pair, |
| 152 | /// nested immediately under each of `targets`. |
| 153 | SmallVector<SmallVector<AffineForOp, 8>, 8> tile(ArrayRef<AffineForOp> forOps, |
| 154 | ArrayRef<uint64_t> sizes, |
| 155 | ArrayRef<AffineForOp> targets); |
| 156 | |
| 157 | /// Performs tiling (with interchange) by strip-mining the `forOps` by `sizes` |
| 158 | /// and sinking them, in their order of occurrence in `forOps`, under `target`. |
| 159 | /// Returns the new AffineForOps, one per `forOps`, nested immediately under |
| 160 | /// `target`. |
| 161 | SmallVector<AffineForOp, 8> tile(ArrayRef<AffineForOp> forOps, |
| 162 | ArrayRef<uint64_t> sizes, AffineForOp target); |
| 163 | |
| 164 | /// Explicit copy / DMA generation options for mlir::affineDataCopyGenerate. |
| 165 | struct AffineCopyOptions { |
| 166 | // True if DMAs should be generated instead of point-wise copies. |
| 167 | bool generateDma; |
| 168 | // The slower memory space from which data is to be moved. |
| 169 | unsigned slowMemorySpace; |
| 170 | // Memory space of the faster one (typically a scratchpad). |
| 171 | unsigned fastMemorySpace; |
| 172 | // Memory space to place tags in: only meaningful for DMAs. |
| 173 | unsigned tagMemorySpace; |
| 174 | // Capacity of the fast memory space in bytes. |
| 175 | uint64_t fastMemCapacityBytes; |
| 176 | }; |
| 177 | |
| 178 | /// Performs explicit copying for the contiguous sequence of operations in the |
| 179 | /// block iterator range [`begin', `end'), where `end' can't be past the |
| 180 | /// terminator of the block (since additional operations are potentially |
| 181 | /// inserted right before `end`. `copyOptions` provides various parameters, and |
| 182 | /// the output argument `copyNests` is the set of all copy nests inserted, each |
| 183 | /// represented by its root affine.for. Since we generate alloc's and dealloc's |
| 184 | /// for all fast buffers (before and after the range of operations resp. or at a |
| 185 | /// hoisted position), all of the fast memory capacity is assumed to be |
| 186 | /// available for processing this block range. When 'filterMemRef' is specified, |
| 187 | /// copies are only generated for the provided MemRef. Returns success if the |
| 188 | /// explicit copying succeeded for all memrefs on which affine load/stores were |
| 189 | /// encountered. For memrefs for whose element types a size in bytes can't be |
| 190 | /// computed (`index` type), their capacity is not accounted for and the |
| 191 | /// `fastMemCapacityBytes` copy option would be non-functional in such cases. |
| 192 | LogicalResult affineDataCopyGenerate(Block::iterator begin, Block::iterator end, |
| 193 | const AffineCopyOptions ©Options, |
| 194 | std::optional<Value> filterMemRef, |
| 195 | DenseSet<Operation *> ©Nests); |
| 196 | |
| 197 | /// A convenience version of affineDataCopyGenerate for all ops in the body of |
| 198 | /// an AffineForOp. |
| 199 | LogicalResult affineDataCopyGenerate(AffineForOp forOp, |
| 200 | const AffineCopyOptions ©Options, |
| 201 | std::optional<Value> filterMemRef, |
| 202 | DenseSet<Operation *> ©Nests); |
| 203 | |
| 204 | /// Result for calling generateCopyForMemRegion. |
| 205 | struct CopyGenerateResult { |
| 206 | // Number of bytes used by alloc. |
| 207 | uint64_t sizeInBytes; |
| 208 | |
| 209 | // The newly created buffer allocation. |
| 210 | Operation *alloc; |
| 211 | |
| 212 | // Generated loop nest for copying data between the allocated buffer and the |
| 213 | // original memref. |
| 214 | Operation *copyNest; |
| 215 | }; |
| 216 | |
| 217 | /// generateCopyForMemRegion is similar to affineDataCopyGenerate, but works |
| 218 | /// with a single memref region. `memrefRegion` is supposed to contain analysis |
| 219 | /// information within analyzedOp. The generated prologue and epilogue always |
| 220 | /// surround `analyzedOp`. |
| 221 | /// |
| 222 | /// Note that `analyzedOp` is a single op for API convenience, and the |
| 223 | /// [begin, end) version can be added as needed. |
| 224 | /// |
| 225 | /// Also note that certain options in `copyOptions` aren't looked at anymore, |
| 226 | /// like slowMemorySpace. |
| 227 | LogicalResult generateCopyForMemRegion(const MemRefRegion &memrefRegion, |
| 228 | Operation *analyzedOp, |
| 229 | const AffineCopyOptions ©Options, |
| 230 | CopyGenerateResult &result); |
| 231 | |
| 232 | /// Replace a perfect nest of "for" loops with a single linearized loop. Assumes |
| 233 | /// `loops` contains a list of perfectly nested loops outermost to innermost |
| 234 | /// that are normalized (step one and lower bound of zero) and with bounds and |
| 235 | /// steps independent of any loop induction variable involved in the nest. |
| 236 | /// Coalescing affine.for loops is not always possible, i.e., the result may not |
| 237 | /// be representable using affine.for. |
| 238 | LogicalResult coalesceLoops(MutableArrayRef<AffineForOp> loops); |
| 239 | |
| 240 | /// Maps `forOp` for execution on a parallel grid of virtual `processorIds` of |
| 241 | /// size given by `numProcessors`. This is achieved by embedding the SSA values |
| 242 | /// corresponding to `processorIds` and `numProcessors` into the bounds and step |
| 243 | /// of the `forOp`. No check is performed on the legality of the rewrite, it is |
| 244 | /// the caller's responsibility to ensure legality. |
| 245 | /// |
| 246 | /// Requires that `processorIds` and `numProcessors` have the same size and that |
| 247 | /// for each idx, `processorIds`[idx] takes, at runtime, all values between 0 |
| 248 | /// and `numProcessors`[idx] - 1. This corresponds to traditional use cases for: |
| 249 | /// 1. GPU (threadIdx, get_local_id(), ...) |
| 250 | /// 2. MPI (MPI_Comm_rank) |
| 251 | /// 3. OpenMP (omp_get_thread_num) |
| 252 | /// |
| 253 | /// Example: |
| 254 | /// Assuming a 2-d grid with processorIds = [blockIdx.x, threadIdx.x] and |
| 255 | /// numProcessors = [gridDim.x, blockDim.x], the loop: |
| 256 | /// |
| 257 | /// ``` |
| 258 | /// scf.for %i = %lb to %ub step %step { |
| 259 | /// ... |
| 260 | /// } |
| 261 | /// ``` |
| 262 | /// |
| 263 | /// is rewritten into a version resembling the following pseudo-IR: |
| 264 | /// |
| 265 | /// ``` |
| 266 | /// scf.for %i = %lb + %step * (threadIdx.x + blockIdx.x * blockDim.x) |
| 267 | /// to %ub step %gridDim.x * blockDim.x * %step { |
| 268 | /// ... |
| 269 | /// } |
| 270 | /// ``` |
| 271 | void mapLoopToProcessorIds(scf::ForOp forOp, ArrayRef<Value> processorId, |
| 272 | ArrayRef<Value> numProcessors); |
| 273 | |
| 274 | /// Gathers all AffineForOps in 'func.func' grouped by loop depth. |
| 275 | void gatherLoops(func::FuncOp func, |
| 276 | std::vector<SmallVector<AffineForOp, 2>> &depthToLoops); |
| 277 | |
| 278 | /// Creates an AffineForOp while ensuring that the lower and upper bounds are |
| 279 | /// canonicalized, i.e., unused and duplicate operands are removed, any constant |
| 280 | /// operands propagated/folded in, and duplicate bound maps dropped. |
| 281 | AffineForOp createCanonicalizedAffineForOp(OpBuilder b, Location loc, |
| 282 | ValueRange lbOperands, |
| 283 | AffineMap lbMap, |
| 284 | ValueRange ubOperands, |
| 285 | AffineMap ubMap, int64_t step = 1); |
| 286 | |
| 287 | /// Separates full tiles from partial tiles for a perfect nest `nest` by |
| 288 | /// generating a conditional guard that selects between the full tile version |
| 289 | /// and the partial tile version using an AffineIfOp. The original loop nest |
| 290 | /// is replaced by this guarded two version form. |
| 291 | /// |
| 292 | /// affine.if (cond) |
| 293 | /// // full_tile |
| 294 | /// else |
| 295 | /// // partial tile |
| 296 | /// |
| 297 | LogicalResult |
| 298 | separateFullTiles(MutableArrayRef<AffineForOp> nest, |
| 299 | SmallVectorImpl<AffineForOp> *fullTileNest = nullptr); |
| 300 | |
| 301 | /// Walk an affine.for to find a band to coalesce. |
| 302 | LogicalResult coalescePerfectlyNestedAffineLoops(AffineForOp op); |
| 303 | |
| 304 | /// Count the number of loops surrounding `operand` such that operand could be |
| 305 | /// hoisted above. |
| 306 | /// Stop counting at the first loop over which the operand cannot be hoisted. |
| 307 | /// This counts any LoopLikeOpInterface, not just affine.for. |
| 308 | int64_t numEnclosingInvariantLoops(OpOperand &operand); |
| 309 | } // namespace affine |
| 310 | } // namespace mlir |
| 311 | |
| 312 | #endif // MLIR_DIALECT_AFFINE_LOOPUTILS_H |
| 313 | |