| 1 | //===- LoopTiling.cpp --- Loop tiling pass ------------------------------*-===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // This file implements a pass to tile loop nests. |
| 10 | // |
| 11 | //===----------------------------------------------------------------------===// |
| 12 | |
| 13 | #include "mlir/Dialect/Affine/Passes.h" |
| 14 | |
| 15 | #include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h" |
| 16 | #include "mlir/Dialect/Affine/Analysis/AffineStructures.h" |
| 17 | #include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h" |
| 18 | #include "mlir/Dialect/Affine/Analysis/Utils.h" |
| 19 | #include "mlir/Dialect/Affine/IR/AffineOps.h" |
| 20 | #include "mlir/Dialect/Affine/IR/AffineValueMap.h" |
| 21 | #include "mlir/Dialect/Affine/LoopUtils.h" |
| 22 | #include "mlir/Dialect/Affine/Utils.h" |
| 23 | #include "mlir/Dialect/Func/IR/FuncOps.h" |
| 24 | #include "mlir/IR/Builders.h" |
| 25 | #include "mlir/IR/IRMapping.h" |
| 26 | #include "llvm/Support/CommandLine.h" |
| 27 | #include "llvm/Support/Debug.h" |
| 28 | #include <optional> |
| 29 | |
| 30 | namespace mlir { |
| 31 | namespace affine { |
| 32 | #define GEN_PASS_DEF_AFFINELOOPTILING |
| 33 | #include "mlir/Dialect/Affine/Passes.h.inc" |
| 34 | } // namespace affine |
| 35 | } // namespace mlir |
| 36 | |
| 37 | using namespace mlir; |
| 38 | using namespace mlir::affine; |
| 39 | |
| 40 | #define DEBUG_TYPE "affine-loop-tile" |
| 41 | |
| 42 | namespace { |
| 43 | |
| 44 | /// A pass to perform loop tiling on all suitable loop nests of a Function. |
| 45 | struct LoopTiling : public affine::impl::AffineLoopTilingBase<LoopTiling> { |
| 46 | LoopTiling() = default; |
| 47 | explicit LoopTiling(uint64_t cacheSizeBytes, bool avoidMaxMinBounds = true) |
| 48 | : avoidMaxMinBounds(avoidMaxMinBounds) { |
| 49 | this->cacheSizeInKiB = cacheSizeBytes / 1024; |
| 50 | } |
| 51 | |
| 52 | void runOnOperation() override; |
| 53 | void getTileSizes(ArrayRef<AffineForOp> band, |
| 54 | SmallVectorImpl<unsigned> *tileSizes); |
| 55 | |
| 56 | // Default tile size if nothing is provided. |
| 57 | constexpr static unsigned kDefaultTileSize = 4; |
| 58 | |
| 59 | // If true, tile sizes are set to avoid max/min in bounds if possible. |
| 60 | bool avoidMaxMinBounds = true; |
| 61 | }; |
| 62 | |
| 63 | } // namespace |
| 64 | |
| 65 | /// Creates a pass to perform loop tiling on all suitable loop nests of a |
| 66 | /// Function. |
| 67 | std::unique_ptr<OperationPass<func::FuncOp>> |
| 68 | mlir::affine::createLoopTilingPass(uint64_t cacheSizeBytes) { |
| 69 | return std::make_unique<LoopTiling>(args&: cacheSizeBytes); |
| 70 | } |
| 71 | std::unique_ptr<OperationPass<func::FuncOp>> |
| 72 | mlir::affine::createLoopTilingPass() { |
| 73 | return std::make_unique<LoopTiling>(); |
| 74 | } |
| 75 | |
| 76 | /// Reduces each tile size to the largest divisor of the corresponding trip |
| 77 | /// count (if the trip count is known). |
| 78 | static void adjustToDivisorsOfTripCounts(ArrayRef<AffineForOp> band, |
| 79 | SmallVectorImpl<unsigned> *tileSizes) { |
| 80 | assert(band.size() == tileSizes->size() && "invalid tile size count" ); |
| 81 | for (unsigned i = 0, e = band.size(); i < e; i++) { |
| 82 | unsigned &tSizeAdjusted = (*tileSizes)[i]; |
| 83 | std::optional<uint64_t> mayConst = getConstantTripCount(band[i]); |
| 84 | if (!mayConst) |
| 85 | continue; |
| 86 | // Adjust the tile size to largest factor of the trip count less than |
| 87 | // tSize. |
| 88 | uint64_t constTripCount = *mayConst; |
| 89 | if (constTripCount > 1 && tSizeAdjusted > constTripCount / 2) |
| 90 | tSizeAdjusted = constTripCount / 2; |
| 91 | while (constTripCount % tSizeAdjusted != 0) |
| 92 | tSizeAdjusted--; |
| 93 | } |
| 94 | } |
| 95 | |
| 96 | // Returns tile sizes to use. Checks CL options; if none are specified, sets it |
| 97 | // based on a simple model that looks at the memory footprint and determines |
| 98 | // tile sizes assuming identity accesses / 1:1 tile size proportional footprint |
| 99 | // along each of the dimensions being tiled. |
| 100 | // TODO: evolve this model. Tile size determination is a large area |
| 101 | // to play with in general. |
| 102 | void LoopTiling::getTileSizes(ArrayRef<AffineForOp> band, |
| 103 | SmallVectorImpl<unsigned> *tileSizes) { |
| 104 | if (band.empty()) |
| 105 | return; |
| 106 | |
| 107 | // Use command-line tileSize for all loops if specified. |
| 108 | if (tileSize) { |
| 109 | tileSizes->assign(band.size(), tileSize); |
| 110 | return; |
| 111 | } |
| 112 | |
| 113 | // Use supplied tile sizes and fill them with default tile size if it's short. |
| 114 | if (!this->tileSizes.empty()) { |
| 115 | tileSizes->assign(this->tileSizes.begin(), this->tileSizes.end()); |
| 116 | tileSizes->resize(N: band.size(), NV: kDefaultTileSize); |
| 117 | return; |
| 118 | } |
| 119 | tileSizes->resize(N: band.size()); |
| 120 | |
| 121 | // If the cache size is zero, set the minimum valid tile size. No good reason |
| 122 | // to pick another specific size over this. |
| 123 | if (cacheSizeInKiB == 0) { |
| 124 | std::fill(first: tileSizes->begin(), last: tileSizes->end(), value: 1); |
| 125 | return; |
| 126 | } |
| 127 | |
| 128 | // The first loop in the band. |
| 129 | AffineForOp rootForOp = band[0]; |
| 130 | (void)rootForOp; |
| 131 | |
| 132 | // Obtain memory footprint and set tile sizes so that a tile fits in |
| 133 | // the cache size. This is an approximation with the assumption that the |
| 134 | // footprint increases with the tile size linearly in that dimension (i.e., |
| 135 | // assumes one-to-one access function). |
| 136 | std::optional<int64_t> fp = getMemoryFootprintBytes(band[0], 0); |
| 137 | if (!fp) { |
| 138 | // Fill with default tile sizes if footprint is unknown. |
| 139 | std::fill(first: tileSizes->begin(), last: tileSizes->end(), |
| 140 | value: LoopTiling::kDefaultTileSize); |
| 141 | if (avoidMaxMinBounds) |
| 142 | adjustToDivisorsOfTripCounts(band, tileSizes); |
| 143 | LLVM_DEBUG( |
| 144 | rootForOp.emitWarning("memory footprint unknown: using default tile " |
| 145 | "sizes adjusted to trip count divisors" )); |
| 146 | return; |
| 147 | } |
| 148 | |
| 149 | // Check how many times larger the cache size is when compared to footprint. |
| 150 | uint64_t cacheSizeBytes = cacheSizeInKiB * 1024; |
| 151 | uint64_t excessFactor = llvm::divideCeil(Numerator: *fp, Denominator: cacheSizeBytes); |
| 152 | if (excessFactor <= 1) { |
| 153 | // No need of any tiling - set tile size to 1. |
| 154 | std::fill(first: tileSizes->begin(), last: tileSizes->end(), value: 1); |
| 155 | return; |
| 156 | } |
| 157 | |
| 158 | // Divide all loops equally in an attempt to reduce footprint. |
| 159 | // TODO: this is approximate. Ideally, obtain reuse factor / |
| 160 | // profitability along each dimension and weight tile sizes based on that as |
| 161 | // one possible approach. Or compute a polynomial in tile sizes and solve for |
| 162 | // it. |
| 163 | |
| 164 | // For an n-d tileable band, compute the n^th root of the excess. |
| 165 | unsigned tSize = |
| 166 | static_cast<unsigned>(floorl(x: std::pow(x: excessFactor, y: 1.0 / band.size()))); |
| 167 | // We'll keep a running product to determine the last tile size better. |
| 168 | unsigned cumulProductOfTileSizes = 1; |
| 169 | for (unsigned i = 0, e = band.size(); i < e; i++) { |
| 170 | if (i < e - 1) |
| 171 | (*tileSizes)[i] = tSize; |
| 172 | else |
| 173 | // Set last tile size to cover the balance. |
| 174 | (*tileSizes)[i] = std::max( |
| 175 | a: 1U, b: static_cast<unsigned>(excessFactor / cumulProductOfTileSizes)); |
| 176 | cumulProductOfTileSizes *= (*tileSizes)[i]; |
| 177 | } |
| 178 | if (avoidMaxMinBounds) |
| 179 | adjustToDivisorsOfTripCounts(band, tileSizes); |
| 180 | } |
| 181 | |
| 182 | void LoopTiling::runOnOperation() { |
| 183 | // Bands of loops to tile. |
| 184 | std::vector<SmallVector<AffineForOp, 6>> bands; |
| 185 | getTileableBands(getOperation(), &bands); |
| 186 | |
| 187 | // Tile each band. |
| 188 | for (auto &band : bands) { |
| 189 | if (!isTilingValid(band)) { |
| 190 | band.front().emitRemark("tiling nest is invalid due to dependences" ); |
| 191 | continue; |
| 192 | } |
| 193 | |
| 194 | // Set up tile sizes; fill missing tile sizes at the end with default tile |
| 195 | // size or tileSize if one was provided. |
| 196 | SmallVector<unsigned, 6> tileSizes; |
| 197 | getTileSizes(band, &tileSizes); |
| 198 | if (llvm::DebugFlag) { |
| 199 | auto diag = band[0].emitRemark("using tile sizes [" ); |
| 200 | for (unsigned tSize : tileSizes) |
| 201 | diag << tSize << ' '; |
| 202 | diag << "]\n" ; |
| 203 | } |
| 204 | SmallVector<AffineForOp, 6> tiledNest; |
| 205 | if (failed(Result: tilePerfectlyNested(band, tileSizes, &tiledNest))) { |
| 206 | // An empty band always succeeds. |
| 207 | assert(!band.empty() && "guaranteed to succeed on empty bands" ); |
| 208 | LLVM_DEBUG(band.front()->emitRemark("loop tiling failed!\n" )); |
| 209 | continue; |
| 210 | } |
| 211 | |
| 212 | // Separate full and partial tiles. |
| 213 | if (separate) { |
| 214 | auto intraTileLoops = |
| 215 | MutableArrayRef<AffineForOp>(tiledNest).drop_front(N: band.size()); |
| 216 | if (failed(separateFullTiles(intraTileLoops))) { |
| 217 | assert(!intraTileLoops.empty() && |
| 218 | "guaranteed to succeed on empty bands" ); |
| 219 | LLVM_DEBUG(intraTileLoops.front()->emitRemark( |
| 220 | "separation post tiling failed!\n" )); |
| 221 | } |
| 222 | } |
| 223 | } |
| 224 | } |
| 225 | |
| 226 | constexpr unsigned LoopTiling::kDefaultTileSize; |
| 227 | |