1 | //===- LoopTiling.cpp --- Loop tiling pass ------------------------------*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file implements a pass to tile loop nests. |
10 | // |
11 | //===----------------------------------------------------------------------===// |
12 | |
13 | #include "mlir/Dialect/Affine/Passes.h" |
14 | |
15 | #include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h" |
16 | #include "mlir/Dialect/Affine/Analysis/AffineStructures.h" |
17 | #include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h" |
18 | #include "mlir/Dialect/Affine/Analysis/Utils.h" |
19 | #include "mlir/Dialect/Affine/IR/AffineOps.h" |
20 | #include "mlir/Dialect/Affine/IR/AffineValueMap.h" |
21 | #include "mlir/Dialect/Affine/LoopUtils.h" |
22 | #include "mlir/Dialect/Affine/Utils.h" |
23 | #include "mlir/Dialect/Func/IR/FuncOps.h" |
24 | #include "mlir/IR/Builders.h" |
25 | #include "mlir/IR/IRMapping.h" |
26 | #include "llvm/Support/CommandLine.h" |
27 | #include "llvm/Support/Debug.h" |
28 | #include <optional> |
29 | |
30 | namespace mlir { |
31 | namespace affine { |
32 | #define GEN_PASS_DEF_AFFINELOOPTILING |
33 | #include "mlir/Dialect/Affine/Passes.h.inc" |
34 | } // namespace affine |
35 | } // namespace mlir |
36 | |
37 | using namespace mlir; |
38 | using namespace mlir::affine; |
39 | |
40 | #define DEBUG_TYPE "affine-loop-tile" |
41 | |
42 | namespace { |
43 | |
44 | /// A pass to perform loop tiling on all suitable loop nests of a Function. |
45 | struct LoopTiling : public affine::impl::AffineLoopTilingBase<LoopTiling> { |
46 | LoopTiling() = default; |
47 | explicit LoopTiling(uint64_t cacheSizeBytes, bool avoidMaxMinBounds = true) |
48 | : avoidMaxMinBounds(avoidMaxMinBounds) { |
49 | this->cacheSizeInKiB = cacheSizeBytes / 1024; |
50 | } |
51 | |
52 | void runOnOperation() override; |
53 | void getTileSizes(ArrayRef<AffineForOp> band, |
54 | SmallVectorImpl<unsigned> *tileSizes); |
55 | |
56 | // Default tile size if nothing is provided. |
57 | constexpr static unsigned kDefaultTileSize = 4; |
58 | |
59 | // If true, tile sizes are set to avoid max/min in bounds if possible. |
60 | bool avoidMaxMinBounds = true; |
61 | }; |
62 | |
63 | } // namespace |
64 | |
65 | /// Creates a pass to perform loop tiling on all suitable loop nests of a |
66 | /// Function. |
67 | std::unique_ptr<OperationPass<func::FuncOp>> |
68 | mlir::affine::createLoopTilingPass(uint64_t cacheSizeBytes) { |
69 | return std::make_unique<LoopTiling>(args&: cacheSizeBytes); |
70 | } |
71 | std::unique_ptr<OperationPass<func::FuncOp>> |
72 | mlir::affine::createLoopTilingPass() { |
73 | return std::make_unique<LoopTiling>(); |
74 | } |
75 | |
76 | /// Reduces each tile size to the largest divisor of the corresponding trip |
77 | /// count (if the trip count is known). |
78 | static void adjustToDivisorsOfTripCounts(ArrayRef<AffineForOp> band, |
79 | SmallVectorImpl<unsigned> *tileSizes) { |
80 | assert(band.size() == tileSizes->size() && "invalid tile size count" ); |
81 | for (unsigned i = 0, e = band.size(); i < e; i++) { |
82 | unsigned &tSizeAdjusted = (*tileSizes)[i]; |
83 | std::optional<uint64_t> mayConst = getConstantTripCount(band[i]); |
84 | if (!mayConst) |
85 | continue; |
86 | // Adjust the tile size to largest factor of the trip count less than |
87 | // tSize. |
88 | uint64_t constTripCount = *mayConst; |
89 | if (constTripCount > 1 && tSizeAdjusted > constTripCount / 2) |
90 | tSizeAdjusted = constTripCount / 2; |
91 | while (constTripCount % tSizeAdjusted != 0) |
92 | tSizeAdjusted--; |
93 | } |
94 | } |
95 | |
96 | // Returns tile sizes to use. Checks CL options; if none are specified, sets it |
97 | // based on a simple model that looks at the memory footprint and determines |
98 | // tile sizes assuming identity accesses / 1:1 tile size proportional footprint |
99 | // along each of the dimensions being tiled. |
100 | // TODO: evolve this model. Tile size determination is a large area |
101 | // to play with in general. |
102 | void LoopTiling::getTileSizes(ArrayRef<AffineForOp> band, |
103 | SmallVectorImpl<unsigned> *tileSizes) { |
104 | if (band.empty()) |
105 | return; |
106 | |
107 | // Use command-line tileSize for all loops if specified. |
108 | if (tileSize) { |
109 | tileSizes->assign(band.size(), tileSize); |
110 | return; |
111 | } |
112 | |
113 | // Use supplied tile sizes and fill them with default tile size if it's short. |
114 | if (!this->tileSizes.empty()) { |
115 | tileSizes->assign(this->tileSizes.begin(), this->tileSizes.end()); |
116 | tileSizes->resize(N: band.size(), NV: kDefaultTileSize); |
117 | return; |
118 | } |
119 | tileSizes->resize(N: band.size()); |
120 | |
121 | // If the cache size is zero, set the minimum valid tile size. No good reason |
122 | // to pick another specific size over this. |
123 | if (cacheSizeInKiB == 0) { |
124 | std::fill(first: tileSizes->begin(), last: tileSizes->end(), value: 1); |
125 | return; |
126 | } |
127 | |
128 | // The first loop in the band. |
129 | AffineForOp rootForOp = band[0]; |
130 | (void)rootForOp; |
131 | |
132 | // Obtain memory footprint and set tile sizes so that a tile fits in |
133 | // the cache size. This is an approximation with the assumption that the |
134 | // footprint increases with the tile size linearly in that dimension (i.e., |
135 | // assumes one-to-one access function). |
136 | std::optional<int64_t> fp = getMemoryFootprintBytes(band[0], 0); |
137 | if (!fp) { |
138 | // Fill with default tile sizes if footprint is unknown. |
139 | std::fill(first: tileSizes->begin(), last: tileSizes->end(), |
140 | value: LoopTiling::kDefaultTileSize); |
141 | if (avoidMaxMinBounds) |
142 | adjustToDivisorsOfTripCounts(band, tileSizes); |
143 | LLVM_DEBUG( |
144 | rootForOp.emitWarning("memory footprint unknown: using default tile " |
145 | "sizes adjusted to trip count divisors" )); |
146 | return; |
147 | } |
148 | |
149 | // Check how many times larger the cache size is when compared to footprint. |
150 | uint64_t cacheSizeBytes = cacheSizeInKiB * 1024; |
151 | uint64_t excessFactor = llvm::divideCeil(Numerator: *fp, Denominator: cacheSizeBytes); |
152 | if (excessFactor <= 1) { |
153 | // No need of any tiling - set tile size to 1. |
154 | std::fill(first: tileSizes->begin(), last: tileSizes->end(), value: 1); |
155 | return; |
156 | } |
157 | |
158 | // Divide all loops equally in an attempt to reduce footprint. |
159 | // TODO: this is approximate. Ideally, obtain reuse factor / |
160 | // profitability along each dimension and weight tile sizes based on that as |
161 | // one possible approach. Or compute a polynomial in tile sizes and solve for |
162 | // it. |
163 | |
164 | // For an n-d tileable band, compute the n^th root of the excess. |
165 | unsigned tSize = |
166 | static_cast<unsigned>(floorl(x: std::pow(x: excessFactor, y: 1.0 / band.size()))); |
167 | // We'll keep a running product to determine the last tile size better. |
168 | unsigned cumulProductOfTileSizes = 1; |
169 | for (unsigned i = 0, e = band.size(); i < e; i++) { |
170 | if (i < e - 1) |
171 | (*tileSizes)[i] = tSize; |
172 | else |
173 | // Set last tile size to cover the balance. |
174 | (*tileSizes)[i] = std::max( |
175 | a: 1U, b: static_cast<unsigned>(excessFactor / cumulProductOfTileSizes)); |
176 | cumulProductOfTileSizes *= (*tileSizes)[i]; |
177 | } |
178 | if (avoidMaxMinBounds) |
179 | adjustToDivisorsOfTripCounts(band, tileSizes); |
180 | } |
181 | |
182 | void LoopTiling::runOnOperation() { |
183 | // Bands of loops to tile. |
184 | std::vector<SmallVector<AffineForOp, 6>> bands; |
185 | getTileableBands(getOperation(), &bands); |
186 | |
187 | // Tile each band. |
188 | for (auto &band : bands) { |
189 | if (!isTilingValid(band)) { |
190 | band.front().emitRemark("tiling nest is invalid due to dependences" ); |
191 | continue; |
192 | } |
193 | |
194 | // Set up tile sizes; fill missing tile sizes at the end with default tile |
195 | // size or tileSize if one was provided. |
196 | SmallVector<unsigned, 6> tileSizes; |
197 | getTileSizes(band, &tileSizes); |
198 | if (llvm::DebugFlag) { |
199 | auto diag = band[0].emitRemark("using tile sizes [" ); |
200 | for (unsigned tSize : tileSizes) |
201 | diag << tSize << ' '; |
202 | diag << "]\n" ; |
203 | } |
204 | SmallVector<AffineForOp, 6> tiledNest; |
205 | if (failed(Result: tilePerfectlyNested(band, tileSizes, &tiledNest))) { |
206 | // An empty band always succeeds. |
207 | assert(!band.empty() && "guaranteed to succeed on empty bands" ); |
208 | LLVM_DEBUG(band.front()->emitRemark("loop tiling failed!\n" )); |
209 | continue; |
210 | } |
211 | |
212 | // Separate full and partial tiles. |
213 | if (separate) { |
214 | auto intraTileLoops = |
215 | MutableArrayRef<AffineForOp>(tiledNest).drop_front(N: band.size()); |
216 | if (failed(separateFullTiles(intraTileLoops))) { |
217 | assert(!intraTileLoops.empty() && |
218 | "guaranteed to succeed on empty bands" ); |
219 | LLVM_DEBUG(intraTileLoops.front()->emitRemark( |
220 | "separation post tiling failed!\n" )); |
221 | } |
222 | } |
223 | } |
224 | } |
225 | |
226 | constexpr unsigned LoopTiling::kDefaultTileSize; |
227 | |