1 | //===- LoopTiling.cpp --- Loop tiling pass ------------------------------*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file implements a pass to tile loop nests. |
10 | // |
11 | //===----------------------------------------------------------------------===// |
12 | |
13 | #include "mlir/Dialect/Affine/Passes.h" |
14 | |
15 | #include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h" |
16 | #include "mlir/Dialect/Affine/Analysis/AffineStructures.h" |
17 | #include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h" |
18 | #include "mlir/Dialect/Affine/Analysis/Utils.h" |
19 | #include "mlir/Dialect/Affine/IR/AffineOps.h" |
20 | #include "mlir/Dialect/Affine/IR/AffineValueMap.h" |
21 | #include "mlir/Dialect/Affine/LoopUtils.h" |
22 | #include "mlir/Dialect/Affine/Utils.h" |
23 | #include "mlir/Dialect/Func/IR/FuncOps.h" |
24 | #include "mlir/IR/Builders.h" |
25 | #include "mlir/IR/IRMapping.h" |
26 | #include "llvm/Support/CommandLine.h" |
27 | #include "llvm/Support/Debug.h" |
28 | #include <optional> |
29 | |
30 | namespace mlir { |
31 | namespace affine { |
32 | #define GEN_PASS_DEF_AFFINELOOPTILING |
33 | #include "mlir/Dialect/Affine/Passes.h.inc" |
34 | } // namespace affine |
35 | } // namespace mlir |
36 | |
37 | using namespace mlir; |
38 | using namespace mlir::affine; |
39 | |
40 | #define DEBUG_TYPE "affine-loop-tile" |
41 | |
42 | namespace { |
43 | |
44 | /// A pass to perform loop tiling on all suitable loop nests of a Function. |
45 | struct LoopTiling : public affine::impl::AffineLoopTilingBase<LoopTiling> { |
46 | LoopTiling() = default; |
47 | explicit LoopTiling(uint64_t cacheSizeBytes, bool avoidMaxMinBounds = true) |
48 | : avoidMaxMinBounds(avoidMaxMinBounds) { |
49 | this->cacheSizeInKiB = cacheSizeBytes / 1024; |
50 | } |
51 | |
52 | void runOnOperation() override; |
53 | void getTileSizes(ArrayRef<AffineForOp> band, |
54 | SmallVectorImpl<unsigned> *tileSizes); |
55 | |
56 | // Default tile size if nothing is provided. |
57 | constexpr static unsigned kDefaultTileSize = 4; |
58 | |
59 | // If true, tile sizes are set to avoid max/min in bounds if possible. |
60 | bool avoidMaxMinBounds = true; |
61 | }; |
62 | |
63 | } // namespace |
64 | |
65 | /// Creates a pass to perform loop tiling on all suitable loop nests of a |
66 | /// Function. |
67 | std::unique_ptr<OperationPass<func::FuncOp>> |
68 | mlir::affine::createLoopTilingPass(uint64_t cacheSizeBytes) { |
69 | return std::make_unique<LoopTiling>(args&: cacheSizeBytes); |
70 | } |
71 | std::unique_ptr<OperationPass<func::FuncOp>> |
72 | mlir::affine::createLoopTilingPass() { |
73 | return std::make_unique<LoopTiling>(); |
74 | } |
75 | |
76 | /// Reduces each tile size to the largest divisor of the corresponding trip |
77 | /// count (if the trip count is known). |
78 | static void adjustToDivisorsOfTripCounts(ArrayRef<AffineForOp> band, |
79 | SmallVectorImpl<unsigned> *tileSizes) { |
80 | assert(band.size() == tileSizes->size() && "invalid tile size count" ); |
81 | for (unsigned i = 0, e = band.size(); i < e; i++) { |
82 | unsigned &tSizeAdjusted = (*tileSizes)[i]; |
83 | std::optional<uint64_t> mayConst = getConstantTripCount(band[i]); |
84 | if (!mayConst) |
85 | continue; |
86 | // Adjust the tile size to largest factor of the trip count less than |
87 | // tSize. |
88 | uint64_t constTripCount = *mayConst; |
89 | if (constTripCount > 1 && tSizeAdjusted > constTripCount / 2) |
90 | tSizeAdjusted = constTripCount / 2; |
91 | while (constTripCount % tSizeAdjusted != 0) |
92 | tSizeAdjusted--; |
93 | } |
94 | } |
95 | |
96 | /// Checks whether hyper-rectangular loop tiling of the nest represented by |
97 | /// `origLoops` is valid. The validity condition is from Irigoin and Triolet, |
98 | /// which states that two tiles cannot depend on each other. We simplify such |
99 | /// condition to just checking whether there is any negative dependence |
100 | /// direction, since we have the prior knowledge that the tiling results will be |
101 | /// hyper-rectangles, which are scheduled in the lexicographically increasing |
102 | /// order on the vector of loop indices. This function will return failure when |
103 | /// any dependence component is negative along any of `origLoops`. |
104 | static bool checkTilingLegality(MutableArrayRef<AffineForOp> origLoops) { |
105 | assert(!origLoops.empty() && "no original loops provided" ); |
106 | |
107 | // We first find out all dependences we intend to check. |
108 | SmallVector<Operation *, 8> loadAndStoreOps; |
109 | origLoops[0]->walk([&](Operation *op) { |
110 | if (isa<AffineReadOpInterface, AffineWriteOpInterface>(op)) |
111 | loadAndStoreOps.push_back(op); |
112 | }); |
113 | |
114 | unsigned numOps = loadAndStoreOps.size(); |
115 | unsigned numLoops = origLoops.size(); |
116 | for (unsigned d = 1; d <= numLoops + 1; ++d) { |
117 | for (unsigned i = 0; i < numOps; ++i) { |
118 | Operation *srcOp = loadAndStoreOps[i]; |
119 | MemRefAccess srcAccess(srcOp); |
120 | for (unsigned j = 0; j < numOps; ++j) { |
121 | Operation *dstOp = loadAndStoreOps[j]; |
122 | MemRefAccess dstAccess(dstOp); |
123 | |
124 | SmallVector<DependenceComponent, 2> depComps; |
125 | DependenceResult result = checkMemrefAccessDependence( |
126 | srcAccess, dstAccess, loopDepth: d, /*dependenceConstraints=*/nullptr, |
127 | dependenceComponents: &depComps); |
128 | |
129 | // Skip if there is no dependence in this case. |
130 | if (!hasDependence(result)) |
131 | continue; |
132 | |
133 | // Check whether there is any negative direction vector in the |
134 | // dependence components found above, which means that dependence is |
135 | // violated by the default hyper-rect tiling method. |
136 | LLVM_DEBUG(llvm::dbgs() << "Checking whether tiling legality violated " |
137 | "for dependence at depth: " |
138 | << Twine(d) << " between:\n" ;); |
139 | LLVM_DEBUG(srcAccess.opInst->dump();); |
140 | LLVM_DEBUG(dstAccess.opInst->dump();); |
141 | for (const DependenceComponent &depComp : depComps) { |
142 | if (depComp.lb.has_value() && depComp.ub.has_value() && |
143 | *depComp.lb < *depComp.ub && *depComp.ub < 0) { |
144 | LLVM_DEBUG(llvm::dbgs() |
145 | << "Dependence component lb = " << Twine(*depComp.lb) |
146 | << " ub = " << Twine(*depComp.ub) |
147 | << " is negative at depth: " << Twine(d) |
148 | << " and thus violates the legality rule.\n" ); |
149 | return false; |
150 | } |
151 | } |
152 | } |
153 | } |
154 | } |
155 | |
156 | return true; |
157 | } |
158 | |
159 | // Returns tile sizes to use. Checks CL options; if none are specified, sets it |
160 | // based on a simple model that looks at the memory footprint and determines |
161 | // tile sizes assuming identity accesses / 1:1 tile size proportional footprint |
162 | // along each of the dimensions being tiled. |
163 | // TODO: evolve this model. Tile size determination is a large area |
164 | // to play with in general. |
165 | void LoopTiling::getTileSizes(ArrayRef<AffineForOp> band, |
166 | SmallVectorImpl<unsigned> *tileSizes) { |
167 | if (band.empty()) |
168 | return; |
169 | |
170 | // Use command-line tileSize for all loops if specified. |
171 | if (tileSize) { |
172 | tileSizes->assign(band.size(), tileSize); |
173 | return; |
174 | } |
175 | |
176 | // Use tileSizes and fill them with default tile size if it's short. |
177 | if (!this->tileSizes.empty()) { |
178 | tileSizes->assign(this->tileSizes.begin(), this->tileSizes.end()); |
179 | tileSizes->resize(N: band.size(), NV: kDefaultTileSize); |
180 | return; |
181 | } |
182 | tileSizes->resize(N: band.size()); |
183 | |
184 | // The first loop in the band. |
185 | AffineForOp rootForOp = band[0]; |
186 | (void)rootForOp; |
187 | |
188 | // Obtain memory footprint and set tile sizes so that a tile fits in |
189 | // the cache size. This is an approximation with the assumption that the |
190 | // footprint increases with the tile size linearly in that dimension (i.e., |
191 | // assumes one-to-one access function). |
192 | std::optional<int64_t> fp = getMemoryFootprintBytes(band[0], 0); |
193 | if (!fp) { |
194 | // Fill with default tile sizes if footprint is unknown. |
195 | std::fill(first: tileSizes->begin(), last: tileSizes->end(), |
196 | value: LoopTiling::kDefaultTileSize); |
197 | if (avoidMaxMinBounds) |
198 | adjustToDivisorsOfTripCounts(band, tileSizes); |
199 | LLVM_DEBUG( |
200 | rootForOp.emitWarning("memory footprint unknown: using default tile " |
201 | "sizes adjusted to trip count divisors" )); |
202 | return; |
203 | } |
204 | |
205 | // Check how many times larger the cache size is when compared to footprint. |
206 | uint64_t cacheSizeBytes = cacheSizeInKiB * 1024; |
207 | uint64_t excessFactor = llvm::divideCeil(Numerator: *fp, Denominator: cacheSizeBytes); |
208 | if (excessFactor <= 1) { |
209 | // No need of any tiling - set tile size to 1. |
210 | std::fill(first: tileSizes->begin(), last: tileSizes->end(), value: 1); |
211 | return; |
212 | } |
213 | |
214 | // Divide all loops equally in an attempt to reduce footprint. |
215 | // TODO: this is approximate. Ideally, obtain reuse factor / |
216 | // profitability along each dimension and weight tile sizes based on that as |
217 | // one possible approach. Or compute a polynomial in tile sizes and solve for |
218 | // it. |
219 | |
220 | // For an n-d tileable band, compute the n^th root of the excess. |
221 | unsigned tSize = |
222 | static_cast<unsigned>(floorl(x: std::pow(x: excessFactor, y: 1.0 / band.size()))); |
223 | // We'll keep a running product to determine the last tile size better. |
224 | unsigned cumulProductOfTileSizes = 1; |
225 | for (unsigned i = 0, e = band.size(); i < e; i++) { |
226 | if (i < e - 1) |
227 | (*tileSizes)[i] = tSize; |
228 | else |
229 | // Set last tile size to cover the balance. |
230 | (*tileSizes)[i] = std::max( |
231 | a: 1U, b: static_cast<unsigned>(excessFactor / cumulProductOfTileSizes)); |
232 | cumulProductOfTileSizes *= (*tileSizes)[i]; |
233 | } |
234 | if (avoidMaxMinBounds) |
235 | adjustToDivisorsOfTripCounts(band, tileSizes); |
236 | } |
237 | |
238 | void LoopTiling::runOnOperation() { |
239 | // Bands of loops to tile. |
240 | std::vector<SmallVector<AffineForOp, 6>> bands; |
241 | getTileableBands(getOperation(), &bands); |
242 | |
243 | // Tile each band. |
244 | for (auto &band : bands) { |
245 | if (!checkTilingLegality(band)) { |
246 | band.front().emitRemark("tiling code is illegal due to dependences" ); |
247 | continue; |
248 | } |
249 | |
250 | // Set up tile sizes; fill missing tile sizes at the end with default tile |
251 | // size or tileSize if one was provided. |
252 | SmallVector<unsigned, 6> tileSizes; |
253 | getTileSizes(band, &tileSizes); |
254 | if (llvm::DebugFlag) { |
255 | auto diag = band[0].emitRemark("using tile sizes [" ); |
256 | for (unsigned tSize : tileSizes) |
257 | diag << tSize << ' '; |
258 | diag << "]\n" ; |
259 | } |
260 | SmallVector<AffineForOp, 6> tiledNest; |
261 | if (failed(result: tilePerfectlyNested(band, tileSizes, &tiledNest))) { |
262 | // An empty band always succeeds. |
263 | assert(!band.empty() && "guaranteed to succeed on empty bands" ); |
264 | LLVM_DEBUG(band.front()->emitRemark("loop tiling failed!\n" )); |
265 | continue; |
266 | } |
267 | |
268 | // Separate full and partial tiles. |
269 | if (separate) { |
270 | auto intraTileLoops = |
271 | MutableArrayRef<AffineForOp>(tiledNest).drop_front(N: band.size()); |
272 | if (failed(separateFullTiles(intraTileLoops))) { |
273 | assert(!intraTileLoops.empty() && |
274 | "guaranteed to succeed on empty bands" ); |
275 | LLVM_DEBUG(intraTileLoops.front()->emitRemark( |
276 | "separation post tiling failed!\n" )); |
277 | } |
278 | } |
279 | } |
280 | } |
281 | |
282 | constexpr unsigned LoopTiling::kDefaultTileSize; |
283 | |