1//===- LoopTiling.cpp --- Loop tiling pass ------------------------------*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements a pass to tile loop nests.
10//
11//===----------------------------------------------------------------------===//
12
13#include "mlir/Dialect/Affine/Passes.h"
14
15#include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h"
16#include "mlir/Dialect/Affine/Analysis/AffineStructures.h"
17#include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h"
18#include "mlir/Dialect/Affine/Analysis/Utils.h"
19#include "mlir/Dialect/Affine/IR/AffineOps.h"
20#include "mlir/Dialect/Affine/IR/AffineValueMap.h"
21#include "mlir/Dialect/Affine/LoopUtils.h"
22#include "mlir/Dialect/Affine/Utils.h"
23#include "mlir/Dialect/Func/IR/FuncOps.h"
24#include "mlir/IR/Builders.h"
25#include "mlir/IR/IRMapping.h"
26#include "llvm/Support/CommandLine.h"
27#include "llvm/Support/Debug.h"
28#include <optional>
29
30namespace mlir {
31namespace affine {
32#define GEN_PASS_DEF_AFFINELOOPTILING
33#include "mlir/Dialect/Affine/Passes.h.inc"
34} // namespace affine
35} // namespace mlir
36
37using namespace mlir;
38using namespace mlir::affine;
39
40#define DEBUG_TYPE "affine-loop-tile"
41
42namespace {
43
44/// A pass to perform loop tiling on all suitable loop nests of a Function.
45struct LoopTiling : public affine::impl::AffineLoopTilingBase<LoopTiling> {
46 LoopTiling() = default;
47 explicit LoopTiling(uint64_t cacheSizeBytes, bool avoidMaxMinBounds = true)
48 : avoidMaxMinBounds(avoidMaxMinBounds) {
49 this->cacheSizeInKiB = cacheSizeBytes / 1024;
50 }
51
52 void runOnOperation() override;
53 void getTileSizes(ArrayRef<AffineForOp> band,
54 SmallVectorImpl<unsigned> *tileSizes);
55
56 // Default tile size if nothing is provided.
57 constexpr static unsigned kDefaultTileSize = 4;
58
59 // If true, tile sizes are set to avoid max/min in bounds if possible.
60 bool avoidMaxMinBounds = true;
61};
62
63} // namespace
64
65/// Creates a pass to perform loop tiling on all suitable loop nests of a
66/// Function.
67std::unique_ptr<OperationPass<func::FuncOp>>
68mlir::affine::createLoopTilingPass(uint64_t cacheSizeBytes) {
69 return std::make_unique<LoopTiling>(args&: cacheSizeBytes);
70}
71std::unique_ptr<OperationPass<func::FuncOp>>
72mlir::affine::createLoopTilingPass() {
73 return std::make_unique<LoopTiling>();
74}
75
76/// Reduces each tile size to the largest divisor of the corresponding trip
77/// count (if the trip count is known).
78static void adjustToDivisorsOfTripCounts(ArrayRef<AffineForOp> band,
79 SmallVectorImpl<unsigned> *tileSizes) {
80 assert(band.size() == tileSizes->size() && "invalid tile size count");
81 for (unsigned i = 0, e = band.size(); i < e; i++) {
82 unsigned &tSizeAdjusted = (*tileSizes)[i];
83 std::optional<uint64_t> mayConst = getConstantTripCount(band[i]);
84 if (!mayConst)
85 continue;
86 // Adjust the tile size to largest factor of the trip count less than
87 // tSize.
88 uint64_t constTripCount = *mayConst;
89 if (constTripCount > 1 && tSizeAdjusted > constTripCount / 2)
90 tSizeAdjusted = constTripCount / 2;
91 while (constTripCount % tSizeAdjusted != 0)
92 tSizeAdjusted--;
93 }
94}
95
96/// Checks whether hyper-rectangular loop tiling of the nest represented by
97/// `origLoops` is valid. The validity condition is from Irigoin and Triolet,
98/// which states that two tiles cannot depend on each other. We simplify such
99/// condition to just checking whether there is any negative dependence
100/// direction, since we have the prior knowledge that the tiling results will be
101/// hyper-rectangles, which are scheduled in the lexicographically increasing
102/// order on the vector of loop indices. This function will return failure when
103/// any dependence component is negative along any of `origLoops`.
104static bool checkTilingLegality(MutableArrayRef<AffineForOp> origLoops) {
105 assert(!origLoops.empty() && "no original loops provided");
106
107 // We first find out all dependences we intend to check.
108 SmallVector<Operation *, 8> loadAndStoreOps;
109 origLoops[0]->walk([&](Operation *op) {
110 if (isa<AffineReadOpInterface, AffineWriteOpInterface>(op))
111 loadAndStoreOps.push_back(op);
112 });
113
114 unsigned numOps = loadAndStoreOps.size();
115 unsigned numLoops = origLoops.size();
116 for (unsigned d = 1; d <= numLoops + 1; ++d) {
117 for (unsigned i = 0; i < numOps; ++i) {
118 Operation *srcOp = loadAndStoreOps[i];
119 MemRefAccess srcAccess(srcOp);
120 for (unsigned j = 0; j < numOps; ++j) {
121 Operation *dstOp = loadAndStoreOps[j];
122 MemRefAccess dstAccess(dstOp);
123
124 SmallVector<DependenceComponent, 2> depComps;
125 DependenceResult result = checkMemrefAccessDependence(
126 srcAccess, dstAccess, loopDepth: d, /*dependenceConstraints=*/nullptr,
127 dependenceComponents: &depComps);
128
129 // Skip if there is no dependence in this case.
130 if (!hasDependence(result))
131 continue;
132
133 // Check whether there is any negative direction vector in the
134 // dependence components found above, which means that dependence is
135 // violated by the default hyper-rect tiling method.
136 LLVM_DEBUG(llvm::dbgs() << "Checking whether tiling legality violated "
137 "for dependence at depth: "
138 << Twine(d) << " between:\n";);
139 LLVM_DEBUG(srcAccess.opInst->dump(););
140 LLVM_DEBUG(dstAccess.opInst->dump(););
141 for (const DependenceComponent &depComp : depComps) {
142 if (depComp.lb.has_value() && depComp.ub.has_value() &&
143 *depComp.lb < *depComp.ub && *depComp.ub < 0) {
144 LLVM_DEBUG(llvm::dbgs()
145 << "Dependence component lb = " << Twine(*depComp.lb)
146 << " ub = " << Twine(*depComp.ub)
147 << " is negative at depth: " << Twine(d)
148 << " and thus violates the legality rule.\n");
149 return false;
150 }
151 }
152 }
153 }
154 }
155
156 return true;
157}
158
159// Returns tile sizes to use. Checks CL options; if none are specified, sets it
160// based on a simple model that looks at the memory footprint and determines
161// tile sizes assuming identity accesses / 1:1 tile size proportional footprint
162// along each of the dimensions being tiled.
163// TODO: evolve this model. Tile size determination is a large area
164// to play with in general.
165void LoopTiling::getTileSizes(ArrayRef<AffineForOp> band,
166 SmallVectorImpl<unsigned> *tileSizes) {
167 if (band.empty())
168 return;
169
170 // Use command-line tileSize for all loops if specified.
171 if (tileSize) {
172 tileSizes->assign(band.size(), tileSize);
173 return;
174 }
175
176 // Use tileSizes and fill them with default tile size if it's short.
177 if (!this->tileSizes.empty()) {
178 tileSizes->assign(this->tileSizes.begin(), this->tileSizes.end());
179 tileSizes->resize(N: band.size(), NV: kDefaultTileSize);
180 return;
181 }
182 tileSizes->resize(N: band.size());
183
184 // The first loop in the band.
185 AffineForOp rootForOp = band[0];
186 (void)rootForOp;
187
188 // Obtain memory footprint and set tile sizes so that a tile fits in
189 // the cache size. This is an approximation with the assumption that the
190 // footprint increases with the tile size linearly in that dimension (i.e.,
191 // assumes one-to-one access function).
192 std::optional<int64_t> fp = getMemoryFootprintBytes(band[0], 0);
193 if (!fp) {
194 // Fill with default tile sizes if footprint is unknown.
195 std::fill(first: tileSizes->begin(), last: tileSizes->end(),
196 value: LoopTiling::kDefaultTileSize);
197 if (avoidMaxMinBounds)
198 adjustToDivisorsOfTripCounts(band, tileSizes);
199 LLVM_DEBUG(
200 rootForOp.emitWarning("memory footprint unknown: using default tile "
201 "sizes adjusted to trip count divisors"));
202 return;
203 }
204
205 // Check how many times larger the cache size is when compared to footprint.
206 uint64_t cacheSizeBytes = cacheSizeInKiB * 1024;
207 uint64_t excessFactor = llvm::divideCeil(Numerator: *fp, Denominator: cacheSizeBytes);
208 if (excessFactor <= 1) {
209 // No need of any tiling - set tile size to 1.
210 std::fill(first: tileSizes->begin(), last: tileSizes->end(), value: 1);
211 return;
212 }
213
214 // Divide all loops equally in an attempt to reduce footprint.
215 // TODO: this is approximate. Ideally, obtain reuse factor /
216 // profitability along each dimension and weight tile sizes based on that as
217 // one possible approach. Or compute a polynomial in tile sizes and solve for
218 // it.
219
220 // For an n-d tileable band, compute the n^th root of the excess.
221 unsigned tSize =
222 static_cast<unsigned>(floorl(x: std::pow(x: excessFactor, y: 1.0 / band.size())));
223 // We'll keep a running product to determine the last tile size better.
224 unsigned cumulProductOfTileSizes = 1;
225 for (unsigned i = 0, e = band.size(); i < e; i++) {
226 if (i < e - 1)
227 (*tileSizes)[i] = tSize;
228 else
229 // Set last tile size to cover the balance.
230 (*tileSizes)[i] = std::max(
231 a: 1U, b: static_cast<unsigned>(excessFactor / cumulProductOfTileSizes));
232 cumulProductOfTileSizes *= (*tileSizes)[i];
233 }
234 if (avoidMaxMinBounds)
235 adjustToDivisorsOfTripCounts(band, tileSizes);
236}
237
238void LoopTiling::runOnOperation() {
239 // Bands of loops to tile.
240 std::vector<SmallVector<AffineForOp, 6>> bands;
241 getTileableBands(getOperation(), &bands);
242
243 // Tile each band.
244 for (auto &band : bands) {
245 if (!checkTilingLegality(band)) {
246 band.front().emitRemark("tiling code is illegal due to dependences");
247 continue;
248 }
249
250 // Set up tile sizes; fill missing tile sizes at the end with default tile
251 // size or tileSize if one was provided.
252 SmallVector<unsigned, 6> tileSizes;
253 getTileSizes(band, &tileSizes);
254 if (llvm::DebugFlag) {
255 auto diag = band[0].emitRemark("using tile sizes [");
256 for (unsigned tSize : tileSizes)
257 diag << tSize << ' ';
258 diag << "]\n";
259 }
260 SmallVector<AffineForOp, 6> tiledNest;
261 if (failed(result: tilePerfectlyNested(band, tileSizes, &tiledNest))) {
262 // An empty band always succeeds.
263 assert(!band.empty() && "guaranteed to succeed on empty bands");
264 LLVM_DEBUG(band.front()->emitRemark("loop tiling failed!\n"));
265 continue;
266 }
267
268 // Separate full and partial tiles.
269 if (separate) {
270 auto intraTileLoops =
271 MutableArrayRef<AffineForOp>(tiledNest).drop_front(N: band.size());
272 if (failed(separateFullTiles(intraTileLoops))) {
273 assert(!intraTileLoops.empty() &&
274 "guaranteed to succeed on empty bands");
275 LLVM_DEBUG(intraTileLoops.front()->emitRemark(
276 "separation post tiling failed!\n"));
277 }
278 }
279 }
280}
281
282constexpr unsigned LoopTiling::kDefaultTileSize;
283

source code of mlir/lib/Dialect/Affine/Transforms/LoopTiling.cpp