1//===- LoopTiling.cpp --- Loop tiling pass ------------------------------*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements a pass to tile loop nests.
10//
11//===----------------------------------------------------------------------===//
12
13#include "mlir/Dialect/Affine/Passes.h"
14
15#include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h"
16#include "mlir/Dialect/Affine/Analysis/AffineStructures.h"
17#include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h"
18#include "mlir/Dialect/Affine/Analysis/Utils.h"
19#include "mlir/Dialect/Affine/IR/AffineOps.h"
20#include "mlir/Dialect/Affine/LoopUtils.h"
21#include "mlir/Dialect/Affine/Utils.h"
22#include "mlir/Dialect/Func/IR/FuncOps.h"
23#include "llvm/Support/CommandLine.h"
24#include "llvm/Support/Debug.h"
25#include <optional>
26
27namespace mlir {
28namespace affine {
29#define GEN_PASS_DEF_AFFINELOOPTILING
30#include "mlir/Dialect/Affine/Passes.h.inc"
31} // namespace affine
32} // namespace mlir
33
34using namespace mlir;
35using namespace mlir::affine;
36
37#define DEBUG_TYPE "affine-loop-tile"
38
39namespace {
40
41/// A pass to perform loop tiling on all suitable loop nests of a Function.
42struct LoopTiling : public affine::impl::AffineLoopTilingBase<LoopTiling> {
43 LoopTiling() = default;
44 explicit LoopTiling(uint64_t cacheSizeBytes, bool avoidMaxMinBounds = true)
45 : avoidMaxMinBounds(avoidMaxMinBounds) {
46 this->cacheSizeInKiB = cacheSizeBytes / 1024;
47 }
48
49 void runOnOperation() override;
50 void getTileSizes(ArrayRef<AffineForOp> band,
51 SmallVectorImpl<unsigned> *tileSizes);
52
53 // Default tile size if nothing is provided.
54 constexpr static unsigned kDefaultTileSize = 4;
55
56 // If true, tile sizes are set to avoid max/min in bounds if possible.
57 bool avoidMaxMinBounds = true;
58};
59
60} // namespace
61
62/// Creates a pass to perform loop tiling on all suitable loop nests of a
63/// Function.
64std::unique_ptr<OperationPass<func::FuncOp>>
65mlir::affine::createLoopTilingPass(uint64_t cacheSizeBytes) {
66 return std::make_unique<LoopTiling>(args&: cacheSizeBytes);
67}
68std::unique_ptr<OperationPass<func::FuncOp>>
69mlir::affine::createLoopTilingPass() {
70 return std::make_unique<LoopTiling>();
71}
72
73/// Reduces each tile size to the largest divisor of the corresponding trip
74/// count (if the trip count is known).
75static void adjustToDivisorsOfTripCounts(ArrayRef<AffineForOp> band,
76 SmallVectorImpl<unsigned> *tileSizes) {
77 assert(band.size() == tileSizes->size() && "invalid tile size count");
78 for (unsigned i = 0, e = band.size(); i < e; i++) {
79 unsigned &tSizeAdjusted = (*tileSizes)[i];
80 std::optional<uint64_t> mayConst = getConstantTripCount(forOp: band[i]);
81 if (!mayConst)
82 continue;
83 // Adjust the tile size to largest factor of the trip count less than
84 // tSize.
85 uint64_t constTripCount = *mayConst;
86 if (constTripCount > 1 && tSizeAdjusted > constTripCount / 2)
87 tSizeAdjusted = constTripCount / 2;
88 while (constTripCount % tSizeAdjusted != 0)
89 tSizeAdjusted--;
90 }
91}
92
93// Returns tile sizes to use. Checks CL options; if none are specified, sets it
94// based on a simple model that looks at the memory footprint and determines
95// tile sizes assuming identity accesses / 1:1 tile size proportional footprint
96// along each of the dimensions being tiled.
97// TODO: evolve this model. Tile size determination is a large area
98// to play with in general.
99void LoopTiling::getTileSizes(ArrayRef<AffineForOp> band,
100 SmallVectorImpl<unsigned> *tileSizes) {
101 if (band.empty())
102 return;
103
104 // Use command-line tileSize for all loops if specified.
105 if (tileSize) {
106 tileSizes->assign(NumElts: band.size(), Elt: tileSize);
107 return;
108 }
109
110 // Use supplied tile sizes and fill them with default tile size if it's short.
111 if (!this->tileSizes.empty()) {
112 tileSizes->assign(in_start: this->tileSizes.begin(), in_end: this->tileSizes.end());
113 tileSizes->resize(N: band.size(), NV: kDefaultTileSize);
114 return;
115 }
116 tileSizes->resize(N: band.size());
117
118 // If the cache size is zero, set the minimum valid tile size. No good reason
119 // to pick another specific size over this.
120 if (cacheSizeInKiB == 0) {
121 llvm::fill(Range&: *tileSizes, Value: 1);
122 return;
123 }
124
125 // The first loop in the band.
126 AffineForOp rootForOp = band[0];
127 (void)rootForOp;
128
129 // Obtain memory footprint and set tile sizes so that a tile fits in
130 // the cache size. This is an approximation with the assumption that the
131 // footprint increases with the tile size linearly in that dimension (i.e.,
132 // assumes one-to-one access function).
133 std::optional<int64_t> fp = getMemoryFootprintBytes(forOp: band[0], memorySpace: 0);
134 if (!fp) {
135 // Fill with default tile sizes if footprint is unknown.
136 llvm::fill(Range&: *tileSizes, Value: LoopTiling::kDefaultTileSize);
137 if (avoidMaxMinBounds)
138 adjustToDivisorsOfTripCounts(band, tileSizes);
139 LLVM_DEBUG(
140 rootForOp.emitWarning("memory footprint unknown: using default tile "
141 "sizes adjusted to trip count divisors"));
142 return;
143 }
144
145 // Check how many times larger the cache size is when compared to footprint.
146 uint64_t cacheSizeBytes = cacheSizeInKiB * 1024;
147 uint64_t excessFactor = llvm::divideCeil(Numerator: *fp, Denominator: cacheSizeBytes);
148 if (excessFactor <= 1) {
149 // No need of any tiling - set tile size to 1.
150 llvm::fill(Range&: *tileSizes, Value: 1);
151 return;
152 }
153
154 // Divide all loops equally in an attempt to reduce footprint.
155 // TODO: this is approximate. Ideally, obtain reuse factor /
156 // profitability along each dimension and weight tile sizes based on that as
157 // one possible approach. Or compute a polynomial in tile sizes and solve for
158 // it.
159
160 // For an n-d tileable band, compute the n^th root of the excess.
161 unsigned tSize =
162 static_cast<unsigned>(floorl(x: std::pow(x: excessFactor, y: 1.0 / band.size())));
163 // We'll keep a running product to determine the last tile size better.
164 unsigned cumulProductOfTileSizes = 1;
165 for (unsigned i = 0, e = band.size(); i < e; i++) {
166 if (i < e - 1)
167 (*tileSizes)[i] = tSize;
168 else
169 // Set last tile size to cover the balance.
170 (*tileSizes)[i] = std::max(
171 a: 1U, b: static_cast<unsigned>(excessFactor / cumulProductOfTileSizes));
172 cumulProductOfTileSizes *= (*tileSizes)[i];
173 }
174 if (avoidMaxMinBounds)
175 adjustToDivisorsOfTripCounts(band, tileSizes);
176}
177
178void LoopTiling::runOnOperation() {
179 // Bands of loops to tile.
180 std::vector<SmallVector<AffineForOp, 6>> bands;
181 getTileableBands(f: getOperation(), bands: &bands);
182
183 // Tile each band.
184 for (auto &band : bands) {
185 if (!isTilingValid(loops: band)) {
186 band.front().emitRemark(message: "tiling nest is invalid due to dependences");
187 continue;
188 }
189
190 // Set up tile sizes; fill missing tile sizes at the end with default tile
191 // size or tileSize if one was provided.
192 SmallVector<unsigned, 6> tileSizes;
193 getTileSizes(band, tileSizes: &tileSizes);
194 if (llvm::DebugFlag) {
195 auto diag = band[0].emitRemark(message: "using tile sizes [");
196 for (unsigned tSize : tileSizes)
197 diag << tSize << ' ';
198 diag << "]\n";
199 }
200 SmallVector<AffineForOp, 6> tiledNest;
201 if (failed(Result: tilePerfectlyNested(input: band, tileSizes, tiledNest: &tiledNest))) {
202 // An empty band always succeeds.
203 assert(!band.empty() && "guaranteed to succeed on empty bands");
204 LLVM_DEBUG(band.front()->emitRemark("loop tiling failed!\n"));
205 continue;
206 }
207
208 // Separate full and partial tiles.
209 if (separate) {
210 auto intraTileLoops =
211 MutableArrayRef<AffineForOp>(tiledNest).drop_front(N: band.size());
212 if (failed(Result: separateFullTiles(nest: intraTileLoops))) {
213 assert(!intraTileLoops.empty() &&
214 "guaranteed to succeed on empty bands");
215 LLVM_DEBUG(intraTileLoops.front()->emitRemark(
216 "separation post tiling failed!\n"));
217 }
218 }
219 }
220}
221
222constexpr unsigned LoopTiling::kDefaultTileSize;
223

source code of mlir/lib/Dialect/Affine/Transforms/LoopTiling.cpp