1//===- AffineDataCopyGeneration.cpp - Explicit memref copying pass ------*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements a pass to automatically promote accessed memref regions
10// to buffers in a faster memory space that is explicitly managed, with the
11// necessary data movement operations performed through either regular
12// point-wise load/store's or DMAs. Such explicit copying (also referred to as
13// array packing/unpacking in the literature), when done on arrays that exhibit
14// reuse, results in near elimination of conflict misses, TLB misses, reduced
15// use of hardware prefetch streams, and reduced false sharing. It is also
16// necessary for hardware that explicitly managed levels in the memory
17// hierarchy, and where DMAs may have to be used. This optimization is often
18// performed on already tiled code.
19//
20//===----------------------------------------------------------------------===//
21
22#include "mlir/Dialect/Affine/Passes.h"
23
24#include "mlir/Dialect/Affine/Analysis/Utils.h"
25#include "mlir/Dialect/Affine/IR/AffineOps.h"
26#include "mlir/Dialect/Affine/LoopUtils.h"
27#include "mlir/Dialect/Arith/IR/Arith.h"
28#include "mlir/Dialect/Func/IR/FuncOps.h"
29#include "mlir/Dialect/MemRef/IR/MemRef.h"
30#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
31#include "llvm/ADT/MapVector.h"
32#include "llvm/Support/CommandLine.h"
33#include "llvm/Support/Debug.h"
34#include <algorithm>
35#include <optional>
36
37namespace mlir {
38namespace affine {
39#define GEN_PASS_DEF_AFFINEDATACOPYGENERATION
40#include "mlir/Dialect/Affine/Passes.h.inc"
41} // namespace affine
42} // namespace mlir
43
44#define DEBUG_TYPE "affine-data-copy-generate"
45
46using namespace mlir;
47using namespace mlir::affine;
48
49namespace {
50
51/// Replaces all loads and stores on memref's living in 'slowMemorySpace' by
52/// introducing copy operations to transfer data into `fastMemorySpace` and
53/// rewriting the original load's/store's to instead load/store from the
54/// allocated fast memory buffers. Additional options specify the identifier
55/// corresponding to the fast memory space and the amount of fast memory space
56/// available. The pass traverses through the nesting structure, recursing to
57/// inner levels if necessary to determine at what depth copies need to be
58/// placed so that the allocated buffers fit within the memory capacity
59/// provided.
60// TODO: We currently can't generate copies correctly when stores
61// are strided. Check for strided stores.
62struct AffineDataCopyGeneration
63 : public affine::impl::AffineDataCopyGenerationBase<
64 AffineDataCopyGeneration> {
65 AffineDataCopyGeneration() = default;
66 explicit AffineDataCopyGeneration(unsigned slowMemorySpace,
67 unsigned fastMemorySpace,
68 unsigned tagMemorySpace,
69 int minDmaTransferSize,
70 uint64_t fastMemCapacityBytes) {
71 this->slowMemorySpace = slowMemorySpace;
72 this->fastMemorySpace = fastMemorySpace;
73 this->tagMemorySpace = tagMemorySpace;
74 this->minDmaTransferSize = minDmaTransferSize;
75 this->fastMemoryCapacity = fastMemCapacityBytes / 1024;
76 }
77
78 void runOnOperation() override;
79 void runOnBlock(Block *block, DenseSet<Operation *> &copyNests);
80
81 // Constant zero index to avoid too many duplicates.
82 Value zeroIndex = nullptr;
83};
84
85} // namespace
86
87/// Generates copies for memref's living in 'slowMemorySpace' into newly created
88/// buffers in 'fastMemorySpace', and replaces memory operations to the former
89/// by the latter. Only load op's handled for now.
90std::unique_ptr<OperationPass<func::FuncOp>>
91mlir::affine::createAffineDataCopyGenerationPass(
92 unsigned slowMemorySpace, unsigned fastMemorySpace, unsigned tagMemorySpace,
93 int minDmaTransferSize, uint64_t fastMemCapacityBytes) {
94 return std::make_unique<AffineDataCopyGeneration>(
95 args&: slowMemorySpace, args&: fastMemorySpace, args&: tagMemorySpace, args&: minDmaTransferSize,
96 args&: fastMemCapacityBytes);
97}
98std::unique_ptr<OperationPass<func::FuncOp>>
99mlir::affine::createAffineDataCopyGenerationPass() {
100 return std::make_unique<AffineDataCopyGeneration>();
101}
102
103/// Generate copies for this block. The block is partitioned into separate
104/// ranges: each range is either a sequence of one or more operations starting
105/// and ending with an affine load or store op, or just an affine.for op (which
106/// could have other affine for op's nested within).
107void AffineDataCopyGeneration::runOnBlock(Block *block,
108 DenseSet<Operation *> &copyNests) {
109 if (block->empty())
110 return;
111
112 uint64_t fastMemCapacityBytes =
113 fastMemoryCapacity != std::numeric_limits<uint64_t>::max()
114 ? fastMemoryCapacity * 1024
115 : fastMemoryCapacity;
116 AffineCopyOptions copyOptions = {generateDma, slowMemorySpace,
117 fastMemorySpace, tagMemorySpace,
118 fastMemCapacityBytes};
119
120 // Every affine.for op in the block starts and ends a block range for copying;
121 // in addition, a contiguous sequence of operations starting with a
122 // load/store op but not including any copy nests themselves is also
123 // identified as a copy block range. Straightline code (a contiguous chunk of
124 // operations excluding AffineForOp's) are always assumed to not exhaust
125 // memory. As a result, this approach is conservative in some cases at the
126 // moment; we do a check later and report an error with location info.
127
128 // Get to the first load, store, or for op (that is not a copy nest itself).
129 auto curBegin =
130 std::find_if(first: block->begin(), last: block->end(), pred: [&](Operation &op) {
131 return isa<AffineLoadOp, AffineStoreOp, AffineForOp>(op) &&
132 copyNests.count(&op) == 0;
133 });
134
135 // Create [begin, end) ranges.
136 auto it = curBegin;
137 while (it != block->end()) {
138 AffineForOp forOp;
139 // If you hit a non-copy for loop, we will split there.
140 if ((forOp = dyn_cast<AffineForOp>(&*it)) && copyNests.count(V: forOp) == 0) {
141 // Perform the copying up unti this 'for' op first.
142 (void)affineDataCopyGenerate(/*begin=*/curBegin, /*end=*/it, copyOptions,
143 /*filterMemRef=*/std::nullopt, copyNests);
144
145 // Returns true if the footprint is known to exceed capacity.
146 auto exceedsCapacity = [&](AffineForOp forOp) {
147 std::optional<int64_t> footprint =
148 getMemoryFootprintBytes(forOp,
149 /*memorySpace=*/0);
150 return (footprint.has_value() &&
151 static_cast<uint64_t>(*footprint) > fastMemCapacityBytes);
152 };
153
154 // If the memory footprint of the 'affine.for' loop is higher than fast
155 // memory capacity (when provided), we recurse to copy at an inner level
156 // until we find a depth at which footprint fits in fast mem capacity. If
157 // the footprint can't be calculated, we assume for now it fits. Recurse
158 // inside if footprint for 'forOp' exceeds capacity, or when
159 // skipNonUnitStrideLoops is set and the step size is not one.
160 bool recurseInner = skipNonUnitStrideLoops ? forOp.getStep() != 1
161 : exceedsCapacity(forOp);
162 if (recurseInner) {
163 // We'll recurse and do the copies at an inner level for 'forInst'.
164 // Recurse onto the body of this loop.
165 runOnBlock(forOp.getBody(), copyNests);
166 } else {
167 // We have enough capacity, i.e., copies will be computed for the
168 // portion of the block until 'it', and for 'it', which is 'forOp'. Note
169 // that for the latter, the copies are placed just before this loop (for
170 // incoming copies) and right after (for outgoing ones).
171
172 // Inner loop copies have their own scope - we don't thus update
173 // consumed capacity. The footprint check above guarantees this inner
174 // loop's footprint fits.
175 (void)affineDataCopyGenerate(/*begin=*/it, /*end=*/std::next(x: it),
176 copyOptions,
177 /*filterMemRef=*/std::nullopt, copyNests);
178 }
179 // Get to the next load or store op after 'forOp'.
180 curBegin = std::find_if(first: std::next(x: it), last: block->end(), pred: [&](Operation &op) {
181 return isa<AffineLoadOp, AffineStoreOp, AffineForOp>(op) &&
182 copyNests.count(&op) == 0;
183 });
184 it = curBegin;
185 } else {
186 assert(copyNests.count(&*it) == 0 &&
187 "all copy nests generated should have been skipped above");
188 // We simply include this op in the current range and continue for more.
189 ++it;
190 }
191 }
192
193 // Generate the copy for the final block range.
194 if (curBegin != block->end()) {
195 // Can't be a terminator because it would have been skipped above.
196 assert(!curBegin->hasTrait<OpTrait::IsTerminator>() &&
197 "can't be a terminator");
198 // Exclude the affine.yield - hence, the std::prev.
199 (void)affineDataCopyGenerate(/*begin=*/curBegin,
200 /*end=*/std::prev(x: block->end()), copyOptions,
201 /*filterMemRef=*/std::nullopt, copyNests);
202 }
203}
204
205void AffineDataCopyGeneration::runOnOperation() {
206 func::FuncOp f = getOperation();
207 OpBuilder topBuilder(f.getBody());
208 zeroIndex = topBuilder.create<arith::ConstantIndexOp>(f.getLoc(), 0);
209
210 // Nests that are copy-in's or copy-out's; the root AffineForOps of those
211 // nests are stored herein.
212 DenseSet<Operation *> copyNests;
213
214 // Clear recorded copy nests.
215 copyNests.clear();
216
217 for (auto &block : f)
218 runOnBlock(&block, copyNests);
219
220 // Promote any single iteration loops in the copy nests and collect
221 // load/stores to simplify.
222 SmallVector<Operation *, 4> copyOps;
223 for (Operation *nest : copyNests)
224 // With a post order walk, the erasure of loops does not affect
225 // continuation of the walk or the collection of load/store ops.
226 nest->walk(callback: [&](Operation *op) {
227 if (auto forOp = dyn_cast<AffineForOp>(op))
228 (void)promoteIfSingleIteration(forOp);
229 else if (isa<AffineLoadOp, AffineStoreOp>(op))
230 copyOps.push_back(Elt: op);
231 });
232
233 // Promoting single iteration loops could lead to simplification of
234 // contained load's/store's, and the latter could anyway also be
235 // canonicalized.
236 RewritePatternSet patterns(&getContext());
237 AffineLoadOp::getCanonicalizationPatterns(patterns, &getContext());
238 AffineStoreOp::getCanonicalizationPatterns(patterns, &getContext());
239 FrozenRewritePatternSet frozenPatterns(std::move(patterns));
240 GreedyRewriteConfig config;
241 config.strictMode = GreedyRewriteStrictness::ExistingAndNewOps;
242 (void)applyOpPatternsAndFold(ops: copyOps, patterns: frozenPatterns, config);
243}
244

source code of mlir/lib/Dialect/Affine/Transforms/AffineDataCopyGeneration.cpp