1 | //===- AffineDataCopyGeneration.cpp - Explicit memref copying pass ------*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file implements a pass to automatically promote accessed memref regions |
10 | // to buffers in a faster memory space that is explicitly managed, with the |
11 | // necessary data movement operations performed through either regular |
12 | // point-wise load/store's or DMAs. Such explicit copying (also referred to as |
13 | // array packing/unpacking in the literature), when done on arrays that exhibit |
14 | // reuse, results in near elimination of conflict misses, TLB misses, reduced |
15 | // use of hardware prefetch streams, and reduced false sharing. It is also |
16 | // necessary for hardware that explicitly managed levels in the memory |
17 | // hierarchy, and where DMAs may have to be used. This optimization is often |
18 | // performed on already tiled code. |
19 | // |
20 | //===----------------------------------------------------------------------===// |
21 | |
22 | #include "mlir/Dialect/Affine/Passes.h" |
23 | |
24 | #include "mlir/Dialect/Affine/Analysis/Utils.h" |
25 | #include "mlir/Dialect/Affine/IR/AffineOps.h" |
26 | #include "mlir/Dialect/Affine/LoopUtils.h" |
27 | #include "mlir/Dialect/Arith/IR/Arith.h" |
28 | #include "mlir/Dialect/Func/IR/FuncOps.h" |
29 | #include "mlir/Dialect/MemRef/IR/MemRef.h" |
30 | #include "mlir/Transforms/GreedyPatternRewriteDriver.h" |
31 | #include "llvm/ADT/MapVector.h" |
32 | #include "llvm/Support/CommandLine.h" |
33 | #include "llvm/Support/Debug.h" |
34 | #include <algorithm> |
35 | #include <optional> |
36 | |
37 | namespace mlir { |
38 | namespace affine { |
39 | #define GEN_PASS_DEF_AFFINEDATACOPYGENERATION |
40 | #include "mlir/Dialect/Affine/Passes.h.inc" |
41 | } // namespace affine |
42 | } // namespace mlir |
43 | |
44 | #define DEBUG_TYPE "affine-data-copy-generate" |
45 | |
46 | using namespace mlir; |
47 | using namespace mlir::affine; |
48 | |
49 | namespace { |
50 | |
51 | /// Replaces all loads and stores on memref's living in 'slowMemorySpace' by |
52 | /// introducing copy operations to transfer data into `fastMemorySpace` and |
53 | /// rewriting the original load's/store's to instead load/store from the |
54 | /// allocated fast memory buffers. Additional options specify the identifier |
55 | /// corresponding to the fast memory space and the amount of fast memory space |
56 | /// available. The pass traverses through the nesting structure, recursing to |
57 | /// inner levels if necessary to determine at what depth copies need to be |
58 | /// placed so that the allocated buffers fit within the memory capacity |
59 | /// provided. |
60 | // TODO: We currently can't generate copies correctly when stores |
61 | // are strided. Check for strided stores. |
62 | struct AffineDataCopyGeneration |
63 | : public affine::impl::AffineDataCopyGenerationBase< |
64 | AffineDataCopyGeneration> { |
65 | AffineDataCopyGeneration() = default; |
66 | explicit AffineDataCopyGeneration(unsigned slowMemorySpace, |
67 | unsigned fastMemorySpace, |
68 | unsigned tagMemorySpace, |
69 | int minDmaTransferSize, |
70 | uint64_t fastMemCapacityBytes) { |
71 | this->slowMemorySpace = slowMemorySpace; |
72 | this->fastMemorySpace = fastMemorySpace; |
73 | this->tagMemorySpace = tagMemorySpace; |
74 | this->minDmaTransferSize = minDmaTransferSize; |
75 | this->fastMemoryCapacity = fastMemCapacityBytes / 1024; |
76 | } |
77 | |
78 | void runOnOperation() override; |
79 | void runOnBlock(Block *block, DenseSet<Operation *> ©Nests); |
80 | |
81 | // Constant zero index to avoid too many duplicates. |
82 | Value zeroIndex = nullptr; |
83 | }; |
84 | |
85 | } // namespace |
86 | |
87 | /// Generates copies for memref's living in 'slowMemorySpace' into newly created |
88 | /// buffers in 'fastMemorySpace', and replaces memory operations to the former |
89 | /// by the latter. Only load op's handled for now. |
90 | std::unique_ptr<OperationPass<func::FuncOp>> |
91 | mlir::affine::createAffineDataCopyGenerationPass( |
92 | unsigned slowMemorySpace, unsigned fastMemorySpace, unsigned tagMemorySpace, |
93 | int minDmaTransferSize, uint64_t fastMemCapacityBytes) { |
94 | return std::make_unique<AffineDataCopyGeneration>( |
95 | args&: slowMemorySpace, args&: fastMemorySpace, args&: tagMemorySpace, args&: minDmaTransferSize, |
96 | args&: fastMemCapacityBytes); |
97 | } |
98 | std::unique_ptr<OperationPass<func::FuncOp>> |
99 | mlir::affine::createAffineDataCopyGenerationPass() { |
100 | return std::make_unique<AffineDataCopyGeneration>(); |
101 | } |
102 | |
103 | /// Generate copies for this block. The block is partitioned into separate |
104 | /// ranges: each range is either a sequence of one or more operations starting |
105 | /// and ending with an affine load or store op, or just an affine.for op (which |
106 | /// could have other affine for op's nested within). |
107 | void AffineDataCopyGeneration::runOnBlock(Block *block, |
108 | DenseSet<Operation *> ©Nests) { |
109 | if (block->empty()) |
110 | return; |
111 | |
112 | uint64_t fastMemCapacityBytes = |
113 | fastMemoryCapacity != std::numeric_limits<uint64_t>::max() |
114 | ? fastMemoryCapacity * 1024 |
115 | : fastMemoryCapacity; |
116 | AffineCopyOptions copyOptions = {generateDma, slowMemorySpace, |
117 | fastMemorySpace, tagMemorySpace, |
118 | fastMemCapacityBytes}; |
119 | |
120 | // Every affine.for op in the block starts and ends a block range for copying; |
121 | // in addition, a contiguous sequence of operations starting with a |
122 | // load/store op but not including any copy nests themselves is also |
123 | // identified as a copy block range. Straightline code (a contiguous chunk of |
124 | // operations excluding AffineForOp's) are always assumed to not exhaust |
125 | // memory. As a result, this approach is conservative in some cases at the |
126 | // moment; we do a check later and report an error with location info. |
127 | |
128 | // Get to the first load, store, or for op (that is not a copy nest itself). |
129 | auto curBegin = |
130 | std::find_if(first: block->begin(), last: block->end(), pred: [&](Operation &op) { |
131 | return isa<AffineLoadOp, AffineStoreOp, AffineForOp>(op) && |
132 | copyNests.count(&op) == 0; |
133 | }); |
134 | |
135 | // Create [begin, end) ranges. |
136 | auto it = curBegin; |
137 | while (it != block->end()) { |
138 | AffineForOp forOp; |
139 | // If you hit a non-copy for loop, we will split there. |
140 | if ((forOp = dyn_cast<AffineForOp>(&*it)) && copyNests.count(V: forOp) == 0) { |
141 | // Perform the copying up unti this 'for' op first. |
142 | (void)affineDataCopyGenerate(/*begin=*/curBegin, /*end=*/it, copyOptions, |
143 | /*filterMemRef=*/std::nullopt, copyNests); |
144 | |
145 | // Returns true if the footprint is known to exceed capacity. |
146 | auto exceedsCapacity = [&](AffineForOp forOp) { |
147 | std::optional<int64_t> = |
148 | getMemoryFootprintBytes(forOp, |
149 | /*memorySpace=*/0); |
150 | return (footprint.has_value() && |
151 | static_cast<uint64_t>(*footprint) > fastMemCapacityBytes); |
152 | }; |
153 | |
154 | // If the memory footprint of the 'affine.for' loop is higher than fast |
155 | // memory capacity (when provided), we recurse to copy at an inner level |
156 | // until we find a depth at which footprint fits in fast mem capacity. If |
157 | // the footprint can't be calculated, we assume for now it fits. Recurse |
158 | // inside if footprint for 'forOp' exceeds capacity, or when |
159 | // skipNonUnitStrideLoops is set and the step size is not one. |
160 | bool recurseInner = skipNonUnitStrideLoops ? forOp.getStep() != 1 |
161 | : exceedsCapacity(forOp); |
162 | if (recurseInner) { |
163 | // We'll recurse and do the copies at an inner level for 'forInst'. |
164 | // Recurse onto the body of this loop. |
165 | runOnBlock(forOp.getBody(), copyNests); |
166 | } else { |
167 | // We have enough capacity, i.e., copies will be computed for the |
168 | // portion of the block until 'it', and for 'it', which is 'forOp'. Note |
169 | // that for the latter, the copies are placed just before this loop (for |
170 | // incoming copies) and right after (for outgoing ones). |
171 | |
172 | // Inner loop copies have their own scope - we don't thus update |
173 | // consumed capacity. The footprint check above guarantees this inner |
174 | // loop's footprint fits. |
175 | (void)affineDataCopyGenerate(/*begin=*/it, /*end=*/std::next(x: it), |
176 | copyOptions, |
177 | /*filterMemRef=*/std::nullopt, copyNests); |
178 | } |
179 | // Get to the next load or store op after 'forOp'. |
180 | curBegin = std::find_if(first: std::next(x: it), last: block->end(), pred: [&](Operation &op) { |
181 | return isa<AffineLoadOp, AffineStoreOp, AffineForOp>(op) && |
182 | copyNests.count(&op) == 0; |
183 | }); |
184 | it = curBegin; |
185 | } else { |
186 | assert(copyNests.count(&*it) == 0 && |
187 | "all copy nests generated should have been skipped above" ); |
188 | // We simply include this op in the current range and continue for more. |
189 | ++it; |
190 | } |
191 | } |
192 | |
193 | // Generate the copy for the final block range. |
194 | if (curBegin != block->end()) { |
195 | // Can't be a terminator because it would have been skipped above. |
196 | assert(!curBegin->hasTrait<OpTrait::IsTerminator>() && |
197 | "can't be a terminator" ); |
198 | // Exclude the affine.yield - hence, the std::prev. |
199 | (void)affineDataCopyGenerate(/*begin=*/curBegin, |
200 | /*end=*/std::prev(x: block->end()), copyOptions, |
201 | /*filterMemRef=*/std::nullopt, copyNests); |
202 | } |
203 | } |
204 | |
205 | void AffineDataCopyGeneration::runOnOperation() { |
206 | func::FuncOp f = getOperation(); |
207 | OpBuilder topBuilder(f.getBody()); |
208 | zeroIndex = topBuilder.create<arith::ConstantIndexOp>(f.getLoc(), 0); |
209 | |
210 | // Nests that are copy-in's or copy-out's; the root AffineForOps of those |
211 | // nests are stored herein. |
212 | DenseSet<Operation *> copyNests; |
213 | |
214 | // Clear recorded copy nests. |
215 | copyNests.clear(); |
216 | |
217 | for (auto &block : f) |
218 | runOnBlock(&block, copyNests); |
219 | |
220 | // Promote any single iteration loops in the copy nests and collect |
221 | // load/stores to simplify. |
222 | SmallVector<Operation *, 4> copyOps; |
223 | for (Operation *nest : copyNests) |
224 | // With a post order walk, the erasure of loops does not affect |
225 | // continuation of the walk or the collection of load/store ops. |
226 | nest->walk(callback: [&](Operation *op) { |
227 | if (auto forOp = dyn_cast<AffineForOp>(op)) |
228 | (void)promoteIfSingleIteration(forOp); |
229 | else if (isa<AffineLoadOp, AffineStoreOp>(op)) |
230 | copyOps.push_back(Elt: op); |
231 | }); |
232 | |
233 | // Promoting single iteration loops could lead to simplification of |
234 | // contained load's/store's, and the latter could anyway also be |
235 | // canonicalized. |
236 | RewritePatternSet patterns(&getContext()); |
237 | AffineLoadOp::getCanonicalizationPatterns(patterns, &getContext()); |
238 | AffineStoreOp::getCanonicalizationPatterns(patterns, &getContext()); |
239 | FrozenRewritePatternSet frozenPatterns(std::move(patterns)); |
240 | GreedyRewriteConfig config; |
241 | config.strictMode = GreedyRewriteStrictness::ExistingAndNewOps; |
242 | (void)applyOpPatternsAndFold(ops: copyOps, patterns: frozenPatterns, config); |
243 | } |
244 | |