| 1 | //===- AffineDataCopyGeneration.cpp - Explicit memref copying pass ------*-===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // This file implements a pass to automatically promote accessed memref regions |
| 10 | // to buffers in a faster memory space that is explicitly managed, with the |
| 11 | // necessary data movement operations performed through either regular |
| 12 | // point-wise load/store's or DMAs. Such explicit copying (also referred to as |
| 13 | // array packing/unpacking in the literature), when done on arrays that exhibit |
| 14 | // reuse, results in near elimination of conflict misses, TLB misses, reduced |
| 15 | // use of hardware prefetch streams, and reduced false sharing. It is also |
| 16 | // necessary for hardware that explicitly managed levels in the memory |
| 17 | // hierarchy, and where DMAs may have to be used. This optimization is often |
| 18 | // performed on already tiled code. |
| 19 | // |
| 20 | //===----------------------------------------------------------------------===// |
| 21 | |
| 22 | #include "mlir/Dialect/Affine/Passes.h" |
| 23 | |
| 24 | #include "mlir/Dialect/Affine/Analysis/Utils.h" |
| 25 | #include "mlir/Dialect/Affine/IR/AffineOps.h" |
| 26 | #include "mlir/Dialect/Affine/LoopUtils.h" |
| 27 | #include "mlir/Dialect/Arith/IR/Arith.h" |
| 28 | #include "mlir/Dialect/Func/IR/FuncOps.h" |
| 29 | #include "mlir/Dialect/MemRef/IR/MemRef.h" |
| 30 | #include "mlir/Transforms/GreedyPatternRewriteDriver.h" |
| 31 | #include "llvm/ADT/MapVector.h" |
| 32 | #include "llvm/Support/CommandLine.h" |
| 33 | #include "llvm/Support/Debug.h" |
| 34 | #include <algorithm> |
| 35 | #include <optional> |
| 36 | |
| 37 | namespace mlir { |
| 38 | namespace affine { |
| 39 | #define GEN_PASS_DEF_AFFINEDATACOPYGENERATION |
| 40 | #include "mlir/Dialect/Affine/Passes.h.inc" |
| 41 | } // namespace affine |
| 42 | } // namespace mlir |
| 43 | |
| 44 | #define DEBUG_TYPE "affine-data-copy-generate" |
| 45 | |
| 46 | using namespace mlir; |
| 47 | using namespace mlir::affine; |
| 48 | |
| 49 | namespace { |
| 50 | |
| 51 | /// Replaces all loads and stores on memref's living in 'slowMemorySpace' by |
| 52 | /// introducing copy operations to transfer data into `fastMemorySpace` and |
| 53 | /// rewriting the original load's/store's to instead load/store from the |
| 54 | /// allocated fast memory buffers. Additional options specify the identifier |
| 55 | /// corresponding to the fast memory space and the amount of fast memory space |
| 56 | /// available. The pass traverses through the nesting structure, recursing to |
| 57 | /// inner levels if necessary to determine at what depth copies need to be |
| 58 | /// placed so that the allocated buffers fit within the memory capacity |
| 59 | /// provided. |
| 60 | // TODO: We currently can't generate copies correctly when stores |
| 61 | // are strided. Check for strided stores. |
| 62 | struct AffineDataCopyGeneration |
| 63 | : public affine::impl::AffineDataCopyGenerationBase< |
| 64 | AffineDataCopyGeneration> { |
| 65 | AffineDataCopyGeneration() = default; |
| 66 | explicit AffineDataCopyGeneration(unsigned slowMemorySpace, |
| 67 | unsigned fastMemorySpace, |
| 68 | unsigned tagMemorySpace, |
| 69 | int minDmaTransferSize, |
| 70 | uint64_t fastMemCapacityBytes) { |
| 71 | this->slowMemorySpace = slowMemorySpace; |
| 72 | this->fastMemorySpace = fastMemorySpace; |
| 73 | this->tagMemorySpace = tagMemorySpace; |
| 74 | this->minDmaTransferSize = minDmaTransferSize; |
| 75 | this->fastMemoryCapacity = fastMemCapacityBytes / 1024; |
| 76 | } |
| 77 | |
| 78 | void runOnOperation() override; |
| 79 | void runOnBlock(Block *block, DenseSet<Operation *> ©Nests); |
| 80 | |
| 81 | // Constant zero index to avoid too many duplicates. |
| 82 | Value zeroIndex = nullptr; |
| 83 | }; |
| 84 | |
| 85 | } // namespace |
| 86 | |
| 87 | /// Generates copies for memref's living in 'slowMemorySpace' into newly created |
| 88 | /// buffers in 'fastMemorySpace', and replaces memory operations to the former |
| 89 | /// by the latter. |
| 90 | std::unique_ptr<OperationPass<func::FuncOp>> |
| 91 | mlir::affine::createAffineDataCopyGenerationPass( |
| 92 | unsigned slowMemorySpace, unsigned fastMemorySpace, unsigned tagMemorySpace, |
| 93 | int minDmaTransferSize, uint64_t fastMemCapacityBytes) { |
| 94 | return std::make_unique<AffineDataCopyGeneration>( |
| 95 | args&: slowMemorySpace, args&: fastMemorySpace, args&: tagMemorySpace, args&: minDmaTransferSize, |
| 96 | args&: fastMemCapacityBytes); |
| 97 | } |
| 98 | std::unique_ptr<OperationPass<func::FuncOp>> |
| 99 | mlir::affine::createAffineDataCopyGenerationPass() { |
| 100 | return std::make_unique<AffineDataCopyGeneration>(); |
| 101 | } |
| 102 | |
| 103 | /// Generate copies for this block. The block is partitioned into separate |
| 104 | /// ranges: each range is either a sequence of one or more operations starting |
| 105 | /// and ending with an affine load or store op, or just an affine.for op (which |
| 106 | /// could have other affine for op's nested within). |
| 107 | void AffineDataCopyGeneration::runOnBlock(Block *block, |
| 108 | DenseSet<Operation *> ©Nests) { |
| 109 | if (block->empty()) |
| 110 | return; |
| 111 | |
| 112 | uint64_t fastMemCapacityBytes = |
| 113 | fastMemoryCapacity != std::numeric_limits<uint64_t>::max() |
| 114 | ? fastMemoryCapacity * 1024 |
| 115 | : fastMemoryCapacity; |
| 116 | AffineCopyOptions copyOptions = {generateDma, slowMemorySpace, |
| 117 | fastMemorySpace, tagMemorySpace, |
| 118 | fastMemCapacityBytes}; |
| 119 | |
| 120 | // Every affine.for op in the block starts and ends a block range for copying; |
| 121 | // in addition, a contiguous sequence of operations starting with a |
| 122 | // load/store op but not including any copy nests themselves is also |
| 123 | // identified as a copy block range. Straightline code (a contiguous chunk of |
| 124 | // operations excluding AffineForOp's) are always assumed to not exhaust |
| 125 | // memory. As a result, this approach is conservative in some cases at the |
| 126 | // moment; we do a check later and report an error with location info. |
| 127 | |
| 128 | // Get to the first load, store, or for op (that is not a copy nest itself). |
| 129 | auto curBegin = llvm::find_if(Range&: *block, P: [&](Operation &op) { |
| 130 | return isa<AffineLoadOp, AffineStoreOp, AffineForOp>(op) && |
| 131 | copyNests.count(&op) == 0; |
| 132 | }); |
| 133 | |
| 134 | // Create [begin, end) ranges. |
| 135 | auto it = curBegin; |
| 136 | while (it != block->end()) { |
| 137 | AffineForOp forOp; |
| 138 | // If you hit a non-copy for loop, we will split there. |
| 139 | if ((forOp = dyn_cast<AffineForOp>(&*it)) && copyNests.count(V: forOp) == 0) { |
| 140 | // Perform the copying up unti this 'for' op first. |
| 141 | (void)affineDataCopyGenerate(/*begin=*/curBegin, /*end=*/it, copyOptions, |
| 142 | /*filterMemRef=*/std::nullopt, copyNests); |
| 143 | |
| 144 | // Returns true if the footprint is known to exceed capacity. |
| 145 | auto exceedsCapacity = [&](AffineForOp forOp) { |
| 146 | std::optional<int64_t> = |
| 147 | getMemoryFootprintBytes(forOp, |
| 148 | /*memorySpace=*/0); |
| 149 | return (footprint.has_value() && |
| 150 | static_cast<uint64_t>(*footprint) > fastMemCapacityBytes); |
| 151 | }; |
| 152 | |
| 153 | // If the memory footprint of the 'affine.for' loop is higher than fast |
| 154 | // memory capacity (when provided), we recurse to copy at an inner level |
| 155 | // until we find a depth at which footprint fits in fast mem capacity. If |
| 156 | // the footprint can't be calculated, we assume for now it fits. Recurse |
| 157 | // inside if footprint for 'forOp' exceeds capacity, or when |
| 158 | // skipNonUnitStrideLoops is set and the step size is not one. |
| 159 | bool recurseInner = skipNonUnitStrideLoops ? forOp.getStep() != 1 |
| 160 | : exceedsCapacity(forOp); |
| 161 | if (recurseInner) { |
| 162 | // We'll recurse and do the copies at an inner level for 'forInst'. |
| 163 | // Recurse onto the body of this loop. |
| 164 | runOnBlock(forOp.getBody(), copyNests); |
| 165 | } else { |
| 166 | // We have enough capacity, i.e., copies will be computed for the |
| 167 | // portion of the block until 'it', and for 'it', which is 'forOp'. Note |
| 168 | // that for the latter, the copies are placed just before this loop (for |
| 169 | // incoming copies) and right after (for outgoing ones). |
| 170 | |
| 171 | // Inner loop copies have their own scope - we don't thus update |
| 172 | // consumed capacity. The footprint check above guarantees this inner |
| 173 | // loop's footprint fits. |
| 174 | (void)affineDataCopyGenerate(/*begin=*/it, /*end=*/std::next(x: it), |
| 175 | copyOptions, |
| 176 | /*filterMemRef=*/std::nullopt, copyNests); |
| 177 | } |
| 178 | // Get to the next load or store op after 'forOp'. |
| 179 | curBegin = std::find_if(first: std::next(x: it), last: block->end(), pred: [&](Operation &op) { |
| 180 | return isa<AffineLoadOp, AffineStoreOp, AffineForOp>(op) && |
| 181 | copyNests.count(&op) == 0; |
| 182 | }); |
| 183 | it = curBegin; |
| 184 | } else { |
| 185 | assert(copyNests.count(&*it) == 0 && |
| 186 | "all copy nests generated should have been skipped above" ); |
| 187 | // We simply include this op in the current range and continue for more. |
| 188 | ++it; |
| 189 | } |
| 190 | } |
| 191 | |
| 192 | // Generate the copy for the final block range. |
| 193 | if (curBegin != block->end()) { |
| 194 | // Can't be a terminator because it would have been skipped above. |
| 195 | assert(!curBegin->hasTrait<OpTrait::IsTerminator>() && |
| 196 | "can't be a terminator" ); |
| 197 | // Exclude the affine.yield - hence, the std::prev. |
| 198 | (void)affineDataCopyGenerate(/*begin=*/curBegin, |
| 199 | /*end=*/std::prev(x: block->end()), copyOptions, |
| 200 | /*filterMemRef=*/std::nullopt, copyNests); |
| 201 | } |
| 202 | } |
| 203 | |
| 204 | void AffineDataCopyGeneration::runOnOperation() { |
| 205 | func::FuncOp f = getOperation(); |
| 206 | OpBuilder topBuilder(f.getBody()); |
| 207 | zeroIndex = topBuilder.create<arith::ConstantIndexOp>(f.getLoc(), 0); |
| 208 | |
| 209 | // Nests that are copy-in's or copy-out's; the root AffineForOps of those |
| 210 | // nests are stored herein. |
| 211 | DenseSet<Operation *> copyNests; |
| 212 | |
| 213 | // Clear recorded copy nests. |
| 214 | copyNests.clear(); |
| 215 | |
| 216 | for (auto &block : f) |
| 217 | runOnBlock(&block, copyNests); |
| 218 | |
| 219 | // Promote any single iteration loops in the copy nests and collect |
| 220 | // load/stores to simplify. |
| 221 | SmallVector<Operation *, 4> copyOps; |
| 222 | for (Operation *nest : copyNests) |
| 223 | // With a post order walk, the erasure of loops does not affect |
| 224 | // continuation of the walk or the collection of load/store ops. |
| 225 | nest->walk(callback: [&](Operation *op) { |
| 226 | if (auto forOp = dyn_cast<AffineForOp>(op)) |
| 227 | (void)promoteIfSingleIteration(forOp); |
| 228 | else if (isa<AffineLoadOp, AffineStoreOp>(op)) |
| 229 | copyOps.push_back(Elt: op); |
| 230 | }); |
| 231 | |
| 232 | // Promoting single iteration loops could lead to simplification of |
| 233 | // contained load's/store's, and the latter could anyway also be |
| 234 | // canonicalized. |
| 235 | RewritePatternSet patterns(&getContext()); |
| 236 | AffineLoadOp::getCanonicalizationPatterns(patterns, &getContext()); |
| 237 | AffineStoreOp::getCanonicalizationPatterns(patterns, &getContext()); |
| 238 | FrozenRewritePatternSet frozenPatterns(std::move(patterns)); |
| 239 | (void)applyOpPatternsGreedily( |
| 240 | ops: copyOps, patterns: frozenPatterns, |
| 241 | config: GreedyRewriteConfig().setStrictness( |
| 242 | GreedyRewriteStrictness::ExistingAndNewOps)); |
| 243 | } |
| 244 | |