AffineDataCopyGeneration.cpp source code [mlir/lib/Dialect/Affine/Transforms/AffineDataCopyGeneration.cpp]

1	//===- AffineDataCopyGeneration.cpp - Explicit memref copying pass -------===//*
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file implements a pass to automatically promote accessed memref regions
10	// to buffers in a faster memory space that is explicitly managed, with the
11	// necessary data movement operations performed through either regular
12	// point-wise load/store's or DMAs. Such explicit copying (also referred to as
13	// array packing/unpacking in the literature), when done on arrays that exhibit
14	// reuse, results in near elimination of conflict misses, TLB misses, reduced
15	// use of hardware prefetch streams, and reduced false sharing. It is also
16	// necessary for hardware that explicitly managed levels in the memory
17	// hierarchy, and where DMAs may have to be used. This optimization is often
18	// performed on already tiled code.
19	//
20	//===----------------------------------------------------------------------===//
21
22	#include "mlir/Dialect/Affine/Passes.h"
23
24	#include "mlir/Dialect/Affine/Analysis/Utils.h"
25	#include "mlir/Dialect/Affine/IR/AffineOps.h"
26	#include "mlir/Dialect/Affine/LoopUtils.h"
27	#include "mlir/Dialect/Arith/IR/Arith.h"
28	#include "mlir/Dialect/Func/IR/FuncOps.h"
29	#include "mlir/Dialect/MemRef/IR/MemRef.h"
30	#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
31	#include "llvm/ADT/MapVector.h"
32	#include "llvm/Support/CommandLine.h"
33	#include "llvm/Support/Debug.h"
34	#include <algorithm>
35	#include <optional>
36
37	namespace mlir {
38	namespace affine {
39	#define GEN_PASS_DEF_AFFINEDATACOPYGENERATION
40	#include "mlir/Dialect/Affine/Passes.h.inc"
41	} // namespace affine
42	} // namespace mlir
43
44	#define DEBUG_TYPE "affine-data-copy-generate"
45
46	using namespace mlir;
47	using namespace mlir::affine;
48
49	namespace {
50
51	/// Replaces all loads and stores on memref's living in 'slowMemorySpace' by
52	/// introducing copy operations to transfer data into `fastMemorySpace` and
53	/// rewriting the original load's/store's to instead load/store from the
54	/// allocated fast memory buffers. Additional options specify the identifier
55	/// corresponding to the fast memory space and the amount of fast memory space
56	/// available. The pass traverses through the nesting structure, recursing to
57	/// inner levels if necessary to determine at what depth copies need to be
58	/// placed so that the allocated buffers fit within the memory capacity
59	/// provided.
60	// TODO: We currently can't generate copies correctly when stores
61	// are strided. Check for strided stores.
62	struct AffineDataCopyGeneration
63	: public affine::impl::AffineDataCopyGenerationBase<
64	AffineDataCopyGeneration> {
65	AffineDataCopyGeneration() = default;
66	explicit AffineDataCopyGeneration(unsigned slowMemorySpace,
67	unsigned fastMemorySpace,
68	unsigned tagMemorySpace,
69	int minDmaTransferSize,
70	uint64_t fastMemCapacityBytes) {
71	this->slowMemorySpace = slowMemorySpace;
72	this->fastMemorySpace = fastMemorySpace;
73	this->tagMemorySpace = tagMemorySpace;
74	this->minDmaTransferSize = minDmaTransferSize;
75	this->fastMemoryCapacity = fastMemCapacityBytes / `1024`;
76	}
77
78	void runOnOperation() override;
79	void runOnBlock(Block block, DenseSet<Operation > &copyNests);
80
81	// Constant zero index to avoid too many duplicates.
82	Value zeroIndex = nullptr;
83	};
84
85	} // namespace
86
87	/// Generates copies for memref's living in 'slowMemorySpace' into newly created
88	/// buffers in 'fastMemorySpace', and replaces memory operations to the former
89	/// by the latter.
90	std::unique_ptr<OperationPass<func::FuncOp>>
91	mlir::affine::createAffineDataCopyGenerationPass(
92	unsigned slowMemorySpace, unsigned fastMemorySpace, unsigned tagMemorySpace,
93	int minDmaTransferSize, uint64_t fastMemCapacityBytes) {
94	return std::make_unique<AffineDataCopyGeneration>(
95	args&: slowMemorySpace, args&: fastMemorySpace, args&: tagMemorySpace, args&: minDmaTransferSize,
96	args&: fastMemCapacityBytes);
97	}
98	std::unique_ptr<OperationPass<func::FuncOp>>
99	mlir::affine::createAffineDataCopyGenerationPass() {
100	return std::make_unique<AffineDataCopyGeneration>();
101	}
102
103	/// Generate copies for this block. The block is partitioned into separate
104	/// ranges: each range is either a sequence of one or more operations starting
105	/// and ending with an affine load or store op, or just an affine.for op (which
106	/// could have other affine for op's nested within).
107	void AffineDataCopyGeneration::runOnBlock(Block *block,
108	DenseSet<Operation *> &copyNests) {
109	if (block->empty())
110	return;
111
112	uint64_t fastMemCapacityBytes =
113	fastMemoryCapacity != std::numeric_limits<uint64_t>::max()
114	? fastMemoryCapacity * `1024`
115	: fastMemoryCapacity;
116	AffineCopyOptions copyOptions = {generateDma, slowMemorySpace,
117	fastMemorySpace, tagMemorySpace,
118	fastMemCapacityBytes};
119
120	// Every affine.for op in the block starts and ends a block range for copying;
121	// in addition, a contiguous sequence of operations starting with a
122	// load/store op but not including any copy nests themselves is also
123	// identified as a copy block range. Straightline code (a contiguous chunk of
124	// operations excluding AffineForOp's) are always assumed to not exhaust
125	// memory. As a result, this approach is conservative in some cases at the
126	// moment; we do a check later and report an error with location info.
127
128	// Get to the first load, store, or for op (that is not a copy nest itself).
129	auto curBegin = llvm::find_if(Range&: *block, P: [&](Operation &op) {
130	return isa<AffineLoadOp, AffineStoreOp, AffineForOp>(op) &&
131	copyNests.count(&op) == `0`;
132	});
133
134	// Create [begin, end) ranges.
135	auto it = curBegin;
136	while (it != block->end()) {
137	AffineForOp forOp;
138	// If you hit a non-copy for loop, we will split there.
139	if ((forOp = dyn_cast<AffineForOp>(&*it)) && copyNests.count(V: forOp) == `0`) {
140	// Perform the copying up unti this 'for' op first.
141	(void)affineDataCopyGenerate(/begin=/curBegin, /end=/it, copyOptions,
142	/filterMemRef=/std::nullopt, copyNests);
143
144	// Returns true if the footprint is known to exceed capacity.
145	auto exceedsCapacity = [&](AffineForOp forOp) {
146	std::optional<int64_t> footprint =
147	getMemoryFootprintBytes(forOp,
148	/memorySpace=/`0`);
149	return (footprint.has_value() &&
150	static_cast<uint64_t>(*footprint) > fastMemCapacityBytes);
151	};
152
153	// If the memory footprint of the 'affine.for' loop is higher than fast
154	// memory capacity (when provided), we recurse to copy at an inner level
155	// until we find a depth at which footprint fits in fast mem capacity. If
156	// the footprint can't be calculated, we assume for now it fits. Recurse
157	// inside if footprint for 'forOp' exceeds capacity, or when
158	// skipNonUnitStrideLoops is set and the step size is not one.
159	bool recurseInner = skipNonUnitStrideLoops ? forOp.getStep() != `1`
160	: exceedsCapacity(forOp);
161	if (recurseInner) {
162	// We'll recurse and do the copies at an inner level for 'forInst'.
163	// Recurse onto the body of this loop.
164	runOnBlock(forOp.getBody(), copyNests);
165	} else {
166	// We have enough capacity, i.e., copies will be computed for the
167	// portion of the block until 'it', and for 'it', which is 'forOp'. Note
168	// that for the latter, the copies are placed just before this loop (for
169	// incoming copies) and right after (for outgoing ones).
170
171	// Inner loop copies have their own scope - we don't thus update
172	// consumed capacity. The footprint check above guarantees this inner
173	// loop's footprint fits.
174	(void)affineDataCopyGenerate(/begin=/it, /end=/std::next(x: it),
175	copyOptions,
176	/filterMemRef=/std::nullopt, copyNests);
177	}
178	// Get to the next load or store op after 'forOp'.
179	curBegin = std::find_if(first: std::next(x: it), last: block->end(), pred: [&](Operation &op) {
180	return isa<AffineLoadOp, AffineStoreOp, AffineForOp>(op) &&
181	copyNests.count(&op) == `0`;
182	});
183	it = curBegin;
184	} else {
185	assert(copyNests.count(&*it) == `0` &&
186	"all copy nests generated should have been skipped above");
187	// We simply include this op in the current range and continue for more.
188	++it;
189	}
190	}
191
192	// Generate the copy for the final block range.
193	if (curBegin != block->end()) {
194	// Can't be a terminator because it would have been skipped above.
195	assert(!curBegin ->hasTrait<OpTrait::IsTerminator>() &&
196	"can't be a terminator");
197	// Exclude the affine.yield - hence, the std::prev.
198	(void)affineDataCopyGenerate(/begin=/curBegin,
199	/end=/std::prev(x: block->end()), copyOptions,
200	/filterMemRef=/std::nullopt, copyNests);
201	}
202	}
203
204	void AffineDataCopyGeneration::runOnOperation() {
205	func::FuncOp f = getOperation();
206	OpBuilder topBuilder(f.getBody());
207	zeroIndex = topBuilder.create<arith::ConstantIndexOp>(f.getLoc(), `0`);
208
209	// Nests that are copy-in's or copy-out's; the root AffineForOps of those
210	// nests are stored herein.
211	DenseSet<Operation *> copyNests;
212
213	// Clear recorded copy nests.
214	copyNests.clear();
215
216	for (auto &block : f)
217	runOnBlock(&block, copyNests);
218
219	// Promote any single iteration loops in the copy nests and collect
220	// load/stores to simplify.
221	SmallVector<Operation *, `4`> copyOps;
222	for (Operation *nest : copyNests)
223	// With a post order walk, the erasure of loops does not affect
224	// continuation of the walk or the collection of load/store ops.
225	nest->walk(callback: [&](Operation *op) {
226	if (auto forOp = dyn_cast<AffineForOp>(op))
227	(void)promoteIfSingleIteration(forOp);
228	else if (isa<AffineLoadOp, AffineStoreOp>(op))
229	copyOps.push_back(Elt: op);
230	});
231
232	// Promoting single iteration loops could lead to simplification of
233	// contained load's/store's, and the latter could anyway also be
234	// canonicalized.
235	RewritePatternSet patterns(&getContext());
236	AffineLoadOp::getCanonicalizationPatterns(patterns, &getContext());
237	AffineStoreOp::getCanonicalizationPatterns(patterns, &getContext());
238	FrozenRewritePatternSet frozenPatterns(std::move(patterns));
239	(void)applyOpPatternsGreedily(
240	ops: copyOps, patterns: frozenPatterns,
241	config: GreedyRewriteConfig ().setStrictness(
242	GreedyRewriteStrictness::ExistingAndNewOps));
243	}
244

Provided by KDAB

Update your C++ knowledge – Modern C++11/14/17 Training

Find out more

Definitions

source code of mlir/lib/Dialect/Affine/Transforms/AffineDataCopyGeneration.cpp