LoopUtils.cpp source code [mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp]

1	//===- LoopUtils.cpp ---- Misc utilities for loop transformation ----------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file implements miscellaneous loop transformation routines.
10	//
11	//===----------------------------------------------------------------------===//
12
13	#include "mlir/Dialect/Affine/LoopUtils.h"
14	#include "mlir/Analysis/SliceAnalysis.h"
15	#include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h"
16	#include "mlir/Dialect/Affine/Analysis/Utils.h"
17	#include "mlir/Dialect/Affine/IR/AffineValueMap.h"
18	#include "mlir/Dialect/Affine/Utils.h"
19	#include "mlir/Dialect/Func/IR/FuncOps.h"
20	#include "mlir/Dialect/MemRef/IR/MemRef.h"
21	#include "mlir/Dialect/SCF/IR/SCF.h"
22	#include "mlir/IR/IRMapping.h"
23	#include "mlir/IR/IntegerSet.h"
24	#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
25	#include "llvm/ADT/MapVector.h"
26	#include "llvm/ADT/SmallPtrSet.h"
27	#include "llvm/Support/Debug.h"
28	#include "llvm/Support/raw_ostream.h"
29	#include <optional>
30
31	#define DEBUG_TYPE "loop-utils"
32
33	using namespace mlir;
34	using namespace affine;
35	using namespace presburger;
36	using llvm::SmallMapVector;
37
38	/// Computes the cleanup loop lower bound of the loop being unrolled with
39	/// the specified unroll factor; this bound will also be upper bound of the main
40	/// part of the unrolled loop. Computes the bound as an AffineMap with its
41	/// operands or a null map when the trip count can't be expressed as an affine
42	/// expression.
43	static void
44	getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor,
45	AffineMap &cleanupLbMap,
46	SmallVectorImpl<Value> &cleanupLbOperands) {
47	AffineMap tripCountMap;
48	SmallVector<Value, `4`> tripCountOperands;
49	getTripCountMapAndOperands(forOp, &tripCountMap, &tripCountOperands);
50	// Trip count can't be computed.
51	if (!tripCountMap) {
52	cleanupLbMap = AffineMap ();
53	return;
54	}
55
56	OpBuilder b(forOp);
57	auto lbMap = forOp.getLowerBoundMap();
58	auto lb = b.create<AffineApplyOp>(forOp.getLoc(), lbMap,
59	forOp.getLowerBoundOperands());
60
61	// For each upper bound expr, get the range.
62	// Eg: affine.for %i = lb to min (ub1, ub2),
63	// where tripCountExprs yield (tr1, tr2), we create affine.apply's:
64	// lb + tr1 - tr1 % ufactor, lb + tr2 - tr2 % ufactor; the results of all
65	// these affine.apply's make up the cleanup loop lower bound.
66	SmallVector<AffineExpr, `4`> bumpExprs(tripCountMap.getNumResults());
67	SmallVector<Value, `4`> bumpValues(tripCountMap.getNumResults());
68	int64_t step = forOp.getStepAsInt();
69	for (unsigned i = `0`, e = tripCountMap.getNumResults(); i < e; i++) {
70	auto tripCountExpr = tripCountMap.getResult(idx: i);
71	bumpExprs [i] = (tripCountExpr - tripCountExpr % unrollFactor) * step;
72	auto bumpMap = AffineMap::get(dimCount: tripCountMap.getNumDims(),
73	symbolCount: tripCountMap.getNumSymbols(), result: bumpExprs [i]);
74	bumpValues [i] =
75	b.create<AffineApplyOp>(forOp.getLoc(), bumpMap, tripCountOperands);
76	}
77
78	SmallVector<AffineExpr, `4`> newUbExprs(tripCountMap.getNumResults());
79	for (unsigned i = `0`, e = bumpExprs.size(); i < e; i++)
80	newUbExprs [i] = b.getAffineDimExpr(position: `0`) + b.getAffineDimExpr(position: i + `1`);
81
82	cleanupLbOperands.clear();
83	cleanupLbOperands.push_back(Elt: lb);
84	cleanupLbOperands.append(in_start: bumpValues.begin(), in_end: bumpValues.end());
85	cleanupLbMap = AffineMap::get(dimCount: `1` + tripCountMap.getNumResults(), symbolCount: `0`, results: newUbExprs,
86	context: b.getContext());
87	// Simplify the cleanupLbMap + cleanupLbOperands.
88	fullyComposeAffineMapAndOperands(map: &cleanupLbMap, operands: &cleanupLbOperands);
89	cleanupLbMap = simplifyAffineMap(map: cleanupLbMap);
90	canonicalizeMapAndOperands(map: &cleanupLbMap, operands: &cleanupLbOperands);
91	// Remove any affine.apply's that became dead from the simplification above.
92	for (auto v : bumpValues)
93	if (v.use_empty())
94	v.getDefiningOp()->erase();
95
96	if (lb.use_empty())
97	lb.erase();
98	}
99
100	/// Helper to replace uses of loop carried values (iter_args) and loop
101	/// yield values while promoting single iteration affine.for ops.
102	static void replaceIterArgsAndYieldResults(AffineForOp forOp) {
103	// Replace uses of iter arguments with iter operands (initial values).
104	auto iterOperands = forOp.getInits();
105	auto iterArgs = forOp.getRegionIterArgs();
106	for (auto e : llvm::zip(iterOperands, iterArgs))
107	std::get<`1`>(e).replaceAllUsesWith(std::get<`0`>(e));
108
109	// Replace uses of loop results with the values yielded by the loop.
110	auto outerResults = forOp.getResults();
111	auto innerResults = forOp.getBody()->getTerminator()->getOperands();
112	for (auto e : llvm::zip(outerResults, innerResults))
113	std::get<`0`>(e).replaceAllUsesWith(std::get<`1`>(e));
114	}
115
116	/// Promotes the loop body of a forOp to its containing block if the forOp
117	/// was known to have a single iteration.
118	LogicalResult mlir::affine::promoteIfSingleIteration(AffineForOp forOp) {
119	std::optional<uint64_t> tripCount = getConstantTripCount(forOp);
120	if (!tripCount \|\| *tripCount != `1`)
121	return failure();
122
123	// TODO: extend this for arbitrary affine bounds.
124	if (forOp.getLowerBoundMap().getNumResults() != `1`)
125	return failure();
126
127	// Replaces all IV uses to its single iteration value.
128	auto iv = forOp.getInductionVar();
129	auto *parentBlock = forOp->getBlock();
130	if (!iv.use_empty()) {
131	if (forOp.hasConstantLowerBound()) {
132	auto func = forOp->getParentOfType<FunctionOpInterface>();
133	OpBuilder builder(forOp->getContext());
134	if (func)
135	builder.setInsertionPointToStart(&func.getFunctionBody().front());
136	else
137	builder.setInsertionPoint(forOp);
138	auto constOp = builder.create<arith::ConstantIndexOp>(
139	forOp.getLoc(), forOp.getConstantLowerBound());
140	iv.replaceAllUsesWith(constOp);
141	} else {
142	auto lbOperands = forOp.getLowerBoundOperands();
143	auto lbMap = forOp.getLowerBoundMap();
144	OpBuilder builder(forOp);
145	if (lbMap == builder.getDimIdentityMap()) {
146	// No need of generating an affine.apply.
147	iv.replaceAllUsesWith(lbOperands[`0`]);
148	} else {
149	auto affineApplyOp =
150	builder.create<AffineApplyOp>(forOp.getLoc(), lbMap, lbOperands);
151	iv.replaceAllUsesWith(affineApplyOp);
152	}
153	}
154	}
155
156	replaceIterArgsAndYieldResults(forOp);
157
158	// Move the loop body operations, except for its terminator, to the loop's
159	// containing block.
160	forOp.getBody()->back().erase();
161	parentBlock->getOperations().splice(Block::iterator(forOp),
162	forOp.getBody()->getOperations());
163	forOp.erase();
164	return success();
165	}
166
167	/// Generates an affine.for op with the specified lower and upper bounds
168	/// while generating the right IV remappings to realize shifts for operations in
169	/// its body. The operations that go into the loop body are specified in
170	/// opGroupQueue starting from the specified offset, and in that order. The
171	/// first element of the pair specifies the shift applied to that group of
172	/// operations; the shift is multiplied by the loop step before being applied.
173	/// Returns nullptr if the generated loop simplifies to a single iteration one.
174	static AffineForOp generateShiftedLoop(
175	AffineMap lbMap, AffineMap ubMap,
176	const std::vector<std::pair<uint64_t, ArrayRef<Operation *>>> &opGroupQueue,
177	unsigned offset, AffineForOp srcForOp, OpBuilder b) {
178	auto lbOperands = srcForOp.getLowerBoundOperands();
179	auto ubOperands = srcForOp.getUpperBoundOperands();
180
181	assert(lbMap.getNumInputs() == lbOperands.size());
182	assert(ubMap.getNumInputs() == ubOperands.size());
183
184	auto loopChunk =
185	b.create<AffineForOp>(srcForOp.getLoc(), lbOperands, lbMap, ubOperands,
186	ubMap, srcForOp.getStepAsInt());
187	auto loopChunkIV = loopChunk.getInductionVar();
188	auto srcIV = srcForOp.getInductionVar();
189
190	IRMapping operandMap;
191
192	auto bodyBuilder = OpBuilder::atBlockTerminator(block: loopChunk.getBody());
193	for (const auto &it : llvm::drop_begin(RangeOrContainer: opGroupQueue, N: offset)) {
194	uint64_t shift = it.first;
195	auto ops = it.second;
196	// All 'same shift' operations get added with their operands being
197	// remapped to results of cloned operations, and their IV used remapped.
198	// Generate the remapping if the shift is not zero: remappedIV = newIV -
199	// shift.
200	if (!srcIV.use_empty() && shift != `0`) {
201	auto ivRemap = bodyBuilder.create<AffineApplyOp>(
202	srcForOp.getLoc(),
203	bodyBuilder.getSingleDimShiftAffineMap(
204	-static_cast<int64_t>(srcForOp.getStepAsInt() * shift)),
205	loopChunkIV);
206	operandMap.map(srcIV, ivRemap);
207	} else {
208	operandMap.map(srcIV, loopChunkIV);
209	}
210	for (auto *op : ops)
211	bodyBuilder.clone(*op, operandMap);
212	};
213	if (succeeded(promoteIfSingleIteration(loopChunk)))
214	return AffineForOp();
215	return loopChunk;
216	}
217
218	// The skewing of operations with respect to one another can be used for
219	// example to allow overlap of asynchronous operations (such as DMA
220	// communication) with computation, or just relative shifting of operations
221	// for better register reuse, locality or parallelism. As such, the shifts are
222	// typically expected to be at most of the order of the number of operations.
223	// This method should not be used as a substitute for loop distribution/fission.
224	// This method uses an algorithm// in time linear in the number of operations
225	// in the body of the for loop - (using the 'sweep line' paradigm). This method
226	// asserts preservation of SSA dominance. A check for that as well as that for
227	// memory-based dependence preservation check rests with the users of this
228	// method.
229	LogicalResult mlir::affine::affineForOpBodySkew(AffineForOp forOp,
230	ArrayRef<uint64_t> shifts,
231	bool unrollPrologueEpilogue) {
232	assert(forOp.getBody()->getOperations().size() == shifts.size() &&
233	"too few/many shifts");
234	if (forOp.getBody()->begin() == std::prev(forOp.getBody()->end()))
235	return success();
236
237	// If the trip counts aren't constant, we would need versioning and
238	// conditional guards (or context information to prevent such versioning). The
239	// better way to pipeline for such loops is to first tile them and extract
240	// constant trip count "full tiles" before applying this.
241	auto mayBeConstTripCount = getConstantTripCount(forOp);
242	if (!mayBeConstTripCount) {
243	LLVM_DEBUG(forOp.emitRemark("non-constant trip count loop not handled"));
244	return success();
245	}
246	uint64_t tripCount = *mayBeConstTripCount;
247
248	assert(isOpwiseShiftValid(forOp, shifts) &&
249	"shifts will lead to an invalid transformation\n");
250
251	int64_t step = forOp.getStepAsInt();
252
253	unsigned numChildOps = shifts.size();
254
255	// Do a linear time (counting) sort for the shifts.
256	uint64_t maxShift = *llvm::max_element(Range&: shifts);
257	if (maxShift >= numChildOps) {
258	// Large shifts are not the typical use case.
259	forOp.emitWarning("not shifting because shifts are unrealistically large");
260	return success();
261	}
262
263	// An array of operation groups sorted by shift amount; each group has all
264	// operations with the same shift in the order in which they appear in the
265	// body of the 'affine.for' op.
266	std::vector<std::vector<Operation *>> sortedOpGroups(maxShift + `1`);
267	unsigned pos = `0`;
268	for (auto &op : forOp.getBody()->without_terminator()) {
269	auto shift = shifts[pos++];
270	sortedOpGroups[shift].push_back(&op);
271	}
272
273	// Unless the shifts have a specific pattern (which actually would be the
274	// common use case), prologue and epilogue are not meaningfully defined.
275	// Nevertheless, if 'unrollPrologueEpilogue' is set, we will treat the first
276	// loop generated as the prologue and the last as epilogue and unroll these
277	// fully.
278	AffineForOp prologue, epilogue;
279
280	// Do a sweep over the sorted shifts while storing open groups in a
281	// vector, and generating loop portions as necessary during the sweep. A block
282	// of operations is paired with its shift.
283	std::vector<std::pair<uint64_t, ArrayRef<Operation *>>> opGroupQueue;
284
285	auto origLbMap = forOp.getLowerBoundMap();
286	uint64_t lbShift = `0`;
287	OpBuilder b(forOp);
288	for (uint64_t d = `0`, e = sortedOpGroups.size(); d < e; ++d) {
289	// If nothing is shifted by d, continue.
290	if (sortedOpGroups [d].empty())
291	continue;
292	if (!opGroupQueue.empty()) {
293	assert(d > `0` &&
294	"Queue expected to be empty when the first block is found");
295	// The interval for which the loop needs to be generated here is:
296	// [lbShift, min(lbShift + tripCount, d)) and the body of the
297	// loop needs to have all operations in opQueue in that order.
298	AffineForOp res;
299	if (lbShift + tripCount * step < d * step) {
300	res = generateShiftedLoop(
301	b.getShiftedAffineMap(origLbMap, lbShift),
302	b.getShiftedAffineMap(origLbMap, lbShift + tripCount * step),
303	opGroupQueue, /offset=/`0`, forOp, b);
304	// Entire loop for the queued op groups generated, empty it.
305	opGroupQueue.clear();
306	lbShift += tripCount * step;
307	} else {
308	res = generateShiftedLoop(b.getShiftedAffineMap(origLbMap, lbShift),
309	b.getShiftedAffineMap(origLbMap, d),
310	opGroupQueue, /offset=/`0`, forOp, b);
311	lbShift = d * step;
312	}
313
314	if (res) {
315	// Simplify/canonicalize the affine.for.
316	RewritePatternSet patterns(res.getContext());
317	AffineForOp::getCanonicalizationPatterns(patterns, res.getContext());
318	bool erased;
319	(void)applyOpPatternsGreedily(
320	res.getOperation(), std::move(patterns),
321	GreedyRewriteConfig ().setStrictness(
322	GreedyRewriteStrictness::ExistingAndNewOps),
323	/changed=/nullptr, &erased);
324	if (!erased && !prologue)
325	prologue = res;
326	if (!erased)
327	epilogue = res;
328	}
329	} else {
330	// Start of first interval.
331	lbShift = d * step;
332	}
333	// Augment the list of operations that get into the current open interval.
334	opGroupQueue.emplace_back(args&: d, args&: sortedOpGroups [d]);
335	}
336
337	// Those operations groups left in the queue now need to be processed (FIFO)
338	// and their loops completed.
339	for (unsigned i = `0`, e = opGroupQueue.size(); i < e; ++i) {
340	uint64_t ubShift = (opGroupQueue [i].first + tripCount) * step;
341	epilogue = generateShiftedLoop(b.getShiftedAffineMap(origLbMap, lbShift),
342	b.getShiftedAffineMap(origLbMap, ubShift),
343	opGroupQueue, /offset=/i, forOp, b);
344	lbShift = ubShift;
345	if (!prologue)
346	prologue = epilogue;
347	}
348
349	// Erase the original for op.
350	forOp.erase();
351
352	if (unrollPrologueEpilogue && prologue)
353	(void)loopUnrollFull(prologue);
354	if (unrollPrologueEpilogue && !epilogue && epilogue != prologue)
355	(void)loopUnrollFull(epilogue);
356
357	return success();
358	}
359
360	/// Checks whether a loop nest is hyper-rectangular or not.
361	static LogicalResult
362	checkIfHyperRectangular(MutableArrayRef<AffineForOp> input) {
363	FlatAffineValueConstraints cst;
364	SmallVector<Operation *, `8`> ops(input.begin(), input.end());
365	// 0-d or 1-d is trivially hyper-rectangular.
366	if (input.size() <= `1`)
367	return success();
368	if (failed(Result: getIndexSet(ops, domain: &cst))) {
369	LLVM_DEBUG(llvm::dbgs() << "Index set computation failed!\n");
370	return failure();
371	}
372	if (!cst.isHyperRectangular(pos: `0`, num: input.size())) {
373	LLVM_DEBUG(llvm::dbgs()
374	<< "Non-hyperrectangular nests not supported for tiling!\n");
375	return failure();
376	}
377	return success();
378	}
379
380	/// Check if the input nest is supported for tiling and whether tiling would be
381	/// legal or not.
382	template <typename t>
383	static LogicalResult performPreTilingChecks(MutableArrayRef<AffineForOp> input,
384	ArrayRef<t> tileSizes) {
385	assert(input.size() == tileSizes.size() && "Too few/many tile sizes");
386
387	if (llvm::any_of(input,
388	[](AffineForOp op) { return op.getNumResults() > `0`; })) {
389	LLVM_DEBUG(llvm::dbgs()
390	<< "Cannot tile nest where a loop has yield values\n");
391	return failure();
392	}
393
394	// Check if the supplied `for` ops are all successively nested.
395	if (!isPerfectlyNested(loops: input)) {
396	LLVM_DEBUG(llvm::dbgs() << "input loops not perfectly nested");
397	return failure();
398	}
399
400	// TODO: handle non hyper-rectangular spaces.
401	if (failed(Result: checkIfHyperRectangular(input)))
402	return failure();
403
404	return success();
405	}
406
407	/// Move the loop body of AffineForOp 'src' from 'src' into the specified
408	/// location in destination's body, ignoring the terminator.
409	static void moveLoopBodyImpl(AffineForOp src, AffineForOp dest,
410	Block::iterator loc) {
411	auto &ops = src.getBody()->getOperations();
412	dest.getBody()->getOperations().splice(loc, ops, ops.begin(),
413	std::prev(ops.end()));
414	}
415
416	/// Move the loop body of AffineForOp 'src' from 'src' to the start of dest
417	/// body.
418	static void moveLoopBody(AffineForOp src, AffineForOp dest) {
419	moveLoopBodyImpl(src, dest, dest.getBody()->begin());
420	}
421
422	/// Constructs tiled loop nest, without setting the loop bounds and move the
423	/// body of the original loop nest to the tiled loop nest.
424	static void constructTiledLoopNest(MutableArrayRef<AffineForOp> origLoops,
425	AffineForOp rootAffineForOp, unsigned width,
426	MutableArrayRef<AffineForOp> tiledLoops) {
427	Location loc = rootAffineForOp.getLoc();
428
429	// The outermost among the loops as we add more..
430	Operation *topLoop = rootAffineForOp.getOperation();
431	AffineForOp innermostPointLoop;
432
433	// Add intra-tile (or point) loops.
434	for (unsigned i = `0`; i < width; i++) {
435	OpBuilder b(topLoop);
436	// Loop bounds will be set later.
437	AffineForOp pointLoop = b.create<AffineForOp>(loc, `0`, `0`);
438	pointLoop.getBody()->getOperations().splice(
439	pointLoop.getBody()->begin(), topLoop->getBlock()->getOperations(),
440	topLoop);
441	tiledLoops [`2` * width - `1` - i] = pointLoop;
442	topLoop = pointLoop.getOperation();
443	if (i == `0`)
444	innermostPointLoop = pointLoop;
445	}
446
447	// Add tile space loops;
448	for (unsigned i = width; i < `2` * width; i++) {
449	OpBuilder b(topLoop);
450	// Loop bounds will be set later.
451	AffineForOp tileSpaceLoop = b.create<AffineForOp>(loc, `0`, `0`);
452	tileSpaceLoop.getBody()->getOperations().splice(
453	tileSpaceLoop.getBody()->begin(), topLoop->getBlock()->getOperations(),
454	topLoop);
455	tiledLoops [`2` * width - i - `1`] = tileSpaceLoop;
456	topLoop = tileSpaceLoop.getOperation();
457	}
458
459	// Move the loop body of the original nest to the new one.
460	moveLoopBody(origLoops.back(), innermostPointLoop);
461	}
462
463	/// Set lower and upper bounds of intra-tile loops for parametric tiling.
464	// TODO: Handle non-constant lower bounds.
465	static void setIntraTileBoundsParametric(OpBuilder &b, AffineForOp origLoop,
466	AffineForOp newInterTileLoop,
467	AffineForOp newIntraTileLoop,
468	Value tileSize) {
469	// The lower bound for the intra-tile loop is represented by an affine map
470	// as (%i, %t0)->((%i - %origlb) %t0 + %origlb). Similarly, the upper bound*
471	// for the intra-tile loop is represented by an affine map as (%i, %t0)->((%i
472	// - %origlb) %t0) + (%t0 * %origLoopStep) + %origlb), where %i is loop IV*
473	// of the corresponding inter-tile loop, %t0 is the corresponding tiling
474	// parameter, %origlb is lower bound and %origLoopStep is the loop step of the
475	// corresponding inter-tile loop.
476
477	assert(origLoop.hasConstantLowerBound() &&
478	"expected input loops to have constant lower bound.");
479
480	// Get lower bound of original loop as an affine expression.
481	AffineExpr origLowerBoundExpr;
482	origLowerBoundExpr =
483	b.getAffineConstantExpr(constant: origLoop.getConstantLowerBound());
484
485	// Add dim operands from original lower/upper bound.
486	SmallVector<Value, `4`> lbOperands, ubOperands;
487	AffineBound lb = origLoop.getLowerBound();
488	AffineBound ub = origLoop.getUpperBound();
489	lbOperands.reserve(N: lb.getNumOperands() + `2`);
490	ubOperands.reserve(N: ub.getNumOperands() + `2`);
491	AffineMap origLbMap = lb.getMap();
492	AffineMap origUbMap = ub.getMap();
493	for (unsigned j = `0`, e = origLbMap.getNumDims(); j < e; ++j)
494	lbOperands.push_back(Elt: lb.getOperand(idx: j));
495	for (unsigned j = `0`, e = origUbMap.getNumDims(); j < e; ++j)
496	ubOperands.push_back(Elt: ub.getOperand(idx: j));
497
498	// Add a new dim operand in lb/ubOperands corresponding to the origLoop
499	// IV.
500	lbOperands.push_back(Elt: newInterTileLoop.getInductionVar());
501	ubOperands.push_back(Elt: newInterTileLoop.getInductionVar());
502
503	// Get loop IV as an affine expression for lower/upper bound. Size of
504	// lb/ubOperands is guaranteed to be atleast one.
505	AffineExpr lbLoopIvExpr = b.getAffineDimExpr(position: lbOperands.size() - `1`);
506	AffineExpr ubLoopIvExpr = b.getAffineDimExpr(position: ubOperands.size() - `1`);
507
508	// Add symbol operands from original lower/upper bound.
509	for (unsigned j = `0`, e = origLbMap.getNumSymbols(); j < e; ++j)
510	lbOperands.push_back(Elt: lb.getOperand(idx: origLbMap.getNumDims() + j));
511	for (unsigned j = `0`, e = origUbMap.getNumSymbols(); j < e; ++j)
512	ubOperands.push_back(Elt: ub.getOperand(idx: origUbMap.getNumDims() + j));
513
514	// Add a new symbol operand which is the tile size for this loop.
515	lbOperands.push_back(Elt: tileSize);
516	ubOperands.push_back(Elt: tileSize);
517
518	SmallVector<AffineExpr, `4`> lbBoundExprs;
519	SmallVector<AffineExpr, `4`> ubBoundExprs;
520	lbBoundExprs.reserve(N: origLbMap.getNumResults());
521	ubBoundExprs.reserve(N: origUbMap.getNumResults());
522
523	// Get tiling parameter as an affine expression for lb/ub.
524	AffineExpr lbTileParameter = b.getAffineSymbolExpr(position: origLbMap.getNumSymbols());
525	AffineExpr ubTileParameter = b.getAffineSymbolExpr(position: origUbMap.getNumSymbols());
526
527	// Insert lb as inter-tile ((loop IV - origlb) tilingParameter) + origlb.*
528	lbBoundExprs.push_back(
529	Elt: ((lbLoopIvExpr - origLowerBoundExpr) * lbTileParameter) +
530	origLowerBoundExpr);
531
532	// Get the origLoopStep as an affine expression.
533	AffineExpr origLoopStep = b.getAffineConstantExpr(constant: origLoop.getStepAsInt());
534
535	// Insert ub as inter-tile ((loop IV - origlb) tilingParameter) +*
536	// (tilingParameter origLoopStep) + origlb.*
537	ubBoundExprs.push_back(
538	Elt: ((ubLoopIvExpr - origLowerBoundExpr) * ubTileParameter) +
539	(ubTileParameter * origLoopStep) + origLowerBoundExpr);
540
541	ubBoundExprs.append(in_start: origUbMap.getResults().begin(),
542	in_end: origUbMap.getResults().end());
543
544	AffineMap lbMap =
545	AffineMap::get(dimCount: origLbMap.getNumDims() + `1`, symbolCount: origLbMap.getNumSymbols() + `1`,
546	results: lbBoundExprs, context: b.getContext());
547	newIntraTileLoop.setLowerBound(lbOperands, lbMap);
548
549	AffineMap ubMap =
550	AffineMap::get(dimCount: origUbMap.getNumDims() + `1`, symbolCount: origUbMap.getNumSymbols() + `1`,
551	results: ubBoundExprs, context: b.getContext());
552	newIntraTileLoop.setUpperBound(ubOperands, ubMap);
553
554	// Original loop step must be preserved.
555	newIntraTileLoop.setStep(origLoop.getStepAsInt());
556	}
557
558	/// Set lower and upper bounds of inter-tile loops for parametric tiling.
559	// TODO: Handle non-constant lower bounds.
560	static void setInterTileBoundsParametric(OpBuilder &b, AffineForOp origLoop,
561	AffineForOp newLoop, Value tileSize) {
562	OperandRange newLbOperands = origLoop.getLowerBoundOperands();
563
564	// The lower bounds for inter-tile loops are same as the corresponding lower
565	// bounds of original loops.
566	newLoop.setLowerBound(newLbOperands, origLoop.getLowerBoundMap());
567
568	// The new upper bound map for inter-tile loops, assuming constant lower
569	// bounds, are now originalLowerBound + ceildiv((originalUpperBound -
570	// originalLowerBound), tiling parameter); where tiling parameter is the
571	// respective tile size for that loop. For e.g. if the original ubmap was
572	// ()->(1024), the new map will be
573	// ()[s0]->(ceildiv((1024 -lb) % s0)), where s0 is the tiling parameter.
574	// Therefore a new symbol operand is inserted in the map and the result
575	// expression is overwritten.
576
577	assert(origLoop.hasConstantLowerBound() &&
578	"expected input loops to have constant lower bound.");
579
580	// Get lower bound of original loop as an affine expression.
581	AffineExpr origLowerBoundExpr;
582	origLowerBoundExpr =
583	b.getAffineConstantExpr(constant: origLoop.getConstantLowerBound());
584
585	// Add dim operands from original upper bound.
586	SmallVector<Value, `4`> ubOperands;
587	AffineBound ub = origLoop.getUpperBound();
588	ubOperands.reserve(N: ub.getNumOperands() + `1`);
589	AffineMap origUbMap = ub.getMap();
590	for (unsigned j = `0`, e = origUbMap.getNumDims(); j < e; ++j)
591	ubOperands.push_back(Elt: ub.getOperand(idx: j));
592
593	// Add symbol operands from original upper bound.
594	for (unsigned j = `0`, e = origUbMap.getNumSymbols(); j < e; ++j)
595	ubOperands.push_back(Elt: ub.getOperand(idx: origUbMap.getNumDims() + j));
596
597	// Add a new symbol operand which is the tile size for this loop.
598	ubOperands.push_back(Elt: tileSize);
599
600	// Get tiling parameter as an affine expression.
601	AffineExpr tileParameter = b.getAffineSymbolExpr(position: origUbMap.getNumSymbols());
602
603	SmallVector<AffineExpr, `4`> boundExprs;
604	boundExprs.reserve(N: origUbMap.getNumResults());
605	int64_t origUpperBound;
606	AffineExpr origUpperBoundExpr;
607
608	// If upper bound for the original loop is constant, then the constant can
609	// be obtained as an affine expression straight away.
610	if (origLoop.hasConstantUpperBound()) {
611	origUpperBound = origLoop.getConstantUpperBound();
612
613	// Get original constant upper bound as an affine expression.
614	origUpperBoundExpr = b.getAffineConstantExpr(constant: origUpperBound);
615
616	// Insert the bound as originalLowerBoundceildiv((originalUpperBound -
617	// originalLowerBound), tilingParameter).
618	boundExprs.push_back(
619	Elt: origLowerBoundExpr +
620	(origUpperBoundExpr - origLowerBoundExpr).ceilDiv(other: tileParameter));
621	} else {
622	// If upper bound for the original loop is not constant then two cases
623	// are possible, although there handeling is the same, 1.) The result of
624	// ubmap has only one result expression. For e.g.
625	// affine.for %i = 5 to %ub
626	//
627	// A symbol operand is added which represents the tiling parameter. The
628	// new loop bounds here will be like ()[s0, s1] -> ((s0 - 5) ceildiv s1 + 5)
629	// where 's0' is the original upper bound and 's1' is the tiling
630	// parameter. 2.) When ubMap has more than one result expression. For e.g.
631	// #map0 = affine_map<()[s0, s1] -> (s0, s1)
632	// affine.for %i = 5 to min #map0()[%s0, %s1]
633	//
634	// A symbol operand is added which represents the tiling parameter. The
635	// new loop bounds will be like ()[s0, s1, s2] -> ((s0 - 5) ceildiv s2 + 5,
636	// (s1 -5) ceildiv s2 + 5), where s2 is the tiling parameter.
637
638	// Insert the bounds as originalLowerBound + ceildiv((originalUpperBound -
639	// originalLowerBound), tilingParameter).
640	for (AffineExpr origUpperBoundExpr : origUbMap.getResults())
641	boundExprs.push_back(
642	origLowerBoundExpr +
643	(origUpperBoundExpr - origLowerBoundExpr).ceilDiv(tileParameter));
644	}
645
646	AffineMap ubMap =
647	AffineMap::get(dimCount: origUbMap.getNumDims(), symbolCount: origUbMap.getNumSymbols() + `1`,
648	results: boundExprs, context: b.getContext());
649	newLoop.setUpperBound(ubOperands, ubMap);
650
651	// Original loop step must be preserved.
652	newLoop.setStep(origLoop.getStepAsInt());
653	}
654
655	/// Constructs and sets new loop bounds after tiling for the case of
656	/// hyper-rectangular index sets, where the bounds of one dimension do not
657	/// depend on other dimensions and tiling parameters are captured from SSA
658	/// values. Bounds of each dimension can thus be treated independently,
659	/// and deriving the new bounds is much simpler and faster than for the case of
660	/// tiling arbitrary polyhedral shapes.
661	static void constructParametricallyTiledIndexSetHyperRect(
662	MutableArrayRef<AffineForOp> origLoops,
663	MutableArrayRef<AffineForOp> newLoops, ArrayRef<Value> tileSizes) {
664	assert(!origLoops.empty() && "expected atleast one loop in band");
665	assert(origLoops.size() == tileSizes.size() &&
666	"expected tiling parameter for each loop in band.");
667
668	OpBuilder b(origLoops [`0`].getOperation());
669	unsigned width = origLoops.size();
670
671	// Set bounds for tile space loops.
672	for (unsigned i = `0`; i < width; ++i) {
673	setInterTileBoundsParametric(b, origLoops[i], newLoops [i], tileSizes [i]);
674	}
675
676	// Set bounds for intra-tile loops.
677	for (unsigned i = `0`; i < width; ++i) {
678	setIntraTileBoundsParametric(b, origLoops[i], newLoops [i],
679	newLoops [i + width], tileSizes [i]);
680	}
681	}
682
683	/// Constructs and sets new loop bounds after tiling for the case of
684	/// hyper-rectangular index sets, where the bounds of one dimension do not
685	/// depend on other dimensions. Bounds of each dimension can thus be treated
686	/// independently, and deriving the new bounds is much simpler and faster
687	/// than for the case of tiling arbitrary polyhedral shapes.
688	static void
689	constructTiledIndexSetHyperRect(MutableArrayRef<AffineForOp> origLoops,
690	MutableArrayRef<AffineForOp> newLoops,
691	ArrayRef<unsigned> tileSizes) {
692	assert(!origLoops.empty());
693	assert(origLoops.size() == tileSizes.size());
694
695	OpBuilder b(origLoops [`0`].getOperation());
696	unsigned width = origLoops.size();
697
698	// Bounds for tile space loops.
699	for (unsigned i = `0`; i < width; i++) {
700	OperandRange newLbOperands = origLoops [i].getLowerBoundOperands();
701	OperandRange newUbOperands = origLoops [i].getUpperBoundOperands();
702	newLoops [i].setLowerBound(newLbOperands, origLoops [i].getLowerBoundMap());
703	newLoops [i].setUpperBound(newUbOperands, origLoops [i].getUpperBoundMap());
704	// If the step size of original loop is x and tileSize is y then after
705	// tiling the tile space loops' step size becomes xy.*
706	newLoops [i].setStep(tileSizes [i] * origLoops [i].getStepAsInt());
707	}
708	// Bounds for intra-tile loops.
709	for (unsigned i = `0`; i < width; i++) {
710	int64_t largestDiv = getLargestDivisorOfTripCount(origLoops[i]);
711	std::optional<uint64_t> mayBeConstantCount =
712	getConstantTripCount(origLoops[i]);
713	// The lower bound is just the tile-space loop.
714	AffineMap lbMap = b.getDimIdentityMap();
715	newLoops [width + i].setLowerBound(
716	/operands=/newLoops [i].getInductionVar(), lbMap);
717	// The step sizes of intra-tile loops is just the original loops' step size.
718	newLoops [width + i].setStep(origLoops [i].getStepAsInt());
719
720	// Set the upper bound.
721	if (mayBeConstantCount && *mayBeConstantCount < tileSizes [i]) {
722	// Trip count is less than the tile size: upper bound is lower bound +
723	// trip count stepSize.*
724	AffineMap ubMap = b.getSingleDimShiftAffineMap(
725	shift: mayBeConstantCount origLoops [i].getStepAsInt());
726	newLoops [width + i].setUpperBound(
727	/operands=/newLoops [i].getInductionVar(), ubMap);
728	} else if (largestDiv % tileSizes [i] != `0`) {
729	// Intra-tile loop ii goes from i to min(i + tileSize stepSize, ub_i).*
730	// Construct the upper bound map; the operands are the original operands
731	// with 'i' (tile-space loop) appended to it. The new upper bound map is
732	// the original one with an additional expression i + tileSize stepSize*
733	// appended.
734
735	// Add dim operands from original upper bound.
736	SmallVector<Value, `4`> ubOperands;
737	AffineBound ub = origLoops [i].getUpperBound();
738	ubOperands.reserve(N: ub.getNumOperands() + `1`);
739	AffineMap origUbMap = ub.getMap();
740	for (unsigned j = `0`, e = origUbMap.getNumDims(); j < e; ++j)
741	ubOperands.push_back(Elt: ub.getOperand(idx: j));
742
743	// Add dim operand for new loop upper bound.
744	ubOperands.push_back(Elt: newLoops [i].getInductionVar());
745
746	// Add symbol operands from original upper bound.
747	for (unsigned j = `0`, e = origUbMap.getNumSymbols(); j < e; ++j)
748	ubOperands.push_back(Elt: ub.getOperand(idx: origUbMap.getNumDims() + j));
749
750	SmallVector<AffineExpr, `4`> boundExprs;
751	boundExprs.reserve(N: `1` + origUbMap.getNumResults());
752	AffineExpr dim = b.getAffineDimExpr(position: origUbMap.getNumDims());
753	// The new upper bound map is the original one with an additional
754	// expression i + tileSize stepSize (of original loop) appended.*
755	boundExprs.push_back(Elt: dim + tileSizes [i] * origLoops [i].getStepAsInt());
756	boundExprs.append(in_start: origUbMap.getResults().begin(),
757	in_end: origUbMap.getResults().end());
758	AffineMap ubMap =
759	AffineMap::get(dimCount: origUbMap.getNumDims() + `1`, symbolCount: origUbMap.getNumSymbols(),
760	results: boundExprs, context: b.getContext());
761	newLoops [width + i].setUpperBound(/operands=/ubOperands, ubMap);
762	} else {
763	// No need of the min expression.
764	AffineExpr dim = b.getAffineDimExpr(position: `0`);
765	AffineMap ubMap = AffineMap::get(
766	`1`, `0`, dim + tileSizes [i] * origLoops [i].getStepAsInt());
767	newLoops [width + i].setUpperBound(newLoops [i].getInductionVar(), ubMap);
768	}
769	}
770	}
771
772	LogicalResult
773	mlir::affine::tilePerfectlyNested(MutableArrayRef<AffineForOp> input,
774	ArrayRef<unsigned> tileSizes,
775	SmallVectorImpl<AffineForOp> *tiledNest) {
776	if (input.empty())
777	return success();
778
779	if (failed(Result: performPreTilingChecks(input, tileSizes)))
780	return failure();
781
782	MutableArrayRef<AffineForOp> origLoops = input;
783	AffineForOp rootAffineForOp = origLoops[`0`];
784
785	// Note that width is at least one since the band isn't empty.
786	unsigned width = input.size();
787	SmallVector<AffineForOp, `6`> tiledLoops(`2` * width);
788
789	// Construct a tiled loop nest without setting their bounds. Bounds are
790	// set later.
791	constructTiledLoopNest(origLoops, rootAffineForOp, width, tiledLoops);
792
793	SmallVector<Value, `8`> origLoopIVs;
794	extractForInductionVars(forInsts: input, ivs: &origLoopIVs);
795
796	// Set loop bounds for the tiled loop nest.
797	constructTiledIndexSetHyperRect(origLoops, tiledLoops, tileSizes);
798
799	// Replace original IVs with intra-tile loop IVs.
800	for (unsigned i = `0`; i < width; i++)
801	origLoopIVs [i].replaceAllUsesWith(newValue: tiledLoops[i + width].getInductionVar());
802
803	// Erase the old loop nest.
804	rootAffineForOp.erase();
805
806	if (tiledNest)
807	*tiledNest = std::move(tiledLoops);
808
809	return success();
810	}
811
812	/// Tiles the specified band of perfectly nested loops creating tile-space
813	/// loops and intra-tile loops, using SSA values as tiling parameters. A band
814	/// is a contiguous set of loops.
815	LogicalResult mlir::affine::tilePerfectlyNestedParametric(
816	MutableArrayRef<AffineForOp> input, ArrayRef<Value> tileSizes,
817	SmallVectorImpl<AffineForOp> *tiledNest) {
818	if (input.empty())
819	return success();
820
821	if (failed(Result: performPreTilingChecks(input, tileSizes)))
822	return failure();
823
824	MutableArrayRef<AffineForOp> origLoops = input;
825	AffineForOp rootAffineForOp = origLoops[`0`];
826	unsigned width = input.size();
827	SmallVector<AffineForOp, `6`> tiledLoops(`2` * width);
828
829	// Construct a tiled loop nest without setting their bounds. Bounds are
830	// set later.
831	constructTiledLoopNest(origLoops, rootAffineForOp, width, tiledLoops);
832
833	SmallVector<Value, `8`> origLoopIVs;
834	extractForInductionVars(forInsts: input, ivs: &origLoopIVs);
835
836	// Set loop bounds for the tiled loop nest.
837	constructParametricallyTiledIndexSetHyperRect(origLoops, tiledLoops,
838	tileSizes);
839
840	// Replace original IVs with intra-tile loop IVs.
841	for (unsigned i = `0`; i < width; i++)
842	origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar());
843
844	// Erase the old loop nest.
845	rootAffineForOp.erase();
846
847	if (tiledNest)
848	*tiledNest = std::move(tiledLoops);
849
850	return success();
851	}
852
853	/// Get perfectly nested sequence of loops starting at root of loop nest
854	/// (the first op being another AffineFor, and the second op - a terminator).
855	/// A loop is perfectly nested iff: the first op in the loop's body is another
856	/// AffineForOp, and the second op is a terminator).
857	void mlir::affine::getPerfectlyNestedLoops(
858	SmallVectorImpl<AffineForOp> &nestedLoops, AffineForOp root) {
859	for (unsigned i = `0`; i < std::numeric_limits<unsigned>::max(); ++i) {
860	nestedLoops.push_back(root);
861	Block &body = root.getRegion().front();
862	if (body.begin() != std::prev(x: body.end(), n: `2`))
863	return;
864
865	root = dyn_cast<AffineForOp>(&body.front());
866	if (!root)
867	return;
868	}
869	}
870
871	/// Identify valid and profitable bands of loops to tile. This is currently just
872	/// a temporary placeholder to test the mechanics of tiled code generation.
873	/// Returns all maximal outermost perfect loop nests to tile.
874	void mlir::affine::getTileableBands(
875	func::FuncOp f, std::vector<SmallVector<AffineForOp, `6`>> *bands) {
876	// Get maximal perfect nest of 'affine.for' insts starting from root
877	// (inclusive).
878	for (AffineForOp forOp : f.getOps<AffineForOp>()) {
879	SmallVector<AffineForOp, `6`> band;
880	getPerfectlyNestedLoops(band, forOp);
881	bands->push_back(band);
882	}
883	}
884
885	/// Unrolls this loop completely.
886	LogicalResult mlir::affine::loopUnrollFull(AffineForOp forOp) {
887	std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
888	if (mayBeConstantTripCount.has_value()) {
889	uint64_t tripCount = *mayBeConstantTripCount;
890	if (tripCount == `0`)
891	return success();
892	if (tripCount == `1`)
893	return promoteIfSingleIteration(forOp);
894	return loopUnrollByFactor(forOp, tripCount);
895	}
896	return failure();
897	}
898
899	/// Unrolls this loop by the specified factor or by the trip count (if constant)
900	/// whichever is lower.
901	LogicalResult mlir::affine::loopUnrollUpToFactor(AffineForOp forOp,
902	uint64_t unrollFactor) {
903	std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
904	if (mayBeConstantTripCount.has_value() &&
905	*mayBeConstantTripCount < unrollFactor)
906	return loopUnrollByFactor(forOp, *mayBeConstantTripCount);
907	return loopUnrollByFactor(forOp, unrollFactor);
908	}
909
910	/// Generates unrolled copies of AffineForOp 'loopBodyBlock', with associated
911	/// 'forOpIV' by 'unrollFactor', calling 'ivRemapFn' to remap 'forOpIV' for each
912	/// unrolled body. If specified, annotates the Ops in each unrolled iteration
913	/// using annotateFn.
914	static void generateUnrolledLoop(
915	Block *loopBodyBlock, Value forOpIV, uint64_t unrollFactor,
916	function_ref<Value(unsigned, Value, OpBuilder)> ivRemapFn,
917	function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn,
918	ValueRange iterArgs, ValueRange yieldedValues) {
919	// Builder to insert unrolled bodies just before the terminator of the body of
920	// 'forOp'.
921	auto builder = OpBuilder::atBlockTerminator(block: loopBodyBlock);
922
923	constexpr auto defaultAnnotateFn = [](unsigned, Operation *, OpBuilder) {};
924	if (!annotateFn)
925	annotateFn = defaultAnnotateFn;
926
927	// Keep a pointer to the last non-terminator operation in the original block
928	// so that we know what to clone (since we are doing this in-place).
929	Block::iterator srcBlockEnd = std::prev(x: loopBodyBlock->end(), n: `2`);
930
931	// Unroll the contents of 'forOp' (append unrollFactor - 1 additional copies).
932	SmallVector<Value, `4`> lastYielded(yieldedValues);
933
934	for (unsigned i = `1`; i < unrollFactor; i++) {
935	IRMapping operandMap;
936
937	// Prepare operand map.
938	operandMap.map(from&: iterArgs, to&: lastYielded);
939
940	// If the induction variable is used, create a remapping to the value for
941	// this unrolled instance.
942	if (!forOpIV.use_empty()) {
943	Value ivUnroll = ivRemapFn (i, forOpIV, builder);
944	operandMap.map(from: forOpIV, to: ivUnroll);
945	}
946
947	// Clone the original body of 'forOp'.
948	for (auto it = loopBodyBlock->begin(); it != std::next(x: srcBlockEnd); it ++) {
949	Operation clonedOp = builder.clone(op&: it, mapper&: operandMap);
950	annotateFn (i, clonedOp, builder);
951	}
952
953	// Update yielded values. If the yielded value is defined outside the
954	// `loopBodyBlock` or if it is a BlockArgument then it won't be cloned, thus
955	// the `lastYielded` value remains unchanged. Else, update the `lastYielded`
956	// value with the clone corresponding to the yielded value.
957	for (unsigned i = `0`, e = lastYielded.size(); i < e; i++) {
958	Operation *defOp = yieldedValues [i].getDefiningOp();
959	if (defOp && defOp->getBlock() == loopBodyBlock)
960	lastYielded [i] = operandMap.lookup(from: yieldedValues [i]);
961	}
962	}
963
964	// Make sure we annotate the Ops in the original body. We do this last so that
965	// any annotations are not copied into the cloned Ops above.
966	for (auto it = loopBodyBlock->begin(); it != std::next(x: srcBlockEnd); it ++)
967	annotateFn (`0`, &*it, builder);
968
969	// Update operands of the yield statement.
970	loopBodyBlock->getTerminator()->setOperands(lastYielded);
971	}
972
973	/// Helper to generate cleanup loop for unroll or unroll-and-jam when the trip
974	/// count is not a multiple of `unrollFactor`.
975	static LogicalResult generateCleanupLoopForUnroll(AffineForOp forOp,
976	uint64_t unrollFactor) {
977	// Insert the cleanup loop right after 'forOp'.
978	OpBuilder builder(forOp->getBlock(), std::next(x: Block::iterator(forOp)));
979	auto cleanupForOp = cast<AffineForOp>(builder.clone(*forOp));
980
981	// Update uses of `forOp` results. `cleanupForOp` should use `forOp` result
982	// and produce results for the original users of `forOp` results.
983	auto results = forOp.getResults();
984	auto cleanupResults = cleanupForOp.getResults();
985	auto cleanupIterOperands = cleanupForOp.getInits();
986
987	for (auto e : llvm::zip(results, cleanupResults, cleanupIterOperands)) {
988	std::get<`0`>(e).replaceAllUsesWith(std::get<`1`>(e));
989	cleanupForOp->replaceUsesOfWith(std::get<`2`>(e), std::get<`0`>(e));
990	}
991
992	AffineMap cleanupMap;
993	SmallVector<Value, `4`> cleanupOperands;
994	getCleanupLoopLowerBound(forOp, unrollFactor, cleanupMap, cleanupOperands);
995	if (!cleanupMap)
996	return failure();
997
998	cleanupForOp.setLowerBound(cleanupOperands, cleanupMap);
999	// Promote the loop body up if this has turned into a single iteration loop.
1000	(void)promoteIfSingleIteration(cleanupForOp);
1001
1002	// Adjust upper bound of the original loop; this is the same as the lower
1003	// bound of the cleanup loop.
1004	forOp.setUpperBound(cleanupOperands, cleanupMap);
1005	return success();
1006	}
1007
1008	/// Unrolls this loop by the specified factor. Returns success if the loop
1009	/// is successfully unrolled.
1010	LogicalResult mlir::affine::loopUnrollByFactor(
1011	AffineForOp forOp, uint64_t unrollFactor,
1012	function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn,
1013	bool cleanUpUnroll) {
1014	assert(unrollFactor > `0` && "unroll factor should be positive");
1015
1016	std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
1017	if (unrollFactor == `1`) {
1018	if (mayBeConstantTripCount && *mayBeConstantTripCount == `1` &&
1019	failed(promoteIfSingleIteration(forOp)))
1020	return failure();
1021	return success();
1022	}
1023
1024	// Nothing in the loop body other than the terminator.
1025	if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
1026	return success();
1027
1028	// If the trip count is lower than the unroll factor, no unrolled body.
1029	if (mayBeConstantTripCount && *mayBeConstantTripCount < unrollFactor) {
1030	if (cleanUpUnroll) {
1031	// Unroll the cleanup loop if cleanUpUnroll is specified.
1032	return loopUnrollFull(forOp);
1033	}
1034
1035	return failure();
1036	}
1037
1038	// Generate the cleanup loop if trip count isn't a multiple of unrollFactor.
1039	if (getLargestDivisorOfTripCount(forOp) % unrollFactor != `0`) {
1040	// Loops where the lower bound is a max expression or the upper bound is
1041	// a min expression and the trip count doesn't divide the unroll factor
1042	// can't be unrolled since the lower bound of the cleanup loop in such cases
1043	// cannot be expressed as an affine function or a max over affine functions.
1044	if (forOp.getLowerBoundMap().getNumResults() != `1` \|\|
1045	forOp.getUpperBoundMap().getNumResults() != `1`)
1046	return failure();
1047	if (cleanUpUnroll)
1048	// Force unroll including cleanup loop
1049	return loopUnrollFull(forOp);
1050	if (failed(generateCleanupLoopForUnroll(forOp, unrollFactor)))
1051	assert(false && "cleanup loop lower bound map for single result lower "
1052	"and upper bound maps can always be determined");
1053	}
1054
1055	ValueRange iterArgs(forOp.getRegionIterArgs());
1056	auto yieldedValues = forOp.getBody()->getTerminator()->getOperands();
1057
1058	// Scale the step of loop being unrolled by unroll factor.
1059	int64_t step = forOp.getStepAsInt();
1060	forOp.setStep(step * unrollFactor);
1061	generateUnrolledLoop(
1062	forOp.getBody(), forOp.getInductionVar(), unrollFactor,
1063	[&](unsigned i, Value iv, OpBuilder b) {
1064	// iv' = iv + i step*
1065	auto d0 = b.getAffineDimExpr(position: `0`);
1066	auto bumpMap = AffineMap::get(dimCount: `1`, symbolCount: `0`, result: d0 + i * step);
1067	return b.create<AffineApplyOp>(forOp.getLoc(), bumpMap, iv);
1068	},
1069	/annotateFn=/annotateFn,
1070	/iterArgs=/iterArgs, /yieldedValues=/yieldedValues);
1071
1072	// Promote the loop body up if this has turned into a single iteration loop.
1073	(void)promoteIfSingleIteration(forOp);
1074	return success();
1075	}
1076
1077	LogicalResult mlir::affine::loopUnrollJamUpToFactor(AffineForOp forOp,
1078	uint64_t unrollJamFactor) {
1079	std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
1080	if (mayBeConstantTripCount.has_value() &&
1081	*mayBeConstantTripCount < unrollJamFactor)
1082	return loopUnrollJamByFactor(forOp, *mayBeConstantTripCount);
1083	return loopUnrollJamByFactor(forOp, unrollJamFactor);
1084	}
1085
1086	/// Check if all control operands of all loops are defined outside of `forOp`
1087	/// and return false if not.
1088	static bool areInnerBoundsInvariant(AffineForOp forOp) {
1089	auto walkResult = forOp.walk([&](AffineForOp aForOp) {
1090	for (auto controlOperand : aForOp.getControlOperands()) {
1091	if (!forOp.isDefinedOutsideOfLoop(controlOperand))
1092	return WalkResult::interrupt();
1093	}
1094	return WalkResult::advance();
1095	});
1096	return !walkResult.wasInterrupted();
1097	}
1098
1099	/// Unrolls and jams this loop by the specified factor.
1100	LogicalResult mlir::affine::loopUnrollJamByFactor(AffineForOp forOp,
1101	uint64_t unrollJamFactor) {
1102	assert(unrollJamFactor > `0` && "unroll jam factor should be positive");
1103
1104	std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
1105	if (unrollJamFactor == `1`) {
1106	if (mayBeConstantTripCount && *mayBeConstantTripCount == `1` &&
1107	failed(promoteIfSingleIteration(forOp)))
1108	return failure();
1109	return success();
1110	}
1111
1112	// Nothing in the loop body other than the terminator.
1113	if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
1114	return success();
1115
1116	// If the trip count is lower than the unroll jam factor, no unroll jam.
1117	if (mayBeConstantTripCount && *mayBeConstantTripCount < unrollJamFactor) {
1118	LLVM_DEBUG(llvm::dbgs() << "[failed] trip count < unroll-jam factor\n");
1119	return failure();
1120	}
1121
1122	// If any control operand of any inner loop of `forOp` is defined within
1123	// `forOp`, no unroll jam.
1124	if (!areInnerBoundsInvariant(forOp))
1125	return failure();
1126
1127	// Gather all sub-blocks to jam upon the loop being unrolled.
1128	JamBlockGatherer<AffineForOp> jbg;
1129	jbg.walk(forOp);
1130	auto &subBlocks = jbg.subBlocks;
1131
1132	// Collect loops with iter_args.
1133	SmallVector<AffineForOp, `4`> loopsWithIterArgs;
1134	forOp.walk([&](AffineForOp aForOp) {
1135	if (aForOp.getNumIterOperands() > `0`)
1136	loopsWithIterArgs.push_back(aForOp);
1137	});
1138
1139	// Get supported reductions to be used for creating reduction ops at the end.
1140	SmallVector<LoopReduction> reductions;
1141	if (forOp.getNumIterOperands() > `0`)
1142	getSupportedReductions(forOp, reductions);
1143
1144	// Generate the cleanup loop if trip count isn't a multiple of
1145	// unrollJamFactor.
1146	if (getLargestDivisorOfTripCount(forOp) % unrollJamFactor != `0`) {
1147	// Loops where the lower bound is a max expression or the upper bound is
1148	// a min expression and the trip count doesn't divide the unroll factor
1149	// can't be unrolled since the lower bound of the cleanup loop in such cases
1150	// cannot be expressed as an affine function or a max over affine functions.
1151	if (forOp.getLowerBoundMap().getNumResults() != `1` \|\|
1152	forOp.getUpperBoundMap().getNumResults() != `1`)
1153	return failure();
1154	if (failed(generateCleanupLoopForUnroll(forOp, unrollJamFactor)))
1155	assert(false && "cleanup loop lower bound map for single result lower "
1156	"and upper bound maps can always be determined");
1157	}
1158
1159	// `operandMaps[i - 1]` carries old->new operand mapping for the ith unrolled
1160	// iteration. There are (`unrollJamFactor` - 1) iterations.
1161	SmallVector<IRMapping, `4`> operandMaps(unrollJamFactor - `1`);
1162
1163	// For any loop with iter_args, replace it with a new loop that has
1164	// `unrollJamFactor` copies of its iterOperands, iter_args and yield
1165	// operands.
1166	SmallVector<AffineForOp, `4`> newLoopsWithIterArgs;
1167	IRRewriter rewriter(forOp.getContext());
1168	for (AffineForOp oldForOp : loopsWithIterArgs) {
1169	SmallVector<Value> dupIterOperands, dupYieldOperands;
1170	ValueRange oldIterOperands = oldForOp.getInits();
1171	ValueRange oldIterArgs = oldForOp.getRegionIterArgs();
1172	ValueRange oldYieldOperands =
1173	cast<AffineYieldOp>(oldForOp.getBody()->getTerminator()).getOperands();
1174	// Get additional iterOperands, iterArgs, and yield operands. We will
1175	// fix iterOperands and yield operands after cloning of sub-blocks.
1176	for (unsigned i = unrollJamFactor - `1`; i >= `1`; --i) {
1177	dupIterOperands.append(oldIterOperands.begin(), oldIterOperands.end());
1178	dupYieldOperands.append(oldYieldOperands.begin(), oldYieldOperands.end());
1179	}
1180	// Create a new loop with additional iterOperands, iter_args and yield
1181	// operands. This new loop will take the loop body of the original loop.
1182	bool forOpReplaced = oldForOp == forOp;
1183	AffineForOp newForOp =
1184	cast<AffineForOp>(*oldForOp.replaceWithAdditionalYields(
1185	rewriter, dupIterOperands, /replaceInitOperandUsesInLoop=/false,
1186	[&](OpBuilder &b, Location loc, ArrayRef<BlockArgument> newBbArgs) {
1187	return dupYieldOperands;
1188	}));
1189	newLoopsWithIterArgs.push_back(newForOp);
1190	// `forOp` has been replaced with a new loop.
1191	if (forOpReplaced)
1192	forOp = newForOp;
1193	// Update `operandMaps` for `newForOp` iterArgs and results.
1194	ValueRange newIterArgs = newForOp.getRegionIterArgs();
1195	unsigned oldNumIterArgs = oldIterArgs.size();
1196	ValueRange newResults = newForOp.getResults();
1197	unsigned oldNumResults = newResults.size() / unrollJamFactor;
1198	assert(oldNumIterArgs == oldNumResults &&
1199	"oldNumIterArgs must be the same as oldNumResults");
1200	for (unsigned i = unrollJamFactor - `1`; i >= `1`; --i) {
1201	for (unsigned j = `0`; j < oldNumIterArgs; ++j) {
1202	// `newForOp` has `unrollJamFactor` - 1 new sets of iterArgs and
1203	// results. Update `operandMaps[i - 1]` to map old iterArgs and results
1204	// to those in the `i`th new set.
1205	operandMaps[i - `1`].map(newIterArgs[j],
1206	newIterArgs[i * oldNumIterArgs + j]);
1207	operandMaps[i - `1`].map(newResults[j],
1208	newResults[i * oldNumResults + j]);
1209	}
1210	}
1211	}
1212
1213	// Scale the step of loop being unroll-jammed by the unroll-jam factor.
1214	int64_t step = forOp.getStepAsInt();
1215	forOp.setStep(step * unrollJamFactor);
1216
1217	auto forOpIV = forOp.getInductionVar();
1218	// Unroll and jam (appends unrollJamFactor - 1 additional copies).
1219	for (unsigned i = unrollJamFactor - `1`; i >= `1`; --i) {
1220	for (auto &subBlock : subBlocks) {
1221	// Builder to insert unroll-jammed bodies. Insert right at the end of
1222	// sub-block.
1223	OpBuilder builder(subBlock.first ->getBlock(), std::next(x: subBlock.second));
1224
1225	// If the induction variable is used, create a remapping to the value for
1226	// this unrolled instance.
1227	if (!forOpIV.use_empty()) {
1228	// iv' = iv + i step, i = 1 to unrollJamFactor-1.*
1229	auto d0 = builder.getAffineDimExpr(position: `0`);
1230	auto bumpMap = AffineMap::get(dimCount: `1`, symbolCount: `0`, result: d0 + i * step);
1231	auto ivUnroll =
1232	builder.create<AffineApplyOp>(forOp.getLoc(), bumpMap, forOpIV);
1233	operandMaps [i - `1`].map(forOpIV, ivUnroll);
1234	}
1235	// Clone the sub-block being unroll-jammed.
1236	for (auto it = subBlock.first; it != std::next(x: subBlock.second); ++it)
1237	builder.clone(op&: *it, mapper&: operandMaps [i - `1`]);
1238	}
1239	// Fix iterOperands and yield op operands of newly created loops.
1240	for (auto newForOp : newLoopsWithIterArgs) {
1241	unsigned oldNumIterOperands =
1242	newForOp.getNumIterOperands() / unrollJamFactor;
1243	unsigned numControlOperands = newForOp.getNumControlOperands();
1244	auto yieldOp = cast<AffineYieldOp>(newForOp.getBody()->getTerminator());
1245	unsigned oldNumYieldOperands = yieldOp.getNumOperands() / unrollJamFactor;
1246	assert(oldNumIterOperands == oldNumYieldOperands &&
1247	"oldNumIterOperands must be the same as oldNumYieldOperands");
1248	for (unsigned j = `0`; j < oldNumIterOperands; ++j) {
1249	// The `i`th duplication of an old iterOperand or yield op operand
1250	// needs to be replaced with a mapped value from `operandMaps[i - 1]`
1251	// if such mapped value exists.
1252	newForOp.setOperand(numControlOperands + i * oldNumIterOperands + j,
1253	operandMaps[i - `1`].lookupOrDefault(
1254	newForOp.getOperand(numControlOperands + j)));
1255	yieldOp.setOperand(
1256	i * oldNumYieldOperands + j,
1257	operandMaps[i - `1`].lookupOrDefault(yieldOp.getOperand(j)));
1258	}
1259	}
1260	}
1261	if (forOp.getNumResults() > `0`) {
1262	// Create reduction ops to combine every `unrollJamFactor` related results
1263	// into one value. For example, for %0:2 = affine.for ... and addf, we add
1264	// %1 = arith.addf %0#0, %0#1, and replace the following uses of %0#0 with
1265	// %1.
1266	rewriter.setInsertionPointAfter(forOp);
1267	auto loc = forOp.getLoc();
1268	unsigned oldNumResults = forOp.getNumResults() / unrollJamFactor;
1269	for (LoopReduction &reduction : reductions) {
1270	unsigned pos = reduction.iterArgPosition;
1271	Value lhs = forOp.getResult(pos);
1272	Value rhs;
1273	SmallPtrSet<Operation *, `4`> newOps;
1274	for (unsigned i = unrollJamFactor - `1`; i >= `1`; --i) {
1275	rhs = forOp.getResult(i * oldNumResults + pos);
1276	// Create ops based on reduction type.
1277	lhs = arith::getReductionOp(reduction.kind, rewriter, loc, lhs, rhs);
1278	if (!lhs)
1279	return failure();
1280	Operation *op = lhs.getDefiningOp();
1281	assert(op && "Reduction op should have been created");
1282	newOps.insert(Ptr: op);
1283	}
1284	// Replace all uses except those in newly created reduction ops.
1285	forOp.getResult(pos).replaceAllUsesExcept(lhs, newOps);
1286	}
1287	}
1288
1289	// Promote the loop body up if this has turned into a single iteration loop.
1290	(void)promoteIfSingleIteration(forOp);
1291	return success();
1292	}
1293
1294	/// Performs loop interchange on 'forOpA' and 'forOpB', where 'forOpB' is
1295	/// nested within 'forOpA' as the only non-terminator operation in its block.
1296	void mlir::affine::interchangeLoops(AffineForOp forOpA, AffineForOp forOpB) {
1297	assert(&*forOpA.getBody()->begin() == forOpB.getOperation());
1298	auto &forOpABody = forOpA.getBody()->getOperations();
1299	auto &forOpBBody = forOpB.getBody()->getOperations();
1300
1301	// 1) Splice forOpA's non-terminator operations (which is just forOpB) just
1302	// before forOpA (in ForOpA's parent's block) this should leave 'forOpA's
1303	// body containing only the terminator.
1304	forOpA->getBlock()->getOperations().splice(Block::iterator(forOpA),
1305	forOpABody, forOpABody.begin(),
1306	std::prev(forOpABody.end()));
1307	// 2) Splice forOpB's non-terminator operations into the beginning of forOpA's
1308	// body (this leaves forOpB's body containing only the terminator).
1309	forOpABody.splice(forOpABody.begin(), forOpBBody, forOpBBody.begin(),
1310	std::prev(forOpBBody.end()));
1311	// 3) Splice forOpA into the beginning of forOpB's body.
1312	forOpBBody.splice(forOpBBody.begin(), forOpA->getBlock()->getOperations(),
1313	Block::iterator(forOpA));
1314	}
1315
1316	// Checks each dependence component against the permutation to see if the
1317	// desired loop interchange would violate dependences by making the
1318	// dependence component lexicographically negative.
1319	static bool checkLoopInterchangeDependences(
1320	const std::vector<SmallVector<DependenceComponent, `2`>> &depCompsVec,
1321	ArrayRef<AffineForOp> loops, ArrayRef<unsigned> loopPermMap) {
1322	// Invert permutation map.
1323	unsigned maxLoopDepth = loops.size();
1324	SmallVector<unsigned, `4`> loopPermMapInv;
1325	loopPermMapInv.resize(N: maxLoopDepth);
1326	for (unsigned i = `0`; i < maxLoopDepth; ++i)
1327	loopPermMapInv [loopPermMap [i]] = i;
1328
1329	// Check each dependence component against the permutation to see if the
1330	// desired loop interchange permutation would make the dependence vectors
1331	// lexicographically negative.
1332	// Example 1: [-1, 1][0, 0]
1333	// Example 2: [0, 0][-1, 1]
1334	for (const auto &depComps : depCompsVec) {
1335	assert(depComps.size() >= maxLoopDepth);
1336	// Check if the first non-zero dependence component is positive.
1337	// This iterates through loops in the desired order.
1338	for (unsigned j = `0`; j < maxLoopDepth; ++j) {
1339	unsigned permIndex = loopPermMapInv [j];
1340	assert(depComps[permIndex].lb);
1341	int64_t depCompLb = *depComps [permIndex].lb;
1342	if (depCompLb > `0`)
1343	break;
1344	if (depCompLb < `0`)
1345	return false;
1346	}
1347	}
1348	return true;
1349	}
1350
1351	/// Checks if the loop interchange permutation 'loopPermMap' of the perfectly
1352	/// nested sequence of loops in 'loops' would violate dependences.
1353	bool mlir::affine::isValidLoopInterchangePermutation(
1354	ArrayRef<AffineForOp> loops, ArrayRef<unsigned> loopPermMap) {
1355	assert(loopPermMap.size() == loops.size() && "invalid loop perm map");
1356	unsigned maxLoopDepth = loops.size();
1357	if (maxLoopDepth == `1`)
1358	return true;
1359	// Gather dependence components for dependences between all ops in loop nest
1360	// rooted at 'loops[0]', at loop depths in range [1, maxLoopDepth].
1361	std::vector<SmallVector<DependenceComponent, `2`>> depCompsVec;
1362	getDependenceComponents(loops[`0`], maxLoopDepth, &depCompsVec);
1363	return checkLoopInterchangeDependences(depCompsVec, loops, loopPermMap);
1364	}
1365
1366	/// Returns true if `loops` is a perfectly nested loop nest, where loops appear
1367	/// in it from outermost to innermost.
1368	bool LLVM_ATTRIBUTE_UNUSED
1369	mlir::affine::isPerfectlyNested(ArrayRef<AffineForOp> loops) {
1370	assert(!loops.empty() && "no loops provided");
1371
1372	// We already know that the block can't be empty.
1373	auto hasTwoElements = [](Block *block) {
1374	auto secondOpIt = std::next(x: block->begin());
1375	return secondOpIt != block->end() && &*secondOpIt == &block->back();
1376	};
1377
1378	auto enclosingLoop = loops.front();
1379	for (auto loop : loops.drop_front()) {
1380	auto parentForOp = dyn_cast<AffineForOp>(loop->getParentOp());
1381	// parentForOp's body should be just this loop and the terminator.
1382	if (parentForOp != enclosingLoop \|\| !hasTwoElements(parentForOp.getBody()))
1383	return false;
1384	enclosingLoop = loop;
1385	}
1386	return true;
1387	}
1388
1389	// input[i] should move from position i -> permMap[i]. Returns the position in
1390	// `input` that becomes the new outermost loop.
1391	unsigned mlir::affine::permuteLoops(ArrayRef<AffineForOp> input,
1392	ArrayRef<unsigned> permMap) {
1393	assert(input.size() == permMap.size() && "invalid permutation map size");
1394	// Check whether the permutation spec is valid. This is a small vector - we'll
1395	// just sort and check if it's iota.
1396	SmallVector<unsigned, `4`> checkPermMap(permMap);
1397	llvm::sort(C&: checkPermMap);
1398	if (llvm::any_of(Range: llvm::enumerate(First&: checkPermMap),
1399	P: [](const auto &en) { return en.value() != en.index(); }))
1400	assert(false && "invalid permutation map");
1401
1402	// Nothing to do.
1403	if (input.size() < `2`)
1404	return `0`;
1405
1406	assert(isPerfectlyNested(input) && "input not perfectly nested");
1407
1408	// Compute the inverse mapping, invPermMap: since input[i] goes to position
1409	// permMap[i], position i of the permuted nest is at input[invPermMap[i]].
1410	SmallVector<std::pair<unsigned, unsigned>, `4`> invPermMap;
1411	for (unsigned i = `0`, e = input.size(); i < e; ++i)
1412	invPermMap.push_back(Elt: {permMap [i], i});
1413	llvm::sort(C&: invPermMap);
1414
1415	// Move the innermost loop body to the loop that would be the innermost in the
1416	// permuted nest (only if the innermost loop is going to change).
1417	if (permMap.back() != input.size() - `1`) {
1418	Block *destBody = ((AffineForOp)input[invPermMap.back().second]).getBody();
1419	Block *srcBody = ((AffineForOp)input.back()).getBody();
1420	destBody->getOperations().splice(where: destBody->begin(),
1421	L2&: srcBody->getOperations(), first: srcBody->begin(),
1422	last: std::prev(x: srcBody->end()));
1423	}
1424
1425	// We'll move each loop in `input` in the reverse order so that its body is
1426	// empty when we are moving it; this incurs zero copies and no erasing.
1427	for (int i = input.size() - `1`; i >= `0`; --i) {
1428	// If this has to become the outermost loop after permutation, add it to the
1429	// parent block of the original root.
1430	if (permMap [i] == `0`) {
1431	// If the root remains the same, nothing to do.
1432	if (i == `0`)
1433	continue;
1434	// Make input[i] the new outermost loop moving it into parentBlock.
1435	auto *parentBlock = input[`0`]->getBlock();
1436	parentBlock->getOperations().splice(Block::iterator(input[`0`]),
1437	input[i]->getBlock()->getOperations(),
1438	Block::iterator(input[i]));
1439	continue;
1440	}
1441
1442	// If the parent in the permuted order is the same as in the original,
1443	// nothing to do.
1444	unsigned parentPosInInput = invPermMap [permMap [i] - `1`].second;
1445	if (i > `0` && static_cast<unsigned>(i - `1`) == parentPosInInput)
1446	continue;
1447
1448	// Move input[i] to its surrounding loop in the transformed nest.
1449	auto *destBody = ((AffineForOp)input[parentPosInInput]).getBody();
1450	destBody->getOperations().splice(destBody->begin(),
1451	input[i]->getBlock()->getOperations(),
1452	Block::iterator(input[i]));
1453	}
1454
1455	return invPermMap [`0`].second;
1456	}
1457
1458	// Sinks all sequential loops to the innermost levels (while preserving
1459	// relative order among them) and moves all parallel loops to the
1460	// outermost (while again preserving relative order among them).
1461	AffineForOp mlir::affine::sinkSequentialLoops(AffineForOp forOp) {
1462	SmallVector<AffineForOp, `4`> loops;
1463	getPerfectlyNestedLoops(loops, forOp);
1464	if (loops.size() < `2`)
1465	return forOp;
1466
1467	// Gather dependence components for dependences between all ops in loop nest
1468	// rooted at 'loops[0]', at loop depths in range [1, maxLoopDepth].
1469	unsigned maxLoopDepth = loops.size();
1470	std::vector<SmallVector<DependenceComponent, `2`>> depCompsVec;
1471	getDependenceComponents(loops[`0`], maxLoopDepth, &depCompsVec);
1472
1473	// Mark loops as either parallel or sequential.
1474	SmallVector<bool, `8`> isParallelLoop(maxLoopDepth, true);
1475	for (auto &depComps : depCompsVec) {
1476	assert(depComps.size() >= maxLoopDepth);
1477	for (unsigned j = `0`; j < maxLoopDepth; ++j) {
1478	DependenceComponent &depComp = depComps [j];
1479	assert(depComp.lb.has_value() && depComp.ub.has_value());
1480	if (depComp.lb != `0` \|\| depComp.ub != `0`)
1481	isParallelLoop [j] = false;
1482	}
1483	}
1484
1485	unsigned numParallelLoops = llvm::count(Range&: isParallelLoop, Element: true);
1486
1487	// Compute permutation of loops that sinks sequential loops (and thus raises
1488	// parallel loops) while preserving relative order.
1489	SmallVector<unsigned, `4`> loopPermMap(maxLoopDepth);
1490	unsigned nextSequentialLoop = numParallelLoops;
1491	unsigned nextParallelLoop = `0`;
1492	for (unsigned i = `0`; i < maxLoopDepth; ++i) {
1493	if (isParallelLoop [i]) {
1494	loopPermMap [i] = nextParallelLoop++;
1495	} else {
1496	loopPermMap [i] = nextSequentialLoop++;
1497	}
1498	}
1499
1500	// Check if permutation 'loopPermMap' would violate dependences.
1501	if (!checkLoopInterchangeDependences(depCompsVec, loops, loopPermMap))
1502	return forOp;
1503	// Perform loop interchange according to permutation 'loopPermMap'.
1504	unsigned loopNestRootIndex = permuteLoops(loops, loopPermMap);
1505	return loops[loopNestRootIndex];
1506	}
1507
1508	// Factors out common behavior to add a new `iv` (resp. `iv` + `offset`) to the
1509	// lower (resp. upper) loop bound. When called for both the lower and upper
1510	// bounds, the resulting IR resembles:
1511	//
1512	// ```mlir
1513	// affine.for %i = max (`iv, ...) to min (`iv` + `offset`) {
1514	// ...
1515	// }
1516	// ```
1517	static void augmentMapAndBounds(OpBuilder &b, Value iv, AffineMap *map,
1518	SmallVector<Value, `4`> *operands,
1519	int64_t offset = `0`) {
1520	auto bounds = llvm::to_vector<`4`>(Range: map->getResults());
1521	bounds.push_back(Elt: b.getAffineDimExpr(position: map->getNumDims()) + offset);
1522	operands->insert(I: operands->begin() + map->getNumDims(), Elt: iv);
1523	*map = AffineMap::get(dimCount: map->getNumDims() + `1`, symbolCount: map->getNumSymbols(), results: bounds,
1524	context: b.getContext());
1525	canonicalizeMapAndOperands(map, operands);
1526	}
1527
1528	// Stripmines `forOp` by `factor` and sinks it under each of the `targets`.
1529	// Stripmine-sink is a primitive building block for generalized tiling of
1530	// imperfectly nested loops.
1531	// This transformation is purely mechanical and does not check legality,
1532	// profitability or even structural correctness. It is the user's
1533	// responsibility to specify `targets` that are dominated by `forOp`.
1534	// Returns the new AffineForOps, one per `targets`, nested immediately under
1535	// each of the `targets`.
1536	static SmallVector<AffineForOp, `8`>
1537	stripmineSink(AffineForOp forOp, uint64_t factor,
1538	ArrayRef<AffineForOp> targets) {
1539	auto originalStep = forOp.getStepAsInt();
1540	auto scaledStep = originalStep * factor;
1541	forOp.setStep(scaledStep);
1542
1543	OpBuilder b(forOp->getBlock(), std::next(x: Block::iterator(forOp)));
1544
1545	// Lower-bound map creation.
1546	auto lbMap = forOp.getLowerBoundMap();
1547	SmallVector<Value, `4`> lbOperands(forOp.getLowerBoundOperands());
1548	augmentMapAndBounds(b, forOp.getInductionVar(), &lbMap, &lbOperands);
1549
1550	// Upper-bound map creation.
1551	auto ubMap = forOp.getUpperBoundMap();
1552	SmallVector<Value, `4`> ubOperands(forOp.getUpperBoundOperands());
1553	augmentMapAndBounds(b, forOp.getInductionVar(), &ubMap, &ubOperands,
1554	/offset=/scaledStep);
1555
1556	auto iv = forOp.getInductionVar();
1557	SmallVector<AffineForOp, `8`> innerLoops;
1558	for (auto t : targets) {
1559	// Insert newForOp before the terminator of `t`.
1560	auto b = OpBuilder::atBlockTerminator(t.getBody());
1561	auto newForOp = b.create<AffineForOp>(t.getLoc(), lbOperands, lbMap,
1562	ubOperands, ubMap, originalStep);
1563	auto begin = t.getBody()->begin();
1564	// Skip terminator and `newForOp` which is just before the terminator.
1565	auto nOps = t.getBody()->getOperations().size() - `2`;
1566	newForOp.getBody()->getOperations().splice(
1567	newForOp.getBody()->getOperations().begin(),
1568	t.getBody()->getOperations(), begin, std::next(begin, nOps));
1569	replaceAllUsesInRegionWith(iv, newForOp.getInductionVar(),
1570	newForOp.getRegion());
1571	innerLoops.push_back(newForOp);
1572	}
1573
1574	return innerLoops;
1575	}
1576
1577	// Stripmines a `forOp` by `factor` and sinks it under a single `target`.
1578	// Returns the new AffineForOps, nested immediately under `target`.
1579	template <typename SizeType>
1580	static AffineForOp stripmineSink(AffineForOp forOp, SizeType factor,
1581	AffineForOp target) {
1582	// TODO: Use cheap structural assertions that targets are nested under
1583	// forOp and that targets are not nested under each other when DominanceInfo
1584	// exposes the capability. It seems overkill to construct a whole function
1585	// dominance tree at this point.
1586	auto res = stripmineSink(forOp, factor, ArrayRef<AffineForOp>(target));
1587	assert(res.size() == `1` && "Expected 1 inner forOp");
1588	return res[`0`];
1589	}
1590
1591	SmallVector<SmallVector<AffineForOp, `8`>, `8`>
1592	mlir::affine::tile(ArrayRef<AffineForOp> forOps, ArrayRef<uint64_t> sizes,
1593	ArrayRef<AffineForOp> targets) {
1594	SmallVector<SmallVector<AffineForOp, `8`>, `8`> res;
1595	SmallVector<AffineForOp, `8`> currentTargets(targets);
1596	for (auto it : llvm::zip(t&: forOps, u&: sizes)) {
1597	auto step = stripmineSink(std::get<`0`>(t&: it), std::get<`1`>(t&: it), currentTargets);
1598	res.push_back(step);
1599	currentTargets = step;
1600	}
1601	return res;
1602	}
1603
1604	SmallVector<AffineForOp, `8`> mlir::affine::tile(ArrayRef<AffineForOp> forOps,
1605	ArrayRef<uint64_t> sizes,
1606	AffineForOp target) {
1607	SmallVector<AffineForOp, `8`> res;
1608	for (auto loops : tile(forOps, sizes, ArrayRef<AffineForOp>(target)))
1609	res.push_back(llvm::getSingleElement(loops));
1610	return res;
1611	}
1612
1613	LogicalResult mlir::affine::coalesceLoops(MutableArrayRef<AffineForOp> loops) {
1614	if (loops.size() < `2`)
1615	return success();
1616
1617	AffineForOp innermost = loops.back();
1618	AffineForOp outermost = loops.front();
1619	AffineBound ub = outermost.getUpperBound();
1620	AffineMap origUbMap = ub.getMap();
1621	Location loc = outermost.getLoc();
1622	OpBuilder builder(outermost);
1623	for (AffineForOp loop : loops) {
1624	// We only work on normalized loops.
1625	if (loop.getStepAsInt() != `1` \|\| !loop.hasConstantLowerBound() \|\|
1626	loop.getConstantLowerBound() != `0`)
1627	return failure();
1628	}
1629	SmallVector<Value, `4`> upperBoundSymbols;
1630	SmallVector<Value, `4`> ubOperands(ub.getOperands().begin(),
1631	ub.getOperands().end());
1632
1633	// 1. Store the upper bound of the outermost loop in a variable.
1634	Value prev;
1635	if (!llvm::hasSingleElement(C: origUbMap.getResults()))
1636	prev = builder.create<AffineMinOp>(loc, origUbMap, ubOperands);
1637	else
1638	prev = builder.create<AffineApplyOp>(loc, origUbMap, ubOperands);
1639	upperBoundSymbols.push_back(Elt: prev);
1640
1641	// 2. Emit code computing the upper bound of the coalesced loop as product of
1642	// the number of iterations of all loops.
1643	for (AffineForOp loop : loops.drop_front()) {
1644	ub = loop.getUpperBound();
1645	origUbMap = ub.getMap();
1646	ubOperands = ub.getOperands();
1647	Value upperBound;
1648	// If upper bound map has more than one result, take their minimum.
1649	if (!llvm::hasSingleElement(origUbMap.getResults()))
1650	upperBound = builder.create<AffineMinOp>(loc, origUbMap, ubOperands);
1651	else
1652	upperBound = builder.create<AffineApplyOp>(loc, origUbMap, ubOperands);
1653	upperBoundSymbols.push_back(upperBound);
1654	SmallVector<Value, `4`> operands;
1655	operands.push_back(prev);
1656	operands.push_back(upperBound);
1657	// Maintain running product of loop upper bounds.
1658	prev = builder.create<AffineApplyOp>(
1659	loc,
1660	AffineMap::get(/dimCount=/`1`,
1661	/symbolCount=/`1`,
1662	builder.getAffineDimExpr(`0`) *
1663	builder.getAffineSymbolExpr(`0`)),
1664	operands);
1665	}
1666	// Set upper bound of the coalesced loop.
1667	AffineMap newUbMap = AffineMap::get(
1668	/dimCount=/`0`,
1669	/symbolCount=/`1`, results: builder.getAffineSymbolExpr(position: `0`), context: builder.getContext());
1670	outermost.setUpperBound(prev, newUbMap);
1671
1672	builder.setInsertionPointToStart(outermost.getBody());
1673
1674	// 3. Remap induction variables. For each original loop, the value of the
1675	// induction variable can be obtained by dividing the induction variable of
1676	// the linearized loop by the total number of iterations of the loops nested
1677	// in it modulo the number of iterations in this loop (remove the values
1678	// related to the outer loops):
1679	// iv_i = floordiv(iv_linear, product-of-loop-ranges-until-i) mod range_i.
1680	// Compute these iteratively from the innermost loop by creating a "running
1681	// quotient" of division by the range.
1682	Value previous = outermost.getInductionVar();
1683	for (unsigned idx = loops.size(); idx > `0`; --idx) {
1684	if (idx != loops.size()) {
1685	SmallVector<Value, `4`> operands;
1686	operands.push_back(Elt: previous);
1687	operands.push_back(Elt: upperBoundSymbols [idx]);
1688	previous = builder.create<AffineApplyOp>(
1689	loc,
1690	AffineMap::get(
1691	/dimCount=/`1`, /symbolCount=/`1`,
1692	result: builder.getAffineDimExpr(position: `0`).floorDiv(
1693	other: builder.getAffineSymbolExpr(position: `0`))),
1694	operands);
1695	}
1696	// Modified value of the induction variables of the nested loops after
1697	// coalescing.
1698	Value inductionVariable;
1699	if (idx == `1`) {
1700	inductionVariable = previous;
1701	} else {
1702	SmallVector<Value, `4`> applyOperands;
1703	applyOperands.push_back(Elt: previous);
1704	applyOperands.push_back(Elt: upperBoundSymbols [idx - `1`]);
1705	inductionVariable = builder.create<AffineApplyOp>(
1706	loc,
1707	AffineMap::get(
1708	/dimCount=/`1`, /symbolCount=/`1`,
1709	result: builder.getAffineDimExpr(position: `0`) % builder.getAffineSymbolExpr(position: `0`)),
1710	applyOperands);
1711	}
1712	replaceAllUsesInRegionWith(loops [idx - `1`].getInductionVar(),
1713	inductionVariable, loops.back().getRegion());
1714	}
1715
1716	// 4. Move the operations from the innermost just above the second-outermost
1717	// loop, delete the extra terminator and the second-outermost loop.
1718	AffineForOp secondOutermostLoop = loops[`1`];
1719	innermost.getBody()->back().erase();
1720	outermost.getBody()->getOperations().splice(
1721	Block::iterator(secondOutermostLoop.getOperation()),
1722	innermost.getBody()->getOperations());
1723	secondOutermostLoop.erase();
1724	return success();
1725	}
1726
1727	void mlir::affine::mapLoopToProcessorIds(scf::ForOp forOp,
1728	ArrayRef<Value> processorId,
1729	ArrayRef<Value> numProcessors) {
1730	assert(processorId.size() == numProcessors.size());
1731	if (processorId.empty())
1732	return;
1733
1734	OpBuilder b(forOp);
1735	Location loc(forOp.getLoc());
1736	AffineExpr lhs, rhs;
1737	bindSymbols(forOp.getContext(), lhs, rhs);
1738	auto mulMap = AffineMap::get(dimCount: `0`, symbolCount: `2`, result: lhs * rhs);
1739	auto addMap = AffineMap::get(dimCount: `0`, symbolCount: `2`, result: lhs + rhs);
1740
1741	Value linearIndex = processorId.front();
1742	for (unsigned i = `1`, e = processorId.size(); i < e; ++i) {
1743	auto mulApplyOp = b.create<AffineApplyOp>(
1744	loc, mulMap, ValueRange {linearIndex, numProcessors [i]});
1745	linearIndex = b.create<AffineApplyOp>(
1746	loc, addMap, ValueRange{mulApplyOp, processorId [i]});
1747	}
1748
1749	auto mulApplyOp = b.create<AffineApplyOp>(
1750	loc, mulMap, ValueRange{linearIndex, forOp.getStep()});
1751	Value lb = b.create<AffineApplyOp>(
1752	loc, addMap, ValueRange{mulApplyOp, forOp.getLowerBound()});
1753	forOp.setLowerBound(lb);
1754
1755	Value step = forOp.getStep();
1756	for (auto numProcs : numProcessors)
1757	step = b.create<AffineApplyOp>(loc, mulMap, ValueRange {numProcs, step});
1758	forOp.setStep(step);
1759	}
1760
1761	/// Given a memref region, determine the lowest depth at which transfers can be
1762	/// placed for it, and return the corresponding block, start and end positions
1763	/// in the block for placing incoming (read) and outgoing (write) copies
1764	/// respectively. The lowest depth depends on whether the region being accessed
1765	/// is hoistable with respect to one or more immediately surrounding loops.
1766	static void
1767	findHighestBlockForPlacement(const MemRefRegion &region, Block &block,
1768	Block::iterator &begin, Block::iterator &end,
1769	Block **copyPlacementBlock,
1770	Block::iterator *copyInPlacementStart,
1771	Block::iterator *copyOutPlacementStart) {
1772	const auto *cst = region.getConstraints();
1773	SmallVector<Value, `4`> symbols;
1774	cst->getValues(start: cst->getNumDimVars(), end: cst->getNumDimAndSymbolVars(), values: &symbols);
1775
1776	SmallVector<Operation *, `4`> enclosingAffineOps;
1777	getEnclosingAffineOps(op&: *block.begin(), ops: &enclosingAffineOps);
1778	// Walk up loop parents till we find an IV on which this region is
1779	// symbolic/variant or we hit `hoistGuard`.
1780	auto it = enclosingAffineOps.rbegin();
1781	AffineForOp lastInvariantFor;
1782	for (auto e = enclosingAffineOps.rend(); it != e; ++it) {
1783	Operation enclosingOp = it;
1784	// We can't hoist past the definition of the memref being copied.
1785	Value memref = region.memref;
1786	if (!memref.getParentRegion()->isAncestor(other: enclosingOp->getParentRegion())) {
1787	LLVM_DEBUG(
1788	llvm::dbgs()
1789	<< "memref definition will end up not dominating hoist location\n");
1790	break;
1791	}
1792
1793	auto affineFor = dyn_cast<AffineForOp>(enclosingOp);
1794	if (!affineFor)
1795	break;
1796	// TODO: also need to be checking this for regions symbols that
1797	// aren't loop IVs, whether we are within their resp. defs' dominance scope.
1798	if (llvm::is_contained(symbols, affineFor.getInductionVar()))
1799	break;
1800	lastInvariantFor = affineFor;
1801	}
1802
1803	if (it != enclosingAffineOps.rbegin()) {
1804	*copyInPlacementStart = Block::iterator(lastInvariantFor);
1805	copyOutPlacementStart = std::next(x: copyInPlacementStart);
1806	*copyPlacementBlock = lastInvariantFor->getBlock();
1807	} else {
1808	*copyInPlacementStart = begin;
1809	*copyOutPlacementStart = end;
1810	*copyPlacementBlock = &block;
1811	}
1812	}
1813
1814	// Info comprising stride and number of elements transferred every stride.
1815	struct StrideInfo {
1816	int64_t stride;
1817	int64_t numEltPerStride;
1818	};
1819
1820	/// Returns striding information for a copy/transfer of this region with
1821	/// potentially multiple striding levels from outermost to innermost. For an
1822	/// n-dimensional region, there can be at most n-1 levels of striding
1823	/// successively nested.
1824	// TODO: make this work with non-identity layout maps.
1825	static void getMultiLevelStrides(const MemRefRegion &region,
1826	ArrayRef<int64_t> bufferShape,
1827	SmallVectorImpl<StrideInfo> *strideInfos) {
1828	if (bufferShape.size() <= `1`)
1829	return;
1830
1831	int64_t numEltPerStride = `1`;
1832	int64_t stride = `1`;
1833	for (int d = bufferShape.size() - `1`; d >= `1`; d--) {
1834	int64_t dimSize = cast<MemRefType>(region.memref.getType()).getDimSize(d);
1835	stride *= dimSize;
1836	numEltPerStride *= bufferShape [d];
1837	// A stride is needed only if the region has a shorter extent than the
1838	// memref along the dimension and* has an extent greater than one along the*
1839	// next major dimension.
1840	if (bufferShape [d] < dimSize && bufferShape [d - `1`] > `1`) {
1841	strideInfos->push_back(Elt: {.stride: stride, .numEltPerStride: numEltPerStride});
1842	}
1843	}
1844	}
1845
1846	/// Generates a point-wise copy from/to a non-zero ranked `memref' to/from
1847	/// `fastMemRef' and returns the outermost AffineForOp of the copy loop nest.
1848	/// `lbMaps` and `ubMaps` along with `lbOperands` and `ubOperands` hold the
1849	/// lower and upper bound information for the copy loop nest. `fastBufOffsets`
1850	/// contain the expressions to be subtracted out from the respective copy loop
1851	/// iterators in order to index the fast buffer. If `copyOut' is true, generates
1852	/// a copy-out; otherwise a copy-in. Builder `b` should be set to the point the
1853	/// copy nest is inserted.
1854	//
1855	/// The copy-in nest is generated as follows as an example for a 2-d region:
1856	/// for x = ...
1857	/// for y = ...
1858	/// fast_buf[x - offset_x][y - offset_y] = memref[x][y]
1859	///
1860	static AffineForOp
1861	generatePointWiseCopy(Location loc, Value memref, Value fastMemRef,
1862	ArrayRef<AffineMap> lbMaps, ArrayRef<Value> lbOperands,
1863	ArrayRef<AffineMap> ubMaps, ArrayRef<Value> ubOperands,
1864	ArrayRef<AffineExpr> fastBufOffsets, bool isCopyOut,
1865	OpBuilder b) {
1866	assert(llvm::all_of(lbMaps, [&](AffineMap lbMap) {
1867	return lbMap.getNumInputs() == lbOperands.size();
1868	}));
1869	assert(llvm::all_of(ubMaps, [&](AffineMap ubMap) {
1870	return ubMap.getNumInputs() == ubOperands.size();
1871	}));
1872
1873	unsigned rank = cast<MemRefType>(memref.getType()).getRank();
1874	// A copy nest can't be generated for 0-ranked memrefs.
1875	assert(rank != `0` && "non-zero rank memref expected");
1876	assert(lbMaps.size() == rank && "wrong number of lb maps");
1877	assert(ubMaps.size() == rank && "wrong number of ub maps");
1878
1879	SmallVector<Value, `4`> memIndices;
1880	SmallVector<AffineExpr, `4`> fastBufExprs;
1881	SmallVector<Value, `4`> fastBufMapOperands;
1882	AffineForOp copyNestRoot;
1883	SmallVector<AffineApplyOp, `4`> mayBeDeadApplys;
1884	for (unsigned d = `0`; d < rank; ++d) {
1885	auto forOp = createCanonicalizedAffineForOp(b, loc, lbOperands, lbMaps [d],
1886	ubOperands, ubMaps [d]);
1887	if (d == `0`)
1888	copyNestRoot = forOp;
1889
1890	b = OpBuilder::atBlockTerminator(block: forOp.getBody());
1891
1892	auto fastBufOffsetMap =
1893	AffineMap::get(dimCount: lbOperands.size(), symbolCount: `0`, result: fastBufOffsets [d]);
1894	auto offset = b.create<AffineApplyOp>(loc, fastBufOffsetMap, lbOperands);
1895
1896	// Construct the subscript for the fast memref being copied into/from:
1897	// x - offset_x.
1898	fastBufExprs.push_back(Elt: b.getAffineDimExpr(position: `2` * d + `1`) -
1899	b.getAffineDimExpr(position: `2` * d));
1900	fastBufMapOperands.push_back(Elt: offset);
1901	fastBufMapOperands.push_back(Elt: forOp.getInductionVar());
1902	mayBeDeadApplys.push_back(offset);
1903
1904	// Subscript for the slow memref being copied.
1905	memIndices.push_back(Elt: forOp.getInductionVar());
1906	}
1907
1908	auto fastBufMap =
1909	AffineMap::get(dimCount: `2` * rank, /symbolCount=/`0`, results: fastBufExprs, context: b.getContext());
1910	fullyComposeAffineMapAndOperands(&fastBufMap, &fastBufMapOperands);
1911	fastBufMap = simplifyAffineMap(fastBufMap);
1912	canonicalizeMapAndOperands(&fastBufMap, &fastBufMapOperands);
1913
1914	// Drop any dead affine.applys.
1915	for (auto applyOp : mayBeDeadApplys)
1916	if (applyOp.use_empty())
1917	applyOp.erase();
1918
1919	if (!isCopyOut) {
1920	// Copy in.
1921	auto load = b.create<AffineLoadOp>(loc, memref, memIndices);
1922	b.create<AffineStoreOp>(loc, load, fastMemRef, fastBufMap,
1923	fastBufMapOperands);
1924	return copyNestRoot;
1925	}
1926
1927	// Copy out.
1928	auto load =
1929	b.create<AffineLoadOp>(loc, fastMemRef, fastBufMap, fastBufMapOperands);
1930	b.create<AffineStoreOp>(loc, load, memref, memIndices);
1931	return copyNestRoot;
1932	}
1933
1934	static InFlightDiagnostic LLVM_ATTRIBUTE_UNUSED
1935	emitRemarkForBlock(Block &block) {
1936	return block.getParentOp()->emitRemark();
1937	}
1938
1939	/// Creates a buffer in the faster memory space for the specified memref region
1940	/// (memref has to be non-zero ranked); generates a copy from the lower memory
1941	/// space to this one, and replaces all loads/stores in the block range
1942	/// [`begin', `end') of `block' to load/store from that buffer. Returns failure
1943	/// if copies could not be generated due to yet unimplemented cases.
1944	/// `copyInPlacementStart` and `copyOutPlacementStart` in copyPlacementBlock
1945	/// specify the insertion points where the incoming copies and outgoing copies,
1946	/// respectively, should be inserted (the insertion happens right before the
1947	/// insertion point). Since `begin` can itself be invalidated due to the memref
1948	/// rewriting done from this method, the output argument `nBegin` is set to its
1949	/// replacement (set to `begin` if no invalidation happens). Since outgoing
1950	/// copies could have been inserted at `end`, the output argument `nEnd` is set
1951	/// to the new end. `sizeInBytes` is set to the size of the fast buffer
1952	/// allocated.
1953	static LogicalResult generateCopy(
1954	const MemRefRegion &region, Block *block, Block::iterator begin,
1955	Block::iterator end, Block *copyPlacementBlock,
1956	Block::iterator copyInPlacementStart, Block::iterator copyOutPlacementStart,
1957	const AffineCopyOptions &copyOptions, DenseMap<Value, Value> &fastBufferMap,
1958	DenseSet<Operation > &copyNests, uint64_t sizeInBytes,
1959	Block::iterator nBegin, Block::iterator nEnd) {
1960	*nBegin = begin;
1961	*nEnd = end;
1962
1963	auto f = begin ->getParentOfType<FunctionOpInterface>();
1964	OpBuilder topBuilder(f.getFunctionBody());
1965	Value zeroIndex = topBuilder.create<arith::ConstantIndexOp>(f.getLoc(), `0`);
1966
1967	*sizeInBytes = `0`;
1968
1969	if (begin == end)
1970	return success();
1971
1972	// Is the copy out point at the end of the block where we are doing
1973	// explicit copying.
1974	bool isCopyOutAtEndOfBlock = (end == copyOutPlacementStart);
1975
1976	// Copies for read regions are going to be inserted at 'begin'.
1977	OpBuilder prologue(copyPlacementBlock, copyInPlacementStart);
1978	// Copies for write regions are going to be inserted at 'end'.
1979	OpBuilder epilogue(copyPlacementBlock, copyOutPlacementStart);
1980	OpBuilder &b = region.isWrite() ? epilogue : prologue;
1981
1982	// Builder to create constants at the top level.
1983	auto func =
1984	copyPlacementBlock->getParent()->getParentOfType<FunctionOpInterface>();
1985	OpBuilder top(func.getFunctionBody());
1986
1987	auto loc = region.loc;
1988	auto memref = region.memref;
1989	auto memRefType = cast<MemRefType>(memref.getType());
1990
1991	if (!memRefType.getLayout().isIdentity()) {
1992	LLVM_DEBUG(llvm::dbgs() << "Non-identity layout map not yet supported\n");
1993	return failure();
1994	}
1995
1996	// Indices to use for the copying.
1997	// Indices for the original memref being copied from/to.
1998	SmallVector<Value, `4`> memIndices;
1999	// Indices for the faster buffer being copied into/from.
2000	SmallVector<Value, `4`> bufIndices;
2001
2002	unsigned rank = memRefType.getRank();
2003	if (rank == `0`) {
2004	LLVM_DEBUG(llvm::dbgs() << "Non-zero ranked memrefs supported\n");
2005	return failure();
2006	}
2007
2008	SmallVector<int64_t, `4`> fastBufferShape;
2009
2010	// Compute the extents of the buffer.
2011	SmallVector<AffineMap, `2`> lbs;
2012	lbs.reserve(N: rank);
2013	std::optional<int64_t> numElements =
2014	region.getConstantBoundingSizeAndShape(shape: &fastBufferShape, lbs: &lbs);
2015	if (!numElements) {
2016	LLVM_DEBUG(llvm::dbgs() << "Non-constant region size not supported\n");
2017	return failure();
2018	}
2019
2020	if (llvm::any_of(Range&: lbs, P: [](AffineMap lb) { return lb.getNumResults() > `1`; })) {
2021	// This can be supported in the future if needed.
2022	LLVM_DEBUG(llvm::dbgs()
2023	<< "Max lower bound for memref region start not supported\n");
2024	return failure();
2025	}
2026
2027	if (*numElements == `0`) {
2028	LLVM_DEBUG(llvm::dbgs() << "Nothing to copy\n");
2029	return success();
2030	}
2031
2032	SmallVector<AffineMap, `4`> lbMaps(rank), ubMaps(rank);
2033	for (unsigned i = `0`; i < rank; ++i) {
2034	region.getLowerAndUpperBound(pos: i, lbMap&: lbMaps [i], ubMap&: ubMaps [i]);
2035	if (lbMaps [i].getNumResults() == `0` \|\| ubMaps [i].getNumResults() == `0`) {
2036	LLVM_DEBUG(llvm::dbgs()
2037	<< "Missing lower or upper bound for region along dimension: "
2038	<< i << `'\n'`);
2039	return failure();
2040	}
2041	}
2042
2043	const FlatAffineValueConstraints *cst = region.getConstraints();
2044	// 'regionSymbols' hold values that this memory region is symbolic/parametric
2045	// on; these typically include loop IVs surrounding the level at which the
2046	// copy generation is being done or other valid symbols in MLIR.
2047	SmallVector<Value, `8`> regionSymbols;
2048	cst->getValues(start: rank, end: cst->getNumVars(), values: &regionSymbols);
2049
2050	// Construct the access expression for the fast memory buffer. The access
2051	// expression for a particular dimension of the fast buffer is obtained by
2052	// subtracting out the lower bound on the original memref's data region
2053	// along the corresponding dimension.
2054
2055	// Index start offsets for faster memory buffer relative to the original.
2056	SmallVector<AffineExpr, `4`> fastBufOffsets;
2057	fastBufOffsets.reserve(N: rank);
2058	for (unsigned d = `0`; d < rank; d++) {
2059	assert(lbs[d].getNumSymbols() == cst->getNumCols() - rank - `1` &&
2060	"incorrect bound size");
2061
2062	// Set copy start location for this dimension in the lower memory space
2063	// memref.
2064	if (lbs [d].isSingleConstant()) {
2065	auto indexVal = lbs [d].getSingleConstantResult();
2066	if (indexVal == `0`) {
2067	memIndices.push_back(Elt: zeroIndex);
2068	} else {
2069	memIndices.push_back(
2070	Elt: top.create<arith::ConstantIndexOp>(location: loc, args&: indexVal).getResult());
2071	}
2072	} else {
2073	// The coordinate for the start location is just the lower bound along the
2074	// corresponding dimension on the memory region (stored in 'offset').
2075	// Remap all inputs of the map to dimensions uniformly since in the
2076	// generate IR we need valid affine symbols as opposed to "symbols" for
2077	// the purpose of the memref region.
2078	SmallVector<AffineExpr> symReplacements(lbs [d].getNumSymbols());
2079	for (unsigned i = `0`, e = lbs [d].getNumSymbols(); i < e; ++i)
2080	symReplacements [i] = top.getAffineDimExpr(position: i);
2081	lbs [d] = lbs [d].replaceDimsAndSymbols(
2082	/dimReplacements=/{}, symReplacements, numResultDims: lbs [d].getNumSymbols(),
2083	/numResultSyms=/`0`);
2084	memIndices.push_back(b.create<AffineApplyOp>(loc, lbs [d], regionSymbols));
2085	}
2086	// The fast buffer is copied into at location zero; addressing is relative.
2087	bufIndices.push_back(Elt: zeroIndex);
2088
2089	// Record the offsets since they are needed to remap the memory accesses of
2090	// the original memref further below.
2091	fastBufOffsets.push_back(Elt: lbs [d].getResult(idx: `0`));
2092	}
2093
2094	// The faster memory space buffer.
2095	Value fastMemRef;
2096
2097	// Check if a buffer was already created.
2098	bool existingBuf = fastBufferMap.count(Val: memref) > `0`;
2099	if (!existingBuf) {
2100	AffineMap fastBufferLayout = b.getMultiDimIdentityMap(rank);
2101	auto fastMemRefType =
2102	MemRefType::get(fastBufferShape, memRefType.getElementType(),
2103	fastBufferLayout, copyOptions.fastMemorySpace);
2104
2105	// Create the fast memory space buffer just before the 'affine.for'
2106	// operation.
2107	fastMemRef =
2108	prologue.create<memref::AllocOp>(loc, fastMemRefType).getResult();
2109	// Record it.
2110	fastBufferMap [memref] = fastMemRef;
2111	// fastMemRefType is a constant shaped memref.
2112	auto maySizeInBytes = getIntOrFloatMemRefSizeInBytes(fastMemRefType);
2113	// We don't account for things of unknown size.
2114	*sizeInBytes = maySizeInBytes.value_or(`0`);
2115
2116	LLVM_DEBUG(emitRemarkForBlock(*block)
2117	<< "Creating fast buffer of type " << fastMemRefType
2118	<< " and size " << llvm::divideCeil(*sizeInBytes, `1024`)
2119	<< " KiB\n");
2120	} else {
2121	// Reuse the one already created.
2122	fastMemRef = fastBufferMap [memref];
2123	}
2124
2125	auto numElementsSSA = top.create<arith::ConstantIndexOp>(location: loc, args&: *numElements);
2126
2127	Value dmaStride;
2128	Value numEltPerDmaStride;
2129	if (copyOptions.generateDma) {
2130	SmallVector<StrideInfo, `4`> dmaStrideInfos;
2131	getMultiLevelStrides(region, bufferShape: fastBufferShape, strideInfos: &dmaStrideInfos);
2132
2133	// TODO: use all stride levels once DmaStartOp is extended for
2134	// multi-level strides.
2135	if (dmaStrideInfos.size() > `1`) {
2136	LLVM_DEBUG(llvm::dbgs() << "Only up to one level of stride supported\n");
2137	return failure();
2138	}
2139
2140	if (!dmaStrideInfos.empty()) {
2141	dmaStride =
2142	top.create<arith::ConstantIndexOp>(location: loc, args&: dmaStrideInfos [`0`].stride);
2143	numEltPerDmaStride = top.create<arith::ConstantIndexOp>(
2144	location: loc, args&: dmaStrideInfos [`0`].numEltPerStride);
2145	}
2146	}
2147
2148	// Record the last operation where we want the memref replacement to end. We
2149	// later do the memref replacement only in [begin, postDomFilter] so
2150	// that the original memref's used in the data movement code themselves don't
2151	// get replaced.
2152	auto postDomFilter = std::prev(x: end);
2153
2154	// Create fully composed affine maps for each memref.
2155	auto memAffineMap = b.getMultiDimIdentityMap(rank: memIndices.size());
2156	fullyComposeAffineMapAndOperands(map: &memAffineMap, operands: &memIndices);
2157	auto bufAffineMap = b.getMultiDimIdentityMap(rank: bufIndices.size());
2158	fullyComposeAffineMapAndOperands(map: &bufAffineMap, operands: &bufIndices);
2159
2160	if (!copyOptions.generateDma) {
2161	// Point-wise copy generation.
2162	auto copyNest =
2163	generatePointWiseCopy(loc, memref, fastMemRef, lbMaps,
2164	/lbOperands=/regionSymbols, ubMaps,
2165	/ubOperands=/regionSymbols, fastBufOffsets,
2166	/isCopyOut=/region.isWrite(), b);
2167
2168	// Record this so that we can skip it from yet another copy.
2169	copyNests.insert(copyNest);
2170
2171	// Since new ops are being appended (for copy out's), adjust the end to
2172	// mark end of block range being processed if necessary.
2173	if (region.isWrite() && isCopyOutAtEndOfBlock)
2174	*nEnd = Block::iterator(copyNest.getOperation());
2175	} else {
2176	// DMA generation.
2177	// Create a tag (single element 1-d memref) for the DMA.
2178	auto tagMemRefType = MemRefType::get({`1`}, top.getIntegerType(`32`), {},
2179	copyOptions.tagMemorySpace);
2180	auto tagMemRef = prologue.create<memref::AllocOp>(loc, tagMemRefType);
2181
2182	SmallVector<Value, `4`> tagIndices({zeroIndex});
2183	auto tagAffineMap = b.getMultiDimIdentityMap(rank: tagIndices.size());
2184	fullyComposeAffineMapAndOperands(&tagAffineMap, &tagIndices);
2185	if (!region.isWrite()) {
2186	// DMA non-blocking read from original buffer to fast buffer.
2187	b.create<AffineDmaStartOp>(loc, memref, memAffineMap, memIndices,
2188	fastMemRef, bufAffineMap, bufIndices,
2189	tagMemRef, tagAffineMap, tagIndices,
2190	numElementsSSA, dmaStride, numEltPerDmaStride);
2191	} else {
2192	// DMA non-blocking write from fast buffer to the original memref.
2193	auto op = b.create<AffineDmaStartOp>(
2194	loc, fastMemRef, bufAffineMap, bufIndices, memref, memAffineMap,
2195	memIndices, tagMemRef, tagAffineMap, tagIndices, numElementsSSA,
2196	dmaStride, numEltPerDmaStride);
2197	// Since new ops may be appended at 'end' (for outgoing DMAs), adjust the
2198	// end to mark end of block range being processed.
2199	if (isCopyOutAtEndOfBlock)
2200	*nEnd = Block::iterator(op.getOperation());
2201	}
2202
2203	// Matching DMA wait to block on completion; tag always has a 0 index.
2204	b.create<AffineDmaWaitOp>(loc, tagMemRef, tagAffineMap, zeroIndex,
2205	numElementsSSA);
2206
2207	// Generate dealloc for the tag.
2208	auto tagDeallocOp = epilogue.create<memref::DeallocOp>(loc, tagMemRef);
2209	if (*nEnd == end && isCopyOutAtEndOfBlock)
2210	// Since new ops are being appended (for outgoing DMAs), adjust the end to
2211	// mark end of range of the original.
2212	*nEnd = Block::iterator(tagDeallocOp.getOperation());
2213	}
2214
2215	// Generate dealloc for the buffer.
2216	if (!existingBuf) {
2217	auto bufDeallocOp = epilogue.create<memref::DeallocOp>(loc, fastMemRef);
2218	// When generating pointwise copies, `nEnd' has to be set to deallocOp on
2219	// the fast buffer (since it marks the new end insertion point).
2220	if (!copyOptions.generateDma && *nEnd == end && isCopyOutAtEndOfBlock)
2221	*nEnd = Block::iterator(bufDeallocOp.getOperation());
2222	}
2223
2224	// Replace all uses of the old memref with the faster one while remapping
2225	// access indices (subtracting out lower bound offsets for each dimension).
2226	// Ex: to replace load %A[%i, %j] with load %Abuf[%i - %iT, %j - %jT],
2227	// index remap will be (%i, %j) -> (%i - %iT, %j - %jT),
2228	// i.e., affine.apply (d0, d1, d2, d3) -> (d2-d0, d3-d1) (%iT, %jT, %i, %j),
2229	// and (%iT, %jT) will be the 'extraOperands' for 'rep all memref uses with'.
2230	// d2, d3 correspond to the original indices (%i, %j).
2231	SmallVector<AffineExpr, `4`> remapExprs;
2232	remapExprs.reserve(N: rank);
2233	for (unsigned i = `0`; i < rank; i++) {
2234	// The starting operands of indexRemap will be regionSymbols (the symbols on
2235	// which the memref region is parametric); then those corresponding to
2236	// the memref's original indices follow.
2237	auto dimExpr = b.getAffineDimExpr(position: regionSymbols.size() + i);
2238	remapExprs.push_back(Elt: dimExpr - fastBufOffsets [i]);
2239	}
2240	auto indexRemap = AffineMap::get(dimCount: regionSymbols.size() + rank, symbolCount: `0`, results: remapExprs,
2241	context: b.getContext());
2242
2243	// Record the begin since it may be invalidated by memref replacement.
2244	Block::iterator prevOfBegin;
2245	bool isBeginAtStartOfBlock = (begin == block->begin());
2246	if (!isBeginAtStartOfBlock)
2247	prevOfBegin = std::prev(x: begin);
2248
2249	// Only* those uses within the range [begin, end) of 'block' are replaced.*
2250	(void)replaceAllMemRefUsesWith(memref, fastMemRef,
2251	/extraIndices=/{}, indexRemap,
2252	/extraOperands=/regionSymbols,
2253	/symbolOperands=/{},
2254	/domOpFilter=/&*begin,
2255	/postDomOpFilter=/&*postDomFilter);
2256
2257	*nBegin = isBeginAtStartOfBlock ? block->begin() : std::next(x: prevOfBegin);
2258
2259	return success();
2260	}
2261
2262	/// Construct the memref region to just include the entire memref. Returns false
2263	/// dynamic shaped memref's for now. `numParamLoopIVs` is the number of
2264	/// enclosing loop IVs of `op` (starting from the outermost) that the region
2265	/// is parametric on.
2266	static bool getFullMemRefAsRegion(Operation op, unsigned* numParamLoopIVs,
2267	MemRefRegion *region) {
2268	unsigned rank;
2269	if (auto loadOp = dyn_cast<AffineLoadOp>(op)) {
2270	rank = loadOp.getMemRefType().getRank();
2271	region->memref = loadOp.getMemRef();
2272	region->setWrite(false);
2273	} else if (auto storeOp = dyn_cast<AffineStoreOp>(op)) {
2274	rank = storeOp.getMemRefType().getRank();
2275	region->memref = storeOp.getMemRef();
2276	region->setWrite(true);
2277	} else {
2278	assert(false && "expected load or store op");
2279	return false;
2280	}
2281	auto memRefType = cast<MemRefType>(region->memref.getType());
2282	if (!memRefType.hasStaticShape())
2283	return false;
2284
2285	auto *regionCst = region->getConstraints();
2286
2287	// Just get the first numSymbols IVs, which the memref region is parametric
2288	// on.
2289	SmallVector<AffineForOp, `4`> ivs;
2290	getAffineForIVs(*op, &ivs);
2291	ivs.resize(numParamLoopIVs);
2292	SmallVector<Value, `4`> symbols;
2293	extractForInductionVars(ivs, &symbols);
2294	*regionCst = FlatAffineValueConstraints (rank, numParamLoopIVs, `0`);
2295	regionCst->setValues(start: rank, end: rank + numParamLoopIVs, values: symbols);
2296
2297	// Memref dim sizes provide the bounds.
2298	for (unsigned d = `0`; d < rank; d++) {
2299	auto dimSize = memRefType.getDimSize(d);
2300	assert(dimSize > `0` && "filtered dynamic shapes above");
2301	regionCst->addBound(type: BoundType::LB, pos: d, value: `0`);
2302	regionCst->addBound(BoundType::UB, d, dimSize - `1`);
2303	}
2304	return true;
2305	}
2306
2307	LogicalResult
2308	mlir::affine::affineDataCopyGenerate(Block::iterator begin, Block::iterator end,
2309	const AffineCopyOptions &copyOptions,
2310	std::optional<Value> filterMemRef,
2311	DenseSet<Operation *> &copyNests) {
2312	if (begin == end)
2313	return success();
2314
2315	assert(begin ->getBlock() == std::prev(end)->getBlock() &&
2316	"Inconsistent block begin/end args");
2317	assert(end != end ->getBlock()->end() && "end can't be the block terminator");
2318
2319	Block *block = begin ->getBlock();
2320
2321	// Copies will be generated for this depth, i.e., symbolic in all loops
2322	// surrounding the this block range.
2323	unsigned copyDepth = getNestingDepth(op: &*begin);
2324
2325	LLVM_DEBUG(llvm::dbgs() << "Generating copies at depth " << copyDepth
2326	<< "\n");
2327	LLVM_DEBUG(llvm::dbgs() << "from begin: " << *begin << "\n");
2328	LLVM_DEBUG(llvm::dbgs() << "to inclusive end: " << *std::prev(end) << "\n");
2329
2330	// List of memory regions to copy for. We need a map vector to have a
2331	// guaranteed iteration order to write test cases. CHECK-DAG doesn't help here
2332	// since the alloc's for example are identical except for the SSA id.
2333	SmallMapVector<Value, std::unique_ptr<MemRefRegion>, `4`> readRegions;
2334	SmallMapVector<Value, std::unique_ptr<MemRefRegion>, `4`> writeRegions;
2335
2336	// Map from original memref's to the fast buffers that their accesses are
2337	// replaced with.
2338	DenseMap<Value, Value> fastBufferMap;
2339
2340	// To check for errors when walking the block.
2341	bool error = false;
2342
2343	// Walk this range of operations to gather all memory regions.
2344	block->walk(begin, end, callback: [&](Operation *opInst) {
2345	Value memref;
2346	MemRefType memrefType;
2347	// Gather regions to allocate to buffers in faster memory space.
2348	if (auto loadOp = dyn_cast<AffineLoadOp>(opInst)) {
2349	memref = loadOp.getMemRef();
2350	memrefType = loadOp.getMemRefType();
2351	} else if (auto storeOp = dyn_cast<AffineStoreOp>(opInst)) {
2352	memref = storeOp.getMemRef();
2353	memrefType = storeOp.getMemRefType();
2354	}
2355	// Not an affine.load/store op.
2356	if (!memref)
2357	return;
2358
2359	if ((filterMemRef.has_value() && filterMemRef != memref) \|\|
2360	(isa_and_nonnull<IntegerAttr>(memrefType.getMemorySpace()) &&
2361	memrefType.getMemorySpaceAsInt() != copyOptions.slowMemorySpace))
2362	return;
2363
2364	if (!memref.getParentRegion()->isAncestor(other: block->getParent())) {
2365	LLVM_DEBUG(llvm::dbgs() << "memref definition is inside of the depth at "
2366	"which copy-in/copy-out would happen\n");
2367	return;
2368	}
2369
2370	// Compute the MemRefRegion accessed.
2371	auto region = std::make_unique<MemRefRegion>(args: opInst->getLoc());
2372	if (failed(Result: region ->compute(op: opInst, loopDepth: copyDepth, /sliceState=/nullptr,
2373	/addMemRefDimBounds=/false))) {
2374	LLVM_DEBUG(llvm::dbgs()
2375	<< "Error obtaining memory region: semi-affine maps?\n");
2376	LLVM_DEBUG(llvm::dbgs() << "over-approximating to the entire memref\n");
2377	if (!getFullMemRefAsRegion(op: opInst, numParamLoopIVs: copyDepth, region: region.get())) {
2378	LLVM_DEBUG(
2379	opInst->emitError("non-constant memref sizes not yet supported"));
2380	error = true;
2381	return;
2382	}
2383	}
2384
2385	// Each memref has a single buffer associated with it irrespective of how
2386	// many load's and store's happen on it.
2387	// TODO: in the future, when regions don't intersect and satisfy
2388	// other properties (based on load/store regions), we could consider
2389	// multiple buffers per memref.
2390
2391	// Add to the appropriate region if it's not already in it, or take a
2392	// bounding box union with the existing one if it's already in there.
2393	// Note that a memref may have both read and write regions - so update the
2394	// region in the other list if one exists (write in case of read and vice
2395	// versa) since there is a single bounding box for a memref across all reads
2396	// and writes that happen on it.
2397
2398	// Attempts to update; returns true if 'region' exists in targetRegions.
2399	auto updateRegion =
2400	[&](const SmallMapVector<Value, std::unique_ptr<MemRefRegion>, `4`>
2401	&targetRegions) {
2402	const auto *const it = targetRegions.find(Key: region ->memref);
2403	if (it == targetRegions.end())
2404	return false;
2405
2406	// Perform a union with the existing region.
2407	if (failed(Result: it->second ->unionBoundingBox(other: *region))) {
2408	LLVM_DEBUG(llvm::dbgs()
2409	<< "Memory region bounding box failed; "
2410	"over-approximating to the entire memref\n");
2411	// If the union fails, we will overapproximate.
2412	if (!getFullMemRefAsRegion(op: opInst, numParamLoopIVs: copyDepth, region: region.get())) {
2413	LLVM_DEBUG(opInst->emitError(
2414	"non-constant memref sizes not yet supported"));
2415	error = true;
2416	return true;
2417	}
2418	it->second ->getConstraints()->clearAndCopyFrom(
2419	other: *region ->getConstraints());
2420	} else {
2421	// Union was computed and stored in 'it->second': copy to 'region'.
2422	region ->getConstraints()->clearAndCopyFrom(
2423	other: *it->second ->getConstraints());
2424	}
2425	return true;
2426	};
2427
2428	bool existsInRead = updateRegion(readRegions);
2429	if (error)
2430	return;
2431	bool existsInWrite = updateRegion(writeRegions);
2432	if (error)
2433	return;
2434
2435	// Finally add it to the region list.
2436	if (region ->isWrite() && !existsInWrite) {
2437	writeRegions [region ->memref] = std::move(region);
2438	} else if (!region ->isWrite() && !existsInRead) {
2439	readRegions [region ->memref] = std::move(region);
2440	}
2441	});
2442
2443	if (error) {
2444	LLVM_DEBUG(begin ->emitError(
2445	"copy generation failed for one or more memref's in this block\n"));
2446	return failure();
2447	}
2448
2449	uint64_t totalCopyBuffersSizeInBytes = `0`;
2450	bool ret = true;
2451	auto processRegions =
2452	[&](const SmallMapVector<Value, std::unique_ptr<MemRefRegion>, `4`>
2453	&regions) {
2454	for (const auto &regionEntry : regions) {
2455	// For each region, hoist copy in/out past all hoistable
2456	// 'affine.for's.
2457	Block::iterator copyInPlacementStart, copyOutPlacementStart;
2458	Block *copyPlacementBlock;
2459	findHighestBlockForPlacement(
2460	region: regionEntry.second, block&: block, begin, end, copyPlacementBlock: &copyPlacementBlock,
2461	copyInPlacementStart: &copyInPlacementStart, copyOutPlacementStart: &copyOutPlacementStart);
2462
2463	uint64_t sizeInBytes;
2464	Block::iterator nBegin, nEnd;
2465	LogicalResult iRet = generateCopy(
2466	region: *regionEntry.second, block, begin, end, copyPlacementBlock,
2467	copyInPlacementStart, copyOutPlacementStart, copyOptions,
2468	fastBufferMap, copyNests, sizeInBytes: &sizeInBytes, nBegin: &nBegin, nEnd: &nEnd);
2469	if (succeeded(Result: iRet)) {
2470	// begin/end could have been invalidated, and need update.
2471	begin = nBegin;
2472	end = nEnd;
2473	totalCopyBuffersSizeInBytes += sizeInBytes;
2474	}
2475	ret = ret & succeeded(Result: iRet);
2476	}
2477	};
2478	processRegions (readRegions);
2479	processRegions (writeRegions);
2480
2481	if (!ret) {
2482	LLVM_DEBUG(begin ->emitError(
2483	"copy generation failed for one or more memref's in this block\n"));
2484	return failure();
2485	}
2486
2487	// For a range of operations, a note will be emitted at the caller.
2488	AffineForOp forOp;
2489	if (llvm::DebugFlag && (forOp = dyn_cast<AffineForOp>(&*begin))) {
2490	LLVM_DEBUG(forOp.emitRemark()
2491	<< llvm::divideCeil(totalCopyBuffersSizeInBytes, `1024`)
2492	<< " KiB of copy buffers in fast memory space for this block");
2493	}
2494
2495	if (totalCopyBuffersSizeInBytes > copyOptions.fastMemCapacityBytes) {
2496	block->getParentOp()->emitWarning(
2497	message: "total size of all copy buffers' for this block exceeds fast memory "
2498	"capacity");
2499	}
2500
2501	return success();
2502	}
2503
2504	// A convenience version of affineDataCopyGenerate for all ops in the body of
2505	// an AffineForOp.
2506	LogicalResult mlir::affine::affineDataCopyGenerate(
2507	AffineForOp forOp, const AffineCopyOptions &copyOptions,
2508	std::optional<Value> filterMemRef, DenseSet<Operation *> &copyNests) {
2509	return affineDataCopyGenerate(forOp.getBody()->begin(),
2510	std::prev(forOp.getBody()->end()), copyOptions,
2511	filterMemRef, copyNests);
2512	}
2513
2514	LogicalResult mlir::affine::generateCopyForMemRegion(
2515	const MemRefRegion &memrefRegion, Operation *analyzedOp,
2516	const AffineCopyOptions &copyOptions, CopyGenerateResult &result) {
2517	Block *block = analyzedOp->getBlock();
2518	auto begin = analyzedOp->getIterator();
2519	auto end = std::next(begin);
2520	DenseMap<Value, Value> fastBufferMap;
2521	DenseSet<Operation *> copyNests;
2522
2523	auto err = generateCopy(memrefRegion, block, begin, end, block, begin, end,
2524	copyOptions, fastBufferMap, copyNests,
2525	&result.sizeInBytes, &begin, &end);
2526	if (failed(err))
2527	return err;
2528
2529	const auto &en = fastBufferMap.find(Val: memrefRegion.memref);
2530	// In some cases (empty loops), no copy generation would have happened.
2531	if (en == fastBufferMap.end())
2532	return failure();
2533	result.alloc = en ->second.getDefiningOp();
2534	assert(result.alloc && "fast buffer expected to be locally allocated");
2535	assert(copyNests.size() <= `1` && "At most one copy nest is expected.");
2536	result.copyNest = copyNests.empty() ? nullptr : *copyNests.begin();
2537	return success();
2538	}
2539
2540	/// Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'.
2541	static void
2542	gatherLoopsInBlock(Block block, unsigned* currLoopDepth,
2543	std::vector<SmallVector<AffineForOp, `2`>> &depthToLoops) {
2544	// Add a new empty level to output if it doesn't exist level already.
2545	assert(currLoopDepth <= depthToLoops.size() && "Unexpected currLoopDepth");
2546	if (currLoopDepth == depthToLoops.size())
2547	depthToLoops.emplace_back();
2548
2549	for (auto &op : *block) {
2550	if (auto forOp = dyn_cast<AffineForOp>(op)) {
2551	depthToLoops [currLoopDepth].push_back(forOp);
2552	gatherLoopsInBlock(forOp.getBody(), currLoopDepth + `1`, depthToLoops);
2553	}
2554	}
2555	}
2556
2557	/// Gathers all AffineForOps in 'func.func' grouped by loop depth.
2558	void mlir::affine::gatherLoops(
2559	func::FuncOp func, std::vector<SmallVector<AffineForOp, `2`>> &depthToLoops) {
2560	for (auto &block : func)
2561	gatherLoopsInBlock(&block, /currLoopDepth=/`0`, depthToLoops);
2562
2563	// Remove last loop level from output since it's empty.
2564	if (!depthToLoops.empty()) {
2565	assert(depthToLoops.back().empty() && "Last loop level is not empty?");
2566	depthToLoops.pop_back();
2567	}
2568	}
2569
2570	AffineForOp mlir::affine::createCanonicalizedAffineForOp(
2571	OpBuilder b, Location loc, ValueRange lbOperands, AffineMap lbMap,
2572	ValueRange ubOperands, AffineMap ubMap, int64_t step) {
2573	SmallVector<Value, `4`> lowerOperands(lbOperands);
2574	SmallVector<Value, `4`> upperOperands(ubOperands);
2575
2576	fullyComposeAffineMapAndOperands(map: &lbMap, operands: &lowerOperands);
2577	canonicalizeMapAndOperands(map: &lbMap, operands: &lowerOperands);
2578	lbMap = removeDuplicateExprs(map: lbMap);
2579	fullyComposeAffineMapAndOperands(map: &ubMap, operands: &upperOperands);
2580	canonicalizeMapAndOperands(map: &ubMap, operands: &upperOperands);
2581	ubMap = removeDuplicateExprs(map: ubMap);
2582
2583	return b.create<AffineForOp>(loc, lowerOperands, lbMap, upperOperands, ubMap,
2584	step);
2585	}
2586
2587	/// Creates an AffineIfOp that encodes the conditional to choose between
2588	/// the constant trip count version and an unknown trip count version of this
2589	/// nest of loops. This is used to separate partial and full tiles if `loops`
2590	/// has the intra-tile loops. The affine.if op is inserted at the builder
2591	/// insertion point of `b`.
2592	static AffineIfOp createSeparationCondition(MutableArrayRef<AffineForOp> loops,
2593	OpBuilder b) {
2594	if (loops.empty())
2595	return nullptr;
2596
2597	auto *context = loops [`0`].getContext();
2598
2599	FlatAffineValueConstraints cst;
2600	SmallVector<Operation *, `8`> ops;
2601	llvm::append_range(C&: ops, R&: loops);
2602	(void)getIndexSet(ops, domain: &cst);
2603
2604	// Remove constraints that are independent of these loop IVs.
2605	cst.removeIndependentConstraints(/pos=/`0`, /num=/loops.size());
2606
2607	// Construct the constraint set representing the guard for full tiles. The
2608	// lower bound (and upper bound) corresponding to the full tile should be
2609	// larger (and resp. smaller) than any other lower (or upper bound).
2610	SmallVector<int64_t, `8`> fullTileLb, fullTileUb;
2611	for (auto loop : loops) {
2612	(void)loop;
2613	// TODO: Non-unit stride is not an issue to generalize to.
2614	assert(loop.getStepAsInt() == `1` && "point loop step expected to be one");
2615	// Mark everything symbols for the purpose of finding a constant diff pair.
2616	cst.setDimSymbolSeparation(/newSymbolCount=/cst.getNumDimAndSymbolVars() -
2617	`1`);
2618	unsigned fullTileLbPos, fullTileUbPos;
2619	if (!((IntegerRelation)cst)
2620	.getConstantBoundOnDimSize(`0`, /lb=/nullptr,
2621	/boundFloorDivisor=/nullptr,
2622	/ub=/nullptr, &fullTileLbPos,
2623	&fullTileUbPos)) {
2624	LLVM_DEBUG(llvm::dbgs() << "Can't get constant diff pair for a loop\n");
2625	return nullptr;
2626	}
2627
2628	SmallVector<unsigned, `4`> lbIndices, ubIndices;
2629	cst.getLowerAndUpperBoundIndices(/pos=/`0`, &lbIndices, &ubIndices);
2630
2631	auto fLb = cst.getInequality(fullTileLbPos);
2632	auto fUb = cst.getInequality(fullTileUbPos);
2633	fullTileLb.assign(fLb.begin(), fLb.end());
2634	fullTileUb.assign(fUb.begin(), fUb.end());
2635
2636	// Full tile lower bound should be >= than any other lower bound.
2637	for (auto lbIndex : lbIndices)
2638	for (unsigned i = `0`, e = cst.getNumCols(); i < e; ++i)
2639	cst.atIneq(lbIndex, i) = fullTileLb[i] - cst.atIneq(lbIndex, i);
2640
2641	// Full tile upper bound should be <= any other upper bound.
2642	for (auto ubIndex : ubIndices)
2643	for (unsigned i = `0`, e = cst.getNumCols(); i < e; ++i)
2644	cst.atIneq(ubIndex, i) -= fullTileUb[i];
2645
2646	cst.removeVar(`0`);
2647	}
2648
2649	// The previous step leads to all zeros for the full tile lb and ub position
2650	// itself; remove those and any other duplicates / trivial redundancies.
2651	cst.removeTrivialRedundancy();
2652
2653	// Turn everything into dims conservatively since we earlier turned all
2654	// trailing ids past point loop IV into symbols. Some of these could be outer
2655	// loop IVs; we'll canonicalize anyway.
2656	cst.setDimSymbolSeparation(`0`);
2657
2658	IntegerSet ifCondSet = cst.getAsIntegerSet(context: context);
2659	// ifCondSet can be null if cst was empty -- this can happen if all loops
2660	// in the nest have constant trip counts.
2661	if (!ifCondSet)
2662	return nullptr;
2663
2664	SmallVector<Value, `4`> setOperands;
2665	cst.getValues(start: `0`, end: cst.getNumDimAndSymbolVars(), values: &setOperands);
2666	canonicalizeSetAndOperands(set: &ifCondSet, operands: &setOperands);
2667	return b.create<AffineIfOp>(loops [`0`].getLoc(), ifCondSet, setOperands,
2668	/withElseRegion=/true);
2669	}
2670
2671	/// Create the full tile loop nest (along with its body).
2672	static LogicalResult
2673	createFullTiles(MutableArrayRef<AffineForOp> inputNest,
2674	SmallVectorImpl<AffineForOp> &fullTileLoops, OpBuilder b) {
2675	fullTileLoops.reserve(N: inputNest.size());
2676
2677	// For each loop in the original nest identify a lower/upper bound pair such
2678	// that their difference is a constant.
2679	FlatAffineValueConstraints cst;
2680	for (auto loop : inputNest) {
2681	// TODO: straightforward to generalize to a non-unit stride.
2682	if (loop.getStepAsInt() != `1`) {
2683	LLVM_DEBUG(llvm::dbgs()
2684	<< "[tile separation] non-unit stride not implemented\n");
2685	return failure();
2686	}
2687	SmallVector<Operation *, `1`> loopOp{loop.getOperation()};
2688	(void)getIndexSet(loopOp, &cst);
2689	// We will mark everything other than this loop IV as symbol for getting a
2690	// pair of <lb, ub> with a constant difference.
2691	cst.setDimSymbolSeparation(cst.getNumDimAndSymbolVars() - `1`);
2692	unsigned lbPos, ubPos;
2693	if (!((IntegerRelation)cst)
2694	.getConstantBoundOnDimSize(/pos=/`0`, /lb=/nullptr,
2695	/boundFloorDivisor=/nullptr,
2696	/ub=/nullptr, &lbPos, &ubPos) \|\|
2697	lbPos == ubPos) {
2698	LLVM_DEBUG(llvm::dbgs() << "[tile separation] Can't get constant diff / "
2699	"equalities not yet handled\n");
2700	return failure();
2701	}
2702
2703	// Set all variables as dimensions uniformly since some of those marked as
2704	// symbols above could be outer loop IVs (corresponding tile space IVs).
2705	cst.setDimSymbolSeparation(/newSymbolCount=/`0`);
2706
2707	AffineValueMap lbVmap, ubVmap;
2708	cst.getIneqAsAffineValueMap(/pos=/`0`, lbPos, lbVmap, b.getContext());
2709	cst.getIneqAsAffineValueMap(/pos=/`0`, ubPos, ubVmap, b.getContext());
2710	AffineForOp fullTileLoop = createCanonicalizedAffineForOp(
2711	b, loop.getLoc(), lbVmap.getOperands(), lbVmap.getAffineMap(),
2712	ubVmap.getOperands(), ubVmap.getAffineMap());
2713	b = OpBuilder::atBlockTerminator(fullTileLoop.getBody());
2714	fullTileLoops.push_back(fullTileLoop);
2715	}
2716
2717	// Add the body for the full tile loop nest.
2718	IRMapping operandMap;
2719	for (const auto &loopEn : llvm::enumerate(inputNest))
2720	operandMap.map(loopEn.value().getInductionVar(),
2721	fullTileLoops[loopEn.index()].getInductionVar());
2722	b = OpBuilder::atBlockTerminator(block: fullTileLoops.back().getBody());
2723	for (auto &op : inputNest.back().getBody()->without_terminator())
2724	b.clone(op, operandMap);
2725	return success();
2726	}
2727
2728	LogicalResult
2729	mlir::affine::separateFullTiles(MutableArrayRef<AffineForOp> inputNest,
2730	SmallVectorImpl<AffineForOp> *fullTileNest) {
2731	if (inputNest.empty())
2732	return success();
2733
2734	auto firstLoop = inputNest[`0`];
2735
2736	// Each successive for op has to be nested in the other.
2737	auto prevLoop = firstLoop;
2738	for (auto loop : inputNest.drop_front(`1`)) {
2739	assert(loop->getParentOp() == prevLoop && "input not contiguously nested");
2740	prevLoop = loop;
2741	}
2742
2743	// Create the full tile loop nest.
2744	SmallVector<AffineForOp, `4`> fullTileLoops;
2745	OpBuilder b(firstLoop);
2746	if (failed(Result: createFullTiles(inputNest, fullTileLoops, b))) {
2747	if (!fullTileLoops.empty())
2748	fullTileLoops.front().erase();
2749	return failure();
2750	}
2751
2752	// Create and insert the version select right before the root of the nest.
2753	b = OpBuilder(firstLoop);
2754	AffineIfOp ifOp = createSeparationCondition(inputNest, b);
2755	if (!ifOp) {
2756	fullTileLoops.front().erase();
2757	LLVM_DEBUG(llvm::dbgs() << "All tiles are full tiles, or failure creating "
2758	"separation condition\n");
2759	return failure();
2760	}
2761
2762	// Move the full tile into the then block.
2763	Block *thenBlock = ifOp.getThenBlock();
2764	AffineForOp outermostFullTileLoop = fullTileLoops[`0`];
2765	thenBlock->getOperations().splice(
2766	std::prev(x: thenBlock->end()),
2767	outermostFullTileLoop->getBlock()->getOperations(),
2768	Block::iterator(outermostFullTileLoop));
2769
2770	// Move the partial tile into the else block. The partial tile is the same as
2771	// the original loop nest.
2772	Block *elseBlock = ifOp.getElseBlock();
2773	elseBlock->getOperations().splice(std::prev(x: elseBlock->end()),
2774	firstLoop->getBlock()->getOperations(),
2775	Block::iterator(firstLoop));
2776
2777	if (fullTileNest)
2778	*fullTileNest = std::move(fullTileLoops);
2779
2780	return success();
2781	}
2782
2783	LogicalResult affine::coalescePerfectlyNestedAffineLoops(AffineForOp op) {
2784	LogicalResult result(failure());
2785	SmallVector<AffineForOp> loops;
2786	getPerfectlyNestedLoops(loops, op);
2787	if (loops.size() <= `1`)
2788	return success();
2789
2790	// Look for a band of loops that can be coalesced, i.e. perfectly nested
2791	// loops with bounds defined above some loop.
2792	// 1. For each loop, find above which parent loop its operands are
2793	// defined.
2794	SmallVector<unsigned> operandsDefinedAbove(loops.size());
2795	for (unsigned i = `0`, e = loops.size(); i < e; ++i) {
2796	operandsDefinedAbove [i] = i;
2797	for (unsigned j = `0`; j < i; ++j) {
2798	if (areValuesDefinedAbove(loops[i].getOperands(), loops[j].getRegion())) {
2799	operandsDefinedAbove [i] = j;
2800	break;
2801	}
2802	}
2803	}
2804
2805	// 2. Identify bands of loops such that the operands of all of them are
2806	// defined above the first loop in the band. Traverse the nest bottom-up
2807	// so that modifications don't invalidate the inner loops.
2808	for (unsigned end = loops.size(); end > `0`; --end) {
2809	unsigned start = `0`;
2810	for (; start < end - `1`; ++start) {
2811	auto maxPos =
2812	*std::max_element(first: std::next(x: operandsDefinedAbove.begin(), n: start),
2813	last: std::next(x: operandsDefinedAbove.begin(), n: end));
2814	if (maxPos > start)
2815	continue;
2816	assert(maxPos == start &&
2817	"expected loop bounds to be known at the start of the band");
2818	auto band = llvm::MutableArrayRef(loops.data() + start, end - start);
2819	if (succeeded(coalesceLoops(band)))
2820	result = success();
2821	break;
2822	}
2823	// If a band was found and transformed, keep looking at the loops above
2824	// the outermost transformed loop.
2825	if (start != end - `1`)
2826	end = start + `1`;
2827	}
2828	return result;
2829	}
2830
2831	int64_t mlir::affine::numEnclosingInvariantLoops(OpOperand &operand) {
2832	int64_t count = `0`;
2833	Operation *currentOp = operand.getOwner();
2834	while (auto loopOp = currentOp->getParentOfType<LoopLikeOpInterface>()) {
2835	if (!loopOp.isDefinedOutsideOfLoop(operand.get()))
2836	break;
2837	currentOp = loopOp;
2838	count++;
2839	}
2840	return count;
2841	}
2842

Provided by KDAB

Definitions

getCleanupLoopLowerBound
replaceIterArgsAndYieldResults
promoteIfSingleIteration
generateShiftedLoop
affineForOpBodySkew
checkIfHyperRectangular
performPreTilingChecks
moveLoopBodyImpl
moveLoopBody
constructTiledLoopNest
setIntraTileBoundsParametric
setInterTileBoundsParametric
constructParametricallyTiledIndexSetHyperRect
constructTiledIndexSetHyperRect
tilePerfectlyNested
tilePerfectlyNestedParametric
getPerfectlyNestedLoops
getTileableBands
loopUnrollFull
loopUnrollUpToFactor
generateUnrolledLoop
generateCleanupLoopForUnroll
loopUnrollByFactor
loopUnrollJamUpToFactor
areInnerBoundsInvariant
loopUnrollJamByFactor
interchangeLoops
checkLoopInterchangeDependences
isValidLoopInterchangePermutation
isPerfectlyNested
permuteLoops
sinkSequentialLoops
augmentMapAndBounds
stripmineSink
stripmineSink
tile
tile
coalesceLoops
mapLoopToProcessorIds
findHighestBlockForPlacement
StrideInfo
getMultiLevelStrides
generatePointWiseCopy
emitRemarkForBlock
generateCopy
getFullMemRefAsRegion
affineDataCopyGenerate
affineDataCopyGenerate
generateCopyForMemRegion
gatherLoopsInBlock
gatherLoops
createCanonicalizedAffineForOp
createSeparationCondition
createFullTiles
separateFullTiles
coalescePerfectlyNestedAffineLoops

Improve your Profiling and Debugging skills

Find out more

Definitions

source code of mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp