HoistPadding.cpp source code [mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp]

1	//===- HoistPadding.cpp - Hoisting for tensor::PadOp ----------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file implements functions concerned with hoisting padding operations.
10	//
11	//===----------------------------------------------------------------------===//
12
13	#include "mlir/Analysis/Presburger/IntegerRelation.h"
14	#include "mlir/Analysis/SliceAnalysis.h"
15	#include "mlir/Dialect/Affine/IR/AffineOps.h"
16	#include "mlir/Dialect/Affine/Transforms/Transforms.h"
17	#include "mlir/Dialect/Func/IR/FuncOps.h"
18	#include "mlir/Dialect/Linalg/IR/Linalg.h"
19	#include "mlir/Dialect/Linalg/Transforms/Hoisting.h"
20	#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
21	#include "mlir/Dialect/SCF/IR/SCF.h"
22	#include "mlir/Dialect/Tensor/Utils/Utils.h"
23	#include "mlir/Dialect/Utils/IndexingUtils.h"
24	#include "mlir/IR/AsmState.h"
25	#include "mlir/IR/Dominance.h"
26	#include "mlir/IR/Matchers.h"
27	#include "mlir/Interfaces/DestinationStyleOpInterface.h"
28	#include "mlir/Transforms/LoopInvariantCodeMotionUtils.h"
29	#include "mlir/Transforms/RegionUtils.h"
30	#include "llvm/Support/Debug.h"
31
32	using llvm::dbgs;
33
34	#define DEBUG_TYPE "hoist-padding"
35
36	#define DBGS() (dbgs() << '[' << DEBUG_TYPE << "] ")
37
38	using namespace mlir;
39	using namespace mlir::linalg;
40	using namespace mlir::linalg::detail;
41
42	#ifndef NDEBUG
43	static bool debugPrintLoopInShortForm(Operation *op) {
44	AsmState state(op->getParentOfType<func::FuncOp>());
45	(void)state;
46	if (auto forOp = dyn_cast<scf::ForOp>(op)) {
47	forOp.getInductionVar().printAsOperand(dbgs(), state);
48	dbgs() << " @ " << forOp.getOperation();
49	return true;
50	}
51	return false;
52	}
53	#endif
54
55	static void debugPrintBackwardSlice(SetVector<Operation *> &backwardSlice) {
56	LLVM_DEBUG(llvm::interleaveComma(backwardSlice, DBGS() << "--backwardSlice:",
57	[](Operation *op) {
58	dbgs() << "\n";
59	DBGS() << "----";
60	if (debugPrintLoopInShortForm(op)) {
61	dbgs() << "\n";
62	return;
63	}
64	dbgs() << *op << "\n";
65	});
66	DBGS() << "\n";);
67	}
68
69	/// Return at most nLevels of immediately enclosing scf::ForOp loops.
70	/// Stops at the first parent that is not an scf::ForOp.
71	/// Multi-loops such as scf.parallel or linalg.tiled_loop are not modeled atm.
72	/// Control-flow and other containing ops with regions are not modeled atm.
73	static void
74	getAtMostNEnclosingLoops(tensor::PadOp padOp, int nLevels,
75	SmallVector<scf::ForOp> &reverseEnclosingLoops) {
76	scf::ForOp outermostEnclosingForOp = nullptr;
77	Operation *nextEnclosingOp = padOp->getParentOp();
78	while (nLevels-- > `0` &&
79	(outermostEnclosingForOp = dyn_cast<scf::ForOp>(nextEnclosingOp))) {
80	LLVM_DEBUG(DBGS() << "loops: ";
81	debugPrintLoopInShortForm(outermostEnclosingForOp);
82	dbgs() << "\n");
83	reverseEnclosingLoops.push_back(outermostEnclosingForOp);
84	nextEnclosingOp = outermostEnclosingForOp->getParentOp();
85	}
86	}
87
88	/// Return at most nLevels of immediately enclosing scf::ForOp loops.
89	/// Stops at the first parent that is not an scf::ForOp.
90	/// Multi-loops such as scf.parallel or linalg.tiled_loop are not modeled atm.
91	/// Control-flow and other containing ops with regions are not modeled atm.
92	static void
93	getEnclosingLoopsUntil(tensor::PadOp padOp, scf::ForOp untilLoop,
94	SmallVector<scf::ForOp> &reverseEnclosingLoops) {
95	scf::ForOp outermostEnclosingForOp = nullptr;
96	Operation *nextEnclosingOp = padOp->getParentOp();
97	while (outermostEnclosingForOp != untilLoop &&
98	(outermostEnclosingForOp = dyn_cast<scf::ForOp>(nextEnclosingOp))) {
99	LLVM_DEBUG(DBGS() << "loops: ";
100	debugPrintLoopInShortForm(outermostEnclosingForOp);
101	dbgs() << "\n");
102	reverseEnclosingLoops.push_back(outermostEnclosingForOp);
103	nextEnclosingOp = outermostEnclosingForOp->getParentOp();
104	}
105	}
106
107	// Get all the ops in the backwards slice starting from `padOp` and that
108	// are dominated by the outermost enclosing loop.
109	// This also requires tracking ops defining values used in the region but
110	// defined above.
111	static void computeBackwardSlice(tensor::PadOp padOp,
112	scf::ForOp outermostEnclosingForOp,
113	SetVector<Operation *> &backwardSlice) {
114	DominanceInfo domInfo(outermostEnclosingForOp);
115	BackwardSliceOptions sliceOptions;
116	sliceOptions.filter = [&](Operation *op) {
117	return domInfo.dominates(outermostEnclosingForOp, op) &&
118	!padOp->isProperAncestor(op);
119	};
120	sliceOptions.inclusive = true;
121
122	// First, add the ops required to compute the region to the backwardSlice.
123	SetVector<Value> valuesDefinedAbove;
124	getUsedValuesDefinedAbove(padOp.getRegion(), padOp.getRegion(),
125	valuesDefinedAbove);
126	for (Value v : valuesDefinedAbove) {
127	LogicalResult result = getBackwardSlice(root: v, backwardSlice: &backwardSlice, options: sliceOptions);
128	assert(result.succeeded() && "expected a backward slice");
129	(void)result;
130	}
131	// Then, add the backward slice from padOp itself.
132	LogicalResult result =
133	getBackwardSlice(padOp.getOperation(), &backwardSlice, sliceOptions);
134	assert(result.succeeded() && "expected a backward slice");
135	(void)result;
136	}
137
138	//===----------------------------------------------------------------------===//
139	// HoistPaddingAnalysis Implementation.
140	//===----------------------------------------------------------------------===//
141
142	namespace {
143	/// Analysis class to support tensor::PadOp hoisting across multiple enclosing
144	/// loops. The failure conditions are:
145	/// 1. Pad op has a use that is not an input of a LinalgOp.
146	/// 2. Pad op does not have a constant padding value.
147	/// 3. There is no immediately enclosing scf::ForOp.
148	/// 4. The backward slice from the pad op to the scf::ForOp to hoist above
149	/// contains an unknown op with non index type operands, a region, or a
150	/// memory effect.
151	/// 5. The backward slice from the pad op to the scf::ForOp to hoist above is
152	/// empty.
153	/// 6. The source tensor of pad op is not defined by an extract slice op.
154	/// 7. The source tensor of the extract slice op is not defined outside of
155	/// the outermost enclosing scf::ForOp.
156	/// 8. There is no enclosing scf::ForOp that indexes the padded data.
157	/// Other cases succeed and will trigger hoisting of the pad op.
158	struct HoistPaddingAnalysis {
159	HoistPaddingAnalysis(tensor::PadOp padOp, int numLoops);
160	HoistPaddingAnalysis(tensor::PadOp padOp, scf::ForOp outermostEnclosingForOp);
161
162	bool isValid() { return valid.has_value() && valid.value(); }
163	bool isInvalid() { return valid.has_value() && !valid.value(); }
164
165	/// Footprint of the hoistedPackedTensor, computed from the packingLoops.
166	SmallVector<Value> getHoistedPackedTensorSizes(RewriterBase &rewriter,
167	Location loc) const;
168
169	/// Performs optional hoisting to enable hoist padding to occur. This may be
170	/// necessary when `sliceOp` is not defined outside of the outermost enclosing
171	/// loop we want to hoist above.
172	///
173	/// Example:
174	/// ```
175	/// %source = linalg.fill(%cst, %arg0)
176	/// // %source is available for packing here!
177	/// scf.for %i
178	/// scf.for %j
179	/// scf.for %k
180	/// %slice = tensor.extract_slice %source [%i, %j]
181	/// %padded_slice = tensor.pad %slice
182	/// ```
183	void enableHoistPadding(RewriterBase &rewriter);
184
185	/// Common analysis builder to finalize the construction of the analysis once
186	/// optional `enableHoistPadding` has run.
187	/// `reverseEnclosingLoops.back()` is the loop to hoist above.
188	void finalizeHoistPaddingAnalysis();
189
190	private:
191	/// Encodes whether the analysis is valid and hoisting can proceed.
192	std::optional<bool> valid;
193
194	/// The padOp to hoist.
195	tensor::PadOp opToHoist;
196
197	/// Immediately enclosing loops considered for hoisting padding.
198	SmallVector<scf::ForOp> reverseEnclosingLoops;
199
200	/// Drop any non-index dependencies of `padOp` and `sliceOp` from
201	/// `backwardSlice`. The method follows the use-def chains of the index
202	/// operands consumed by `padOp` and `sliceOp` and drops the operations
203	/// not part of this index computation. Afterwards, the filtered
204	/// `backwardSlice` contains only the loops whose induction variable is
205	/// used, directly or indirectly, to index the padded tensor. The method
206	/// returns failure if the filtered backward slice contains an unexpected
207	/// operation.
208	///
209	/// Example:
210	/// ```
211	/// %source = linalg.fill(%cst, %arg0)
212	/// scf.for %i
213	/// %unrelated = linalg.fill(%cst, %arg1) // not used to index
214	/// %source! scf.for %j (%arg2 = %unrelated)
215	/// scf.for %k // not used to index
216	/// %source!
217	/// %ubi = affine.min #map(%i)
218	/// %ubj = affine.min #map(%j)
219	/// %slice = tensor.extract_slice %source [%i, %j] [%ubi, %ubj]
220	/// %padded_slice = tensor.pad %slice
221	/// ```
222	/// dropNonIndexDependencies(%padded_slice, %slice)
223	/// removes [scf.for %k, linalg.fill(%cst, %arg1)] from backwardSlice.
224	LogicalResult dropNonIndexDependencies();
225
226	public:
227	/// The outermost loop, determined by `nLevels` above which `padOp` will
228	/// be hoisted.
229	scf::ForOp outermostEnclosingForOp;
230
231	/// Backward slice rooted at `padOp` and nested under
232	/// `outermostEnclosingForOp`.
233	SetVector<Operation *> backwardSlice;
234
235	/// The scf::ForOp immediately enclosing `padOp` such that:
236	/// 1. they are nested under `outermostEnclosingForOp` (inclusive)
237	/// 2. whose induction variable is used, directly or indirectly, in the
238	/// computation of `padOp`.
239	/// The span of these loops determines the footprint of the packed tensor.
240	SmallVector<scf::ForOp> packingLoops;
241
242	/// The ExtractSliceOp that feeds the PadOp we want to hoist.
243	tensor::ExtractSliceOp sliceOp;
244
245	/// If non-empty, this is the unique scf::ForOp that consumes the `sliceOp`.
246	scf::ForOp padConsumingForOp;
247	};
248
249	} // namespace
250
251	HoistPaddingAnalysis::HoistPaddingAnalysis(tensor::PadOp padOp, int numLoops)
252	: valid (std::nullopt), opToHoist(padOp) {
253	// Get at most `numLoops` of immediately enclosing loops.
254	getAtMostNEnclosingLoops(opToHoist, numLoops, reverseEnclosingLoops);
255	if (reverseEnclosingLoops.empty()) {
256	LLVM_DEBUG(DBGS() << "--No immediately enclosing loop -> Skip\n");
257	valid = false;
258	return;
259	}
260	outermostEnclosingForOp = reverseEnclosingLoops.back();
261	sliceOp = opToHoist.getSource().getDefiningOp<tensor::ExtractSliceOp>();
262	if (!sliceOp) {
263	LLVM_DEBUG(DBGS() << "--Cannot find the extract slice op -> Skip\n");
264	valid = false;
265	return;
266	}
267	}
268
269	HoistPaddingAnalysis::HoistPaddingAnalysis(tensor::PadOp padOp,
270	scf::ForOp outermostEnclosingForOp)
271	: valid (std::nullopt), opToHoist(padOp) {
272	// Get enclosing loops until outermostEnclosingForOp.
273	getEnclosingLoopsUntil(opToHoist, outermostEnclosingForOp,
274	reverseEnclosingLoops);
275	if (reverseEnclosingLoops.empty()) {
276	LLVM_DEBUG(DBGS() << "--No immediately enclosing loop -> Skip\n");
277	valid = false;
278	return;
279	}
280	this->outermostEnclosingForOp = reverseEnclosingLoops.back();
281	if (this->outermostEnclosingForOp != outermostEnclosingForOp) {
282	LLVM_DEBUG(DBGS() << "--Unexpected outermost enclosing loop -> Skip\n");
283	valid = false;
284	return;
285	}
286	sliceOp = opToHoist.getSource().getDefiningOp<tensor::ExtractSliceOp>();
287	if (!sliceOp) {
288	LLVM_DEBUG(DBGS() << "--Cannot find the extract slice op -> Skip\n");
289	valid = false;
290	return;
291	}
292	}
293
294	void HoistPaddingAnalysis::enableHoistPadding(RewriterBase &rewriter) {
295	if (isInvalid())
296	return;
297	// If the padded data is not yet available before entering the outermost
298	// enclosing loop, try to apply hoisting on this outermost loop.
299	// TODO: we may want finer-grained hoisting of only that particular `sliceOp`.
300	if (!outermostEnclosingForOp.isDefinedOutsideOfLoop(sliceOp.getSource())) {
301	outermostEnclosingForOp = cast<scf::ForOp>(
302	hoistLoopInvariantSubsets(rewriter, outermostEnclosingForOp));
303	}
304	}
305
306	void HoistPaddingAnalysis::finalizeHoistPaddingAnalysis() {
307	if (isInvalid())
308	return;
309
310	if (!outermostEnclosingForOp.isDefinedOutsideOfLoop(sliceOp.getSource())) {
311	LLVM_DEBUG(DBGS() << "--outermostEnclosingForOp:\n"
312	<< outermostEnclosingForOp << "\n"
313	<< "--sliceOp: " << sliceOp << "\n"
314	<< "--sliceOp.getSource(): " << sliceOp.getSource()
315	<< "\n");
316	LLVM_DEBUG(DBGS() << "----Source not defined outside of loops -> Skip\n");
317	valid = false;
318	return;
319	}
320	if (sliceOp->hasOneUse()) {
321	padConsumingForOp = dyn_cast<scf::ForOp>(*(sliceOp->getUsers().begin()));
322	}
323
324	// Check the region of `padOp` depends on a constant only. Adding hoisting
325	// support for arbitrary padding regions would require cloning all
326	// dependencies captured by the padding region.
327	Value paddingValue = opToHoist.getConstantPaddingValue();
328	if (!paddingValue \|\|
329	!isa_and_nonnull<arith::ConstantOp>(paddingValue.getDefiningOp())) {
330	LLVM_DEBUG(DBGS() << "Cannot find constant padding value -> Skip\n");
331	valid = false;
332	return;
333	}
334
335	computeBackwardSlice(opToHoist, outermostEnclosingForOp, backwardSlice);
336	if (backwardSlice.size() <= `1`) {
337	valid = false;
338	return;
339	}
340
341	debugPrintBackwardSlice(backwardSlice);
342	// Remove all ops in the backward slice that are not used to index
343	// the padded tensor. In particular, keep `padOp`, `sliceOp`, and
344	// the loop and affine operations used for the index computation.
345	if (failed(Result: dropNonIndexDependencies())) {
346	LLVM_DEBUG(DBGS() << "--Cannot dropNonIndexDependencies -> Skip\n");
347	valid = false;
348	return;
349	}
350	debugPrintBackwardSlice(backwardSlice);
351
352	// Add only the loops part of the filtered `backwardSlice` to the
353	// packing loops. All other loops are not used to index the padded
354	// data and consequently access the same data in every loop
355	// iteration. Adding them to the packing loops would increase the
356	// cache footprint of the packed data by storing the same data
357	// multiple times.
358	for (scf::ForOp forOp : llvm::reverse(reverseEnclosingLoops))
359	if (backwardSlice.contains(forOp))
360	packingLoops.push_back(forOp);
361
362	// TODO: for multiple loops we need to track the use to the innermost loop.
363	if (packingLoops.size() > `1` && padConsumingForOp) {
364	LLVM_DEBUG(DBGS() << "--Cannot hoist multiple loops through iter_args -> "
365	"Downgrade to 1 loop\n");
366	packingLoops.resize(`1`);
367	}
368
369	// Note: at this point, packing loops may be empty but we would still like
370	// to hoist the padding if so specified.
371
372	// The analysis is valid and hoisting can occur.
373	valid = true;
374	}
375
376	LogicalResult HoistPaddingAnalysis::dropNonIndexDependencies() {
377	// Set of all values used for index computation.
378	SetVector<Value> indexEdges;
379
380	// Add all index operands of `operation` to `indexEdges`. An index operand
381	// is an operand of type index.
382	auto addIndexOperandsToIndexEdges = [&](Operation *operation) {
383	for (Value operand : operation->getOperands())
384	if (operand.getType().isIndex())
385	indexEdges.insert(X: operand);
386	};
387
388	// Check if any operation result is contained in `indexEdges`.
389	auto hasIndexResult = [&](Operation *operation) {
390	return llvm::any_of(Range: operation->getResults(), P: [&](Value result) {
391	return indexEdges.contains(key: result);
392	});
393	};
394
395	// Starting from `opToHoist` and `sliceOp` walk the use-def edges of index
396	// type in `backwardSlice`. Add the index operands of an operation to
397	// `indexEdges` and remove all operations from `backwardSlice` that are not
398	// part of the index computation.
399	//
400	// Example:
401	// ```
402	// %source = linalg.fill(%cst, %arg0)
403	// scf.for %i
404	// %unrelated = linalg.fill(%cst, %arg1) // not used to index %source!
405	// scf.for %j (%arg2 = %unrelated)
406	// scf.for %k // not used to index %source!
407	// %ubi = affine.min #map(%i)
408	// %ubj = affine.min #map(%j)
409	// %slice = tensor.extract_slice %source [%i, %j] [%ubi, %ubj]
410	// %padded_slice = tensor.pad %slice
411	// ```
412	// After iterating `backwardSlice` we obtain:
413	// indexEdges = [%i, %j, %ubi, %ubj]
414	// backwardSlice = backwardSlice / [linalg.fill(%cst, %arg1), scf.for %k]
415	SetVector<Operation *> operationsToRemove;
416	for (Operation *op : llvm::reverse(C&: backwardSlice)) {
417	// Add the index operands of `opToHoist` and `sliceOp` to start the
418	// exploration of the index computation.
419	if (op == opToHoist \|\| op == sliceOp) {
420	addIndexOperandsToIndexEdges (op);
421	continue;
422	}
423	// Add the index operands of the loop if its induction variable is
424	// used for index computation.
425	if (auto forOp = dyn_cast<scf::ForOp>(op)) {
426	if (!hasIndexResult (op) && indexEdges.contains(key: forOp.getInductionVar())) {
427	addIndexOperandsToIndexEdges (op);
428	continue;
429	}
430	}
431	// Add the index operands of all other operations if at least one result
432	// is used for index computation.
433	if (hasIndexResult (op)) {
434	addIndexOperandsToIndexEdges (op);
435	// Check the operands of the remaining operations all have index type.
436	if (llvm::any_of(Range: op->getOperandTypes(),
437	P: [](Type type) { return !type.isIndex(); })) {
438	LLVM_DEBUG(DBGS() << "Unsupported op with non index type operands: "
439	<< op << " -> Skip\n");
440	return failure();
441	}
442	// Check the remaining operations do not have regions or memory effects.
443	auto effectInterface = dyn_cast<MemoryEffectOpInterface>(op);
444	bool hasMemoryEffect = effectInterface && !effectInterface.hasNoEffect();
445	if (hasMemoryEffect \|\| op->getNumRegions() != `0`) {
446	LLVM_DEBUG(DBGS() << "Unsupported op with region or memory effect: "
447	<< op << " -> Skip\n");
448	return failure();
449	}
450	continue;
451	}
452	// Remove all other operations not used by the index computation. An
453	// exception are constant operations that may be used by `opToHoist`.
454	if (!isa<arith::ConstantOp>(op))
455	operationsToRemove.insert(X: op);
456	}
457	backwardSlice.set_subtract(operationsToRemove);
458	return success();
459	}
460
461	SmallVector<Value>
462	HoistPaddingAnalysis::getHoistedPackedTensorSizes(RewriterBase &rewriter,
463	Location loc) const {
464	SmallVector<Value> dynamicTensorSizes;
465
466	// Upper bound the packing loop lengths to size the packed tensor. Taking
467	// upper bounds can make the sizes of the packed tensor independent of the
468	// enclosing loops. This independence is a prerequisite for reusing the same
469	// buffer for all enclosing loop iterations and hoisting its allocation out
470	// of the enclosing loops.
471	for (auto forOp : packingLoops) {
472	// Compute an upper bound `ubVal` for the upper bound of `forOp`.
473	FailureOr<OpFoldResult> loopUb = affine::reifyIndexValueBound(
474	rewriter, loc, presburger::BoundType::UB, forOp.getUpperBound(),
475	/stopCondition=/
476	[&](Value v, std::optional<int64_t> d, ValueBoundsConstraintSet &cstr) {
477	if (v == forOp.getUpperBound())
478	return false;
479	// Compute a bound that is independent of any affine op results.
480	Operation *op = v.getDefiningOp();
481	if (!op)
482	return true;
483	return !isa<affine::AffineMinOp, affine::AffineMaxOp,
484	affine::AffineApplyOp>(op);
485	},
486	/closedUB=/true);
487	assert(succeeded(loopUb) && "could not get upper bound");
488	Value ubVal = getValueOrCreateConstantIndexOp(rewriter, loc, *loopUb);
489
490	// Compute the maximal packing loop length as (ub - lb).ceilDiv(step) and
491	// store the result to `dynamicTensorSizes`.
492	// TODO: instead of using the lower bound of `forOp` directly, implement a
493	// lower bound computation similar to the upper bound computation.
494	AffineExpr lb, ub, step;
495	bindDims(rewriter.getContext(), lb, ub);
496	bindSymbols(rewriter.getContext(), step);
497	Value res = rewriter.createOrFold<affine::AffineApplyOp>(
498	loc, (ub - lb).ceilDiv(step),
499	ValueRange{forOp.getLowerBound(), ubVal,
500	cast<scf::ForOp>(forOp).getStep()});
501	dynamicTensorSizes.push_back(res);
502	}
503
504	return dynamicTensorSizes;
505	}
506
507	static bool isDefinedOutsideOrConstant(scf::ForOp outer, Value v) {
508	return outer.isDefinedOutsideOfLoop(v) \|\| matchPattern(value: v, pattern: m_Constant());
509	}
510
511	//===----------------------------------------------------------------------===//
512	// buildPackingLoopNest Implementation.
513	//===----------------------------------------------------------------------===//
514
515	/// Return the current iteration number in the loop (iv - lb).ceilDiv(step).
516	/// The returned Value is guaranteed not to depend on any loop comprised in
517	/// [`outer`, `forOp`].
518	/// Return null if such a loop-independent quantity cannot be computed.
519	static Value buildLoopIterationCount(RewriterBase &rewriter, scf::ForOp outer,
520	scf::ForOp forOp) {
521	MLIRContext *ctx = forOp->getContext();
522	AffineExpr iv, lb, step;
523	bindDims(ctx, exprs&: iv, exprs&: lb);
524	bindSymbols(ctx, exprs&: step);
525	if (!isDefinedOutsideOrConstant(outer, forOp.getLowerBound()) \|\|
526	!isDefinedOutsideOrConstant(outer, forOp.getStep()))
527	return Value ();
528	Value ivVal = forOp.getInductionVar(), lbVal = forOp.getLowerBound(),
529	stepVal = forOp.getStep();
530	auto loc = forOp->getLoc();
531	return rewriter.createOrFold<affine::AffineApplyOp>(
532	loc, (iv - lb).ceilDiv(other: step), ValueRange {ivVal, lbVal, stepVal});
533	}
534
535	// Build a packing loop nest by iteratively traversing the backward slice and
536	// clone the operations, iteratively stepping into the loops that we encounter.
537	// The implementation proceeds in a stack-like fashion:
538	// 1. Iteratively clone and step into the loops, pushing the
539	// `hoistedPackedTensor`
540	// deeper in the stack.
541	// 2. At the innermost loop level, create a GenericOp if `transposeVector` is
542	// non-empty.
543	// 3. At the innermost loop level, create a InsertSliceOp.
544	// 4. Iteratively pop and yield the result of the InsertSliceOp across the
545	// cloned loops.
546	static FailureOr<PackingResult> buildPackingLoopNestImpl(
547	RewriterBase &rewriter, IRMapping &bvm, tensor::PadOp opToHoist,
548	ArrayRef<int64_t> transposeVector, RankedTensorType transposedTensorType,
549	tensor::EmptyOp emptyOp, const HoistPaddingAnalysis &analysis) {
550	SmallVector<OpFoldResult> offsets, sizes, strides;
551	SmallVector<Value> clonedLoopIvs, leadingHoistedPackedTensorIndexings;
552
553	scf::ForOp outerLoop = analysis.outermostEnclosingForOp;
554
555	Location loc = opToHoist->getLoc();
556	RankedTensorType paddedTensorType = opToHoist.getResultType();
557	int paddedRank = paddedTensorType.getRank();
558
559	// Step 0. Populate bvm with opToHoist.getSource if relevant.
560	BlockArgument bbArg = dyn_cast<BlockArgument>(opToHoist.getSource());
561	while (bbArg) {
562	auto forOp = dyn_cast<scf::ForOp>(bbArg.getOwner()->getParentOp());
563	if (!forOp)
564	break;
565	if (forOp != outerLoop && !outerLoop->isAncestor(forOp))
566	break;
567	OpOperand &operand = *forOp.getTiedLoopInit(bbArg);
568	bvm.map(from: bbArg, to: operand.get());
569	bbArg = dyn_cast<BlockArgument>(Val: operand.get());
570	}
571
572	// Step 1. iteratively clone loops and push `hoistedPackedTensor`.
573	Value hoistedPackedTensor = emptyOp.getResult();
574	OpBuilder::InsertionGuard g(rewriter);
575	for (Operation *op : analysis.backwardSlice) {
576	// Specifically sit out in the extract_slice(hoistedPackedTensor) case: this
577	// is the piece we seek to replace.
578	if (auto sliceOp = dyn_cast<tensor::ExtractSliceOp>(op)) {
579	if (bvm.lookupOrDefault(sliceOp.getSource()) == hoistedPackedTensor) {
580	LLVM_DEBUG(DBGS() << "--Skip: " << sliceOp << "\n");
581	continue;
582	}
583	}
584
585	// Clone all operations except loops which require special handling.
586	auto forOp = dyn_cast<scf::ForOp>(op);
587	if (!forOp) {
588	// We are at the right insertion point within the loop nest.
589	rewriter.clone(op&: *op, mapper&: bvm);
590	continue;
591	}
592
593	// Create a packing loop that takes `hoistedPackedTensor` as iteration
594	// argument.
595	auto clonedForOp = rewriter.create<scf::ForOp>(
596	loc, bvm.lookupOrDefault(forOp.getLowerBound()),
597	bvm.lookupOrDefault(forOp.getUpperBound()),
598	bvm.lookupOrDefault(forOp.getStep()), hoistedPackedTensor);
599
600	// Map the induction var, region args and results to the `clonedForOp`.
601	bvm.map(forOp.getInductionVar(), clonedForOp.getInductionVar());
602	bvm.map(forOp.getRegionIterArgs(), clonedForOp.getRegionIterArgs());
603	bvm.map(forOp.getResults(), clonedForOp.getResults());
604	assert(clonedForOp->getNumRegions() == `1`);
605	clonedLoopIvs.push_back(Elt: clonedForOp.getInductionVar());
606
607	// Do not insert guard here, we get deeper into the loop nest.
608	rewriter.setInsertionPointToStart(&clonedForOp->getRegion(`0`).front());
609	Value loopIndependentIterationCount =
610	buildLoopIterationCount(rewriter, outerLoop, clonedForOp);
611
612	// Assert the loop-independent iteration count can be computed.
613	if (!loopIndependentIterationCount)
614	llvm_unreachable("loop independence prerequisite not met");
615	leadingHoistedPackedTensorIndexings.push_back(
616	Elt: loopIndependentIterationCount);
617	hoistedPackedTensor = clonedForOp.getRegionIterArgs().front();
618	}
619
620	// Step 2. Construct offsets, sizes and strides for the innermost level of the
621	// packing loop.
622	int64_t nPackedLoops = clonedLoopIvs.size();
623	// offsets = [clonedLoopIvs, 0 .. 0].
624	offsets =
625	SmallVector<OpFoldResult>{leadingHoistedPackedTensorIndexings.begin(),
626	leadingHoistedPackedTensorIndexings.end()};
627	offsets.append(paddedRank, rewriter.getIndexAttr(`0`));
628	// sizes = [1 .. 1, transposedShape].
629	sizes = SmallVector<OpFoldResult>(nPackedLoops, rewriter.getIndexAttr(`1`));
630	for (int64_t sz : transposedTensorType.getShape()) {
631	// TODO: go grab dims when needed, atm tensor::PadOp yields a static tensor.
632	if (ShapedType::isDynamic(sz))
633	return failure();
634	sizes.push_back(rewriter.getIndexAttr(sz));
635	}
636	// strides = [1 .. 1].
637	strides = SmallVector<OpFoldResult>(nPackedLoops + paddedRank,
638	rewriter.getIndexAttr(`1`));
639
640	// Step 3. Optionally transpose the padded tensor.
641	TransposeOp maybeTransposeOp;
642	Value paddedTensor = bvm.lookup(opToHoist.getResult());
643	if (!transposeVector.empty()) {
644	Value outputTensor = rewriter.create<tensor::ExtractSliceOp>(
645	loc, transposedTensorType, hoistedPackedTensor, offsets, sizes,
646	strides);
647	maybeTransposeOp = rewriter.create<linalg::TransposeOp>(
648	loc, paddedTensor, outputTensor, transposeVector);
649	paddedTensor = maybeTransposeOp.getResult()[`0`];
650	}
651
652	// Innermost tensor.insert_slice and yields are optional / need loops.
653	if (nPackedLoops > `0`) {
654	// Step 4. Create InsertSliceOp at the innermost loop level, inserting an
655	// optionally transposed padded slice into the packed tensor.
656	Value inserted = rewriter.create<tensor::InsertSliceOp>(
657	loc, paddedTensor, hoistedPackedTensor, offsets, sizes, strides);
658
659	// Step 5. Iteratively pop the stack and propagate the yield.
660	Value valueToYield = inserted;
661	for (Value iv : llvm::reverse(C&: clonedLoopIvs)) {
662	auto forOp = scf::getForInductionVarOwner(iv);
663	rewriter.setInsertionPointToEnd(&forOp.getRegion().front());
664	rewriter.create<scf::YieldOp>(loc, valueToYield);
665	valueToYield = forOp.getResult(`0`);
666	}
667	}
668
669	return PackingResult{
670	offsets,
671	sizes,
672	strides,
673	clonedLoopIvs,
674	leadingHoistedPackedTensorIndexings,
675	maybeTransposeOp,
676	cast<tensor::PadOp>(bvm.lookup(opToHoist.getResult()).getDefiningOp())};
677	}
678
679	/// Build the packing loop nest required to hoist `opToHoist` above
680	/// `outermostEnclosingForOp`.
681	/// The loop nest is built just before `outermostEnclosingForOp`.
682	static FailureOr<PackingResult> buildPackingLoopNestImpl(
683	RewriterBase &rewriter, IRMapping &bvm, tensor::PadOp opToHoist,
684	ArrayRef<int64_t> transposeVector, const HoistPaddingAnalysis &analysis) {
685	// Update actual number of loops, which may be smaller.
686	int nPackedLoops = analysis.packingLoops.size();
687	LLVM_DEBUG(DBGS() << "\n";
688	DBGS() << "Func:\n"
689	<< *opToHoist->getParentOfType<func::FuncOp>() << "\n";
690	DBGS() << "Start hoisting above " << nPackedLoops << " loops\n");
691
692	Location loc = opToHoist->getLoc();
693	RankedTensorType paddedTensorType = opToHoist.getResultType();
694
695	// Compute the type of the transposed padded tensor.
696	FailureOr<RankedTensorType> transposedTensorType =
697	tensor::computeTransposedType(paddedTensorType, transposeVector);
698	if (failed(transposedTensorType)) {
699	LLVM_DEBUG(DBGS() << "--Could not compute transposed type -> Skip\n");
700	return failure();
701	}
702
703	// Create the packed tensor<?x?x..? x transposedShape>.
704	SmallVector<int64_t> packedShape(nPackedLoops, ShapedType::kDynamic);
705	// TODO: go grab dims when needed, atm tensor::PadOp yields a static tensor.
706	llvm::append_range(packedShape, transposedTensorType->getShape());
707	auto hoistedPackedTensorType = RankedTensorType::get(
708	packedShape, transposedTensorType->getElementType());
709
710	// Set the insertion point right before the outer loop and start packing.
711	scf::ForOp outerLoop = analysis.outermostEnclosingForOp;
712	OpBuilder::InsertionGuard g(rewriter);
713	rewriter.setInsertionPoint(outerLoop);
714	SmallVector<Value> dynamicTensorSizes =
715	analysis.getHoistedPackedTensorSizes(rewriter, loc);
716	auto emptyOp = rewriter.create<tensor::EmptyOp>(
717	loc, hoistedPackedTensorType.getShape(),
718	hoistedPackedTensorType.getElementType(), dynamicTensorSizes);
719
720	return buildPackingLoopNestImpl(rewriter, bvm, opToHoist, transposeVector,
721	*transposedTensorType, emptyOp, analysis);
722	}
723
724	/// Build the packing loop nest required to hoist `opToHoist` above
725	/// `outermostEnclosingForOp`.
726	/// The loop nest is built just before `outermostEnclosingForOp`.
727	FailureOr<PackingResult> mlir::linalg::detail::buildPackingLoopNest(
728	RewriterBase &rewriter, tensor::PadOp opToHoist,
729	scf::ForOp outermostEnclosingForOp, ArrayRef<int64_t> transposeVector) {
730	HoistPaddingAnalysis analysis(opToHoist, outermostEnclosingForOp);
731	analysis.enableHoistPadding(rewriter);
732	analysis.finalizeHoistPaddingAnalysis();
733	if (!analysis.isValid()) {
734	LLVM_DEBUG(DBGS() << "--Analysis failed -> Skip\n");
735	return failure();
736	}
737	IRMapping bvm;
738	return buildPackingLoopNestImpl(rewriter, bvm, opToHoist, transposeVector,
739	analysis);
740	}
741
742	//===----------------------------------------------------------------------===//
743	// hoistPaddingOnTensors Implementation.
744	//===----------------------------------------------------------------------===//
745
746	/// Return true if we can walk back the use-def chain from `extractSliceOp` to
747	/// expectedSource going through DestinationStyleOpInterface inits only.
748	/// This is a poor man's analysis that is sufficient to check the extractSliceOp
749	/// the matches tensor.pad we want to hoist.
750	/// In the future, it will be easier to ensure this with a matching symmetric
751	/// tensor.unpad op.
752	static bool tracesBackToExpectedValue(tensor::ExtractSliceOp extractSliceOp,
753	Value expectedSource) {
754	LLVM_DEBUG(DBGS() << "Start tracesBackToExpectedValue on: " << extractSliceOp
755	<< "\n");
756	LLVM_DEBUG(DBGS() << "--with extractSlice: " << extractSliceOp << "\n");
757	Value source = extractSliceOp.getSource();
758	LLVM_DEBUG(DBGS() << "--with starting source: " << source << "\n");
759	while (source && source != expectedSource) {
760	auto destOp =
761	dyn_cast_or_null<DestinationStyleOpInterface>(source.getDefiningOp());
762	if (!destOp)
763	break;
764	LLVM_DEBUG(DBGS() << "--step dest op: " << destOp << "\n");
765	source = destOp.getDpsInitOperand(cast<OpResult>(Val&: source).getResultNumber())
766	->get();
767	}
768	LLVM_DEBUG(DBGS() << "--final source: " << source << "\n");
769	LLVM_DEBUG(DBGS() << "--expected source: " << expectedSource << "\n");
770	return source == expectedSource;
771	}
772
773	/// If the original consumer of `outerSliceOp` was a `forOp` (i.e. through an
774	/// iter arg), propagate the `hoistedPackedTensor` value through the same iter
775	/// arg.
776	/// TODO: for multiple loops we need to track the use to the innermost loop.
777	///
778	/// Match:
779	/// ```
780	/// %outerSliceOp = tensor.extract_slice ..
781	/// %f = scf.for ... iter_args(%arg0 = %outerSliceOp) {
782	/// %hoistedPackedTensor = tensor.pad %arg0
783	/// %1 = compute %hoistedPackedTensor
784	/// %2 = tensor.extract_slice %1
785	/// scf.yield %2
786	/// }
787	/// ```
788	///
789	/// and rewrite as:
790	/// ```
791	/// %outerSliceOp = tensor.extract_slice ..
792	/// %hoistedPackedTensor = tensor.pad %outerSliceOp
793	/// %f = scf.for ... iter_args(%arg0 = %hoistedPackedTensor) {
794	/// %1 = compute %arg0
795	/// scf.yield %1
796	/// }
797	/// %2 = tensor.extract_slice %forOp
798	/// ```
799	///
800	/// Return null when no rewrite happened.
801	static tensor::ExtractSliceOp
802	padThroughLoopIterArg(RewriterBase &rewriter, Value paddedValueBeforeHoisting,
803	Value hoistedPackedTensor,
804	tensor::ExtractSliceOp outerSliceOp, scf::ForOp forOp) {
805	LLVM_DEBUG(DBGS() << "Start padThroughLoopIterArg on: " << forOp << "\n");
806	LLVM_DEBUG(DBGS() << "--paddedValueBeforeHoisting: "
807	<< paddedValueBeforeHoisting << "\n");
808	OpOperand pUse = nullptr*;
809	for (OpOperand &use : outerSliceOp->getUses()) {
810	if (use.getOwner() == forOp) {
811	assert(!pUse && "Multiple slice uses in the for loop");
812	pUse = &use;
813	}
814	}
815	assert(pUse && "No slice use in the for loop");
816	OpBuilder::InsertionGuard g(rewriter);
817	rewriter.setInsertionPointAfter(hoistedPackedTensor.getDefiningOp());
818
819	unsigned iterArgNumber = forOp.getTiedLoopResult(pUse).getResultNumber();
820	auto yieldingExtractSliceOp = forOp.getYieldedValues()[iterArgNumber]
821	.getDefiningOp<tensor::ExtractSliceOp>();
822	if (!yieldingExtractSliceOp)
823	return tensor::ExtractSliceOp();
824
825	// Poor man's analysis sufficient to ensure extractSlice matches tensor.pad.
826	// In the future, it will be easier to ensure this with a matching symmetric
827	// tensor.unpad op.
828	if (!tracesBackToExpectedValue(yieldingExtractSliceOp,
829	paddedValueBeforeHoisting))
830	return tensor::ExtractSliceOp();
831
832	SmallVector<Value> initArgs = forOp.getInitArgs();
833	initArgs [iterArgNumber] = hoistedPackedTensor;
834	SmallVector<Value> yieldOperands = llvm::to_vector(forOp.getYieldedValues());
835	yieldOperands [iterArgNumber] = yieldingExtractSliceOp.getSource();
836
837	int64_t numOriginalForOpResults = initArgs.size();
838	LLVM_DEBUG(DBGS() << "numOriginalForOpResults: " << numOriginalForOpResults
839	<< "\n");
840	tensor::ExtractSliceOp extracted;
841	{
842	OpBuilder::InsertionGuard g(rewriter);
843	rewriter.setInsertionPointAfter(forOp);
844	extracted = rewriter.create<tensor::ExtractSliceOp>(
845	hoistedPackedTensor.getLoc(), hoistedPackedTensor,
846	outerSliceOp.getMixedOffsets(), outerSliceOp.getMixedSizes(),
847	outerSliceOp.getMixedStrides());
848	rewriter.replaceAllUsesWith(forOp.getResult(iterArgNumber), extracted);
849	}
850	scf::ForOp newForOp = cast<scf::ForOp>(*forOp.replaceWithAdditionalYields(
851	rewriter, initArgs, /replaceInitOperandUsesInLoop=/true,
852	[&](OpBuilder &b, Location loc, ArrayRef<BlockArgument> newBBArgs) {
853	return yieldOperands;
854	}));
855
856	LLVM_DEBUG(DBGS() << "newForOp results: " << newForOp.getNumResults()
857	<< "\n");
858	LLVM_DEBUG(DBGS() << "replace source of: " << extracted << "\n");
859	LLVM_DEBUG(DBGS() << "with result #"
860	<< numOriginalForOpResults + iterArgNumber
861	<< " of forOp, giving us: " << extracted << "\n");
862	rewriter.startOpModification(op: extracted);
863	extracted.getSourceMutable().assign(
864	newForOp.getResult(numOriginalForOpResults + iterArgNumber));
865	rewriter.finalizeOpModification(op: extracted);
866
867	LLVM_DEBUG(DBGS() << "replace uses of: " << paddedValueBeforeHoisting
868	<< "\n");
869	LLVM_DEBUG(DBGS() << "with region iter arg #"
870	<< numOriginalForOpResults + iterArgNumber << "\n");
871	rewriter.replaceAllUsesWith(
872	paddedValueBeforeHoisting,
873	newForOp.getRegionIterArg(numOriginalForOpResults + iterArgNumber));
874
875	return extracted;
876	}
877
878	/// Produce a tensor extracted from the packingResult. This can be used as a
879	/// replacement for `opToHoist` in callers.
880	static Value replaceByPackingResult(RewriterBase &rewriter,
881	const IRMapping &bvm,
882	tensor::PadOp opToHoist,
883	RankedTensorType transposedTensorType,
884	const HoistPaddingAnalysis &analysis,
885	const PackingResult &packingResult) {
886	// The replacement occurs under a single insertion point within the original
887	// loop, just before opToHoist.
888	OpBuilder::InsertionGuard g(rewriter);
889	rewriter.setInsertionPoint(opToHoist);
890
891	Location loc = opToHoist->getLoc();
892	RankedTensorType paddedTensorType = opToHoist.getResultType();
893	int paddedRank = paddedTensorType.getRank();
894
895	int64_t nPackedLoops = packingResult.clonedLoopIvs.size();
896	LLVM_DEBUG(DBGS() << "nPackedLoops: " << nPackedLoops << " loops\n");
897
898	scf::ForOp outerLoop = analysis.outermostEnclosingForOp;
899	ArrayRef<scf::ForOp> packingLoops = analysis.packingLoops;
900
901	Value hoistedPackedTensor;
902	SmallVector<Value> loopIterationCounts;
903	SmallVector<OpFoldResult> offsets(nPackedLoops + paddedRank,
904	rewriter.getIndexAttr(`0`));
905	if (nPackedLoops > `0`) {
906	loopIterationCounts =
907	llvm::to_vector<`4`>(llvm::map_range(packingLoops, [&](Operation *loop) {
908	return buildLoopIterationCount(rewriter, outerLoop,
909	cast<scf::ForOp>(loop));
910	}));
911	// Assert all loop iteration counts can be computed.
912	if (llvm ::any_of(Range&: loopIterationCounts, P: [](Value v) { return !v; }))
913	llvm_unreachable("loop independence prerequisite not met");
914
915	// offsets = [maybe_leading_ivs = originalLoopIvs, 0 .. 0].
916	std::copy(first: loopIterationCounts.begin(), last: loopIterationCounts.end(),
917	result: offsets.begin());
918	hoistedPackedTensor =
919	scf::getForInductionVarOwner(packingResult.clonedLoopIvs.front())
920	->getResult(`0`);
921	} else {
922	// If no loops were created, this is just hoisting without packing.
923	hoistedPackedTensor = bvm.lookup(opToHoist.getResult());
924	}
925
926	LLVM_DEBUG(DBGS() << "hoistedPackedTensor: " << hoistedPackedTensor << "\n");
927
928	// If the consumer of `padOp` was a `forOp`, propagate through iter args.
929	scf::ForOp forOp = analysis.padConsumingForOp;
930	if (forOp) {
931	return padThroughLoopIterArg(rewriter, opToHoist, hoistedPackedTensor,
932	analysis.sliceOp, forOp);
933	}
934
935	// offsets = [maybe_leading_ivs, 0 .. 0].
936	// sizes = [1 .. 1, transposedShape] (defined above).
937	// strides = [1 .. 1] (defined above)
938	return rewriter.create<tensor::ExtractSliceOp>(
939	loc, transposedTensorType, hoistedPackedTensor, offsets,
940	packingResult.sizes, packingResult.strides);
941	}
942
943	FailureOr<Value> mlir::linalg::hoistPaddingOnTensors(
944	RewriterBase &rewriter, tensor::PadOp opToHoist, int64_t numLoops,
945	ArrayRef<int64_t> transposeVector, tensor::PadOp &hoistedOp,
946	SmallVectorImpl<TransposeOp> &transposeOps) {
947	LLVM_DEBUG(DBGS() << "\n"; DBGS() << " Try to hoist " << *(opToHoist) << "\n";
948	DBGS() << " by " << numLoops << " loops\n");
949
950	HoistPaddingAnalysis analysis(opToHoist, numLoops);
951	analysis.enableHoistPadding(rewriter);
952	analysis.finalizeHoistPaddingAnalysis();
953	if (!analysis.isValid()) {
954	LLVM_DEBUG(DBGS() << "--Analysis failed -> Skip\n");
955	return failure();
956	}
957
958	/// Construct the packing loop nest.
959	IRMapping bvm;
960	FailureOr<PackingResult> packingResult = buildPackingLoopNestImpl(
961	rewriter, bvm, opToHoist, transposeVector, analysis);
962	if (failed(Result: packingResult)) {
963	LLVM_DEBUG(DBGS() << "--buildPackingLoopNestImpl failed -> Skip\n");
964	return failure();
965	}
966
967	if (!transposeVector.empty())
968	transposeOps.push_back(packingResult->maybeTransposeOp);
969
970	FailureOr<RankedTensorType> transposedTensorType =
971	tensor::computeTransposedType(rankedTensorType: opToHoist.getResultType(), transposeVector);
972	assert(succeeded(transposedTensorType) && "unexpected failure in type");
973
974	// Now the packed tensor is ready, replace the original padding op by a
975	// 1x..x1 slice [originalLoopIvs, 0 .. 0][1 .. 1, paddedShape][1 .. 1].
976	Value newResult =
977	replaceByPackingResult(rewriter, bvm, opToHoist, *transposedTensorType,
978	analysis, *packingResult);
979
980	Location loc = opToHoist->getLoc();
981	RankedTensorType paddedTensorType = opToHoist.getResultType();
982	if (!transposeVector.empty()) {
983	OpBuilder::InsertionGuard g(rewriter);
984	rewriter.setInsertionPointAfter(newResult.getDefiningOp());
985	// Transpose the packed tensor back to the original storage order.
986	Value emptyTensor = rewriter.create<tensor::EmptyOp>(
987	loc, paddedTensorType.getShape(), paddedTensorType.getElementType());
988	TransposeOp unTransposeOp = rewriter.create<linalg::TransposeOp>(
989	loc, newResult, emptyTensor, transposeVector);
990	newResult = unTransposeOp.getResult()[`0`];
991	transposeOps.push_back(unTransposeOp);
992	}
993
994	LLVM_DEBUG(DBGS() << "newResult: " << newResult << "\n");
995	LLVM_DEBUG(
996	DBGS() << "After hoisting: "
997	<< newResult.getDefiningOp()->getParentOfType<func::FuncOp>()
998	<< "\n");
999
1000	// Make the newly cloned `opToHoist` available to the caller.
1001	hoistedOp = packingResult->hoistedPadOp;
1002
1003	LLVM_DEBUG(DBGS() << "--SUCCESS\n");
1004	return newResult;
1005	}
1006
1007	FailureOr<Value> mlir::linalg::hoistPaddingOnTensors(
1008	tensor::PadOp opToHoist, int64_t numLoops,
1009	ArrayRef<int64_t> transposeVector, tensor::PadOp &hoistedOp,
1010	SmallVectorImpl<TransposeOp> &transposeOps) {
1011	IRRewriter rewriter(opToHoist.getContext());
1012	return hoistPaddingOnTensors(rewriter, opToHoist, numLoops, transposeVector,
1013	hoistedOp, transposeOps);
1014	}
1015

Provided by KDAB

Definitions

source code of mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp