1//===- HoistPadding.cpp - Hoisting for tensor::PadOp ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements functions concerned with hoisting padding operations.
10//
11//===----------------------------------------------------------------------===//
12
13#include "mlir/Analysis/Presburger/IntegerRelation.h"
14#include "mlir/Analysis/SliceAnalysis.h"
15#include "mlir/Dialect/Affine/IR/AffineOps.h"
16#include "mlir/Dialect/Affine/Transforms/Transforms.h"
17#include "mlir/Dialect/Func/IR/FuncOps.h"
18#include "mlir/Dialect/Linalg/IR/Linalg.h"
19#include "mlir/Dialect/Linalg/Transforms/Hoisting.h"
20#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
21#include "mlir/Dialect/SCF/IR/SCF.h"
22#include "mlir/Dialect/Tensor/Utils/Utils.h"
23#include "mlir/IR/AsmState.h"
24#include "mlir/IR/Dominance.h"
25#include "mlir/IR/Matchers.h"
26#include "mlir/Interfaces/DestinationStyleOpInterface.h"
27#include "mlir/Transforms/LoopInvariantCodeMotionUtils.h"
28#include "mlir/Transforms/RegionUtils.h"
29#include "llvm/Support/Debug.h"
30
31using llvm::dbgs;
32
33#define DEBUG_TYPE "hoist-padding"
34
35#define DBGS() (dbgs() << '[' << DEBUG_TYPE << "] ")
36
37using namespace mlir;
38using namespace mlir::linalg;
39using namespace mlir::linalg::detail;
40
41#ifndef NDEBUG
42static bool debugPrintLoopInShortForm(Operation *op) {
43 AsmState state(op->getParentOfType<func::FuncOp>());
44 (void)state;
45 if (auto forOp = dyn_cast<scf::ForOp>(op)) {
46 forOp.getInductionVar().printAsOperand(dbgs(), state);
47 dbgs() << " @ " << forOp.getOperation();
48 return true;
49 }
50 return false;
51}
52#endif
53
54static void debugPrintBackwardSlice(SetVector<Operation *> &backwardSlice) {
55 LLVM_DEBUG(llvm::interleaveComma(backwardSlice, DBGS() << "--backwardSlice:",
56 [](Operation *op) {
57 dbgs() << "\n";
58 DBGS() << "----";
59 if (debugPrintLoopInShortForm(op)) {
60 dbgs() << "\n";
61 return;
62 }
63 dbgs() << *op << "\n";
64 });
65 DBGS() << "\n";);
66}
67
68/// Return at most nLevels of immediately enclosing scf::ForOp loops.
69/// Stops at the first parent that is not an scf::ForOp.
70/// Multi-loops such as scf.parallel or linalg.tiled_loop are not modeled atm.
71/// Control-flow and other containing ops with regions are not modeled atm.
72static void
73getAtMostNEnclosingLoops(tensor::PadOp padOp, int nLevels,
74 SmallVector<scf::ForOp> &reverseEnclosingLoops) {
75 scf::ForOp outermostEnclosingForOp = nullptr;
76 Operation *nextEnclosingOp = padOp->getParentOp();
77 while (nLevels-- > 0 &&
78 (outermostEnclosingForOp = dyn_cast<scf::ForOp>(Val: nextEnclosingOp))) {
79 LLVM_DEBUG(DBGS() << "loops: ";
80 debugPrintLoopInShortForm(outermostEnclosingForOp);
81 dbgs() << "\n");
82 reverseEnclosingLoops.push_back(Elt: outermostEnclosingForOp);
83 nextEnclosingOp = outermostEnclosingForOp->getParentOp();
84 }
85}
86
87/// Return at most nLevels of immediately enclosing scf::ForOp loops.
88/// Stops at the first parent that is not an scf::ForOp.
89/// Multi-loops such as scf.parallel or linalg.tiled_loop are not modeled atm.
90/// Control-flow and other containing ops with regions are not modeled atm.
91static void
92getEnclosingLoopsUntil(tensor::PadOp padOp, scf::ForOp untilLoop,
93 SmallVector<scf::ForOp> &reverseEnclosingLoops) {
94 scf::ForOp outermostEnclosingForOp = nullptr;
95 Operation *nextEnclosingOp = padOp->getParentOp();
96 while (outermostEnclosingForOp != untilLoop &&
97 (outermostEnclosingForOp = dyn_cast<scf::ForOp>(Val: nextEnclosingOp))) {
98 LLVM_DEBUG(DBGS() << "loops: ";
99 debugPrintLoopInShortForm(outermostEnclosingForOp);
100 dbgs() << "\n");
101 reverseEnclosingLoops.push_back(Elt: outermostEnclosingForOp);
102 nextEnclosingOp = outermostEnclosingForOp->getParentOp();
103 }
104}
105
106// Get all the ops in the backwards slice starting from `padOp` and that
107// are dominated by the outermost enclosing loop.
108// This also requires tracking ops defining values used in the region but
109// defined above.
110static void computeBackwardSlice(tensor::PadOp padOp,
111 scf::ForOp outermostEnclosingForOp,
112 SetVector<Operation *> &backwardSlice) {
113 DominanceInfo domInfo(outermostEnclosingForOp);
114 BackwardSliceOptions sliceOptions;
115 sliceOptions.filter = [&](Operation *op) {
116 return domInfo.dominates(a: outermostEnclosingForOp, b: op) &&
117 !padOp->isProperAncestor(other: op);
118 };
119 sliceOptions.inclusive = true;
120
121 // First, add the ops required to compute the region to the backwardSlice.
122 SetVector<Value> valuesDefinedAbove;
123 getUsedValuesDefinedAbove(region&: padOp.getRegion(), limit&: padOp.getRegion(),
124 values&: valuesDefinedAbove);
125 for (Value v : valuesDefinedAbove) {
126 LogicalResult result = getBackwardSlice(root: v, backwardSlice: &backwardSlice, options: sliceOptions);
127 assert(result.succeeded() && "expected a backward slice");
128 (void)result;
129 }
130 // Then, add the backward slice from padOp itself.
131 LogicalResult result =
132 getBackwardSlice(op: padOp.getOperation(), backwardSlice: &backwardSlice, options: sliceOptions);
133 assert(result.succeeded() && "expected a backward slice");
134 (void)result;
135}
136
137//===----------------------------------------------------------------------===//
138// HoistPaddingAnalysis Implementation.
139//===----------------------------------------------------------------------===//
140
141namespace {
142/// Analysis class to support tensor::PadOp hoisting across multiple enclosing
143/// loops. The failure conditions are:
144/// 1. Pad op has a use that is not an input of a LinalgOp.
145/// 2. Pad op does not have a constant padding value.
146/// 3. There is no immediately enclosing scf::ForOp.
147/// 4. The backward slice from the pad op to the scf::ForOp to hoist above
148/// contains an unknown op with non index type operands, a region, or a
149/// memory effect.
150/// 5. The backward slice from the pad op to the scf::ForOp to hoist above is
151/// empty.
152/// 6. The source tensor of pad op is not defined by an extract slice op.
153/// 7. The source tensor of the extract slice op is not defined outside of
154/// the outermost enclosing scf::ForOp.
155/// 8. There is no enclosing scf::ForOp that indexes the padded data.
156/// Other cases succeed and will trigger hoisting of the pad op.
157struct HoistPaddingAnalysis {
158 HoistPaddingAnalysis(tensor::PadOp padOp, int numLoops);
159 HoistPaddingAnalysis(tensor::PadOp padOp, scf::ForOp outermostEnclosingForOp);
160
161 bool isValid() { return valid.has_value() && valid.value(); }
162 bool isInvalid() { return valid.has_value() && !valid.value(); }
163
164 /// Footprint of the hoistedPackedTensor, computed from the packingLoops.
165 SmallVector<Value> getHoistedPackedTensorSizes(RewriterBase &rewriter,
166 Location loc) const;
167
168 /// Performs optional hoisting to enable hoist padding to occur. This may be
169 /// necessary when `sliceOp` is not defined outside of the outermost enclosing
170 /// loop we want to hoist above.
171 ///
172 /// Example:
173 /// ```
174 /// %source = linalg.fill(%cst, %arg0)
175 /// // %source is available for packing here!
176 /// scf.for %i
177 /// scf.for %j
178 /// scf.for %k
179 /// %slice = tensor.extract_slice %source [%i, %j]
180 /// %padded_slice = tensor.pad %slice
181 /// ```
182 void enableHoistPadding(RewriterBase &rewriter);
183
184 /// Common analysis builder to finalize the construction of the analysis once
185 /// optional `enableHoistPadding` has run.
186 /// `reverseEnclosingLoops.back()` is the loop to hoist above.
187 void finalizeHoistPaddingAnalysis();
188
189private:
190 /// Encodes whether the analysis is valid and hoisting can proceed.
191 std::optional<bool> valid;
192
193 /// The padOp to hoist.
194 tensor::PadOp opToHoist;
195
196 /// Immediately enclosing loops considered for hoisting padding.
197 SmallVector<scf::ForOp> reverseEnclosingLoops;
198
199 /// Drop any non-index dependencies of `padOp` and `sliceOp` from
200 /// `backwardSlice`. The method follows the use-def chains of the index
201 /// operands consumed by `padOp` and `sliceOp` and drops the operations
202 /// not part of this index computation. Afterwards, the filtered
203 /// `backwardSlice` contains only the loops whose induction variable is
204 /// used, directly or indirectly, to index the padded tensor. The method
205 /// returns failure if the filtered backward slice contains an unexpected
206 /// operation.
207 ///
208 /// Example:
209 /// ```
210 /// %source = linalg.fill(%cst, %arg0)
211 /// scf.for %i
212 /// %unrelated = linalg.fill(%cst, %arg1) // not used to index
213 /// %source! scf.for %j (%arg2 = %unrelated)
214 /// scf.for %k // not used to index
215 /// %source!
216 /// %ubi = affine.min #map(%i)
217 /// %ubj = affine.min #map(%j)
218 /// %slice = tensor.extract_slice %source [%i, %j] [%ubi, %ubj]
219 /// %padded_slice = tensor.pad %slice
220 /// ```
221 /// dropNonIndexDependencies(%padded_slice, %slice)
222 /// removes [scf.for %k, linalg.fill(%cst, %arg1)] from backwardSlice.
223 LogicalResult dropNonIndexDependencies();
224
225public:
226 /// The outermost loop, determined by `nLevels` above which `padOp` will
227 /// be hoisted.
228 scf::ForOp outermostEnclosingForOp;
229
230 /// Backward slice rooted at `padOp` and nested under
231 /// `outermostEnclosingForOp`.
232 SetVector<Operation *> backwardSlice;
233
234 /// The scf::ForOp immediately enclosing `padOp` such that:
235 /// 1. they are nested under `outermostEnclosingForOp` (inclusive)
236 /// 2. whose induction variable is used, directly or indirectly, in the
237 /// computation of `padOp`.
238 /// The span of these loops determines the footprint of the packed tensor.
239 SmallVector<scf::ForOp> packingLoops;
240
241 /// The ExtractSliceOp that feeds the PadOp we want to hoist.
242 tensor::ExtractSliceOp sliceOp;
243
244 /// If non-empty, this is the unique scf::ForOp that consumes the `sliceOp`.
245 scf::ForOp padConsumingForOp;
246};
247
248} // namespace
249
250HoistPaddingAnalysis::HoistPaddingAnalysis(tensor::PadOp padOp, int numLoops)
251 : valid(std::nullopt), opToHoist(padOp) {
252 // Get at most `numLoops` of immediately enclosing loops.
253 getAtMostNEnclosingLoops(padOp: opToHoist, nLevels: numLoops, reverseEnclosingLoops);
254 if (reverseEnclosingLoops.empty()) {
255 LLVM_DEBUG(DBGS() << "--No immediately enclosing loop -> Skip\n");
256 valid = false;
257 return;
258 }
259 outermostEnclosingForOp = reverseEnclosingLoops.back();
260 sliceOp = opToHoist.getSource().getDefiningOp<tensor::ExtractSliceOp>();
261 if (!sliceOp) {
262 LLVM_DEBUG(DBGS() << "--Cannot find the extract slice op -> Skip\n");
263 valid = false;
264 return;
265 }
266}
267
268HoistPaddingAnalysis::HoistPaddingAnalysis(tensor::PadOp padOp,
269 scf::ForOp outermostEnclosingForOp)
270 : valid(std::nullopt), opToHoist(padOp) {
271 // Get enclosing loops until outermostEnclosingForOp.
272 getEnclosingLoopsUntil(padOp: opToHoist, untilLoop: outermostEnclosingForOp,
273 reverseEnclosingLoops);
274 if (reverseEnclosingLoops.empty()) {
275 LLVM_DEBUG(DBGS() << "--No immediately enclosing loop -> Skip\n");
276 valid = false;
277 return;
278 }
279 this->outermostEnclosingForOp = reverseEnclosingLoops.back();
280 if (this->outermostEnclosingForOp != outermostEnclosingForOp) {
281 LLVM_DEBUG(DBGS() << "--Unexpected outermost enclosing loop -> Skip\n");
282 valid = false;
283 return;
284 }
285 sliceOp = opToHoist.getSource().getDefiningOp<tensor::ExtractSliceOp>();
286 if (!sliceOp) {
287 LLVM_DEBUG(DBGS() << "--Cannot find the extract slice op -> Skip\n");
288 valid = false;
289 return;
290 }
291}
292
293void HoistPaddingAnalysis::enableHoistPadding(RewriterBase &rewriter) {
294 if (isInvalid())
295 return;
296 // If the padded data is not yet available before entering the outermost
297 // enclosing loop, try to apply hoisting on this outermost loop.
298 // TODO: we may want finer-grained hoisting of only that particular `sliceOp`.
299 if (!outermostEnclosingForOp.isDefinedOutsideOfLoop(value: sliceOp.getSource())) {
300 outermostEnclosingForOp = cast<scf::ForOp>(
301 Val: hoistLoopInvariantSubsets(rewriter, loopLike: outermostEnclosingForOp));
302 }
303}
304
305void HoistPaddingAnalysis::finalizeHoistPaddingAnalysis() {
306 if (isInvalid())
307 return;
308
309 if (!outermostEnclosingForOp.isDefinedOutsideOfLoop(value: sliceOp.getSource())) {
310 LLVM_DEBUG(DBGS() << "--outermostEnclosingForOp:\n"
311 << outermostEnclosingForOp << "\n"
312 << "--sliceOp: " << sliceOp << "\n"
313 << "--sliceOp.getSource(): " << sliceOp.getSource()
314 << "\n");
315 LLVM_DEBUG(DBGS() << "----Source not defined outside of loops -> Skip\n");
316 valid = false;
317 return;
318 }
319 if (sliceOp->hasOneUse()) {
320 padConsumingForOp = dyn_cast<scf::ForOp>(Val: *(sliceOp->getUsers().begin()));
321 }
322
323 // Check the region of `padOp` depends on a constant only. Adding hoisting
324 // support for arbitrary padding regions would require cloning all
325 // dependencies captured by the padding region.
326 Value paddingValue = opToHoist.getConstantPaddingValue();
327 if (!paddingValue ||
328 !isa_and_nonnull<arith::ConstantOp>(Val: paddingValue.getDefiningOp())) {
329 LLVM_DEBUG(DBGS() << "Cannot find constant padding value -> Skip\n");
330 valid = false;
331 return;
332 }
333
334 computeBackwardSlice(padOp: opToHoist, outermostEnclosingForOp, backwardSlice);
335 if (backwardSlice.size() <= 1) {
336 valid = false;
337 return;
338 }
339
340 debugPrintBackwardSlice(backwardSlice);
341 // Remove all ops in the backward slice that are not used to index
342 // the padded tensor. In particular, keep `padOp`, `sliceOp`, and
343 // the loop and affine operations used for the index computation.
344 if (failed(Result: dropNonIndexDependencies())) {
345 LLVM_DEBUG(DBGS() << "--Cannot dropNonIndexDependencies -> Skip\n");
346 valid = false;
347 return;
348 }
349 debugPrintBackwardSlice(backwardSlice);
350
351 // Add only the loops part of the filtered `backwardSlice` to the
352 // packing loops. All other loops are not used to index the padded
353 // data and consequently access the same data in every loop
354 // iteration. Adding them to the packing loops would increase the
355 // cache footprint of the packed data by storing the same data
356 // multiple times.
357 for (scf::ForOp forOp : llvm::reverse(C&: reverseEnclosingLoops))
358 if (backwardSlice.contains(key: forOp))
359 packingLoops.push_back(Elt: forOp);
360
361 // TODO: for multiple loops we need to track the use to the innermost loop.
362 if (packingLoops.size() > 1 && padConsumingForOp) {
363 LLVM_DEBUG(DBGS() << "--Cannot hoist multiple loops through iter_args -> "
364 "Downgrade to 1 loop\n");
365 packingLoops.resize(N: 1);
366 }
367
368 // Note: at this point, packing loops may be empty but we would still like
369 // to hoist the padding if so specified.
370
371 // The analysis is valid and hoisting can occur.
372 valid = true;
373}
374
375LogicalResult HoistPaddingAnalysis::dropNonIndexDependencies() {
376 // Set of all values used for index computation.
377 SetVector<Value> indexEdges;
378
379 // Add all index operands of `operation` to `indexEdges`. An index operand
380 // is an operand of type index.
381 auto addIndexOperandsToIndexEdges = [&](Operation *operation) {
382 for (Value operand : operation->getOperands())
383 if (operand.getType().isIndex())
384 indexEdges.insert(X: operand);
385 };
386
387 // Check if any operation result is contained in `indexEdges`.
388 auto hasIndexResult = [&](Operation *operation) {
389 return llvm::any_of(Range: operation->getResults(), P: [&](Value result) {
390 return indexEdges.contains(key: result);
391 });
392 };
393
394 // Starting from `opToHoist` and `sliceOp` walk the use-def edges of index
395 // type in `backwardSlice`. Add the index operands of an operation to
396 // `indexEdges` and remove all operations from `backwardSlice` that are not
397 // part of the index computation.
398 //
399 // Example:
400 // ```
401 // %source = linalg.fill(%cst, %arg0)
402 // scf.for %i
403 // %unrelated = linalg.fill(%cst, %arg1) // not used to index %source!
404 // scf.for %j (%arg2 = %unrelated)
405 // scf.for %k // not used to index %source!
406 // %ubi = affine.min #map(%i)
407 // %ubj = affine.min #map(%j)
408 // %slice = tensor.extract_slice %source [%i, %j] [%ubi, %ubj]
409 // %padded_slice = tensor.pad %slice
410 // ```
411 // After iterating `backwardSlice` we obtain:
412 // indexEdges = [%i, %j, %ubi, %ubj]
413 // backwardSlice = backwardSlice / [linalg.fill(%cst, %arg1), scf.for %k]
414 SetVector<Operation *> operationsToRemove;
415 for (Operation *op : llvm::reverse(C&: backwardSlice)) {
416 // Add the index operands of `opToHoist` and `sliceOp` to start the
417 // exploration of the index computation.
418 if (op == opToHoist || op == sliceOp) {
419 addIndexOperandsToIndexEdges(op);
420 continue;
421 }
422 // Add the index operands of the loop if its induction variable is
423 // used for index computation.
424 if (auto forOp = dyn_cast<scf::ForOp>(Val: op)) {
425 if (!hasIndexResult(op) && indexEdges.contains(key: forOp.getInductionVar())) {
426 addIndexOperandsToIndexEdges(op);
427 continue;
428 }
429 }
430 // Add the index operands of all other operations if at least one result
431 // is used for index computation.
432 if (hasIndexResult(op)) {
433 addIndexOperandsToIndexEdges(op);
434 // Check the operands of the remaining operations all have index type.
435 if (llvm::any_of(Range: op->getOperandTypes(),
436 P: [](Type type) { return !type.isIndex(); })) {
437 LLVM_DEBUG(DBGS() << "Unsupported op with non index type operands: "
438 << op << " -> Skip\n");
439 return failure();
440 }
441 // Check the remaining operations do not have regions or memory effects.
442 auto effectInterface = dyn_cast<MemoryEffectOpInterface>(Val: op);
443 bool hasMemoryEffect = effectInterface && !effectInterface.hasNoEffect();
444 if (hasMemoryEffect || op->getNumRegions() != 0) {
445 LLVM_DEBUG(DBGS() << "Unsupported op with region or memory effect: "
446 << op << " -> Skip\n");
447 return failure();
448 }
449 continue;
450 }
451 // Remove all other operations not used by the index computation. An
452 // exception are constant operations that may be used by `opToHoist`.
453 if (!isa<arith::ConstantOp>(Val: op))
454 operationsToRemove.insert(X: op);
455 }
456 backwardSlice.set_subtract(operationsToRemove);
457 return success();
458}
459
460SmallVector<Value>
461HoistPaddingAnalysis::getHoistedPackedTensorSizes(RewriterBase &rewriter,
462 Location loc) const {
463 SmallVector<Value> dynamicTensorSizes;
464
465 // Upper bound the packing loop lengths to size the packed tensor. Taking
466 // upper bounds can make the sizes of the packed tensor independent of the
467 // enclosing loops. This independence is a prerequisite for reusing the same
468 // buffer for all enclosing loop iterations and hoisting its allocation out
469 // of the enclosing loops.
470 for (auto forOp : packingLoops) {
471 // Compute an upper bound `ubVal` for the upper bound of `forOp`.
472 FailureOr<OpFoldResult> loopUb = affine::reifyIndexValueBound(
473 b&: rewriter, loc, type: presburger::BoundType::UB, value: forOp.getUpperBound(),
474 /*stopCondition=*/
475 [&](Value v, std::optional<int64_t> d, ValueBoundsConstraintSet &cstr) {
476 if (v == forOp.getUpperBound())
477 return false;
478 // Compute a bound that is independent of any affine op results.
479 Operation *op = v.getDefiningOp();
480 if (!op)
481 return true;
482 return !isa<affine::AffineMinOp, affine::AffineMaxOp,
483 affine::AffineApplyOp>(Val: op);
484 },
485 /*closedUB=*/true);
486 assert(succeeded(loopUb) && "could not get upper bound");
487 Value ubVal = getValueOrCreateConstantIndexOp(b&: rewriter, loc, ofr: *loopUb);
488
489 // Compute the maximal packing loop length as (ub - lb).ceilDiv(step) and
490 // store the result to `dynamicTensorSizes`.
491 // TODO: instead of using the lower bound of `forOp` directly, implement a
492 // lower bound computation similar to the upper bound computation.
493 AffineExpr lb, ub, step;
494 bindDims(ctx: rewriter.getContext(), exprs&: lb, exprs&: ub);
495 bindSymbols(ctx: rewriter.getContext(), exprs&: step);
496 Value res = rewriter.createOrFold<affine::AffineApplyOp>(
497 location: loc, args: (ub - lb).ceilDiv(other: step),
498 args: ValueRange{forOp.getLowerBound(), ubVal,
499 cast<scf::ForOp>(Val&: forOp).getStep()});
500 dynamicTensorSizes.push_back(Elt: res);
501 }
502
503 return dynamicTensorSizes;
504}
505
506static bool isDefinedOutsideOrConstant(scf::ForOp outer, Value v) {
507 return outer.isDefinedOutsideOfLoop(value: v) || matchPattern(value: v, pattern: m_Constant());
508}
509
510//===----------------------------------------------------------------------===//
511// buildPackingLoopNest Implementation.
512//===----------------------------------------------------------------------===//
513
514/// Return the current iteration number in the loop (iv - lb).ceilDiv(step).
515/// The returned Value is guaranteed not to depend on any loop comprised in
516/// [`outer`, `forOp`].
517/// Return null if such a loop-independent quantity cannot be computed.
518static Value buildLoopIterationCount(RewriterBase &rewriter, scf::ForOp outer,
519 scf::ForOp forOp) {
520 MLIRContext *ctx = forOp->getContext();
521 AffineExpr iv, lb, step;
522 bindDims(ctx, exprs&: iv, exprs&: lb);
523 bindSymbols(ctx, exprs&: step);
524 if (!isDefinedOutsideOrConstant(outer, v: forOp.getLowerBound()) ||
525 !isDefinedOutsideOrConstant(outer, v: forOp.getStep()))
526 return Value();
527 Value ivVal = forOp.getInductionVar(), lbVal = forOp.getLowerBound(),
528 stepVal = forOp.getStep();
529 auto loc = forOp->getLoc();
530 return rewriter.createOrFold<affine::AffineApplyOp>(
531 location: loc, args: (iv - lb).ceilDiv(other: step), args: ValueRange{ivVal, lbVal, stepVal});
532}
533
534// Build a packing loop nest by iteratively traversing the backward slice and
535// clone the operations, iteratively stepping into the loops that we encounter.
536// The implementation proceeds in a stack-like fashion:
537// 1. Iteratively clone and step into the loops, pushing the
538// `hoistedPackedTensor`
539// deeper in the stack.
540// 2. At the innermost loop level, create a GenericOp if `transposeVector` is
541// non-empty.
542// 3. At the innermost loop level, create a InsertSliceOp.
543// 4. Iteratively pop and yield the result of the InsertSliceOp across the
544// cloned loops.
545static FailureOr<PackingResult> buildPackingLoopNestImpl(
546 RewriterBase &rewriter, IRMapping &bvm, tensor::PadOp opToHoist,
547 ArrayRef<int64_t> transposeVector, RankedTensorType transposedTensorType,
548 tensor::EmptyOp emptyOp, const HoistPaddingAnalysis &analysis) {
549 SmallVector<OpFoldResult> offsets, sizes, strides;
550 SmallVector<Value> clonedLoopIvs, leadingHoistedPackedTensorIndexings;
551
552 scf::ForOp outerLoop = analysis.outermostEnclosingForOp;
553
554 Location loc = opToHoist->getLoc();
555 RankedTensorType paddedTensorType = opToHoist.getResultType();
556 int paddedRank = paddedTensorType.getRank();
557
558 // Step 0. Populate bvm with opToHoist.getSource if relevant.
559 BlockArgument bbArg = dyn_cast<BlockArgument>(Val: opToHoist.getSource());
560 while (bbArg) {
561 auto forOp = dyn_cast<scf::ForOp>(Val: bbArg.getOwner()->getParentOp());
562 if (!forOp)
563 break;
564 if (forOp != outerLoop && !outerLoop->isAncestor(other: forOp))
565 break;
566 OpOperand &operand = *forOp.getTiedLoopInit(bbArg);
567 bvm.map(from: bbArg, to: operand.get());
568 bbArg = dyn_cast<BlockArgument>(Val: operand.get());
569 }
570
571 // Step 1. iteratively clone loops and push `hoistedPackedTensor`.
572 Value hoistedPackedTensor = emptyOp.getResult();
573 OpBuilder::InsertionGuard g(rewriter);
574 for (Operation *op : analysis.backwardSlice) {
575 // Specifically sit out in the extract_slice(hoistedPackedTensor) case: this
576 // is the piece we seek to replace.
577 if (auto sliceOp = dyn_cast<tensor::ExtractSliceOp>(Val: op)) {
578 if (bvm.lookupOrDefault(from: sliceOp.getSource()) == hoistedPackedTensor) {
579 LLVM_DEBUG(DBGS() << "--Skip: " << sliceOp << "\n");
580 continue;
581 }
582 }
583
584 // Clone all operations except loops which require special handling.
585 auto forOp = dyn_cast<scf::ForOp>(Val: op);
586 if (!forOp) {
587 // We are at the right insertion point within the loop nest.
588 rewriter.clone(op&: *op, mapper&: bvm);
589 continue;
590 }
591
592 // Create a packing loop that takes `hoistedPackedTensor` as iteration
593 // argument.
594 auto clonedForOp = rewriter.create<scf::ForOp>(
595 location: loc, args: bvm.lookupOrDefault(from: forOp.getLowerBound()),
596 args: bvm.lookupOrDefault(from: forOp.getUpperBound()),
597 args: bvm.lookupOrDefault(from: forOp.getStep()), args&: hoistedPackedTensor);
598
599 // Map the induction var, region args and results to the `clonedForOp`.
600 bvm.map(from: forOp.getInductionVar(), to: clonedForOp.getInductionVar());
601 bvm.map(from: forOp.getRegionIterArgs(), to: clonedForOp.getRegionIterArgs());
602 bvm.map(from: forOp.getResults(), to: clonedForOp.getResults());
603 assert(clonedForOp->getNumRegions() == 1);
604 clonedLoopIvs.push_back(Elt: clonedForOp.getInductionVar());
605
606 // Do not insert guard here, we get deeper into the loop nest.
607 rewriter.setInsertionPointToStart(&clonedForOp->getRegion(index: 0).front());
608 Value loopIndependentIterationCount =
609 buildLoopIterationCount(rewriter, outer: outerLoop, forOp: clonedForOp);
610
611 // Assert the loop-independent iteration count can be computed.
612 if (!loopIndependentIterationCount)
613 llvm_unreachable("loop independence prerequisite not met");
614 leadingHoistedPackedTensorIndexings.push_back(
615 Elt: loopIndependentIterationCount);
616 hoistedPackedTensor = clonedForOp.getRegionIterArgs().front();
617 }
618
619 // Step 2. Construct offsets, sizes and strides for the innermost level of the
620 // packing loop.
621 int64_t nPackedLoops = clonedLoopIvs.size();
622 // offsets = [clonedLoopIvs, 0 .. 0].
623 offsets =
624 SmallVector<OpFoldResult>{leadingHoistedPackedTensorIndexings.begin(),
625 leadingHoistedPackedTensorIndexings.end()};
626 offsets.append(NumInputs: paddedRank, Elt: rewriter.getIndexAttr(value: 0));
627 // sizes = [1 .. 1, transposedShape].
628 sizes = SmallVector<OpFoldResult>(nPackedLoops, rewriter.getIndexAttr(value: 1));
629 for (int64_t sz : transposedTensorType.getShape()) {
630 // TODO: go grab dims when needed, atm tensor::PadOp yields a static tensor.
631 if (ShapedType::isDynamic(dValue: sz))
632 return failure();
633 sizes.push_back(Elt: rewriter.getIndexAttr(value: sz));
634 }
635 // strides = [1 .. 1].
636 strides = SmallVector<OpFoldResult>(nPackedLoops + paddedRank,
637 rewriter.getIndexAttr(value: 1));
638
639 // Step 3. Optionally transpose the padded tensor.
640 TransposeOp maybeTransposeOp;
641 Value paddedTensor = bvm.lookup(from: opToHoist.getResult());
642 if (!transposeVector.empty()) {
643 Value outputTensor = rewriter.create<tensor::ExtractSliceOp>(
644 location: loc, args&: transposedTensorType, args&: hoistedPackedTensor, args&: offsets, args&: sizes,
645 args&: strides);
646 maybeTransposeOp = rewriter.create<linalg::TransposeOp>(
647 location: loc, args&: paddedTensor, args&: outputTensor, args&: transposeVector);
648 paddedTensor = maybeTransposeOp.getResult()[0];
649 }
650
651 // Innermost tensor.insert_slice and yields are optional / need loops.
652 if (nPackedLoops > 0) {
653 // Step 4. Create InsertSliceOp at the innermost loop level, inserting an
654 // optionally transposed padded slice into the packed tensor.
655 Value inserted = rewriter.create<tensor::InsertSliceOp>(
656 location: loc, args&: paddedTensor, args&: hoistedPackedTensor, args&: offsets, args&: sizes, args&: strides);
657
658 // Step 5. Iteratively pop the stack and propagate the yield.
659 Value valueToYield = inserted;
660 for (Value iv : llvm::reverse(C&: clonedLoopIvs)) {
661 auto forOp = scf::getForInductionVarOwner(val: iv);
662 rewriter.setInsertionPointToEnd(&forOp.getRegion().front());
663 rewriter.create<scf::YieldOp>(location: loc, args&: valueToYield);
664 valueToYield = forOp.getResult(i: 0);
665 }
666 }
667
668 return PackingResult{
669 .offsets: offsets,
670 .sizes: sizes,
671 .strides: strides,
672 .clonedLoopIvs: clonedLoopIvs,
673 .leadingPackedTensorIndexings: leadingHoistedPackedTensorIndexings,
674 .maybeTransposeOp: maybeTransposeOp,
675 .hoistedPadOp: cast<tensor::PadOp>(Val: bvm.lookup(from: opToHoist.getResult()).getDefiningOp())};
676}
677
678/// Build the packing loop nest required to hoist `opToHoist` above
679/// `outermostEnclosingForOp`.
680/// The loop nest is built just before `outermostEnclosingForOp`.
681static FailureOr<PackingResult> buildPackingLoopNestImpl(
682 RewriterBase &rewriter, IRMapping &bvm, tensor::PadOp opToHoist,
683 ArrayRef<int64_t> transposeVector, const HoistPaddingAnalysis &analysis) {
684 // Update actual number of loops, which may be smaller.
685 int nPackedLoops = analysis.packingLoops.size();
686 LLVM_DEBUG(DBGS() << "\n";
687 DBGS() << "Func:\n"
688 << *opToHoist->getParentOfType<func::FuncOp>() << "\n";
689 DBGS() << "Start hoisting above " << nPackedLoops << " loops\n");
690
691 Location loc = opToHoist->getLoc();
692 RankedTensorType paddedTensorType = opToHoist.getResultType();
693
694 // Compute the type of the transposed padded tensor.
695 FailureOr<RankedTensorType> transposedTensorType =
696 tensor::computeTransposedType(rankedTensorType: paddedTensorType, transposeVector);
697 if (failed(Result: transposedTensorType)) {
698 LLVM_DEBUG(DBGS() << "--Could not compute transposed type -> Skip\n");
699 return failure();
700 }
701
702 // Create the packed tensor<?x?x..? x transposedShape>.
703 SmallVector<int64_t> packedShape(nPackedLoops, ShapedType::kDynamic);
704 // TODO: go grab dims when needed, atm tensor::PadOp yields a static tensor.
705 llvm::append_range(C&: packedShape, R: transposedTensorType->getShape());
706 auto hoistedPackedTensorType = RankedTensorType::get(
707 shape: packedShape, elementType: transposedTensorType->getElementType());
708
709 // Set the insertion point right before the outer loop and start packing.
710 scf::ForOp outerLoop = analysis.outermostEnclosingForOp;
711 OpBuilder::InsertionGuard g(rewriter);
712 rewriter.setInsertionPoint(outerLoop);
713 SmallVector<Value> dynamicTensorSizes =
714 analysis.getHoistedPackedTensorSizes(rewriter, loc);
715 auto emptyOp = rewriter.create<tensor::EmptyOp>(
716 location: loc, args: hoistedPackedTensorType.getShape(),
717 args: hoistedPackedTensorType.getElementType(), args&: dynamicTensorSizes);
718
719 return buildPackingLoopNestImpl(rewriter, bvm, opToHoist, transposeVector,
720 transposedTensorType: *transposedTensorType, emptyOp, analysis);
721}
722
723/// Build the packing loop nest required to hoist `opToHoist` above
724/// `outermostEnclosingForOp`.
725/// The loop nest is built just before `outermostEnclosingForOp`.
726FailureOr<PackingResult> mlir::linalg::detail::buildPackingLoopNest(
727 RewriterBase &rewriter, tensor::PadOp opToHoist,
728 scf::ForOp outermostEnclosingForOp, ArrayRef<int64_t> transposeVector) {
729 HoistPaddingAnalysis analysis(opToHoist, outermostEnclosingForOp);
730 analysis.enableHoistPadding(rewriter);
731 analysis.finalizeHoistPaddingAnalysis();
732 if (!analysis.isValid()) {
733 LLVM_DEBUG(DBGS() << "--Analysis failed -> Skip\n");
734 return failure();
735 }
736 IRMapping bvm;
737 return buildPackingLoopNestImpl(rewriter, bvm, opToHoist, transposeVector,
738 analysis);
739}
740
741//===----------------------------------------------------------------------===//
742// hoistPaddingOnTensors Implementation.
743//===----------------------------------------------------------------------===//
744
745/// Return true if we can walk back the use-def chain from `extractSliceOp` to
746/// expectedSource going through DestinationStyleOpInterface inits only.
747/// This is a poor man's analysis that is sufficient to check the extractSliceOp
748/// the matches tensor.pad we want to hoist.
749/// In the future, it will be easier to ensure this with a matching symmetric
750/// tensor.unpad op.
751static bool tracesBackToExpectedValue(tensor::ExtractSliceOp extractSliceOp,
752 Value expectedSource) {
753 LLVM_DEBUG(DBGS() << "Start tracesBackToExpectedValue on: " << extractSliceOp
754 << "\n");
755 LLVM_DEBUG(DBGS() << "--with extractSlice: " << extractSliceOp << "\n");
756 Value source = extractSliceOp.getSource();
757 LLVM_DEBUG(DBGS() << "--with starting source: " << source << "\n");
758 while (source && source != expectedSource) {
759 auto destOp =
760 dyn_cast_or_null<DestinationStyleOpInterface>(Val: source.getDefiningOp());
761 if (!destOp)
762 break;
763 LLVM_DEBUG(DBGS() << "--step dest op: " << destOp << "\n");
764 source = destOp.getDpsInitOperand(i: cast<OpResult>(Val&: source).getResultNumber())
765 ->get();
766 }
767 LLVM_DEBUG(DBGS() << "--final source: " << source << "\n");
768 LLVM_DEBUG(DBGS() << "--expected source: " << expectedSource << "\n");
769 return source == expectedSource;
770}
771
772/// If the original consumer of `outerSliceOp` was a `forOp` (i.e. through an
773/// iter arg), propagate the `hoistedPackedTensor` value through the same iter
774/// arg.
775/// TODO: for multiple loops we need to track the use to the innermost loop.
776///
777/// Match:
778/// ```
779/// %outerSliceOp = tensor.extract_slice ..
780/// %f = scf.for ... iter_args(%arg0 = %outerSliceOp) {
781/// %hoistedPackedTensor = tensor.pad %arg0
782/// %1 = compute %hoistedPackedTensor
783/// %2 = tensor.extract_slice %1
784/// scf.yield %2
785/// }
786/// ```
787///
788/// and rewrite as:
789/// ```
790/// %outerSliceOp = tensor.extract_slice ..
791/// %hoistedPackedTensor = tensor.pad %outerSliceOp
792/// %f = scf.for ... iter_args(%arg0 = %hoistedPackedTensor) {
793/// %1 = compute %arg0
794/// scf.yield %1
795/// }
796/// %2 = tensor.extract_slice %forOp
797/// ```
798///
799/// Return null when no rewrite happened.
800static tensor::ExtractSliceOp
801padThroughLoopIterArg(RewriterBase &rewriter, Value paddedValueBeforeHoisting,
802 Value hoistedPackedTensor,
803 tensor::ExtractSliceOp outerSliceOp, scf::ForOp forOp) {
804 LLVM_DEBUG(DBGS() << "Start padThroughLoopIterArg on: " << forOp << "\n");
805 LLVM_DEBUG(DBGS() << "--paddedValueBeforeHoisting: "
806 << paddedValueBeforeHoisting << "\n");
807 OpOperand *pUse = nullptr;
808 for (OpOperand &use : outerSliceOp->getUses()) {
809 if (use.getOwner() == forOp) {
810 assert(!pUse && "Multiple slice uses in the for loop");
811 pUse = &use;
812 }
813 }
814 assert(pUse && "No slice use in the for loop");
815 OpBuilder::InsertionGuard g(rewriter);
816 rewriter.setInsertionPointAfter(hoistedPackedTensor.getDefiningOp());
817
818 unsigned iterArgNumber = forOp.getTiedLoopResult(opOperand: pUse).getResultNumber();
819 auto yieldingExtractSliceOp = forOp.getYieldedValues()[iterArgNumber]
820 .getDefiningOp<tensor::ExtractSliceOp>();
821 if (!yieldingExtractSliceOp)
822 return tensor::ExtractSliceOp();
823
824 // Poor man's analysis sufficient to ensure extractSlice matches tensor.pad.
825 // In the future, it will be easier to ensure this with a matching symmetric
826 // tensor.unpad op.
827 if (!tracesBackToExpectedValue(extractSliceOp: yieldingExtractSliceOp,
828 expectedSource: paddedValueBeforeHoisting))
829 return tensor::ExtractSliceOp();
830
831 SmallVector<Value> initArgs = forOp.getInitArgs();
832 initArgs[iterArgNumber] = hoistedPackedTensor;
833 SmallVector<Value> yieldOperands = llvm::to_vector(Range: forOp.getYieldedValues());
834 yieldOperands[iterArgNumber] = yieldingExtractSliceOp.getSource();
835
836 int64_t numOriginalForOpResults = initArgs.size();
837 LLVM_DEBUG(DBGS() << "numOriginalForOpResults: " << numOriginalForOpResults
838 << "\n");
839 tensor::ExtractSliceOp extracted;
840 {
841 OpBuilder::InsertionGuard g(rewriter);
842 rewriter.setInsertionPointAfter(forOp);
843 extracted = rewriter.create<tensor::ExtractSliceOp>(
844 location: hoistedPackedTensor.getLoc(), args&: hoistedPackedTensor,
845 args: outerSliceOp.getMixedOffsets(), args: outerSliceOp.getMixedSizes(),
846 args: outerSliceOp.getMixedStrides());
847 rewriter.replaceAllUsesWith(from: forOp.getResult(i: iterArgNumber), to: extracted);
848 }
849 scf::ForOp newForOp = cast<scf::ForOp>(Val: *forOp.replaceWithAdditionalYields(
850 rewriter, newInitOperands: initArgs, /*replaceInitOperandUsesInLoop=*/true,
851 newYieldValuesFn: [&](OpBuilder &b, Location loc, ArrayRef<BlockArgument> newBBArgs) {
852 return yieldOperands;
853 }));
854
855 LLVM_DEBUG(DBGS() << "newForOp results: " << newForOp.getNumResults()
856 << "\n");
857 LLVM_DEBUG(DBGS() << "replace source of: " << extracted << "\n");
858 LLVM_DEBUG(DBGS() << "with result #"
859 << numOriginalForOpResults + iterArgNumber
860 << " of forOp, giving us: " << extracted << "\n");
861 rewriter.startOpModification(op: extracted);
862 extracted.getSourceMutable().assign(
863 value: newForOp.getResult(i: numOriginalForOpResults + iterArgNumber));
864 rewriter.finalizeOpModification(op: extracted);
865
866 LLVM_DEBUG(DBGS() << "replace uses of: " << paddedValueBeforeHoisting
867 << "\n");
868 LLVM_DEBUG(DBGS() << "with region iter arg #"
869 << numOriginalForOpResults + iterArgNumber << "\n");
870 rewriter.replaceAllUsesWith(
871 from: paddedValueBeforeHoisting,
872 to: newForOp.getRegionIterArg(index: numOriginalForOpResults + iterArgNumber));
873
874 return extracted;
875}
876
877/// Produce a tensor extracted from the packingResult. This can be used as a
878/// replacement for `opToHoist` in callers.
879static Value replaceByPackingResult(RewriterBase &rewriter,
880 const IRMapping &bvm,
881 tensor::PadOp opToHoist,
882 RankedTensorType transposedTensorType,
883 const HoistPaddingAnalysis &analysis,
884 const PackingResult &packingResult) {
885 // The replacement occurs under a single insertion point within the original
886 // loop, just before opToHoist.
887 OpBuilder::InsertionGuard g(rewriter);
888 rewriter.setInsertionPoint(opToHoist);
889
890 Location loc = opToHoist->getLoc();
891 RankedTensorType paddedTensorType = opToHoist.getResultType();
892 int paddedRank = paddedTensorType.getRank();
893
894 int64_t nPackedLoops = packingResult.clonedLoopIvs.size();
895 LLVM_DEBUG(DBGS() << "nPackedLoops: " << nPackedLoops << " loops\n");
896
897 scf::ForOp outerLoop = analysis.outermostEnclosingForOp;
898 ArrayRef<scf::ForOp> packingLoops = analysis.packingLoops;
899
900 Value hoistedPackedTensor;
901 SmallVector<Value> loopIterationCounts;
902 SmallVector<OpFoldResult> offsets(nPackedLoops + paddedRank,
903 rewriter.getIndexAttr(value: 0));
904 if (nPackedLoops > 0) {
905 loopIterationCounts =
906 llvm::to_vector<4>(Range: llvm::map_range(C&: packingLoops, F: [&](Operation *loop) {
907 return buildLoopIterationCount(rewriter, outer: outerLoop,
908 forOp: cast<scf::ForOp>(Val: loop));
909 }));
910 // Assert all loop iteration counts can be computed.
911 if (llvm ::any_of(Range&: loopIterationCounts, P: [](Value v) { return !v; }))
912 llvm_unreachable("loop independence prerequisite not met");
913
914 // offsets = [maybe_leading_ivs = originalLoopIvs, 0 .. 0].
915 std::copy(first: loopIterationCounts.begin(), last: loopIterationCounts.end(),
916 result: offsets.begin());
917 hoistedPackedTensor =
918 scf::getForInductionVarOwner(val: packingResult.clonedLoopIvs.front())
919 ->getResult(idx: 0);
920 } else {
921 // If no loops were created, this is just hoisting without packing.
922 hoistedPackedTensor = bvm.lookup(from: opToHoist.getResult());
923 }
924
925 LLVM_DEBUG(DBGS() << "hoistedPackedTensor: " << hoistedPackedTensor << "\n");
926
927 // If the consumer of `padOp` was a `forOp`, propagate through iter args.
928 scf::ForOp forOp = analysis.padConsumingForOp;
929 if (forOp) {
930 return padThroughLoopIterArg(rewriter, paddedValueBeforeHoisting: opToHoist, hoistedPackedTensor,
931 outerSliceOp: analysis.sliceOp, forOp);
932 }
933
934 // offsets = [maybe_leading_ivs, 0 .. 0].
935 // sizes = [1 .. 1, transposedShape] (defined above).
936 // strides = [1 .. 1] (defined above)
937 return rewriter.create<tensor::ExtractSliceOp>(
938 location: loc, args&: transposedTensorType, args&: hoistedPackedTensor, args&: offsets,
939 args: packingResult.sizes, args: packingResult.strides);
940}
941
942FailureOr<Value> mlir::linalg::hoistPaddingOnTensors(
943 RewriterBase &rewriter, tensor::PadOp opToHoist, int64_t numLoops,
944 ArrayRef<int64_t> transposeVector, tensor::PadOp &hoistedOp,
945 SmallVectorImpl<TransposeOp> &transposeOps) {
946 LLVM_DEBUG(DBGS() << "\n"; DBGS() << " Try to hoist " << *(opToHoist) << "\n";
947 DBGS() << " by " << numLoops << " loops\n");
948
949 HoistPaddingAnalysis analysis(opToHoist, numLoops);
950 analysis.enableHoistPadding(rewriter);
951 analysis.finalizeHoistPaddingAnalysis();
952 if (!analysis.isValid()) {
953 LLVM_DEBUG(DBGS() << "--Analysis failed -> Skip\n");
954 return failure();
955 }
956
957 /// Construct the packing loop nest.
958 IRMapping bvm;
959 FailureOr<PackingResult> packingResult = buildPackingLoopNestImpl(
960 rewriter, bvm, opToHoist, transposeVector, analysis);
961 if (failed(Result: packingResult)) {
962 LLVM_DEBUG(DBGS() << "--buildPackingLoopNestImpl failed -> Skip\n");
963 return failure();
964 }
965
966 if (!transposeVector.empty())
967 transposeOps.push_back(Elt: packingResult->maybeTransposeOp);
968
969 FailureOr<RankedTensorType> transposedTensorType =
970 tensor::computeTransposedType(rankedTensorType: opToHoist.getResultType(), transposeVector);
971 assert(succeeded(transposedTensorType) && "unexpected failure in type");
972
973 // Now the packed tensor is ready, replace the original padding op by a
974 // 1x..x1 slice [originalLoopIvs, 0 .. 0][1 .. 1, paddedShape][1 .. 1].
975 Value newResult =
976 replaceByPackingResult(rewriter, bvm, opToHoist, transposedTensorType: *transposedTensorType,
977 analysis, packingResult: *packingResult);
978
979 Location loc = opToHoist->getLoc();
980 RankedTensorType paddedTensorType = opToHoist.getResultType();
981 if (!transposeVector.empty()) {
982 OpBuilder::InsertionGuard g(rewriter);
983 rewriter.setInsertionPointAfter(newResult.getDefiningOp());
984 // Transpose the packed tensor back to the original storage order.
985 Value emptyTensor = rewriter.create<tensor::EmptyOp>(
986 location: loc, args: paddedTensorType.getShape(), args: paddedTensorType.getElementType());
987 TransposeOp unTransposeOp = rewriter.create<linalg::TransposeOp>(
988 location: loc, args&: newResult, args&: emptyTensor, args&: transposeVector);
989 newResult = unTransposeOp.getResult()[0];
990 transposeOps.push_back(Elt: unTransposeOp);
991 }
992
993 LLVM_DEBUG(DBGS() << "newResult: " << newResult << "\n");
994 LLVM_DEBUG(
995 DBGS() << "After hoisting: "
996 << newResult.getDefiningOp()->getParentOfType<func::FuncOp>()
997 << "\n");
998
999 // Make the newly cloned `opToHoist` available to the caller.
1000 hoistedOp = packingResult->hoistedPadOp;
1001
1002 LLVM_DEBUG(DBGS() << "--SUCCESS\n");
1003 return newResult;
1004}
1005
1006FailureOr<Value> mlir::linalg::hoistPaddingOnTensors(
1007 tensor::PadOp opToHoist, int64_t numLoops,
1008 ArrayRef<int64_t> transposeVector, tensor::PadOp &hoistedOp,
1009 SmallVectorImpl<TransposeOp> &transposeOps) {
1010 IRRewriter rewriter(opToHoist.getContext());
1011 return hoistPaddingOnTensors(rewriter, opToHoist, numLoops, transposeVector,
1012 hoistedOp, transposeOps);
1013}
1014

source code of mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp