1 | //===- Transforms.h - Linalg transformations as patterns --------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #ifndef MLIR_DIALECT_LINALG_TRANSFORMS_TRANSFORMS_H |
10 | #define MLIR_DIALECT_LINALG_TRANSFORMS_TRANSFORMS_H |
11 | |
12 | #include <utility> |
13 | |
14 | #include "mlir/Conversion/VectorToSCF/VectorToSCF.h" |
15 | #include "mlir/Dialect/Bufferization/IR/Bufferization.h" |
16 | #include "mlir/Dialect/Linalg/Utils/Utils.h" |
17 | #include "mlir/Dialect/MemRef/IR/MemRef.h" |
18 | #include "mlir/Dialect/SCF/Utils/Utils.h" |
19 | #include "mlir/Dialect/Tensor/IR/Tensor.h" |
20 | #include "mlir/Dialect/Utils/StaticValueUtils.h" |
21 | #include "mlir/Dialect/Vector/Transforms/VectorTransforms.h" |
22 | #include "mlir/Dialect/X86Vector/Transforms.h" |
23 | #include "mlir/IR/PatternMatch.h" |
24 | #include "mlir/Interfaces/TilingInterface.h" |
25 | #include "mlir/Support/LogicalResult.h" |
26 | #include "mlir/Transforms/DialectConversion.h" |
27 | #include "llvm/ADT/SmallBitVector.h" |
28 | #include "llvm/ADT/SmallSet.h" |
29 | |
30 | namespace mlir { |
31 | namespace bufferization { |
32 | class AllocTensorOp; |
33 | class OneShotAnalysisState; |
34 | } // namespace bufferization |
35 | |
36 | namespace linalg { |
37 | |
38 | class LinalgOp; |
39 | |
40 | //===----------------------------------------------------------------------===// |
41 | // Utils. |
42 | //===----------------------------------------------------------------------===// |
43 | |
44 | /// Return vector::CombiningKind for the given op. |
45 | std::optional<vector::CombiningKind> getCombinerOpKind(Operation *combinerOp); |
46 | |
47 | //===----------------------------------------------------------------------===// |
48 | // Bufferization-related transforms. |
49 | //===----------------------------------------------------------------------===// |
50 | |
51 | struct BufferizeToAllocationOptions { |
52 | enum class AllocOp { MemrefAlloc = 0, MemrefAlloca = 1 }; |
53 | AllocOp allocOp = AllocOp::MemrefAlloc; |
54 | |
55 | enum class MemcpyOp { |
56 | MaterializeInDestination = 0, |
57 | MemrefCopy = 1, |
58 | LinalgCopy = 2 |
59 | }; |
60 | MemcpyOp memcpyOp = MemcpyOp::MaterializeInDestination; |
61 | |
62 | /// If set to "true", only the destination tensor operands are bufferized to |
63 | /// a new allocation (and wrapped in "bufferization.to_tensor"), but not the |
64 | /// targeted op itself. |
65 | bool bufferizeDestinationOnly = false; |
66 | |
67 | /// If set to "true", a `memref.dealloc` operation will be emitted for each |
68 | /// allocated buffer. Otherwise, the memory is leaked, which is useful if |
69 | /// the buffer deallocation pipeline should be run after bufferization is |
70 | /// done. |
71 | bool emitDealloc = false; |
72 | }; |
73 | |
74 | /// Materialize a buffer allocation for the given tensor.pad op and lower the |
75 | /// op to linalg.fill/linalg.generic + bufferization.materialize_in_destination. |
76 | /// E.g.: |
77 | /// |
78 | /// %0 = tensor.pad low[%l] high[%h] %t ... |
79 | /// |
80 | /// is lowered to: |
81 | /// |
82 | /// %alloc = memref.alloc |
83 | /// linalg.fill ... outs(%alloc) |
84 | /// %subview = memref.subview %alloc [%l] [...] [1] |
85 | /// bufferization.materialize_in_destination %t in %subview |
86 | /// %0 = bufferization.to_tensor %alloc restrict writable |
87 | /// |
88 | /// In addition to rewriting the IR as shown above, this function returns the |
89 | /// newly allocated buffer. The `insertionPoint` parameter can be used to |
90 | /// specify a custom insertion point for the buffer allocation. |
91 | Value bufferizeToAllocation(RewriterBase &rewriter, |
92 | const BufferizeToAllocationOptions &options, |
93 | tensor::PadOp padOp, Attribute memorySpace = {}, |
94 | Operation *insertionPoint = nullptr); |
95 | |
96 | /// Materialize a buffer allocation for the given vector.mask op and bufferize |
97 | /// the op, including its region. E.g.: |
98 | /// |
99 | /// %0 = vector.mask { |
100 | /// vector.transfer_write %v, %t : vector<16xf32>, tensor<?xf32> |
101 | /// } : vector<16xi1> -> tensor<?xf32> |
102 | /// |
103 | /// is lowered to: |
104 | /// |
105 | /// %alloc = memref.alloc |
106 | /// bufferization.materialize_in_destination %t in %subview |
107 | /// vector.mask { |
108 | /// vector.transfer_write %arg0, %alloc : vector<16xf32>, memref<?xf32> |
109 | /// } : vector<16xi1> |
110 | /// %0 = bufferization.to_tensor %alloc restrict writable |
111 | /// |
112 | /// In addition to rewriting the IR as shown above, this function returns the |
113 | /// newly allocated buffer. The `insertionPoint` parameter can be used to |
114 | /// specify a custom insertion point for the buffer allocation. |
115 | Value bufferizeToAllocation(RewriterBase &rewriter, |
116 | const BufferizeToAllocationOptions &options, |
117 | vector::MaskOp maskOp, Attribute memorySpace = {}, |
118 | Operation *insertionPoint = nullptr); |
119 | |
120 | /// Materialize a buffer allocation for the given bufferization.alloc_tensor op |
121 | /// and lower the op to memref.alloc + memref.tensor_store. |
122 | /// |
123 | /// In addition to rewriting the IR, this function returns the newly allocated |
124 | /// buffer. The `insertionPoint` parameter can be used to specify a custom |
125 | /// insertion point for the buffer allocation. |
126 | Value bufferizeToAllocation(RewriterBase &rewriter, |
127 | const BufferizeToAllocationOptions &options, |
128 | bufferization::AllocTensorOp allocTensorOp, |
129 | Attribute memorySpace = {}, |
130 | Operation *insertionPoint = nullptr); |
131 | |
132 | /// Bufferize the given op with tensor semantics and materialize the result in |
133 | /// a newly allocated buffer. |
134 | /// |
135 | /// Only bufferizable ops that bufferize to a memory write or have an |
136 | /// aliasing OpOperand (and do not themselves bufferize to an allocation) are |
137 | /// supported. They are bufferized using their BufferizableOpInterface |
138 | /// implementation. |
139 | /// |
140 | /// Selected ops that bufferize to an allocation (or need special handling) are |
141 | /// also supported: |
142 | /// - tensor.pad |
143 | /// - vector.mask |
144 | /// |
145 | /// This function returns the newly allocated buffer. The `insertionPoint` |
146 | /// parameter can be used to specify a custom insertion point for the buffer |
147 | /// allocation. |
148 | Value bufferizeToAllocation(RewriterBase &rewriter, |
149 | const BufferizeToAllocationOptions &options, |
150 | Operation *op, Attribute memorySpace = {}, |
151 | Operation *insertionPoint = nullptr); |
152 | |
153 | /// Try to eliminate tensor::EmptyOps inside `op` that are anchored on a |
154 | /// LinalgOp. This transforms looks for LinalgOps that have an unused output |
155 | /// operand and an input operand that is rooted in a tensor::EmptyOp. The |
156 | /// tensor::EmptyOp uses are replaced with the output operand and the two |
157 | /// operands of the LinalgOp are swapped. |
158 | /// |
159 | /// Example: |
160 | /// %0 = tensor.empty() |
161 | /// %1 = linalg.matmul ins(...) outs(%0) |
162 | /// %2 = linalg.generic ins(%1) outs(%dest) { |
163 | /// ^bb0(%in: f32, %out: f32): |
164 | /// // out not used |
165 | /// } |
166 | /// |
167 | /// The IR is transformed as follows: |
168 | /// %0 = tensor.empty() |
169 | /// %1 = linalg.matmul ins(...) outs(%dest) |
170 | /// %2 = linalg.generic ins(%0) outs(%1) { |
171 | /// ^bb0(%in: f32, %out: f32): |
172 | /// // Use %out instead of %in |
173 | /// } |
174 | /// |
175 | /// The "ins" operand has no uses inside the body of the LinalgOp and can be |
176 | /// folded away with existing cleanup patterns. Afterwards, the tensor::EmptyOp |
177 | /// can also fold away. |
178 | LogicalResult linalgOpAnchoredEmptyTensorEliminationStep( |
179 | RewriterBase &rewriter, Operation *op, |
180 | bufferization::OneShotAnalysisState &state); |
181 | |
182 | //===----------------------------------------------------------------------===// |
183 | // Structs that configure the behavior of various transformations. |
184 | //===----------------------------------------------------------------------===// |
185 | |
186 | using TileSizeComputationFunction = |
187 | std::function<SmallVector<Value, 4>(OpBuilder &, Operation *)>; |
188 | |
189 | struct LinalgTilingOptions { |
190 | /// Computation function that returns the tile sizes for each operation. |
191 | /// Delayed construction of constant tile sizes should occur to interoperate |
192 | /// with folding. |
193 | TileSizeComputationFunction tileSizeComputationFunction = nullptr; |
194 | |
195 | LinalgTilingOptions & |
196 | setTileSizeComputationFunction(TileSizeComputationFunction fun) { |
197 | tileSizeComputationFunction = std::move(fun); |
198 | return *this; |
199 | } |
200 | /// Set the `tileSizeComputationFunction` to return the values `ts`. The |
201 | /// values must not fold away when tiling. Otherwise, use a more robust |
202 | /// `tileSizeComputationFunction`. |
203 | LinalgTilingOptions &setTileSizes(const SmallVector<Value, 4> &ts) { |
204 | tileSizeComputationFunction = [=](OpBuilder &, Operation *) { return ts; }; |
205 | return *this; |
206 | } |
207 | /// Convenience function to set the `tileSizeComputationFunction` to a |
208 | /// function that computes tile sizes at the point they are needed. Allows |
209 | /// proper interaction with folding. |
210 | LinalgTilingOptions &setTileSizes(ArrayRef<int64_t> ts); |
211 | |
212 | /// Tile all dynamic dimensions by 1. I.e., scalarize those dimensions. |
213 | /// Note: `scalarizeDynamicDims` and `setTileSizes` cannot be used together. |
214 | LinalgTilingOptions &scalarizeDynamicDims(); |
215 | |
216 | /// The interchange vector to reorder the tiled loops. |
217 | SmallVector<unsigned, 4> interchangeVector = {}; |
218 | |
219 | LinalgTilingOptions &setInterchange(ArrayRef<unsigned> interchange) { |
220 | interchangeVector.assign(in_start: interchange.begin(), in_end: interchange.end()); |
221 | return *this; |
222 | } |
223 | |
224 | /// The type of tile loops to generate. |
225 | LinalgTilingLoopType loopType = LinalgTilingLoopType::Loops; |
226 | |
227 | LinalgTilingOptions &setLoopType(LinalgTilingLoopType lt) { |
228 | loopType = lt; |
229 | return *this; |
230 | } |
231 | |
232 | /// When specified, specifies distribution of generated tile loops to |
233 | /// processors. |
234 | std::optional<LinalgLoopDistributionOptions> distribution; |
235 | |
236 | LinalgTilingOptions & |
237 | setDistributionOptions(LinalgLoopDistributionOptions distributionOptions) { |
238 | distribution = std::move(distributionOptions); |
239 | return *this; |
240 | } |
241 | |
242 | /// Specification markers of how to distribute the `linalg.tiled_loop`. |
243 | SmallVector<StringRef, 2> distributionTypes = {}; |
244 | |
245 | LinalgTilingOptions &setDistributionTypes(ArrayRef<StringRef> types) { |
246 | distributionTypes.assign(in_start: types.begin(), in_end: types.end()); |
247 | return *this; |
248 | } |
249 | |
250 | /// Peel the specified loops. |
251 | SmallVector<int64_t> peeledLoops; |
252 | |
253 | LinalgTilingOptions &setPeeledLoops(ArrayRef<int64_t> loops) { |
254 | peeledLoops.clear(); |
255 | peeledLoops.append(in_start: loops.begin(), in_end: loops.end()); |
256 | return *this; |
257 | } |
258 | }; |
259 | |
260 | struct LinalgTilingAndFusionOptions { |
261 | /// Tile sizes used to tile the root operation. |
262 | SmallVector<int64_t> tileSizes; |
263 | LinalgTilingAndFusionOptions &setTileSizes(ArrayRef<int64_t> ts) { |
264 | tileSizes.assign(in_start: ts.begin(), in_end: ts.end()); |
265 | return *this; |
266 | } |
267 | /// Tile interchange used to permute the tile loops. |
268 | SmallVector<int64_t> tileInterchange; |
269 | /// When specified, specifies distribution of generated tile loops to |
270 | /// processors. |
271 | std::optional<LinalgLoopDistributionOptions> tileDistribution; |
272 | LinalgTilingAndFusionOptions & |
273 | setDistributionOptions(LinalgLoopDistributionOptions distributionOptions) { |
274 | tileDistribution = std::move(distributionOptions); |
275 | return *this; |
276 | } |
277 | }; |
278 | |
279 | struct LinalgPaddingOptions { |
280 | /// A padding value for every operand. |
281 | SmallVector<Attribute> paddingValues; |
282 | LinalgPaddingOptions &setPaddingValues(ArrayRef<Attribute> pv) { |
283 | paddingValues.assign(in_start: pv.begin(), in_end: pv.end()); |
284 | return *this; |
285 | } |
286 | /// A list of iterator dimensions to pad. |
287 | SmallVector<int64_t> paddingDimensions; |
288 | LinalgPaddingOptions &setPaddingDimensions(ArrayRef<int64_t> pd) { |
289 | paddingDimensions.assign(in_start: pd.begin(), in_end: pd.end()); |
290 | return *this; |
291 | } |
292 | /// A list of multiples to which each padding dimension should be padded to. |
293 | std::optional<SmallVector<int64_t>> padToMultipleOf; |
294 | LinalgPaddingOptions &setPadToMultipleOf(ArrayRef<int64_t> m) { |
295 | padToMultipleOf.emplace(args: m.begin(), args: m.end()); |
296 | return *this; |
297 | } |
298 | /// A flag for every operand to mark the PadOp as nofold which enables |
299 | /// packing for statically shaped operands. |
300 | SmallVector<bool> packPaddings; |
301 | LinalgPaddingOptions &setPackPaddings(ArrayRef<bool> pp) { |
302 | packPaddings.assign(in_start: pp.begin(), in_end: pp.end()); |
303 | return *this; |
304 | } |
305 | /// A number of loops to hoist the PadOp out for every operand. |
306 | SmallVector<int64_t> hoistPaddings; |
307 | LinalgPaddingOptions &setHoistPaddings(ArrayRef<int64_t> hp) { |
308 | hoistPaddings.assign(in_start: hp.begin(), in_end: hp.end()); |
309 | return *this; |
310 | } |
311 | /// A permutation vector for every operand used to transpose the packed |
312 | /// PadOp results. |
313 | SmallVector<SmallVector<int64_t>> transposePaddings; |
314 | LinalgPaddingOptions & |
315 | setTransposePaddings(ArrayRef<SmallVector<int64_t>> tp) { |
316 | transposePaddings.assign(in_start: tp.begin(), in_end: tp.end()); |
317 | return *this; |
318 | } |
319 | enum class CopyBackOp : int8_t { |
320 | None = 0, |
321 | BufferizationMaterializeInDestination = 1, |
322 | LinalgCopy = 2 |
323 | }; |
324 | /// The op to be used for copying the padded result to the original |
325 | /// destination tensor. |
326 | CopyBackOp copyBackOp = CopyBackOp::BufferizationMaterializeInDestination; |
327 | LinalgPaddingOptions &setCopyBackOp(CopyBackOp op) { |
328 | copyBackOp = op; |
329 | return *this; |
330 | } |
331 | }; |
332 | |
333 | /// Callback function type used to perform the allocation for the promoted |
334 | /// `subView`. In `boundingSubViewsize` a best attempt is made to find the |
335 | /// smallest constant value for the size of the buffer needed for each |
336 | /// dimension. If that is not possible, contains the dynamic size of the |
337 | /// subview. The call back should return the buffer to use. |
338 | using AllocBufferCallbackFn = std::function<std::optional<Value>( |
339 | OpBuilder &b, memref::SubViewOp subView, |
340 | ArrayRef<Value> boundingSubViewSize, DataLayout &layout)>; |
341 | |
342 | /// Callback function type used to deallocate the buffers used to hold the |
343 | /// promoted subview. |
344 | using DeallocBufferCallbackFn = |
345 | std::function<LogicalResult(OpBuilder &b, Value buffer)>; |
346 | |
347 | /// Callback function type used to insert copy from original subview to |
348 | /// subview of the promoted region for the read operands/subview of promoted |
349 | /// region to original subview for the results. The copy has to happen from |
350 | /// `src` to `dst`. |
351 | using CopyCallbackFn = |
352 | std::function<LogicalResult(OpBuilder &b, Value src, Value dst)>; |
353 | |
354 | struct LinalgPromotionOptions { |
355 | /// Indices of subViews to promote. If `std::nullopt`, try to promote all |
356 | /// operands. |
357 | std::optional<DenseSet<unsigned>> operandsToPromote; |
358 | LinalgPromotionOptions &setOperandsToPromote(ArrayRef<int64_t> operands) { |
359 | operandsToPromote = DenseSet<unsigned>(); |
360 | operandsToPromote->insert(I: operands.begin(), E: operands.end()); |
361 | return *this; |
362 | } |
363 | /// If ith element of `useFullTiles` is true the full view should be used |
364 | /// for the promoted buffer of the ith operand in `operandsToPromote`. |
365 | /// Otherwise the partial view will be used. The decision is defaulted to |
366 | /// `useFullTileBuffersDefault` when `useFullTileBuffers` is std::nullopt and |
367 | /// for operands missing from `useFullTileBuffers`. |
368 | std::optional<llvm::SmallBitVector> useFullTileBuffers; |
369 | LinalgPromotionOptions &setUseFullTileBuffers(ArrayRef<bool> useFullTiles) { |
370 | unsigned size = useFullTiles.size(); |
371 | llvm::SmallBitVector tmp(size, false); |
372 | for (unsigned i = 0; i < size; ++i) |
373 | tmp[i] = useFullTiles[i]; |
374 | useFullTileBuffers = tmp; |
375 | return *this; |
376 | } |
377 | /// If true all operands unspecified by `useFullTileBuffers` will use the |
378 | /// full view, otherwise the partial view. |
379 | bool useFullTileBuffersDefault = false; |
380 | LinalgPromotionOptions &setUseFullTileBuffersByDefault(bool use) { |
381 | useFullTileBuffersDefault = use; |
382 | return *this; |
383 | } |
384 | /// Alignment of promoted buffer. If `std::nullopt` do not specify alignment. |
385 | std::optional<unsigned> alignment; |
386 | LinalgPromotionOptions &setAlignment(unsigned align) { |
387 | alignment = align; |
388 | return *this; |
389 | } |
390 | /// Memory space of promoted buffer. If `std::nullopt` do not specify memory |
391 | /// space. |
392 | std::optional<Attribute> memorySpace; |
393 | LinalgPromotionOptions &setMemorySpace(Attribute memorySpc) { |
394 | memorySpace = memorySpc; |
395 | return *this; |
396 | } |
397 | /// Use alloca with the default allocation scheme. |
398 | bool useAlloca = false; |
399 | LinalgPromotionOptions &setUseAlloca(bool use) { |
400 | useAlloca = use; |
401 | return *this; |
402 | } |
403 | /// Callback function to do the allocation of the promoted buffer. If |
404 | /// std::nullopt, then the default allocation scheme of allocating a |
405 | /// memref<?xi8> buffer followed by a view operation is used. |
406 | std::optional<AllocBufferCallbackFn> allocationFn; |
407 | std::optional<DeallocBufferCallbackFn> deallocationFn; |
408 | LinalgPromotionOptions & |
409 | setAllocationDeallocationFns(AllocBufferCallbackFn const &allocFn, |
410 | DeallocBufferCallbackFn const &deallocFn) { |
411 | allocationFn = allocFn; |
412 | deallocationFn = deallocFn; |
413 | return *this; |
414 | } |
415 | /// Callback function to do the copy of data to and from the promoted |
416 | /// subview. If std::nullopt then a memref.copy is used. |
417 | std::optional<CopyCallbackFn> copyInFn; |
418 | std::optional<CopyCallbackFn> copyOutFn; |
419 | LinalgPromotionOptions &setCopyInOutFns(CopyCallbackFn const ©In, |
420 | CopyCallbackFn const ©Out) { |
421 | copyInFn = copyIn; |
422 | copyOutFn = copyOut; |
423 | return *this; |
424 | } |
425 | }; |
426 | |
427 | /// Split Reduction options. |
428 | struct SplitReductionOptions { |
429 | // Ratio used to split the reduction dimension. If the ratio is <= 1, |
430 | // nothing will be done. |
431 | int64_t ratio = 0; |
432 | // Index where the extra dimension is added to the intermediate tensor |
433 | // shape. |
434 | unsigned index = 0; |
435 | // If the inner dimension after splitting is parallel or reduction. |
436 | bool innerParallel = false; |
437 | }; |
438 | |
439 | /// Function signature to control reduction splitting. This returns |
440 | /// `SplitReductionOptions`. |
441 | // TODO: don't use unsigned unless doing bit manipulation. |
442 | using ControlSplitReductionFn = |
443 | std::function<SplitReductionOptions(LinalgOp op)>; |
444 | |
445 | //===----------------------------------------------------------------------===// |
446 | // Preconditions that ensure the corresponding transformation succeeds and can |
447 | // be applied as a rewrite pattern. |
448 | //===----------------------------------------------------------------------===// |
449 | |
450 | /// Return true if two `linalg.generic` operations with producer/consumer |
451 | /// relationship through `fusedOperand` can be fused using elementwise op |
452 | /// fusion. |
453 | bool areElementwiseOpsFusable(OpOperand *fusedOperand); |
454 | |
455 | /// Promote memref.subviews feeding linalg-on-buffers operations. |
456 | LogicalResult promoteSubviewsPrecondition(Operation *op, |
457 | LinalgPromotionOptions options); |
458 | |
459 | /// Return success if the operation can be vectorized. |
460 | LogicalResult vectorizeOpPrecondition(Operation *op, |
461 | ArrayRef<int64_t> inputVectorSizes = {}, |
462 | ArrayRef<bool> inputScalableVecDims = {}, |
463 | bool = false, |
464 | bool flatten1DDepthwiseConv = false); |
465 | |
466 | //===----------------------------------------------------------------------===// |
467 | // Transformations exposed as functional-style API calls. |
468 | //===----------------------------------------------------------------------===// |
469 | |
470 | using LinalgLoops = SmallVector<Operation *, 4>; |
471 | |
472 | /// Transformation to drop unit-extent dimensions from `linalg.generic` |
473 | /// operations. |
474 | struct ControlDropUnitDims { |
475 | enum class RankReductionStrategy { ReassociativeReshape, }; |
476 | |
477 | RankReductionStrategy rankReductionStrategy = |
478 | RankReductionStrategy::ReassociativeReshape; |
479 | |
480 | using ControlFnTy = std::function<SmallVector<unsigned>(Operation *)>; |
481 | ControlFnTy controlFn = [](Operation *op) { |
482 | if (auto genericOp = dyn_cast_or_null<GenericOp>(op)) { |
483 | return llvm::to_vector(llvm::seq<unsigned>(0, genericOp.getNumLoops())); |
484 | } |
485 | if (auto padOp = dyn_cast_or_null<tensor::PadOp>(op)) { |
486 | return llvm::to_vector( |
487 | llvm::seq<unsigned>(0, padOp.getSourceType().getRank())); |
488 | } |
489 | return SmallVector<unsigned>{}; |
490 | }; |
491 | }; |
492 | LogicalResult dropUnitDims(RewriterBase &rewriter, GenericOp genericOp, |
493 | const ControlDropUnitDims &options); |
494 | |
495 | /// Fuse two `linalg.generic` operations that have a producer-consumer |
496 | /// relationship captured through `fusedOperand`. The method expects |
497 | /// that `areElementwiseOpsFusable` returns true for the given `fusedOperand`. |
498 | struct ElementwiseOpFusionResult { |
499 | Operation *fusedOp; |
500 | llvm::DenseMap<Value, Value> replacements; |
501 | static llvm::SmallDenseSet<int> |
502 | getPreservedProducerResults(GenericOp producer, GenericOp consumer); |
503 | }; |
504 | FailureOr<ElementwiseOpFusionResult> |
505 | fuseElementwiseOps(RewriterBase &rewriter, OpOperand *fusedOperand); |
506 | |
507 | /// Try to peel and canonicalize loop `op` and return the new result. |
508 | /// Also applies affine_min/max bounds simplification on the fly where relevant. |
509 | // TODO: Add support for scf.parallel and affine.for loops. |
510 | SmallVector<Value> peelLoop(RewriterBase &rewriter, Operation *op); |
511 | |
512 | /// Peel 'loops' and applies affine_min/max bounds simplification on the fly |
513 | /// where relevant. |
514 | void peelLoops(RewriterBase &rewriter, ArrayRef<scf::ForOp> loops); |
515 | |
516 | /// Pad the iterator dimensions `paddingDimensions` of all `opToPad` operands |
517 | /// to a static bounding box. The original `opToPad` is cloned and operates on |
518 | /// the padded tensors. |
519 | /// |
520 | /// * "options.padToMultipleOf" indicates that each padding dimension should be |
521 | /// padded to the specified multiple. |
522 | /// * Use "options.paddingValues" and "options.packPaddings" to set padding |
523 | /// value and nofold attribute of the created tensor::PadOps, respectively. |
524 | /// * The unpadded results (extracted slice of the cloned operation) are |
525 | /// returned via `replacements`. |
526 | /// * The tensor::PadOps are returned via `padOps`. |
527 | /// * "options.copyBackOp" specifies the op type for copying back the unpadded |
528 | /// result to the original destination tensor. |
529 | LogicalResult rewriteAsPaddedOp(RewriterBase &rewriter, LinalgOp opToPad, |
530 | const LinalgPaddingOptions &options, |
531 | LinalgOp &paddedOp, |
532 | SmallVector<Value> &replacements, |
533 | SmallVector<tensor::PadOp> &padOps); |
534 | |
535 | namespace detail { |
536 | |
537 | /// Helper struct to hold the results of building a packing loop nest. |
538 | struct PackingResult { |
539 | SmallVector<OpFoldResult> offsets, sizes, strides; |
540 | SmallVector<Value> clonedLoopIvs, leadingPackedTensorIndexings; |
541 | GenericOp maybeTransposeOp; |
542 | tensor::PadOp hoistedPadOp; |
543 | }; |
544 | |
545 | /// Build the packing loop nest required to hoist `opToHoist` above |
546 | /// `outermostEnclosingForOp`. |
547 | /// The loop nest is built just before `outermostEnclosingForOp`. |
548 | FailureOr<PackingResult> |
549 | buildPackingLoopNest(RewriterBase &rewriter, tensor::PadOp opToHoist, |
550 | scf::ForOp outermostEnclosingForOp, |
551 | ArrayRef<int64_t> transposeVector); |
552 | |
553 | } // namespace detail |
554 | |
555 | /// Mechanically hoist padding operations on tensors by `numLoops` into a new, |
556 | /// generally larger tensor. This achieves packing of multiple padding ops into |
557 | /// a larger tensor. On success, `opToHoist` is replaced by the cloned version |
558 | /// in the packing loop so the caller can continue reasoning about the padding |
559 | /// operation. If `transposeVector` is non-empty, hoist padding introduces a |
560 | /// GenericOp to transpose the padded tensor before inserting it into the packed |
561 | /// tensor. A `transposeVector` can change the storage order of the padded |
562 | /// tensor but does not change the order of the pack or compute loops. |
563 | /// |
564 | /// TODO: In the future, we should consider rewriting as a tensor.pack after |
565 | /// hoisting since this abstraction is now available. |
566 | /// |
567 | /// Example in pseudo-mlir: |
568 | /// ======================= |
569 | /// |
570 | /// If hoistPaddingOnTensors is called with `nLoops` = 2 on the following IR. |
571 | /// ``` |
572 | /// scf.for (%i, %j, %k) |
573 | /// %st0 = tensor.extract_slice f(%i, %k) : ... to tensor<?x?xf32> |
574 | /// %0 = tensor.pad %st0 low[0, 0] high[...] { |
575 | /// ^bb0( ... ): |
576 | /// linalg.yield %pad |
577 | /// } : tensor<?x?xf32> to tensor<4x8xf32> |
578 | /// compute(%0) |
579 | /// ``` |
580 | /// |
581 | /// IR resembling the following is produced: |
582 | /// |
583 | /// ``` |
584 | /// scf.for (%i) { |
585 | /// %packed_init = tensor.empty range(%j) : tensor<?x4x8xf32> |
586 | /// %packed = scf.for (%k) iter_args(%p : %packed_init) { |
587 | /// %st0 = tensor.extract_slice f(%i, %k) : ... to tensor<?x?xf32> |
588 | /// %0 = tensor.pad %st0 low[0, 0] high[...] { |
589 | /// ^bb0( ... ): |
590 | /// linalg.yield %pad |
591 | /// } : tensor<?x?xf32> to tensor<4x8xf32> |
592 | /// %1 = tensor.insert_slice %0 ... |
593 | /// : tensor<4x8xf32> to tensor<?x4x8xf32> |
594 | /// scf.yield %1: tensor<?x4x8xf32> |
595 | /// } -> tensor<?x4x8xf32> |
596 | /// scf.for (%j, %k) { |
597 | /// %st0 = tensor.extract_slice %packed [%k, 0, 0][1, 4, 8][1, 1, 1] : |
598 | /// tensor<?x4x8xf32> to tensor<4x8xf32> |
599 | /// compute(%st0) |
600 | /// } |
601 | /// } |
602 | /// ``` |
603 | FailureOr<Value> |
604 | hoistPaddingOnTensors(RewriterBase &rewriter, tensor::PadOp opToHoist, |
605 | int64_t numLoops, ArrayRef<int64_t> transposeVector, |
606 | tensor::PadOp &hoistedOp, |
607 | SmallVectorImpl<GenericOp> &transposeOps); |
608 | /// Calls into `hoistPaddingOnTensors` with a local IRRewriter. |
609 | FailureOr<Value> |
610 | hoistPaddingOnTensors(tensor::PadOp opToHoist, int64_t numLoops, |
611 | ArrayRef<int64_t> transposeVector, |
612 | tensor::PadOp &hoistedOp, |
613 | SmallVectorImpl<GenericOp> &transposeOps); |
614 | |
615 | /// Apply padding and hoisting to `linalgOp` according to the configuration |
616 | /// specified in `options`. |
617 | FailureOr<LinalgOp> padAndHoistLinalgOp(RewriterBase &rewriter, |
618 | LinalgOp linalgOp, |
619 | const LinalgPaddingOptions &options); |
620 | |
621 | /// Split the given `op` into two parts along the given iteration space |
622 | /// `dimension` at the specified `splitPoint`, and return the two parts. |
623 | /// If the second part is statically known to be empty, do not create it |
624 | /// and return nullptr instead. Error state is signalled by returning |
625 | /// a pair of nullptrs. |
626 | /// |
627 | /// For example, the following op: |
628 | /// |
629 | /// linalg.matmul ins(%0, %1 : tensor<128x32xf32>, tensor<32x64xf32>) |
630 | /// outs(%2 : tensor<128x64xf32>) |
631 | /// |
632 | /// split along the first dimension at position 42 will result in: |
633 | /// |
634 | /// %3 = tensor.extract_slice %0[0, 0][42, 32][1, 1] |
635 | /// %4 = tensor.extract_slice %2[0, 0][42, 64][1, 1] |
636 | /// %5 = linalg.matmul ins(%3, %1 : tensor<42x32xf32>, tensor<32x64xf32>) |
637 | /// outs(%5 : tensor<42x64xf32>) |
638 | /// %6 = tensor.insert_slice %5 into %2[0, 0][42, 64][1, 1] |
639 | /// |
640 | /// %7 = tensor.extract_slice %0[42, 0][86, 32][1, 1] |
641 | /// %8 = tensor.extract_slice %6[42, 0][86, 64][1, 1] |
642 | /// %9 = linalg.matmul ins(%7, %1 : tensor<86x32xf32>, tensor<32x64xf32>) |
643 | /// outs(%8 : tensor<86x64xf32>) |
644 | /// tensor.insert_slice %5 into %6[42, 0][86, 64][1, 1] |
645 | /// |
646 | /// Note that there is no simplification other than constant propagation applied |
647 | /// to slice extraction and insertion. |
648 | std::pair<TilingInterface, TilingInterface> splitOp(RewriterBase &rewriter, |
649 | TilingInterface op, |
650 | unsigned dimension, |
651 | OpFoldResult splitPoint); |
652 | |
653 | /// Perform standalone tiling of a single LinalgOp by `tileSizes`. |
654 | /// and permute the loop nest according to `interchangeVector` |
655 | /// The permutation is expressed as a list of integers that specify |
656 | /// the new ordering of the loop nest. The length of `interchangeVector` |
657 | /// must be equal to the length of `tileSizes`. |
658 | /// An empty vector is interpreted as the identity permutation and the |
659 | /// transformation returns early. |
660 | /// |
661 | /// Return a struct containing the tiled loops in the specified order |
662 | /// and the cloned op if successful, std::nullopt otherwise. |
663 | /// |
664 | /// E.g. the permutation `(i,j,k) -> (j,k,i)` is expressed by |
665 | /// `interchangeVector = [1,2,0]`. All values in `interchangeVector` must be |
666 | /// integers, in the range 0..`tileSizes.size()` without duplications |
667 | /// (i.e. `[1,1,2]` is an invalid permutation). |
668 | struct TiledLinalgOp { |
669 | LinalgOp op; |
670 | SmallVector<Operation *, 8> loops; |
671 | SmallVector<Value, 4> tensorResults; |
672 | }; |
673 | FailureOr<TiledLinalgOp> tileLinalgOp(RewriterBase &b, LinalgOp op, |
674 | const LinalgTilingOptions &options); |
675 | |
676 | /// Interchange the `iterator_types` and `iterator_maps` dimensions and adapts |
677 | /// the index accesses of `op`. This is an in-place transformation controlled |
678 | /// by `interchangeVector`. An empty vector is interpreted as the identity |
679 | /// permutation and the transformation returns early. |
680 | /// |
681 | /// E.g. the permutation `(i,j,k) -> (j,k,i)` is expressed with |
682 | /// `interchangeVector = [1,2,0]`. All values in `interchangeVector` must be |
683 | /// integers, in the range 0..`op.rank` without duplications |
684 | /// (i.e. `[1,1,2]` is an invalid permutation). |
685 | /// |
686 | /// Return failure if the permutation is not valid. |
687 | FailureOr<GenericOp> interchangeGenericOp(RewriterBase &rewriter, |
688 | GenericOp genericOp, |
689 | ArrayRef<unsigned> interchangeVector); |
690 | |
691 | /// Create a GenericOp from the given named operation `namedOp` and replace |
692 | /// namedOp. |
693 | /// Return failure if `namedOp` is a GenericOp or misses a region builder. |
694 | FailureOr<GenericOp> generalizeNamedOp(RewriterBase &rewriter, |
695 | LinalgOp namedOp); |
696 | |
697 | /// Create a namedOp from the given GenericOp and replace the GenericOp. |
698 | /// Currently we can specialize only trivial linalg copy operations. |
699 | FailureOr<LinalgOp> specializeGenericOp(RewriterBase &rewriter, |
700 | GenericOp genericOp); |
701 | |
702 | /// Create a new buffer using the `allocationFn` provided. The size of this |
703 | /// buffer is the smallest constant bounding size along each dimension that |
704 | /// can be computed for the size of the result of `subView`. Returns the |
705 | /// allocated buffer as `fullLocalView` and the view that matches the size of |
706 | /// the result of subview operation as `partialLocalView`. |
707 | struct PromotionInfo { |
708 | Value fullLocalView; |
709 | Value partialLocalView; |
710 | }; |
711 | FailureOr<PromotionInfo> |
712 | promoteSubviewAsNewBuffer(OpBuilder &b, Location loc, memref::SubViewOp subView, |
713 | const AllocBufferCallbackFn &allocationFn, |
714 | DataLayout &layout); |
715 | |
716 | /// Promote the `subViews` into a new buffer allocated at the insertion point |
717 | /// `b`. Promotion occurs in 3 steps: |
718 | /// 1. Create a new buffer for a full tile (i.e. not clipped at the |
719 | /// boundary). |
720 | /// 2. Take a full view on the buffer. |
721 | /// 3. Take a partial slice of the full view in step 2. and copy into it. |
722 | /// |
723 | /// Return the modified linalg op (the modification happens in place) as well |
724 | /// as all the copy ops created. |
725 | FailureOr<LinalgOp> promoteSubViews(OpBuilder &b, LinalgOp op, |
726 | const LinalgPromotionOptions &options); |
727 | |
728 | /// Allocate the subview in the GPU workgroup memory. |
729 | std::optional<Value> allocateWorkgroupMemory(OpBuilder &builder, |
730 | memref::SubViewOp subview, |
731 | ArrayRef<Value> sizeBounds, |
732 | DataLayout &); |
733 | |
734 | /// In case of GPU group memory there is no need to deallocate. |
735 | LogicalResult deallocateWorkgroupMemory(OpBuilder &, Value /*buffer*/); |
736 | |
737 | /// Create Memref copy operations and add gpu barrier guards before and after |
738 | /// the copy operation to ensure data integrity. |
739 | LogicalResult copyToWorkgroupMemory(OpBuilder &b, Value src, Value dst); |
740 | |
741 | /// Allocate the subview in the GPU private memory. |
742 | std::optional<Value> allocateGPUPrivateMemory(OpBuilder &builder, |
743 | memref::SubViewOp subview, |
744 | ArrayRef<Value> sizeBounds, |
745 | DataLayout &); |
746 | |
747 | /// Normal copy to between src and dst. |
748 | LogicalResult copyToGPUPrivateMemory(OpBuilder &b, Value src, Value dst); |
749 | |
750 | /// In case of GPU private memory there is no need to deallocate since the |
751 | /// memory is freed when going outside of the scope. |
752 | LogicalResult deallocateGPUPrivateMemory(OpBuilder &, Value /*buffer*/); |
753 | |
754 | /// Emit a suitable vector form for an operation. If provided, |
755 | /// `inputVectorSizes` are used to vectorize this operation. `inputVectorSizes` |
756 | /// must match the rank of the iteration space of the operation and the sizes |
757 | /// must be smaller or equal than their counterpart interation space sizes, if |
758 | /// static. `inputVectorShapes` also allows the vectorization of operations with |
759 | /// dynamic shapes. |
760 | LogicalResult vectorize(RewriterBase &rewriter, Operation *op, |
761 | ArrayRef<int64_t> inputVectorSizes = {}, |
762 | ArrayRef<bool> inputScalableVecDims = {}, |
763 | bool = false, |
764 | bool flatten1DDepthwiseConv = false); |
765 | |
766 | /// Emit a suitable vector form for a Copy op with fully static shape. |
767 | LogicalResult vectorizeCopy(RewriterBase &builder, memref::CopyOp copyOp); |
768 | |
769 | /// Emit a loop nest of `scf.for` with the proper body for `linalgOp`. |
770 | FailureOr<LinalgLoops> linalgOpToLoops(RewriterBase &rewriter, |
771 | LinalgOp linalgOp); |
772 | |
773 | /// Emit a loop nest of `scf.parallel` with the proper body for `linalgOp`. |
774 | FailureOr<LinalgLoops> linalgOpToParallelLoops(RewriterBase &rewriter, |
775 | LinalgOp linalgOp); |
776 | |
777 | /// Emit a loop nest of `affine.for` with the proper body for `linalgOp`. |
778 | FailureOr<LinalgLoops> linalgOpToAffineLoops(RewriterBase &rewriter, |
779 | LinalgOp linalgOp); |
780 | |
781 | /// Creates a number of ranges equal to the number of non-zero in `tileSizes`. |
782 | /// One for each loop of the LinalgOp that is tiled. The `tileSizes` argument |
783 | /// has one entry per surrounding loop. It uses zero as the convention that a |
784 | /// particular loop is not tiled. This convention simplifies implementations |
785 | /// by avoiding affine map manipulations. The returned ranges correspond to |
786 | /// the loop ranges, in the proper order, that are tiled and for which new |
787 | /// loops will be created. Also the function returns a map from loop indices |
788 | /// of the LinalgOp to the corresponding non-empty range indices of newly |
789 | /// created loops. |
790 | using LoopIndexToRangeIndexMap = DenseMap<int, int>; |
791 | std::tuple<SmallVector<Range, 4>, LoopIndexToRangeIndexMap> |
792 | makeTiledLoopRanges(RewriterBase &b, Location loc, AffineMap map, |
793 | ArrayRef<OpFoldResult> allShapeSizes, |
794 | ArrayRef<OpFoldResult> allTileSizes); |
795 | |
796 | namespace detail { |
797 | template <typename T> |
798 | struct MultiSizeSpecificationBase { |
799 | /// Tile sizes. |
800 | T lowTileSize, highTileSize; |
801 | /// Number of tiles associated with each size. |
802 | T lowTripCount, highTripCount; |
803 | }; |
804 | } // namespace detail |
805 | |
806 | /// A description of a multi-size tiling comprising tile sizes and numbers of |
807 | /// tiles, expressed as Values which may or may not be constant. Multi-size |
808 | /// currently means two-size. |
809 | struct MultiSizeSpecification |
810 | : public detail::MultiSizeSpecificationBase<Value> {}; |
811 | struct StaticMultiSizeSpecification |
812 | : public detail::MultiSizeSpecificationBase<int64_t> {}; |
813 | |
814 | /// Emits the IR computing the multi-sized tiling specification with two tile |
815 | /// sizes not exceeding `targetSize`, each divisible by `sizeDivisor`, such |
816 | /// that there exist numbers of tiles with these sizes that fully cover the |
817 | /// given iteration space `dimension` of the structured `op`. |
818 | /// |
819 | /// The computation is as follows: |
820 | /// |
821 | /// b = originalTripCount floordiv sizeDivisor |
822 | /// t = (targetSize + sizeDivisor - 1) floordiv sizeDivisor |
823 | /// d = (b + t - 1) floordiv t |
824 | /// s = (b floordiv d) * sizeDivisor |
825 | /// v = b % d |
826 | /// u = d - v |
827 | /// |
828 | /// where the tile sizes are `s` and `s` + `sizeDivisor`, and the numbers of |
829 | /// the corresponding tiles are `u` and `v`, respectively. Alternatively, |
830 | /// |
831 | /// s * u + (s + sizeDivisor) * v == original size, |
832 | /// where s mod sizeDivisor = 0. |
833 | /// |
834 | /// Expects all values to be positive. In some cases with the target tile size |
835 | /// sufficiently close to the dimension shape and non-unit divisor, it is |
836 | /// impossible to compute such sizes. If `emitAssertion` is set, also emit the |
837 | /// assertion that size computation succeeded. |
838 | /// |
839 | /// Returns the specification consisting of both tile values and the number of |
840 | /// tiles of each size. |
841 | FailureOr<MultiSizeSpecification> |
842 | computeMultiTileSizes(OpBuilder &builder, LinalgOp op, unsigned dimension, |
843 | OpFoldResult targetSize, OpFoldResult divisor, |
844 | bool emitAssertions = true); |
845 | FailureOr<StaticMultiSizeSpecification> |
846 | computeStaticMultiTileSizes(LinalgOp op, unsigned dimension, int64_t targetSize, |
847 | int64_t divisor); |
848 | |
849 | /// Rewrite a TilingInterface `op` to a tiled `scf.forall`, applying |
850 | /// tiling by `numThreads`. |
851 | /// If non-empty, the `mapping` is added as an attribute to the |
852 | /// resulting `scf.forall`. |
853 | /// Zero tile sizes indicate that the dimension is not tiled, and can be |
854 | /// thought of as tiling by the full size of data. It is the user's |
855 | /// responsibility to ensure that `numThreads` is a valid tiling specification |
856 | /// (i.e. that only tiles parallel dimensions, e.g. in the Linalg case). |
857 | struct ForallTilingResult { |
858 | Operation *tileOp; |
859 | Operation *tiledOp; |
860 | }; |
861 | FailureOr<ForallTilingResult> tileToForallOp(RewriterBase &builder, |
862 | TilingInterface op, |
863 | ArrayRef<OpFoldResult> numThreads, |
864 | std::optional<ArrayAttr> mapping); |
865 | |
866 | /// Same as `tileToForallOp`, but calculate the number of threads |
867 | /// required using the given tileSizes. |
868 | FailureOr<ForallTilingResult> |
869 | tileToForallOpUsingTileSizes(RewriterBase &builder, TilingInterface op, |
870 | ArrayRef<OpFoldResult> tileSizes, |
871 | std::optional<ArrayAttr> mapping); |
872 | |
873 | /// Transformation information returned after reduction tiling. |
874 | struct ForallReductionTilingResult { |
875 | /// The partial reduction tiled op generated. |
876 | Operation *parallelTiledOp; |
877 | /// The final reduction operation merging all the partial reductions. |
878 | Operation *mergeOp; |
879 | /// The op initializing the tensor used for partial reductions. |
880 | Operation *initialOp; |
881 | /// The `scf.forall` operation that iterate over the tiles. |
882 | scf::ForallOp loops; |
883 | }; |
884 | |
885 | /// Method to tile a reduction to parallel iterations computing partial |
886 | /// reductions. After the loop all the partial reduction are merged into a final |
887 | /// reduction. For example for the following sequence |
888 | /// |
889 | /// ```mlir |
890 | /// %0 = linalg.generic %in ["parallel", "reduction"] |
891 | /// : tensor<7x9xf32> -> tensor<7xf32> |
892 | /// ``` |
893 | /// |
894 | /// into: |
895 | /// |
896 | /// ```mlir |
897 | /// %0 = linalg.fill ... : tensor<7x4xf32> |
898 | /// %1 = scf.forall (%iv) in (%c4) shared_outs(%arg0 = %0) |
899 | /// -> (tensor<7x4xf32>) { |
900 | /// %2 = tensor.extract_slice %arg3 : tensor<7x4xf32> to tensor<7xf32> |
901 | /// %3 = tensor.extract_slice %in : tensor<7x9xf32> -> tensor<7x?xf32> |
902 | /// %4 = linalg.generic %2, %3 ["parallel", "reduction"] |
903 | /// : tensor<7x?xf32> -> tensor<7xf32> |
904 | /// %5 = tensor.insert_slice %3, %arg0[0, %iv] : tensor<7x4xf32> |
905 | /// } |
906 | /// %6 = linalg.generic %1 ["parallel", "reduction"] |
907 | /// : tensor<7x4xf32> -> tensor<7xf32> |
908 | /// ``` |
909 | FailureOr<ForallReductionTilingResult> |
910 | tileReductionUsingForall(RewriterBase &b, PartialReductionOpInterface op, |
911 | ArrayRef<OpFoldResult> numThreads, |
912 | ArrayRef<OpFoldResult> tileSizes = {}, |
913 | std::optional<ArrayAttr> mapping = std::nullopt); |
914 | |
915 | /// All indices returned by IndexOp should be invariant with respect to |
916 | /// tiling. Therefore, if an operation is tiled, we have to transform the |
917 | /// indices accordingly, i.e. offset them by the values of the corresponding |
918 | /// induction variables that are captured implicitly in the body of the op. |
919 | /// |
920 | /// Example. `linalg.generic` before tiling: |
921 | /// |
922 | /// #id_2d = (i, j) -> (i, j) |
923 | /// #pointwise_2d_trait = { |
924 | /// indexing_maps = [#id_2d, #id_2d], |
925 | /// iterator_types = ["parallel", "parallel"] |
926 | /// } |
927 | /// linalg.generic #pointwise_2d_trait %operand, %result { |
928 | /// ^bb0(%operand_in: f32, %result_in: f32): |
929 | /// %i = linalg.index 0 : index |
930 | /// %j = linalg.index 1 : index |
931 | /// <some operations that use %i, %j> |
932 | /// }: memref<50x100xf32>, memref<50x100xf32> |
933 | /// |
934 | /// After tiling pass with tiles sizes 10 and 25: |
935 | /// |
936 | /// #strided = (i, j)[s0, s1, s2] -> (i * s1 + s0 + j * s2) |
937 | /// |
938 | /// %c1 = arith.constant 1 : index |
939 | /// %c0 = arith.constant 0 : index |
940 | /// %c25 = arith.constant 25 : index |
941 | /// %c10 = arith.constant 10 : index |
942 | /// operand_dim_0 = dim %operand, 0 : memref<50x100xf32> |
943 | /// operand_dim_1 = dim %operand, 1 : memref<50x100xf32> |
944 | /// scf.for %k = %c0 to operand_dim_0 step %c10 { |
945 | /// scf.for %l = %c0 to operand_dim_1 step %c25 { |
946 | /// %4 = memref.subview %operand[%k, %l][%c10, %c25][%c1, %c1] |
947 | /// : memref<50x100xf32> to memref<?x?xf32, #strided> |
948 | /// %5 = memref.subview %result[%k, %l][%c10, %c25][%c1, %c1] |
949 | /// : memref<50x100xf32> to memref<?x?xf32, #strided> |
950 | /// linalg.generic pointwise_2d_trait %4, %5 { |
951 | /// ^bb0(%operand_in: f32, %result_in: f32): |
952 | /// %i = linalg.index 0 : index |
953 | /// %j = linalg.index 1 : index |
954 | /// // Indices `k` and `l` are implicitly captured in the body. |
955 | /// %transformed_i = arith.addi %i, %k : index // index `i` is offset by |
956 | /// %k %transformed_j = arith.addi %j, %l : index // index `j` is offset |
957 | /// by %l |
958 | /// // Every use of %i, %j is replaced with %transformed_i, |
959 | /// %transformed_j <some operations that use %transformed_i, |
960 | /// %transformed_j> |
961 | /// }: memref<?x?xf32, #strided>, memref<?x?xf32, #strided> |
962 | /// } |
963 | /// } |
964 | /// |
965 | /// TODO: Investigate whether mixing implicit and explicit indices |
966 | /// does not lead to losing information. |
967 | void transformIndexOps(RewriterBase &b, LinalgOp op, |
968 | SmallVectorImpl<Value> &ivs, |
969 | const LoopIndexToRangeIndexMap &loopIndexToRangeIndex); |
970 | |
971 | /// Apply transformation to split the single linalg op reduction into a |
972 | /// parallel and reduction dimension. Then create a new linalg.generic op |
973 | /// doing the rest of the reduction. Return the new linalg op with an extra |
974 | /// parallel dimension or failure if the transformation didn't happen. |
975 | /// |
976 | /// Example: |
977 | /// ``` |
978 | /// %r = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, |
979 | /// affine_map<(d0) -> ()>], |
980 | /// iterator_types = ["reduction"]} |
981 | /// ins(%in : tensor<32xf32>) |
982 | /// outs(%out : tensor<f32>) { |
983 | /// ^bb0(%arg1: f32, %arg2: f32): |
984 | /// %y = arith.addf %arg1, %arg2 : f32 |
985 | /// linalg.yield %y : f32 |
986 | /// } -> tensor<f32> |
987 | /// ``` |
988 | /// To: |
989 | /// ``` |
990 | /// %cst = arith.constant 0.000000e+00 : f32 |
991 | /// %0 = tensor.expand_shape %in [[0, 1]] : tensor<32xf32> into |
992 | /// tensor<4x8xf32> %1 = tensor.empty [4] : tensor<4xf32> %2 = linalg.fill |
993 | /// ins(%cst : f32) outs(%1 : tensor<4xf32>) -> tensor<4xf32> %3 = |
994 | /// linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, |
995 | /// affine_map<(d0, d1) -> (d0)>], |
996 | /// iterator_types = ["parallel", "reduction"]} |
997 | /// ins(%0 : tensor<4x8xf32>) outs(%2 : tensor<4xf32>) { |
998 | /// ^bb0(%arg3: f32, %arg5: f32): |
999 | /// %5 = arith.addf %arg3, %arg4 : f32 |
1000 | /// linalg.yield %5 : f32 |
1001 | /// } -> tensor<4xf32> |
1002 | /// %r = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, |
1003 | /// affine_map<(d0) -> ()>], |
1004 | /// iterator_types = ["reduction"]} |
1005 | /// ins(%3 : tensor<4xf32>) outs(%out : tensor<f32>) { |
1006 | /// ^bb0(%arg3: f32, %arg4: f32): |
1007 | /// %5 = arith.addf %arg3, %arg4 : f32 |
1008 | /// linalg.yield %5 : f32 |
1009 | /// } -> tensor<f32> |
1010 | /// ``` |
1011 | struct SplitReductionResult { |
1012 | Operation *initOrAlloc; |
1013 | FillOp fillOp; |
1014 | LinalgOp splitLinalgOp; |
1015 | LinalgOp resultCombiningLinalgOp; |
1016 | }; |
1017 | FailureOr<SplitReductionResult> |
1018 | splitReduction(RewriterBase &b, LinalgOp op, |
1019 | const ControlSplitReductionFn &controlSplitReductionFn, |
1020 | bool useAlloc = false); |
1021 | |
1022 | /// Scaling-based implementation of the split reduction transformation. |
1023 | /// Instead of introducing an ExpandShapeOp, this rewrites a reduction |
1024 | /// dimension `k` into `k * scale + kk`. |
1025 | /// |
1026 | /// Example: |
1027 | /// ``` |
1028 | /// %0 = linalg.matmul ins(%A, %B: tensor<16x256xf32>, tensor<256x32xf32>) |
1029 | /// outs(%C: tensor<16x32xf32>) -> tensor<16x32xf32> |
1030 | /// ``` |
1031 | /// |
1032 | /// Is transformed to: |
1033 | /// |
1034 | /// ``` |
1035 | /// #map0 = affine_map<(d0, d1, d2, d3) -> (d0, d2 * 4 + d3)> |
1036 | /// #map1 = affine_map<(d0, d1, d2, d3) -> (d2 * 4 + d3, d1)> |
1037 | /// #map2 = affine_map<(d0, d1, d2, d3) -> (d2, d3)> |
1038 | /// #map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)> |
1039 | /// #map4 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> |
1040 | /// #map5 = affine_map<(d0, d1, d2) -> (d0, d1)> |
1041 | /// %0 = tensor.empty [16, 32, 64] : tensor<16x32x64xf32> |
1042 | /// %cst = arith.constant 0.000000e+00 : f32 |
1043 | /// %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<16x32x64xf32>) -> |
1044 | /// tensor<16x32x64xf32> |
1045 | /// %2 = tensor.empty [64, 4] : tensor<64x4xi1> |
1046 | /// |
1047 | /// %3 = linalg.generic {indexing_maps = [#map0, #map1, #map2, #map3], |
1048 | /// iterator_types = ["parallel", "parallel", "parallel", "reduction"]} |
1049 | /// ins(%A, %B, %2 : tensor<16x256xf32>, tensor<256x32xf32>, |
1050 | /// tensor<64x4xi1>) |
1051 | /// outs(%1 : tensor<16x32x64xf32>) { |
1052 | /// ^bb0(%arg3: f32, %arg4: f32, %arg5: i1, %arg6: f32): |
1053 | /// %5 = arith.mulf %arg3, %arg4 : f32 |
1054 | /// %6 = arith.addf %arg6, %5 : f32 |
1055 | /// linalg.yield %6 : f32 |
1056 | /// } -> tensor<16x32x64xf32> |
1057 | /// |
1058 | /// %4 = linalg.generic {indexing_maps = [#map4, #map5], |
1059 | /// iterator_types = ["parallel", "parallel", "reduction"]} |
1060 | // ins(%3 : tensor<16x32x64xf32>) |
1061 | /// outs(%C : tensor<16x32xf32>) { |
1062 | /// ^bb0(%arg3: f32, %arg4: f32): |
1063 | /// %5 = arith.addf %arg3, %arg4 : f32 |
1064 | /// linalg.yield %5 : f32 |
1065 | /// } -> tensor<16x32xf32> |
1066 | /// |
1067 | /// return %4 : tensor<16x32xf32> |
1068 | /// ``` |
1069 | FailureOr<SplitReductionResult> |
1070 | splitReductionByScaling(RewriterBase &b, LinalgOp op, |
1071 | const ControlSplitReductionFn &controlSplitReductionFn, |
1072 | bool useAlloc = false); |
1073 | |
1074 | /// Return `true` if a given sequence of dimensions are contiguous in the |
1075 | /// range of the specified indexing map. |
1076 | bool isDimSequencePreserved(AffineMap map, ReassociationIndicesRef dimSequence); |
1077 | /// Return `true` if all sequences of dimensions specified in `dimSequences` are |
1078 | /// contiguous in all the ranges of the `maps`. |
1079 | bool areDimSequencesPreserved(ArrayRef<AffineMap> maps, |
1080 | ArrayRef<ReassociationIndices> dimSequences); |
1081 | |
1082 | struct CollapseResult { |
1083 | SmallVector<Value> results; |
1084 | LinalgOp collapsedOp; |
1085 | }; |
1086 | |
1087 | /// Collapses dimensions of linalg.generic/linalg.copy operation. A precondition |
1088 | /// to calling this method is that for each list in `foldedIterationDim`, the |
1089 | /// sequence of dimensions is contiguous in domains of all `indexing_maps` of |
1090 | /// the `linalgOp`. This can be checked using `areDimSequencePreserved` method. |
1091 | /// When valid, the method also collapses the operands of the op. Returns |
1092 | /// replacement values of the results of the original `linalgOp` by inserting |
1093 | /// reshapes to get back values of compatible types. |
1094 | FailureOr<CollapseResult> |
1095 | collapseOpIterationDims(LinalgOp op, |
1096 | ArrayRef<ReassociationIndices> foldedIterationDims, |
1097 | RewriterBase &rewriter); |
1098 | |
1099 | struct LowerPackResult { |
1100 | tensor::PadOp padOp; |
1101 | tensor::ExpandShapeOp expandShapeOp; |
1102 | linalg::TransposeOp transposeOp; |
1103 | }; |
1104 | |
1105 | /// Rewrite pack as pad + reshape + transpose. |
1106 | FailureOr<LowerPackResult> lowerPack(RewriterBase &rewriter, |
1107 | tensor::PackOp packOp); |
1108 | |
1109 | struct LowerUnPackOpResult { |
1110 | tensor::EmptyOp emptyOp; |
1111 | linalg::TransposeOp transposeOp; |
1112 | tensor::CollapseShapeOp collapseShapeOp; |
1113 | tensor::ExtractSliceOp ; |
1114 | }; |
1115 | |
1116 | /// Rewrite pack as empty + transpose + reshape + extract_slice. |
1117 | FailureOr<LowerUnPackOpResult> lowerUnPack(RewriterBase &rewriter, |
1118 | tensor::UnPackOp unPackOp); |
1119 | |
1120 | /// Struct to hold the result of a `pack` call. |
1121 | struct PackResult { |
1122 | SmallVector<tensor::PackOp> packOps; |
1123 | linalg::LinalgOp packedLinalgOp; |
1124 | SmallVector<tensor::UnPackOp> unPackOps; |
1125 | }; |
1126 | /// Implement packing of a single LinalgOp by `packedSizes`. |
1127 | /// There must be one packedSizes entry per `linalgOp` iterator. |
1128 | /// Return the packed Linalg op on success, failure otherwise. |
1129 | FailureOr<PackResult> pack(RewriterBase &rewriter, linalg::LinalgOp linalgOp, |
1130 | ArrayRef<OpFoldResult> packedSizes); |
1131 | |
1132 | /// Struct to hold the result of a `packTranspose` call. |
1133 | struct PackTransposeResult { |
1134 | tensor::PackOp transposedPackOp; |
1135 | linalg::LinalgOp transposedLinalgOp; |
1136 | tensor::UnPackOp transposedUnPackOp; |
1137 | }; |
1138 | /// Transpose a single PackOp -> LinalgOp -> UnPackOp chain and return the |
1139 | /// transposed PackOp -> LinalgOp -> UnPackOp chain after replacements. |
1140 | /// Return failure if either: |
1141 | /// 1. the `packOp` does not have the `linalgOp` as its unique use. |
1142 | /// 2. the `maybeUnPackOp`, if specified must be a consumer of the result tied |
1143 | /// to the unique `packOp` use. |
1144 | /// 3. `outerPerm` (resp. `innerPerm`) must be valid permutations of |
1145 | /// `packOp.getOuterDimsPerm` (resp. `packOp.getInnerDimsPerm`) or empty. |
1146 | FailureOr<PackTransposeResult> |
1147 | packTranspose(RewriterBase &rewriter, tensor::PackOp packOp, |
1148 | linalg::LinalgOp linalgOp, tensor::UnPackOp maybeUnPackOp, |
1149 | ArrayRef<int64_t> outerPerm, ArrayRef<int64_t> innerPerm); |
1150 | |
1151 | /// Pack a LinalgOp by greedily inferring matmul dimensions (m, n, k) where m |
1152 | /// and n are proper parallel dimensions and k is a proper reduction |
1153 | /// dimension. Packing occurs by rewriting the op as a linalg.generic and |
1154 | /// calling linalg::pack by `mnkPackedSizes`. The order of the packed |
1155 | /// dimensions is customizable: the `mnkOrder` is a permutation of {0, 1, 2} |
1156 | /// to reorder {m, n, k} into one of the 8 possible forms. The outer |
1157 | /// dimensions of the operands are not permuted at this time, this is left for |
1158 | /// future work. |
1159 | FailureOr<PackResult> |
1160 | packMatmulGreedily(RewriterBase &rewriter, LinalgOp linalgOp, |
1161 | ArrayRef<OpFoldResult> mnkPackedSizes, |
1162 | ArrayRef<int64_t> mnkPaddedSizesNextMultipleOf, |
1163 | ArrayRef<int64_t> mnkOrder); |
1164 | |
1165 | /// Rewrite tensor.from_elements to linalg.generic. |
1166 | FailureOr<Operation *> |
1167 | rewriteInDestinationPassingStyle(RewriterBase &rewriter, |
1168 | tensor::FromElementsOp fromElementsOp); |
1169 | |
1170 | /// Rewrite tensor.generate to linalg.generic. |
1171 | FailureOr<Operation *> |
1172 | rewriteInDestinationPassingStyle(RewriterBase &rewriter, |
1173 | tensor::GenerateOp generateOp); |
1174 | |
1175 | /// Rewrite tensor.pad to linalg.generic + tensor.insert_slice. |
1176 | FailureOr<Operation *> rewriteInDestinationPassingStyle(RewriterBase &rewriter, |
1177 | tensor::PadOp padOp); |
1178 | |
1179 | /// Convert linalg.conv_2d_nhwc_hwcf into linalg.generic (for img2col packing) |
1180 | /// and linalg.matmul. |
1181 | /// |
1182 | /// A convolution operation can be written as a matrix-matrix multiplication by |
1183 | /// unfolding the cross-correlation between input and filter and explicitly copy |
1184 | /// overlapped sliding window inputs. |
1185 | /// |
1186 | /// Consider 2D input X with single channel input and output and 2x2 filter W: |
1187 | /// [x(0, 0) , x(0, 1) , ..., x(0, n) ] |
1188 | /// [x(1, 0) , x(1, 1) , ..., x(1, n) ] |
1189 | /// [. , . ,. , . ] [w(0, 0), w(0, 1)] |
1190 | /// [. , . , . , . ] (conv) [w(1, 0), w(1, 1)] |
1191 | /// [. , . , ., . ] |
1192 | /// [x(n-1, 0), x(n-1, 1), ..., x(n-1, n-1)] |
1193 | /// |
1194 | /// The packed input data (img2col) is a matrix with |rows| = output spatial |
1195 | /// size, |columns| = filter spatial size. To compute the output Y(i, j) we need |
1196 | /// to calculate the dot product between filter window at input X(x, y)) and the |
1197 | /// filter which will look like the following where r.h.s is the img2col matrix |
1198 | /// and l.h.s is the flattened filter: |
1199 | /// |
1200 | /// [x(0,0), x(0,1), x(1,0), x(1,1)] |
1201 | /// [x(0,1), x(1,1), x(0,2), x(1,2)] (matmul) [w(0,0), w(0,1), w(1,0), w(1,1)] |
1202 | /// [x(0,1), x(1,1), x(0,2), x(1,2)] |
1203 | /// [ . , . , . , . ] |
1204 | /// |
1205 | /// In general for 2D case with (N, H, W, C) input and (Kh, Kw, C, D) filter |
1206 | /// and output (N, Ho, Wo, D) the convolution is the following matrix-matrix |
1207 | /// multiplication (Ho x Wo, Kh x Kw x C) * (Kh x Kw x C, D) for each input in |
1208 | /// the N input. For the case where N > 1 its a batched matrix-matrix |
1209 | /// multiplication. |
1210 | /// |
1211 | /// On success, return both the operation that produces the img2col tensor and |
1212 | /// the final operation of the sequence that replaces the original convolution. |
1213 | FailureOr<std::pair<Operation *, Operation *>> |
1214 | rewriteInIm2Col(RewriterBase &rewriter, linalg::Conv2DNhwcHwcfOp convOp); |
1215 | |
1216 | /// Same as the above but for Fhwc channel orderings in the filter. In this case |
1217 | /// the matrix multiplication is actually a row-wise dot-product rather than a |
1218 | /// row-column dot-product. This is to avoid transposing the filter matrix which |
1219 | /// would be required for a regular matrix multiplication to produce the correct |
1220 | /// output dimensions. |
1221 | FailureOr<std::pair<Operation *, Operation *>> |
1222 | rewriteInIm2Col(RewriterBase &rewriter, linalg::Conv2DNhwcFhwcOp convOp); |
1223 | |
1224 | /// Similar to rewriteInIm2Col with linalg::Conv2DNhwcHwcfOp except there is no |
1225 | /// reduction among the input channels so each convolution can be a |
1226 | /// matrix-vector product and by transposing both input filter so channels are |
1227 | /// outer most the computation is a batched matrix-vector product. |
1228 | FailureOr<std::pair<Operation *, Operation *>> |
1229 | rewriteInIm2Col(RewriterBase &rewriter, |
1230 | linalg::DepthwiseConv2DNhwcHwcOp convOp); |
1231 | |
1232 | /// Similar to rewriteInIm2Col with linalg::Conv2DNhwcHwcfOp except because the |
1233 | /// channels are to the left of the image shape dimensions, the position of the |
1234 | /// contraction dimension in the resulting matmul is reversed. This swaps the |
1235 | /// LHS and RHS of the matmul when compared with nhwc (i.e. (D, C x Kh x Kw) * |
1236 | /// (C x Kh x Kw, Ho x Wo)) |
1237 | FailureOr<std::pair<Operation *, Operation *>> |
1238 | rewriteInIm2Col(RewriterBase &rewriter, linalg::Conv2DNchwFchwOp convOp); |
1239 | |
1240 | /// Convert linalg.conv_2d_nhwc_fhwc(_q) to linalg.conv_2d_nhwc_hwcf(_q) by |
1241 | /// materializing transpose. |
1242 | FailureOr<Operation *> transposeConv2D(RewriterBase &rewriter, |
1243 | linalg::Conv2DNhwcFhwcOp op); |
1244 | FailureOr<Operation *> transposeConv2D(RewriterBase &rewriter, |
1245 | linalg::Conv2DNhwcFhwcQOp op); |
1246 | |
1247 | /// Convert Linalg matmul ops to transposed variants. |
1248 | FailureOr<Operation *> transposeMatmul(RewriterBase &rewriter, |
1249 | linalg::MatmulOp op, |
1250 | bool transposeLHS = true); |
1251 | FailureOr<Operation *> transposeBatchMatmul(RewriterBase &rewriter, |
1252 | linalg::BatchMatmulOp op, |
1253 | bool transposeLHS = true); |
1254 | |
1255 | //===----------------------------------------------------------------------===// |
1256 | // Rewrite patterns wrapping transformations. |
1257 | // TODO: every single such pattern should be a close to noop wrapper around a |
1258 | // functional-stye API call. |
1259 | //===----------------------------------------------------------------------===// |
1260 | |
1261 | /// Rewrites 2-D convolution ops with size-1 window dimensions into 1-D |
1262 | /// convolution ops. |
1263 | template <typename Conv2DOp, typename Conv1DOp> |
1264 | struct DownscaleSizeOneWindowed2DConvolution final |
1265 | : public OpRewritePattern<Conv2DOp> { |
1266 | using OpRewritePattern<Conv2DOp>::OpRewritePattern; |
1267 | |
1268 | FailureOr<Conv1DOp> returningMatchAndRewrite(Conv2DOp convOp, |
1269 | PatternRewriter &rewriter) const; |
1270 | |
1271 | LogicalResult matchAndRewrite(Conv2DOp convOp, |
1272 | PatternRewriter &rewriter) const override { |
1273 | return returningMatchAndRewrite(convOp, rewriter); |
1274 | } |
1275 | }; |
1276 | |
1277 | extern template struct DownscaleSizeOneWindowed2DConvolution<Conv2DNhwcHwcfOp, |
1278 | Conv1DNwcWcfOp>; |
1279 | extern template struct DownscaleSizeOneWindowed2DConvolution<Conv2DNchwFchwOp, |
1280 | Conv1DNcwFcwOp>; |
1281 | |
1282 | /// Rewrites 2-D depthwise convolution ops with size-1 (w, kw) or (h, kh) |
1283 | /// dimensions into 1-D depthwise convolution ops. |
1284 | struct DownscaleDepthwiseConv2DNhwcHwcOp final |
1285 | : public OpRewritePattern<DepthwiseConv2DNhwcHwcOp> { |
1286 | DownscaleDepthwiseConv2DNhwcHwcOp(MLIRContext *context, |
1287 | PatternBenefit benefit = 1) |
1288 | : OpRewritePattern<DepthwiseConv2DNhwcHwcOp>(context, benefit) {} |
1289 | |
1290 | FailureOr<DepthwiseConv1DNwcWcOp> |
1291 | returningMatchAndRewrite(DepthwiseConv2DNhwcHwcOp convOp, |
1292 | PatternRewriter &rewriter) const; |
1293 | |
1294 | LogicalResult matchAndRewrite(DepthwiseConv2DNhwcHwcOp convOp, |
1295 | PatternRewriter &rewriter) const override { |
1296 | return returningMatchAndRewrite(convOp, rewriter); |
1297 | } |
1298 | }; |
1299 | |
1300 | struct DownscaleConv2DOp final : public OpRewritePattern<Conv2DOp> { |
1301 | DownscaleConv2DOp(MLIRContext *context, PatternBenefit benefit = 1) |
1302 | : OpRewritePattern<Conv2DOp>(context, benefit) {} |
1303 | |
1304 | FailureOr<Conv1DOp> returningMatchAndRewrite(Conv2DOp convOp, |
1305 | PatternRewriter &rewriter) const; |
1306 | |
1307 | LogicalResult matchAndRewrite(Conv2DOp convOp, |
1308 | PatternRewriter &rewriter) const override { |
1309 | return returningMatchAndRewrite(convOp, rewriter); |
1310 | } |
1311 | }; |
1312 | |
1313 | /// |
1314 | /// Linalg generalization pattern. |
1315 | /// |
1316 | /// Apply the `generalization` transformation as a pattern. |
1317 | /// See `generalization` for more details. |
1318 | // |
1319 | // TODO: Automatic default pattern class that just unwraps a function |
1320 | // returning FailureOr<GenericOp>. |
1321 | struct LinalgGeneralizationPattern |
1322 | : public OpInterfaceRewritePattern<LinalgOp> { |
1323 | using OpInterfaceRewritePattern<LinalgOp>::OpInterfaceRewritePattern; |
1324 | |
1325 | /// `matchAndRewrite` implementation that returns the significant |
1326 | /// transformed pieces of IR. |
1327 | FailureOr<GenericOp> |
1328 | returningMatchAndRewrite(LinalgOp op, PatternRewriter &rewriter) const { |
1329 | return generalizeNamedOp(rewriter, op); |
1330 | } |
1331 | |
1332 | LogicalResult matchAndRewrite(LinalgOp op, |
1333 | PatternRewriter &rewriter) const override { |
1334 | return returningMatchAndRewrite(op, rewriter); |
1335 | } |
1336 | }; |
1337 | |
1338 | /// Vectorization pattern for memref::CopyOp. |
1339 | struct CopyVectorizationPattern : public OpRewritePattern<memref::CopyOp> { |
1340 | using OpRewritePattern<memref::CopyOp>::OpRewritePattern; |
1341 | |
1342 | LogicalResult matchAndRewrite(memref::CopyOp copyOp, |
1343 | PatternRewriter &rewriter) const override; |
1344 | }; |
1345 | |
1346 | using OptimizeCopyFn = |
1347 | std::function<LogicalResult(RewriterBase &, tensor::PadOp, Value)>; |
1348 | |
1349 | /// Rewrite a tensor::PadOp into a sequence of EmptyOp, FillOp and |
1350 | /// InsertSliceOp. For now, only constant padding values are supported. |
1351 | /// `OptimizeCopyFn` can be used to customize copying step optimization. |
1352 | struct GeneralizePadOpPattern : public OpRewritePattern<tensor::PadOp> { |
1353 | GeneralizePadOpPattern(MLIRContext *context, |
1354 | OptimizeCopyFn optimizeCopyFn = nullptr, |
1355 | PatternBenefit benefit = 1) |
1356 | : OpRewritePattern<tensor::PadOp>(context, benefit), |
1357 | optimizeCopyFn(std::move(optimizeCopyFn)) {} |
1358 | LogicalResult matchAndRewrite(tensor::PadOp padOp, |
1359 | PatternRewriter &rewriter) const override; |
1360 | |
1361 | protected: |
1362 | OptimizeCopyFn optimizeCopyFn; |
1363 | Value createFillOrGenerateOp(RewriterBase &rewriter, tensor::PadOp padOp, |
1364 | Value dest, |
1365 | const SmallVector<Value> &dynSizes) const; |
1366 | }; |
1367 | |
1368 | /// Rewrites a tensor::PackOp into a sequence of tensor.pad + linalg.transpose + |
1369 | /// tensor.insert_slice ops, where the tensor::PackOp has outer dims being all |
1370 | /// 1s. |
1371 | struct GeneralizeOuterUnitDimsPackOpPattern |
1372 | : public OpRewritePattern<tensor::PackOp> { |
1373 | using OpRewritePattern<tensor::PackOp>::OpRewritePattern; |
1374 | LogicalResult matchAndRewrite(tensor::PackOp packOp, |
1375 | PatternRewriter &rewriter) const override; |
1376 | }; |
1377 | |
1378 | /// Rewrites a tensor::UnPackOp into a sequence of rank-reduced extract_slice op |
1379 | /// + transpose op + insert_slice op, where the tensor::UnPackOp has outer dims |
1380 | /// being all 1s. |
1381 | struct GeneralizeOuterUnitDimsUnPackOpPattern |
1382 | : public OpRewritePattern<tensor::UnPackOp> { |
1383 | using OpRewritePattern<tensor::UnPackOp>::OpRewritePattern; |
1384 | LogicalResult matchAndRewrite(tensor::UnPackOp unpackOp, |
1385 | PatternRewriter &rewriter) const override; |
1386 | }; |
1387 | |
1388 | /// Match and rewrite for the pattern: |
1389 | /// ``` |
1390 | /// %alloc = ... |
1391 | /// [optional] %view = memref.view %alloc ... |
1392 | /// %subView = subview %allocOrView ... |
1393 | /// [optional] linalg.fill(%allocOrView, %cst) ... |
1394 | /// ... |
1395 | /// memref.copy(%in, %subView) ... |
1396 | /// vector.transfer_read %allocOrView[...], %cst ... |
1397 | /// ``` |
1398 | /// into |
1399 | /// ``` |
1400 | /// [unchanged] %alloc = ... |
1401 | /// [unchanged] [optional] %view = memref.view %alloc ... |
1402 | /// [unchanged] [unchanged] %subView = subview %allocOrView ... |
1403 | /// ... |
1404 | /// vector.transfer_read %in[...], %cst ... |
1405 | /// ``` |
1406 | /// Where there is no interleaved use between memref.copy and transfer_read as |
1407 | /// well as no interleaved use between linalg.fill and memref.copy (if |
1408 | /// linalg.fill is specified). |
1409 | /// This is a custom rewrite to forward partial reads (with optional fills) to |
1410 | /// vector.transfer_read. |
1411 | struct LinalgCopyVTRForwardingPattern |
1412 | : public OpRewritePattern<vector::TransferReadOp> { |
1413 | using OpRewritePattern<vector::TransferReadOp>::OpRewritePattern; |
1414 | |
1415 | LogicalResult matchAndRewrite(vector::TransferReadOp xferOp, |
1416 | PatternRewriter &rewriter) const override; |
1417 | }; |
1418 | |
1419 | /// Match and rewrite for the pattern: |
1420 | /// ``` |
1421 | /// %alloc = ... |
1422 | /// [optional] %view = memref.view %alloc ... |
1423 | /// %subView = subview %allocOrView... |
1424 | /// ... |
1425 | /// vector.transfer_write %..., %allocOrView[...] |
1426 | /// memref.copy(%subView, %out) |
1427 | /// ``` |
1428 | /// into |
1429 | /// ``` |
1430 | /// [unchanged] %alloc = ... |
1431 | /// [unchanged] [optional] %view = memref.view %alloc ... |
1432 | /// [unchanged] %subView = subview %allocOrView... |
1433 | /// ... |
1434 | /// vector.transfer_write %..., %out[...] |
1435 | /// ``` |
1436 | /// Where there is no interleaved use between transfer_write and memref.copy. |
1437 | /// This is a custom rewrite to forward partial writes to |
1438 | /// vector.transfer_write. |
1439 | struct LinalgCopyVTWForwardingPattern |
1440 | : public OpRewritePattern<vector::TransferWriteOp> { |
1441 | using OpRewritePattern<vector::TransferWriteOp>::OpRewritePattern; |
1442 | |
1443 | LogicalResult matchAndRewrite(vector::TransferWriteOp xferOp, |
1444 | PatternRewriter &rewriter) const override; |
1445 | }; |
1446 | |
1447 | /// Rewrite extract_slice(tensor.pad(x)) into tensor.pad(extract_slice(x)). |
1448 | struct |
1449 | : public OpRewritePattern<tensor::ExtractSliceOp> { |
1450 | /// A function to control pattern application and rewrite logic. |
1451 | /// |
1452 | /// The function will be given the slice op and should return: |
1453 | /// - std::nullopt: to fail the match and not apply the pattern; |
1454 | /// - true: to apply the pattern with zero slice guard; |
1455 | /// - false: to apply the pattern without zero slice guard. |
1456 | /// |
1457 | /// See the documentation for tensor::bubbleUpPadSlice regarding zero slice |
1458 | /// guard. |
1459 | using = std::function<std::optional<bool>(tensor::ExtractSliceOp)>; |
1460 | |
1461 | (MLIRContext *context, |
1462 | ControlFn controlFn = nullptr, |
1463 | PatternBenefit benefit = 1) |
1464 | : OpRewritePattern(context, benefit), controlFn(std::move(controlFn)) {} |
1465 | |
1466 | LogicalResult matchAndRewrite(tensor::ExtractSliceOp sliceOp, |
1467 | PatternRewriter &rewriter) const override; |
1468 | |
1469 | private: |
1470 | ControlFn ; |
1471 | }; |
1472 | |
1473 | //===----------------------------------------------------------------------===// |
1474 | // Populate functions. |
1475 | //===----------------------------------------------------------------------===// |
1476 | |
1477 | /// Canonicalization patterns relevant to apply after tiling patterns. These |
1478 | /// are applied automatically by the tiling pass but need to be applied |
1479 | /// manually when tiling is called programmatically. |
1480 | RewritePatternSet getLinalgTilingCanonicalizationPatterns(MLIRContext *ctx); |
1481 | void populateLinalgTilingCanonicalizationPatterns(RewritePatternSet &patterns); |
1482 | |
1483 | /// Linalg generalization patterns |
1484 | |
1485 | /// Populates `patterns` with patterns to convert spec-generated named ops to |
1486 | /// linalg.generic ops. |
1487 | void populateLinalgNamedOpsGeneralizationPatterns(RewritePatternSet &patterns); |
1488 | |
1489 | /// Linalg decompose convolutions patterns |
1490 | |
1491 | /// Populates patterns to decompose high-D convolution ops into low-D ones. |
1492 | /// This is a step in progressive lowering for convolution ops, afterwards we |
1493 | /// can vectorize the low-D convolution ops. |
1494 | void populateDecomposeConvolutionPatterns(RewritePatternSet &patterns, |
1495 | PatternBenefit benefit = 1); |
1496 | |
1497 | /// Populates patterns to transform linalg.conv_2d_xxx operations into |
1498 | /// linalg.generic (for img2col packing) and linalg.matmul. |
1499 | /// \see rewriteInIm2Col for more details. |
1500 | void populateConvertConv2DToImg2ColPatterns(RewritePatternSet &patterns); |
1501 | |
1502 | /// Populates `patterns` with patterns that vectorize tensor.pad. |
1503 | /// These patterns are meant to apply in a complementary fashion. Benefits |
1504 | /// are used to encode a certain ordering of pattern application. To avoid |
1505 | /// scattering magic constants throughout the code base, the patterns must be |
1506 | /// added with this function. `baseBenefit` can be used to offset the benefit |
1507 | /// of all tensor::PadOp vectorization patterns by a certain value. |
1508 | void populatePadOpVectorizationPatterns(RewritePatternSet &patterns, |
1509 | PatternBenefit baseBenefit = 1); |
1510 | |
1511 | /// Populate patterns for splitting a `LinalgOp` with multiple statements within |
1512 | /// its payload into multiple `GenericOp` that have a single statement. |
1513 | /// The option `removeDeadArgsAndResults` adds patterns to remove dead arguments |
1514 | /// and results from the generated decomposed ops. This is default `true` since |
1515 | /// the core decomposition patterns relies on these clean up patterns. It is set |
1516 | /// to false only for testing purposes. |
1517 | void populateDecomposeLinalgOpsPattern(RewritePatternSet &patterns, |
1518 | bool removeDeadArgsAndResults = true); |
1519 | |
1520 | /// Populate patterns that convert non-destination-style ops to destination |
1521 | /// style ops. |
1522 | void populateConvertToDestinationStylePatterns(RewritePatternSet &patterns); |
1523 | |
1524 | /// Populate patterns for vectorizing low-D convolution ops. This is a step in |
1525 | /// progressive lowering for convolution ops, it assume high-D convolution ops |
1526 | /// were decomposed previously. |
1527 | void populateConvolutionVectorizationPatterns(RewritePatternSet &patterns, |
1528 | PatternBenefit benefit = 1); |
1529 | |
1530 | /// Populate patterns that convert `ElementwiseMappable` ops to linalg |
1531 | /// parallel loops. |
1532 | void populateElementwiseToLinalgConversionPatterns(RewritePatternSet &patterns); |
1533 | |
1534 | /// Populate patterns that are only useful in the context of sparse tensors. |
1535 | void populateSparseTensorRewriting(RewritePatternSet &patterns); |
1536 | |
1537 | /// Function type which is used to control when to stop fusion. It is expected |
1538 | /// that OpOperand is not modified in the callback. The OpOperand is not marked |
1539 | /// as const to allow callers to use non-const methods. |
1540 | using ControlFusionFn = std::function<bool(OpOperand *fusedOperand)>; |
1541 | |
1542 | /// Patterns for fusing linalg operation on tensors. |
1543 | |
1544 | /// Pattern to fuse `linalg.generic` -> `linalg.generic` operations |
1545 | /// when both operations are fusable elementwise operations. |
1546 | void populateElementwiseOpsFusionPatterns( |
1547 | RewritePatternSet &patterns, |
1548 | const ControlFusionFn &controlElementwiseOpFusion); |
1549 | |
1550 | /// Function type which is used to control propagation of tensor.pack/unpack |
1551 | /// ops. |
1552 | using ControlPropagationFn = std::function<bool(Operation *op)>; |
1553 | |
1554 | /// Patterns to bubble up or down data layout ops across other operations. |
1555 | void populateDataLayoutPropagationPatterns( |
1556 | RewritePatternSet &patterns, |
1557 | const ControlPropagationFn &controlPackUnPackPropagation); |
1558 | |
1559 | /// Pattern to remove dead operands and results of `linalg.generic` operations. |
1560 | /// This is effectively DCE for a linalg op. |
1561 | void populateEraseUnusedOperandsAndResultsPatterns(RewritePatternSet &patterns); |
1562 | |
1563 | /// Patterns to promote inputs to outputs and remove unused inputs of |
1564 | /// `linalg.generic` ops. |
1565 | void populateEraseUnnecessaryInputsPatterns(RewritePatternSet &patterns); |
1566 | |
1567 | /// Function type to control generic op dimension collapsing. It is expected |
1568 | /// to return an array of `ReassociationIndices` representing dimensions that |
1569 | /// should be merged. |
1570 | using GetCollapsableDimensionsFn = |
1571 | std::function<SmallVector<ReassociationIndices>(linalg::LinalgOp)>; |
1572 | |
1573 | /// Pattern to collapse dimensions in a linalg.generic op. This will collapse |
1574 | /// tensor operands when needed and expand back the result tensors. |
1575 | void populateCollapseDimensions( |
1576 | RewritePatternSet &patterns, |
1577 | const GetCollapsableDimensionsFn &controlCollapseDimensions); |
1578 | |
1579 | /// Patterns to fold an expanding (collapsing) tensor_reshape operation with its |
1580 | /// producer (consumer) generic operation by expanding the dimensionality of the |
1581 | /// loop in the generic op. |
1582 | void populateFoldReshapeOpsByExpansionPatterns( |
1583 | RewritePatternSet &patterns, const ControlFusionFn &controlFoldingReshapes); |
1584 | |
1585 | /// Patterns to fold an expanding tensor.expand_shape operation with its |
1586 | /// producer generic operation by collapsing the dimensions of the generic op. |
1587 | void populateFoldReshapeOpsByCollapsingPatterns( |
1588 | RewritePatternSet &patterns, const ControlFusionFn &controlFoldingReshapes); |
1589 | |
1590 | /// Patterns to constant fold Linalg operations. |
1591 | void populateConstantFoldLinalgOperations(RewritePatternSet &patterns, |
1592 | const ControlFusionFn &controlFn); |
1593 | |
1594 | /// Pattern to fuse a `tensor.pad` operation with the producer of its source, |
1595 | /// if the producer is a `linalg` operation with all parallel iterator types. |
1596 | void populateFuseTensorPadWithProducerLinalgOpPatterns( |
1597 | RewritePatternSet &patterns); |
1598 | |
1599 | /// Patterns to convert from one named op to another. These can be seen as |
1600 | /// canonicalizations of named ops into another named op. |
1601 | void populateLinalgNamedOpConversionPatterns(RewritePatternSet &patterns); |
1602 | |
1603 | /// Patterns to fold unit-extent dimensions in operands/results of linalg ops on |
1604 | /// tensors via reassociative reshape ops. |
1605 | void populateFoldUnitExtentDimsPatterns(RewritePatternSet &patterns, |
1606 | ControlDropUnitDims &options); |
1607 | |
1608 | /// A pattern that converts init operands to input operands. |
1609 | void populateMoveInitOperandsToInputPattern(RewritePatternSet &patterns); |
1610 | |
1611 | /// Patterns that are used to inline constant operands into linalg generic ops. |
1612 | void populateInlineConstantOperandsPatterns(RewritePatternSet &patterns); |
1613 | |
1614 | /// Patterns that are used to bubble up extract slice op above linalg op. |
1615 | void (RewritePatternSet &patterns); |
1616 | |
1617 | /// Adds patterns that waps tensor.extract_slice(linalg.fill(%cst, %init)) into |
1618 | /// linalg.fill(%cst, tensor.extract_slice(%init)). |
1619 | void (RewritePatternSet &patterns); |
1620 | |
1621 | /// Patterns to apply `splitReduction` below. |
1622 | void populateSplitReductionPattern( |
1623 | RewritePatternSet &patterns, |
1624 | const ControlSplitReductionFn &controlSplitReductionFn, |
1625 | bool useAlloc = false); |
1626 | |
1627 | /// Patterns to convert Linalg matmul ops to transposed variants. |
1628 | void populateTransposeMatmulPatterns(RewritePatternSet &patterns, |
1629 | bool transposeLHS = true); |
1630 | |
1631 | } // namespace linalg |
1632 | } // namespace mlir |
1633 | |
1634 | #endif // MLIR_DIALECT_LINALG_TRANSFORMS_TRANSFORMS_H |
1635 | |