TileUsingInterface.h source code [mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h]

1	//===- TileUsingInterface.h - Tiling ops using TilingInterface --- C++ --===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8
9	#ifndef MLIR_DIALECT_SCF_TRANSFORMS_TILEUSINGINTERFACE_H
10	#define MLIR_DIALECT_SCF_TRANSFORMS_TILEUSINGINTERFACE_H
11
12	#include "mlir/Dialect/SCF/IR/SCF.h"
13	#include "mlir/Dialect/Tensor/Transforms/Transforms.h"
14	#include "mlir/IR/PatternMatch.h"
15	#include "mlir/Interfaces/LoopLikeInterface.h"
16	#include "mlir/Interfaces/TilingInterface.h"
17	#include "mlir/Interfaces/ViewLikeInterface.h"
18	#include "mlir/Rewrite/FrozenRewritePatternSet.h"
19
20	#include <deque>
21
22	namespace mlir {
23	class Operation;
24	class RewriterBase;
25	class TilingInterface;
26	} // namespace mlir
27
28	namespace mlir {
29	namespace scf {
30
31	using SCFTileSizeComputationFunction =
32	std::function<SmallVector<OpFoldResult>(OpBuilder &, Operation *)>;
33
34	/// Options to use to control tiling.
35	struct SCFTilingOptions {
36	/// Computation function that returns the tile sizes to use for each loop.
37	/// Returning a tile size of zero implies no tiling for that loop. If the
38	/// size of the returned vector is smaller than the number of loops, the inner
39	/// loops are not tiled. If the size of the returned vector is larger, then
40	/// the vector is truncated to number of loops.
41	SCFTileSizeComputationFunction tileSizeComputationFunction = nullptr;
42
43	SCFTilingOptions &
44	setTileSizeComputationFunction(SCFTileSizeComputationFunction fun) {
45	tileSizeComputationFunction = std::move(fun);
46	return *this;
47	}
48	/// Convenience function to set the `tileSizeComputationFunction` to a
49	/// function that computes tile sizes at the point they are needed. Allows
50	/// proper interaction with folding.
51	SCFTilingOptions &setTileSizes(ArrayRef<OpFoldResult> tileSizes);
52
53	/// Computation function that returns the number of threads to use for
54	/// each loop. Returning a num threads of zero implies no tiling for that
55	/// loop. If the size of the returned vector is smaller than the number of
56	/// loops, the inner loops are not tiled. If the size of the returned vector
57	/// is larger, then the vector is truncated to number of loops. Note: This
58	/// option is only supported with loopType set to `LoopType::ForallOp`. If the
59	/// tile size function is not specified while the num threads computation is,
60	/// then the tile size is determined automatically to map at most one tile per
61	/// thread.
62	SCFTileSizeComputationFunction numThreadsComputationFunction = nullptr;
63
64	SCFTilingOptions &
65	setNumThreadsComputationFunction(SCFTileSizeComputationFunction fun) {
66	numThreadsComputationFunction = std::move(fun);
67	return *this;
68	}
69	/// Convenience function to set the `numThreadsComputationFunction` to a
70	/// function that computes num threads at the point they are needed.
71	SCFTilingOptions &setNumThreads(ArrayRef<OpFoldResult> numThreads);
72
73	/// The interchange vector to reorder the tiled loops.
74	SmallVector<int64_t> interchangeVector = {};
75	SCFTilingOptions &setInterchange(ArrayRef<int64_t> interchange) {
76	interchangeVector = llvm::to_vector(Range&: interchange);
77	return *this;
78	}
79
80	/// Specify which loop construct to use for tile and fuse.
81	enum class LoopType { ForOp, ForallOp };
82	LoopType loopType = LoopType::ForOp;
83	SCFTilingOptions &setLoopType(LoopType type) {
84	loopType = type;
85	return *this;
86	}
87
88	/// Specify how reduction dimensions should be tiled.
89	///
90	/// Tiling can be thought of as splitting a dimension into 2 and materializing
91	/// the outer dimension as a loop:
92	///
93	/// op[original] -> op[original / x, x] -> loop[original] { op[x] }
94	///
95	/// For parallel dimensions, the split can only happen in one way, with both
96	/// dimensions being parallel. For reduction dimensions however, there is a
97	/// choice in how we split the reduction dimension. This enum exposes this
98	/// choice.
99	enum class ReductionTilingStrategy {
100	// [reduction] -> [reduction1, reduction2]
101	// -> loop[reduction1] { [reduction2] }
102	FullReduction,
103	// [reduction] -> [reduction1, parallel2]
104	// -> loop[reduction1] { [parallel2] }; merge[reduction1]
105	PartialReductionOuterReduction,
106	// [reduction] -> [parallel1, reduction2]
107	// -> loop[parallel1] { [reduction2] }; merge[parallel1]
108	PartialReductionOuterParallel
109	};
110	ReductionTilingStrategy reductionStrategy =
111	ReductionTilingStrategy::FullReduction;
112	SCFTilingOptions &
113	setReductionTilingStrategy(ReductionTilingStrategy strategy) {
114	reductionStrategy = strategy;
115	return *this;
116	}
117
118	/// Specify mapping of loops to devices. This is only respected when the loop
119	/// constructs support such a mapping (like `scf.forall`). Will be ignored
120	/// when using loop constructs that dont support such a mapping (like
121	/// `scf.for`)
122	SmallVector<Attribute> mappingVector = {};
123	SCFTilingOptions &setMapping(ArrayRef<Attribute> mapping) {
124	mappingVector = llvm::to_vector(Range&: mapping);
125	return *this;
126	}
127	};
128
129	/// Transformation information returned after tiling.
130	struct SCFTilingResult {
131	/// Tiled operations that are generated during tiling. The order does not
132	/// matter except the last op. The replacements are expected to be the results
133	/// of the last op.
134	SmallVector<Operation *> tiledOps;
135	/// The initial destination values passed to the tiled operations.
136	SmallVector<Value> initialValues;
137	/// The `scf.for` operations that iterate over the tiles.
138	SmallVector<LoopLikeOpInterface> loops;
139	/// Values to use as replacements for the untiled op. Is the same size as the
140	/// number of results of the untiled op.
141	SmallVector<Value> replacements;
142	/// Slices generated after tiling that can be used for fusing with the tiled
143	/// producer.
144	SmallVector<Operation *> generatedSlices;
145	/// In cases where there as an additional merge step after tiling
146	/// return the merged ops after tiling. This list is empty when reduction
147	/// tiling strategy is
148	/// `scf::SCFTilingOptions::ReductionTilingStrategy::FullReduction.
149	SmallVector<Operation *> mergeOps;
150	};
151
152	/// Method to tile an op that implements the `TilingInterface` using
153	/// `scf.for` for iterating over the tiles.
154	FailureOr<SCFTilingResult> tileUsingSCF(RewriterBase &rewriter,
155	TilingInterface op,
156	const SCFTilingOptions &options);
157
158	/// Options used to control tile + fuse.
159	struct SCFTileAndFuseOptions {
160	/// The tiling options used to control the tiling of the consumer.
161	SCFTilingOptions tilingOptions;
162	SCFTileAndFuseOptions &setTilingOptions(SCFTilingOptions options) {
163	tilingOptions = options;
164	return *this;
165	}
166
167	/// Control function to check if a slice needs to be fused or not,
168	/// The control function receives
169	/// 1) the slice along which fusion is to be done,
170	/// 2) the producer value that is to be fused
171	/// 3) a boolean value set to `true` if the fusion is from
172	/// a destination operand.
173	/// The control function returns an `std::optiona<ControlFnResult>`.
174	/// If the return value is `std::nullopt`, that implies no fusion
175	/// is to be performed along that slice.
176	struct ControlFnResult {
177	/// Set to true if the loop nest has to return a replacement value
178	/// for the fused producer.
179	bool yieldProducerReplacement = false;
180	};
181	using ControlFnTy = std::function<std::optional<ControlFnResult>(
182	tensor::ExtractSliceOp candidateSliceOp, OpResult originalProducer,
183	bool isDestinationOperand)>;
184	/// The default control function implements greedy fusion without yielding
185	/// a replacement for any of the fused results.
186	ControlFnTy fusionControlFn = [](tensor::ExtractSliceOp, OpResult,
187	bool) -> std::optional<ControlFnResult> {
188	return ControlFnResult{};
189	};
190	SCFTileAndFuseOptions &setFusionControlFn(ControlFnTy controlFn) {
191	fusionControlFn = controlFn;
192	return *this;
193	}
194
195	/// An optional set of rewrite patterns to apply to the results of tiling
196	/// before fusion. This will track deleted and newly inserted
197	/// `tensor.extract_slice` ops and update the worklist.
198	std::optional<FrozenRewritePatternSet> cleanupPatterns = std::nullopt;
199	};
200
201	/// Fuse the producer of the source of `candidateSliceOp` by computing the
202	/// required slice of the producer in-place. Note that the method
203	/// replaces the uses of `candidateSliceOp` with the tiled and fused producer
204	/// value but does not delete the slice operation.
205	struct SCFFuseProducerOfSliceResult {
206	OpResult origProducer; // Original untiled producer.
207	Value tiledAndFusedProducer; // Tile and fused producer value.
208	SmallVector<Operation *> tiledOps;
209	SmallVector<Operation *> generatedSlices;
210	};
211	std::optional<SCFFuseProducerOfSliceResult>
212	tileAndFuseProducerOfSlice(RewriterBase &rewriter,
213	tensor::ExtractSliceOp candidateSliceOp,
214	MutableArrayRef<LoopLikeOpInterface> loops);
215
216	/// Reconstruct the fused producer from within the tiled-and-fused code. Based
217	/// on the slice of the producer computed in place it is possible that within
218	/// the loop nest same slice of the producer is computed multiple times. It is
219	/// in general not possible to recompute the value of the fused producer from
220	/// the tiled loop code in such cases. For the cases where no slice of the
221	/// producer is computed in a redundant fashion it is possible to reconstruct
222	/// the value of the original producer from within the tiled loop. It is upto
223	/// the caller to ensure that the producer is not computed redundantly within
224	/// the tiled loop nest. For example, consider
225	///
226	/// ```mlir
227	/// %0 = linalg.matmul ins(...) outs(...) -> tensor<?x?xf32>
228	/// %1 = linalg.matmul ins(%0, ..) outs(...) -> tensor<?x?x?f32>
229	/// ```
230	///
231	/// If `%1` is tiled in a 2D fashion and `%0` is fused with it, the resulting IR
232	/// is,
233	///
234	/// ```mlir
235	/// %t1_0 = scf.for .... iter_args(%arg0 = ...) {
236	/// %t1_1 = scf.for ... iter_args(%arg1 = %arg0) {
237	/// ...
238	/// %t1_2 = linalg.matmul ins(...) outs(...) -> tensor<?x?xf32>
239	/// %t1_3 = linalg.matmul ins(%t1_2, ...)
240	/// %t1_4 = tensor.insert_slice %t1_3 into %arg1 ...
241	/// scf.yield %t1_4
242	/// }
243	/// scf.yield %t1_1
244	/// }
245	/// ```
246	///
247	/// Here `%t1_2` is the same for all iterations of the inner `scf.for`. Instead
248	/// if `%1` were tiled only along the rows, the resultant code would be
249	///
250	/// ```mlir
251	/// %t2_0 = scf.for .... iter_args(%arg0 = ...) {
252	/// ...
253	/// %t2_1 = linalg.matmul ins(...) outs(...) -> tensor<?x?xf32>
254	/// %t2_2 = linalg.matmul ins(%t2_1, ...)
255	/// %t2_3 = tensor.insert_slice %t2_2 into %arg0 ...
256	/// scf.yield %t2_3
257	/// }
258	/// ```
259	///
260	/// Here there is no intersection in the different slices of `%t2_1` computed
261	/// across iterations of the `scf.for`. In such cases, the value of the original
262	/// `%0` can be reconstructed from within the loop body. This is useful in cases
263	/// where `%0` had other uses as well. If not reconstructed from within the loop
264	/// body, uses of `%0` could not be replaced, making it still live and the
265	/// fusion immaterial.
266	///
267	/// The @param `yieldResultNumber` decides which result would be yield. If not
268	/// given, yield all `opResult` of fused producer.
269	///
270	/// The method returns the list of new slices added during the process (which
271	/// can be used to fuse along).
272	FailureOr<SmallVector<Operation *>> yieldReplacementForFusedProducer(
273	RewriterBase &rewriter, tensor::ExtractSliceOp sliceOp,
274	scf::SCFFuseProducerOfSliceResult fusedProducerInfo,
275	MutableArrayRef<LoopLikeOpInterface> loops,
276	ArrayRef<unsigned> yieldResultNumber = ArrayRef<unsigned>{});
277
278	/// Transformation information returned after tile and fuse.
279	struct SCFTileAndFuseResult {
280	/// List of untiled operations that were fused with the tiled consumer.
281	llvm::SetVector<Operation *> fusedProducers;
282	/// List of tiled and fused operations generated. The first one in this list
283	/// is guaranteed to be the tiled operations generated during tiling of the
284	/// generated operation.
285	llvm::SetVector<Operation *> tiledAndFusedOps;
286	/// The `scf.for` operations that iterate over the tiles.
287	SmallVector<LoopLikeOpInterface> loops;
288	/// The replacement values to use for the tiled and fused operations.
289	llvm::DenseMap<Value, Value> replacements;
290	};
291
292	/// Method to tile and fuse a sequence of operations, by tiling the consumer
293	/// and fusing its producers. Note that this assumes that it is valid to
294	/// tile+fuse the producer into the innermost tiled loop. Its up to the caller
295	/// to ensure that the tile sizes provided make this fusion valid.
296	///
297	/// For example, for the following sequence
298	///
299	/// ```mlir
300	/// %0 =
301	/// %1 = linalg.fill ... outs(%0 : ... )
302	/// %2 = linalg.matmul ... outs(%1 : ...) ...
303	/// ```
304	///
305	/// it is legal to fuse the fill with the matmul only if the matmul is tiled
306	/// along the parallel dimensions and not the reduction dimension, i.e. the tile
307	/// size for the reduction dimension should be 0. The resulting fused
308	/// transformation is
309	///
310	/// ```mlir
311	/// %1 = scf.for ... iter_args(%arg0 = %0)
312	/// %2 = tensor.extract_slice %arg0
313	/// %3 = linalg.fill .. outs(%2 : ... )
314	/// %4 = linalg.matmul .. outs(%3 : ...)
315	/// }
316	/// ```
317	FailureOr<SCFTileAndFuseResult>
318	tileConsumerAndFuseProducersUsingSCF(RewriterBase &rewriter,
319	TilingInterface consumer,
320	const SCFTileAndFuseOptions &options);
321
322	/// Fuse the consumer of the source of `candidateSliceOp` by computing the
323	/// required slice of the consumer in-place. Note that the method
324	/// replaces the uses of `candidateSliceOp` with the tiled and fused consumer
325	/// value but does not delete the slice operation.
326	struct SCFFuseConsumerOfSliceResult {
327	OpOperand origConsumerOperand; // Original untiled consumer's operand.*
328	OpOperand
329	tiledAndFusedConsumerOperand; // Tiled and fused consumer's operand.*
330	SmallVector<Operation *> tiledOps;
331	};
332	FailureOr<scf::SCFFuseConsumerOfSliceResult>
333	tileAndFuseConsumerOfSlice(RewriterBase &rewriter, Operation *candidateSliceOp,
334	MutableArrayRef<LoopLikeOpInterface> loops);
335
336	/// Method to lower an `op` that implements the `TilingInterface` to
337	/// loops/scalars.
338	FailureOr<SmallVector<scf::ForOp>>
339	lowerToLoopsUsingSCFForOp(RewriterBase &rewriter, TilingInterface op);
340
341	/// Method to tile a reduction and generate a parallel op within a serial loop.
342	/// Each of the partial reductions are calculated in parallel. Then after the
343	/// loop all the partial reduction are merged into a final reduction.
344	/// For example for the following sequence
345	///
346	/// ```mlir
347	/// %0 = linalg.generic %in ["parallel", "reduction"]
348	/// : tensor<7x9xf32> -> tensor<7xf32>
349	/// ```
350	///
351	/// into:
352	///
353	/// ```mlir
354	/// %0 = linalg.fill ... : tensor<7x4xf32>
355	/// %1 = scf.for ... iter_args(%arg0 = %0)
356	/// %2 = tensor.extract_slice %arg0 : tensor<7x4xf32> -> tensor<7x?xf32>
357	/// %3 = tensor.extract_slice %in : tensor<7x9xf32> -> tensor<7x?xf32>
358	/// %4 = linalg.generic %2, %3 ["parallel", "parallel"]
359	/// : tensor<7x?xf32> -> tensor<7x?xf32>
360	/// %5 = tensor.insert_slice %3, %0[0, 0] : tensor<7x4xf32>
361	/// }
362	/// %6 = linalg.generic %1 ["parallel", "reduction"]
363	/// : tensor<7x4xf32> -> tensor<7xf32>
364	/// ```
365	FailureOr<scf::SCFTilingResult>
366	tileReductionUsingScf(RewriterBase &b, PartialReductionOpInterface op,
367	ArrayRef<OpFoldResult> tileSizes);
368
369	} // namespace scf
370	} // namespace mlir
371
372	#endif // MLIR_DIALECT_SCF_TRANSFORMS_TILEUSINGINTERFACE_H
373

Provided by KDAB

Definitions

SCFTilingOptions
setTileSizeComputationFunction
setNumThreadsComputationFunction
setInterchange
LoopType
setLoopType
ReductionTilingStrategy
setReductionTilingStrategy
setMapping
SCFTilingResult
SCFTileAndFuseOptions
setTilingOptions
ControlFnResult
setFusionControlFn
SCFFuseProducerOfSliceResult
SCFTileAndFuseResult

Learn to use CMake with our Intro Training

Find out more

Definitions

source code of mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h