SuperVectorize.cpp source code [mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp]

1	//===- SuperVectorize.cpp - Vectorize Pass Impl ---------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file implements vectorization of loops, operations and data types to
10	// a target-independent, n-D super-vector abstraction.
11	//
12	//===----------------------------------------------------------------------===//
13
14	#include "mlir/Dialect/Affine/Passes.h"
15
16	#include "mlir/Analysis/SliceAnalysis.h"
17	#include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h"
18	#include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h"
19	#include "mlir/Dialect/Affine/Analysis/NestedMatcher.h"
20	#include "mlir/Dialect/Affine/IR/AffineOps.h"
21	#include "mlir/Dialect/Affine/Utils.h"
22	#include "mlir/Dialect/Arith/IR/Arith.h"
23	#include "mlir/Dialect/Func/IR/FuncOps.h"
24	#include "mlir/Dialect/Vector/IR/VectorOps.h"
25	#include "mlir/Dialect/Vector/Utils/VectorUtils.h"
26	#include "mlir/IR/IRMapping.h"
27	#include "mlir/Pass/Pass.h"
28	#include "mlir/Support/LLVM.h"
29	#include "llvm/ADT/STLExtras.h"
30	#include "llvm/Support/Debug.h"
31	#include <optional>
32
33	namespace mlir {
34	namespace affine {
35	#define GEN_PASS_DEF_AFFINEVECTORIZE
36	#include "mlir/Dialect/Affine/Passes.h.inc"
37	} // namespace affine
38	} // namespace mlir
39
40	using namespace mlir;
41	using namespace affine;
42	using namespace vector;
43
44	///
45	/// Implements a high-level vectorization strategy on a Function.
46	/// The abstraction used is that of super-vectors, which provide a single,
47	/// compact, representation in the vector types, information that is expected
48	/// to reduce the impact of the phase ordering problem
49	///
50	/// Vector granularity:
51	/// ===================
52	/// This pass is designed to perform vectorization at a super-vector
53	/// granularity. A super-vector is loosely defined as a vector type that is a
54	/// multiple of a "good" vector size so the HW can efficiently implement a set
55	/// of high-level primitives. Multiple is understood along any dimension; e.g.
56	/// both vector<16xf32> and vector<2x8xf32> are valid super-vectors for a
57	/// vector<8xf32> HW vector. Note that a "good vector size so the HW can
58	/// efficiently implement a set of high-level primitives" is not necessarily an
59	/// integer multiple of actual hardware registers. We leave details of this
60	/// distinction unspecified for now.
61	///
62	/// Some may prefer the terminology a "tile of HW vectors". In this case, one
63	/// should note that super-vectors implement an "always full tile" abstraction.
64	/// They guarantee no partial-tile separation is necessary by relying on a
65	/// high-level copy-reshape abstraction that we call vector.transfer. This
66	/// copy-reshape operations is also responsible for performing layout
67	/// transposition if necessary. In the general case this will require a scoped
68	/// allocation in some notional local memory.
69	///
70	/// Whatever the mental model one prefers to use for this abstraction, the key
71	/// point is that we burn into a single, compact, representation in the vector
72	/// types, information that is expected to reduce the impact of the phase
73	/// ordering problem. Indeed, a vector type conveys information that:
74	/// 1. the associated loops have dependency semantics that do not prevent
75	/// vectorization;
76	/// 2. the associate loops have been sliced in chunks of static sizes that are
77	/// compatible with vector sizes (i.e. similar to unroll-and-jam);
78	/// 3. the inner loops, in the unroll-and-jam analogy of 2, are captured by
79	/// the
80	/// vector type and no vectorization hampering transformations can be
81	/// applied to them anymore;
82	/// 4. the underlying memrefs are accessed in some notional contiguous way
83	/// that allows loading into vectors with some amount of spatial locality;
84	/// In other words, super-vectorization provides a level of separation of
85	/// concern by way of opacity to subsequent passes. This has the effect of
86	/// encapsulating and propagating vectorization constraints down the list of
87	/// passes until we are ready to lower further.
88	///
89	/// For a particular target, a notion of minimal n-d vector size will be
90	/// specified and vectorization targets a multiple of those. In the following
91	/// paragraph, let "k ." represent "a multiple of", to be understood as a
92	/// multiple in the same dimension (e.g. vector<16 x k . 128> summarizes
93	/// vector<16 x 128>, vector<16 x 256>, vector<16 x 1024>, etc).
94	///
95	/// Some non-exhaustive notable super-vector sizes of interest include:
96	/// - CPU: vector<k . HW_vector_size>,
97	/// vector<k' . core_count x k . HW_vector_size>,
98	/// vector<socket_count x k' . core_count x k . HW_vector_size>;
99	/// - GPU: vector<k . warp_size>,
100	/// vector<k . warp_size x float2>,
101	/// vector<k . warp_size x float4>,
102	/// vector<k . warp_size x 4 x 4x 4> (for tensor_core sizes).
103	///
104	/// Loops and operations are emitted that operate on those super-vector shapes.
105	/// Subsequent lowering passes will materialize to actual HW vector sizes. These
106	/// passes are expected to be (gradually) more target-specific.
107	///
108	/// At a high level, a vectorized load in a loop will resemble:
109	/// ```mlir
110	/// affine.for %i = ? to ? step ? {
111	/// %v_a = vector.transfer_read A[%i] : memref<?xf32>, vector<128xf32>
112	/// }
113	/// ```
114	/// It is the responsibility of the implementation of vector.transfer_read to
115	/// materialize vector registers from the original scalar memrefs. A later (more
116	/// target-dependent) lowering pass will materialize to actual HW vector sizes.
117	/// This lowering may be occur at different times:
118	/// 1. at the MLIR level into a combination of loops, unrolling, DmaStartOp +
119	/// DmaWaitOp + vectorized operations for data transformations and shuffle;
120	/// thus opening opportunities for unrolling and pipelining. This is an
121	/// instance of library call "whiteboxing"; or
122	/// 2. later in the a target-specific lowering pass or hand-written library
123	/// call; achieving full separation of concerns. This is an instance of
124	/// library call; or
125	/// 3. a mix of both, e.g. based on a model.
126	/// In the future, these operations will expose a contract to constrain the
127	/// search on vectorization patterns and sizes.
128	///
129	/// Occurrence of super-vectorization in the compiler flow:
130	/// =======================================================
131	/// This is an active area of investigation. We start with 2 remarks to position
132	/// super-vectorization in the context of existing ongoing work: LLVM VPLAN
133	/// and LLVM SLP Vectorizer.
134	///
135	/// LLVM VPLAN:
136	/// -----------
137	/// The astute reader may have noticed that in the limit, super-vectorization
138	/// can be applied at a similar time and with similar objectives than VPLAN.
139	/// For instance, in the case of a traditional, polyhedral compilation-flow (for
140	/// instance, the PPCG project uses ISL to provide dependence analysis,
141	/// multi-level(scheduling + tiling), lifting footprint to fast memory,
142	/// communication synthesis, mapping, register optimizations) and before
143	/// unrolling. When vectorization is applied at this late* level in a typical*
144	/// polyhedral flow, and is instantiated with actual hardware vector sizes,
145	/// super-vectorization is expected to match (or subsume) the type of patterns
146	/// that LLVM's VPLAN aims at targeting. The main difference here is that MLIR
147	/// is higher level and our implementation should be significantly simpler. Also
148	/// note that in this mode, recursive patterns are probably a bit of an overkill
149	/// although it is reasonable to expect that mixing a bit of outer loop and
150	/// inner loop vectorization + unrolling will provide interesting choices to
151	/// MLIR.
152	///
153	/// LLVM SLP Vectorizer:
154	/// --------------------
155	/// Super-vectorization however is not meant to be usable in a similar fashion
156	/// to the SLP vectorizer. The main difference lies in the information that
157	/// both vectorizers use: super-vectorization examines contiguity of memory
158	/// references along fastest varying dimensions and loops with recursive nested
159	/// patterns capturing imperfectly-nested loop nests; the SLP vectorizer, on
160	/// the other hand, performs flat pattern matching inside a single unrolled loop
161	/// body and stitches together pieces of load and store operations into full
162	/// 1-D vectors. We envision that the SLP vectorizer is a good way to capture
163	/// innermost loop, control-flow dependent patterns that super-vectorization may
164	/// not be able to capture easily. In other words, super-vectorization does not
165	/// aim at replacing the SLP vectorizer and the two solutions are complementary.
166	///
167	/// Ongoing investigations:
168	/// -----------------------
169	/// We discuss the following early* places where super-vectorization is*
170	/// applicable and touch on the expected benefits and risks . We list the
171	/// opportunities in the context of the traditional polyhedral compiler flow
172	/// described in PPCG. There are essentially 6 places in the MLIR pass pipeline
173	/// we expect to experiment with super-vectorization:
174	/// 1. Right after language lowering to MLIR: this is the earliest time where
175	/// super-vectorization is expected to be applied. At this level, all the
176	/// language/user/library-level annotations are available and can be fully
177	/// exploited. Examples include loop-type annotations (such as parallel,
178	/// reduction, scan, dependence distance vector, vectorizable) as well as
179	/// memory access annotations (such as non-aliasing writes guaranteed,
180	/// indirect accesses that are permutations by construction) accesses or
181	/// that a particular operation is prescribed atomic by the user. At this
182	/// level, anything that enriches what dependence analysis can do should be
183	/// aggressively exploited. At this level we are close to having explicit
184	/// vector types in the language, except we do not impose that burden on the
185	/// programmer/library: we derive information from scalar code + annotations.
186	/// 2. After dependence analysis and before polyhedral scheduling: the
187	/// information that supports vectorization does not need to be supplied by a
188	/// higher level of abstraction. Traditional dependence analysis is available
189	/// in MLIR and will be used to drive vectorization and cost models.
190	///
191	/// Let's pause here and remark that applying super-vectorization as described
192	/// in 1. and 2. presents clear opportunities and risks:
193	/// - the opportunity is that vectorization is burned in the type system and
194	/// is protected from the adverse effect of loop scheduling, tiling, loop
195	/// interchange and all passes downstream. Provided that subsequent passes are
196	/// able to operate on vector types; the vector shapes, associated loop
197	/// iterator properties, alignment, and contiguity of fastest varying
198	/// dimensions are preserved until we lower the super-vector types. We expect
199	/// this to significantly rein in on the adverse effects of phase ordering.
200	/// - the risks are that a. all passes after super-vectorization have to work
201	/// on elemental vector types (not that this is always true, wherever
202	/// vectorization is applied) and b. that imposing vectorization constraints
203	/// too early may be overall detrimental to loop fusion, tiling and other
204	/// transformations because the dependence distances are coarsened when
205	/// operating on elemental vector types. For this reason, the pattern
206	/// profitability analysis should include a component that also captures the
207	/// maximal amount of fusion available under a particular pattern. This is
208	/// still at the stage of rough ideas but in this context, search is our
209	/// friend as the Tensor Comprehensions and auto-TVM contributions
210	/// demonstrated previously.
211	/// Bottom-line is we do not yet have good answers for the above but aim at
212	/// making it easy to answer such questions.
213	///
214	/// Back to our listing, the last places where early super-vectorization makes
215	/// sense are:
216	/// 3. right after polyhedral-style scheduling: PLUTO-style algorithms are known
217	/// to improve locality, parallelism and be configurable (e.g. max-fuse,
218	/// smart-fuse etc). They can also have adverse effects on contiguity
219	/// properties that are required for vectorization but the vector.transfer
220	/// copy-reshape-pad-transpose abstraction is expected to help recapture
221	/// these properties.
222	/// 4. right after polyhedral-style scheduling+tiling;
223	/// 5. right after scheduling+tiling+rescheduling: points 4 and 5 represent
224	/// probably the most promising places because applying tiling achieves a
225	/// separation of concerns that allows rescheduling to worry less about
226	/// locality and more about parallelism and distribution (e.g. min-fuse).
227	///
228	/// At these levels the risk-reward looks different: on one hand we probably
229	/// lost a good deal of language/user/library-level annotation; on the other
230	/// hand we gained parallelism and locality through scheduling and tiling.
231	/// However we probably want to ensure tiling is compatible with the
232	/// full-tile-only abstraction used in super-vectorization or suffer the
233	/// consequences. It is too early to place bets on what will win but we expect
234	/// super-vectorization to be the right abstraction to allow exploring at all
235	/// these levels. And again, search is our friend.
236	///
237	/// Lastly, we mention it again here:
238	/// 6. as a MLIR-based alternative to VPLAN.
239	///
240	/// Lowering, unrolling, pipelining:
241	/// ================================
242	/// TODO: point to the proper places.
243	///
244	/// Algorithm:
245	/// ==========
246	/// The algorithm proceeds in a few steps:
247	/// 1. defining super-vectorization patterns and matching them on the tree of
248	/// AffineForOp. A super-vectorization pattern is defined as a recursive
249	/// data structures that matches and captures nested, imperfectly-nested
250	/// loops that have a. conformable loop annotations attached (e.g. parallel,
251	/// reduction, vectorizable, ...) as well as b. all contiguous load/store
252	/// operations along a specified minor dimension (not necessarily the
253	/// fastest varying) ;
254	/// 2. analyzing those patterns for profitability (TODO: and
255	/// interference);
256	/// 3. then, for each pattern in order:
257	/// a. applying iterative rewriting of the loops and all their nested
258	/// operations in topological order. Rewriting is implemented by
259	/// coarsening the loops and converting operations and operands to their
260	/// vector forms. Processing operations in topological order is relatively
261	/// simple due to the structured nature of the control-flow
262	/// representation. This order ensures that all the operands of a given
263	/// operation have been vectorized before the operation itself in a single
264	/// traversal, except for operands defined outside of the loop nest. The
265	/// algorithm can convert the following operations to their vector form:
266	/// Affine load and store operations are converted to opaque vector*
267	/// transfer read and write operations.
268	/// Scalar constant operations/operands are converted to vector*
269	/// constant operations (splat).
270	/// Uniform operands (only induction variables of loops not mapped to*
271	/// a vector dimension, or operands defined outside of the loop nest
272	/// for now) are broadcasted to a vector.
273	/// TODO: Support more uniform cases.
274	/// Affine for operations with 'iter_args' are vectorized by*
275	/// vectorizing their 'iter_args' operands and results.
276	/// TODO: Support more complex loops with divergent lbs and/or ubs.
277	/// The remaining operations in the loop nest are vectorized by*
278	/// widening their scalar types to vector types.
279	/// b. if everything under the root AffineForOp in the current pattern
280	/// is vectorized properly, we commit that loop to the IR and remove the
281	/// scalar loop. Otherwise, we discard the vectorized loop and keep the
282	/// original scalar loop.
283	/// c. vectorization is applied on the next pattern in the list. Because
284	/// pattern interference avoidance is not yet implemented and that we do
285	/// not support further vectorizing an already vector load we need to
286	/// re-verify that the pattern is still vectorizable. This is expected to
287	/// make cost models more difficult to write and is subject to improvement
288	/// in the future.
289	///
290	/// Choice of loop transformation to support the algorithm:
291	/// =======================================================
292	/// The choice of loop transformation to apply for coarsening vectorized loops
293	/// is still subject to exploratory tradeoffs. In particular, say we want to
294	/// vectorize by a factor 128, we want to transform the following input:
295	/// ```mlir
296	/// affine.for %i = %M to %N {
297	/// %a = affine.load %A[%i] : memref<?xf32>
298	/// }
299	/// ```
300	///
301	/// Traditionally, one would vectorize late (after scheduling, tiling,
302	/// memory promotion etc) say after stripmining (and potentially unrolling in
303	/// the case of LLVM's SLP vectorizer):
304	/// ```mlir
305	/// affine.for %i = floor(%M, 128) to ceil(%N, 128) {
306	/// affine.for %ii = max(%M, 128 %i) to min(%N, 128%i + 127) {
307	/// %a = affine.load %A[%ii] : memref<?xf32>
308	/// }
309	/// }
310	/// ```
311	///
312	/// Instead, we seek to vectorize early and freeze vector types before
313	/// scheduling, so we want to generate a pattern that resembles:
314	/// ```mlir
315	/// affine.for %i = ? to ? step ? {
316	/// %v_a = vector.transfer_read %A[%i] : memref<?xf32>, vector<128xf32>
317	/// }
318	/// ```
319	///
320	/// i. simply dividing the lower / upper bounds by 128 creates issues
321	/// when representing expressions such as ii + 1 because now we only
322	/// have access to original values that have been divided. Additional
323	/// information is needed to specify accesses at below-128 granularity;
324	/// ii. another alternative is to coarsen the loop step but this may have
325	/// consequences on dependence analysis and fusability of loops: fusable
326	/// loops probably need to have the same step (because we don't want to
327	/// stripmine/unroll to enable fusion).
328	/// As a consequence, we choose to represent the coarsening using the loop
329	/// step for now and reevaluate in the future. Note that we can renormalize
330	/// loop steps later if/when we have evidence that they are problematic.
331	///
332	/// For the simple strawman example above, vectorizing for a 1-D vector
333	/// abstraction of size 128 returns code similar to:
334	/// ```mlir
335	/// affine.for %i = %M to %N step 128 {
336	/// %v_a = vector.transfer_read %A[%i] : memref<?xf32>, vector<128xf32>
337	/// }
338	/// ```
339	///
340	/// Unsupported cases, extensions, and work in progress (help welcome :-) ):
341	/// ========================================================================
342	/// 1. lowering to concrete vector types for various HW;
343	/// 2. reduction support for n-D vectorization and non-unit steps;
344	/// 3. non-effecting padding during vector.transfer_read and filter during
345	/// vector.transfer_write;
346	/// 4. misalignment support vector.transfer_read / vector.transfer_write
347	/// (hopefully without read-modify-writes);
348	/// 5. control-flow support;
349	/// 6. cost-models, heuristics and search;
350	/// 7. Op implementation, extensions and implication on memref views;
351	/// 8. many TODOs left around.
352	///
353	/// Examples:
354	/// =========
355	/// Consider the following Function:
356	/// ```mlir
357	/// func @vector_add_2d(%M : index, %N : index) -> f32 {
358	/// %A = alloc (%M, %N) : memref<?x?xf32, 0>
359	/// %B = alloc (%M, %N) : memref<?x?xf32, 0>
360	/// %C = alloc (%M, %N) : memref<?x?xf32, 0>
361	/// %f1 = arith.constant 1.0 : f32
362	/// %f2 = arith.constant 2.0 : f32
363	/// affine.for %i0 = 0 to %M {
364	/// affine.for %i1 = 0 to %N {
365	/// // non-scoped %f1
366	/// affine.store %f1, %A[%i0, %i1] : memref<?x?xf32, 0>
367	/// }
368	/// }
369	/// affine.for %i2 = 0 to %M {
370	/// affine.for %i3 = 0 to %N {
371	/// // non-scoped %f2
372	/// affine.store %f2, %B[%i2, %i3] : memref<?x?xf32, 0>
373	/// }
374	/// }
375	/// affine.for %i4 = 0 to %M {
376	/// affine.for %i5 = 0 to %N {
377	/// %a5 = affine.load %A[%i4, %i5] : memref<?x?xf32, 0>
378	/// %b5 = affine.load %B[%i4, %i5] : memref<?x?xf32, 0>
379	/// %s5 = arith.addf %a5, %b5 : f32
380	/// // non-scoped %f1
381	/// %s6 = arith.addf %s5, %f1 : f32
382	/// // non-scoped %f2
383	/// %s7 = arith.addf %s5, %f2 : f32
384	/// // diamond dependency.
385	/// %s8 = arith.addf %s7, %s6 : f32
386	/// affine.store %s8, %C[%i4, %i5] : memref<?x?xf32, 0>
387	/// }
388	/// }
389	/// %c7 = arith.constant 7 : index
390	/// %c42 = arith.constant 42 : index
391	/// %res = load %C[%c7, %c42] : memref<?x?xf32, 0>
392	/// return %res : f32
393	/// }
394	/// ```
395	///
396	/// The -affine-super-vectorize pass with the following arguments:
397	/// ```
398	/// -affine-super-vectorize="virtual-vector-size=256 test-fastest-varying=0"
399	/// ```
400	///
401	/// produces this standard innermost-loop vectorized code:
402	/// ```mlir
403	/// func @vector_add_2d(%arg0 : index, %arg1 : index) -> f32 {
404	/// %0 = memref.alloc(%arg0, %arg1) : memref<?x?xf32>
405	/// %1 = memref.alloc(%arg0, %arg1) : memref<?x?xf32>
406	/// %2 = memref.alloc(%arg0, %arg1) : memref<?x?xf32>
407	/// %cst = arith.constant 1.0 : f32
408	/// %cst_0 = arith.constant 2.0 : f32
409	/// affine.for %i0 = 0 to %arg0 {
410	/// affine.for %i1 = 0 to %arg1 step 256 {
411	/// %cst_1 = arith.constant dense<vector<256xf32>, 1.0> :
412	/// vector<256xf32>
413	/// vector.transfer_write %cst_1, %0[%i0, %i1] :
414	/// vector<256xf32>, memref<?x?xf32>
415	/// }
416	/// }
417	/// affine.for %i2 = 0 to %arg0 {
418	/// affine.for %i3 = 0 to %arg1 step 256 {
419	/// %cst_2 = arith.constant dense<vector<256xf32>, 2.0> :
420	/// vector<256xf32>
421	/// vector.transfer_write %cst_2, %1[%i2, %i3] :
422	/// vector<256xf32>, memref<?x?xf32>
423	/// }
424	/// }
425	/// affine.for %i4 = 0 to %arg0 {
426	/// affine.for %i5 = 0 to %arg1 step 256 {
427	/// %3 = vector.transfer_read %0[%i4, %i5] :
428	/// memref<?x?xf32>, vector<256xf32>
429	/// %4 = vector.transfer_read %1[%i4, %i5] :
430	/// memref<?x?xf32>, vector<256xf32>
431	/// %5 = arith.addf %3, %4 : vector<256xf32>
432	/// %cst_3 = arith.constant dense<vector<256xf32>, 1.0> :
433	/// vector<256xf32>
434	/// %6 = arith.addf %5, %cst_3 : vector<256xf32>
435	/// %cst_4 = arith.constant dense<vector<256xf32>, 2.0> :
436	/// vector<256xf32>
437	/// %7 = arith.addf %5, %cst_4 : vector<256xf32>
438	/// %8 = arith.addf %7, %6 : vector<256xf32>
439	/// vector.transfer_write %8, %2[%i4, %i5] :
440	/// vector<256xf32>, memref<?x?xf32>
441	/// }
442	/// }
443	/// %c7 = arith.constant 7 : index
444	/// %c42 = arith.constant 42 : index
445	/// %9 = load %2[%c7, %c42] : memref<?x?xf32>
446	/// return %9 : f32
447	/// }
448	/// ```
449	///
450	/// The -affine-super-vectorize pass with the following arguments:
451	/// ```
452	/// -affine-super-vectorize="virtual-vector-size=32,256 \
453	/// test-fastest-varying=1,0"
454	/// ```
455	///
456	/// produces this more interesting mixed outer-innermost-loop vectorized code:
457	/// ```mlir
458	/// func @vector_add_2d(%arg0 : index, %arg1 : index) -> f32 {
459	/// %0 = memref.alloc(%arg0, %arg1) : memref<?x?xf32>
460	/// %1 = memref.alloc(%arg0, %arg1) : memref<?x?xf32>
461	/// %2 = memref.alloc(%arg0, %arg1) : memref<?x?xf32>
462	/// %cst = arith.constant 1.0 : f32
463	/// %cst_0 = arith.constant 2.0 : f32
464	/// affine.for %i0 = 0 to %arg0 step 32 {
465	/// affine.for %i1 = 0 to %arg1 step 256 {
466	/// %cst_1 = arith.constant dense<vector<32x256xf32>, 1.0> :
467	/// vector<32x256xf32>
468	/// vector.transfer_write %cst_1, %0[%i0, %i1] :
469	/// vector<32x256xf32>, memref<?x?xf32>
470	/// }
471	/// }
472	/// affine.for %i2 = 0 to %arg0 step 32 {
473	/// affine.for %i3 = 0 to %arg1 step 256 {
474	/// %cst_2 = arith.constant dense<vector<32x256xf32>, 2.0> :
475	/// vector<32x256xf32>
476	/// vector.transfer_write %cst_2, %1[%i2, %i3] :
477	/// vector<32x256xf32>, memref<?x?xf32>
478	/// }
479	/// }
480	/// affine.for %i4 = 0 to %arg0 step 32 {
481	/// affine.for %i5 = 0 to %arg1 step 256 {
482	/// %3 = vector.transfer_read %0[%i4, %i5] :
483	/// memref<?x?xf32> vector<32x256xf32>
484	/// %4 = vector.transfer_read %1[%i4, %i5] :
485	/// memref<?x?xf32>, vector<32x256xf32>
486	/// %5 = arith.addf %3, %4 : vector<32x256xf32>
487	/// %cst_3 = arith.constant dense<vector<32x256xf32>, 1.0> :
488	/// vector<32x256xf32>
489	/// %6 = arith.addf %5, %cst_3 : vector<32x256xf32>
490	/// %cst_4 = arith.constant dense<vector<32x256xf32>, 2.0> :
491	/// vector<32x256xf32>
492	/// %7 = arith.addf %5, %cst_4 : vector<32x256xf32>
493	/// %8 = arith.addf %7, %6 : vector<32x256xf32>
494	/// vector.transfer_write %8, %2[%i4, %i5] :
495	/// vector<32x256xf32>, memref<?x?xf32>
496	/// }
497	/// }
498	/// %c7 = arith.constant 7 : index
499	/// %c42 = arith.constant 42 : index
500	/// %9 = load %2[%c7, %c42] : memref<?x?xf32>
501	/// return %9 : f32
502	/// }
503	/// ```
504	///
505	/// Of course, much more intricate n-D imperfectly-nested patterns can be
506	/// vectorized too and specified in a fully declarative fashion.
507	///
508	/// Reduction:
509	/// ==========
510	/// Vectorizing reduction loops along the reduction dimension is supported if:
511	/// - the reduction kind is supported,
512	/// - the vectorization is 1-D, and
513	/// - the step size of the loop equals to one.
514	///
515	/// Comparing to the non-vector-dimension case, two additional things are done
516	/// during vectorization of such loops:
517	/// - The resulting vector returned from the loop is reduced to a scalar using
518	/// `vector.reduce`.
519	/// - In some cases a mask is applied to the vector yielded at the end of the
520	/// loop to prevent garbage values from being written to the accumulator.
521	///
522	/// Reduction vectorization is switched off by default, it can be enabled by
523	/// passing a map from loops to reductions to utility functions, or by passing
524	/// `vectorize-reductions=true` to the vectorization pass.
525	///
526	/// Consider the following example:
527	/// ```mlir
528	/// func @vecred(%in: memref<512xf32>) -> f32 {
529	/// %cst = arith.constant 0.000000e+00 : f32
530	/// %sum = affine.for %i = 0 to 500 iter_args(%part_sum = %cst) -> (f32) {
531	/// %ld = affine.load %in[%i] : memref<512xf32>
532	/// %cos = math.cos %ld : f32
533	/// %add = arith.addf %part_sum, %cos : f32
534	/// affine.yield %add : f32
535	/// }
536	/// return %sum : f32
537	/// }
538	/// ```
539	///
540	/// The -affine-super-vectorize pass with the following arguments:
541	/// ```
542	/// -affine-super-vectorize="virtual-vector-size=128 test-fastest-varying=0 \
543	/// vectorize-reductions=true"
544	/// ```
545	/// produces the following output:
546	/// ```mlir
547	/// #map = affine_map<(d0) -> (-d0 + 500)>
548	/// func @vecred(%arg0: memref<512xf32>) -> f32 {
549	/// %cst = arith.constant 0.000000e+00 : f32
550	/// %cst_0 = arith.constant dense<0.000000e+00> : vector<128xf32>
551	/// %0 = affine.for %arg1 = 0 to 500 step 128 iter_args(%arg2 = %cst_0)
552	/// -> (vector<128xf32>) {
553	/// // %2 is the number of iterations left in the original loop.
554	/// %2 = affine.apply #map(%arg1)
555	/// %3 = vector.create_mask %2 : vector<128xi1>
556	/// %cst_1 = arith.constant 0.000000e+00 : f32
557	/// %4 = vector.transfer_read %arg0[%arg1], %cst_1 :
558	/// memref<512xf32>, vector<128xf32>
559	/// %5 = math.cos %4 : vector<128xf32>
560	/// %6 = arith.addf %arg2, %5 : vector<128xf32>
561	/// // We filter out the effect of last 12 elements using the mask.
562	/// %7 = select %3, %6, %arg2 : vector<128xi1>, vector<128xf32>
563	/// affine.yield %7 : vector<128xf32>
564	/// }
565	/// %1 = vector.reduction <add>, %0 : vector<128xf32> into f32
566	/// return %1 : f32
567	/// }
568	/// ```
569	///
570	/// Note that because of loop misalignment we needed to apply a mask to prevent
571	/// last 12 elements from affecting the final result. The mask is full of ones
572	/// in every iteration except for the last one, in which it has the form
573	/// `11...100...0` with 116 ones and 12 zeros.
574
575	#define DEBUG_TYPE "early-vect"
576
577	using llvm::dbgs;
578
579	/// Forward declaration.
580	static FilterFunctionType
581	isVectorizableLoopPtrFactory(const DenseSet<Operation *> &parallelLoops,
582	int fastestVaryingMemRefDimension);
583
584	/// Creates a vectorization pattern from the command line arguments.
585	/// Up to 3-D patterns are supported.
586	/// If the command line argument requests a pattern of higher order, returns an
587	/// empty pattern list which will conservatively result in no vectorization.
588	static std::optional<NestedPattern>
589	makePattern(const DenseSet<Operation > &parallelLoops, int* vectorRank,
590	ArrayRef<int64_t> fastestVaryingPattern) {
591	using affine::matcher::For;
592	int64_t d0 = fastestVaryingPattern.empty() ? -`1` : fastestVaryingPattern [`0`];
593	int64_t d1 = fastestVaryingPattern.size() < `2` ? -`1` : fastestVaryingPattern [`1`];
594	int64_t d2 = fastestVaryingPattern.size() < `3` ? -`1` : fastestVaryingPattern [`2`];
595	switch (vectorRank) {
596	case `1`:
597	return For(filter: isVectorizableLoopPtrFactory(parallelLoops, fastestVaryingMemRefDimension: d0));
598	case `2`:
599	return For(filter: isVectorizableLoopPtrFactory(parallelLoops, fastestVaryingMemRefDimension: d0),
600	child: For(filter: isVectorizableLoopPtrFactory(parallelLoops, fastestVaryingMemRefDimension: d1)));
601	case `3`:
602	return For(filter: isVectorizableLoopPtrFactory(parallelLoops, fastestVaryingMemRefDimension: d0),
603	child: For(filter: isVectorizableLoopPtrFactory(parallelLoops, fastestVaryingMemRefDimension: d1),
604	child: For(filter: isVectorizableLoopPtrFactory(parallelLoops, fastestVaryingMemRefDimension: d2))));
605	default: {
606	return std::nullopt;
607	}
608	}
609	}
610
611	static NestedPattern &vectorTransferPattern() {
612	static auto pattern = affine::matcher::Op(
613	filter: llvm::IsaPred<vector::TransferReadOp, vector::TransferWriteOp>);
614	return pattern;
615	}
616
617	namespace {
618
619	/// Base state for the vectorize pass.
620	/// Command line arguments are preempted by non-empty pass arguments.
621	struct Vectorize : public affine::impl::AffineVectorizeBase<Vectorize> {
622	using Base::Base;
623
624	void runOnOperation() override;
625	};
626
627	} // namespace
628
629	static void vectorizeLoopIfProfitable(Operation loop, unsigned* depthInPattern,
630	unsigned patternDepth,
631	VectorizationStrategy *strategy) {
632	assert(patternDepth > depthInPattern &&
633	"patternDepth is greater than depthInPattern");
634	if (patternDepth - depthInPattern > strategy->vectorSizes.size()) {
635	// Don't vectorize this loop
636	return;
637	}
638	strategy->loopToVectorDim [loop] =
639	strategy->vectorSizes.size() - (patternDepth - depthInPattern);
640	}
641
642	/// Implements a simple strawman strategy for vectorization.
643	/// Given a matched pattern `matches` of depth `patternDepth`, this strategy
644	/// greedily assigns the fastest varying dimension * of the vector ** to the*
645	/// innermost loop in the pattern.
646	/// When coupled with a pattern that looks for the fastest varying dimension in
647	/// load/store MemRefs, this creates a generic vectorization strategy that works
648	/// for any loop in a hierarchy (outermost, innermost or intermediate).
649	///
650	/// TODO: In the future we should additionally increase the power of the
651	/// profitability analysis along 3 directions:
652	/// 1. account for loop extents (both static and parametric + annotations);
653	/// 2. account for data layout permutations;
654	/// 3. account for impact of vectorization on maximal loop fusion.
655	/// Then we can quantify the above to build a cost model and search over
656	/// strategies.
657	static LogicalResult analyzeProfitability(ArrayRef<NestedMatch> matches,
658	unsigned depthInPattern,
659	unsigned patternDepth,
660	VectorizationStrategy *strategy) {
661	for (auto m : matches) {
662	if (failed(result: analyzeProfitability(matches: m.getMatchedChildren(), depthInPattern: depthInPattern + `1`,
663	patternDepth, strategy))) {
664	return failure();
665	}
666	vectorizeLoopIfProfitable(loop: m.getMatchedOperation(), depthInPattern,
667	patternDepth, strategy);
668	}
669	return success();
670	}
671
672	///// end TODO: Hoist to a VectorizationStrategy.cpp when appropriate /////
673
674	namespace {
675
676	struct VectorizationState {
677
678	VectorizationState(MLIRContext *context) : builder (context) {}
679
680	/// Registers the vector replacement of a scalar operation and its result
681	/// values. Both operations must have the same number of results.
682	///
683	/// This utility is used to register the replacement for the vast majority of
684	/// the vectorized operations.
685	///
686	/// Example:
687	/// 'replaced': %0 = arith.addf %1, %2 : f32*
688	/// 'replacement': %0 = arith.addf %1, %2 : vector<128xf32>*
689	void registerOpVectorReplacement(Operation replaced, Operation replacement);
690
691	/// Registers the vector replacement of a scalar value. The replacement
692	/// operation should have a single result, which replaces the scalar value.
693	///
694	/// This utility is used to register the vector replacement of block arguments
695	/// and operation results which are not directly vectorized (i.e., their
696	/// scalar version still exists after vectorization), like uniforms.
697	///
698	/// Example:
699	/// 'replaced': block argument or operation outside of the vectorized*
700	/// loop.
701	/// 'replacement': %0 = vector.broadcast %1 : f32 to vector<128xf32>*
702	void registerValueVectorReplacement(Value replaced, Operation *replacement);
703
704	/// Registers the vector replacement of a block argument (e.g., iter_args).
705	///
706	/// Example:
707	/// 'replaced': 'iter_arg' block argument.*
708	/// 'replacement': vectorized 'iter_arg' block argument.*
709	void registerBlockArgVectorReplacement(BlockArgument replaced,
710	BlockArgument replacement);
711
712	/// Registers the scalar replacement of a scalar value. 'replacement' must be
713	/// scalar.
714	///
715	/// This utility is used to register the replacement of block arguments
716	/// or affine.apply results that are within the loop be vectorized and will
717	/// continue being scalar within the vector loop.
718	///
719	/// Example:
720	/// 'replaced': induction variable of a loop to be vectorized.*
721	/// 'replacement': new induction variable in the new vector loop.*
722	void registerValueScalarReplacement(Value replaced, Value replacement);
723
724	/// Registers the scalar replacement of a scalar result returned from a
725	/// reduction loop. 'replacement' must be scalar.
726	///
727	/// This utility is used to register the replacement for scalar results of
728	/// vectorized reduction loops with iter_args.
729	///
730	/// Example 2:
731	/// 'replaced': %0 = affine.for %i = 0 to 512 iter_args(%x = ...) -> (f32)*
732	/// 'replacement': %1 = vector.reduction <add>, %0 : vector<4xf32> into*
733	/// f32
734	void registerLoopResultScalarReplacement(Value replaced, Value replacement);
735
736	/// Returns in 'replacedVals' the scalar replacement for values in
737	/// 'inputVals'.
738	void getScalarValueReplacementsFor(ValueRange inputVals,
739	SmallVectorImpl<Value> &replacedVals);
740
741	/// Erases the scalar loop nest after its successful vectorization.
742	void finishVectorizationPattern(AffineForOp rootLoop);
743
744	// Used to build and insert all the new operations created. The insertion
745	// point is preserved and updated along the vectorization process.
746	OpBuilder builder;
747
748	// Maps input scalar operations to their vector counterparts.
749	DenseMap<Operation , Operation > opVectorReplacement;
750	// Maps input scalar values to their vector counterparts.
751	IRMapping valueVectorReplacement;
752	// Maps input scalar values to their new scalar counterparts in the vector
753	// loop nest.
754	IRMapping valueScalarReplacement;
755	// Maps results of reduction loops to their new scalar counterparts.
756	DenseMap<Value, Value> loopResultScalarReplacement;
757
758	// Maps the newly created vector loops to their vector dimension.
759	DenseMap<Operation , unsigned*> vecLoopToVecDim;
760
761	// Maps the new vectorized loops to the corresponding vector masks if it is
762	// required.
763	DenseMap<Operation *, Value> vecLoopToMask;
764
765	// The strategy drives which loop to vectorize by which amount.
766	const VectorizationStrategy strategy = nullptr*;
767
768	private:
769	/// Internal implementation to map input scalar values to new vector or scalar
770	/// values.
771	void registerValueVectorReplacementImpl(Value replaced, Value replacement);
772	};
773
774	} // namespace
775
776	/// Registers the vector replacement of a scalar operation and its result
777	/// values. Both operations must have the same number of results.
778	///
779	/// This utility is used to register the replacement for the vast majority of
780	/// the vectorized operations.
781	///
782	/// Example:
783	/// 'replaced': %0 = arith.addf %1, %2 : f32*
784	/// 'replacement': %0 = arith.addf %1, %2 : vector<128xf32>*
785	void VectorizationState::registerOpVectorReplacement(Operation *replaced,
786	Operation *replacement) {
787	LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ commit vectorized op:\n");
788	LLVM_DEBUG(dbgs() << *replaced << "\n");
789	LLVM_DEBUG(dbgs() << "into\n");
790	LLVM_DEBUG(dbgs() << *replacement << "\n");
791
792	assert(replaced->getNumResults() == replacement->getNumResults() &&
793	"Unexpected replaced and replacement results");
794	assert(opVectorReplacement.count(replaced) == `0` && "already registered");
795	opVectorReplacement [replaced] = replacement;
796
797	for (auto resultTuple :
798	llvm::zip(t: replaced->getResults(), u: replacement->getResults()))
799	registerValueVectorReplacementImpl(replaced: std::get<`0`>(t&: resultTuple),
800	replacement: std::get<`1`>(t&: resultTuple));
801	}
802
803	/// Registers the vector replacement of a scalar value. The replacement
804	/// operation should have a single result, which replaces the scalar value.
805	///
806	/// This utility is used to register the vector replacement of block arguments
807	/// and operation results which are not directly vectorized (i.e., their
808	/// scalar version still exists after vectorization), like uniforms.
809	///
810	/// Example:
811	/// 'replaced': block argument or operation outside of the vectorized loop.*
812	/// 'replacement': %0 = vector.broadcast %1 : f32 to vector<128xf32>*
813	void VectorizationState::registerValueVectorReplacement(
814	Value replaced, Operation *replacement) {
815	assert(replacement->getNumResults() == `1` &&
816	"Expected single-result replacement");
817	if (Operation *defOp = replaced.getDefiningOp())
818	registerOpVectorReplacement(replaced: defOp, replacement);
819	else
820	registerValueVectorReplacementImpl(replaced, replacement: replacement->getResult(idx: `0`));
821	}
822
823	/// Registers the vector replacement of a block argument (e.g., iter_args).
824	///
825	/// Example:
826	/// 'replaced': 'iter_arg' block argument.*
827	/// 'replacement': vectorized 'iter_arg' block argument.*
828	void VectorizationState::registerBlockArgVectorReplacement(
829	BlockArgument replaced, BlockArgument replacement) {
830	registerValueVectorReplacementImpl(replaced, replacement);
831	}
832
833	void VectorizationState::registerValueVectorReplacementImpl(Value replaced,
834	Value replacement) {
835	assert(!valueVectorReplacement.contains(replaced) &&
836	"Vector replacement already registered");
837	assert(isa<VectorType>(replacement.getType()) &&
838	"Expected vector type in vector replacement");
839	valueVectorReplacement.map(from: replaced, to: replacement);
840	}
841
842	/// Registers the scalar replacement of a scalar value. 'replacement' must be
843	/// scalar.
844	///
845	/// This utility is used to register the replacement of block arguments
846	/// or affine.apply results that are within the loop be vectorized and will
847	/// continue being scalar within the vector loop.
848	///
849	/// Example:
850	/// 'replaced': induction variable of a loop to be vectorized.*
851	/// 'replacement': new induction variable in the new vector loop.*
852	void VectorizationState::registerValueScalarReplacement(Value replaced,
853	Value replacement) {
854	assert(!valueScalarReplacement.contains(replaced) &&
855	"Scalar value replacement already registered");
856	assert(!isa<VectorType>(replacement.getType()) &&
857	"Expected scalar type in scalar replacement");
858	valueScalarReplacement.map(from: replaced, to: replacement);
859	}
860
861	/// Registers the scalar replacement of a scalar result returned from a
862	/// reduction loop. 'replacement' must be scalar.
863	///
864	/// This utility is used to register the replacement for scalar results of
865	/// vectorized reduction loops with iter_args.
866	///
867	/// Example 2:
868	/// 'replaced': %0 = affine.for %i = 0 to 512 iter_args(%x = ...) -> (f32)*
869	/// 'replacement': %1 = vector.reduction <add>, %0 : vector<4xf32> into f32*
870	void VectorizationState::registerLoopResultScalarReplacement(
871	Value replaced, Value replacement) {
872	assert(isa<AffineForOp>(replaced.getDefiningOp()));
873	assert(loopResultScalarReplacement.count(replaced) == `0` &&
874	"already registered");
875	LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ will replace a result of the loop "
876	"with scalar: "
877	<< replacement);
878	loopResultScalarReplacement [replaced] = replacement;
879	}
880
881	/// Returns in 'replacedVals' the scalar replacement for values in 'inputVals'.
882	void VectorizationState::getScalarValueReplacementsFor(
883	ValueRange inputVals, SmallVectorImpl<Value> &replacedVals) {
884	for (Value inputVal : inputVals)
885	replacedVals.push_back(Elt: valueScalarReplacement.lookupOrDefault(from: inputVal));
886	}
887
888	/// Erases a loop nest, including all its nested operations.
889	static void eraseLoopNest(AffineForOp forOp) {
890	LLVM_DEBUG(dbgs() << "[early-vect]+++++ erasing:\n" << forOp << "\n");
891	forOp.erase();
892	}
893
894	/// Erases the scalar loop nest after its successful vectorization.
895	void VectorizationState::finishVectorizationPattern(AffineForOp rootLoop) {
896	LLVM_DEBUG(dbgs() << "\n[early-vect] Finalizing vectorization\n");
897	eraseLoopNest(rootLoop);
898	}
899
900	// Apply 'map' with 'mapOperands' returning resulting values in 'results'.
901	static void computeMemoryOpIndices(Operation *op, AffineMap map,
902	ValueRange mapOperands,
903	VectorizationState &state,
904	SmallVectorImpl<Value> &results) {
905	for (auto resultExpr : map.getResults()) {
906	auto singleResMap =
907	AffineMap::get(dimCount: map.getNumDims(), symbolCount: map.getNumSymbols(), result: resultExpr);
908	auto afOp = state.builder.create<AffineApplyOp>(op->getLoc(), singleResMap,
909	mapOperands);
910	results.push_back(Elt: afOp);
911	}
912	}
913
914	/// Returns a FilterFunctionType that can be used in NestedPattern to match a
915	/// loop whose underlying load/store accesses are either invariant or all
916	// varying along the `fastestVaryingMemRefDimension`.
917	static FilterFunctionType
918	isVectorizableLoopPtrFactory(const DenseSet<Operation *> &parallelLoops,
919	int fastestVaryingMemRefDimension) {
920	return [&parallelLoops, fastestVaryingMemRefDimension](Operation &forOp) {
921	auto loop = cast<AffineForOp>(forOp);
922	if (!parallelLoops.contains(V: loop))
923	return false;
924	int memRefDim = -`1`;
925	auto vectorizableBody =
926	isVectorizableLoopBody(loop, &memRefDim, vectorTransferPattern());
927	if (!vectorizableBody)
928	return false;
929	return memRefDim == -`1` \|\| fastestVaryingMemRefDimension == -`1` \|\|
930	memRefDim == fastestVaryingMemRefDimension;
931	};
932	}
933
934	/// Returns the vector type resulting from applying the provided vectorization
935	/// strategy on the scalar type.
936	static VectorType getVectorType(Type scalarTy,
937	const VectorizationStrategy *strategy) {
938	assert(!isa<VectorType>(scalarTy) && "Expected scalar type");
939	return VectorType::get(strategy->vectorSizes, scalarTy);
940	}
941
942	/// Tries to transform a scalar constant into a vector constant. Returns the
943	/// vector constant if the scalar type is valid vector element type. Returns
944	/// nullptr, otherwise.
945	static arith::ConstantOp vectorizeConstant(arith::ConstantOp constOp,
946	VectorizationState &state) {
947	Type scalarTy = constOp.getType();
948	if (!VectorType::isValidElementType(scalarTy))
949	return nullptr;
950
951	auto vecTy = getVectorType(scalarTy, state.strategy);
952	auto vecAttr = DenseElementsAttr::get(vecTy, constOp.getValue());
953
954	OpBuilder::InsertionGuard guard(state.builder);
955	Operation *parentOp = state.builder.getInsertionBlock()->getParentOp();
956	// Find the innermost vectorized ancestor loop to insert the vector constant.
957	while (parentOp && !state.vecLoopToVecDim.count(Val: parentOp))
958	parentOp = parentOp->getParentOp();
959	assert(parentOp && state.vecLoopToVecDim.count(parentOp) &&
960	isa<AffineForOp>(parentOp) && "Expected a vectorized for op");
961	auto vecForOp = cast<AffineForOp>(parentOp);
962	state.builder.setInsertionPointToStart(vecForOp.getBody());
963	auto newConstOp =
964	state.builder.create<arith::ConstantOp>(constOp.getLoc(), vecAttr);
965
966	// Register vector replacement for future uses in the scope.
967	state.registerOpVectorReplacement(replaced: constOp, replacement: newConstOp);
968	return newConstOp;
969	}
970
971	/// We have no need to vectorize affine.apply. However, we still need to
972	/// generate it and replace the operands with values in valueScalarReplacement.
973	static Operation *vectorizeAffineApplyOp(AffineApplyOp applyOp,
974	VectorizationState &state) {
975	SmallVector<Value, `8`> updatedOperands;
976	for (Value operand : applyOp.getOperands()) {
977	if (state.valueVectorReplacement.contains(operand)) {
978	LLVM_DEBUG(
979	dbgs() << "\n[early-vect]+++++ affine.apply on vector operand\n");
980	return nullptr;
981	} else {
982	Value updatedOperand = state.valueScalarReplacement.lookupOrNull(operand);
983	if (!updatedOperand)
984	updatedOperand = operand;
985	updatedOperands.push_back(updatedOperand);
986	}
987	}
988
989	auto newApplyOp = state.builder.create<AffineApplyOp>(
990	applyOp.getLoc(), applyOp.getAffineMap(), updatedOperands);
991
992	// Register the new affine.apply result.
993	state.registerValueScalarReplacement(replaced: applyOp.getResult(),
994	replacement: newApplyOp.getResult());
995	return newApplyOp;
996	}
997
998	/// Creates a constant vector filled with the neutral elements of the given
999	/// reduction. The scalar type of vector elements will be taken from
1000	/// `oldOperand`.
1001	static arith::ConstantOp createInitialVector(arith::AtomicRMWKind reductionKind,
1002	Value oldOperand,
1003	VectorizationState &state) {
1004	Type scalarTy = oldOperand.getType();
1005	if (!VectorType::isValidElementType(scalarTy))
1006	return nullptr;
1007
1008	Attribute valueAttr = getIdentityValueAttr(
1009	reductionKind, scalarTy, state.builder, oldOperand.getLoc());
1010	auto vecTy = getVectorType(scalarTy, state.strategy);
1011	auto vecAttr = DenseElementsAttr::get(vecTy, valueAttr);
1012	auto newConstOp =
1013	state.builder.create<arith::ConstantOp>(oldOperand.getLoc(), vecAttr);
1014
1015	return newConstOp;
1016	}
1017
1018	/// Creates a mask used to filter out garbage elements in the last iteration
1019	/// of unaligned loops. If a mask is not required then `nullptr` is returned.
1020	/// The mask will be a vector of booleans representing meaningful vector
1021	/// elements in the current iteration. It is filled with ones for each iteration
1022	/// except for the last one, where it has the form `11...100...0` with the
1023	/// number of ones equal to the number of meaningful elements (i.e. the number
1024	/// of iterations that would be left in the original loop).
1025	static Value createMask(AffineForOp vecForOp, VectorizationState &state) {
1026	assert(state.strategy->vectorSizes.size() == `1` &&
1027	"Creating a mask non-1-D vectors is not supported.");
1028	assert(vecForOp.getStep() == state.strategy->vectorSizes[`0`] &&
1029	"Creating a mask for loops with non-unit original step size is not "
1030	"supported.");
1031
1032	// Check if we have already created the mask.
1033	if (Value mask = state.vecLoopToMask.lookup(Val: vecForOp))
1034	return mask;
1035
1036	// If the loop has constant bounds and the original number of iterations is
1037	// divisable by the vector size then we don't need a mask.
1038	if (vecForOp.hasConstantBounds()) {
1039	int64_t originalTripCount =
1040	vecForOp.getConstantUpperBound() - vecForOp.getConstantLowerBound();
1041	if (originalTripCount % vecForOp.getStepAsInt() == `0`)
1042	return nullptr;
1043	}
1044
1045	OpBuilder::InsertionGuard guard(state.builder);
1046	state.builder.setInsertionPointToStart(vecForOp.getBody());
1047
1048	// We generate the mask using the `vector.create_mask` operation which accepts
1049	// the number of meaningful elements (i.e. the length of the prefix of 1s).
1050	// To compute the number of meaningful elements we subtract the current value
1051	// of the iteration variable from the upper bound of the loop. Example:
1052	//
1053	// // 500 is the upper bound of the loop
1054	// #map = affine_map<(d0) -> (500 - d0)>
1055	// %elems_left = affine.apply #map(%iv)
1056	// %mask = vector.create_mask %elems_left : vector<128xi1>
1057
1058	Location loc = vecForOp.getLoc();
1059
1060	// First we get the upper bound of the loop using `affine.apply` or
1061	// `affine.min`.
1062	AffineMap ubMap = vecForOp.getUpperBoundMap();
1063	Value ub;
1064	if (ubMap.getNumResults() == `1`)
1065	ub = state.builder.create<AffineApplyOp>(loc, vecForOp.getUpperBoundMap(),
1066	vecForOp.getUpperBoundOperands());
1067	else
1068	ub = state.builder.create<AffineMinOp>(loc, vecForOp.getUpperBoundMap(),
1069	vecForOp.getUpperBoundOperands());
1070	// Then we compute the number of (original) iterations left in the loop.
1071	AffineExpr subExpr =
1072	state.builder.getAffineDimExpr(position: `0`) - state.builder.getAffineDimExpr(position: `1`);
1073	Value itersLeft =
1074	makeComposedAffineApply(state.builder, loc, AffineMap::get(dimCount: `2`, symbolCount: `0`, result: subExpr),
1075	{ub, vecForOp.getInductionVar()});
1076	// If the affine maps were successfully composed then `ub` is unneeded.
1077	if (ub.use_empty())
1078	ub.getDefiningOp()->erase();
1079	// Finally we create the mask.
1080	Type maskTy = VectorType::get(state.strategy->vectorSizes,
1081	state.builder.getIntegerType(`1`));
1082	Value mask =
1083	state.builder.create<vector::CreateMaskOp>(loc, maskTy, itersLeft);
1084
1085	LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ creating a mask:\n"
1086	<< itersLeft << "\n"
1087	<< mask << "\n");
1088
1089	state.vecLoopToMask[vecForOp] = mask;
1090	return mask;
1091	}
1092
1093	/// Returns true if the provided value is vector uniform given the vectorization
1094	/// strategy.
1095	// TODO: For now, only values that are induction variables of loops not in
1096	// `loopToVectorDim` or invariants to all the loops in the vectorization
1097	// strategy are considered vector uniforms.
1098	static bool isUniformDefinition(Value value,
1099	const VectorizationStrategy *strategy) {
1100	AffineForOp forOp = getForInductionVarOwner(value);
1101	if (forOp && strategy->loopToVectorDim.count(Val: forOp) == `0`)
1102	return true;
1103
1104	for (auto loopToDim : strategy->loopToVectorDim) {
1105	auto loop = cast<AffineForOp>(loopToDim.first);
1106	if (!loop.isDefinedOutsideOfLoop(value))
1107	return false;
1108	}
1109	return true;
1110	}
1111
1112	/// Generates a broadcast op for the provided uniform value using the
1113	/// vectorization strategy in 'state'.
1114	static Operation *vectorizeUniform(Value uniformVal,
1115	VectorizationState &state) {
1116	OpBuilder::InsertionGuard guard(state.builder);
1117	Value uniformScalarRepl =
1118	state.valueScalarReplacement.lookupOrDefault(from: uniformVal);
1119	state.builder.setInsertionPointAfterValue(uniformScalarRepl);
1120
1121	auto vectorTy = getVectorType(uniformVal.getType(), state.strategy);
1122	auto bcastOp = state.builder.create<BroadcastOp>(uniformVal.getLoc(),
1123	vectorTy, uniformScalarRepl);
1124	state.registerValueVectorReplacement(replaced: uniformVal, replacement: bcastOp);
1125	return bcastOp;
1126	}
1127
1128	/// Tries to vectorize a given `operand` by applying the following logic:
1129	/// 1. if the defining operation has been already vectorized, `operand` is
1130	/// already in the proper vector form;
1131	/// 2. if the `operand` is a constant, returns the vectorized form of the
1132	/// constant;
1133	/// 3. if the `operand` is uniform, returns a vector broadcast of the `op`;
1134	/// 4. otherwise, the vectorization of `operand` is not supported.
1135	/// Newly created vector operations are registered in `state` as replacement
1136	/// for their scalar counterparts.
1137	/// In particular this logic captures some of the use cases where definitions
1138	/// that are not scoped under the current pattern are needed to vectorize.
1139	/// One such example is top level function constants that need to be splatted.
1140	///
1141	/// Returns an operand that has been vectorized to match `state`'s strategy if
1142	/// vectorization is possible with the above logic. Returns nullptr otherwise.
1143	///
1144	/// TODO: handle more complex cases.
1145	static Value vectorizeOperand(Value operand, VectorizationState &state) {
1146	LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ vectorize operand: " << operand);
1147	// If this value is already vectorized, we are done.
1148	if (Value vecRepl = state.valueVectorReplacement.lookupOrNull(from: operand)) {
1149	LLVM_DEBUG(dbgs() << " -> already vectorized: " << vecRepl);
1150	return vecRepl;
1151	}
1152
1153	// An vector operand that is not in the replacement map should never reach
1154	// this point. Reaching this point could mean that the code was already
1155	// vectorized and we shouldn't try to vectorize already vectorized code.
1156	assert(!isa<VectorType>(operand.getType()) &&
1157	"Vector op not found in replacement map");
1158
1159	// Vectorize constant.
1160	if (auto constOp = operand.getDefiningOp<arith::ConstantOp>()) {
1161	auto vecConstant = vectorizeConstant(constOp, state);
1162	LLVM_DEBUG(dbgs() << "-> constant: " << vecConstant);
1163	return vecConstant.getResult();
1164	}
1165
1166	// Vectorize uniform values.
1167	if (isUniformDefinition(value: operand, strategy: state.strategy)) {
1168	Operation *vecUniform = vectorizeUniform(uniformVal: operand, state);
1169	LLVM_DEBUG(dbgs() << "-> uniform: " << *vecUniform);
1170	return vecUniform->getResult(idx: `0`);
1171	}
1172
1173	// Check for unsupported block argument scenarios. A supported block argument
1174	// should have been vectorized already.
1175	if (!operand.getDefiningOp())
1176	LLVM_DEBUG(dbgs() << "-> unsupported block argument\n");
1177	else
1178	// Generic unsupported case.
1179	LLVM_DEBUG(dbgs() << "-> non-vectorizable\n");
1180
1181	return nullptr;
1182	}
1183
1184	/// Vectorizes an affine load with the vectorization strategy in 'state' by
1185	/// generating a 'vector.transfer_read' op with the proper permutation map
1186	/// inferred from the indices of the load. The new 'vector.transfer_read' is
1187	/// registered as replacement of the scalar load. Returns the newly created
1188	/// 'vector.transfer_read' if vectorization was successful. Returns nullptr,
1189	/// otherwise.
1190	static Operation *vectorizeAffineLoad(AffineLoadOp loadOp,
1191	VectorizationState &state) {
1192	MemRefType memRefType = loadOp.getMemRefType();
1193	Type elementType = memRefType.getElementType();
1194	auto vectorType = VectorType::get(state.strategy->vectorSizes, elementType);
1195
1196	// Replace map operands with operands from the vector loop nest.
1197	SmallVector<Value, `8`> mapOperands;
1198	state.getScalarValueReplacementsFor(inputVals: loadOp.getMapOperands(), replacedVals&: mapOperands);
1199
1200	// Compute indices for the transfer op. AffineApplyOp's may be generated.
1201	SmallVector<Value, `8`> indices;
1202	indices.reserve(N: memRefType.getRank());
1203	if (loadOp.getAffineMap() !=
1204	state.builder.getMultiDimIdentityMap(rank: memRefType.getRank())) {
1205	// Check the operand in loadOp affine map does not come from AffineApplyOp.
1206	for (auto op : mapOperands) {
1207	if (op.getDefiningOp<AffineApplyOp>())
1208	return nullptr;
1209	}
1210	computeMemoryOpIndices(loadOp, loadOp.getAffineMap(), mapOperands, state,
1211	indices);
1212	} else {
1213	indices.append(in_start: mapOperands.begin(), in_end: mapOperands.end());
1214	}
1215
1216	// Compute permutation map using the information of new vector loops.
1217	auto permutationMap = makePermutationMap(insertPoint: state.builder.getInsertionBlock(),
1218	indices, loopToVectorDim: state.vecLoopToVecDim);
1219	if (!permutationMap) {
1220	LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ can't compute permutationMap\n");
1221	return nullptr;
1222	}
1223	LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ permutationMap: ");
1224	LLVM_DEBUG(permutationMap.print(dbgs()));
1225
1226	auto transfer = state.builder.create<vector::TransferReadOp>(
1227	loadOp.getLoc(), vectorType, loadOp.getMemRef(), indices, permutationMap);
1228
1229	// Register replacement for future uses in the scope.
1230	state.registerOpVectorReplacement(replaced: loadOp, replacement: transfer);
1231	return transfer;
1232	}
1233
1234	/// Vectorizes an affine store with the vectorization strategy in 'state' by
1235	/// generating a 'vector.transfer_write' op with the proper permutation map
1236	/// inferred from the indices of the store. The new 'vector.transfer_store' is
1237	/// registered as replacement of the scalar load. Returns the newly created
1238	/// 'vector.transfer_write' if vectorization was successful. Returns nullptr,
1239	/// otherwise.
1240	static Operation *vectorizeAffineStore(AffineStoreOp storeOp,
1241	VectorizationState &state) {
1242	MemRefType memRefType = storeOp.getMemRefType();
1243	Value vectorValue = vectorizeOperand(storeOp.getValueToStore(), state);
1244	if (!vectorValue)
1245	return nullptr;
1246
1247	// Replace map operands with operands from the vector loop nest.
1248	SmallVector<Value, `8`> mapOperands;
1249	state.getScalarValueReplacementsFor(inputVals: storeOp.getMapOperands(), replacedVals&: mapOperands);
1250
1251	// Compute indices for the transfer op. AffineApplyOp's may be generated.
1252	SmallVector<Value, `8`> indices;
1253	indices.reserve(N: memRefType.getRank());
1254	if (storeOp.getAffineMap() !=
1255	state.builder.getMultiDimIdentityMap(rank: memRefType.getRank()))
1256	computeMemoryOpIndices(storeOp, storeOp.getAffineMap(), mapOperands, state,
1257	indices);
1258	else
1259	indices.append(in_start: mapOperands.begin(), in_end: mapOperands.end());
1260
1261	// Compute permutation map using the information of new vector loops.
1262	auto permutationMap = makePermutationMap(insertPoint: state.builder.getInsertionBlock(),
1263	indices, loopToVectorDim: state.vecLoopToVecDim);
1264	if (!permutationMap)
1265	return nullptr;
1266	LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ permutationMap: ");
1267	LLVM_DEBUG(permutationMap.print(dbgs()));
1268
1269	auto transfer = state.builder.create<vector::TransferWriteOp>(
1270	storeOp.getLoc(), vectorValue, storeOp.getMemRef(), indices,
1271	permutationMap);
1272	LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ vectorized store: " << transfer);
1273
1274	// Register replacement for future uses in the scope.
1275	state.registerOpVectorReplacement(replaced: storeOp, replacement: transfer);
1276	return transfer;
1277	}
1278
1279	/// Returns true if `value` is a constant equal to the neutral element of the
1280	/// given vectorizable reduction.
1281	static bool isNeutralElementConst(arith::AtomicRMWKind reductionKind,
1282	Value value, VectorizationState &state) {
1283	Type scalarTy = value.getType();
1284	if (!VectorType::isValidElementType(scalarTy))
1285	return false;
1286	Attribute valueAttr = getIdentityValueAttr(reductionKind, scalarTy,
1287	state.builder, value.getLoc());
1288	if (auto constOp = dyn_cast_or_null<arith::ConstantOp>(value.getDefiningOp()))
1289	return constOp.getValue() == valueAttr;
1290	return false;
1291	}
1292
1293	/// Vectorizes a loop with the vectorization strategy in 'state'. A new loop is
1294	/// created and registered as replacement for the scalar loop. The builder's
1295	/// insertion point is set to the new loop's body so that subsequent vectorized
1296	/// operations are inserted into the new loop. If the loop is a vector
1297	/// dimension, the step of the newly created loop will reflect the vectorization
1298	/// factor used to vectorized that dimension.
1299	static Operation *vectorizeAffineForOp(AffineForOp forOp,
1300	VectorizationState &state) {
1301	const VectorizationStrategy &strategy = *state.strategy;
1302	auto loopToVecDimIt = strategy.loopToVectorDim.find(forOp);
1303	bool isLoopVecDim = loopToVecDimIt != strategy.loopToVectorDim.end();
1304
1305	// TODO: Vectorization of reduction loops is not supported for non-unit steps.
1306	if (isLoopVecDim && forOp.getNumIterOperands() > `0` && forOp.getStep() != `1`) {
1307	LLVM_DEBUG(
1308	dbgs()
1309	<< "\n[early-vect]+++++ unsupported step size for reduction loop: "
1310	<< forOp.getStep() << "\n");
1311	return nullptr;
1312	}
1313
1314	// If we are vectorizing a vector dimension, compute a new step for the new
1315	// vectorized loop using the vectorization factor for the vector dimension.
1316	// Otherwise, propagate the step of the scalar loop.
1317	unsigned newStep;
1318	if (isLoopVecDim) {
1319	unsigned vectorDim = loopToVecDimIt->second;
1320	assert(vectorDim < strategy.vectorSizes.size() && "vector dim overflow");
1321	int64_t forOpVecFactor = strategy.vectorSizes [vectorDim];
1322	newStep = forOp.getStepAsInt() * forOpVecFactor;
1323	} else {
1324	newStep = forOp.getStepAsInt();
1325	}
1326
1327	// Get information about reduction kinds.
1328	ArrayRef<LoopReduction> reductions;
1329	if (isLoopVecDim && forOp.getNumIterOperands() > `0`) {
1330	auto it = strategy.reductionLoops.find(forOp);
1331	assert(it != strategy.reductionLoops.end() &&
1332	"Reduction descriptors not found when vectorizing a reduction loop");
1333	reductions = it->second;
1334	assert(reductions.size() == forOp.getNumIterOperands() &&
1335	"The size of reductions array must match the number of iter_args");
1336	}
1337
1338	// Vectorize 'iter_args'.
1339	SmallVector<Value, `8`> vecIterOperands;
1340	if (!isLoopVecDim) {
1341	for (auto operand : forOp.getInits())
1342	vecIterOperands.push_back(vectorizeOperand(operand, state));
1343	} else {
1344	// For reduction loops we need to pass a vector of neutral elements as an
1345	// initial value of the accumulator. We will add the original initial value
1346	// later.
1347	for (auto redAndOperand : llvm::zip(reductions, forOp.getInits())) {
1348	vecIterOperands.push_back(createInitialVector(
1349	std::get<`0`>(redAndOperand).kind, std::get<`1`>(redAndOperand), state));
1350	}
1351	}
1352
1353	auto vecForOp = state.builder.create<AffineForOp>(
1354	forOp.getLoc(), forOp.getLowerBoundOperands(), forOp.getLowerBoundMap(),
1355	forOp.getUpperBoundOperands(), forOp.getUpperBoundMap(), newStep,
1356	vecIterOperands,
1357	/bodyBuilder=/[](OpBuilder &, Location, Value, ValueRange) {
1358	// Make sure we don't create a default terminator in the loop body as
1359	// the proper terminator will be added during vectorization.
1360	});
1361
1362	// Register loop-related replacements:
1363	// 1) The new vectorized loop is registered as vector replacement of the
1364	// scalar loop.
1365	// 2) The new iv of the vectorized loop is registered as scalar replacement
1366	// since a scalar copy of the iv will prevail in the vectorized loop.
1367	// TODO: A vector replacement will also be added in the future when
1368	// vectorization of linear ops is supported.
1369	// 3) The new 'iter_args' region arguments are registered as vector
1370	// replacements since they have been vectorized.
1371	// 4) If the loop performs a reduction along the vector dimension, a
1372	// `vector.reduction` or similar op is inserted for each resulting value
1373	// of the loop and its scalar value replaces the corresponding scalar
1374	// result of the loop.
1375	state.registerOpVectorReplacement(replaced: forOp, replacement: vecForOp);
1376	state.registerValueScalarReplacement(replaced: forOp.getInductionVar(),
1377	replacement: vecForOp.getInductionVar());
1378	for (auto iterTuple :
1379	llvm ::zip(forOp.getRegionIterArgs(), vecForOp.getRegionIterArgs()))
1380	state.registerBlockArgVectorReplacement(std::get<`0`>(iterTuple),
1381	std::get<`1`>(iterTuple));
1382
1383	if (isLoopVecDim) {
1384	for (unsigned i = `0`; i < vecForOp.getNumIterOperands(); ++i) {
1385	// First, we reduce the vector returned from the loop into a scalar.
1386	Value reducedRes =
1387	getVectorReductionOp(reductions [i].kind, state.builder,
1388	vecForOp.getLoc(), vecForOp.getResult(i));
1389	LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ creating a vector reduction: "
1390	<< reducedRes);
1391	// Then we combine it with the original (scalar) initial value unless it
1392	// is equal to the neutral element of the reduction.
1393	Value origInit = forOp.getOperand(forOp.getNumControlOperands() + i);
1394	Value finalRes = reducedRes;
1395	if (!isNeutralElementConst(reductions [i].kind, origInit, state))
1396	finalRes =
1397	arith::getReductionOp(reductions [i].kind, state.builder,
1398	reducedRes.getLoc(), reducedRes, origInit);
1399	state.registerLoopResultScalarReplacement(replaced: forOp.getResult(i), replacement: finalRes);
1400	}
1401	}
1402
1403	if (isLoopVecDim)
1404	state.vecLoopToVecDim[vecForOp] = loopToVecDimIt->second;
1405
1406	// Change insertion point so that upcoming vectorized instructions are
1407	// inserted into the vectorized loop's body.
1408	state.builder.setInsertionPointToStart(vecForOp.getBody());
1409
1410	// If this is a reduction loop then we may need to create a mask to filter out
1411	// garbage in the last iteration.
1412	if (isLoopVecDim && forOp.getNumIterOperands() > `0`)
1413	createMask(vecForOp, state);
1414
1415	return vecForOp;
1416	}
1417
1418	/// Vectorizes arbitrary operation by plain widening. We apply generic type
1419	/// widening of all its results and retrieve the vector counterparts for all its
1420	/// operands.
1421	static Operation widenOp(Operation op, VectorizationState &state) {
1422	SmallVector<Type, `8`> vectorTypes;
1423	for (Value result : op->getResults())
1424	vectorTypes.push_back(
1425	VectorType::get(state.strategy->vectorSizes, result.getType()));
1426
1427	SmallVector<Value, `8`> vectorOperands;
1428	for (Value operand : op->getOperands()) {
1429	Value vecOperand = vectorizeOperand(operand, state);
1430	if (!vecOperand) {
1431	LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ an operand failed vectorize\n");
1432	return nullptr;
1433	}
1434	vectorOperands.push_back(Elt: vecOperand);
1435	}
1436
1437	// Create a clone of the op with the proper operands and return types.
1438	// TODO: The following assumes there is always an op with a fixed
1439	// name that works both in scalar mode and vector mode.
1440	// TODO: Is it worth considering an Operation.clone operation which
1441	// changes the type so we can promote an Operation with less boilerplate?
1442	Operation *vecOp =
1443	state.builder.create(op->getLoc(), op->getName().getIdentifier(),
1444	vectorOperands, vectorTypes, op->getAttrs());
1445	state.registerOpVectorReplacement(replaced: op, replacement: vecOp);
1446	return vecOp;
1447	}
1448
1449	/// Vectorizes a yield operation by widening its types. The builder's insertion
1450	/// point is set after the vectorized parent op to continue vectorizing the
1451	/// operations after the parent op. When vectorizing a reduction loop a mask may
1452	/// be used to prevent adding garbage values to the accumulator.
1453	static Operation *vectorizeAffineYieldOp(AffineYieldOp yieldOp,
1454	VectorizationState &state) {
1455	Operation *newYieldOp = widenOp(yieldOp, state);
1456	Operation *newParentOp = state.builder.getInsertionBlock()->getParentOp();
1457
1458	// If there is a mask for this loop then we must prevent garbage values from
1459	// being added to the accumulator by inserting `select` operations, for
1460	// example:
1461	//
1462	// %val_masked = select %mask, %val, %neutralCst : vector<128xi1>,
1463	// vector<128xf32>
1464	// %res = arith.addf %acc, %val_masked : vector<128xf32>
1465	// affine.yield %res : vector<128xf32>
1466	//
1467	if (Value mask = state.vecLoopToMask.lookup(Val: newParentOp)) {
1468	state.builder.setInsertionPoint(newYieldOp);
1469	for (unsigned i = `0`; i < newYieldOp->getNumOperands(); ++i) {
1470	SmallVector<Operation *> combinerOps;
1471	Value reducedVal = matchReduction(
1472	cast<AffineForOp>(newParentOp).getRegionIterArgs(), i, combinerOps);
1473	assert(reducedVal && "expect non-null value for parallel reduction loop");
1474	assert(combinerOps.size() == `1` && "expect only one combiner op");
1475	// IterOperands are neutral element vectors.
1476	Value neutralVal = cast<AffineForOp>(newParentOp).getInits()[i];
1477	state.builder.setInsertionPoint(combinerOps.back());
1478	Value maskedReducedVal = state.builder.create<arith::SelectOp>(
1479	reducedVal.getLoc(), mask, reducedVal, neutralVal);
1480	LLVM_DEBUG(
1481	dbgs() << "\n[early-vect]+++++ masking an input to a binary op that"
1482	"produces value for a yield Op: "
1483	<< maskedReducedVal);
1484	combinerOps.back()->replaceUsesOfWith(from: reducedVal, to: maskedReducedVal);
1485	}
1486	}
1487
1488	state.builder.setInsertionPointAfter(newParentOp);
1489	return newYieldOp;
1490	}
1491
1492	/// Encodes Operation-specific behavior for vectorization. In general we
1493	/// assume that all operands of an op must be vectorized but this is not
1494	/// always true. In the future, it would be nice to have a trait that
1495	/// describes how a particular operation vectorizes. For now we implement the
1496	/// case distinction here. Returns a vectorized form of an operation or
1497	/// nullptr if vectorization fails.
1498	// TODO: consider adding a trait to Op to describe how it gets vectorized.
1499	// Maybe some Ops are not vectorizable or require some tricky logic, we cannot
1500	// do one-off logic here; ideally it would be TableGen'd.
1501	static Operation vectorizeOneOperation(Operation op,
1502	VectorizationState &state) {
1503	// Sanity checks.
1504	assert(!isa<vector::TransferReadOp>(op) &&
1505	"vector.transfer_read cannot be further vectorized");
1506	assert(!isa<vector::TransferWriteOp>(op) &&
1507	"vector.transfer_write cannot be further vectorized");
1508
1509	if (auto loadOp = dyn_cast<AffineLoadOp>(op))
1510	return vectorizeAffineLoad(loadOp, state);
1511	if (auto storeOp = dyn_cast<AffineStoreOp>(op))
1512	return vectorizeAffineStore(storeOp, state);
1513	if (auto forOp = dyn_cast<AffineForOp>(op))
1514	return vectorizeAffineForOp(forOp, state);
1515	if (auto yieldOp = dyn_cast<AffineYieldOp>(op))
1516	return vectorizeAffineYieldOp(yieldOp, state);
1517	if (auto constant = dyn_cast<arith::ConstantOp>(op))
1518	return vectorizeConstant(constant, state);
1519	if (auto applyOp = dyn_cast<AffineApplyOp>(op))
1520	return vectorizeAffineApplyOp(applyOp, state);
1521
1522	// Other ops with regions are not supported.
1523	if (op->getNumRegions() != `0`)
1524	return nullptr;
1525
1526	return widenOp(op, state);
1527	}
1528
1529	/// Recursive implementation to convert all the nested loops in 'match' to a 2D
1530	/// vector container that preserves the relative nesting level of each loop with
1531	/// respect to the others in 'match'. 'currentLevel' is the nesting level that
1532	/// will be assigned to the loop in the current 'match'.
1533	static void
1534	getMatchedAffineLoopsRec(NestedMatch match, unsigned currentLevel,
1535	std::vector<SmallVector<AffineForOp, `2`>> &loops) {
1536	// Add a new empty level to the output if it doesn't exist already.
1537	assert(currentLevel <= loops.size() && "Unexpected currentLevel");
1538	if (currentLevel == loops.size())
1539	loops.emplace_back();
1540
1541	// Add current match and recursively visit its children.
1542	loops [currentLevel].push_back(cast<AffineForOp>(match.getMatchedOperation()));
1543	for (auto childMatch : match.getMatchedChildren()) {
1544	getMatchedAffineLoopsRec(match: childMatch, currentLevel: currentLevel + `1`, loops);
1545	}
1546	}
1547
1548	/// Converts all the nested loops in 'match' to a 2D vector container that
1549	/// preserves the relative nesting level of each loop with respect to the others
1550	/// in 'match'. This means that every loop in 'loops[i]' will have a parent loop
1551	/// in 'loops[i-1]'. A loop in 'loops[i]' may or may not have a child loop in
1552	/// 'loops[i+1]'.
1553	static void
1554	getMatchedAffineLoops(NestedMatch match,
1555	std::vector<SmallVector<AffineForOp, `2`>> &loops) {
1556	getMatchedAffineLoopsRec(match, /currLoopDepth=/currentLevel: `0`, loops);
1557	}
1558
1559	/// Internal implementation to vectorize affine loops from a single loop nest
1560	/// using an n-D vectorization strategy.
1561	static LogicalResult
1562	vectorizeLoopNest(std::vector<SmallVector<AffineForOp, `2`>> &loops,
1563	const VectorizationStrategy &strategy) {
1564	assert(loops[`0`].size() == `1` && "Expected single root loop");
1565	AffineForOp rootLoop = loops[`0`][`0`];
1566	VectorizationState state(rootLoop.getContext());
1567	state.builder.setInsertionPointAfter(rootLoop);
1568	state.strategy = &strategy;
1569
1570	// Since patterns are recursive, they can very well intersect.
1571	// Since we do not want a fully greedy strategy in general, we decouple
1572	// pattern matching, from profitability analysis, from application.
1573	// As a consequence we must check that each root pattern is still
1574	// vectorizable. If a pattern is not vectorizable anymore, we just skip it.
1575	// TODO: implement a non-greedy profitability analysis that keeps only
1576	// non-intersecting patterns.
1577	if (!isVectorizableLoopBody(rootLoop, vectorTransferPattern())) {
1578	LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ loop is not vectorizable");
1579	return failure();
1580	}
1581
1582	//////////////////////////////////////////////////////////////////////////////
1583	// Vectorize the scalar loop nest following a topological order. A new vector
1584	// loop nest with the vectorized operations is created along the process. If
1585	// vectorization succeeds, the scalar loop nest is erased. If vectorization
1586	// fails, the vector loop nest is erased and the scalar loop nest is not
1587	// modified.
1588	//////////////////////////////////////////////////////////////////////////////
1589
1590	auto opVecResult = rootLoop.walk<WalkOrder::PreOrder>([&](Operation *op) {
1591	LLVM_DEBUG(dbgs() << "[early-vect]+++++ Vectorizing: " << *op);
1592	Operation *vectorOp = vectorizeOneOperation(op, state);
1593	if (!vectorOp) {
1594	LLVM_DEBUG(
1595	dbgs() << "[early-vect]+++++ failed vectorizing the operation: "
1596	<< *op << "\n");
1597	return WalkResult::interrupt();
1598	}
1599
1600	return WalkResult::advance();
1601	});
1602
1603	if (opVecResult.wasInterrupted()) {
1604	LLVM_DEBUG(dbgs() << "[early-vect]+++++ failed vectorization for: "
1605	<< rootLoop << "\n");
1606	// Erase vector loop nest if it was created.
1607	auto vecRootLoopIt = state.opVectorReplacement.find(rootLoop);
1608	if (vecRootLoopIt != state.opVectorReplacement.end())
1609	eraseLoopNest(cast<AffineForOp>(vecRootLoopIt->second));
1610
1611	return failure();
1612	}
1613
1614	// Replace results of reduction loops with the scalar values computed using
1615	// `vector.reduce` or similar ops.
1616	for (auto resPair : state.loopResultScalarReplacement)
1617	resPair.first.replaceAllUsesWith(resPair.second);
1618
1619	assert(state.opVectorReplacement.count(rootLoop) == `1` &&
1620	"Expected vector replacement for loop nest");
1621	LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ success vectorizing pattern");
1622	LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ vectorization result:\n"
1623	<< *state.opVectorReplacement[rootLoop]);
1624
1625	// Finish this vectorization pattern.
1626	state.finishVectorizationPattern(rootLoop: rootLoop);
1627	return success();
1628	}
1629
1630	/// Extracts the matched loops and vectorizes them following a topological
1631	/// order. A new vector loop nest will be created if vectorization succeeds. The
1632	/// original loop nest won't be modified in any case.
1633	static LogicalResult vectorizeRootMatch(NestedMatch m,
1634	const VectorizationStrategy &strategy) {
1635	std::vector<SmallVector<AffineForOp, `2`>> loopsToVectorize;
1636	getMatchedAffineLoops(match: m, loops&: loopsToVectorize);
1637	return vectorizeLoopNest(loops&: loopsToVectorize, strategy);
1638	}
1639
1640	/// Traverses all the loop matches and classifies them into intersection
1641	/// buckets. Two matches intersect if any of them encloses the other one. A
1642	/// match intersects with a bucket if the match intersects with the root
1643	/// (outermost) loop in that bucket.
1644	static void computeIntersectionBuckets(
1645	ArrayRef<NestedMatch> matches,
1646	std::vector<SmallVector<NestedMatch, `8`>> &intersectionBuckets) {
1647	assert(intersectionBuckets.empty() && "Expected empty output");
1648	// Keeps track of the root (outermost) loop of each bucket.
1649	SmallVector<AffineForOp, `8`> bucketRoots;
1650
1651	for (const NestedMatch &match : matches) {
1652	AffineForOp matchRoot = cast<AffineForOp>(match.getMatchedOperation());
1653	bool intersects = false;
1654	for (int i = `0`, end = intersectionBuckets.size(); i < end; ++i) {
1655	AffineForOp bucketRoot = bucketRoots[i];
1656	// Add match to the bucket if the bucket root encloses the match root.
1657	if (bucketRoot->isAncestor(matchRoot)) {
1658	intersectionBuckets [i].push_back(Elt: match);
1659	intersects = true;
1660	break;
1661	}
1662	// Add match to the bucket if the match root encloses the bucket root. The
1663	// match root becomes the new bucket root.
1664	if (matchRoot->isAncestor(bucketRoot)) {
1665	bucketRoots[i] = matchRoot;
1666	intersectionBuckets [i].push_back(Elt: match);
1667	intersects = true;
1668	break;
1669	}
1670	}
1671
1672	// Match doesn't intersect with any existing bucket. Create a new bucket for
1673	// it.
1674	if (!intersects) {
1675	bucketRoots.push_back(matchRoot);
1676	intersectionBuckets.emplace_back();
1677	intersectionBuckets.back().push_back(Elt: match);
1678	}
1679	}
1680	}
1681
1682	/// Internal implementation to vectorize affine loops in 'loops' using the n-D
1683	/// vectorization factors in 'vectorSizes'. By default, each vectorization
1684	/// factor is applied inner-to-outer to the loops of each loop nest.
1685	/// 'fastestVaryingPattern' can be optionally used to provide a different loop
1686	/// vectorization order. `reductionLoops` can be provided to specify loops which
1687	/// can be vectorized along the reduction dimension.
1688	static void vectorizeLoops(Operation parentOp, DenseSet<Operation > &loops,
1689	ArrayRef<int64_t> vectorSizes,
1690	ArrayRef<int64_t> fastestVaryingPattern,
1691	const ReductionLoopMap &reductionLoops) {
1692	assert((reductionLoops.empty() \|\| vectorSizes.size() == `1`) &&
1693	"Vectorizing reductions is supported only for 1-D vectors");
1694
1695	// Compute 1-D, 2-D or 3-D loop pattern to be matched on the target loops.
1696	std::optional<NestedPattern> pattern =
1697	makePattern(loops, vectorSizes.size(), fastestVaryingPattern);
1698	if (!pattern) {
1699	LLVM_DEBUG(dbgs() << "\n[early-vect] pattern couldn't be computed\n");
1700	return;
1701	}
1702
1703	LLVM_DEBUG(dbgs() << "\n******************************************");
1704	LLVM_DEBUG(dbgs() << "\n******************************************");
1705	LLVM_DEBUG(dbgs() << "\n[early-vect] new pattern on parent op\n");
1706	LLVM_DEBUG(dbgs() << *parentOp << "\n");
1707
1708	unsigned patternDepth = pattern ->getDepth();
1709
1710	// Compute all the pattern matches and classify them into buckets of
1711	// intersecting matches.
1712	SmallVector<NestedMatch, `32`> allMatches;
1713	pattern->match(parentOp, &allMatches);
1714	std::vector<SmallVector<NestedMatch, `8`>> intersectionBuckets;
1715	computeIntersectionBuckets(matches: allMatches, intersectionBuckets);
1716
1717	// Iterate over all buckets and vectorize the matches eagerly. We can only
1718	// vectorize one match from each bucket since all the matches within a bucket
1719	// intersect.
1720	for (auto &intersectingMatches : intersectionBuckets) {
1721	for (NestedMatch &match : intersectingMatches) {
1722	VectorizationStrategy strategy;
1723	// TODO: depending on profitability, elect to reduce the vector size.
1724	strategy.vectorSizes.assign(in_start: vectorSizes.begin(), in_end: vectorSizes.end());
1725	strategy.reductionLoops = reductionLoops;
1726	if (failed(result: analyzeProfitability(matches: match.getMatchedChildren(), depthInPattern: `1`,
1727	patternDepth, strategy: &strategy))) {
1728	continue;
1729	}
1730	vectorizeLoopIfProfitable(loop: match.getMatchedOperation(), depthInPattern: `0`, patternDepth,
1731	strategy: &strategy);
1732	// Vectorize match. Skip the rest of intersecting matches in the bucket if
1733	// vectorization succeeded.
1734	// TODO: if pattern does not apply, report it; alter the cost/benefit.
1735	// TODO: some diagnostics if failure to vectorize occurs.
1736	if (succeeded(result: vectorizeRootMatch(m: match, strategy)))
1737	break;
1738	}
1739	}
1740
1741	LLVM_DEBUG(dbgs() << "\n");
1742	}
1743
1744	/// Applies vectorization to the current function by searching over a bunch of
1745	/// predetermined patterns.
1746	void Vectorize::runOnOperation() {
1747	func::FuncOp f = getOperation();
1748	if (!fastestVaryingPattern.empty() &&
1749	fastestVaryingPattern.size() != vectorSizes.size()) {
1750	f.emitRemark("Fastest varying pattern specified with different size than "
1751	"the vector size.");
1752	return signalPassFailure();
1753	}
1754
1755	if (vectorizeReductions && vectorSizes.size() != `1`) {
1756	f.emitError("Vectorizing reductions is supported only for 1-D vectors.");
1757	return signalPassFailure();
1758	}
1759
1760	if (llvm::any_of(vectorSizes, [](int64_t size) { return size <= `0`; })) {
1761	f.emitError("Vectorization factor must be greater than zero.");
1762	return signalPassFailure();
1763	}
1764
1765	DenseSet<Operation *> parallelLoops;
1766	ReductionLoopMap reductionLoops;
1767
1768	// If 'vectorize-reduction=true' is provided, we also populate the
1769	// `reductionLoops` map.
1770	if (vectorizeReductions) {
1771	f.walk([&parallelLoops, &reductionLoops](AffineForOp loop) {
1772	SmallVector<LoopReduction, `2`> reductions;
1773	if (isLoopParallel(loop, &reductions)) {
1774	parallelLoops.insert(loop);
1775	// If it's not a reduction loop, adding it to the map is not necessary.
1776	if (!reductions.empty())
1777	reductionLoops[loop] = reductions;
1778	}
1779	});
1780	} else {
1781	f.walk([&parallelLoops](AffineForOp loop) {
1782	if (isLoopParallel(loop))
1783	parallelLoops.insert(loop);
1784	});
1785	}
1786
1787	// Thread-safe RAII local context, BumpPtrAllocator freed on exit.
1788	NestedPatternContext mlContext;
1789	vectorizeLoops(f, parallelLoops, vectorSizes, fastestVaryingPattern,
1790	reductionLoops);
1791	}
1792
1793	/// Verify that affine loops in 'loops' meet the nesting criteria expected by
1794	/// SuperVectorizer:
1795	/// There must be at least one loop.*
1796	/// There must be a single root loop (nesting level 0).*
1797	/// Each loop at a given nesting level must be nested in a loop from a*
1798	/// previous nesting level.
1799	static LogicalResult
1800	verifyLoopNesting(const std::vector<SmallVector<AffineForOp, `2`>> &loops) {
1801	// Expected at least one loop.
1802	if (loops.empty())
1803	return failure();
1804
1805	// Expected only one root loop.
1806	if (loops [`0`].size() != `1`)
1807	return failure();
1808
1809	// Traverse loops outer-to-inner to check some invariants.
1810	for (int i = `1`, end = loops.size(); i < end; ++i) {
1811	for (AffineForOp loop : loops[i]) {
1812	// Check that each loop at this level is nested in one of the loops from
1813	// the previous level.
1814	if (none_of(loops[i - `1`], [&](AffineForOp maybeParent) {
1815	return maybeParent->isProperAncestor(loop);
1816	}))
1817	return failure();
1818
1819	// Check that each loop at this level is not nested in another loop from
1820	// this level.
1821	for (AffineForOp sibling : loops[i]) {
1822	if (sibling->isProperAncestor(loop))
1823	return failure();
1824	}
1825	}
1826	}
1827
1828	return success();
1829	}
1830
1831
1832	/// External utility to vectorize affine loops in 'loops' using the n-D
1833	/// vectorization factors in 'vectorSizes'. By default, each vectorization
1834	/// factor is applied inner-to-outer to the loops of each loop nest.
1835	/// 'fastestVaryingPattern' can be optionally used to provide a different loop
1836	/// vectorization order.
1837	/// If `reductionLoops` is not empty, the given reduction loops may be
1838	/// vectorized along the reduction dimension.
1839	/// TODO: Vectorizing reductions is supported only for 1-D vectorization.
1840	void mlir::affine::vectorizeAffineLoops(
1841	Operation parentOp, DenseSet<Operation > &loops,
1842	ArrayRef<int64_t> vectorSizes, ArrayRef<int64_t> fastestVaryingPattern,
1843	const ReductionLoopMap &reductionLoops) {
1844	// Thread-safe RAII local context, BumpPtrAllocator freed on exit.
1845	NestedPatternContext mlContext;
1846	vectorizeLoops(parentOp, loops, vectorSizes, fastestVaryingPattern,
1847	reductionLoops);
1848	}
1849
1850	/// External utility to vectorize affine loops from a single loop nest using an
1851	/// n-D vectorization strategy (see doc in VectorizationStrategy definition).
1852	/// Loops are provided in a 2D vector container. The first dimension represents
1853	/// the nesting level relative to the loops to be vectorized. The second
1854	/// dimension contains the loops. This means that:
1855	/// a) every loop in 'loops[i]' must have a parent loop in 'loops[i-1]',
1856	/// b) a loop in 'loops[i]' may or may not have a child loop in 'loops[i+1]'.
1857	///
1858	/// For example, for the following loop nest:
1859	///
1860	/// func @vec2d(%in0: memref<64x128x512xf32>, %in1: memref<64x128x128xf32>,
1861	/// %out0: memref<64x128x512xf32>,
1862	/// %out1: memref<64x128x128xf32>) {
1863	/// affine.for %i0 = 0 to 64 {
1864	/// affine.for %i1 = 0 to 128 {
1865	/// affine.for %i2 = 0 to 512 {
1866	/// %ld = affine.load %in0[%i0, %i1, %i2] : memref<64x128x512xf32>
1867	/// affine.store %ld, %out0[%i0, %i1, %i2] : memref<64x128x512xf32>
1868	/// }
1869	/// affine.for %i3 = 0 to 128 {
1870	/// %ld = affine.load %in1[%i0, %i1, %i3] : memref<64x128x128xf32>
1871	/// affine.store %ld, %out1[%i0, %i1, %i3] : memref<64x128x128xf32>
1872	/// }
1873	/// }
1874	/// }
1875	/// return
1876	/// }
1877	///
1878	/// loops = {{%i0}, {%i2, %i3}}, to vectorize the outermost and the two
1879	/// innermost loops;
1880	/// loops = {{%i1}, {%i2, %i3}}, to vectorize the middle and the two innermost
1881	/// loops;
1882	/// loops = {{%i2}}, to vectorize only the first innermost loop;
1883	/// loops = {{%i3}}, to vectorize only the second innermost loop;
1884	/// loops = {{%i1}}, to vectorize only the middle loop.
1885	LogicalResult mlir::affine::vectorizeAffineLoopNest(
1886	std::vector<SmallVector<AffineForOp, `2`>> &loops,
1887	const VectorizationStrategy &strategy) {
1888	// Thread-safe RAII local context, BumpPtrAllocator freed on exit.
1889	NestedPatternContext mlContext;
1890	if (failed(result: verifyLoopNesting(loops)))
1891	return failure();
1892	return vectorizeLoopNest(loops, strategy);
1893	}
1894

source code of mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp