1 | //===- SuperVectorize.cpp - Vectorize Pass Impl ---------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file implements vectorization of loops, operations and data types to |
10 | // a target-independent, n-D super-vector abstraction. |
11 | // |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #include "mlir/Dialect/Affine/Passes.h" |
15 | |
16 | #include "mlir/Analysis/SliceAnalysis.h" |
17 | #include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h" |
18 | #include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h" |
19 | #include "mlir/Dialect/Affine/Analysis/NestedMatcher.h" |
20 | #include "mlir/Dialect/Affine/IR/AffineOps.h" |
21 | #include "mlir/Dialect/Affine/Utils.h" |
22 | #include "mlir/Dialect/Arith/IR/Arith.h" |
23 | #include "mlir/Dialect/Func/IR/FuncOps.h" |
24 | #include "mlir/Dialect/Vector/IR/VectorOps.h" |
25 | #include "mlir/Dialect/Vector/Utils/VectorUtils.h" |
26 | #include "mlir/IR/IRMapping.h" |
27 | #include "mlir/Pass/Pass.h" |
28 | #include "mlir/Support/LLVM.h" |
29 | #include "llvm/ADT/STLExtras.h" |
30 | #include "llvm/Support/Debug.h" |
31 | #include <optional> |
32 | |
33 | namespace mlir { |
34 | namespace affine { |
35 | #define GEN_PASS_DEF_AFFINEVECTORIZE |
36 | #include "mlir/Dialect/Affine/Passes.h.inc" |
37 | } // namespace affine |
38 | } // namespace mlir |
39 | |
40 | using namespace mlir; |
41 | using namespace affine; |
42 | using namespace vector; |
43 | |
44 | /// |
45 | /// Implements a high-level vectorization strategy on a Function. |
46 | /// The abstraction used is that of super-vectors, which provide a single, |
47 | /// compact, representation in the vector types, information that is expected |
48 | /// to reduce the impact of the phase ordering problem |
49 | /// |
50 | /// Vector granularity: |
51 | /// =================== |
52 | /// This pass is designed to perform vectorization at a super-vector |
53 | /// granularity. A super-vector is loosely defined as a vector type that is a |
54 | /// multiple of a "good" vector size so the HW can efficiently implement a set |
55 | /// of high-level primitives. Multiple is understood along any dimension; e.g. |
56 | /// both vector<16xf32> and vector<2x8xf32> are valid super-vectors for a |
57 | /// vector<8xf32> HW vector. Note that a "good vector size so the HW can |
58 | /// efficiently implement a set of high-level primitives" is not necessarily an |
59 | /// integer multiple of actual hardware registers. We leave details of this |
60 | /// distinction unspecified for now. |
61 | /// |
62 | /// Some may prefer the terminology a "tile of HW vectors". In this case, one |
63 | /// should note that super-vectors implement an "always full tile" abstraction. |
64 | /// They guarantee no partial-tile separation is necessary by relying on a |
65 | /// high-level copy-reshape abstraction that we call vector.transfer. This |
66 | /// copy-reshape operations is also responsible for performing layout |
67 | /// transposition if necessary. In the general case this will require a scoped |
68 | /// allocation in some notional local memory. |
69 | /// |
70 | /// Whatever the mental model one prefers to use for this abstraction, the key |
71 | /// point is that we burn into a single, compact, representation in the vector |
72 | /// types, information that is expected to reduce the impact of the phase |
73 | /// ordering problem. Indeed, a vector type conveys information that: |
74 | /// 1. the associated loops have dependency semantics that do not prevent |
75 | /// vectorization; |
76 | /// 2. the associate loops have been sliced in chunks of static sizes that are |
77 | /// compatible with vector sizes (i.e. similar to unroll-and-jam); |
78 | /// 3. the inner loops, in the unroll-and-jam analogy of 2, are captured by |
79 | /// the |
80 | /// vector type and no vectorization hampering transformations can be |
81 | /// applied to them anymore; |
82 | /// 4. the underlying memrefs are accessed in some notional contiguous way |
83 | /// that allows loading into vectors with some amount of spatial locality; |
84 | /// In other words, super-vectorization provides a level of separation of |
85 | /// concern by way of opacity to subsequent passes. This has the effect of |
86 | /// encapsulating and propagating vectorization constraints down the list of |
87 | /// passes until we are ready to lower further. |
88 | /// |
89 | /// For a particular target, a notion of minimal n-d vector size will be |
90 | /// specified and vectorization targets a multiple of those. In the following |
91 | /// paragraph, let "k ." represent "a multiple of", to be understood as a |
92 | /// multiple in the same dimension (e.g. vector<16 x k . 128> summarizes |
93 | /// vector<16 x 128>, vector<16 x 256>, vector<16 x 1024>, etc). |
94 | /// |
95 | /// Some non-exhaustive notable super-vector sizes of interest include: |
96 | /// - CPU: vector<k . HW_vector_size>, |
97 | /// vector<k' . core_count x k . HW_vector_size>, |
98 | /// vector<socket_count x k' . core_count x k . HW_vector_size>; |
99 | /// - GPU: vector<k . warp_size>, |
100 | /// vector<k . warp_size x float2>, |
101 | /// vector<k . warp_size x float4>, |
102 | /// vector<k . warp_size x 4 x 4x 4> (for tensor_core sizes). |
103 | /// |
104 | /// Loops and operations are emitted that operate on those super-vector shapes. |
105 | /// Subsequent lowering passes will materialize to actual HW vector sizes. These |
106 | /// passes are expected to be (gradually) more target-specific. |
107 | /// |
108 | /// At a high level, a vectorized load in a loop will resemble: |
109 | /// ```mlir |
110 | /// affine.for %i = ? to ? step ? { |
111 | /// %v_a = vector.transfer_read A[%i] : memref<?xf32>, vector<128xf32> |
112 | /// } |
113 | /// ``` |
114 | /// It is the responsibility of the implementation of vector.transfer_read to |
115 | /// materialize vector registers from the original scalar memrefs. A later (more |
116 | /// target-dependent) lowering pass will materialize to actual HW vector sizes. |
117 | /// This lowering may be occur at different times: |
118 | /// 1. at the MLIR level into a combination of loops, unrolling, DmaStartOp + |
119 | /// DmaWaitOp + vectorized operations for data transformations and shuffle; |
120 | /// thus opening opportunities for unrolling and pipelining. This is an |
121 | /// instance of library call "whiteboxing"; or |
122 | /// 2. later in the a target-specific lowering pass or hand-written library |
123 | /// call; achieving full separation of concerns. This is an instance of |
124 | /// library call; or |
125 | /// 3. a mix of both, e.g. based on a model. |
126 | /// In the future, these operations will expose a contract to constrain the |
127 | /// search on vectorization patterns and sizes. |
128 | /// |
129 | /// Occurrence of super-vectorization in the compiler flow: |
130 | /// ======================================================= |
131 | /// This is an active area of investigation. We start with 2 remarks to position |
132 | /// super-vectorization in the context of existing ongoing work: LLVM VPLAN |
133 | /// and LLVM SLP Vectorizer. |
134 | /// |
135 | /// LLVM VPLAN: |
136 | /// ----------- |
137 | /// The astute reader may have noticed that in the limit, super-vectorization |
138 | /// can be applied at a similar time and with similar objectives than VPLAN. |
139 | /// For instance, in the case of a traditional, polyhedral compilation-flow (for |
140 | /// instance, the PPCG project uses ISL to provide dependence analysis, |
141 | /// multi-level(scheduling + tiling), lifting footprint to fast memory, |
142 | /// communication synthesis, mapping, register optimizations) and before |
143 | /// unrolling. When vectorization is applied at this *late* level in a typical |
144 | /// polyhedral flow, and is instantiated with actual hardware vector sizes, |
145 | /// super-vectorization is expected to match (or subsume) the type of patterns |
146 | /// that LLVM's VPLAN aims at targeting. The main difference here is that MLIR |
147 | /// is higher level and our implementation should be significantly simpler. Also |
148 | /// note that in this mode, recursive patterns are probably a bit of an overkill |
149 | /// although it is reasonable to expect that mixing a bit of outer loop and |
150 | /// inner loop vectorization + unrolling will provide interesting choices to |
151 | /// MLIR. |
152 | /// |
153 | /// LLVM SLP Vectorizer: |
154 | /// -------------------- |
155 | /// Super-vectorization however is not meant to be usable in a similar fashion |
156 | /// to the SLP vectorizer. The main difference lies in the information that |
157 | /// both vectorizers use: super-vectorization examines contiguity of memory |
158 | /// references along fastest varying dimensions and loops with recursive nested |
159 | /// patterns capturing imperfectly-nested loop nests; the SLP vectorizer, on |
160 | /// the other hand, performs flat pattern matching inside a single unrolled loop |
161 | /// body and stitches together pieces of load and store operations into full |
162 | /// 1-D vectors. We envision that the SLP vectorizer is a good way to capture |
163 | /// innermost loop, control-flow dependent patterns that super-vectorization may |
164 | /// not be able to capture easily. In other words, super-vectorization does not |
165 | /// aim at replacing the SLP vectorizer and the two solutions are complementary. |
166 | /// |
167 | /// Ongoing investigations: |
168 | /// ----------------------- |
169 | /// We discuss the following *early* places where super-vectorization is |
170 | /// applicable and touch on the expected benefits and risks . We list the |
171 | /// opportunities in the context of the traditional polyhedral compiler flow |
172 | /// described in PPCG. There are essentially 6 places in the MLIR pass pipeline |
173 | /// we expect to experiment with super-vectorization: |
174 | /// 1. Right after language lowering to MLIR: this is the earliest time where |
175 | /// super-vectorization is expected to be applied. At this level, all the |
176 | /// language/user/library-level annotations are available and can be fully |
177 | /// exploited. Examples include loop-type annotations (such as parallel, |
178 | /// reduction, scan, dependence distance vector, vectorizable) as well as |
179 | /// memory access annotations (such as non-aliasing writes guaranteed, |
180 | /// indirect accesses that are permutations by construction) accesses or |
181 | /// that a particular operation is prescribed atomic by the user. At this |
182 | /// level, anything that enriches what dependence analysis can do should be |
183 | /// aggressively exploited. At this level we are close to having explicit |
184 | /// vector types in the language, except we do not impose that burden on the |
185 | /// programmer/library: we derive information from scalar code + annotations. |
186 | /// 2. After dependence analysis and before polyhedral scheduling: the |
187 | /// information that supports vectorization does not need to be supplied by a |
188 | /// higher level of abstraction. Traditional dependence analysis is available |
189 | /// in MLIR and will be used to drive vectorization and cost models. |
190 | /// |
191 | /// Let's pause here and remark that applying super-vectorization as described |
192 | /// in 1. and 2. presents clear opportunities and risks: |
193 | /// - the opportunity is that vectorization is burned in the type system and |
194 | /// is protected from the adverse effect of loop scheduling, tiling, loop |
195 | /// interchange and all passes downstream. Provided that subsequent passes are |
196 | /// able to operate on vector types; the vector shapes, associated loop |
197 | /// iterator properties, alignment, and contiguity of fastest varying |
198 | /// dimensions are preserved until we lower the super-vector types. We expect |
199 | /// this to significantly rein in on the adverse effects of phase ordering. |
200 | /// - the risks are that a. all passes after super-vectorization have to work |
201 | /// on elemental vector types (not that this is always true, wherever |
202 | /// vectorization is applied) and b. that imposing vectorization constraints |
203 | /// too early may be overall detrimental to loop fusion, tiling and other |
204 | /// transformations because the dependence distances are coarsened when |
205 | /// operating on elemental vector types. For this reason, the pattern |
206 | /// profitability analysis should include a component that also captures the |
207 | /// maximal amount of fusion available under a particular pattern. This is |
208 | /// still at the stage of rough ideas but in this context, search is our |
209 | /// friend as the Tensor Comprehensions and auto-TVM contributions |
210 | /// demonstrated previously. |
211 | /// Bottom-line is we do not yet have good answers for the above but aim at |
212 | /// making it easy to answer such questions. |
213 | /// |
214 | /// Back to our listing, the last places where early super-vectorization makes |
215 | /// sense are: |
216 | /// 3. right after polyhedral-style scheduling: PLUTO-style algorithms are known |
217 | /// to improve locality, parallelism and be configurable (e.g. max-fuse, |
218 | /// smart-fuse etc). They can also have adverse effects on contiguity |
219 | /// properties that are required for vectorization but the vector.transfer |
220 | /// copy-reshape-pad-transpose abstraction is expected to help recapture |
221 | /// these properties. |
222 | /// 4. right after polyhedral-style scheduling+tiling; |
223 | /// 5. right after scheduling+tiling+rescheduling: points 4 and 5 represent |
224 | /// probably the most promising places because applying tiling achieves a |
225 | /// separation of concerns that allows rescheduling to worry less about |
226 | /// locality and more about parallelism and distribution (e.g. min-fuse). |
227 | /// |
228 | /// At these levels the risk-reward looks different: on one hand we probably |
229 | /// lost a good deal of language/user/library-level annotation; on the other |
230 | /// hand we gained parallelism and locality through scheduling and tiling. |
231 | /// However we probably want to ensure tiling is compatible with the |
232 | /// full-tile-only abstraction used in super-vectorization or suffer the |
233 | /// consequences. It is too early to place bets on what will win but we expect |
234 | /// super-vectorization to be the right abstraction to allow exploring at all |
235 | /// these levels. And again, search is our friend. |
236 | /// |
237 | /// Lastly, we mention it again here: |
238 | /// 6. as a MLIR-based alternative to VPLAN. |
239 | /// |
240 | /// Lowering, unrolling, pipelining: |
241 | /// ================================ |
242 | /// TODO: point to the proper places. |
243 | /// |
244 | /// Algorithm: |
245 | /// ========== |
246 | /// The algorithm proceeds in a few steps: |
247 | /// 1. defining super-vectorization patterns and matching them on the tree of |
248 | /// AffineForOp. A super-vectorization pattern is defined as a recursive |
249 | /// data structures that matches and captures nested, imperfectly-nested |
250 | /// loops that have a. conformable loop annotations attached (e.g. parallel, |
251 | /// reduction, vectorizable, ...) as well as b. all contiguous load/store |
252 | /// operations along a specified minor dimension (not necessarily the |
253 | /// fastest varying) ; |
254 | /// 2. analyzing those patterns for profitability (TODO: and |
255 | /// interference); |
256 | /// 3. then, for each pattern in order: |
257 | /// a. applying iterative rewriting of the loops and all their nested |
258 | /// operations in topological order. Rewriting is implemented by |
259 | /// coarsening the loops and converting operations and operands to their |
260 | /// vector forms. Processing operations in topological order is relatively |
261 | /// simple due to the structured nature of the control-flow |
262 | /// representation. This order ensures that all the operands of a given |
263 | /// operation have been vectorized before the operation itself in a single |
264 | /// traversal, except for operands defined outside of the loop nest. The |
265 | /// algorithm can convert the following operations to their vector form: |
266 | /// * Affine load and store operations are converted to opaque vector |
267 | /// transfer read and write operations. |
268 | /// * Scalar constant operations/operands are converted to vector |
269 | /// constant operations (splat). |
270 | /// * Uniform operands (only induction variables of loops not mapped to |
271 | /// a vector dimension, or operands defined outside of the loop nest |
272 | /// for now) are broadcasted to a vector. |
273 | /// TODO: Support more uniform cases. |
274 | /// * Affine for operations with 'iter_args' are vectorized by |
275 | /// vectorizing their 'iter_args' operands and results. |
276 | /// TODO: Support more complex loops with divergent lbs and/or ubs. |
277 | /// * The remaining operations in the loop nest are vectorized by |
278 | /// widening their scalar types to vector types. |
279 | /// b. if everything under the root AffineForOp in the current pattern |
280 | /// is vectorized properly, we commit that loop to the IR and remove the |
281 | /// scalar loop. Otherwise, we discard the vectorized loop and keep the |
282 | /// original scalar loop. |
283 | /// c. vectorization is applied on the next pattern in the list. Because |
284 | /// pattern interference avoidance is not yet implemented and that we do |
285 | /// not support further vectorizing an already vector load we need to |
286 | /// re-verify that the pattern is still vectorizable. This is expected to |
287 | /// make cost models more difficult to write and is subject to improvement |
288 | /// in the future. |
289 | /// |
290 | /// Choice of loop transformation to support the algorithm: |
291 | /// ======================================================= |
292 | /// The choice of loop transformation to apply for coarsening vectorized loops |
293 | /// is still subject to exploratory tradeoffs. In particular, say we want to |
294 | /// vectorize by a factor 128, we want to transform the following input: |
295 | /// ```mlir |
296 | /// affine.for %i = %M to %N { |
297 | /// %a = affine.load %A[%i] : memref<?xf32> |
298 | /// } |
299 | /// ``` |
300 | /// |
301 | /// Traditionally, one would vectorize late (after scheduling, tiling, |
302 | /// memory promotion etc) say after stripmining (and potentially unrolling in |
303 | /// the case of LLVM's SLP vectorizer): |
304 | /// ```mlir |
305 | /// affine.for %i = floor(%M, 128) to ceil(%N, 128) { |
306 | /// affine.for %ii = max(%M, 128 * %i) to min(%N, 128*%i + 127) { |
307 | /// %a = affine.load %A[%ii] : memref<?xf32> |
308 | /// } |
309 | /// } |
310 | /// ``` |
311 | /// |
312 | /// Instead, we seek to vectorize early and freeze vector types before |
313 | /// scheduling, so we want to generate a pattern that resembles: |
314 | /// ```mlir |
315 | /// affine.for %i = ? to ? step ? { |
316 | /// %v_a = vector.transfer_read %A[%i] : memref<?xf32>, vector<128xf32> |
317 | /// } |
318 | /// ``` |
319 | /// |
320 | /// i. simply dividing the lower / upper bounds by 128 creates issues |
321 | /// when representing expressions such as ii + 1 because now we only |
322 | /// have access to original values that have been divided. Additional |
323 | /// information is needed to specify accesses at below-128 granularity; |
324 | /// ii. another alternative is to coarsen the loop step but this may have |
325 | /// consequences on dependence analysis and fusability of loops: fusable |
326 | /// loops probably need to have the same step (because we don't want to |
327 | /// stripmine/unroll to enable fusion). |
328 | /// As a consequence, we choose to represent the coarsening using the loop |
329 | /// step for now and reevaluate in the future. Note that we can renormalize |
330 | /// loop steps later if/when we have evidence that they are problematic. |
331 | /// |
332 | /// For the simple strawman example above, vectorizing for a 1-D vector |
333 | /// abstraction of size 128 returns code similar to: |
334 | /// ```mlir |
335 | /// affine.for %i = %M to %N step 128 { |
336 | /// %v_a = vector.transfer_read %A[%i] : memref<?xf32>, vector<128xf32> |
337 | /// } |
338 | /// ``` |
339 | /// |
340 | /// Unsupported cases, extensions, and work in progress (help welcome :-) ): |
341 | /// ======================================================================== |
342 | /// 1. lowering to concrete vector types for various HW; |
343 | /// 2. reduction support for n-D vectorization and non-unit steps; |
344 | /// 3. non-effecting padding during vector.transfer_read and filter during |
345 | /// vector.transfer_write; |
346 | /// 4. misalignment support vector.transfer_read / vector.transfer_write |
347 | /// (hopefully without read-modify-writes); |
348 | /// 5. control-flow support; |
349 | /// 6. cost-models, heuristics and search; |
350 | /// 7. Op implementation, extensions and implication on memref views; |
351 | /// 8. many TODOs left around. |
352 | /// |
353 | /// Examples: |
354 | /// ========= |
355 | /// Consider the following Function: |
356 | /// ```mlir |
357 | /// func @vector_add_2d(%M : index, %N : index) -> f32 { |
358 | /// %A = alloc (%M, %N) : memref<?x?xf32, 0> |
359 | /// %B = alloc (%M, %N) : memref<?x?xf32, 0> |
360 | /// %C = alloc (%M, %N) : memref<?x?xf32, 0> |
361 | /// %f1 = arith.constant 1.0 : f32 |
362 | /// %f2 = arith.constant 2.0 : f32 |
363 | /// affine.for %i0 = 0 to %M { |
364 | /// affine.for %i1 = 0 to %N { |
365 | /// // non-scoped %f1 |
366 | /// affine.store %f1, %A[%i0, %i1] : memref<?x?xf32, 0> |
367 | /// } |
368 | /// } |
369 | /// affine.for %i2 = 0 to %M { |
370 | /// affine.for %i3 = 0 to %N { |
371 | /// // non-scoped %f2 |
372 | /// affine.store %f2, %B[%i2, %i3] : memref<?x?xf32, 0> |
373 | /// } |
374 | /// } |
375 | /// affine.for %i4 = 0 to %M { |
376 | /// affine.for %i5 = 0 to %N { |
377 | /// %a5 = affine.load %A[%i4, %i5] : memref<?x?xf32, 0> |
378 | /// %b5 = affine.load %B[%i4, %i5] : memref<?x?xf32, 0> |
379 | /// %s5 = arith.addf %a5, %b5 : f32 |
380 | /// // non-scoped %f1 |
381 | /// %s6 = arith.addf %s5, %f1 : f32 |
382 | /// // non-scoped %f2 |
383 | /// %s7 = arith.addf %s5, %f2 : f32 |
384 | /// // diamond dependency. |
385 | /// %s8 = arith.addf %s7, %s6 : f32 |
386 | /// affine.store %s8, %C[%i4, %i5] : memref<?x?xf32, 0> |
387 | /// } |
388 | /// } |
389 | /// %c7 = arith.constant 7 : index |
390 | /// %c42 = arith.constant 42 : index |
391 | /// %res = load %C[%c7, %c42] : memref<?x?xf32, 0> |
392 | /// return %res : f32 |
393 | /// } |
394 | /// ``` |
395 | /// |
396 | /// The -affine-super-vectorize pass with the following arguments: |
397 | /// ``` |
398 | /// -affine-super-vectorize="virtual-vector-size=256 test-fastest-varying=0" |
399 | /// ``` |
400 | /// |
401 | /// produces this standard innermost-loop vectorized code: |
402 | /// ```mlir |
403 | /// func @vector_add_2d(%arg0 : index, %arg1 : index) -> f32 { |
404 | /// %0 = memref.alloc(%arg0, %arg1) : memref<?x?xf32> |
405 | /// %1 = memref.alloc(%arg0, %arg1) : memref<?x?xf32> |
406 | /// %2 = memref.alloc(%arg0, %arg1) : memref<?x?xf32> |
407 | /// %cst = arith.constant 1.0 : f32 |
408 | /// %cst_0 = arith.constant 2.0 : f32 |
409 | /// affine.for %i0 = 0 to %arg0 { |
410 | /// affine.for %i1 = 0 to %arg1 step 256 { |
411 | /// %cst_1 = arith.constant dense<vector<256xf32>, 1.0> : |
412 | /// vector<256xf32> |
413 | /// vector.transfer_write %cst_1, %0[%i0, %i1] : |
414 | /// vector<256xf32>, memref<?x?xf32> |
415 | /// } |
416 | /// } |
417 | /// affine.for %i2 = 0 to %arg0 { |
418 | /// affine.for %i3 = 0 to %arg1 step 256 { |
419 | /// %cst_2 = arith.constant dense<vector<256xf32>, 2.0> : |
420 | /// vector<256xf32> |
421 | /// vector.transfer_write %cst_2, %1[%i2, %i3] : |
422 | /// vector<256xf32>, memref<?x?xf32> |
423 | /// } |
424 | /// } |
425 | /// affine.for %i4 = 0 to %arg0 { |
426 | /// affine.for %i5 = 0 to %arg1 step 256 { |
427 | /// %3 = vector.transfer_read %0[%i4, %i5] : |
428 | /// memref<?x?xf32>, vector<256xf32> |
429 | /// %4 = vector.transfer_read %1[%i4, %i5] : |
430 | /// memref<?x?xf32>, vector<256xf32> |
431 | /// %5 = arith.addf %3, %4 : vector<256xf32> |
432 | /// %cst_3 = arith.constant dense<vector<256xf32>, 1.0> : |
433 | /// vector<256xf32> |
434 | /// %6 = arith.addf %5, %cst_3 : vector<256xf32> |
435 | /// %cst_4 = arith.constant dense<vector<256xf32>, 2.0> : |
436 | /// vector<256xf32> |
437 | /// %7 = arith.addf %5, %cst_4 : vector<256xf32> |
438 | /// %8 = arith.addf %7, %6 : vector<256xf32> |
439 | /// vector.transfer_write %8, %2[%i4, %i5] : |
440 | /// vector<256xf32>, memref<?x?xf32> |
441 | /// } |
442 | /// } |
443 | /// %c7 = arith.constant 7 : index |
444 | /// %c42 = arith.constant 42 : index |
445 | /// %9 = load %2[%c7, %c42] : memref<?x?xf32> |
446 | /// return %9 : f32 |
447 | /// } |
448 | /// ``` |
449 | /// |
450 | /// The -affine-super-vectorize pass with the following arguments: |
451 | /// ``` |
452 | /// -affine-super-vectorize="virtual-vector-size=32,256 \ |
453 | /// test-fastest-varying=1,0" |
454 | /// ``` |
455 | /// |
456 | /// produces this more interesting mixed outer-innermost-loop vectorized code: |
457 | /// ```mlir |
458 | /// func @vector_add_2d(%arg0 : index, %arg1 : index) -> f32 { |
459 | /// %0 = memref.alloc(%arg0, %arg1) : memref<?x?xf32> |
460 | /// %1 = memref.alloc(%arg0, %arg1) : memref<?x?xf32> |
461 | /// %2 = memref.alloc(%arg0, %arg1) : memref<?x?xf32> |
462 | /// %cst = arith.constant 1.0 : f32 |
463 | /// %cst_0 = arith.constant 2.0 : f32 |
464 | /// affine.for %i0 = 0 to %arg0 step 32 { |
465 | /// affine.for %i1 = 0 to %arg1 step 256 { |
466 | /// %cst_1 = arith.constant dense<vector<32x256xf32>, 1.0> : |
467 | /// vector<32x256xf32> |
468 | /// vector.transfer_write %cst_1, %0[%i0, %i1] : |
469 | /// vector<32x256xf32>, memref<?x?xf32> |
470 | /// } |
471 | /// } |
472 | /// affine.for %i2 = 0 to %arg0 step 32 { |
473 | /// affine.for %i3 = 0 to %arg1 step 256 { |
474 | /// %cst_2 = arith.constant dense<vector<32x256xf32>, 2.0> : |
475 | /// vector<32x256xf32> |
476 | /// vector.transfer_write %cst_2, %1[%i2, %i3] : |
477 | /// vector<32x256xf32>, memref<?x?xf32> |
478 | /// } |
479 | /// } |
480 | /// affine.for %i4 = 0 to %arg0 step 32 { |
481 | /// affine.for %i5 = 0 to %arg1 step 256 { |
482 | /// %3 = vector.transfer_read %0[%i4, %i5] : |
483 | /// memref<?x?xf32> vector<32x256xf32> |
484 | /// %4 = vector.transfer_read %1[%i4, %i5] : |
485 | /// memref<?x?xf32>, vector<32x256xf32> |
486 | /// %5 = arith.addf %3, %4 : vector<32x256xf32> |
487 | /// %cst_3 = arith.constant dense<vector<32x256xf32>, 1.0> : |
488 | /// vector<32x256xf32> |
489 | /// %6 = arith.addf %5, %cst_3 : vector<32x256xf32> |
490 | /// %cst_4 = arith.constant dense<vector<32x256xf32>, 2.0> : |
491 | /// vector<32x256xf32> |
492 | /// %7 = arith.addf %5, %cst_4 : vector<32x256xf32> |
493 | /// %8 = arith.addf %7, %6 : vector<32x256xf32> |
494 | /// vector.transfer_write %8, %2[%i4, %i5] : |
495 | /// vector<32x256xf32>, memref<?x?xf32> |
496 | /// } |
497 | /// } |
498 | /// %c7 = arith.constant 7 : index |
499 | /// %c42 = arith.constant 42 : index |
500 | /// %9 = load %2[%c7, %c42] : memref<?x?xf32> |
501 | /// return %9 : f32 |
502 | /// } |
503 | /// ``` |
504 | /// |
505 | /// Of course, much more intricate n-D imperfectly-nested patterns can be |
506 | /// vectorized too and specified in a fully declarative fashion. |
507 | /// |
508 | /// Reduction: |
509 | /// ========== |
510 | /// Vectorizing reduction loops along the reduction dimension is supported if: |
511 | /// - the reduction kind is supported, |
512 | /// - the vectorization is 1-D, and |
513 | /// - the step size of the loop equals to one. |
514 | /// |
515 | /// Comparing to the non-vector-dimension case, two additional things are done |
516 | /// during vectorization of such loops: |
517 | /// - The resulting vector returned from the loop is reduced to a scalar using |
518 | /// `vector.reduce`. |
519 | /// - In some cases a mask is applied to the vector yielded at the end of the |
520 | /// loop to prevent garbage values from being written to the accumulator. |
521 | /// |
522 | /// Reduction vectorization is switched off by default, it can be enabled by |
523 | /// passing a map from loops to reductions to utility functions, or by passing |
524 | /// `vectorize-reductions=true` to the vectorization pass. |
525 | /// |
526 | /// Consider the following example: |
527 | /// ```mlir |
528 | /// func @vecred(%in: memref<512xf32>) -> f32 { |
529 | /// %cst = arith.constant 0.000000e+00 : f32 |
530 | /// %sum = affine.for %i = 0 to 500 iter_args(%part_sum = %cst) -> (f32) { |
531 | /// %ld = affine.load %in[%i] : memref<512xf32> |
532 | /// %cos = math.cos %ld : f32 |
533 | /// %add = arith.addf %part_sum, %cos : f32 |
534 | /// affine.yield %add : f32 |
535 | /// } |
536 | /// return %sum : f32 |
537 | /// } |
538 | /// ``` |
539 | /// |
540 | /// The -affine-super-vectorize pass with the following arguments: |
541 | /// ``` |
542 | /// -affine-super-vectorize="virtual-vector-size=128 test-fastest-varying=0 \ |
543 | /// vectorize-reductions=true" |
544 | /// ``` |
545 | /// produces the following output: |
546 | /// ```mlir |
547 | /// #map = affine_map<(d0) -> (-d0 + 500)> |
548 | /// func @vecred(%arg0: memref<512xf32>) -> f32 { |
549 | /// %cst = arith.constant 0.000000e+00 : f32 |
550 | /// %cst_0 = arith.constant dense<0.000000e+00> : vector<128xf32> |
551 | /// %0 = affine.for %arg1 = 0 to 500 step 128 iter_args(%arg2 = %cst_0) |
552 | /// -> (vector<128xf32>) { |
553 | /// // %2 is the number of iterations left in the original loop. |
554 | /// %2 = affine.apply #map(%arg1) |
555 | /// %3 = vector.create_mask %2 : vector<128xi1> |
556 | /// %cst_1 = arith.constant 0.000000e+00 : f32 |
557 | /// %4 = vector.transfer_read %arg0[%arg1], %cst_1 : |
558 | /// memref<512xf32>, vector<128xf32> |
559 | /// %5 = math.cos %4 : vector<128xf32> |
560 | /// %6 = arith.addf %arg2, %5 : vector<128xf32> |
561 | /// // We filter out the effect of last 12 elements using the mask. |
562 | /// %7 = select %3, %6, %arg2 : vector<128xi1>, vector<128xf32> |
563 | /// affine.yield %7 : vector<128xf32> |
564 | /// } |
565 | /// %1 = vector.reduction <add>, %0 : vector<128xf32> into f32 |
566 | /// return %1 : f32 |
567 | /// } |
568 | /// ``` |
569 | /// |
570 | /// Note that because of loop misalignment we needed to apply a mask to prevent |
571 | /// last 12 elements from affecting the final result. The mask is full of ones |
572 | /// in every iteration except for the last one, in which it has the form |
573 | /// `11...100...0` with 116 ones and 12 zeros. |
574 | |
575 | #define DEBUG_TYPE "early-vect" |
576 | |
577 | using llvm::dbgs; |
578 | |
579 | /// Forward declaration. |
580 | static FilterFunctionType |
581 | isVectorizableLoopPtrFactory(const DenseSet<Operation *> ¶llelLoops, |
582 | int fastestVaryingMemRefDimension); |
583 | |
584 | /// Creates a vectorization pattern from the command line arguments. |
585 | /// Up to 3-D patterns are supported. |
586 | /// If the command line argument requests a pattern of higher order, returns an |
587 | /// empty pattern list which will conservatively result in no vectorization. |
588 | static std::optional<NestedPattern> |
589 | makePattern(const DenseSet<Operation *> ¶llelLoops, int vectorRank, |
590 | ArrayRef<int64_t> fastestVaryingPattern) { |
591 | using affine::matcher::For; |
592 | int64_t d0 = fastestVaryingPattern.empty() ? -1 : fastestVaryingPattern[0]; |
593 | int64_t d1 = fastestVaryingPattern.size() < 2 ? -1 : fastestVaryingPattern[1]; |
594 | int64_t d2 = fastestVaryingPattern.size() < 3 ? -1 : fastestVaryingPattern[2]; |
595 | switch (vectorRank) { |
596 | case 1: |
597 | return For(filter: isVectorizableLoopPtrFactory(parallelLoops, fastestVaryingMemRefDimension: d0)); |
598 | case 2: |
599 | return For(filter: isVectorizableLoopPtrFactory(parallelLoops, fastestVaryingMemRefDimension: d0), |
600 | child: For(filter: isVectorizableLoopPtrFactory(parallelLoops, fastestVaryingMemRefDimension: d1))); |
601 | case 3: |
602 | return For(filter: isVectorizableLoopPtrFactory(parallelLoops, fastestVaryingMemRefDimension: d0), |
603 | child: For(filter: isVectorizableLoopPtrFactory(parallelLoops, fastestVaryingMemRefDimension: d1), |
604 | child: For(filter: isVectorizableLoopPtrFactory(parallelLoops, fastestVaryingMemRefDimension: d2)))); |
605 | default: { |
606 | return std::nullopt; |
607 | } |
608 | } |
609 | } |
610 | |
611 | static NestedPattern &vectorTransferPattern() { |
612 | static auto pattern = affine::matcher::Op( |
613 | filter: llvm::IsaPred<vector::TransferReadOp, vector::TransferWriteOp>); |
614 | return pattern; |
615 | } |
616 | |
617 | namespace { |
618 | |
619 | /// Base state for the vectorize pass. |
620 | /// Command line arguments are preempted by non-empty pass arguments. |
621 | struct Vectorize : public affine::impl::AffineVectorizeBase<Vectorize> { |
622 | using Base::Base; |
623 | |
624 | void runOnOperation() override; |
625 | }; |
626 | |
627 | } // namespace |
628 | |
629 | static void vectorizeLoopIfProfitable(Operation *loop, unsigned depthInPattern, |
630 | unsigned patternDepth, |
631 | VectorizationStrategy *strategy) { |
632 | assert(patternDepth > depthInPattern && |
633 | "patternDepth is greater than depthInPattern" ); |
634 | if (patternDepth - depthInPattern > strategy->vectorSizes.size()) { |
635 | // Don't vectorize this loop |
636 | return; |
637 | } |
638 | strategy->loopToVectorDim[loop] = |
639 | strategy->vectorSizes.size() - (patternDepth - depthInPattern); |
640 | } |
641 | |
642 | /// Implements a simple strawman strategy for vectorization. |
643 | /// Given a matched pattern `matches` of depth `patternDepth`, this strategy |
644 | /// greedily assigns the fastest varying dimension ** of the vector ** to the |
645 | /// innermost loop in the pattern. |
646 | /// When coupled with a pattern that looks for the fastest varying dimension in |
647 | /// load/store MemRefs, this creates a generic vectorization strategy that works |
648 | /// for any loop in a hierarchy (outermost, innermost or intermediate). |
649 | /// |
650 | /// TODO: In the future we should additionally increase the power of the |
651 | /// profitability analysis along 3 directions: |
652 | /// 1. account for loop extents (both static and parametric + annotations); |
653 | /// 2. account for data layout permutations; |
654 | /// 3. account for impact of vectorization on maximal loop fusion. |
655 | /// Then we can quantify the above to build a cost model and search over |
656 | /// strategies. |
657 | static LogicalResult analyzeProfitability(ArrayRef<NestedMatch> matches, |
658 | unsigned depthInPattern, |
659 | unsigned patternDepth, |
660 | VectorizationStrategy *strategy) { |
661 | for (auto m : matches) { |
662 | if (failed(result: analyzeProfitability(matches: m.getMatchedChildren(), depthInPattern: depthInPattern + 1, |
663 | patternDepth, strategy))) { |
664 | return failure(); |
665 | } |
666 | vectorizeLoopIfProfitable(loop: m.getMatchedOperation(), depthInPattern, |
667 | patternDepth, strategy); |
668 | } |
669 | return success(); |
670 | } |
671 | |
672 | ///// end TODO: Hoist to a VectorizationStrategy.cpp when appropriate ///// |
673 | |
674 | namespace { |
675 | |
676 | struct VectorizationState { |
677 | |
678 | VectorizationState(MLIRContext *context) : builder(context) {} |
679 | |
680 | /// Registers the vector replacement of a scalar operation and its result |
681 | /// values. Both operations must have the same number of results. |
682 | /// |
683 | /// This utility is used to register the replacement for the vast majority of |
684 | /// the vectorized operations. |
685 | /// |
686 | /// Example: |
687 | /// * 'replaced': %0 = arith.addf %1, %2 : f32 |
688 | /// * 'replacement': %0 = arith.addf %1, %2 : vector<128xf32> |
689 | void registerOpVectorReplacement(Operation *replaced, Operation *replacement); |
690 | |
691 | /// Registers the vector replacement of a scalar value. The replacement |
692 | /// operation should have a single result, which replaces the scalar value. |
693 | /// |
694 | /// This utility is used to register the vector replacement of block arguments |
695 | /// and operation results which are not directly vectorized (i.e., their |
696 | /// scalar version still exists after vectorization), like uniforms. |
697 | /// |
698 | /// Example: |
699 | /// * 'replaced': block argument or operation outside of the vectorized |
700 | /// loop. |
701 | /// * 'replacement': %0 = vector.broadcast %1 : f32 to vector<128xf32> |
702 | void registerValueVectorReplacement(Value replaced, Operation *replacement); |
703 | |
704 | /// Registers the vector replacement of a block argument (e.g., iter_args). |
705 | /// |
706 | /// Example: |
707 | /// * 'replaced': 'iter_arg' block argument. |
708 | /// * 'replacement': vectorized 'iter_arg' block argument. |
709 | void registerBlockArgVectorReplacement(BlockArgument replaced, |
710 | BlockArgument replacement); |
711 | |
712 | /// Registers the scalar replacement of a scalar value. 'replacement' must be |
713 | /// scalar. |
714 | /// |
715 | /// This utility is used to register the replacement of block arguments |
716 | /// or affine.apply results that are within the loop be vectorized and will |
717 | /// continue being scalar within the vector loop. |
718 | /// |
719 | /// Example: |
720 | /// * 'replaced': induction variable of a loop to be vectorized. |
721 | /// * 'replacement': new induction variable in the new vector loop. |
722 | void registerValueScalarReplacement(Value replaced, Value replacement); |
723 | |
724 | /// Registers the scalar replacement of a scalar result returned from a |
725 | /// reduction loop. 'replacement' must be scalar. |
726 | /// |
727 | /// This utility is used to register the replacement for scalar results of |
728 | /// vectorized reduction loops with iter_args. |
729 | /// |
730 | /// Example 2: |
731 | /// * 'replaced': %0 = affine.for %i = 0 to 512 iter_args(%x = ...) -> (f32) |
732 | /// * 'replacement': %1 = vector.reduction <add>, %0 : vector<4xf32> into |
733 | /// f32 |
734 | void registerLoopResultScalarReplacement(Value replaced, Value replacement); |
735 | |
736 | /// Returns in 'replacedVals' the scalar replacement for values in |
737 | /// 'inputVals'. |
738 | void getScalarValueReplacementsFor(ValueRange inputVals, |
739 | SmallVectorImpl<Value> &replacedVals); |
740 | |
741 | /// Erases the scalar loop nest after its successful vectorization. |
742 | void finishVectorizationPattern(AffineForOp rootLoop); |
743 | |
744 | // Used to build and insert all the new operations created. The insertion |
745 | // point is preserved and updated along the vectorization process. |
746 | OpBuilder builder; |
747 | |
748 | // Maps input scalar operations to their vector counterparts. |
749 | DenseMap<Operation *, Operation *> opVectorReplacement; |
750 | // Maps input scalar values to their vector counterparts. |
751 | IRMapping valueVectorReplacement; |
752 | // Maps input scalar values to their new scalar counterparts in the vector |
753 | // loop nest. |
754 | IRMapping valueScalarReplacement; |
755 | // Maps results of reduction loops to their new scalar counterparts. |
756 | DenseMap<Value, Value> loopResultScalarReplacement; |
757 | |
758 | // Maps the newly created vector loops to their vector dimension. |
759 | DenseMap<Operation *, unsigned> vecLoopToVecDim; |
760 | |
761 | // Maps the new vectorized loops to the corresponding vector masks if it is |
762 | // required. |
763 | DenseMap<Operation *, Value> vecLoopToMask; |
764 | |
765 | // The strategy drives which loop to vectorize by which amount. |
766 | const VectorizationStrategy *strategy = nullptr; |
767 | |
768 | private: |
769 | /// Internal implementation to map input scalar values to new vector or scalar |
770 | /// values. |
771 | void registerValueVectorReplacementImpl(Value replaced, Value replacement); |
772 | }; |
773 | |
774 | } // namespace |
775 | |
776 | /// Registers the vector replacement of a scalar operation and its result |
777 | /// values. Both operations must have the same number of results. |
778 | /// |
779 | /// This utility is used to register the replacement for the vast majority of |
780 | /// the vectorized operations. |
781 | /// |
782 | /// Example: |
783 | /// * 'replaced': %0 = arith.addf %1, %2 : f32 |
784 | /// * 'replacement': %0 = arith.addf %1, %2 : vector<128xf32> |
785 | void VectorizationState::registerOpVectorReplacement(Operation *replaced, |
786 | Operation *replacement) { |
787 | LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ commit vectorized op:\n" ); |
788 | LLVM_DEBUG(dbgs() << *replaced << "\n" ); |
789 | LLVM_DEBUG(dbgs() << "into\n" ); |
790 | LLVM_DEBUG(dbgs() << *replacement << "\n" ); |
791 | |
792 | assert(replaced->getNumResults() == replacement->getNumResults() && |
793 | "Unexpected replaced and replacement results" ); |
794 | assert(opVectorReplacement.count(replaced) == 0 && "already registered" ); |
795 | opVectorReplacement[replaced] = replacement; |
796 | |
797 | for (auto resultTuple : |
798 | llvm::zip(t: replaced->getResults(), u: replacement->getResults())) |
799 | registerValueVectorReplacementImpl(replaced: std::get<0>(t&: resultTuple), |
800 | replacement: std::get<1>(t&: resultTuple)); |
801 | } |
802 | |
803 | /// Registers the vector replacement of a scalar value. The replacement |
804 | /// operation should have a single result, which replaces the scalar value. |
805 | /// |
806 | /// This utility is used to register the vector replacement of block arguments |
807 | /// and operation results which are not directly vectorized (i.e., their |
808 | /// scalar version still exists after vectorization), like uniforms. |
809 | /// |
810 | /// Example: |
811 | /// * 'replaced': block argument or operation outside of the vectorized loop. |
812 | /// * 'replacement': %0 = vector.broadcast %1 : f32 to vector<128xf32> |
813 | void VectorizationState::registerValueVectorReplacement( |
814 | Value replaced, Operation *replacement) { |
815 | assert(replacement->getNumResults() == 1 && |
816 | "Expected single-result replacement" ); |
817 | if (Operation *defOp = replaced.getDefiningOp()) |
818 | registerOpVectorReplacement(replaced: defOp, replacement); |
819 | else |
820 | registerValueVectorReplacementImpl(replaced, replacement: replacement->getResult(idx: 0)); |
821 | } |
822 | |
823 | /// Registers the vector replacement of a block argument (e.g., iter_args). |
824 | /// |
825 | /// Example: |
826 | /// * 'replaced': 'iter_arg' block argument. |
827 | /// * 'replacement': vectorized 'iter_arg' block argument. |
828 | void VectorizationState::registerBlockArgVectorReplacement( |
829 | BlockArgument replaced, BlockArgument replacement) { |
830 | registerValueVectorReplacementImpl(replaced, replacement); |
831 | } |
832 | |
833 | void VectorizationState::registerValueVectorReplacementImpl(Value replaced, |
834 | Value replacement) { |
835 | assert(!valueVectorReplacement.contains(replaced) && |
836 | "Vector replacement already registered" ); |
837 | assert(isa<VectorType>(replacement.getType()) && |
838 | "Expected vector type in vector replacement" ); |
839 | valueVectorReplacement.map(from: replaced, to: replacement); |
840 | } |
841 | |
842 | /// Registers the scalar replacement of a scalar value. 'replacement' must be |
843 | /// scalar. |
844 | /// |
845 | /// This utility is used to register the replacement of block arguments |
846 | /// or affine.apply results that are within the loop be vectorized and will |
847 | /// continue being scalar within the vector loop. |
848 | /// |
849 | /// Example: |
850 | /// * 'replaced': induction variable of a loop to be vectorized. |
851 | /// * 'replacement': new induction variable in the new vector loop. |
852 | void VectorizationState::registerValueScalarReplacement(Value replaced, |
853 | Value replacement) { |
854 | assert(!valueScalarReplacement.contains(replaced) && |
855 | "Scalar value replacement already registered" ); |
856 | assert(!isa<VectorType>(replacement.getType()) && |
857 | "Expected scalar type in scalar replacement" ); |
858 | valueScalarReplacement.map(from: replaced, to: replacement); |
859 | } |
860 | |
861 | /// Registers the scalar replacement of a scalar result returned from a |
862 | /// reduction loop. 'replacement' must be scalar. |
863 | /// |
864 | /// This utility is used to register the replacement for scalar results of |
865 | /// vectorized reduction loops with iter_args. |
866 | /// |
867 | /// Example 2: |
868 | /// * 'replaced': %0 = affine.for %i = 0 to 512 iter_args(%x = ...) -> (f32) |
869 | /// * 'replacement': %1 = vector.reduction <add>, %0 : vector<4xf32> into f32 |
870 | void VectorizationState::registerLoopResultScalarReplacement( |
871 | Value replaced, Value replacement) { |
872 | assert(isa<AffineForOp>(replaced.getDefiningOp())); |
873 | assert(loopResultScalarReplacement.count(replaced) == 0 && |
874 | "already registered" ); |
875 | LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ will replace a result of the loop " |
876 | "with scalar: " |
877 | << replacement); |
878 | loopResultScalarReplacement[replaced] = replacement; |
879 | } |
880 | |
881 | /// Returns in 'replacedVals' the scalar replacement for values in 'inputVals'. |
882 | void VectorizationState::getScalarValueReplacementsFor( |
883 | ValueRange inputVals, SmallVectorImpl<Value> &replacedVals) { |
884 | for (Value inputVal : inputVals) |
885 | replacedVals.push_back(Elt: valueScalarReplacement.lookupOrDefault(from: inputVal)); |
886 | } |
887 | |
888 | /// Erases a loop nest, including all its nested operations. |
889 | static void eraseLoopNest(AffineForOp forOp) { |
890 | LLVM_DEBUG(dbgs() << "[early-vect]+++++ erasing:\n" << forOp << "\n" ); |
891 | forOp.erase(); |
892 | } |
893 | |
894 | /// Erases the scalar loop nest after its successful vectorization. |
895 | void VectorizationState::finishVectorizationPattern(AffineForOp rootLoop) { |
896 | LLVM_DEBUG(dbgs() << "\n[early-vect] Finalizing vectorization\n" ); |
897 | eraseLoopNest(rootLoop); |
898 | } |
899 | |
900 | // Apply 'map' with 'mapOperands' returning resulting values in 'results'. |
901 | static void computeMemoryOpIndices(Operation *op, AffineMap map, |
902 | ValueRange mapOperands, |
903 | VectorizationState &state, |
904 | SmallVectorImpl<Value> &results) { |
905 | for (auto resultExpr : map.getResults()) { |
906 | auto singleResMap = |
907 | AffineMap::get(dimCount: map.getNumDims(), symbolCount: map.getNumSymbols(), result: resultExpr); |
908 | auto afOp = state.builder.create<AffineApplyOp>(op->getLoc(), singleResMap, |
909 | mapOperands); |
910 | results.push_back(Elt: afOp); |
911 | } |
912 | } |
913 | |
914 | /// Returns a FilterFunctionType that can be used in NestedPattern to match a |
915 | /// loop whose underlying load/store accesses are either invariant or all |
916 | // varying along the `fastestVaryingMemRefDimension`. |
917 | static FilterFunctionType |
918 | isVectorizableLoopPtrFactory(const DenseSet<Operation *> ¶llelLoops, |
919 | int fastestVaryingMemRefDimension) { |
920 | return [¶llelLoops, fastestVaryingMemRefDimension](Operation &forOp) { |
921 | auto loop = cast<AffineForOp>(forOp); |
922 | if (!parallelLoops.contains(V: loop)) |
923 | return false; |
924 | int memRefDim = -1; |
925 | auto vectorizableBody = |
926 | isVectorizableLoopBody(loop, &memRefDim, vectorTransferPattern()); |
927 | if (!vectorizableBody) |
928 | return false; |
929 | return memRefDim == -1 || fastestVaryingMemRefDimension == -1 || |
930 | memRefDim == fastestVaryingMemRefDimension; |
931 | }; |
932 | } |
933 | |
934 | /// Returns the vector type resulting from applying the provided vectorization |
935 | /// strategy on the scalar type. |
936 | static VectorType getVectorType(Type scalarTy, |
937 | const VectorizationStrategy *strategy) { |
938 | assert(!isa<VectorType>(scalarTy) && "Expected scalar type" ); |
939 | return VectorType::get(strategy->vectorSizes, scalarTy); |
940 | } |
941 | |
942 | /// Tries to transform a scalar constant into a vector constant. Returns the |
943 | /// vector constant if the scalar type is valid vector element type. Returns |
944 | /// nullptr, otherwise. |
945 | static arith::ConstantOp vectorizeConstant(arith::ConstantOp constOp, |
946 | VectorizationState &state) { |
947 | Type scalarTy = constOp.getType(); |
948 | if (!VectorType::isValidElementType(scalarTy)) |
949 | return nullptr; |
950 | |
951 | auto vecTy = getVectorType(scalarTy, state.strategy); |
952 | auto vecAttr = DenseElementsAttr::get(vecTy, constOp.getValue()); |
953 | |
954 | OpBuilder::InsertionGuard guard(state.builder); |
955 | Operation *parentOp = state.builder.getInsertionBlock()->getParentOp(); |
956 | // Find the innermost vectorized ancestor loop to insert the vector constant. |
957 | while (parentOp && !state.vecLoopToVecDim.count(Val: parentOp)) |
958 | parentOp = parentOp->getParentOp(); |
959 | assert(parentOp && state.vecLoopToVecDim.count(parentOp) && |
960 | isa<AffineForOp>(parentOp) && "Expected a vectorized for op" ); |
961 | auto vecForOp = cast<AffineForOp>(parentOp); |
962 | state.builder.setInsertionPointToStart(vecForOp.getBody()); |
963 | auto newConstOp = |
964 | state.builder.create<arith::ConstantOp>(constOp.getLoc(), vecAttr); |
965 | |
966 | // Register vector replacement for future uses in the scope. |
967 | state.registerOpVectorReplacement(replaced: constOp, replacement: newConstOp); |
968 | return newConstOp; |
969 | } |
970 | |
971 | /// We have no need to vectorize affine.apply. However, we still need to |
972 | /// generate it and replace the operands with values in valueScalarReplacement. |
973 | static Operation *vectorizeAffineApplyOp(AffineApplyOp applyOp, |
974 | VectorizationState &state) { |
975 | SmallVector<Value, 8> updatedOperands; |
976 | for (Value operand : applyOp.getOperands()) { |
977 | if (state.valueVectorReplacement.contains(operand)) { |
978 | LLVM_DEBUG( |
979 | dbgs() << "\n[early-vect]+++++ affine.apply on vector operand\n" ); |
980 | return nullptr; |
981 | } else { |
982 | Value updatedOperand = state.valueScalarReplacement.lookupOrNull(operand); |
983 | if (!updatedOperand) |
984 | updatedOperand = operand; |
985 | updatedOperands.push_back(updatedOperand); |
986 | } |
987 | } |
988 | |
989 | auto newApplyOp = state.builder.create<AffineApplyOp>( |
990 | applyOp.getLoc(), applyOp.getAffineMap(), updatedOperands); |
991 | |
992 | // Register the new affine.apply result. |
993 | state.registerValueScalarReplacement(replaced: applyOp.getResult(), |
994 | replacement: newApplyOp.getResult()); |
995 | return newApplyOp; |
996 | } |
997 | |
998 | /// Creates a constant vector filled with the neutral elements of the given |
999 | /// reduction. The scalar type of vector elements will be taken from |
1000 | /// `oldOperand`. |
1001 | static arith::ConstantOp createInitialVector(arith::AtomicRMWKind reductionKind, |
1002 | Value oldOperand, |
1003 | VectorizationState &state) { |
1004 | Type scalarTy = oldOperand.getType(); |
1005 | if (!VectorType::isValidElementType(scalarTy)) |
1006 | return nullptr; |
1007 | |
1008 | Attribute valueAttr = getIdentityValueAttr( |
1009 | reductionKind, scalarTy, state.builder, oldOperand.getLoc()); |
1010 | auto vecTy = getVectorType(scalarTy, state.strategy); |
1011 | auto vecAttr = DenseElementsAttr::get(vecTy, valueAttr); |
1012 | auto newConstOp = |
1013 | state.builder.create<arith::ConstantOp>(oldOperand.getLoc(), vecAttr); |
1014 | |
1015 | return newConstOp; |
1016 | } |
1017 | |
1018 | /// Creates a mask used to filter out garbage elements in the last iteration |
1019 | /// of unaligned loops. If a mask is not required then `nullptr` is returned. |
1020 | /// The mask will be a vector of booleans representing meaningful vector |
1021 | /// elements in the current iteration. It is filled with ones for each iteration |
1022 | /// except for the last one, where it has the form `11...100...0` with the |
1023 | /// number of ones equal to the number of meaningful elements (i.e. the number |
1024 | /// of iterations that would be left in the original loop). |
1025 | static Value createMask(AffineForOp vecForOp, VectorizationState &state) { |
1026 | assert(state.strategy->vectorSizes.size() == 1 && |
1027 | "Creating a mask non-1-D vectors is not supported." ); |
1028 | assert(vecForOp.getStep() == state.strategy->vectorSizes[0] && |
1029 | "Creating a mask for loops with non-unit original step size is not " |
1030 | "supported." ); |
1031 | |
1032 | // Check if we have already created the mask. |
1033 | if (Value mask = state.vecLoopToMask.lookup(Val: vecForOp)) |
1034 | return mask; |
1035 | |
1036 | // If the loop has constant bounds and the original number of iterations is |
1037 | // divisable by the vector size then we don't need a mask. |
1038 | if (vecForOp.hasConstantBounds()) { |
1039 | int64_t originalTripCount = |
1040 | vecForOp.getConstantUpperBound() - vecForOp.getConstantLowerBound(); |
1041 | if (originalTripCount % vecForOp.getStepAsInt() == 0) |
1042 | return nullptr; |
1043 | } |
1044 | |
1045 | OpBuilder::InsertionGuard guard(state.builder); |
1046 | state.builder.setInsertionPointToStart(vecForOp.getBody()); |
1047 | |
1048 | // We generate the mask using the `vector.create_mask` operation which accepts |
1049 | // the number of meaningful elements (i.e. the length of the prefix of 1s). |
1050 | // To compute the number of meaningful elements we subtract the current value |
1051 | // of the iteration variable from the upper bound of the loop. Example: |
1052 | // |
1053 | // // 500 is the upper bound of the loop |
1054 | // #map = affine_map<(d0) -> (500 - d0)> |
1055 | // %elems_left = affine.apply #map(%iv) |
1056 | // %mask = vector.create_mask %elems_left : vector<128xi1> |
1057 | |
1058 | Location loc = vecForOp.getLoc(); |
1059 | |
1060 | // First we get the upper bound of the loop using `affine.apply` or |
1061 | // `affine.min`. |
1062 | AffineMap ubMap = vecForOp.getUpperBoundMap(); |
1063 | Value ub; |
1064 | if (ubMap.getNumResults() == 1) |
1065 | ub = state.builder.create<AffineApplyOp>(loc, vecForOp.getUpperBoundMap(), |
1066 | vecForOp.getUpperBoundOperands()); |
1067 | else |
1068 | ub = state.builder.create<AffineMinOp>(loc, vecForOp.getUpperBoundMap(), |
1069 | vecForOp.getUpperBoundOperands()); |
1070 | // Then we compute the number of (original) iterations left in the loop. |
1071 | AffineExpr subExpr = |
1072 | state.builder.getAffineDimExpr(position: 0) - state.builder.getAffineDimExpr(position: 1); |
1073 | Value itersLeft = |
1074 | makeComposedAffineApply(state.builder, loc, AffineMap::get(dimCount: 2, symbolCount: 0, result: subExpr), |
1075 | {ub, vecForOp.getInductionVar()}); |
1076 | // If the affine maps were successfully composed then `ub` is unneeded. |
1077 | if (ub.use_empty()) |
1078 | ub.getDefiningOp()->erase(); |
1079 | // Finally we create the mask. |
1080 | Type maskTy = VectorType::get(state.strategy->vectorSizes, |
1081 | state.builder.getIntegerType(1)); |
1082 | Value mask = |
1083 | state.builder.create<vector::CreateMaskOp>(loc, maskTy, itersLeft); |
1084 | |
1085 | LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ creating a mask:\n" |
1086 | << itersLeft << "\n" |
1087 | << mask << "\n" ); |
1088 | |
1089 | state.vecLoopToMask[vecForOp] = mask; |
1090 | return mask; |
1091 | } |
1092 | |
1093 | /// Returns true if the provided value is vector uniform given the vectorization |
1094 | /// strategy. |
1095 | // TODO: For now, only values that are induction variables of loops not in |
1096 | // `loopToVectorDim` or invariants to all the loops in the vectorization |
1097 | // strategy are considered vector uniforms. |
1098 | static bool isUniformDefinition(Value value, |
1099 | const VectorizationStrategy *strategy) { |
1100 | AffineForOp forOp = getForInductionVarOwner(value); |
1101 | if (forOp && strategy->loopToVectorDim.count(Val: forOp) == 0) |
1102 | return true; |
1103 | |
1104 | for (auto loopToDim : strategy->loopToVectorDim) { |
1105 | auto loop = cast<AffineForOp>(loopToDim.first); |
1106 | if (!loop.isDefinedOutsideOfLoop(value)) |
1107 | return false; |
1108 | } |
1109 | return true; |
1110 | } |
1111 | |
1112 | /// Generates a broadcast op for the provided uniform value using the |
1113 | /// vectorization strategy in 'state'. |
1114 | static Operation *vectorizeUniform(Value uniformVal, |
1115 | VectorizationState &state) { |
1116 | OpBuilder::InsertionGuard guard(state.builder); |
1117 | Value uniformScalarRepl = |
1118 | state.valueScalarReplacement.lookupOrDefault(from: uniformVal); |
1119 | state.builder.setInsertionPointAfterValue(uniformScalarRepl); |
1120 | |
1121 | auto vectorTy = getVectorType(uniformVal.getType(), state.strategy); |
1122 | auto bcastOp = state.builder.create<BroadcastOp>(uniformVal.getLoc(), |
1123 | vectorTy, uniformScalarRepl); |
1124 | state.registerValueVectorReplacement(replaced: uniformVal, replacement: bcastOp); |
1125 | return bcastOp; |
1126 | } |
1127 | |
1128 | /// Tries to vectorize a given `operand` by applying the following logic: |
1129 | /// 1. if the defining operation has been already vectorized, `operand` is |
1130 | /// already in the proper vector form; |
1131 | /// 2. if the `operand` is a constant, returns the vectorized form of the |
1132 | /// constant; |
1133 | /// 3. if the `operand` is uniform, returns a vector broadcast of the `op`; |
1134 | /// 4. otherwise, the vectorization of `operand` is not supported. |
1135 | /// Newly created vector operations are registered in `state` as replacement |
1136 | /// for their scalar counterparts. |
1137 | /// In particular this logic captures some of the use cases where definitions |
1138 | /// that are not scoped under the current pattern are needed to vectorize. |
1139 | /// One such example is top level function constants that need to be splatted. |
1140 | /// |
1141 | /// Returns an operand that has been vectorized to match `state`'s strategy if |
1142 | /// vectorization is possible with the above logic. Returns nullptr otherwise. |
1143 | /// |
1144 | /// TODO: handle more complex cases. |
1145 | static Value vectorizeOperand(Value operand, VectorizationState &state) { |
1146 | LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ vectorize operand: " << operand); |
1147 | // If this value is already vectorized, we are done. |
1148 | if (Value vecRepl = state.valueVectorReplacement.lookupOrNull(from: operand)) { |
1149 | LLVM_DEBUG(dbgs() << " -> already vectorized: " << vecRepl); |
1150 | return vecRepl; |
1151 | } |
1152 | |
1153 | // An vector operand that is not in the replacement map should never reach |
1154 | // this point. Reaching this point could mean that the code was already |
1155 | // vectorized and we shouldn't try to vectorize already vectorized code. |
1156 | assert(!isa<VectorType>(operand.getType()) && |
1157 | "Vector op not found in replacement map" ); |
1158 | |
1159 | // Vectorize constant. |
1160 | if (auto constOp = operand.getDefiningOp<arith::ConstantOp>()) { |
1161 | auto vecConstant = vectorizeConstant(constOp, state); |
1162 | LLVM_DEBUG(dbgs() << "-> constant: " << vecConstant); |
1163 | return vecConstant.getResult(); |
1164 | } |
1165 | |
1166 | // Vectorize uniform values. |
1167 | if (isUniformDefinition(value: operand, strategy: state.strategy)) { |
1168 | Operation *vecUniform = vectorizeUniform(uniformVal: operand, state); |
1169 | LLVM_DEBUG(dbgs() << "-> uniform: " << *vecUniform); |
1170 | return vecUniform->getResult(idx: 0); |
1171 | } |
1172 | |
1173 | // Check for unsupported block argument scenarios. A supported block argument |
1174 | // should have been vectorized already. |
1175 | if (!operand.getDefiningOp()) |
1176 | LLVM_DEBUG(dbgs() << "-> unsupported block argument\n" ); |
1177 | else |
1178 | // Generic unsupported case. |
1179 | LLVM_DEBUG(dbgs() << "-> non-vectorizable\n" ); |
1180 | |
1181 | return nullptr; |
1182 | } |
1183 | |
1184 | /// Vectorizes an affine load with the vectorization strategy in 'state' by |
1185 | /// generating a 'vector.transfer_read' op with the proper permutation map |
1186 | /// inferred from the indices of the load. The new 'vector.transfer_read' is |
1187 | /// registered as replacement of the scalar load. Returns the newly created |
1188 | /// 'vector.transfer_read' if vectorization was successful. Returns nullptr, |
1189 | /// otherwise. |
1190 | static Operation *vectorizeAffineLoad(AffineLoadOp loadOp, |
1191 | VectorizationState &state) { |
1192 | MemRefType memRefType = loadOp.getMemRefType(); |
1193 | Type elementType = memRefType.getElementType(); |
1194 | auto vectorType = VectorType::get(state.strategy->vectorSizes, elementType); |
1195 | |
1196 | // Replace map operands with operands from the vector loop nest. |
1197 | SmallVector<Value, 8> mapOperands; |
1198 | state.getScalarValueReplacementsFor(inputVals: loadOp.getMapOperands(), replacedVals&: mapOperands); |
1199 | |
1200 | // Compute indices for the transfer op. AffineApplyOp's may be generated. |
1201 | SmallVector<Value, 8> indices; |
1202 | indices.reserve(N: memRefType.getRank()); |
1203 | if (loadOp.getAffineMap() != |
1204 | state.builder.getMultiDimIdentityMap(rank: memRefType.getRank())) { |
1205 | // Check the operand in loadOp affine map does not come from AffineApplyOp. |
1206 | for (auto op : mapOperands) { |
1207 | if (op.getDefiningOp<AffineApplyOp>()) |
1208 | return nullptr; |
1209 | } |
1210 | computeMemoryOpIndices(loadOp, loadOp.getAffineMap(), mapOperands, state, |
1211 | indices); |
1212 | } else { |
1213 | indices.append(in_start: mapOperands.begin(), in_end: mapOperands.end()); |
1214 | } |
1215 | |
1216 | // Compute permutation map using the information of new vector loops. |
1217 | auto permutationMap = makePermutationMap(insertPoint: state.builder.getInsertionBlock(), |
1218 | indices, loopToVectorDim: state.vecLoopToVecDim); |
1219 | if (!permutationMap) { |
1220 | LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ can't compute permutationMap\n" ); |
1221 | return nullptr; |
1222 | } |
1223 | LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ permutationMap: " ); |
1224 | LLVM_DEBUG(permutationMap.print(dbgs())); |
1225 | |
1226 | auto transfer = state.builder.create<vector::TransferReadOp>( |
1227 | loadOp.getLoc(), vectorType, loadOp.getMemRef(), indices, permutationMap); |
1228 | |
1229 | // Register replacement for future uses in the scope. |
1230 | state.registerOpVectorReplacement(replaced: loadOp, replacement: transfer); |
1231 | return transfer; |
1232 | } |
1233 | |
1234 | /// Vectorizes an affine store with the vectorization strategy in 'state' by |
1235 | /// generating a 'vector.transfer_write' op with the proper permutation map |
1236 | /// inferred from the indices of the store. The new 'vector.transfer_store' is |
1237 | /// registered as replacement of the scalar load. Returns the newly created |
1238 | /// 'vector.transfer_write' if vectorization was successful. Returns nullptr, |
1239 | /// otherwise. |
1240 | static Operation *vectorizeAffineStore(AffineStoreOp storeOp, |
1241 | VectorizationState &state) { |
1242 | MemRefType memRefType = storeOp.getMemRefType(); |
1243 | Value vectorValue = vectorizeOperand(storeOp.getValueToStore(), state); |
1244 | if (!vectorValue) |
1245 | return nullptr; |
1246 | |
1247 | // Replace map operands with operands from the vector loop nest. |
1248 | SmallVector<Value, 8> mapOperands; |
1249 | state.getScalarValueReplacementsFor(inputVals: storeOp.getMapOperands(), replacedVals&: mapOperands); |
1250 | |
1251 | // Compute indices for the transfer op. AffineApplyOp's may be generated. |
1252 | SmallVector<Value, 8> indices; |
1253 | indices.reserve(N: memRefType.getRank()); |
1254 | if (storeOp.getAffineMap() != |
1255 | state.builder.getMultiDimIdentityMap(rank: memRefType.getRank())) |
1256 | computeMemoryOpIndices(storeOp, storeOp.getAffineMap(), mapOperands, state, |
1257 | indices); |
1258 | else |
1259 | indices.append(in_start: mapOperands.begin(), in_end: mapOperands.end()); |
1260 | |
1261 | // Compute permutation map using the information of new vector loops. |
1262 | auto permutationMap = makePermutationMap(insertPoint: state.builder.getInsertionBlock(), |
1263 | indices, loopToVectorDim: state.vecLoopToVecDim); |
1264 | if (!permutationMap) |
1265 | return nullptr; |
1266 | LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ permutationMap: " ); |
1267 | LLVM_DEBUG(permutationMap.print(dbgs())); |
1268 | |
1269 | auto transfer = state.builder.create<vector::TransferWriteOp>( |
1270 | storeOp.getLoc(), vectorValue, storeOp.getMemRef(), indices, |
1271 | permutationMap); |
1272 | LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ vectorized store: " << transfer); |
1273 | |
1274 | // Register replacement for future uses in the scope. |
1275 | state.registerOpVectorReplacement(replaced: storeOp, replacement: transfer); |
1276 | return transfer; |
1277 | } |
1278 | |
1279 | /// Returns true if `value` is a constant equal to the neutral element of the |
1280 | /// given vectorizable reduction. |
1281 | static bool isNeutralElementConst(arith::AtomicRMWKind reductionKind, |
1282 | Value value, VectorizationState &state) { |
1283 | Type scalarTy = value.getType(); |
1284 | if (!VectorType::isValidElementType(scalarTy)) |
1285 | return false; |
1286 | Attribute valueAttr = getIdentityValueAttr(reductionKind, scalarTy, |
1287 | state.builder, value.getLoc()); |
1288 | if (auto constOp = dyn_cast_or_null<arith::ConstantOp>(value.getDefiningOp())) |
1289 | return constOp.getValue() == valueAttr; |
1290 | return false; |
1291 | } |
1292 | |
1293 | /// Vectorizes a loop with the vectorization strategy in 'state'. A new loop is |
1294 | /// created and registered as replacement for the scalar loop. The builder's |
1295 | /// insertion point is set to the new loop's body so that subsequent vectorized |
1296 | /// operations are inserted into the new loop. If the loop is a vector |
1297 | /// dimension, the step of the newly created loop will reflect the vectorization |
1298 | /// factor used to vectorized that dimension. |
1299 | static Operation *vectorizeAffineForOp(AffineForOp forOp, |
1300 | VectorizationState &state) { |
1301 | const VectorizationStrategy &strategy = *state.strategy; |
1302 | auto loopToVecDimIt = strategy.loopToVectorDim.find(forOp); |
1303 | bool isLoopVecDim = loopToVecDimIt != strategy.loopToVectorDim.end(); |
1304 | |
1305 | // TODO: Vectorization of reduction loops is not supported for non-unit steps. |
1306 | if (isLoopVecDim && forOp.getNumIterOperands() > 0 && forOp.getStep() != 1) { |
1307 | LLVM_DEBUG( |
1308 | dbgs() |
1309 | << "\n[early-vect]+++++ unsupported step size for reduction loop: " |
1310 | << forOp.getStep() << "\n" ); |
1311 | return nullptr; |
1312 | } |
1313 | |
1314 | // If we are vectorizing a vector dimension, compute a new step for the new |
1315 | // vectorized loop using the vectorization factor for the vector dimension. |
1316 | // Otherwise, propagate the step of the scalar loop. |
1317 | unsigned newStep; |
1318 | if (isLoopVecDim) { |
1319 | unsigned vectorDim = loopToVecDimIt->second; |
1320 | assert(vectorDim < strategy.vectorSizes.size() && "vector dim overflow" ); |
1321 | int64_t forOpVecFactor = strategy.vectorSizes[vectorDim]; |
1322 | newStep = forOp.getStepAsInt() * forOpVecFactor; |
1323 | } else { |
1324 | newStep = forOp.getStepAsInt(); |
1325 | } |
1326 | |
1327 | // Get information about reduction kinds. |
1328 | ArrayRef<LoopReduction> reductions; |
1329 | if (isLoopVecDim && forOp.getNumIterOperands() > 0) { |
1330 | auto it = strategy.reductionLoops.find(forOp); |
1331 | assert(it != strategy.reductionLoops.end() && |
1332 | "Reduction descriptors not found when vectorizing a reduction loop" ); |
1333 | reductions = it->second; |
1334 | assert(reductions.size() == forOp.getNumIterOperands() && |
1335 | "The size of reductions array must match the number of iter_args" ); |
1336 | } |
1337 | |
1338 | // Vectorize 'iter_args'. |
1339 | SmallVector<Value, 8> vecIterOperands; |
1340 | if (!isLoopVecDim) { |
1341 | for (auto operand : forOp.getInits()) |
1342 | vecIterOperands.push_back(vectorizeOperand(operand, state)); |
1343 | } else { |
1344 | // For reduction loops we need to pass a vector of neutral elements as an |
1345 | // initial value of the accumulator. We will add the original initial value |
1346 | // later. |
1347 | for (auto redAndOperand : llvm::zip(reductions, forOp.getInits())) { |
1348 | vecIterOperands.push_back(createInitialVector( |
1349 | std::get<0>(redAndOperand).kind, std::get<1>(redAndOperand), state)); |
1350 | } |
1351 | } |
1352 | |
1353 | auto vecForOp = state.builder.create<AffineForOp>( |
1354 | forOp.getLoc(), forOp.getLowerBoundOperands(), forOp.getLowerBoundMap(), |
1355 | forOp.getUpperBoundOperands(), forOp.getUpperBoundMap(), newStep, |
1356 | vecIterOperands, |
1357 | /*bodyBuilder=*/[](OpBuilder &, Location, Value, ValueRange) { |
1358 | // Make sure we don't create a default terminator in the loop body as |
1359 | // the proper terminator will be added during vectorization. |
1360 | }); |
1361 | |
1362 | // Register loop-related replacements: |
1363 | // 1) The new vectorized loop is registered as vector replacement of the |
1364 | // scalar loop. |
1365 | // 2) The new iv of the vectorized loop is registered as scalar replacement |
1366 | // since a scalar copy of the iv will prevail in the vectorized loop. |
1367 | // TODO: A vector replacement will also be added in the future when |
1368 | // vectorization of linear ops is supported. |
1369 | // 3) The new 'iter_args' region arguments are registered as vector |
1370 | // replacements since they have been vectorized. |
1371 | // 4) If the loop performs a reduction along the vector dimension, a |
1372 | // `vector.reduction` or similar op is inserted for each resulting value |
1373 | // of the loop and its scalar value replaces the corresponding scalar |
1374 | // result of the loop. |
1375 | state.registerOpVectorReplacement(replaced: forOp, replacement: vecForOp); |
1376 | state.registerValueScalarReplacement(replaced: forOp.getInductionVar(), |
1377 | replacement: vecForOp.getInductionVar()); |
1378 | for (auto iterTuple : |
1379 | llvm ::zip(forOp.getRegionIterArgs(), vecForOp.getRegionIterArgs())) |
1380 | state.registerBlockArgVectorReplacement(std::get<0>(iterTuple), |
1381 | std::get<1>(iterTuple)); |
1382 | |
1383 | if (isLoopVecDim) { |
1384 | for (unsigned i = 0; i < vecForOp.getNumIterOperands(); ++i) { |
1385 | // First, we reduce the vector returned from the loop into a scalar. |
1386 | Value reducedRes = |
1387 | getVectorReductionOp(reductions[i].kind, state.builder, |
1388 | vecForOp.getLoc(), vecForOp.getResult(i)); |
1389 | LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ creating a vector reduction: " |
1390 | << reducedRes); |
1391 | // Then we combine it with the original (scalar) initial value unless it |
1392 | // is equal to the neutral element of the reduction. |
1393 | Value origInit = forOp.getOperand(forOp.getNumControlOperands() + i); |
1394 | Value finalRes = reducedRes; |
1395 | if (!isNeutralElementConst(reductions[i].kind, origInit, state)) |
1396 | finalRes = |
1397 | arith::getReductionOp(reductions[i].kind, state.builder, |
1398 | reducedRes.getLoc(), reducedRes, origInit); |
1399 | state.registerLoopResultScalarReplacement(replaced: forOp.getResult(i), replacement: finalRes); |
1400 | } |
1401 | } |
1402 | |
1403 | if (isLoopVecDim) |
1404 | state.vecLoopToVecDim[vecForOp] = loopToVecDimIt->second; |
1405 | |
1406 | // Change insertion point so that upcoming vectorized instructions are |
1407 | // inserted into the vectorized loop's body. |
1408 | state.builder.setInsertionPointToStart(vecForOp.getBody()); |
1409 | |
1410 | // If this is a reduction loop then we may need to create a mask to filter out |
1411 | // garbage in the last iteration. |
1412 | if (isLoopVecDim && forOp.getNumIterOperands() > 0) |
1413 | createMask(vecForOp, state); |
1414 | |
1415 | return vecForOp; |
1416 | } |
1417 | |
1418 | /// Vectorizes arbitrary operation by plain widening. We apply generic type |
1419 | /// widening of all its results and retrieve the vector counterparts for all its |
1420 | /// operands. |
1421 | static Operation *widenOp(Operation *op, VectorizationState &state) { |
1422 | SmallVector<Type, 8> vectorTypes; |
1423 | for (Value result : op->getResults()) |
1424 | vectorTypes.push_back( |
1425 | VectorType::get(state.strategy->vectorSizes, result.getType())); |
1426 | |
1427 | SmallVector<Value, 8> vectorOperands; |
1428 | for (Value operand : op->getOperands()) { |
1429 | Value vecOperand = vectorizeOperand(operand, state); |
1430 | if (!vecOperand) { |
1431 | LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ an operand failed vectorize\n" ); |
1432 | return nullptr; |
1433 | } |
1434 | vectorOperands.push_back(Elt: vecOperand); |
1435 | } |
1436 | |
1437 | // Create a clone of the op with the proper operands and return types. |
1438 | // TODO: The following assumes there is always an op with a fixed |
1439 | // name that works both in scalar mode and vector mode. |
1440 | // TODO: Is it worth considering an Operation.clone operation which |
1441 | // changes the type so we can promote an Operation with less boilerplate? |
1442 | Operation *vecOp = |
1443 | state.builder.create(op->getLoc(), op->getName().getIdentifier(), |
1444 | vectorOperands, vectorTypes, op->getAttrs()); |
1445 | state.registerOpVectorReplacement(replaced: op, replacement: vecOp); |
1446 | return vecOp; |
1447 | } |
1448 | |
1449 | /// Vectorizes a yield operation by widening its types. The builder's insertion |
1450 | /// point is set after the vectorized parent op to continue vectorizing the |
1451 | /// operations after the parent op. When vectorizing a reduction loop a mask may |
1452 | /// be used to prevent adding garbage values to the accumulator. |
1453 | static Operation *vectorizeAffineYieldOp(AffineYieldOp yieldOp, |
1454 | VectorizationState &state) { |
1455 | Operation *newYieldOp = widenOp(yieldOp, state); |
1456 | Operation *newParentOp = state.builder.getInsertionBlock()->getParentOp(); |
1457 | |
1458 | // If there is a mask for this loop then we must prevent garbage values from |
1459 | // being added to the accumulator by inserting `select` operations, for |
1460 | // example: |
1461 | // |
1462 | // %val_masked = select %mask, %val, %neutralCst : vector<128xi1>, |
1463 | // vector<128xf32> |
1464 | // %res = arith.addf %acc, %val_masked : vector<128xf32> |
1465 | // affine.yield %res : vector<128xf32> |
1466 | // |
1467 | if (Value mask = state.vecLoopToMask.lookup(Val: newParentOp)) { |
1468 | state.builder.setInsertionPoint(newYieldOp); |
1469 | for (unsigned i = 0; i < newYieldOp->getNumOperands(); ++i) { |
1470 | SmallVector<Operation *> combinerOps; |
1471 | Value reducedVal = matchReduction( |
1472 | cast<AffineForOp>(newParentOp).getRegionIterArgs(), i, combinerOps); |
1473 | assert(reducedVal && "expect non-null value for parallel reduction loop" ); |
1474 | assert(combinerOps.size() == 1 && "expect only one combiner op" ); |
1475 | // IterOperands are neutral element vectors. |
1476 | Value neutralVal = cast<AffineForOp>(newParentOp).getInits()[i]; |
1477 | state.builder.setInsertionPoint(combinerOps.back()); |
1478 | Value maskedReducedVal = state.builder.create<arith::SelectOp>( |
1479 | reducedVal.getLoc(), mask, reducedVal, neutralVal); |
1480 | LLVM_DEBUG( |
1481 | dbgs() << "\n[early-vect]+++++ masking an input to a binary op that" |
1482 | "produces value for a yield Op: " |
1483 | << maskedReducedVal); |
1484 | combinerOps.back()->replaceUsesOfWith(from: reducedVal, to: maskedReducedVal); |
1485 | } |
1486 | } |
1487 | |
1488 | state.builder.setInsertionPointAfter(newParentOp); |
1489 | return newYieldOp; |
1490 | } |
1491 | |
1492 | /// Encodes Operation-specific behavior for vectorization. In general we |
1493 | /// assume that all operands of an op must be vectorized but this is not |
1494 | /// always true. In the future, it would be nice to have a trait that |
1495 | /// describes how a particular operation vectorizes. For now we implement the |
1496 | /// case distinction here. Returns a vectorized form of an operation or |
1497 | /// nullptr if vectorization fails. |
1498 | // TODO: consider adding a trait to Op to describe how it gets vectorized. |
1499 | // Maybe some Ops are not vectorizable or require some tricky logic, we cannot |
1500 | // do one-off logic here; ideally it would be TableGen'd. |
1501 | static Operation *vectorizeOneOperation(Operation *op, |
1502 | VectorizationState &state) { |
1503 | // Sanity checks. |
1504 | assert(!isa<vector::TransferReadOp>(op) && |
1505 | "vector.transfer_read cannot be further vectorized" ); |
1506 | assert(!isa<vector::TransferWriteOp>(op) && |
1507 | "vector.transfer_write cannot be further vectorized" ); |
1508 | |
1509 | if (auto loadOp = dyn_cast<AffineLoadOp>(op)) |
1510 | return vectorizeAffineLoad(loadOp, state); |
1511 | if (auto storeOp = dyn_cast<AffineStoreOp>(op)) |
1512 | return vectorizeAffineStore(storeOp, state); |
1513 | if (auto forOp = dyn_cast<AffineForOp>(op)) |
1514 | return vectorizeAffineForOp(forOp, state); |
1515 | if (auto yieldOp = dyn_cast<AffineYieldOp>(op)) |
1516 | return vectorizeAffineYieldOp(yieldOp, state); |
1517 | if (auto constant = dyn_cast<arith::ConstantOp>(op)) |
1518 | return vectorizeConstant(constant, state); |
1519 | if (auto applyOp = dyn_cast<AffineApplyOp>(op)) |
1520 | return vectorizeAffineApplyOp(applyOp, state); |
1521 | |
1522 | // Other ops with regions are not supported. |
1523 | if (op->getNumRegions() != 0) |
1524 | return nullptr; |
1525 | |
1526 | return widenOp(op, state); |
1527 | } |
1528 | |
1529 | /// Recursive implementation to convert all the nested loops in 'match' to a 2D |
1530 | /// vector container that preserves the relative nesting level of each loop with |
1531 | /// respect to the others in 'match'. 'currentLevel' is the nesting level that |
1532 | /// will be assigned to the loop in the current 'match'. |
1533 | static void |
1534 | getMatchedAffineLoopsRec(NestedMatch match, unsigned currentLevel, |
1535 | std::vector<SmallVector<AffineForOp, 2>> &loops) { |
1536 | // Add a new empty level to the output if it doesn't exist already. |
1537 | assert(currentLevel <= loops.size() && "Unexpected currentLevel" ); |
1538 | if (currentLevel == loops.size()) |
1539 | loops.emplace_back(); |
1540 | |
1541 | // Add current match and recursively visit its children. |
1542 | loops[currentLevel].push_back(cast<AffineForOp>(match.getMatchedOperation())); |
1543 | for (auto childMatch : match.getMatchedChildren()) { |
1544 | getMatchedAffineLoopsRec(match: childMatch, currentLevel: currentLevel + 1, loops); |
1545 | } |
1546 | } |
1547 | |
1548 | /// Converts all the nested loops in 'match' to a 2D vector container that |
1549 | /// preserves the relative nesting level of each loop with respect to the others |
1550 | /// in 'match'. This means that every loop in 'loops[i]' will have a parent loop |
1551 | /// in 'loops[i-1]'. A loop in 'loops[i]' may or may not have a child loop in |
1552 | /// 'loops[i+1]'. |
1553 | static void |
1554 | getMatchedAffineLoops(NestedMatch match, |
1555 | std::vector<SmallVector<AffineForOp, 2>> &loops) { |
1556 | getMatchedAffineLoopsRec(match, /*currLoopDepth=*/currentLevel: 0, loops); |
1557 | } |
1558 | |
1559 | /// Internal implementation to vectorize affine loops from a single loop nest |
1560 | /// using an n-D vectorization strategy. |
1561 | static LogicalResult |
1562 | vectorizeLoopNest(std::vector<SmallVector<AffineForOp, 2>> &loops, |
1563 | const VectorizationStrategy &strategy) { |
1564 | assert(loops[0].size() == 1 && "Expected single root loop" ); |
1565 | AffineForOp rootLoop = loops[0][0]; |
1566 | VectorizationState state(rootLoop.getContext()); |
1567 | state.builder.setInsertionPointAfter(rootLoop); |
1568 | state.strategy = &strategy; |
1569 | |
1570 | // Since patterns are recursive, they can very well intersect. |
1571 | // Since we do not want a fully greedy strategy in general, we decouple |
1572 | // pattern matching, from profitability analysis, from application. |
1573 | // As a consequence we must check that each root pattern is still |
1574 | // vectorizable. If a pattern is not vectorizable anymore, we just skip it. |
1575 | // TODO: implement a non-greedy profitability analysis that keeps only |
1576 | // non-intersecting patterns. |
1577 | if (!isVectorizableLoopBody(rootLoop, vectorTransferPattern())) { |
1578 | LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ loop is not vectorizable" ); |
1579 | return failure(); |
1580 | } |
1581 | |
1582 | ////////////////////////////////////////////////////////////////////////////// |
1583 | // Vectorize the scalar loop nest following a topological order. A new vector |
1584 | // loop nest with the vectorized operations is created along the process. If |
1585 | // vectorization succeeds, the scalar loop nest is erased. If vectorization |
1586 | // fails, the vector loop nest is erased and the scalar loop nest is not |
1587 | // modified. |
1588 | ////////////////////////////////////////////////////////////////////////////// |
1589 | |
1590 | auto opVecResult = rootLoop.walk<WalkOrder::PreOrder>([&](Operation *op) { |
1591 | LLVM_DEBUG(dbgs() << "[early-vect]+++++ Vectorizing: " << *op); |
1592 | Operation *vectorOp = vectorizeOneOperation(op, state); |
1593 | if (!vectorOp) { |
1594 | LLVM_DEBUG( |
1595 | dbgs() << "[early-vect]+++++ failed vectorizing the operation: " |
1596 | << *op << "\n" ); |
1597 | return WalkResult::interrupt(); |
1598 | } |
1599 | |
1600 | return WalkResult::advance(); |
1601 | }); |
1602 | |
1603 | if (opVecResult.wasInterrupted()) { |
1604 | LLVM_DEBUG(dbgs() << "[early-vect]+++++ failed vectorization for: " |
1605 | << rootLoop << "\n" ); |
1606 | // Erase vector loop nest if it was created. |
1607 | auto vecRootLoopIt = state.opVectorReplacement.find(rootLoop); |
1608 | if (vecRootLoopIt != state.opVectorReplacement.end()) |
1609 | eraseLoopNest(cast<AffineForOp>(vecRootLoopIt->second)); |
1610 | |
1611 | return failure(); |
1612 | } |
1613 | |
1614 | // Replace results of reduction loops with the scalar values computed using |
1615 | // `vector.reduce` or similar ops. |
1616 | for (auto resPair : state.loopResultScalarReplacement) |
1617 | resPair.first.replaceAllUsesWith(resPair.second); |
1618 | |
1619 | assert(state.opVectorReplacement.count(rootLoop) == 1 && |
1620 | "Expected vector replacement for loop nest" ); |
1621 | LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ success vectorizing pattern" ); |
1622 | LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ vectorization result:\n" |
1623 | << *state.opVectorReplacement[rootLoop]); |
1624 | |
1625 | // Finish this vectorization pattern. |
1626 | state.finishVectorizationPattern(rootLoop: rootLoop); |
1627 | return success(); |
1628 | } |
1629 | |
1630 | /// Extracts the matched loops and vectorizes them following a topological |
1631 | /// order. A new vector loop nest will be created if vectorization succeeds. The |
1632 | /// original loop nest won't be modified in any case. |
1633 | static LogicalResult vectorizeRootMatch(NestedMatch m, |
1634 | const VectorizationStrategy &strategy) { |
1635 | std::vector<SmallVector<AffineForOp, 2>> loopsToVectorize; |
1636 | getMatchedAffineLoops(match: m, loops&: loopsToVectorize); |
1637 | return vectorizeLoopNest(loops&: loopsToVectorize, strategy); |
1638 | } |
1639 | |
1640 | /// Traverses all the loop matches and classifies them into intersection |
1641 | /// buckets. Two matches intersect if any of them encloses the other one. A |
1642 | /// match intersects with a bucket if the match intersects with the root |
1643 | /// (outermost) loop in that bucket. |
1644 | static void computeIntersectionBuckets( |
1645 | ArrayRef<NestedMatch> matches, |
1646 | std::vector<SmallVector<NestedMatch, 8>> &intersectionBuckets) { |
1647 | assert(intersectionBuckets.empty() && "Expected empty output" ); |
1648 | // Keeps track of the root (outermost) loop of each bucket. |
1649 | SmallVector<AffineForOp, 8> bucketRoots; |
1650 | |
1651 | for (const NestedMatch &match : matches) { |
1652 | AffineForOp matchRoot = cast<AffineForOp>(match.getMatchedOperation()); |
1653 | bool intersects = false; |
1654 | for (int i = 0, end = intersectionBuckets.size(); i < end; ++i) { |
1655 | AffineForOp bucketRoot = bucketRoots[i]; |
1656 | // Add match to the bucket if the bucket root encloses the match root. |
1657 | if (bucketRoot->isAncestor(matchRoot)) { |
1658 | intersectionBuckets[i].push_back(Elt: match); |
1659 | intersects = true; |
1660 | break; |
1661 | } |
1662 | // Add match to the bucket if the match root encloses the bucket root. The |
1663 | // match root becomes the new bucket root. |
1664 | if (matchRoot->isAncestor(bucketRoot)) { |
1665 | bucketRoots[i] = matchRoot; |
1666 | intersectionBuckets[i].push_back(Elt: match); |
1667 | intersects = true; |
1668 | break; |
1669 | } |
1670 | } |
1671 | |
1672 | // Match doesn't intersect with any existing bucket. Create a new bucket for |
1673 | // it. |
1674 | if (!intersects) { |
1675 | bucketRoots.push_back(matchRoot); |
1676 | intersectionBuckets.emplace_back(); |
1677 | intersectionBuckets.back().push_back(Elt: match); |
1678 | } |
1679 | } |
1680 | } |
1681 | |
1682 | /// Internal implementation to vectorize affine loops in 'loops' using the n-D |
1683 | /// vectorization factors in 'vectorSizes'. By default, each vectorization |
1684 | /// factor is applied inner-to-outer to the loops of each loop nest. |
1685 | /// 'fastestVaryingPattern' can be optionally used to provide a different loop |
1686 | /// vectorization order. `reductionLoops` can be provided to specify loops which |
1687 | /// can be vectorized along the reduction dimension. |
1688 | static void vectorizeLoops(Operation *parentOp, DenseSet<Operation *> &loops, |
1689 | ArrayRef<int64_t> vectorSizes, |
1690 | ArrayRef<int64_t> fastestVaryingPattern, |
1691 | const ReductionLoopMap &reductionLoops) { |
1692 | assert((reductionLoops.empty() || vectorSizes.size() == 1) && |
1693 | "Vectorizing reductions is supported only for 1-D vectors" ); |
1694 | |
1695 | // Compute 1-D, 2-D or 3-D loop pattern to be matched on the target loops. |
1696 | std::optional<NestedPattern> pattern = |
1697 | makePattern(loops, vectorSizes.size(), fastestVaryingPattern); |
1698 | if (!pattern) { |
1699 | LLVM_DEBUG(dbgs() << "\n[early-vect] pattern couldn't be computed\n" ); |
1700 | return; |
1701 | } |
1702 | |
1703 | LLVM_DEBUG(dbgs() << "\n******************************************" ); |
1704 | LLVM_DEBUG(dbgs() << "\n******************************************" ); |
1705 | LLVM_DEBUG(dbgs() << "\n[early-vect] new pattern on parent op\n" ); |
1706 | LLVM_DEBUG(dbgs() << *parentOp << "\n" ); |
1707 | |
1708 | unsigned patternDepth = pattern->getDepth(); |
1709 | |
1710 | // Compute all the pattern matches and classify them into buckets of |
1711 | // intersecting matches. |
1712 | SmallVector<NestedMatch, 32> allMatches; |
1713 | pattern->match(parentOp, &allMatches); |
1714 | std::vector<SmallVector<NestedMatch, 8>> intersectionBuckets; |
1715 | computeIntersectionBuckets(matches: allMatches, intersectionBuckets); |
1716 | |
1717 | // Iterate over all buckets and vectorize the matches eagerly. We can only |
1718 | // vectorize one match from each bucket since all the matches within a bucket |
1719 | // intersect. |
1720 | for (auto &intersectingMatches : intersectionBuckets) { |
1721 | for (NestedMatch &match : intersectingMatches) { |
1722 | VectorizationStrategy strategy; |
1723 | // TODO: depending on profitability, elect to reduce the vector size. |
1724 | strategy.vectorSizes.assign(in_start: vectorSizes.begin(), in_end: vectorSizes.end()); |
1725 | strategy.reductionLoops = reductionLoops; |
1726 | if (failed(result: analyzeProfitability(matches: match.getMatchedChildren(), depthInPattern: 1, |
1727 | patternDepth, strategy: &strategy))) { |
1728 | continue; |
1729 | } |
1730 | vectorizeLoopIfProfitable(loop: match.getMatchedOperation(), depthInPattern: 0, patternDepth, |
1731 | strategy: &strategy); |
1732 | // Vectorize match. Skip the rest of intersecting matches in the bucket if |
1733 | // vectorization succeeded. |
1734 | // TODO: if pattern does not apply, report it; alter the cost/benefit. |
1735 | // TODO: some diagnostics if failure to vectorize occurs. |
1736 | if (succeeded(result: vectorizeRootMatch(m: match, strategy))) |
1737 | break; |
1738 | } |
1739 | } |
1740 | |
1741 | LLVM_DEBUG(dbgs() << "\n" ); |
1742 | } |
1743 | |
1744 | /// Applies vectorization to the current function by searching over a bunch of |
1745 | /// predetermined patterns. |
1746 | void Vectorize::runOnOperation() { |
1747 | func::FuncOp f = getOperation(); |
1748 | if (!fastestVaryingPattern.empty() && |
1749 | fastestVaryingPattern.size() != vectorSizes.size()) { |
1750 | f.emitRemark("Fastest varying pattern specified with different size than " |
1751 | "the vector size." ); |
1752 | return signalPassFailure(); |
1753 | } |
1754 | |
1755 | if (vectorizeReductions && vectorSizes.size() != 1) { |
1756 | f.emitError("Vectorizing reductions is supported only for 1-D vectors." ); |
1757 | return signalPassFailure(); |
1758 | } |
1759 | |
1760 | if (llvm::any_of(vectorSizes, [](int64_t size) { return size <= 0; })) { |
1761 | f.emitError("Vectorization factor must be greater than zero." ); |
1762 | return signalPassFailure(); |
1763 | } |
1764 | |
1765 | DenseSet<Operation *> parallelLoops; |
1766 | ReductionLoopMap reductionLoops; |
1767 | |
1768 | // If 'vectorize-reduction=true' is provided, we also populate the |
1769 | // `reductionLoops` map. |
1770 | if (vectorizeReductions) { |
1771 | f.walk([¶llelLoops, &reductionLoops](AffineForOp loop) { |
1772 | SmallVector<LoopReduction, 2> reductions; |
1773 | if (isLoopParallel(loop, &reductions)) { |
1774 | parallelLoops.insert(loop); |
1775 | // If it's not a reduction loop, adding it to the map is not necessary. |
1776 | if (!reductions.empty()) |
1777 | reductionLoops[loop] = reductions; |
1778 | } |
1779 | }); |
1780 | } else { |
1781 | f.walk([¶llelLoops](AffineForOp loop) { |
1782 | if (isLoopParallel(loop)) |
1783 | parallelLoops.insert(loop); |
1784 | }); |
1785 | } |
1786 | |
1787 | // Thread-safe RAII local context, BumpPtrAllocator freed on exit. |
1788 | NestedPatternContext mlContext; |
1789 | vectorizeLoops(f, parallelLoops, vectorSizes, fastestVaryingPattern, |
1790 | reductionLoops); |
1791 | } |
1792 | |
1793 | /// Verify that affine loops in 'loops' meet the nesting criteria expected by |
1794 | /// SuperVectorizer: |
1795 | /// * There must be at least one loop. |
1796 | /// * There must be a single root loop (nesting level 0). |
1797 | /// * Each loop at a given nesting level must be nested in a loop from a |
1798 | /// previous nesting level. |
1799 | static LogicalResult |
1800 | verifyLoopNesting(const std::vector<SmallVector<AffineForOp, 2>> &loops) { |
1801 | // Expected at least one loop. |
1802 | if (loops.empty()) |
1803 | return failure(); |
1804 | |
1805 | // Expected only one root loop. |
1806 | if (loops[0].size() != 1) |
1807 | return failure(); |
1808 | |
1809 | // Traverse loops outer-to-inner to check some invariants. |
1810 | for (int i = 1, end = loops.size(); i < end; ++i) { |
1811 | for (AffineForOp loop : loops[i]) { |
1812 | // Check that each loop at this level is nested in one of the loops from |
1813 | // the previous level. |
1814 | if (none_of(loops[i - 1], [&](AffineForOp maybeParent) { |
1815 | return maybeParent->isProperAncestor(loop); |
1816 | })) |
1817 | return failure(); |
1818 | |
1819 | // Check that each loop at this level is not nested in another loop from |
1820 | // this level. |
1821 | for (AffineForOp sibling : loops[i]) { |
1822 | if (sibling->isProperAncestor(loop)) |
1823 | return failure(); |
1824 | } |
1825 | } |
1826 | } |
1827 | |
1828 | return success(); |
1829 | } |
1830 | |
1831 | |
1832 | /// External utility to vectorize affine loops in 'loops' using the n-D |
1833 | /// vectorization factors in 'vectorSizes'. By default, each vectorization |
1834 | /// factor is applied inner-to-outer to the loops of each loop nest. |
1835 | /// 'fastestVaryingPattern' can be optionally used to provide a different loop |
1836 | /// vectorization order. |
1837 | /// If `reductionLoops` is not empty, the given reduction loops may be |
1838 | /// vectorized along the reduction dimension. |
1839 | /// TODO: Vectorizing reductions is supported only for 1-D vectorization. |
1840 | void mlir::affine::vectorizeAffineLoops( |
1841 | Operation *parentOp, DenseSet<Operation *> &loops, |
1842 | ArrayRef<int64_t> vectorSizes, ArrayRef<int64_t> fastestVaryingPattern, |
1843 | const ReductionLoopMap &reductionLoops) { |
1844 | // Thread-safe RAII local context, BumpPtrAllocator freed on exit. |
1845 | NestedPatternContext mlContext; |
1846 | vectorizeLoops(parentOp, loops, vectorSizes, fastestVaryingPattern, |
1847 | reductionLoops); |
1848 | } |
1849 | |
1850 | /// External utility to vectorize affine loops from a single loop nest using an |
1851 | /// n-D vectorization strategy (see doc in VectorizationStrategy definition). |
1852 | /// Loops are provided in a 2D vector container. The first dimension represents |
1853 | /// the nesting level relative to the loops to be vectorized. The second |
1854 | /// dimension contains the loops. This means that: |
1855 | /// a) every loop in 'loops[i]' must have a parent loop in 'loops[i-1]', |
1856 | /// b) a loop in 'loops[i]' may or may not have a child loop in 'loops[i+1]'. |
1857 | /// |
1858 | /// For example, for the following loop nest: |
1859 | /// |
1860 | /// func @vec2d(%in0: memref<64x128x512xf32>, %in1: memref<64x128x128xf32>, |
1861 | /// %out0: memref<64x128x512xf32>, |
1862 | /// %out1: memref<64x128x128xf32>) { |
1863 | /// affine.for %i0 = 0 to 64 { |
1864 | /// affine.for %i1 = 0 to 128 { |
1865 | /// affine.for %i2 = 0 to 512 { |
1866 | /// %ld = affine.load %in0[%i0, %i1, %i2] : memref<64x128x512xf32> |
1867 | /// affine.store %ld, %out0[%i0, %i1, %i2] : memref<64x128x512xf32> |
1868 | /// } |
1869 | /// affine.for %i3 = 0 to 128 { |
1870 | /// %ld = affine.load %in1[%i0, %i1, %i3] : memref<64x128x128xf32> |
1871 | /// affine.store %ld, %out1[%i0, %i1, %i3] : memref<64x128x128xf32> |
1872 | /// } |
1873 | /// } |
1874 | /// } |
1875 | /// return |
1876 | /// } |
1877 | /// |
1878 | /// loops = {{%i0}, {%i2, %i3}}, to vectorize the outermost and the two |
1879 | /// innermost loops; |
1880 | /// loops = {{%i1}, {%i2, %i3}}, to vectorize the middle and the two innermost |
1881 | /// loops; |
1882 | /// loops = {{%i2}}, to vectorize only the first innermost loop; |
1883 | /// loops = {{%i3}}, to vectorize only the second innermost loop; |
1884 | /// loops = {{%i1}}, to vectorize only the middle loop. |
1885 | LogicalResult mlir::affine::vectorizeAffineLoopNest( |
1886 | std::vector<SmallVector<AffineForOp, 2>> &loops, |
1887 | const VectorizationStrategy &strategy) { |
1888 | // Thread-safe RAII local context, BumpPtrAllocator freed on exit. |
1889 | NestedPatternContext mlContext; |
1890 | if (failed(result: verifyLoopNesting(loops))) |
1891 | return failure(); |
1892 | return vectorizeLoopNest(loops, strategy); |
1893 | } |
1894 | |