1 | /* Loop Vectorization |
2 | Copyright (C) 2003-2023 Free Software Foundation, Inc. |
3 | Contributed by Dorit Naishlos <dorit@il.ibm.com> and |
4 | Ira Rosen <irar@il.ibm.com> |
5 | |
6 | This file is part of GCC. |
7 | |
8 | GCC is free software; you can redistribute it and/or modify it under |
9 | the terms of the GNU General Public License as published by the Free |
10 | Software Foundation; either version 3, or (at your option) any later |
11 | version. |
12 | |
13 | GCC is distributed in the hope that it will be useful, but WITHOUT ANY |
14 | WARRANTY; without even the implied warranty of MERCHANTABILITY or |
15 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
16 | for more details. |
17 | |
18 | You should have received a copy of the GNU General Public License |
19 | along with GCC; see the file COPYING3. If not see |
20 | <http://www.gnu.org/licenses/>. */ |
21 | |
22 | #define INCLUDE_ALGORITHM |
23 | #include "config.h" |
24 | #include "system.h" |
25 | #include "coretypes.h" |
26 | #include "backend.h" |
27 | #include "target.h" |
28 | #include "rtl.h" |
29 | #include "tree.h" |
30 | #include "gimple.h" |
31 | #include "cfghooks.h" |
32 | #include "tree-pass.h" |
33 | #include "ssa.h" |
34 | #include "optabs-tree.h" |
35 | #include "memmodel.h" |
36 | #include "optabs.h" |
37 | #include "diagnostic-core.h" |
38 | #include "fold-const.h" |
39 | #include "stor-layout.h" |
40 | #include "cfganal.h" |
41 | #include "gimplify.h" |
42 | #include "gimple-iterator.h" |
43 | #include "gimplify-me.h" |
44 | #include "tree-ssa-loop-ivopts.h" |
45 | #include "tree-ssa-loop-manip.h" |
46 | #include "tree-ssa-loop-niter.h" |
47 | #include "tree-ssa-loop.h" |
48 | #include "cfgloop.h" |
49 | #include "tree-scalar-evolution.h" |
50 | #include "tree-vectorizer.h" |
51 | #include "gimple-fold.h" |
52 | #include "cgraph.h" |
53 | #include "tree-cfg.h" |
54 | #include "tree-if-conv.h" |
55 | #include "internal-fn.h" |
56 | #include "tree-vector-builder.h" |
57 | #include "vec-perm-indices.h" |
58 | #include "tree-eh.h" |
59 | #include "case-cfn-macros.h" |
60 | #include "langhooks.h" |
61 | |
62 | /* Loop Vectorization Pass. |
63 | |
64 | This pass tries to vectorize loops. |
65 | |
66 | For example, the vectorizer transforms the following simple loop: |
67 | |
68 | short a[N]; short b[N]; short c[N]; int i; |
69 | |
70 | for (i=0; i<N; i++){ |
71 | a[i] = b[i] + c[i]; |
72 | } |
73 | |
74 | as if it was manually vectorized by rewriting the source code into: |
75 | |
76 | typedef int __attribute__((mode(V8HI))) v8hi; |
77 | short a[N]; short b[N]; short c[N]; int i; |
78 | v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c; |
79 | v8hi va, vb, vc; |
80 | |
81 | for (i=0; i<N/8; i++){ |
82 | vb = pb[i]; |
83 | vc = pc[i]; |
84 | va = vb + vc; |
85 | pa[i] = va; |
86 | } |
87 | |
88 | The main entry to this pass is vectorize_loops(), in which |
89 | the vectorizer applies a set of analyses on a given set of loops, |
90 | followed by the actual vectorization transformation for the loops that |
91 | had successfully passed the analysis phase. |
92 | Throughout this pass we make a distinction between two types of |
93 | data: scalars (which are represented by SSA_NAMES), and memory references |
94 | ("data-refs"). These two types of data require different handling both |
95 | during analysis and transformation. The types of data-refs that the |
96 | vectorizer currently supports are ARRAY_REFS which base is an array DECL |
97 | (not a pointer), and INDIRECT_REFS through pointers; both array and pointer |
98 | accesses are required to have a simple (consecutive) access pattern. |
99 | |
100 | Analysis phase: |
101 | =============== |
102 | The driver for the analysis phase is vect_analyze_loop(). |
103 | It applies a set of analyses, some of which rely on the scalar evolution |
104 | analyzer (scev) developed by Sebastian Pop. |
105 | |
106 | During the analysis phase the vectorizer records some information |
107 | per stmt in a "stmt_vec_info" struct which is attached to each stmt in the |
108 | loop, as well as general information about the loop as a whole, which is |
109 | recorded in a "loop_vec_info" struct attached to each loop. |
110 | |
111 | Transformation phase: |
112 | ===================== |
113 | The loop transformation phase scans all the stmts in the loop, and |
114 | creates a vector stmt (or a sequence of stmts) for each scalar stmt S in |
115 | the loop that needs to be vectorized. It inserts the vector code sequence |
116 | just before the scalar stmt S, and records a pointer to the vector code |
117 | in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct |
118 | attached to S). This pointer will be used for the vectorization of following |
119 | stmts which use the def of stmt S. Stmt S is removed if it writes to memory; |
120 | otherwise, we rely on dead code elimination for removing it. |
121 | |
122 | For example, say stmt S1 was vectorized into stmt VS1: |
123 | |
124 | VS1: vb = px[i]; |
125 | S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1 |
126 | S2: a = b; |
127 | |
128 | To vectorize stmt S2, the vectorizer first finds the stmt that defines |
129 | the operand 'b' (S1), and gets the relevant vector def 'vb' from the |
130 | vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The |
131 | resulting sequence would be: |
132 | |
133 | VS1: vb = px[i]; |
134 | S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1 |
135 | VS2: va = vb; |
136 | S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2 |
137 | |
138 | Operands that are not SSA_NAMEs, are data-refs that appear in |
139 | load/store operations (like 'x[i]' in S1), and are handled differently. |
140 | |
141 | Target modeling: |
142 | ================= |
143 | Currently the only target specific information that is used is the |
144 | size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". |
145 | Targets that can support different sizes of vectors, for now will need |
146 | to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More |
147 | flexibility will be added in the future. |
148 | |
149 | Since we only vectorize operations which vector form can be |
150 | expressed using existing tree codes, to verify that an operation is |
151 | supported, the vectorizer checks the relevant optab at the relevant |
152 | machine_mode (e.g, optab_handler (add_optab, V8HImode)). If |
153 | the value found is CODE_FOR_nothing, then there's no target support, and |
154 | we can't vectorize the stmt. |
155 | |
156 | For additional information on this project see: |
157 | http://gcc.gnu.org/projects/tree-ssa/vectorization.html |
158 | */ |
159 | |
160 | static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *, |
161 | unsigned *); |
162 | static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info, |
163 | bool *, bool *, bool); |
164 | |
165 | /* Subroutine of vect_determine_vf_for_stmt that handles only one |
166 | statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE |
167 | may already be set for general statements (not just data refs). */ |
168 | |
169 | static opt_result |
170 | vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info, |
171 | bool vectype_maybe_set_p, |
172 | poly_uint64 *vf) |
173 | { |
174 | gimple *stmt = stmt_info->stmt; |
175 | |
176 | if ((!STMT_VINFO_RELEVANT_P (stmt_info) |
177 | && !STMT_VINFO_LIVE_P (stmt_info)) |
178 | || gimple_clobber_p (s: stmt)) |
179 | { |
180 | if (dump_enabled_p ()) |
181 | dump_printf_loc (MSG_NOTE, vect_location, "skip.\n" ); |
182 | return opt_result::success (); |
183 | } |
184 | |
185 | tree stmt_vectype, nunits_vectype; |
186 | opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info, |
187 | &stmt_vectype, |
188 | &nunits_vectype); |
189 | if (!res) |
190 | return res; |
191 | |
192 | if (stmt_vectype) |
193 | { |
194 | if (STMT_VINFO_VECTYPE (stmt_info)) |
195 | /* The only case when a vectype had been already set is for stmts |
196 | that contain a data ref, or for "pattern-stmts" (stmts generated |
197 | by the vectorizer to represent/replace a certain idiom). */ |
198 | gcc_assert ((STMT_VINFO_DATA_REF (stmt_info) |
199 | || vectype_maybe_set_p) |
200 | && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype); |
201 | else |
202 | STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype; |
203 | } |
204 | |
205 | if (nunits_vectype) |
206 | vect_update_max_nunits (max_nunits: vf, vectype: nunits_vectype); |
207 | |
208 | return opt_result::success (); |
209 | } |
210 | |
211 | /* Subroutine of vect_determine_vectorization_factor. Set the vector |
212 | types of STMT_INFO and all attached pattern statements and update |
213 | the vectorization factor VF accordingly. Return true on success |
214 | or false if something prevented vectorization. */ |
215 | |
216 | static opt_result |
217 | vect_determine_vf_for_stmt (vec_info *vinfo, |
218 | stmt_vec_info stmt_info, poly_uint64 *vf) |
219 | { |
220 | if (dump_enabled_p ()) |
221 | dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G" , |
222 | stmt_info->stmt); |
223 | opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, vectype_maybe_set_p: false, vf); |
224 | if (!res) |
225 | return res; |
226 | |
227 | if (STMT_VINFO_IN_PATTERN_P (stmt_info) |
228 | && STMT_VINFO_RELATED_STMT (stmt_info)) |
229 | { |
230 | gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info); |
231 | stmt_info = STMT_VINFO_RELATED_STMT (stmt_info); |
232 | |
233 | /* If a pattern statement has def stmts, analyze them too. */ |
234 | for (gimple_stmt_iterator si = gsi_start (seq&: pattern_def_seq); |
235 | !gsi_end_p (i: si); gsi_next (i: &si)) |
236 | { |
237 | stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (i: si)); |
238 | if (dump_enabled_p ()) |
239 | dump_printf_loc (MSG_NOTE, vect_location, |
240 | "==> examining pattern def stmt: %G" , |
241 | def_stmt_info->stmt); |
242 | res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info: def_stmt_info, vectype_maybe_set_p: true, vf); |
243 | if (!res) |
244 | return res; |
245 | } |
246 | |
247 | if (dump_enabled_p ()) |
248 | dump_printf_loc (MSG_NOTE, vect_location, |
249 | "==> examining pattern statement: %G" , |
250 | stmt_info->stmt); |
251 | res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, vectype_maybe_set_p: true, vf); |
252 | if (!res) |
253 | return res; |
254 | } |
255 | |
256 | return opt_result::success (); |
257 | } |
258 | |
259 | /* Function vect_determine_vectorization_factor |
260 | |
261 | Determine the vectorization factor (VF). VF is the number of data elements |
262 | that are operated upon in parallel in a single iteration of the vectorized |
263 | loop. For example, when vectorizing a loop that operates on 4byte elements, |
264 | on a target with vector size (VS) 16byte, the VF is set to 4, since 4 |
265 | elements can fit in a single vector register. |
266 | |
267 | We currently support vectorization of loops in which all types operated upon |
268 | are of the same size. Therefore this function currently sets VF according to |
269 | the size of the types operated upon, and fails if there are multiple sizes |
270 | in the loop. |
271 | |
272 | VF is also the factor by which the loop iterations are strip-mined, e.g.: |
273 | original loop: |
274 | for (i=0; i<N; i++){ |
275 | a[i] = b[i] + c[i]; |
276 | } |
277 | |
278 | vectorized loop: |
279 | for (i=0; i<N; i+=VF){ |
280 | a[i:VF] = b[i:VF] + c[i:VF]; |
281 | } |
282 | */ |
283 | |
284 | static opt_result |
285 | vect_determine_vectorization_factor (loop_vec_info loop_vinfo) |
286 | { |
287 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
288 | basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); |
289 | unsigned nbbs = loop->num_nodes; |
290 | poly_uint64 vectorization_factor = 1; |
291 | tree scalar_type = NULL_TREE; |
292 | gphi *phi; |
293 | tree vectype; |
294 | stmt_vec_info stmt_info; |
295 | unsigned i; |
296 | |
297 | DUMP_VECT_SCOPE ("vect_determine_vectorization_factor" ); |
298 | |
299 | for (i = 0; i < nbbs; i++) |
300 | { |
301 | basic_block bb = bbs[i]; |
302 | |
303 | for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (i: si); |
304 | gsi_next (i: &si)) |
305 | { |
306 | phi = si.phi (); |
307 | stmt_info = loop_vinfo->lookup_stmt (phi); |
308 | if (dump_enabled_p ()) |
309 | dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G" , |
310 | (gimple *) phi); |
311 | |
312 | gcc_assert (stmt_info); |
313 | |
314 | if (STMT_VINFO_RELEVANT_P (stmt_info) |
315 | || STMT_VINFO_LIVE_P (stmt_info)) |
316 | { |
317 | gcc_assert (!STMT_VINFO_VECTYPE (stmt_info)); |
318 | scalar_type = TREE_TYPE (PHI_RESULT (phi)); |
319 | |
320 | if (dump_enabled_p ()) |
321 | dump_printf_loc (MSG_NOTE, vect_location, |
322 | "get vectype for scalar type: %T\n" , |
323 | scalar_type); |
324 | |
325 | vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type); |
326 | if (!vectype) |
327 | return opt_result::failure_at (loc: phi, |
328 | fmt: "not vectorized: unsupported " |
329 | "data-type %T\n" , |
330 | scalar_type); |
331 | STMT_VINFO_VECTYPE (stmt_info) = vectype; |
332 | |
333 | if (dump_enabled_p ()) |
334 | dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n" , |
335 | vectype); |
336 | |
337 | if (dump_enabled_p ()) |
338 | { |
339 | dump_printf_loc (MSG_NOTE, vect_location, "nunits = " ); |
340 | dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (node: vectype)); |
341 | dump_printf (MSG_NOTE, "\n" ); |
342 | } |
343 | |
344 | vect_update_max_nunits (max_nunits: &vectorization_factor, vectype); |
345 | } |
346 | } |
347 | |
348 | for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (i: si); |
349 | gsi_next (i: &si)) |
350 | { |
351 | if (is_gimple_debug (gs: gsi_stmt (i: si))) |
352 | continue; |
353 | stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (i: si)); |
354 | opt_result res |
355 | = vect_determine_vf_for_stmt (vinfo: loop_vinfo, |
356 | stmt_info, vf: &vectorization_factor); |
357 | if (!res) |
358 | return res; |
359 | } |
360 | } |
361 | |
362 | /* TODO: Analyze cost. Decide if worth while to vectorize. */ |
363 | if (dump_enabled_p ()) |
364 | { |
365 | dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = " ); |
366 | dump_dec (MSG_NOTE, vectorization_factor); |
367 | dump_printf (MSG_NOTE, "\n" ); |
368 | } |
369 | |
370 | if (known_le (vectorization_factor, 1U)) |
371 | return opt_result::failure_at (loc: vect_location, |
372 | fmt: "not vectorized: unsupported data-type\n" ); |
373 | LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor; |
374 | return opt_result::success (); |
375 | } |
376 | |
377 | |
378 | /* Function vect_is_simple_iv_evolution. |
379 | |
380 | FORNOW: A simple evolution of an induction variables in the loop is |
381 | considered a polynomial evolution. */ |
382 | |
383 | static bool |
384 | vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init, |
385 | tree * step) |
386 | { |
387 | tree init_expr; |
388 | tree step_expr; |
389 | tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb); |
390 | basic_block bb; |
391 | |
392 | /* When there is no evolution in this loop, the evolution function |
393 | is not "simple". */ |
394 | if (evolution_part == NULL_TREE) |
395 | return false; |
396 | |
397 | /* When the evolution is a polynomial of degree >= 2 |
398 | the evolution function is not "simple". */ |
399 | if (tree_is_chrec (expr: evolution_part)) |
400 | return false; |
401 | |
402 | step_expr = evolution_part; |
403 | init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb)); |
404 | |
405 | if (dump_enabled_p ()) |
406 | dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n" , |
407 | step_expr, init_expr); |
408 | |
409 | *init = init_expr; |
410 | *step = step_expr; |
411 | |
412 | if (TREE_CODE (step_expr) != INTEGER_CST |
413 | && (TREE_CODE (step_expr) != SSA_NAME |
414 | || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr))) |
415 | && flow_bb_inside_loop_p (get_loop (cfun, num: loop_nb), bb)) |
416 | || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr)) |
417 | && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)) |
418 | || !flag_associative_math))) |
419 | && (TREE_CODE (step_expr) != REAL_CST |
420 | || !flag_associative_math)) |
421 | { |
422 | if (dump_enabled_p ()) |
423 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
424 | "step unknown.\n" ); |
425 | return false; |
426 | } |
427 | |
428 | return true; |
429 | } |
430 | |
431 | /* Function vect_is_nonlinear_iv_evolution |
432 | |
433 | Only support nonlinear induction for integer type |
434 | 1. neg |
435 | 2. mul by constant |
436 | 3. lshift/rshift by constant. |
437 | |
438 | For neg induction, return a fake step as integer -1. */ |
439 | static bool |
440 | vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info, |
441 | gphi* loop_phi_node, tree *init, tree *step) |
442 | { |
443 | tree init_expr, ev_expr, result, op1, op2; |
444 | gimple* def; |
445 | |
446 | if (gimple_phi_num_args (gs: loop_phi_node) != 2) |
447 | return false; |
448 | |
449 | init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop)); |
450 | ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop)); |
451 | |
452 | /* Support nonlinear induction only for integer type. */ |
453 | if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr))) |
454 | return false; |
455 | |
456 | *init = init_expr; |
457 | result = PHI_RESULT (loop_phi_node); |
458 | |
459 | if (TREE_CODE (ev_expr) != SSA_NAME |
460 | || ((def = SSA_NAME_DEF_STMT (ev_expr)), false) |
461 | || !is_gimple_assign (gs: def)) |
462 | return false; |
463 | |
464 | enum tree_code t_code = gimple_assign_rhs_code (gs: def); |
465 | switch (t_code) |
466 | { |
467 | case NEGATE_EXPR: |
468 | if (gimple_assign_rhs1 (gs: def) != result) |
469 | return false; |
470 | *step = build_int_cst (TREE_TYPE (init_expr), -1); |
471 | STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg; |
472 | break; |
473 | |
474 | case RSHIFT_EXPR: |
475 | case LSHIFT_EXPR: |
476 | case MULT_EXPR: |
477 | op1 = gimple_assign_rhs1 (gs: def); |
478 | op2 = gimple_assign_rhs2 (gs: def); |
479 | if (TREE_CODE (op2) != INTEGER_CST |
480 | || op1 != result) |
481 | return false; |
482 | *step = op2; |
483 | if (t_code == LSHIFT_EXPR) |
484 | STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl; |
485 | else if (t_code == RSHIFT_EXPR) |
486 | STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr; |
487 | /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul. */ |
488 | else |
489 | STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul; |
490 | break; |
491 | |
492 | default: |
493 | return false; |
494 | } |
495 | |
496 | STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init; |
497 | STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step; |
498 | |
499 | return true; |
500 | } |
501 | |
502 | /* Return true if PHI, described by STMT_INFO, is the inner PHI in |
503 | what we are assuming is a double reduction. For example, given |
504 | a structure like this: |
505 | |
506 | outer1: |
507 | x_1 = PHI <x_4(outer2), ...>; |
508 | ... |
509 | |
510 | inner: |
511 | x_2 = PHI <x_1(outer1), ...>; |
512 | ... |
513 | x_3 = ...; |
514 | ... |
515 | |
516 | outer2: |
517 | x_4 = PHI <x_3(inner)>; |
518 | ... |
519 | |
520 | outer loop analysis would treat x_1 as a double reduction phi and |
521 | this function would then return true for x_2. */ |
522 | |
523 | static bool |
524 | vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi) |
525 | { |
526 | use_operand_p use_p; |
527 | ssa_op_iter op_iter; |
528 | FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE) |
529 | if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p))) |
530 | if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def) |
531 | return true; |
532 | return false; |
533 | } |
534 | |
535 | /* Returns true if Phi is a first-order recurrence. A first-order |
536 | recurrence is a non-reduction recurrence relation in which the value of |
537 | the recurrence in the current loop iteration equals a value defined in |
538 | the previous iteration. */ |
539 | |
540 | static bool |
541 | vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop, |
542 | gphi *phi) |
543 | { |
544 | /* A nested cycle isn't vectorizable as first order recurrence. */ |
545 | if (LOOP_VINFO_LOOP (loop_vinfo) != loop) |
546 | return false; |
547 | |
548 | /* Ensure the loop latch definition is from within the loop. */ |
549 | edge latch = loop_latch_edge (loop); |
550 | tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch); |
551 | if (TREE_CODE (ldef) != SSA_NAME |
552 | || SSA_NAME_IS_DEFAULT_DEF (ldef) |
553 | || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef)) |
554 | || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef)))) |
555 | return false; |
556 | |
557 | tree def = gimple_phi_result (gs: phi); |
558 | |
559 | /* Ensure every use_stmt of the phi node is dominated by the latch |
560 | definition. */ |
561 | imm_use_iterator imm_iter; |
562 | use_operand_p use_p; |
563 | FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def) |
564 | if (!is_gimple_debug (USE_STMT (use_p)) |
565 | && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p) |
566 | || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef), |
567 | USE_STMT (use_p)))) |
568 | return false; |
569 | |
570 | /* First-order recurrence autovectorization needs shuffle vector. */ |
571 | tree scalar_type = TREE_TYPE (def); |
572 | tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type); |
573 | if (!vectype) |
574 | return false; |
575 | |
576 | return true; |
577 | } |
578 | |
579 | /* Function vect_analyze_scalar_cycles_1. |
580 | |
581 | Examine the cross iteration def-use cycles of scalar variables |
582 | in LOOP. LOOP_VINFO represents the loop that is now being |
583 | considered for vectorization (can be LOOP, or an outer-loop |
584 | enclosing LOOP). SLP indicates there will be some subsequent |
585 | slp analyses or not. */ |
586 | |
587 | static void |
588 | vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop, |
589 | bool slp) |
590 | { |
591 | basic_block bb = loop->header; |
592 | tree init, step; |
593 | auto_vec<stmt_vec_info, 64> worklist; |
594 | gphi_iterator gsi; |
595 | bool double_reduc, reduc_chain; |
596 | |
597 | DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles" ); |
598 | |
599 | /* First - identify all inductions. Reduction detection assumes that all the |
600 | inductions have been identified, therefore, this order must not be |
601 | changed. */ |
602 | for (gsi = gsi_start_phis (bb); !gsi_end_p (i: gsi); gsi_next (i: &gsi)) |
603 | { |
604 | gphi *phi = gsi.phi (); |
605 | tree access_fn = NULL; |
606 | tree def = PHI_RESULT (phi); |
607 | stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi); |
608 | |
609 | if (dump_enabled_p ()) |
610 | dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G" , |
611 | (gimple *) phi); |
612 | |
613 | /* Skip virtual phi's. The data dependences that are associated with |
614 | virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */ |
615 | if (virtual_operand_p (op: def)) |
616 | continue; |
617 | |
618 | STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type; |
619 | |
620 | /* Analyze the evolution function. */ |
621 | access_fn = analyze_scalar_evolution (loop, def); |
622 | if (access_fn) |
623 | { |
624 | STRIP_NOPS (access_fn); |
625 | if (dump_enabled_p ()) |
626 | dump_printf_loc (MSG_NOTE, vect_location, |
627 | "Access function of PHI: %T\n" , access_fn); |
628 | STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo) |
629 | = initial_condition_in_loop_num (access_fn, loop->num); |
630 | STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) |
631 | = evolution_part_in_loop_num (access_fn, loop->num); |
632 | } |
633 | |
634 | if ((!access_fn |
635 | || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi) |
636 | || !vect_is_simple_iv_evolution (loop_nb: loop->num, access_fn, |
637 | init: &init, step: &step) |
638 | || (LOOP_VINFO_LOOP (loop_vinfo) != loop |
639 | && TREE_CODE (step) != INTEGER_CST)) |
640 | /* Only handle nonlinear iv for same loop. */ |
641 | && (LOOP_VINFO_LOOP (loop_vinfo) != loop |
642 | || !vect_is_nonlinear_iv_evolution (loop, stmt_info: stmt_vinfo, |
643 | loop_phi_node: phi, init: &init, step: &step))) |
644 | { |
645 | worklist.safe_push (obj: stmt_vinfo); |
646 | continue; |
647 | } |
648 | |
649 | gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo) |
650 | != NULL_TREE); |
651 | gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE); |
652 | |
653 | if (dump_enabled_p ()) |
654 | dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n" ); |
655 | STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def; |
656 | } |
657 | |
658 | |
659 | /* Second - identify all reductions and nested cycles. */ |
660 | while (worklist.length () > 0) |
661 | { |
662 | stmt_vec_info stmt_vinfo = worklist.pop (); |
663 | gphi *phi = as_a <gphi *> (p: stmt_vinfo->stmt); |
664 | tree def = PHI_RESULT (phi); |
665 | |
666 | if (dump_enabled_p ()) |
667 | dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G" , |
668 | (gimple *) phi); |
669 | |
670 | gcc_assert (!virtual_operand_p (def) |
671 | && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type); |
672 | |
673 | stmt_vec_info reduc_stmt_info |
674 | = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc, |
675 | &reduc_chain, slp); |
676 | if (reduc_stmt_info) |
677 | { |
678 | STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info; |
679 | STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo; |
680 | if (double_reduc) |
681 | { |
682 | if (dump_enabled_p ()) |
683 | dump_printf_loc (MSG_NOTE, vect_location, |
684 | "Detected double reduction.\n" ); |
685 | |
686 | STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def; |
687 | STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def; |
688 | } |
689 | else |
690 | { |
691 | if (loop != LOOP_VINFO_LOOP (loop_vinfo)) |
692 | { |
693 | if (dump_enabled_p ()) |
694 | dump_printf_loc (MSG_NOTE, vect_location, |
695 | "Detected vectorizable nested cycle.\n" ); |
696 | |
697 | STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle; |
698 | } |
699 | else |
700 | { |
701 | if (dump_enabled_p ()) |
702 | dump_printf_loc (MSG_NOTE, vect_location, |
703 | "Detected reduction.\n" ); |
704 | |
705 | STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def; |
706 | STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def; |
707 | /* Store the reduction cycles for possible vectorization in |
708 | loop-aware SLP if it was not detected as reduction |
709 | chain. */ |
710 | if (! reduc_chain) |
711 | LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push |
712 | (obj: reduc_stmt_info); |
713 | } |
714 | } |
715 | } |
716 | else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi)) |
717 | STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence; |
718 | else |
719 | if (dump_enabled_p ()) |
720 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
721 | "Unknown def-use cycle pattern.\n" ); |
722 | } |
723 | } |
724 | |
725 | |
726 | /* Function vect_analyze_scalar_cycles. |
727 | |
728 | Examine the cross iteration def-use cycles of scalar variables, by |
729 | analyzing the loop-header PHIs of scalar variables. Classify each |
730 | cycle as one of the following: invariant, induction, reduction, unknown. |
731 | We do that for the loop represented by LOOP_VINFO, and also to its |
732 | inner-loop, if exists. |
733 | Examples for scalar cycles: |
734 | |
735 | Example1: reduction: |
736 | |
737 | loop1: |
738 | for (i=0; i<N; i++) |
739 | sum += a[i]; |
740 | |
741 | Example2: induction: |
742 | |
743 | loop2: |
744 | for (i=0; i<N; i++) |
745 | a[i] = i; */ |
746 | |
747 | static void |
748 | vect_analyze_scalar_cycles (loop_vec_info loop_vinfo, bool slp) |
749 | { |
750 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
751 | |
752 | vect_analyze_scalar_cycles_1 (loop_vinfo, loop, slp); |
753 | |
754 | /* When vectorizing an outer-loop, the inner-loop is executed sequentially. |
755 | Reductions in such inner-loop therefore have different properties than |
756 | the reductions in the nest that gets vectorized: |
757 | 1. When vectorized, they are executed in the same order as in the original |
758 | scalar loop, so we can't change the order of computation when |
759 | vectorizing them. |
760 | 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the |
761 | current checks are too strict. */ |
762 | |
763 | if (loop->inner) |
764 | vect_analyze_scalar_cycles_1 (loop_vinfo, loop: loop->inner, slp); |
765 | } |
766 | |
767 | /* Transfer group and reduction information from STMT_INFO to its |
768 | pattern stmt. */ |
769 | |
770 | static void |
771 | vect_fixup_reduc_chain (stmt_vec_info stmt_info) |
772 | { |
773 | stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info); |
774 | stmt_vec_info stmtp; |
775 | gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp) |
776 | && REDUC_GROUP_FIRST_ELEMENT (stmt_info)); |
777 | REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info); |
778 | do |
779 | { |
780 | stmtp = STMT_VINFO_RELATED_STMT (stmt_info); |
781 | gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp) |
782 | == STMT_VINFO_DEF_TYPE (stmt_info)); |
783 | REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp; |
784 | stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info); |
785 | if (stmt_info) |
786 | REDUC_GROUP_NEXT_ELEMENT (stmtp) |
787 | = STMT_VINFO_RELATED_STMT (stmt_info); |
788 | } |
789 | while (stmt_info); |
790 | } |
791 | |
792 | /* Fixup scalar cycles that now have their stmts detected as patterns. */ |
793 | |
794 | static void |
795 | vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo) |
796 | { |
797 | stmt_vec_info first; |
798 | unsigned i; |
799 | |
800 | FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first) |
801 | { |
802 | stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first); |
803 | while (next) |
804 | { |
805 | if ((STMT_VINFO_IN_PATTERN_P (next) |
806 | != STMT_VINFO_IN_PATTERN_P (first)) |
807 | || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1) |
808 | break; |
809 | next = REDUC_GROUP_NEXT_ELEMENT (next); |
810 | } |
811 | /* If all reduction chain members are well-formed patterns adjust |
812 | the group to group the pattern stmts instead. */ |
813 | if (! next |
814 | && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1) |
815 | { |
816 | if (STMT_VINFO_IN_PATTERN_P (first)) |
817 | { |
818 | vect_fixup_reduc_chain (stmt_info: first); |
819 | LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i] |
820 | = STMT_VINFO_RELATED_STMT (first); |
821 | } |
822 | } |
823 | /* If not all stmt in the chain are patterns or if we failed |
824 | to update STMT_VINFO_REDUC_IDX dissolve the chain and handle |
825 | it as regular reduction instead. */ |
826 | else |
827 | { |
828 | stmt_vec_info vinfo = first; |
829 | stmt_vec_info last = NULL; |
830 | while (vinfo) |
831 | { |
832 | next = REDUC_GROUP_NEXT_ELEMENT (vinfo); |
833 | REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL; |
834 | REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL; |
835 | last = vinfo; |
836 | vinfo = next; |
837 | } |
838 | STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first)) |
839 | = vect_internal_def; |
840 | loop_vinfo->reductions.safe_push (obj: vect_stmt_to_vectorize (stmt_info: last)); |
841 | LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (ix: i); |
842 | --i; |
843 | } |
844 | } |
845 | } |
846 | |
847 | /* Function vect_get_loop_niters. |
848 | |
849 | Determine how many iterations the loop is executed and place it |
850 | in NUMBER_OF_ITERATIONS. Place the number of latch iterations |
851 | in NUMBER_OF_ITERATIONSM1. Place the condition under which the |
852 | niter information holds in ASSUMPTIONS. |
853 | |
854 | Return the loop exit conditions. */ |
855 | |
856 | |
857 | static vec<gcond *> |
858 | vect_get_loop_niters (class loop *loop, const_edge main_exit, tree *assumptions, |
859 | tree *number_of_iterations, tree *number_of_iterationsm1) |
860 | { |
861 | auto_vec<edge> exits = get_loop_exit_edges (loop); |
862 | vec<gcond *> conds; |
863 | conds.create (nelems: exits.length ()); |
864 | class tree_niter_desc niter_desc; |
865 | tree niter_assumptions, niter, may_be_zero; |
866 | |
867 | *assumptions = boolean_true_node; |
868 | *number_of_iterationsm1 = chrec_dont_know; |
869 | *number_of_iterations = chrec_dont_know; |
870 | |
871 | DUMP_VECT_SCOPE ("get_loop_niters" ); |
872 | |
873 | if (exits.is_empty ()) |
874 | return conds; |
875 | |
876 | if (dump_enabled_p ()) |
877 | dump_printf_loc (MSG_NOTE, vect_location, "Loop has %d exits.\n" , |
878 | exits.length ()); |
879 | |
880 | edge exit; |
881 | unsigned int i; |
882 | FOR_EACH_VEC_ELT (exits, i, exit) |
883 | { |
884 | gcond *cond = get_loop_exit_condition (exit); |
885 | if (cond) |
886 | conds.safe_push (obj: cond); |
887 | |
888 | if (dump_enabled_p ()) |
889 | dump_printf_loc (MSG_NOTE, vect_location, "Analyzing exit %d...\n" , i); |
890 | |
891 | if (exit != main_exit) |
892 | continue; |
893 | |
894 | may_be_zero = NULL_TREE; |
895 | if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL) |
896 | || chrec_contains_undetermined (niter_desc.niter)) |
897 | continue; |
898 | |
899 | niter_assumptions = niter_desc.assumptions; |
900 | may_be_zero = niter_desc.may_be_zero; |
901 | niter = niter_desc.niter; |
902 | |
903 | if (may_be_zero && integer_zerop (may_be_zero)) |
904 | may_be_zero = NULL_TREE; |
905 | |
906 | if (may_be_zero) |
907 | { |
908 | if (COMPARISON_CLASS_P (may_be_zero)) |
909 | { |
910 | /* Try to combine may_be_zero with assumptions, this can simplify |
911 | computation of niter expression. */ |
912 | if (niter_assumptions && !integer_nonzerop (niter_assumptions)) |
913 | niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node, |
914 | niter_assumptions, |
915 | fold_build1 (TRUTH_NOT_EXPR, |
916 | boolean_type_node, |
917 | may_be_zero)); |
918 | else |
919 | niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero, |
920 | build_int_cst (TREE_TYPE (niter), 0), |
921 | rewrite_to_non_trapping_overflow (niter)); |
922 | |
923 | may_be_zero = NULL_TREE; |
924 | } |
925 | else if (integer_nonzerop (may_be_zero)) |
926 | { |
927 | *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0); |
928 | *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1); |
929 | continue; |
930 | } |
931 | else |
932 | continue; |
933 | } |
934 | |
935 | /* Loop assumptions are based off the normal exit. */ |
936 | *assumptions = niter_assumptions; |
937 | *number_of_iterationsm1 = niter; |
938 | |
939 | /* We want the number of loop header executions which is the number |
940 | of latch executions plus one. |
941 | ??? For UINT_MAX latch executions this number overflows to zero |
942 | for loops like do { n++; } while (n != 0); */ |
943 | if (niter && !chrec_contains_undetermined (niter)) |
944 | niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), |
945 | unshare_expr (niter), |
946 | build_int_cst (TREE_TYPE (niter), 1)); |
947 | *number_of_iterations = niter; |
948 | } |
949 | |
950 | if (dump_enabled_p ()) |
951 | dump_printf_loc (MSG_NOTE, vect_location, "All loop exits successfully analyzed.\n" ); |
952 | |
953 | return conds; |
954 | } |
955 | |
956 | /* Determine the main loop exit for the vectorizer. */ |
957 | |
958 | edge |
959 | vec_init_loop_exit_info (class loop *loop) |
960 | { |
961 | /* Before we begin we must first determine which exit is the main one and |
962 | which are auxilary exits. */ |
963 | auto_vec<edge> exits = get_loop_exit_edges (loop); |
964 | if (exits.length () == 1) |
965 | return exits[0]; |
966 | |
967 | /* If we have multiple exits we only support counting IV at the moment. Analyze |
968 | all exits and return one */ |
969 | class tree_niter_desc niter_desc; |
970 | edge candidate = NULL; |
971 | for (edge exit : exits) |
972 | { |
973 | if (!get_loop_exit_condition (exit)) |
974 | continue; |
975 | |
976 | if (number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL) |
977 | && !chrec_contains_undetermined (niter_desc.niter)) |
978 | { |
979 | if (!niter_desc.may_be_zero || !candidate) |
980 | candidate = exit; |
981 | } |
982 | } |
983 | |
984 | return candidate; |
985 | } |
986 | |
987 | /* Function bb_in_loop_p |
988 | |
989 | Used as predicate for dfs order traversal of the loop bbs. */ |
990 | |
991 | static bool |
992 | bb_in_loop_p (const_basic_block bb, const void *data) |
993 | { |
994 | const class loop *const loop = (const class loop *)data; |
995 | if (flow_bb_inside_loop_p (loop, bb)) |
996 | return true; |
997 | return false; |
998 | } |
999 | |
1000 | |
1001 | /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as |
1002 | stmt_vec_info structs for all the stmts in LOOP_IN. */ |
1003 | |
1004 | _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared) |
1005 | : vec_info (vec_info::loop, shared), |
1006 | loop (loop_in), |
1007 | bbs (XCNEWVEC (basic_block, loop->num_nodes)), |
1008 | num_itersm1 (NULL_TREE), |
1009 | num_iters (NULL_TREE), |
1010 | num_iters_unchanged (NULL_TREE), |
1011 | num_iters_assumptions (NULL_TREE), |
1012 | vector_costs (nullptr), |
1013 | scalar_costs (nullptr), |
1014 | th (0), |
1015 | versioning_threshold (0), |
1016 | vectorization_factor (0), |
1017 | main_loop_edge (nullptr), |
1018 | skip_main_loop_edge (nullptr), |
1019 | skip_this_loop_edge (nullptr), |
1020 | reusable_accumulators (), |
1021 | suggested_unroll_factor (1), |
1022 | max_vectorization_factor (0), |
1023 | mask_skip_niters (NULL_TREE), |
1024 | rgroup_compare_type (NULL_TREE), |
1025 | simd_if_cond (NULL_TREE), |
1026 | partial_vector_style (vect_partial_vectors_none), |
1027 | unaligned_dr (NULL), |
1028 | peeling_for_alignment (0), |
1029 | ptr_mask (0), |
1030 | ivexpr_map (NULL), |
1031 | scan_map (NULL), |
1032 | slp_unrolling_factor (1), |
1033 | inner_loop_cost_factor (param_vect_inner_loop_cost_factor), |
1034 | vectorizable (false), |
1035 | can_use_partial_vectors_p (param_vect_partial_vector_usage != 0), |
1036 | using_partial_vectors_p (false), |
1037 | using_decrementing_iv_p (false), |
1038 | using_select_vl_p (false), |
1039 | epil_using_partial_vectors_p (false), |
1040 | partial_load_store_bias (0), |
1041 | peeling_for_gaps (false), |
1042 | peeling_for_niter (false), |
1043 | no_data_dependencies (false), |
1044 | has_mask_store (false), |
1045 | scalar_loop_scaling (profile_probability::uninitialized ()), |
1046 | scalar_loop (NULL), |
1047 | orig_loop_info (NULL), |
1048 | vec_loop_iv_exit (NULL), |
1049 | vec_epilogue_loop_iv_exit (NULL), |
1050 | scalar_loop_iv_exit (NULL) |
1051 | { |
1052 | /* CHECKME: We want to visit all BBs before their successors (except for |
1053 | latch blocks, for which this assertion wouldn't hold). In the simple |
1054 | case of the loop forms we allow, a dfs order of the BBs would the same |
1055 | as reversed postorder traversal, so we are safe. */ |
1056 | |
1057 | unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p, |
1058 | bbs, loop->num_nodes, loop); |
1059 | gcc_assert (nbbs == loop->num_nodes); |
1060 | |
1061 | for (unsigned int i = 0; i < nbbs; i++) |
1062 | { |
1063 | basic_block bb = bbs[i]; |
1064 | gimple_stmt_iterator si; |
1065 | |
1066 | for (si = gsi_start_phis (bb); !gsi_end_p (i: si); gsi_next (i: &si)) |
1067 | { |
1068 | gimple *phi = gsi_stmt (i: si); |
1069 | gimple_set_uid (g: phi, uid: 0); |
1070 | add_stmt (phi); |
1071 | } |
1072 | |
1073 | for (si = gsi_start_bb (bb); !gsi_end_p (i: si); gsi_next (i: &si)) |
1074 | { |
1075 | gimple *stmt = gsi_stmt (i: si); |
1076 | gimple_set_uid (g: stmt, uid: 0); |
1077 | if (is_gimple_debug (gs: stmt)) |
1078 | continue; |
1079 | add_stmt (stmt); |
1080 | /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the |
1081 | third argument is the #pragma omp simd if (x) condition, when 0, |
1082 | loop shouldn't be vectorized, when non-zero constant, it should |
1083 | be vectorized normally, otherwise versioned with vectorized loop |
1084 | done if the condition is non-zero at runtime. */ |
1085 | if (loop_in->simduid |
1086 | && is_gimple_call (gs: stmt) |
1087 | && gimple_call_internal_p (gs: stmt) |
1088 | && gimple_call_internal_fn (gs: stmt) == IFN_GOMP_SIMD_LANE |
1089 | && gimple_call_num_args (gs: stmt) >= 3 |
1090 | && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME |
1091 | && (loop_in->simduid |
1092 | == SSA_NAME_VAR (gimple_call_arg (stmt, 0)))) |
1093 | { |
1094 | tree arg = gimple_call_arg (gs: stmt, index: 2); |
1095 | if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME) |
1096 | simd_if_cond = arg; |
1097 | else |
1098 | gcc_assert (integer_nonzerop (arg)); |
1099 | } |
1100 | } |
1101 | } |
1102 | |
1103 | epilogue_vinfos.create (nelems: 6); |
1104 | } |
1105 | |
1106 | /* Free all levels of rgroup CONTROLS. */ |
1107 | |
1108 | void |
1109 | release_vec_loop_controls (vec<rgroup_controls> *controls) |
1110 | { |
1111 | rgroup_controls *rgc; |
1112 | unsigned int i; |
1113 | FOR_EACH_VEC_ELT (*controls, i, rgc) |
1114 | rgc->controls.release (); |
1115 | controls->release (); |
1116 | } |
1117 | |
1118 | /* Free all memory used by the _loop_vec_info, as well as all the |
1119 | stmt_vec_info structs of all the stmts in the loop. */ |
1120 | |
1121 | _loop_vec_info::~_loop_vec_info () |
1122 | { |
1123 | free (ptr: bbs); |
1124 | |
1125 | release_vec_loop_controls (controls: &masks.rgc_vec); |
1126 | release_vec_loop_controls (controls: &lens); |
1127 | delete ivexpr_map; |
1128 | delete scan_map; |
1129 | epilogue_vinfos.release (); |
1130 | delete scalar_costs; |
1131 | delete vector_costs; |
1132 | |
1133 | /* When we release an epiloge vinfo that we do not intend to use |
1134 | avoid clearing AUX of the main loop which should continue to |
1135 | point to the main loop vinfo since otherwise we'll leak that. */ |
1136 | if (loop->aux == this) |
1137 | loop->aux = NULL; |
1138 | } |
1139 | |
1140 | /* Return an invariant or register for EXPR and emit necessary |
1141 | computations in the LOOP_VINFO loop preheader. */ |
1142 | |
1143 | tree |
1144 | cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr) |
1145 | { |
1146 | if (is_gimple_reg (expr) |
1147 | || is_gimple_min_invariant (expr)) |
1148 | return expr; |
1149 | |
1150 | if (! loop_vinfo->ivexpr_map) |
1151 | loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>; |
1152 | tree &cached = loop_vinfo->ivexpr_map->get_or_insert (k: expr); |
1153 | if (! cached) |
1154 | { |
1155 | gimple_seq stmts = NULL; |
1156 | cached = force_gimple_operand (unshare_expr (expr), |
1157 | &stmts, true, NULL_TREE); |
1158 | if (stmts) |
1159 | { |
1160 | edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo)); |
1161 | gsi_insert_seq_on_edge_immediate (e, stmts); |
1162 | } |
1163 | } |
1164 | return cached; |
1165 | } |
1166 | |
1167 | /* Return true if we can use CMP_TYPE as the comparison type to produce |
1168 | all masks required to mask LOOP_VINFO. */ |
1169 | |
1170 | static bool |
1171 | can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type) |
1172 | { |
1173 | rgroup_controls *rgm; |
1174 | unsigned int i; |
1175 | FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm) |
1176 | if (rgm->type != NULL_TREE |
1177 | && !direct_internal_fn_supported_p (fn: IFN_WHILE_ULT, |
1178 | type0: cmp_type, type1: rgm->type, |
1179 | opt_type: OPTIMIZE_FOR_SPEED)) |
1180 | return false; |
1181 | return true; |
1182 | } |
1183 | |
1184 | /* Calculate the maximum number of scalars per iteration for every |
1185 | rgroup in LOOP_VINFO. */ |
1186 | |
1187 | static unsigned int |
1188 | vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo) |
1189 | { |
1190 | unsigned int res = 1; |
1191 | unsigned int i; |
1192 | rgroup_controls *rgm; |
1193 | FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm) |
1194 | res = MAX (res, rgm->max_nscalars_per_iter); |
1195 | return res; |
1196 | } |
1197 | |
1198 | /* Calculate the minimum precision necessary to represent: |
1199 | |
1200 | MAX_NITERS * FACTOR |
1201 | |
1202 | as an unsigned integer, where MAX_NITERS is the maximum number of |
1203 | loop header iterations for the original scalar form of LOOP_VINFO. */ |
1204 | |
1205 | static unsigned |
1206 | vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor) |
1207 | { |
1208 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
1209 | |
1210 | /* Get the maximum number of iterations that is representable |
1211 | in the counter type. */ |
1212 | tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo)); |
1213 | widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1; |
1214 | |
1215 | /* Get a more refined estimate for the number of iterations. */ |
1216 | widest_int max_back_edges; |
1217 | if (max_loop_iterations (loop, &max_back_edges)) |
1218 | max_ni = wi::smin (x: max_ni, y: max_back_edges + 1); |
1219 | |
1220 | /* Work out how many bits we need to represent the limit. */ |
1221 | return wi::min_precision (x: max_ni * factor, sgn: UNSIGNED); |
1222 | } |
1223 | |
1224 | /* True if the loop needs peeling or partial vectors when vectorized. */ |
1225 | |
1226 | static bool |
1227 | vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo) |
1228 | { |
1229 | unsigned HOST_WIDE_INT const_vf; |
1230 | HOST_WIDE_INT max_niter |
1231 | = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo)); |
1232 | |
1233 | unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo); |
1234 | if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) |
1235 | th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO |
1236 | (loop_vinfo)); |
1237 | |
1238 | if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) |
1239 | && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0) |
1240 | { |
1241 | /* Work out the (constant) number of iterations that need to be |
1242 | peeled for reasons other than niters. */ |
1243 | unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); |
1244 | if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)) |
1245 | peel_niter += 1; |
1246 | if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter, |
1247 | LOOP_VINFO_VECT_FACTOR (loop_vinfo))) |
1248 | return true; |
1249 | } |
1250 | else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) |
1251 | /* ??? When peeling for gaps but not alignment, we could |
1252 | try to check whether the (variable) niters is known to be |
1253 | VF * N + 1. That's something of a niche case though. */ |
1254 | || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) |
1255 | || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (const_value: &const_vf) |
1256 | || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo)) |
1257 | < (unsigned) exact_log2 (x: const_vf)) |
1258 | /* In case of versioning, check if the maximum number of |
1259 | iterations is greater than th. If they are identical, |
1260 | the epilogue is unnecessary. */ |
1261 | && (!LOOP_REQUIRES_VERSIONING (loop_vinfo) |
1262 | || ((unsigned HOST_WIDE_INT) max_niter |
1263 | > (th / const_vf) * const_vf)))) |
1264 | return true; |
1265 | |
1266 | return false; |
1267 | } |
1268 | |
1269 | /* Each statement in LOOP_VINFO can be masked where necessary. Check |
1270 | whether we can actually generate the masks required. Return true if so, |
1271 | storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */ |
1272 | |
1273 | static bool |
1274 | vect_verify_full_masking (loop_vec_info loop_vinfo) |
1275 | { |
1276 | unsigned int min_ni_width; |
1277 | |
1278 | /* Use a normal loop if there are no statements that need masking. |
1279 | This only happens in rare degenerate cases: it means that the loop |
1280 | has no loads, no stores, and no live-out values. */ |
1281 | if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ()) |
1282 | return false; |
1283 | |
1284 | /* Produce the rgroup controls. */ |
1285 | for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set) |
1286 | { |
1287 | vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); |
1288 | tree vectype = mask.first; |
1289 | unsigned nvectors = mask.second; |
1290 | |
1291 | if (masks->rgc_vec.length () < nvectors) |
1292 | masks->rgc_vec.safe_grow_cleared (len: nvectors, exact: true); |
1293 | rgroup_controls *rgm = &(*masks).rgc_vec[nvectors - 1]; |
1294 | /* The number of scalars per iteration and the number of vectors are |
1295 | both compile-time constants. */ |
1296 | unsigned int nscalars_per_iter |
1297 | = exact_div (a: nvectors * TYPE_VECTOR_SUBPARTS (node: vectype), |
1298 | LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant (); |
1299 | |
1300 | if (rgm->max_nscalars_per_iter < nscalars_per_iter) |
1301 | { |
1302 | rgm->max_nscalars_per_iter = nscalars_per_iter; |
1303 | rgm->type = truth_type_for (vectype); |
1304 | rgm->factor = 1; |
1305 | } |
1306 | } |
1307 | |
1308 | unsigned int max_nscalars_per_iter |
1309 | = vect_get_max_nscalars_per_iter (loop_vinfo); |
1310 | |
1311 | /* Work out how many bits we need to represent the limit. */ |
1312 | min_ni_width |
1313 | = vect_min_prec_for_max_niters (loop_vinfo, factor: max_nscalars_per_iter); |
1314 | |
1315 | /* Find a scalar mode for which WHILE_ULT is supported. */ |
1316 | opt_scalar_int_mode cmp_mode_iter; |
1317 | tree cmp_type = NULL_TREE; |
1318 | tree iv_type = NULL_TREE; |
1319 | widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo); |
1320 | unsigned int iv_precision = UINT_MAX; |
1321 | |
1322 | if (iv_limit != -1) |
1323 | iv_precision = wi::min_precision (x: iv_limit * max_nscalars_per_iter, |
1324 | sgn: UNSIGNED); |
1325 | |
1326 | FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT) |
1327 | { |
1328 | unsigned int cmp_bits = GET_MODE_BITSIZE (mode: cmp_mode_iter.require ()); |
1329 | if (cmp_bits >= min_ni_width |
1330 | && targetm.scalar_mode_supported_p (cmp_mode_iter.require ())) |
1331 | { |
1332 | tree this_type = build_nonstandard_integer_type (cmp_bits, true); |
1333 | if (this_type |
1334 | && can_produce_all_loop_masks_p (loop_vinfo, cmp_type: this_type)) |
1335 | { |
1336 | /* Although we could stop as soon as we find a valid mode, |
1337 | there are at least two reasons why that's not always the |
1338 | best choice: |
1339 | |
1340 | - An IV that's Pmode or wider is more likely to be reusable |
1341 | in address calculations than an IV that's narrower than |
1342 | Pmode. |
1343 | |
1344 | - Doing the comparison in IV_PRECISION or wider allows |
1345 | a natural 0-based IV, whereas using a narrower comparison |
1346 | type requires mitigations against wrap-around. |
1347 | |
1348 | Conversely, if the IV limit is variable, doing the comparison |
1349 | in a wider type than the original type can introduce |
1350 | unnecessary extensions, so picking the widest valid mode |
1351 | is not always a good choice either. |
1352 | |
1353 | Here we prefer the first IV type that's Pmode or wider, |
1354 | and the first comparison type that's IV_PRECISION or wider. |
1355 | (The comparison type must be no wider than the IV type, |
1356 | to avoid extensions in the vector loop.) |
1357 | |
1358 | ??? We might want to try continuing beyond Pmode for ILP32 |
1359 | targets if CMP_BITS < IV_PRECISION. */ |
1360 | iv_type = this_type; |
1361 | if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type)) |
1362 | cmp_type = this_type; |
1363 | if (cmp_bits >= GET_MODE_BITSIZE (Pmode)) |
1364 | break; |
1365 | } |
1366 | } |
1367 | } |
1368 | |
1369 | if (!cmp_type) |
1370 | { |
1371 | LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.release (); |
1372 | return false; |
1373 | } |
1374 | |
1375 | LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type; |
1376 | LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type; |
1377 | LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_while_ult; |
1378 | return true; |
1379 | } |
1380 | |
1381 | /* Each statement in LOOP_VINFO can be masked where necessary. Check |
1382 | whether we can actually generate AVX512 style masks. Return true if so, |
1383 | storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE. */ |
1384 | |
1385 | static bool |
1386 | vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo) |
1387 | { |
1388 | /* Produce differently organized rgc_vec and differently check |
1389 | we can produce masks. */ |
1390 | |
1391 | /* Use a normal loop if there are no statements that need masking. |
1392 | This only happens in rare degenerate cases: it means that the loop |
1393 | has no loads, no stores, and no live-out values. */ |
1394 | if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ()) |
1395 | return false; |
1396 | |
1397 | /* For the decrementing IV we need to represent all values in |
1398 | [0, niter + niter_skip] where niter_skip is the elements we |
1399 | skip in the first iteration for prologue peeling. */ |
1400 | tree iv_type = NULL_TREE; |
1401 | widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo); |
1402 | unsigned int iv_precision = UINT_MAX; |
1403 | if (iv_limit != -1) |
1404 | iv_precision = wi::min_precision (x: iv_limit, sgn: UNSIGNED); |
1405 | |
1406 | /* First compute the type for the IV we use to track the remaining |
1407 | scalar iterations. */ |
1408 | opt_scalar_int_mode cmp_mode_iter; |
1409 | FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT) |
1410 | { |
1411 | unsigned int cmp_bits = GET_MODE_BITSIZE (mode: cmp_mode_iter.require ()); |
1412 | if (cmp_bits >= iv_precision |
1413 | && targetm.scalar_mode_supported_p (cmp_mode_iter.require ())) |
1414 | { |
1415 | iv_type = build_nonstandard_integer_type (cmp_bits, true); |
1416 | if (iv_type) |
1417 | break; |
1418 | } |
1419 | } |
1420 | if (!iv_type) |
1421 | return false; |
1422 | |
1423 | /* Produce the rgroup controls. */ |
1424 | for (auto const &mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set) |
1425 | { |
1426 | vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); |
1427 | tree vectype = mask.first; |
1428 | unsigned nvectors = mask.second; |
1429 | |
1430 | /* The number of scalars per iteration and the number of vectors are |
1431 | both compile-time constants. */ |
1432 | unsigned int nscalars_per_iter |
1433 | = exact_div (a: nvectors * TYPE_VECTOR_SUBPARTS (node: vectype), |
1434 | LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant (); |
1435 | |
1436 | /* We index the rgroup_controls vector with nscalars_per_iter |
1437 | which we keep constant and instead have a varying nvectors, |
1438 | remembering the vector mask with the fewest nV. */ |
1439 | if (masks->rgc_vec.length () < nscalars_per_iter) |
1440 | masks->rgc_vec.safe_grow_cleared (len: nscalars_per_iter, exact: true); |
1441 | rgroup_controls *rgm = &(*masks).rgc_vec[nscalars_per_iter - 1]; |
1442 | |
1443 | if (!rgm->type || rgm->factor > nvectors) |
1444 | { |
1445 | rgm->type = truth_type_for (vectype); |
1446 | rgm->compare_type = NULL_TREE; |
1447 | rgm->max_nscalars_per_iter = nscalars_per_iter; |
1448 | rgm->factor = nvectors; |
1449 | rgm->bias_adjusted_ctrl = NULL_TREE; |
1450 | } |
1451 | } |
1452 | |
1453 | /* There is no fixed compare type we are going to use but we have to |
1454 | be able to get at one for each mask group. */ |
1455 | unsigned int min_ni_width |
1456 | = wi::min_precision (x: vect_max_vf (loop_vinfo), sgn: UNSIGNED); |
1457 | |
1458 | bool ok = true; |
1459 | for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec) |
1460 | { |
1461 | tree mask_type = rgc.type; |
1462 | if (!mask_type) |
1463 | continue; |
1464 | |
1465 | /* For now vect_get_loop_mask only supports integer mode masks |
1466 | when we need to split it. */ |
1467 | if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_INT |
1468 | || TYPE_PRECISION (TREE_TYPE (mask_type)) != 1) |
1469 | { |
1470 | ok = false; |
1471 | break; |
1472 | } |
1473 | |
1474 | /* If iv_type is usable as compare type use that - we can elide the |
1475 | saturation in that case. */ |
1476 | if (TYPE_PRECISION (iv_type) >= min_ni_width) |
1477 | { |
1478 | tree cmp_vectype |
1479 | = build_vector_type (iv_type, TYPE_VECTOR_SUBPARTS (node: mask_type)); |
1480 | if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR)) |
1481 | rgc.compare_type = cmp_vectype; |
1482 | } |
1483 | if (!rgc.compare_type) |
1484 | FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT) |
1485 | { |
1486 | unsigned int cmp_bits = GET_MODE_BITSIZE (mode: cmp_mode_iter.require ()); |
1487 | if (cmp_bits >= min_ni_width |
1488 | && targetm.scalar_mode_supported_p (cmp_mode_iter.require ())) |
1489 | { |
1490 | tree cmp_type = build_nonstandard_integer_type (cmp_bits, true); |
1491 | if (!cmp_type) |
1492 | continue; |
1493 | |
1494 | /* Check whether we can produce the mask with cmp_type. */ |
1495 | tree cmp_vectype |
1496 | = build_vector_type (cmp_type, TYPE_VECTOR_SUBPARTS (node: mask_type)); |
1497 | if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR)) |
1498 | { |
1499 | rgc.compare_type = cmp_vectype; |
1500 | break; |
1501 | } |
1502 | } |
1503 | } |
1504 | if (!rgc.compare_type) |
1505 | { |
1506 | ok = false; |
1507 | break; |
1508 | } |
1509 | } |
1510 | if (!ok) |
1511 | { |
1512 | release_vec_loop_controls (controls: &LOOP_VINFO_MASKS (loop_vinfo).rgc_vec); |
1513 | return false; |
1514 | } |
1515 | |
1516 | LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = error_mark_node; |
1517 | LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type; |
1518 | LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_avx512; |
1519 | return true; |
1520 | } |
1521 | |
1522 | /* Check whether we can use vector access with length based on precison |
1523 | comparison. So far, to keep it simple, we only allow the case that the |
1524 | precision of the target supported length is larger than the precision |
1525 | required by loop niters. */ |
1526 | |
1527 | static bool |
1528 | vect_verify_loop_lens (loop_vec_info loop_vinfo) |
1529 | { |
1530 | if (LOOP_VINFO_LENS (loop_vinfo).is_empty ()) |
1531 | return false; |
1532 | |
1533 | machine_mode len_load_mode, len_store_mode; |
1534 | if (!get_len_load_store_mode (loop_vinfo->vector_mode, true) |
1535 | .exists (mode: &len_load_mode)) |
1536 | return false; |
1537 | if (!get_len_load_store_mode (loop_vinfo->vector_mode, false) |
1538 | .exists (mode: &len_store_mode)) |
1539 | return false; |
1540 | |
1541 | signed char partial_load_bias = internal_len_load_store_bias |
1542 | (ifn: IFN_LEN_LOAD, len_load_mode); |
1543 | |
1544 | signed char partial_store_bias = internal_len_load_store_bias |
1545 | (ifn: IFN_LEN_STORE, len_store_mode); |
1546 | |
1547 | gcc_assert (partial_load_bias == partial_store_bias); |
1548 | |
1549 | if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED) |
1550 | return false; |
1551 | |
1552 | /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit |
1553 | len_loads with a length of zero. In order to avoid that we prohibit |
1554 | more than one loop length here. */ |
1555 | if (partial_load_bias == -1 |
1556 | && LOOP_VINFO_LENS (loop_vinfo).length () > 1) |
1557 | return false; |
1558 | |
1559 | LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias; |
1560 | |
1561 | unsigned int max_nitems_per_iter = 1; |
1562 | unsigned int i; |
1563 | rgroup_controls *rgl; |
1564 | /* Find the maximum number of items per iteration for every rgroup. */ |
1565 | FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl) |
1566 | { |
1567 | unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor; |
1568 | max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter); |
1569 | } |
1570 | |
1571 | /* Work out how many bits we need to represent the length limit. */ |
1572 | unsigned int min_ni_prec |
1573 | = vect_min_prec_for_max_niters (loop_vinfo, factor: max_nitems_per_iter); |
1574 | |
1575 | /* Now use the maximum of below precisions for one suitable IV type: |
1576 | - the IV's natural precision |
1577 | - the precision needed to hold: the maximum number of scalar |
1578 | iterations multiplied by the scale factor (min_ni_prec above) |
1579 | - the Pmode precision |
1580 | |
1581 | If min_ni_prec is less than the precision of the current niters, |
1582 | we perfer to still use the niters type. Prefer to use Pmode and |
1583 | wider IV to avoid narrow conversions. */ |
1584 | |
1585 | unsigned int ni_prec |
1586 | = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo))); |
1587 | min_ni_prec = MAX (min_ni_prec, ni_prec); |
1588 | min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode)); |
1589 | |
1590 | tree iv_type = NULL_TREE; |
1591 | opt_scalar_int_mode tmode_iter; |
1592 | FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT) |
1593 | { |
1594 | scalar_mode tmode = tmode_iter.require (); |
1595 | unsigned int tbits = GET_MODE_BITSIZE (mode: tmode); |
1596 | |
1597 | /* ??? Do we really want to construct one IV whose precision exceeds |
1598 | BITS_PER_WORD? */ |
1599 | if (tbits > BITS_PER_WORD) |
1600 | break; |
1601 | |
1602 | /* Find the first available standard integral type. */ |
1603 | if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode)) |
1604 | { |
1605 | iv_type = build_nonstandard_integer_type (tbits, true); |
1606 | break; |
1607 | } |
1608 | } |
1609 | |
1610 | if (!iv_type) |
1611 | { |
1612 | if (dump_enabled_p ()) |
1613 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
1614 | "can't vectorize with length-based partial vectors" |
1615 | " because there is no suitable iv type.\n" ); |
1616 | return false; |
1617 | } |
1618 | |
1619 | LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type; |
1620 | LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type; |
1621 | LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_len; |
1622 | |
1623 | return true; |
1624 | } |
1625 | |
1626 | /* Calculate the cost of one scalar iteration of the loop. */ |
1627 | static void |
1628 | vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo) |
1629 | { |
1630 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
1631 | basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); |
1632 | int nbbs = loop->num_nodes, factor; |
1633 | int innerloop_iters, i; |
1634 | |
1635 | DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost" ); |
1636 | |
1637 | /* Gather costs for statements in the scalar loop. */ |
1638 | |
1639 | /* FORNOW. */ |
1640 | innerloop_iters = 1; |
1641 | if (loop->inner) |
1642 | innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo); |
1643 | |
1644 | for (i = 0; i < nbbs; i++) |
1645 | { |
1646 | gimple_stmt_iterator si; |
1647 | basic_block bb = bbs[i]; |
1648 | |
1649 | if (bb->loop_father == loop->inner) |
1650 | factor = innerloop_iters; |
1651 | else |
1652 | factor = 1; |
1653 | |
1654 | for (si = gsi_start_bb (bb); !gsi_end_p (i: si); gsi_next (i: &si)) |
1655 | { |
1656 | gimple *stmt = gsi_stmt (i: si); |
1657 | stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt); |
1658 | |
1659 | if (!is_gimple_assign (gs: stmt) && !is_gimple_call (gs: stmt)) |
1660 | continue; |
1661 | |
1662 | /* Skip stmts that are not vectorized inside the loop. */ |
1663 | stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info); |
1664 | if (!STMT_VINFO_RELEVANT_P (vstmt_info) |
1665 | && (!STMT_VINFO_LIVE_P (vstmt_info) |
1666 | || !VECTORIZABLE_CYCLE_DEF |
1667 | (STMT_VINFO_DEF_TYPE (vstmt_info)))) |
1668 | continue; |
1669 | |
1670 | vect_cost_for_stmt kind; |
1671 | if (STMT_VINFO_DATA_REF (stmt_info)) |
1672 | { |
1673 | if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))) |
1674 | kind = scalar_load; |
1675 | else |
1676 | kind = scalar_store; |
1677 | } |
1678 | else if (vect_nop_conversion_p (stmt_info)) |
1679 | continue; |
1680 | else |
1681 | kind = scalar_stmt; |
1682 | |
1683 | /* We are using vect_prologue here to avoid scaling twice |
1684 | by the inner loop factor. */ |
1685 | record_stmt_cost (body_cost_vec: &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), |
1686 | count: factor, kind, stmt_info, misalign: 0, where: vect_prologue); |
1687 | } |
1688 | } |
1689 | |
1690 | /* Now accumulate cost. */ |
1691 | loop_vinfo->scalar_costs = init_cost (vinfo: loop_vinfo, costing_for_scalar: true); |
1692 | add_stmt_costs (costs: loop_vinfo->scalar_costs, |
1693 | cost_vec: &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo)); |
1694 | loop_vinfo->scalar_costs->finish_cost (scalar_costs: nullptr); |
1695 | } |
1696 | |
1697 | |
1698 | /* Function vect_analyze_loop_form. |
1699 | |
1700 | Verify that certain CFG restrictions hold, including: |
1701 | - the loop has a pre-header |
1702 | - the loop has a single entry and exit |
1703 | - the loop exit condition is simple enough |
1704 | - the number of iterations can be analyzed, i.e, a countable loop. The |
1705 | niter could be analyzed under some assumptions. */ |
1706 | |
1707 | opt_result |
1708 | vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info) |
1709 | { |
1710 | DUMP_VECT_SCOPE ("vect_analyze_loop_form" ); |
1711 | |
1712 | edge exit_e = vec_init_loop_exit_info (loop); |
1713 | if (!exit_e) |
1714 | return opt_result::failure_at (loc: vect_location, |
1715 | fmt: "not vectorized:" |
1716 | " could not determine main exit from" |
1717 | " loop with multiple exits.\n" ); |
1718 | info->loop_exit = exit_e; |
1719 | if (dump_enabled_p ()) |
1720 | dump_printf_loc (MSG_NOTE, vect_location, |
1721 | "using as main loop exit: %d -> %d [AUX: %p]\n" , |
1722 | exit_e->src->index, exit_e->dest->index, exit_e->aux); |
1723 | |
1724 | /* Different restrictions apply when we are considering an inner-most loop, |
1725 | vs. an outer (nested) loop. |
1726 | (FORNOW. May want to relax some of these restrictions in the future). */ |
1727 | |
1728 | info->inner_loop_cond = NULL; |
1729 | if (!loop->inner) |
1730 | { |
1731 | /* Inner-most loop. We currently require that the number of BBs is |
1732 | exactly 2 (the header and latch). Vectorizable inner-most loops |
1733 | look like this: |
1734 | |
1735 | (pre-header) |
1736 | | |
1737 | header <--------+ |
1738 | | | | |
1739 | | +--> latch --+ |
1740 | | |
1741 | (exit-bb) */ |
1742 | |
1743 | if (loop->num_nodes != 2) |
1744 | return opt_result::failure_at (loc: vect_location, |
1745 | fmt: "not vectorized:" |
1746 | " control flow in loop.\n" ); |
1747 | |
1748 | if (empty_block_p (loop->header)) |
1749 | return opt_result::failure_at (loc: vect_location, |
1750 | fmt: "not vectorized: empty loop.\n" ); |
1751 | } |
1752 | else |
1753 | { |
1754 | class loop *innerloop = loop->inner; |
1755 | edge entryedge; |
1756 | |
1757 | /* Nested loop. We currently require that the loop is doubly-nested, |
1758 | contains a single inner loop, and the number of BBs is exactly 5. |
1759 | Vectorizable outer-loops look like this: |
1760 | |
1761 | (pre-header) |
1762 | | |
1763 | header <---+ |
1764 | | | |
1765 | inner-loop | |
1766 | | | |
1767 | tail ------+ |
1768 | | |
1769 | (exit-bb) |
1770 | |
1771 | The inner-loop has the properties expected of inner-most loops |
1772 | as described above. */ |
1773 | |
1774 | if ((loop->inner)->inner || (loop->inner)->next) |
1775 | return opt_result::failure_at (loc: vect_location, |
1776 | fmt: "not vectorized:" |
1777 | " multiple nested loops.\n" ); |
1778 | |
1779 | if (loop->num_nodes != 5) |
1780 | return opt_result::failure_at (loc: vect_location, |
1781 | fmt: "not vectorized:" |
1782 | " control flow in loop.\n" ); |
1783 | |
1784 | entryedge = loop_preheader_edge (innerloop); |
1785 | if (entryedge->src != loop->header |
1786 | || !single_exit (innerloop) |
1787 | || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src) |
1788 | return opt_result::failure_at (loc: vect_location, |
1789 | fmt: "not vectorized:" |
1790 | " unsupported outerloop form.\n" ); |
1791 | |
1792 | /* Analyze the inner-loop. */ |
1793 | vect_loop_form_info inner; |
1794 | opt_result res = vect_analyze_loop_form (loop: loop->inner, info: &inner); |
1795 | if (!res) |
1796 | { |
1797 | if (dump_enabled_p ()) |
1798 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
1799 | "not vectorized: Bad inner loop.\n" ); |
1800 | return res; |
1801 | } |
1802 | |
1803 | /* Don't support analyzing niter under assumptions for inner |
1804 | loop. */ |
1805 | if (!integer_onep (inner.assumptions)) |
1806 | return opt_result::failure_at (loc: vect_location, |
1807 | fmt: "not vectorized: Bad inner loop.\n" ); |
1808 | |
1809 | if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations)) |
1810 | return opt_result::failure_at (loc: vect_location, |
1811 | fmt: "not vectorized: inner-loop count not" |
1812 | " invariant.\n" ); |
1813 | |
1814 | if (dump_enabled_p ()) |
1815 | dump_printf_loc (MSG_NOTE, vect_location, |
1816 | "Considering outer-loop vectorization.\n" ); |
1817 | info->inner_loop_cond = inner.conds[0]; |
1818 | } |
1819 | |
1820 | if (!single_exit (loop)) |
1821 | return opt_result::failure_at (loc: vect_location, |
1822 | fmt: "not vectorized: multiple exits.\n" ); |
1823 | if (EDGE_COUNT (loop->header->preds) != 2) |
1824 | return opt_result::failure_at (loc: vect_location, |
1825 | fmt: "not vectorized:" |
1826 | " too many incoming edges.\n" ); |
1827 | |
1828 | /* We assume that the loop exit condition is at the end of the loop. i.e, |
1829 | that the loop is represented as a do-while (with a proper if-guard |
1830 | before the loop if needed), where the loop header contains all the |
1831 | executable statements, and the latch is empty. */ |
1832 | if (!empty_block_p (loop->latch) |
1833 | || !gimple_seq_empty_p (s: phi_nodes (bb: loop->latch))) |
1834 | return opt_result::failure_at (loc: vect_location, |
1835 | fmt: "not vectorized: latch block not empty.\n" ); |
1836 | |
1837 | /* Make sure the exit is not abnormal. */ |
1838 | if (exit_e->flags & EDGE_ABNORMAL) |
1839 | return opt_result::failure_at (loc: vect_location, |
1840 | fmt: "not vectorized:" |
1841 | " abnormal loop exit edge.\n" ); |
1842 | |
1843 | info->conds |
1844 | = vect_get_loop_niters (loop, main_exit: exit_e, assumptions: &info->assumptions, |
1845 | number_of_iterations: &info->number_of_iterations, |
1846 | number_of_iterationsm1: &info->number_of_iterationsm1); |
1847 | |
1848 | if (info->conds.is_empty ()) |
1849 | return opt_result::failure_at |
1850 | (loc: vect_location, |
1851 | fmt: "not vectorized: complicated exit condition.\n" ); |
1852 | |
1853 | /* Determine what the primary and alternate exit conds are. */ |
1854 | for (unsigned i = 0; i < info->conds.length (); i++) |
1855 | { |
1856 | gcond *cond = info->conds[i]; |
1857 | if (exit_e->src == gimple_bb (g: cond)) |
1858 | std::swap (a&: info->conds[0], b&: info->conds[i]); |
1859 | } |
1860 | |
1861 | if (integer_zerop (info->assumptions) |
1862 | || !info->number_of_iterations |
1863 | || chrec_contains_undetermined (info->number_of_iterations)) |
1864 | return opt_result::failure_at |
1865 | (loc: info->conds[0], |
1866 | fmt: "not vectorized: number of iterations cannot be computed.\n" ); |
1867 | |
1868 | if (integer_zerop (info->number_of_iterations)) |
1869 | return opt_result::failure_at |
1870 | (loc: info->conds[0], |
1871 | fmt: "not vectorized: number of iterations = 0.\n" ); |
1872 | |
1873 | if (!(tree_fits_shwi_p (info->number_of_iterations) |
1874 | && tree_to_shwi (info->number_of_iterations) > 0)) |
1875 | { |
1876 | if (dump_enabled_p ()) |
1877 | { |
1878 | dump_printf_loc (MSG_NOTE, vect_location, |
1879 | "Symbolic number of iterations is " ); |
1880 | dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations); |
1881 | dump_printf (MSG_NOTE, "\n" ); |
1882 | } |
1883 | } |
1884 | |
1885 | return opt_result::success (); |
1886 | } |
1887 | |
1888 | /* Create a loop_vec_info for LOOP with SHARED and the |
1889 | vect_analyze_loop_form result. */ |
1890 | |
1891 | loop_vec_info |
1892 | vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared, |
1893 | const vect_loop_form_info *info, |
1894 | loop_vec_info main_loop_info) |
1895 | { |
1896 | loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared); |
1897 | LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1; |
1898 | LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations; |
1899 | LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations; |
1900 | LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info; |
1901 | /* Also record the assumptions for versioning. */ |
1902 | if (!integer_onep (info->assumptions) && !main_loop_info) |
1903 | LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions; |
1904 | |
1905 | for (gcond *cond : info->conds) |
1906 | { |
1907 | stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (cond); |
1908 | STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type; |
1909 | } |
1910 | |
1911 | for (unsigned i = 1; i < info->conds.length (); i ++) |
1912 | LOOP_VINFO_LOOP_CONDS (loop_vinfo).safe_push (obj: info->conds[i]); |
1913 | LOOP_VINFO_LOOP_IV_COND (loop_vinfo) = info->conds[0]; |
1914 | |
1915 | LOOP_VINFO_IV_EXIT (loop_vinfo) = info->loop_exit; |
1916 | |
1917 | if (info->inner_loop_cond) |
1918 | { |
1919 | stmt_vec_info inner_loop_cond_info |
1920 | = loop_vinfo->lookup_stmt (info->inner_loop_cond); |
1921 | STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type; |
1922 | /* If we have an estimate on the number of iterations of the inner |
1923 | loop use that to limit the scale for costing, otherwise use |
1924 | --param vect-inner-loop-cost-factor literally. */ |
1925 | widest_int nit; |
1926 | if (estimated_stmt_executions (loop->inner, &nit)) |
1927 | LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo) |
1928 | = wi::smin (x: nit, param_vect_inner_loop_cost_factor).to_uhwi (); |
1929 | } |
1930 | |
1931 | return loop_vinfo; |
1932 | } |
1933 | |
1934 | |
1935 | |
1936 | /* Scan the loop stmts and dependent on whether there are any (non-)SLP |
1937 | statements update the vectorization factor. */ |
1938 | |
1939 | static void |
1940 | vect_update_vf_for_slp (loop_vec_info loop_vinfo) |
1941 | { |
1942 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
1943 | basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); |
1944 | int nbbs = loop->num_nodes; |
1945 | poly_uint64 vectorization_factor; |
1946 | int i; |
1947 | |
1948 | DUMP_VECT_SCOPE ("vect_update_vf_for_slp" ); |
1949 | |
1950 | vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); |
1951 | gcc_assert (known_ne (vectorization_factor, 0U)); |
1952 | |
1953 | /* If all the stmts in the loop can be SLPed, we perform only SLP, and |
1954 | vectorization factor of the loop is the unrolling factor required by |
1955 | the SLP instances. If that unrolling factor is 1, we say, that we |
1956 | perform pure SLP on loop - cross iteration parallelism is not |
1957 | exploited. */ |
1958 | bool only_slp_in_loop = true; |
1959 | for (i = 0; i < nbbs; i++) |
1960 | { |
1961 | basic_block bb = bbs[i]; |
1962 | for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (i: si); |
1963 | gsi_next (i: &si)) |
1964 | { |
1965 | stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ()); |
1966 | if (!stmt_info) |
1967 | continue; |
1968 | if ((STMT_VINFO_RELEVANT_P (stmt_info) |
1969 | || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info))) |
1970 | && !PURE_SLP_STMT (stmt_info)) |
1971 | /* STMT needs both SLP and loop-based vectorization. */ |
1972 | only_slp_in_loop = false; |
1973 | } |
1974 | for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (i: si); |
1975 | gsi_next (i: &si)) |
1976 | { |
1977 | if (is_gimple_debug (gs: gsi_stmt (i: si))) |
1978 | continue; |
1979 | stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (i: si)); |
1980 | stmt_info = vect_stmt_to_vectorize (stmt_info); |
1981 | if ((STMT_VINFO_RELEVANT_P (stmt_info) |
1982 | || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info))) |
1983 | && !PURE_SLP_STMT (stmt_info)) |
1984 | /* STMT needs both SLP and loop-based vectorization. */ |
1985 | only_slp_in_loop = false; |
1986 | } |
1987 | } |
1988 | |
1989 | if (only_slp_in_loop) |
1990 | { |
1991 | if (dump_enabled_p ()) |
1992 | dump_printf_loc (MSG_NOTE, vect_location, |
1993 | "Loop contains only SLP stmts\n" ); |
1994 | vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo); |
1995 | } |
1996 | else |
1997 | { |
1998 | if (dump_enabled_p ()) |
1999 | dump_printf_loc (MSG_NOTE, vect_location, |
2000 | "Loop contains SLP and non-SLP stmts\n" ); |
2001 | /* Both the vectorization factor and unroll factor have the form |
2002 | GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X, |
2003 | so they must have a common multiple. */ |
2004 | vectorization_factor |
2005 | = force_common_multiple (a: vectorization_factor, |
2006 | LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo)); |
2007 | } |
2008 | |
2009 | LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor; |
2010 | if (dump_enabled_p ()) |
2011 | { |
2012 | dump_printf_loc (MSG_NOTE, vect_location, |
2013 | "Updating vectorization factor to " ); |
2014 | dump_dec (MSG_NOTE, vectorization_factor); |
2015 | dump_printf (MSG_NOTE, ".\n" ); |
2016 | } |
2017 | } |
2018 | |
2019 | /* Return true if STMT_INFO describes a double reduction phi and if |
2020 | the other phi in the reduction is also relevant for vectorization. |
2021 | This rejects cases such as: |
2022 | |
2023 | outer1: |
2024 | x_1 = PHI <x_3(outer2), ...>; |
2025 | ... |
2026 | |
2027 | inner: |
2028 | x_2 = ...; |
2029 | ... |
2030 | |
2031 | outer2: |
2032 | x_3 = PHI <x_2(inner)>; |
2033 | |
2034 | if nothing in x_2 or elsewhere makes x_1 relevant. */ |
2035 | |
2036 | static bool |
2037 | vect_active_double_reduction_p (stmt_vec_info stmt_info) |
2038 | { |
2039 | if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def) |
2040 | return false; |
2041 | |
2042 | return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info)); |
2043 | } |
2044 | |
2045 | /* Function vect_analyze_loop_operations. |
2046 | |
2047 | Scan the loop stmts and make sure they are all vectorizable. */ |
2048 | |
2049 | static opt_result |
2050 | vect_analyze_loop_operations (loop_vec_info loop_vinfo) |
2051 | { |
2052 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
2053 | basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); |
2054 | int nbbs = loop->num_nodes; |
2055 | int i; |
2056 | stmt_vec_info stmt_info; |
2057 | bool need_to_vectorize = false; |
2058 | bool ok; |
2059 | |
2060 | DUMP_VECT_SCOPE ("vect_analyze_loop_operations" ); |
2061 | |
2062 | auto_vec<stmt_info_for_cost> cost_vec; |
2063 | |
2064 | for (i = 0; i < nbbs; i++) |
2065 | { |
2066 | basic_block bb = bbs[i]; |
2067 | |
2068 | for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (i: si); |
2069 | gsi_next (i: &si)) |
2070 | { |
2071 | gphi *phi = si.phi (); |
2072 | ok = true; |
2073 | |
2074 | stmt_info = loop_vinfo->lookup_stmt (phi); |
2075 | if (dump_enabled_p ()) |
2076 | dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G" , |
2077 | (gimple *) phi); |
2078 | if (virtual_operand_p (op: gimple_phi_result (gs: phi))) |
2079 | continue; |
2080 | |
2081 | /* Inner-loop loop-closed exit phi in outer-loop vectorization |
2082 | (i.e., a phi in the tail of the outer-loop). */ |
2083 | if (! is_loop_header_bb_p (bb)) |
2084 | { |
2085 | /* FORNOW: we currently don't support the case that these phis |
2086 | are not used in the outerloop (unless it is double reduction, |
2087 | i.e., this phi is vect_reduction_def), cause this case |
2088 | requires to actually do something here. */ |
2089 | if (STMT_VINFO_LIVE_P (stmt_info) |
2090 | && !vect_active_double_reduction_p (stmt_info)) |
2091 | return opt_result::failure_at (loc: phi, |
2092 | fmt: "Unsupported loop-closed phi" |
2093 | " in outer-loop.\n" ); |
2094 | |
2095 | /* If PHI is used in the outer loop, we check that its operand |
2096 | is defined in the inner loop. */ |
2097 | if (STMT_VINFO_RELEVANT_P (stmt_info)) |
2098 | { |
2099 | tree phi_op; |
2100 | |
2101 | if (gimple_phi_num_args (gs: phi) != 1) |
2102 | return opt_result::failure_at (loc: phi, fmt: "unsupported phi" ); |
2103 | |
2104 | phi_op = PHI_ARG_DEF (phi, 0); |
2105 | stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op); |
2106 | if (!op_def_info) |
2107 | return opt_result::failure_at (loc: phi, fmt: "unsupported phi\n" ); |
2108 | |
2109 | if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer |
2110 | && (STMT_VINFO_RELEVANT (op_def_info) |
2111 | != vect_used_in_outer_by_reduction)) |
2112 | return opt_result::failure_at (loc: phi, fmt: "unsupported phi\n" ); |
2113 | |
2114 | if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def |
2115 | || (STMT_VINFO_DEF_TYPE (stmt_info) |
2116 | == vect_double_reduction_def)) |
2117 | && !vectorizable_lc_phi (loop_vinfo, |
2118 | stmt_info, NULL, NULL)) |
2119 | return opt_result::failure_at (loc: phi, fmt: "unsupported phi\n" ); |
2120 | } |
2121 | |
2122 | continue; |
2123 | } |
2124 | |
2125 | gcc_assert (stmt_info); |
2126 | |
2127 | if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope |
2128 | || STMT_VINFO_LIVE_P (stmt_info)) |
2129 | && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def |
2130 | && STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence) |
2131 | /* A scalar-dependence cycle that we don't support. */ |
2132 | return opt_result::failure_at (loc: phi, |
2133 | fmt: "not vectorized:" |
2134 | " scalar dependence cycle.\n" ); |
2135 | |
2136 | if (STMT_VINFO_RELEVANT_P (stmt_info)) |
2137 | { |
2138 | need_to_vectorize = true; |
2139 | if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def |
2140 | && ! PURE_SLP_STMT (stmt_info)) |
2141 | ok = vectorizable_induction (loop_vinfo, |
2142 | stmt_info, NULL, NULL, |
2143 | &cost_vec); |
2144 | else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def |
2145 | || (STMT_VINFO_DEF_TYPE (stmt_info) |
2146 | == vect_double_reduction_def) |
2147 | || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle) |
2148 | && ! PURE_SLP_STMT (stmt_info)) |
2149 | ok = vectorizable_reduction (loop_vinfo, |
2150 | stmt_info, NULL, NULL, &cost_vec); |
2151 | else if ((STMT_VINFO_DEF_TYPE (stmt_info) |
2152 | == vect_first_order_recurrence) |
2153 | && ! PURE_SLP_STMT (stmt_info)) |
2154 | ok = vectorizable_recurr (loop_vinfo, stmt_info, NULL, NULL, |
2155 | &cost_vec); |
2156 | } |
2157 | |
2158 | /* SLP PHIs are tested by vect_slp_analyze_node_operations. */ |
2159 | if (ok |
2160 | && STMT_VINFO_LIVE_P (stmt_info) |
2161 | && !PURE_SLP_STMT (stmt_info)) |
2162 | ok = vectorizable_live_operation (loop_vinfo, stmt_info, NULL, NULL, |
2163 | -1, false, &cost_vec); |
2164 | |
2165 | if (!ok) |
2166 | return opt_result::failure_at (loc: phi, |
2167 | fmt: "not vectorized: relevant phi not " |
2168 | "supported: %G" , |
2169 | static_cast <gimple *> (phi)); |
2170 | } |
2171 | |
2172 | for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (i: si); |
2173 | gsi_next (i: &si)) |
2174 | { |
2175 | gimple *stmt = gsi_stmt (i: si); |
2176 | if (!gimple_clobber_p (s: stmt) |
2177 | && !is_gimple_debug (gs: stmt)) |
2178 | { |
2179 | opt_result res |
2180 | = vect_analyze_stmt (loop_vinfo, |
2181 | loop_vinfo->lookup_stmt (stmt), |
2182 | &need_to_vectorize, |
2183 | NULL, NULL, &cost_vec); |
2184 | if (!res) |
2185 | return res; |
2186 | } |
2187 | } |
2188 | } /* bbs */ |
2189 | |
2190 | add_stmt_costs (costs: loop_vinfo->vector_costs, cost_vec: &cost_vec); |
2191 | |
2192 | /* All operations in the loop are either irrelevant (deal with loop |
2193 | control, or dead), or only used outside the loop and can be moved |
2194 | out of the loop (e.g. invariants, inductions). The loop can be |
2195 | optimized away by scalar optimizations. We're better off not |
2196 | touching this loop. */ |
2197 | if (!need_to_vectorize) |
2198 | { |
2199 | if (dump_enabled_p ()) |
2200 | dump_printf_loc (MSG_NOTE, vect_location, |
2201 | "All the computation can be taken out of the loop.\n" ); |
2202 | return opt_result::failure_at |
2203 | (loc: vect_location, |
2204 | fmt: "not vectorized: redundant loop. no profit to vectorize.\n" ); |
2205 | } |
2206 | |
2207 | return opt_result::success (); |
2208 | } |
2209 | |
2210 | /* Return true if we know that the iteration count is smaller than the |
2211 | vectorization factor. Return false if it isn't, or if we can't be sure |
2212 | either way. */ |
2213 | |
2214 | static bool |
2215 | vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo) |
2216 | { |
2217 | unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo); |
2218 | |
2219 | HOST_WIDE_INT max_niter; |
2220 | if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) |
2221 | max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo); |
2222 | else |
2223 | max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo)); |
2224 | |
2225 | if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf) |
2226 | return true; |
2227 | |
2228 | return false; |
2229 | } |
2230 | |
2231 | /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it |
2232 | is worthwhile to vectorize. Return 1 if definitely yes, 0 if |
2233 | definitely no, or -1 if it's worth retrying. */ |
2234 | |
2235 | static int |
2236 | vect_analyze_loop_costing (loop_vec_info loop_vinfo, |
2237 | unsigned *suggested_unroll_factor) |
2238 | { |
2239 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
2240 | unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo); |
2241 | |
2242 | /* Only loops that can handle partially-populated vectors can have iteration |
2243 | counts less than the vectorization factor. */ |
2244 | if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) |
2245 | && vect_known_niters_smaller_than_vf (loop_vinfo)) |
2246 | { |
2247 | if (dump_enabled_p ()) |
2248 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
2249 | "not vectorized: iteration count smaller than " |
2250 | "vectorization factor.\n" ); |
2251 | return 0; |
2252 | } |
2253 | |
2254 | /* If we know the number of iterations we can do better, for the |
2255 | epilogue we can also decide whether the main loop leaves us |
2256 | with enough iterations, prefering a smaller vector epilog then |
2257 | also possibly used for the case we skip the vector loop. */ |
2258 | if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) |
2259 | { |
2260 | widest_int scalar_niters |
2261 | = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo)) + 1; |
2262 | if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)) |
2263 | { |
2264 | loop_vec_info orig_loop_vinfo |
2265 | = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo); |
2266 | unsigned lowest_vf |
2267 | = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)); |
2268 | int prolog_peeling = 0; |
2269 | if (!vect_use_loop_mask_for_alignment_p (loop_vinfo)) |
2270 | prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo); |
2271 | if (prolog_peeling >= 0 |
2272 | && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo), |
2273 | lowest_vf)) |
2274 | { |
2275 | unsigned gap |
2276 | = LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo) ? 1 : 0; |
2277 | scalar_niters = ((scalar_niters - gap - prolog_peeling) |
2278 | % lowest_vf + gap); |
2279 | } |
2280 | } |
2281 | /* Reject vectorizing for a single scalar iteration, even if |
2282 | we could in principle implement that using partial vectors. */ |
2283 | unsigned peeling_gap = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo); |
2284 | if (scalar_niters <= peeling_gap + 1) |
2285 | { |
2286 | if (dump_enabled_p ()) |
2287 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
2288 | "not vectorized: loop only has a single " |
2289 | "scalar iteration.\n" ); |
2290 | return 0; |
2291 | } |
2292 | |
2293 | if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)) |
2294 | { |
2295 | /* Check that the loop processes at least one full vector. */ |
2296 | poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); |
2297 | if (known_lt (scalar_niters, vf)) |
2298 | { |
2299 | if (dump_enabled_p ()) |
2300 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
2301 | "loop does not have enough iterations " |
2302 | "to support vectorization.\n" ); |
2303 | return 0; |
2304 | } |
2305 | |
2306 | /* If we need to peel an extra epilogue iteration to handle data |
2307 | accesses with gaps, check that there are enough scalar iterations |
2308 | available. |
2309 | |
2310 | The check above is redundant with this one when peeling for gaps, |
2311 | but the distinction is useful for diagnostics. */ |
2312 | if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) |
2313 | && known_le (scalar_niters, vf)) |
2314 | { |
2315 | if (dump_enabled_p ()) |
2316 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
2317 | "loop does not have enough iterations " |
2318 | "to support peeling for gaps.\n" ); |
2319 | return 0; |
2320 | } |
2321 | } |
2322 | } |
2323 | |
2324 | /* If using the "very cheap" model. reject cases in which we'd keep |
2325 | a copy of the scalar code (even if we might be able to vectorize it). */ |
2326 | if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP |
2327 | && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) |
2328 | || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) |
2329 | || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))) |
2330 | { |
2331 | if (dump_enabled_p ()) |
2332 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
2333 | "some scalar iterations would need to be peeled\n" ); |
2334 | return 0; |
2335 | } |
2336 | |
2337 | int min_profitable_iters, min_profitable_estimate; |
2338 | vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters, |
2339 | &min_profitable_estimate, |
2340 | suggested_unroll_factor); |
2341 | |
2342 | if (min_profitable_iters < 0) |
2343 | { |
2344 | if (dump_enabled_p ()) |
2345 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
2346 | "not vectorized: vectorization not profitable.\n" ); |
2347 | if (dump_enabled_p ()) |
2348 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
2349 | "not vectorized: vector version will never be " |
2350 | "profitable.\n" ); |
2351 | return -1; |
2352 | } |
2353 | |
2354 | int min_scalar_loop_bound = (param_min_vect_loop_bound |
2355 | * assumed_vf); |
2356 | |
2357 | /* Use the cost model only if it is more conservative than user specified |
2358 | threshold. */ |
2359 | unsigned int th = (unsigned) MAX (min_scalar_loop_bound, |
2360 | min_profitable_iters); |
2361 | |
2362 | LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th; |
2363 | |
2364 | if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) |
2365 | && LOOP_VINFO_INT_NITERS (loop_vinfo) < th) |
2366 | { |
2367 | if (dump_enabled_p ()) |
2368 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
2369 | "not vectorized: vectorization not profitable.\n" ); |
2370 | if (dump_enabled_p ()) |
2371 | dump_printf_loc (MSG_NOTE, vect_location, |
2372 | "not vectorized: iteration count smaller than user " |
2373 | "specified loop bound parameter or minimum profitable " |
2374 | "iterations (whichever is more conservative).\n" ); |
2375 | return 0; |
2376 | } |
2377 | |
2378 | /* The static profitablity threshold min_profitable_estimate includes |
2379 | the cost of having to check at runtime whether the scalar loop |
2380 | should be used instead. If it turns out that we don't need or want |
2381 | such a check, the threshold we should use for the static estimate |
2382 | is simply the point at which the vector loop becomes more profitable |
2383 | than the scalar loop. */ |
2384 | if (min_profitable_estimate > min_profitable_iters |
2385 | && !LOOP_REQUIRES_VERSIONING (loop_vinfo) |
2386 | && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) |
2387 | && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) |
2388 | && !vect_apply_runtime_profitability_check_p (loop_vinfo)) |
2389 | { |
2390 | if (dump_enabled_p ()) |
2391 | dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime" |
2392 | " choice between the scalar and vector loops\n" ); |
2393 | min_profitable_estimate = min_profitable_iters; |
2394 | } |
2395 | |
2396 | /* If the vector loop needs multiple iterations to be beneficial then |
2397 | things are probably too close to call, and the conservative thing |
2398 | would be to stick with the scalar code. */ |
2399 | if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP |
2400 | && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo)) |
2401 | { |
2402 | if (dump_enabled_p ()) |
2403 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
2404 | "one iteration of the vector loop would be" |
2405 | " more expensive than the equivalent number of" |
2406 | " iterations of the scalar loop\n" ); |
2407 | return 0; |
2408 | } |
2409 | |
2410 | HOST_WIDE_INT estimated_niter; |
2411 | |
2412 | /* If we are vectorizing an epilogue then we know the maximum number of |
2413 | scalar iterations it will cover is at least one lower than the |
2414 | vectorization factor of the main loop. */ |
2415 | if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)) |
2416 | estimated_niter |
2417 | = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1; |
2418 | else |
2419 | { |
2420 | estimated_niter = estimated_stmt_executions_int (loop); |
2421 | if (estimated_niter == -1) |
2422 | estimated_niter = likely_max_stmt_executions_int (loop); |
2423 | } |
2424 | if (estimated_niter != -1 |
2425 | && ((unsigned HOST_WIDE_INT) estimated_niter |
2426 | < MAX (th, (unsigned) min_profitable_estimate))) |
2427 | { |
2428 | if (dump_enabled_p ()) |
2429 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
2430 | "not vectorized: estimated iteration count too " |
2431 | "small.\n" ); |
2432 | if (dump_enabled_p ()) |
2433 | dump_printf_loc (MSG_NOTE, vect_location, |
2434 | "not vectorized: estimated iteration count smaller " |
2435 | "than specified loop bound parameter or minimum " |
2436 | "profitable iterations (whichever is more " |
2437 | "conservative).\n" ); |
2438 | return -1; |
2439 | } |
2440 | |
2441 | return 1; |
2442 | } |
2443 | |
2444 | static opt_result |
2445 | vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs, |
2446 | vec<data_reference_p> *datarefs, |
2447 | unsigned int *n_stmts) |
2448 | { |
2449 | *n_stmts = 0; |
2450 | for (unsigned i = 0; i < loop->num_nodes; i++) |
2451 | for (gimple_stmt_iterator gsi = gsi_start_bb (bb: bbs[i]); |
2452 | !gsi_end_p (i: gsi); gsi_next (i: &gsi)) |
2453 | { |
2454 | gimple *stmt = gsi_stmt (i: gsi); |
2455 | if (is_gimple_debug (gs: stmt)) |
2456 | continue; |
2457 | ++(*n_stmts); |
2458 | opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs, |
2459 | NULL, 0); |
2460 | if (!res) |
2461 | { |
2462 | if (is_gimple_call (gs: stmt) && loop->safelen) |
2463 | { |
2464 | tree fndecl = gimple_call_fndecl (gs: stmt), op; |
2465 | if (fndecl == NULL_TREE |
2466 | && gimple_call_internal_p (gs: stmt, fn: IFN_MASK_CALL)) |
2467 | { |
2468 | fndecl = gimple_call_arg (gs: stmt, index: 0); |
2469 | gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR); |
2470 | fndecl = TREE_OPERAND (fndecl, 0); |
2471 | gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL); |
2472 | } |
2473 | if (fndecl != NULL_TREE) |
2474 | { |
2475 | cgraph_node *node = cgraph_node::get (decl: fndecl); |
2476 | if (node != NULL && node->simd_clones != NULL) |
2477 | { |
2478 | unsigned int j, n = gimple_call_num_args (gs: stmt); |
2479 | for (j = 0; j < n; j++) |
2480 | { |
2481 | op = gimple_call_arg (gs: stmt, index: j); |
2482 | if (DECL_P (op) |
2483 | || (REFERENCE_CLASS_P (op) |
2484 | && get_base_address (t: op))) |
2485 | break; |
2486 | } |
2487 | op = gimple_call_lhs (gs: stmt); |
2488 | /* Ignore #pragma omp declare simd functions |
2489 | if they don't have data references in the |
2490 | call stmt itself. */ |
2491 | if (j == n |
2492 | && !(op |
2493 | && (DECL_P (op) |
2494 | || (REFERENCE_CLASS_P (op) |
2495 | && get_base_address (t: op))))) |
2496 | continue; |
2497 | } |
2498 | } |
2499 | } |
2500 | return res; |
2501 | } |
2502 | /* If dependence analysis will give up due to the limit on the |
2503 | number of datarefs stop here and fail fatally. */ |
2504 | if (datarefs->length () |
2505 | > (unsigned)param_loop_max_datarefs_for_datadeps) |
2506 | return opt_result::failure_at (loc: stmt, fmt: "exceeded param " |
2507 | "loop-max-datarefs-for-datadeps\n" ); |
2508 | } |
2509 | return opt_result::success (); |
2510 | } |
2511 | |
2512 | /* Look for SLP-only access groups and turn each individual access into its own |
2513 | group. */ |
2514 | static void |
2515 | vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo) |
2516 | { |
2517 | unsigned int i; |
2518 | struct data_reference *dr; |
2519 | |
2520 | DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups" ); |
2521 | |
2522 | vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); |
2523 | FOR_EACH_VEC_ELT (datarefs, i, dr) |
2524 | { |
2525 | gcc_assert (DR_REF (dr)); |
2526 | stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr)); |
2527 | |
2528 | /* Check if the load is a part of an interleaving chain. */ |
2529 | if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) |
2530 | { |
2531 | stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info); |
2532 | dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element); |
2533 | unsigned int group_size = DR_GROUP_SIZE (first_element); |
2534 | |
2535 | /* Check if SLP-only groups. */ |
2536 | if (!STMT_SLP_TYPE (stmt_info) |
2537 | && STMT_VINFO_SLP_VECT_ONLY (first_element)) |
2538 | { |
2539 | /* Dissolve the group. */ |
2540 | STMT_VINFO_SLP_VECT_ONLY (first_element) = false; |
2541 | |
2542 | stmt_vec_info vinfo = first_element; |
2543 | while (vinfo) |
2544 | { |
2545 | stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo); |
2546 | DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo; |
2547 | DR_GROUP_NEXT_ELEMENT (vinfo) = NULL; |
2548 | DR_GROUP_SIZE (vinfo) = 1; |
2549 | if (STMT_VINFO_STRIDED_P (first_element) |
2550 | /* We cannot handle stores with gaps. */ |
2551 | || DR_IS_WRITE (dr_info->dr)) |
2552 | { |
2553 | STMT_VINFO_STRIDED_P (vinfo) = true; |
2554 | DR_GROUP_GAP (vinfo) = 0; |
2555 | } |
2556 | else |
2557 | DR_GROUP_GAP (vinfo) = group_size - 1; |
2558 | /* Duplicate and adjust alignment info, it needs to |
2559 | be present on each group leader, see dr_misalignment. */ |
2560 | if (vinfo != first_element) |
2561 | { |
2562 | dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo); |
2563 | dr_info2->target_alignment = dr_info->target_alignment; |
2564 | int misalignment = dr_info->misalignment; |
2565 | if (misalignment != DR_MISALIGNMENT_UNKNOWN) |
2566 | { |
2567 | HOST_WIDE_INT diff |
2568 | = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr)) |
2569 | - TREE_INT_CST_LOW (DR_INIT (dr_info->dr))); |
2570 | unsigned HOST_WIDE_INT align_c |
2571 | = dr_info->target_alignment.to_constant (); |
2572 | misalignment = (misalignment + diff) % align_c; |
2573 | } |
2574 | dr_info2->misalignment = misalignment; |
2575 | } |
2576 | vinfo = next; |
2577 | } |
2578 | } |
2579 | } |
2580 | } |
2581 | } |
2582 | |
2583 | /* Determine if operating on full vectors for LOOP_VINFO might leave |
2584 | some scalar iterations still to do. If so, decide how we should |
2585 | handle those scalar iterations. The possibilities are: |
2586 | |
2587 | (1) Make LOOP_VINFO operate on partial vectors instead of full vectors. |
2588 | In this case: |
2589 | |
2590 | LOOP_VINFO_USING_PARTIAL_VECTORS_P == true |
2591 | LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false |
2592 | LOOP_VINFO_PEELING_FOR_NITER == false |
2593 | |
2594 | (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop |
2595 | to handle the remaining scalar iterations. In this case: |
2596 | |
2597 | LOOP_VINFO_USING_PARTIAL_VECTORS_P == false |
2598 | LOOP_VINFO_PEELING_FOR_NITER == true |
2599 | |
2600 | There are two choices: |
2601 | |
2602 | (2a) Consider vectorizing the epilogue loop at the same VF as the |
2603 | main loop, but using partial vectors instead of full vectors. |
2604 | In this case: |
2605 | |
2606 | LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true |
2607 | |
2608 | (2b) Consider vectorizing the epilogue loop at lower VFs only. |
2609 | In this case: |
2610 | |
2611 | LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false |
2612 | */ |
2613 | |
2614 | opt_result |
2615 | vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo) |
2616 | { |
2617 | /* Determine whether there would be any scalar iterations left over. */ |
2618 | bool need_peeling_or_partial_vectors_p |
2619 | = vect_need_peeling_or_partial_vectors_p (loop_vinfo); |
2620 | |
2621 | /* Decide whether to vectorize the loop with partial vectors. */ |
2622 | LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false; |
2623 | LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false; |
2624 | if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) |
2625 | && need_peeling_or_partial_vectors_p) |
2626 | { |
2627 | /* For partial-vector-usage=1, try to push the handling of partial |
2628 | vectors to the epilogue, with the main loop continuing to operate |
2629 | on full vectors. |
2630 | |
2631 | If we are unrolling we also do not want to use partial vectors. This |
2632 | is to avoid the overhead of generating multiple masks and also to |
2633 | avoid having to execute entire iterations of FALSE masked instructions |
2634 | when dealing with one or less full iterations. |
2635 | |
2636 | ??? We could then end up failing to use partial vectors if we |
2637 | decide to peel iterations into a prologue, and if the main loop |
2638 | then ends up processing fewer than VF iterations. */ |
2639 | if ((param_vect_partial_vector_usage == 1 |
2640 | || loop_vinfo->suggested_unroll_factor > 1) |
2641 | && !LOOP_VINFO_EPILOGUE_P (loop_vinfo) |
2642 | && !vect_known_niters_smaller_than_vf (loop_vinfo)) |
2643 | LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true; |
2644 | else |
2645 | LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true; |
2646 | } |
2647 | |
2648 | if (dump_enabled_p ()) |
2649 | dump_printf_loc (MSG_NOTE, vect_location, |
2650 | "operating on %s vectors%s.\n" , |
2651 | LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) |
2652 | ? "partial" : "full" , |
2653 | LOOP_VINFO_EPILOGUE_P (loop_vinfo) |
2654 | ? " for epilogue loop" : "" ); |
2655 | |
2656 | LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) |
2657 | = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) |
2658 | && need_peeling_or_partial_vectors_p); |
2659 | |
2660 | return opt_result::success (); |
2661 | } |
2662 | |
2663 | /* Function vect_analyze_loop_2. |
2664 | |
2665 | Apply a set of analyses on LOOP specified by LOOP_VINFO, the different |
2666 | analyses will record information in some members of LOOP_VINFO. FATAL |
2667 | indicates if some analysis meets fatal error. If one non-NULL pointer |
2668 | SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one |
2669 | worked out suggested unroll factor, while one NULL pointer shows it's |
2670 | going to apply the suggested unroll factor. SLP_DONE_FOR_SUGGESTED_UF |
2671 | is to hold the slp decision when the suggested unroll factor is worked |
2672 | out. */ |
2673 | static opt_result |
2674 | vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, |
2675 | unsigned *suggested_unroll_factor, |
2676 | bool& slp_done_for_suggested_uf) |
2677 | { |
2678 | opt_result ok = opt_result::success (); |
2679 | int res; |
2680 | unsigned int max_vf = MAX_VECTORIZATION_FACTOR; |
2681 | poly_uint64 min_vf = 2; |
2682 | loop_vec_info orig_loop_vinfo = NULL; |
2683 | |
2684 | /* If we are dealing with an epilogue then orig_loop_vinfo points to the |
2685 | loop_vec_info of the first vectorized loop. */ |
2686 | if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)) |
2687 | orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo); |
2688 | else |
2689 | orig_loop_vinfo = loop_vinfo; |
2690 | gcc_assert (orig_loop_vinfo); |
2691 | |
2692 | /* The first group of checks is independent of the vector size. */ |
2693 | fatal = true; |
2694 | |
2695 | if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo) |
2696 | && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo))) |
2697 | return opt_result::failure_at (loc: vect_location, |
2698 | fmt: "not vectorized: simd if(0)\n" ); |
2699 | |
2700 | /* Find all data references in the loop (which correspond to vdefs/vuses) |
2701 | and analyze their evolution in the loop. */ |
2702 | |
2703 | loop_p loop = LOOP_VINFO_LOOP (loop_vinfo); |
2704 | |
2705 | /* Gather the data references and count stmts in the loop. */ |
2706 | if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ()) |
2707 | { |
2708 | opt_result res |
2709 | = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo), |
2710 | datarefs: &LOOP_VINFO_DATAREFS (loop_vinfo), |
2711 | n_stmts: &LOOP_VINFO_N_STMTS (loop_vinfo)); |
2712 | if (!res) |
2713 | { |
2714 | if (dump_enabled_p ()) |
2715 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
2716 | "not vectorized: loop contains function " |
2717 | "calls or data references that cannot " |
2718 | "be analyzed\n" ); |
2719 | return res; |
2720 | } |
2721 | loop_vinfo->shared->save_datarefs (); |
2722 | } |
2723 | else |
2724 | loop_vinfo->shared->check_datarefs (); |
2725 | |
2726 | /* Analyze the data references and also adjust the minimal |
2727 | vectorization factor according to the loads and stores. */ |
2728 | |
2729 | ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal); |
2730 | if (!ok) |
2731 | { |
2732 | if (dump_enabled_p ()) |
2733 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
2734 | "bad data references.\n" ); |
2735 | return ok; |
2736 | } |
2737 | |
2738 | /* Check if we are applying unroll factor now. */ |
2739 | bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1; |
2740 | gcc_assert (!applying_suggested_uf || !suggested_unroll_factor); |
2741 | |
2742 | /* If the slp decision is false when suggested unroll factor is worked |
2743 | out, and we are applying suggested unroll factor, we can simply skip |
2744 | all slp related analyses this time. */ |
2745 | bool slp = !applying_suggested_uf || slp_done_for_suggested_uf; |
2746 | |
2747 | /* Classify all cross-iteration scalar data-flow cycles. |
2748 | Cross-iteration cycles caused by virtual phis are analyzed separately. */ |
2749 | vect_analyze_scalar_cycles (loop_vinfo, slp); |
2750 | |
2751 | vect_pattern_recog (loop_vinfo); |
2752 | |
2753 | vect_fixup_scalar_cycles_with_patterns (loop_vinfo); |
2754 | |
2755 | /* Analyze the access patterns of the data-refs in the loop (consecutive, |
2756 | complex, etc.). FORNOW: Only handle consecutive access pattern. */ |
2757 | |
2758 | ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL); |
2759 | if (!ok) |
2760 | { |
2761 | if (dump_enabled_p ()) |
2762 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
2763 | "bad data access.\n" ); |
2764 | return ok; |
2765 | } |
2766 | |
2767 | /* Data-flow analysis to detect stmts that do not need to be vectorized. */ |
2768 | |
2769 | ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal); |
2770 | if (!ok) |
2771 | { |
2772 | if (dump_enabled_p ()) |
2773 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
2774 | "unexpected pattern.\n" ); |
2775 | return ok; |
2776 | } |
2777 | |
2778 | /* While the rest of the analysis below depends on it in some way. */ |
2779 | fatal = false; |
2780 | |
2781 | /* Analyze data dependences between the data-refs in the loop |
2782 | and adjust the maximum vectorization factor according to |
2783 | the dependences. |
2784 | FORNOW: fail at the first data dependence that we encounter. */ |
2785 | |
2786 | ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf); |
2787 | if (!ok) |
2788 | { |
2789 | if (dump_enabled_p ()) |
2790 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
2791 | "bad data dependence.\n" ); |
2792 | return ok; |
2793 | } |
2794 | if (max_vf != MAX_VECTORIZATION_FACTOR |
2795 | && maybe_lt (a: max_vf, b: min_vf)) |
2796 | return opt_result::failure_at (loc: vect_location, fmt: "bad data dependence.\n" ); |
2797 | LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf; |
2798 | |
2799 | ok = vect_determine_vectorization_factor (loop_vinfo); |
2800 | if (!ok) |
2801 | { |
2802 | if (dump_enabled_p ()) |
2803 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
2804 | "can't determine vectorization factor.\n" ); |
2805 | return ok; |
2806 | } |
2807 | if (max_vf != MAX_VECTORIZATION_FACTOR |
2808 | && maybe_lt (a: max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo))) |
2809 | return opt_result::failure_at (loc: vect_location, fmt: "bad data dependence.\n" ); |
2810 | |
2811 | /* Compute the scalar iteration cost. */ |
2812 | vect_compute_single_scalar_iteration_cost (loop_vinfo); |
2813 | |
2814 | poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); |
2815 | |
2816 | if (slp) |
2817 | { |
2818 | /* Check the SLP opportunities in the loop, analyze and build |
2819 | SLP trees. */ |
2820 | ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo)); |
2821 | if (!ok) |
2822 | return ok; |
2823 | |
2824 | /* If there are any SLP instances mark them as pure_slp. */ |
2825 | slp = vect_make_slp_decision (loop_vinfo); |
2826 | if (slp) |
2827 | { |
2828 | /* Find stmts that need to be both vectorized and SLPed. */ |
2829 | vect_detect_hybrid_slp (loop_vinfo); |
2830 | |
2831 | /* Update the vectorization factor based on the SLP decision. */ |
2832 | vect_update_vf_for_slp (loop_vinfo); |
2833 | |
2834 | /* Optimize the SLP graph with the vectorization factor fixed. */ |
2835 | vect_optimize_slp (loop_vinfo); |
2836 | |
2837 | /* Gather the loads reachable from the SLP graph entries. */ |
2838 | vect_gather_slp_loads (loop_vinfo); |
2839 | } |
2840 | } |
2841 | |
2842 | bool saved_can_use_partial_vectors_p |
2843 | = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo); |
2844 | |
2845 | /* We don't expect to have to roll back to anything other than an empty |
2846 | set of rgroups. */ |
2847 | gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ()); |
2848 | |
2849 | /* This is the point where we can re-start analysis with SLP forced off. */ |
2850 | start_over: |
2851 | |
2852 | /* Apply the suggested unrolling factor, this was determined by the backend |
2853 | during finish_cost the first time we ran the analyzis for this |
2854 | vector mode. */ |
2855 | if (applying_suggested_uf) |
2856 | LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor; |
2857 | |
2858 | /* Now the vectorization factor is final. */ |
2859 | poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); |
2860 | gcc_assert (known_ne (vectorization_factor, 0U)); |
2861 | |
2862 | if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ()) |
2863 | { |
2864 | dump_printf_loc (MSG_NOTE, vect_location, |
2865 | "vectorization_factor = " ); |
2866 | dump_dec (MSG_NOTE, vectorization_factor); |
2867 | dump_printf (MSG_NOTE, ", niters = %wd\n" , |
2868 | LOOP_VINFO_INT_NITERS (loop_vinfo)); |
2869 | } |
2870 | |
2871 | loop_vinfo->vector_costs = init_cost (vinfo: loop_vinfo, costing_for_scalar: false); |
2872 | |
2873 | /* Analyze the alignment of the data-refs in the loop. |
2874 | Fail if a data reference is found that cannot be vectorized. */ |
2875 | |
2876 | ok = vect_analyze_data_refs_alignment (loop_vinfo); |
2877 | if (!ok) |
2878 | { |
2879 | if (dump_enabled_p ()) |
2880 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
2881 | "bad data alignment.\n" ); |
2882 | return ok; |
2883 | } |
2884 | |
2885 | /* Prune the list of ddrs to be tested at run-time by versioning for alias. |
2886 | It is important to call pruning after vect_analyze_data_ref_accesses, |
2887 | since we use grouping information gathered by interleaving analysis. */ |
2888 | ok = vect_prune_runtime_alias_test_list (loop_vinfo); |
2889 | if (!ok) |
2890 | return ok; |
2891 | |
2892 | /* Do not invoke vect_enhance_data_refs_alignment for epilogue |
2893 | vectorization, since we do not want to add extra peeling or |
2894 | add versioning for alignment. */ |
2895 | if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)) |
2896 | /* This pass will decide on using loop versioning and/or loop peeling in |
2897 | order to enhance the alignment of data references in the loop. */ |
2898 | ok = vect_enhance_data_refs_alignment (loop_vinfo); |
2899 | if (!ok) |
2900 | return ok; |
2901 | |
2902 | if (slp) |
2903 | { |
2904 | /* Analyze operations in the SLP instances. Note this may |
2905 | remove unsupported SLP instances which makes the above |
2906 | SLP kind detection invalid. */ |
2907 | unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length (); |
2908 | vect_slp_analyze_operations (loop_vinfo); |
2909 | if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size) |
2910 | { |
2911 | ok = opt_result::failure_at (loc: vect_location, |
2912 | fmt: "unsupported SLP instances\n" ); |
2913 | goto again; |
2914 | } |
2915 | |
2916 | /* Check whether any load in ALL SLP instances is possibly permuted. */ |
2917 | slp_tree load_node, slp_root; |
2918 | unsigned i, x; |
2919 | slp_instance instance; |
2920 | bool can_use_lanes = true; |
2921 | FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance) |
2922 | { |
2923 | slp_root = SLP_INSTANCE_TREE (instance); |
2924 | int group_size = SLP_TREE_LANES (slp_root); |
2925 | tree vectype = SLP_TREE_VECTYPE (slp_root); |
2926 | bool loads_permuted = false; |
2927 | FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node) |
2928 | { |
2929 | if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ()) |
2930 | continue; |
2931 | unsigned j; |
2932 | stmt_vec_info load_info; |
2933 | FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info) |
2934 | if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j) |
2935 | { |
2936 | loads_permuted = true; |
2937 | break; |
2938 | } |
2939 | } |
2940 | |
2941 | /* If the loads and stores can be handled with load/store-lane |
2942 | instructions record it and move on to the next instance. */ |
2943 | if (loads_permuted |
2944 | && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store |
2945 | && vect_store_lanes_supported (vectype, group_size, false) |
2946 | != IFN_LAST) |
2947 | { |
2948 | FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node) |
2949 | if (STMT_VINFO_GROUPED_ACCESS |
2950 | (SLP_TREE_REPRESENTATIVE (load_node))) |
2951 | { |
2952 | stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT |
2953 | (SLP_TREE_REPRESENTATIVE (load_node)); |
2954 | /* Use SLP for strided accesses (or if we can't |
2955 | load-lanes). */ |
2956 | if (STMT_VINFO_STRIDED_P (stmt_vinfo) |
2957 | || vect_load_lanes_supported |
2958 | (STMT_VINFO_VECTYPE (stmt_vinfo), |
2959 | DR_GROUP_SIZE (stmt_vinfo), false) == IFN_LAST) |
2960 | break; |
2961 | } |
2962 | |
2963 | can_use_lanes |
2964 | = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length (); |
2965 | |
2966 | if (can_use_lanes && dump_enabled_p ()) |
2967 | dump_printf_loc (MSG_NOTE, vect_location, |
2968 | "SLP instance %p can use load/store-lanes\n" , |
2969 | (void *) instance); |
2970 | } |
2971 | else |
2972 | { |
2973 | can_use_lanes = false; |
2974 | break; |
2975 | } |
2976 | } |
2977 | |
2978 | /* If all SLP instances can use load/store-lanes abort SLP and try again |
2979 | with SLP disabled. */ |
2980 | if (can_use_lanes) |
2981 | { |
2982 | ok = opt_result::failure_at (loc: vect_location, |
2983 | fmt: "Built SLP cancelled: can use " |
2984 | "load/store-lanes\n" ); |
2985 | if (dump_enabled_p ()) |
2986 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
2987 | "Built SLP cancelled: all SLP instances support " |
2988 | "load/store-lanes\n" ); |
2989 | goto again; |
2990 | } |
2991 | } |
2992 | |
2993 | /* Dissolve SLP-only groups. */ |
2994 | vect_dissolve_slp_only_groups (loop_vinfo); |
2995 | |
2996 | /* Scan all the remaining operations in the loop that are not subject |
2997 | to SLP and make sure they are vectorizable. */ |
2998 | ok = vect_analyze_loop_operations (loop_vinfo); |
2999 | if (!ok) |
3000 | { |
3001 | if (dump_enabled_p ()) |
3002 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
3003 | "bad operation or unsupported loop bound.\n" ); |
3004 | return ok; |
3005 | } |
3006 | |
3007 | /* For now, we don't expect to mix both masking and length approaches for one |
3008 | loop, disable it if both are recorded. */ |
3009 | if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) |
3010 | && !LOOP_VINFO_MASKS (loop_vinfo).is_empty () |
3011 | && !LOOP_VINFO_LENS (loop_vinfo).is_empty ()) |
3012 | { |
3013 | if (dump_enabled_p ()) |
3014 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
3015 | "can't vectorize a loop with partial vectors" |
3016 | " because we don't expect to mix different" |
3017 | " approaches with partial vectors for the" |
3018 | " same loop.\n" ); |
3019 | LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; |
3020 | } |
3021 | |
3022 | /* If we still have the option of using partial vectors, |
3023 | check whether we can generate the necessary loop controls. */ |
3024 | if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)) |
3025 | { |
3026 | if (!LOOP_VINFO_MASKS (loop_vinfo).is_empty ()) |
3027 | { |
3028 | if (!vect_verify_full_masking (loop_vinfo) |
3029 | && !vect_verify_full_masking_avx512 (loop_vinfo)) |
3030 | LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; |
3031 | } |
3032 | else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */ |
3033 | if (!vect_verify_loop_lens (loop_vinfo)) |
3034 | LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; |
3035 | } |
3036 | |
3037 | /* If we're vectorizing a loop that uses length "controls" and |
3038 | can iterate more than once, we apply decrementing IV approach |
3039 | in loop control. */ |
3040 | if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) |
3041 | && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_len |
3042 | && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0 |
3043 | && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) |
3044 | && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo), |
3045 | LOOP_VINFO_VECT_FACTOR (loop_vinfo)))) |
3046 | LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true; |
3047 | |
3048 | /* If a loop uses length controls and has a decrementing loop control IV, |
3049 | we will normally pass that IV through a MIN_EXPR to calcaluate the |
3050 | basis for the length controls. E.g. in a loop that processes one |
3051 | element per scalar iteration, the number of elements would be |
3052 | MIN_EXPR <N, VF>, where N is the number of scalar iterations left. |
3053 | |
3054 | This MIN_EXPR approach allows us to use pointer IVs with an invariant |
3055 | step, since only the final iteration of the vector loop can have |
3056 | inactive lanes. |
3057 | |
3058 | However, some targets have a dedicated instruction for calculating the |
3059 | preferred length, given the total number of elements that still need to |
3060 | be processed. This is encapsulated in the SELECT_VL internal function. |
3061 | |
3062 | If the target supports SELECT_VL, we can use it instead of MIN_EXPR |
3063 | to determine the basis for the length controls. However, unlike the |
3064 | MIN_EXPR calculation, the SELECT_VL calculation can decide to make |
3065 | lanes inactive in any iteration of the vector loop, not just the last |
3066 | iteration. This SELECT_VL approach therefore requires us to use pointer |
3067 | IVs with variable steps. |
3068 | |
3069 | Once we've decided how many elements should be processed by one |
3070 | iteration of the vector loop, we need to populate the rgroup controls. |
3071 | If a loop has multiple rgroups, we need to make sure that those rgroups |
3072 | "line up" (that is, they must be consistent about which elements are |
3073 | active and which aren't). This is done by vect_adjust_loop_lens_control. |
3074 | |
3075 | In principle, it would be possible to use vect_adjust_loop_lens_control |
3076 | on either the result of a MIN_EXPR or the result of a SELECT_VL. |
3077 | However: |
3078 | |
3079 | (1) In practice, it only makes sense to use SELECT_VL when a vector |
3080 | operation will be controlled directly by the result. It is not |
3081 | worth using SELECT_VL if it would only be the input to other |
3082 | calculations. |
3083 | |
3084 | (2) If we use SELECT_VL for an rgroup that has N controls, each associated |
3085 | pointer IV will need N updates by a variable amount (N-1 updates |
3086 | within the iteration and 1 update to move to the next iteration). |
3087 | |
3088 | Because of this, we prefer to use the MIN_EXPR approach whenever there |
3089 | is more than one length control. |
3090 | |
3091 | In addition, SELECT_VL always operates to a granularity of 1 unit. |
3092 | If we wanted to use it to control an SLP operation on N consecutive |
3093 | elements, we would need to make the SELECT_VL inputs measure scalar |
3094 | iterations (rather than elements) and then multiply the SELECT_VL |
3095 | result by N. But using SELECT_VL this way is inefficient because |
3096 | of (1) above. |
3097 | |
3098 | 2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are |
3099 | satisfied: |
3100 | |
3101 | (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true. |
3102 | (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true. |
3103 | |
3104 | Since SELECT_VL (variable step) will make SCEV analysis failed and then |
3105 | we will fail to gain benefits of following unroll optimizations. We prefer |
3106 | using the MIN_EXPR approach in this situation. */ |
3107 | if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)) |
3108 | { |
3109 | tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo); |
3110 | if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type, |
3111 | OPTIMIZE_FOR_SPEED) |
3112 | && LOOP_VINFO_LENS (loop_vinfo).length () == 1 |
3113 | && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1 && !slp |
3114 | && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) |
3115 | || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ())) |
3116 | LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true; |
3117 | } |
3118 | |
3119 | /* Decide whether this loop_vinfo should use partial vectors or peeling, |
3120 | assuming that the loop will be used as a main loop. We will redo |
3121 | this analysis later if we instead decide to use the loop as an |
3122 | epilogue loop. */ |
3123 | ok = vect_determine_partial_vectors_and_peeling (loop_vinfo); |
3124 | if (!ok) |
3125 | return ok; |
3126 | |
3127 | /* If we're vectorizing an epilogue loop, the vectorized loop either needs |
3128 | to be able to handle fewer than VF scalars, or needs to have a lower VF |
3129 | than the main loop. */ |
3130 | if (LOOP_VINFO_EPILOGUE_P (loop_vinfo) |
3131 | && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)) |
3132 | { |
3133 | poly_uint64 unscaled_vf |
3134 | = exact_div (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo), |
3135 | b: orig_loop_vinfo->suggested_unroll_factor); |
3136 | if (maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), unscaled_vf)) |
3137 | return opt_result::failure_at (loc: vect_location, |
3138 | fmt: "Vectorization factor too high for" |
3139 | " epilogue loop.\n" ); |
3140 | } |
3141 | |
3142 | /* Check the costings of the loop make vectorizing worthwhile. */ |
3143 | res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor); |
3144 | if (res < 0) |
3145 | { |
3146 | ok = opt_result::failure_at (loc: vect_location, |
3147 | fmt: "Loop costings may not be worthwhile.\n" ); |
3148 | goto again; |
3149 | } |
3150 | if (!res) |
3151 | return opt_result::failure_at (loc: vect_location, |
3152 | fmt: "Loop costings not worthwhile.\n" ); |
3153 | |
3154 | /* If an epilogue loop is required make sure we can create one. */ |
3155 | if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) |
3156 | || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)) |
3157 | { |
3158 | if (dump_enabled_p ()) |
3159 | dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n" ); |
3160 | if (!vect_can_advance_ivs_p (loop_vinfo) |
3161 | || !slpeel_can_duplicate_loop_p (loop, |
3162 | LOOP_VINFO_IV_EXIT (loop_vinfo), |
3163 | LOOP_VINFO_IV_EXIT (loop_vinfo))) |
3164 | { |
3165 | ok = opt_result::failure_at (loc: vect_location, |
3166 | fmt: "not vectorized: can't create required " |
3167 | "epilog loop\n" ); |
3168 | goto again; |
3169 | } |
3170 | } |
3171 | |
3172 | /* During peeling, we need to check if number of loop iterations is |
3173 | enough for both peeled prolog loop and vector loop. This check |
3174 | can be merged along with threshold check of loop versioning, so |
3175 | increase threshold for this case if necessary. |
3176 | |
3177 | If we are analyzing an epilogue we still want to check what its |
3178 | versioning threshold would be. If we decide to vectorize the epilogues we |
3179 | will want to use the lowest versioning threshold of all epilogues and main |
3180 | loop. This will enable us to enter a vectorized epilogue even when |
3181 | versioning the loop. We can't simply check whether the epilogue requires |
3182 | versioning though since we may have skipped some versioning checks when |
3183 | analyzing the epilogue. For instance, checks for alias versioning will be |
3184 | skipped when dealing with epilogues as we assume we already checked them |
3185 | for the main loop. So instead we always check the 'orig_loop_vinfo'. */ |
3186 | if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo)) |
3187 | { |
3188 | poly_uint64 niters_th = 0; |
3189 | unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo); |
3190 | |
3191 | if (!vect_use_loop_mask_for_alignment_p (loop_vinfo)) |
3192 | { |
3193 | /* Niters for peeled prolog loop. */ |
3194 | if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0) |
3195 | { |
3196 | dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo); |
3197 | tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt); |
3198 | niters_th += TYPE_VECTOR_SUBPARTS (node: vectype) - 1; |
3199 | } |
3200 | else |
3201 | niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); |
3202 | } |
3203 | |
3204 | /* Niters for at least one iteration of vectorized loop. */ |
3205 | if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)) |
3206 | niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo); |
3207 | /* One additional iteration because of peeling for gap. */ |
3208 | if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)) |
3209 | niters_th += 1; |
3210 | |
3211 | /* Use the same condition as vect_transform_loop to decide when to use |
3212 | the cost to determine a versioning threshold. */ |
3213 | if (vect_apply_runtime_profitability_check_p (loop_vinfo) |
3214 | && ordered_p (a: th, b: niters_th)) |
3215 | niters_th = ordered_max (a: poly_uint64 (th), b: niters_th); |
3216 | |
3217 | LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th; |
3218 | } |
3219 | |
3220 | gcc_assert (known_eq (vectorization_factor, |
3221 | LOOP_VINFO_VECT_FACTOR (loop_vinfo))); |
3222 | |
3223 | slp_done_for_suggested_uf = slp; |
3224 | |
3225 | /* Ok to vectorize! */ |
3226 | LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1; |
3227 | return opt_result::success (); |
3228 | |
3229 | again: |
3230 | /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */ |
3231 | gcc_assert (!ok); |
3232 | |
3233 | /* Try again with SLP forced off but if we didn't do any SLP there is |
3234 | no point in re-trying. */ |
3235 | if (!slp) |
3236 | return ok; |
3237 | |
3238 | /* If the slp decision is true when suggested unroll factor is worked |
3239 | out, and we are applying suggested unroll factor, we don't need to |
3240 | re-try any more. */ |
3241 | if (applying_suggested_uf && slp_done_for_suggested_uf) |
3242 | return ok; |
3243 | |
3244 | /* If there are reduction chains re-trying will fail anyway. */ |
3245 | if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ()) |
3246 | return ok; |
3247 | |
3248 | /* Likewise if the grouped loads or stores in the SLP cannot be handled |
3249 | via interleaving or lane instructions. */ |
3250 | slp_instance instance; |
3251 | slp_tree node; |
3252 | unsigned i, j; |
3253 | FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance) |
3254 | { |
3255 | stmt_vec_info vinfo; |
3256 | vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]; |
3257 | if (! STMT_VINFO_GROUPED_ACCESS (vinfo)) |
3258 | continue; |
3259 | vinfo = DR_GROUP_FIRST_ELEMENT (vinfo); |
3260 | unsigned int size = DR_GROUP_SIZE (vinfo); |
3261 | tree vectype = STMT_VINFO_VECTYPE (vinfo); |
3262 | if (vect_store_lanes_supported (vectype, size, false) == IFN_LAST |
3263 | && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U) |
3264 | && ! vect_grouped_store_supported (vectype, size)) |
3265 | return opt_result::failure_at (loc: vinfo->stmt, |
3266 | fmt: "unsupported grouped store\n" ); |
3267 | FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node) |
3268 | { |
3269 | vinfo = SLP_TREE_REPRESENTATIVE (node); |
3270 | if (STMT_VINFO_GROUPED_ACCESS (vinfo)) |
3271 | { |
3272 | vinfo = DR_GROUP_FIRST_ELEMENT (vinfo); |
3273 | bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo); |
3274 | size = DR_GROUP_SIZE (vinfo); |
3275 | vectype = STMT_VINFO_VECTYPE (vinfo); |
3276 | if (vect_load_lanes_supported (vectype, size, false) == IFN_LAST |
3277 | && ! vect_grouped_load_supported (vectype, single_element_p, |
3278 | size)) |
3279 | return opt_result::failure_at (loc: vinfo->stmt, |
3280 | fmt: "unsupported grouped load\n" ); |
3281 | } |
3282 | } |
3283 | } |
3284 | |
3285 | if (dump_enabled_p ()) |
3286 | dump_printf_loc (MSG_NOTE, vect_location, |
3287 | "re-trying with SLP disabled\n" ); |
3288 | |
3289 | /* Roll back state appropriately. No SLP this time. */ |
3290 | slp = false; |
3291 | /* Restore vectorization factor as it were without SLP. */ |
3292 | LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor; |
3293 | /* Free the SLP instances. */ |
3294 | FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance) |
3295 | vect_free_slp_instance (instance); |
3296 | LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release (); |
3297 | /* Reset SLP type to loop_vect on all stmts. */ |
3298 | for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i) |
3299 | { |
3300 | basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i]; |
3301 | for (gimple_stmt_iterator si = gsi_start_phis (bb); |
3302 | !gsi_end_p (i: si); gsi_next (i: &si)) |
3303 | { |
3304 | stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (i: si)); |
3305 | STMT_SLP_TYPE (stmt_info) = loop_vect; |
3306 | if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def |
3307 | || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def) |
3308 | { |
3309 | /* vectorizable_reduction adjusts reduction stmt def-types, |
3310 | restore them to that of the PHI. */ |
3311 | STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info)) |
3312 | = STMT_VINFO_DEF_TYPE (stmt_info); |
3313 | STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize |
3314 | (STMT_VINFO_REDUC_DEF (stmt_info))) |
3315 | = STMT_VINFO_DEF_TYPE (stmt_info); |
3316 | } |
3317 | } |
3318 | for (gimple_stmt_iterator si = gsi_start_bb (bb); |
3319 | !gsi_end_p (i: si); gsi_next (i: &si)) |
3320 | { |
3321 | if (is_gimple_debug (gs: gsi_stmt (i: si))) |
3322 | continue; |
3323 | stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (i: si)); |
3324 | STMT_SLP_TYPE (stmt_info) = loop_vect; |
3325 | if (STMT_VINFO_IN_PATTERN_P (stmt_info)) |
3326 | { |
3327 | stmt_vec_info pattern_stmt_info |
3328 | = STMT_VINFO_RELATED_STMT (stmt_info); |
3329 | if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info)) |
3330 | STMT_VINFO_IN_PATTERN_P (stmt_info) = false; |
3331 | |
3332 | gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info); |
3333 | STMT_SLP_TYPE (pattern_stmt_info) = loop_vect; |
3334 | for (gimple_stmt_iterator pi = gsi_start (seq&: pattern_def_seq); |
3335 | !gsi_end_p (i: pi); gsi_next (i: &pi)) |
3336 | STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi))) |
3337 | = loop_vect; |
3338 | } |
3339 | } |
3340 | } |
3341 | /* Free optimized alias test DDRS. */ |
3342 | LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (size: 0); |
3343 | LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release (); |
3344 | LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release (); |
3345 | /* Reset target cost data. */ |
3346 | delete loop_vinfo->vector_costs; |
3347 | loop_vinfo->vector_costs = nullptr; |
3348 | /* Reset accumulated rgroup information. */ |
3349 | LOOP_VINFO_MASKS (loop_vinfo).mask_set.empty (); |
3350 | release_vec_loop_controls (controls: &LOOP_VINFO_MASKS (loop_vinfo).rgc_vec); |
3351 | release_vec_loop_controls (controls: &LOOP_VINFO_LENS (loop_vinfo)); |
3352 | /* Reset assorted flags. */ |
3353 | LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false; |
3354 | LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false; |
3355 | LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0; |
3356 | LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0; |
3357 | LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) |
3358 | = saved_can_use_partial_vectors_p; |
3359 | |
3360 | goto start_over; |
3361 | } |
3362 | |
3363 | /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears |
3364 | to be better than vectorizing it using OLD_LOOP_VINFO. Assume that |
3365 | OLD_LOOP_VINFO is better unless something specifically indicates |
3366 | otherwise. |
3367 | |
3368 | Note that this deliberately isn't a partial order. */ |
3369 | |
3370 | static bool |
3371 | vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo, |
3372 | loop_vec_info old_loop_vinfo) |
3373 | { |
3374 | struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo); |
3375 | gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop); |
3376 | |
3377 | poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo); |
3378 | poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo); |
3379 | |
3380 | /* Always prefer a VF of loop->simdlen over any other VF. */ |
3381 | if (loop->simdlen) |
3382 | { |
3383 | bool new_simdlen_p = known_eq (new_vf, loop->simdlen); |
3384 | bool old_simdlen_p = known_eq (old_vf, loop->simdlen); |
3385 | if (new_simdlen_p != old_simdlen_p) |
3386 | return new_simdlen_p; |
3387 | } |
3388 | |
3389 | const auto *old_costs = old_loop_vinfo->vector_costs; |
3390 | const auto *new_costs = new_loop_vinfo->vector_costs; |
3391 | if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo)) |
3392 | return new_costs->better_epilogue_loop_than_p (other: old_costs, main_loop); |
3393 | |
3394 | return new_costs->better_main_loop_than_p (other: old_costs); |
3395 | } |
3396 | |
3397 | /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return |
3398 | true if we should. */ |
3399 | |
3400 | static bool |
3401 | vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo, |
3402 | loop_vec_info old_loop_vinfo) |
3403 | { |
3404 | if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo)) |
3405 | return false; |
3406 | |
3407 | if (dump_enabled_p ()) |
3408 | dump_printf_loc (MSG_NOTE, vect_location, |
3409 | "***** Preferring vector mode %s to vector mode %s\n" , |
3410 | GET_MODE_NAME (new_loop_vinfo->vector_mode), |
3411 | GET_MODE_NAME (old_loop_vinfo->vector_mode)); |
3412 | return true; |
3413 | } |
3414 | |
3415 | /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is |
3416 | not NULL. Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance |
3417 | MODE_I to the next mode useful to analyze. |
3418 | Return the loop_vinfo on success and wrapped null on failure. */ |
3419 | |
3420 | static opt_loop_vec_info |
3421 | vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared, |
3422 | const vect_loop_form_info *loop_form_info, |
3423 | loop_vec_info main_loop_vinfo, |
3424 | const vector_modes &vector_modes, unsigned &mode_i, |
3425 | machine_mode &autodetected_vector_mode, |
3426 | bool &fatal) |
3427 | { |
3428 | loop_vec_info loop_vinfo |
3429 | = vect_create_loop_vinfo (loop, shared, info: loop_form_info, main_loop_info: main_loop_vinfo); |
3430 | |
3431 | machine_mode vector_mode = vector_modes[mode_i]; |
3432 | loop_vinfo->vector_mode = vector_mode; |
3433 | unsigned int suggested_unroll_factor = 1; |
3434 | bool slp_done_for_suggested_uf = false; |
3435 | |
3436 | /* Run the main analysis. */ |
3437 | opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal, |
3438 | suggested_unroll_factor: &suggested_unroll_factor, |
3439 | slp_done_for_suggested_uf); |
3440 | if (dump_enabled_p ()) |
3441 | dump_printf_loc (MSG_NOTE, vect_location, |
3442 | "***** Analysis %s with vector mode %s\n" , |
3443 | res ? "succeeded" : " failed" , |
3444 | GET_MODE_NAME (loop_vinfo->vector_mode)); |
3445 | |
3446 | if (res && !main_loop_vinfo && suggested_unroll_factor > 1) |
3447 | { |
3448 | if (dump_enabled_p ()) |
3449 | dump_printf_loc (MSG_NOTE, vect_location, |
3450 | "***** Re-trying analysis for unrolling" |
3451 | " with unroll factor %d and slp %s.\n" , |
3452 | suggested_unroll_factor, |
3453 | slp_done_for_suggested_uf ? "on" : "off" ); |
3454 | loop_vec_info unroll_vinfo |
3455 | = vect_create_loop_vinfo (loop, shared, info: loop_form_info, main_loop_info: main_loop_vinfo); |
3456 | unroll_vinfo->vector_mode = vector_mode; |
3457 | unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor; |
3458 | opt_result new_res = vect_analyze_loop_2 (loop_vinfo: unroll_vinfo, fatal, NULL, |
3459 | slp_done_for_suggested_uf); |
3460 | if (new_res) |
3461 | { |
3462 | delete loop_vinfo; |
3463 | loop_vinfo = unroll_vinfo; |
3464 | } |
3465 | else |
3466 | delete unroll_vinfo; |
3467 | } |
3468 | |
3469 | /* Remember the autodetected vector mode. */ |
3470 | if (vector_mode == VOIDmode) |
3471 | autodetected_vector_mode = loop_vinfo->vector_mode; |
3472 | |
3473 | /* Advance mode_i, first skipping modes that would result in the |
3474 | same analysis result. */ |
3475 | while (mode_i + 1 < vector_modes.length () |
3476 | && vect_chooses_same_modes_p (loop_vinfo, |
3477 | vector_modes[mode_i + 1])) |
3478 | { |
3479 | if (dump_enabled_p ()) |
3480 | dump_printf_loc (MSG_NOTE, vect_location, |
3481 | "***** The result for vector mode %s would" |
3482 | " be the same\n" , |
3483 | GET_MODE_NAME (vector_modes[mode_i + 1])); |
3484 | mode_i += 1; |
3485 | } |
3486 | if (mode_i + 1 < vector_modes.length () |
3487 | && VECTOR_MODE_P (autodetected_vector_mode) |
3488 | && (related_vector_mode (vector_modes[mode_i + 1], |
3489 | GET_MODE_INNER (autodetected_vector_mode)) |
3490 | == autodetected_vector_mode) |
3491 | && (related_vector_mode (autodetected_vector_mode, |
3492 | GET_MODE_INNER (vector_modes[mode_i + 1])) |
3493 | == vector_modes[mode_i + 1])) |
3494 | { |
3495 | if (dump_enabled_p ()) |
3496 | dump_printf_loc (MSG_NOTE, vect_location, |
3497 | "***** Skipping vector mode %s, which would" |
3498 | " repeat the analysis for %s\n" , |
3499 | GET_MODE_NAME (vector_modes[mode_i + 1]), |
3500 | GET_MODE_NAME (autodetected_vector_mode)); |
3501 | mode_i += 1; |
3502 | } |
3503 | mode_i++; |
3504 | |
3505 | if (!res) |
3506 | { |
3507 | delete loop_vinfo; |
3508 | if (fatal) |
3509 | gcc_checking_assert (main_loop_vinfo == NULL); |
3510 | return opt_loop_vec_info::propagate_failure (other: res); |
3511 | } |
3512 | |
3513 | return opt_loop_vec_info::success (ptr: loop_vinfo); |
3514 | } |
3515 | |
3516 | /* Function vect_analyze_loop. |
3517 | |
3518 | Apply a set of analyses on LOOP, and create a loop_vec_info struct |
3519 | for it. The different analyses will record information in the |
3520 | loop_vec_info struct. */ |
3521 | opt_loop_vec_info |
3522 | vect_analyze_loop (class loop *loop, vec_info_shared *shared) |
3523 | { |
3524 | DUMP_VECT_SCOPE ("analyze_loop_nest" ); |
3525 | |
3526 | if (loop_outer (loop) |
3527 | && loop_vec_info_for_loop (loop: loop_outer (loop)) |
3528 | && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop)))) |
3529 | return opt_loop_vec_info::failure_at (loc: vect_location, |
3530 | fmt: "outer-loop already vectorized.\n" ); |
3531 | |
3532 | if (!find_loop_nest (loop, &shared->loop_nest)) |
3533 | return opt_loop_vec_info::failure_at |
3534 | (loc: vect_location, |
3535 | fmt: "not vectorized: loop nest containing two or more consecutive inner" |
3536 | " loops cannot be vectorized\n" ); |
3537 | |
3538 | /* Analyze the loop form. */ |
3539 | vect_loop_form_info loop_form_info; |
3540 | opt_result res = vect_analyze_loop_form (loop, info: &loop_form_info); |
3541 | if (!res) |
3542 | { |
3543 | if (dump_enabled_p ()) |
3544 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
3545 | "bad loop form.\n" ); |
3546 | return opt_loop_vec_info::propagate_failure (other: res); |
3547 | } |
3548 | if (!integer_onep (loop_form_info.assumptions)) |
3549 | { |
3550 | /* We consider to vectorize this loop by versioning it under |
3551 | some assumptions. In order to do this, we need to clear |
3552 | existing information computed by scev and niter analyzer. */ |
3553 | scev_reset_htab (); |
3554 | free_numbers_of_iterations_estimates (loop); |
3555 | /* Also set flag for this loop so that following scev and niter |
3556 | analysis are done under the assumptions. */ |
3557 | loop_constraint_set (loop, LOOP_C_FINITE); |
3558 | } |
3559 | |
3560 | auto_vector_modes vector_modes; |
3561 | /* Autodetect first vector size we try. */ |
3562 | vector_modes.safe_push (VOIDmode); |
3563 | unsigned int autovec_flags |
3564 | = targetm.vectorize.autovectorize_vector_modes (&vector_modes, |
3565 | loop->simdlen != 0); |
3566 | bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS) |
3567 | && !unlimited_cost_model (loop)); |
3568 | machine_mode autodetected_vector_mode = VOIDmode; |
3569 | opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL); |
3570 | unsigned int mode_i = 0; |
3571 | unsigned HOST_WIDE_INT simdlen = loop->simdlen; |
3572 | |
3573 | /* Keep track of the VF for each mode. Initialize all to 0 which indicates |
3574 | a mode has not been analyzed. */ |
3575 | auto_vec<poly_uint64, 8> cached_vf_per_mode; |
3576 | for (unsigned i = 0; i < vector_modes.length (); ++i) |
3577 | cached_vf_per_mode.safe_push (obj: 0); |
3578 | |
3579 | /* First determine the main loop vectorization mode, either the first |
3580 | one that works, starting with auto-detecting the vector mode and then |
3581 | following the targets order of preference, or the one with the |
3582 | lowest cost if pick_lowest_cost_p. */ |
3583 | while (1) |
3584 | { |
3585 | bool fatal; |
3586 | unsigned int last_mode_i = mode_i; |
3587 | /* Set cached VF to -1 prior to analysis, which indicates a mode has |
3588 | failed. */ |
3589 | cached_vf_per_mode[last_mode_i] = -1; |
3590 | opt_loop_vec_info loop_vinfo |
3591 | = vect_analyze_loop_1 (loop, shared, loop_form_info: &loop_form_info, |
3592 | NULL, vector_modes, mode_i, |
3593 | autodetected_vector_mode, fatal); |
3594 | if (fatal) |
3595 | break; |
3596 | |
3597 | if (loop_vinfo) |
3598 | { |
3599 | /* Analyzis has been successful so update the VF value. The |
3600 | VF should always be a multiple of unroll_factor and we want to |
3601 | capture the original VF here. */ |
3602 | cached_vf_per_mode[last_mode_i] |
3603 | = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo), |
3604 | b: loop_vinfo->suggested_unroll_factor); |
3605 | /* Once we hit the desired simdlen for the first time, |
3606 | discard any previous attempts. */ |
3607 | if (simdlen |
3608 | && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen)) |
3609 | { |
3610 | delete first_loop_vinfo; |
3611 | first_loop_vinfo = opt_loop_vec_info::success (NULL); |
3612 | simdlen = 0; |
3613 | } |
3614 | else if (pick_lowest_cost_p |
3615 | && first_loop_vinfo |
3616 | && vect_joust_loop_vinfos (new_loop_vinfo: loop_vinfo, old_loop_vinfo: first_loop_vinfo)) |
3617 | { |
3618 | /* Pick loop_vinfo over first_loop_vinfo. */ |
3619 | delete first_loop_vinfo; |
3620 | first_loop_vinfo = opt_loop_vec_info::success (NULL); |
3621 | } |
3622 | if (first_loop_vinfo == NULL) |
3623 | first_loop_vinfo = loop_vinfo; |
3624 | else |
3625 | { |
3626 | delete loop_vinfo; |
3627 | loop_vinfo = opt_loop_vec_info::success (NULL); |
3628 | } |
3629 | |
3630 | /* Commit to first_loop_vinfo if we have no reason to try |
3631 | alternatives. */ |
3632 | if (!simdlen && !pick_lowest_cost_p) |
3633 | break; |
3634 | } |
3635 | if (mode_i == vector_modes.length () |
3636 | || autodetected_vector_mode == VOIDmode) |
3637 | break; |
3638 | |
3639 | /* Try the next biggest vector size. */ |
3640 | if (dump_enabled_p ()) |
3641 | dump_printf_loc (MSG_NOTE, vect_location, |
3642 | "***** Re-trying analysis with vector mode %s\n" , |
3643 | GET_MODE_NAME (vector_modes[mode_i])); |
3644 | } |
3645 | if (!first_loop_vinfo) |
3646 | return opt_loop_vec_info::propagate_failure (other: res); |
3647 | |
3648 | if (dump_enabled_p ()) |
3649 | dump_printf_loc (MSG_NOTE, vect_location, |
3650 | "***** Choosing vector mode %s\n" , |
3651 | GET_MODE_NAME (first_loop_vinfo->vector_mode)); |
3652 | |
3653 | /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is |
3654 | enabled, SIMDUID is not set, it is the innermost loop and we have |
3655 | either already found the loop's SIMDLEN or there was no SIMDLEN to |
3656 | begin with. |
3657 | TODO: Enable epilogue vectorization for loops with SIMDUID set. */ |
3658 | bool vect_epilogues = (!simdlen |
3659 | && loop->inner == NULL |
3660 | && param_vect_epilogues_nomask |
3661 | && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo) |
3662 | && !loop->simduid); |
3663 | if (!vect_epilogues) |
3664 | return first_loop_vinfo; |
3665 | |
3666 | /* Now analyze first_loop_vinfo for epilogue vectorization. */ |
3667 | poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo); |
3668 | |
3669 | /* For epilogues start the analysis from the first mode. The motivation |
3670 | behind starting from the beginning comes from cases where the VECTOR_MODES |
3671 | array may contain length-agnostic and length-specific modes. Their |
3672 | ordering is not guaranteed, so we could end up picking a mode for the main |
3673 | loop that is after the epilogue's optimal mode. */ |
3674 | vector_modes[0] = autodetected_vector_mode; |
3675 | mode_i = 0; |
3676 | |
3677 | bool supports_partial_vectors = |
3678 | partial_vectors_supported_p () && param_vect_partial_vector_usage != 0; |
3679 | poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo); |
3680 | |
3681 | while (1) |
3682 | { |
3683 | /* If the target does not support partial vectors we can shorten the |
3684 | number of modes to analyze for the epilogue as we know we can't pick a |
3685 | mode that would lead to a VF at least as big as the |
3686 | FIRST_VINFO_VF. */ |
3687 | if (!supports_partial_vectors |
3688 | && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf)) |
3689 | { |
3690 | mode_i++; |
3691 | if (mode_i == vector_modes.length ()) |
3692 | break; |
3693 | continue; |
3694 | } |
3695 | |
3696 | if (dump_enabled_p ()) |
3697 | dump_printf_loc (MSG_NOTE, vect_location, |
3698 | "***** Re-trying epilogue analysis with vector " |
3699 | "mode %s\n" , GET_MODE_NAME (vector_modes[mode_i])); |
3700 | |
3701 | bool fatal; |
3702 | opt_loop_vec_info loop_vinfo |
3703 | = vect_analyze_loop_1 (loop, shared, loop_form_info: &loop_form_info, |
3704 | main_loop_vinfo: first_loop_vinfo, |
3705 | vector_modes, mode_i, |
3706 | autodetected_vector_mode, fatal); |
3707 | if (fatal) |
3708 | break; |
3709 | |
3710 | if (loop_vinfo) |
3711 | { |
3712 | if (pick_lowest_cost_p) |
3713 | { |
3714 | /* Keep trying to roll back vectorization attempts while the |
3715 | loop_vec_infos they produced were worse than this one. */ |
3716 | vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos; |
3717 | while (!vinfos.is_empty () |
3718 | && vect_joust_loop_vinfos (new_loop_vinfo: loop_vinfo, old_loop_vinfo: vinfos.last ())) |
3719 | { |
3720 | gcc_assert (vect_epilogues); |
3721 | delete vinfos.pop (); |
3722 | } |
3723 | } |
3724 | /* For now only allow one epilogue loop. */ |
3725 | if (first_loop_vinfo->epilogue_vinfos.is_empty ()) |
3726 | { |
3727 | first_loop_vinfo->epilogue_vinfos.safe_push (obj: loop_vinfo); |
3728 | poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo); |
3729 | gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo) |
3730 | || maybe_ne (lowest_th, 0U)); |
3731 | /* Keep track of the known smallest versioning |
3732 | threshold. */ |
3733 | if (ordered_p (a: lowest_th, b: th)) |
3734 | lowest_th = ordered_min (a: lowest_th, b: th); |
3735 | } |
3736 | else |
3737 | { |
3738 | delete loop_vinfo; |
3739 | loop_vinfo = opt_loop_vec_info::success (NULL); |
3740 | } |
3741 | |
3742 | /* For now only allow one epilogue loop, but allow |
3743 | pick_lowest_cost_p to replace it, so commit to the |
3744 | first epilogue if we have no reason to try alternatives. */ |
3745 | if (!pick_lowest_cost_p) |
3746 | break; |
3747 | } |
3748 | |
3749 | if (mode_i == vector_modes.length ()) |
3750 | break; |
3751 | |
3752 | } |
3753 | |
3754 | if (!first_loop_vinfo->epilogue_vinfos.is_empty ()) |
3755 | { |
3756 | LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th; |
3757 | if (dump_enabled_p ()) |
3758 | dump_printf_loc (MSG_NOTE, vect_location, |
3759 | "***** Choosing epilogue vector mode %s\n" , |
3760 | GET_MODE_NAME |
3761 | (first_loop_vinfo->epilogue_vinfos[0]->vector_mode)); |
3762 | } |
3763 | |
3764 | return first_loop_vinfo; |
3765 | } |
3766 | |
3767 | /* Return true if there is an in-order reduction function for CODE, storing |
3768 | it in *REDUC_FN if so. */ |
3769 | |
3770 | static bool |
3771 | fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn) |
3772 | { |
3773 | /* We support MINUS_EXPR by negating the operand. This also preserves an |
3774 | initial -0.0 since -0.0 - 0.0 (neutral op for MINUS_EXPR) == -0.0 + |
3775 | (-0.0) = -0.0. */ |
3776 | if (code == PLUS_EXPR || code == MINUS_EXPR) |
3777 | { |
3778 | *reduc_fn = IFN_FOLD_LEFT_PLUS; |
3779 | return true; |
3780 | } |
3781 | return false; |
3782 | } |
3783 | |
3784 | /* Function reduction_fn_for_scalar_code |
3785 | |
3786 | Input: |
3787 | CODE - tree_code of a reduction operations. |
3788 | |
3789 | Output: |
3790 | REDUC_FN - the corresponding internal function to be used to reduce the |
3791 | vector of partial results into a single scalar result, or IFN_LAST |
3792 | if the operation is a supported reduction operation, but does not have |
3793 | such an internal function. |
3794 | |
3795 | Return FALSE if CODE currently cannot be vectorized as reduction. */ |
3796 | |
3797 | bool |
3798 | reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn) |
3799 | { |
3800 | if (code.is_tree_code ()) |
3801 | switch (tree_code (code)) |
3802 | { |
3803 | case MAX_EXPR: |
3804 | *reduc_fn = IFN_REDUC_MAX; |
3805 | return true; |
3806 | |
3807 | case MIN_EXPR: |
3808 | *reduc_fn = IFN_REDUC_MIN; |
3809 | return true; |
3810 | |
3811 | case PLUS_EXPR: |
3812 | *reduc_fn = IFN_REDUC_PLUS; |
3813 | return true; |
3814 | |
3815 | case BIT_AND_EXPR: |
3816 | *reduc_fn = IFN_REDUC_AND; |
3817 | return true; |
3818 | |
3819 | case BIT_IOR_EXPR: |
3820 | *reduc_fn = IFN_REDUC_IOR; |
3821 | return true; |
3822 | |
3823 | case BIT_XOR_EXPR: |
3824 | *reduc_fn = IFN_REDUC_XOR; |
3825 | return true; |
3826 | |
3827 | case MULT_EXPR: |
3828 | case MINUS_EXPR: |
3829 | *reduc_fn = IFN_LAST; |
3830 | return true; |
3831 | |
3832 | default: |
3833 | return false; |
3834 | } |
3835 | else |
3836 | switch (combined_fn (code)) |
3837 | { |
3838 | CASE_CFN_FMAX: |
3839 | *reduc_fn = IFN_REDUC_FMAX; |
3840 | return true; |
3841 | |
3842 | CASE_CFN_FMIN: |
3843 | *reduc_fn = IFN_REDUC_FMIN; |
3844 | return true; |
3845 | |
3846 | default: |
3847 | return false; |
3848 | } |
3849 | } |
3850 | |
3851 | /* If there is a neutral value X such that a reduction would not be affected |
3852 | by the introduction of additional X elements, return that X, otherwise |
3853 | return null. CODE is the code of the reduction and SCALAR_TYPE is type |
3854 | of the scalar elements. If the reduction has just a single initial value |
3855 | then INITIAL_VALUE is that value, otherwise it is null. |
3856 | If AS_INITIAL is TRUE the value is supposed to be used as initial value. |
3857 | In that case no signed zero is returned. */ |
3858 | |
3859 | tree |
3860 | neutral_op_for_reduction (tree scalar_type, code_helper code, |
3861 | tree initial_value, bool as_initial) |
3862 | { |
3863 | if (code.is_tree_code ()) |
3864 | switch (tree_code (code)) |
3865 | { |
3866 | case DOT_PROD_EXPR: |
3867 | case SAD_EXPR: |
3868 | case MINUS_EXPR: |
3869 | case BIT_IOR_EXPR: |
3870 | case BIT_XOR_EXPR: |
3871 | return build_zero_cst (scalar_type); |
3872 | case WIDEN_SUM_EXPR: |
3873 | case PLUS_EXPR: |
3874 | if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type)) |
3875 | return build_real (scalar_type, dconstm0); |
3876 | else |
3877 | return build_zero_cst (scalar_type); |
3878 | |
3879 | case MULT_EXPR: |
3880 | return build_one_cst (scalar_type); |
3881 | |
3882 | case BIT_AND_EXPR: |
3883 | return build_all_ones_cst (scalar_type); |
3884 | |
3885 | case MAX_EXPR: |
3886 | case MIN_EXPR: |
3887 | return initial_value; |
3888 | |
3889 | default: |
3890 | return NULL_TREE; |
3891 | } |
3892 | else |
3893 | switch (combined_fn (code)) |
3894 | { |
3895 | CASE_CFN_FMIN: |
3896 | CASE_CFN_FMAX: |
3897 | return initial_value; |
3898 | |
3899 | default: |
3900 | return NULL_TREE; |
3901 | } |
3902 | } |
3903 | |
3904 | /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement |
3905 | STMT is printed with a message MSG. */ |
3906 | |
3907 | static void |
3908 | report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg) |
3909 | { |
3910 | dump_printf_loc (msg_type, vect_location, "%s%G" , msg, stmt); |
3911 | } |
3912 | |
3913 | /* Return true if we need an in-order reduction for operation CODE |
3914 | on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer |
3915 | overflow must wrap. */ |
3916 | |
3917 | bool |
3918 | needs_fold_left_reduction_p (tree type, code_helper code) |
3919 | { |
3920 | /* CHECKME: check for !flag_finite_math_only too? */ |
3921 | if (SCALAR_FLOAT_TYPE_P (type)) |
3922 | { |
3923 | if (code.is_tree_code ()) |
3924 | switch (tree_code (code)) |
3925 | { |
3926 | case MIN_EXPR: |
3927 | case MAX_EXPR: |
3928 | return false; |
3929 | |
3930 | default: |
3931 | return !flag_associative_math; |
3932 | } |
3933 | else |
3934 | switch (combined_fn (code)) |
3935 | { |
3936 | CASE_CFN_FMIN: |
3937 | CASE_CFN_FMAX: |
3938 | return false; |
3939 | |
3940 | default: |
3941 | return !flag_associative_math; |
3942 | } |
3943 | } |
3944 | |
3945 | if (INTEGRAL_TYPE_P (type)) |
3946 | return (!code.is_tree_code () |
3947 | || !operation_no_trapping_overflow (type, tree_code (code))); |
3948 | |
3949 | if (SAT_FIXED_POINT_TYPE_P (type)) |
3950 | return true; |
3951 | |
3952 | return false; |
3953 | } |
3954 | |
3955 | /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and |
3956 | has a handled computation expression. Store the main reduction |
3957 | operation in *CODE. */ |
3958 | |
3959 | static bool |
3960 | check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi, |
3961 | tree loop_arg, code_helper *code, |
3962 | vec<std::pair<ssa_op_iter, use_operand_p> > &path) |
3963 | { |
3964 | auto_bitmap visited; |
3965 | tree lookfor = PHI_RESULT (phi); |
3966 | ssa_op_iter curri; |
3967 | use_operand_p curr = op_iter_init_phiuse (ptr: &curri, phi, SSA_OP_USE); |
3968 | while (USE_FROM_PTR (curr) != loop_arg) |
3969 | curr = op_iter_next_use (ptr: &curri); |
3970 | curri.i = curri.numops; |
3971 | do |
3972 | { |
3973 | path.safe_push (obj: std::make_pair (x&: curri, y&: curr)); |
3974 | tree use = USE_FROM_PTR (curr); |
3975 | if (use == lookfor) |
3976 | break; |
3977 | gimple *def = SSA_NAME_DEF_STMT (use); |
3978 | if (gimple_nop_p (g: def) |
3979 | || ! flow_bb_inside_loop_p (loop, gimple_bb (g: def))) |
3980 | { |
3981 | pop: |
3982 | do |
3983 | { |
3984 | std::pair<ssa_op_iter, use_operand_p> x = path.pop (); |
3985 | curri = x.first; |
3986 | curr = x.second; |
3987 | do |
3988 | curr = op_iter_next_use (ptr: &curri); |
3989 | /* Skip already visited or non-SSA operands (from iterating |
3990 | over PHI args). */ |
3991 | while (curr != NULL_USE_OPERAND_P |
3992 | && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME |
3993 | || ! bitmap_set_bit (visited, |
3994 | SSA_NAME_VERSION |
3995 | (USE_FROM_PTR (curr))))); |
3996 | } |
3997 | while (curr == NULL_USE_OPERAND_P && ! path.is_empty ()); |
3998 | if (curr == NULL_USE_OPERAND_P) |
3999 | break; |
4000 | } |
4001 | else |
4002 | { |
4003 | if (gimple_code (g: def) == GIMPLE_PHI) |
4004 | curr = op_iter_init_phiuse (ptr: &curri, phi: as_a <gphi *>(p: def), SSA_OP_USE); |
4005 | else |
4006 | curr = op_iter_init_use (ptr: &curri, stmt: def, SSA_OP_USE); |
4007 | while (curr != NULL_USE_OPERAND_P |
4008 | && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME |
4009 | || ! bitmap_set_bit (visited, |
4010 | SSA_NAME_VERSION |
4011 | (USE_FROM_PTR (curr))))) |
4012 | curr = op_iter_next_use (ptr: &curri); |
4013 | if (curr == NULL_USE_OPERAND_P) |
4014 | goto pop; |
4015 | } |
4016 | } |
4017 | while (1); |
4018 | if (dump_file && (dump_flags & TDF_DETAILS)) |
4019 | { |
4020 | dump_printf_loc (MSG_NOTE, loc, "reduction path: " ); |
4021 | unsigned i; |
4022 | std::pair<ssa_op_iter, use_operand_p> *x; |
4023 | FOR_EACH_VEC_ELT (path, i, x) |
4024 | dump_printf (MSG_NOTE, "%T " , USE_FROM_PTR (x->second)); |
4025 | dump_printf (MSG_NOTE, "\n" ); |
4026 | } |
4027 | |
4028 | /* Check whether the reduction path detected is valid. */ |
4029 | bool fail = path.length () == 0; |
4030 | bool neg = false; |
4031 | int sign = -1; |
4032 | *code = ERROR_MARK; |
4033 | for (unsigned i = 1; i < path.length (); ++i) |
4034 | { |
4035 | gimple *use_stmt = USE_STMT (path[i].second); |
4036 | gimple_match_op op; |
4037 | if (!gimple_extract_op (use_stmt, &op)) |
4038 | { |
4039 | fail = true; |
4040 | break; |
4041 | } |
4042 | unsigned int opi = op.num_ops; |
4043 | if (gassign *assign = dyn_cast<gassign *> (p: use_stmt)) |
4044 | { |
4045 | /* The following make sure we can compute the operand index |
4046 | easily plus it mostly disallows chaining via COND_EXPR condition |
4047 | operands. */ |
4048 | for (opi = 0; opi < op.num_ops; ++opi) |
4049 | if (gimple_assign_rhs1_ptr (gs: assign) + opi == path[i].second->use) |
4050 | break; |
4051 | } |
4052 | else if (gcall *call = dyn_cast<gcall *> (p: use_stmt)) |
4053 | { |
4054 | for (opi = 0; opi < op.num_ops; ++opi) |
4055 | if (gimple_call_arg_ptr (gs: call, index: opi) == path[i].second->use) |
4056 | break; |
4057 | } |
4058 | if (opi == op.num_ops) |
4059 | { |
4060 | fail = true; |
4061 | break; |
4062 | } |
4063 | op.code = canonicalize_code (op.code, op.type); |
4064 | if (op.code == MINUS_EXPR) |
4065 | { |
4066 | op.code = PLUS_EXPR; |
4067 | /* Track whether we negate the reduction value each iteration. */ |
4068 | if (op.ops[1] == op.ops[opi]) |
4069 | neg = ! neg; |
4070 | } |
4071 | if (CONVERT_EXPR_CODE_P (op.code) |
4072 | && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0]))) |
4073 | ; |
4074 | else if (*code == ERROR_MARK) |
4075 | { |
4076 | *code = op.code; |
4077 | sign = TYPE_SIGN (op.type); |
4078 | } |
4079 | else if (op.code != *code) |
4080 | { |
4081 | fail = true; |
4082 | break; |
4083 | } |
4084 | else if ((op.code == MIN_EXPR |
4085 | || op.code == MAX_EXPR) |
4086 | && sign != TYPE_SIGN (op.type)) |
4087 | { |
4088 | fail = true; |
4089 | break; |
4090 | } |
4091 | /* Check there's only a single stmt the op is used on. For the |
4092 | not value-changing tail and the last stmt allow out-of-loop uses. |
4093 | ??? We could relax this and handle arbitrary live stmts by |
4094 | forcing a scalar epilogue for example. */ |
4095 | imm_use_iterator imm_iter; |
4096 | use_operand_p use_p; |
4097 | gimple *op_use_stmt; |
4098 | unsigned cnt = 0; |
4099 | bool cond_fn_p = op.code.is_internal_fn () |
4100 | && (conditional_internal_fn_code (internal_fn (op.code)) |
4101 | != ERROR_MARK); |
4102 | |
4103 | FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi]) |
4104 | { |
4105 | /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have |
4106 | op1 twice (once as definition, once as else) in the same operation. |
4107 | Allow this. */ |
4108 | if (cond_fn_p) |
4109 | { |
4110 | gcall *call = dyn_cast<gcall *> (p: use_stmt); |
4111 | unsigned else_pos |
4112 | = internal_fn_else_index (internal_fn (op.code)); |
4113 | |
4114 | for (unsigned int j = 0; j < gimple_call_num_args (gs: call); ++j) |
4115 | { |
4116 | if (j == else_pos) |
4117 | continue; |
4118 | if (gimple_call_arg (gs: call, index: j) == op.ops[opi]) |
4119 | cnt++; |
4120 | } |
4121 | } |
4122 | else if (!is_gimple_debug (gs: op_use_stmt) |
4123 | && (*code != ERROR_MARK |
4124 | || flow_bb_inside_loop_p (loop, |
4125 | gimple_bb (g: op_use_stmt)))) |
4126 | FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter) |
4127 | cnt++; |
4128 | } |
4129 | |
4130 | if (cnt != 1) |
4131 | { |
4132 | fail = true; |
4133 | break; |
4134 | } |
4135 | } |
4136 | return ! fail && ! neg && *code != ERROR_MARK; |
4137 | } |
4138 | |
4139 | bool |
4140 | check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi, |
4141 | tree loop_arg, enum tree_code code) |
4142 | { |
4143 | auto_vec<std::pair<ssa_op_iter, use_operand_p> > path; |
4144 | code_helper code_; |
4145 | return (check_reduction_path (loc, loop, phi, loop_arg, code: &code_, path) |
4146 | && code_ == code); |
4147 | } |
4148 | |
4149 | |
4150 | |
4151 | /* Function vect_is_simple_reduction |
4152 | |
4153 | (1) Detect a cross-iteration def-use cycle that represents a simple |
4154 | reduction computation. We look for the following pattern: |
4155 | |
4156 | loop_header: |
4157 | a1 = phi < a0, a2 > |
4158 | a3 = ... |
4159 | a2 = operation (a3, a1) |
4160 | |
4161 | or |
4162 | |
4163 | a3 = ... |
4164 | loop_header: |
4165 | a1 = phi < a0, a2 > |
4166 | a2 = operation (a3, a1) |
4167 | |
4168 | such that: |
4169 | 1. operation is commutative and associative and it is safe to |
4170 | change the order of the computation |
4171 | 2. no uses for a2 in the loop (a2 is used out of the loop) |
4172 | 3. no uses of a1 in the loop besides the reduction operation |
4173 | 4. no uses of a1 outside the loop. |
4174 | |
4175 | Conditions 1,4 are tested here. |
4176 | Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized. |
4177 | |
4178 | (2) Detect a cross-iteration def-use cycle in nested loops, i.e., |
4179 | nested cycles. |
4180 | |
4181 | (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double |
4182 | reductions: |
4183 | |
4184 | a1 = phi < a0, a2 > |
4185 | inner loop (def of a3) |
4186 | a2 = phi < a3 > |
4187 | |
4188 | (4) Detect condition expressions, ie: |
4189 | for (int i = 0; i < N; i++) |
4190 | if (a[i] < val) |
4191 | ret_val = a[i]; |
4192 | |
4193 | */ |
4194 | |
4195 | static stmt_vec_info |
4196 | vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info, |
4197 | bool *double_reduc, bool *reduc_chain_p, bool slp) |
4198 | { |
4199 | gphi *phi = as_a <gphi *> (p: phi_info->stmt); |
4200 | gimple *phi_use_stmt = NULL; |
4201 | imm_use_iterator imm_iter; |
4202 | use_operand_p use_p; |
4203 | |
4204 | *double_reduc = false; |
4205 | *reduc_chain_p = false; |
4206 | STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION; |
4207 | |
4208 | tree phi_name = PHI_RESULT (phi); |
4209 | /* ??? If there are no uses of the PHI result the inner loop reduction |
4210 | won't be detected as possibly double-reduction by vectorizable_reduction |
4211 | because that tries to walk the PHI arg from the preheader edge which |
4212 | can be constant. See PR60382. */ |
4213 | if (has_zero_uses (var: phi_name)) |
4214 | return NULL; |
4215 | class loop *loop = (gimple_bb (g: phi))->loop_father; |
4216 | unsigned nphi_def_loop_uses = 0; |
4217 | FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name) |
4218 | { |
4219 | gimple *use_stmt = USE_STMT (use_p); |
4220 | if (is_gimple_debug (gs: use_stmt)) |
4221 | continue; |
4222 | |
4223 | if (!flow_bb_inside_loop_p (loop, gimple_bb (g: use_stmt))) |
4224 | { |
4225 | if (dump_enabled_p ()) |
4226 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
4227 | "intermediate value used outside loop.\n" ); |
4228 | |
4229 | return NULL; |
4230 | } |
4231 | |
4232 | /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have |
4233 | op1 twice (once as definition, once as else) in the same operation. |
4234 | Only count it as one. */ |
4235 | if (use_stmt != phi_use_stmt) |
4236 | { |
4237 | nphi_def_loop_uses++; |
4238 | phi_use_stmt = use_stmt; |
4239 | } |
4240 | } |
4241 | |
4242 | tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop)); |
4243 | if (TREE_CODE (latch_def) != SSA_NAME) |
4244 | { |
4245 | if (dump_enabled_p ()) |
4246 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
4247 | "reduction: not ssa_name: %T\n" , latch_def); |
4248 | return NULL; |
4249 | } |
4250 | |
4251 | stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def); |
4252 | if (!def_stmt_info |
4253 | || !flow_bb_inside_loop_p (loop, gimple_bb (g: def_stmt_info->stmt))) |
4254 | return NULL; |
4255 | |
4256 | bool nested_in_vect_loop |
4257 | = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop); |
4258 | unsigned nlatch_def_loop_uses = 0; |
4259 | auto_vec<gphi *, 3> lcphis; |
4260 | bool inner_loop_of_double_reduc = false; |
4261 | FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def) |
4262 | { |
4263 | gimple *use_stmt = USE_STMT (use_p); |
4264 | if (is_gimple_debug (gs: use_stmt)) |
4265 | continue; |
4266 | if (flow_bb_inside_loop_p (loop, gimple_bb (g: use_stmt))) |
4267 | nlatch_def_loop_uses++; |
4268 | else |
4269 | { |
4270 | /* We can have more than one loop-closed PHI. */ |
4271 | lcphis.safe_push (obj: as_a <gphi *> (p: use_stmt)); |
4272 | if (nested_in_vect_loop |
4273 | && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt)) |
4274 | == vect_double_reduction_def)) |
4275 | inner_loop_of_double_reduc = true; |
4276 | } |
4277 | } |
4278 | |
4279 | /* If we are vectorizing an inner reduction we are executing that |
4280 | in the original order only in case we are not dealing with a |
4281 | double reduction. */ |
4282 | if (nested_in_vect_loop && !inner_loop_of_double_reduc) |
4283 | { |
4284 | if (dump_enabled_p ()) |
4285 | report_vect_op (msg_type: MSG_NOTE, stmt: def_stmt_info->stmt, |
4286 | msg: "detected nested cycle: " ); |
4287 | return def_stmt_info; |
4288 | } |
4289 | |
4290 | /* When the inner loop of a double reduction ends up with more than |
4291 | one loop-closed PHI we have failed to classify alternate such |
4292 | PHIs as double reduction, leading to wrong code. See PR103237. */ |
4293 | if (inner_loop_of_double_reduc && lcphis.length () != 1) |
4294 | { |
4295 | if (dump_enabled_p ()) |
4296 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
4297 | "unhandle double reduction\n" ); |
4298 | return NULL; |
4299 | } |
4300 | |
4301 | /* If this isn't a nested cycle or if the nested cycle reduction value |
4302 | is used ouside of the inner loop we cannot handle uses of the reduction |
4303 | value. */ |
4304 | if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1) |
4305 | { |
4306 | if (dump_enabled_p ()) |
4307 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
4308 | "reduction used in loop.\n" ); |
4309 | return NULL; |
4310 | } |
4311 | |
4312 | /* If DEF_STMT is a phi node itself, we expect it to have a single argument |
4313 | defined in the inner loop. */ |
4314 | if (gphi *def_stmt = dyn_cast <gphi *> (p: def_stmt_info->stmt)) |
4315 | { |
4316 | tree op1 = PHI_ARG_DEF (def_stmt, 0); |
4317 | if (gimple_phi_num_args (gs: def_stmt) != 1 |
4318 | || TREE_CODE (op1) != SSA_NAME) |
4319 | { |
4320 | if (dump_enabled_p ()) |
4321 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
4322 | "unsupported phi node definition.\n" ); |
4323 | |
4324 | return NULL; |
4325 | } |
4326 | |
4327 | /* Verify there is an inner cycle composed of the PHI phi_use_stmt |
4328 | and the latch definition op1. */ |
4329 | gimple *def1 = SSA_NAME_DEF_STMT (op1); |
4330 | if (gimple_bb (g: def1) |
4331 | && flow_bb_inside_loop_p (loop, gimple_bb (g: def_stmt)) |
4332 | && loop->inner |
4333 | && flow_bb_inside_loop_p (loop->inner, gimple_bb (g: def1)) |
4334 | && (is_gimple_assign (gs: def1) || is_gimple_call (gs: def1)) |
4335 | && is_a <gphi *> (p: phi_use_stmt) |
4336 | && flow_bb_inside_loop_p (loop->inner, gimple_bb (g: phi_use_stmt)) |
4337 | && (op1 == PHI_ARG_DEF_FROM_EDGE (phi_use_stmt, |
4338 | loop_latch_edge (loop->inner)))) |
4339 | { |
4340 | if (dump_enabled_p ()) |
4341 | report_vect_op (msg_type: MSG_NOTE, stmt: def_stmt, |
4342 | msg: "detected double reduction: " ); |
4343 | |
4344 | *double_reduc = true; |
4345 | return def_stmt_info; |
4346 | } |
4347 | |
4348 | return NULL; |
4349 | } |
4350 | |
4351 | /* Look for the expression computing latch_def from then loop PHI result. */ |
4352 | auto_vec<std::pair<ssa_op_iter, use_operand_p> > path; |
4353 | code_helper code; |
4354 | if (check_reduction_path (loc: vect_location, loop, phi, loop_arg: latch_def, code: &code, |
4355 | path)) |
4356 | { |
4357 | STMT_VINFO_REDUC_CODE (phi_info) = code; |
4358 | if (code == COND_EXPR && !nested_in_vect_loop) |
4359 | STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION; |
4360 | |
4361 | /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP |
4362 | reduction chain for which the additional restriction is that |
4363 | all operations in the chain are the same. */ |
4364 | auto_vec<stmt_vec_info, 8> reduc_chain; |
4365 | unsigned i; |
4366 | bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR; |
4367 | for (i = path.length () - 1; i >= 1; --i) |
4368 | { |
4369 | gimple *stmt = USE_STMT (path[i].second); |
4370 | stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt); |
4371 | gimple_match_op op; |
4372 | if (!gimple_extract_op (stmt, &op)) |
4373 | gcc_unreachable (); |
4374 | if (gassign *assign = dyn_cast<gassign *> (p: stmt)) |
4375 | STMT_VINFO_REDUC_IDX (stmt_info) |
4376 | = path[i].second->use - gimple_assign_rhs1_ptr (gs: assign); |
4377 | else |
4378 | { |
4379 | gcall *call = as_a<gcall *> (p: stmt); |
4380 | STMT_VINFO_REDUC_IDX (stmt_info) |
4381 | = path[i].second->use - gimple_call_arg_ptr (gs: call, index: 0); |
4382 | } |
4383 | bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code) |
4384 | && (i == 1 || i == path.length () - 1)); |
4385 | if ((op.code != code && !leading_conversion) |
4386 | /* We can only handle the final value in epilogue |
4387 | generation for reduction chains. */ |
4388 | || (i != 1 && !has_single_use (var: gimple_get_lhs (stmt)))) |
4389 | is_slp_reduc = false; |
4390 | /* For reduction chains we support a trailing/leading |
4391 | conversions. We do not store those in the actual chain. */ |
4392 | if (leading_conversion) |
4393 | continue; |
4394 | reduc_chain.safe_push (obj: stmt_info); |
4395 | } |
4396 | if (slp && is_slp_reduc && reduc_chain.length () > 1) |
4397 | { |
4398 | for (unsigned i = 0; i < reduc_chain.length () - 1; ++i) |
4399 | { |
4400 | REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0]; |
4401 | REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1]; |
4402 | } |
4403 | REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0]; |
4404 | REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL; |
4405 | |
4406 | /* Save the chain for further analysis in SLP detection. */ |
4407 | LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (obj: reduc_chain[0]); |
4408 | REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length (); |
4409 | |
4410 | *reduc_chain_p = true; |
4411 | if (dump_enabled_p ()) |
4412 | dump_printf_loc (MSG_NOTE, vect_location, |
4413 | "reduction: detected reduction chain\n" ); |
4414 | } |
4415 | else if (dump_enabled_p ()) |
4416 | dump_printf_loc (MSG_NOTE, vect_location, |
4417 | "reduction: detected reduction\n" ); |
4418 | |
4419 | return def_stmt_info; |
4420 | } |
4421 | |
4422 | if (dump_enabled_p ()) |
4423 | dump_printf_loc (MSG_NOTE, vect_location, |
4424 | "reduction: unknown pattern\n" ); |
4425 | |
4426 | return NULL; |
4427 | } |
4428 | |
4429 | /* Estimate the number of peeled epilogue iterations for LOOP_VINFO. |
4430 | PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations, |
4431 | or -1 if not known. */ |
4432 | |
4433 | static int |
4434 | vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue) |
4435 | { |
4436 | int assumed_vf = vect_vf_for_cost (loop_vinfo); |
4437 | if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1) |
4438 | { |
4439 | if (dump_enabled_p ()) |
4440 | dump_printf_loc (MSG_NOTE, vect_location, |
4441 | "cost model: epilogue peel iters set to vf/2 " |
4442 | "because loop iterations are unknown .\n" ); |
4443 | return assumed_vf / 2; |
4444 | } |
4445 | else |
4446 | { |
4447 | int niters = LOOP_VINFO_INT_NITERS (loop_vinfo); |
4448 | peel_iters_prologue = MIN (niters, peel_iters_prologue); |
4449 | int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf; |
4450 | /* If we need to peel for gaps, but no peeling is required, we have to |
4451 | peel VF iterations. */ |
4452 | if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue) |
4453 | peel_iters_epilogue = assumed_vf; |
4454 | return peel_iters_epilogue; |
4455 | } |
4456 | } |
4457 | |
4458 | /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */ |
4459 | int |
4460 | vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue, |
4461 | int *peel_iters_epilogue, |
4462 | stmt_vector_for_cost *scalar_cost_vec, |
4463 | stmt_vector_for_cost *prologue_cost_vec, |
4464 | stmt_vector_for_cost *epilogue_cost_vec) |
4465 | { |
4466 | int retval = 0; |
4467 | |
4468 | *peel_iters_epilogue |
4469 | = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue); |
4470 | |
4471 | if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) |
4472 | { |
4473 | /* If peeled iterations are known but number of scalar loop |
4474 | iterations are unknown, count a taken branch per peeled loop. */ |
4475 | if (peel_iters_prologue > 0) |
4476 | retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken, |
4477 | vect_prologue); |
4478 | if (*peel_iters_epilogue > 0) |
4479 | retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken, |
4480 | vect_epilogue); |
4481 | } |
4482 | |
4483 | stmt_info_for_cost *si; |
4484 | int j; |
4485 | if (peel_iters_prologue) |
4486 | FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si) |
4487 | retval += record_stmt_cost (body_cost_vec: prologue_cost_vec, |
4488 | count: si->count * peel_iters_prologue, |
4489 | kind: si->kind, stmt_info: si->stmt_info, misalign: si->misalign, |
4490 | where: vect_prologue); |
4491 | if (*peel_iters_epilogue) |
4492 | FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si) |
4493 | retval += record_stmt_cost (body_cost_vec: epilogue_cost_vec, |
4494 | count: si->count * *peel_iters_epilogue, |
4495 | kind: si->kind, stmt_info: si->stmt_info, misalign: si->misalign, |
4496 | where: vect_epilogue); |
4497 | |
4498 | return retval; |
4499 | } |
4500 | |
4501 | /* Function vect_estimate_min_profitable_iters |
4502 | |
4503 | Return the number of iterations required for the vector version of the |
4504 | loop to be profitable relative to the cost of the scalar version of the |
4505 | loop. |
4506 | |
4507 | *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold |
4508 | of iterations for vectorization. -1 value means loop vectorization |
4509 | is not profitable. This returned value may be used for dynamic |
4510 | profitability check. |
4511 | |
4512 | *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used |
4513 | for static check against estimated number of iterations. */ |
4514 | |
4515 | static void |
4516 | vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo, |
4517 | int *ret_min_profitable_niters, |
4518 | int *ret_min_profitable_estimate, |
4519 | unsigned *suggested_unroll_factor) |
4520 | { |
4521 | int min_profitable_iters; |
4522 | int min_profitable_estimate; |
4523 | int peel_iters_prologue; |
4524 | int peel_iters_epilogue; |
4525 | unsigned vec_inside_cost = 0; |
4526 | int vec_outside_cost = 0; |
4527 | unsigned vec_prologue_cost = 0; |
4528 | unsigned vec_epilogue_cost = 0; |
4529 | int scalar_single_iter_cost = 0; |
4530 | int scalar_outside_cost = 0; |
4531 | int assumed_vf = vect_vf_for_cost (loop_vinfo); |
4532 | int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); |
4533 | vector_costs *target_cost_data = loop_vinfo->vector_costs; |
4534 | |
4535 | /* Cost model disabled. */ |
4536 | if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) |
4537 | { |
4538 | if (dump_enabled_p ()) |
4539 | dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n" ); |
4540 | *ret_min_profitable_niters = 0; |
4541 | *ret_min_profitable_estimate = 0; |
4542 | return; |
4543 | } |
4544 | |
4545 | /* Requires loop versioning tests to handle misalignment. */ |
4546 | if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)) |
4547 | { |
4548 | /* FIXME: Make cost depend on complexity of individual check. */ |
4549 | unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length (); |
4550 | (void) add_stmt_cost (costs: target_cost_data, count: len, kind: scalar_stmt, where: vect_prologue); |
4551 | if (dump_enabled_p ()) |
4552 | dump_printf (MSG_NOTE, |
4553 | "cost model: Adding cost of checks for loop " |
4554 | "versioning to treat misalignment.\n" ); |
4555 | } |
4556 | |
4557 | /* Requires loop versioning with alias checks. */ |
4558 | if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)) |
4559 | { |
4560 | /* FIXME: Make cost depend on complexity of individual check. */ |
4561 | unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length (); |
4562 | (void) add_stmt_cost (costs: target_cost_data, count: len, kind: scalar_stmt, where: vect_prologue); |
4563 | len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length (); |
4564 | if (len) |
4565 | /* Count LEN - 1 ANDs and LEN comparisons. */ |
4566 | (void) add_stmt_cost (costs: target_cost_data, count: len * 2 - 1, |
4567 | kind: scalar_stmt, where: vect_prologue); |
4568 | len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length (); |
4569 | if (len) |
4570 | { |
4571 | /* Count LEN - 1 ANDs and LEN comparisons. */ |
4572 | unsigned int nstmts = len * 2 - 1; |
4573 | /* +1 for each bias that needs adding. */ |
4574 | for (unsigned int i = 0; i < len; ++i) |
4575 | if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p) |
4576 | nstmts += 1; |
4577 | (void) add_stmt_cost (costs: target_cost_data, count: nstmts, |
4578 | kind: scalar_stmt, where: vect_prologue); |
4579 | } |
4580 | if (dump_enabled_p ()) |
4581 | dump_printf (MSG_NOTE, |
4582 | "cost model: Adding cost of checks for loop " |
4583 | "versioning aliasing.\n" ); |
4584 | } |
4585 | |
4586 | /* Requires loop versioning with niter checks. */ |
4587 | if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo)) |
4588 | { |
4589 | /* FIXME: Make cost depend on complexity of individual check. */ |
4590 | (void) add_stmt_cost (costs: target_cost_data, count: 1, kind: vector_stmt, |
4591 | NULL, NULL, NULL_TREE, misalign: 0, where: vect_prologue); |
4592 | if (dump_enabled_p ()) |
4593 | dump_printf (MSG_NOTE, |
4594 | "cost model: Adding cost of checks for loop " |
4595 | "versioning niters.\n" ); |
4596 | } |
4597 | |
4598 | if (LOOP_REQUIRES_VERSIONING (loop_vinfo)) |
4599 | (void) add_stmt_cost (costs: target_cost_data, count: 1, kind: cond_branch_taken, |
4600 | where: vect_prologue); |
4601 | |
4602 | /* Count statements in scalar loop. Using this as scalar cost for a single |
4603 | iteration for now. |
4604 | |
4605 | TODO: Add outer loop support. |
4606 | |
4607 | TODO: Consider assigning different costs to different scalar |
4608 | statements. */ |
4609 | |
4610 | scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost (); |
4611 | |
4612 | /* Add additional cost for the peeled instructions in prologue and epilogue |
4613 | loop. (For fully-masked loops there will be no peeling.) |
4614 | |
4615 | FORNOW: If we don't know the value of peel_iters for prologue or epilogue |
4616 | at compile-time - we assume it's vf/2 (the worst would be vf-1). |
4617 | |
4618 | TODO: Build an expression that represents peel_iters for prologue and |
4619 | epilogue to be used in a run-time test. */ |
4620 | |
4621 | bool prologue_need_br_taken_cost = false; |
4622 | bool prologue_need_br_not_taken_cost = false; |
4623 | |
4624 | /* Calculate peel_iters_prologue. */ |
4625 | if (vect_use_loop_mask_for_alignment_p (loop_vinfo)) |
4626 | peel_iters_prologue = 0; |
4627 | else if (npeel < 0) |
4628 | { |
4629 | peel_iters_prologue = assumed_vf / 2; |
4630 | if (dump_enabled_p ()) |
4631 | dump_printf (MSG_NOTE, "cost model: " |
4632 | "prologue peel iters set to vf/2.\n" ); |
4633 | |
4634 | /* If peeled iterations are unknown, count a taken branch and a not taken |
4635 | branch per peeled loop. Even if scalar loop iterations are known, |
4636 | vector iterations are not known since peeled prologue iterations are |
4637 | not known. Hence guards remain the same. */ |
4638 | prologue_need_br_taken_cost = true; |
4639 | prologue_need_br_not_taken_cost = true; |
4640 | } |
4641 | else |
4642 | { |
4643 | peel_iters_prologue = npeel; |
4644 | if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0) |
4645 | /* If peeled iterations are known but number of scalar loop |
4646 | iterations are unknown, count a taken branch per peeled loop. */ |
4647 | prologue_need_br_taken_cost = true; |
4648 | } |
4649 | |
4650 | bool epilogue_need_br_taken_cost = false; |
4651 | bool epilogue_need_br_not_taken_cost = false; |
4652 | |
4653 | /* Calculate peel_iters_epilogue. */ |
4654 | if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)) |
4655 | /* We need to peel exactly one iteration for gaps. */ |
4656 | peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0; |
4657 | else if (npeel < 0) |
4658 | { |
4659 | /* If peeling for alignment is unknown, loop bound of main loop |
4660 | becomes unknown. */ |
4661 | peel_iters_epilogue = assumed_vf / 2; |
4662 | if (dump_enabled_p ()) |
4663 | dump_printf (MSG_NOTE, "cost model: " |
4664 | "epilogue peel iters set to vf/2 because " |
4665 | "peeling for alignment is unknown.\n" ); |
4666 | |
4667 | /* See the same reason above in peel_iters_prologue calculation. */ |
4668 | epilogue_need_br_taken_cost = true; |
4669 | epilogue_need_br_not_taken_cost = true; |
4670 | } |
4671 | else |
4672 | { |
4673 | peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue: npeel); |
4674 | if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0) |
4675 | /* If peeled iterations are known but number of scalar loop |
4676 | iterations are unknown, count a taken branch per peeled loop. */ |
4677 | epilogue_need_br_taken_cost = true; |
4678 | } |
4679 | |
4680 | stmt_info_for_cost *si; |
4681 | int j; |
4682 | /* Add costs associated with peel_iters_prologue. */ |
4683 | if (peel_iters_prologue) |
4684 | FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si) |
4685 | { |
4686 | (void) add_stmt_cost (costs: target_cost_data, |
4687 | count: si->count * peel_iters_prologue, kind: si->kind, |
4688 | stmt_info: si->stmt_info, node: si->node, vectype: si->vectype, |
4689 | misalign: si->misalign, where: vect_prologue); |
4690 | } |
4691 | |
4692 | /* Add costs associated with peel_iters_epilogue. */ |
4693 | if (peel_iters_epilogue) |
4694 | FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si) |
4695 | { |
4696 | (void) add_stmt_cost (costs: target_cost_data, |
4697 | count: si->count * peel_iters_epilogue, kind: si->kind, |
4698 | stmt_info: si->stmt_info, node: si->node, vectype: si->vectype, |
4699 | misalign: si->misalign, where: vect_epilogue); |
4700 | } |
4701 | |
4702 | /* Add possible cond_branch_taken/cond_branch_not_taken cost. */ |
4703 | |
4704 | if (prologue_need_br_taken_cost) |
4705 | (void) add_stmt_cost (costs: target_cost_data, count: 1, kind: cond_branch_taken, |
4706 | where: vect_prologue); |
4707 | |
4708 | if (prologue_need_br_not_taken_cost) |
4709 | (void) add_stmt_cost (costs: target_cost_data, count: 1, |
4710 | kind: cond_branch_not_taken, where: vect_prologue); |
4711 | |
4712 | if (epilogue_need_br_taken_cost) |
4713 | (void) add_stmt_cost (costs: target_cost_data, count: 1, kind: cond_branch_taken, |
4714 | where: vect_epilogue); |
4715 | |
4716 | if (epilogue_need_br_not_taken_cost) |
4717 | (void) add_stmt_cost (costs: target_cost_data, count: 1, |
4718 | kind: cond_branch_not_taken, where: vect_epilogue); |
4719 | |
4720 | /* Take care of special costs for rgroup controls of partial vectors. */ |
4721 | if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) |
4722 | && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) |
4723 | == vect_partial_vectors_avx512)) |
4724 | { |
4725 | /* Calculate how many masks we need to generate. */ |
4726 | unsigned int num_masks = 0; |
4727 | bool need_saturation = false; |
4728 | for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec) |
4729 | if (rgm.type) |
4730 | { |
4731 | unsigned nvectors = rgm.factor; |
4732 | num_masks += nvectors; |
4733 | if (TYPE_PRECISION (TREE_TYPE (rgm.compare_type)) |
4734 | < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo))) |
4735 | need_saturation = true; |
4736 | } |
4737 | |
4738 | /* ??? The target isn't able to identify the costs below as |
4739 | producing masks so it cannot penaltize cases where we'd run |
4740 | out of mask registers for example. */ |
4741 | |
4742 | /* ??? We are also failing to account for smaller vector masks |
4743 | we generate by splitting larger masks in vect_get_loop_mask. */ |
4744 | |
4745 | /* In the worst case, we need to generate each mask in the prologue |
4746 | and in the loop body. We need one splat per group and one |
4747 | compare per mask. |
4748 | |
4749 | Sometimes the prologue mask will fold to a constant, |
4750 | so the actual prologue cost might be smaller. However, it's |
4751 | simpler and safer to use the worst-case cost; if this ends up |
4752 | being the tie-breaker between vectorizing or not, then it's |
4753 | probably better not to vectorize. */ |
4754 | (void) add_stmt_cost (costs: target_cost_data, |
4755 | count: num_masks |
4756 | + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (), |
4757 | kind: vector_stmt, NULL, NULL, NULL_TREE, misalign: 0, |
4758 | where: vect_prologue); |
4759 | (void) add_stmt_cost (costs: target_cost_data, |
4760 | count: num_masks |
4761 | + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (), |
4762 | kind: vector_stmt, NULL, NULL, NULL_TREE, misalign: 0, where: vect_body); |
4763 | |
4764 | /* When we need saturation we need it both in the prologue and |
4765 | the epilogue. */ |
4766 | if (need_saturation) |
4767 | { |
4768 | (void) add_stmt_cost (costs: target_cost_data, count: 1, kind: scalar_stmt, |
4769 | NULL, NULL, NULL_TREE, misalign: 0, where: vect_prologue); |
4770 | (void) add_stmt_cost (costs: target_cost_data, count: 1, kind: scalar_stmt, |
4771 | NULL, NULL, NULL_TREE, misalign: 0, where: vect_body); |
4772 | } |
4773 | } |
4774 | else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) |
4775 | && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) |
4776 | == vect_partial_vectors_while_ult)) |
4777 | { |
4778 | /* Calculate how many masks we need to generate. */ |
4779 | unsigned int num_masks = 0; |
4780 | rgroup_controls *rgm; |
4781 | unsigned int num_vectors_m1; |
4782 | FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, |
4783 | num_vectors_m1, rgm) |
4784 | if (rgm->type) |
4785 | num_masks += num_vectors_m1 + 1; |
4786 | gcc_assert (num_masks > 0); |
4787 | |
4788 | /* In the worst case, we need to generate each mask in the prologue |
4789 | and in the loop body. One of the loop body mask instructions |
4790 | replaces the comparison in the scalar loop, and since we don't |
4791 | count the scalar comparison against the scalar body, we shouldn't |
4792 | count that vector instruction against the vector body either. |
4793 | |
4794 | Sometimes we can use unpacks instead of generating prologue |
4795 | masks and sometimes the prologue mask will fold to a constant, |
4796 | so the actual prologue cost might be smaller. However, it's |
4797 | simpler and safer to use the worst-case cost; if this ends up |
4798 | being the tie-breaker between vectorizing or not, then it's |
4799 | probably better not to vectorize. */ |
4800 | (void) add_stmt_cost (costs: target_cost_data, count: num_masks, |
4801 | kind: vector_stmt, NULL, NULL, NULL_TREE, misalign: 0, |
4802 | where: vect_prologue); |
4803 | (void) add_stmt_cost (costs: target_cost_data, count: num_masks - 1, |
4804 | kind: vector_stmt, NULL, NULL, NULL_TREE, misalign: 0, |
4805 | where: vect_body); |
4806 | } |
4807 | else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)) |
4808 | { |
4809 | /* Referring to the functions vect_set_loop_condition_partial_vectors |
4810 | and vect_set_loop_controls_directly, we need to generate each |
4811 | length in the prologue and in the loop body if required. Although |
4812 | there are some possible optimizations, we consider the worst case |
4813 | here. */ |
4814 | |
4815 | bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo); |
4816 | signed char partial_load_store_bias |
4817 | = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo); |
4818 | bool need_iterate_p |
4819 | = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo) |
4820 | && !vect_known_niters_smaller_than_vf (loop_vinfo)); |
4821 | |
4822 | /* Calculate how many statements to be added. */ |
4823 | unsigned int prologue_stmts = 0; |
4824 | unsigned int body_stmts = 0; |
4825 | |
4826 | rgroup_controls *rgc; |
4827 | unsigned int num_vectors_m1; |
4828 | FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc) |
4829 | if (rgc->type) |
4830 | { |
4831 | /* May need one SHIFT for nitems_total computation. */ |
4832 | unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor; |
4833 | if (nitems != 1 && !niters_known_p) |
4834 | prologue_stmts += 1; |
4835 | |
4836 | /* May need one MAX and one MINUS for wrap around. */ |
4837 | if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc)) |
4838 | prologue_stmts += 2; |
4839 | |
4840 | /* Need one MAX and one MINUS for each batch limit excepting for |
4841 | the 1st one. */ |
4842 | prologue_stmts += num_vectors_m1 * 2; |
4843 | |
4844 | unsigned int num_vectors = num_vectors_m1 + 1; |
4845 | |
4846 | /* Need to set up lengths in prologue, only one MIN required |
4847 | for each since start index is zero. */ |
4848 | prologue_stmts += num_vectors; |
4849 | |
4850 | /* If we have a non-zero partial load bias, we need one PLUS |
4851 | to adjust the load length. */ |
4852 | if (partial_load_store_bias != 0) |
4853 | body_stmts += 1; |
4854 | |
4855 | /* Each may need two MINs and one MINUS to update lengths in body |
4856 | for next iteration. */ |
4857 | if (need_iterate_p) |
4858 | body_stmts += 3 * num_vectors; |
4859 | } |
4860 | |
4861 | (void) add_stmt_cost (costs: target_cost_data, count: prologue_stmts, |
4862 | kind: scalar_stmt, where: vect_prologue); |
4863 | (void) add_stmt_cost (costs: target_cost_data, count: body_stmts, |
4864 | kind: scalar_stmt, where: vect_body); |
4865 | } |
4866 | |
4867 | /* FORNOW: The scalar outside cost is incremented in one of the |
4868 | following ways: |
4869 | |
4870 | 1. The vectorizer checks for alignment and aliasing and generates |
4871 | a condition that allows dynamic vectorization. A cost model |
4872 | check is ANDED with the versioning condition. Hence scalar code |
4873 | path now has the added cost of the versioning check. |
4874 | |
4875 | if (cost > th & versioning_check) |
4876 | jmp to vector code |
4877 | |
4878 | Hence run-time scalar is incremented by not-taken branch cost. |
4879 | |
4880 | 2. The vectorizer then checks if a prologue is required. If the |
4881 | cost model check was not done before during versioning, it has to |
4882 | be done before the prologue check. |
4883 | |
4884 | if (cost <= th) |
4885 | prologue = scalar_iters |
4886 | if (prologue == 0) |
4887 | jmp to vector code |
4888 | else |
4889 | execute prologue |
4890 | if (prologue == num_iters) |
4891 | go to exit |
4892 | |
4893 | Hence the run-time scalar cost is incremented by a taken branch, |
4894 | plus a not-taken branch, plus a taken branch cost. |
4895 | |
4896 | 3. The vectorizer then checks if an epilogue is required. If the |
4897 | cost model check was not done before during prologue check, it |
4898 | has to be done with the epilogue check. |
4899 | |
4900 | if (prologue == 0) |
4901 | jmp to vector code |
4902 | else |
4903 | execute prologue |
4904 | if (prologue == num_iters) |
4905 | go to exit |
4906 | vector code: |
4907 | if ((cost <= th) | (scalar_iters-prologue-epilogue == 0)) |
4908 | jmp to epilogue |
4909 | |
4910 | Hence the run-time scalar cost should be incremented by 2 taken |
4911 | branches. |
4912 | |
4913 | TODO: The back end may reorder the BBS's differently and reverse |
4914 | conditions/branch directions. Change the estimates below to |
4915 | something more reasonable. */ |
4916 | |
4917 | /* If the number of iterations is known and we do not do versioning, we can |
4918 | decide whether to vectorize at compile time. Hence the scalar version |
4919 | do not carry cost model guard costs. */ |
4920 | if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) |
4921 | || LOOP_REQUIRES_VERSIONING (loop_vinfo)) |
4922 | { |
4923 | /* Cost model check occurs at versioning. */ |
4924 | if (LOOP_REQUIRES_VERSIONING (loop_vinfo)) |
4925 | scalar_outside_cost += vect_get_stmt_cost (type_of_cost: cond_branch_not_taken); |
4926 | else |
4927 | { |
4928 | /* Cost model check occurs at prologue generation. */ |
4929 | if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0) |
4930 | scalar_outside_cost += 2 * vect_get_stmt_cost (type_of_cost: cond_branch_taken) |
4931 | + vect_get_stmt_cost (type_of_cost: cond_branch_not_taken); |
4932 | /* Cost model check occurs at epilogue generation. */ |
4933 | else |
4934 | scalar_outside_cost += 2 * vect_get_stmt_cost (type_of_cost: cond_branch_taken); |
4935 | } |
4936 | } |
4937 | |
4938 | /* Complete the target-specific cost calculations. */ |
4939 | finish_cost (costs: loop_vinfo->vector_costs, scalar_costs: loop_vinfo->scalar_costs, |
4940 | prologue_cost: &vec_prologue_cost, body_cost: &vec_inside_cost, epilogue_cost: &vec_epilogue_cost, |
4941 | suggested_unroll_factor); |
4942 | |
4943 | if (suggested_unroll_factor && *suggested_unroll_factor > 1 |
4944 | && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR |
4945 | && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) * |
4946 | *suggested_unroll_factor, |
4947 | LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo))) |
4948 | { |
4949 | if (dump_enabled_p ()) |
4950 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
4951 | "can't unroll as unrolled vectorization factor larger" |
4952 | " than maximum vectorization factor: " |
4953 | HOST_WIDE_INT_PRINT_UNSIGNED "\n" , |
4954 | LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)); |
4955 | *suggested_unroll_factor = 1; |
4956 | } |
4957 | |
4958 | vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost); |
4959 | |
4960 | if (dump_enabled_p ()) |
4961 | { |
4962 | dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n" ); |
4963 | dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n" , |
4964 | vec_inside_cost); |
4965 | dump_printf (MSG_NOTE, " Vector prologue cost: %d\n" , |
4966 | vec_prologue_cost); |
4967 | dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n" , |
4968 | vec_epilogue_cost); |
4969 | dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n" , |
4970 | scalar_single_iter_cost); |
4971 | dump_printf (MSG_NOTE, " Scalar outside cost: %d\n" , |
4972 | scalar_outside_cost); |
4973 | dump_printf (MSG_NOTE, " Vector outside cost: %d\n" , |
4974 | vec_outside_cost); |
4975 | dump_printf (MSG_NOTE, " prologue iterations: %d\n" , |
4976 | peel_iters_prologue); |
4977 | dump_printf (MSG_NOTE, " epilogue iterations: %d\n" , |
4978 | peel_iters_epilogue); |
4979 | } |
4980 | |
4981 | /* Calculate number of iterations required to make the vector version |
4982 | profitable, relative to the loop bodies only. The following condition |
4983 | must hold true: |
4984 | SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC |
4985 | where |
4986 | SIC = scalar iteration cost, VIC = vector iteration cost, |
4987 | VOC = vector outside cost, VF = vectorization factor, |
4988 | NPEEL = prologue iterations + epilogue iterations, |
4989 | SOC = scalar outside cost for run time cost model check. */ |
4990 | |
4991 | int saving_per_viter = (scalar_single_iter_cost * assumed_vf |
4992 | - vec_inside_cost); |
4993 | if (saving_per_viter <= 0) |
4994 | { |
4995 | if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize) |
4996 | warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd, |
4997 | "vectorization did not happen for a simd loop" ); |
4998 | |
4999 | if (dump_enabled_p ()) |
5000 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
5001 | "cost model: the vector iteration cost = %d " |
5002 | "divided by the scalar iteration cost = %d " |
5003 | "is greater or equal to the vectorization factor = %d" |
5004 | ".\n" , |
5005 | vec_inside_cost, scalar_single_iter_cost, assumed_vf); |
5006 | *ret_min_profitable_niters = -1; |
5007 | *ret_min_profitable_estimate = -1; |
5008 | return; |
5009 | } |
5010 | |
5011 | /* ??? The "if" arm is written to handle all cases; see below for what |
5012 | we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */ |
5013 | if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)) |
5014 | { |
5015 | /* Rewriting the condition above in terms of the number of |
5016 | vector iterations (vniters) rather than the number of |
5017 | scalar iterations (niters) gives: |
5018 | |
5019 | SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC |
5020 | |
5021 | <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC |
5022 | |
5023 | For integer N, X and Y when X > 0: |
5024 | |
5025 | N * X > Y <==> N >= (Y /[floor] X) + 1. */ |
5026 | int outside_overhead = (vec_outside_cost |
5027 | - scalar_single_iter_cost * peel_iters_prologue |
5028 | - scalar_single_iter_cost * peel_iters_epilogue |
5029 | - scalar_outside_cost); |
5030 | /* We're only interested in cases that require at least one |
5031 | vector iteration. */ |
5032 | int min_vec_niters = 1; |
5033 | if (outside_overhead > 0) |
5034 | min_vec_niters = outside_overhead / saving_per_viter + 1; |
5035 | |
5036 | if (dump_enabled_p ()) |
5037 | dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n" , |
5038 | min_vec_niters); |
5039 | |
5040 | if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)) |
5041 | { |
5042 | /* Now that we know the minimum number of vector iterations, |
5043 | find the minimum niters for which the scalar cost is larger: |
5044 | |
5045 | SIC * niters > VIC * vniters + VOC - SOC |
5046 | |
5047 | We know that the minimum niters is no more than |
5048 | vniters * VF + NPEEL, but it might be (and often is) less |
5049 | than that if a partial vector iteration is cheaper than the |
5050 | equivalent scalar code. */ |
5051 | int threshold = (vec_inside_cost * min_vec_niters |
5052 | + vec_outside_cost |
5053 | - scalar_outside_cost); |
5054 | if (threshold <= 0) |
5055 | min_profitable_iters = 1; |
5056 | else |
5057 | min_profitable_iters = threshold / scalar_single_iter_cost + 1; |
5058 | } |
5059 | else |
5060 | /* Convert the number of vector iterations into a number of |
5061 | scalar iterations. */ |
5062 | min_profitable_iters = (min_vec_niters * assumed_vf |
5063 | + peel_iters_prologue |
5064 | + peel_iters_epilogue); |
5065 | } |
5066 | else |
5067 | { |
5068 | min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) |
5069 | * assumed_vf |
5070 | - vec_inside_cost * peel_iters_prologue |
5071 | - vec_inside_cost * peel_iters_epilogue); |
5072 | if (min_profitable_iters <= 0) |
5073 | min_profitable_iters = 0; |
5074 | else |
5075 | { |
5076 | min_profitable_iters /= saving_per_viter; |
5077 | |
5078 | if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters) |
5079 | <= (((int) vec_inside_cost * min_profitable_iters) |
5080 | + (((int) vec_outside_cost - scalar_outside_cost) |
5081 | * assumed_vf))) |
5082 | min_profitable_iters++; |
5083 | } |
5084 | } |
5085 | |
5086 | if (dump_enabled_p ()) |
5087 | dump_printf (MSG_NOTE, |
5088 | " Calculated minimum iters for profitability: %d\n" , |
5089 | min_profitable_iters); |
5090 | |
5091 | if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) |
5092 | && min_profitable_iters < (assumed_vf + peel_iters_prologue)) |
5093 | /* We want the vectorized loop to execute at least once. */ |
5094 | min_profitable_iters = assumed_vf + peel_iters_prologue; |
5095 | else if (min_profitable_iters < peel_iters_prologue) |
5096 | /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the |
5097 | vectorized loop executes at least once. */ |
5098 | min_profitable_iters = peel_iters_prologue; |
5099 | |
5100 | if (dump_enabled_p ()) |
5101 | dump_printf_loc (MSG_NOTE, vect_location, |
5102 | " Runtime profitability threshold = %d\n" , |
5103 | min_profitable_iters); |
5104 | |
5105 | *ret_min_profitable_niters = min_profitable_iters; |
5106 | |
5107 | /* Calculate number of iterations required to make the vector version |
5108 | profitable, relative to the loop bodies only. |
5109 | |
5110 | Non-vectorized variant is SIC * niters and it must win over vector |
5111 | variant on the expected loop trip count. The following condition must hold true: |
5112 | SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */ |
5113 | |
5114 | if (vec_outside_cost <= 0) |
5115 | min_profitable_estimate = 0; |
5116 | /* ??? This "else if" arm is written to handle all cases; see below for |
5117 | what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */ |
5118 | else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)) |
5119 | { |
5120 | /* This is a repeat of the code above, but with + SOC rather |
5121 | than - SOC. */ |
5122 | int outside_overhead = (vec_outside_cost |
5123 | - scalar_single_iter_cost * peel_iters_prologue |
5124 | - scalar_single_iter_cost * peel_iters_epilogue |
5125 | + scalar_outside_cost); |
5126 | int min_vec_niters = 1; |
5127 | if (outside_overhead > 0) |
5128 | min_vec_niters = outside_overhead / saving_per_viter + 1; |
5129 | |
5130 | if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)) |
5131 | { |
5132 | int threshold = (vec_inside_cost * min_vec_niters |
5133 | + vec_outside_cost |
5134 | + scalar_outside_cost); |
5135 | min_profitable_estimate = threshold / scalar_single_iter_cost + 1; |
5136 | } |
5137 | else |
5138 | min_profitable_estimate = (min_vec_niters * assumed_vf |
5139 | + peel_iters_prologue |
5140 | + peel_iters_epilogue); |
5141 | } |
5142 | else |
5143 | { |
5144 | min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) |
5145 | * assumed_vf |
5146 | - vec_inside_cost * peel_iters_prologue |
5147 | - vec_inside_cost * peel_iters_epilogue) |
5148 | / ((scalar_single_iter_cost * assumed_vf) |
5149 | - vec_inside_cost); |
5150 | } |
5151 | min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters); |
5152 | if (dump_enabled_p ()) |
5153 | dump_printf_loc (MSG_NOTE, vect_location, |
5154 | " Static estimate profitability threshold = %d\n" , |
5155 | min_profitable_estimate); |
5156 | |
5157 | *ret_min_profitable_estimate = min_profitable_estimate; |
5158 | } |
5159 | |
5160 | /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET |
5161 | vector elements (not bits) for a vector with NELT elements. */ |
5162 | static void |
5163 | calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt, |
5164 | vec_perm_builder *sel) |
5165 | { |
5166 | /* The encoding is a single stepped pattern. Any wrap-around is handled |
5167 | by vec_perm_indices. */ |
5168 | sel->new_vector (full_nelts: nelt, npatterns: 1, nelts_per_pattern: 3); |
5169 | for (unsigned int i = 0; i < 3; i++) |
5170 | sel->quick_push (obj: i + offset); |
5171 | } |
5172 | |
5173 | /* Checks whether the target supports whole-vector shifts for vectors of mode |
5174 | MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_ |
5175 | it supports vec_perm_const with masks for all necessary shift amounts. */ |
5176 | static bool |
5177 | have_whole_vector_shift (machine_mode mode) |
5178 | { |
5179 | if (optab_handler (op: vec_shr_optab, mode) != CODE_FOR_nothing) |
5180 | return true; |
5181 | |
5182 | /* Variable-length vectors should be handled via the optab. */ |
5183 | unsigned int nelt; |
5184 | if (!GET_MODE_NUNITS (mode).is_constant (const_value: &nelt)) |
5185 | return false; |
5186 | |
5187 | vec_perm_builder sel; |
5188 | vec_perm_indices indices; |
5189 | for (unsigned int i = nelt / 2; i >= 1; i /= 2) |
5190 | { |
5191 | calc_vec_perm_mask_for_shift (offset: i, nelt, sel: &sel); |
5192 | indices.new_vector (sel, 2, nelt); |
5193 | if (!can_vec_perm_const_p (mode, mode, indices, false)) |
5194 | return false; |
5195 | } |
5196 | return true; |
5197 | } |
5198 | |
5199 | /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose |
5200 | multiplication operands have differing signs and (b) we intend |
5201 | to emulate the operation using a series of signed DOT_PROD_EXPRs. |
5202 | See vect_emulate_mixed_dot_prod for the actual sequence used. */ |
5203 | |
5204 | static bool |
5205 | vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo, |
5206 | stmt_vec_info stmt_info) |
5207 | { |
5208 | gassign *assign = dyn_cast<gassign *> (p: stmt_info->stmt); |
5209 | if (!assign || gimple_assign_rhs_code (gs: assign) != DOT_PROD_EXPR) |
5210 | return false; |
5211 | |
5212 | tree rhs1 = gimple_assign_rhs1 (gs: assign); |
5213 | tree rhs2 = gimple_assign_rhs2 (gs: assign); |
5214 | if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2))) |
5215 | return false; |
5216 | |
5217 | stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info); |
5218 | gcc_assert (reduc_info->is_reduc_info); |
5219 | return !directly_supported_p (DOT_PROD_EXPR, |
5220 | STMT_VINFO_REDUC_VECTYPE_IN (reduc_info), |
5221 | optab_vector_mixed_sign); |
5222 | } |
5223 | |
5224 | /* TODO: Close dependency between vect_model_*_cost and vectorizable_* |
5225 | functions. Design better to avoid maintenance issues. */ |
5226 | |
5227 | /* Function vect_model_reduction_cost. |
5228 | |
5229 | Models cost for a reduction operation, including the vector ops |
5230 | generated within the strip-mine loop in some cases, the initial |
5231 | definition before the loop, and the epilogue code that must be generated. */ |
5232 | |
5233 | static void |
5234 | vect_model_reduction_cost (loop_vec_info loop_vinfo, |
5235 | stmt_vec_info stmt_info, internal_fn reduc_fn, |
5236 | vect_reduction_type reduction_type, |
5237 | int ncopies, stmt_vector_for_cost *cost_vec) |
5238 | { |
5239 | int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0; |
5240 | tree vectype; |
5241 | machine_mode mode; |
5242 | class loop *loop = NULL; |
5243 | |
5244 | if (loop_vinfo) |
5245 | loop = LOOP_VINFO_LOOP (loop_vinfo); |
5246 | |
5247 | /* Condition reductions generate two reductions in the loop. */ |
5248 | if (reduction_type == COND_REDUCTION) |
5249 | ncopies *= 2; |
5250 | |
5251 | vectype = STMT_VINFO_VECTYPE (stmt_info); |
5252 | mode = TYPE_MODE (vectype); |
5253 | stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info); |
5254 | |
5255 | gimple_match_op op; |
5256 | if (!gimple_extract_op (orig_stmt_info->stmt, &op)) |
5257 | gcc_unreachable (); |
5258 | |
5259 | bool emulated_mixed_dot_prod |
5260 | = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info); |
5261 | if (reduction_type == EXTRACT_LAST_REDUCTION) |
5262 | /* No extra instructions are needed in the prologue. The loop body |
5263 | operations are costed in vectorizable_condition. */ |
5264 | inside_cost = 0; |
5265 | else if (reduction_type == FOLD_LEFT_REDUCTION) |
5266 | { |
5267 | /* No extra instructions needed in the prologue. */ |
5268 | prologue_cost = 0; |
5269 | |
5270 | if (reduc_fn != IFN_LAST) |
5271 | /* Count one reduction-like operation per vector. */ |
5272 | inside_cost = record_stmt_cost (body_cost_vec: cost_vec, count: ncopies, kind: vec_to_scalar, |
5273 | stmt_info, misalign: 0, where: vect_body); |
5274 | else |
5275 | { |
5276 | /* Use NELEMENTS extracts and NELEMENTS scalar ops. */ |
5277 | unsigned int nelements = ncopies * vect_nunits_for_cost (vec_type: vectype); |
5278 | inside_cost = record_stmt_cost (body_cost_vec: cost_vec, count: nelements, |
5279 | kind: vec_to_scalar, stmt_info, misalign: 0, |
5280 | where: vect_body); |
5281 | inside_cost += record_stmt_cost (body_cost_vec: cost_vec, count: nelements, |
5282 | kind: scalar_stmt, stmt_info, misalign: 0, |
5283 | where: vect_body); |
5284 | } |
5285 | } |
5286 | else |
5287 | { |
5288 | /* Add in the cost of the initial definitions. */ |
5289 | int prologue_stmts; |
5290 | if (reduction_type == COND_REDUCTION) |
5291 | /* For cond reductions we have four vectors: initial index, step, |
5292 | initial result of the data reduction, initial value of the index |
5293 | reduction. */ |
5294 | prologue_stmts = 4; |
5295 | else if (emulated_mixed_dot_prod) |
5296 | /* We need the initial reduction value and two invariants: |
5297 | one that contains the minimum signed value and one that |
5298 | contains half of its negative. */ |
5299 | prologue_stmts = 3; |
5300 | else |
5301 | prologue_stmts = 1; |
5302 | prologue_cost += record_stmt_cost (body_cost_vec: cost_vec, count: prologue_stmts, |
5303 | kind: scalar_to_vec, stmt_info, misalign: 0, |
5304 | where: vect_prologue); |
5305 | } |
5306 | |
5307 | /* Determine cost of epilogue code. |
5308 | |
5309 | We have a reduction operator that will reduce the vector in one statement. |
5310 | Also requires scalar extract. */ |
5311 | |
5312 | if (!loop || !nested_in_vect_loop_p (loop, stmt_info: orig_stmt_info)) |
5313 | { |
5314 | if (reduc_fn != IFN_LAST) |
5315 | { |
5316 | if (reduction_type == COND_REDUCTION) |
5317 | { |
5318 | /* An EQ stmt and an COND_EXPR stmt. */ |
5319 | epilogue_cost += record_stmt_cost (body_cost_vec: cost_vec, count: 2, |
5320 | kind: vector_stmt, stmt_info, misalign: 0, |
5321 | where: vect_epilogue); |
5322 | /* Reduction of the max index and a reduction of the found |
5323 | values. */ |
5324 | epilogue_cost += record_stmt_cost (body_cost_vec: cost_vec, count: 2, |
5325 | kind: vec_to_scalar, stmt_info, misalign: 0, |
5326 | where: vect_epilogue); |
5327 | /* A broadcast of the max value. */ |
5328 | epilogue_cost += record_stmt_cost (body_cost_vec: cost_vec, count: 1, |
5329 | kind: scalar_to_vec, stmt_info, misalign: 0, |
5330 | where: vect_epilogue); |
5331 | } |
5332 | else |
5333 | { |
5334 | epilogue_cost += record_stmt_cost (body_cost_vec: cost_vec, count: 1, kind: vector_stmt, |
5335 | stmt_info, misalign: 0, where: vect_epilogue); |
5336 | epilogue_cost += record_stmt_cost (body_cost_vec: cost_vec, count: 1, |
5337 | kind: vec_to_scalar, stmt_info, misalign: 0, |
5338 | where: vect_epilogue); |
5339 | } |
5340 | } |
5341 | else if (reduction_type == COND_REDUCTION) |
5342 | { |
5343 | unsigned estimated_nunits = vect_nunits_for_cost (vec_type: vectype); |
5344 | /* Extraction of scalar elements. */ |
5345 | epilogue_cost += record_stmt_cost (body_cost_vec: cost_vec, |
5346 | count: 2 * estimated_nunits, |
5347 | kind: vec_to_scalar, stmt_info, misalign: 0, |
5348 | where: vect_epilogue); |
5349 | /* Scalar max reductions via COND_EXPR / MAX_EXPR. */ |
5350 | epilogue_cost += record_stmt_cost (body_cost_vec: cost_vec, |
5351 | count: 2 * estimated_nunits - 3, |
5352 | kind: scalar_stmt, stmt_info, misalign: 0, |
5353 | where: vect_epilogue); |
5354 | } |
5355 | else if (reduction_type == EXTRACT_LAST_REDUCTION |
5356 | || reduction_type == FOLD_LEFT_REDUCTION) |
5357 | /* No extra instructions need in the epilogue. */ |
5358 | ; |
5359 | else |
5360 | { |
5361 | int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype)); |
5362 | tree bitsize = TYPE_SIZE (op.type); |
5363 | int element_bitsize = tree_to_uhwi (bitsize); |
5364 | int nelements = vec_size_in_bits / element_bitsize; |
5365 | |
5366 | if (op.code == COND_EXPR) |
5367 | op.code = MAX_EXPR; |
5368 | |
5369 | /* We have a whole vector shift available. */ |
5370 | if (VECTOR_MODE_P (mode) |
5371 | && directly_supported_p (op.code, vectype) |
5372 | && have_whole_vector_shift (mode)) |
5373 | { |
5374 | /* Final reduction via vector shifts and the reduction operator. |
5375 | Also requires scalar extract. */ |
5376 | epilogue_cost += record_stmt_cost (body_cost_vec: cost_vec, |
5377 | count: exact_log2 (x: nelements) * 2, |
5378 | kind: vector_stmt, stmt_info, misalign: 0, |
5379 | where: vect_epilogue); |
5380 | epilogue_cost += record_stmt_cost (body_cost_vec: cost_vec, count: 1, |
5381 | kind: vec_to_scalar, stmt_info, misalign: 0, |
5382 | where: vect_epilogue); |
5383 | } |
5384 | else |
5385 | /* Use extracts and reduction op for final reduction. For N |
5386 | elements, we have N extracts and N-1 reduction ops. */ |
5387 | epilogue_cost += record_stmt_cost (body_cost_vec: cost_vec, |
5388 | count: nelements + nelements - 1, |
5389 | kind: vector_stmt, stmt_info, misalign: 0, |
5390 | where: vect_epilogue); |
5391 | } |
5392 | } |
5393 | |
5394 | if (dump_enabled_p ()) |
5395 | dump_printf (MSG_NOTE, |
5396 | "vect_model_reduction_cost: inside_cost = %d, " |
5397 | "prologue_cost = %d, epilogue_cost = %d .\n" , inside_cost, |
5398 | prologue_cost, epilogue_cost); |
5399 | } |
5400 | |
5401 | /* SEQ is a sequence of instructions that initialize the reduction |
5402 | described by REDUC_INFO. Emit them in the appropriate place. */ |
5403 | |
5404 | static void |
5405 | vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo, |
5406 | stmt_vec_info reduc_info, gimple *seq) |
5407 | { |
5408 | if (reduc_info->reused_accumulator) |
5409 | { |
5410 | /* When reusing an accumulator from the main loop, we only need |
5411 | initialization instructions if the main loop can be skipped. |
5412 | In that case, emit the initialization instructions at the end |
5413 | of the guard block that does the skip. */ |
5414 | edge skip_edge = loop_vinfo->skip_main_loop_edge; |
5415 | gcc_assert (skip_edge); |
5416 | gimple_stmt_iterator gsi = gsi_last_bb (bb: skip_edge->src); |
5417 | gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT); |
5418 | } |
5419 | else |
5420 | { |
5421 | /* The normal case: emit the initialization instructions on the |
5422 | preheader edge. */ |
5423 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
5424 | gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq); |
5425 | } |
5426 | } |
5427 | |
5428 | /* Function get_initial_def_for_reduction |
5429 | |
5430 | Input: |
5431 | REDUC_INFO - the info_for_reduction |
5432 | INIT_VAL - the initial value of the reduction variable |
5433 | NEUTRAL_OP - a value that has no effect on the reduction, as per |
5434 | neutral_op_for_reduction |
5435 | |
5436 | Output: |
5437 | Return a vector variable, initialized according to the operation that |
5438 | STMT_VINFO performs. This vector will be used as the initial value |
5439 | of the vector of partial results. |
5440 | |
5441 | The value we need is a vector in which element 0 has value INIT_VAL |
5442 | and every other element has value NEUTRAL_OP. */ |
5443 | |
5444 | static tree |
5445 | get_initial_def_for_reduction (loop_vec_info loop_vinfo, |
5446 | stmt_vec_info reduc_info, |
5447 | tree init_val, tree neutral_op) |
5448 | { |
5449 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
5450 | tree scalar_type = TREE_TYPE (init_val); |
5451 | tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type); |
5452 | tree init_def; |
5453 | gimple_seq stmts = NULL; |
5454 | |
5455 | gcc_assert (vectype); |
5456 | |
5457 | gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type) |
5458 | || SCALAR_FLOAT_TYPE_P (scalar_type)); |
5459 | |
5460 | gcc_assert (nested_in_vect_loop_p (loop, reduc_info) |
5461 | || loop == (gimple_bb (reduc_info->stmt))->loop_father); |
5462 | |
5463 | if (operand_equal_p (init_val, neutral_op)) |
5464 | { |
5465 | /* If both elements are equal then the vector described above is |
5466 | just a splat. */ |
5467 | neutral_op = gimple_convert (seq: &stmts, TREE_TYPE (vectype), op: neutral_op); |
5468 | init_def = gimple_build_vector_from_val (seq: &stmts, type: vectype, op: neutral_op); |
5469 | } |
5470 | else |
5471 | { |
5472 | neutral_op = gimple_convert (seq: &stmts, TREE_TYPE (vectype), op: neutral_op); |
5473 | init_val = gimple_convert (seq: &stmts, TREE_TYPE (vectype), op: init_val); |
5474 | if (!TYPE_VECTOR_SUBPARTS (node: vectype).is_constant ()) |
5475 | { |
5476 | /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into |
5477 | element 0. */ |
5478 | init_def = gimple_build_vector_from_val (seq: &stmts, type: vectype, |
5479 | op: neutral_op); |
5480 | init_def = gimple_build (seq: &stmts, fn: CFN_VEC_SHL_INSERT, |
5481 | type: vectype, args: init_def, args: init_val); |
5482 | } |
5483 | else |
5484 | { |
5485 | /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}. */ |
5486 | tree_vector_builder elts (vectype, 1, 2); |
5487 | elts.quick_push (obj: init_val); |
5488 | elts.quick_push (obj: neutral_op); |
5489 | init_def = gimple_build_vector (seq: &stmts, builder: &elts); |
5490 | } |
5491 | } |
5492 | |
5493 | if (stmts) |
5494 | vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, seq: stmts); |
5495 | return init_def; |
5496 | } |
5497 | |
5498 | /* Get at the initial defs for the reduction PHIs for REDUC_INFO, |
5499 | which performs a reduction involving GROUP_SIZE scalar statements. |
5500 | NUMBER_OF_VECTORS is the number of vector defs to create. If NEUTRAL_OP |
5501 | is nonnull, introducing extra elements of that value will not change the |
5502 | result. */ |
5503 | |
5504 | static void |
5505 | get_initial_defs_for_reduction (loop_vec_info loop_vinfo, |
5506 | stmt_vec_info reduc_info, |
5507 | vec<tree> *vec_oprnds, |
5508 | unsigned int number_of_vectors, |
5509 | unsigned int group_size, tree neutral_op) |
5510 | { |
5511 | vec<tree> &initial_values = reduc_info->reduc_initial_values; |
5512 | unsigned HOST_WIDE_INT nunits; |
5513 | unsigned j, number_of_places_left_in_vector; |
5514 | tree vector_type = STMT_VINFO_VECTYPE (reduc_info); |
5515 | unsigned int i; |
5516 | |
5517 | gcc_assert (group_size == initial_values.length () || neutral_op); |
5518 | |
5519 | /* NUMBER_OF_COPIES is the number of times we need to use the same values in |
5520 | created vectors. It is greater than 1 if unrolling is performed. |
5521 | |
5522 | For example, we have two scalar operands, s1 and s2 (e.g., group of |
5523 | strided accesses of size two), while NUNITS is four (i.e., four scalars |
5524 | of this type can be packed in a vector). The output vector will contain |
5525 | two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES |
5526 | will be 2). |
5527 | |
5528 | If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several |
5529 | vectors containing the operands. |
5530 | |
5531 | For example, NUNITS is four as before, and the group size is 8 |
5532 | (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and |
5533 | {s5, s6, s7, s8}. */ |
5534 | |
5535 | if (!TYPE_VECTOR_SUBPARTS (node: vector_type).is_constant (const_value: &nunits)) |
5536 | nunits = group_size; |
5537 | |
5538 | number_of_places_left_in_vector = nunits; |
5539 | bool constant_p = true; |
5540 | tree_vector_builder elts (vector_type, nunits, 1); |
5541 | elts.quick_grow (len: nunits); |
5542 | gimple_seq ctor_seq = NULL; |
5543 | for (j = 0; j < nunits * number_of_vectors; ++j) |
5544 | { |
5545 | tree op; |
5546 | i = j % group_size; |
5547 | |
5548 | /* Get the def before the loop. In reduction chain we have only |
5549 | one initial value. Else we have as many as PHIs in the group. */ |
5550 | if (i >= initial_values.length () || (j > i && neutral_op)) |
5551 | op = neutral_op; |
5552 | else |
5553 | op = initial_values[i]; |
5554 | |
5555 | /* Create 'vect_ = {op0,op1,...,opn}'. */ |
5556 | number_of_places_left_in_vector--; |
5557 | elts[nunits - number_of_places_left_in_vector - 1] = op; |
5558 | if (!CONSTANT_CLASS_P (op)) |
5559 | constant_p = false; |
5560 | |
5561 | if (number_of_places_left_in_vector == 0) |
5562 | { |
5563 | tree init; |
5564 | if (constant_p && !neutral_op |
5565 | ? multiple_p (a: TYPE_VECTOR_SUBPARTS (node: vector_type), b: nunits) |
5566 | : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits)) |
5567 | /* Build the vector directly from ELTS. */ |
5568 | init = gimple_build_vector (seq: &ctor_seq, builder: &elts); |
5569 | else if (neutral_op) |
5570 | { |
5571 | /* Build a vector of the neutral value and shift the |
5572 | other elements into place. */ |
5573 | init = gimple_build_vector_from_val (seq: &ctor_seq, type: vector_type, |
5574 | op: neutral_op); |
5575 | int k = nunits; |
5576 | while (k > 0 && elts[k - 1] == neutral_op) |
5577 | k -= 1; |
5578 | while (k > 0) |
5579 | { |
5580 | k -= 1; |
5581 | init = gimple_build (seq: &ctor_seq, fn: CFN_VEC_SHL_INSERT, |
5582 | type: vector_type, args: init, args: elts[k]); |
5583 | } |
5584 | } |
5585 | else |
5586 | { |
5587 | /* First time round, duplicate ELTS to fill the |
5588 | required number of vectors. */ |
5589 | duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type, |
5590 | elts, number_of_vectors, *vec_oprnds); |
5591 | break; |
5592 | } |
5593 | vec_oprnds->quick_push (obj: init); |
5594 | |
5595 | number_of_places_left_in_vector = nunits; |
5596 | elts.new_vector (type: vector_type, npatterns: nunits, nelts_per_pattern: 1); |
5597 | elts.quick_grow (len: nunits); |
5598 | constant_p = true; |
5599 | } |
5600 | } |
5601 | if (ctor_seq != NULL) |
5602 | vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, seq: ctor_seq); |
5603 | } |
5604 | |
5605 | /* For a statement STMT_INFO taking part in a reduction operation return |
5606 | the stmt_vec_info the meta information is stored on. */ |
5607 | |
5608 | stmt_vec_info |
5609 | info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info) |
5610 | { |
5611 | stmt_info = vect_orig_stmt (stmt_info); |
5612 | gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info)); |
5613 | if (!is_a <gphi *> (p: stmt_info->stmt) |
5614 | || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info))) |
5615 | stmt_info = STMT_VINFO_REDUC_DEF (stmt_info); |
5616 | gphi *phi = as_a <gphi *> (p: stmt_info->stmt); |
5617 | if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def) |
5618 | { |
5619 | if (gimple_phi_num_args (gs: phi) == 1) |
5620 | stmt_info = STMT_VINFO_REDUC_DEF (stmt_info); |
5621 | } |
5622 | else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle) |
5623 | { |
5624 | stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi)); |
5625 | if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def) |
5626 | stmt_info = info; |
5627 | } |
5628 | return stmt_info; |
5629 | } |
5630 | |
5631 | /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that |
5632 | REDUC_INFO can build on. Adjust REDUC_INFO and return true if so, otherwise |
5633 | return false. */ |
5634 | |
5635 | static bool |
5636 | vect_find_reusable_accumulator (loop_vec_info loop_vinfo, |
5637 | stmt_vec_info reduc_info) |
5638 | { |
5639 | loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo); |
5640 | if (!main_loop_vinfo) |
5641 | return false; |
5642 | |
5643 | if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION) |
5644 | return false; |
5645 | |
5646 | unsigned int num_phis = reduc_info->reduc_initial_values.length (); |
5647 | auto_vec<tree, 16> main_loop_results (num_phis); |
5648 | auto_vec<tree, 16> initial_values (num_phis); |
5649 | if (edge main_loop_edge = loop_vinfo->main_loop_edge) |
5650 | { |
5651 | /* The epilogue loop can be entered either from the main loop or |
5652 | from an earlier guard block. */ |
5653 | edge skip_edge = loop_vinfo->skip_main_loop_edge; |
5654 | for (tree incoming_value : reduc_info->reduc_initial_values) |
5655 | { |
5656 | /* Look for: |
5657 | |
5658 | INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop), |
5659 | INITIAL_VALUE(guard block)>. */ |
5660 | gcc_assert (TREE_CODE (incoming_value) == SSA_NAME); |
5661 | |
5662 | gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value)); |
5663 | gcc_assert (gimple_bb (phi) == main_loop_edge->dest); |
5664 | |
5665 | tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge); |
5666 | tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge); |
5667 | |
5668 | main_loop_results.quick_push (obj: from_main_loop); |
5669 | initial_values.quick_push (obj: from_skip); |
5670 | } |
5671 | } |
5672 | else |
5673 | /* The main loop dominates the epilogue loop. */ |
5674 | main_loop_results.splice (src: reduc_info->reduc_initial_values); |
5675 | |
5676 | /* See if the main loop has the kind of accumulator we need. */ |
5677 | vect_reusable_accumulator *accumulator |
5678 | = main_loop_vinfo->reusable_accumulators.get (k: main_loop_results[0]); |
5679 | if (!accumulator |
5680 | || num_phis != accumulator->reduc_info->reduc_scalar_results.length () |
5681 | || !std::equal (first1: main_loop_results.begin (), last1: main_loop_results.end (), |
5682 | first2: accumulator->reduc_info->reduc_scalar_results.begin ())) |
5683 | return false; |
5684 | |
5685 | /* Handle the case where we can reduce wider vectors to narrower ones. */ |
5686 | tree vectype = STMT_VINFO_VECTYPE (reduc_info); |
5687 | tree old_vectype = TREE_TYPE (accumulator->reduc_input); |
5688 | unsigned HOST_WIDE_INT m; |
5689 | if (!constant_multiple_p (a: TYPE_VECTOR_SUBPARTS (node: old_vectype), |
5690 | b: TYPE_VECTOR_SUBPARTS (node: vectype), multiple: &m)) |
5691 | return false; |
5692 | /* Check the intermediate vector types and operations are available. */ |
5693 | tree prev_vectype = old_vectype; |
5694 | poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (node: old_vectype); |
5695 | while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype))) |
5696 | { |
5697 | intermediate_nunits = exact_div (a: intermediate_nunits, b: 2); |
5698 | tree intermediate_vectype = get_related_vectype_for_scalar_type |
5699 | (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits); |
5700 | if (!intermediate_vectype |
5701 | || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info), |
5702 | intermediate_vectype) |
5703 | || !can_vec_extract (TYPE_MODE (prev_vectype), |
5704 | TYPE_MODE (intermediate_vectype))) |
5705 | return false; |
5706 | prev_vectype = intermediate_vectype; |
5707 | } |
5708 | |
5709 | /* Non-SLP reductions might apply an adjustment after the reduction |
5710 | operation, in order to simplify the initialization of the accumulator. |
5711 | If the epilogue loop carries on from where the main loop left off, |
5712 | it should apply the same adjustment to the final reduction result. |
5713 | |
5714 | If the epilogue loop can also be entered directly (rather than via |
5715 | the main loop), we need to be able to handle that case in the same way, |
5716 | with the same adjustment. (In principle we could add a PHI node |
5717 | to select the correct adjustment, but in practice that shouldn't be |
5718 | necessary.) */ |
5719 | tree main_adjustment |
5720 | = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info); |
5721 | if (loop_vinfo->main_loop_edge && main_adjustment) |
5722 | { |
5723 | gcc_assert (num_phis == 1); |
5724 | tree initial_value = initial_values[0]; |
5725 | /* Check that we can use INITIAL_VALUE as the adjustment and |
5726 | initialize the accumulator with a neutral value instead. */ |
5727 | if (!operand_equal_p (initial_value, main_adjustment)) |
5728 | return false; |
5729 | code_helper code = STMT_VINFO_REDUC_CODE (reduc_info); |
5730 | initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value), |
5731 | code, initial_value); |
5732 | } |
5733 | STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment; |
5734 | reduc_info->reduc_initial_values.truncate (size: 0); |
5735 | reduc_info->reduc_initial_values.splice (src: initial_values); |
5736 | reduc_info->reused_accumulator = accumulator; |
5737 | return true; |
5738 | } |
5739 | |
5740 | /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation |
5741 | CODE emitting stmts before GSI. Returns a vector def of VECTYPE. */ |
5742 | |
5743 | static tree |
5744 | vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code, |
5745 | gimple_seq *seq) |
5746 | { |
5747 | unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant (); |
5748 | unsigned nunits1 = TYPE_VECTOR_SUBPARTS (node: vectype).to_constant (); |
5749 | tree stype = TREE_TYPE (vectype); |
5750 | tree new_temp = vec_def; |
5751 | while (nunits > nunits1) |
5752 | { |
5753 | nunits /= 2; |
5754 | tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype), |
5755 | stype, nunits); |
5756 | unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1)); |
5757 | |
5758 | /* The target has to make sure we support lowpart/highpart |
5759 | extraction, either via direct vector extract or through |
5760 | an integer mode punning. */ |
5761 | tree dst1, dst2; |
5762 | gimple *epilog_stmt; |
5763 | if (convert_optab_handler (op: vec_extract_optab, |
5764 | TYPE_MODE (TREE_TYPE (new_temp)), |
5765 | TYPE_MODE (vectype1)) |
5766 | != CODE_FOR_nothing) |
5767 | { |
5768 | /* Extract sub-vectors directly once vec_extract becomes |
5769 | a conversion optab. */ |
5770 | dst1 = make_ssa_name (var: vectype1); |
5771 | epilog_stmt |
5772 | = gimple_build_assign (dst1, BIT_FIELD_REF, |
5773 | build3 (BIT_FIELD_REF, vectype1, |
5774 | new_temp, TYPE_SIZE (vectype1), |
5775 | bitsize_int (0))); |
5776 | gimple_seq_add_stmt_without_update (seq, epilog_stmt); |
5777 | dst2 = make_ssa_name (var: vectype1); |
5778 | epilog_stmt |
5779 | = gimple_build_assign (dst2, BIT_FIELD_REF, |
5780 | build3 (BIT_FIELD_REF, vectype1, |
5781 | new_temp, TYPE_SIZE (vectype1), |
5782 | bitsize_int (bitsize))); |
5783 | gimple_seq_add_stmt_without_update (seq, epilog_stmt); |
5784 | } |
5785 | else |
5786 | { |
5787 | /* Extract via punning to appropriately sized integer mode |
5788 | vector. */ |
5789 | tree eltype = build_nonstandard_integer_type (bitsize, 1); |
5790 | tree etype = build_vector_type (eltype, 2); |
5791 | gcc_assert (convert_optab_handler (vec_extract_optab, |
5792 | TYPE_MODE (etype), |
5793 | TYPE_MODE (eltype)) |
5794 | != CODE_FOR_nothing); |
5795 | tree tem = make_ssa_name (var: etype); |
5796 | epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR, |
5797 | build1 (VIEW_CONVERT_EXPR, |
5798 | etype, new_temp)); |
5799 | gimple_seq_add_stmt_without_update (seq, epilog_stmt); |
5800 | new_temp = tem; |
5801 | tem = make_ssa_name (var: eltype); |
5802 | epilog_stmt |
5803 | = gimple_build_assign (tem, BIT_FIELD_REF, |
5804 | build3 (BIT_FIELD_REF, eltype, |
5805 | new_temp, TYPE_SIZE (eltype), |
5806 | bitsize_int (0))); |
5807 | gimple_seq_add_stmt_without_update (seq, epilog_stmt); |
5808 | dst1 = make_ssa_name (var: vectype1); |
5809 | epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR, |
5810 | build1 (VIEW_CONVERT_EXPR, |
5811 | vectype1, tem)); |
5812 | gimple_seq_add_stmt_without_update (seq, epilog_stmt); |
5813 | tem = make_ssa_name (var: eltype); |
5814 | epilog_stmt |
5815 | = gimple_build_assign (tem, BIT_FIELD_REF, |
5816 | build3 (BIT_FIELD_REF, eltype, |
5817 | new_temp, TYPE_SIZE (eltype), |
5818 | bitsize_int (bitsize))); |
5819 | gimple_seq_add_stmt_without_update (seq, epilog_stmt); |
5820 | dst2 = make_ssa_name (var: vectype1); |
5821 | epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR, |
5822 | build1 (VIEW_CONVERT_EXPR, |
5823 | vectype1, tem)); |
5824 | gimple_seq_add_stmt_without_update (seq, epilog_stmt); |
5825 | } |
5826 | |
5827 | new_temp = gimple_build (seq, code, type: vectype1, ops: dst1, ops: dst2); |
5828 | } |
5829 | |
5830 | return new_temp; |
5831 | } |
5832 | |
5833 | /* Function vect_create_epilog_for_reduction |
5834 | |
5835 | Create code at the loop-epilog to finalize the result of a reduction |
5836 | computation. |
5837 | |
5838 | STMT_INFO is the scalar reduction stmt that is being vectorized. |
5839 | SLP_NODE is an SLP node containing a group of reduction statements. The |
5840 | first one in this group is STMT_INFO. |
5841 | SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE |
5842 | REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi |
5843 | (counting from 0) |
5844 | |
5845 | This function: |
5846 | 1. Completes the reduction def-use cycles. |
5847 | 2. "Reduces" each vector of partial results VECT_DEFS into a single result, |
5848 | by calling the function specified by REDUC_FN if available, or by |
5849 | other means (whole-vector shifts or a scalar loop). |
5850 | The function also creates a new phi node at the loop exit to preserve |
5851 | loop-closed form, as illustrated below. |
5852 | |
5853 | The flow at the entry to this function: |
5854 | |
5855 | loop: |
5856 | vec_def = phi <vec_init, null> # REDUCTION_PHI |
5857 | VECT_DEF = vector_stmt # vectorized form of STMT_INFO |
5858 | s_loop = scalar_stmt # (scalar) STMT_INFO |
5859 | loop_exit: |
5860 | s_out0 = phi <s_loop> # (scalar) EXIT_PHI |
5861 | use <s_out0> |
5862 | use <s_out0> |
5863 | |
5864 | The above is transformed by this function into: |
5865 | |
5866 | loop: |
5867 | vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI |
5868 | VECT_DEF = vector_stmt # vectorized form of STMT_INFO |
5869 | s_loop = scalar_stmt # (scalar) STMT_INFO |
5870 | loop_exit: |
5871 | s_out0 = phi <s_loop> # (scalar) EXIT_PHI |
5872 | v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI |
5873 | v_out2 = reduce <v_out1> |
5874 | s_out3 = extract_field <v_out2, 0> |
5875 | s_out4 = adjust_result <s_out3> |
5876 | use <s_out4> |
5877 | use <s_out4> |
5878 | */ |
5879 | |
5880 | static void |
5881 | vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, |
5882 | stmt_vec_info stmt_info, |
5883 | slp_tree slp_node, |
5884 | slp_instance slp_node_instance) |
5885 | { |
5886 | stmt_vec_info reduc_info = info_for_reduction (vinfo: loop_vinfo, stmt_info); |
5887 | gcc_assert (reduc_info->is_reduc_info); |
5888 | /* For double reductions we need to get at the inner loop reduction |
5889 | stmt which has the meta info attached. Our stmt_info is that of the |
5890 | loop-closed PHI of the inner loop which we remember as |
5891 | def for the reduction PHI generation. */ |
5892 | bool double_reduc = false; |
5893 | stmt_vec_info rdef_info = stmt_info; |
5894 | if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def) |
5895 | { |
5896 | gcc_assert (!slp_node); |
5897 | double_reduc = true; |
5898 | stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def |
5899 | (gs: stmt_info->stmt, index: 0)); |
5900 | stmt_info = vect_stmt_to_vectorize (stmt_info); |
5901 | } |
5902 | gphi *reduc_def_stmt |
5903 | = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt); |
5904 | code_helper code = STMT_VINFO_REDUC_CODE (reduc_info); |
5905 | internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info); |
5906 | tree vectype; |
5907 | machine_mode mode; |
5908 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL; |
5909 | basic_block exit_bb; |
5910 | tree scalar_dest; |
5911 | tree scalar_type; |
5912 | gimple *new_phi = NULL, *phi = NULL; |
5913 | gimple_stmt_iterator exit_gsi; |
5914 | tree new_temp = NULL_TREE, new_name, new_scalar_dest; |
5915 | gimple *epilog_stmt = NULL; |
5916 | gimple *exit_phi; |
5917 | tree bitsize; |
5918 | tree def; |
5919 | tree orig_name, scalar_result; |
5920 | imm_use_iterator imm_iter, phi_imm_iter; |
5921 | use_operand_p use_p, phi_use_p; |
5922 | gimple *use_stmt; |
5923 | auto_vec<tree> reduc_inputs; |
5924 | int j, i; |
5925 | vec<tree> &scalar_results = reduc_info->reduc_scalar_results; |
5926 | unsigned int group_size = 1, k; |
5927 | auto_vec<gimple *> phis; |
5928 | /* SLP reduction without reduction chain, e.g., |
5929 | # a1 = phi <a2, a0> |
5930 | # b1 = phi <b2, b0> |
5931 | a2 = operation (a1) |
5932 | b2 = operation (b1) */ |
5933 | bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)); |
5934 | bool direct_slp_reduc; |
5935 | tree induction_index = NULL_TREE; |
5936 | |
5937 | if (slp_node) |
5938 | group_size = SLP_TREE_LANES (slp_node); |
5939 | |
5940 | if (nested_in_vect_loop_p (loop, stmt_info)) |
5941 | { |
5942 | outer_loop = loop; |
5943 | loop = loop->inner; |
5944 | gcc_assert (!slp_node && double_reduc); |
5945 | } |
5946 | |
5947 | vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info); |
5948 | gcc_assert (vectype); |
5949 | mode = TYPE_MODE (vectype); |
5950 | |
5951 | tree induc_val = NULL_TREE; |
5952 | tree adjustment_def = NULL; |
5953 | if (slp_node) |
5954 | ; |
5955 | else |
5956 | { |
5957 | /* Optimize: for induction condition reduction, if we can't use zero |
5958 | for induc_val, use initial_def. */ |
5959 | if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION) |
5960 | induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info); |
5961 | else if (double_reduc) |
5962 | ; |
5963 | else |
5964 | adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info); |
5965 | } |
5966 | |
5967 | stmt_vec_info single_live_out_stmt[] = { stmt_info }; |
5968 | array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt; |
5969 | if (slp_reduc) |
5970 | /* All statements produce live-out values. */ |
5971 | live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node); |
5972 | else if (slp_node) |
5973 | { |
5974 | /* The last statement in the reduction chain produces the live-out |
5975 | value. Note SLP optimization can shuffle scalar stmts to |
5976 | optimize permutations so we have to search for the last stmt. */ |
5977 | for (k = 0; k < group_size; ++k) |
5978 | if (!REDUC_GROUP_NEXT_ELEMENT (SLP_TREE_SCALAR_STMTS (slp_node)[k])) |
5979 | { |
5980 | single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[k]; |
5981 | break; |
5982 | } |
5983 | } |
5984 | |
5985 | unsigned vec_num; |
5986 | int ncopies; |
5987 | if (slp_node) |
5988 | { |
5989 | vec_num = SLP_TREE_VEC_DEFS (slp_node_instance->reduc_phis).length (); |
5990 | ncopies = 1; |
5991 | } |
5992 | else |
5993 | { |
5994 | stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt); |
5995 | vec_num = 1; |
5996 | ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length (); |
5997 | } |
5998 | |
5999 | /* For cond reductions we want to create a new vector (INDEX_COND_EXPR) |
6000 | which is updated with the current index of the loop for every match of |
6001 | the original loop's cond_expr (VEC_STMT). This results in a vector |
6002 | containing the last time the condition passed for that vector lane. |
6003 | The first match will be a 1 to allow 0 to be used for non-matching |
6004 | indexes. If there are no matches at all then the vector will be all |
6005 | zeroes. |
6006 | |
6007 | PR92772: This algorithm is broken for architectures that support |
6008 | masked vectors, but do not provide fold_extract_last. */ |
6009 | if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION) |
6010 | { |
6011 | auto_vec<std::pair<tree, bool>, 2> ccompares; |
6012 | stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info); |
6013 | cond_info = vect_stmt_to_vectorize (stmt_info: cond_info); |
6014 | while (cond_info != reduc_info) |
6015 | { |
6016 | if (gimple_assign_rhs_code (gs: cond_info->stmt) == COND_EXPR) |
6017 | { |
6018 | gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0]; |
6019 | gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR); |
6020 | ccompares.safe_push |
6021 | (obj: std::make_pair (x: unshare_expr (gimple_assign_rhs1 (gs: vec_stmt)), |
6022 | STMT_VINFO_REDUC_IDX (cond_info) == 2)); |
6023 | } |
6024 | cond_info |
6025 | = loop_vinfo->lookup_def (gimple_op (gs: cond_info->stmt, |
6026 | i: 1 + STMT_VINFO_REDUC_IDX |
6027 | (cond_info))); |
6028 | cond_info = vect_stmt_to_vectorize (stmt_info: cond_info); |
6029 | } |
6030 | gcc_assert (ccompares.length () != 0); |
6031 | |
6032 | tree indx_before_incr, indx_after_incr; |
6033 | poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (node: vectype); |
6034 | int scalar_precision |
6035 | = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype))); |
6036 | tree cr_index_scalar_type = make_unsigned_type (scalar_precision); |
6037 | tree cr_index_vector_type = get_related_vectype_for_scalar_type |
6038 | (TYPE_MODE (vectype), cr_index_scalar_type, |
6039 | TYPE_VECTOR_SUBPARTS (node: vectype)); |
6040 | |
6041 | /* First we create a simple vector induction variable which starts |
6042 | with the values {1,2,3,...} (SERIES_VECT) and increments by the |
6043 | vector size (STEP). */ |
6044 | |
6045 | /* Create a {1,2,3,...} vector. */ |
6046 | tree series_vect = build_index_vector (cr_index_vector_type, 1, 1); |
6047 | |
6048 | /* Create a vector of the step value. */ |
6049 | tree step = build_int_cst (cr_index_scalar_type, nunits_out); |
6050 | tree vec_step = build_vector_from_val (cr_index_vector_type, step); |
6051 | |
6052 | /* Create an induction variable. */ |
6053 | gimple_stmt_iterator incr_gsi; |
6054 | bool insert_after; |
6055 | standard_iv_increment_position (loop, &incr_gsi, &insert_after); |
6056 | create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi, |
6057 | insert_after, &indx_before_incr, &indx_after_incr); |
6058 | |
6059 | /* Next create a new phi node vector (NEW_PHI_TREE) which starts |
6060 | filled with zeros (VEC_ZERO). */ |
6061 | |
6062 | /* Create a vector of 0s. */ |
6063 | tree zero = build_zero_cst (cr_index_scalar_type); |
6064 | tree vec_zero = build_vector_from_val (cr_index_vector_type, zero); |
6065 | |
6066 | /* Create a vector phi node. */ |
6067 | tree new_phi_tree = make_ssa_name (var: cr_index_vector_type); |
6068 | new_phi = create_phi_node (new_phi_tree, loop->header); |
6069 | add_phi_arg (as_a <gphi *> (p: new_phi), vec_zero, |
6070 | loop_preheader_edge (loop), UNKNOWN_LOCATION); |
6071 | |
6072 | /* Now take the condition from the loops original cond_exprs |
6073 | and produce a new cond_exprs (INDEX_COND_EXPR) which for |
6074 | every match uses values from the induction variable |
6075 | (INDEX_BEFORE_INCR) otherwise uses values from the phi node |
6076 | (NEW_PHI_TREE). |
6077 | Finally, we update the phi (NEW_PHI_TREE) to take the value of |
6078 | the new cond_expr (INDEX_COND_EXPR). */ |
6079 | gimple_seq stmts = NULL; |
6080 | for (int i = ccompares.length () - 1; i != -1; --i) |
6081 | { |
6082 | tree ccompare = ccompares[i].first; |
6083 | if (ccompares[i].second) |
6084 | new_phi_tree = gimple_build (seq: &stmts, code: VEC_COND_EXPR, |
6085 | type: cr_index_vector_type, |
6086 | ops: ccompare, |
6087 | ops: indx_before_incr, ops: new_phi_tree); |
6088 | else |
6089 | new_phi_tree = gimple_build (seq: &stmts, code: VEC_COND_EXPR, |
6090 | type: cr_index_vector_type, |
6091 | ops: ccompare, |
6092 | ops: new_phi_tree, ops: indx_before_incr); |
6093 | } |
6094 | gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT); |
6095 | |
6096 | /* Update the phi with the vec cond. */ |
6097 | induction_index = new_phi_tree; |
6098 | add_phi_arg (as_a <gphi *> (p: new_phi), induction_index, |
6099 | loop_latch_edge (loop), UNKNOWN_LOCATION); |
6100 | } |
6101 | |
6102 | /* 2. Create epilog code. |
6103 | The reduction epilog code operates across the elements of the vector |
6104 | of partial results computed by the vectorized loop. |
6105 | The reduction epilog code consists of: |
6106 | |
6107 | step 1: compute the scalar result in a vector (v_out2) |
6108 | step 2: extract the scalar result (s_out3) from the vector (v_out2) |
6109 | step 3: adjust the scalar result (s_out3) if needed. |
6110 | |
6111 | Step 1 can be accomplished using one the following three schemes: |
6112 | (scheme 1) using reduc_fn, if available. |
6113 | (scheme 2) using whole-vector shifts, if available. |
6114 | (scheme 3) using a scalar loop. In this case steps 1+2 above are |
6115 | combined. |
6116 | |
6117 | The overall epilog code looks like this: |
6118 | |
6119 | s_out0 = phi <s_loop> # original EXIT_PHI |
6120 | v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI |
6121 | v_out2 = reduce <v_out1> # step 1 |
6122 | s_out3 = extract_field <v_out2, 0> # step 2 |
6123 | s_out4 = adjust_result <s_out3> # step 3 |
6124 | |
6125 | (step 3 is optional, and steps 1 and 2 may be combined). |
6126 | Lastly, the uses of s_out0 are replaced by s_out4. */ |
6127 | |
6128 | |
6129 | /* 2.1 Create new loop-exit-phis to preserve loop-closed form: |
6130 | v_out1 = phi <VECT_DEF> |
6131 | Store them in NEW_PHIS. */ |
6132 | if (double_reduc) |
6133 | loop = outer_loop; |
6134 | exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest; |
6135 | exit_gsi = gsi_after_labels (bb: exit_bb); |
6136 | reduc_inputs.create (nelems: slp_node ? vec_num : ncopies); |
6137 | for (unsigned i = 0; i < vec_num; i++) |
6138 | { |
6139 | gimple_seq stmts = NULL; |
6140 | if (slp_node) |
6141 | def = vect_get_slp_vect_def (slp_node, i); |
6142 | else |
6143 | def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]); |
6144 | for (j = 0; j < ncopies; j++) |
6145 | { |
6146 | tree new_def = copy_ssa_name (var: def); |
6147 | phi = create_phi_node (new_def, exit_bb); |
6148 | if (j) |
6149 | def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]); |
6150 | SET_PHI_ARG_DEF (phi, LOOP_VINFO_IV_EXIT (loop_vinfo)->dest_idx, def); |
6151 | new_def = gimple_convert (seq: &stmts, type: vectype, op: new_def); |
6152 | reduc_inputs.quick_push (obj: new_def); |
6153 | } |
6154 | gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); |
6155 | } |
6156 | |
6157 | /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3 |
6158 | (i.e. when reduc_fn is not available) and in the final adjustment |
6159 | code (if needed). Also get the original scalar reduction variable as |
6160 | defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it |
6161 | represents a reduction pattern), the tree-code and scalar-def are |
6162 | taken from the original stmt that the pattern-stmt (STMT) replaces. |
6163 | Otherwise (it is a regular reduction) - the tree-code and scalar-def |
6164 | are taken from STMT. */ |
6165 | |
6166 | stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info); |
6167 | if (orig_stmt_info != stmt_info) |
6168 | { |
6169 | /* Reduction pattern */ |
6170 | gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)); |
6171 | gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info); |
6172 | } |
6173 | |
6174 | scalar_dest = gimple_get_lhs (orig_stmt_info->stmt); |
6175 | scalar_type = TREE_TYPE (scalar_dest); |
6176 | scalar_results.truncate (size: 0); |
6177 | scalar_results.reserve_exact (nelems: group_size); |
6178 | new_scalar_dest = vect_create_destination_var (scalar_dest, NULL); |
6179 | bitsize = TYPE_SIZE (scalar_type); |
6180 | |
6181 | /* True if we should implement SLP_REDUC using native reduction operations |
6182 | instead of scalar operations. */ |
6183 | direct_slp_reduc = (reduc_fn != IFN_LAST |
6184 | && slp_reduc |
6185 | && !TYPE_VECTOR_SUBPARTS (node: vectype).is_constant ()); |
6186 | |
6187 | /* In case of reduction chain, e.g., |
6188 | # a1 = phi <a3, a0> |
6189 | a2 = operation (a1) |
6190 | a3 = operation (a2), |
6191 | |
6192 | we may end up with more than one vector result. Here we reduce them |
6193 | to one vector. |
6194 | |
6195 | The same is true for a SLP reduction, e.g., |
6196 | # a1 = phi <a2, a0> |
6197 | # b1 = phi <b2, b0> |
6198 | a2 = operation (a1) |
6199 | b2 = operation (a2), |
6200 | |
6201 | where we can end up with more than one vector as well. We can |
6202 | easily accumulate vectors when the number of vector elements is |
6203 | a multiple of the SLP group size. |
6204 | |
6205 | The same is true if we couldn't use a single defuse cycle. */ |
6206 | if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) |
6207 | || direct_slp_reduc |
6208 | || (slp_reduc |
6209 | && constant_multiple_p (a: TYPE_VECTOR_SUBPARTS (node: vectype), b: group_size)) |
6210 | || ncopies > 1) |
6211 | { |
6212 | gimple_seq stmts = NULL; |
6213 | tree single_input = reduc_inputs[0]; |
6214 | for (k = 1; k < reduc_inputs.length (); k++) |
6215 | single_input = gimple_build (seq: &stmts, code, type: vectype, |
6216 | ops: single_input, ops: reduc_inputs[k]); |
6217 | gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); |
6218 | |
6219 | reduc_inputs.truncate (size: 0); |
6220 | reduc_inputs.safe_push (obj: single_input); |
6221 | } |
6222 | |
6223 | tree orig_reduc_input = reduc_inputs[0]; |
6224 | |
6225 | /* If this loop is an epilogue loop that can be skipped after the |
6226 | main loop, we can only share a reduction operation between the |
6227 | main loop and the epilogue if we put it at the target of the |
6228 | skip edge. |
6229 | |
6230 | We can still reuse accumulators if this check fails. Doing so has |
6231 | the minor(?) benefit of making the epilogue loop's scalar result |
6232 | independent of the main loop's scalar result. */ |
6233 | bool unify_with_main_loop_p = false; |
6234 | if (reduc_info->reused_accumulator |
6235 | && loop_vinfo->skip_this_loop_edge |
6236 | && single_succ_p (bb: exit_bb) |
6237 | && single_succ (bb: exit_bb) == loop_vinfo->skip_this_loop_edge->dest) |
6238 | { |
6239 | unify_with_main_loop_p = true; |
6240 | |
6241 | basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest; |
6242 | reduc_inputs[0] = make_ssa_name (var: vectype); |
6243 | gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block); |
6244 | add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (bb: exit_bb), |
6245 | UNKNOWN_LOCATION); |
6246 | add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input, |
6247 | loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION); |
6248 | exit_gsi = gsi_after_labels (bb: reduc_block); |
6249 | } |
6250 | |
6251 | /* Shouldn't be used beyond this point. */ |
6252 | exit_bb = nullptr; |
6253 | |
6254 | if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION |
6255 | && reduc_fn != IFN_LAST) |
6256 | { |
6257 | /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing |
6258 | various data values where the condition matched and another vector |
6259 | (INDUCTION_INDEX) containing all the indexes of those matches. We |
6260 | need to extract the last matching index (which will be the index with |
6261 | highest value) and use this to index into the data vector. |
6262 | For the case where there were no matches, the data vector will contain |
6263 | all default values and the index vector will be all zeros. */ |
6264 | |
6265 | /* Get various versions of the type of the vector of indexes. */ |
6266 | tree index_vec_type = TREE_TYPE (induction_index); |
6267 | gcc_checking_assert (TYPE_UNSIGNED (index_vec_type)); |
6268 | tree index_scalar_type = TREE_TYPE (index_vec_type); |
6269 | tree index_vec_cmp_type = truth_type_for (index_vec_type); |
6270 | |
6271 | /* Get an unsigned integer version of the type of the data vector. */ |
6272 | int scalar_precision |
6273 | = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type)); |
6274 | tree scalar_type_unsigned = make_unsigned_type (scalar_precision); |
6275 | tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned, |
6276 | vectype); |
6277 | |
6278 | /* First we need to create a vector (ZERO_VEC) of zeros and another |
6279 | vector (MAX_INDEX_VEC) filled with the last matching index, which we |
6280 | can create using a MAX reduction and then expanding. |
6281 | In the case where the loop never made any matches, the max index will |
6282 | be zero. */ |
6283 | |
6284 | /* Vector of {0, 0, 0,...}. */ |
6285 | tree zero_vec = build_zero_cst (vectype); |
6286 | |
6287 | /* Find maximum value from the vector of found indexes. */ |
6288 | tree max_index = make_ssa_name (var: index_scalar_type); |
6289 | gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX, |
6290 | 1, induction_index); |
6291 | gimple_call_set_lhs (gs: max_index_stmt, lhs: max_index); |
6292 | gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT); |
6293 | |
6294 | /* Vector of {max_index, max_index, max_index,...}. */ |
6295 | tree max_index_vec = make_ssa_name (var: index_vec_type); |
6296 | tree max_index_vec_rhs = build_vector_from_val (index_vec_type, |
6297 | max_index); |
6298 | gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec, |
6299 | max_index_vec_rhs); |
6300 | gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT); |
6301 | |
6302 | /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes |
6303 | with the vector (INDUCTION_INDEX) of found indexes, choosing values |
6304 | from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC) |
6305 | otherwise. Only one value should match, resulting in a vector |
6306 | (VEC_COND) with one data value and the rest zeros. |
6307 | In the case where the loop never made any matches, every index will |
6308 | match, resulting in a vector with all data values (which will all be |
6309 | the default value). */ |
6310 | |
6311 | /* Compare the max index vector to the vector of found indexes to find |
6312 | the position of the max value. */ |
6313 | tree vec_compare = make_ssa_name (var: index_vec_cmp_type); |
6314 | gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR, |
6315 | induction_index, |
6316 | max_index_vec); |
6317 | gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT); |
6318 | |
6319 | /* Use the compare to choose either values from the data vector or |
6320 | zero. */ |
6321 | tree vec_cond = make_ssa_name (var: vectype); |
6322 | gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR, |
6323 | vec_compare, |
6324 | reduc_inputs[0], |
6325 | zero_vec); |
6326 | gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT); |
6327 | |
6328 | /* Finally we need to extract the data value from the vector (VEC_COND) |
6329 | into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR |
6330 | reduction, but because this doesn't exist, we can use a MAX reduction |
6331 | instead. The data value might be signed or a float so we need to cast |
6332 | it first. |
6333 | In the case where the loop never made any matches, the data values are |
6334 | all identical, and so will reduce down correctly. */ |
6335 | |
6336 | /* Make the matched data values unsigned. */ |
6337 | tree vec_cond_cast = make_ssa_name (var: vectype_unsigned); |
6338 | tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned, |
6339 | vec_cond); |
6340 | gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast, |
6341 | VIEW_CONVERT_EXPR, |
6342 | vec_cond_cast_rhs); |
6343 | gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT); |
6344 | |
6345 | /* Reduce down to a scalar value. */ |
6346 | tree data_reduc = make_ssa_name (var: scalar_type_unsigned); |
6347 | gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX, |
6348 | 1, vec_cond_cast); |
6349 | gimple_call_set_lhs (gs: data_reduc_stmt, lhs: data_reduc); |
6350 | gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT); |
6351 | |
6352 | /* Convert the reduced value back to the result type and set as the |
6353 | result. */ |
6354 | gimple_seq stmts = NULL; |
6355 | new_temp = gimple_build (seq: &stmts, code: VIEW_CONVERT_EXPR, type: scalar_type, |
6356 | ops: data_reduc); |
6357 | gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); |
6358 | scalar_results.safe_push (obj: new_temp); |
6359 | } |
6360 | else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION |
6361 | && reduc_fn == IFN_LAST) |
6362 | { |
6363 | /* Condition reduction without supported IFN_REDUC_MAX. Generate |
6364 | idx = 0; |
6365 | idx_val = induction_index[0]; |
6366 | val = data_reduc[0]; |
6367 | for (idx = 0, val = init, i = 0; i < nelts; ++i) |
6368 | if (induction_index[i] > idx_val) |
6369 | val = data_reduc[i], idx_val = induction_index[i]; |
6370 | return val; */ |
6371 | |
6372 | tree data_eltype = TREE_TYPE (vectype); |
6373 | tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index)); |
6374 | unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype)); |
6375 | poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index)); |
6376 | /* Enforced by vectorizable_reduction, which ensures we have target |
6377 | support before allowing a conditional reduction on variable-length |
6378 | vectors. */ |
6379 | unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant (); |
6380 | tree idx_val = NULL_TREE, val = NULL_TREE; |
6381 | for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size) |
6382 | { |
6383 | tree old_idx_val = idx_val; |
6384 | tree old_val = val; |
6385 | idx_val = make_ssa_name (var: idx_eltype); |
6386 | epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF, |
6387 | build3 (BIT_FIELD_REF, idx_eltype, |
6388 | induction_index, |
6389 | bitsize_int (el_size), |
6390 | bitsize_int (off))); |
6391 | gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); |
6392 | val = make_ssa_name (var: data_eltype); |
6393 | epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF, |
6394 | build3 (BIT_FIELD_REF, |
6395 | data_eltype, |
6396 | reduc_inputs[0], |
6397 | bitsize_int (el_size), |
6398 | bitsize_int (off))); |
6399 | gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); |
6400 | if (off != 0) |
6401 | { |
6402 | tree new_idx_val = idx_val; |
6403 | if (off != v_size - el_size) |
6404 | { |
6405 | new_idx_val = make_ssa_name (var: idx_eltype); |
6406 | epilog_stmt = gimple_build_assign (new_idx_val, |
6407 | MAX_EXPR, idx_val, |
6408 | old_idx_val); |
6409 | gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); |
6410 | } |
6411 | tree cond = make_ssa_name (boolean_type_node); |
6412 | epilog_stmt = gimple_build_assign (cond, GT_EXPR, |
6413 | idx_val, old_idx_val); |
6414 | gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); |
6415 | tree new_val = make_ssa_name (var: data_eltype); |
6416 | epilog_stmt = gimple_build_assign (new_val, COND_EXPR, |
6417 | cond, val, old_val); |
6418 | gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); |
6419 | idx_val = new_idx_val; |
6420 | val = new_val; |
6421 | } |
6422 | } |
6423 | /* Convert the reduced value back to the result type and set as the |
6424 | result. */ |
6425 | gimple_seq stmts = NULL; |
6426 | val = gimple_convert (seq: &stmts, type: scalar_type, op: val); |
6427 | gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); |
6428 | scalar_results.safe_push (obj: val); |
6429 | } |
6430 | |
6431 | /* 2.3 Create the reduction code, using one of the three schemes described |
6432 | above. In SLP we simply need to extract all the elements from the |
6433 | vector (without reducing them), so we use scalar shifts. */ |
6434 | else if (reduc_fn != IFN_LAST && !slp_reduc) |
6435 | { |
6436 | tree tmp; |
6437 | tree vec_elem_type; |
6438 | |
6439 | /* Case 1: Create: |
6440 | v_out2 = reduc_expr <v_out1> */ |
6441 | |
6442 | if (dump_enabled_p ()) |
6443 | dump_printf_loc (MSG_NOTE, vect_location, |
6444 | "Reduce using direct vector reduction.\n" ); |
6445 | |
6446 | gimple_seq stmts = NULL; |
6447 | vec_elem_type = TREE_TYPE (vectype); |
6448 | new_temp = gimple_build (seq: &stmts, fn: as_combined_fn (fn: reduc_fn), |
6449 | type: vec_elem_type, args: reduc_inputs[0]); |
6450 | new_temp = gimple_convert (seq: &stmts, type: scalar_type, op: new_temp); |
6451 | gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); |
6452 | |
6453 | if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION) |
6454 | && induc_val) |
6455 | { |
6456 | /* Earlier we set the initial value to be a vector if induc_val |
6457 | values. Check the result and if it is induc_val then replace |
6458 | with the original initial value, unless induc_val is |
6459 | the same as initial_def already. */ |
6460 | tree zcompare = make_ssa_name (boolean_type_node); |
6461 | epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR, |
6462 | new_temp, induc_val); |
6463 | gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); |
6464 | tree initial_def = reduc_info->reduc_initial_values[0]; |
6465 | tmp = make_ssa_name (var: new_scalar_dest); |
6466 | epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare, |
6467 | initial_def, new_temp); |
6468 | gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); |
6469 | new_temp = tmp; |
6470 | } |
6471 | |
6472 | scalar_results.safe_push (obj: new_temp); |
6473 | } |
6474 | else if (direct_slp_reduc) |
6475 | { |
6476 | /* Here we create one vector for each of the REDUC_GROUP_SIZE results, |
6477 | with the elements for other SLP statements replaced with the |
6478 | neutral value. We can then do a normal reduction on each vector. */ |
6479 | |
6480 | /* Enforced by vectorizable_reduction. */ |
6481 | gcc_assert (reduc_inputs.length () == 1); |
6482 | gcc_assert (pow2p_hwi (group_size)); |
6483 | |
6484 | gimple_seq seq = NULL; |
6485 | |
6486 | /* Build a vector {0, 1, 2, ...}, with the same number of elements |
6487 | and the same element size as VECTYPE. */ |
6488 | tree index = build_index_vector (vectype, 0, 1); |
6489 | tree index_type = TREE_TYPE (index); |
6490 | tree index_elt_type = TREE_TYPE (index_type); |
6491 | tree mask_type = truth_type_for (index_type); |
6492 | |
6493 | /* Create a vector that, for each element, identifies which of |
6494 | the REDUC_GROUP_SIZE results should use it. */ |
6495 | tree index_mask = build_int_cst (index_elt_type, group_size - 1); |
6496 | index = gimple_build (seq: &seq, code: BIT_AND_EXPR, type: index_type, ops: index, |
6497 | ops: build_vector_from_val (index_type, index_mask)); |
6498 | |
6499 | /* Get a neutral vector value. This is simply a splat of the neutral |
6500 | scalar value if we have one, otherwise the initial scalar value |
6501 | is itself a neutral value. */ |
6502 | tree vector_identity = NULL_TREE; |
6503 | tree neutral_op = NULL_TREE; |
6504 | if (slp_node) |
6505 | { |
6506 | tree initial_value = NULL_TREE; |
6507 | if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)) |
6508 | initial_value = reduc_info->reduc_initial_values[0]; |
6509 | neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code, |
6510 | initial_value, as_initial: false); |
6511 | } |
6512 | if (neutral_op) |
6513 | vector_identity = gimple_build_vector_from_val (seq: &seq, type: vectype, |
6514 | op: neutral_op); |
6515 | for (unsigned int i = 0; i < group_size; ++i) |
6516 | { |
6517 | /* If there's no univeral neutral value, we can use the |
6518 | initial scalar value from the original PHI. This is used |
6519 | for MIN and MAX reduction, for example. */ |
6520 | if (!neutral_op) |
6521 | { |
6522 | tree scalar_value = reduc_info->reduc_initial_values[i]; |
6523 | scalar_value = gimple_convert (seq: &seq, TREE_TYPE (vectype), |
6524 | op: scalar_value); |
6525 | vector_identity = gimple_build_vector_from_val (seq: &seq, type: vectype, |
6526 | op: scalar_value); |
6527 | } |
6528 | |
6529 | /* Calculate the equivalent of: |
6530 | |
6531 | sel[j] = (index[j] == i); |
6532 | |
6533 | which selects the elements of REDUC_INPUTS[0] that should |
6534 | be included in the result. */ |
6535 | tree compare_val = build_int_cst (index_elt_type, i); |
6536 | compare_val = build_vector_from_val (index_type, compare_val); |
6537 | tree sel = gimple_build (seq: &seq, code: EQ_EXPR, type: mask_type, |
6538 | ops: index, ops: compare_val); |
6539 | |
6540 | /* Calculate the equivalent of: |
6541 | |
6542 | vec = seq ? reduc_inputs[0] : vector_identity; |
6543 | |
6544 | VEC is now suitable for a full vector reduction. */ |
6545 | tree vec = gimple_build (seq: &seq, code: VEC_COND_EXPR, type: vectype, |
6546 | ops: sel, ops: reduc_inputs[0], ops: vector_identity); |
6547 | |
6548 | /* Do the reduction and convert it to the appropriate type. */ |
6549 | tree scalar = gimple_build (seq: &seq, fn: as_combined_fn (fn: reduc_fn), |
6550 | TREE_TYPE (vectype), args: vec); |
6551 | scalar = gimple_convert (seq: &seq, type: scalar_type, op: scalar); |
6552 | scalar_results.safe_push (obj: scalar); |
6553 | } |
6554 | gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT); |
6555 | } |
6556 | else |
6557 | { |
6558 | bool reduce_with_shift; |
6559 | tree vec_temp; |
6560 | |
6561 | gcc_assert (slp_reduc || reduc_inputs.length () == 1); |
6562 | |
6563 | /* See if the target wants to do the final (shift) reduction |
6564 | in a vector mode of smaller size and first reduce upper/lower |
6565 | halves against each other. */ |
6566 | enum machine_mode mode1 = mode; |
6567 | tree stype = TREE_TYPE (vectype); |
6568 | unsigned nunits = TYPE_VECTOR_SUBPARTS (node: vectype).to_constant (); |
6569 | unsigned nunits1 = nunits; |
6570 | if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode |
6571 | && reduc_inputs.length () == 1) |
6572 | { |
6573 | nunits1 = GET_MODE_NUNITS (mode: mode1).to_constant (); |
6574 | /* For SLP reductions we have to make sure lanes match up, but |
6575 | since we're doing individual element final reduction reducing |
6576 | vector width here is even more important. |
6577 | ??? We can also separate lanes with permutes, for the common |
6578 | case of power-of-two group-size odd/even extracts would work. */ |
6579 | if (slp_reduc && nunits != nunits1) |
6580 | { |
6581 | nunits1 = least_common_multiple (nunits1, group_size); |
6582 | gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits); |
6583 | } |
6584 | } |
6585 | if (!slp_reduc |
6586 | && (mode1 = targetm.vectorize.split_reduction (mode)) != mode) |
6587 | nunits1 = GET_MODE_NUNITS (mode: mode1).to_constant (); |
6588 | |
6589 | tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype), |
6590 | stype, nunits1); |
6591 | reduce_with_shift = have_whole_vector_shift (mode: mode1); |
6592 | if (!VECTOR_MODE_P (mode1) |
6593 | || !directly_supported_p (code, vectype1)) |
6594 | reduce_with_shift = false; |
6595 | |
6596 | /* First reduce the vector to the desired vector size we should |
6597 | do shift reduction on by combining upper and lower halves. */ |
6598 | gimple_seq stmts = NULL; |
6599 | new_temp = vect_create_partial_epilog (vec_def: reduc_inputs[0], vectype: vectype1, |
6600 | code, seq: &stmts); |
6601 | gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); |
6602 | reduc_inputs[0] = new_temp; |
6603 | |
6604 | if (reduce_with_shift && !slp_reduc) |
6605 | { |
6606 | int element_bitsize = tree_to_uhwi (bitsize); |
6607 | /* Enforced by vectorizable_reduction, which disallows SLP reductions |
6608 | for variable-length vectors and also requires direct target support |
6609 | for loop reductions. */ |
6610 | int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1)); |
6611 | int nelements = vec_size_in_bits / element_bitsize; |
6612 | vec_perm_builder sel; |
6613 | vec_perm_indices indices; |
6614 | |
6615 | int elt_offset; |
6616 | |
6617 | tree zero_vec = build_zero_cst (vectype1); |
6618 | /* Case 2: Create: |
6619 | for (offset = nelements/2; offset >= 1; offset/=2) |
6620 | { |
6621 | Create: va' = vec_shift <va, offset> |
6622 | Create: va = vop <va, va'> |
6623 | } */ |
6624 | |
6625 | tree rhs; |
6626 | |
6627 | if (dump_enabled_p ()) |
6628 | dump_printf_loc (MSG_NOTE, vect_location, |
6629 | "Reduce using vector shifts\n" ); |
6630 | |
6631 | gimple_seq stmts = NULL; |
6632 | new_temp = gimple_convert (seq: &stmts, type: vectype1, op: new_temp); |
6633 | for (elt_offset = nelements / 2; |
6634 | elt_offset >= 1; |
6635 | elt_offset /= 2) |
6636 | { |
6637 | calc_vec_perm_mask_for_shift (offset: elt_offset, nelt: nelements, sel: &sel); |
6638 | indices.new_vector (sel, 2, nelements); |
6639 | tree mask = vect_gen_perm_mask_any (vectype1, indices); |
6640 | new_name = gimple_build (seq: &stmts, code: VEC_PERM_EXPR, type: vectype1, |
6641 | ops: new_temp, ops: zero_vec, ops: mask); |
6642 | new_temp = gimple_build (seq: &stmts, code, |
6643 | type: vectype1, ops: new_name, ops: new_temp); |
6644 | } |
6645 | gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); |
6646 | |
6647 | /* 2.4 Extract the final scalar result. Create: |
6648 | s_out3 = extract_field <v_out2, bitpos> */ |
6649 | |
6650 | if (dump_enabled_p ()) |
6651 | dump_printf_loc (MSG_NOTE, vect_location, |
6652 | "extract scalar result\n" ); |
6653 | |
6654 | rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, |
6655 | bitsize, bitsize_zero_node); |
6656 | epilog_stmt = gimple_build_assign (new_scalar_dest, rhs); |
6657 | new_temp = make_ssa_name (var: new_scalar_dest, stmt: epilog_stmt); |
6658 | gimple_assign_set_lhs (gs: epilog_stmt, lhs: new_temp); |
6659 | gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); |
6660 | scalar_results.safe_push (obj: new_temp); |
6661 | } |
6662 | else |
6663 | { |
6664 | /* Case 3: Create: |
6665 | s = extract_field <v_out2, 0> |
6666 | for (offset = element_size; |
6667 | offset < vector_size; |
6668 | offset += element_size;) |
6669 | { |
6670 | Create: s' = extract_field <v_out2, offset> |
6671 | Create: s = op <s, s'> // For non SLP cases |
6672 | } */ |
6673 | |
6674 | if (dump_enabled_p ()) |
6675 | dump_printf_loc (MSG_NOTE, vect_location, |
6676 | "Reduce using scalar code.\n" ); |
6677 | |
6678 | int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1)); |
6679 | int element_bitsize = tree_to_uhwi (bitsize); |
6680 | tree compute_type = TREE_TYPE (vectype); |
6681 | gimple_seq stmts = NULL; |
6682 | FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp) |
6683 | { |
6684 | int bit_offset; |
6685 | new_temp = gimple_build (seq: &stmts, code: BIT_FIELD_REF, type: compute_type, |
6686 | ops: vec_temp, ops: bitsize, bitsize_zero_node); |
6687 | |
6688 | /* In SLP we don't need to apply reduction operation, so we just |
6689 | collect s' values in SCALAR_RESULTS. */ |
6690 | if (slp_reduc) |
6691 | scalar_results.safe_push (obj: new_temp); |
6692 | |
6693 | for (bit_offset = element_bitsize; |
6694 | bit_offset < vec_size_in_bits; |
6695 | bit_offset += element_bitsize) |
6696 | { |
6697 | tree bitpos = bitsize_int (bit_offset); |
6698 | new_name = gimple_build (seq: &stmts, code: BIT_FIELD_REF, |
6699 | type: compute_type, ops: vec_temp, |
6700 | ops: bitsize, ops: bitpos); |
6701 | if (slp_reduc) |
6702 | { |
6703 | /* In SLP we don't need to apply reduction operation, so |
6704 | we just collect s' values in SCALAR_RESULTS. */ |
6705 | new_temp = new_name; |
6706 | scalar_results.safe_push (obj: new_name); |
6707 | } |
6708 | else |
6709 | new_temp = gimple_build (seq: &stmts, code, type: compute_type, |
6710 | ops: new_name, ops: new_temp); |
6711 | } |
6712 | } |
6713 | |
6714 | /* The only case where we need to reduce scalar results in SLP, is |
6715 | unrolling. If the size of SCALAR_RESULTS is greater than |
6716 | REDUC_GROUP_SIZE, we reduce them combining elements modulo |
6717 | REDUC_GROUP_SIZE. */ |
6718 | if (slp_reduc) |
6719 | { |
6720 | tree res, first_res, new_res; |
6721 | |
6722 | /* Reduce multiple scalar results in case of SLP unrolling. */ |
6723 | for (j = group_size; scalar_results.iterate (ix: j, ptr: &res); |
6724 | j++) |
6725 | { |
6726 | first_res = scalar_results[j % group_size]; |
6727 | new_res = gimple_build (seq: &stmts, code, type: compute_type, |
6728 | ops: first_res, ops: res); |
6729 | scalar_results[j % group_size] = new_res; |
6730 | } |
6731 | scalar_results.truncate (size: group_size); |
6732 | for (k = 0; k < group_size; k++) |
6733 | scalar_results[k] = gimple_convert (seq: &stmts, type: scalar_type, |
6734 | op: scalar_results[k]); |
6735 | } |
6736 | else |
6737 | { |
6738 | /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */ |
6739 | new_temp = gimple_convert (seq: &stmts, type: scalar_type, op: new_temp); |
6740 | scalar_results.safe_push (obj: new_temp); |
6741 | } |
6742 | |
6743 | gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); |
6744 | } |
6745 | |
6746 | if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION) |
6747 | && induc_val) |
6748 | { |
6749 | /* Earlier we set the initial value to be a vector if induc_val |
6750 | values. Check the result and if it is induc_val then replace |
6751 | with the original initial value, unless induc_val is |
6752 | the same as initial_def already. */ |
6753 | tree zcompare = make_ssa_name (boolean_type_node); |
6754 | epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR, new_temp, |
6755 | induc_val); |
6756 | gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); |
6757 | tree initial_def = reduc_info->reduc_initial_values[0]; |
6758 | tree tmp = make_ssa_name (var: new_scalar_dest); |
6759 | epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare, |
6760 | initial_def, new_temp); |
6761 | gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); |
6762 | scalar_results[0] = tmp; |
6763 | } |
6764 | } |
6765 | |
6766 | /* 2.5 Adjust the final result by the initial value of the reduction |
6767 | variable. (When such adjustment is not needed, then |
6768 | 'adjustment_def' is zero). For example, if code is PLUS we create: |
6769 | new_temp = loop_exit_def + adjustment_def */ |
6770 | |
6771 | if (adjustment_def) |
6772 | { |
6773 | gcc_assert (!slp_reduc); |
6774 | gimple_seq stmts = NULL; |
6775 | if (double_reduc) |
6776 | { |
6777 | gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def))); |
6778 | adjustment_def = gimple_convert (seq: &stmts, type: vectype, op: adjustment_def); |
6779 | new_temp = gimple_build (seq: &stmts, code, type: vectype, |
6780 | ops: reduc_inputs[0], ops: adjustment_def); |
6781 | } |
6782 | else |
6783 | { |
6784 | new_temp = scalar_results[0]; |
6785 | gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE); |
6786 | adjustment_def = gimple_convert (seq: &stmts, TREE_TYPE (vectype), |
6787 | op: adjustment_def); |
6788 | new_temp = gimple_convert (seq: &stmts, TREE_TYPE (vectype), op: new_temp); |
6789 | new_temp = gimple_build (seq: &stmts, code, TREE_TYPE (vectype), |
6790 | ops: new_temp, ops: adjustment_def); |
6791 | new_temp = gimple_convert (seq: &stmts, type: scalar_type, op: new_temp); |
6792 | } |
6793 | |
6794 | epilog_stmt = gimple_seq_last_stmt (s: stmts); |
6795 | gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); |
6796 | scalar_results[0] = new_temp; |
6797 | } |
6798 | |
6799 | /* Record this operation if it could be reused by the epilogue loop. */ |
6800 | if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION |
6801 | && reduc_inputs.length () == 1) |
6802 | loop_vinfo->reusable_accumulators.put (k: scalar_results[0], |
6803 | v: { .reduc_input: orig_reduc_input, .reduc_info: reduc_info }); |
6804 | |
6805 | if (double_reduc) |
6806 | loop = outer_loop; |
6807 | |
6808 | /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit |
6809 | phis with new adjusted scalar results, i.e., replace use <s_out0> |
6810 | with use <s_out4>. |
6811 | |
6812 | Transform: |
6813 | loop_exit: |
6814 | s_out0 = phi <s_loop> # (scalar) EXIT_PHI |
6815 | v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI |
6816 | v_out2 = reduce <v_out1> |
6817 | s_out3 = extract_field <v_out2, 0> |
6818 | s_out4 = adjust_result <s_out3> |
6819 | use <s_out0> |
6820 | use <s_out0> |
6821 | |
6822 | into: |
6823 | |
6824 | loop_exit: |
6825 | s_out0 = phi <s_loop> # (scalar) EXIT_PHI |
6826 | v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI |
6827 | v_out2 = reduce <v_out1> |
6828 | s_out3 = extract_field <v_out2, 0> |
6829 | s_out4 = adjust_result <s_out3> |
6830 | use <s_out4> |
6831 | use <s_out4> */ |
6832 | |
6833 | gcc_assert (live_out_stmts.size () == scalar_results.length ()); |
6834 | for (k = 0; k < live_out_stmts.size (); k++) |
6835 | { |
6836 | stmt_vec_info scalar_stmt_info = vect_orig_stmt (stmt_info: live_out_stmts[k]); |
6837 | scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt); |
6838 | |
6839 | phis.create (nelems: 3); |
6840 | /* Find the loop-closed-use at the loop exit of the original scalar |
6841 | result. (The reduction result is expected to have two immediate uses, |
6842 | one at the latch block, and one at the loop exit). For double |
6843 | reductions we are looking for exit phis of the outer loop. */ |
6844 | FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest) |
6845 | { |
6846 | if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))) |
6847 | { |
6848 | if (!is_gimple_debug (USE_STMT (use_p))) |
6849 | phis.safe_push (USE_STMT (use_p)); |
6850 | } |
6851 | else |
6852 | { |
6853 | if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI) |
6854 | { |
6855 | tree phi_res = PHI_RESULT (USE_STMT (use_p)); |
6856 | |
6857 | FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res) |
6858 | { |
6859 | if (!flow_bb_inside_loop_p (loop, |
6860 | gimple_bb (USE_STMT (phi_use_p))) |
6861 | && !is_gimple_debug (USE_STMT (phi_use_p))) |
6862 | phis.safe_push (USE_STMT (phi_use_p)); |
6863 | } |
6864 | } |
6865 | } |
6866 | } |
6867 | |
6868 | FOR_EACH_VEC_ELT (phis, i, exit_phi) |
6869 | { |
6870 | /* Replace the uses: */ |
6871 | orig_name = PHI_RESULT (exit_phi); |
6872 | |
6873 | /* Look for a single use at the target of the skip edge. */ |
6874 | if (unify_with_main_loop_p) |
6875 | { |
6876 | use_operand_p use_p; |
6877 | gimple *user; |
6878 | if (!single_imm_use (var: orig_name, use_p: &use_p, stmt: &user)) |
6879 | gcc_unreachable (); |
6880 | orig_name = gimple_get_lhs (user); |
6881 | } |
6882 | |
6883 | scalar_result = scalar_results[k]; |
6884 | FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name) |
6885 | { |
6886 | FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter) |
6887 | SET_USE (use_p, scalar_result); |
6888 | update_stmt (s: use_stmt); |
6889 | } |
6890 | } |
6891 | |
6892 | phis.release (); |
6893 | } |
6894 | } |
6895 | |
6896 | /* Return a vector of type VECTYPE that is equal to the vector select |
6897 | operation "MASK ? VEC : IDENTITY". Insert the select statements |
6898 | before GSI. */ |
6899 | |
6900 | static tree |
6901 | merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype, |
6902 | tree vec, tree identity) |
6903 | { |
6904 | tree cond = make_temp_ssa_name (type: vectype, NULL, name: "cond" ); |
6905 | gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR, |
6906 | mask, vec, identity); |
6907 | gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); |
6908 | return cond; |
6909 | } |
6910 | |
6911 | /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right |
6912 | order, starting with LHS. Insert the extraction statements before GSI and |
6913 | associate the new scalar SSA names with variable SCALAR_DEST. |
6914 | If MASK is nonzero mask the input and then operate on it unconditionally. |
6915 | Return the SSA name for the result. */ |
6916 | |
6917 | static tree |
6918 | vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest, |
6919 | tree_code code, tree lhs, tree vector_rhs, |
6920 | tree mask) |
6921 | { |
6922 | tree vectype = TREE_TYPE (vector_rhs); |
6923 | tree scalar_type = TREE_TYPE (vectype); |
6924 | tree bitsize = TYPE_SIZE (scalar_type); |
6925 | unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype)); |
6926 | unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize); |
6927 | |
6928 | /* Re-create a VEC_COND_EXPR to mask the input here in order to be able |
6929 | to perform an unconditional element-wise reduction of it. */ |
6930 | if (mask) |
6931 | { |
6932 | tree masked_vector_rhs = make_temp_ssa_name (type: vectype, NULL, |
6933 | name: "masked_vector_rhs" ); |
6934 | tree neutral_op = neutral_op_for_reduction (scalar_type, code, NULL_TREE, |
6935 | as_initial: false); |
6936 | tree vector_identity = build_vector_from_val (vectype, neutral_op); |
6937 | gassign *select = gimple_build_assign (masked_vector_rhs, VEC_COND_EXPR, |
6938 | mask, vector_rhs, vector_identity); |
6939 | gsi_insert_before (gsi, select, GSI_SAME_STMT); |
6940 | vector_rhs = masked_vector_rhs; |
6941 | } |
6942 | |
6943 | for (unsigned HOST_WIDE_INT bit_offset = 0; |
6944 | bit_offset < vec_size_in_bits; |
6945 | bit_offset += element_bitsize) |
6946 | { |
6947 | tree bitpos = bitsize_int (bit_offset); |
6948 | tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs, |
6949 | bitsize, bitpos); |
6950 | |
6951 | gassign *stmt = gimple_build_assign (scalar_dest, rhs); |
6952 | rhs = make_ssa_name (var: scalar_dest, stmt); |
6953 | gimple_assign_set_lhs (gs: stmt, lhs: rhs); |
6954 | gsi_insert_before (gsi, stmt, GSI_SAME_STMT); |
6955 | |
6956 | stmt = gimple_build_assign (scalar_dest, code, lhs, rhs); |
6957 | tree new_name = make_ssa_name (var: scalar_dest, stmt); |
6958 | gimple_assign_set_lhs (gs: stmt, lhs: new_name); |
6959 | gsi_insert_before (gsi, stmt, GSI_SAME_STMT); |
6960 | lhs = new_name; |
6961 | } |
6962 | return lhs; |
6963 | } |
6964 | |
6965 | /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the |
6966 | type of the vector input. */ |
6967 | |
6968 | static internal_fn |
6969 | get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in) |
6970 | { |
6971 | internal_fn mask_reduc_fn; |
6972 | internal_fn mask_len_reduc_fn; |
6973 | |
6974 | switch (reduc_fn) |
6975 | { |
6976 | case IFN_FOLD_LEFT_PLUS: |
6977 | mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS; |
6978 | mask_len_reduc_fn = IFN_MASK_LEN_FOLD_LEFT_PLUS; |
6979 | break; |
6980 | |
6981 | default: |
6982 | return IFN_LAST; |
6983 | } |
6984 | |
6985 | if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in, |
6986 | OPTIMIZE_FOR_SPEED)) |
6987 | return mask_reduc_fn; |
6988 | if (direct_internal_fn_supported_p (mask_len_reduc_fn, vectype_in, |
6989 | OPTIMIZE_FOR_SPEED)) |
6990 | return mask_len_reduc_fn; |
6991 | return IFN_LAST; |
6992 | } |
6993 | |
6994 | /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the |
6995 | statement that sets the live-out value. REDUC_DEF_STMT is the phi |
6996 | statement. CODE is the operation performed by STMT_INFO and OPS are |
6997 | its scalar operands. REDUC_INDEX is the index of the operand in |
6998 | OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that |
6999 | implements in-order reduction, or IFN_LAST if we should open-code it. |
7000 | VECTYPE_IN is the type of the vector input. MASKS specifies the masks |
7001 | that should be used to control the operation in a fully-masked loop. */ |
7002 | |
7003 | static bool |
7004 | vectorize_fold_left_reduction (loop_vec_info loop_vinfo, |
7005 | stmt_vec_info stmt_info, |
7006 | gimple_stmt_iterator *gsi, |
7007 | gimple **vec_stmt, slp_tree slp_node, |
7008 | gimple *reduc_def_stmt, |
7009 | code_helper code, internal_fn reduc_fn, |
7010 | tree *ops, int num_ops, tree vectype_in, |
7011 | int reduc_index, vec_loop_masks *masks, |
7012 | vec_loop_lens *lens) |
7013 | { |
7014 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
7015 | tree vectype_out = STMT_VINFO_VECTYPE (stmt_info); |
7016 | internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in); |
7017 | |
7018 | int ncopies; |
7019 | if (slp_node) |
7020 | ncopies = 1; |
7021 | else |
7022 | ncopies = vect_get_num_copies (loop_vinfo, vectype: vectype_in); |
7023 | |
7024 | gcc_assert (!nested_in_vect_loop_p (loop, stmt_info)); |
7025 | gcc_assert (ncopies == 1); |
7026 | |
7027 | bool is_cond_op = false; |
7028 | if (!code.is_tree_code ()) |
7029 | { |
7030 | code = conditional_internal_fn_code (internal_fn (code)); |
7031 | gcc_assert (code != ERROR_MARK); |
7032 | is_cond_op = true; |
7033 | } |
7034 | |
7035 | gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op); |
7036 | |
7037 | if (slp_node) |
7038 | { |
7039 | if (is_cond_op) |
7040 | { |
7041 | if (dump_enabled_p ()) |
7042 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
7043 | "fold-left reduction on SLP not supported.\n" ); |
7044 | return false; |
7045 | } |
7046 | |
7047 | gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out), |
7048 | TYPE_VECTOR_SUBPARTS (vectype_in))); |
7049 | } |
7050 | |
7051 | /* The operands either come from a binary operation or an IFN_COND operation. |
7052 | The former is a gimple assign with binary rhs and the latter is a |
7053 | gimple call with four arguments. */ |
7054 | gcc_assert (num_ops == 2 || num_ops == 4); |
7055 | tree op0, opmask; |
7056 | if (!is_cond_op) |
7057 | op0 = ops[1 - reduc_index]; |
7058 | else |
7059 | { |
7060 | op0 = ops[2]; |
7061 | opmask = ops[0]; |
7062 | gcc_assert (!slp_node); |
7063 | } |
7064 | |
7065 | int group_size = 1; |
7066 | stmt_vec_info scalar_dest_def_info; |
7067 | auto_vec<tree> vec_oprnds0, vec_opmask; |
7068 | if (slp_node) |
7069 | { |
7070 | auto_vec<vec<tree> > vec_defs (2); |
7071 | vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs); |
7072 | vec_oprnds0.safe_splice (src: vec_defs[1 - reduc_index]); |
7073 | vec_defs[0].release (); |
7074 | vec_defs[1].release (); |
7075 | group_size = SLP_TREE_SCALAR_STMTS (slp_node).length (); |
7076 | scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]; |
7077 | } |
7078 | else |
7079 | { |
7080 | vect_get_vec_defs_for_operand (vinfo: loop_vinfo, stmt_info, 1, |
7081 | op: op0, &vec_oprnds0); |
7082 | scalar_dest_def_info = stmt_info; |
7083 | |
7084 | /* For an IFN_COND_OP we also need the vector mask operand. */ |
7085 | if (is_cond_op) |
7086 | vect_get_vec_defs_for_operand (vinfo: loop_vinfo, stmt_info, 1, |
7087 | op: opmask, &vec_opmask); |
7088 | } |
7089 | |
7090 | gimple *sdef = scalar_dest_def_info->stmt; |
7091 | tree scalar_dest = gimple_get_lhs (sdef); |
7092 | tree scalar_type = TREE_TYPE (scalar_dest); |
7093 | tree reduc_var = gimple_phi_result (gs: reduc_def_stmt); |
7094 | |
7095 | int vec_num = vec_oprnds0.length (); |
7096 | gcc_assert (vec_num == 1 || slp_node); |
7097 | tree vec_elem_type = TREE_TYPE (vectype_out); |
7098 | gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type)); |
7099 | |
7100 | tree vector_identity = NULL_TREE; |
7101 | if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) |
7102 | { |
7103 | vector_identity = build_zero_cst (vectype_out); |
7104 | if (!HONOR_SIGNED_ZEROS (vectype_out)) |
7105 | ; |
7106 | else |
7107 | { |
7108 | gcc_assert (!HONOR_SIGN_DEPENDENT_ROUNDING (vectype_out)); |
7109 | vector_identity = const_unop (NEGATE_EXPR, vectype_out, |
7110 | vector_identity); |
7111 | } |
7112 | } |
7113 | |
7114 | tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL); |
7115 | int i; |
7116 | tree def0; |
7117 | FOR_EACH_VEC_ELT (vec_oprnds0, i, def0) |
7118 | { |
7119 | gimple *new_stmt; |
7120 | tree mask = NULL_TREE; |
7121 | tree len = NULL_TREE; |
7122 | tree bias = NULL_TREE; |
7123 | if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) |
7124 | mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i); |
7125 | else if (is_cond_op) |
7126 | mask = vec_opmask[0]; |
7127 | if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)) |
7128 | { |
7129 | len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in, |
7130 | i, 1); |
7131 | signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo); |
7132 | bias = build_int_cst (intQI_type_node, biasval); |
7133 | if (!is_cond_op) |
7134 | mask = build_minus_one_cst (truth_type_for (vectype_in)); |
7135 | } |
7136 | |
7137 | /* Handle MINUS by adding the negative. */ |
7138 | if (reduc_fn != IFN_LAST && code == MINUS_EXPR) |
7139 | { |
7140 | tree negated = make_ssa_name (var: vectype_out); |
7141 | new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0); |
7142 | gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); |
7143 | def0 = negated; |
7144 | } |
7145 | |
7146 | if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) |
7147 | && mask && mask_reduc_fn == IFN_LAST) |
7148 | def0 = merge_with_identity (gsi, mask, vectype: vectype_out, vec: def0, |
7149 | identity: vector_identity); |
7150 | |
7151 | /* On the first iteration the input is simply the scalar phi |
7152 | result, and for subsequent iterations it is the output of |
7153 | the preceding operation. */ |
7154 | if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST)) |
7155 | { |
7156 | if (mask && len && mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS) |
7157 | new_stmt = gimple_build_call_internal (mask_reduc_fn, 5, reduc_var, |
7158 | def0, mask, len, bias); |
7159 | else if (mask && mask_reduc_fn == IFN_MASK_FOLD_LEFT_PLUS) |
7160 | new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var, |
7161 | def0, mask); |
7162 | else |
7163 | new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, |
7164 | def0); |
7165 | /* For chained SLP reductions the output of the previous reduction |
7166 | operation serves as the input of the next. For the final statement |
7167 | the output cannot be a temporary - we reuse the original |
7168 | scalar destination of the last statement. */ |
7169 | if (i != vec_num - 1) |
7170 | { |
7171 | gimple_set_lhs (new_stmt, scalar_dest_var); |
7172 | reduc_var = make_ssa_name (var: scalar_dest_var, stmt: new_stmt); |
7173 | gimple_set_lhs (new_stmt, reduc_var); |
7174 | } |
7175 | } |
7176 | else |
7177 | { |
7178 | reduc_var = vect_expand_fold_left (gsi, scalar_dest: scalar_dest_var, |
7179 | code: tree_code (code), lhs: reduc_var, vector_rhs: def0, |
7180 | mask); |
7181 | new_stmt = SSA_NAME_DEF_STMT (reduc_var); |
7182 | /* Remove the statement, so that we can use the same code paths |
7183 | as for statements that we've just created. */ |
7184 | gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt); |
7185 | gsi_remove (&tmp_gsi, true); |
7186 | } |
7187 | |
7188 | if (i == vec_num - 1) |
7189 | { |
7190 | gimple_set_lhs (new_stmt, scalar_dest); |
7191 | vect_finish_replace_stmt (loop_vinfo, |
7192 | scalar_dest_def_info, |
7193 | new_stmt); |
7194 | } |
7195 | else |
7196 | vect_finish_stmt_generation (loop_vinfo, |
7197 | scalar_dest_def_info, |
7198 | new_stmt, gsi); |
7199 | |
7200 | if (slp_node) |
7201 | slp_node->push_vec_def (def: new_stmt); |
7202 | else |
7203 | { |
7204 | STMT_VINFO_VEC_STMTS (stmt_info).safe_push (obj: new_stmt); |
7205 | *vec_stmt = new_stmt; |
7206 | } |
7207 | } |
7208 | |
7209 | return true; |
7210 | } |
7211 | |
7212 | /* Function is_nonwrapping_integer_induction. |
7213 | |
7214 | Check if STMT_VINO (which is part of loop LOOP) both increments and |
7215 | does not cause overflow. */ |
7216 | |
7217 | static bool |
7218 | is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop) |
7219 | { |
7220 | gphi *phi = as_a <gphi *> (p: stmt_vinfo->stmt); |
7221 | tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo); |
7222 | tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo); |
7223 | tree lhs_type = TREE_TYPE (gimple_phi_result (phi)); |
7224 | widest_int ni, max_loop_value, lhs_max; |
7225 | wi::overflow_type overflow = wi::OVF_NONE; |
7226 | |
7227 | /* Make sure the loop is integer based. */ |
7228 | if (TREE_CODE (base) != INTEGER_CST |
7229 | || TREE_CODE (step) != INTEGER_CST) |
7230 | return false; |
7231 | |
7232 | /* Check that the max size of the loop will not wrap. */ |
7233 | |
7234 | if (TYPE_OVERFLOW_UNDEFINED (lhs_type)) |
7235 | return true; |
7236 | |
7237 | if (! max_stmt_executions (loop, &ni)) |
7238 | return false; |
7239 | |
7240 | max_loop_value = wi::mul (x: wi::to_widest (t: step), y: ni, TYPE_SIGN (lhs_type), |
7241 | overflow: &overflow); |
7242 | if (overflow) |
7243 | return false; |
7244 | |
7245 | max_loop_value = wi::add (x: wi::to_widest (t: base), y: max_loop_value, |
7246 | TYPE_SIGN (lhs_type), overflow: &overflow); |
7247 | if (overflow) |
7248 | return false; |
7249 | |
7250 | return (wi::min_precision (x: max_loop_value, TYPE_SIGN (lhs_type)) |
7251 | <= TYPE_PRECISION (lhs_type)); |
7252 | } |
7253 | |
7254 | /* Check if masking can be supported by inserting a conditional expression. |
7255 | CODE is the code for the operation. COND_FN is the conditional internal |
7256 | function, if it exists. VECTYPE_IN is the type of the vector input. */ |
7257 | static bool |
7258 | use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn, |
7259 | tree vectype_in) |
7260 | { |
7261 | if (cond_fn != IFN_LAST |
7262 | && direct_internal_fn_supported_p (cond_fn, vectype_in, |
7263 | OPTIMIZE_FOR_SPEED)) |
7264 | return false; |
7265 | |
7266 | if (code.is_tree_code ()) |
7267 | switch (tree_code (code)) |
7268 | { |
7269 | case DOT_PROD_EXPR: |
7270 | case SAD_EXPR: |
7271 | return true; |
7272 | |
7273 | default: |
7274 | break; |
7275 | } |
7276 | return false; |
7277 | } |
7278 | |
7279 | /* Insert a conditional expression to enable masked vectorization. CODE is the |
7280 | code for the operation. VOP is the array of operands. MASK is the loop |
7281 | mask. GSI is a statement iterator used to place the new conditional |
7282 | expression. */ |
7283 | static void |
7284 | build_vect_cond_expr (code_helper code, tree vop[3], tree mask, |
7285 | gimple_stmt_iterator *gsi) |
7286 | { |
7287 | switch (tree_code (code)) |
7288 | { |
7289 | case DOT_PROD_EXPR: |
7290 | { |
7291 | tree vectype = TREE_TYPE (vop[1]); |
7292 | tree zero = build_zero_cst (vectype); |
7293 | tree masked_op1 = make_temp_ssa_name (type: vectype, NULL, name: "masked_op1" ); |
7294 | gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR, |
7295 | mask, vop[1], zero); |
7296 | gsi_insert_before (gsi, select, GSI_SAME_STMT); |
7297 | vop[1] = masked_op1; |
7298 | break; |
7299 | } |
7300 | |
7301 | case SAD_EXPR: |
7302 | { |
7303 | tree vectype = TREE_TYPE (vop[1]); |
7304 | tree masked_op1 = make_temp_ssa_name (type: vectype, NULL, name: "masked_op1" ); |
7305 | gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR, |
7306 | mask, vop[1], vop[0]); |
7307 | gsi_insert_before (gsi, select, GSI_SAME_STMT); |
7308 | vop[1] = masked_op1; |
7309 | break; |
7310 | } |
7311 | |
7312 | default: |
7313 | gcc_unreachable (); |
7314 | } |
7315 | } |
7316 | |
7317 | /* Function vectorizable_reduction. |
7318 | |
7319 | Check if STMT_INFO performs a reduction operation that can be vectorized. |
7320 | If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized |
7321 | stmt to replace it, put it in VEC_STMT, and insert it at GSI. |
7322 | Return true if STMT_INFO is vectorizable in this way. |
7323 | |
7324 | This function also handles reduction idioms (patterns) that have been |
7325 | recognized in advance during vect_pattern_recog. In this case, STMT_INFO |
7326 | may be of this form: |
7327 | X = pattern_expr (arg0, arg1, ..., X) |
7328 | and its STMT_VINFO_RELATED_STMT points to the last stmt in the original |
7329 | sequence that had been detected and replaced by the pattern-stmt |
7330 | (STMT_INFO). |
7331 | |
7332 | This function also handles reduction of condition expressions, for example: |
7333 | for (int i = 0; i < N; i++) |
7334 | if (a[i] < value) |
7335 | last = a[i]; |
7336 | This is handled by vectorising the loop and creating an additional vector |
7337 | containing the loop indexes for which "a[i] < value" was true. In the |
7338 | function epilogue this is reduced to a single max value and then used to |
7339 | index into the vector of results. |
7340 | |
7341 | In some cases of reduction patterns, the type of the reduction variable X is |
7342 | different than the type of the other arguments of STMT_INFO. |
7343 | In such cases, the vectype that is used when transforming STMT_INFO into |
7344 | a vector stmt is different than the vectype that is used to determine the |
7345 | vectorization factor, because it consists of a different number of elements |
7346 | than the actual number of elements that are being operated upon in parallel. |
7347 | |
7348 | For example, consider an accumulation of shorts into an int accumulator. |
7349 | On some targets it's possible to vectorize this pattern operating on 8 |
7350 | shorts at a time (hence, the vectype for purposes of determining the |
7351 | vectorization factor should be V8HI); on the other hand, the vectype that |
7352 | is used to create the vector form is actually V4SI (the type of the result). |
7353 | |
7354 | Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that |
7355 | indicates what is the actual level of parallelism (V8HI in the example), so |
7356 | that the right vectorization factor would be derived. This vectype |
7357 | corresponds to the type of arguments to the reduction stmt, and should *NOT* |
7358 | be used to create the vectorized stmt. The right vectype for the vectorized |
7359 | stmt is obtained from the type of the result X: |
7360 | get_vectype_for_scalar_type (vinfo, TREE_TYPE (X)) |
7361 | |
7362 | This means that, contrary to "regular" reductions (or "regular" stmts in |
7363 | general), the following equation: |
7364 | STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X)) |
7365 | does *NOT* necessarily hold for reduction patterns. */ |
7366 | |
7367 | bool |
7368 | vectorizable_reduction (loop_vec_info loop_vinfo, |
7369 | stmt_vec_info stmt_info, slp_tree slp_node, |
7370 | slp_instance slp_node_instance, |
7371 | stmt_vector_for_cost *cost_vec) |
7372 | { |
7373 | tree vectype_in = NULL_TREE; |
7374 | tree vectype_op[3] = { NULL_TREE, NULL_TREE, NULL_TREE }; |
7375 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
7376 | enum vect_def_type cond_reduc_dt = vect_unknown_def_type; |
7377 | stmt_vec_info cond_stmt_vinfo = NULL; |
7378 | int i; |
7379 | int ncopies; |
7380 | bool single_defuse_cycle = false; |
7381 | bool nested_cycle = false; |
7382 | bool double_reduc = false; |
7383 | int vec_num; |
7384 | tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE; |
7385 | tree cond_reduc_val = NULL_TREE; |
7386 | |
7387 | /* Make sure it was already recognized as a reduction computation. */ |
7388 | if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def |
7389 | && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def |
7390 | && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle) |
7391 | return false; |
7392 | |
7393 | /* The stmt we store reduction analysis meta on. */ |
7394 | stmt_vec_info reduc_info = info_for_reduction (vinfo: loop_vinfo, stmt_info); |
7395 | reduc_info->is_reduc_info = true; |
7396 | |
7397 | if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle) |
7398 | { |
7399 | if (is_a <gphi *> (p: stmt_info->stmt)) |
7400 | { |
7401 | if (slp_node) |
7402 | { |
7403 | /* We eventually need to set a vector type on invariant |
7404 | arguments. */ |
7405 | unsigned j; |
7406 | slp_tree child; |
7407 | FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child) |
7408 | if (!vect_maybe_update_slp_op_vectype |
7409 | (child, SLP_TREE_VECTYPE (slp_node))) |
7410 | { |
7411 | if (dump_enabled_p ()) |
7412 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
7413 | "incompatible vector types for " |
7414 | "invariants\n" ); |
7415 | return false; |
7416 | } |
7417 | } |
7418 | /* Analysis for double-reduction is done on the outer |
7419 | loop PHI, nested cycles have no further restrictions. */ |
7420 | STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type; |
7421 | } |
7422 | else |
7423 | STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type; |
7424 | return true; |
7425 | } |
7426 | |
7427 | stmt_vec_info orig_stmt_of_analysis = stmt_info; |
7428 | stmt_vec_info phi_info = stmt_info; |
7429 | if (!is_a <gphi *> (p: stmt_info->stmt)) |
7430 | { |
7431 | STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type; |
7432 | return true; |
7433 | } |
7434 | if (slp_node) |
7435 | { |
7436 | slp_node_instance->reduc_phis = slp_node; |
7437 | /* ??? We're leaving slp_node to point to the PHIs, we only |
7438 | need it to get at the number of vector stmts which wasn't |
7439 | yet initialized for the instance root. */ |
7440 | } |
7441 | if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def) |
7442 | { |
7443 | use_operand_p use_p; |
7444 | gimple *use_stmt; |
7445 | bool res = single_imm_use (var: gimple_phi_result (gs: stmt_info->stmt), |
7446 | use_p: &use_p, stmt: &use_stmt); |
7447 | gcc_assert (res); |
7448 | phi_info = loop_vinfo->lookup_stmt (use_stmt); |
7449 | } |
7450 | |
7451 | /* PHIs should not participate in patterns. */ |
7452 | gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info)); |
7453 | gphi *reduc_def_phi = as_a <gphi *> (p: phi_info->stmt); |
7454 | |
7455 | /* Verify following REDUC_IDX from the latch def leads us back to the PHI |
7456 | and compute the reduction chain length. Discover the real |
7457 | reduction operation stmt on the way (stmt_info and slp_for_stmt_info). */ |
7458 | tree reduc_def |
7459 | = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi, |
7460 | loop_latch_edge |
7461 | (gimple_bb (reduc_def_phi)->loop_father)); |
7462 | unsigned reduc_chain_length = 0; |
7463 | bool only_slp_reduc_chain = true; |
7464 | stmt_info = NULL; |
7465 | slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL; |
7466 | while (reduc_def != PHI_RESULT (reduc_def_phi)) |
7467 | { |
7468 | stmt_vec_info def = loop_vinfo->lookup_def (reduc_def); |
7469 | stmt_vec_info vdef = vect_stmt_to_vectorize (stmt_info: def); |
7470 | if (STMT_VINFO_REDUC_IDX (vdef) == -1) |
7471 | { |
7472 | if (dump_enabled_p ()) |
7473 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
7474 | "reduction chain broken by patterns.\n" ); |
7475 | return false; |
7476 | } |
7477 | if (!REDUC_GROUP_FIRST_ELEMENT (vdef)) |
7478 | only_slp_reduc_chain = false; |
7479 | /* For epilogue generation live members of the chain need |
7480 | to point back to the PHI via their original stmt for |
7481 | info_for_reduction to work. For SLP we need to look at |
7482 | all lanes here - even though we only will vectorize from |
7483 | the SLP node with live lane zero the other live lanes also |
7484 | need to be identified as part of a reduction to be able |
7485 | to skip code generation for them. */ |
7486 | if (slp_for_stmt_info) |
7487 | { |
7488 | for (auto s : SLP_TREE_SCALAR_STMTS (slp_for_stmt_info)) |
7489 | if (STMT_VINFO_LIVE_P (s)) |
7490 | STMT_VINFO_REDUC_DEF (vect_orig_stmt (s)) = phi_info; |
7491 | } |
7492 | else if (STMT_VINFO_LIVE_P (vdef)) |
7493 | STMT_VINFO_REDUC_DEF (def) = phi_info; |
7494 | gimple_match_op op; |
7495 | if (!gimple_extract_op (vdef->stmt, &op)) |
7496 | { |
7497 | if (dump_enabled_p ()) |
7498 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
7499 | "reduction chain includes unsupported" |
7500 | " statement type.\n" ); |
7501 | return false; |
7502 | } |
7503 | if (CONVERT_EXPR_CODE_P (op.code)) |
7504 | { |
7505 | if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0]))) |
7506 | { |
7507 | if (dump_enabled_p ()) |
7508 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
7509 | "conversion in the reduction chain.\n" ); |
7510 | return false; |
7511 | } |
7512 | } |
7513 | else if (!stmt_info) |
7514 | /* First non-conversion stmt. */ |
7515 | stmt_info = vdef; |
7516 | reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)]; |
7517 | reduc_chain_length++; |
7518 | if (!stmt_info && slp_node) |
7519 | slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0]; |
7520 | } |
7521 | /* PHIs should not participate in patterns. */ |
7522 | gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info)); |
7523 | |
7524 | if (nested_in_vect_loop_p (loop, stmt_info)) |
7525 | { |
7526 | loop = loop->inner; |
7527 | nested_cycle = true; |
7528 | } |
7529 | |
7530 | /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last |
7531 | element. */ |
7532 | if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info)) |
7533 | { |
7534 | gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info)); |
7535 | stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info); |
7536 | } |
7537 | if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)) |
7538 | gcc_assert (slp_node |
7539 | && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info); |
7540 | |
7541 | /* 1. Is vectorizable reduction? */ |
7542 | /* Not supportable if the reduction variable is used in the loop, unless |
7543 | it's a reduction chain. */ |
7544 | if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer |
7545 | && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)) |
7546 | return false; |
7547 | |
7548 | /* Reductions that are not used even in an enclosing outer-loop, |
7549 | are expected to be "live" (used out of the loop). */ |
7550 | if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope |
7551 | && !STMT_VINFO_LIVE_P (stmt_info)) |
7552 | return false; |
7553 | |
7554 | /* 2. Has this been recognized as a reduction pattern? |
7555 | |
7556 | Check if STMT represents a pattern that has been recognized |
7557 | in earlier analysis stages. For stmts that represent a pattern, |
7558 | the STMT_VINFO_RELATED_STMT field records the last stmt in |
7559 | the original sequence that constitutes the pattern. */ |
7560 | |
7561 | stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info); |
7562 | if (orig_stmt_info) |
7563 | { |
7564 | gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)); |
7565 | gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info)); |
7566 | } |
7567 | |
7568 | /* 3. Check the operands of the operation. The first operands are defined |
7569 | inside the loop body. The last operand is the reduction variable, |
7570 | which is defined by the loop-header-phi. */ |
7571 | |
7572 | tree vectype_out = STMT_VINFO_VECTYPE (stmt_info); |
7573 | STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out; |
7574 | gimple_match_op op; |
7575 | if (!gimple_extract_op (stmt_info->stmt, &op)) |
7576 | gcc_unreachable (); |
7577 | bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR |
7578 | || op.code == WIDEN_SUM_EXPR |
7579 | || op.code == SAD_EXPR); |
7580 | |
7581 | if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type) |
7582 | && !SCALAR_FLOAT_TYPE_P (op.type)) |
7583 | return false; |
7584 | |
7585 | /* Do not try to vectorize bit-precision reductions. */ |
7586 | if (!type_has_mode_precision_p (t: op.type)) |
7587 | return false; |
7588 | |
7589 | /* For lane-reducing ops we're reducing the number of reduction PHIs |
7590 | which means the only use of that may be in the lane-reducing operation. */ |
7591 | if (lane_reduc_code_p |
7592 | && reduc_chain_length != 1 |
7593 | && !only_slp_reduc_chain) |
7594 | { |
7595 | if (dump_enabled_p ()) |
7596 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
7597 | "lane-reducing reduction with extra stmts.\n" ); |
7598 | return false; |
7599 | } |
7600 | |
7601 | /* All uses but the last are expected to be defined in the loop. |
7602 | The last use is the reduction variable. In case of nested cycle this |
7603 | assumption is not true: we use reduc_index to record the index of the |
7604 | reduction variable. */ |
7605 | slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops); |
7606 | /* We need to skip an extra operand for COND_EXPRs with embedded |
7607 | comparison. */ |
7608 | unsigned opno_adjust = 0; |
7609 | if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0])) |
7610 | opno_adjust = 1; |
7611 | for (i = 0; i < (int) op.num_ops; i++) |
7612 | { |
7613 | /* The condition of COND_EXPR is checked in vectorizable_condition(). */ |
7614 | if (i == 0 && op.code == COND_EXPR) |
7615 | continue; |
7616 | |
7617 | stmt_vec_info def_stmt_info; |
7618 | enum vect_def_type dt; |
7619 | if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info, |
7620 | i + opno_adjust, &op.ops[i], &slp_op[i], &dt, |
7621 | &vectype_op[i], &def_stmt_info)) |
7622 | { |
7623 | if (dump_enabled_p ()) |
7624 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
7625 | "use not simple.\n" ); |
7626 | return false; |
7627 | } |
7628 | if (i == STMT_VINFO_REDUC_IDX (stmt_info)) |
7629 | continue; |
7630 | |
7631 | /* For an IFN_COND_OP we might hit the reduction definition operand |
7632 | twice (once as definition, once as else). */ |
7633 | if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)]) |
7634 | continue; |
7635 | |
7636 | /* There should be only one cycle def in the stmt, the one |
7637 | leading to reduc_def. */ |
7638 | if (VECTORIZABLE_CYCLE_DEF (dt)) |
7639 | return false; |
7640 | |
7641 | if (!vectype_op[i]) |
7642 | vectype_op[i] |
7643 | = get_vectype_for_scalar_type (loop_vinfo, |
7644 | TREE_TYPE (op.ops[i]), slp_op[i]); |
7645 | |
7646 | /* To properly compute ncopies we are interested in the widest |
7647 | non-reduction input type in case we're looking at a widening |
7648 | accumulation that we later handle in vect_transform_reduction. */ |
7649 | if (lane_reduc_code_p |
7650 | && vectype_op[i] |
7651 | && (!vectype_in |
7652 | || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in))) |
7653 | < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op[i])))))) |
7654 | vectype_in = vectype_op[i]; |
7655 | |
7656 | if (op.code == COND_EXPR) |
7657 | { |
7658 | /* Record how the non-reduction-def value of COND_EXPR is defined. */ |
7659 | if (dt == vect_constant_def) |
7660 | { |
7661 | cond_reduc_dt = dt; |
7662 | cond_reduc_val = op.ops[i]; |
7663 | } |
7664 | if (dt == vect_induction_def |
7665 | && def_stmt_info |
7666 | && is_nonwrapping_integer_induction (stmt_vinfo: def_stmt_info, loop)) |
7667 | { |
7668 | cond_reduc_dt = dt; |
7669 | cond_stmt_vinfo = def_stmt_info; |
7670 | } |
7671 | } |
7672 | } |
7673 | if (!vectype_in) |
7674 | vectype_in = STMT_VINFO_VECTYPE (phi_info); |
7675 | STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in; |
7676 | |
7677 | enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info); |
7678 | STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type; |
7679 | /* If we have a condition reduction, see if we can simplify it further. */ |
7680 | if (v_reduc_type == COND_REDUCTION) |
7681 | { |
7682 | if (slp_node) |
7683 | return false; |
7684 | |
7685 | /* When the condition uses the reduction value in the condition, fail. */ |
7686 | if (STMT_VINFO_REDUC_IDX (stmt_info) == 0) |
7687 | { |
7688 | if (dump_enabled_p ()) |
7689 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
7690 | "condition depends on previous iteration\n" ); |
7691 | return false; |
7692 | } |
7693 | |
7694 | if (reduc_chain_length == 1 |
7695 | && (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, vectype_in, |
7696 | OPTIMIZE_FOR_SPEED) |
7697 | || direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST, |
7698 | vectype_in, |
7699 | OPTIMIZE_FOR_SPEED))) |
7700 | { |
7701 | if (dump_enabled_p ()) |
7702 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
7703 | "optimizing condition reduction with" |
7704 | " FOLD_EXTRACT_LAST.\n" ); |
7705 | STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION; |
7706 | } |
7707 | else if (cond_reduc_dt == vect_induction_def) |
7708 | { |
7709 | tree base |
7710 | = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo); |
7711 | tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo); |
7712 | |
7713 | gcc_assert (TREE_CODE (base) == INTEGER_CST |
7714 | && TREE_CODE (step) == INTEGER_CST); |
7715 | cond_reduc_val = NULL_TREE; |
7716 | enum tree_code cond_reduc_op_code = ERROR_MARK; |
7717 | tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo)); |
7718 | if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base))) |
7719 | ; |
7720 | /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR |
7721 | above base; punt if base is the minimum value of the type for |
7722 | MAX_EXPR or maximum value of the type for MIN_EXPR for now. */ |
7723 | else if (tree_int_cst_sgn (step) == -1) |
7724 | { |
7725 | cond_reduc_op_code = MIN_EXPR; |
7726 | if (tree_int_cst_sgn (base) == -1) |
7727 | cond_reduc_val = build_int_cst (TREE_TYPE (base), 0); |
7728 | else if (tree_int_cst_lt (t1: base, |
7729 | TYPE_MAX_VALUE (TREE_TYPE (base)))) |
7730 | cond_reduc_val |
7731 | = int_const_binop (PLUS_EXPR, base, integer_one_node); |
7732 | } |
7733 | else |
7734 | { |
7735 | cond_reduc_op_code = MAX_EXPR; |
7736 | if (tree_int_cst_sgn (base) == 1) |
7737 | cond_reduc_val = build_int_cst (TREE_TYPE (base), 0); |
7738 | else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)), |
7739 | t2: base)) |
7740 | cond_reduc_val |
7741 | = int_const_binop (MINUS_EXPR, base, integer_one_node); |
7742 | } |
7743 | if (cond_reduc_val) |
7744 | { |
7745 | if (dump_enabled_p ()) |
7746 | dump_printf_loc (MSG_NOTE, vect_location, |
7747 | "condition expression based on " |
7748 | "integer induction.\n" ); |
7749 | STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code; |
7750 | STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) |
7751 | = cond_reduc_val; |
7752 | STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION; |
7753 | } |
7754 | } |
7755 | else if (cond_reduc_dt == vect_constant_def) |
7756 | { |
7757 | enum vect_def_type cond_initial_dt; |
7758 | tree cond_initial_val = vect_phi_initial_value (phi: reduc_def_phi); |
7759 | vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt); |
7760 | if (cond_initial_dt == vect_constant_def |
7761 | && types_compatible_p (TREE_TYPE (cond_initial_val), |
7762 | TREE_TYPE (cond_reduc_val))) |
7763 | { |
7764 | tree e = fold_binary (LE_EXPR, boolean_type_node, |
7765 | cond_initial_val, cond_reduc_val); |
7766 | if (e && (integer_onep (e) || integer_zerop (e))) |
7767 | { |
7768 | if (dump_enabled_p ()) |
7769 | dump_printf_loc (MSG_NOTE, vect_location, |
7770 | "condition expression based on " |
7771 | "compile time constant.\n" ); |
7772 | /* Record reduction code at analysis stage. */ |
7773 | STMT_VINFO_REDUC_CODE (reduc_info) |
7774 | = integer_onep (e) ? MAX_EXPR : MIN_EXPR; |
7775 | STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION; |
7776 | } |
7777 | } |
7778 | } |
7779 | } |
7780 | |
7781 | if (STMT_VINFO_LIVE_P (phi_info)) |
7782 | return false; |
7783 | |
7784 | if (slp_node) |
7785 | ncopies = 1; |
7786 | else |
7787 | ncopies = vect_get_num_copies (loop_vinfo, vectype: vectype_in); |
7788 | |
7789 | gcc_assert (ncopies >= 1); |
7790 | |
7791 | poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (node: vectype_out); |
7792 | |
7793 | if (nested_cycle) |
7794 | { |
7795 | gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) |
7796 | == vect_double_reduction_def); |
7797 | double_reduc = true; |
7798 | } |
7799 | |
7800 | /* 4.2. Check support for the epilog operation. |
7801 | |
7802 | If STMT represents a reduction pattern, then the type of the |
7803 | reduction variable may be different than the type of the rest |
7804 | of the arguments. For example, consider the case of accumulation |
7805 | of shorts into an int accumulator; The original code: |
7806 | S1: int_a = (int) short_a; |
7807 | orig_stmt-> S2: int_acc = plus <int_a ,int_acc>; |
7808 | |
7809 | was replaced with: |
7810 | STMT: int_acc = widen_sum <short_a, int_acc> |
7811 | |
7812 | This means that: |
7813 | 1. The tree-code that is used to create the vector operation in the |
7814 | epilog code (that reduces the partial results) is not the |
7815 | tree-code of STMT, but is rather the tree-code of the original |
7816 | stmt from the pattern that STMT is replacing. I.e, in the example |
7817 | above we want to use 'widen_sum' in the loop, but 'plus' in the |
7818 | epilog. |
7819 | 2. The type (mode) we use to check available target support |
7820 | for the vector operation to be created in the *epilog*, is |
7821 | determined by the type of the reduction variable (in the example |
7822 | above we'd check this: optab_handler (plus_optab, vect_int_mode])). |
7823 | However the type (mode) we use to check available target support |
7824 | for the vector operation to be created *inside the loop*, is |
7825 | determined by the type of the other arguments to STMT (in the |
7826 | example we'd check this: optab_handler (widen_sum_optab, |
7827 | vect_short_mode)). |
7828 | |
7829 | This is contrary to "regular" reductions, in which the types of all |
7830 | the arguments are the same as the type of the reduction variable. |
7831 | For "regular" reductions we can therefore use the same vector type |
7832 | (and also the same tree-code) when generating the epilog code and |
7833 | when generating the code inside the loop. */ |
7834 | |
7835 | code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info); |
7836 | |
7837 | /* If conversion might have created a conditional operation like |
7838 | IFN_COND_ADD already. Use the internal code for the following checks. */ |
7839 | if (orig_code.is_internal_fn ()) |
7840 | { |
7841 | tree_code new_code = conditional_internal_fn_code (internal_fn (orig_code)); |
7842 | orig_code = new_code != ERROR_MARK ? new_code : orig_code; |
7843 | } |
7844 | |
7845 | STMT_VINFO_REDUC_CODE (reduc_info) = orig_code; |
7846 | |
7847 | vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info); |
7848 | if (reduction_type == TREE_CODE_REDUCTION) |
7849 | { |
7850 | /* Check whether it's ok to change the order of the computation. |
7851 | Generally, when vectorizing a reduction we change the order of the |
7852 | computation. This may change the behavior of the program in some |
7853 | cases, so we need to check that this is ok. One exception is when |
7854 | vectorizing an outer-loop: the inner-loop is executed sequentially, |
7855 | and therefore vectorizing reductions in the inner-loop during |
7856 | outer-loop vectorization is safe. Likewise when we are vectorizing |
7857 | a series of reductions using SLP and the VF is one the reductions |
7858 | are performed in scalar order. */ |
7859 | if (slp_node |
7860 | && !REDUC_GROUP_FIRST_ELEMENT (stmt_info) |
7861 | && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u)) |
7862 | ; |
7863 | else if (needs_fold_left_reduction_p (type: op.type, code: orig_code)) |
7864 | { |
7865 | /* When vectorizing a reduction chain w/o SLP the reduction PHI |
7866 | is not directy used in stmt. */ |
7867 | if (!only_slp_reduc_chain |
7868 | && reduc_chain_length != 1) |
7869 | { |
7870 | if (dump_enabled_p ()) |
7871 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
7872 | "in-order reduction chain without SLP.\n" ); |
7873 | return false; |
7874 | } |
7875 | STMT_VINFO_REDUC_TYPE (reduc_info) |
7876 | = reduction_type = FOLD_LEFT_REDUCTION; |
7877 | } |
7878 | else if (!commutative_binary_op_p (orig_code, op.type) |
7879 | || !associative_binary_op_p (orig_code, op.type)) |
7880 | { |
7881 | if (dump_enabled_p ()) |
7882 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
7883 | "reduction: not commutative/associative\n" ); |
7884 | return false; |
7885 | } |
7886 | } |
7887 | |
7888 | if ((double_reduc || reduction_type != TREE_CODE_REDUCTION) |
7889 | && ncopies > 1) |
7890 | { |
7891 | if (dump_enabled_p ()) |
7892 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
7893 | "multiple types in double reduction or condition " |
7894 | "reduction or fold-left reduction.\n" ); |
7895 | return false; |
7896 | } |
7897 | |
7898 | internal_fn reduc_fn = IFN_LAST; |
7899 | if (reduction_type == TREE_CODE_REDUCTION |
7900 | || reduction_type == FOLD_LEFT_REDUCTION |
7901 | || reduction_type == INTEGER_INDUC_COND_REDUCTION |
7902 | || reduction_type == CONST_COND_REDUCTION) |
7903 | { |
7904 | if (reduction_type == FOLD_LEFT_REDUCTION |
7905 | ? fold_left_reduction_fn (code: orig_code, reduc_fn: &reduc_fn) |
7906 | : reduction_fn_for_scalar_code (code: orig_code, reduc_fn: &reduc_fn)) |
7907 | { |
7908 | if (reduc_fn != IFN_LAST |
7909 | && !direct_internal_fn_supported_p (reduc_fn, vectype_out, |
7910 | OPTIMIZE_FOR_SPEED)) |
7911 | { |
7912 | if (dump_enabled_p ()) |
7913 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
7914 | "reduc op not supported by target.\n" ); |
7915 | |
7916 | reduc_fn = IFN_LAST; |
7917 | } |
7918 | } |
7919 | else |
7920 | { |
7921 | if (!nested_cycle || double_reduc) |
7922 | { |
7923 | if (dump_enabled_p ()) |
7924 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
7925 | "no reduc code for scalar code.\n" ); |
7926 | |
7927 | return false; |
7928 | } |
7929 | } |
7930 | } |
7931 | else if (reduction_type == COND_REDUCTION) |
7932 | { |
7933 | int scalar_precision |
7934 | = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type)); |
7935 | cr_index_scalar_type = make_unsigned_type (scalar_precision); |
7936 | cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type, |
7937 | vectype_out); |
7938 | |
7939 | if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type, |
7940 | OPTIMIZE_FOR_SPEED)) |
7941 | reduc_fn = IFN_REDUC_MAX; |
7942 | } |
7943 | STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn; |
7944 | |
7945 | if (reduction_type != EXTRACT_LAST_REDUCTION |
7946 | && (!nested_cycle || double_reduc) |
7947 | && reduc_fn == IFN_LAST |
7948 | && !nunits_out.is_constant ()) |
7949 | { |
7950 | if (dump_enabled_p ()) |
7951 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
7952 | "missing target support for reduction on" |
7953 | " variable-length vectors.\n" ); |
7954 | return false; |
7955 | } |
7956 | |
7957 | /* For SLP reductions, see if there is a neutral value we can use. */ |
7958 | tree neutral_op = NULL_TREE; |
7959 | if (slp_node) |
7960 | { |
7961 | tree initial_value = NULL_TREE; |
7962 | if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL) |
7963 | initial_value = vect_phi_initial_value (phi: reduc_def_phi); |
7964 | neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out), |
7965 | code: orig_code, initial_value); |
7966 | } |
7967 | |
7968 | if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION) |
7969 | { |
7970 | /* We can't support in-order reductions of code such as this: |
7971 | |
7972 | for (int i = 0; i < n1; ++i) |
7973 | for (int j = 0; j < n2; ++j) |
7974 | l += a[j]; |
7975 | |
7976 | since GCC effectively transforms the loop when vectorizing: |
7977 | |
7978 | for (int i = 0; i < n1 / VF; ++i) |
7979 | for (int j = 0; j < n2; ++j) |
7980 | for (int k = 0; k < VF; ++k) |
7981 | l += a[j]; |
7982 | |
7983 | which is a reassociation of the original operation. */ |
7984 | if (dump_enabled_p ()) |
7985 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
7986 | "in-order double reduction not supported.\n" ); |
7987 | |
7988 | return false; |
7989 | } |
7990 | |
7991 | if (reduction_type == FOLD_LEFT_REDUCTION |
7992 | && slp_node |
7993 | && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)) |
7994 | { |
7995 | /* We cannot use in-order reductions in this case because there is |
7996 | an implicit reassociation of the operations involved. */ |
7997 | if (dump_enabled_p ()) |
7998 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
7999 | "in-order unchained SLP reductions not supported.\n" ); |
8000 | return false; |
8001 | } |
8002 | |
8003 | /* For double reductions, and for SLP reductions with a neutral value, |
8004 | we construct a variable-length initial vector by loading a vector |
8005 | full of the neutral value and then shift-and-inserting the start |
8006 | values into the low-numbered elements. */ |
8007 | if ((double_reduc || neutral_op) |
8008 | && !nunits_out.is_constant () |
8009 | && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT, |
8010 | vectype_out, OPTIMIZE_FOR_SPEED)) |
8011 | { |
8012 | if (dump_enabled_p ()) |
8013 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
8014 | "reduction on variable-length vectors requires" |
8015 | " target support for a vector-shift-and-insert" |
8016 | " operation.\n" ); |
8017 | return false; |
8018 | } |
8019 | |
8020 | /* Check extra constraints for variable-length unchained SLP reductions. */ |
8021 | if (slp_node |
8022 | && !REDUC_GROUP_FIRST_ELEMENT (stmt_info) |
8023 | && !nunits_out.is_constant ()) |
8024 | { |
8025 | /* We checked above that we could build the initial vector when |
8026 | there's a neutral element value. Check here for the case in |
8027 | which each SLP statement has its own initial value and in which |
8028 | that value needs to be repeated for every instance of the |
8029 | statement within the initial vector. */ |
8030 | unsigned int group_size = SLP_TREE_LANES (slp_node); |
8031 | if (!neutral_op |
8032 | && !can_duplicate_and_interleave_p (loop_vinfo, group_size, |
8033 | TREE_TYPE (vectype_out))) |
8034 | { |
8035 | if (dump_enabled_p ()) |
8036 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
8037 | "unsupported form of SLP reduction for" |
8038 | " variable-length vectors: cannot build" |
8039 | " initial vector.\n" ); |
8040 | return false; |
8041 | } |
8042 | /* The epilogue code relies on the number of elements being a multiple |
8043 | of the group size. The duplicate-and-interleave approach to setting |
8044 | up the initial vector does too. */ |
8045 | if (!multiple_p (a: nunits_out, b: group_size)) |
8046 | { |
8047 | if (dump_enabled_p ()) |
8048 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
8049 | "unsupported form of SLP reduction for" |
8050 | " variable-length vectors: the vector size" |
8051 | " is not a multiple of the number of results.\n" ); |
8052 | return false; |
8053 | } |
8054 | } |
8055 | |
8056 | if (reduction_type == COND_REDUCTION) |
8057 | { |
8058 | widest_int ni; |
8059 | |
8060 | if (! max_loop_iterations (loop, &ni)) |
8061 | { |
8062 | if (dump_enabled_p ()) |
8063 | dump_printf_loc (MSG_NOTE, vect_location, |
8064 | "loop count not known, cannot create cond " |
8065 | "reduction.\n" ); |
8066 | return false; |
8067 | } |
8068 | /* Convert backedges to iterations. */ |
8069 | ni += 1; |
8070 | |
8071 | /* The additional index will be the same type as the condition. Check |
8072 | that the loop can fit into this less one (because we'll use up the |
8073 | zero slot for when there are no matches). */ |
8074 | tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type); |
8075 | if (wi::geu_p (x: ni, y: wi::to_widest (t: max_index))) |
8076 | { |
8077 | if (dump_enabled_p ()) |
8078 | dump_printf_loc (MSG_NOTE, vect_location, |
8079 | "loop size is greater than data size.\n" ); |
8080 | return false; |
8081 | } |
8082 | } |
8083 | |
8084 | /* In case the vectorization factor (VF) is bigger than the number |
8085 | of elements that we can fit in a vectype (nunits), we have to generate |
8086 | more than one vector stmt - i.e - we need to "unroll" the |
8087 | vector stmt by a factor VF/nunits. For more details see documentation |
8088 | in vectorizable_operation. */ |
8089 | |
8090 | /* If the reduction is used in an outer loop we need to generate |
8091 | VF intermediate results, like so (e.g. for ncopies=2): |
8092 | r0 = phi (init, r0) |
8093 | r1 = phi (init, r1) |
8094 | r0 = x0 + r0; |
8095 | r1 = x1 + r1; |
8096 | (i.e. we generate VF results in 2 registers). |
8097 | In this case we have a separate def-use cycle for each copy, and therefore |
8098 | for each copy we get the vector def for the reduction variable from the |
8099 | respective phi node created for this copy. |
8100 | |
8101 | Otherwise (the reduction is unused in the loop nest), we can combine |
8102 | together intermediate results, like so (e.g. for ncopies=2): |
8103 | r = phi (init, r) |
8104 | r = x0 + r; |
8105 | r = x1 + r; |
8106 | (i.e. we generate VF/2 results in a single register). |
8107 | In this case for each copy we get the vector def for the reduction variable |
8108 | from the vectorized reduction operation generated in the previous iteration. |
8109 | |
8110 | This only works when we see both the reduction PHI and its only consumer |
8111 | in vectorizable_reduction and there are no intermediate stmts |
8112 | participating. When unrolling we want each unrolled iteration to have its |
8113 | own reduction accumulator since one of the main goals of unrolling a |
8114 | reduction is to reduce the aggregate loop-carried latency. */ |
8115 | if (ncopies > 1 |
8116 | && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live) |
8117 | && reduc_chain_length == 1 |
8118 | && loop_vinfo->suggested_unroll_factor == 1) |
8119 | single_defuse_cycle = true; |
8120 | |
8121 | if (single_defuse_cycle || lane_reduc_code_p) |
8122 | { |
8123 | gcc_assert (op.code != COND_EXPR); |
8124 | |
8125 | /* 4. Supportable by target? */ |
8126 | bool ok = true; |
8127 | |
8128 | /* 4.1. check support for the operation in the loop |
8129 | |
8130 | This isn't necessary for the lane reduction codes, since they |
8131 | can only be produced by pattern matching, and it's up to the |
8132 | pattern matcher to test for support. The main reason for |
8133 | specifically skipping this step is to avoid rechecking whether |
8134 | mixed-sign dot-products can be implemented using signed |
8135 | dot-products. */ |
8136 | machine_mode vec_mode = TYPE_MODE (vectype_in); |
8137 | if (!lane_reduc_code_p |
8138 | && !directly_supported_p (op.code, vectype_in, optab_vector)) |
8139 | { |
8140 | if (dump_enabled_p ()) |
8141 | dump_printf (MSG_NOTE, "op not supported by target.\n" ); |
8142 | if (maybe_ne (a: GET_MODE_SIZE (mode: vec_mode), UNITS_PER_WORD) |
8143 | || !vect_can_vectorize_without_simd_p (op.code)) |
8144 | ok = false; |
8145 | else |
8146 | if (dump_enabled_p ()) |
8147 | dump_printf (MSG_NOTE, "proceeding using word mode.\n" ); |
8148 | } |
8149 | |
8150 | if (vect_emulated_vector_p (vectype_in) |
8151 | && !vect_can_vectorize_without_simd_p (op.code)) |
8152 | { |
8153 | if (dump_enabled_p ()) |
8154 | dump_printf (MSG_NOTE, "using word mode not possible.\n" ); |
8155 | return false; |
8156 | } |
8157 | |
8158 | /* lane-reducing operations have to go through vect_transform_reduction. |
8159 | For the other cases try without the single cycle optimization. */ |
8160 | if (!ok) |
8161 | { |
8162 | if (lane_reduc_code_p) |
8163 | return false; |
8164 | else |
8165 | single_defuse_cycle = false; |
8166 | } |
8167 | } |
8168 | STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle; |
8169 | |
8170 | /* If the reduction stmt is one of the patterns that have lane |
8171 | reduction embedded we cannot handle the case of ! single_defuse_cycle. */ |
8172 | if ((ncopies > 1 && ! single_defuse_cycle) |
8173 | && lane_reduc_code_p) |
8174 | { |
8175 | if (dump_enabled_p ()) |
8176 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
8177 | "multi def-use cycle not possible for lane-reducing " |
8178 | "reduction operation\n" ); |
8179 | return false; |
8180 | } |
8181 | |
8182 | if (slp_node |
8183 | && !(!single_defuse_cycle |
8184 | && !lane_reduc_code_p |
8185 | && reduction_type != FOLD_LEFT_REDUCTION)) |
8186 | for (i = 0; i < (int) op.num_ops; i++) |
8187 | if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i])) |
8188 | { |
8189 | if (dump_enabled_p ()) |
8190 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
8191 | "incompatible vector types for invariants\n" ); |
8192 | return false; |
8193 | } |
8194 | |
8195 | if (slp_node) |
8196 | vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); |
8197 | else |
8198 | vec_num = 1; |
8199 | |
8200 | vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn, |
8201 | reduction_type, ncopies, cost_vec); |
8202 | /* Cost the reduction op inside the loop if transformed via |
8203 | vect_transform_reduction. Otherwise this is costed by the |
8204 | separate vectorizable_* routines. */ |
8205 | if (single_defuse_cycle || lane_reduc_code_p) |
8206 | { |
8207 | int factor = 1; |
8208 | if (vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info)) |
8209 | /* Three dot-products and a subtraction. */ |
8210 | factor = 4; |
8211 | record_stmt_cost (body_cost_vec: cost_vec, count: ncopies * factor, kind: vector_stmt, |
8212 | stmt_info, misalign: 0, where: vect_body); |
8213 | } |
8214 | |
8215 | if (dump_enabled_p () |
8216 | && reduction_type == FOLD_LEFT_REDUCTION) |
8217 | dump_printf_loc (MSG_NOTE, vect_location, |
8218 | "using an in-order (fold-left) reduction.\n" ); |
8219 | STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type; |
8220 | /* All but single defuse-cycle optimized, lane-reducing and fold-left |
8221 | reductions go through their own vectorizable_* routines. */ |
8222 | if (!single_defuse_cycle |
8223 | && !lane_reduc_code_p |
8224 | && reduction_type != FOLD_LEFT_REDUCTION) |
8225 | { |
8226 | stmt_vec_info tem |
8227 | = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info)); |
8228 | if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem)) |
8229 | { |
8230 | gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem)); |
8231 | tem = REDUC_GROUP_FIRST_ELEMENT (tem); |
8232 | } |
8233 | STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def; |
8234 | STMT_VINFO_DEF_TYPE (tem) = vect_internal_def; |
8235 | } |
8236 | else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)) |
8237 | { |
8238 | vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); |
8239 | vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo); |
8240 | internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type); |
8241 | |
8242 | if (reduction_type != FOLD_LEFT_REDUCTION |
8243 | && !use_mask_by_cond_expr_p (code: op.code, cond_fn, vectype_in) |
8244 | && (cond_fn == IFN_LAST |
8245 | || !direct_internal_fn_supported_p (cond_fn, vectype_in, |
8246 | OPTIMIZE_FOR_SPEED))) |
8247 | { |
8248 | if (dump_enabled_p ()) |
8249 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
8250 | "can't operate on partial vectors because" |
8251 | " no conditional operation is available.\n" ); |
8252 | LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; |
8253 | } |
8254 | else if (reduction_type == FOLD_LEFT_REDUCTION |
8255 | && reduc_fn == IFN_LAST |
8256 | && !expand_vec_cond_expr_p (vectype_in, |
8257 | truth_type_for (vectype_in), |
8258 | SSA_NAME)) |
8259 | { |
8260 | if (dump_enabled_p ()) |
8261 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
8262 | "can't operate on partial vectors because" |
8263 | " no conditional operation is available.\n" ); |
8264 | LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; |
8265 | } |
8266 | else if (reduction_type == FOLD_LEFT_REDUCTION |
8267 | && internal_fn_mask_index (reduc_fn) == -1 |
8268 | && FLOAT_TYPE_P (vectype_in) |
8269 | && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in)) |
8270 | { |
8271 | if (dump_enabled_p ()) |
8272 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
8273 | "can't operate on partial vectors because" |
8274 | " signed zeros cannot be preserved.\n" ); |
8275 | LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; |
8276 | } |
8277 | else |
8278 | { |
8279 | internal_fn mask_reduc_fn |
8280 | = get_masked_reduction_fn (reduc_fn, vectype_in); |
8281 | |
8282 | if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS) |
8283 | vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num, |
8284 | vectype_in, 1); |
8285 | else |
8286 | vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num, |
8287 | vectype_in, NULL); |
8288 | } |
8289 | } |
8290 | return true; |
8291 | } |
8292 | |
8293 | /* STMT_INFO is a dot-product reduction whose multiplication operands |
8294 | have different signs. Emit a sequence to emulate the operation |
8295 | using a series of signed DOT_PROD_EXPRs and return the last |
8296 | statement generated. VEC_DEST is the result of the vector operation |
8297 | and VOP lists its inputs. */ |
8298 | |
8299 | static gassign * |
8300 | vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info, |
8301 | gimple_stmt_iterator *gsi, tree vec_dest, |
8302 | tree vop[3]) |
8303 | { |
8304 | tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest)); |
8305 | tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0])); |
8306 | tree narrow_elttype = TREE_TYPE (narrow_vectype); |
8307 | gimple *new_stmt; |
8308 | |
8309 | /* Make VOP[0] the unsigned operand VOP[1] the signed operand. */ |
8310 | if (!TYPE_UNSIGNED (TREE_TYPE (vop[0]))) |
8311 | std::swap (a&: vop[0], b&: vop[1]); |
8312 | |
8313 | /* Convert all inputs to signed types. */ |
8314 | for (int i = 0; i < 3; ++i) |
8315 | if (TYPE_UNSIGNED (TREE_TYPE (vop[i]))) |
8316 | { |
8317 | tree tmp = make_ssa_name (var: signed_type_for (TREE_TYPE (vop[i]))); |
8318 | new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]); |
8319 | vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi); |
8320 | vop[i] = tmp; |
8321 | } |
8322 | |
8323 | /* In the comments below we assume 8-bit inputs for simplicity, |
8324 | but the approach works for any full integer type. */ |
8325 | |
8326 | /* Create a vector of -128. */ |
8327 | tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype); |
8328 | tree min_narrow = build_vector_from_val (narrow_vectype, |
8329 | min_narrow_elttype); |
8330 | |
8331 | /* Create a vector of 64. */ |
8332 | auto half_wi = wi::lrshift (x: wi::to_wide (t: min_narrow_elttype), y: 1); |
8333 | tree half_narrow = wide_int_to_tree (type: narrow_elttype, cst: half_wi); |
8334 | half_narrow = build_vector_from_val (narrow_vectype, half_narrow); |
8335 | |
8336 | /* Emit: SUB_RES = VOP[0] - 128. */ |
8337 | tree sub_res = make_ssa_name (var: narrow_vectype); |
8338 | new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow); |
8339 | vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi); |
8340 | |
8341 | /* Emit: |
8342 | |
8343 | STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>; |
8344 | STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>; |
8345 | STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>; |
8346 | |
8347 | on the basis that x * y == (x - 128) * y + 64 * y + 64 * y |
8348 | Doing the two 64 * y steps first allows more time to compute x. */ |
8349 | tree stage1 = make_ssa_name (var: wide_vectype); |
8350 | new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR, |
8351 | vop[1], half_narrow, vop[2]); |
8352 | vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi); |
8353 | |
8354 | tree stage2 = make_ssa_name (var: wide_vectype); |
8355 | new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR, |
8356 | vop[1], half_narrow, stage1); |
8357 | vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi); |
8358 | |
8359 | tree stage3 = make_ssa_name (var: wide_vectype); |
8360 | new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR, |
8361 | sub_res, vop[1], stage2); |
8362 | vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi); |
8363 | |
8364 | /* Convert STAGE3 to the reduction type. */ |
8365 | return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3); |
8366 | } |
8367 | |
8368 | /* Transform the definition stmt STMT_INFO of a reduction PHI backedge |
8369 | value. */ |
8370 | |
8371 | bool |
8372 | vect_transform_reduction (loop_vec_info loop_vinfo, |
8373 | stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, |
8374 | gimple **vec_stmt, slp_tree slp_node) |
8375 | { |
8376 | tree vectype_out = STMT_VINFO_VECTYPE (stmt_info); |
8377 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
8378 | int i; |
8379 | int ncopies; |
8380 | int vec_num; |
8381 | |
8382 | stmt_vec_info reduc_info = info_for_reduction (vinfo: loop_vinfo, stmt_info); |
8383 | gcc_assert (reduc_info->is_reduc_info); |
8384 | |
8385 | if (nested_in_vect_loop_p (loop, stmt_info)) |
8386 | { |
8387 | loop = loop->inner; |
8388 | gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def); |
8389 | } |
8390 | |
8391 | gimple_match_op op; |
8392 | if (!gimple_extract_op (stmt_info->stmt, &op)) |
8393 | gcc_unreachable (); |
8394 | |
8395 | /* All uses but the last are expected to be defined in the loop. |
8396 | The last use is the reduction variable. In case of nested cycle this |
8397 | assumption is not true: we use reduc_index to record the index of the |
8398 | reduction variable. */ |
8399 | stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)); |
8400 | gphi *reduc_def_phi = as_a <gphi *> (p: phi_info->stmt); |
8401 | int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info); |
8402 | tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info); |
8403 | |
8404 | if (slp_node) |
8405 | { |
8406 | ncopies = 1; |
8407 | vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); |
8408 | } |
8409 | else |
8410 | { |
8411 | ncopies = vect_get_num_copies (loop_vinfo, vectype: vectype_in); |
8412 | vec_num = 1; |
8413 | } |
8414 | |
8415 | code_helper code = canonicalize_code (op.code, op.type); |
8416 | internal_fn cond_fn = get_conditional_internal_fn (code, op.type); |
8417 | |
8418 | vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); |
8419 | vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo); |
8420 | bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in); |
8421 | |
8422 | /* Transform. */ |
8423 | tree new_temp = NULL_TREE; |
8424 | auto_vec<tree> vec_oprnds0; |
8425 | auto_vec<tree> vec_oprnds1; |
8426 | auto_vec<tree> vec_oprnds2; |
8427 | tree def0; |
8428 | |
8429 | if (dump_enabled_p ()) |
8430 | dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n" ); |
8431 | |
8432 | /* FORNOW: Multiple types are not supported for condition. */ |
8433 | if (code == COND_EXPR) |
8434 | gcc_assert (ncopies == 1); |
8435 | |
8436 | /* A binary COND_OP reduction must have the same definition and else |
8437 | value. */ |
8438 | bool cond_fn_p = code.is_internal_fn () |
8439 | && conditional_internal_fn_code (internal_fn (code)) != ERROR_MARK; |
8440 | if (cond_fn_p) |
8441 | { |
8442 | gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB |
8443 | || code == IFN_COND_MUL || code == IFN_COND_AND |
8444 | || code == IFN_COND_IOR || code == IFN_COND_XOR); |
8445 | gcc_assert (op.num_ops == 4 && (op.ops[1] == op.ops[3])); |
8446 | } |
8447 | |
8448 | bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo); |
8449 | |
8450 | vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info); |
8451 | if (reduction_type == FOLD_LEFT_REDUCTION) |
8452 | { |
8453 | internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info); |
8454 | gcc_assert (code.is_tree_code () || cond_fn_p); |
8455 | return vectorize_fold_left_reduction |
8456 | (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_stmt: reduc_def_phi, |
8457 | code, reduc_fn, ops: op.ops, num_ops: op.num_ops, vectype_in, |
8458 | reduc_index, masks, lens); |
8459 | } |
8460 | |
8461 | bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info); |
8462 | gcc_assert (single_defuse_cycle |
8463 | || code == DOT_PROD_EXPR |
8464 | || code == WIDEN_SUM_EXPR |
8465 | || code == SAD_EXPR); |
8466 | |
8467 | /* Create the destination vector */ |
8468 | tree scalar_dest = gimple_get_lhs (stmt_info->stmt); |
8469 | tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out); |
8470 | |
8471 | /* Get NCOPIES vector definitions for all operands except the reduction |
8472 | definition. */ |
8473 | vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies, |
8474 | single_defuse_cycle && reduc_index == 0 |
8475 | ? NULL_TREE : op.ops[0], &vec_oprnds0, |
8476 | single_defuse_cycle && reduc_index == 1 |
8477 | ? NULL_TREE : op.ops[1], &vec_oprnds1, |
8478 | op.num_ops == 4 |
8479 | || (op.num_ops == 3 |
8480 | && !(single_defuse_cycle && reduc_index == 2)) |
8481 | ? op.ops[2] : NULL_TREE, &vec_oprnds2); |
8482 | |
8483 | /* For single def-use cycles get one copy of the vectorized reduction |
8484 | definition. */ |
8485 | if (single_defuse_cycle) |
8486 | { |
8487 | gcc_assert (!slp_node); |
8488 | vect_get_vec_defs_for_operand (vinfo: loop_vinfo, stmt_info, 1, |
8489 | op: op.ops[reduc_index], |
8490 | reduc_index == 0 ? &vec_oprnds0 |
8491 | : (reduc_index == 1 ? &vec_oprnds1 |
8492 | : &vec_oprnds2)); |
8493 | } |
8494 | |
8495 | bool emulated_mixed_dot_prod |
8496 | = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info); |
8497 | FOR_EACH_VEC_ELT (vec_oprnds0, i, def0) |
8498 | { |
8499 | gimple *new_stmt; |
8500 | tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE }; |
8501 | if (masked_loop_p && !mask_by_cond_expr) |
8502 | { |
8503 | /* No conditional ifns have been defined for dot-product yet. */ |
8504 | gcc_assert (code != DOT_PROD_EXPR); |
8505 | |
8506 | /* Make sure that the reduction accumulator is vop[0]. */ |
8507 | if (reduc_index == 1) |
8508 | { |
8509 | gcc_assert (commutative_binary_op_p (code, op.type)); |
8510 | std::swap (a&: vop[0], b&: vop[1]); |
8511 | } |
8512 | tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks, |
8513 | vec_num * ncopies, vectype_in, i); |
8514 | gcall *call = gimple_build_call_internal (cond_fn, 4, mask, |
8515 | vop[0], vop[1], vop[0]); |
8516 | new_temp = make_ssa_name (var: vec_dest, stmt: call); |
8517 | gimple_call_set_lhs (gs: call, lhs: new_temp); |
8518 | gimple_call_set_nothrow (s: call, nothrow_p: true); |
8519 | vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi); |
8520 | new_stmt = call; |
8521 | } |
8522 | else |
8523 | { |
8524 | if (op.num_ops >= 3) |
8525 | vop[2] = vec_oprnds2[i]; |
8526 | |
8527 | if (masked_loop_p && mask_by_cond_expr) |
8528 | { |
8529 | tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks, |
8530 | vec_num * ncopies, vectype_in, i); |
8531 | build_vect_cond_expr (code, vop, mask, gsi); |
8532 | } |
8533 | |
8534 | if (emulated_mixed_dot_prod) |
8535 | new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi, |
8536 | vec_dest, vop); |
8537 | |
8538 | else if (code.is_internal_fn () && !cond_fn_p) |
8539 | new_stmt = gimple_build_call_internal (internal_fn (code), |
8540 | op.num_ops, |
8541 | vop[0], vop[1], vop[2]); |
8542 | else if (code.is_internal_fn () && cond_fn_p) |
8543 | new_stmt = gimple_build_call_internal (internal_fn (code), |
8544 | op.num_ops, |
8545 | vop[0], vop[1], vop[2], |
8546 | vop[1]); |
8547 | else |
8548 | new_stmt = gimple_build_assign (vec_dest, tree_code (op.code), |
8549 | vop[0], vop[1], vop[2]); |
8550 | new_temp = make_ssa_name (var: vec_dest, stmt: new_stmt); |
8551 | gimple_set_lhs (new_stmt, new_temp); |
8552 | vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi); |
8553 | } |
8554 | |
8555 | if (slp_node) |
8556 | slp_node->push_vec_def (def: new_stmt); |
8557 | else if (single_defuse_cycle |
8558 | && i < ncopies - 1) |
8559 | { |
8560 | if (reduc_index == 0) |
8561 | vec_oprnds0.safe_push (obj: gimple_get_lhs (new_stmt)); |
8562 | else if (reduc_index == 1) |
8563 | vec_oprnds1.safe_push (obj: gimple_get_lhs (new_stmt)); |
8564 | else if (reduc_index == 2) |
8565 | vec_oprnds2.safe_push (obj: gimple_get_lhs (new_stmt)); |
8566 | } |
8567 | else |
8568 | STMT_VINFO_VEC_STMTS (stmt_info).safe_push (obj: new_stmt); |
8569 | } |
8570 | |
8571 | if (!slp_node) |
8572 | *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0]; |
8573 | |
8574 | return true; |
8575 | } |
8576 | |
8577 | /* Transform phase of a cycle PHI. */ |
8578 | |
8579 | bool |
8580 | vect_transform_cycle_phi (loop_vec_info loop_vinfo, |
8581 | stmt_vec_info stmt_info, gimple **vec_stmt, |
8582 | slp_tree slp_node, slp_instance slp_node_instance) |
8583 | { |
8584 | tree vectype_out = STMT_VINFO_VECTYPE (stmt_info); |
8585 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
8586 | int i; |
8587 | int ncopies; |
8588 | int j; |
8589 | bool nested_cycle = false; |
8590 | int vec_num; |
8591 | |
8592 | if (nested_in_vect_loop_p (loop, stmt_info)) |
8593 | { |
8594 | loop = loop->inner; |
8595 | nested_cycle = true; |
8596 | } |
8597 | |
8598 | stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info); |
8599 | reduc_stmt_info = vect_stmt_to_vectorize (stmt_info: reduc_stmt_info); |
8600 | stmt_vec_info reduc_info = info_for_reduction (vinfo: loop_vinfo, stmt_info); |
8601 | gcc_assert (reduc_info->is_reduc_info); |
8602 | |
8603 | if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION |
8604 | || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION) |
8605 | /* Leave the scalar phi in place. */ |
8606 | return true; |
8607 | |
8608 | tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info); |
8609 | /* For a nested cycle we do not fill the above. */ |
8610 | if (!vectype_in) |
8611 | vectype_in = STMT_VINFO_VECTYPE (stmt_info); |
8612 | gcc_assert (vectype_in); |
8613 | |
8614 | if (slp_node) |
8615 | { |
8616 | /* The size vect_schedule_slp_instance computes is off for us. */ |
8617 | vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo) |
8618 | * SLP_TREE_LANES (slp_node), vectype: vectype_in); |
8619 | ncopies = 1; |
8620 | } |
8621 | else |
8622 | { |
8623 | vec_num = 1; |
8624 | ncopies = vect_get_num_copies (loop_vinfo, vectype: vectype_in); |
8625 | } |
8626 | |
8627 | /* Check whether we should use a single PHI node and accumulate |
8628 | vectors to one before the backedge. */ |
8629 | if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info)) |
8630 | ncopies = 1; |
8631 | |
8632 | /* Create the destination vector */ |
8633 | gphi *phi = as_a <gphi *> (p: stmt_info->stmt); |
8634 | tree vec_dest = vect_create_destination_var (gimple_phi_result (gs: phi), |
8635 | vectype_out); |
8636 | |
8637 | /* Get the loop-entry arguments. */ |
8638 | tree vec_initial_def = NULL_TREE; |
8639 | auto_vec<tree> vec_initial_defs; |
8640 | if (slp_node) |
8641 | { |
8642 | vec_initial_defs.reserve (nelems: vec_num); |
8643 | if (nested_cycle) |
8644 | { |
8645 | unsigned phi_idx = loop_preheader_edge (loop)->dest_idx; |
8646 | vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx], |
8647 | &vec_initial_defs); |
8648 | } |
8649 | else |
8650 | { |
8651 | gcc_assert (slp_node == slp_node_instance->reduc_phis); |
8652 | vec<tree> &initial_values = reduc_info->reduc_initial_values; |
8653 | vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node); |
8654 | |
8655 | unsigned int num_phis = stmts.length (); |
8656 | if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info)) |
8657 | num_phis = 1; |
8658 | initial_values.reserve (nelems: num_phis); |
8659 | for (unsigned int i = 0; i < num_phis; ++i) |
8660 | { |
8661 | gphi *this_phi = as_a<gphi *> (p: stmts[i]->stmt); |
8662 | initial_values.quick_push (obj: vect_phi_initial_value (phi: this_phi)); |
8663 | } |
8664 | if (vec_num == 1) |
8665 | vect_find_reusable_accumulator (loop_vinfo, reduc_info); |
8666 | if (!initial_values.is_empty ()) |
8667 | { |
8668 | tree initial_value |
8669 | = (num_phis == 1 ? initial_values[0] : NULL_TREE); |
8670 | code_helper code = STMT_VINFO_REDUC_CODE (reduc_info); |
8671 | tree neutral_op |
8672 | = neutral_op_for_reduction (TREE_TYPE (vectype_out), |
8673 | code, initial_value); |
8674 | get_initial_defs_for_reduction (loop_vinfo, reduc_info, |
8675 | vec_oprnds: &vec_initial_defs, number_of_vectors: vec_num, |
8676 | group_size: stmts.length (), neutral_op); |
8677 | } |
8678 | } |
8679 | } |
8680 | else |
8681 | { |
8682 | /* Get at the scalar def before the loop, that defines the initial |
8683 | value of the reduction variable. */ |
8684 | tree initial_def = vect_phi_initial_value (phi); |
8685 | reduc_info->reduc_initial_values.safe_push (obj: initial_def); |
8686 | /* Optimize: if initial_def is for REDUC_MAX smaller than the base |
8687 | and we can't use zero for induc_val, use initial_def. Similarly |
8688 | for REDUC_MIN and initial_def larger than the base. */ |
8689 | if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION) |
8690 | { |
8691 | tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info); |
8692 | if (TREE_CODE (initial_def) == INTEGER_CST |
8693 | && !integer_zerop (induc_val) |
8694 | && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR |
8695 | && tree_int_cst_lt (t1: initial_def, t2: induc_val)) |
8696 | || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR |
8697 | && tree_int_cst_lt (t1: induc_val, t2: initial_def)))) |
8698 | { |
8699 | induc_val = initial_def; |
8700 | /* Communicate we used the initial_def to epilouge |
8701 | generation. */ |
8702 | STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE; |
8703 | } |
8704 | vec_initial_def = build_vector_from_val (vectype_out, induc_val); |
8705 | } |
8706 | else if (nested_cycle) |
8707 | { |
8708 | /* Do not use an adjustment def as that case is not supported |
8709 | correctly if ncopies is not one. */ |
8710 | vect_get_vec_defs_for_operand (vinfo: loop_vinfo, reduc_stmt_info, |
8711 | ncopies, op: initial_def, |
8712 | &vec_initial_defs); |
8713 | } |
8714 | else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION |
8715 | || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION) |
8716 | /* Fill the initial vector with the initial scalar value. */ |
8717 | vec_initial_def |
8718 | = get_initial_def_for_reduction (loop_vinfo, reduc_info: reduc_stmt_info, |
8719 | init_val: initial_def, neutral_op: initial_def); |
8720 | else |
8721 | { |
8722 | if (ncopies == 1) |
8723 | vect_find_reusable_accumulator (loop_vinfo, reduc_info); |
8724 | if (!reduc_info->reduc_initial_values.is_empty ()) |
8725 | { |
8726 | initial_def = reduc_info->reduc_initial_values[0]; |
8727 | code_helper code = STMT_VINFO_REDUC_CODE (reduc_info); |
8728 | tree neutral_op |
8729 | = neutral_op_for_reduction (TREE_TYPE (initial_def), |
8730 | code, initial_value: initial_def); |
8731 | gcc_assert (neutral_op); |
8732 | /* Try to simplify the vector initialization by applying an |
8733 | adjustment after the reduction has been performed. */ |
8734 | if (!reduc_info->reused_accumulator |
8735 | && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def |
8736 | && !operand_equal_p (neutral_op, initial_def)) |
8737 | { |
8738 | STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) |
8739 | = initial_def; |
8740 | initial_def = neutral_op; |
8741 | } |
8742 | vec_initial_def |
8743 | = get_initial_def_for_reduction (loop_vinfo, reduc_info, |
8744 | init_val: initial_def, neutral_op); |
8745 | } |
8746 | } |
8747 | } |
8748 | |
8749 | if (vec_initial_def) |
8750 | { |
8751 | vec_initial_defs.create (nelems: ncopies); |
8752 | for (i = 0; i < ncopies; ++i) |
8753 | vec_initial_defs.quick_push (obj: vec_initial_def); |
8754 | } |
8755 | |
8756 | if (auto *accumulator = reduc_info->reused_accumulator) |
8757 | { |
8758 | tree def = accumulator->reduc_input; |
8759 | if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def))) |
8760 | { |
8761 | unsigned int nreduc; |
8762 | bool res = constant_multiple_p (a: TYPE_VECTOR_SUBPARTS |
8763 | (TREE_TYPE (def)), |
8764 | b: TYPE_VECTOR_SUBPARTS (node: vectype_out), |
8765 | multiple: &nreduc); |
8766 | gcc_assert (res); |
8767 | gimple_seq stmts = NULL; |
8768 | /* Reduce the single vector to a smaller one. */ |
8769 | if (nreduc != 1) |
8770 | { |
8771 | /* Perform the reduction in the appropriate type. */ |
8772 | tree rvectype = vectype_out; |
8773 | if (!useless_type_conversion_p (TREE_TYPE (vectype_out), |
8774 | TREE_TYPE (TREE_TYPE (def)))) |
8775 | rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)), |
8776 | TYPE_VECTOR_SUBPARTS |
8777 | (node: vectype_out)); |
8778 | def = vect_create_partial_epilog (vec_def: def, vectype: rvectype, |
8779 | STMT_VINFO_REDUC_CODE |
8780 | (reduc_info), |
8781 | seq: &stmts); |
8782 | } |
8783 | /* The epilogue loop might use a different vector mode, like |
8784 | VNx2DI vs. V2DI. */ |
8785 | if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def))) |
8786 | { |
8787 | tree reduc_type = build_vector_type_for_mode |
8788 | (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out)); |
8789 | def = gimple_convert (seq: &stmts, type: reduc_type, op: def); |
8790 | } |
8791 | /* Adjust the input so we pick up the partially reduced value |
8792 | for the skip edge in vect_create_epilog_for_reduction. */ |
8793 | accumulator->reduc_input = def; |
8794 | /* And the reduction could be carried out using a different sign. */ |
8795 | if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def))) |
8796 | def = gimple_convert (seq: &stmts, type: vectype_out, op: def); |
8797 | if (loop_vinfo->main_loop_edge) |
8798 | { |
8799 | /* While we'd like to insert on the edge this will split |
8800 | blocks and disturb bookkeeping, we also will eventually |
8801 | need this on the skip edge. Rely on sinking to |
8802 | fixup optimal placement and insert in the pred. */ |
8803 | gimple_stmt_iterator gsi |
8804 | = gsi_last_bb (bb: loop_vinfo->main_loop_edge->src); |
8805 | /* Insert before a cond that eventually skips the |
8806 | epilogue. */ |
8807 | if (!gsi_end_p (i: gsi) && stmt_ends_bb_p (gsi_stmt (i: gsi))) |
8808 | gsi_prev (i: &gsi); |
8809 | gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING); |
8810 | } |
8811 | else |
8812 | gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), |
8813 | stmts); |
8814 | } |
8815 | if (loop_vinfo->main_loop_edge) |
8816 | vec_initial_defs[0] |
8817 | = vect_get_main_loop_result (loop_vinfo, def, |
8818 | vec_initial_defs[0]); |
8819 | else |
8820 | vec_initial_defs.safe_push (obj: def); |
8821 | } |
8822 | |
8823 | /* Generate the reduction PHIs upfront. */ |
8824 | for (i = 0; i < vec_num; i++) |
8825 | { |
8826 | tree vec_init_def = vec_initial_defs[i]; |
8827 | for (j = 0; j < ncopies; j++) |
8828 | { |
8829 | /* Create the reduction-phi that defines the reduction |
8830 | operand. */ |
8831 | gphi *new_phi = create_phi_node (vec_dest, loop->header); |
8832 | |
8833 | /* Set the loop-entry arg of the reduction-phi. */ |
8834 | if (j != 0 && nested_cycle) |
8835 | vec_init_def = vec_initial_defs[j]; |
8836 | add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop), |
8837 | UNKNOWN_LOCATION); |
8838 | |
8839 | /* The loop-latch arg is set in epilogue processing. */ |
8840 | |
8841 | if (slp_node) |
8842 | slp_node->push_vec_def (def: new_phi); |
8843 | else |
8844 | { |
8845 | if (j == 0) |
8846 | *vec_stmt = new_phi; |
8847 | STMT_VINFO_VEC_STMTS (stmt_info).safe_push (obj: new_phi); |
8848 | } |
8849 | } |
8850 | } |
8851 | |
8852 | return true; |
8853 | } |
8854 | |
8855 | /* Vectorizes LC PHIs. */ |
8856 | |
8857 | bool |
8858 | vectorizable_lc_phi (loop_vec_info loop_vinfo, |
8859 | stmt_vec_info stmt_info, gimple **vec_stmt, |
8860 | slp_tree slp_node) |
8861 | { |
8862 | if (!loop_vinfo |
8863 | || !is_a <gphi *> (p: stmt_info->stmt) |
8864 | || gimple_phi_num_args (gs: stmt_info->stmt) != 1) |
8865 | return false; |
8866 | |
8867 | if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def |
8868 | && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def) |
8869 | return false; |
8870 | |
8871 | if (!vec_stmt) /* transformation not required. */ |
8872 | { |
8873 | /* Deal with copies from externs or constants that disguise as |
8874 | loop-closed PHI nodes (PR97886). */ |
8875 | if (slp_node |
8876 | && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0], |
8877 | SLP_TREE_VECTYPE (slp_node))) |
8878 | { |
8879 | if (dump_enabled_p ()) |
8880 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
8881 | "incompatible vector types for invariants\n" ); |
8882 | return false; |
8883 | } |
8884 | STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type; |
8885 | return true; |
8886 | } |
8887 | |
8888 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); |
8889 | tree scalar_dest = gimple_phi_result (gs: stmt_info->stmt); |
8890 | basic_block bb = gimple_bb (g: stmt_info->stmt); |
8891 | edge e = single_pred_edge (bb); |
8892 | tree vec_dest = vect_create_destination_var (scalar_dest, vectype); |
8893 | auto_vec<tree> vec_oprnds; |
8894 | vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, |
8895 | !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1, |
8896 | gimple_phi_arg_def (gs: stmt_info->stmt, index: 0), &vec_oprnds); |
8897 | for (unsigned i = 0; i < vec_oprnds.length (); i++) |
8898 | { |
8899 | /* Create the vectorized LC PHI node. */ |
8900 | gphi *new_phi = create_phi_node (vec_dest, bb); |
8901 | add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION); |
8902 | if (slp_node) |
8903 | slp_node->push_vec_def (def: new_phi); |
8904 | else |
8905 | STMT_VINFO_VEC_STMTS (stmt_info).safe_push (obj: new_phi); |
8906 | } |
8907 | if (!slp_node) |
8908 | *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0]; |
8909 | |
8910 | return true; |
8911 | } |
8912 | |
8913 | /* Vectorizes PHIs. */ |
8914 | |
8915 | bool |
8916 | vectorizable_phi (vec_info *, |
8917 | stmt_vec_info stmt_info, gimple **vec_stmt, |
8918 | slp_tree slp_node, stmt_vector_for_cost *cost_vec) |
8919 | { |
8920 | if (!is_a <gphi *> (p: stmt_info->stmt) || !slp_node) |
8921 | return false; |
8922 | |
8923 | if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def) |
8924 | return false; |
8925 | |
8926 | tree vectype = SLP_TREE_VECTYPE (slp_node); |
8927 | |
8928 | if (!vec_stmt) /* transformation not required. */ |
8929 | { |
8930 | slp_tree child; |
8931 | unsigned i; |
8932 | FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child) |
8933 | if (!child) |
8934 | { |
8935 | if (dump_enabled_p ()) |
8936 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
8937 | "PHI node with unvectorized backedge def\n" ); |
8938 | return false; |
8939 | } |
8940 | else if (!vect_maybe_update_slp_op_vectype (child, vectype)) |
8941 | { |
8942 | if (dump_enabled_p ()) |
8943 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
8944 | "incompatible vector types for invariants\n" ); |
8945 | return false; |
8946 | } |
8947 | else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def |
8948 | && !useless_type_conversion_p (vectype, |
8949 | SLP_TREE_VECTYPE (child))) |
8950 | { |
8951 | /* With bools we can have mask and non-mask precision vectors |
8952 | or different non-mask precisions. while pattern recog is |
8953 | supposed to guarantee consistency here bugs in it can cause |
8954 | mismatches (PR103489 and PR103800 for example). |
8955 | Deal with them here instead of ICEing later. */ |
8956 | if (dump_enabled_p ()) |
8957 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
8958 | "incompatible vector type setup from " |
8959 | "bool pattern detection\n" ); |
8960 | return false; |
8961 | } |
8962 | |
8963 | /* For single-argument PHIs assume coalescing which means zero cost |
8964 | for the scalar and the vector PHIs. This avoids artificially |
8965 | favoring the vector path (but may pessimize it in some cases). */ |
8966 | if (gimple_phi_num_args (gs: as_a <gphi *> (p: stmt_info->stmt)) > 1) |
8967 | record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node), |
8968 | vector_stmt, stmt_info, vectype, 0, vect_body); |
8969 | STMT_VINFO_TYPE (stmt_info) = phi_info_type; |
8970 | return true; |
8971 | } |
8972 | |
8973 | tree scalar_dest = gimple_phi_result (gs: stmt_info->stmt); |
8974 | basic_block bb = gimple_bb (g: stmt_info->stmt); |
8975 | tree vec_dest = vect_create_destination_var (scalar_dest, vectype); |
8976 | auto_vec<gphi *> new_phis; |
8977 | for (unsigned i = 0; i < gimple_phi_num_args (gs: stmt_info->stmt); ++i) |
8978 | { |
8979 | slp_tree child = SLP_TREE_CHILDREN (slp_node)[i]; |
8980 | |
8981 | /* Skip not yet vectorized defs. */ |
8982 | if (SLP_TREE_DEF_TYPE (child) == vect_internal_def |
8983 | && SLP_TREE_VEC_DEFS (child).is_empty ()) |
8984 | continue; |
8985 | |
8986 | auto_vec<tree> vec_oprnds; |
8987 | vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds); |
8988 | if (!new_phis.exists ()) |
8989 | { |
8990 | new_phis.create (nelems: vec_oprnds.length ()); |
8991 | for (unsigned j = 0; j < vec_oprnds.length (); j++) |
8992 | { |
8993 | /* Create the vectorized LC PHI node. */ |
8994 | new_phis.quick_push (obj: create_phi_node (vec_dest, bb)); |
8995 | slp_node->push_vec_def (def: new_phis[j]); |
8996 | } |
8997 | } |
8998 | edge e = gimple_phi_arg_edge (phi: as_a <gphi *> (p: stmt_info->stmt), i); |
8999 | for (unsigned j = 0; j < vec_oprnds.length (); j++) |
9000 | add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION); |
9001 | } |
9002 | /* We should have at least one already vectorized child. */ |
9003 | gcc_assert (new_phis.exists ()); |
9004 | |
9005 | return true; |
9006 | } |
9007 | |
9008 | /* Vectorizes first order recurrences. An overview of the transformation |
9009 | is described below. Suppose we have the following loop. |
9010 | |
9011 | int t = 0; |
9012 | for (int i = 0; i < n; ++i) |
9013 | { |
9014 | b[i] = a[i] - t; |
9015 | t = a[i]; |
9016 | } |
9017 | |
9018 | There is a first-order recurrence on 'a'. For this loop, the scalar IR |
9019 | looks (simplified) like: |
9020 | |
9021 | scalar.preheader: |
9022 | init = 0; |
9023 | |
9024 | scalar.body: |
9025 | i = PHI <0(scalar.preheader), i+1(scalar.body)> |
9026 | _2 = PHI <(init(scalar.preheader), <_1(scalar.body)> |
9027 | _1 = a[i] |
9028 | b[i] = _1 - _2 |
9029 | if (i < n) goto scalar.body |
9030 | |
9031 | In this example, _2 is a recurrence because it's value depends on the |
9032 | previous iteration. We vectorize this as (VF = 4) |
9033 | |
9034 | vector.preheader: |
9035 | vect_init = vect_cst(..., ..., ..., 0) |
9036 | |
9037 | vector.body |
9038 | i = PHI <0(vector.preheader), i+4(vector.body)> |
9039 | vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)> |
9040 | vect_2 = a[i, i+1, i+2, i+3]; |
9041 | vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 }) |
9042 | b[i, i+1, i+2, i+3] = vect_2 - vect_3 |
9043 | if (..) goto vector.body |
9044 | |
9045 | In this function, vectorizable_recurr, we code generate both the |
9046 | vector PHI node and the permute since those together compute the |
9047 | vectorized value of the scalar PHI. We do not yet have the |
9048 | backedge value to fill in there nor into the vec_perm. Those |
9049 | are filled in maybe_set_vectorized_backedge_value and |
9050 | vect_schedule_scc. |
9051 | |
9052 | TODO: Since the scalar loop does not have a use of the recurrence |
9053 | outside of the loop the natural way to implement peeling via |
9054 | vectorizing the live value doesn't work. For now peeling of loops |
9055 | with a recurrence is not implemented. For SLP the supported cases |
9056 | are restricted to those requiring a single vector recurrence PHI. */ |
9057 | |
9058 | bool |
9059 | vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info, |
9060 | gimple **vec_stmt, slp_tree slp_node, |
9061 | stmt_vector_for_cost *cost_vec) |
9062 | { |
9063 | if (!loop_vinfo || !is_a<gphi *> (p: stmt_info->stmt)) |
9064 | return false; |
9065 | |
9066 | gphi *phi = as_a<gphi *> (p: stmt_info->stmt); |
9067 | |
9068 | /* So far we only support first-order recurrence auto-vectorization. */ |
9069 | if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence) |
9070 | return false; |
9071 | |
9072 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); |
9073 | unsigned ncopies; |
9074 | if (slp_node) |
9075 | ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); |
9076 | else |
9077 | ncopies = vect_get_num_copies (loop_vinfo, vectype); |
9078 | poly_int64 nunits = TYPE_VECTOR_SUBPARTS (node: vectype); |
9079 | unsigned dist = slp_node ? SLP_TREE_LANES (slp_node) : 1; |
9080 | /* We need to be able to make progress with a single vector. */ |
9081 | if (maybe_gt (dist * 2, nunits)) |
9082 | { |
9083 | if (dump_enabled_p ()) |
9084 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
9085 | "first order recurrence exceeds half of " |
9086 | "a vector\n" ); |
9087 | return false; |
9088 | } |
9089 | |
9090 | /* First-order recurrence autovectorization needs to handle permutation |
9091 | with indices = [nunits-1, nunits, nunits+1, ...]. */ |
9092 | vec_perm_builder sel (nunits, 1, 3); |
9093 | for (int i = 0; i < 3; ++i) |
9094 | sel.quick_push (obj: nunits - dist + i); |
9095 | vec_perm_indices indices (sel, 2, nunits); |
9096 | |
9097 | if (!vec_stmt) /* transformation not required. */ |
9098 | { |
9099 | if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype), |
9100 | indices)) |
9101 | return false; |
9102 | |
9103 | if (slp_node) |
9104 | { |
9105 | /* We eventually need to set a vector type on invariant |
9106 | arguments. */ |
9107 | unsigned j; |
9108 | slp_tree child; |
9109 | FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child) |
9110 | if (!vect_maybe_update_slp_op_vectype |
9111 | (child, SLP_TREE_VECTYPE (slp_node))) |
9112 | { |
9113 | if (dump_enabled_p ()) |
9114 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
9115 | "incompatible vector types for " |
9116 | "invariants\n" ); |
9117 | return false; |
9118 | } |
9119 | } |
9120 | /* The recurrence costs the initialization vector and one permute |
9121 | for each copy. */ |
9122 | unsigned prologue_cost = record_stmt_cost (body_cost_vec: cost_vec, count: 1, kind: scalar_to_vec, |
9123 | stmt_info, misalign: 0, where: vect_prologue); |
9124 | unsigned inside_cost = record_stmt_cost (body_cost_vec: cost_vec, count: ncopies, kind: vector_stmt, |
9125 | stmt_info, misalign: 0, where: vect_body); |
9126 | if (dump_enabled_p ()) |
9127 | dump_printf_loc (MSG_NOTE, vect_location, |
9128 | "vectorizable_recurr: inside_cost = %d, " |
9129 | "prologue_cost = %d .\n" , inside_cost, |
9130 | prologue_cost); |
9131 | |
9132 | STMT_VINFO_TYPE (stmt_info) = recurr_info_type; |
9133 | return true; |
9134 | } |
9135 | |
9136 | edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo)); |
9137 | basic_block bb = gimple_bb (g: phi); |
9138 | tree = PHI_ARG_DEF_FROM_EDGE (phi, pe); |
9139 | if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (preheader))) |
9140 | { |
9141 | gimple_seq stmts = NULL; |
9142 | preheader = gimple_convert (seq: &stmts, TREE_TYPE (vectype), op: preheader); |
9143 | gsi_insert_seq_on_edge_immediate (pe, stmts); |
9144 | } |
9145 | tree vec_init = build_vector_from_val (vectype, preheader); |
9146 | vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL); |
9147 | |
9148 | /* Create the vectorized first-order PHI node. */ |
9149 | tree vec_dest = vect_get_new_vect_var (vectype, |
9150 | vect_simple_var, "vec_recur_" ); |
9151 | gphi *new_phi = create_phi_node (vec_dest, bb); |
9152 | add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION); |
9153 | |
9154 | /* Insert shuffles the first-order recurrence autovectorization. |
9155 | result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>. */ |
9156 | tree perm = vect_gen_perm_mask_checked (vectype, indices); |
9157 | |
9158 | /* Insert the required permute after the latch definition. The |
9159 | second and later operands are tentative and will be updated when we have |
9160 | vectorized the latch definition. */ |
9161 | edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo)); |
9162 | gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le)); |
9163 | gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def); |
9164 | gsi_next (i: &gsi2); |
9165 | |
9166 | for (unsigned i = 0; i < ncopies; ++i) |
9167 | { |
9168 | vec_dest = make_ssa_name (var: vectype); |
9169 | gassign *vperm |
9170 | = gimple_build_assign (vec_dest, VEC_PERM_EXPR, |
9171 | i == 0 ? gimple_phi_result (gs: new_phi) : NULL, |
9172 | NULL, perm); |
9173 | vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2); |
9174 | |
9175 | if (slp_node) |
9176 | slp_node->push_vec_def (def: vperm); |
9177 | else |
9178 | STMT_VINFO_VEC_STMTS (stmt_info).safe_push (obj: vperm); |
9179 | } |
9180 | |
9181 | if (!slp_node) |
9182 | *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0]; |
9183 | return true; |
9184 | } |
9185 | |
9186 | /* Return true if VECTYPE represents a vector that requires lowering |
9187 | by the vector lowering pass. */ |
9188 | |
9189 | bool |
9190 | vect_emulated_vector_p (tree vectype) |
9191 | { |
9192 | return (!VECTOR_MODE_P (TYPE_MODE (vectype)) |
9193 | && (!VECTOR_BOOLEAN_TYPE_P (vectype) |
9194 | || TYPE_PRECISION (TREE_TYPE (vectype)) != 1)); |
9195 | } |
9196 | |
9197 | /* Return true if we can emulate CODE on an integer mode representation |
9198 | of a vector. */ |
9199 | |
9200 | bool |
9201 | vect_can_vectorize_without_simd_p (tree_code code) |
9202 | { |
9203 | switch (code) |
9204 | { |
9205 | case PLUS_EXPR: |
9206 | case MINUS_EXPR: |
9207 | case NEGATE_EXPR: |
9208 | case BIT_AND_EXPR: |
9209 | case BIT_IOR_EXPR: |
9210 | case BIT_XOR_EXPR: |
9211 | case BIT_NOT_EXPR: |
9212 | return true; |
9213 | |
9214 | default: |
9215 | return false; |
9216 | } |
9217 | } |
9218 | |
9219 | /* Likewise, but taking a code_helper. */ |
9220 | |
9221 | bool |
9222 | vect_can_vectorize_without_simd_p (code_helper code) |
9223 | { |
9224 | return (code.is_tree_code () |
9225 | && vect_can_vectorize_without_simd_p (code: tree_code (code))); |
9226 | } |
9227 | |
9228 | /* Create vector init for vectorized iv. */ |
9229 | static tree |
9230 | vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr, |
9231 | tree step_expr, poly_uint64 nunits, |
9232 | tree vectype, |
9233 | enum vect_induction_op_type induction_type) |
9234 | { |
9235 | unsigned HOST_WIDE_INT const_nunits; |
9236 | tree vec_shift, vec_init, new_name; |
9237 | unsigned i; |
9238 | tree itype = TREE_TYPE (vectype); |
9239 | |
9240 | /* iv_loop is the loop to be vectorized. Create: |
9241 | vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr). */ |
9242 | new_name = gimple_convert (seq: stmts, type: itype, op: init_expr); |
9243 | switch (induction_type) |
9244 | { |
9245 | case vect_step_op_shr: |
9246 | case vect_step_op_shl: |
9247 | /* Build the Initial value from shift_expr. */ |
9248 | vec_init = gimple_build_vector_from_val (seq: stmts, |
9249 | type: vectype, |
9250 | op: new_name); |
9251 | vec_shift = gimple_build (seq: stmts, code: VEC_SERIES_EXPR, type: vectype, |
9252 | ops: build_zero_cst (itype), ops: step_expr); |
9253 | vec_init = gimple_build (seq: stmts, |
9254 | code: (induction_type == vect_step_op_shr |
9255 | ? RSHIFT_EXPR : LSHIFT_EXPR), |
9256 | type: vectype, ops: vec_init, ops: vec_shift); |
9257 | break; |
9258 | |
9259 | case vect_step_op_neg: |
9260 | { |
9261 | vec_init = gimple_build_vector_from_val (seq: stmts, |
9262 | type: vectype, |
9263 | op: new_name); |
9264 | tree vec_neg = gimple_build (seq: stmts, code: NEGATE_EXPR, |
9265 | type: vectype, ops: vec_init); |
9266 | /* The encoding has 2 interleaved stepped patterns. */ |
9267 | vec_perm_builder sel (nunits, 2, 3); |
9268 | sel.quick_grow (len: 6); |
9269 | for (i = 0; i < 3; i++) |
9270 | { |
9271 | sel[2 * i] = i; |
9272 | sel[2 * i + 1] = i + nunits; |
9273 | } |
9274 | vec_perm_indices indices (sel, 2, nunits); |
9275 | /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may |
9276 | fail when vec_init is const vector. In that situation vec_perm is not |
9277 | really needed. */ |
9278 | tree perm_mask_even |
9279 | = vect_gen_perm_mask_any (vectype, indices); |
9280 | vec_init = gimple_build (seq: stmts, code: VEC_PERM_EXPR, |
9281 | type: vectype, |
9282 | ops: vec_init, ops: vec_neg, |
9283 | ops: perm_mask_even); |
9284 | } |
9285 | break; |
9286 | |
9287 | case vect_step_op_mul: |
9288 | { |
9289 | /* Use unsigned mult to avoid UD integer overflow. */ |
9290 | gcc_assert (nunits.is_constant (&const_nunits)); |
9291 | tree utype = unsigned_type_for (itype); |
9292 | tree uvectype = build_vector_type (utype, |
9293 | TYPE_VECTOR_SUBPARTS (node: vectype)); |
9294 | new_name = gimple_convert (seq: stmts, type: utype, op: new_name); |
9295 | vec_init = gimple_build_vector_from_val (seq: stmts, |
9296 | type: uvectype, |
9297 | op: new_name); |
9298 | tree_vector_builder elts (uvectype, const_nunits, 1); |
9299 | tree elt_step = build_one_cst (utype); |
9300 | |
9301 | elts.quick_push (obj: elt_step); |
9302 | for (i = 1; i < const_nunits; i++) |
9303 | { |
9304 | /* Create: new_name_i = new_name + step_expr. */ |
9305 | elt_step = gimple_build (seq: stmts, code: MULT_EXPR, |
9306 | type: utype, ops: elt_step, ops: step_expr); |
9307 | elts.quick_push (obj: elt_step); |
9308 | } |
9309 | /* Create a vector from [new_name_0, new_name_1, ..., |
9310 | new_name_nunits-1]. */ |
9311 | tree vec_mul = gimple_build_vector (seq: stmts, builder: &elts); |
9312 | vec_init = gimple_build (seq: stmts, code: MULT_EXPR, type: uvectype, |
9313 | ops: vec_init, ops: vec_mul); |
9314 | vec_init = gimple_convert (seq: stmts, type: vectype, op: vec_init); |
9315 | } |
9316 | break; |
9317 | |
9318 | default: |
9319 | gcc_unreachable (); |
9320 | } |
9321 | |
9322 | return vec_init; |
9323 | } |
9324 | |
9325 | /* Peel init_expr by skip_niter for induction_type. */ |
9326 | tree |
9327 | vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr, |
9328 | tree skip_niters, tree step_expr, |
9329 | enum vect_induction_op_type induction_type) |
9330 | { |
9331 | gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST); |
9332 | tree type = TREE_TYPE (init_expr); |
9333 | unsigned prec = TYPE_PRECISION (type); |
9334 | switch (induction_type) |
9335 | { |
9336 | case vect_step_op_neg: |
9337 | if (TREE_INT_CST_LOW (skip_niters) % 2) |
9338 | init_expr = gimple_build (seq: stmts, code: NEGATE_EXPR, type, ops: init_expr); |
9339 | /* else no change. */ |
9340 | break; |
9341 | |
9342 | case vect_step_op_shr: |
9343 | case vect_step_op_shl: |
9344 | skip_niters = gimple_convert (seq: stmts, type, op: skip_niters); |
9345 | step_expr = gimple_build (seq: stmts, code: MULT_EXPR, type, ops: step_expr, ops: skip_niters); |
9346 | /* When shift mount >= precision, need to avoid UD. |
9347 | In the original loop, there's no UD, and according to semantic, |
9348 | init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr. */ |
9349 | if (!tree_fits_uhwi_p (step_expr) |
9350 | || tree_to_uhwi (step_expr) >= prec) |
9351 | { |
9352 | if (induction_type == vect_step_op_shl |
9353 | || TYPE_UNSIGNED (type)) |
9354 | init_expr = build_zero_cst (type); |
9355 | else |
9356 | init_expr = gimple_build (seq: stmts, code: RSHIFT_EXPR, type, |
9357 | ops: init_expr, |
9358 | ops: wide_int_to_tree (type, cst: prec - 1)); |
9359 | } |
9360 | else |
9361 | init_expr = gimple_build (seq: stmts, code: (induction_type == vect_step_op_shr |
9362 | ? RSHIFT_EXPR : LSHIFT_EXPR), |
9363 | type, ops: init_expr, ops: step_expr); |
9364 | break; |
9365 | |
9366 | case vect_step_op_mul: |
9367 | { |
9368 | tree utype = unsigned_type_for (type); |
9369 | init_expr = gimple_convert (seq: stmts, type: utype, op: init_expr); |
9370 | wide_int skipn = wi::to_wide (t: skip_niters); |
9371 | wide_int begin = wi::to_wide (t: step_expr); |
9372 | auto_mpz base, exp, mod, res; |
9373 | wi::to_mpz (begin, base, TYPE_SIGN (type)); |
9374 | wi::to_mpz (skipn, exp, UNSIGNED); |
9375 | mpz_ui_pow_ui (mod, 2, TYPE_PRECISION (type)); |
9376 | mpz_powm (res, base, exp, mod); |
9377 | begin = wi::from_mpz (type, res, TYPE_SIGN (type)); |
9378 | tree mult_expr = wide_int_to_tree (type: utype, cst: begin); |
9379 | init_expr = gimple_build (seq: stmts, code: MULT_EXPR, type: utype, |
9380 | ops: init_expr, ops: mult_expr); |
9381 | init_expr = gimple_convert (seq: stmts, type, op: init_expr); |
9382 | } |
9383 | break; |
9384 | |
9385 | default: |
9386 | gcc_unreachable (); |
9387 | } |
9388 | |
9389 | return init_expr; |
9390 | } |
9391 | |
9392 | /* Create vector step for vectorized iv. */ |
9393 | static tree |
9394 | vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr, |
9395 | poly_uint64 vf, |
9396 | enum vect_induction_op_type induction_type) |
9397 | { |
9398 | tree expr = build_int_cst (TREE_TYPE (step_expr), vf); |
9399 | tree new_name = NULL; |
9400 | /* Step should be pow (step, vf) for mult induction. */ |
9401 | if (induction_type == vect_step_op_mul) |
9402 | { |
9403 | gcc_assert (vf.is_constant ()); |
9404 | wide_int begin = wi::to_wide (t: step_expr); |
9405 | |
9406 | for (unsigned i = 0; i != vf.to_constant () - 1; i++) |
9407 | begin = wi::mul (x: begin, y: wi::to_wide (t: step_expr)); |
9408 | |
9409 | new_name = wide_int_to_tree (TREE_TYPE (step_expr), cst: begin); |
9410 | } |
9411 | else if (induction_type == vect_step_op_neg) |
9412 | /* Do nothing. */ |
9413 | ; |
9414 | else |
9415 | new_name = gimple_build (seq: stmts, code: MULT_EXPR, TREE_TYPE (step_expr), |
9416 | ops: expr, ops: step_expr); |
9417 | return new_name; |
9418 | } |
9419 | |
9420 | static tree |
9421 | vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo, |
9422 | stmt_vec_info stmt_info, |
9423 | tree new_name, tree vectype, |
9424 | enum vect_induction_op_type induction_type) |
9425 | { |
9426 | /* No step is needed for neg induction. */ |
9427 | if (induction_type == vect_step_op_neg) |
9428 | return NULL; |
9429 | |
9430 | tree t = unshare_expr (new_name); |
9431 | gcc_assert (CONSTANT_CLASS_P (new_name) |
9432 | || TREE_CODE (new_name) == SSA_NAME); |
9433 | tree new_vec = build_vector_from_val (vectype, t); |
9434 | tree vec_step = vect_init_vector (loop_vinfo, stmt_info, |
9435 | new_vec, vectype, NULL); |
9436 | return vec_step; |
9437 | } |
9438 | |
9439 | /* Update vectorized iv with vect_step, induc_def is init. */ |
9440 | static tree |
9441 | vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype, |
9442 | tree induc_def, tree vec_step, |
9443 | enum vect_induction_op_type induction_type) |
9444 | { |
9445 | tree vec_def = induc_def; |
9446 | switch (induction_type) |
9447 | { |
9448 | case vect_step_op_mul: |
9449 | { |
9450 | /* Use unsigned mult to avoid UD integer overflow. */ |
9451 | tree uvectype |
9452 | = build_vector_type (unsigned_type_for (TREE_TYPE (vectype)), |
9453 | TYPE_VECTOR_SUBPARTS (node: vectype)); |
9454 | vec_def = gimple_convert (seq: stmts, type: uvectype, op: vec_def); |
9455 | vec_step = gimple_convert (seq: stmts, type: uvectype, op: vec_step); |
9456 | vec_def = gimple_build (seq: stmts, code: MULT_EXPR, type: uvectype, |
9457 | ops: vec_def, ops: vec_step); |
9458 | vec_def = gimple_convert (seq: stmts, type: vectype, op: vec_def); |
9459 | } |
9460 | break; |
9461 | |
9462 | case vect_step_op_shr: |
9463 | vec_def = gimple_build (seq: stmts, code: RSHIFT_EXPR, type: vectype, |
9464 | ops: vec_def, ops: vec_step); |
9465 | break; |
9466 | |
9467 | case vect_step_op_shl: |
9468 | vec_def = gimple_build (seq: stmts, code: LSHIFT_EXPR, type: vectype, |
9469 | ops: vec_def, ops: vec_step); |
9470 | break; |
9471 | case vect_step_op_neg: |
9472 | vec_def = induc_def; |
9473 | /* Do nothing. */ |
9474 | break; |
9475 | default: |
9476 | gcc_unreachable (); |
9477 | } |
9478 | |
9479 | return vec_def; |
9480 | |
9481 | } |
9482 | |
9483 | /* Function vectorizable_induction |
9484 | |
9485 | Check if STMT_INFO performs an nonlinear induction computation that can be |
9486 | vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create |
9487 | a vectorized phi to replace it, put it in VEC_STMT, and add it to the same |
9488 | basic block. |
9489 | Return true if STMT_INFO is vectorizable in this way. */ |
9490 | |
9491 | static bool |
9492 | vectorizable_nonlinear_induction (loop_vec_info loop_vinfo, |
9493 | stmt_vec_info stmt_info, |
9494 | gimple **vec_stmt, slp_tree slp_node, |
9495 | stmt_vector_for_cost *cost_vec) |
9496 | { |
9497 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
9498 | unsigned ncopies; |
9499 | bool nested_in_vect_loop = false; |
9500 | class loop *iv_loop; |
9501 | tree vec_def; |
9502 | edge pe = loop_preheader_edge (loop); |
9503 | basic_block new_bb; |
9504 | tree vec_init, vec_step; |
9505 | tree new_name; |
9506 | gimple *new_stmt; |
9507 | gphi *induction_phi; |
9508 | tree induc_def, vec_dest; |
9509 | tree init_expr, step_expr; |
9510 | tree niters_skip; |
9511 | poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); |
9512 | unsigned i; |
9513 | gimple_stmt_iterator si; |
9514 | |
9515 | gphi *phi = dyn_cast <gphi *> (p: stmt_info->stmt); |
9516 | |
9517 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); |
9518 | poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (node: vectype); |
9519 | enum vect_induction_op_type induction_type |
9520 | = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info); |
9521 | |
9522 | gcc_assert (induction_type > vect_step_op_add); |
9523 | |
9524 | if (slp_node) |
9525 | ncopies = 1; |
9526 | else |
9527 | ncopies = vect_get_num_copies (loop_vinfo, vectype); |
9528 | gcc_assert (ncopies >= 1); |
9529 | |
9530 | /* FORNOW. Only handle nonlinear induction in the same loop. */ |
9531 | if (nested_in_vect_loop_p (loop, stmt_info)) |
9532 | { |
9533 | if (dump_enabled_p ()) |
9534 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
9535 | "nonlinear induction in nested loop.\n" ); |
9536 | return false; |
9537 | } |
9538 | |
9539 | iv_loop = loop; |
9540 | gcc_assert (iv_loop == (gimple_bb (phi))->loop_father); |
9541 | |
9542 | /* TODO: Support slp for nonlinear iv. There should be separate vector iv |
9543 | update for each iv and a permutation to generate wanted vector iv. */ |
9544 | if (slp_node) |
9545 | { |
9546 | if (dump_enabled_p ()) |
9547 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
9548 | "SLP induction not supported for nonlinear" |
9549 | " induction.\n" ); |
9550 | return false; |
9551 | } |
9552 | |
9553 | if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype))) |
9554 | { |
9555 | if (dump_enabled_p ()) |
9556 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
9557 | "floating point nonlinear induction vectorization" |
9558 | " not supported.\n" ); |
9559 | return false; |
9560 | } |
9561 | |
9562 | step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info); |
9563 | init_expr = vect_phi_initial_value (phi); |
9564 | gcc_assert (step_expr != NULL_TREE && init_expr != NULL |
9565 | && TREE_CODE (step_expr) == INTEGER_CST); |
9566 | /* step_expr should be aligned with init_expr, |
9567 | .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used. */ |
9568 | step_expr = fold_convert (TREE_TYPE (vectype), step_expr); |
9569 | |
9570 | if (TREE_CODE (init_expr) == INTEGER_CST) |
9571 | init_expr = fold_convert (TREE_TYPE (vectype), init_expr); |
9572 | else |
9573 | gcc_assert (tree_nop_conversion_p (TREE_TYPE (vectype), |
9574 | TREE_TYPE (init_expr))); |
9575 | |
9576 | switch (induction_type) |
9577 | { |
9578 | case vect_step_op_neg: |
9579 | if (TREE_CODE (init_expr) != INTEGER_CST |
9580 | && TREE_CODE (init_expr) != REAL_CST) |
9581 | { |
9582 | /* Check for backend support of NEGATE_EXPR and vec_perm. */ |
9583 | if (!directly_supported_p (NEGATE_EXPR, vectype)) |
9584 | return false; |
9585 | |
9586 | /* The encoding has 2 interleaved stepped patterns. */ |
9587 | vec_perm_builder sel (nunits, 2, 3); |
9588 | machine_mode mode = TYPE_MODE (vectype); |
9589 | sel.quick_grow (len: 6); |
9590 | for (i = 0; i < 3; i++) |
9591 | { |
9592 | sel[i * 2] = i; |
9593 | sel[i * 2 + 1] = i + nunits; |
9594 | } |
9595 | vec_perm_indices indices (sel, 2, nunits); |
9596 | if (!can_vec_perm_const_p (mode, mode, indices)) |
9597 | return false; |
9598 | } |
9599 | break; |
9600 | |
9601 | case vect_step_op_mul: |
9602 | { |
9603 | /* Check for backend support of MULT_EXPR. */ |
9604 | if (!directly_supported_p (MULT_EXPR, vectype)) |
9605 | return false; |
9606 | |
9607 | /* ?? How to construct vector step for variable number vector. |
9608 | [ 1, step, pow (step, 2), pow (step, 4), .. ]. */ |
9609 | if (!vf.is_constant ()) |
9610 | return false; |
9611 | } |
9612 | break; |
9613 | |
9614 | case vect_step_op_shr: |
9615 | /* Check for backend support of RSHIFT_EXPR. */ |
9616 | if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector)) |
9617 | return false; |
9618 | |
9619 | /* Don't shift more than type precision to avoid UD. */ |
9620 | if (!tree_fits_uhwi_p (step_expr) |
9621 | || maybe_ge (nunits * tree_to_uhwi (step_expr), |
9622 | TYPE_PRECISION (TREE_TYPE (init_expr)))) |
9623 | return false; |
9624 | break; |
9625 | |
9626 | case vect_step_op_shl: |
9627 | /* Check for backend support of RSHIFT_EXPR. */ |
9628 | if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector)) |
9629 | return false; |
9630 | |
9631 | /* Don't shift more than type precision to avoid UD. */ |
9632 | if (!tree_fits_uhwi_p (step_expr) |
9633 | || maybe_ge (nunits * tree_to_uhwi (step_expr), |
9634 | TYPE_PRECISION (TREE_TYPE (init_expr)))) |
9635 | return false; |
9636 | |
9637 | break; |
9638 | |
9639 | default: |
9640 | gcc_unreachable (); |
9641 | } |
9642 | |
9643 | if (!vec_stmt) /* transformation not required. */ |
9644 | { |
9645 | unsigned inside_cost = 0, prologue_cost = 0; |
9646 | /* loop cost for vec_loop. Neg induction doesn't have any |
9647 | inside_cost. */ |
9648 | inside_cost = record_stmt_cost (body_cost_vec: cost_vec, count: ncopies, kind: vector_stmt, |
9649 | stmt_info, misalign: 0, where: vect_body); |
9650 | |
9651 | /* loop cost for vec_loop. Neg induction doesn't have any |
9652 | inside_cost. */ |
9653 | if (induction_type == vect_step_op_neg) |
9654 | inside_cost = 0; |
9655 | |
9656 | /* prologue cost for vec_init and vec_step. */ |
9657 | prologue_cost = record_stmt_cost (body_cost_vec: cost_vec, count: 2, kind: scalar_to_vec, |
9658 | stmt_info, misalign: 0, where: vect_prologue); |
9659 | |
9660 | if (dump_enabled_p ()) |
9661 | dump_printf_loc (MSG_NOTE, vect_location, |
9662 | "vect_model_induction_cost: inside_cost = %d, " |
9663 | "prologue_cost = %d. \n" , inside_cost, |
9664 | prologue_cost); |
9665 | |
9666 | STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type; |
9667 | DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction" ); |
9668 | return true; |
9669 | } |
9670 | |
9671 | /* Transform. */ |
9672 | |
9673 | /* Compute a vector variable, initialized with the first VF values of |
9674 | the induction variable. E.g., for an iv with IV_PHI='X' and |
9675 | evolution S, for a vector of 4 units, we want to compute: |
9676 | [X, X + S, X + 2*S, X + 3*S]. */ |
9677 | |
9678 | if (dump_enabled_p ()) |
9679 | dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n" ); |
9680 | |
9681 | pe = loop_preheader_edge (iv_loop); |
9682 | /* Find the first insertion point in the BB. */ |
9683 | basic_block bb = gimple_bb (g: phi); |
9684 | si = gsi_after_labels (bb); |
9685 | |
9686 | gimple_seq stmts = NULL; |
9687 | |
9688 | niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo); |
9689 | /* If we are using the loop mask to "peel" for alignment then we need |
9690 | to adjust the start value here. */ |
9691 | if (niters_skip != NULL_TREE) |
9692 | init_expr = vect_peel_nonlinear_iv_init (stmts: &stmts, init_expr, skip_niters: niters_skip, |
9693 | step_expr, induction_type); |
9694 | |
9695 | vec_init = vect_create_nonlinear_iv_init (stmts: &stmts, init_expr, |
9696 | step_expr, nunits, vectype, |
9697 | induction_type); |
9698 | if (stmts) |
9699 | { |
9700 | new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); |
9701 | gcc_assert (!new_bb); |
9702 | } |
9703 | |
9704 | stmts = NULL; |
9705 | new_name = vect_create_nonlinear_iv_step (stmts: &stmts, step_expr, |
9706 | vf, induction_type); |
9707 | if (stmts) |
9708 | { |
9709 | new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); |
9710 | gcc_assert (!new_bb); |
9711 | } |
9712 | |
9713 | vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info, |
9714 | new_name, vectype, |
9715 | induction_type); |
9716 | /* Create the following def-use cycle: |
9717 | loop prolog: |
9718 | vec_init = ... |
9719 | vec_step = ... |
9720 | loop: |
9721 | vec_iv = PHI <vec_init, vec_loop> |
9722 | ... |
9723 | STMT |
9724 | ... |
9725 | vec_loop = vec_iv + vec_step; */ |
9726 | |
9727 | /* Create the induction-phi that defines the induction-operand. */ |
9728 | vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_" ); |
9729 | induction_phi = create_phi_node (vec_dest, iv_loop->header); |
9730 | induc_def = PHI_RESULT (induction_phi); |
9731 | |
9732 | /* Create the iv update inside the loop. */ |
9733 | stmts = NULL; |
9734 | vec_def = vect_update_nonlinear_iv (stmts: &stmts, vectype, |
9735 | induc_def, vec_step, |
9736 | induction_type); |
9737 | |
9738 | gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT); |
9739 | new_stmt = SSA_NAME_DEF_STMT (vec_def); |
9740 | |
9741 | /* Set the arguments of the phi node: */ |
9742 | add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION); |
9743 | add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop), |
9744 | UNKNOWN_LOCATION); |
9745 | |
9746 | STMT_VINFO_VEC_STMTS (stmt_info).safe_push (obj: induction_phi); |
9747 | *vec_stmt = induction_phi; |
9748 | |
9749 | /* In case that vectorization factor (VF) is bigger than the number |
9750 | of elements that we can fit in a vectype (nunits), we have to generate |
9751 | more than one vector stmt - i.e - we need to "unroll" the |
9752 | vector stmt by a factor VF/nunits. For more details see documentation |
9753 | in vectorizable_operation. */ |
9754 | |
9755 | if (ncopies > 1) |
9756 | { |
9757 | stmts = NULL; |
9758 | /* FORNOW. This restriction should be relaxed. */ |
9759 | gcc_assert (!nested_in_vect_loop); |
9760 | |
9761 | new_name = vect_create_nonlinear_iv_step (stmts: &stmts, step_expr, |
9762 | vf: nunits, induction_type); |
9763 | |
9764 | vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info, |
9765 | new_name, vectype, |
9766 | induction_type); |
9767 | vec_def = induc_def; |
9768 | for (i = 1; i < ncopies; i++) |
9769 | { |
9770 | /* vec_i = vec_prev + vec_step. */ |
9771 | stmts = NULL; |
9772 | vec_def = vect_update_nonlinear_iv (stmts: &stmts, vectype, |
9773 | induc_def: vec_def, vec_step, |
9774 | induction_type); |
9775 | gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT); |
9776 | new_stmt = SSA_NAME_DEF_STMT (vec_def); |
9777 | STMT_VINFO_VEC_STMTS (stmt_info).safe_push (obj: new_stmt); |
9778 | } |
9779 | } |
9780 | |
9781 | if (dump_enabled_p ()) |
9782 | dump_printf_loc (MSG_NOTE, vect_location, |
9783 | "transform induction: created def-use cycle: %G%G" , |
9784 | (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def)); |
9785 | |
9786 | return true; |
9787 | } |
9788 | |
9789 | /* Function vectorizable_induction |
9790 | |
9791 | Check if STMT_INFO performs an induction computation that can be vectorized. |
9792 | If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized |
9793 | phi to replace it, put it in VEC_STMT, and add it to the same basic block. |
9794 | Return true if STMT_INFO is vectorizable in this way. */ |
9795 | |
9796 | bool |
9797 | vectorizable_induction (loop_vec_info loop_vinfo, |
9798 | stmt_vec_info stmt_info, |
9799 | gimple **vec_stmt, slp_tree slp_node, |
9800 | stmt_vector_for_cost *cost_vec) |
9801 | { |
9802 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
9803 | unsigned ncopies; |
9804 | bool nested_in_vect_loop = false; |
9805 | class loop *iv_loop; |
9806 | tree vec_def; |
9807 | edge pe = loop_preheader_edge (loop); |
9808 | basic_block new_bb; |
9809 | tree new_vec, vec_init, vec_step, t; |
9810 | tree new_name; |
9811 | gimple *new_stmt; |
9812 | gphi *induction_phi; |
9813 | tree induc_def, vec_dest; |
9814 | tree init_expr, step_expr; |
9815 | poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); |
9816 | unsigned i; |
9817 | tree expr; |
9818 | gimple_stmt_iterator si; |
9819 | enum vect_induction_op_type induction_type |
9820 | = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info); |
9821 | |
9822 | gphi *phi = dyn_cast <gphi *> (p: stmt_info->stmt); |
9823 | if (!phi) |
9824 | return false; |
9825 | |
9826 | if (!STMT_VINFO_RELEVANT_P (stmt_info)) |
9827 | return false; |
9828 | |
9829 | /* Make sure it was recognized as induction computation. */ |
9830 | if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def) |
9831 | return false; |
9832 | |
9833 | /* Handle nonlinear induction in a separate place. */ |
9834 | if (induction_type != vect_step_op_add) |
9835 | return vectorizable_nonlinear_induction (loop_vinfo, stmt_info, |
9836 | vec_stmt, slp_node, cost_vec); |
9837 | |
9838 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); |
9839 | poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (node: vectype); |
9840 | |
9841 | if (slp_node) |
9842 | ncopies = 1; |
9843 | else |
9844 | ncopies = vect_get_num_copies (loop_vinfo, vectype); |
9845 | gcc_assert (ncopies >= 1); |
9846 | |
9847 | /* FORNOW. These restrictions should be relaxed. */ |
9848 | if (nested_in_vect_loop_p (loop, stmt_info)) |
9849 | { |
9850 | imm_use_iterator imm_iter; |
9851 | use_operand_p use_p; |
9852 | gimple *exit_phi; |
9853 | edge latch_e; |
9854 | tree loop_arg; |
9855 | |
9856 | if (ncopies > 1) |
9857 | { |
9858 | if (dump_enabled_p ()) |
9859 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
9860 | "multiple types in nested loop.\n" ); |
9861 | return false; |
9862 | } |
9863 | |
9864 | exit_phi = NULL; |
9865 | latch_e = loop_latch_edge (loop->inner); |
9866 | loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e); |
9867 | FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg) |
9868 | { |
9869 | gimple *use_stmt = USE_STMT (use_p); |
9870 | if (is_gimple_debug (gs: use_stmt)) |
9871 | continue; |
9872 | |
9873 | if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (g: use_stmt))) |
9874 | { |
9875 | exit_phi = use_stmt; |
9876 | break; |
9877 | } |
9878 | } |
9879 | if (exit_phi) |
9880 | { |
9881 | stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi); |
9882 | if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo) |
9883 | && !STMT_VINFO_LIVE_P (exit_phi_vinfo))) |
9884 | { |
9885 | if (dump_enabled_p ()) |
9886 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
9887 | "inner-loop induction only used outside " |
9888 | "of the outer vectorized loop.\n" ); |
9889 | return false; |
9890 | } |
9891 | } |
9892 | |
9893 | nested_in_vect_loop = true; |
9894 | iv_loop = loop->inner; |
9895 | } |
9896 | else |
9897 | iv_loop = loop; |
9898 | gcc_assert (iv_loop == (gimple_bb (phi))->loop_father); |
9899 | |
9900 | if (slp_node && !nunits.is_constant ()) |
9901 | { |
9902 | /* The current SLP code creates the step value element-by-element. */ |
9903 | if (dump_enabled_p ()) |
9904 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
9905 | "SLP induction not supported for variable-length" |
9906 | " vectors.\n" ); |
9907 | return false; |
9908 | } |
9909 | |
9910 | if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float) |
9911 | { |
9912 | if (dump_enabled_p ()) |
9913 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
9914 | "floating point induction vectorization disabled\n" ); |
9915 | return false; |
9916 | } |
9917 | |
9918 | step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info); |
9919 | gcc_assert (step_expr != NULL_TREE); |
9920 | tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype); |
9921 | |
9922 | /* Check for backend support of PLUS/MINUS_EXPR. */ |
9923 | if (!directly_supported_p (PLUS_EXPR, step_vectype) |
9924 | || !directly_supported_p (MINUS_EXPR, step_vectype)) |
9925 | return false; |
9926 | |
9927 | if (!vec_stmt) /* transformation not required. */ |
9928 | { |
9929 | unsigned inside_cost = 0, prologue_cost = 0; |
9930 | if (slp_node) |
9931 | { |
9932 | /* We eventually need to set a vector type on invariant |
9933 | arguments. */ |
9934 | unsigned j; |
9935 | slp_tree child; |
9936 | FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child) |
9937 | if (!vect_maybe_update_slp_op_vectype |
9938 | (child, SLP_TREE_VECTYPE (slp_node))) |
9939 | { |
9940 | if (dump_enabled_p ()) |
9941 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
9942 | "incompatible vector types for " |
9943 | "invariants\n" ); |
9944 | return false; |
9945 | } |
9946 | /* loop cost for vec_loop. */ |
9947 | inside_cost |
9948 | = record_stmt_cost (body_cost_vec: cost_vec, |
9949 | SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node), |
9950 | kind: vector_stmt, stmt_info, misalign: 0, where: vect_body); |
9951 | /* prologue cost for vec_init (if not nested) and step. */ |
9952 | prologue_cost = record_stmt_cost (body_cost_vec: cost_vec, count: 1 + !nested_in_vect_loop, |
9953 | kind: scalar_to_vec, |
9954 | stmt_info, misalign: 0, where: vect_prologue); |
9955 | } |
9956 | else /* if (!slp_node) */ |
9957 | { |
9958 | /* loop cost for vec_loop. */ |
9959 | inside_cost = record_stmt_cost (body_cost_vec: cost_vec, count: ncopies, kind: vector_stmt, |
9960 | stmt_info, misalign: 0, where: vect_body); |
9961 | /* prologue cost for vec_init and vec_step. */ |
9962 | prologue_cost = record_stmt_cost (body_cost_vec: cost_vec, count: 2, kind: scalar_to_vec, |
9963 | stmt_info, misalign: 0, where: vect_prologue); |
9964 | } |
9965 | if (dump_enabled_p ()) |
9966 | dump_printf_loc (MSG_NOTE, vect_location, |
9967 | "vect_model_induction_cost: inside_cost = %d, " |
9968 | "prologue_cost = %d .\n" , inside_cost, |
9969 | prologue_cost); |
9970 | |
9971 | STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type; |
9972 | DUMP_VECT_SCOPE ("vectorizable_induction" ); |
9973 | return true; |
9974 | } |
9975 | |
9976 | /* Transform. */ |
9977 | |
9978 | /* Compute a vector variable, initialized with the first VF values of |
9979 | the induction variable. E.g., for an iv with IV_PHI='X' and |
9980 | evolution S, for a vector of 4 units, we want to compute: |
9981 | [X, X + S, X + 2*S, X + 3*S]. */ |
9982 | |
9983 | if (dump_enabled_p ()) |
9984 | dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n" ); |
9985 | |
9986 | pe = loop_preheader_edge (iv_loop); |
9987 | /* Find the first insertion point in the BB. */ |
9988 | basic_block bb = gimple_bb (g: phi); |
9989 | si = gsi_after_labels (bb); |
9990 | |
9991 | /* For SLP induction we have to generate several IVs as for example |
9992 | with group size 3 we need |
9993 | [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1] |
9994 | [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */ |
9995 | if (slp_node) |
9996 | { |
9997 | /* Enforced above. */ |
9998 | unsigned int const_nunits = nunits.to_constant (); |
9999 | |
10000 | /* The initial values are vectorized, but any lanes > group_size |
10001 | need adjustment. */ |
10002 | slp_tree init_node |
10003 | = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx]; |
10004 | |
10005 | /* Gather steps. Since we do not vectorize inductions as |
10006 | cycles we have to reconstruct the step from SCEV data. */ |
10007 | unsigned group_size = SLP_TREE_LANES (slp_node); |
10008 | tree *steps = XALLOCAVEC (tree, group_size); |
10009 | tree *inits = XALLOCAVEC (tree, group_size); |
10010 | stmt_vec_info phi_info; |
10011 | FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info) |
10012 | { |
10013 | steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info); |
10014 | if (!init_node) |
10015 | inits[i] = gimple_phi_arg_def (gs: as_a<gphi *> (p: phi_info->stmt), |
10016 | index: pe->dest_idx); |
10017 | } |
10018 | |
10019 | /* Now generate the IVs. */ |
10020 | unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); |
10021 | gcc_assert ((const_nunits * nvects) % group_size == 0); |
10022 | unsigned nivs; |
10023 | if (nested_in_vect_loop) |
10024 | nivs = nvects; |
10025 | else |
10026 | { |
10027 | /* Compute the number of distinct IVs we need. First reduce |
10028 | group_size if it is a multiple of const_nunits so we get |
10029 | one IV for a group_size of 4 but const_nunits 2. */ |
10030 | unsigned group_sizep = group_size; |
10031 | if (group_sizep % const_nunits == 0) |
10032 | group_sizep = group_sizep / const_nunits; |
10033 | nivs = least_common_multiple (group_sizep, |
10034 | const_nunits) / const_nunits; |
10035 | } |
10036 | tree stept = TREE_TYPE (step_vectype); |
10037 | tree lupdate_mul = NULL_TREE; |
10038 | if (!nested_in_vect_loop) |
10039 | { |
10040 | /* The number of iterations covered in one vector iteration. */ |
10041 | unsigned lup_mul = (nvects * const_nunits) / group_size; |
10042 | lupdate_mul |
10043 | = build_vector_from_val (step_vectype, |
10044 | SCALAR_FLOAT_TYPE_P (stept) |
10045 | ? build_real_from_wide (stept, lup_mul, |
10046 | UNSIGNED) |
10047 | : build_int_cstu (type: stept, lup_mul)); |
10048 | } |
10049 | tree peel_mul = NULL_TREE; |
10050 | gimple_seq init_stmts = NULL; |
10051 | if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo)) |
10052 | { |
10053 | if (SCALAR_FLOAT_TYPE_P (stept)) |
10054 | peel_mul = gimple_build (seq: &init_stmts, code: FLOAT_EXPR, type: stept, |
10055 | LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo)); |
10056 | else |
10057 | peel_mul = gimple_convert (seq: &init_stmts, type: stept, |
10058 | LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo)); |
10059 | peel_mul = gimple_build_vector_from_val (seq: &init_stmts, |
10060 | type: step_vectype, op: peel_mul); |
10061 | } |
10062 | unsigned ivn; |
10063 | auto_vec<tree> vec_steps; |
10064 | for (ivn = 0; ivn < nivs; ++ivn) |
10065 | { |
10066 | tree_vector_builder step_elts (step_vectype, const_nunits, 1); |
10067 | tree_vector_builder init_elts (vectype, const_nunits, 1); |
10068 | tree_vector_builder mul_elts (step_vectype, const_nunits, 1); |
10069 | for (unsigned eltn = 0; eltn < const_nunits; ++eltn) |
10070 | { |
10071 | /* The scalar steps of the IVs. */ |
10072 | tree elt = steps[(ivn*const_nunits + eltn) % group_size]; |
10073 | elt = gimple_convert (seq: &init_stmts, TREE_TYPE (step_vectype), op: elt); |
10074 | step_elts.quick_push (obj: elt); |
10075 | if (!init_node) |
10076 | { |
10077 | /* The scalar inits of the IVs if not vectorized. */ |
10078 | elt = inits[(ivn*const_nunits + eltn) % group_size]; |
10079 | if (!useless_type_conversion_p (TREE_TYPE (vectype), |
10080 | TREE_TYPE (elt))) |
10081 | elt = gimple_build (seq: &init_stmts, code: VIEW_CONVERT_EXPR, |
10082 | TREE_TYPE (vectype), ops: elt); |
10083 | init_elts.quick_push (obj: elt); |
10084 | } |
10085 | /* The number of steps to add to the initial values. */ |
10086 | unsigned mul_elt = (ivn*const_nunits + eltn) / group_size; |
10087 | mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept) |
10088 | ? build_real_from_wide (stept, |
10089 | mul_elt, UNSIGNED) |
10090 | : build_int_cstu (type: stept, mul_elt)); |
10091 | } |
10092 | vec_step = gimple_build_vector (seq: &init_stmts, builder: &step_elts); |
10093 | vec_steps.safe_push (obj: vec_step); |
10094 | tree step_mul = gimple_build_vector (seq: &init_stmts, builder: &mul_elts); |
10095 | if (peel_mul) |
10096 | step_mul = gimple_build (seq: &init_stmts, code: PLUS_EXPR, type: step_vectype, |
10097 | ops: step_mul, ops: peel_mul); |
10098 | if (!init_node) |
10099 | vec_init = gimple_build_vector (seq: &init_stmts, builder: &init_elts); |
10100 | |
10101 | /* Create the induction-phi that defines the induction-operand. */ |
10102 | vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, |
10103 | "vec_iv_" ); |
10104 | induction_phi = create_phi_node (vec_dest, iv_loop->header); |
10105 | induc_def = PHI_RESULT (induction_phi); |
10106 | |
10107 | /* Create the iv update inside the loop */ |
10108 | tree up = vec_step; |
10109 | if (lupdate_mul) |
10110 | up = gimple_build (seq: &init_stmts, code: MULT_EXPR, type: step_vectype, |
10111 | ops: vec_step, ops: lupdate_mul); |
10112 | gimple_seq stmts = NULL; |
10113 | vec_def = gimple_convert (seq: &stmts, type: step_vectype, op: induc_def); |
10114 | vec_def = gimple_build (seq: &stmts, |
10115 | code: PLUS_EXPR, type: step_vectype, ops: vec_def, ops: up); |
10116 | vec_def = gimple_convert (seq: &stmts, type: vectype, op: vec_def); |
10117 | gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT); |
10118 | add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop), |
10119 | UNKNOWN_LOCATION); |
10120 | |
10121 | if (init_node) |
10122 | vec_init = vect_get_slp_vect_def (init_node, ivn); |
10123 | if (!nested_in_vect_loop |
10124 | && !integer_zerop (step_mul)) |
10125 | { |
10126 | vec_def = gimple_convert (seq: &init_stmts, type: step_vectype, op: vec_init); |
10127 | up = gimple_build (seq: &init_stmts, code: MULT_EXPR, type: step_vectype, |
10128 | ops: vec_step, ops: step_mul); |
10129 | vec_def = gimple_build (seq: &init_stmts, code: PLUS_EXPR, type: step_vectype, |
10130 | ops: vec_def, ops: up); |
10131 | vec_init = gimple_convert (seq: &init_stmts, type: vectype, op: vec_def); |
10132 | } |
10133 | |
10134 | /* Set the arguments of the phi node: */ |
10135 | add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION); |
10136 | |
10137 | slp_node->push_vec_def (def: induction_phi); |
10138 | } |
10139 | if (!nested_in_vect_loop) |
10140 | { |
10141 | /* Fill up to the number of vectors we need for the whole group. */ |
10142 | nivs = least_common_multiple (group_size, |
10143 | const_nunits) / const_nunits; |
10144 | vec_steps.reserve (nelems: nivs-ivn); |
10145 | for (; ivn < nivs; ++ivn) |
10146 | { |
10147 | slp_node->push_vec_def (SLP_TREE_VEC_DEFS (slp_node)[0]); |
10148 | vec_steps.quick_push (obj: vec_steps[0]); |
10149 | } |
10150 | } |
10151 | |
10152 | /* Re-use IVs when we can. We are generating further vector |
10153 | stmts by adding VF' * stride to the IVs generated above. */ |
10154 | if (ivn < nvects) |
10155 | { |
10156 | unsigned vfp |
10157 | = least_common_multiple (group_size, const_nunits) / group_size; |
10158 | tree lupdate_mul |
10159 | = build_vector_from_val (step_vectype, |
10160 | SCALAR_FLOAT_TYPE_P (stept) |
10161 | ? build_real_from_wide (stept, |
10162 | vfp, UNSIGNED) |
10163 | : build_int_cstu (type: stept, vfp)); |
10164 | for (; ivn < nvects; ++ivn) |
10165 | { |
10166 | gimple *iv |
10167 | = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (slp_node)[ivn - nivs]); |
10168 | tree def = gimple_get_lhs (iv); |
10169 | if (ivn < 2*nivs) |
10170 | vec_steps[ivn - nivs] |
10171 | = gimple_build (seq: &init_stmts, code: MULT_EXPR, type: step_vectype, |
10172 | ops: vec_steps[ivn - nivs], ops: lupdate_mul); |
10173 | gimple_seq stmts = NULL; |
10174 | def = gimple_convert (seq: &stmts, type: step_vectype, op: def); |
10175 | def = gimple_build (seq: &stmts, code: PLUS_EXPR, type: step_vectype, |
10176 | ops: def, ops: vec_steps[ivn % nivs]); |
10177 | def = gimple_convert (seq: &stmts, type: vectype, op: def); |
10178 | if (gimple_code (g: iv) == GIMPLE_PHI) |
10179 | gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT); |
10180 | else |
10181 | { |
10182 | gimple_stmt_iterator tgsi = gsi_for_stmt (iv); |
10183 | gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING); |
10184 | } |
10185 | slp_node->push_vec_def (def); |
10186 | } |
10187 | } |
10188 | |
10189 | new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts); |
10190 | gcc_assert (!new_bb); |
10191 | |
10192 | return true; |
10193 | } |
10194 | |
10195 | init_expr = vect_phi_initial_value (phi); |
10196 | |
10197 | gimple_seq stmts = NULL; |
10198 | if (!nested_in_vect_loop) |
10199 | { |
10200 | /* Convert the initial value to the IV update type. */ |
10201 | tree new_type = TREE_TYPE (step_expr); |
10202 | init_expr = gimple_convert (seq: &stmts, type: new_type, op: init_expr); |
10203 | |
10204 | /* If we are using the loop mask to "peel" for alignment then we need |
10205 | to adjust the start value here. */ |
10206 | tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo); |
10207 | if (skip_niters != NULL_TREE) |
10208 | { |
10209 | if (FLOAT_TYPE_P (vectype)) |
10210 | skip_niters = gimple_build (seq: &stmts, code: FLOAT_EXPR, type: new_type, |
10211 | ops: skip_niters); |
10212 | else |
10213 | skip_niters = gimple_convert (seq: &stmts, type: new_type, op: skip_niters); |
10214 | tree skip_step = gimple_build (seq: &stmts, code: MULT_EXPR, type: new_type, |
10215 | ops: skip_niters, ops: step_expr); |
10216 | init_expr = gimple_build (seq: &stmts, code: MINUS_EXPR, type: new_type, |
10217 | ops: init_expr, ops: skip_step); |
10218 | } |
10219 | } |
10220 | |
10221 | if (stmts) |
10222 | { |
10223 | new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); |
10224 | gcc_assert (!new_bb); |
10225 | } |
10226 | |
10227 | /* Create the vector that holds the initial_value of the induction. */ |
10228 | if (nested_in_vect_loop) |
10229 | { |
10230 | /* iv_loop is nested in the loop to be vectorized. init_expr had already |
10231 | been created during vectorization of previous stmts. We obtain it |
10232 | from the STMT_VINFO_VEC_STMT of the defining stmt. */ |
10233 | auto_vec<tree> vec_inits; |
10234 | vect_get_vec_defs_for_operand (vinfo: loop_vinfo, stmt_info, 1, |
10235 | op: init_expr, &vec_inits); |
10236 | vec_init = vec_inits[0]; |
10237 | /* If the initial value is not of proper type, convert it. */ |
10238 | if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init))) |
10239 | { |
10240 | new_stmt |
10241 | = gimple_build_assign (vect_get_new_ssa_name (vectype, |
10242 | vect_simple_var, |
10243 | "vec_iv_" ), |
10244 | VIEW_CONVERT_EXPR, |
10245 | build1 (VIEW_CONVERT_EXPR, vectype, |
10246 | vec_init)); |
10247 | vec_init = gimple_assign_lhs (gs: new_stmt); |
10248 | new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop), |
10249 | new_stmt); |
10250 | gcc_assert (!new_bb); |
10251 | } |
10252 | } |
10253 | else |
10254 | { |
10255 | /* iv_loop is the loop to be vectorized. Create: |
10256 | vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */ |
10257 | stmts = NULL; |
10258 | new_name = gimple_convert (seq: &stmts, TREE_TYPE (step_expr), op: init_expr); |
10259 | |
10260 | unsigned HOST_WIDE_INT const_nunits; |
10261 | if (nunits.is_constant (const_value: &const_nunits)) |
10262 | { |
10263 | tree_vector_builder elts (step_vectype, const_nunits, 1); |
10264 | elts.quick_push (obj: new_name); |
10265 | for (i = 1; i < const_nunits; i++) |
10266 | { |
10267 | /* Create: new_name_i = new_name + step_expr */ |
10268 | new_name = gimple_build (seq: &stmts, code: PLUS_EXPR, TREE_TYPE (new_name), |
10269 | ops: new_name, ops: step_expr); |
10270 | elts.quick_push (obj: new_name); |
10271 | } |
10272 | /* Create a vector from [new_name_0, new_name_1, ..., |
10273 | new_name_nunits-1] */ |
10274 | vec_init = gimple_build_vector (seq: &stmts, builder: &elts); |
10275 | } |
10276 | else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr))) |
10277 | /* Build the initial value directly from a VEC_SERIES_EXPR. */ |
10278 | vec_init = gimple_build (seq: &stmts, code: VEC_SERIES_EXPR, type: step_vectype, |
10279 | ops: new_name, ops: step_expr); |
10280 | else |
10281 | { |
10282 | /* Build: |
10283 | [base, base, base, ...] |
10284 | + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */ |
10285 | gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))); |
10286 | gcc_assert (flag_associative_math); |
10287 | tree index = build_index_vector (step_vectype, 0, 1); |
10288 | tree base_vec = gimple_build_vector_from_val (seq: &stmts, type: step_vectype, |
10289 | op: new_name); |
10290 | tree step_vec = gimple_build_vector_from_val (seq: &stmts, type: step_vectype, |
10291 | op: step_expr); |
10292 | vec_init = gimple_build (seq: &stmts, code: FLOAT_EXPR, type: step_vectype, ops: index); |
10293 | vec_init = gimple_build (seq: &stmts, code: MULT_EXPR, type: step_vectype, |
10294 | ops: vec_init, ops: step_vec); |
10295 | vec_init = gimple_build (seq: &stmts, code: PLUS_EXPR, type: step_vectype, |
10296 | ops: vec_init, ops: base_vec); |
10297 | } |
10298 | vec_init = gimple_convert (seq: &stmts, type: vectype, op: vec_init); |
10299 | |
10300 | if (stmts) |
10301 | { |
10302 | new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); |
10303 | gcc_assert (!new_bb); |
10304 | } |
10305 | } |
10306 | |
10307 | |
10308 | /* Create the vector that holds the step of the induction. */ |
10309 | if (nested_in_vect_loop) |
10310 | /* iv_loop is nested in the loop to be vectorized. Generate: |
10311 | vec_step = [S, S, S, S] */ |
10312 | new_name = step_expr; |
10313 | else |
10314 | { |
10315 | /* iv_loop is the loop to be vectorized. Generate: |
10316 | vec_step = [VF*S, VF*S, VF*S, VF*S] */ |
10317 | gimple_seq seq = NULL; |
10318 | if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))) |
10319 | { |
10320 | expr = build_int_cst (integer_type_node, vf); |
10321 | expr = gimple_build (seq: &seq, code: FLOAT_EXPR, TREE_TYPE (step_expr), ops: expr); |
10322 | } |
10323 | else |
10324 | expr = build_int_cst (TREE_TYPE (step_expr), vf); |
10325 | new_name = gimple_build (seq: &seq, code: MULT_EXPR, TREE_TYPE (step_expr), |
10326 | ops: expr, ops: step_expr); |
10327 | if (seq) |
10328 | { |
10329 | new_bb = gsi_insert_seq_on_edge_immediate (pe, seq); |
10330 | gcc_assert (!new_bb); |
10331 | } |
10332 | } |
10333 | |
10334 | t = unshare_expr (new_name); |
10335 | gcc_assert (CONSTANT_CLASS_P (new_name) |
10336 | || TREE_CODE (new_name) == SSA_NAME); |
10337 | new_vec = build_vector_from_val (step_vectype, t); |
10338 | vec_step = vect_init_vector (loop_vinfo, stmt_info, |
10339 | new_vec, step_vectype, NULL); |
10340 | |
10341 | |
10342 | /* Create the following def-use cycle: |
10343 | loop prolog: |
10344 | vec_init = ... |
10345 | vec_step = ... |
10346 | loop: |
10347 | vec_iv = PHI <vec_init, vec_loop> |
10348 | ... |
10349 | STMT |
10350 | ... |
10351 | vec_loop = vec_iv + vec_step; */ |
10352 | |
10353 | /* Create the induction-phi that defines the induction-operand. */ |
10354 | vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_" ); |
10355 | induction_phi = create_phi_node (vec_dest, iv_loop->header); |
10356 | induc_def = PHI_RESULT (induction_phi); |
10357 | |
10358 | /* Create the iv update inside the loop */ |
10359 | stmts = NULL; |
10360 | vec_def = gimple_convert (seq: &stmts, type: step_vectype, op: induc_def); |
10361 | vec_def = gimple_build (seq: &stmts, code: PLUS_EXPR, type: step_vectype, ops: vec_def, ops: vec_step); |
10362 | vec_def = gimple_convert (seq: &stmts, type: vectype, op: vec_def); |
10363 | gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT); |
10364 | new_stmt = SSA_NAME_DEF_STMT (vec_def); |
10365 | |
10366 | /* Set the arguments of the phi node: */ |
10367 | add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION); |
10368 | add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop), |
10369 | UNKNOWN_LOCATION); |
10370 | |
10371 | STMT_VINFO_VEC_STMTS (stmt_info).safe_push (obj: induction_phi); |
10372 | *vec_stmt = induction_phi; |
10373 | |
10374 | /* In case that vectorization factor (VF) is bigger than the number |
10375 | of elements that we can fit in a vectype (nunits), we have to generate |
10376 | more than one vector stmt - i.e - we need to "unroll" the |
10377 | vector stmt by a factor VF/nunits. For more details see documentation |
10378 | in vectorizable_operation. */ |
10379 | |
10380 | if (ncopies > 1) |
10381 | { |
10382 | gimple_seq seq = NULL; |
10383 | /* FORNOW. This restriction should be relaxed. */ |
10384 | gcc_assert (!nested_in_vect_loop); |
10385 | |
10386 | /* Create the vector that holds the step of the induction. */ |
10387 | if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))) |
10388 | { |
10389 | expr = build_int_cst (integer_type_node, nunits); |
10390 | expr = gimple_build (seq: &seq, code: FLOAT_EXPR, TREE_TYPE (step_expr), ops: expr); |
10391 | } |
10392 | else |
10393 | expr = build_int_cst (TREE_TYPE (step_expr), nunits); |
10394 | new_name = gimple_build (seq: &seq, code: MULT_EXPR, TREE_TYPE (step_expr), |
10395 | ops: expr, ops: step_expr); |
10396 | if (seq) |
10397 | { |
10398 | new_bb = gsi_insert_seq_on_edge_immediate (pe, seq); |
10399 | gcc_assert (!new_bb); |
10400 | } |
10401 | |
10402 | t = unshare_expr (new_name); |
10403 | gcc_assert (CONSTANT_CLASS_P (new_name) |
10404 | || TREE_CODE (new_name) == SSA_NAME); |
10405 | new_vec = build_vector_from_val (step_vectype, t); |
10406 | vec_step = vect_init_vector (loop_vinfo, stmt_info, |
10407 | new_vec, step_vectype, NULL); |
10408 | |
10409 | vec_def = induc_def; |
10410 | for (i = 1; i < ncopies + 1; i++) |
10411 | { |
10412 | /* vec_i = vec_prev + vec_step */ |
10413 | gimple_seq stmts = NULL; |
10414 | vec_def = gimple_convert (seq: &stmts, type: step_vectype, op: vec_def); |
10415 | vec_def = gimple_build (seq: &stmts, |
10416 | code: PLUS_EXPR, type: step_vectype, ops: vec_def, ops: vec_step); |
10417 | vec_def = gimple_convert (seq: &stmts, type: vectype, op: vec_def); |
10418 | |
10419 | gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT); |
10420 | if (i < ncopies) |
10421 | { |
10422 | new_stmt = SSA_NAME_DEF_STMT (vec_def); |
10423 | STMT_VINFO_VEC_STMTS (stmt_info).safe_push (obj: new_stmt); |
10424 | } |
10425 | else |
10426 | { |
10427 | /* vec_1 = vec_iv + (VF/n * S) |
10428 | vec_2 = vec_1 + (VF/n * S) |
10429 | ... |
10430 | vec_n = vec_prev + (VF/n * S) = vec_iv + VF * S = vec_loop |
10431 | |
10432 | vec_n is used as vec_loop to save the large step register and |
10433 | related operations. */ |
10434 | add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop), |
10435 | UNKNOWN_LOCATION); |
10436 | } |
10437 | } |
10438 | } |
10439 | |
10440 | if (dump_enabled_p ()) |
10441 | dump_printf_loc (MSG_NOTE, vect_location, |
10442 | "transform induction: created def-use cycle: %G%G" , |
10443 | (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def)); |
10444 | |
10445 | return true; |
10446 | } |
10447 | |
10448 | /* Function vectorizable_live_operation. |
10449 | |
10450 | STMT_INFO computes a value that is used outside the loop. Check if |
10451 | it can be supported. */ |
10452 | |
10453 | bool |
10454 | vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info, |
10455 | slp_tree slp_node, slp_instance slp_node_instance, |
10456 | int slp_index, bool vec_stmt_p, |
10457 | stmt_vector_for_cost *cost_vec) |
10458 | { |
10459 | loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (p: vinfo); |
10460 | imm_use_iterator imm_iter; |
10461 | tree lhs, lhs_type, bitsize; |
10462 | tree vectype = (slp_node |
10463 | ? SLP_TREE_VECTYPE (slp_node) |
10464 | : STMT_VINFO_VECTYPE (stmt_info)); |
10465 | poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (node: vectype); |
10466 | int ncopies; |
10467 | gimple *use_stmt; |
10468 | auto_vec<tree> vec_oprnds; |
10469 | int vec_entry = 0; |
10470 | poly_uint64 vec_index = 0; |
10471 | |
10472 | gcc_assert (STMT_VINFO_LIVE_P (stmt_info)); |
10473 | |
10474 | /* If a stmt of a reduction is live, vectorize it via |
10475 | vect_create_epilog_for_reduction. vectorizable_reduction assessed |
10476 | validity so just trigger the transform here. */ |
10477 | if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))) |
10478 | { |
10479 | if (!vec_stmt_p) |
10480 | return true; |
10481 | if (slp_node) |
10482 | { |
10483 | /* For reduction chains the meta-info is attached to |
10484 | the group leader. */ |
10485 | if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)) |
10486 | stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info); |
10487 | /* For SLP reductions we vectorize the epilogue for |
10488 | all involved stmts together. */ |
10489 | else if (slp_index != 0) |
10490 | return true; |
10491 | } |
10492 | stmt_vec_info reduc_info = info_for_reduction (vinfo: loop_vinfo, stmt_info); |
10493 | gcc_assert (reduc_info->is_reduc_info); |
10494 | if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION |
10495 | || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION) |
10496 | return true; |
10497 | vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node, |
10498 | slp_node_instance); |
10499 | return true; |
10500 | } |
10501 | |
10502 | /* If STMT is not relevant and it is a simple assignment and its inputs are |
10503 | invariant then it can remain in place, unvectorized. The original last |
10504 | scalar value that it computes will be used. */ |
10505 | if (!STMT_VINFO_RELEVANT_P (stmt_info)) |
10506 | { |
10507 | gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo)); |
10508 | if (dump_enabled_p ()) |
10509 | dump_printf_loc (MSG_NOTE, vect_location, |
10510 | "statement is simple and uses invariant. Leaving in " |
10511 | "place.\n" ); |
10512 | return true; |
10513 | } |
10514 | |
10515 | if (slp_node) |
10516 | ncopies = 1; |
10517 | else |
10518 | ncopies = vect_get_num_copies (loop_vinfo, vectype); |
10519 | |
10520 | if (slp_node) |
10521 | { |
10522 | gcc_assert (slp_index >= 0); |
10523 | |
10524 | /* Get the last occurrence of the scalar index from the concatenation of |
10525 | all the slp vectors. Calculate which slp vector it is and the index |
10526 | within. */ |
10527 | int num_scalar = SLP_TREE_LANES (slp_node); |
10528 | int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); |
10529 | poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index; |
10530 | |
10531 | /* Calculate which vector contains the result, and which lane of |
10532 | that vector we need. */ |
10533 | if (!can_div_trunc_p (a: pos, b: nunits, quotient: &vec_entry, remainder: &vec_index)) |
10534 | { |
10535 | if (dump_enabled_p ()) |
10536 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
10537 | "Cannot determine which vector holds the" |
10538 | " final result.\n" ); |
10539 | return false; |
10540 | } |
10541 | } |
10542 | |
10543 | if (!vec_stmt_p) |
10544 | { |
10545 | /* No transformation required. */ |
10546 | if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)) |
10547 | { |
10548 | if (slp_node) |
10549 | { |
10550 | if (dump_enabled_p ()) |
10551 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
10552 | "can't operate on partial vectors " |
10553 | "because an SLP statement is live after " |
10554 | "the loop.\n" ); |
10555 | LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; |
10556 | } |
10557 | else if (ncopies > 1) |
10558 | { |
10559 | if (dump_enabled_p ()) |
10560 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
10561 | "can't operate on partial vectors " |
10562 | "because ncopies is greater than 1.\n" ); |
10563 | LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; |
10564 | } |
10565 | else |
10566 | { |
10567 | gcc_assert (ncopies == 1 && !slp_node); |
10568 | if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype, |
10569 | OPTIMIZE_FOR_SPEED)) |
10570 | vect_record_loop_mask (loop_vinfo, |
10571 | &LOOP_VINFO_MASKS (loop_vinfo), |
10572 | 1, vectype, NULL); |
10573 | else if (can_vec_extract_var_idx_p ( |
10574 | TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype)))) |
10575 | vect_record_loop_len (loop_vinfo, |
10576 | &LOOP_VINFO_LENS (loop_vinfo), |
10577 | 1, vectype, 1); |
10578 | else |
10579 | { |
10580 | if (dump_enabled_p ()) |
10581 | dump_printf_loc ( |
10582 | MSG_MISSED_OPTIMIZATION, vect_location, |
10583 | "can't operate on partial vectors " |
10584 | "because the target doesn't support extract " |
10585 | "last reduction.\n" ); |
10586 | LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; |
10587 | } |
10588 | } |
10589 | } |
10590 | /* ??? Enable for loop costing as well. */ |
10591 | if (!loop_vinfo) |
10592 | record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE, |
10593 | 0, vect_epilogue); |
10594 | return true; |
10595 | } |
10596 | |
10597 | /* Use the lhs of the original scalar statement. */ |
10598 | gimple *stmt = vect_orig_stmt (stmt_info)->stmt; |
10599 | if (dump_enabled_p ()) |
10600 | dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live " |
10601 | "stmt %G" , stmt); |
10602 | |
10603 | lhs = gimple_get_lhs (stmt); |
10604 | lhs_type = TREE_TYPE (lhs); |
10605 | |
10606 | bitsize = vector_element_bits_tree (vectype); |
10607 | |
10608 | /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */ |
10609 | tree vec_lhs, bitstart; |
10610 | gimple *vec_stmt; |
10611 | if (slp_node) |
10612 | { |
10613 | gcc_assert (!loop_vinfo |
10614 | || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) |
10615 | && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))); |
10616 | |
10617 | /* Get the correct slp vectorized stmt. */ |
10618 | vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry]; |
10619 | vec_stmt = SSA_NAME_DEF_STMT (vec_lhs); |
10620 | |
10621 | /* Get entry to use. */ |
10622 | bitstart = bitsize_int (vec_index); |
10623 | bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart); |
10624 | } |
10625 | else |
10626 | { |
10627 | /* For multiple copies, get the last copy. */ |
10628 | vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last (); |
10629 | vec_lhs = gimple_get_lhs (vec_stmt); |
10630 | |
10631 | /* Get the last lane in the vector. */ |
10632 | bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1)); |
10633 | } |
10634 | |
10635 | if (loop_vinfo) |
10636 | { |
10637 | /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI |
10638 | requirement, insert one phi node for it. It looks like: |
10639 | loop; |
10640 | BB: |
10641 | # lhs' = PHI <lhs> |
10642 | ==> |
10643 | loop; |
10644 | BB: |
10645 | # vec_lhs' = PHI <vec_lhs> |
10646 | new_tree = lane_extract <vec_lhs', ...>; |
10647 | lhs' = new_tree; */ |
10648 | |
10649 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
10650 | basic_block exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest; |
10651 | gcc_assert (single_pred_p (exit_bb)); |
10652 | |
10653 | tree vec_lhs_phi = copy_ssa_name (var: vec_lhs); |
10654 | gimple *phi = create_phi_node (vec_lhs_phi, exit_bb); |
10655 | SET_PHI_ARG_DEF (phi, LOOP_VINFO_IV_EXIT (loop_vinfo)->dest_idx, vec_lhs); |
10656 | |
10657 | gimple_seq stmts = NULL; |
10658 | tree new_tree; |
10659 | if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)) |
10660 | { |
10661 | /* Emit: |
10662 | |
10663 | SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1> |
10664 | |
10665 | where VEC_LHS is the vectorized live-out result and MASK is |
10666 | the loop mask for the final iteration. */ |
10667 | gcc_assert (ncopies == 1 && !slp_node); |
10668 | gimple_seq tem = NULL; |
10669 | gimple_stmt_iterator gsi = gsi_last (seq&: tem); |
10670 | tree len |
10671 | = vect_get_loop_len (loop_vinfo, &gsi, |
10672 | &LOOP_VINFO_LENS (loop_vinfo), |
10673 | 1, vectype, 0, 0); |
10674 | |
10675 | /* BIAS - 1. */ |
10676 | signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo); |
10677 | tree bias_minus_one |
10678 | = int_const_binop (MINUS_EXPR, |
10679 | build_int_cst (TREE_TYPE (len), biasval), |
10680 | build_one_cst (TREE_TYPE (len))); |
10681 | |
10682 | /* LAST_INDEX = LEN + (BIAS - 1). */ |
10683 | tree last_index = gimple_build (seq: &stmts, code: PLUS_EXPR, TREE_TYPE (len), |
10684 | ops: len, ops: bias_minus_one); |
10685 | |
10686 | /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>. */ |
10687 | tree scalar_res |
10688 | = gimple_build (seq: &stmts, fn: CFN_VEC_EXTRACT, TREE_TYPE (vectype), |
10689 | args: vec_lhs_phi, args: last_index); |
10690 | |
10691 | /* Convert the extracted vector element to the scalar type. */ |
10692 | new_tree = gimple_convert (seq: &stmts, type: lhs_type, op: scalar_res); |
10693 | } |
10694 | else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) |
10695 | { |
10696 | /* Emit: |
10697 | |
10698 | SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK> |
10699 | |
10700 | where VEC_LHS is the vectorized live-out result and MASK is |
10701 | the loop mask for the final iteration. */ |
10702 | gcc_assert (ncopies == 1 && !slp_node); |
10703 | tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info)); |
10704 | gimple_seq tem = NULL; |
10705 | gimple_stmt_iterator gsi = gsi_last (seq&: tem); |
10706 | tree mask = vect_get_loop_mask (loop_vinfo, &gsi, |
10707 | &LOOP_VINFO_MASKS (loop_vinfo), |
10708 | 1, vectype, 0); |
10709 | gimple_seq_add_seq (&stmts, tem); |
10710 | tree scalar_res = gimple_build (seq: &stmts, fn: CFN_EXTRACT_LAST, type: scalar_type, |
10711 | args: mask, args: vec_lhs_phi); |
10712 | |
10713 | /* Convert the extracted vector element to the scalar type. */ |
10714 | new_tree = gimple_convert (seq: &stmts, type: lhs_type, op: scalar_res); |
10715 | } |
10716 | else |
10717 | { |
10718 | tree bftype = TREE_TYPE (vectype); |
10719 | if (VECTOR_BOOLEAN_TYPE_P (vectype)) |
10720 | bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1); |
10721 | new_tree = build3 (BIT_FIELD_REF, bftype, |
10722 | vec_lhs_phi, bitsize, bitstart); |
10723 | new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree), |
10724 | &stmts, true, NULL_TREE); |
10725 | } |
10726 | |
10727 | gimple_stmt_iterator exit_gsi = gsi_after_labels (bb: exit_bb); |
10728 | if (stmts) |
10729 | gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); |
10730 | |
10731 | /* Remove existing phis that copy from lhs and create copies |
10732 | from new_tree. */ |
10733 | gimple_stmt_iterator gsi; |
10734 | for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (i: gsi);) |
10735 | { |
10736 | gimple *phi = gsi_stmt (i: gsi); |
10737 | if ((gimple_phi_arg_def (gs: phi, index: 0) == lhs)) |
10738 | { |
10739 | remove_phi_node (&gsi, false); |
10740 | tree lhs_phi = gimple_phi_result (gs: phi); |
10741 | gimple *copy = gimple_build_assign (lhs_phi, new_tree); |
10742 | gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT); |
10743 | } |
10744 | else |
10745 | gsi_next (i: &gsi); |
10746 | } |
10747 | |
10748 | /* There a no further out-of-loop uses of lhs by LC-SSA construction. */ |
10749 | FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs) |
10750 | gcc_assert (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))); |
10751 | } |
10752 | else |
10753 | { |
10754 | /* For basic-block vectorization simply insert the lane-extraction. */ |
10755 | tree bftype = TREE_TYPE (vectype); |
10756 | if (VECTOR_BOOLEAN_TYPE_P (vectype)) |
10757 | bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1); |
10758 | tree new_tree = build3 (BIT_FIELD_REF, bftype, |
10759 | vec_lhs, bitsize, bitstart); |
10760 | gimple_seq stmts = NULL; |
10761 | new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree), |
10762 | &stmts, true, NULL_TREE); |
10763 | if (TREE_CODE (new_tree) == SSA_NAME |
10764 | && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs)) |
10765 | SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1; |
10766 | if (is_a <gphi *> (p: vec_stmt)) |
10767 | { |
10768 | gimple_stmt_iterator si = gsi_after_labels (bb: gimple_bb (g: vec_stmt)); |
10769 | gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT); |
10770 | } |
10771 | else |
10772 | { |
10773 | gimple_stmt_iterator si = gsi_for_stmt (vec_stmt); |
10774 | gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT); |
10775 | } |
10776 | |
10777 | /* Replace use of lhs with newly computed result. If the use stmt is a |
10778 | single arg PHI, just replace all uses of PHI result. It's necessary |
10779 | because lcssa PHI defining lhs may be before newly inserted stmt. */ |
10780 | use_operand_p use_p; |
10781 | stmt_vec_info use_stmt_info; |
10782 | FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs) |
10783 | if (!is_gimple_debug (gs: use_stmt) |
10784 | && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt)) |
10785 | || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))) |
10786 | { |
10787 | /* ??? This can happen when the live lane ends up being |
10788 | rooted in a vector construction code-generated by an |
10789 | external SLP node (and code-generation for that already |
10790 | happened). See gcc.dg/vect/bb-slp-47.c. |
10791 | Doing this is what would happen if that vector CTOR |
10792 | were not code-generated yet so it is not too bad. |
10793 | ??? In fact we'd likely want to avoid this situation |
10794 | in the first place. */ |
10795 | if (TREE_CODE (new_tree) == SSA_NAME |
10796 | && !SSA_NAME_IS_DEFAULT_DEF (new_tree) |
10797 | && gimple_code (g: use_stmt) != GIMPLE_PHI |
10798 | && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree), |
10799 | use_stmt)) |
10800 | { |
10801 | if (dump_enabled_p ()) |
10802 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
10803 | "Using original scalar computation for " |
10804 | "live lane because use preceeds vector " |
10805 | "def\n" ); |
10806 | continue; |
10807 | } |
10808 | /* ??? It can also happen that we end up pulling a def into |
10809 | a loop where replacing out-of-loop uses would require |
10810 | a new LC SSA PHI node. Retain the original scalar in |
10811 | those cases as well. PR98064. */ |
10812 | if (TREE_CODE (new_tree) == SSA_NAME |
10813 | && !SSA_NAME_IS_DEFAULT_DEF (new_tree) |
10814 | && (gimple_bb (g: use_stmt)->loop_father |
10815 | != gimple_bb (g: vec_stmt)->loop_father) |
10816 | && !flow_loop_nested_p (gimple_bb (g: vec_stmt)->loop_father, |
10817 | gimple_bb (g: use_stmt)->loop_father)) |
10818 | { |
10819 | if (dump_enabled_p ()) |
10820 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
10821 | "Using original scalar computation for " |
10822 | "live lane because there is an out-of-loop " |
10823 | "definition for it\n" ); |
10824 | continue; |
10825 | } |
10826 | FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter) |
10827 | SET_USE (use_p, new_tree); |
10828 | update_stmt (s: use_stmt); |
10829 | } |
10830 | } |
10831 | |
10832 | return true; |
10833 | } |
10834 | |
10835 | /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */ |
10836 | |
10837 | static void |
10838 | vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info) |
10839 | { |
10840 | ssa_op_iter op_iter; |
10841 | imm_use_iterator imm_iter; |
10842 | def_operand_p def_p; |
10843 | gimple *ustmt; |
10844 | |
10845 | FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF) |
10846 | { |
10847 | FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p)) |
10848 | { |
10849 | basic_block bb; |
10850 | |
10851 | if (!is_gimple_debug (gs: ustmt)) |
10852 | continue; |
10853 | |
10854 | bb = gimple_bb (g: ustmt); |
10855 | |
10856 | if (!flow_bb_inside_loop_p (loop, bb)) |
10857 | { |
10858 | if (gimple_debug_bind_p (s: ustmt)) |
10859 | { |
10860 | if (dump_enabled_p ()) |
10861 | dump_printf_loc (MSG_NOTE, vect_location, |
10862 | "killing debug use\n" ); |
10863 | |
10864 | gimple_debug_bind_reset_value (dbg: ustmt); |
10865 | update_stmt (s: ustmt); |
10866 | } |
10867 | else |
10868 | gcc_unreachable (); |
10869 | } |
10870 | } |
10871 | } |
10872 | } |
10873 | |
10874 | /* Given loop represented by LOOP_VINFO, return true if computation of |
10875 | LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false |
10876 | otherwise. */ |
10877 | |
10878 | static bool |
10879 | loop_niters_no_overflow (loop_vec_info loop_vinfo) |
10880 | { |
10881 | /* Constant case. */ |
10882 | if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) |
10883 | { |
10884 | tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo); |
10885 | tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo); |
10886 | |
10887 | gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST); |
10888 | gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST); |
10889 | if (wi::to_widest (t: cst_nitersm1) < wi::to_widest (t: cst_niters)) |
10890 | return true; |
10891 | } |
10892 | |
10893 | widest_int max; |
10894 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
10895 | /* Check the upper bound of loop niters. */ |
10896 | if (get_max_loop_iterations (loop, nit: &max)) |
10897 | { |
10898 | tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)); |
10899 | signop sgn = TYPE_SIGN (type); |
10900 | widest_int type_max = widest_int::from (x: wi::max_value (type), sgn); |
10901 | if (max < type_max) |
10902 | return true; |
10903 | } |
10904 | return false; |
10905 | } |
10906 | |
10907 | /* Return a mask type with half the number of elements as OLD_TYPE, |
10908 | given that it should have mode NEW_MODE. */ |
10909 | |
10910 | tree |
10911 | vect_halve_mask_nunits (tree old_type, machine_mode new_mode) |
10912 | { |
10913 | poly_uint64 nunits = exact_div (a: TYPE_VECTOR_SUBPARTS (node: old_type), b: 2); |
10914 | return build_truth_vector_type_for_mode (nunits, new_mode); |
10915 | } |
10916 | |
10917 | /* Return a mask type with twice as many elements as OLD_TYPE, |
10918 | given that it should have mode NEW_MODE. */ |
10919 | |
10920 | tree |
10921 | vect_double_mask_nunits (tree old_type, machine_mode new_mode) |
10922 | { |
10923 | poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (node: old_type) * 2; |
10924 | return build_truth_vector_type_for_mode (nunits, new_mode); |
10925 | } |
10926 | |
10927 | /* Record that a fully-masked version of LOOP_VINFO would need MASKS to |
10928 | contain a sequence of NVECTORS masks that each control a vector of type |
10929 | VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND |
10930 | these vector masks with the vector version of SCALAR_MASK. */ |
10931 | |
10932 | void |
10933 | vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks, |
10934 | unsigned int nvectors, tree vectype, tree scalar_mask) |
10935 | { |
10936 | gcc_assert (nvectors != 0); |
10937 | |
10938 | if (scalar_mask) |
10939 | { |
10940 | scalar_cond_masked_key cond (scalar_mask, nvectors); |
10941 | loop_vinfo->scalar_cond_masked_set.add (k: cond); |
10942 | } |
10943 | |
10944 | masks->mask_set.add (k: std::make_pair (x&: vectype, y&: nvectors)); |
10945 | } |
10946 | |
10947 | /* Given a complete set of masks MASKS, extract mask number INDEX |
10948 | for an rgroup that operates on NVECTORS vectors of type VECTYPE, |
10949 | where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI. |
10950 | |
10951 | See the comment above vec_loop_masks for more details about the mask |
10952 | arrangement. */ |
10953 | |
10954 | tree |
10955 | vect_get_loop_mask (loop_vec_info loop_vinfo, |
10956 | gimple_stmt_iterator *gsi, vec_loop_masks *masks, |
10957 | unsigned int nvectors, tree vectype, unsigned int index) |
10958 | { |
10959 | if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) |
10960 | == vect_partial_vectors_while_ult) |
10961 | { |
10962 | rgroup_controls *rgm = &(masks->rgc_vec)[nvectors - 1]; |
10963 | tree mask_type = rgm->type; |
10964 | |
10965 | /* Populate the rgroup's mask array, if this is the first time we've |
10966 | used it. */ |
10967 | if (rgm->controls.is_empty ()) |
10968 | { |
10969 | rgm->controls.safe_grow_cleared (len: nvectors, exact: true); |
10970 | for (unsigned int i = 0; i < nvectors; ++i) |
10971 | { |
10972 | tree mask = make_temp_ssa_name (type: mask_type, NULL, name: "loop_mask" ); |
10973 | /* Provide a dummy definition until the real one is available. */ |
10974 | SSA_NAME_DEF_STMT (mask) = gimple_build_nop (); |
10975 | rgm->controls[i] = mask; |
10976 | } |
10977 | } |
10978 | |
10979 | tree mask = rgm->controls[index]; |
10980 | if (maybe_ne (a: TYPE_VECTOR_SUBPARTS (node: mask_type), |
10981 | b: TYPE_VECTOR_SUBPARTS (node: vectype))) |
10982 | { |
10983 | /* A loop mask for data type X can be reused for data type Y |
10984 | if X has N times more elements than Y and if Y's elements |
10985 | are N times bigger than X's. In this case each sequence |
10986 | of N elements in the loop mask will be all-zero or all-one. |
10987 | We can then view-convert the mask so that each sequence of |
10988 | N elements is replaced by a single element. */ |
10989 | gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type), |
10990 | TYPE_VECTOR_SUBPARTS (vectype))); |
10991 | gimple_seq seq = NULL; |
10992 | mask_type = truth_type_for (vectype); |
10993 | mask = gimple_build (seq: &seq, code: VIEW_CONVERT_EXPR, type: mask_type, ops: mask); |
10994 | if (seq) |
10995 | gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT); |
10996 | } |
10997 | return mask; |
10998 | } |
10999 | else if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) |
11000 | == vect_partial_vectors_avx512) |
11001 | { |
11002 | /* The number of scalars per iteration and the number of vectors are |
11003 | both compile-time constants. */ |
11004 | unsigned int nscalars_per_iter |
11005 | = exact_div (a: nvectors * TYPE_VECTOR_SUBPARTS (node: vectype), |
11006 | LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant (); |
11007 | |
11008 | rgroup_controls *rgm = &masks->rgc_vec[nscalars_per_iter - 1]; |
11009 | |
11010 | /* The stored nV is dependent on the mask type produced. */ |
11011 | gcc_assert (exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype), |
11012 | TYPE_VECTOR_SUBPARTS (rgm->type)).to_constant () |
11013 | == rgm->factor); |
11014 | nvectors = rgm->factor; |
11015 | |
11016 | /* Populate the rgroup's mask array, if this is the first time we've |
11017 | used it. */ |
11018 | if (rgm->controls.is_empty ()) |
11019 | { |
11020 | rgm->controls.safe_grow_cleared (len: nvectors, exact: true); |
11021 | for (unsigned int i = 0; i < nvectors; ++i) |
11022 | { |
11023 | tree mask = make_temp_ssa_name (type: rgm->type, NULL, name: "loop_mask" ); |
11024 | /* Provide a dummy definition until the real one is available. */ |
11025 | SSA_NAME_DEF_STMT (mask) = gimple_build_nop (); |
11026 | rgm->controls[i] = mask; |
11027 | } |
11028 | } |
11029 | if (known_eq (TYPE_VECTOR_SUBPARTS (rgm->type), |
11030 | TYPE_VECTOR_SUBPARTS (vectype))) |
11031 | return rgm->controls[index]; |
11032 | |
11033 | /* Split the vector if needed. Since we are dealing with integer mode |
11034 | masks with AVX512 we can operate on the integer representation |
11035 | performing the whole vector shifting. */ |
11036 | unsigned HOST_WIDE_INT factor; |
11037 | bool ok = constant_multiple_p (a: TYPE_VECTOR_SUBPARTS (node: rgm->type), |
11038 | b: TYPE_VECTOR_SUBPARTS (node: vectype), multiple: &factor); |
11039 | gcc_assert (ok); |
11040 | gcc_assert (GET_MODE_CLASS (TYPE_MODE (rgm->type)) == MODE_INT); |
11041 | tree mask_type = truth_type_for (vectype); |
11042 | gcc_assert (GET_MODE_CLASS (TYPE_MODE (mask_type)) == MODE_INT); |
11043 | unsigned vi = index / factor; |
11044 | unsigned vpart = index % factor; |
11045 | tree vec = rgm->controls[vi]; |
11046 | gimple_seq seq = NULL; |
11047 | vec = gimple_build (seq: &seq, code: VIEW_CONVERT_EXPR, |
11048 | type: lang_hooks.types.type_for_mode |
11049 | (TYPE_MODE (rgm->type), 1), ops: vec); |
11050 | /* For integer mode masks simply shift the right bits into position. */ |
11051 | if (vpart != 0) |
11052 | vec = gimple_build (seq: &seq, code: RSHIFT_EXPR, TREE_TYPE (vec), ops: vec, |
11053 | ops: build_int_cst (integer_type_node, |
11054 | (TYPE_VECTOR_SUBPARTS (node: vectype) |
11055 | * vpart))); |
11056 | vec = gimple_convert (seq: &seq, type: lang_hooks.types.type_for_mode |
11057 | (TYPE_MODE (mask_type), 1), op: vec); |
11058 | vec = gimple_build (seq: &seq, code: VIEW_CONVERT_EXPR, type: mask_type, ops: vec); |
11059 | if (seq) |
11060 | gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT); |
11061 | return vec; |
11062 | } |
11063 | else |
11064 | gcc_unreachable (); |
11065 | } |
11066 | |
11067 | /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS |
11068 | lengths for controlling an operation on VECTYPE. The operation splits |
11069 | each element of VECTYPE into FACTOR separate subelements, measuring the |
11070 | length as a number of these subelements. */ |
11071 | |
11072 | void |
11073 | vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens, |
11074 | unsigned int nvectors, tree vectype, unsigned int factor) |
11075 | { |
11076 | gcc_assert (nvectors != 0); |
11077 | if (lens->length () < nvectors) |
11078 | lens->safe_grow_cleared (len: nvectors, exact: true); |
11079 | rgroup_controls *rgl = &(*lens)[nvectors - 1]; |
11080 | |
11081 | /* The number of scalars per iteration, scalar occupied bytes and |
11082 | the number of vectors are both compile-time constants. */ |
11083 | unsigned int nscalars_per_iter |
11084 | = exact_div (a: nvectors * TYPE_VECTOR_SUBPARTS (node: vectype), |
11085 | LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant (); |
11086 | |
11087 | if (rgl->max_nscalars_per_iter < nscalars_per_iter) |
11088 | { |
11089 | /* For now, we only support cases in which all loads and stores fall back |
11090 | to VnQI or none do. */ |
11091 | gcc_assert (!rgl->max_nscalars_per_iter |
11092 | || (rgl->factor == 1 && factor == 1) |
11093 | || (rgl->max_nscalars_per_iter * rgl->factor |
11094 | == nscalars_per_iter * factor)); |
11095 | rgl->max_nscalars_per_iter = nscalars_per_iter; |
11096 | rgl->type = vectype; |
11097 | rgl->factor = factor; |
11098 | } |
11099 | } |
11100 | |
11101 | /* Given a complete set of lengths LENS, extract length number INDEX |
11102 | for an rgroup that operates on NVECTORS vectors of type VECTYPE, |
11103 | where 0 <= INDEX < NVECTORS. Return a value that contains FACTOR |
11104 | multipled by the number of elements that should be processed. |
11105 | Insert any set-up statements before GSI. */ |
11106 | |
11107 | tree |
11108 | vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi, |
11109 | vec_loop_lens *lens, unsigned int nvectors, tree vectype, |
11110 | unsigned int index, unsigned int factor) |
11111 | { |
11112 | rgroup_controls *rgl = &(*lens)[nvectors - 1]; |
11113 | bool use_bias_adjusted_len = |
11114 | LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0; |
11115 | |
11116 | /* Populate the rgroup's len array, if this is the first time we've |
11117 | used it. */ |
11118 | if (rgl->controls.is_empty ()) |
11119 | { |
11120 | rgl->controls.safe_grow_cleared (len: nvectors, exact: true); |
11121 | for (unsigned int i = 0; i < nvectors; ++i) |
11122 | { |
11123 | tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo); |
11124 | gcc_assert (len_type != NULL_TREE); |
11125 | |
11126 | tree len = make_temp_ssa_name (type: len_type, NULL, name: "loop_len" ); |
11127 | |
11128 | /* Provide a dummy definition until the real one is available. */ |
11129 | SSA_NAME_DEF_STMT (len) = gimple_build_nop (); |
11130 | rgl->controls[i] = len; |
11131 | |
11132 | if (use_bias_adjusted_len) |
11133 | { |
11134 | gcc_assert (i == 0); |
11135 | tree adjusted_len = |
11136 | make_temp_ssa_name (type: len_type, NULL, name: "adjusted_loop_len" ); |
11137 | SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop (); |
11138 | rgl->bias_adjusted_ctrl = adjusted_len; |
11139 | } |
11140 | } |
11141 | } |
11142 | |
11143 | if (use_bias_adjusted_len) |
11144 | return rgl->bias_adjusted_ctrl; |
11145 | |
11146 | tree loop_len = rgl->controls[index]; |
11147 | if (rgl->factor == 1 && factor == 1) |
11148 | { |
11149 | poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (node: rgl->type); |
11150 | poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (node: vectype); |
11151 | if (maybe_ne (a: nunits1, b: nunits2)) |
11152 | { |
11153 | /* A loop len for data type X can be reused for data type Y |
11154 | if X has N times more elements than Y and if Y's elements |
11155 | are N times bigger than X's. */ |
11156 | gcc_assert (multiple_p (nunits1, nunits2)); |
11157 | factor = exact_div (a: nunits1, b: nunits2).to_constant (); |
11158 | tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo); |
11159 | gimple_seq seq = NULL; |
11160 | loop_len = gimple_build (seq: &seq, code: RDIV_EXPR, type: iv_type, ops: loop_len, |
11161 | ops: build_int_cst (iv_type, factor)); |
11162 | if (seq) |
11163 | gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT); |
11164 | } |
11165 | } |
11166 | return loop_len; |
11167 | } |
11168 | |
11169 | /* Scale profiling counters by estimation for LOOP which is vectorized |
11170 | by factor VF. |
11171 | If FLAT is true, the loop we started with had unrealistically flat |
11172 | profile. */ |
11173 | |
11174 | static void |
11175 | scale_profile_for_vect_loop (class loop *loop, edge exit_e, unsigned vf, bool flat) |
11176 | { |
11177 | /* For flat profiles do not scale down proportionally by VF and only |
11178 | cap by known iteration count bounds. */ |
11179 | if (flat) |
11180 | { |
11181 | if (dump_file && (dump_flags & TDF_DETAILS)) |
11182 | fprintf (stream: dump_file, |
11183 | format: "Vectorized loop profile seems flat; not scaling iteration " |
11184 | "count down by the vectorization factor %i\n" , vf); |
11185 | scale_loop_profile (loop, profile_probability::always (), |
11186 | get_likely_max_loop_iterations_int (loop)); |
11187 | return; |
11188 | } |
11189 | /* Loop body executes VF fewer times and exit increases VF times. */ |
11190 | profile_count entry_count = loop_preheader_edge (loop)->count (); |
11191 | |
11192 | /* If we have unreliable loop profile avoid dropping entry |
11193 | count bellow header count. This can happen since loops |
11194 | has unrealistically low trip counts. */ |
11195 | while (vf > 1 |
11196 | && loop->header->count > entry_count |
11197 | && loop->header->count < entry_count * vf) |
11198 | { |
11199 | if (dump_file && (dump_flags & TDF_DETAILS)) |
11200 | fprintf (stream: dump_file, |
11201 | format: "Vectorization factor %i seems too large for profile " |
11202 | "prevoiusly believed to be consistent; reducing.\n" , vf); |
11203 | vf /= 2; |
11204 | } |
11205 | |
11206 | if (entry_count.nonzero_p ()) |
11207 | set_edge_probability_and_rescale_others |
11208 | (exit_e, |
11209 | entry_count.probability_in (overall: loop->header->count / vf)); |
11210 | /* Avoid producing very large exit probability when we do not have |
11211 | sensible profile. */ |
11212 | else if (exit_e->probability < profile_probability::always () / (vf * 2)) |
11213 | set_edge_probability_and_rescale_others (exit_e, exit_e->probability * vf); |
11214 | loop->latch->count = single_pred_edge (bb: loop->latch)->count (); |
11215 | |
11216 | scale_loop_profile (loop, profile_probability::always () / vf, |
11217 | get_likely_max_loop_iterations_int (loop)); |
11218 | } |
11219 | |
11220 | /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI |
11221 | latch edge values originally defined by it. */ |
11222 | |
11223 | static void |
11224 | maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo, |
11225 | stmt_vec_info def_stmt_info) |
11226 | { |
11227 | tree def = gimple_get_lhs (vect_orig_stmt (stmt_info: def_stmt_info)->stmt); |
11228 | if (!def || TREE_CODE (def) != SSA_NAME) |
11229 | return; |
11230 | stmt_vec_info phi_info; |
11231 | imm_use_iterator iter; |
11232 | use_operand_p use_p; |
11233 | FOR_EACH_IMM_USE_FAST (use_p, iter, def) |
11234 | { |
11235 | gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p)); |
11236 | if (!phi) |
11237 | continue; |
11238 | if (!(gimple_bb (g: phi)->loop_father->header == gimple_bb (g: phi) |
11239 | && (phi_info = loop_vinfo->lookup_stmt (phi)) |
11240 | && STMT_VINFO_RELEVANT_P (phi_info))) |
11241 | continue; |
11242 | loop_p loop = gimple_bb (g: phi)->loop_father; |
11243 | edge e = loop_latch_edge (loop); |
11244 | if (PHI_ARG_DEF_FROM_EDGE (phi, e) != def) |
11245 | continue; |
11246 | |
11247 | if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info)) |
11248 | && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION |
11249 | && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION) |
11250 | { |
11251 | vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info); |
11252 | vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info); |
11253 | gcc_assert (phi_defs.length () == latch_defs.length ()); |
11254 | for (unsigned i = 0; i < phi_defs.length (); ++i) |
11255 | add_phi_arg (as_a <gphi *> (p: phi_defs[i]), |
11256 | gimple_get_lhs (latch_defs[i]), e, |
11257 | gimple_phi_arg_location (phi, i: e->dest_idx)); |
11258 | } |
11259 | else if (STMT_VINFO_DEF_TYPE (phi_info) == vect_first_order_recurrence) |
11260 | { |
11261 | /* For first order recurrences we have to update both uses of |
11262 | the latch definition, the one in the PHI node and the one |
11263 | in the generated VEC_PERM_EXPR. */ |
11264 | vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info); |
11265 | vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info); |
11266 | gcc_assert (phi_defs.length () == latch_defs.length ()); |
11267 | tree phidef = gimple_assign_rhs1 (gs: phi_defs[0]); |
11268 | gphi *vphi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef)); |
11269 | for (unsigned i = 0; i < phi_defs.length (); ++i) |
11270 | { |
11271 | gassign *perm = as_a <gassign *> (p: phi_defs[i]); |
11272 | if (i > 0) |
11273 | gimple_assign_set_rhs1 (gs: perm, rhs: gimple_get_lhs (latch_defs[i-1])); |
11274 | gimple_assign_set_rhs2 (gs: perm, rhs: gimple_get_lhs (latch_defs[i])); |
11275 | update_stmt (s: perm); |
11276 | } |
11277 | add_phi_arg (vphi, gimple_get_lhs (latch_defs.last ()), e, |
11278 | gimple_phi_arg_location (phi, i: e->dest_idx)); |
11279 | } |
11280 | } |
11281 | } |
11282 | |
11283 | /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI. |
11284 | When vectorizing STMT_INFO as a store, set *SEEN_STORE to its |
11285 | stmt_vec_info. */ |
11286 | |
11287 | static bool |
11288 | vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info, |
11289 | gimple_stmt_iterator *gsi, stmt_vec_info *seen_store) |
11290 | { |
11291 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
11292 | poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); |
11293 | |
11294 | if (dump_enabled_p ()) |
11295 | dump_printf_loc (MSG_NOTE, vect_location, |
11296 | "------>vectorizing statement: %G" , stmt_info->stmt); |
11297 | |
11298 | if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info)) |
11299 | vect_loop_kill_debug_uses (loop, stmt_info); |
11300 | |
11301 | if (!STMT_VINFO_RELEVANT_P (stmt_info) |
11302 | && !STMT_VINFO_LIVE_P (stmt_info)) |
11303 | return false; |
11304 | |
11305 | if (STMT_VINFO_VECTYPE (stmt_info)) |
11306 | { |
11307 | poly_uint64 nunits |
11308 | = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)); |
11309 | if (!STMT_SLP_TYPE (stmt_info) |
11310 | && maybe_ne (a: nunits, b: vf) |
11311 | && dump_enabled_p ()) |
11312 | /* For SLP VF is set according to unrolling factor, and not |
11313 | to vector size, hence for SLP this print is not valid. */ |
11314 | dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n" ); |
11315 | } |
11316 | |
11317 | /* Pure SLP statements have already been vectorized. We still need |
11318 | to apply loop vectorization to hybrid SLP statements. */ |
11319 | if (PURE_SLP_STMT (stmt_info)) |
11320 | return false; |
11321 | |
11322 | if (dump_enabled_p ()) |
11323 | dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n" ); |
11324 | |
11325 | if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL)) |
11326 | *seen_store = stmt_info; |
11327 | |
11328 | return true; |
11329 | } |
11330 | |
11331 | /* Helper function to pass to simplify_replace_tree to enable replacing tree's |
11332 | in the hash_map with its corresponding values. */ |
11333 | |
11334 | static tree |
11335 | find_in_mapping (tree t, void *context) |
11336 | { |
11337 | hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context; |
11338 | |
11339 | tree *value = mapping->get (k: t); |
11340 | return value ? *value : t; |
11341 | } |
11342 | |
11343 | /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the |
11344 | original loop that has now been vectorized. |
11345 | |
11346 | The inits of the data_references need to be advanced with the number of |
11347 | iterations of the main loop. This has been computed in vect_do_peeling and |
11348 | is stored in parameter ADVANCE. We first restore the data_references |
11349 | initial offset with the values recored in ORIG_DRS_INIT. |
11350 | |
11351 | Since the loop_vec_info of this EPILOGUE was constructed for the original |
11352 | loop, its stmt_vec_infos all point to the original statements. These need |
11353 | to be updated to point to their corresponding copies as well as the SSA_NAMES |
11354 | in their PATTERN_DEF_SEQs and RELATED_STMTs. |
11355 | |
11356 | The data_reference's connections also need to be updated. Their |
11357 | corresponding dr_vec_info need to be reconnected to the EPILOGUE's |
11358 | stmt_vec_infos, their statements need to point to their corresponding copy, |
11359 | if they are gather loads or scatter stores then their reference needs to be |
11360 | updated to point to its corresponding copy and finally we set |
11361 | 'base_misaligned' to false as we have already peeled for alignment in the |
11362 | prologue of the main loop. */ |
11363 | |
11364 | static void |
11365 | update_epilogue_loop_vinfo (class loop *epilogue, tree advance) |
11366 | { |
11367 | loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (loop: epilogue); |
11368 | auto_vec<gimple *> stmt_worklist; |
11369 | hash_map<tree,tree> mapping; |
11370 | gimple *orig_stmt, *new_stmt; |
11371 | gimple_stmt_iterator epilogue_gsi; |
11372 | gphi_iterator epilogue_phi_gsi; |
11373 | stmt_vec_info stmt_vinfo = NULL, related_vinfo; |
11374 | basic_block *epilogue_bbs = get_loop_body (epilogue); |
11375 | unsigned i; |
11376 | |
11377 | free (LOOP_VINFO_BBS (epilogue_vinfo)); |
11378 | LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs; |
11379 | |
11380 | /* Advance data_reference's with the number of iterations of the previous |
11381 | loop and its prologue. */ |
11382 | vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR); |
11383 | |
11384 | |
11385 | /* The EPILOGUE loop is a copy of the original loop so they share the same |
11386 | gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to |
11387 | point to the copied statements. We also create a mapping of all LHS' in |
11388 | the original loop and all the LHS' in the EPILOGUE and create worklists to |
11389 | update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */ |
11390 | for (unsigned i = 0; i < epilogue->num_nodes; ++i) |
11391 | { |
11392 | for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]); |
11393 | !gsi_end_p (i: epilogue_phi_gsi); gsi_next (i: &epilogue_phi_gsi)) |
11394 | { |
11395 | new_stmt = epilogue_phi_gsi.phi (); |
11396 | |
11397 | gcc_assert (gimple_uid (new_stmt) > 0); |
11398 | stmt_vinfo |
11399 | = epilogue_vinfo->stmt_vec_infos[gimple_uid (g: new_stmt) - 1]; |
11400 | |
11401 | orig_stmt = STMT_VINFO_STMT (stmt_vinfo); |
11402 | STMT_VINFO_STMT (stmt_vinfo) = new_stmt; |
11403 | |
11404 | mapping.put (k: gimple_phi_result (gs: orig_stmt), |
11405 | v: gimple_phi_result (gs: new_stmt)); |
11406 | /* PHI nodes can not have patterns or related statements. */ |
11407 | gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL |
11408 | && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL); |
11409 | } |
11410 | |
11411 | for (epilogue_gsi = gsi_start_bb (bb: epilogue_bbs[i]); |
11412 | !gsi_end_p (i: epilogue_gsi); gsi_next (i: &epilogue_gsi)) |
11413 | { |
11414 | new_stmt = gsi_stmt (i: epilogue_gsi); |
11415 | if (is_gimple_debug (gs: new_stmt)) |
11416 | continue; |
11417 | |
11418 | gcc_assert (gimple_uid (new_stmt) > 0); |
11419 | stmt_vinfo |
11420 | = epilogue_vinfo->stmt_vec_infos[gimple_uid (g: new_stmt) - 1]; |
11421 | |
11422 | orig_stmt = STMT_VINFO_STMT (stmt_vinfo); |
11423 | STMT_VINFO_STMT (stmt_vinfo) = new_stmt; |
11424 | |
11425 | if (tree old_lhs = gimple_get_lhs (orig_stmt)) |
11426 | mapping.put (k: old_lhs, v: gimple_get_lhs (new_stmt)); |
11427 | |
11428 | if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo)) |
11429 | { |
11430 | gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo); |
11431 | for (gimple_stmt_iterator gsi = gsi_start (seq); |
11432 | !gsi_end_p (i: gsi); gsi_next (i: &gsi)) |
11433 | stmt_worklist.safe_push (obj: gsi_stmt (i: gsi)); |
11434 | } |
11435 | |
11436 | related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo); |
11437 | if (related_vinfo != NULL && related_vinfo != stmt_vinfo) |
11438 | { |
11439 | gimple *stmt = STMT_VINFO_STMT (related_vinfo); |
11440 | stmt_worklist.safe_push (obj: stmt); |
11441 | /* Set BB such that the assert in |
11442 | 'get_initial_def_for_reduction' is able to determine that |
11443 | the BB of the related stmt is inside this loop. */ |
11444 | gimple_set_bb (stmt, |
11445 | gimple_bb (g: new_stmt)); |
11446 | related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo); |
11447 | gcc_assert (related_vinfo == NULL |
11448 | || related_vinfo == stmt_vinfo); |
11449 | } |
11450 | } |
11451 | } |
11452 | |
11453 | /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed |
11454 | using the original main loop and thus need to be updated to refer to the |
11455 | cloned variables used in the epilogue. */ |
11456 | for (unsigned i = 0; i < stmt_worklist.length (); ++i) |
11457 | { |
11458 | gimple *stmt = stmt_worklist[i]; |
11459 | tree *new_op; |
11460 | |
11461 | for (unsigned j = 1; j < gimple_num_ops (gs: stmt); ++j) |
11462 | { |
11463 | tree op = gimple_op (gs: stmt, i: j); |
11464 | if ((new_op = mapping.get(k: op))) |
11465 | gimple_set_op (gs: stmt, i: j, op: *new_op); |
11466 | else |
11467 | { |
11468 | /* PR92429: The last argument of simplify_replace_tree disables |
11469 | folding when replacing arguments. This is required as |
11470 | otherwise you might end up with different statements than the |
11471 | ones analyzed in vect_loop_analyze, leading to different |
11472 | vectorization. */ |
11473 | op = simplify_replace_tree (op, NULL_TREE, NULL_TREE, |
11474 | &find_in_mapping, &mapping, do_fold: false); |
11475 | gimple_set_op (gs: stmt, i: j, op); |
11476 | } |
11477 | } |
11478 | } |
11479 | |
11480 | struct data_reference *dr; |
11481 | vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo); |
11482 | FOR_EACH_VEC_ELT (datarefs, i, dr) |
11483 | { |
11484 | orig_stmt = DR_STMT (dr); |
11485 | gcc_assert (gimple_uid (orig_stmt) > 0); |
11486 | stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (g: orig_stmt) - 1]; |
11487 | /* Data references for gather loads and scatter stores do not use the |
11488 | updated offset we set using ADVANCE. Instead we have to make sure the |
11489 | reference in the data references point to the corresponding copy of |
11490 | the original in the epilogue. Make sure to update both |
11491 | gather/scatters recognized by dataref analysis and also other |
11492 | refs that get_load_store_type classified as VMAT_GATHER_SCATTER. */ |
11493 | auto vstmt_vinfo = vect_stmt_to_vectorize (stmt_info: stmt_vinfo); |
11494 | if (STMT_VINFO_MEMORY_ACCESS_TYPE (vstmt_vinfo) == VMAT_GATHER_SCATTER |
11495 | || STMT_VINFO_GATHER_SCATTER_P (vstmt_vinfo)) |
11496 | { |
11497 | DR_REF (dr) |
11498 | = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE, |
11499 | &find_in_mapping, &mapping); |
11500 | DR_BASE_ADDRESS (dr) |
11501 | = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE, |
11502 | &find_in_mapping, &mapping); |
11503 | } |
11504 | DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo); |
11505 | stmt_vinfo->dr_aux.stmt = stmt_vinfo; |
11506 | /* The vector size of the epilogue is smaller than that of the main loop |
11507 | so the alignment is either the same or lower. This means the dr will |
11508 | thus by definition be aligned. */ |
11509 | STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false; |
11510 | } |
11511 | |
11512 | epilogue_vinfo->shared->datarefs_copy.release (); |
11513 | epilogue_vinfo->shared->save_datarefs (); |
11514 | } |
11515 | |
11516 | /* Function vect_transform_loop. |
11517 | |
11518 | The analysis phase has determined that the loop is vectorizable. |
11519 | Vectorize the loop - created vectorized stmts to replace the scalar |
11520 | stmts in the loop, and update the loop exit condition. |
11521 | Returns scalar epilogue loop if any. */ |
11522 | |
11523 | class loop * |
11524 | vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call) |
11525 | { |
11526 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
11527 | class loop *epilogue = NULL; |
11528 | basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); |
11529 | int nbbs = loop->num_nodes; |
11530 | int i; |
11531 | tree niters_vector = NULL_TREE; |
11532 | tree step_vector = NULL_TREE; |
11533 | tree niters_vector_mult_vf = NULL_TREE; |
11534 | poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); |
11535 | unsigned int lowest_vf = constant_lower_bound (a: vf); |
11536 | gimple *stmt; |
11537 | bool check_profitability = false; |
11538 | unsigned int th; |
11539 | bool flat = maybe_flat_loop_profile (loop); |
11540 | |
11541 | DUMP_VECT_SCOPE ("vec_transform_loop" ); |
11542 | |
11543 | loop_vinfo->shared->check_datarefs (); |
11544 | |
11545 | /* Use the more conservative vectorization threshold. If the number |
11546 | of iterations is constant assume the cost check has been performed |
11547 | by our caller. If the threshold makes all loops profitable that |
11548 | run at least the (estimated) vectorization factor number of times |
11549 | checking is pointless, too. */ |
11550 | th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo); |
11551 | if (vect_apply_runtime_profitability_check_p (loop_vinfo)) |
11552 | { |
11553 | if (dump_enabled_p ()) |
11554 | dump_printf_loc (MSG_NOTE, vect_location, |
11555 | "Profitability threshold is %d loop iterations.\n" , |
11556 | th); |
11557 | check_profitability = true; |
11558 | } |
11559 | |
11560 | /* Make sure there exists a single-predecessor exit bb. Do this before |
11561 | versioning. */ |
11562 | edge e = LOOP_VINFO_IV_EXIT (loop_vinfo); |
11563 | if (! single_pred_p (bb: e->dest)) |
11564 | { |
11565 | split_loop_exit_edge (e, true); |
11566 | if (dump_enabled_p ()) |
11567 | dump_printf (MSG_NOTE, "split exit edge\n" ); |
11568 | } |
11569 | |
11570 | /* Version the loop first, if required, so the profitability check |
11571 | comes first. */ |
11572 | |
11573 | if (LOOP_REQUIRES_VERSIONING (loop_vinfo)) |
11574 | { |
11575 | class loop *sloop |
11576 | = vect_loop_versioning (loop_vinfo, loop_vectorized_call); |
11577 | sloop->force_vectorize = false; |
11578 | check_profitability = false; |
11579 | } |
11580 | |
11581 | /* Make sure there exists a single-predecessor exit bb also on the |
11582 | scalar loop copy. Do this after versioning but before peeling |
11583 | so CFG structure is fine for both scalar and if-converted loop |
11584 | to make slpeel_duplicate_current_defs_from_edges face matched |
11585 | loop closed PHI nodes on the exit. */ |
11586 | if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)) |
11587 | { |
11588 | e = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo); |
11589 | if (! single_pred_p (bb: e->dest)) |
11590 | { |
11591 | split_loop_exit_edge (e, true); |
11592 | if (dump_enabled_p ()) |
11593 | dump_printf (MSG_NOTE, "split exit edge of scalar loop\n" ); |
11594 | } |
11595 | } |
11596 | |
11597 | tree niters = vect_build_loop_niters (loop_vinfo); |
11598 | LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters; |
11599 | tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo)); |
11600 | bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo); |
11601 | tree advance; |
11602 | drs_init_vec orig_drs_init; |
11603 | |
11604 | epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector, |
11605 | &step_vector, &niters_vector_mult_vf, th, |
11606 | check_profitability, niters_no_overflow, |
11607 | &advance); |
11608 | if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo) |
11609 | && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ()) |
11610 | { |
11611 | /* Ifcvt duplicates loop preheader, loop body and produces an basic |
11612 | block after loop exit. We need to scale all that. */ |
11613 | basic_block |
11614 | = loop_preheader_edge (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->src; |
11615 | preheader->count |
11616 | = preheader->count.apply_probability |
11617 | (LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo)); |
11618 | scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo), |
11619 | LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo)); |
11620 | single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->dest->count |
11621 | = preheader->count; |
11622 | } |
11623 | |
11624 | if (niters_vector == NULL_TREE) |
11625 | { |
11626 | if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) |
11627 | && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) |
11628 | && known_eq (lowest_vf, vf)) |
11629 | { |
11630 | niters_vector |
11631 | = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)), |
11632 | LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf); |
11633 | step_vector = build_one_cst (TREE_TYPE (niters)); |
11634 | } |
11635 | else if (vect_use_loop_mask_for_alignment_p (loop_vinfo)) |
11636 | vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector, |
11637 | &step_vector, niters_no_overflow); |
11638 | else |
11639 | /* vect_do_peeling subtracted the number of peeled prologue |
11640 | iterations from LOOP_VINFO_NITERS. */ |
11641 | vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo), |
11642 | &niters_vector, &step_vector, |
11643 | niters_no_overflow); |
11644 | } |
11645 | |
11646 | /* 1) Make sure the loop header has exactly two entries |
11647 | 2) Make sure we have a preheader basic block. */ |
11648 | |
11649 | gcc_assert (EDGE_COUNT (loop->header->preds) == 2); |
11650 | |
11651 | split_edge (loop_preheader_edge (loop)); |
11652 | |
11653 | if (vect_use_loop_mask_for_alignment_p (loop_vinfo)) |
11654 | /* This will deal with any possible peeling. */ |
11655 | vect_prepare_for_masked_peels (loop_vinfo); |
11656 | |
11657 | /* Schedule the SLP instances first, then handle loop vectorization |
11658 | below. */ |
11659 | if (!loop_vinfo->slp_instances.is_empty ()) |
11660 | { |
11661 | DUMP_VECT_SCOPE ("scheduling SLP instances" ); |
11662 | vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo)); |
11663 | } |
11664 | |
11665 | /* FORNOW: the vectorizer supports only loops which body consist |
11666 | of one basic block (header + empty latch). When the vectorizer will |
11667 | support more involved loop forms, the order by which the BBs are |
11668 | traversed need to be reconsidered. */ |
11669 | |
11670 | for (i = 0; i < nbbs; i++) |
11671 | { |
11672 | basic_block bb = bbs[i]; |
11673 | stmt_vec_info stmt_info; |
11674 | |
11675 | for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (i: si); |
11676 | gsi_next (i: &si)) |
11677 | { |
11678 | gphi *phi = si.phi (); |
11679 | if (dump_enabled_p ()) |
11680 | dump_printf_loc (MSG_NOTE, vect_location, |
11681 | "------>vectorizing phi: %G" , (gimple *) phi); |
11682 | stmt_info = loop_vinfo->lookup_stmt (phi); |
11683 | if (!stmt_info) |
11684 | continue; |
11685 | |
11686 | if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info)) |
11687 | vect_loop_kill_debug_uses (loop, stmt_info); |
11688 | |
11689 | if (!STMT_VINFO_RELEVANT_P (stmt_info) |
11690 | && !STMT_VINFO_LIVE_P (stmt_info)) |
11691 | continue; |
11692 | |
11693 | if (STMT_VINFO_VECTYPE (stmt_info) |
11694 | && (maybe_ne |
11695 | (a: TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), b: vf)) |
11696 | && dump_enabled_p ()) |
11697 | dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n" ); |
11698 | |
11699 | if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def |
11700 | || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def |
11701 | || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def |
11702 | || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle |
11703 | || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence |
11704 | || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def) |
11705 | && ! PURE_SLP_STMT (stmt_info)) |
11706 | { |
11707 | if (dump_enabled_p ()) |
11708 | dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n" ); |
11709 | vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL); |
11710 | } |
11711 | } |
11712 | |
11713 | for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (i: si); |
11714 | gsi_next (i: &si)) |
11715 | { |
11716 | gphi *phi = si.phi (); |
11717 | stmt_info = loop_vinfo->lookup_stmt (phi); |
11718 | if (!stmt_info) |
11719 | continue; |
11720 | |
11721 | if (!STMT_VINFO_RELEVANT_P (stmt_info) |
11722 | && !STMT_VINFO_LIVE_P (stmt_info)) |
11723 | continue; |
11724 | |
11725 | if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def |
11726 | || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def |
11727 | || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def |
11728 | || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle |
11729 | || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def |
11730 | || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence) |
11731 | && ! PURE_SLP_STMT (stmt_info)) |
11732 | maybe_set_vectorized_backedge_value (loop_vinfo, def_stmt_info: stmt_info); |
11733 | } |
11734 | |
11735 | for (gimple_stmt_iterator si = gsi_start_bb (bb); |
11736 | !gsi_end_p (i: si);) |
11737 | { |
11738 | stmt = gsi_stmt (i: si); |
11739 | /* During vectorization remove existing clobber stmts. */ |
11740 | if (gimple_clobber_p (s: stmt)) |
11741 | { |
11742 | unlink_stmt_vdef (stmt); |
11743 | gsi_remove (&si, true); |
11744 | release_defs (stmt); |
11745 | } |
11746 | else |
11747 | { |
11748 | /* Ignore vector stmts created in the outer loop. */ |
11749 | stmt_info = loop_vinfo->lookup_stmt (stmt); |
11750 | |
11751 | /* vector stmts created in the outer-loop during vectorization of |
11752 | stmts in an inner-loop may not have a stmt_info, and do not |
11753 | need to be vectorized. */ |
11754 | stmt_vec_info seen_store = NULL; |
11755 | if (stmt_info) |
11756 | { |
11757 | if (STMT_VINFO_IN_PATTERN_P (stmt_info)) |
11758 | { |
11759 | gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info); |
11760 | for (gimple_stmt_iterator subsi = gsi_start (seq&: def_seq); |
11761 | !gsi_end_p (i: subsi); gsi_next (i: &subsi)) |
11762 | { |
11763 | stmt_vec_info pat_stmt_info |
11764 | = loop_vinfo->lookup_stmt (gsi_stmt (i: subsi)); |
11765 | vect_transform_loop_stmt (loop_vinfo, stmt_info: pat_stmt_info, |
11766 | gsi: &si, seen_store: &seen_store); |
11767 | } |
11768 | stmt_vec_info pat_stmt_info |
11769 | = STMT_VINFO_RELATED_STMT (stmt_info); |
11770 | if (vect_transform_loop_stmt (loop_vinfo, stmt_info: pat_stmt_info, |
11771 | gsi: &si, seen_store: &seen_store)) |
11772 | maybe_set_vectorized_backedge_value (loop_vinfo, |
11773 | def_stmt_info: pat_stmt_info); |
11774 | } |
11775 | else |
11776 | { |
11777 | if (vect_transform_loop_stmt (loop_vinfo, stmt_info, gsi: &si, |
11778 | seen_store: &seen_store)) |
11779 | maybe_set_vectorized_backedge_value (loop_vinfo, |
11780 | def_stmt_info: stmt_info); |
11781 | } |
11782 | } |
11783 | gsi_next (i: &si); |
11784 | if (seen_store) |
11785 | { |
11786 | if (STMT_VINFO_GROUPED_ACCESS (seen_store)) |
11787 | /* Interleaving. If IS_STORE is TRUE, the |
11788 | vectorization of the interleaving chain was |
11789 | completed - free all the stores in the chain. */ |
11790 | vect_remove_stores (loop_vinfo, |
11791 | DR_GROUP_FIRST_ELEMENT (seen_store)); |
11792 | else |
11793 | /* Free the attached stmt_vec_info and remove the stmt. */ |
11794 | loop_vinfo->remove_stmt (stmt_info); |
11795 | } |
11796 | } |
11797 | } |
11798 | |
11799 | /* Stub out scalar statements that must not survive vectorization. |
11800 | Doing this here helps with grouped statements, or statements that |
11801 | are involved in patterns. */ |
11802 | for (gimple_stmt_iterator gsi = gsi_start_bb (bb); |
11803 | !gsi_end_p (i: gsi); gsi_next (i: &gsi)) |
11804 | { |
11805 | gcall *call = dyn_cast <gcall *> (p: gsi_stmt (i: gsi)); |
11806 | if (!call || !gimple_call_internal_p (gs: call)) |
11807 | continue; |
11808 | internal_fn ifn = gimple_call_internal_fn (gs: call); |
11809 | if (ifn == IFN_MASK_LOAD) |
11810 | { |
11811 | tree lhs = gimple_get_lhs (call); |
11812 | if (!VECTOR_TYPE_P (TREE_TYPE (lhs))) |
11813 | { |
11814 | tree zero = build_zero_cst (TREE_TYPE (lhs)); |
11815 | gimple *new_stmt = gimple_build_assign (lhs, zero); |
11816 | gsi_replace (&gsi, new_stmt, true); |
11817 | } |
11818 | } |
11819 | else if (conditional_internal_fn_code (ifn) != ERROR_MARK) |
11820 | { |
11821 | tree lhs = gimple_get_lhs (call); |
11822 | if (!VECTOR_TYPE_P (TREE_TYPE (lhs))) |
11823 | { |
11824 | tree else_arg |
11825 | = gimple_call_arg (gs: call, index: gimple_call_num_args (gs: call) - 1); |
11826 | gimple *new_stmt = gimple_build_assign (lhs, else_arg); |
11827 | gsi_replace (&gsi, new_stmt, true); |
11828 | } |
11829 | } |
11830 | } |
11831 | } /* BBs in loop */ |
11832 | |
11833 | /* The vectorization factor is always > 1, so if we use an IV increment of 1. |
11834 | a zero NITERS becomes a nonzero NITERS_VECTOR. */ |
11835 | if (integer_onep (step_vector)) |
11836 | niters_no_overflow = true; |
11837 | vect_set_loop_condition (loop, LOOP_VINFO_IV_EXIT (loop_vinfo), loop_vinfo, |
11838 | niters_vector, step_vector, niters_vector_mult_vf, |
11839 | !niters_no_overflow); |
11840 | |
11841 | unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo); |
11842 | |
11843 | /* True if the final iteration might not handle a full vector's |
11844 | worth of scalar iterations. */ |
11845 | bool final_iter_may_be_partial |
11846 | = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo); |
11847 | /* The minimum number of iterations performed by the epilogue. This |
11848 | is 1 when peeling for gaps because we always need a final scalar |
11849 | iteration. */ |
11850 | int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0; |
11851 | /* +1 to convert latch counts to loop iteration counts, |
11852 | -min_epilogue_iters to remove iterations that cannot be performed |
11853 | by the vector code. */ |
11854 | int bias_for_lowest = 1 - min_epilogue_iters; |
11855 | int bias_for_assumed = bias_for_lowest; |
11856 | int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); |
11857 | if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)) |
11858 | { |
11859 | /* When the amount of peeling is known at compile time, the first |
11860 | iteration will have exactly alignment_npeels active elements. |
11861 | In the worst case it will have at least one. */ |
11862 | int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1); |
11863 | bias_for_lowest += lowest_vf - min_first_active; |
11864 | bias_for_assumed += assumed_vf - min_first_active; |
11865 | } |
11866 | /* In these calculations the "- 1" converts loop iteration counts |
11867 | back to latch counts. */ |
11868 | if (loop->any_upper_bound) |
11869 | { |
11870 | loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo); |
11871 | loop->nb_iterations_upper_bound |
11872 | = (final_iter_may_be_partial |
11873 | ? wi::udiv_ceil (x: loop->nb_iterations_upper_bound + bias_for_lowest, |
11874 | y: lowest_vf) - 1 |
11875 | : wi::udiv_floor (x: loop->nb_iterations_upper_bound + bias_for_lowest, |
11876 | y: lowest_vf) - 1); |
11877 | if (main_vinfo |
11878 | /* Both peeling for alignment and peeling for gaps can end up |
11879 | with the scalar epilogue running for more than VF-1 iterations. */ |
11880 | && !main_vinfo->peeling_for_alignment |
11881 | && !main_vinfo->peeling_for_gaps) |
11882 | { |
11883 | unsigned int bound; |
11884 | poly_uint64 main_iters |
11885 | = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo), |
11886 | LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo)); |
11887 | main_iters |
11888 | = upper_bound (a: main_iters, |
11889 | LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo)); |
11890 | if (can_div_away_from_zero_p (a: main_iters, |
11891 | LOOP_VINFO_VECT_FACTOR (loop_vinfo), |
11892 | quotient: &bound)) |
11893 | loop->nb_iterations_upper_bound |
11894 | = wi::umin (x: (bound_wide_int) (bound - 1), |
11895 | y: loop->nb_iterations_upper_bound); |
11896 | } |
11897 | } |
11898 | if (loop->any_likely_upper_bound) |
11899 | loop->nb_iterations_likely_upper_bound |
11900 | = (final_iter_may_be_partial |
11901 | ? wi::udiv_ceil (x: loop->nb_iterations_likely_upper_bound |
11902 | + bias_for_lowest, y: lowest_vf) - 1 |
11903 | : wi::udiv_floor (x: loop->nb_iterations_likely_upper_bound |
11904 | + bias_for_lowest, y: lowest_vf) - 1); |
11905 | if (loop->any_estimate) |
11906 | loop->nb_iterations_estimate |
11907 | = (final_iter_may_be_partial |
11908 | ? wi::udiv_ceil (x: loop->nb_iterations_estimate + bias_for_assumed, |
11909 | y: assumed_vf) - 1 |
11910 | : wi::udiv_floor (x: loop->nb_iterations_estimate + bias_for_assumed, |
11911 | y: assumed_vf) - 1); |
11912 | scale_profile_for_vect_loop (loop, LOOP_VINFO_IV_EXIT (loop_vinfo), |
11913 | vf: assumed_vf, flat); |
11914 | |
11915 | if (dump_enabled_p ()) |
11916 | { |
11917 | if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)) |
11918 | { |
11919 | dump_printf_loc (MSG_NOTE, vect_location, |
11920 | "LOOP VECTORIZED\n" ); |
11921 | if (loop->inner) |
11922 | dump_printf_loc (MSG_NOTE, vect_location, |
11923 | "OUTER LOOP VECTORIZED\n" ); |
11924 | dump_printf (MSG_NOTE, "\n" ); |
11925 | } |
11926 | else |
11927 | dump_printf_loc (MSG_NOTE, vect_location, |
11928 | "LOOP EPILOGUE VECTORIZED (MODE=%s)\n" , |
11929 | GET_MODE_NAME (loop_vinfo->vector_mode)); |
11930 | } |
11931 | |
11932 | /* Loops vectorized with a variable factor won't benefit from |
11933 | unrolling/peeling. */ |
11934 | if (!vf.is_constant ()) |
11935 | { |
11936 | loop->unroll = 1; |
11937 | if (dump_enabled_p ()) |
11938 | dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to" |
11939 | " variable-length vectorization factor\n" ); |
11940 | } |
11941 | /* Free SLP instances here because otherwise stmt reference counting |
11942 | won't work. */ |
11943 | slp_instance instance; |
11944 | FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance) |
11945 | vect_free_slp_instance (instance); |
11946 | LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release (); |
11947 | /* Clear-up safelen field since its value is invalid after vectorization |
11948 | since vectorized loop can have loop-carried dependencies. */ |
11949 | loop->safelen = 0; |
11950 | |
11951 | if (epilogue) |
11952 | { |
11953 | update_epilogue_loop_vinfo (epilogue, advance); |
11954 | |
11955 | epilogue->simduid = loop->simduid; |
11956 | epilogue->force_vectorize = loop->force_vectorize; |
11957 | epilogue->dont_vectorize = false; |
11958 | } |
11959 | |
11960 | return epilogue; |
11961 | } |
11962 | |
11963 | /* The code below is trying to perform simple optimization - revert |
11964 | if-conversion for masked stores, i.e. if the mask of a store is zero |
11965 | do not perform it and all stored value producers also if possible. |
11966 | For example, |
11967 | for (i=0; i<n; i++) |
11968 | if (c[i]) |
11969 | { |
11970 | p1[i] += 1; |
11971 | p2[i] = p3[i] +2; |
11972 | } |
11973 | this transformation will produce the following semi-hammock: |
11974 | |
11975 | if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 }) |
11976 | { |
11977 | vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165); |
11978 | vect__12.22_172 = vect__11.19_170 + vect_cst__171; |
11979 | MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172); |
11980 | vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165); |
11981 | vect__19.28_184 = vect__18.25_182 + vect_cst__183; |
11982 | MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184); |
11983 | } |
11984 | */ |
11985 | |
11986 | void |
11987 | optimize_mask_stores (class loop *loop) |
11988 | { |
11989 | basic_block *bbs = get_loop_body (loop); |
11990 | unsigned nbbs = loop->num_nodes; |
11991 | unsigned i; |
11992 | basic_block bb; |
11993 | class loop *bb_loop; |
11994 | gimple_stmt_iterator gsi; |
11995 | gimple *stmt; |
11996 | auto_vec<gimple *> worklist; |
11997 | auto_purge_vect_location sentinel; |
11998 | |
11999 | vect_location = find_loop_location (loop); |
12000 | /* Pick up all masked stores in loop if any. */ |
12001 | for (i = 0; i < nbbs; i++) |
12002 | { |
12003 | bb = bbs[i]; |
12004 | for (gsi = gsi_start_bb (bb); !gsi_end_p (i: gsi); |
12005 | gsi_next (i: &gsi)) |
12006 | { |
12007 | stmt = gsi_stmt (i: gsi); |
12008 | if (gimple_call_internal_p (gs: stmt, fn: IFN_MASK_STORE)) |
12009 | worklist.safe_push (obj: stmt); |
12010 | } |
12011 | } |
12012 | |
12013 | free (ptr: bbs); |
12014 | if (worklist.is_empty ()) |
12015 | return; |
12016 | |
12017 | /* Loop has masked stores. */ |
12018 | while (!worklist.is_empty ()) |
12019 | { |
12020 | gimple *last, *last_store; |
12021 | edge e, efalse; |
12022 | tree mask; |
12023 | basic_block store_bb, join_bb; |
12024 | gimple_stmt_iterator gsi_to; |
12025 | tree vdef, new_vdef; |
12026 | gphi *phi; |
12027 | tree vectype; |
12028 | tree zero; |
12029 | |
12030 | last = worklist.pop (); |
12031 | mask = gimple_call_arg (gs: last, index: 2); |
12032 | bb = gimple_bb (g: last); |
12033 | /* Create then_bb and if-then structure in CFG, then_bb belongs to |
12034 | the same loop as if_bb. It could be different to LOOP when two |
12035 | level loop-nest is vectorized and mask_store belongs to the inner |
12036 | one. */ |
12037 | e = split_block (bb, last); |
12038 | bb_loop = bb->loop_father; |
12039 | gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop)); |
12040 | join_bb = e->dest; |
12041 | store_bb = create_empty_bb (bb); |
12042 | add_bb_to_loop (store_bb, bb_loop); |
12043 | e->flags = EDGE_TRUE_VALUE; |
12044 | efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE); |
12045 | /* Put STORE_BB to likely part. */ |
12046 | efalse->probability = profile_probability::likely (); |
12047 | e->probability = efalse->probability.invert (); |
12048 | store_bb->count = efalse->count (); |
12049 | make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU); |
12050 | if (dom_info_available_p (CDI_DOMINATORS)) |
12051 | set_immediate_dominator (CDI_DOMINATORS, store_bb, bb); |
12052 | if (dump_enabled_p ()) |
12053 | dump_printf_loc (MSG_NOTE, vect_location, |
12054 | "Create new block %d to sink mask stores." , |
12055 | store_bb->index); |
12056 | /* Create vector comparison with boolean result. */ |
12057 | vectype = TREE_TYPE (mask); |
12058 | zero = build_zero_cst (vectype); |
12059 | stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE); |
12060 | gsi = gsi_last_bb (bb); |
12061 | gsi_insert_after (&gsi, stmt, GSI_SAME_STMT); |
12062 | /* Create new PHI node for vdef of the last masked store: |
12063 | .MEM_2 = VDEF <.MEM_1> |
12064 | will be converted to |
12065 | .MEM.3 = VDEF <.MEM_1> |
12066 | and new PHI node will be created in join bb |
12067 | .MEM_2 = PHI <.MEM_1, .MEM_3> |
12068 | */ |
12069 | vdef = gimple_vdef (g: last); |
12070 | new_vdef = make_ssa_name (var: gimple_vop (cfun), stmt: last); |
12071 | gimple_set_vdef (g: last, vdef: new_vdef); |
12072 | phi = create_phi_node (vdef, join_bb); |
12073 | add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION); |
12074 | |
12075 | /* Put all masked stores with the same mask to STORE_BB if possible. */ |
12076 | while (true) |
12077 | { |
12078 | gimple_stmt_iterator gsi_from; |
12079 | gimple *stmt1 = NULL; |
12080 | |
12081 | /* Move masked store to STORE_BB. */ |
12082 | last_store = last; |
12083 | gsi = gsi_for_stmt (last); |
12084 | gsi_from = gsi; |
12085 | /* Shift GSI to the previous stmt for further traversal. */ |
12086 | gsi_prev (i: &gsi); |
12087 | gsi_to = gsi_start_bb (bb: store_bb); |
12088 | gsi_move_before (&gsi_from, &gsi_to); |
12089 | /* Setup GSI_TO to the non-empty block start. */ |
12090 | gsi_to = gsi_start_bb (bb: store_bb); |
12091 | if (dump_enabled_p ()) |
12092 | dump_printf_loc (MSG_NOTE, vect_location, |
12093 | "Move stmt to created bb\n%G" , last); |
12094 | /* Move all stored value producers if possible. */ |
12095 | while (!gsi_end_p (i: gsi)) |
12096 | { |
12097 | tree lhs; |
12098 | imm_use_iterator imm_iter; |
12099 | use_operand_p use_p; |
12100 | bool res; |
12101 | |
12102 | /* Skip debug statements. */ |
12103 | if (is_gimple_debug (gs: gsi_stmt (i: gsi))) |
12104 | { |
12105 | gsi_prev (i: &gsi); |
12106 | continue; |
12107 | } |
12108 | stmt1 = gsi_stmt (i: gsi); |
12109 | /* Do not consider statements writing to memory or having |
12110 | volatile operand. */ |
12111 | if (gimple_vdef (g: stmt1) |
12112 | || gimple_has_volatile_ops (stmt: stmt1)) |
12113 | break; |
12114 | gsi_from = gsi; |
12115 | gsi_prev (i: &gsi); |
12116 | lhs = gimple_get_lhs (stmt1); |
12117 | if (!lhs) |
12118 | break; |
12119 | |
12120 | /* LHS of vectorized stmt must be SSA_NAME. */ |
12121 | if (TREE_CODE (lhs) != SSA_NAME) |
12122 | break; |
12123 | |
12124 | if (!VECTOR_TYPE_P (TREE_TYPE (lhs))) |
12125 | { |
12126 | /* Remove dead scalar statement. */ |
12127 | if (has_zero_uses (var: lhs)) |
12128 | { |
12129 | gsi_remove (&gsi_from, true); |
12130 | continue; |
12131 | } |
12132 | } |
12133 | |
12134 | /* Check that LHS does not have uses outside of STORE_BB. */ |
12135 | res = true; |
12136 | FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs) |
12137 | { |
12138 | gimple *use_stmt; |
12139 | use_stmt = USE_STMT (use_p); |
12140 | if (is_gimple_debug (gs: use_stmt)) |
12141 | continue; |
12142 | if (gimple_bb (g: use_stmt) != store_bb) |
12143 | { |
12144 | res = false; |
12145 | break; |
12146 | } |
12147 | } |
12148 | if (!res) |
12149 | break; |
12150 | |
12151 | if (gimple_vuse (g: stmt1) |
12152 | && gimple_vuse (g: stmt1) != gimple_vuse (g: last_store)) |
12153 | break; |
12154 | |
12155 | /* Can move STMT1 to STORE_BB. */ |
12156 | if (dump_enabled_p ()) |
12157 | dump_printf_loc (MSG_NOTE, vect_location, |
12158 | "Move stmt to created bb\n%G" , stmt1); |
12159 | gsi_move_before (&gsi_from, &gsi_to); |
12160 | /* Shift GSI_TO for further insertion. */ |
12161 | gsi_prev (i: &gsi_to); |
12162 | } |
12163 | /* Put other masked stores with the same mask to STORE_BB. */ |
12164 | if (worklist.is_empty () |
12165 | || gimple_call_arg (gs: worklist.last (), index: 2) != mask |
12166 | || worklist.last () != stmt1) |
12167 | break; |
12168 | last = worklist.pop (); |
12169 | } |
12170 | add_phi_arg (phi, gimple_vuse (g: last_store), e, UNKNOWN_LOCATION); |
12171 | } |
12172 | } |
12173 | |
12174 | /* Decide whether it is possible to use a zero-based induction variable |
12175 | when vectorizing LOOP_VINFO with partial vectors. If it is, return |
12176 | the value that the induction variable must be able to hold in order |
12177 | to ensure that the rgroups eventually have no active vector elements. |
12178 | Return -1 otherwise. */ |
12179 | |
12180 | widest_int |
12181 | vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo) |
12182 | { |
12183 | tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo); |
12184 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
12185 | unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo); |
12186 | |
12187 | /* Calculate the value that the induction variable must be able |
12188 | to hit in order to ensure that we end the loop with an all-false mask. |
12189 | This involves adding the maximum number of inactive trailing scalar |
12190 | iterations. */ |
12191 | widest_int iv_limit = -1; |
12192 | if (max_loop_iterations (loop, &iv_limit)) |
12193 | { |
12194 | if (niters_skip) |
12195 | { |
12196 | /* Add the maximum number of skipped iterations to the |
12197 | maximum iteration count. */ |
12198 | if (TREE_CODE (niters_skip) == INTEGER_CST) |
12199 | iv_limit += wi::to_widest (t: niters_skip); |
12200 | else |
12201 | iv_limit += max_vf - 1; |
12202 | } |
12203 | else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)) |
12204 | /* Make a conservatively-correct assumption. */ |
12205 | iv_limit += max_vf - 1; |
12206 | |
12207 | /* IV_LIMIT is the maximum number of latch iterations, which is also |
12208 | the maximum in-range IV value. Round this value down to the previous |
12209 | vector alignment boundary and then add an extra full iteration. */ |
12210 | poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); |
12211 | iv_limit = (iv_limit & -(int) known_alignment (a: vf)) + max_vf; |
12212 | } |
12213 | return iv_limit; |
12214 | } |
12215 | |
12216 | /* For the given rgroup_controls RGC, check whether an induction variable |
12217 | would ever hit a value that produces a set of all-false masks or zero |
12218 | lengths before wrapping around. Return true if it's possible to wrap |
12219 | around before hitting the desirable value, otherwise return false. */ |
12220 | |
12221 | bool |
12222 | vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc) |
12223 | { |
12224 | widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo); |
12225 | |
12226 | if (iv_limit == -1) |
12227 | return true; |
12228 | |
12229 | tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo); |
12230 | unsigned int compare_precision = TYPE_PRECISION (compare_type); |
12231 | unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor; |
12232 | |
12233 | if (wi::min_precision (x: iv_limit * nitems, sgn: UNSIGNED) > compare_precision) |
12234 | return true; |
12235 | |
12236 | return false; |
12237 | } |
12238 | |