1/* Loop Vectorization
2 Copyright (C) 2003-2023 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
5
6This file is part of GCC.
7
8GCC is free software; you can redistribute it and/or modify it under
9the terms of the GNU General Public License as published by the Free
10Software Foundation; either version 3, or (at your option) any later
11version.
12
13GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14WARRANTY; without even the implied warranty of MERCHANTABILITY or
15FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16for more details.
17
18You should have received a copy of the GNU General Public License
19along with GCC; see the file COPYING3. If not see
20<http://www.gnu.org/licenses/>. */
21
22#define INCLUDE_ALGORITHM
23#include "config.h"
24#include "system.h"
25#include "coretypes.h"
26#include "backend.h"
27#include "target.h"
28#include "rtl.h"
29#include "tree.h"
30#include "gimple.h"
31#include "cfghooks.h"
32#include "tree-pass.h"
33#include "ssa.h"
34#include "optabs-tree.h"
35#include "memmodel.h"
36#include "optabs.h"
37#include "diagnostic-core.h"
38#include "fold-const.h"
39#include "stor-layout.h"
40#include "cfganal.h"
41#include "gimplify.h"
42#include "gimple-iterator.h"
43#include "gimplify-me.h"
44#include "tree-ssa-loop-ivopts.h"
45#include "tree-ssa-loop-manip.h"
46#include "tree-ssa-loop-niter.h"
47#include "tree-ssa-loop.h"
48#include "cfgloop.h"
49#include "tree-scalar-evolution.h"
50#include "tree-vectorizer.h"
51#include "gimple-fold.h"
52#include "cgraph.h"
53#include "tree-cfg.h"
54#include "tree-if-conv.h"
55#include "internal-fn.h"
56#include "tree-vector-builder.h"
57#include "vec-perm-indices.h"
58#include "tree-eh.h"
59#include "case-cfn-macros.h"
60#include "langhooks.h"
61
62/* Loop Vectorization Pass.
63
64 This pass tries to vectorize loops.
65
66 For example, the vectorizer transforms the following simple loop:
67
68 short a[N]; short b[N]; short c[N]; int i;
69
70 for (i=0; i<N; i++){
71 a[i] = b[i] + c[i];
72 }
73
74 as if it was manually vectorized by rewriting the source code into:
75
76 typedef int __attribute__((mode(V8HI))) v8hi;
77 short a[N]; short b[N]; short c[N]; int i;
78 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
79 v8hi va, vb, vc;
80
81 for (i=0; i<N/8; i++){
82 vb = pb[i];
83 vc = pc[i];
84 va = vb + vc;
85 pa[i] = va;
86 }
87
88 The main entry to this pass is vectorize_loops(), in which
89 the vectorizer applies a set of analyses on a given set of loops,
90 followed by the actual vectorization transformation for the loops that
91 had successfully passed the analysis phase.
92 Throughout this pass we make a distinction between two types of
93 data: scalars (which are represented by SSA_NAMES), and memory references
94 ("data-refs"). These two types of data require different handling both
95 during analysis and transformation. The types of data-refs that the
96 vectorizer currently supports are ARRAY_REFS which base is an array DECL
97 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
98 accesses are required to have a simple (consecutive) access pattern.
99
100 Analysis phase:
101 ===============
102 The driver for the analysis phase is vect_analyze_loop().
103 It applies a set of analyses, some of which rely on the scalar evolution
104 analyzer (scev) developed by Sebastian Pop.
105
106 During the analysis phase the vectorizer records some information
107 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
108 loop, as well as general information about the loop as a whole, which is
109 recorded in a "loop_vec_info" struct attached to each loop.
110
111 Transformation phase:
112 =====================
113 The loop transformation phase scans all the stmts in the loop, and
114 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
115 the loop that needs to be vectorized. It inserts the vector code sequence
116 just before the scalar stmt S, and records a pointer to the vector code
117 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
118 attached to S). This pointer will be used for the vectorization of following
119 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
120 otherwise, we rely on dead code elimination for removing it.
121
122 For example, say stmt S1 was vectorized into stmt VS1:
123
124 VS1: vb = px[i];
125 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
126 S2: a = b;
127
128 To vectorize stmt S2, the vectorizer first finds the stmt that defines
129 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
130 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
131 resulting sequence would be:
132
133 VS1: vb = px[i];
134 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
135 VS2: va = vb;
136 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
137
138 Operands that are not SSA_NAMEs, are data-refs that appear in
139 load/store operations (like 'x[i]' in S1), and are handled differently.
140
141 Target modeling:
142 =================
143 Currently the only target specific information that is used is the
144 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
145 Targets that can support different sizes of vectors, for now will need
146 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
147 flexibility will be added in the future.
148
149 Since we only vectorize operations which vector form can be
150 expressed using existing tree codes, to verify that an operation is
151 supported, the vectorizer checks the relevant optab at the relevant
152 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
153 the value found is CODE_FOR_nothing, then there's no target support, and
154 we can't vectorize the stmt.
155
156 For additional information on this project see:
157 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
158*/
159
160static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
161 unsigned *);
162static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
163 bool *, bool *, bool);
164
165/* Subroutine of vect_determine_vf_for_stmt that handles only one
166 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
167 may already be set for general statements (not just data refs). */
168
169static opt_result
170vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
171 bool vectype_maybe_set_p,
172 poly_uint64 *vf)
173{
174 gimple *stmt = stmt_info->stmt;
175
176 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
177 && !STMT_VINFO_LIVE_P (stmt_info))
178 || gimple_clobber_p (s: stmt))
179 {
180 if (dump_enabled_p ())
181 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
182 return opt_result::success ();
183 }
184
185 tree stmt_vectype, nunits_vectype;
186 opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
187 &stmt_vectype,
188 &nunits_vectype);
189 if (!res)
190 return res;
191
192 if (stmt_vectype)
193 {
194 if (STMT_VINFO_VECTYPE (stmt_info))
195 /* The only case when a vectype had been already set is for stmts
196 that contain a data ref, or for "pattern-stmts" (stmts generated
197 by the vectorizer to represent/replace a certain idiom). */
198 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
199 || vectype_maybe_set_p)
200 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
201 else
202 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
203 }
204
205 if (nunits_vectype)
206 vect_update_max_nunits (max_nunits: vf, vectype: nunits_vectype);
207
208 return opt_result::success ();
209}
210
211/* Subroutine of vect_determine_vectorization_factor. Set the vector
212 types of STMT_INFO and all attached pattern statements and update
213 the vectorization factor VF accordingly. Return true on success
214 or false if something prevented vectorization. */
215
216static opt_result
217vect_determine_vf_for_stmt (vec_info *vinfo,
218 stmt_vec_info stmt_info, poly_uint64 *vf)
219{
220 if (dump_enabled_p ())
221 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
222 stmt_info->stmt);
223 opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, vectype_maybe_set_p: false, vf);
224 if (!res)
225 return res;
226
227 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
228 && STMT_VINFO_RELATED_STMT (stmt_info))
229 {
230 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
231 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
232
233 /* If a pattern statement has def stmts, analyze them too. */
234 for (gimple_stmt_iterator si = gsi_start (seq&: pattern_def_seq);
235 !gsi_end_p (i: si); gsi_next (i: &si))
236 {
237 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (i: si));
238 if (dump_enabled_p ())
239 dump_printf_loc (MSG_NOTE, vect_location,
240 "==> examining pattern def stmt: %G",
241 def_stmt_info->stmt);
242 res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info: def_stmt_info, vectype_maybe_set_p: true, vf);
243 if (!res)
244 return res;
245 }
246
247 if (dump_enabled_p ())
248 dump_printf_loc (MSG_NOTE, vect_location,
249 "==> examining pattern statement: %G",
250 stmt_info->stmt);
251 res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, vectype_maybe_set_p: true, vf);
252 if (!res)
253 return res;
254 }
255
256 return opt_result::success ();
257}
258
259/* Function vect_determine_vectorization_factor
260
261 Determine the vectorization factor (VF). VF is the number of data elements
262 that are operated upon in parallel in a single iteration of the vectorized
263 loop. For example, when vectorizing a loop that operates on 4byte elements,
264 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
265 elements can fit in a single vector register.
266
267 We currently support vectorization of loops in which all types operated upon
268 are of the same size. Therefore this function currently sets VF according to
269 the size of the types operated upon, and fails if there are multiple sizes
270 in the loop.
271
272 VF is also the factor by which the loop iterations are strip-mined, e.g.:
273 original loop:
274 for (i=0; i<N; i++){
275 a[i] = b[i] + c[i];
276 }
277
278 vectorized loop:
279 for (i=0; i<N; i+=VF){
280 a[i:VF] = b[i:VF] + c[i:VF];
281 }
282*/
283
284static opt_result
285vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
286{
287 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
288 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
289 unsigned nbbs = loop->num_nodes;
290 poly_uint64 vectorization_factor = 1;
291 tree scalar_type = NULL_TREE;
292 gphi *phi;
293 tree vectype;
294 stmt_vec_info stmt_info;
295 unsigned i;
296
297 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
298
299 for (i = 0; i < nbbs; i++)
300 {
301 basic_block bb = bbs[i];
302
303 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (i: si);
304 gsi_next (i: &si))
305 {
306 phi = si.phi ();
307 stmt_info = loop_vinfo->lookup_stmt (phi);
308 if (dump_enabled_p ())
309 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
310 (gimple *) phi);
311
312 gcc_assert (stmt_info);
313
314 if (STMT_VINFO_RELEVANT_P (stmt_info)
315 || STMT_VINFO_LIVE_P (stmt_info))
316 {
317 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
318 scalar_type = TREE_TYPE (PHI_RESULT (phi));
319
320 if (dump_enabled_p ())
321 dump_printf_loc (MSG_NOTE, vect_location,
322 "get vectype for scalar type: %T\n",
323 scalar_type);
324
325 vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
326 if (!vectype)
327 return opt_result::failure_at (loc: phi,
328 fmt: "not vectorized: unsupported "
329 "data-type %T\n",
330 scalar_type);
331 STMT_VINFO_VECTYPE (stmt_info) = vectype;
332
333 if (dump_enabled_p ())
334 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
335 vectype);
336
337 if (dump_enabled_p ())
338 {
339 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
340 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (node: vectype));
341 dump_printf (MSG_NOTE, "\n");
342 }
343
344 vect_update_max_nunits (max_nunits: &vectorization_factor, vectype);
345 }
346 }
347
348 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (i: si);
349 gsi_next (i: &si))
350 {
351 if (is_gimple_debug (gs: gsi_stmt (i: si)))
352 continue;
353 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (i: si));
354 opt_result res
355 = vect_determine_vf_for_stmt (vinfo: loop_vinfo,
356 stmt_info, vf: &vectorization_factor);
357 if (!res)
358 return res;
359 }
360 }
361
362 /* TODO: Analyze cost. Decide if worth while to vectorize. */
363 if (dump_enabled_p ())
364 {
365 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
366 dump_dec (MSG_NOTE, vectorization_factor);
367 dump_printf (MSG_NOTE, "\n");
368 }
369
370 if (known_le (vectorization_factor, 1U))
371 return opt_result::failure_at (loc: vect_location,
372 fmt: "not vectorized: unsupported data-type\n");
373 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
374 return opt_result::success ();
375}
376
377
378/* Function vect_is_simple_iv_evolution.
379
380 FORNOW: A simple evolution of an induction variables in the loop is
381 considered a polynomial evolution. */
382
383static bool
384vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
385 tree * step)
386{
387 tree init_expr;
388 tree step_expr;
389 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
390 basic_block bb;
391
392 /* When there is no evolution in this loop, the evolution function
393 is not "simple". */
394 if (evolution_part == NULL_TREE)
395 return false;
396
397 /* When the evolution is a polynomial of degree >= 2
398 the evolution function is not "simple". */
399 if (tree_is_chrec (expr: evolution_part))
400 return false;
401
402 step_expr = evolution_part;
403 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
404
405 if (dump_enabled_p ())
406 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
407 step_expr, init_expr);
408
409 *init = init_expr;
410 *step = step_expr;
411
412 if (TREE_CODE (step_expr) != INTEGER_CST
413 && (TREE_CODE (step_expr) != SSA_NAME
414 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
415 && flow_bb_inside_loop_p (get_loop (cfun, num: loop_nb), bb))
416 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
417 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
418 || !flag_associative_math)))
419 && (TREE_CODE (step_expr) != REAL_CST
420 || !flag_associative_math))
421 {
422 if (dump_enabled_p ())
423 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
424 "step unknown.\n");
425 return false;
426 }
427
428 return true;
429}
430
431/* Function vect_is_nonlinear_iv_evolution
432
433 Only support nonlinear induction for integer type
434 1. neg
435 2. mul by constant
436 3. lshift/rshift by constant.
437
438 For neg induction, return a fake step as integer -1. */
439static bool
440vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
441 gphi* loop_phi_node, tree *init, tree *step)
442{
443 tree init_expr, ev_expr, result, op1, op2;
444 gimple* def;
445
446 if (gimple_phi_num_args (gs: loop_phi_node) != 2)
447 return false;
448
449 init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
450 ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
451
452 /* Support nonlinear induction only for integer type. */
453 if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
454 return false;
455
456 *init = init_expr;
457 result = PHI_RESULT (loop_phi_node);
458
459 if (TREE_CODE (ev_expr) != SSA_NAME
460 || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
461 || !is_gimple_assign (gs: def))
462 return false;
463
464 enum tree_code t_code = gimple_assign_rhs_code (gs: def);
465 switch (t_code)
466 {
467 case NEGATE_EXPR:
468 if (gimple_assign_rhs1 (gs: def) != result)
469 return false;
470 *step = build_int_cst (TREE_TYPE (init_expr), -1);
471 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
472 break;
473
474 case RSHIFT_EXPR:
475 case LSHIFT_EXPR:
476 case MULT_EXPR:
477 op1 = gimple_assign_rhs1 (gs: def);
478 op2 = gimple_assign_rhs2 (gs: def);
479 if (TREE_CODE (op2) != INTEGER_CST
480 || op1 != result)
481 return false;
482 *step = op2;
483 if (t_code == LSHIFT_EXPR)
484 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
485 else if (t_code == RSHIFT_EXPR)
486 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
487 /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul. */
488 else
489 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
490 break;
491
492 default:
493 return false;
494 }
495
496 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init;
497 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step;
498
499 return true;
500}
501
502/* Return true if PHI, described by STMT_INFO, is the inner PHI in
503 what we are assuming is a double reduction. For example, given
504 a structure like this:
505
506 outer1:
507 x_1 = PHI <x_4(outer2), ...>;
508 ...
509
510 inner:
511 x_2 = PHI <x_1(outer1), ...>;
512 ...
513 x_3 = ...;
514 ...
515
516 outer2:
517 x_4 = PHI <x_3(inner)>;
518 ...
519
520 outer loop analysis would treat x_1 as a double reduction phi and
521 this function would then return true for x_2. */
522
523static bool
524vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
525{
526 use_operand_p use_p;
527 ssa_op_iter op_iter;
528 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
529 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
530 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
531 return true;
532 return false;
533}
534
535/* Returns true if Phi is a first-order recurrence. A first-order
536 recurrence is a non-reduction recurrence relation in which the value of
537 the recurrence in the current loop iteration equals a value defined in
538 the previous iteration. */
539
540static bool
541vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
542 gphi *phi)
543{
544 /* A nested cycle isn't vectorizable as first order recurrence. */
545 if (LOOP_VINFO_LOOP (loop_vinfo) != loop)
546 return false;
547
548 /* Ensure the loop latch definition is from within the loop. */
549 edge latch = loop_latch_edge (loop);
550 tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
551 if (TREE_CODE (ldef) != SSA_NAME
552 || SSA_NAME_IS_DEFAULT_DEF (ldef)
553 || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
554 || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
555 return false;
556
557 tree def = gimple_phi_result (gs: phi);
558
559 /* Ensure every use_stmt of the phi node is dominated by the latch
560 definition. */
561 imm_use_iterator imm_iter;
562 use_operand_p use_p;
563 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
564 if (!is_gimple_debug (USE_STMT (use_p))
565 && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
566 || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
567 USE_STMT (use_p))))
568 return false;
569
570 /* First-order recurrence autovectorization needs shuffle vector. */
571 tree scalar_type = TREE_TYPE (def);
572 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
573 if (!vectype)
574 return false;
575
576 return true;
577}
578
579/* Function vect_analyze_scalar_cycles_1.
580
581 Examine the cross iteration def-use cycles of scalar variables
582 in LOOP. LOOP_VINFO represents the loop that is now being
583 considered for vectorization (can be LOOP, or an outer-loop
584 enclosing LOOP). SLP indicates there will be some subsequent
585 slp analyses or not. */
586
587static void
588vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
589 bool slp)
590{
591 basic_block bb = loop->header;
592 tree init, step;
593 auto_vec<stmt_vec_info, 64> worklist;
594 gphi_iterator gsi;
595 bool double_reduc, reduc_chain;
596
597 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
598
599 /* First - identify all inductions. Reduction detection assumes that all the
600 inductions have been identified, therefore, this order must not be
601 changed. */
602 for (gsi = gsi_start_phis (bb); !gsi_end_p (i: gsi); gsi_next (i: &gsi))
603 {
604 gphi *phi = gsi.phi ();
605 tree access_fn = NULL;
606 tree def = PHI_RESULT (phi);
607 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
608
609 if (dump_enabled_p ())
610 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
611 (gimple *) phi);
612
613 /* Skip virtual phi's. The data dependences that are associated with
614 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
615 if (virtual_operand_p (op: def))
616 continue;
617
618 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
619
620 /* Analyze the evolution function. */
621 access_fn = analyze_scalar_evolution (loop, def);
622 if (access_fn)
623 {
624 STRIP_NOPS (access_fn);
625 if (dump_enabled_p ())
626 dump_printf_loc (MSG_NOTE, vect_location,
627 "Access function of PHI: %T\n", access_fn);
628 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
629 = initial_condition_in_loop_num (access_fn, loop->num);
630 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
631 = evolution_part_in_loop_num (access_fn, loop->num);
632 }
633
634 if ((!access_fn
635 || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
636 || !vect_is_simple_iv_evolution (loop_nb: loop->num, access_fn,
637 init: &init, step: &step)
638 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
639 && TREE_CODE (step) != INTEGER_CST))
640 /* Only handle nonlinear iv for same loop. */
641 && (LOOP_VINFO_LOOP (loop_vinfo) != loop
642 || !vect_is_nonlinear_iv_evolution (loop, stmt_info: stmt_vinfo,
643 loop_phi_node: phi, init: &init, step: &step)))
644 {
645 worklist.safe_push (obj: stmt_vinfo);
646 continue;
647 }
648
649 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
650 != NULL_TREE);
651 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
652
653 if (dump_enabled_p ())
654 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
655 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
656 }
657
658
659 /* Second - identify all reductions and nested cycles. */
660 while (worklist.length () > 0)
661 {
662 stmt_vec_info stmt_vinfo = worklist.pop ();
663 gphi *phi = as_a <gphi *> (p: stmt_vinfo->stmt);
664 tree def = PHI_RESULT (phi);
665
666 if (dump_enabled_p ())
667 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
668 (gimple *) phi);
669
670 gcc_assert (!virtual_operand_p (def)
671 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
672
673 stmt_vec_info reduc_stmt_info
674 = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
675 &reduc_chain, slp);
676 if (reduc_stmt_info)
677 {
678 STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
679 STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
680 if (double_reduc)
681 {
682 if (dump_enabled_p ())
683 dump_printf_loc (MSG_NOTE, vect_location,
684 "Detected double reduction.\n");
685
686 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
687 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
688 }
689 else
690 {
691 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
692 {
693 if (dump_enabled_p ())
694 dump_printf_loc (MSG_NOTE, vect_location,
695 "Detected vectorizable nested cycle.\n");
696
697 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
698 }
699 else
700 {
701 if (dump_enabled_p ())
702 dump_printf_loc (MSG_NOTE, vect_location,
703 "Detected reduction.\n");
704
705 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
706 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
707 /* Store the reduction cycles for possible vectorization in
708 loop-aware SLP if it was not detected as reduction
709 chain. */
710 if (! reduc_chain)
711 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
712 (obj: reduc_stmt_info);
713 }
714 }
715 }
716 else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
717 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
718 else
719 if (dump_enabled_p ())
720 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
721 "Unknown def-use cycle pattern.\n");
722 }
723}
724
725
726/* Function vect_analyze_scalar_cycles.
727
728 Examine the cross iteration def-use cycles of scalar variables, by
729 analyzing the loop-header PHIs of scalar variables. Classify each
730 cycle as one of the following: invariant, induction, reduction, unknown.
731 We do that for the loop represented by LOOP_VINFO, and also to its
732 inner-loop, if exists.
733 Examples for scalar cycles:
734
735 Example1: reduction:
736
737 loop1:
738 for (i=0; i<N; i++)
739 sum += a[i];
740
741 Example2: induction:
742
743 loop2:
744 for (i=0; i<N; i++)
745 a[i] = i; */
746
747static void
748vect_analyze_scalar_cycles (loop_vec_info loop_vinfo, bool slp)
749{
750 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
751
752 vect_analyze_scalar_cycles_1 (loop_vinfo, loop, slp);
753
754 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
755 Reductions in such inner-loop therefore have different properties than
756 the reductions in the nest that gets vectorized:
757 1. When vectorized, they are executed in the same order as in the original
758 scalar loop, so we can't change the order of computation when
759 vectorizing them.
760 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
761 current checks are too strict. */
762
763 if (loop->inner)
764 vect_analyze_scalar_cycles_1 (loop_vinfo, loop: loop->inner, slp);
765}
766
767/* Transfer group and reduction information from STMT_INFO to its
768 pattern stmt. */
769
770static void
771vect_fixup_reduc_chain (stmt_vec_info stmt_info)
772{
773 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
774 stmt_vec_info stmtp;
775 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
776 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
777 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
778 do
779 {
780 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
781 gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
782 == STMT_VINFO_DEF_TYPE (stmt_info));
783 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
784 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
785 if (stmt_info)
786 REDUC_GROUP_NEXT_ELEMENT (stmtp)
787 = STMT_VINFO_RELATED_STMT (stmt_info);
788 }
789 while (stmt_info);
790}
791
792/* Fixup scalar cycles that now have their stmts detected as patterns. */
793
794static void
795vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
796{
797 stmt_vec_info first;
798 unsigned i;
799
800 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
801 {
802 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
803 while (next)
804 {
805 if ((STMT_VINFO_IN_PATTERN_P (next)
806 != STMT_VINFO_IN_PATTERN_P (first))
807 || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
808 break;
809 next = REDUC_GROUP_NEXT_ELEMENT (next);
810 }
811 /* If all reduction chain members are well-formed patterns adjust
812 the group to group the pattern stmts instead. */
813 if (! next
814 && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
815 {
816 if (STMT_VINFO_IN_PATTERN_P (first))
817 {
818 vect_fixup_reduc_chain (stmt_info: first);
819 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
820 = STMT_VINFO_RELATED_STMT (first);
821 }
822 }
823 /* If not all stmt in the chain are patterns or if we failed
824 to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
825 it as regular reduction instead. */
826 else
827 {
828 stmt_vec_info vinfo = first;
829 stmt_vec_info last = NULL;
830 while (vinfo)
831 {
832 next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
833 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
834 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
835 last = vinfo;
836 vinfo = next;
837 }
838 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
839 = vect_internal_def;
840 loop_vinfo->reductions.safe_push (obj: vect_stmt_to_vectorize (stmt_info: last));
841 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (ix: i);
842 --i;
843 }
844 }
845}
846
847/* Function vect_get_loop_niters.
848
849 Determine how many iterations the loop is executed and place it
850 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
851 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
852 niter information holds in ASSUMPTIONS.
853
854 Return the loop exit conditions. */
855
856
857static vec<gcond *>
858vect_get_loop_niters (class loop *loop, const_edge main_exit, tree *assumptions,
859 tree *number_of_iterations, tree *number_of_iterationsm1)
860{
861 auto_vec<edge> exits = get_loop_exit_edges (loop);
862 vec<gcond *> conds;
863 conds.create (nelems: exits.length ());
864 class tree_niter_desc niter_desc;
865 tree niter_assumptions, niter, may_be_zero;
866
867 *assumptions = boolean_true_node;
868 *number_of_iterationsm1 = chrec_dont_know;
869 *number_of_iterations = chrec_dont_know;
870
871 DUMP_VECT_SCOPE ("get_loop_niters");
872
873 if (exits.is_empty ())
874 return conds;
875
876 if (dump_enabled_p ())
877 dump_printf_loc (MSG_NOTE, vect_location, "Loop has %d exits.\n",
878 exits.length ());
879
880 edge exit;
881 unsigned int i;
882 FOR_EACH_VEC_ELT (exits, i, exit)
883 {
884 gcond *cond = get_loop_exit_condition (exit);
885 if (cond)
886 conds.safe_push (obj: cond);
887
888 if (dump_enabled_p ())
889 dump_printf_loc (MSG_NOTE, vect_location, "Analyzing exit %d...\n", i);
890
891 if (exit != main_exit)
892 continue;
893
894 may_be_zero = NULL_TREE;
895 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
896 || chrec_contains_undetermined (niter_desc.niter))
897 continue;
898
899 niter_assumptions = niter_desc.assumptions;
900 may_be_zero = niter_desc.may_be_zero;
901 niter = niter_desc.niter;
902
903 if (may_be_zero && integer_zerop (may_be_zero))
904 may_be_zero = NULL_TREE;
905
906 if (may_be_zero)
907 {
908 if (COMPARISON_CLASS_P (may_be_zero))
909 {
910 /* Try to combine may_be_zero with assumptions, this can simplify
911 computation of niter expression. */
912 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
913 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
914 niter_assumptions,
915 fold_build1 (TRUTH_NOT_EXPR,
916 boolean_type_node,
917 may_be_zero));
918 else
919 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
920 build_int_cst (TREE_TYPE (niter), 0),
921 rewrite_to_non_trapping_overflow (niter));
922
923 may_be_zero = NULL_TREE;
924 }
925 else if (integer_nonzerop (may_be_zero))
926 {
927 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
928 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
929 continue;
930 }
931 else
932 continue;
933 }
934
935 /* Loop assumptions are based off the normal exit. */
936 *assumptions = niter_assumptions;
937 *number_of_iterationsm1 = niter;
938
939 /* We want the number of loop header executions which is the number
940 of latch executions plus one.
941 ??? For UINT_MAX latch executions this number overflows to zero
942 for loops like do { n++; } while (n != 0); */
943 if (niter && !chrec_contains_undetermined (niter))
944 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter),
945 unshare_expr (niter),
946 build_int_cst (TREE_TYPE (niter), 1));
947 *number_of_iterations = niter;
948 }
949
950 if (dump_enabled_p ())
951 dump_printf_loc (MSG_NOTE, vect_location, "All loop exits successfully analyzed.\n");
952
953 return conds;
954}
955
956/* Determine the main loop exit for the vectorizer. */
957
958edge
959vec_init_loop_exit_info (class loop *loop)
960{
961 /* Before we begin we must first determine which exit is the main one and
962 which are auxilary exits. */
963 auto_vec<edge> exits = get_loop_exit_edges (loop);
964 if (exits.length () == 1)
965 return exits[0];
966
967 /* If we have multiple exits we only support counting IV at the moment. Analyze
968 all exits and return one */
969 class tree_niter_desc niter_desc;
970 edge candidate = NULL;
971 for (edge exit : exits)
972 {
973 if (!get_loop_exit_condition (exit))
974 continue;
975
976 if (number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
977 && !chrec_contains_undetermined (niter_desc.niter))
978 {
979 if (!niter_desc.may_be_zero || !candidate)
980 candidate = exit;
981 }
982 }
983
984 return candidate;
985}
986
987/* Function bb_in_loop_p
988
989 Used as predicate for dfs order traversal of the loop bbs. */
990
991static bool
992bb_in_loop_p (const_basic_block bb, const void *data)
993{
994 const class loop *const loop = (const class loop *)data;
995 if (flow_bb_inside_loop_p (loop, bb))
996 return true;
997 return false;
998}
999
1000
1001/* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1002 stmt_vec_info structs for all the stmts in LOOP_IN. */
1003
1004_loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
1005 : vec_info (vec_info::loop, shared),
1006 loop (loop_in),
1007 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1008 num_itersm1 (NULL_TREE),
1009 num_iters (NULL_TREE),
1010 num_iters_unchanged (NULL_TREE),
1011 num_iters_assumptions (NULL_TREE),
1012 vector_costs (nullptr),
1013 scalar_costs (nullptr),
1014 th (0),
1015 versioning_threshold (0),
1016 vectorization_factor (0),
1017 main_loop_edge (nullptr),
1018 skip_main_loop_edge (nullptr),
1019 skip_this_loop_edge (nullptr),
1020 reusable_accumulators (),
1021 suggested_unroll_factor (1),
1022 max_vectorization_factor (0),
1023 mask_skip_niters (NULL_TREE),
1024 rgroup_compare_type (NULL_TREE),
1025 simd_if_cond (NULL_TREE),
1026 partial_vector_style (vect_partial_vectors_none),
1027 unaligned_dr (NULL),
1028 peeling_for_alignment (0),
1029 ptr_mask (0),
1030 ivexpr_map (NULL),
1031 scan_map (NULL),
1032 slp_unrolling_factor (1),
1033 inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
1034 vectorizable (false),
1035 can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
1036 using_partial_vectors_p (false),
1037 using_decrementing_iv_p (false),
1038 using_select_vl_p (false),
1039 epil_using_partial_vectors_p (false),
1040 partial_load_store_bias (0),
1041 peeling_for_gaps (false),
1042 peeling_for_niter (false),
1043 no_data_dependencies (false),
1044 has_mask_store (false),
1045 scalar_loop_scaling (profile_probability::uninitialized ()),
1046 scalar_loop (NULL),
1047 orig_loop_info (NULL),
1048 vec_loop_iv_exit (NULL),
1049 vec_epilogue_loop_iv_exit (NULL),
1050 scalar_loop_iv_exit (NULL)
1051{
1052 /* CHECKME: We want to visit all BBs before their successors (except for
1053 latch blocks, for which this assertion wouldn't hold). In the simple
1054 case of the loop forms we allow, a dfs order of the BBs would the same
1055 as reversed postorder traversal, so we are safe. */
1056
1057 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1058 bbs, loop->num_nodes, loop);
1059 gcc_assert (nbbs == loop->num_nodes);
1060
1061 for (unsigned int i = 0; i < nbbs; i++)
1062 {
1063 basic_block bb = bbs[i];
1064 gimple_stmt_iterator si;
1065
1066 for (si = gsi_start_phis (bb); !gsi_end_p (i: si); gsi_next (i: &si))
1067 {
1068 gimple *phi = gsi_stmt (i: si);
1069 gimple_set_uid (g: phi, uid: 0);
1070 add_stmt (phi);
1071 }
1072
1073 for (si = gsi_start_bb (bb); !gsi_end_p (i: si); gsi_next (i: &si))
1074 {
1075 gimple *stmt = gsi_stmt (i: si);
1076 gimple_set_uid (g: stmt, uid: 0);
1077 if (is_gimple_debug (gs: stmt))
1078 continue;
1079 add_stmt (stmt);
1080 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
1081 third argument is the #pragma omp simd if (x) condition, when 0,
1082 loop shouldn't be vectorized, when non-zero constant, it should
1083 be vectorized normally, otherwise versioned with vectorized loop
1084 done if the condition is non-zero at runtime. */
1085 if (loop_in->simduid
1086 && is_gimple_call (gs: stmt)
1087 && gimple_call_internal_p (gs: stmt)
1088 && gimple_call_internal_fn (gs: stmt) == IFN_GOMP_SIMD_LANE
1089 && gimple_call_num_args (gs: stmt) >= 3
1090 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
1091 && (loop_in->simduid
1092 == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
1093 {
1094 tree arg = gimple_call_arg (gs: stmt, index: 2);
1095 if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
1096 simd_if_cond = arg;
1097 else
1098 gcc_assert (integer_nonzerop (arg));
1099 }
1100 }
1101 }
1102
1103 epilogue_vinfos.create (nelems: 6);
1104}
1105
1106/* Free all levels of rgroup CONTROLS. */
1107
1108void
1109release_vec_loop_controls (vec<rgroup_controls> *controls)
1110{
1111 rgroup_controls *rgc;
1112 unsigned int i;
1113 FOR_EACH_VEC_ELT (*controls, i, rgc)
1114 rgc->controls.release ();
1115 controls->release ();
1116}
1117
1118/* Free all memory used by the _loop_vec_info, as well as all the
1119 stmt_vec_info structs of all the stmts in the loop. */
1120
1121_loop_vec_info::~_loop_vec_info ()
1122{
1123 free (ptr: bbs);
1124
1125 release_vec_loop_controls (controls: &masks.rgc_vec);
1126 release_vec_loop_controls (controls: &lens);
1127 delete ivexpr_map;
1128 delete scan_map;
1129 epilogue_vinfos.release ();
1130 delete scalar_costs;
1131 delete vector_costs;
1132
1133 /* When we release an epiloge vinfo that we do not intend to use
1134 avoid clearing AUX of the main loop which should continue to
1135 point to the main loop vinfo since otherwise we'll leak that. */
1136 if (loop->aux == this)
1137 loop->aux = NULL;
1138}
1139
1140/* Return an invariant or register for EXPR and emit necessary
1141 computations in the LOOP_VINFO loop preheader. */
1142
1143tree
1144cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1145{
1146 if (is_gimple_reg (expr)
1147 || is_gimple_min_invariant (expr))
1148 return expr;
1149
1150 if (! loop_vinfo->ivexpr_map)
1151 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1152 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (k: expr);
1153 if (! cached)
1154 {
1155 gimple_seq stmts = NULL;
1156 cached = force_gimple_operand (unshare_expr (expr),
1157 &stmts, true, NULL_TREE);
1158 if (stmts)
1159 {
1160 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1161 gsi_insert_seq_on_edge_immediate (e, stmts);
1162 }
1163 }
1164 return cached;
1165}
1166
1167/* Return true if we can use CMP_TYPE as the comparison type to produce
1168 all masks required to mask LOOP_VINFO. */
1169
1170static bool
1171can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1172{
1173 rgroup_controls *rgm;
1174 unsigned int i;
1175 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1176 if (rgm->type != NULL_TREE
1177 && !direct_internal_fn_supported_p (fn: IFN_WHILE_ULT,
1178 type0: cmp_type, type1: rgm->type,
1179 opt_type: OPTIMIZE_FOR_SPEED))
1180 return false;
1181 return true;
1182}
1183
1184/* Calculate the maximum number of scalars per iteration for every
1185 rgroup in LOOP_VINFO. */
1186
1187static unsigned int
1188vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1189{
1190 unsigned int res = 1;
1191 unsigned int i;
1192 rgroup_controls *rgm;
1193 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1194 res = MAX (res, rgm->max_nscalars_per_iter);
1195 return res;
1196}
1197
1198/* Calculate the minimum precision necessary to represent:
1199
1200 MAX_NITERS * FACTOR
1201
1202 as an unsigned integer, where MAX_NITERS is the maximum number of
1203 loop header iterations for the original scalar form of LOOP_VINFO. */
1204
1205static unsigned
1206vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1207{
1208 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1209
1210 /* Get the maximum number of iterations that is representable
1211 in the counter type. */
1212 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1213 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1214
1215 /* Get a more refined estimate for the number of iterations. */
1216 widest_int max_back_edges;
1217 if (max_loop_iterations (loop, &max_back_edges))
1218 max_ni = wi::smin (x: max_ni, y: max_back_edges + 1);
1219
1220 /* Work out how many bits we need to represent the limit. */
1221 return wi::min_precision (x: max_ni * factor, sgn: UNSIGNED);
1222}
1223
1224/* True if the loop needs peeling or partial vectors when vectorized. */
1225
1226static bool
1227vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1228{
1229 unsigned HOST_WIDE_INT const_vf;
1230 HOST_WIDE_INT max_niter
1231 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1232
1233 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1234 if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1235 th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1236 (loop_vinfo));
1237
1238 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1239 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1240 {
1241 /* Work out the (constant) number of iterations that need to be
1242 peeled for reasons other than niters. */
1243 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1244 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1245 peel_niter += 1;
1246 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1247 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1248 return true;
1249 }
1250 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1251 /* ??? When peeling for gaps but not alignment, we could
1252 try to check whether the (variable) niters is known to be
1253 VF * N + 1. That's something of a niche case though. */
1254 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1255 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (const_value: &const_vf)
1256 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1257 < (unsigned) exact_log2 (x: const_vf))
1258 /* In case of versioning, check if the maximum number of
1259 iterations is greater than th. If they are identical,
1260 the epilogue is unnecessary. */
1261 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1262 || ((unsigned HOST_WIDE_INT) max_niter
1263 > (th / const_vf) * const_vf))))
1264 return true;
1265
1266 return false;
1267}
1268
1269/* Each statement in LOOP_VINFO can be masked where necessary. Check
1270 whether we can actually generate the masks required. Return true if so,
1271 storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
1272
1273static bool
1274vect_verify_full_masking (loop_vec_info loop_vinfo)
1275{
1276 unsigned int min_ni_width;
1277
1278 /* Use a normal loop if there are no statements that need masking.
1279 This only happens in rare degenerate cases: it means that the loop
1280 has no loads, no stores, and no live-out values. */
1281 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1282 return false;
1283
1284 /* Produce the rgroup controls. */
1285 for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1286 {
1287 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1288 tree vectype = mask.first;
1289 unsigned nvectors = mask.second;
1290
1291 if (masks->rgc_vec.length () < nvectors)
1292 masks->rgc_vec.safe_grow_cleared (len: nvectors, exact: true);
1293 rgroup_controls *rgm = &(*masks).rgc_vec[nvectors - 1];
1294 /* The number of scalars per iteration and the number of vectors are
1295 both compile-time constants. */
1296 unsigned int nscalars_per_iter
1297 = exact_div (a: nvectors * TYPE_VECTOR_SUBPARTS (node: vectype),
1298 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1299
1300 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
1301 {
1302 rgm->max_nscalars_per_iter = nscalars_per_iter;
1303 rgm->type = truth_type_for (vectype);
1304 rgm->factor = 1;
1305 }
1306 }
1307
1308 unsigned int max_nscalars_per_iter
1309 = vect_get_max_nscalars_per_iter (loop_vinfo);
1310
1311 /* Work out how many bits we need to represent the limit. */
1312 min_ni_width
1313 = vect_min_prec_for_max_niters (loop_vinfo, factor: max_nscalars_per_iter);
1314
1315 /* Find a scalar mode for which WHILE_ULT is supported. */
1316 opt_scalar_int_mode cmp_mode_iter;
1317 tree cmp_type = NULL_TREE;
1318 tree iv_type = NULL_TREE;
1319 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1320 unsigned int iv_precision = UINT_MAX;
1321
1322 if (iv_limit != -1)
1323 iv_precision = wi::min_precision (x: iv_limit * max_nscalars_per_iter,
1324 sgn: UNSIGNED);
1325
1326 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1327 {
1328 unsigned int cmp_bits = GET_MODE_BITSIZE (mode: cmp_mode_iter.require ());
1329 if (cmp_bits >= min_ni_width
1330 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1331 {
1332 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1333 if (this_type
1334 && can_produce_all_loop_masks_p (loop_vinfo, cmp_type: this_type))
1335 {
1336 /* Although we could stop as soon as we find a valid mode,
1337 there are at least two reasons why that's not always the
1338 best choice:
1339
1340 - An IV that's Pmode or wider is more likely to be reusable
1341 in address calculations than an IV that's narrower than
1342 Pmode.
1343
1344 - Doing the comparison in IV_PRECISION or wider allows
1345 a natural 0-based IV, whereas using a narrower comparison
1346 type requires mitigations against wrap-around.
1347
1348 Conversely, if the IV limit is variable, doing the comparison
1349 in a wider type than the original type can introduce
1350 unnecessary extensions, so picking the widest valid mode
1351 is not always a good choice either.
1352
1353 Here we prefer the first IV type that's Pmode or wider,
1354 and the first comparison type that's IV_PRECISION or wider.
1355 (The comparison type must be no wider than the IV type,
1356 to avoid extensions in the vector loop.)
1357
1358 ??? We might want to try continuing beyond Pmode for ILP32
1359 targets if CMP_BITS < IV_PRECISION. */
1360 iv_type = this_type;
1361 if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1362 cmp_type = this_type;
1363 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1364 break;
1365 }
1366 }
1367 }
1368
1369 if (!cmp_type)
1370 {
1371 LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.release ();
1372 return false;
1373 }
1374
1375 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1376 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1377 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_while_ult;
1378 return true;
1379}
1380
1381/* Each statement in LOOP_VINFO can be masked where necessary. Check
1382 whether we can actually generate AVX512 style masks. Return true if so,
1383 storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE. */
1384
1385static bool
1386vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo)
1387{
1388 /* Produce differently organized rgc_vec and differently check
1389 we can produce masks. */
1390
1391 /* Use a normal loop if there are no statements that need masking.
1392 This only happens in rare degenerate cases: it means that the loop
1393 has no loads, no stores, and no live-out values. */
1394 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1395 return false;
1396
1397 /* For the decrementing IV we need to represent all values in
1398 [0, niter + niter_skip] where niter_skip is the elements we
1399 skip in the first iteration for prologue peeling. */
1400 tree iv_type = NULL_TREE;
1401 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1402 unsigned int iv_precision = UINT_MAX;
1403 if (iv_limit != -1)
1404 iv_precision = wi::min_precision (x: iv_limit, sgn: UNSIGNED);
1405
1406 /* First compute the type for the IV we use to track the remaining
1407 scalar iterations. */
1408 opt_scalar_int_mode cmp_mode_iter;
1409 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1410 {
1411 unsigned int cmp_bits = GET_MODE_BITSIZE (mode: cmp_mode_iter.require ());
1412 if (cmp_bits >= iv_precision
1413 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1414 {
1415 iv_type = build_nonstandard_integer_type (cmp_bits, true);
1416 if (iv_type)
1417 break;
1418 }
1419 }
1420 if (!iv_type)
1421 return false;
1422
1423 /* Produce the rgroup controls. */
1424 for (auto const &mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1425 {
1426 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1427 tree vectype = mask.first;
1428 unsigned nvectors = mask.second;
1429
1430 /* The number of scalars per iteration and the number of vectors are
1431 both compile-time constants. */
1432 unsigned int nscalars_per_iter
1433 = exact_div (a: nvectors * TYPE_VECTOR_SUBPARTS (node: vectype),
1434 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1435
1436 /* We index the rgroup_controls vector with nscalars_per_iter
1437 which we keep constant and instead have a varying nvectors,
1438 remembering the vector mask with the fewest nV. */
1439 if (masks->rgc_vec.length () < nscalars_per_iter)
1440 masks->rgc_vec.safe_grow_cleared (len: nscalars_per_iter, exact: true);
1441 rgroup_controls *rgm = &(*masks).rgc_vec[nscalars_per_iter - 1];
1442
1443 if (!rgm->type || rgm->factor > nvectors)
1444 {
1445 rgm->type = truth_type_for (vectype);
1446 rgm->compare_type = NULL_TREE;
1447 rgm->max_nscalars_per_iter = nscalars_per_iter;
1448 rgm->factor = nvectors;
1449 rgm->bias_adjusted_ctrl = NULL_TREE;
1450 }
1451 }
1452
1453 /* There is no fixed compare type we are going to use but we have to
1454 be able to get at one for each mask group. */
1455 unsigned int min_ni_width
1456 = wi::min_precision (x: vect_max_vf (loop_vinfo), sgn: UNSIGNED);
1457
1458 bool ok = true;
1459 for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1460 {
1461 tree mask_type = rgc.type;
1462 if (!mask_type)
1463 continue;
1464
1465 /* For now vect_get_loop_mask only supports integer mode masks
1466 when we need to split it. */
1467 if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_INT
1468 || TYPE_PRECISION (TREE_TYPE (mask_type)) != 1)
1469 {
1470 ok = false;
1471 break;
1472 }
1473
1474 /* If iv_type is usable as compare type use that - we can elide the
1475 saturation in that case. */
1476 if (TYPE_PRECISION (iv_type) >= min_ni_width)
1477 {
1478 tree cmp_vectype
1479 = build_vector_type (iv_type, TYPE_VECTOR_SUBPARTS (node: mask_type));
1480 if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1481 rgc.compare_type = cmp_vectype;
1482 }
1483 if (!rgc.compare_type)
1484 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1485 {
1486 unsigned int cmp_bits = GET_MODE_BITSIZE (mode: cmp_mode_iter.require ());
1487 if (cmp_bits >= min_ni_width
1488 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1489 {
1490 tree cmp_type = build_nonstandard_integer_type (cmp_bits, true);
1491 if (!cmp_type)
1492 continue;
1493
1494 /* Check whether we can produce the mask with cmp_type. */
1495 tree cmp_vectype
1496 = build_vector_type (cmp_type, TYPE_VECTOR_SUBPARTS (node: mask_type));
1497 if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1498 {
1499 rgc.compare_type = cmp_vectype;
1500 break;
1501 }
1502 }
1503 }
1504 if (!rgc.compare_type)
1505 {
1506 ok = false;
1507 break;
1508 }
1509 }
1510 if (!ok)
1511 {
1512 release_vec_loop_controls (controls: &LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
1513 return false;
1514 }
1515
1516 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = error_mark_node;
1517 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1518 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_avx512;
1519 return true;
1520}
1521
1522/* Check whether we can use vector access with length based on precison
1523 comparison. So far, to keep it simple, we only allow the case that the
1524 precision of the target supported length is larger than the precision
1525 required by loop niters. */
1526
1527static bool
1528vect_verify_loop_lens (loop_vec_info loop_vinfo)
1529{
1530 if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1531 return false;
1532
1533 machine_mode len_load_mode, len_store_mode;
1534 if (!get_len_load_store_mode (loop_vinfo->vector_mode, true)
1535 .exists (mode: &len_load_mode))
1536 return false;
1537 if (!get_len_load_store_mode (loop_vinfo->vector_mode, false)
1538 .exists (mode: &len_store_mode))
1539 return false;
1540
1541 signed char partial_load_bias = internal_len_load_store_bias
1542 (ifn: IFN_LEN_LOAD, len_load_mode);
1543
1544 signed char partial_store_bias = internal_len_load_store_bias
1545 (ifn: IFN_LEN_STORE, len_store_mode);
1546
1547 gcc_assert (partial_load_bias == partial_store_bias);
1548
1549 if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1550 return false;
1551
1552 /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1553 len_loads with a length of zero. In order to avoid that we prohibit
1554 more than one loop length here. */
1555 if (partial_load_bias == -1
1556 && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1557 return false;
1558
1559 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1560
1561 unsigned int max_nitems_per_iter = 1;
1562 unsigned int i;
1563 rgroup_controls *rgl;
1564 /* Find the maximum number of items per iteration for every rgroup. */
1565 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1566 {
1567 unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1568 max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1569 }
1570
1571 /* Work out how many bits we need to represent the length limit. */
1572 unsigned int min_ni_prec
1573 = vect_min_prec_for_max_niters (loop_vinfo, factor: max_nitems_per_iter);
1574
1575 /* Now use the maximum of below precisions for one suitable IV type:
1576 - the IV's natural precision
1577 - the precision needed to hold: the maximum number of scalar
1578 iterations multiplied by the scale factor (min_ni_prec above)
1579 - the Pmode precision
1580
1581 If min_ni_prec is less than the precision of the current niters,
1582 we perfer to still use the niters type. Prefer to use Pmode and
1583 wider IV to avoid narrow conversions. */
1584
1585 unsigned int ni_prec
1586 = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1587 min_ni_prec = MAX (min_ni_prec, ni_prec);
1588 min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1589
1590 tree iv_type = NULL_TREE;
1591 opt_scalar_int_mode tmode_iter;
1592 FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1593 {
1594 scalar_mode tmode = tmode_iter.require ();
1595 unsigned int tbits = GET_MODE_BITSIZE (mode: tmode);
1596
1597 /* ??? Do we really want to construct one IV whose precision exceeds
1598 BITS_PER_WORD? */
1599 if (tbits > BITS_PER_WORD)
1600 break;
1601
1602 /* Find the first available standard integral type. */
1603 if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1604 {
1605 iv_type = build_nonstandard_integer_type (tbits, true);
1606 break;
1607 }
1608 }
1609
1610 if (!iv_type)
1611 {
1612 if (dump_enabled_p ())
1613 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1614 "can't vectorize with length-based partial vectors"
1615 " because there is no suitable iv type.\n");
1616 return false;
1617 }
1618
1619 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1620 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1621 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_len;
1622
1623 return true;
1624}
1625
1626/* Calculate the cost of one scalar iteration of the loop. */
1627static void
1628vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1629{
1630 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1631 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1632 int nbbs = loop->num_nodes, factor;
1633 int innerloop_iters, i;
1634
1635 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1636
1637 /* Gather costs for statements in the scalar loop. */
1638
1639 /* FORNOW. */
1640 innerloop_iters = 1;
1641 if (loop->inner)
1642 innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1643
1644 for (i = 0; i < nbbs; i++)
1645 {
1646 gimple_stmt_iterator si;
1647 basic_block bb = bbs[i];
1648
1649 if (bb->loop_father == loop->inner)
1650 factor = innerloop_iters;
1651 else
1652 factor = 1;
1653
1654 for (si = gsi_start_bb (bb); !gsi_end_p (i: si); gsi_next (i: &si))
1655 {
1656 gimple *stmt = gsi_stmt (i: si);
1657 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1658
1659 if (!is_gimple_assign (gs: stmt) && !is_gimple_call (gs: stmt))
1660 continue;
1661
1662 /* Skip stmts that are not vectorized inside the loop. */
1663 stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1664 if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1665 && (!STMT_VINFO_LIVE_P (vstmt_info)
1666 || !VECTORIZABLE_CYCLE_DEF
1667 (STMT_VINFO_DEF_TYPE (vstmt_info))))
1668 continue;
1669
1670 vect_cost_for_stmt kind;
1671 if (STMT_VINFO_DATA_REF (stmt_info))
1672 {
1673 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1674 kind = scalar_load;
1675 else
1676 kind = scalar_store;
1677 }
1678 else if (vect_nop_conversion_p (stmt_info))
1679 continue;
1680 else
1681 kind = scalar_stmt;
1682
1683 /* We are using vect_prologue here to avoid scaling twice
1684 by the inner loop factor. */
1685 record_stmt_cost (body_cost_vec: &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1686 count: factor, kind, stmt_info, misalign: 0, where: vect_prologue);
1687 }
1688 }
1689
1690 /* Now accumulate cost. */
1691 loop_vinfo->scalar_costs = init_cost (vinfo: loop_vinfo, costing_for_scalar: true);
1692 add_stmt_costs (costs: loop_vinfo->scalar_costs,
1693 cost_vec: &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1694 loop_vinfo->scalar_costs->finish_cost (scalar_costs: nullptr);
1695}
1696
1697
1698/* Function vect_analyze_loop_form.
1699
1700 Verify that certain CFG restrictions hold, including:
1701 - the loop has a pre-header
1702 - the loop has a single entry and exit
1703 - the loop exit condition is simple enough
1704 - the number of iterations can be analyzed, i.e, a countable loop. The
1705 niter could be analyzed under some assumptions. */
1706
1707opt_result
1708vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1709{
1710 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1711
1712 edge exit_e = vec_init_loop_exit_info (loop);
1713 if (!exit_e)
1714 return opt_result::failure_at (loc: vect_location,
1715 fmt: "not vectorized:"
1716 " could not determine main exit from"
1717 " loop with multiple exits.\n");
1718 info->loop_exit = exit_e;
1719 if (dump_enabled_p ())
1720 dump_printf_loc (MSG_NOTE, vect_location,
1721 "using as main loop exit: %d -> %d [AUX: %p]\n",
1722 exit_e->src->index, exit_e->dest->index, exit_e->aux);
1723
1724 /* Different restrictions apply when we are considering an inner-most loop,
1725 vs. an outer (nested) loop.
1726 (FORNOW. May want to relax some of these restrictions in the future). */
1727
1728 info->inner_loop_cond = NULL;
1729 if (!loop->inner)
1730 {
1731 /* Inner-most loop. We currently require that the number of BBs is
1732 exactly 2 (the header and latch). Vectorizable inner-most loops
1733 look like this:
1734
1735 (pre-header)
1736 |
1737 header <--------+
1738 | | |
1739 | +--> latch --+
1740 |
1741 (exit-bb) */
1742
1743 if (loop->num_nodes != 2)
1744 return opt_result::failure_at (loc: vect_location,
1745 fmt: "not vectorized:"
1746 " control flow in loop.\n");
1747
1748 if (empty_block_p (loop->header))
1749 return opt_result::failure_at (loc: vect_location,
1750 fmt: "not vectorized: empty loop.\n");
1751 }
1752 else
1753 {
1754 class loop *innerloop = loop->inner;
1755 edge entryedge;
1756
1757 /* Nested loop. We currently require that the loop is doubly-nested,
1758 contains a single inner loop, and the number of BBs is exactly 5.
1759 Vectorizable outer-loops look like this:
1760
1761 (pre-header)
1762 |
1763 header <---+
1764 | |
1765 inner-loop |
1766 | |
1767 tail ------+
1768 |
1769 (exit-bb)
1770
1771 The inner-loop has the properties expected of inner-most loops
1772 as described above. */
1773
1774 if ((loop->inner)->inner || (loop->inner)->next)
1775 return opt_result::failure_at (loc: vect_location,
1776 fmt: "not vectorized:"
1777 " multiple nested loops.\n");
1778
1779 if (loop->num_nodes != 5)
1780 return opt_result::failure_at (loc: vect_location,
1781 fmt: "not vectorized:"
1782 " control flow in loop.\n");
1783
1784 entryedge = loop_preheader_edge (innerloop);
1785 if (entryedge->src != loop->header
1786 || !single_exit (innerloop)
1787 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1788 return opt_result::failure_at (loc: vect_location,
1789 fmt: "not vectorized:"
1790 " unsupported outerloop form.\n");
1791
1792 /* Analyze the inner-loop. */
1793 vect_loop_form_info inner;
1794 opt_result res = vect_analyze_loop_form (loop: loop->inner, info: &inner);
1795 if (!res)
1796 {
1797 if (dump_enabled_p ())
1798 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1799 "not vectorized: Bad inner loop.\n");
1800 return res;
1801 }
1802
1803 /* Don't support analyzing niter under assumptions for inner
1804 loop. */
1805 if (!integer_onep (inner.assumptions))
1806 return opt_result::failure_at (loc: vect_location,
1807 fmt: "not vectorized: Bad inner loop.\n");
1808
1809 if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1810 return opt_result::failure_at (loc: vect_location,
1811 fmt: "not vectorized: inner-loop count not"
1812 " invariant.\n");
1813
1814 if (dump_enabled_p ())
1815 dump_printf_loc (MSG_NOTE, vect_location,
1816 "Considering outer-loop vectorization.\n");
1817 info->inner_loop_cond = inner.conds[0];
1818 }
1819
1820 if (!single_exit (loop))
1821 return opt_result::failure_at (loc: vect_location,
1822 fmt: "not vectorized: multiple exits.\n");
1823 if (EDGE_COUNT (loop->header->preds) != 2)
1824 return opt_result::failure_at (loc: vect_location,
1825 fmt: "not vectorized:"
1826 " too many incoming edges.\n");
1827
1828 /* We assume that the loop exit condition is at the end of the loop. i.e,
1829 that the loop is represented as a do-while (with a proper if-guard
1830 before the loop if needed), where the loop header contains all the
1831 executable statements, and the latch is empty. */
1832 if (!empty_block_p (loop->latch)
1833 || !gimple_seq_empty_p (s: phi_nodes (bb: loop->latch)))
1834 return opt_result::failure_at (loc: vect_location,
1835 fmt: "not vectorized: latch block not empty.\n");
1836
1837 /* Make sure the exit is not abnormal. */
1838 if (exit_e->flags & EDGE_ABNORMAL)
1839 return opt_result::failure_at (loc: vect_location,
1840 fmt: "not vectorized:"
1841 " abnormal loop exit edge.\n");
1842
1843 info->conds
1844 = vect_get_loop_niters (loop, main_exit: exit_e, assumptions: &info->assumptions,
1845 number_of_iterations: &info->number_of_iterations,
1846 number_of_iterationsm1: &info->number_of_iterationsm1);
1847
1848 if (info->conds.is_empty ())
1849 return opt_result::failure_at
1850 (loc: vect_location,
1851 fmt: "not vectorized: complicated exit condition.\n");
1852
1853 /* Determine what the primary and alternate exit conds are. */
1854 for (unsigned i = 0; i < info->conds.length (); i++)
1855 {
1856 gcond *cond = info->conds[i];
1857 if (exit_e->src == gimple_bb (g: cond))
1858 std::swap (a&: info->conds[0], b&: info->conds[i]);
1859 }
1860
1861 if (integer_zerop (info->assumptions)
1862 || !info->number_of_iterations
1863 || chrec_contains_undetermined (info->number_of_iterations))
1864 return opt_result::failure_at
1865 (loc: info->conds[0],
1866 fmt: "not vectorized: number of iterations cannot be computed.\n");
1867
1868 if (integer_zerop (info->number_of_iterations))
1869 return opt_result::failure_at
1870 (loc: info->conds[0],
1871 fmt: "not vectorized: number of iterations = 0.\n");
1872
1873 if (!(tree_fits_shwi_p (info->number_of_iterations)
1874 && tree_to_shwi (info->number_of_iterations) > 0))
1875 {
1876 if (dump_enabled_p ())
1877 {
1878 dump_printf_loc (MSG_NOTE, vect_location,
1879 "Symbolic number of iterations is ");
1880 dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1881 dump_printf (MSG_NOTE, "\n");
1882 }
1883 }
1884
1885 return opt_result::success ();
1886}
1887
1888/* Create a loop_vec_info for LOOP with SHARED and the
1889 vect_analyze_loop_form result. */
1890
1891loop_vec_info
1892vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1893 const vect_loop_form_info *info,
1894 loop_vec_info main_loop_info)
1895{
1896 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1897 LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1898 LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1899 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1900 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info;
1901 /* Also record the assumptions for versioning. */
1902 if (!integer_onep (info->assumptions) && !main_loop_info)
1903 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1904
1905 for (gcond *cond : info->conds)
1906 {
1907 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (cond);
1908 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1909 }
1910
1911 for (unsigned i = 1; i < info->conds.length (); i ++)
1912 LOOP_VINFO_LOOP_CONDS (loop_vinfo).safe_push (obj: info->conds[i]);
1913 LOOP_VINFO_LOOP_IV_COND (loop_vinfo) = info->conds[0];
1914
1915 LOOP_VINFO_IV_EXIT (loop_vinfo) = info->loop_exit;
1916
1917 if (info->inner_loop_cond)
1918 {
1919 stmt_vec_info inner_loop_cond_info
1920 = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1921 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1922 /* If we have an estimate on the number of iterations of the inner
1923 loop use that to limit the scale for costing, otherwise use
1924 --param vect-inner-loop-cost-factor literally. */
1925 widest_int nit;
1926 if (estimated_stmt_executions (loop->inner, &nit))
1927 LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1928 = wi::smin (x: nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1929 }
1930
1931 return loop_vinfo;
1932}
1933
1934
1935
1936/* Scan the loop stmts and dependent on whether there are any (non-)SLP
1937 statements update the vectorization factor. */
1938
1939static void
1940vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1941{
1942 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1943 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1944 int nbbs = loop->num_nodes;
1945 poly_uint64 vectorization_factor;
1946 int i;
1947
1948 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1949
1950 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1951 gcc_assert (known_ne (vectorization_factor, 0U));
1952
1953 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1954 vectorization factor of the loop is the unrolling factor required by
1955 the SLP instances. If that unrolling factor is 1, we say, that we
1956 perform pure SLP on loop - cross iteration parallelism is not
1957 exploited. */
1958 bool only_slp_in_loop = true;
1959 for (i = 0; i < nbbs; i++)
1960 {
1961 basic_block bb = bbs[i];
1962 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (i: si);
1963 gsi_next (i: &si))
1964 {
1965 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1966 if (!stmt_info)
1967 continue;
1968 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1969 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1970 && !PURE_SLP_STMT (stmt_info))
1971 /* STMT needs both SLP and loop-based vectorization. */
1972 only_slp_in_loop = false;
1973 }
1974 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (i: si);
1975 gsi_next (i: &si))
1976 {
1977 if (is_gimple_debug (gs: gsi_stmt (i: si)))
1978 continue;
1979 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (i: si));
1980 stmt_info = vect_stmt_to_vectorize (stmt_info);
1981 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1982 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1983 && !PURE_SLP_STMT (stmt_info))
1984 /* STMT needs both SLP and loop-based vectorization. */
1985 only_slp_in_loop = false;
1986 }
1987 }
1988
1989 if (only_slp_in_loop)
1990 {
1991 if (dump_enabled_p ())
1992 dump_printf_loc (MSG_NOTE, vect_location,
1993 "Loop contains only SLP stmts\n");
1994 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1995 }
1996 else
1997 {
1998 if (dump_enabled_p ())
1999 dump_printf_loc (MSG_NOTE, vect_location,
2000 "Loop contains SLP and non-SLP stmts\n");
2001 /* Both the vectorization factor and unroll factor have the form
2002 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
2003 so they must have a common multiple. */
2004 vectorization_factor
2005 = force_common_multiple (a: vectorization_factor,
2006 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
2007 }
2008
2009 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
2010 if (dump_enabled_p ())
2011 {
2012 dump_printf_loc (MSG_NOTE, vect_location,
2013 "Updating vectorization factor to ");
2014 dump_dec (MSG_NOTE, vectorization_factor);
2015 dump_printf (MSG_NOTE, ".\n");
2016 }
2017}
2018
2019/* Return true if STMT_INFO describes a double reduction phi and if
2020 the other phi in the reduction is also relevant for vectorization.
2021 This rejects cases such as:
2022
2023 outer1:
2024 x_1 = PHI <x_3(outer2), ...>;
2025 ...
2026
2027 inner:
2028 x_2 = ...;
2029 ...
2030
2031 outer2:
2032 x_3 = PHI <x_2(inner)>;
2033
2034 if nothing in x_2 or elsewhere makes x_1 relevant. */
2035
2036static bool
2037vect_active_double_reduction_p (stmt_vec_info stmt_info)
2038{
2039 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2040 return false;
2041
2042 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
2043}
2044
2045/* Function vect_analyze_loop_operations.
2046
2047 Scan the loop stmts and make sure they are all vectorizable. */
2048
2049static opt_result
2050vect_analyze_loop_operations (loop_vec_info loop_vinfo)
2051{
2052 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2053 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2054 int nbbs = loop->num_nodes;
2055 int i;
2056 stmt_vec_info stmt_info;
2057 bool need_to_vectorize = false;
2058 bool ok;
2059
2060 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
2061
2062 auto_vec<stmt_info_for_cost> cost_vec;
2063
2064 for (i = 0; i < nbbs; i++)
2065 {
2066 basic_block bb = bbs[i];
2067
2068 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (i: si);
2069 gsi_next (i: &si))
2070 {
2071 gphi *phi = si.phi ();
2072 ok = true;
2073
2074 stmt_info = loop_vinfo->lookup_stmt (phi);
2075 if (dump_enabled_p ())
2076 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G",
2077 (gimple *) phi);
2078 if (virtual_operand_p (op: gimple_phi_result (gs: phi)))
2079 continue;
2080
2081 /* Inner-loop loop-closed exit phi in outer-loop vectorization
2082 (i.e., a phi in the tail of the outer-loop). */
2083 if (! is_loop_header_bb_p (bb))
2084 {
2085 /* FORNOW: we currently don't support the case that these phis
2086 are not used in the outerloop (unless it is double reduction,
2087 i.e., this phi is vect_reduction_def), cause this case
2088 requires to actually do something here. */
2089 if (STMT_VINFO_LIVE_P (stmt_info)
2090 && !vect_active_double_reduction_p (stmt_info))
2091 return opt_result::failure_at (loc: phi,
2092 fmt: "Unsupported loop-closed phi"
2093 " in outer-loop.\n");
2094
2095 /* If PHI is used in the outer loop, we check that its operand
2096 is defined in the inner loop. */
2097 if (STMT_VINFO_RELEVANT_P (stmt_info))
2098 {
2099 tree phi_op;
2100
2101 if (gimple_phi_num_args (gs: phi) != 1)
2102 return opt_result::failure_at (loc: phi, fmt: "unsupported phi");
2103
2104 phi_op = PHI_ARG_DEF (phi, 0);
2105 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
2106 if (!op_def_info)
2107 return opt_result::failure_at (loc: phi, fmt: "unsupported phi\n");
2108
2109 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
2110 && (STMT_VINFO_RELEVANT (op_def_info)
2111 != vect_used_in_outer_by_reduction))
2112 return opt_result::failure_at (loc: phi, fmt: "unsupported phi\n");
2113
2114 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2115 || (STMT_VINFO_DEF_TYPE (stmt_info)
2116 == vect_double_reduction_def))
2117 && !vectorizable_lc_phi (loop_vinfo,
2118 stmt_info, NULL, NULL))
2119 return opt_result::failure_at (loc: phi, fmt: "unsupported phi\n");
2120 }
2121
2122 continue;
2123 }
2124
2125 gcc_assert (stmt_info);
2126
2127 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
2128 || STMT_VINFO_LIVE_P (stmt_info))
2129 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def
2130 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
2131 /* A scalar-dependence cycle that we don't support. */
2132 return opt_result::failure_at (loc: phi,
2133 fmt: "not vectorized:"
2134 " scalar dependence cycle.\n");
2135
2136 if (STMT_VINFO_RELEVANT_P (stmt_info))
2137 {
2138 need_to_vectorize = true;
2139 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
2140 && ! PURE_SLP_STMT (stmt_info))
2141 ok = vectorizable_induction (loop_vinfo,
2142 stmt_info, NULL, NULL,
2143 &cost_vec);
2144 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2145 || (STMT_VINFO_DEF_TYPE (stmt_info)
2146 == vect_double_reduction_def)
2147 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
2148 && ! PURE_SLP_STMT (stmt_info))
2149 ok = vectorizable_reduction (loop_vinfo,
2150 stmt_info, NULL, NULL, &cost_vec);
2151 else if ((STMT_VINFO_DEF_TYPE (stmt_info)
2152 == vect_first_order_recurrence)
2153 && ! PURE_SLP_STMT (stmt_info))
2154 ok = vectorizable_recurr (loop_vinfo, stmt_info, NULL, NULL,
2155 &cost_vec);
2156 }
2157
2158 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
2159 if (ok
2160 && STMT_VINFO_LIVE_P (stmt_info)
2161 && !PURE_SLP_STMT (stmt_info))
2162 ok = vectorizable_live_operation (loop_vinfo, stmt_info, NULL, NULL,
2163 -1, false, &cost_vec);
2164
2165 if (!ok)
2166 return opt_result::failure_at (loc: phi,
2167 fmt: "not vectorized: relevant phi not "
2168 "supported: %G",
2169 static_cast <gimple *> (phi));
2170 }
2171
2172 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (i: si);
2173 gsi_next (i: &si))
2174 {
2175 gimple *stmt = gsi_stmt (i: si);
2176 if (!gimple_clobber_p (s: stmt)
2177 && !is_gimple_debug (gs: stmt))
2178 {
2179 opt_result res
2180 = vect_analyze_stmt (loop_vinfo,
2181 loop_vinfo->lookup_stmt (stmt),
2182 &need_to_vectorize,
2183 NULL, NULL, &cost_vec);
2184 if (!res)
2185 return res;
2186 }
2187 }
2188 } /* bbs */
2189
2190 add_stmt_costs (costs: loop_vinfo->vector_costs, cost_vec: &cost_vec);
2191
2192 /* All operations in the loop are either irrelevant (deal with loop
2193 control, or dead), or only used outside the loop and can be moved
2194 out of the loop (e.g. invariants, inductions). The loop can be
2195 optimized away by scalar optimizations. We're better off not
2196 touching this loop. */
2197 if (!need_to_vectorize)
2198 {
2199 if (dump_enabled_p ())
2200 dump_printf_loc (MSG_NOTE, vect_location,
2201 "All the computation can be taken out of the loop.\n");
2202 return opt_result::failure_at
2203 (loc: vect_location,
2204 fmt: "not vectorized: redundant loop. no profit to vectorize.\n");
2205 }
2206
2207 return opt_result::success ();
2208}
2209
2210/* Return true if we know that the iteration count is smaller than the
2211 vectorization factor. Return false if it isn't, or if we can't be sure
2212 either way. */
2213
2214static bool
2215vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
2216{
2217 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2218
2219 HOST_WIDE_INT max_niter;
2220 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2221 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
2222 else
2223 max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2224
2225 if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
2226 return true;
2227
2228 return false;
2229}
2230
2231/* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
2232 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
2233 definitely no, or -1 if it's worth retrying. */
2234
2235static int
2236vect_analyze_loop_costing (loop_vec_info loop_vinfo,
2237 unsigned *suggested_unroll_factor)
2238{
2239 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2240 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2241
2242 /* Only loops that can handle partially-populated vectors can have iteration
2243 counts less than the vectorization factor. */
2244 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2245 && vect_known_niters_smaller_than_vf (loop_vinfo))
2246 {
2247 if (dump_enabled_p ())
2248 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2249 "not vectorized: iteration count smaller than "
2250 "vectorization factor.\n");
2251 return 0;
2252 }
2253
2254 /* If we know the number of iterations we can do better, for the
2255 epilogue we can also decide whether the main loop leaves us
2256 with enough iterations, prefering a smaller vector epilog then
2257 also possibly used for the case we skip the vector loop. */
2258 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2259 {
2260 widest_int scalar_niters
2261 = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo)) + 1;
2262 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2263 {
2264 loop_vec_info orig_loop_vinfo
2265 = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2266 unsigned lowest_vf
2267 = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo));
2268 int prolog_peeling = 0;
2269 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2270 prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo);
2271 if (prolog_peeling >= 0
2272 && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
2273 lowest_vf))
2274 {
2275 unsigned gap
2276 = LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo) ? 1 : 0;
2277 scalar_niters = ((scalar_niters - gap - prolog_peeling)
2278 % lowest_vf + gap);
2279 }
2280 }
2281 /* Reject vectorizing for a single scalar iteration, even if
2282 we could in principle implement that using partial vectors. */
2283 unsigned peeling_gap = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2284 if (scalar_niters <= peeling_gap + 1)
2285 {
2286 if (dump_enabled_p ())
2287 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2288 "not vectorized: loop only has a single "
2289 "scalar iteration.\n");
2290 return 0;
2291 }
2292
2293 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2294 {
2295 /* Check that the loop processes at least one full vector. */
2296 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2297 if (known_lt (scalar_niters, vf))
2298 {
2299 if (dump_enabled_p ())
2300 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2301 "loop does not have enough iterations "
2302 "to support vectorization.\n");
2303 return 0;
2304 }
2305
2306 /* If we need to peel an extra epilogue iteration to handle data
2307 accesses with gaps, check that there are enough scalar iterations
2308 available.
2309
2310 The check above is redundant with this one when peeling for gaps,
2311 but the distinction is useful for diagnostics. */
2312 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2313 && known_le (scalar_niters, vf))
2314 {
2315 if (dump_enabled_p ())
2316 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2317 "loop does not have enough iterations "
2318 "to support peeling for gaps.\n");
2319 return 0;
2320 }
2321 }
2322 }
2323
2324 /* If using the "very cheap" model. reject cases in which we'd keep
2325 a copy of the scalar code (even if we might be able to vectorize it). */
2326 if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2327 && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2328 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2329 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
2330 {
2331 if (dump_enabled_p ())
2332 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2333 "some scalar iterations would need to be peeled\n");
2334 return 0;
2335 }
2336
2337 int min_profitable_iters, min_profitable_estimate;
2338 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2339 &min_profitable_estimate,
2340 suggested_unroll_factor);
2341
2342 if (min_profitable_iters < 0)
2343 {
2344 if (dump_enabled_p ())
2345 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2346 "not vectorized: vectorization not profitable.\n");
2347 if (dump_enabled_p ())
2348 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2349 "not vectorized: vector version will never be "
2350 "profitable.\n");
2351 return -1;
2352 }
2353
2354 int min_scalar_loop_bound = (param_min_vect_loop_bound
2355 * assumed_vf);
2356
2357 /* Use the cost model only if it is more conservative than user specified
2358 threshold. */
2359 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2360 min_profitable_iters);
2361
2362 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2363
2364 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2365 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2366 {
2367 if (dump_enabled_p ())
2368 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2369 "not vectorized: vectorization not profitable.\n");
2370 if (dump_enabled_p ())
2371 dump_printf_loc (MSG_NOTE, vect_location,
2372 "not vectorized: iteration count smaller than user "
2373 "specified loop bound parameter or minimum profitable "
2374 "iterations (whichever is more conservative).\n");
2375 return 0;
2376 }
2377
2378 /* The static profitablity threshold min_profitable_estimate includes
2379 the cost of having to check at runtime whether the scalar loop
2380 should be used instead. If it turns out that we don't need or want
2381 such a check, the threshold we should use for the static estimate
2382 is simply the point at which the vector loop becomes more profitable
2383 than the scalar loop. */
2384 if (min_profitable_estimate > min_profitable_iters
2385 && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
2386 && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2387 && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2388 && !vect_apply_runtime_profitability_check_p (loop_vinfo))
2389 {
2390 if (dump_enabled_p ())
2391 dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
2392 " choice between the scalar and vector loops\n");
2393 min_profitable_estimate = min_profitable_iters;
2394 }
2395
2396 /* If the vector loop needs multiple iterations to be beneficial then
2397 things are probably too close to call, and the conservative thing
2398 would be to stick with the scalar code. */
2399 if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2400 && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
2401 {
2402 if (dump_enabled_p ())
2403 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2404 "one iteration of the vector loop would be"
2405 " more expensive than the equivalent number of"
2406 " iterations of the scalar loop\n");
2407 return 0;
2408 }
2409
2410 HOST_WIDE_INT estimated_niter;
2411
2412 /* If we are vectorizing an epilogue then we know the maximum number of
2413 scalar iterations it will cover is at least one lower than the
2414 vectorization factor of the main loop. */
2415 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2416 estimated_niter
2417 = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
2418 else
2419 {
2420 estimated_niter = estimated_stmt_executions_int (loop);
2421 if (estimated_niter == -1)
2422 estimated_niter = likely_max_stmt_executions_int (loop);
2423 }
2424 if (estimated_niter != -1
2425 && ((unsigned HOST_WIDE_INT) estimated_niter
2426 < MAX (th, (unsigned) min_profitable_estimate)))
2427 {
2428 if (dump_enabled_p ())
2429 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2430 "not vectorized: estimated iteration count too "
2431 "small.\n");
2432 if (dump_enabled_p ())
2433 dump_printf_loc (MSG_NOTE, vect_location,
2434 "not vectorized: estimated iteration count smaller "
2435 "than specified loop bound parameter or minimum "
2436 "profitable iterations (whichever is more "
2437 "conservative).\n");
2438 return -1;
2439 }
2440
2441 return 1;
2442}
2443
2444static opt_result
2445vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
2446 vec<data_reference_p> *datarefs,
2447 unsigned int *n_stmts)
2448{
2449 *n_stmts = 0;
2450 for (unsigned i = 0; i < loop->num_nodes; i++)
2451 for (gimple_stmt_iterator gsi = gsi_start_bb (bb: bbs[i]);
2452 !gsi_end_p (i: gsi); gsi_next (i: &gsi))
2453 {
2454 gimple *stmt = gsi_stmt (i: gsi);
2455 if (is_gimple_debug (gs: stmt))
2456 continue;
2457 ++(*n_stmts);
2458 opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
2459 NULL, 0);
2460 if (!res)
2461 {
2462 if (is_gimple_call (gs: stmt) && loop->safelen)
2463 {
2464 tree fndecl = gimple_call_fndecl (gs: stmt), op;
2465 if (fndecl == NULL_TREE
2466 && gimple_call_internal_p (gs: stmt, fn: IFN_MASK_CALL))
2467 {
2468 fndecl = gimple_call_arg (gs: stmt, index: 0);
2469 gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
2470 fndecl = TREE_OPERAND (fndecl, 0);
2471 gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
2472 }
2473 if (fndecl != NULL_TREE)
2474 {
2475 cgraph_node *node = cgraph_node::get (decl: fndecl);
2476 if (node != NULL && node->simd_clones != NULL)
2477 {
2478 unsigned int j, n = gimple_call_num_args (gs: stmt);
2479 for (j = 0; j < n; j++)
2480 {
2481 op = gimple_call_arg (gs: stmt, index: j);
2482 if (DECL_P (op)
2483 || (REFERENCE_CLASS_P (op)
2484 && get_base_address (t: op)))
2485 break;
2486 }
2487 op = gimple_call_lhs (gs: stmt);
2488 /* Ignore #pragma omp declare simd functions
2489 if they don't have data references in the
2490 call stmt itself. */
2491 if (j == n
2492 && !(op
2493 && (DECL_P (op)
2494 || (REFERENCE_CLASS_P (op)
2495 && get_base_address (t: op)))))
2496 continue;
2497 }
2498 }
2499 }
2500 return res;
2501 }
2502 /* If dependence analysis will give up due to the limit on the
2503 number of datarefs stop here and fail fatally. */
2504 if (datarefs->length ()
2505 > (unsigned)param_loop_max_datarefs_for_datadeps)
2506 return opt_result::failure_at (loc: stmt, fmt: "exceeded param "
2507 "loop-max-datarefs-for-datadeps\n");
2508 }
2509 return opt_result::success ();
2510}
2511
2512/* Look for SLP-only access groups and turn each individual access into its own
2513 group. */
2514static void
2515vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2516{
2517 unsigned int i;
2518 struct data_reference *dr;
2519
2520 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2521
2522 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2523 FOR_EACH_VEC_ELT (datarefs, i, dr)
2524 {
2525 gcc_assert (DR_REF (dr));
2526 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2527
2528 /* Check if the load is a part of an interleaving chain. */
2529 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2530 {
2531 stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2532 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2533 unsigned int group_size = DR_GROUP_SIZE (first_element);
2534
2535 /* Check if SLP-only groups. */
2536 if (!STMT_SLP_TYPE (stmt_info)
2537 && STMT_VINFO_SLP_VECT_ONLY (first_element))
2538 {
2539 /* Dissolve the group. */
2540 STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2541
2542 stmt_vec_info vinfo = first_element;
2543 while (vinfo)
2544 {
2545 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2546 DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2547 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2548 DR_GROUP_SIZE (vinfo) = 1;
2549 if (STMT_VINFO_STRIDED_P (first_element)
2550 /* We cannot handle stores with gaps. */
2551 || DR_IS_WRITE (dr_info->dr))
2552 {
2553 STMT_VINFO_STRIDED_P (vinfo) = true;
2554 DR_GROUP_GAP (vinfo) = 0;
2555 }
2556 else
2557 DR_GROUP_GAP (vinfo) = group_size - 1;
2558 /* Duplicate and adjust alignment info, it needs to
2559 be present on each group leader, see dr_misalignment. */
2560 if (vinfo != first_element)
2561 {
2562 dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2563 dr_info2->target_alignment = dr_info->target_alignment;
2564 int misalignment = dr_info->misalignment;
2565 if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2566 {
2567 HOST_WIDE_INT diff
2568 = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2569 - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2570 unsigned HOST_WIDE_INT align_c
2571 = dr_info->target_alignment.to_constant ();
2572 misalignment = (misalignment + diff) % align_c;
2573 }
2574 dr_info2->misalignment = misalignment;
2575 }
2576 vinfo = next;
2577 }
2578 }
2579 }
2580 }
2581}
2582
2583/* Determine if operating on full vectors for LOOP_VINFO might leave
2584 some scalar iterations still to do. If so, decide how we should
2585 handle those scalar iterations. The possibilities are:
2586
2587 (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2588 In this case:
2589
2590 LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2591 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2592 LOOP_VINFO_PEELING_FOR_NITER == false
2593
2594 (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2595 to handle the remaining scalar iterations. In this case:
2596
2597 LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2598 LOOP_VINFO_PEELING_FOR_NITER == true
2599
2600 There are two choices:
2601
2602 (2a) Consider vectorizing the epilogue loop at the same VF as the
2603 main loop, but using partial vectors instead of full vectors.
2604 In this case:
2605
2606 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2607
2608 (2b) Consider vectorizing the epilogue loop at lower VFs only.
2609 In this case:
2610
2611 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2612 */
2613
2614opt_result
2615vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo)
2616{
2617 /* Determine whether there would be any scalar iterations left over. */
2618 bool need_peeling_or_partial_vectors_p
2619 = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2620
2621 /* Decide whether to vectorize the loop with partial vectors. */
2622 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2623 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2624 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2625 && need_peeling_or_partial_vectors_p)
2626 {
2627 /* For partial-vector-usage=1, try to push the handling of partial
2628 vectors to the epilogue, with the main loop continuing to operate
2629 on full vectors.
2630
2631 If we are unrolling we also do not want to use partial vectors. This
2632 is to avoid the overhead of generating multiple masks and also to
2633 avoid having to execute entire iterations of FALSE masked instructions
2634 when dealing with one or less full iterations.
2635
2636 ??? We could then end up failing to use partial vectors if we
2637 decide to peel iterations into a prologue, and if the main loop
2638 then ends up processing fewer than VF iterations. */
2639 if ((param_vect_partial_vector_usage == 1
2640 || loop_vinfo->suggested_unroll_factor > 1)
2641 && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2642 && !vect_known_niters_smaller_than_vf (loop_vinfo))
2643 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2644 else
2645 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2646 }
2647
2648 if (dump_enabled_p ())
2649 dump_printf_loc (MSG_NOTE, vect_location,
2650 "operating on %s vectors%s.\n",
2651 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2652 ? "partial" : "full",
2653 LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2654 ? " for epilogue loop" : "");
2655
2656 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2657 = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2658 && need_peeling_or_partial_vectors_p);
2659
2660 return opt_result::success ();
2661}
2662
2663/* Function vect_analyze_loop_2.
2664
2665 Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2666 analyses will record information in some members of LOOP_VINFO. FATAL
2667 indicates if some analysis meets fatal error. If one non-NULL pointer
2668 SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2669 worked out suggested unroll factor, while one NULL pointer shows it's
2670 going to apply the suggested unroll factor. SLP_DONE_FOR_SUGGESTED_UF
2671 is to hold the slp decision when the suggested unroll factor is worked
2672 out. */
2673static opt_result
2674vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
2675 unsigned *suggested_unroll_factor,
2676 bool& slp_done_for_suggested_uf)
2677{
2678 opt_result ok = opt_result::success ();
2679 int res;
2680 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2681 poly_uint64 min_vf = 2;
2682 loop_vec_info orig_loop_vinfo = NULL;
2683
2684 /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2685 loop_vec_info of the first vectorized loop. */
2686 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2687 orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2688 else
2689 orig_loop_vinfo = loop_vinfo;
2690 gcc_assert (orig_loop_vinfo);
2691
2692 /* The first group of checks is independent of the vector size. */
2693 fatal = true;
2694
2695 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2696 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2697 return opt_result::failure_at (loc: vect_location,
2698 fmt: "not vectorized: simd if(0)\n");
2699
2700 /* Find all data references in the loop (which correspond to vdefs/vuses)
2701 and analyze their evolution in the loop. */
2702
2703 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2704
2705 /* Gather the data references and count stmts in the loop. */
2706 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2707 {
2708 opt_result res
2709 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2710 datarefs: &LOOP_VINFO_DATAREFS (loop_vinfo),
2711 n_stmts: &LOOP_VINFO_N_STMTS (loop_vinfo));
2712 if (!res)
2713 {
2714 if (dump_enabled_p ())
2715 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2716 "not vectorized: loop contains function "
2717 "calls or data references that cannot "
2718 "be analyzed\n");
2719 return res;
2720 }
2721 loop_vinfo->shared->save_datarefs ();
2722 }
2723 else
2724 loop_vinfo->shared->check_datarefs ();
2725
2726 /* Analyze the data references and also adjust the minimal
2727 vectorization factor according to the loads and stores. */
2728
2729 ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2730 if (!ok)
2731 {
2732 if (dump_enabled_p ())
2733 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2734 "bad data references.\n");
2735 return ok;
2736 }
2737
2738 /* Check if we are applying unroll factor now. */
2739 bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2740 gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2741
2742 /* If the slp decision is false when suggested unroll factor is worked
2743 out, and we are applying suggested unroll factor, we can simply skip
2744 all slp related analyses this time. */
2745 bool slp = !applying_suggested_uf || slp_done_for_suggested_uf;
2746
2747 /* Classify all cross-iteration scalar data-flow cycles.
2748 Cross-iteration cycles caused by virtual phis are analyzed separately. */
2749 vect_analyze_scalar_cycles (loop_vinfo, slp);
2750
2751 vect_pattern_recog (loop_vinfo);
2752
2753 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2754
2755 /* Analyze the access patterns of the data-refs in the loop (consecutive,
2756 complex, etc.). FORNOW: Only handle consecutive access pattern. */
2757
2758 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2759 if (!ok)
2760 {
2761 if (dump_enabled_p ())
2762 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2763 "bad data access.\n");
2764 return ok;
2765 }
2766
2767 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2768
2769 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2770 if (!ok)
2771 {
2772 if (dump_enabled_p ())
2773 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2774 "unexpected pattern.\n");
2775 return ok;
2776 }
2777
2778 /* While the rest of the analysis below depends on it in some way. */
2779 fatal = false;
2780
2781 /* Analyze data dependences between the data-refs in the loop
2782 and adjust the maximum vectorization factor according to
2783 the dependences.
2784 FORNOW: fail at the first data dependence that we encounter. */
2785
2786 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2787 if (!ok)
2788 {
2789 if (dump_enabled_p ())
2790 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2791 "bad data dependence.\n");
2792 return ok;
2793 }
2794 if (max_vf != MAX_VECTORIZATION_FACTOR
2795 && maybe_lt (a: max_vf, b: min_vf))
2796 return opt_result::failure_at (loc: vect_location, fmt: "bad data dependence.\n");
2797 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2798
2799 ok = vect_determine_vectorization_factor (loop_vinfo);
2800 if (!ok)
2801 {
2802 if (dump_enabled_p ())
2803 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2804 "can't determine vectorization factor.\n");
2805 return ok;
2806 }
2807 if (max_vf != MAX_VECTORIZATION_FACTOR
2808 && maybe_lt (a: max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2809 return opt_result::failure_at (loc: vect_location, fmt: "bad data dependence.\n");
2810
2811 /* Compute the scalar iteration cost. */
2812 vect_compute_single_scalar_iteration_cost (loop_vinfo);
2813
2814 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2815
2816 if (slp)
2817 {
2818 /* Check the SLP opportunities in the loop, analyze and build
2819 SLP trees. */
2820 ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2821 if (!ok)
2822 return ok;
2823
2824 /* If there are any SLP instances mark them as pure_slp. */
2825 slp = vect_make_slp_decision (loop_vinfo);
2826 if (slp)
2827 {
2828 /* Find stmts that need to be both vectorized and SLPed. */
2829 vect_detect_hybrid_slp (loop_vinfo);
2830
2831 /* Update the vectorization factor based on the SLP decision. */
2832 vect_update_vf_for_slp (loop_vinfo);
2833
2834 /* Optimize the SLP graph with the vectorization factor fixed. */
2835 vect_optimize_slp (loop_vinfo);
2836
2837 /* Gather the loads reachable from the SLP graph entries. */
2838 vect_gather_slp_loads (loop_vinfo);
2839 }
2840 }
2841
2842 bool saved_can_use_partial_vectors_p
2843 = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2844
2845 /* We don't expect to have to roll back to anything other than an empty
2846 set of rgroups. */
2847 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2848
2849 /* This is the point where we can re-start analysis with SLP forced off. */
2850start_over:
2851
2852 /* Apply the suggested unrolling factor, this was determined by the backend
2853 during finish_cost the first time we ran the analyzis for this
2854 vector mode. */
2855 if (applying_suggested_uf)
2856 LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2857
2858 /* Now the vectorization factor is final. */
2859 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2860 gcc_assert (known_ne (vectorization_factor, 0U));
2861
2862 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2863 {
2864 dump_printf_loc (MSG_NOTE, vect_location,
2865 "vectorization_factor = ");
2866 dump_dec (MSG_NOTE, vectorization_factor);
2867 dump_printf (MSG_NOTE, ", niters = %wd\n",
2868 LOOP_VINFO_INT_NITERS (loop_vinfo));
2869 }
2870
2871 loop_vinfo->vector_costs = init_cost (vinfo: loop_vinfo, costing_for_scalar: false);
2872
2873 /* Analyze the alignment of the data-refs in the loop.
2874 Fail if a data reference is found that cannot be vectorized. */
2875
2876 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2877 if (!ok)
2878 {
2879 if (dump_enabled_p ())
2880 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2881 "bad data alignment.\n");
2882 return ok;
2883 }
2884
2885 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2886 It is important to call pruning after vect_analyze_data_ref_accesses,
2887 since we use grouping information gathered by interleaving analysis. */
2888 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2889 if (!ok)
2890 return ok;
2891
2892 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2893 vectorization, since we do not want to add extra peeling or
2894 add versioning for alignment. */
2895 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2896 /* This pass will decide on using loop versioning and/or loop peeling in
2897 order to enhance the alignment of data references in the loop. */
2898 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2899 if (!ok)
2900 return ok;
2901
2902 if (slp)
2903 {
2904 /* Analyze operations in the SLP instances. Note this may
2905 remove unsupported SLP instances which makes the above
2906 SLP kind detection invalid. */
2907 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2908 vect_slp_analyze_operations (loop_vinfo);
2909 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2910 {
2911 ok = opt_result::failure_at (loc: vect_location,
2912 fmt: "unsupported SLP instances\n");
2913 goto again;
2914 }
2915
2916 /* Check whether any load in ALL SLP instances is possibly permuted. */
2917 slp_tree load_node, slp_root;
2918 unsigned i, x;
2919 slp_instance instance;
2920 bool can_use_lanes = true;
2921 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2922 {
2923 slp_root = SLP_INSTANCE_TREE (instance);
2924 int group_size = SLP_TREE_LANES (slp_root);
2925 tree vectype = SLP_TREE_VECTYPE (slp_root);
2926 bool loads_permuted = false;
2927 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2928 {
2929 if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2930 continue;
2931 unsigned j;
2932 stmt_vec_info load_info;
2933 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2934 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2935 {
2936 loads_permuted = true;
2937 break;
2938 }
2939 }
2940
2941 /* If the loads and stores can be handled with load/store-lane
2942 instructions record it and move on to the next instance. */
2943 if (loads_permuted
2944 && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2945 && vect_store_lanes_supported (vectype, group_size, false)
2946 != IFN_LAST)
2947 {
2948 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2949 if (STMT_VINFO_GROUPED_ACCESS
2950 (SLP_TREE_REPRESENTATIVE (load_node)))
2951 {
2952 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2953 (SLP_TREE_REPRESENTATIVE (load_node));
2954 /* Use SLP for strided accesses (or if we can't
2955 load-lanes). */
2956 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2957 || vect_load_lanes_supported
2958 (STMT_VINFO_VECTYPE (stmt_vinfo),
2959 DR_GROUP_SIZE (stmt_vinfo), false) == IFN_LAST)
2960 break;
2961 }
2962
2963 can_use_lanes
2964 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2965
2966 if (can_use_lanes && dump_enabled_p ())
2967 dump_printf_loc (MSG_NOTE, vect_location,
2968 "SLP instance %p can use load/store-lanes\n",
2969 (void *) instance);
2970 }
2971 else
2972 {
2973 can_use_lanes = false;
2974 break;
2975 }
2976 }
2977
2978 /* If all SLP instances can use load/store-lanes abort SLP and try again
2979 with SLP disabled. */
2980 if (can_use_lanes)
2981 {
2982 ok = opt_result::failure_at (loc: vect_location,
2983 fmt: "Built SLP cancelled: can use "
2984 "load/store-lanes\n");
2985 if (dump_enabled_p ())
2986 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2987 "Built SLP cancelled: all SLP instances support "
2988 "load/store-lanes\n");
2989 goto again;
2990 }
2991 }
2992
2993 /* Dissolve SLP-only groups. */
2994 vect_dissolve_slp_only_groups (loop_vinfo);
2995
2996 /* Scan all the remaining operations in the loop that are not subject
2997 to SLP and make sure they are vectorizable. */
2998 ok = vect_analyze_loop_operations (loop_vinfo);
2999 if (!ok)
3000 {
3001 if (dump_enabled_p ())
3002 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3003 "bad operation or unsupported loop bound.\n");
3004 return ok;
3005 }
3006
3007 /* For now, we don't expect to mix both masking and length approaches for one
3008 loop, disable it if both are recorded. */
3009 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3010 && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
3011 && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
3012 {
3013 if (dump_enabled_p ())
3014 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3015 "can't vectorize a loop with partial vectors"
3016 " because we don't expect to mix different"
3017 " approaches with partial vectors for the"
3018 " same loop.\n");
3019 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3020 }
3021
3022 /* If we still have the option of using partial vectors,
3023 check whether we can generate the necessary loop controls. */
3024 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
3025 {
3026 if (!LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
3027 {
3028 if (!vect_verify_full_masking (loop_vinfo)
3029 && !vect_verify_full_masking_avx512 (loop_vinfo))
3030 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3031 }
3032 else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */
3033 if (!vect_verify_loop_lens (loop_vinfo))
3034 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3035 }
3036
3037 /* If we're vectorizing a loop that uses length "controls" and
3038 can iterate more than once, we apply decrementing IV approach
3039 in loop control. */
3040 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3041 && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_len
3042 && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
3043 && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3044 && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
3045 LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
3046 LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
3047
3048 /* If a loop uses length controls and has a decrementing loop control IV,
3049 we will normally pass that IV through a MIN_EXPR to calcaluate the
3050 basis for the length controls. E.g. in a loop that processes one
3051 element per scalar iteration, the number of elements would be
3052 MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
3053
3054 This MIN_EXPR approach allows us to use pointer IVs with an invariant
3055 step, since only the final iteration of the vector loop can have
3056 inactive lanes.
3057
3058 However, some targets have a dedicated instruction for calculating the
3059 preferred length, given the total number of elements that still need to
3060 be processed. This is encapsulated in the SELECT_VL internal function.
3061
3062 If the target supports SELECT_VL, we can use it instead of MIN_EXPR
3063 to determine the basis for the length controls. However, unlike the
3064 MIN_EXPR calculation, the SELECT_VL calculation can decide to make
3065 lanes inactive in any iteration of the vector loop, not just the last
3066 iteration. This SELECT_VL approach therefore requires us to use pointer
3067 IVs with variable steps.
3068
3069 Once we've decided how many elements should be processed by one
3070 iteration of the vector loop, we need to populate the rgroup controls.
3071 If a loop has multiple rgroups, we need to make sure that those rgroups
3072 "line up" (that is, they must be consistent about which elements are
3073 active and which aren't). This is done by vect_adjust_loop_lens_control.
3074
3075 In principle, it would be possible to use vect_adjust_loop_lens_control
3076 on either the result of a MIN_EXPR or the result of a SELECT_VL.
3077 However:
3078
3079 (1) In practice, it only makes sense to use SELECT_VL when a vector
3080 operation will be controlled directly by the result. It is not
3081 worth using SELECT_VL if it would only be the input to other
3082 calculations.
3083
3084 (2) If we use SELECT_VL for an rgroup that has N controls, each associated
3085 pointer IV will need N updates by a variable amount (N-1 updates
3086 within the iteration and 1 update to move to the next iteration).
3087
3088 Because of this, we prefer to use the MIN_EXPR approach whenever there
3089 is more than one length control.
3090
3091 In addition, SELECT_VL always operates to a granularity of 1 unit.
3092 If we wanted to use it to control an SLP operation on N consecutive
3093 elements, we would need to make the SELECT_VL inputs measure scalar
3094 iterations (rather than elements) and then multiply the SELECT_VL
3095 result by N. But using SELECT_VL this way is inefficient because
3096 of (1) above.
3097
3098 2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
3099 satisfied:
3100
3101 (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
3102 (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
3103
3104 Since SELECT_VL (variable step) will make SCEV analysis failed and then
3105 we will fail to gain benefits of following unroll optimizations. We prefer
3106 using the MIN_EXPR approach in this situation. */
3107 if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
3108 {
3109 tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
3110 if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
3111 OPTIMIZE_FOR_SPEED)
3112 && LOOP_VINFO_LENS (loop_vinfo).length () == 1
3113 && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1 && !slp
3114 && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3115 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
3116 LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
3117 }
3118
3119 /* Decide whether this loop_vinfo should use partial vectors or peeling,
3120 assuming that the loop will be used as a main loop. We will redo
3121 this analysis later if we instead decide to use the loop as an
3122 epilogue loop. */
3123 ok = vect_determine_partial_vectors_and_peeling (loop_vinfo);
3124 if (!ok)
3125 return ok;
3126
3127 /* If we're vectorizing an epilogue loop, the vectorized loop either needs
3128 to be able to handle fewer than VF scalars, or needs to have a lower VF
3129 than the main loop. */
3130 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
3131 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3132 {
3133 poly_uint64 unscaled_vf
3134 = exact_div (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
3135 b: orig_loop_vinfo->suggested_unroll_factor);
3136 if (maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), unscaled_vf))
3137 return opt_result::failure_at (loc: vect_location,
3138 fmt: "Vectorization factor too high for"
3139 " epilogue loop.\n");
3140 }
3141
3142 /* Check the costings of the loop make vectorizing worthwhile. */
3143 res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
3144 if (res < 0)
3145 {
3146 ok = opt_result::failure_at (loc: vect_location,
3147 fmt: "Loop costings may not be worthwhile.\n");
3148 goto again;
3149 }
3150 if (!res)
3151 return opt_result::failure_at (loc: vect_location,
3152 fmt: "Loop costings not worthwhile.\n");
3153
3154 /* If an epilogue loop is required make sure we can create one. */
3155 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
3156 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
3157 {
3158 if (dump_enabled_p ())
3159 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
3160 if (!vect_can_advance_ivs_p (loop_vinfo)
3161 || !slpeel_can_duplicate_loop_p (loop,
3162 LOOP_VINFO_IV_EXIT (loop_vinfo),
3163 LOOP_VINFO_IV_EXIT (loop_vinfo)))
3164 {
3165 ok = opt_result::failure_at (loc: vect_location,
3166 fmt: "not vectorized: can't create required "
3167 "epilog loop\n");
3168 goto again;
3169 }
3170 }
3171
3172 /* During peeling, we need to check if number of loop iterations is
3173 enough for both peeled prolog loop and vector loop. This check
3174 can be merged along with threshold check of loop versioning, so
3175 increase threshold for this case if necessary.
3176
3177 If we are analyzing an epilogue we still want to check what its
3178 versioning threshold would be. If we decide to vectorize the epilogues we
3179 will want to use the lowest versioning threshold of all epilogues and main
3180 loop. This will enable us to enter a vectorized epilogue even when
3181 versioning the loop. We can't simply check whether the epilogue requires
3182 versioning though since we may have skipped some versioning checks when
3183 analyzing the epilogue. For instance, checks for alias versioning will be
3184 skipped when dealing with epilogues as we assume we already checked them
3185 for the main loop. So instead we always check the 'orig_loop_vinfo'. */
3186 if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
3187 {
3188 poly_uint64 niters_th = 0;
3189 unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3190
3191 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
3192 {
3193 /* Niters for peeled prolog loop. */
3194 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3195 {
3196 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
3197 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
3198 niters_th += TYPE_VECTOR_SUBPARTS (node: vectype) - 1;
3199 }
3200 else
3201 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3202 }
3203
3204 /* Niters for at least one iteration of vectorized loop. */
3205 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3206 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3207 /* One additional iteration because of peeling for gap. */
3208 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3209 niters_th += 1;
3210
3211 /* Use the same condition as vect_transform_loop to decide when to use
3212 the cost to determine a versioning threshold. */
3213 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3214 && ordered_p (a: th, b: niters_th))
3215 niters_th = ordered_max (a: poly_uint64 (th), b: niters_th);
3216
3217 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
3218 }
3219
3220 gcc_assert (known_eq (vectorization_factor,
3221 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
3222
3223 slp_done_for_suggested_uf = slp;
3224
3225 /* Ok to vectorize! */
3226 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
3227 return opt_result::success ();
3228
3229again:
3230 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
3231 gcc_assert (!ok);
3232
3233 /* Try again with SLP forced off but if we didn't do any SLP there is
3234 no point in re-trying. */
3235 if (!slp)
3236 return ok;
3237
3238 /* If the slp decision is true when suggested unroll factor is worked
3239 out, and we are applying suggested unroll factor, we don't need to
3240 re-try any more. */
3241 if (applying_suggested_uf && slp_done_for_suggested_uf)
3242 return ok;
3243
3244 /* If there are reduction chains re-trying will fail anyway. */
3245 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
3246 return ok;
3247
3248 /* Likewise if the grouped loads or stores in the SLP cannot be handled
3249 via interleaving or lane instructions. */
3250 slp_instance instance;
3251 slp_tree node;
3252 unsigned i, j;
3253 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
3254 {
3255 stmt_vec_info vinfo;
3256 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
3257 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
3258 continue;
3259 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3260 unsigned int size = DR_GROUP_SIZE (vinfo);
3261 tree vectype = STMT_VINFO_VECTYPE (vinfo);
3262 if (vect_store_lanes_supported (vectype, size, false) == IFN_LAST
3263 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
3264 && ! vect_grouped_store_supported (vectype, size))
3265 return opt_result::failure_at (loc: vinfo->stmt,
3266 fmt: "unsupported grouped store\n");
3267 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
3268 {
3269 vinfo = SLP_TREE_REPRESENTATIVE (node);
3270 if (STMT_VINFO_GROUPED_ACCESS (vinfo))
3271 {
3272 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3273 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
3274 size = DR_GROUP_SIZE (vinfo);
3275 vectype = STMT_VINFO_VECTYPE (vinfo);
3276 if (vect_load_lanes_supported (vectype, size, false) == IFN_LAST
3277 && ! vect_grouped_load_supported (vectype, single_element_p,
3278 size))
3279 return opt_result::failure_at (loc: vinfo->stmt,
3280 fmt: "unsupported grouped load\n");
3281 }
3282 }
3283 }
3284
3285 if (dump_enabled_p ())
3286 dump_printf_loc (MSG_NOTE, vect_location,
3287 "re-trying with SLP disabled\n");
3288
3289 /* Roll back state appropriately. No SLP this time. */
3290 slp = false;
3291 /* Restore vectorization factor as it were without SLP. */
3292 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
3293 /* Free the SLP instances. */
3294 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
3295 vect_free_slp_instance (instance);
3296 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
3297 /* Reset SLP type to loop_vect on all stmts. */
3298 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
3299 {
3300 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3301 for (gimple_stmt_iterator si = gsi_start_phis (bb);
3302 !gsi_end_p (i: si); gsi_next (i: &si))
3303 {
3304 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (i: si));
3305 STMT_SLP_TYPE (stmt_info) = loop_vect;
3306 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
3307 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
3308 {
3309 /* vectorizable_reduction adjusts reduction stmt def-types,
3310 restore them to that of the PHI. */
3311 STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
3312 = STMT_VINFO_DEF_TYPE (stmt_info);
3313 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
3314 (STMT_VINFO_REDUC_DEF (stmt_info)))
3315 = STMT_VINFO_DEF_TYPE (stmt_info);
3316 }
3317 }
3318 for (gimple_stmt_iterator si = gsi_start_bb (bb);
3319 !gsi_end_p (i: si); gsi_next (i: &si))
3320 {
3321 if (is_gimple_debug (gs: gsi_stmt (i: si)))
3322 continue;
3323 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (i: si));
3324 STMT_SLP_TYPE (stmt_info) = loop_vect;
3325 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3326 {
3327 stmt_vec_info pattern_stmt_info
3328 = STMT_VINFO_RELATED_STMT (stmt_info);
3329 if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
3330 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
3331
3332 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
3333 STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
3334 for (gimple_stmt_iterator pi = gsi_start (seq&: pattern_def_seq);
3335 !gsi_end_p (i: pi); gsi_next (i: &pi))
3336 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
3337 = loop_vect;
3338 }
3339 }
3340 }
3341 /* Free optimized alias test DDRS. */
3342 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (size: 0);
3343 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
3344 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
3345 /* Reset target cost data. */
3346 delete loop_vinfo->vector_costs;
3347 loop_vinfo->vector_costs = nullptr;
3348 /* Reset accumulated rgroup information. */
3349 LOOP_VINFO_MASKS (loop_vinfo).mask_set.empty ();
3350 release_vec_loop_controls (controls: &LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
3351 release_vec_loop_controls (controls: &LOOP_VINFO_LENS (loop_vinfo));
3352 /* Reset assorted flags. */
3353 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
3354 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
3355 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
3356 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
3357 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3358 = saved_can_use_partial_vectors_p;
3359
3360 goto start_over;
3361}
3362
3363/* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
3364 to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
3365 OLD_LOOP_VINFO is better unless something specifically indicates
3366 otherwise.
3367
3368 Note that this deliberately isn't a partial order. */
3369
3370static bool
3371vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
3372 loop_vec_info old_loop_vinfo)
3373{
3374 struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
3375 gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
3376
3377 poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
3378 poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
3379
3380 /* Always prefer a VF of loop->simdlen over any other VF. */
3381 if (loop->simdlen)
3382 {
3383 bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
3384 bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
3385 if (new_simdlen_p != old_simdlen_p)
3386 return new_simdlen_p;
3387 }
3388
3389 const auto *old_costs = old_loop_vinfo->vector_costs;
3390 const auto *new_costs = new_loop_vinfo->vector_costs;
3391 if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
3392 return new_costs->better_epilogue_loop_than_p (other: old_costs, main_loop);
3393
3394 return new_costs->better_main_loop_than_p (other: old_costs);
3395}
3396
3397/* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
3398 true if we should. */
3399
3400static bool
3401vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
3402 loop_vec_info old_loop_vinfo)
3403{
3404 if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
3405 return false;
3406
3407 if (dump_enabled_p ())
3408 dump_printf_loc (MSG_NOTE, vect_location,
3409 "***** Preferring vector mode %s to vector mode %s\n",
3410 GET_MODE_NAME (new_loop_vinfo->vector_mode),
3411 GET_MODE_NAME (old_loop_vinfo->vector_mode));
3412 return true;
3413}
3414
3415/* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
3416 not NULL. Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
3417 MODE_I to the next mode useful to analyze.
3418 Return the loop_vinfo on success and wrapped null on failure. */
3419
3420static opt_loop_vec_info
3421vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
3422 const vect_loop_form_info *loop_form_info,
3423 loop_vec_info main_loop_vinfo,
3424 const vector_modes &vector_modes, unsigned &mode_i,
3425 machine_mode &autodetected_vector_mode,
3426 bool &fatal)
3427{
3428 loop_vec_info loop_vinfo
3429 = vect_create_loop_vinfo (loop, shared, info: loop_form_info, main_loop_info: main_loop_vinfo);
3430
3431 machine_mode vector_mode = vector_modes[mode_i];
3432 loop_vinfo->vector_mode = vector_mode;
3433 unsigned int suggested_unroll_factor = 1;
3434 bool slp_done_for_suggested_uf = false;
3435
3436 /* Run the main analysis. */
3437 opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
3438 suggested_unroll_factor: &suggested_unroll_factor,
3439 slp_done_for_suggested_uf);
3440 if (dump_enabled_p ())
3441 dump_printf_loc (MSG_NOTE, vect_location,
3442 "***** Analysis %s with vector mode %s\n",
3443 res ? "succeeded" : " failed",
3444 GET_MODE_NAME (loop_vinfo->vector_mode));
3445
3446 if (res && !main_loop_vinfo && suggested_unroll_factor > 1)
3447 {
3448 if (dump_enabled_p ())
3449 dump_printf_loc (MSG_NOTE, vect_location,
3450 "***** Re-trying analysis for unrolling"
3451 " with unroll factor %d and slp %s.\n",
3452 suggested_unroll_factor,
3453 slp_done_for_suggested_uf ? "on" : "off");
3454 loop_vec_info unroll_vinfo
3455 = vect_create_loop_vinfo (loop, shared, info: loop_form_info, main_loop_info: main_loop_vinfo);
3456 unroll_vinfo->vector_mode = vector_mode;
3457 unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
3458 opt_result new_res = vect_analyze_loop_2 (loop_vinfo: unroll_vinfo, fatal, NULL,
3459 slp_done_for_suggested_uf);
3460 if (new_res)
3461 {
3462 delete loop_vinfo;
3463 loop_vinfo = unroll_vinfo;
3464 }
3465 else
3466 delete unroll_vinfo;
3467 }
3468
3469 /* Remember the autodetected vector mode. */
3470 if (vector_mode == VOIDmode)
3471 autodetected_vector_mode = loop_vinfo->vector_mode;
3472
3473 /* Advance mode_i, first skipping modes that would result in the
3474 same analysis result. */
3475 while (mode_i + 1 < vector_modes.length ()
3476 && vect_chooses_same_modes_p (loop_vinfo,
3477 vector_modes[mode_i + 1]))
3478 {
3479 if (dump_enabled_p ())
3480 dump_printf_loc (MSG_NOTE, vect_location,
3481 "***** The result for vector mode %s would"
3482 " be the same\n",
3483 GET_MODE_NAME (vector_modes[mode_i + 1]));
3484 mode_i += 1;
3485 }
3486 if (mode_i + 1 < vector_modes.length ()
3487 && VECTOR_MODE_P (autodetected_vector_mode)
3488 && (related_vector_mode (vector_modes[mode_i + 1],
3489 GET_MODE_INNER (autodetected_vector_mode))
3490 == autodetected_vector_mode)
3491 && (related_vector_mode (autodetected_vector_mode,
3492 GET_MODE_INNER (vector_modes[mode_i + 1]))
3493 == vector_modes[mode_i + 1]))
3494 {
3495 if (dump_enabled_p ())
3496 dump_printf_loc (MSG_NOTE, vect_location,
3497 "***** Skipping vector mode %s, which would"
3498 " repeat the analysis for %s\n",
3499 GET_MODE_NAME (vector_modes[mode_i + 1]),
3500 GET_MODE_NAME (autodetected_vector_mode));
3501 mode_i += 1;
3502 }
3503 mode_i++;
3504
3505 if (!res)
3506 {
3507 delete loop_vinfo;
3508 if (fatal)
3509 gcc_checking_assert (main_loop_vinfo == NULL);
3510 return opt_loop_vec_info::propagate_failure (other: res);
3511 }
3512
3513 return opt_loop_vec_info::success (ptr: loop_vinfo);
3514}
3515
3516/* Function vect_analyze_loop.
3517
3518 Apply a set of analyses on LOOP, and create a loop_vec_info struct
3519 for it. The different analyses will record information in the
3520 loop_vec_info struct. */
3521opt_loop_vec_info
3522vect_analyze_loop (class loop *loop, vec_info_shared *shared)
3523{
3524 DUMP_VECT_SCOPE ("analyze_loop_nest");
3525
3526 if (loop_outer (loop)
3527 && loop_vec_info_for_loop (loop: loop_outer (loop))
3528 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
3529 return opt_loop_vec_info::failure_at (loc: vect_location,
3530 fmt: "outer-loop already vectorized.\n");
3531
3532 if (!find_loop_nest (loop, &shared->loop_nest))
3533 return opt_loop_vec_info::failure_at
3534 (loc: vect_location,
3535 fmt: "not vectorized: loop nest containing two or more consecutive inner"
3536 " loops cannot be vectorized\n");
3537
3538 /* Analyze the loop form. */
3539 vect_loop_form_info loop_form_info;
3540 opt_result res = vect_analyze_loop_form (loop, info: &loop_form_info);
3541 if (!res)
3542 {
3543 if (dump_enabled_p ())
3544 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3545 "bad loop form.\n");
3546 return opt_loop_vec_info::propagate_failure (other: res);
3547 }
3548 if (!integer_onep (loop_form_info.assumptions))
3549 {
3550 /* We consider to vectorize this loop by versioning it under
3551 some assumptions. In order to do this, we need to clear
3552 existing information computed by scev and niter analyzer. */
3553 scev_reset_htab ();
3554 free_numbers_of_iterations_estimates (loop);
3555 /* Also set flag for this loop so that following scev and niter
3556 analysis are done under the assumptions. */
3557 loop_constraint_set (loop, LOOP_C_FINITE);
3558 }
3559
3560 auto_vector_modes vector_modes;
3561 /* Autodetect first vector size we try. */
3562 vector_modes.safe_push (VOIDmode);
3563 unsigned int autovec_flags
3564 = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
3565 loop->simdlen != 0);
3566 bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
3567 && !unlimited_cost_model (loop));
3568 machine_mode autodetected_vector_mode = VOIDmode;
3569 opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
3570 unsigned int mode_i = 0;
3571 unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3572
3573 /* Keep track of the VF for each mode. Initialize all to 0 which indicates
3574 a mode has not been analyzed. */
3575 auto_vec<poly_uint64, 8> cached_vf_per_mode;
3576 for (unsigned i = 0; i < vector_modes.length (); ++i)
3577 cached_vf_per_mode.safe_push (obj: 0);
3578
3579 /* First determine the main loop vectorization mode, either the first
3580 one that works, starting with auto-detecting the vector mode and then
3581 following the targets order of preference, or the one with the
3582 lowest cost if pick_lowest_cost_p. */
3583 while (1)
3584 {
3585 bool fatal;
3586 unsigned int last_mode_i = mode_i;
3587 /* Set cached VF to -1 prior to analysis, which indicates a mode has
3588 failed. */
3589 cached_vf_per_mode[last_mode_i] = -1;
3590 opt_loop_vec_info loop_vinfo
3591 = vect_analyze_loop_1 (loop, shared, loop_form_info: &loop_form_info,
3592 NULL, vector_modes, mode_i,
3593 autodetected_vector_mode, fatal);
3594 if (fatal)
3595 break;
3596
3597 if (loop_vinfo)
3598 {
3599 /* Analyzis has been successful so update the VF value. The
3600 VF should always be a multiple of unroll_factor and we want to
3601 capture the original VF here. */
3602 cached_vf_per_mode[last_mode_i]
3603 = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3604 b: loop_vinfo->suggested_unroll_factor);
3605 /* Once we hit the desired simdlen for the first time,
3606 discard any previous attempts. */
3607 if (simdlen
3608 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3609 {
3610 delete first_loop_vinfo;
3611 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3612 simdlen = 0;
3613 }
3614 else if (pick_lowest_cost_p
3615 && first_loop_vinfo
3616 && vect_joust_loop_vinfos (new_loop_vinfo: loop_vinfo, old_loop_vinfo: first_loop_vinfo))
3617 {
3618 /* Pick loop_vinfo over first_loop_vinfo. */
3619 delete first_loop_vinfo;
3620 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3621 }
3622 if (first_loop_vinfo == NULL)
3623 first_loop_vinfo = loop_vinfo;
3624 else
3625 {
3626 delete loop_vinfo;
3627 loop_vinfo = opt_loop_vec_info::success (NULL);
3628 }
3629
3630 /* Commit to first_loop_vinfo if we have no reason to try
3631 alternatives. */
3632 if (!simdlen && !pick_lowest_cost_p)
3633 break;
3634 }
3635 if (mode_i == vector_modes.length ()
3636 || autodetected_vector_mode == VOIDmode)
3637 break;
3638
3639 /* Try the next biggest vector size. */
3640 if (dump_enabled_p ())
3641 dump_printf_loc (MSG_NOTE, vect_location,
3642 "***** Re-trying analysis with vector mode %s\n",
3643 GET_MODE_NAME (vector_modes[mode_i]));
3644 }
3645 if (!first_loop_vinfo)
3646 return opt_loop_vec_info::propagate_failure (other: res);
3647
3648 if (dump_enabled_p ())
3649 dump_printf_loc (MSG_NOTE, vect_location,
3650 "***** Choosing vector mode %s\n",
3651 GET_MODE_NAME (first_loop_vinfo->vector_mode));
3652
3653 /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3654 enabled, SIMDUID is not set, it is the innermost loop and we have
3655 either already found the loop's SIMDLEN or there was no SIMDLEN to
3656 begin with.
3657 TODO: Enable epilogue vectorization for loops with SIMDUID set. */
3658 bool vect_epilogues = (!simdlen
3659 && loop->inner == NULL
3660 && param_vect_epilogues_nomask
3661 && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3662 && !loop->simduid);
3663 if (!vect_epilogues)
3664 return first_loop_vinfo;
3665
3666 /* Now analyze first_loop_vinfo for epilogue vectorization. */
3667 poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3668
3669 /* For epilogues start the analysis from the first mode. The motivation
3670 behind starting from the beginning comes from cases where the VECTOR_MODES
3671 array may contain length-agnostic and length-specific modes. Their
3672 ordering is not guaranteed, so we could end up picking a mode for the main
3673 loop that is after the epilogue's optimal mode. */
3674 vector_modes[0] = autodetected_vector_mode;
3675 mode_i = 0;
3676
3677 bool supports_partial_vectors =
3678 partial_vectors_supported_p () && param_vect_partial_vector_usage != 0;
3679 poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3680
3681 while (1)
3682 {
3683 /* If the target does not support partial vectors we can shorten the
3684 number of modes to analyze for the epilogue as we know we can't pick a
3685 mode that would lead to a VF at least as big as the
3686 FIRST_VINFO_VF. */
3687 if (!supports_partial_vectors
3688 && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3689 {
3690 mode_i++;
3691 if (mode_i == vector_modes.length ())
3692 break;
3693 continue;
3694 }
3695
3696 if (dump_enabled_p ())
3697 dump_printf_loc (MSG_NOTE, vect_location,
3698 "***** Re-trying epilogue analysis with vector "
3699 "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3700
3701 bool fatal;
3702 opt_loop_vec_info loop_vinfo
3703 = vect_analyze_loop_1 (loop, shared, loop_form_info: &loop_form_info,
3704 main_loop_vinfo: first_loop_vinfo,
3705 vector_modes, mode_i,
3706 autodetected_vector_mode, fatal);
3707 if (fatal)
3708 break;
3709
3710 if (loop_vinfo)
3711 {
3712 if (pick_lowest_cost_p)
3713 {
3714 /* Keep trying to roll back vectorization attempts while the
3715 loop_vec_infos they produced were worse than this one. */
3716 vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3717 while (!vinfos.is_empty ()
3718 && vect_joust_loop_vinfos (new_loop_vinfo: loop_vinfo, old_loop_vinfo: vinfos.last ()))
3719 {
3720 gcc_assert (vect_epilogues);
3721 delete vinfos.pop ();
3722 }
3723 }
3724 /* For now only allow one epilogue loop. */
3725 if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3726 {
3727 first_loop_vinfo->epilogue_vinfos.safe_push (obj: loop_vinfo);
3728 poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3729 gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3730 || maybe_ne (lowest_th, 0U));
3731 /* Keep track of the known smallest versioning
3732 threshold. */
3733 if (ordered_p (a: lowest_th, b: th))
3734 lowest_th = ordered_min (a: lowest_th, b: th);
3735 }
3736 else
3737 {
3738 delete loop_vinfo;
3739 loop_vinfo = opt_loop_vec_info::success (NULL);
3740 }
3741
3742 /* For now only allow one epilogue loop, but allow
3743 pick_lowest_cost_p to replace it, so commit to the
3744 first epilogue if we have no reason to try alternatives. */
3745 if (!pick_lowest_cost_p)
3746 break;
3747 }
3748
3749 if (mode_i == vector_modes.length ())
3750 break;
3751
3752 }
3753
3754 if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3755 {
3756 LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3757 if (dump_enabled_p ())
3758 dump_printf_loc (MSG_NOTE, vect_location,
3759 "***** Choosing epilogue vector mode %s\n",
3760 GET_MODE_NAME
3761 (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3762 }
3763
3764 return first_loop_vinfo;
3765}
3766
3767/* Return true if there is an in-order reduction function for CODE, storing
3768 it in *REDUC_FN if so. */
3769
3770static bool
3771fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3772{
3773 /* We support MINUS_EXPR by negating the operand. This also preserves an
3774 initial -0.0 since -0.0 - 0.0 (neutral op for MINUS_EXPR) == -0.0 +
3775 (-0.0) = -0.0. */
3776 if (code == PLUS_EXPR || code == MINUS_EXPR)
3777 {
3778 *reduc_fn = IFN_FOLD_LEFT_PLUS;
3779 return true;
3780 }
3781 return false;
3782}
3783
3784/* Function reduction_fn_for_scalar_code
3785
3786 Input:
3787 CODE - tree_code of a reduction operations.
3788
3789 Output:
3790 REDUC_FN - the corresponding internal function to be used to reduce the
3791 vector of partial results into a single scalar result, or IFN_LAST
3792 if the operation is a supported reduction operation, but does not have
3793 such an internal function.
3794
3795 Return FALSE if CODE currently cannot be vectorized as reduction. */
3796
3797bool
3798reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3799{
3800 if (code.is_tree_code ())
3801 switch (tree_code (code))
3802 {
3803 case MAX_EXPR:
3804 *reduc_fn = IFN_REDUC_MAX;
3805 return true;
3806
3807 case MIN_EXPR:
3808 *reduc_fn = IFN_REDUC_MIN;
3809 return true;
3810
3811 case PLUS_EXPR:
3812 *reduc_fn = IFN_REDUC_PLUS;
3813 return true;
3814
3815 case BIT_AND_EXPR:
3816 *reduc_fn = IFN_REDUC_AND;
3817 return true;
3818
3819 case BIT_IOR_EXPR:
3820 *reduc_fn = IFN_REDUC_IOR;
3821 return true;
3822
3823 case BIT_XOR_EXPR:
3824 *reduc_fn = IFN_REDUC_XOR;
3825 return true;
3826
3827 case MULT_EXPR:
3828 case MINUS_EXPR:
3829 *reduc_fn = IFN_LAST;
3830 return true;
3831
3832 default:
3833 return false;
3834 }
3835 else
3836 switch (combined_fn (code))
3837 {
3838 CASE_CFN_FMAX:
3839 *reduc_fn = IFN_REDUC_FMAX;
3840 return true;
3841
3842 CASE_CFN_FMIN:
3843 *reduc_fn = IFN_REDUC_FMIN;
3844 return true;
3845
3846 default:
3847 return false;
3848 }
3849}
3850
3851/* If there is a neutral value X such that a reduction would not be affected
3852 by the introduction of additional X elements, return that X, otherwise
3853 return null. CODE is the code of the reduction and SCALAR_TYPE is type
3854 of the scalar elements. If the reduction has just a single initial value
3855 then INITIAL_VALUE is that value, otherwise it is null.
3856 If AS_INITIAL is TRUE the value is supposed to be used as initial value.
3857 In that case no signed zero is returned. */
3858
3859tree
3860neutral_op_for_reduction (tree scalar_type, code_helper code,
3861 tree initial_value, bool as_initial)
3862{
3863 if (code.is_tree_code ())
3864 switch (tree_code (code))
3865 {
3866 case DOT_PROD_EXPR:
3867 case SAD_EXPR:
3868 case MINUS_EXPR:
3869 case BIT_IOR_EXPR:
3870 case BIT_XOR_EXPR:
3871 return build_zero_cst (scalar_type);
3872 case WIDEN_SUM_EXPR:
3873 case PLUS_EXPR:
3874 if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
3875 return build_real (scalar_type, dconstm0);
3876 else
3877 return build_zero_cst (scalar_type);
3878
3879 case MULT_EXPR:
3880 return build_one_cst (scalar_type);
3881
3882 case BIT_AND_EXPR:
3883 return build_all_ones_cst (scalar_type);
3884
3885 case MAX_EXPR:
3886 case MIN_EXPR:
3887 return initial_value;
3888
3889 default:
3890 return NULL_TREE;
3891 }
3892 else
3893 switch (combined_fn (code))
3894 {
3895 CASE_CFN_FMIN:
3896 CASE_CFN_FMAX:
3897 return initial_value;
3898
3899 default:
3900 return NULL_TREE;
3901 }
3902}
3903
3904/* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3905 STMT is printed with a message MSG. */
3906
3907static void
3908report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3909{
3910 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3911}
3912
3913/* Return true if we need an in-order reduction for operation CODE
3914 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3915 overflow must wrap. */
3916
3917bool
3918needs_fold_left_reduction_p (tree type, code_helper code)
3919{
3920 /* CHECKME: check for !flag_finite_math_only too? */
3921 if (SCALAR_FLOAT_TYPE_P (type))
3922 {
3923 if (code.is_tree_code ())
3924 switch (tree_code (code))
3925 {
3926 case MIN_EXPR:
3927 case MAX_EXPR:
3928 return false;
3929
3930 default:
3931 return !flag_associative_math;
3932 }
3933 else
3934 switch (combined_fn (code))
3935 {
3936 CASE_CFN_FMIN:
3937 CASE_CFN_FMAX:
3938 return false;
3939
3940 default:
3941 return !flag_associative_math;
3942 }
3943 }
3944
3945 if (INTEGRAL_TYPE_P (type))
3946 return (!code.is_tree_code ()
3947 || !operation_no_trapping_overflow (type, tree_code (code)));
3948
3949 if (SAT_FIXED_POINT_TYPE_P (type))
3950 return true;
3951
3952 return false;
3953}
3954
3955/* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3956 has a handled computation expression. Store the main reduction
3957 operation in *CODE. */
3958
3959static bool
3960check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3961 tree loop_arg, code_helper *code,
3962 vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3963{
3964 auto_bitmap visited;
3965 tree lookfor = PHI_RESULT (phi);
3966 ssa_op_iter curri;
3967 use_operand_p curr = op_iter_init_phiuse (ptr: &curri, phi, SSA_OP_USE);
3968 while (USE_FROM_PTR (curr) != loop_arg)
3969 curr = op_iter_next_use (ptr: &curri);
3970 curri.i = curri.numops;
3971 do
3972 {
3973 path.safe_push (obj: std::make_pair (x&: curri, y&: curr));
3974 tree use = USE_FROM_PTR (curr);
3975 if (use == lookfor)
3976 break;
3977 gimple *def = SSA_NAME_DEF_STMT (use);
3978 if (gimple_nop_p (g: def)
3979 || ! flow_bb_inside_loop_p (loop, gimple_bb (g: def)))
3980 {
3981pop:
3982 do
3983 {
3984 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3985 curri = x.first;
3986 curr = x.second;
3987 do
3988 curr = op_iter_next_use (ptr: &curri);
3989 /* Skip already visited or non-SSA operands (from iterating
3990 over PHI args). */
3991 while (curr != NULL_USE_OPERAND_P
3992 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3993 || ! bitmap_set_bit (visited,
3994 SSA_NAME_VERSION
3995 (USE_FROM_PTR (curr)))));
3996 }
3997 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3998 if (curr == NULL_USE_OPERAND_P)
3999 break;
4000 }
4001 else
4002 {
4003 if (gimple_code (g: def) == GIMPLE_PHI)
4004 curr = op_iter_init_phiuse (ptr: &curri, phi: as_a <gphi *>(p: def), SSA_OP_USE);
4005 else
4006 curr = op_iter_init_use (ptr: &curri, stmt: def, SSA_OP_USE);
4007 while (curr != NULL_USE_OPERAND_P
4008 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
4009 || ! bitmap_set_bit (visited,
4010 SSA_NAME_VERSION
4011 (USE_FROM_PTR (curr)))))
4012 curr = op_iter_next_use (ptr: &curri);
4013 if (curr == NULL_USE_OPERAND_P)
4014 goto pop;
4015 }
4016 }
4017 while (1);
4018 if (dump_file && (dump_flags & TDF_DETAILS))
4019 {
4020 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
4021 unsigned i;
4022 std::pair<ssa_op_iter, use_operand_p> *x;
4023 FOR_EACH_VEC_ELT (path, i, x)
4024 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
4025 dump_printf (MSG_NOTE, "\n");
4026 }
4027
4028 /* Check whether the reduction path detected is valid. */
4029 bool fail = path.length () == 0;
4030 bool neg = false;
4031 int sign = -1;
4032 *code = ERROR_MARK;
4033 for (unsigned i = 1; i < path.length (); ++i)
4034 {
4035 gimple *use_stmt = USE_STMT (path[i].second);
4036 gimple_match_op op;
4037 if (!gimple_extract_op (use_stmt, &op))
4038 {
4039 fail = true;
4040 break;
4041 }
4042 unsigned int opi = op.num_ops;
4043 if (gassign *assign = dyn_cast<gassign *> (p: use_stmt))
4044 {
4045 /* The following make sure we can compute the operand index
4046 easily plus it mostly disallows chaining via COND_EXPR condition
4047 operands. */
4048 for (opi = 0; opi < op.num_ops; ++opi)
4049 if (gimple_assign_rhs1_ptr (gs: assign) + opi == path[i].second->use)
4050 break;
4051 }
4052 else if (gcall *call = dyn_cast<gcall *> (p: use_stmt))
4053 {
4054 for (opi = 0; opi < op.num_ops; ++opi)
4055 if (gimple_call_arg_ptr (gs: call, index: opi) == path[i].second->use)
4056 break;
4057 }
4058 if (opi == op.num_ops)
4059 {
4060 fail = true;
4061 break;
4062 }
4063 op.code = canonicalize_code (op.code, op.type);
4064 if (op.code == MINUS_EXPR)
4065 {
4066 op.code = PLUS_EXPR;
4067 /* Track whether we negate the reduction value each iteration. */
4068 if (op.ops[1] == op.ops[opi])
4069 neg = ! neg;
4070 }
4071 if (CONVERT_EXPR_CODE_P (op.code)
4072 && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
4073 ;
4074 else if (*code == ERROR_MARK)
4075 {
4076 *code = op.code;
4077 sign = TYPE_SIGN (op.type);
4078 }
4079 else if (op.code != *code)
4080 {
4081 fail = true;
4082 break;
4083 }
4084 else if ((op.code == MIN_EXPR
4085 || op.code == MAX_EXPR)
4086 && sign != TYPE_SIGN (op.type))
4087 {
4088 fail = true;
4089 break;
4090 }
4091 /* Check there's only a single stmt the op is used on. For the
4092 not value-changing tail and the last stmt allow out-of-loop uses.
4093 ??? We could relax this and handle arbitrary live stmts by
4094 forcing a scalar epilogue for example. */
4095 imm_use_iterator imm_iter;
4096 use_operand_p use_p;
4097 gimple *op_use_stmt;
4098 unsigned cnt = 0;
4099 bool cond_fn_p = op.code.is_internal_fn ()
4100 && (conditional_internal_fn_code (internal_fn (op.code))
4101 != ERROR_MARK);
4102
4103 FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
4104 {
4105 /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4106 op1 twice (once as definition, once as else) in the same operation.
4107 Allow this. */
4108 if (cond_fn_p)
4109 {
4110 gcall *call = dyn_cast<gcall *> (p: use_stmt);
4111 unsigned else_pos
4112 = internal_fn_else_index (internal_fn (op.code));
4113
4114 for (unsigned int j = 0; j < gimple_call_num_args (gs: call); ++j)
4115 {
4116 if (j == else_pos)
4117 continue;
4118 if (gimple_call_arg (gs: call, index: j) == op.ops[opi])
4119 cnt++;
4120 }
4121 }
4122 else if (!is_gimple_debug (gs: op_use_stmt)
4123 && (*code != ERROR_MARK
4124 || flow_bb_inside_loop_p (loop,
4125 gimple_bb (g: op_use_stmt))))
4126 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4127 cnt++;
4128 }
4129
4130 if (cnt != 1)
4131 {
4132 fail = true;
4133 break;
4134 }
4135 }
4136 return ! fail && ! neg && *code != ERROR_MARK;
4137}
4138
4139bool
4140check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
4141 tree loop_arg, enum tree_code code)
4142{
4143 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4144 code_helper code_;
4145 return (check_reduction_path (loc, loop, phi, loop_arg, code: &code_, path)
4146 && code_ == code);
4147}
4148
4149
4150
4151/* Function vect_is_simple_reduction
4152
4153 (1) Detect a cross-iteration def-use cycle that represents a simple
4154 reduction computation. We look for the following pattern:
4155
4156 loop_header:
4157 a1 = phi < a0, a2 >
4158 a3 = ...
4159 a2 = operation (a3, a1)
4160
4161 or
4162
4163 a3 = ...
4164 loop_header:
4165 a1 = phi < a0, a2 >
4166 a2 = operation (a3, a1)
4167
4168 such that:
4169 1. operation is commutative and associative and it is safe to
4170 change the order of the computation
4171 2. no uses for a2 in the loop (a2 is used out of the loop)
4172 3. no uses of a1 in the loop besides the reduction operation
4173 4. no uses of a1 outside the loop.
4174
4175 Conditions 1,4 are tested here.
4176 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
4177
4178 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
4179 nested cycles.
4180
4181 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
4182 reductions:
4183
4184 a1 = phi < a0, a2 >
4185 inner loop (def of a3)
4186 a2 = phi < a3 >
4187
4188 (4) Detect condition expressions, ie:
4189 for (int i = 0; i < N; i++)
4190 if (a[i] < val)
4191 ret_val = a[i];
4192
4193*/
4194
4195static stmt_vec_info
4196vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
4197 bool *double_reduc, bool *reduc_chain_p, bool slp)
4198{
4199 gphi *phi = as_a <gphi *> (p: phi_info->stmt);
4200 gimple *phi_use_stmt = NULL;
4201 imm_use_iterator imm_iter;
4202 use_operand_p use_p;
4203
4204 *double_reduc = false;
4205 *reduc_chain_p = false;
4206 STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
4207
4208 tree phi_name = PHI_RESULT (phi);
4209 /* ??? If there are no uses of the PHI result the inner loop reduction
4210 won't be detected as possibly double-reduction by vectorizable_reduction
4211 because that tries to walk the PHI arg from the preheader edge which
4212 can be constant. See PR60382. */
4213 if (has_zero_uses (var: phi_name))
4214 return NULL;
4215 class loop *loop = (gimple_bb (g: phi))->loop_father;
4216 unsigned nphi_def_loop_uses = 0;
4217 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
4218 {
4219 gimple *use_stmt = USE_STMT (use_p);
4220 if (is_gimple_debug (gs: use_stmt))
4221 continue;
4222
4223 if (!flow_bb_inside_loop_p (loop, gimple_bb (g: use_stmt)))
4224 {
4225 if (dump_enabled_p ())
4226 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4227 "intermediate value used outside loop.\n");
4228
4229 return NULL;
4230 }
4231
4232 /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4233 op1 twice (once as definition, once as else) in the same operation.
4234 Only count it as one. */
4235 if (use_stmt != phi_use_stmt)
4236 {
4237 nphi_def_loop_uses++;
4238 phi_use_stmt = use_stmt;
4239 }
4240 }
4241
4242 tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
4243 if (TREE_CODE (latch_def) != SSA_NAME)
4244 {
4245 if (dump_enabled_p ())
4246 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4247 "reduction: not ssa_name: %T\n", latch_def);
4248 return NULL;
4249 }
4250
4251 stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
4252 if (!def_stmt_info
4253 || !flow_bb_inside_loop_p (loop, gimple_bb (g: def_stmt_info->stmt)))
4254 return NULL;
4255
4256 bool nested_in_vect_loop
4257 = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
4258 unsigned nlatch_def_loop_uses = 0;
4259 auto_vec<gphi *, 3> lcphis;
4260 bool inner_loop_of_double_reduc = false;
4261 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
4262 {
4263 gimple *use_stmt = USE_STMT (use_p);
4264 if (is_gimple_debug (gs: use_stmt))
4265 continue;
4266 if (flow_bb_inside_loop_p (loop, gimple_bb (g: use_stmt)))
4267 nlatch_def_loop_uses++;
4268 else
4269 {
4270 /* We can have more than one loop-closed PHI. */
4271 lcphis.safe_push (obj: as_a <gphi *> (p: use_stmt));
4272 if (nested_in_vect_loop
4273 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
4274 == vect_double_reduction_def))
4275 inner_loop_of_double_reduc = true;
4276 }
4277 }
4278
4279 /* If we are vectorizing an inner reduction we are executing that
4280 in the original order only in case we are not dealing with a
4281 double reduction. */
4282 if (nested_in_vect_loop && !inner_loop_of_double_reduc)
4283 {
4284 if (dump_enabled_p ())
4285 report_vect_op (msg_type: MSG_NOTE, stmt: def_stmt_info->stmt,
4286 msg: "detected nested cycle: ");
4287 return def_stmt_info;
4288 }
4289
4290 /* When the inner loop of a double reduction ends up with more than
4291 one loop-closed PHI we have failed to classify alternate such
4292 PHIs as double reduction, leading to wrong code. See PR103237. */
4293 if (inner_loop_of_double_reduc && lcphis.length () != 1)
4294 {
4295 if (dump_enabled_p ())
4296 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4297 "unhandle double reduction\n");
4298 return NULL;
4299 }
4300
4301 /* If this isn't a nested cycle or if the nested cycle reduction value
4302 is used ouside of the inner loop we cannot handle uses of the reduction
4303 value. */
4304 if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
4305 {
4306 if (dump_enabled_p ())
4307 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4308 "reduction used in loop.\n");
4309 return NULL;
4310 }
4311
4312 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
4313 defined in the inner loop. */
4314 if (gphi *def_stmt = dyn_cast <gphi *> (p: def_stmt_info->stmt))
4315 {
4316 tree op1 = PHI_ARG_DEF (def_stmt, 0);
4317 if (gimple_phi_num_args (gs: def_stmt) != 1
4318 || TREE_CODE (op1) != SSA_NAME)
4319 {
4320 if (dump_enabled_p ())
4321 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4322 "unsupported phi node definition.\n");
4323
4324 return NULL;
4325 }
4326
4327 /* Verify there is an inner cycle composed of the PHI phi_use_stmt
4328 and the latch definition op1. */
4329 gimple *def1 = SSA_NAME_DEF_STMT (op1);
4330 if (gimple_bb (g: def1)
4331 && flow_bb_inside_loop_p (loop, gimple_bb (g: def_stmt))
4332 && loop->inner
4333 && flow_bb_inside_loop_p (loop->inner, gimple_bb (g: def1))
4334 && (is_gimple_assign (gs: def1) || is_gimple_call (gs: def1))
4335 && is_a <gphi *> (p: phi_use_stmt)
4336 && flow_bb_inside_loop_p (loop->inner, gimple_bb (g: phi_use_stmt))
4337 && (op1 == PHI_ARG_DEF_FROM_EDGE (phi_use_stmt,
4338 loop_latch_edge (loop->inner))))
4339 {
4340 if (dump_enabled_p ())
4341 report_vect_op (msg_type: MSG_NOTE, stmt: def_stmt,
4342 msg: "detected double reduction: ");
4343
4344 *double_reduc = true;
4345 return def_stmt_info;
4346 }
4347
4348 return NULL;
4349 }
4350
4351 /* Look for the expression computing latch_def from then loop PHI result. */
4352 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4353 code_helper code;
4354 if (check_reduction_path (loc: vect_location, loop, phi, loop_arg: latch_def, code: &code,
4355 path))
4356 {
4357 STMT_VINFO_REDUC_CODE (phi_info) = code;
4358 if (code == COND_EXPR && !nested_in_vect_loop)
4359 STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
4360
4361 /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
4362 reduction chain for which the additional restriction is that
4363 all operations in the chain are the same. */
4364 auto_vec<stmt_vec_info, 8> reduc_chain;
4365 unsigned i;
4366 bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
4367 for (i = path.length () - 1; i >= 1; --i)
4368 {
4369 gimple *stmt = USE_STMT (path[i].second);
4370 stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
4371 gimple_match_op op;
4372 if (!gimple_extract_op (stmt, &op))
4373 gcc_unreachable ();
4374 if (gassign *assign = dyn_cast<gassign *> (p: stmt))
4375 STMT_VINFO_REDUC_IDX (stmt_info)
4376 = path[i].second->use - gimple_assign_rhs1_ptr (gs: assign);
4377 else
4378 {
4379 gcall *call = as_a<gcall *> (p: stmt);
4380 STMT_VINFO_REDUC_IDX (stmt_info)
4381 = path[i].second->use - gimple_call_arg_ptr (gs: call, index: 0);
4382 }
4383 bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
4384 && (i == 1 || i == path.length () - 1));
4385 if ((op.code != code && !leading_conversion)
4386 /* We can only handle the final value in epilogue
4387 generation for reduction chains. */
4388 || (i != 1 && !has_single_use (var: gimple_get_lhs (stmt))))
4389 is_slp_reduc = false;
4390 /* For reduction chains we support a trailing/leading
4391 conversions. We do not store those in the actual chain. */
4392 if (leading_conversion)
4393 continue;
4394 reduc_chain.safe_push (obj: stmt_info);
4395 }
4396 if (slp && is_slp_reduc && reduc_chain.length () > 1)
4397 {
4398 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
4399 {
4400 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
4401 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
4402 }
4403 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
4404 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
4405
4406 /* Save the chain for further analysis in SLP detection. */
4407 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (obj: reduc_chain[0]);
4408 REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
4409
4410 *reduc_chain_p = true;
4411 if (dump_enabled_p ())
4412 dump_printf_loc (MSG_NOTE, vect_location,
4413 "reduction: detected reduction chain\n");
4414 }
4415 else if (dump_enabled_p ())
4416 dump_printf_loc (MSG_NOTE, vect_location,
4417 "reduction: detected reduction\n");
4418
4419 return def_stmt_info;
4420 }
4421
4422 if (dump_enabled_p ())
4423 dump_printf_loc (MSG_NOTE, vect_location,
4424 "reduction: unknown pattern\n");
4425
4426 return NULL;
4427}
4428
4429/* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
4430 PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
4431 or -1 if not known. */
4432
4433static int
4434vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
4435{
4436 int assumed_vf = vect_vf_for_cost (loop_vinfo);
4437 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
4438 {
4439 if (dump_enabled_p ())
4440 dump_printf_loc (MSG_NOTE, vect_location,
4441 "cost model: epilogue peel iters set to vf/2 "
4442 "because loop iterations are unknown .\n");
4443 return assumed_vf / 2;
4444 }
4445 else
4446 {
4447 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
4448 peel_iters_prologue = MIN (niters, peel_iters_prologue);
4449 int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
4450 /* If we need to peel for gaps, but no peeling is required, we have to
4451 peel VF iterations. */
4452 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
4453 peel_iters_epilogue = assumed_vf;
4454 return peel_iters_epilogue;
4455 }
4456}
4457
4458/* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
4459int
4460vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
4461 int *peel_iters_epilogue,
4462 stmt_vector_for_cost *scalar_cost_vec,
4463 stmt_vector_for_cost *prologue_cost_vec,
4464 stmt_vector_for_cost *epilogue_cost_vec)
4465{
4466 int retval = 0;
4467
4468 *peel_iters_epilogue
4469 = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
4470
4471 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
4472 {
4473 /* If peeled iterations are known but number of scalar loop
4474 iterations are unknown, count a taken branch per peeled loop. */
4475 if (peel_iters_prologue > 0)
4476 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
4477 vect_prologue);
4478 if (*peel_iters_epilogue > 0)
4479 retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
4480 vect_epilogue);
4481 }
4482
4483 stmt_info_for_cost *si;
4484 int j;
4485 if (peel_iters_prologue)
4486 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4487 retval += record_stmt_cost (body_cost_vec: prologue_cost_vec,
4488 count: si->count * peel_iters_prologue,
4489 kind: si->kind, stmt_info: si->stmt_info, misalign: si->misalign,
4490 where: vect_prologue);
4491 if (*peel_iters_epilogue)
4492 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4493 retval += record_stmt_cost (body_cost_vec: epilogue_cost_vec,
4494 count: si->count * *peel_iters_epilogue,
4495 kind: si->kind, stmt_info: si->stmt_info, misalign: si->misalign,
4496 where: vect_epilogue);
4497
4498 return retval;
4499}
4500
4501/* Function vect_estimate_min_profitable_iters
4502
4503 Return the number of iterations required for the vector version of the
4504 loop to be profitable relative to the cost of the scalar version of the
4505 loop.
4506
4507 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
4508 of iterations for vectorization. -1 value means loop vectorization
4509 is not profitable. This returned value may be used for dynamic
4510 profitability check.
4511
4512 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
4513 for static check against estimated number of iterations. */
4514
4515static void
4516vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
4517 int *ret_min_profitable_niters,
4518 int *ret_min_profitable_estimate,
4519 unsigned *suggested_unroll_factor)
4520{
4521 int min_profitable_iters;
4522 int min_profitable_estimate;
4523 int peel_iters_prologue;
4524 int peel_iters_epilogue;
4525 unsigned vec_inside_cost = 0;
4526 int vec_outside_cost = 0;
4527 unsigned vec_prologue_cost = 0;
4528 unsigned vec_epilogue_cost = 0;
4529 int scalar_single_iter_cost = 0;
4530 int scalar_outside_cost = 0;
4531 int assumed_vf = vect_vf_for_cost (loop_vinfo);
4532 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
4533 vector_costs *target_cost_data = loop_vinfo->vector_costs;
4534
4535 /* Cost model disabled. */
4536 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
4537 {
4538 if (dump_enabled_p ())
4539 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
4540 *ret_min_profitable_niters = 0;
4541 *ret_min_profitable_estimate = 0;
4542 return;
4543 }
4544
4545 /* Requires loop versioning tests to handle misalignment. */
4546 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
4547 {
4548 /* FIXME: Make cost depend on complexity of individual check. */
4549 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
4550 (void) add_stmt_cost (costs: target_cost_data, count: len, kind: scalar_stmt, where: vect_prologue);
4551 if (dump_enabled_p ())
4552 dump_printf (MSG_NOTE,
4553 "cost model: Adding cost of checks for loop "
4554 "versioning to treat misalignment.\n");
4555 }
4556
4557 /* Requires loop versioning with alias checks. */
4558 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4559 {
4560 /* FIXME: Make cost depend on complexity of individual check. */
4561 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
4562 (void) add_stmt_cost (costs: target_cost_data, count: len, kind: scalar_stmt, where: vect_prologue);
4563 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
4564 if (len)
4565 /* Count LEN - 1 ANDs and LEN comparisons. */
4566 (void) add_stmt_cost (costs: target_cost_data, count: len * 2 - 1,
4567 kind: scalar_stmt, where: vect_prologue);
4568 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
4569 if (len)
4570 {
4571 /* Count LEN - 1 ANDs and LEN comparisons. */
4572 unsigned int nstmts = len * 2 - 1;
4573 /* +1 for each bias that needs adding. */
4574 for (unsigned int i = 0; i < len; ++i)
4575 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4576 nstmts += 1;
4577 (void) add_stmt_cost (costs: target_cost_data, count: nstmts,
4578 kind: scalar_stmt, where: vect_prologue);
4579 }
4580 if (dump_enabled_p ())
4581 dump_printf (MSG_NOTE,
4582 "cost model: Adding cost of checks for loop "
4583 "versioning aliasing.\n");
4584 }
4585
4586 /* Requires loop versioning with niter checks. */
4587 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4588 {
4589 /* FIXME: Make cost depend on complexity of individual check. */
4590 (void) add_stmt_cost (costs: target_cost_data, count: 1, kind: vector_stmt,
4591 NULL, NULL, NULL_TREE, misalign: 0, where: vect_prologue);
4592 if (dump_enabled_p ())
4593 dump_printf (MSG_NOTE,
4594 "cost model: Adding cost of checks for loop "
4595 "versioning niters.\n");
4596 }
4597
4598 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4599 (void) add_stmt_cost (costs: target_cost_data, count: 1, kind: cond_branch_taken,
4600 where: vect_prologue);
4601
4602 /* Count statements in scalar loop. Using this as scalar cost for a single
4603 iteration for now.
4604
4605 TODO: Add outer loop support.
4606
4607 TODO: Consider assigning different costs to different scalar
4608 statements. */
4609
4610 scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
4611
4612 /* Add additional cost for the peeled instructions in prologue and epilogue
4613 loop. (For fully-masked loops there will be no peeling.)
4614
4615 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4616 at compile-time - we assume it's vf/2 (the worst would be vf-1).
4617
4618 TODO: Build an expression that represents peel_iters for prologue and
4619 epilogue to be used in a run-time test. */
4620
4621 bool prologue_need_br_taken_cost = false;
4622 bool prologue_need_br_not_taken_cost = false;
4623
4624 /* Calculate peel_iters_prologue. */
4625 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4626 peel_iters_prologue = 0;
4627 else if (npeel < 0)
4628 {
4629 peel_iters_prologue = assumed_vf / 2;
4630 if (dump_enabled_p ())
4631 dump_printf (MSG_NOTE, "cost model: "
4632 "prologue peel iters set to vf/2.\n");
4633
4634 /* If peeled iterations are unknown, count a taken branch and a not taken
4635 branch per peeled loop. Even if scalar loop iterations are known,
4636 vector iterations are not known since peeled prologue iterations are
4637 not known. Hence guards remain the same. */
4638 prologue_need_br_taken_cost = true;
4639 prologue_need_br_not_taken_cost = true;
4640 }
4641 else
4642 {
4643 peel_iters_prologue = npeel;
4644 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4645 /* If peeled iterations are known but number of scalar loop
4646 iterations are unknown, count a taken branch per peeled loop. */
4647 prologue_need_br_taken_cost = true;
4648 }
4649
4650 bool epilogue_need_br_taken_cost = false;
4651 bool epilogue_need_br_not_taken_cost = false;
4652
4653 /* Calculate peel_iters_epilogue. */
4654 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4655 /* We need to peel exactly one iteration for gaps. */
4656 peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4657 else if (npeel < 0)
4658 {
4659 /* If peeling for alignment is unknown, loop bound of main loop
4660 becomes unknown. */
4661 peel_iters_epilogue = assumed_vf / 2;
4662 if (dump_enabled_p ())
4663 dump_printf (MSG_NOTE, "cost model: "
4664 "epilogue peel iters set to vf/2 because "
4665 "peeling for alignment is unknown.\n");
4666
4667 /* See the same reason above in peel_iters_prologue calculation. */
4668 epilogue_need_br_taken_cost = true;
4669 epilogue_need_br_not_taken_cost = true;
4670 }
4671 else
4672 {
4673 peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue: npeel);
4674 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4675 /* If peeled iterations are known but number of scalar loop
4676 iterations are unknown, count a taken branch per peeled loop. */
4677 epilogue_need_br_taken_cost = true;
4678 }
4679
4680 stmt_info_for_cost *si;
4681 int j;
4682 /* Add costs associated with peel_iters_prologue. */
4683 if (peel_iters_prologue)
4684 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4685 {
4686 (void) add_stmt_cost (costs: target_cost_data,
4687 count: si->count * peel_iters_prologue, kind: si->kind,
4688 stmt_info: si->stmt_info, node: si->node, vectype: si->vectype,
4689 misalign: si->misalign, where: vect_prologue);
4690 }
4691
4692 /* Add costs associated with peel_iters_epilogue. */
4693 if (peel_iters_epilogue)
4694 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4695 {
4696 (void) add_stmt_cost (costs: target_cost_data,
4697 count: si->count * peel_iters_epilogue, kind: si->kind,
4698 stmt_info: si->stmt_info, node: si->node, vectype: si->vectype,
4699 misalign: si->misalign, where: vect_epilogue);
4700 }
4701
4702 /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
4703
4704 if (prologue_need_br_taken_cost)
4705 (void) add_stmt_cost (costs: target_cost_data, count: 1, kind: cond_branch_taken,
4706 where: vect_prologue);
4707
4708 if (prologue_need_br_not_taken_cost)
4709 (void) add_stmt_cost (costs: target_cost_data, count: 1,
4710 kind: cond_branch_not_taken, where: vect_prologue);
4711
4712 if (epilogue_need_br_taken_cost)
4713 (void) add_stmt_cost (costs: target_cost_data, count: 1, kind: cond_branch_taken,
4714 where: vect_epilogue);
4715
4716 if (epilogue_need_br_not_taken_cost)
4717 (void) add_stmt_cost (costs: target_cost_data, count: 1,
4718 kind: cond_branch_not_taken, where: vect_epilogue);
4719
4720 /* Take care of special costs for rgroup controls of partial vectors. */
4721 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4722 && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4723 == vect_partial_vectors_avx512))
4724 {
4725 /* Calculate how many masks we need to generate. */
4726 unsigned int num_masks = 0;
4727 bool need_saturation = false;
4728 for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
4729 if (rgm.type)
4730 {
4731 unsigned nvectors = rgm.factor;
4732 num_masks += nvectors;
4733 if (TYPE_PRECISION (TREE_TYPE (rgm.compare_type))
4734 < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo)))
4735 need_saturation = true;
4736 }
4737
4738 /* ??? The target isn't able to identify the costs below as
4739 producing masks so it cannot penaltize cases where we'd run
4740 out of mask registers for example. */
4741
4742 /* ??? We are also failing to account for smaller vector masks
4743 we generate by splitting larger masks in vect_get_loop_mask. */
4744
4745 /* In the worst case, we need to generate each mask in the prologue
4746 and in the loop body. We need one splat per group and one
4747 compare per mask.
4748
4749 Sometimes the prologue mask will fold to a constant,
4750 so the actual prologue cost might be smaller. However, it's
4751 simpler and safer to use the worst-case cost; if this ends up
4752 being the tie-breaker between vectorizing or not, then it's
4753 probably better not to vectorize. */
4754 (void) add_stmt_cost (costs: target_cost_data,
4755 count: num_masks
4756 + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4757 kind: vector_stmt, NULL, NULL, NULL_TREE, misalign: 0,
4758 where: vect_prologue);
4759 (void) add_stmt_cost (costs: target_cost_data,
4760 count: num_masks
4761 + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4762 kind: vector_stmt, NULL, NULL, NULL_TREE, misalign: 0, where: vect_body);
4763
4764 /* When we need saturation we need it both in the prologue and
4765 the epilogue. */
4766 if (need_saturation)
4767 {
4768 (void) add_stmt_cost (costs: target_cost_data, count: 1, kind: scalar_stmt,
4769 NULL, NULL, NULL_TREE, misalign: 0, where: vect_prologue);
4770 (void) add_stmt_cost (costs: target_cost_data, count: 1, kind: scalar_stmt,
4771 NULL, NULL, NULL_TREE, misalign: 0, where: vect_body);
4772 }
4773 }
4774 else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4775 && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4776 == vect_partial_vectors_while_ult))
4777 {
4778 /* Calculate how many masks we need to generate. */
4779 unsigned int num_masks = 0;
4780 rgroup_controls *rgm;
4781 unsigned int num_vectors_m1;
4782 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
4783 num_vectors_m1, rgm)
4784 if (rgm->type)
4785 num_masks += num_vectors_m1 + 1;
4786 gcc_assert (num_masks > 0);
4787
4788 /* In the worst case, we need to generate each mask in the prologue
4789 and in the loop body. One of the loop body mask instructions
4790 replaces the comparison in the scalar loop, and since we don't
4791 count the scalar comparison against the scalar body, we shouldn't
4792 count that vector instruction against the vector body either.
4793
4794 Sometimes we can use unpacks instead of generating prologue
4795 masks and sometimes the prologue mask will fold to a constant,
4796 so the actual prologue cost might be smaller. However, it's
4797 simpler and safer to use the worst-case cost; if this ends up
4798 being the tie-breaker between vectorizing or not, then it's
4799 probably better not to vectorize. */
4800 (void) add_stmt_cost (costs: target_cost_data, count: num_masks,
4801 kind: vector_stmt, NULL, NULL, NULL_TREE, misalign: 0,
4802 where: vect_prologue);
4803 (void) add_stmt_cost (costs: target_cost_data, count: num_masks - 1,
4804 kind: vector_stmt, NULL, NULL, NULL_TREE, misalign: 0,
4805 where: vect_body);
4806 }
4807 else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4808 {
4809 /* Referring to the functions vect_set_loop_condition_partial_vectors
4810 and vect_set_loop_controls_directly, we need to generate each
4811 length in the prologue and in the loop body if required. Although
4812 there are some possible optimizations, we consider the worst case
4813 here. */
4814
4815 bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4816 signed char partial_load_store_bias
4817 = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4818 bool need_iterate_p
4819 = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4820 && !vect_known_niters_smaller_than_vf (loop_vinfo));
4821
4822 /* Calculate how many statements to be added. */
4823 unsigned int prologue_stmts = 0;
4824 unsigned int body_stmts = 0;
4825
4826 rgroup_controls *rgc;
4827 unsigned int num_vectors_m1;
4828 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4829 if (rgc->type)
4830 {
4831 /* May need one SHIFT for nitems_total computation. */
4832 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4833 if (nitems != 1 && !niters_known_p)
4834 prologue_stmts += 1;
4835
4836 /* May need one MAX and one MINUS for wrap around. */
4837 if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4838 prologue_stmts += 2;
4839
4840 /* Need one MAX and one MINUS for each batch limit excepting for
4841 the 1st one. */
4842 prologue_stmts += num_vectors_m1 * 2;
4843
4844 unsigned int num_vectors = num_vectors_m1 + 1;
4845
4846 /* Need to set up lengths in prologue, only one MIN required
4847 for each since start index is zero. */
4848 prologue_stmts += num_vectors;
4849
4850 /* If we have a non-zero partial load bias, we need one PLUS
4851 to adjust the load length. */
4852 if (partial_load_store_bias != 0)
4853 body_stmts += 1;
4854
4855 /* Each may need two MINs and one MINUS to update lengths in body
4856 for next iteration. */
4857 if (need_iterate_p)
4858 body_stmts += 3 * num_vectors;
4859 }
4860
4861 (void) add_stmt_cost (costs: target_cost_data, count: prologue_stmts,
4862 kind: scalar_stmt, where: vect_prologue);
4863 (void) add_stmt_cost (costs: target_cost_data, count: body_stmts,
4864 kind: scalar_stmt, where: vect_body);
4865 }
4866
4867 /* FORNOW: The scalar outside cost is incremented in one of the
4868 following ways:
4869
4870 1. The vectorizer checks for alignment and aliasing and generates
4871 a condition that allows dynamic vectorization. A cost model
4872 check is ANDED with the versioning condition. Hence scalar code
4873 path now has the added cost of the versioning check.
4874
4875 if (cost > th & versioning_check)
4876 jmp to vector code
4877
4878 Hence run-time scalar is incremented by not-taken branch cost.
4879
4880 2. The vectorizer then checks if a prologue is required. If the
4881 cost model check was not done before during versioning, it has to
4882 be done before the prologue check.
4883
4884 if (cost <= th)
4885 prologue = scalar_iters
4886 if (prologue == 0)
4887 jmp to vector code
4888 else
4889 execute prologue
4890 if (prologue == num_iters)
4891 go to exit
4892
4893 Hence the run-time scalar cost is incremented by a taken branch,
4894 plus a not-taken branch, plus a taken branch cost.
4895
4896 3. The vectorizer then checks if an epilogue is required. If the
4897 cost model check was not done before during prologue check, it
4898 has to be done with the epilogue check.
4899
4900 if (prologue == 0)
4901 jmp to vector code
4902 else
4903 execute prologue
4904 if (prologue == num_iters)
4905 go to exit
4906 vector code:
4907 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4908 jmp to epilogue
4909
4910 Hence the run-time scalar cost should be incremented by 2 taken
4911 branches.
4912
4913 TODO: The back end may reorder the BBS's differently and reverse
4914 conditions/branch directions. Change the estimates below to
4915 something more reasonable. */
4916
4917 /* If the number of iterations is known and we do not do versioning, we can
4918 decide whether to vectorize at compile time. Hence the scalar version
4919 do not carry cost model guard costs. */
4920 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4921 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4922 {
4923 /* Cost model check occurs at versioning. */
4924 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4925 scalar_outside_cost += vect_get_stmt_cost (type_of_cost: cond_branch_not_taken);
4926 else
4927 {
4928 /* Cost model check occurs at prologue generation. */
4929 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4930 scalar_outside_cost += 2 * vect_get_stmt_cost (type_of_cost: cond_branch_taken)
4931 + vect_get_stmt_cost (type_of_cost: cond_branch_not_taken);
4932 /* Cost model check occurs at epilogue generation. */
4933 else
4934 scalar_outside_cost += 2 * vect_get_stmt_cost (type_of_cost: cond_branch_taken);
4935 }
4936 }
4937
4938 /* Complete the target-specific cost calculations. */
4939 finish_cost (costs: loop_vinfo->vector_costs, scalar_costs: loop_vinfo->scalar_costs,
4940 prologue_cost: &vec_prologue_cost, body_cost: &vec_inside_cost, epilogue_cost: &vec_epilogue_cost,
4941 suggested_unroll_factor);
4942
4943 if (suggested_unroll_factor && *suggested_unroll_factor > 1
4944 && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
4945 && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
4946 *suggested_unroll_factor,
4947 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
4948 {
4949 if (dump_enabled_p ())
4950 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4951 "can't unroll as unrolled vectorization factor larger"
4952 " than maximum vectorization factor: "
4953 HOST_WIDE_INT_PRINT_UNSIGNED "\n",
4954 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
4955 *suggested_unroll_factor = 1;
4956 }
4957
4958 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4959
4960 if (dump_enabled_p ())
4961 {
4962 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4963 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
4964 vec_inside_cost);
4965 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
4966 vec_prologue_cost);
4967 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
4968 vec_epilogue_cost);
4969 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
4970 scalar_single_iter_cost);
4971 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
4972 scalar_outside_cost);
4973 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
4974 vec_outside_cost);
4975 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
4976 peel_iters_prologue);
4977 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
4978 peel_iters_epilogue);
4979 }
4980
4981 /* Calculate number of iterations required to make the vector version
4982 profitable, relative to the loop bodies only. The following condition
4983 must hold true:
4984 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4985 where
4986 SIC = scalar iteration cost, VIC = vector iteration cost,
4987 VOC = vector outside cost, VF = vectorization factor,
4988 NPEEL = prologue iterations + epilogue iterations,
4989 SOC = scalar outside cost for run time cost model check. */
4990
4991 int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4992 - vec_inside_cost);
4993 if (saving_per_viter <= 0)
4994 {
4995 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4996 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4997 "vectorization did not happen for a simd loop");
4998
4999 if (dump_enabled_p ())
5000 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5001 "cost model: the vector iteration cost = %d "
5002 "divided by the scalar iteration cost = %d "
5003 "is greater or equal to the vectorization factor = %d"
5004 ".\n",
5005 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
5006 *ret_min_profitable_niters = -1;
5007 *ret_min_profitable_estimate = -1;
5008 return;
5009 }
5010
5011 /* ??? The "if" arm is written to handle all cases; see below for what
5012 we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
5013 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5014 {
5015 /* Rewriting the condition above in terms of the number of
5016 vector iterations (vniters) rather than the number of
5017 scalar iterations (niters) gives:
5018
5019 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
5020
5021 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
5022
5023 For integer N, X and Y when X > 0:
5024
5025 N * X > Y <==> N >= (Y /[floor] X) + 1. */
5026 int outside_overhead = (vec_outside_cost
5027 - scalar_single_iter_cost * peel_iters_prologue
5028 - scalar_single_iter_cost * peel_iters_epilogue
5029 - scalar_outside_cost);
5030 /* We're only interested in cases that require at least one
5031 vector iteration. */
5032 int min_vec_niters = 1;
5033 if (outside_overhead > 0)
5034 min_vec_niters = outside_overhead / saving_per_viter + 1;
5035
5036 if (dump_enabled_p ())
5037 dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
5038 min_vec_niters);
5039
5040 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5041 {
5042 /* Now that we know the minimum number of vector iterations,
5043 find the minimum niters for which the scalar cost is larger:
5044
5045 SIC * niters > VIC * vniters + VOC - SOC
5046
5047 We know that the minimum niters is no more than
5048 vniters * VF + NPEEL, but it might be (and often is) less
5049 than that if a partial vector iteration is cheaper than the
5050 equivalent scalar code. */
5051 int threshold = (vec_inside_cost * min_vec_niters
5052 + vec_outside_cost
5053 - scalar_outside_cost);
5054 if (threshold <= 0)
5055 min_profitable_iters = 1;
5056 else
5057 min_profitable_iters = threshold / scalar_single_iter_cost + 1;
5058 }
5059 else
5060 /* Convert the number of vector iterations into a number of
5061 scalar iterations. */
5062 min_profitable_iters = (min_vec_niters * assumed_vf
5063 + peel_iters_prologue
5064 + peel_iters_epilogue);
5065 }
5066 else
5067 {
5068 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
5069 * assumed_vf
5070 - vec_inside_cost * peel_iters_prologue
5071 - vec_inside_cost * peel_iters_epilogue);
5072 if (min_profitable_iters <= 0)
5073 min_profitable_iters = 0;
5074 else
5075 {
5076 min_profitable_iters /= saving_per_viter;
5077
5078 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
5079 <= (((int) vec_inside_cost * min_profitable_iters)
5080 + (((int) vec_outside_cost - scalar_outside_cost)
5081 * assumed_vf)))
5082 min_profitable_iters++;
5083 }
5084 }
5085
5086 if (dump_enabled_p ())
5087 dump_printf (MSG_NOTE,
5088 " Calculated minimum iters for profitability: %d\n",
5089 min_profitable_iters);
5090
5091 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
5092 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
5093 /* We want the vectorized loop to execute at least once. */
5094 min_profitable_iters = assumed_vf + peel_iters_prologue;
5095 else if (min_profitable_iters < peel_iters_prologue)
5096 /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
5097 vectorized loop executes at least once. */
5098 min_profitable_iters = peel_iters_prologue;
5099
5100 if (dump_enabled_p ())
5101 dump_printf_loc (MSG_NOTE, vect_location,
5102 " Runtime profitability threshold = %d\n",
5103 min_profitable_iters);
5104
5105 *ret_min_profitable_niters = min_profitable_iters;
5106
5107 /* Calculate number of iterations required to make the vector version
5108 profitable, relative to the loop bodies only.
5109
5110 Non-vectorized variant is SIC * niters and it must win over vector
5111 variant on the expected loop trip count. The following condition must hold true:
5112 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
5113
5114 if (vec_outside_cost <= 0)
5115 min_profitable_estimate = 0;
5116 /* ??? This "else if" arm is written to handle all cases; see below for
5117 what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
5118 else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5119 {
5120 /* This is a repeat of the code above, but with + SOC rather
5121 than - SOC. */
5122 int outside_overhead = (vec_outside_cost
5123 - scalar_single_iter_cost * peel_iters_prologue
5124 - scalar_single_iter_cost * peel_iters_epilogue
5125 + scalar_outside_cost);
5126 int min_vec_niters = 1;
5127 if (outside_overhead > 0)
5128 min_vec_niters = outside_overhead / saving_per_viter + 1;
5129
5130 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5131 {
5132 int threshold = (vec_inside_cost * min_vec_niters
5133 + vec_outside_cost
5134 + scalar_outside_cost);
5135 min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
5136 }
5137 else
5138 min_profitable_estimate = (min_vec_niters * assumed_vf
5139 + peel_iters_prologue
5140 + peel_iters_epilogue);
5141 }
5142 else
5143 {
5144 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
5145 * assumed_vf
5146 - vec_inside_cost * peel_iters_prologue
5147 - vec_inside_cost * peel_iters_epilogue)
5148 / ((scalar_single_iter_cost * assumed_vf)
5149 - vec_inside_cost);
5150 }
5151 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
5152 if (dump_enabled_p ())
5153 dump_printf_loc (MSG_NOTE, vect_location,
5154 " Static estimate profitability threshold = %d\n",
5155 min_profitable_estimate);
5156
5157 *ret_min_profitable_estimate = min_profitable_estimate;
5158}
5159
5160/* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
5161 vector elements (not bits) for a vector with NELT elements. */
5162static void
5163calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
5164 vec_perm_builder *sel)
5165{
5166 /* The encoding is a single stepped pattern. Any wrap-around is handled
5167 by vec_perm_indices. */
5168 sel->new_vector (full_nelts: nelt, npatterns: 1, nelts_per_pattern: 3);
5169 for (unsigned int i = 0; i < 3; i++)
5170 sel->quick_push (obj: i + offset);
5171}
5172
5173/* Checks whether the target supports whole-vector shifts for vectors of mode
5174 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
5175 it supports vec_perm_const with masks for all necessary shift amounts. */
5176static bool
5177have_whole_vector_shift (machine_mode mode)
5178{
5179 if (optab_handler (op: vec_shr_optab, mode) != CODE_FOR_nothing)
5180 return true;
5181
5182 /* Variable-length vectors should be handled via the optab. */
5183 unsigned int nelt;
5184 if (!GET_MODE_NUNITS (mode).is_constant (const_value: &nelt))
5185 return false;
5186
5187 vec_perm_builder sel;
5188 vec_perm_indices indices;
5189 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
5190 {
5191 calc_vec_perm_mask_for_shift (offset: i, nelt, sel: &sel);
5192 indices.new_vector (sel, 2, nelt);
5193 if (!can_vec_perm_const_p (mode, mode, indices, false))
5194 return false;
5195 }
5196 return true;
5197}
5198
5199/* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
5200 multiplication operands have differing signs and (b) we intend
5201 to emulate the operation using a series of signed DOT_PROD_EXPRs.
5202 See vect_emulate_mixed_dot_prod for the actual sequence used. */
5203
5204static bool
5205vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo,
5206 stmt_vec_info stmt_info)
5207{
5208 gassign *assign = dyn_cast<gassign *> (p: stmt_info->stmt);
5209 if (!assign || gimple_assign_rhs_code (gs: assign) != DOT_PROD_EXPR)
5210 return false;
5211
5212 tree rhs1 = gimple_assign_rhs1 (gs: assign);
5213 tree rhs2 = gimple_assign_rhs2 (gs: assign);
5214 if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
5215 return false;
5216
5217 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5218 gcc_assert (reduc_info->is_reduc_info);
5219 return !directly_supported_p (DOT_PROD_EXPR,
5220 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info),
5221 optab_vector_mixed_sign);
5222}
5223
5224/* TODO: Close dependency between vect_model_*_cost and vectorizable_*
5225 functions. Design better to avoid maintenance issues. */
5226
5227/* Function vect_model_reduction_cost.
5228
5229 Models cost for a reduction operation, including the vector ops
5230 generated within the strip-mine loop in some cases, the initial
5231 definition before the loop, and the epilogue code that must be generated. */
5232
5233static void
5234vect_model_reduction_cost (loop_vec_info loop_vinfo,
5235 stmt_vec_info stmt_info, internal_fn reduc_fn,
5236 vect_reduction_type reduction_type,
5237 int ncopies, stmt_vector_for_cost *cost_vec)
5238{
5239 int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
5240 tree vectype;
5241 machine_mode mode;
5242 class loop *loop = NULL;
5243
5244 if (loop_vinfo)
5245 loop = LOOP_VINFO_LOOP (loop_vinfo);
5246
5247 /* Condition reductions generate two reductions in the loop. */
5248 if (reduction_type == COND_REDUCTION)
5249 ncopies *= 2;
5250
5251 vectype = STMT_VINFO_VECTYPE (stmt_info);
5252 mode = TYPE_MODE (vectype);
5253 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5254
5255 gimple_match_op op;
5256 if (!gimple_extract_op (orig_stmt_info->stmt, &op))
5257 gcc_unreachable ();
5258
5259 bool emulated_mixed_dot_prod
5260 = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
5261 if (reduction_type == EXTRACT_LAST_REDUCTION)
5262 /* No extra instructions are needed in the prologue. The loop body
5263 operations are costed in vectorizable_condition. */
5264 inside_cost = 0;
5265 else if (reduction_type == FOLD_LEFT_REDUCTION)
5266 {
5267 /* No extra instructions needed in the prologue. */
5268 prologue_cost = 0;
5269
5270 if (reduc_fn != IFN_LAST)
5271 /* Count one reduction-like operation per vector. */
5272 inside_cost = record_stmt_cost (body_cost_vec: cost_vec, count: ncopies, kind: vec_to_scalar,
5273 stmt_info, misalign: 0, where: vect_body);
5274 else
5275 {
5276 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
5277 unsigned int nelements = ncopies * vect_nunits_for_cost (vec_type: vectype);
5278 inside_cost = record_stmt_cost (body_cost_vec: cost_vec, count: nelements,
5279 kind: vec_to_scalar, stmt_info, misalign: 0,
5280 where: vect_body);
5281 inside_cost += record_stmt_cost (body_cost_vec: cost_vec, count: nelements,
5282 kind: scalar_stmt, stmt_info, misalign: 0,
5283 where: vect_body);
5284 }
5285 }
5286 else
5287 {
5288 /* Add in the cost of the initial definitions. */
5289 int prologue_stmts;
5290 if (reduction_type == COND_REDUCTION)
5291 /* For cond reductions we have four vectors: initial index, step,
5292 initial result of the data reduction, initial value of the index
5293 reduction. */
5294 prologue_stmts = 4;
5295 else if (emulated_mixed_dot_prod)
5296 /* We need the initial reduction value and two invariants:
5297 one that contains the minimum signed value and one that
5298 contains half of its negative. */
5299 prologue_stmts = 3;
5300 else
5301 prologue_stmts = 1;
5302 prologue_cost += record_stmt_cost (body_cost_vec: cost_vec, count: prologue_stmts,
5303 kind: scalar_to_vec, stmt_info, misalign: 0,
5304 where: vect_prologue);
5305 }
5306
5307 /* Determine cost of epilogue code.
5308
5309 We have a reduction operator that will reduce the vector in one statement.
5310 Also requires scalar extract. */
5311
5312 if (!loop || !nested_in_vect_loop_p (loop, stmt_info: orig_stmt_info))
5313 {
5314 if (reduc_fn != IFN_LAST)
5315 {
5316 if (reduction_type == COND_REDUCTION)
5317 {
5318 /* An EQ stmt and an COND_EXPR stmt. */
5319 epilogue_cost += record_stmt_cost (body_cost_vec: cost_vec, count: 2,
5320 kind: vector_stmt, stmt_info, misalign: 0,
5321 where: vect_epilogue);
5322 /* Reduction of the max index and a reduction of the found
5323 values. */
5324 epilogue_cost += record_stmt_cost (body_cost_vec: cost_vec, count: 2,
5325 kind: vec_to_scalar, stmt_info, misalign: 0,
5326 where: vect_epilogue);
5327 /* A broadcast of the max value. */
5328 epilogue_cost += record_stmt_cost (body_cost_vec: cost_vec, count: 1,
5329 kind: scalar_to_vec, stmt_info, misalign: 0,
5330 where: vect_epilogue);
5331 }
5332 else
5333 {
5334 epilogue_cost += record_stmt_cost (body_cost_vec: cost_vec, count: 1, kind: vector_stmt,
5335 stmt_info, misalign: 0, where: vect_epilogue);
5336 epilogue_cost += record_stmt_cost (body_cost_vec: cost_vec, count: 1,
5337 kind: vec_to_scalar, stmt_info, misalign: 0,
5338 where: vect_epilogue);
5339 }
5340 }
5341 else if (reduction_type == COND_REDUCTION)
5342 {
5343 unsigned estimated_nunits = vect_nunits_for_cost (vec_type: vectype);
5344 /* Extraction of scalar elements. */
5345 epilogue_cost += record_stmt_cost (body_cost_vec: cost_vec,
5346 count: 2 * estimated_nunits,
5347 kind: vec_to_scalar, stmt_info, misalign: 0,
5348 where: vect_epilogue);
5349 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
5350 epilogue_cost += record_stmt_cost (body_cost_vec: cost_vec,
5351 count: 2 * estimated_nunits - 3,
5352 kind: scalar_stmt, stmt_info, misalign: 0,
5353 where: vect_epilogue);
5354 }
5355 else if (reduction_type == EXTRACT_LAST_REDUCTION
5356 || reduction_type == FOLD_LEFT_REDUCTION)
5357 /* No extra instructions need in the epilogue. */
5358 ;
5359 else
5360 {
5361 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5362 tree bitsize = TYPE_SIZE (op.type);
5363 int element_bitsize = tree_to_uhwi (bitsize);
5364 int nelements = vec_size_in_bits / element_bitsize;
5365
5366 if (op.code == COND_EXPR)
5367 op.code = MAX_EXPR;
5368
5369 /* We have a whole vector shift available. */
5370 if (VECTOR_MODE_P (mode)
5371 && directly_supported_p (op.code, vectype)
5372 && have_whole_vector_shift (mode))
5373 {
5374 /* Final reduction via vector shifts and the reduction operator.
5375 Also requires scalar extract. */
5376 epilogue_cost += record_stmt_cost (body_cost_vec: cost_vec,
5377 count: exact_log2 (x: nelements) * 2,
5378 kind: vector_stmt, stmt_info, misalign: 0,
5379 where: vect_epilogue);
5380 epilogue_cost += record_stmt_cost (body_cost_vec: cost_vec, count: 1,
5381 kind: vec_to_scalar, stmt_info, misalign: 0,
5382 where: vect_epilogue);
5383 }
5384 else
5385 /* Use extracts and reduction op for final reduction. For N
5386 elements, we have N extracts and N-1 reduction ops. */
5387 epilogue_cost += record_stmt_cost (body_cost_vec: cost_vec,
5388 count: nelements + nelements - 1,
5389 kind: vector_stmt, stmt_info, misalign: 0,
5390 where: vect_epilogue);
5391 }
5392 }
5393
5394 if (dump_enabled_p ())
5395 dump_printf (MSG_NOTE,
5396 "vect_model_reduction_cost: inside_cost = %d, "
5397 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
5398 prologue_cost, epilogue_cost);
5399}
5400
5401/* SEQ is a sequence of instructions that initialize the reduction
5402 described by REDUC_INFO. Emit them in the appropriate place. */
5403
5404static void
5405vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
5406 stmt_vec_info reduc_info, gimple *seq)
5407{
5408 if (reduc_info->reused_accumulator)
5409 {
5410 /* When reusing an accumulator from the main loop, we only need
5411 initialization instructions if the main loop can be skipped.
5412 In that case, emit the initialization instructions at the end
5413 of the guard block that does the skip. */
5414 edge skip_edge = loop_vinfo->skip_main_loop_edge;
5415 gcc_assert (skip_edge);
5416 gimple_stmt_iterator gsi = gsi_last_bb (bb: skip_edge->src);
5417 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
5418 }
5419 else
5420 {
5421 /* The normal case: emit the initialization instructions on the
5422 preheader edge. */
5423 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5424 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
5425 }
5426}
5427
5428/* Function get_initial_def_for_reduction
5429
5430 Input:
5431 REDUC_INFO - the info_for_reduction
5432 INIT_VAL - the initial value of the reduction variable
5433 NEUTRAL_OP - a value that has no effect on the reduction, as per
5434 neutral_op_for_reduction
5435
5436 Output:
5437 Return a vector variable, initialized according to the operation that
5438 STMT_VINFO performs. This vector will be used as the initial value
5439 of the vector of partial results.
5440
5441 The value we need is a vector in which element 0 has value INIT_VAL
5442 and every other element has value NEUTRAL_OP. */
5443
5444static tree
5445get_initial_def_for_reduction (loop_vec_info loop_vinfo,
5446 stmt_vec_info reduc_info,
5447 tree init_val, tree neutral_op)
5448{
5449 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5450 tree scalar_type = TREE_TYPE (init_val);
5451 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
5452 tree init_def;
5453 gimple_seq stmts = NULL;
5454
5455 gcc_assert (vectype);
5456
5457 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
5458 || SCALAR_FLOAT_TYPE_P (scalar_type));
5459
5460 gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
5461 || loop == (gimple_bb (reduc_info->stmt))->loop_father);
5462
5463 if (operand_equal_p (init_val, neutral_op))
5464 {
5465 /* If both elements are equal then the vector described above is
5466 just a splat. */
5467 neutral_op = gimple_convert (seq: &stmts, TREE_TYPE (vectype), op: neutral_op);
5468 init_def = gimple_build_vector_from_val (seq: &stmts, type: vectype, op: neutral_op);
5469 }
5470 else
5471 {
5472 neutral_op = gimple_convert (seq: &stmts, TREE_TYPE (vectype), op: neutral_op);
5473 init_val = gimple_convert (seq: &stmts, TREE_TYPE (vectype), op: init_val);
5474 if (!TYPE_VECTOR_SUBPARTS (node: vectype).is_constant ())
5475 {
5476 /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
5477 element 0. */
5478 init_def = gimple_build_vector_from_val (seq: &stmts, type: vectype,
5479 op: neutral_op);
5480 init_def = gimple_build (seq: &stmts, fn: CFN_VEC_SHL_INSERT,
5481 type: vectype, args: init_def, args: init_val);
5482 }
5483 else
5484 {
5485 /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}. */
5486 tree_vector_builder elts (vectype, 1, 2);
5487 elts.quick_push (obj: init_val);
5488 elts.quick_push (obj: neutral_op);
5489 init_def = gimple_build_vector (seq: &stmts, builder: &elts);
5490 }
5491 }
5492
5493 if (stmts)
5494 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, seq: stmts);
5495 return init_def;
5496}
5497
5498/* Get at the initial defs for the reduction PHIs for REDUC_INFO,
5499 which performs a reduction involving GROUP_SIZE scalar statements.
5500 NUMBER_OF_VECTORS is the number of vector defs to create. If NEUTRAL_OP
5501 is nonnull, introducing extra elements of that value will not change the
5502 result. */
5503
5504static void
5505get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
5506 stmt_vec_info reduc_info,
5507 vec<tree> *vec_oprnds,
5508 unsigned int number_of_vectors,
5509 unsigned int group_size, tree neutral_op)
5510{
5511 vec<tree> &initial_values = reduc_info->reduc_initial_values;
5512 unsigned HOST_WIDE_INT nunits;
5513 unsigned j, number_of_places_left_in_vector;
5514 tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
5515 unsigned int i;
5516
5517 gcc_assert (group_size == initial_values.length () || neutral_op);
5518
5519 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5520 created vectors. It is greater than 1 if unrolling is performed.
5521
5522 For example, we have two scalar operands, s1 and s2 (e.g., group of
5523 strided accesses of size two), while NUNITS is four (i.e., four scalars
5524 of this type can be packed in a vector). The output vector will contain
5525 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
5526 will be 2).
5527
5528 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
5529 vectors containing the operands.
5530
5531 For example, NUNITS is four as before, and the group size is 8
5532 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
5533 {s5, s6, s7, s8}. */
5534
5535 if (!TYPE_VECTOR_SUBPARTS (node: vector_type).is_constant (const_value: &nunits))
5536 nunits = group_size;
5537
5538 number_of_places_left_in_vector = nunits;
5539 bool constant_p = true;
5540 tree_vector_builder elts (vector_type, nunits, 1);
5541 elts.quick_grow (len: nunits);
5542 gimple_seq ctor_seq = NULL;
5543 for (j = 0; j < nunits * number_of_vectors; ++j)
5544 {
5545 tree op;
5546 i = j % group_size;
5547
5548 /* Get the def before the loop. In reduction chain we have only
5549 one initial value. Else we have as many as PHIs in the group. */
5550 if (i >= initial_values.length () || (j > i && neutral_op))
5551 op = neutral_op;
5552 else
5553 op = initial_values[i];
5554
5555 /* Create 'vect_ = {op0,op1,...,opn}'. */
5556 number_of_places_left_in_vector--;
5557 elts[nunits - number_of_places_left_in_vector - 1] = op;
5558 if (!CONSTANT_CLASS_P (op))
5559 constant_p = false;
5560
5561 if (number_of_places_left_in_vector == 0)
5562 {
5563 tree init;
5564 if (constant_p && !neutral_op
5565 ? multiple_p (a: TYPE_VECTOR_SUBPARTS (node: vector_type), b: nunits)
5566 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5567 /* Build the vector directly from ELTS. */
5568 init = gimple_build_vector (seq: &ctor_seq, builder: &elts);
5569 else if (neutral_op)
5570 {
5571 /* Build a vector of the neutral value and shift the
5572 other elements into place. */
5573 init = gimple_build_vector_from_val (seq: &ctor_seq, type: vector_type,
5574 op: neutral_op);
5575 int k = nunits;
5576 while (k > 0 && elts[k - 1] == neutral_op)
5577 k -= 1;
5578 while (k > 0)
5579 {
5580 k -= 1;
5581 init = gimple_build (seq: &ctor_seq, fn: CFN_VEC_SHL_INSERT,
5582 type: vector_type, args: init, args: elts[k]);
5583 }
5584 }
5585 else
5586 {
5587 /* First time round, duplicate ELTS to fill the
5588 required number of vectors. */
5589 duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
5590 elts, number_of_vectors, *vec_oprnds);
5591 break;
5592 }
5593 vec_oprnds->quick_push (obj: init);
5594
5595 number_of_places_left_in_vector = nunits;
5596 elts.new_vector (type: vector_type, npatterns: nunits, nelts_per_pattern: 1);
5597 elts.quick_grow (len: nunits);
5598 constant_p = true;
5599 }
5600 }
5601 if (ctor_seq != NULL)
5602 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, seq: ctor_seq);
5603}
5604
5605/* For a statement STMT_INFO taking part in a reduction operation return
5606 the stmt_vec_info the meta information is stored on. */
5607
5608stmt_vec_info
5609info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
5610{
5611 stmt_info = vect_orig_stmt (stmt_info);
5612 gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
5613 if (!is_a <gphi *> (p: stmt_info->stmt)
5614 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
5615 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5616 gphi *phi = as_a <gphi *> (p: stmt_info->stmt);
5617 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5618 {
5619 if (gimple_phi_num_args (gs: phi) == 1)
5620 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5621 }
5622 else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5623 {
5624 stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
5625 if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
5626 stmt_info = info;
5627 }
5628 return stmt_info;
5629}
5630
5631/* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5632 REDUC_INFO can build on. Adjust REDUC_INFO and return true if so, otherwise
5633 return false. */
5634
5635static bool
5636vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5637 stmt_vec_info reduc_info)
5638{
5639 loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5640 if (!main_loop_vinfo)
5641 return false;
5642
5643 if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5644 return false;
5645
5646 unsigned int num_phis = reduc_info->reduc_initial_values.length ();
5647 auto_vec<tree, 16> main_loop_results (num_phis);
5648 auto_vec<tree, 16> initial_values (num_phis);
5649 if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5650 {
5651 /* The epilogue loop can be entered either from the main loop or
5652 from an earlier guard block. */
5653 edge skip_edge = loop_vinfo->skip_main_loop_edge;
5654 for (tree incoming_value : reduc_info->reduc_initial_values)
5655 {
5656 /* Look for:
5657
5658 INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5659 INITIAL_VALUE(guard block)>. */
5660 gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5661
5662 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5663 gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5664
5665 tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5666 tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5667
5668 main_loop_results.quick_push (obj: from_main_loop);
5669 initial_values.quick_push (obj: from_skip);
5670 }
5671 }
5672 else
5673 /* The main loop dominates the epilogue loop. */
5674 main_loop_results.splice (src: reduc_info->reduc_initial_values);
5675
5676 /* See if the main loop has the kind of accumulator we need. */
5677 vect_reusable_accumulator *accumulator
5678 = main_loop_vinfo->reusable_accumulators.get (k: main_loop_results[0]);
5679 if (!accumulator
5680 || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
5681 || !std::equal (first1: main_loop_results.begin (), last1: main_loop_results.end (),
5682 first2: accumulator->reduc_info->reduc_scalar_results.begin ()))
5683 return false;
5684
5685 /* Handle the case where we can reduce wider vectors to narrower ones. */
5686 tree vectype = STMT_VINFO_VECTYPE (reduc_info);
5687 tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5688 unsigned HOST_WIDE_INT m;
5689 if (!constant_multiple_p (a: TYPE_VECTOR_SUBPARTS (node: old_vectype),
5690 b: TYPE_VECTOR_SUBPARTS (node: vectype), multiple: &m))
5691 return false;
5692 /* Check the intermediate vector types and operations are available. */
5693 tree prev_vectype = old_vectype;
5694 poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (node: old_vectype);
5695 while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5696 {
5697 intermediate_nunits = exact_div (a: intermediate_nunits, b: 2);
5698 tree intermediate_vectype = get_related_vectype_for_scalar_type
5699 (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5700 if (!intermediate_vectype
5701 || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info),
5702 intermediate_vectype)
5703 || !can_vec_extract (TYPE_MODE (prev_vectype),
5704 TYPE_MODE (intermediate_vectype)))
5705 return false;
5706 prev_vectype = intermediate_vectype;
5707 }
5708
5709 /* Non-SLP reductions might apply an adjustment after the reduction
5710 operation, in order to simplify the initialization of the accumulator.
5711 If the epilogue loop carries on from where the main loop left off,
5712 it should apply the same adjustment to the final reduction result.
5713
5714 If the epilogue loop can also be entered directly (rather than via
5715 the main loop), we need to be able to handle that case in the same way,
5716 with the same adjustment. (In principle we could add a PHI node
5717 to select the correct adjustment, but in practice that shouldn't be
5718 necessary.) */
5719 tree main_adjustment
5720 = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5721 if (loop_vinfo->main_loop_edge && main_adjustment)
5722 {
5723 gcc_assert (num_phis == 1);
5724 tree initial_value = initial_values[0];
5725 /* Check that we can use INITIAL_VALUE as the adjustment and
5726 initialize the accumulator with a neutral value instead. */
5727 if (!operand_equal_p (initial_value, main_adjustment))
5728 return false;
5729 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5730 initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5731 code, initial_value);
5732 }
5733 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5734 reduc_info->reduc_initial_values.truncate (size: 0);
5735 reduc_info->reduc_initial_values.splice (src: initial_values);
5736 reduc_info->reused_accumulator = accumulator;
5737 return true;
5738}
5739
5740/* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5741 CODE emitting stmts before GSI. Returns a vector def of VECTYPE. */
5742
5743static tree
5744vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5745 gimple_seq *seq)
5746{
5747 unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5748 unsigned nunits1 = TYPE_VECTOR_SUBPARTS (node: vectype).to_constant ();
5749 tree stype = TREE_TYPE (vectype);
5750 tree new_temp = vec_def;
5751 while (nunits > nunits1)
5752 {
5753 nunits /= 2;
5754 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5755 stype, nunits);
5756 unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5757
5758 /* The target has to make sure we support lowpart/highpart
5759 extraction, either via direct vector extract or through
5760 an integer mode punning. */
5761 tree dst1, dst2;
5762 gimple *epilog_stmt;
5763 if (convert_optab_handler (op: vec_extract_optab,
5764 TYPE_MODE (TREE_TYPE (new_temp)),
5765 TYPE_MODE (vectype1))
5766 != CODE_FOR_nothing)
5767 {
5768 /* Extract sub-vectors directly once vec_extract becomes
5769 a conversion optab. */
5770 dst1 = make_ssa_name (var: vectype1);
5771 epilog_stmt
5772 = gimple_build_assign (dst1, BIT_FIELD_REF,
5773 build3 (BIT_FIELD_REF, vectype1,
5774 new_temp, TYPE_SIZE (vectype1),
5775 bitsize_int (0)));
5776 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5777 dst2 = make_ssa_name (var: vectype1);
5778 epilog_stmt
5779 = gimple_build_assign (dst2, BIT_FIELD_REF,
5780 build3 (BIT_FIELD_REF, vectype1,
5781 new_temp, TYPE_SIZE (vectype1),
5782 bitsize_int (bitsize)));
5783 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5784 }
5785 else
5786 {
5787 /* Extract via punning to appropriately sized integer mode
5788 vector. */
5789 tree eltype = build_nonstandard_integer_type (bitsize, 1);
5790 tree etype = build_vector_type (eltype, 2);
5791 gcc_assert (convert_optab_handler (vec_extract_optab,
5792 TYPE_MODE (etype),
5793 TYPE_MODE (eltype))
5794 != CODE_FOR_nothing);
5795 tree tem = make_ssa_name (var: etype);
5796 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5797 build1 (VIEW_CONVERT_EXPR,
5798 etype, new_temp));
5799 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5800 new_temp = tem;
5801 tem = make_ssa_name (var: eltype);
5802 epilog_stmt
5803 = gimple_build_assign (tem, BIT_FIELD_REF,
5804 build3 (BIT_FIELD_REF, eltype,
5805 new_temp, TYPE_SIZE (eltype),
5806 bitsize_int (0)));
5807 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5808 dst1 = make_ssa_name (var: vectype1);
5809 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5810 build1 (VIEW_CONVERT_EXPR,
5811 vectype1, tem));
5812 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5813 tem = make_ssa_name (var: eltype);
5814 epilog_stmt
5815 = gimple_build_assign (tem, BIT_FIELD_REF,
5816 build3 (BIT_FIELD_REF, eltype,
5817 new_temp, TYPE_SIZE (eltype),
5818 bitsize_int (bitsize)));
5819 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5820 dst2 = make_ssa_name (var: vectype1);
5821 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5822 build1 (VIEW_CONVERT_EXPR,
5823 vectype1, tem));
5824 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5825 }
5826
5827 new_temp = gimple_build (seq, code, type: vectype1, ops: dst1, ops: dst2);
5828 }
5829
5830 return new_temp;
5831}
5832
5833/* Function vect_create_epilog_for_reduction
5834
5835 Create code at the loop-epilog to finalize the result of a reduction
5836 computation.
5837
5838 STMT_INFO is the scalar reduction stmt that is being vectorized.
5839 SLP_NODE is an SLP node containing a group of reduction statements. The
5840 first one in this group is STMT_INFO.
5841 SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5842 REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5843 (counting from 0)
5844
5845 This function:
5846 1. Completes the reduction def-use cycles.
5847 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5848 by calling the function specified by REDUC_FN if available, or by
5849 other means (whole-vector shifts or a scalar loop).
5850 The function also creates a new phi node at the loop exit to preserve
5851 loop-closed form, as illustrated below.
5852
5853 The flow at the entry to this function:
5854
5855 loop:
5856 vec_def = phi <vec_init, null> # REDUCTION_PHI
5857 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5858 s_loop = scalar_stmt # (scalar) STMT_INFO
5859 loop_exit:
5860 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5861 use <s_out0>
5862 use <s_out0>
5863
5864 The above is transformed by this function into:
5865
5866 loop:
5867 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
5868 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5869 s_loop = scalar_stmt # (scalar) STMT_INFO
5870 loop_exit:
5871 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5872 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5873 v_out2 = reduce <v_out1>
5874 s_out3 = extract_field <v_out2, 0>
5875 s_out4 = adjust_result <s_out3>
5876 use <s_out4>
5877 use <s_out4>
5878*/
5879
5880static void
5881vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5882 stmt_vec_info stmt_info,
5883 slp_tree slp_node,
5884 slp_instance slp_node_instance)
5885{
5886 stmt_vec_info reduc_info = info_for_reduction (vinfo: loop_vinfo, stmt_info);
5887 gcc_assert (reduc_info->is_reduc_info);
5888 /* For double reductions we need to get at the inner loop reduction
5889 stmt which has the meta info attached. Our stmt_info is that of the
5890 loop-closed PHI of the inner loop which we remember as
5891 def for the reduction PHI generation. */
5892 bool double_reduc = false;
5893 stmt_vec_info rdef_info = stmt_info;
5894 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5895 {
5896 gcc_assert (!slp_node);
5897 double_reduc = true;
5898 stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5899 (gs: stmt_info->stmt, index: 0));
5900 stmt_info = vect_stmt_to_vectorize (stmt_info);
5901 }
5902 gphi *reduc_def_stmt
5903 = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
5904 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5905 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5906 tree vectype;
5907 machine_mode mode;
5908 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
5909 basic_block exit_bb;
5910 tree scalar_dest;
5911 tree scalar_type;
5912 gimple *new_phi = NULL, *phi = NULL;
5913 gimple_stmt_iterator exit_gsi;
5914 tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5915 gimple *epilog_stmt = NULL;
5916 gimple *exit_phi;
5917 tree bitsize;
5918 tree def;
5919 tree orig_name, scalar_result;
5920 imm_use_iterator imm_iter, phi_imm_iter;
5921 use_operand_p use_p, phi_use_p;
5922 gimple *use_stmt;
5923 auto_vec<tree> reduc_inputs;
5924 int j, i;
5925 vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
5926 unsigned int group_size = 1, k;
5927 auto_vec<gimple *> phis;
5928 /* SLP reduction without reduction chain, e.g.,
5929 # a1 = phi <a2, a0>
5930 # b1 = phi <b2, b0>
5931 a2 = operation (a1)
5932 b2 = operation (b1) */
5933 bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5934 bool direct_slp_reduc;
5935 tree induction_index = NULL_TREE;
5936
5937 if (slp_node)
5938 group_size = SLP_TREE_LANES (slp_node);
5939
5940 if (nested_in_vect_loop_p (loop, stmt_info))
5941 {
5942 outer_loop = loop;
5943 loop = loop->inner;
5944 gcc_assert (!slp_node && double_reduc);
5945 }
5946
5947 vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
5948 gcc_assert (vectype);
5949 mode = TYPE_MODE (vectype);
5950
5951 tree induc_val = NULL_TREE;
5952 tree adjustment_def = NULL;
5953 if (slp_node)
5954 ;
5955 else
5956 {
5957 /* Optimize: for induction condition reduction, if we can't use zero
5958 for induc_val, use initial_def. */
5959 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5960 induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
5961 else if (double_reduc)
5962 ;
5963 else
5964 adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
5965 }
5966
5967 stmt_vec_info single_live_out_stmt[] = { stmt_info };
5968 array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5969 if (slp_reduc)
5970 /* All statements produce live-out values. */
5971 live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
5972 else if (slp_node)
5973 {
5974 /* The last statement in the reduction chain produces the live-out
5975 value. Note SLP optimization can shuffle scalar stmts to
5976 optimize permutations so we have to search for the last stmt. */
5977 for (k = 0; k < group_size; ++k)
5978 if (!REDUC_GROUP_NEXT_ELEMENT (SLP_TREE_SCALAR_STMTS (slp_node)[k]))
5979 {
5980 single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5981 break;
5982 }
5983 }
5984
5985 unsigned vec_num;
5986 int ncopies;
5987 if (slp_node)
5988 {
5989 vec_num = SLP_TREE_VEC_DEFS (slp_node_instance->reduc_phis).length ();
5990 ncopies = 1;
5991 }
5992 else
5993 {
5994 stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
5995 vec_num = 1;
5996 ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
5997 }
5998
5999 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
6000 which is updated with the current index of the loop for every match of
6001 the original loop's cond_expr (VEC_STMT). This results in a vector
6002 containing the last time the condition passed for that vector lane.
6003 The first match will be a 1 to allow 0 to be used for non-matching
6004 indexes. If there are no matches at all then the vector will be all
6005 zeroes.
6006
6007 PR92772: This algorithm is broken for architectures that support
6008 masked vectors, but do not provide fold_extract_last. */
6009 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
6010 {
6011 auto_vec<std::pair<tree, bool>, 2> ccompares;
6012 stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
6013 cond_info = vect_stmt_to_vectorize (stmt_info: cond_info);
6014 while (cond_info != reduc_info)
6015 {
6016 if (gimple_assign_rhs_code (gs: cond_info->stmt) == COND_EXPR)
6017 {
6018 gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
6019 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
6020 ccompares.safe_push
6021 (obj: std::make_pair (x: unshare_expr (gimple_assign_rhs1 (gs: vec_stmt)),
6022 STMT_VINFO_REDUC_IDX (cond_info) == 2));
6023 }
6024 cond_info
6025 = loop_vinfo->lookup_def (gimple_op (gs: cond_info->stmt,
6026 i: 1 + STMT_VINFO_REDUC_IDX
6027 (cond_info)));
6028 cond_info = vect_stmt_to_vectorize (stmt_info: cond_info);
6029 }
6030 gcc_assert (ccompares.length () != 0);
6031
6032 tree indx_before_incr, indx_after_incr;
6033 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (node: vectype);
6034 int scalar_precision
6035 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
6036 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
6037 tree cr_index_vector_type = get_related_vectype_for_scalar_type
6038 (TYPE_MODE (vectype), cr_index_scalar_type,
6039 TYPE_VECTOR_SUBPARTS (node: vectype));
6040
6041 /* First we create a simple vector induction variable which starts
6042 with the values {1,2,3,...} (SERIES_VECT) and increments by the
6043 vector size (STEP). */
6044
6045 /* Create a {1,2,3,...} vector. */
6046 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
6047
6048 /* Create a vector of the step value. */
6049 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
6050 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
6051
6052 /* Create an induction variable. */
6053 gimple_stmt_iterator incr_gsi;
6054 bool insert_after;
6055 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
6056 create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
6057 insert_after, &indx_before_incr, &indx_after_incr);
6058
6059 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
6060 filled with zeros (VEC_ZERO). */
6061
6062 /* Create a vector of 0s. */
6063 tree zero = build_zero_cst (cr_index_scalar_type);
6064 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
6065
6066 /* Create a vector phi node. */
6067 tree new_phi_tree = make_ssa_name (var: cr_index_vector_type);
6068 new_phi = create_phi_node (new_phi_tree, loop->header);
6069 add_phi_arg (as_a <gphi *> (p: new_phi), vec_zero,
6070 loop_preheader_edge (loop), UNKNOWN_LOCATION);
6071
6072 /* Now take the condition from the loops original cond_exprs
6073 and produce a new cond_exprs (INDEX_COND_EXPR) which for
6074 every match uses values from the induction variable
6075 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
6076 (NEW_PHI_TREE).
6077 Finally, we update the phi (NEW_PHI_TREE) to take the value of
6078 the new cond_expr (INDEX_COND_EXPR). */
6079 gimple_seq stmts = NULL;
6080 for (int i = ccompares.length () - 1; i != -1; --i)
6081 {
6082 tree ccompare = ccompares[i].first;
6083 if (ccompares[i].second)
6084 new_phi_tree = gimple_build (seq: &stmts, code: VEC_COND_EXPR,
6085 type: cr_index_vector_type,
6086 ops: ccompare,
6087 ops: indx_before_incr, ops: new_phi_tree);
6088 else
6089 new_phi_tree = gimple_build (seq: &stmts, code: VEC_COND_EXPR,
6090 type: cr_index_vector_type,
6091 ops: ccompare,
6092 ops: new_phi_tree, ops: indx_before_incr);
6093 }
6094 gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
6095
6096 /* Update the phi with the vec cond. */
6097 induction_index = new_phi_tree;
6098 add_phi_arg (as_a <gphi *> (p: new_phi), induction_index,
6099 loop_latch_edge (loop), UNKNOWN_LOCATION);
6100 }
6101
6102 /* 2. Create epilog code.
6103 The reduction epilog code operates across the elements of the vector
6104 of partial results computed by the vectorized loop.
6105 The reduction epilog code consists of:
6106
6107 step 1: compute the scalar result in a vector (v_out2)
6108 step 2: extract the scalar result (s_out3) from the vector (v_out2)
6109 step 3: adjust the scalar result (s_out3) if needed.
6110
6111 Step 1 can be accomplished using one the following three schemes:
6112 (scheme 1) using reduc_fn, if available.
6113 (scheme 2) using whole-vector shifts, if available.
6114 (scheme 3) using a scalar loop. In this case steps 1+2 above are
6115 combined.
6116
6117 The overall epilog code looks like this:
6118
6119 s_out0 = phi <s_loop> # original EXIT_PHI
6120 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6121 v_out2 = reduce <v_out1> # step 1
6122 s_out3 = extract_field <v_out2, 0> # step 2
6123 s_out4 = adjust_result <s_out3> # step 3
6124
6125 (step 3 is optional, and steps 1 and 2 may be combined).
6126 Lastly, the uses of s_out0 are replaced by s_out4. */
6127
6128
6129 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
6130 v_out1 = phi <VECT_DEF>
6131 Store them in NEW_PHIS. */
6132 if (double_reduc)
6133 loop = outer_loop;
6134 exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
6135 exit_gsi = gsi_after_labels (bb: exit_bb);
6136 reduc_inputs.create (nelems: slp_node ? vec_num : ncopies);
6137 for (unsigned i = 0; i < vec_num; i++)
6138 {
6139 gimple_seq stmts = NULL;
6140 if (slp_node)
6141 def = vect_get_slp_vect_def (slp_node, i);
6142 else
6143 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
6144 for (j = 0; j < ncopies; j++)
6145 {
6146 tree new_def = copy_ssa_name (var: def);
6147 phi = create_phi_node (new_def, exit_bb);
6148 if (j)
6149 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
6150 SET_PHI_ARG_DEF (phi, LOOP_VINFO_IV_EXIT (loop_vinfo)->dest_idx, def);
6151 new_def = gimple_convert (seq: &stmts, type: vectype, op: new_def);
6152 reduc_inputs.quick_push (obj: new_def);
6153 }
6154 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6155 }
6156
6157 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
6158 (i.e. when reduc_fn is not available) and in the final adjustment
6159 code (if needed). Also get the original scalar reduction variable as
6160 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
6161 represents a reduction pattern), the tree-code and scalar-def are
6162 taken from the original stmt that the pattern-stmt (STMT) replaces.
6163 Otherwise (it is a regular reduction) - the tree-code and scalar-def
6164 are taken from STMT. */
6165
6166 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6167 if (orig_stmt_info != stmt_info)
6168 {
6169 /* Reduction pattern */
6170 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6171 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
6172 }
6173
6174 scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
6175 scalar_type = TREE_TYPE (scalar_dest);
6176 scalar_results.truncate (size: 0);
6177 scalar_results.reserve_exact (nelems: group_size);
6178 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
6179 bitsize = TYPE_SIZE (scalar_type);
6180
6181 /* True if we should implement SLP_REDUC using native reduction operations
6182 instead of scalar operations. */
6183 direct_slp_reduc = (reduc_fn != IFN_LAST
6184 && slp_reduc
6185 && !TYPE_VECTOR_SUBPARTS (node: vectype).is_constant ());
6186
6187 /* In case of reduction chain, e.g.,
6188 # a1 = phi <a3, a0>
6189 a2 = operation (a1)
6190 a3 = operation (a2),
6191
6192 we may end up with more than one vector result. Here we reduce them
6193 to one vector.
6194
6195 The same is true for a SLP reduction, e.g.,
6196 # a1 = phi <a2, a0>
6197 # b1 = phi <b2, b0>
6198 a2 = operation (a1)
6199 b2 = operation (a2),
6200
6201 where we can end up with more than one vector as well. We can
6202 easily accumulate vectors when the number of vector elements is
6203 a multiple of the SLP group size.
6204
6205 The same is true if we couldn't use a single defuse cycle. */
6206 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6207 || direct_slp_reduc
6208 || (slp_reduc
6209 && constant_multiple_p (a: TYPE_VECTOR_SUBPARTS (node: vectype), b: group_size))
6210 || ncopies > 1)
6211 {
6212 gimple_seq stmts = NULL;
6213 tree single_input = reduc_inputs[0];
6214 for (k = 1; k < reduc_inputs.length (); k++)
6215 single_input = gimple_build (seq: &stmts, code, type: vectype,
6216 ops: single_input, ops: reduc_inputs[k]);
6217 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6218
6219 reduc_inputs.truncate (size: 0);
6220 reduc_inputs.safe_push (obj: single_input);
6221 }
6222
6223 tree orig_reduc_input = reduc_inputs[0];
6224
6225 /* If this loop is an epilogue loop that can be skipped after the
6226 main loop, we can only share a reduction operation between the
6227 main loop and the epilogue if we put it at the target of the
6228 skip edge.
6229
6230 We can still reuse accumulators if this check fails. Doing so has
6231 the minor(?) benefit of making the epilogue loop's scalar result
6232 independent of the main loop's scalar result. */
6233 bool unify_with_main_loop_p = false;
6234 if (reduc_info->reused_accumulator
6235 && loop_vinfo->skip_this_loop_edge
6236 && single_succ_p (bb: exit_bb)
6237 && single_succ (bb: exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
6238 {
6239 unify_with_main_loop_p = true;
6240
6241 basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
6242 reduc_inputs[0] = make_ssa_name (var: vectype);
6243 gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
6244 add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (bb: exit_bb),
6245 UNKNOWN_LOCATION);
6246 add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
6247 loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
6248 exit_gsi = gsi_after_labels (bb: reduc_block);
6249 }
6250
6251 /* Shouldn't be used beyond this point. */
6252 exit_bb = nullptr;
6253
6254 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6255 && reduc_fn != IFN_LAST)
6256 {
6257 /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
6258 various data values where the condition matched and another vector
6259 (INDUCTION_INDEX) containing all the indexes of those matches. We
6260 need to extract the last matching index (which will be the index with
6261 highest value) and use this to index into the data vector.
6262 For the case where there were no matches, the data vector will contain
6263 all default values and the index vector will be all zeros. */
6264
6265 /* Get various versions of the type of the vector of indexes. */
6266 tree index_vec_type = TREE_TYPE (induction_index);
6267 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
6268 tree index_scalar_type = TREE_TYPE (index_vec_type);
6269 tree index_vec_cmp_type = truth_type_for (index_vec_type);
6270
6271 /* Get an unsigned integer version of the type of the data vector. */
6272 int scalar_precision
6273 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6274 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
6275 tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
6276 vectype);
6277
6278 /* First we need to create a vector (ZERO_VEC) of zeros and another
6279 vector (MAX_INDEX_VEC) filled with the last matching index, which we
6280 can create using a MAX reduction and then expanding.
6281 In the case where the loop never made any matches, the max index will
6282 be zero. */
6283
6284 /* Vector of {0, 0, 0,...}. */
6285 tree zero_vec = build_zero_cst (vectype);
6286
6287 /* Find maximum value from the vector of found indexes. */
6288 tree max_index = make_ssa_name (var: index_scalar_type);
6289 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6290 1, induction_index);
6291 gimple_call_set_lhs (gs: max_index_stmt, lhs: max_index);
6292 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
6293
6294 /* Vector of {max_index, max_index, max_index,...}. */
6295 tree max_index_vec = make_ssa_name (var: index_vec_type);
6296 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
6297 max_index);
6298 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
6299 max_index_vec_rhs);
6300 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
6301
6302 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
6303 with the vector (INDUCTION_INDEX) of found indexes, choosing values
6304 from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
6305 otherwise. Only one value should match, resulting in a vector
6306 (VEC_COND) with one data value and the rest zeros.
6307 In the case where the loop never made any matches, every index will
6308 match, resulting in a vector with all data values (which will all be
6309 the default value). */
6310
6311 /* Compare the max index vector to the vector of found indexes to find
6312 the position of the max value. */
6313 tree vec_compare = make_ssa_name (var: index_vec_cmp_type);
6314 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
6315 induction_index,
6316 max_index_vec);
6317 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
6318
6319 /* Use the compare to choose either values from the data vector or
6320 zero. */
6321 tree vec_cond = make_ssa_name (var: vectype);
6322 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
6323 vec_compare,
6324 reduc_inputs[0],
6325 zero_vec);
6326 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
6327
6328 /* Finally we need to extract the data value from the vector (VEC_COND)
6329 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
6330 reduction, but because this doesn't exist, we can use a MAX reduction
6331 instead. The data value might be signed or a float so we need to cast
6332 it first.
6333 In the case where the loop never made any matches, the data values are
6334 all identical, and so will reduce down correctly. */
6335
6336 /* Make the matched data values unsigned. */
6337 tree vec_cond_cast = make_ssa_name (var: vectype_unsigned);
6338 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
6339 vec_cond);
6340 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
6341 VIEW_CONVERT_EXPR,
6342 vec_cond_cast_rhs);
6343 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
6344
6345 /* Reduce down to a scalar value. */
6346 tree data_reduc = make_ssa_name (var: scalar_type_unsigned);
6347 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6348 1, vec_cond_cast);
6349 gimple_call_set_lhs (gs: data_reduc_stmt, lhs: data_reduc);
6350 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
6351
6352 /* Convert the reduced value back to the result type and set as the
6353 result. */
6354 gimple_seq stmts = NULL;
6355 new_temp = gimple_build (seq: &stmts, code: VIEW_CONVERT_EXPR, type: scalar_type,
6356 ops: data_reduc);
6357 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6358 scalar_results.safe_push (obj: new_temp);
6359 }
6360 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6361 && reduc_fn == IFN_LAST)
6362 {
6363 /* Condition reduction without supported IFN_REDUC_MAX. Generate
6364 idx = 0;
6365 idx_val = induction_index[0];
6366 val = data_reduc[0];
6367 for (idx = 0, val = init, i = 0; i < nelts; ++i)
6368 if (induction_index[i] > idx_val)
6369 val = data_reduc[i], idx_val = induction_index[i];
6370 return val; */
6371
6372 tree data_eltype = TREE_TYPE (vectype);
6373 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
6374 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
6375 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
6376 /* Enforced by vectorizable_reduction, which ensures we have target
6377 support before allowing a conditional reduction on variable-length
6378 vectors. */
6379 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
6380 tree idx_val = NULL_TREE, val = NULL_TREE;
6381 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
6382 {
6383 tree old_idx_val = idx_val;
6384 tree old_val = val;
6385 idx_val = make_ssa_name (var: idx_eltype);
6386 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
6387 build3 (BIT_FIELD_REF, idx_eltype,
6388 induction_index,
6389 bitsize_int (el_size),
6390 bitsize_int (off)));
6391 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6392 val = make_ssa_name (var: data_eltype);
6393 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
6394 build3 (BIT_FIELD_REF,
6395 data_eltype,
6396 reduc_inputs[0],
6397 bitsize_int (el_size),
6398 bitsize_int (off)));
6399 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6400 if (off != 0)
6401 {
6402 tree new_idx_val = idx_val;
6403 if (off != v_size - el_size)
6404 {
6405 new_idx_val = make_ssa_name (var: idx_eltype);
6406 epilog_stmt = gimple_build_assign (new_idx_val,
6407 MAX_EXPR, idx_val,
6408 old_idx_val);
6409 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6410 }
6411 tree cond = make_ssa_name (boolean_type_node);
6412 epilog_stmt = gimple_build_assign (cond, GT_EXPR,
6413 idx_val, old_idx_val);
6414 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6415 tree new_val = make_ssa_name (var: data_eltype);
6416 epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
6417 cond, val, old_val);
6418 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6419 idx_val = new_idx_val;
6420 val = new_val;
6421 }
6422 }
6423 /* Convert the reduced value back to the result type and set as the
6424 result. */
6425 gimple_seq stmts = NULL;
6426 val = gimple_convert (seq: &stmts, type: scalar_type, op: val);
6427 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6428 scalar_results.safe_push (obj: val);
6429 }
6430
6431 /* 2.3 Create the reduction code, using one of the three schemes described
6432 above. In SLP we simply need to extract all the elements from the
6433 vector (without reducing them), so we use scalar shifts. */
6434 else if (reduc_fn != IFN_LAST && !slp_reduc)
6435 {
6436 tree tmp;
6437 tree vec_elem_type;
6438
6439 /* Case 1: Create:
6440 v_out2 = reduc_expr <v_out1> */
6441
6442 if (dump_enabled_p ())
6443 dump_printf_loc (MSG_NOTE, vect_location,
6444 "Reduce using direct vector reduction.\n");
6445
6446 gimple_seq stmts = NULL;
6447 vec_elem_type = TREE_TYPE (vectype);
6448 new_temp = gimple_build (seq: &stmts, fn: as_combined_fn (fn: reduc_fn),
6449 type: vec_elem_type, args: reduc_inputs[0]);
6450 new_temp = gimple_convert (seq: &stmts, type: scalar_type, op: new_temp);
6451 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6452
6453 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6454 && induc_val)
6455 {
6456 /* Earlier we set the initial value to be a vector if induc_val
6457 values. Check the result and if it is induc_val then replace
6458 with the original initial value, unless induc_val is
6459 the same as initial_def already. */
6460 tree zcompare = make_ssa_name (boolean_type_node);
6461 epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
6462 new_temp, induc_val);
6463 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6464 tree initial_def = reduc_info->reduc_initial_values[0];
6465 tmp = make_ssa_name (var: new_scalar_dest);
6466 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6467 initial_def, new_temp);
6468 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6469 new_temp = tmp;
6470 }
6471
6472 scalar_results.safe_push (obj: new_temp);
6473 }
6474 else if (direct_slp_reduc)
6475 {
6476 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
6477 with the elements for other SLP statements replaced with the
6478 neutral value. We can then do a normal reduction on each vector. */
6479
6480 /* Enforced by vectorizable_reduction. */
6481 gcc_assert (reduc_inputs.length () == 1);
6482 gcc_assert (pow2p_hwi (group_size));
6483
6484 gimple_seq seq = NULL;
6485
6486 /* Build a vector {0, 1, 2, ...}, with the same number of elements
6487 and the same element size as VECTYPE. */
6488 tree index = build_index_vector (vectype, 0, 1);
6489 tree index_type = TREE_TYPE (index);
6490 tree index_elt_type = TREE_TYPE (index_type);
6491 tree mask_type = truth_type_for (index_type);
6492
6493 /* Create a vector that, for each element, identifies which of
6494 the REDUC_GROUP_SIZE results should use it. */
6495 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
6496 index = gimple_build (seq: &seq, code: BIT_AND_EXPR, type: index_type, ops: index,
6497 ops: build_vector_from_val (index_type, index_mask));
6498
6499 /* Get a neutral vector value. This is simply a splat of the neutral
6500 scalar value if we have one, otherwise the initial scalar value
6501 is itself a neutral value. */
6502 tree vector_identity = NULL_TREE;
6503 tree neutral_op = NULL_TREE;
6504 if (slp_node)
6505 {
6506 tree initial_value = NULL_TREE;
6507 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6508 initial_value = reduc_info->reduc_initial_values[0];
6509 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
6510 initial_value, as_initial: false);
6511 }
6512 if (neutral_op)
6513 vector_identity = gimple_build_vector_from_val (seq: &seq, type: vectype,
6514 op: neutral_op);
6515 for (unsigned int i = 0; i < group_size; ++i)
6516 {
6517 /* If there's no univeral neutral value, we can use the
6518 initial scalar value from the original PHI. This is used
6519 for MIN and MAX reduction, for example. */
6520 if (!neutral_op)
6521 {
6522 tree scalar_value = reduc_info->reduc_initial_values[i];
6523 scalar_value = gimple_convert (seq: &seq, TREE_TYPE (vectype),
6524 op: scalar_value);
6525 vector_identity = gimple_build_vector_from_val (seq: &seq, type: vectype,
6526 op: scalar_value);
6527 }
6528
6529 /* Calculate the equivalent of:
6530
6531 sel[j] = (index[j] == i);
6532
6533 which selects the elements of REDUC_INPUTS[0] that should
6534 be included in the result. */
6535 tree compare_val = build_int_cst (index_elt_type, i);
6536 compare_val = build_vector_from_val (index_type, compare_val);
6537 tree sel = gimple_build (seq: &seq, code: EQ_EXPR, type: mask_type,
6538 ops: index, ops: compare_val);
6539
6540 /* Calculate the equivalent of:
6541
6542 vec = seq ? reduc_inputs[0] : vector_identity;
6543
6544 VEC is now suitable for a full vector reduction. */
6545 tree vec = gimple_build (seq: &seq, code: VEC_COND_EXPR, type: vectype,
6546 ops: sel, ops: reduc_inputs[0], ops: vector_identity);
6547
6548 /* Do the reduction and convert it to the appropriate type. */
6549 tree scalar = gimple_build (seq: &seq, fn: as_combined_fn (fn: reduc_fn),
6550 TREE_TYPE (vectype), args: vec);
6551 scalar = gimple_convert (seq: &seq, type: scalar_type, op: scalar);
6552 scalar_results.safe_push (obj: scalar);
6553 }
6554 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
6555 }
6556 else
6557 {
6558 bool reduce_with_shift;
6559 tree vec_temp;
6560
6561 gcc_assert (slp_reduc || reduc_inputs.length () == 1);
6562
6563 /* See if the target wants to do the final (shift) reduction
6564 in a vector mode of smaller size and first reduce upper/lower
6565 halves against each other. */
6566 enum machine_mode mode1 = mode;
6567 tree stype = TREE_TYPE (vectype);
6568 unsigned nunits = TYPE_VECTOR_SUBPARTS (node: vectype).to_constant ();
6569 unsigned nunits1 = nunits;
6570 if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
6571 && reduc_inputs.length () == 1)
6572 {
6573 nunits1 = GET_MODE_NUNITS (mode: mode1).to_constant ();
6574 /* For SLP reductions we have to make sure lanes match up, but
6575 since we're doing individual element final reduction reducing
6576 vector width here is even more important.
6577 ??? We can also separate lanes with permutes, for the common
6578 case of power-of-two group-size odd/even extracts would work. */
6579 if (slp_reduc && nunits != nunits1)
6580 {
6581 nunits1 = least_common_multiple (nunits1, group_size);
6582 gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
6583 }
6584 }
6585 if (!slp_reduc
6586 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
6587 nunits1 = GET_MODE_NUNITS (mode: mode1).to_constant ();
6588
6589 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
6590 stype, nunits1);
6591 reduce_with_shift = have_whole_vector_shift (mode: mode1);
6592 if (!VECTOR_MODE_P (mode1)
6593 || !directly_supported_p (code, vectype1))
6594 reduce_with_shift = false;
6595
6596 /* First reduce the vector to the desired vector size we should
6597 do shift reduction on by combining upper and lower halves. */
6598 gimple_seq stmts = NULL;
6599 new_temp = vect_create_partial_epilog (vec_def: reduc_inputs[0], vectype: vectype1,
6600 code, seq: &stmts);
6601 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6602 reduc_inputs[0] = new_temp;
6603
6604 if (reduce_with_shift && !slp_reduc)
6605 {
6606 int element_bitsize = tree_to_uhwi (bitsize);
6607 /* Enforced by vectorizable_reduction, which disallows SLP reductions
6608 for variable-length vectors and also requires direct target support
6609 for loop reductions. */
6610 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6611 int nelements = vec_size_in_bits / element_bitsize;
6612 vec_perm_builder sel;
6613 vec_perm_indices indices;
6614
6615 int elt_offset;
6616
6617 tree zero_vec = build_zero_cst (vectype1);
6618 /* Case 2: Create:
6619 for (offset = nelements/2; offset >= 1; offset/=2)
6620 {
6621 Create: va' = vec_shift <va, offset>
6622 Create: va = vop <va, va'>
6623 } */
6624
6625 tree rhs;
6626
6627 if (dump_enabled_p ())
6628 dump_printf_loc (MSG_NOTE, vect_location,
6629 "Reduce using vector shifts\n");
6630
6631 gimple_seq stmts = NULL;
6632 new_temp = gimple_convert (seq: &stmts, type: vectype1, op: new_temp);
6633 for (elt_offset = nelements / 2;
6634 elt_offset >= 1;
6635 elt_offset /= 2)
6636 {
6637 calc_vec_perm_mask_for_shift (offset: elt_offset, nelt: nelements, sel: &sel);
6638 indices.new_vector (sel, 2, nelements);
6639 tree mask = vect_gen_perm_mask_any (vectype1, indices);
6640 new_name = gimple_build (seq: &stmts, code: VEC_PERM_EXPR, type: vectype1,
6641 ops: new_temp, ops: zero_vec, ops: mask);
6642 new_temp = gimple_build (seq: &stmts, code,
6643 type: vectype1, ops: new_name, ops: new_temp);
6644 }
6645 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6646
6647 /* 2.4 Extract the final scalar result. Create:
6648 s_out3 = extract_field <v_out2, bitpos> */
6649
6650 if (dump_enabled_p ())
6651 dump_printf_loc (MSG_NOTE, vect_location,
6652 "extract scalar result\n");
6653
6654 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6655 bitsize, bitsize_zero_node);
6656 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
6657 new_temp = make_ssa_name (var: new_scalar_dest, stmt: epilog_stmt);
6658 gimple_assign_set_lhs (gs: epilog_stmt, lhs: new_temp);
6659 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6660 scalar_results.safe_push (obj: new_temp);
6661 }
6662 else
6663 {
6664 /* Case 3: Create:
6665 s = extract_field <v_out2, 0>
6666 for (offset = element_size;
6667 offset < vector_size;
6668 offset += element_size;)
6669 {
6670 Create: s' = extract_field <v_out2, offset>
6671 Create: s = op <s, s'> // For non SLP cases
6672 } */
6673
6674 if (dump_enabled_p ())
6675 dump_printf_loc (MSG_NOTE, vect_location,
6676 "Reduce using scalar code.\n");
6677
6678 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6679 int element_bitsize = tree_to_uhwi (bitsize);
6680 tree compute_type = TREE_TYPE (vectype);
6681 gimple_seq stmts = NULL;
6682 FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6683 {
6684 int bit_offset;
6685 new_temp = gimple_build (seq: &stmts, code: BIT_FIELD_REF, type: compute_type,
6686 ops: vec_temp, ops: bitsize, bitsize_zero_node);
6687
6688 /* In SLP we don't need to apply reduction operation, so we just
6689 collect s' values in SCALAR_RESULTS. */
6690 if (slp_reduc)
6691 scalar_results.safe_push (obj: new_temp);
6692
6693 for (bit_offset = element_bitsize;
6694 bit_offset < vec_size_in_bits;
6695 bit_offset += element_bitsize)
6696 {
6697 tree bitpos = bitsize_int (bit_offset);
6698 new_name = gimple_build (seq: &stmts, code: BIT_FIELD_REF,
6699 type: compute_type, ops: vec_temp,
6700 ops: bitsize, ops: bitpos);
6701 if (slp_reduc)
6702 {
6703 /* In SLP we don't need to apply reduction operation, so
6704 we just collect s' values in SCALAR_RESULTS. */
6705 new_temp = new_name;
6706 scalar_results.safe_push (obj: new_name);
6707 }
6708 else
6709 new_temp = gimple_build (seq: &stmts, code, type: compute_type,
6710 ops: new_name, ops: new_temp);
6711 }
6712 }
6713
6714 /* The only case where we need to reduce scalar results in SLP, is
6715 unrolling. If the size of SCALAR_RESULTS is greater than
6716 REDUC_GROUP_SIZE, we reduce them combining elements modulo
6717 REDUC_GROUP_SIZE. */
6718 if (slp_reduc)
6719 {
6720 tree res, first_res, new_res;
6721
6722 /* Reduce multiple scalar results in case of SLP unrolling. */
6723 for (j = group_size; scalar_results.iterate (ix: j, ptr: &res);
6724 j++)
6725 {
6726 first_res = scalar_results[j % group_size];
6727 new_res = gimple_build (seq: &stmts, code, type: compute_type,
6728 ops: first_res, ops: res);
6729 scalar_results[j % group_size] = new_res;
6730 }
6731 scalar_results.truncate (size: group_size);
6732 for (k = 0; k < group_size; k++)
6733 scalar_results[k] = gimple_convert (seq: &stmts, type: scalar_type,
6734 op: scalar_results[k]);
6735 }
6736 else
6737 {
6738 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
6739 new_temp = gimple_convert (seq: &stmts, type: scalar_type, op: new_temp);
6740 scalar_results.safe_push (obj: new_temp);
6741 }
6742
6743 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6744 }
6745
6746 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6747 && induc_val)
6748 {
6749 /* Earlier we set the initial value to be a vector if induc_val
6750 values. Check the result and if it is induc_val then replace
6751 with the original initial value, unless induc_val is
6752 the same as initial_def already. */
6753 tree zcompare = make_ssa_name (boolean_type_node);
6754 epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR, new_temp,
6755 induc_val);
6756 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6757 tree initial_def = reduc_info->reduc_initial_values[0];
6758 tree tmp = make_ssa_name (var: new_scalar_dest);
6759 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6760 initial_def, new_temp);
6761 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6762 scalar_results[0] = tmp;
6763 }
6764 }
6765
6766 /* 2.5 Adjust the final result by the initial value of the reduction
6767 variable. (When such adjustment is not needed, then
6768 'adjustment_def' is zero). For example, if code is PLUS we create:
6769 new_temp = loop_exit_def + adjustment_def */
6770
6771 if (adjustment_def)
6772 {
6773 gcc_assert (!slp_reduc);
6774 gimple_seq stmts = NULL;
6775 if (double_reduc)
6776 {
6777 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6778 adjustment_def = gimple_convert (seq: &stmts, type: vectype, op: adjustment_def);
6779 new_temp = gimple_build (seq: &stmts, code, type: vectype,
6780 ops: reduc_inputs[0], ops: adjustment_def);
6781 }
6782 else
6783 {
6784 new_temp = scalar_results[0];
6785 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6786 adjustment_def = gimple_convert (seq: &stmts, TREE_TYPE (vectype),
6787 op: adjustment_def);
6788 new_temp = gimple_convert (seq: &stmts, TREE_TYPE (vectype), op: new_temp);
6789 new_temp = gimple_build (seq: &stmts, code, TREE_TYPE (vectype),
6790 ops: new_temp, ops: adjustment_def);
6791 new_temp = gimple_convert (seq: &stmts, type: scalar_type, op: new_temp);
6792 }
6793
6794 epilog_stmt = gimple_seq_last_stmt (s: stmts);
6795 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6796 scalar_results[0] = new_temp;
6797 }
6798
6799 /* Record this operation if it could be reused by the epilogue loop. */
6800 if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
6801 && reduc_inputs.length () == 1)
6802 loop_vinfo->reusable_accumulators.put (k: scalar_results[0],
6803 v: { .reduc_input: orig_reduc_input, .reduc_info: reduc_info });
6804
6805 if (double_reduc)
6806 loop = outer_loop;
6807
6808 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
6809 phis with new adjusted scalar results, i.e., replace use <s_out0>
6810 with use <s_out4>.
6811
6812 Transform:
6813 loop_exit:
6814 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6815 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6816 v_out2 = reduce <v_out1>
6817 s_out3 = extract_field <v_out2, 0>
6818 s_out4 = adjust_result <s_out3>
6819 use <s_out0>
6820 use <s_out0>
6821
6822 into:
6823
6824 loop_exit:
6825 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6826 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6827 v_out2 = reduce <v_out1>
6828 s_out3 = extract_field <v_out2, 0>
6829 s_out4 = adjust_result <s_out3>
6830 use <s_out4>
6831 use <s_out4> */
6832
6833 gcc_assert (live_out_stmts.size () == scalar_results.length ());
6834 for (k = 0; k < live_out_stmts.size (); k++)
6835 {
6836 stmt_vec_info scalar_stmt_info = vect_orig_stmt (stmt_info: live_out_stmts[k]);
6837 scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6838
6839 phis.create (nelems: 3);
6840 /* Find the loop-closed-use at the loop exit of the original scalar
6841 result. (The reduction result is expected to have two immediate uses,
6842 one at the latch block, and one at the loop exit). For double
6843 reductions we are looking for exit phis of the outer loop. */
6844 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6845 {
6846 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6847 {
6848 if (!is_gimple_debug (USE_STMT (use_p)))
6849 phis.safe_push (USE_STMT (use_p));
6850 }
6851 else
6852 {
6853 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6854 {
6855 tree phi_res = PHI_RESULT (USE_STMT (use_p));
6856
6857 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6858 {
6859 if (!flow_bb_inside_loop_p (loop,
6860 gimple_bb (USE_STMT (phi_use_p)))
6861 && !is_gimple_debug (USE_STMT (phi_use_p)))
6862 phis.safe_push (USE_STMT (phi_use_p));
6863 }
6864 }
6865 }
6866 }
6867
6868 FOR_EACH_VEC_ELT (phis, i, exit_phi)
6869 {
6870 /* Replace the uses: */
6871 orig_name = PHI_RESULT (exit_phi);
6872
6873 /* Look for a single use at the target of the skip edge. */
6874 if (unify_with_main_loop_p)
6875 {
6876 use_operand_p use_p;
6877 gimple *user;
6878 if (!single_imm_use (var: orig_name, use_p: &use_p, stmt: &user))
6879 gcc_unreachable ();
6880 orig_name = gimple_get_lhs (user);
6881 }
6882
6883 scalar_result = scalar_results[k];
6884 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6885 {
6886 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6887 SET_USE (use_p, scalar_result);
6888 update_stmt (s: use_stmt);
6889 }
6890 }
6891
6892 phis.release ();
6893 }
6894}
6895
6896/* Return a vector of type VECTYPE that is equal to the vector select
6897 operation "MASK ? VEC : IDENTITY". Insert the select statements
6898 before GSI. */
6899
6900static tree
6901merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6902 tree vec, tree identity)
6903{
6904 tree cond = make_temp_ssa_name (type: vectype, NULL, name: "cond");
6905 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6906 mask, vec, identity);
6907 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6908 return cond;
6909}
6910
6911/* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6912 order, starting with LHS. Insert the extraction statements before GSI and
6913 associate the new scalar SSA names with variable SCALAR_DEST.
6914 If MASK is nonzero mask the input and then operate on it unconditionally.
6915 Return the SSA name for the result. */
6916
6917static tree
6918vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6919 tree_code code, tree lhs, tree vector_rhs,
6920 tree mask)
6921{
6922 tree vectype = TREE_TYPE (vector_rhs);
6923 tree scalar_type = TREE_TYPE (vectype);
6924 tree bitsize = TYPE_SIZE (scalar_type);
6925 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6926 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6927
6928 /* Re-create a VEC_COND_EXPR to mask the input here in order to be able
6929 to perform an unconditional element-wise reduction of it. */
6930 if (mask)
6931 {
6932 tree masked_vector_rhs = make_temp_ssa_name (type: vectype, NULL,
6933 name: "masked_vector_rhs");
6934 tree neutral_op = neutral_op_for_reduction (scalar_type, code, NULL_TREE,
6935 as_initial: false);
6936 tree vector_identity = build_vector_from_val (vectype, neutral_op);
6937 gassign *select = gimple_build_assign (masked_vector_rhs, VEC_COND_EXPR,
6938 mask, vector_rhs, vector_identity);
6939 gsi_insert_before (gsi, select, GSI_SAME_STMT);
6940 vector_rhs = masked_vector_rhs;
6941 }
6942
6943 for (unsigned HOST_WIDE_INT bit_offset = 0;
6944 bit_offset < vec_size_in_bits;
6945 bit_offset += element_bitsize)
6946 {
6947 tree bitpos = bitsize_int (bit_offset);
6948 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6949 bitsize, bitpos);
6950
6951 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6952 rhs = make_ssa_name (var: scalar_dest, stmt);
6953 gimple_assign_set_lhs (gs: stmt, lhs: rhs);
6954 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6955
6956 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6957 tree new_name = make_ssa_name (var: scalar_dest, stmt);
6958 gimple_assign_set_lhs (gs: stmt, lhs: new_name);
6959 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6960 lhs = new_name;
6961 }
6962 return lhs;
6963}
6964
6965/* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
6966 type of the vector input. */
6967
6968static internal_fn
6969get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6970{
6971 internal_fn mask_reduc_fn;
6972 internal_fn mask_len_reduc_fn;
6973
6974 switch (reduc_fn)
6975 {
6976 case IFN_FOLD_LEFT_PLUS:
6977 mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6978 mask_len_reduc_fn = IFN_MASK_LEN_FOLD_LEFT_PLUS;
6979 break;
6980
6981 default:
6982 return IFN_LAST;
6983 }
6984
6985 if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6986 OPTIMIZE_FOR_SPEED))
6987 return mask_reduc_fn;
6988 if (direct_internal_fn_supported_p (mask_len_reduc_fn, vectype_in,
6989 OPTIMIZE_FOR_SPEED))
6990 return mask_len_reduc_fn;
6991 return IFN_LAST;
6992}
6993
6994/* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
6995 statement that sets the live-out value. REDUC_DEF_STMT is the phi
6996 statement. CODE is the operation performed by STMT_INFO and OPS are
6997 its scalar operands. REDUC_INDEX is the index of the operand in
6998 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
6999 implements in-order reduction, or IFN_LAST if we should open-code it.
7000 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
7001 that should be used to control the operation in a fully-masked loop. */
7002
7003static bool
7004vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
7005 stmt_vec_info stmt_info,
7006 gimple_stmt_iterator *gsi,
7007 gimple **vec_stmt, slp_tree slp_node,
7008 gimple *reduc_def_stmt,
7009 code_helper code, internal_fn reduc_fn,
7010 tree *ops, int num_ops, tree vectype_in,
7011 int reduc_index, vec_loop_masks *masks,
7012 vec_loop_lens *lens)
7013{
7014 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7015 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7016 internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
7017
7018 int ncopies;
7019 if (slp_node)
7020 ncopies = 1;
7021 else
7022 ncopies = vect_get_num_copies (loop_vinfo, vectype: vectype_in);
7023
7024 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
7025 gcc_assert (ncopies == 1);
7026
7027 bool is_cond_op = false;
7028 if (!code.is_tree_code ())
7029 {
7030 code = conditional_internal_fn_code (internal_fn (code));
7031 gcc_assert (code != ERROR_MARK);
7032 is_cond_op = true;
7033 }
7034
7035 gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
7036
7037 if (slp_node)
7038 {
7039 if (is_cond_op)
7040 {
7041 if (dump_enabled_p ())
7042 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7043 "fold-left reduction on SLP not supported.\n");
7044 return false;
7045 }
7046
7047 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
7048 TYPE_VECTOR_SUBPARTS (vectype_in)));
7049 }
7050
7051 /* The operands either come from a binary operation or an IFN_COND operation.
7052 The former is a gimple assign with binary rhs and the latter is a
7053 gimple call with four arguments. */
7054 gcc_assert (num_ops == 2 || num_ops == 4);
7055 tree op0, opmask;
7056 if (!is_cond_op)
7057 op0 = ops[1 - reduc_index];
7058 else
7059 {
7060 op0 = ops[2];
7061 opmask = ops[0];
7062 gcc_assert (!slp_node);
7063 }
7064
7065 int group_size = 1;
7066 stmt_vec_info scalar_dest_def_info;
7067 auto_vec<tree> vec_oprnds0, vec_opmask;
7068 if (slp_node)
7069 {
7070 auto_vec<vec<tree> > vec_defs (2);
7071 vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
7072 vec_oprnds0.safe_splice (src: vec_defs[1 - reduc_index]);
7073 vec_defs[0].release ();
7074 vec_defs[1].release ();
7075 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7076 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
7077 }
7078 else
7079 {
7080 vect_get_vec_defs_for_operand (vinfo: loop_vinfo, stmt_info, 1,
7081 op: op0, &vec_oprnds0);
7082 scalar_dest_def_info = stmt_info;
7083
7084 /* For an IFN_COND_OP we also need the vector mask operand. */
7085 if (is_cond_op)
7086 vect_get_vec_defs_for_operand (vinfo: loop_vinfo, stmt_info, 1,
7087 op: opmask, &vec_opmask);
7088 }
7089
7090 gimple *sdef = scalar_dest_def_info->stmt;
7091 tree scalar_dest = gimple_get_lhs (sdef);
7092 tree scalar_type = TREE_TYPE (scalar_dest);
7093 tree reduc_var = gimple_phi_result (gs: reduc_def_stmt);
7094
7095 int vec_num = vec_oprnds0.length ();
7096 gcc_assert (vec_num == 1 || slp_node);
7097 tree vec_elem_type = TREE_TYPE (vectype_out);
7098 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
7099
7100 tree vector_identity = NULL_TREE;
7101 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7102 {
7103 vector_identity = build_zero_cst (vectype_out);
7104 if (!HONOR_SIGNED_ZEROS (vectype_out))
7105 ;
7106 else
7107 {
7108 gcc_assert (!HONOR_SIGN_DEPENDENT_ROUNDING (vectype_out));
7109 vector_identity = const_unop (NEGATE_EXPR, vectype_out,
7110 vector_identity);
7111 }
7112 }
7113
7114 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
7115 int i;
7116 tree def0;
7117 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7118 {
7119 gimple *new_stmt;
7120 tree mask = NULL_TREE;
7121 tree len = NULL_TREE;
7122 tree bias = NULL_TREE;
7123 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7124 mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i);
7125 else if (is_cond_op)
7126 mask = vec_opmask[0];
7127 if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
7128 {
7129 len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
7130 i, 1);
7131 signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
7132 bias = build_int_cst (intQI_type_node, biasval);
7133 if (!is_cond_op)
7134 mask = build_minus_one_cst (truth_type_for (vectype_in));
7135 }
7136
7137 /* Handle MINUS by adding the negative. */
7138 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
7139 {
7140 tree negated = make_ssa_name (var: vectype_out);
7141 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
7142 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
7143 def0 = negated;
7144 }
7145
7146 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
7147 && mask && mask_reduc_fn == IFN_LAST)
7148 def0 = merge_with_identity (gsi, mask, vectype: vectype_out, vec: def0,
7149 identity: vector_identity);
7150
7151 /* On the first iteration the input is simply the scalar phi
7152 result, and for subsequent iterations it is the output of
7153 the preceding operation. */
7154 if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
7155 {
7156 if (mask && len && mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
7157 new_stmt = gimple_build_call_internal (mask_reduc_fn, 5, reduc_var,
7158 def0, mask, len, bias);
7159 else if (mask && mask_reduc_fn == IFN_MASK_FOLD_LEFT_PLUS)
7160 new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
7161 def0, mask);
7162 else
7163 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
7164 def0);
7165 /* For chained SLP reductions the output of the previous reduction
7166 operation serves as the input of the next. For the final statement
7167 the output cannot be a temporary - we reuse the original
7168 scalar destination of the last statement. */
7169 if (i != vec_num - 1)
7170 {
7171 gimple_set_lhs (new_stmt, scalar_dest_var);
7172 reduc_var = make_ssa_name (var: scalar_dest_var, stmt: new_stmt);
7173 gimple_set_lhs (new_stmt, reduc_var);
7174 }
7175 }
7176 else
7177 {
7178 reduc_var = vect_expand_fold_left (gsi, scalar_dest: scalar_dest_var,
7179 code: tree_code (code), lhs: reduc_var, vector_rhs: def0,
7180 mask);
7181 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
7182 /* Remove the statement, so that we can use the same code paths
7183 as for statements that we've just created. */
7184 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
7185 gsi_remove (&tmp_gsi, true);
7186 }
7187
7188 if (i == vec_num - 1)
7189 {
7190 gimple_set_lhs (new_stmt, scalar_dest);
7191 vect_finish_replace_stmt (loop_vinfo,
7192 scalar_dest_def_info,
7193 new_stmt);
7194 }
7195 else
7196 vect_finish_stmt_generation (loop_vinfo,
7197 scalar_dest_def_info,
7198 new_stmt, gsi);
7199
7200 if (slp_node)
7201 slp_node->push_vec_def (def: new_stmt);
7202 else
7203 {
7204 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (obj: new_stmt);
7205 *vec_stmt = new_stmt;
7206 }
7207 }
7208
7209 return true;
7210}
7211
7212/* Function is_nonwrapping_integer_induction.
7213
7214 Check if STMT_VINO (which is part of loop LOOP) both increments and
7215 does not cause overflow. */
7216
7217static bool
7218is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
7219{
7220 gphi *phi = as_a <gphi *> (p: stmt_vinfo->stmt);
7221 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
7222 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
7223 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
7224 widest_int ni, max_loop_value, lhs_max;
7225 wi::overflow_type overflow = wi::OVF_NONE;
7226
7227 /* Make sure the loop is integer based. */
7228 if (TREE_CODE (base) != INTEGER_CST
7229 || TREE_CODE (step) != INTEGER_CST)
7230 return false;
7231
7232 /* Check that the max size of the loop will not wrap. */
7233
7234 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
7235 return true;
7236
7237 if (! max_stmt_executions (loop, &ni))
7238 return false;
7239
7240 max_loop_value = wi::mul (x: wi::to_widest (t: step), y: ni, TYPE_SIGN (lhs_type),
7241 overflow: &overflow);
7242 if (overflow)
7243 return false;
7244
7245 max_loop_value = wi::add (x: wi::to_widest (t: base), y: max_loop_value,
7246 TYPE_SIGN (lhs_type), overflow: &overflow);
7247 if (overflow)
7248 return false;
7249
7250 return (wi::min_precision (x: max_loop_value, TYPE_SIGN (lhs_type))
7251 <= TYPE_PRECISION (lhs_type));
7252}
7253
7254/* Check if masking can be supported by inserting a conditional expression.
7255 CODE is the code for the operation. COND_FN is the conditional internal
7256 function, if it exists. VECTYPE_IN is the type of the vector input. */
7257static bool
7258use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
7259 tree vectype_in)
7260{
7261 if (cond_fn != IFN_LAST
7262 && direct_internal_fn_supported_p (cond_fn, vectype_in,
7263 OPTIMIZE_FOR_SPEED))
7264 return false;
7265
7266 if (code.is_tree_code ())
7267 switch (tree_code (code))
7268 {
7269 case DOT_PROD_EXPR:
7270 case SAD_EXPR:
7271 return true;
7272
7273 default:
7274 break;
7275 }
7276 return false;
7277}
7278
7279/* Insert a conditional expression to enable masked vectorization. CODE is the
7280 code for the operation. VOP is the array of operands. MASK is the loop
7281 mask. GSI is a statement iterator used to place the new conditional
7282 expression. */
7283static void
7284build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
7285 gimple_stmt_iterator *gsi)
7286{
7287 switch (tree_code (code))
7288 {
7289 case DOT_PROD_EXPR:
7290 {
7291 tree vectype = TREE_TYPE (vop[1]);
7292 tree zero = build_zero_cst (vectype);
7293 tree masked_op1 = make_temp_ssa_name (type: vectype, NULL, name: "masked_op1");
7294 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7295 mask, vop[1], zero);
7296 gsi_insert_before (gsi, select, GSI_SAME_STMT);
7297 vop[1] = masked_op1;
7298 break;
7299 }
7300
7301 case SAD_EXPR:
7302 {
7303 tree vectype = TREE_TYPE (vop[1]);
7304 tree masked_op1 = make_temp_ssa_name (type: vectype, NULL, name: "masked_op1");
7305 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7306 mask, vop[1], vop[0]);
7307 gsi_insert_before (gsi, select, GSI_SAME_STMT);
7308 vop[1] = masked_op1;
7309 break;
7310 }
7311
7312 default:
7313 gcc_unreachable ();
7314 }
7315}
7316
7317/* Function vectorizable_reduction.
7318
7319 Check if STMT_INFO performs a reduction operation that can be vectorized.
7320 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
7321 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
7322 Return true if STMT_INFO is vectorizable in this way.
7323
7324 This function also handles reduction idioms (patterns) that have been
7325 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
7326 may be of this form:
7327 X = pattern_expr (arg0, arg1, ..., X)
7328 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
7329 sequence that had been detected and replaced by the pattern-stmt
7330 (STMT_INFO).
7331
7332 This function also handles reduction of condition expressions, for example:
7333 for (int i = 0; i < N; i++)
7334 if (a[i] < value)
7335 last = a[i];
7336 This is handled by vectorising the loop and creating an additional vector
7337 containing the loop indexes for which "a[i] < value" was true. In the
7338 function epilogue this is reduced to a single max value and then used to
7339 index into the vector of results.
7340
7341 In some cases of reduction patterns, the type of the reduction variable X is
7342 different than the type of the other arguments of STMT_INFO.
7343 In such cases, the vectype that is used when transforming STMT_INFO into
7344 a vector stmt is different than the vectype that is used to determine the
7345 vectorization factor, because it consists of a different number of elements
7346 than the actual number of elements that are being operated upon in parallel.
7347
7348 For example, consider an accumulation of shorts into an int accumulator.
7349 On some targets it's possible to vectorize this pattern operating on 8
7350 shorts at a time (hence, the vectype for purposes of determining the
7351 vectorization factor should be V8HI); on the other hand, the vectype that
7352 is used to create the vector form is actually V4SI (the type of the result).
7353
7354 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
7355 indicates what is the actual level of parallelism (V8HI in the example), so
7356 that the right vectorization factor would be derived. This vectype
7357 corresponds to the type of arguments to the reduction stmt, and should *NOT*
7358 be used to create the vectorized stmt. The right vectype for the vectorized
7359 stmt is obtained from the type of the result X:
7360 get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7361
7362 This means that, contrary to "regular" reductions (or "regular" stmts in
7363 general), the following equation:
7364 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7365 does *NOT* necessarily hold for reduction patterns. */
7366
7367bool
7368vectorizable_reduction (loop_vec_info loop_vinfo,
7369 stmt_vec_info stmt_info, slp_tree slp_node,
7370 slp_instance slp_node_instance,
7371 stmt_vector_for_cost *cost_vec)
7372{
7373 tree vectype_in = NULL_TREE;
7374 tree vectype_op[3] = { NULL_TREE, NULL_TREE, NULL_TREE };
7375 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7376 enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
7377 stmt_vec_info cond_stmt_vinfo = NULL;
7378 int i;
7379 int ncopies;
7380 bool single_defuse_cycle = false;
7381 bool nested_cycle = false;
7382 bool double_reduc = false;
7383 int vec_num;
7384 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
7385 tree cond_reduc_val = NULL_TREE;
7386
7387 /* Make sure it was already recognized as a reduction computation. */
7388 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
7389 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
7390 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
7391 return false;
7392
7393 /* The stmt we store reduction analysis meta on. */
7394 stmt_vec_info reduc_info = info_for_reduction (vinfo: loop_vinfo, stmt_info);
7395 reduc_info->is_reduc_info = true;
7396
7397 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7398 {
7399 if (is_a <gphi *> (p: stmt_info->stmt))
7400 {
7401 if (slp_node)
7402 {
7403 /* We eventually need to set a vector type on invariant
7404 arguments. */
7405 unsigned j;
7406 slp_tree child;
7407 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
7408 if (!vect_maybe_update_slp_op_vectype
7409 (child, SLP_TREE_VECTYPE (slp_node)))
7410 {
7411 if (dump_enabled_p ())
7412 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7413 "incompatible vector types for "
7414 "invariants\n");
7415 return false;
7416 }
7417 }
7418 /* Analysis for double-reduction is done on the outer
7419 loop PHI, nested cycles have no further restrictions. */
7420 STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
7421 }
7422 else
7423 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7424 return true;
7425 }
7426
7427 stmt_vec_info orig_stmt_of_analysis = stmt_info;
7428 stmt_vec_info phi_info = stmt_info;
7429 if (!is_a <gphi *> (p: stmt_info->stmt))
7430 {
7431 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7432 return true;
7433 }
7434 if (slp_node)
7435 {
7436 slp_node_instance->reduc_phis = slp_node;
7437 /* ??? We're leaving slp_node to point to the PHIs, we only
7438 need it to get at the number of vector stmts which wasn't
7439 yet initialized for the instance root. */
7440 }
7441 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7442 {
7443 use_operand_p use_p;
7444 gimple *use_stmt;
7445 bool res = single_imm_use (var: gimple_phi_result (gs: stmt_info->stmt),
7446 use_p: &use_p, stmt: &use_stmt);
7447 gcc_assert (res);
7448 phi_info = loop_vinfo->lookup_stmt (use_stmt);
7449 }
7450
7451 /* PHIs should not participate in patterns. */
7452 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7453 gphi *reduc_def_phi = as_a <gphi *> (p: phi_info->stmt);
7454
7455 /* Verify following REDUC_IDX from the latch def leads us back to the PHI
7456 and compute the reduction chain length. Discover the real
7457 reduction operation stmt on the way (stmt_info and slp_for_stmt_info). */
7458 tree reduc_def
7459 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
7460 loop_latch_edge
7461 (gimple_bb (reduc_def_phi)->loop_father));
7462 unsigned reduc_chain_length = 0;
7463 bool only_slp_reduc_chain = true;
7464 stmt_info = NULL;
7465 slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
7466 while (reduc_def != PHI_RESULT (reduc_def_phi))
7467 {
7468 stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
7469 stmt_vec_info vdef = vect_stmt_to_vectorize (stmt_info: def);
7470 if (STMT_VINFO_REDUC_IDX (vdef) == -1)
7471 {
7472 if (dump_enabled_p ())
7473 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7474 "reduction chain broken by patterns.\n");
7475 return false;
7476 }
7477 if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
7478 only_slp_reduc_chain = false;
7479 /* For epilogue generation live members of the chain need
7480 to point back to the PHI via their original stmt for
7481 info_for_reduction to work. For SLP we need to look at
7482 all lanes here - even though we only will vectorize from
7483 the SLP node with live lane zero the other live lanes also
7484 need to be identified as part of a reduction to be able
7485 to skip code generation for them. */
7486 if (slp_for_stmt_info)
7487 {
7488 for (auto s : SLP_TREE_SCALAR_STMTS (slp_for_stmt_info))
7489 if (STMT_VINFO_LIVE_P (s))
7490 STMT_VINFO_REDUC_DEF (vect_orig_stmt (s)) = phi_info;
7491 }
7492 else if (STMT_VINFO_LIVE_P (vdef))
7493 STMT_VINFO_REDUC_DEF (def) = phi_info;
7494 gimple_match_op op;
7495 if (!gimple_extract_op (vdef->stmt, &op))
7496 {
7497 if (dump_enabled_p ())
7498 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7499 "reduction chain includes unsupported"
7500 " statement type.\n");
7501 return false;
7502 }
7503 if (CONVERT_EXPR_CODE_P (op.code))
7504 {
7505 if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
7506 {
7507 if (dump_enabled_p ())
7508 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7509 "conversion in the reduction chain.\n");
7510 return false;
7511 }
7512 }
7513 else if (!stmt_info)
7514 /* First non-conversion stmt. */
7515 stmt_info = vdef;
7516 reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
7517 reduc_chain_length++;
7518 if (!stmt_info && slp_node)
7519 slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
7520 }
7521 /* PHIs should not participate in patterns. */
7522 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7523
7524 if (nested_in_vect_loop_p (loop, stmt_info))
7525 {
7526 loop = loop->inner;
7527 nested_cycle = true;
7528 }
7529
7530 /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
7531 element. */
7532 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7533 {
7534 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
7535 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7536 }
7537 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7538 gcc_assert (slp_node
7539 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
7540
7541 /* 1. Is vectorizable reduction? */
7542 /* Not supportable if the reduction variable is used in the loop, unless
7543 it's a reduction chain. */
7544 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
7545 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7546 return false;
7547
7548 /* Reductions that are not used even in an enclosing outer-loop,
7549 are expected to be "live" (used out of the loop). */
7550 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
7551 && !STMT_VINFO_LIVE_P (stmt_info))
7552 return false;
7553
7554 /* 2. Has this been recognized as a reduction pattern?
7555
7556 Check if STMT represents a pattern that has been recognized
7557 in earlier analysis stages. For stmts that represent a pattern,
7558 the STMT_VINFO_RELATED_STMT field records the last stmt in
7559 the original sequence that constitutes the pattern. */
7560
7561 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7562 if (orig_stmt_info)
7563 {
7564 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
7565 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
7566 }
7567
7568 /* 3. Check the operands of the operation. The first operands are defined
7569 inside the loop body. The last operand is the reduction variable,
7570 which is defined by the loop-header-phi. */
7571
7572 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7573 STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
7574 gimple_match_op op;
7575 if (!gimple_extract_op (stmt_info->stmt, &op))
7576 gcc_unreachable ();
7577 bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR
7578 || op.code == WIDEN_SUM_EXPR
7579 || op.code == SAD_EXPR);
7580
7581 if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
7582 && !SCALAR_FLOAT_TYPE_P (op.type))
7583 return false;
7584
7585 /* Do not try to vectorize bit-precision reductions. */
7586 if (!type_has_mode_precision_p (t: op.type))
7587 return false;
7588
7589 /* For lane-reducing ops we're reducing the number of reduction PHIs
7590 which means the only use of that may be in the lane-reducing operation. */
7591 if (lane_reduc_code_p
7592 && reduc_chain_length != 1
7593 && !only_slp_reduc_chain)
7594 {
7595 if (dump_enabled_p ())
7596 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7597 "lane-reducing reduction with extra stmts.\n");
7598 return false;
7599 }
7600
7601 /* All uses but the last are expected to be defined in the loop.
7602 The last use is the reduction variable. In case of nested cycle this
7603 assumption is not true: we use reduc_index to record the index of the
7604 reduction variable. */
7605 slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
7606 /* We need to skip an extra operand for COND_EXPRs with embedded
7607 comparison. */
7608 unsigned opno_adjust = 0;
7609 if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
7610 opno_adjust = 1;
7611 for (i = 0; i < (int) op.num_ops; i++)
7612 {
7613 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
7614 if (i == 0 && op.code == COND_EXPR)
7615 continue;
7616
7617 stmt_vec_info def_stmt_info;
7618 enum vect_def_type dt;
7619 if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
7620 i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
7621 &vectype_op[i], &def_stmt_info))
7622 {
7623 if (dump_enabled_p ())
7624 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7625 "use not simple.\n");
7626 return false;
7627 }
7628 if (i == STMT_VINFO_REDUC_IDX (stmt_info))
7629 continue;
7630
7631 /* For an IFN_COND_OP we might hit the reduction definition operand
7632 twice (once as definition, once as else). */
7633 if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
7634 continue;
7635
7636 /* There should be only one cycle def in the stmt, the one
7637 leading to reduc_def. */
7638 if (VECTORIZABLE_CYCLE_DEF (dt))
7639 return false;
7640
7641 if (!vectype_op[i])
7642 vectype_op[i]
7643 = get_vectype_for_scalar_type (loop_vinfo,
7644 TREE_TYPE (op.ops[i]), slp_op[i]);
7645
7646 /* To properly compute ncopies we are interested in the widest
7647 non-reduction input type in case we're looking at a widening
7648 accumulation that we later handle in vect_transform_reduction. */
7649 if (lane_reduc_code_p
7650 && vectype_op[i]
7651 && (!vectype_in
7652 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
7653 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op[i]))))))
7654 vectype_in = vectype_op[i];
7655
7656 if (op.code == COND_EXPR)
7657 {
7658 /* Record how the non-reduction-def value of COND_EXPR is defined. */
7659 if (dt == vect_constant_def)
7660 {
7661 cond_reduc_dt = dt;
7662 cond_reduc_val = op.ops[i];
7663 }
7664 if (dt == vect_induction_def
7665 && def_stmt_info
7666 && is_nonwrapping_integer_induction (stmt_vinfo: def_stmt_info, loop))
7667 {
7668 cond_reduc_dt = dt;
7669 cond_stmt_vinfo = def_stmt_info;
7670 }
7671 }
7672 }
7673 if (!vectype_in)
7674 vectype_in = STMT_VINFO_VECTYPE (phi_info);
7675 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
7676
7677 enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
7678 STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
7679 /* If we have a condition reduction, see if we can simplify it further. */
7680 if (v_reduc_type == COND_REDUCTION)
7681 {
7682 if (slp_node)
7683 return false;
7684
7685 /* When the condition uses the reduction value in the condition, fail. */
7686 if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
7687 {
7688 if (dump_enabled_p ())
7689 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7690 "condition depends on previous iteration\n");
7691 return false;
7692 }
7693
7694 if (reduc_chain_length == 1
7695 && (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, vectype_in,
7696 OPTIMIZE_FOR_SPEED)
7697 || direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
7698 vectype_in,
7699 OPTIMIZE_FOR_SPEED)))
7700 {
7701 if (dump_enabled_p ())
7702 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7703 "optimizing condition reduction with"
7704 " FOLD_EXTRACT_LAST.\n");
7705 STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
7706 }
7707 else if (cond_reduc_dt == vect_induction_def)
7708 {
7709 tree base
7710 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
7711 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
7712
7713 gcc_assert (TREE_CODE (base) == INTEGER_CST
7714 && TREE_CODE (step) == INTEGER_CST);
7715 cond_reduc_val = NULL_TREE;
7716 enum tree_code cond_reduc_op_code = ERROR_MARK;
7717 tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
7718 if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
7719 ;
7720 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7721 above base; punt if base is the minimum value of the type for
7722 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
7723 else if (tree_int_cst_sgn (step) == -1)
7724 {
7725 cond_reduc_op_code = MIN_EXPR;
7726 if (tree_int_cst_sgn (base) == -1)
7727 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7728 else if (tree_int_cst_lt (t1: base,
7729 TYPE_MAX_VALUE (TREE_TYPE (base))))
7730 cond_reduc_val
7731 = int_const_binop (PLUS_EXPR, base, integer_one_node);
7732 }
7733 else
7734 {
7735 cond_reduc_op_code = MAX_EXPR;
7736 if (tree_int_cst_sgn (base) == 1)
7737 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7738 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
7739 t2: base))
7740 cond_reduc_val
7741 = int_const_binop (MINUS_EXPR, base, integer_one_node);
7742 }
7743 if (cond_reduc_val)
7744 {
7745 if (dump_enabled_p ())
7746 dump_printf_loc (MSG_NOTE, vect_location,
7747 "condition expression based on "
7748 "integer induction.\n");
7749 STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
7750 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
7751 = cond_reduc_val;
7752 STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
7753 }
7754 }
7755 else if (cond_reduc_dt == vect_constant_def)
7756 {
7757 enum vect_def_type cond_initial_dt;
7758 tree cond_initial_val = vect_phi_initial_value (phi: reduc_def_phi);
7759 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
7760 if (cond_initial_dt == vect_constant_def
7761 && types_compatible_p (TREE_TYPE (cond_initial_val),
7762 TREE_TYPE (cond_reduc_val)))
7763 {
7764 tree e = fold_binary (LE_EXPR, boolean_type_node,
7765 cond_initial_val, cond_reduc_val);
7766 if (e && (integer_onep (e) || integer_zerop (e)))
7767 {
7768 if (dump_enabled_p ())
7769 dump_printf_loc (MSG_NOTE, vect_location,
7770 "condition expression based on "
7771 "compile time constant.\n");
7772 /* Record reduction code at analysis stage. */
7773 STMT_VINFO_REDUC_CODE (reduc_info)
7774 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
7775 STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
7776 }
7777 }
7778 }
7779 }
7780
7781 if (STMT_VINFO_LIVE_P (phi_info))
7782 return false;
7783
7784 if (slp_node)
7785 ncopies = 1;
7786 else
7787 ncopies = vect_get_num_copies (loop_vinfo, vectype: vectype_in);
7788
7789 gcc_assert (ncopies >= 1);
7790
7791 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (node: vectype_out);
7792
7793 if (nested_cycle)
7794 {
7795 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
7796 == vect_double_reduction_def);
7797 double_reduc = true;
7798 }
7799
7800 /* 4.2. Check support for the epilog operation.
7801
7802 If STMT represents a reduction pattern, then the type of the
7803 reduction variable may be different than the type of the rest
7804 of the arguments. For example, consider the case of accumulation
7805 of shorts into an int accumulator; The original code:
7806 S1: int_a = (int) short_a;
7807 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
7808
7809 was replaced with:
7810 STMT: int_acc = widen_sum <short_a, int_acc>
7811
7812 This means that:
7813 1. The tree-code that is used to create the vector operation in the
7814 epilog code (that reduces the partial results) is not the
7815 tree-code of STMT, but is rather the tree-code of the original
7816 stmt from the pattern that STMT is replacing. I.e, in the example
7817 above we want to use 'widen_sum' in the loop, but 'plus' in the
7818 epilog.
7819 2. The type (mode) we use to check available target support
7820 for the vector operation to be created in the *epilog*, is
7821 determined by the type of the reduction variable (in the example
7822 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7823 However the type (mode) we use to check available target support
7824 for the vector operation to be created *inside the loop*, is
7825 determined by the type of the other arguments to STMT (in the
7826 example we'd check this: optab_handler (widen_sum_optab,
7827 vect_short_mode)).
7828
7829 This is contrary to "regular" reductions, in which the types of all
7830 the arguments are the same as the type of the reduction variable.
7831 For "regular" reductions we can therefore use the same vector type
7832 (and also the same tree-code) when generating the epilog code and
7833 when generating the code inside the loop. */
7834
7835 code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
7836
7837 /* If conversion might have created a conditional operation like
7838 IFN_COND_ADD already. Use the internal code for the following checks. */
7839 if (orig_code.is_internal_fn ())
7840 {
7841 tree_code new_code = conditional_internal_fn_code (internal_fn (orig_code));
7842 orig_code = new_code != ERROR_MARK ? new_code : orig_code;
7843 }
7844
7845 STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
7846
7847 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7848 if (reduction_type == TREE_CODE_REDUCTION)
7849 {
7850 /* Check whether it's ok to change the order of the computation.
7851 Generally, when vectorizing a reduction we change the order of the
7852 computation. This may change the behavior of the program in some
7853 cases, so we need to check that this is ok. One exception is when
7854 vectorizing an outer-loop: the inner-loop is executed sequentially,
7855 and therefore vectorizing reductions in the inner-loop during
7856 outer-loop vectorization is safe. Likewise when we are vectorizing
7857 a series of reductions using SLP and the VF is one the reductions
7858 are performed in scalar order. */
7859 if (slp_node
7860 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7861 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7862 ;
7863 else if (needs_fold_left_reduction_p (type: op.type, code: orig_code))
7864 {
7865 /* When vectorizing a reduction chain w/o SLP the reduction PHI
7866 is not directy used in stmt. */
7867 if (!only_slp_reduc_chain
7868 && reduc_chain_length != 1)
7869 {
7870 if (dump_enabled_p ())
7871 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7872 "in-order reduction chain without SLP.\n");
7873 return false;
7874 }
7875 STMT_VINFO_REDUC_TYPE (reduc_info)
7876 = reduction_type = FOLD_LEFT_REDUCTION;
7877 }
7878 else if (!commutative_binary_op_p (orig_code, op.type)
7879 || !associative_binary_op_p (orig_code, op.type))
7880 {
7881 if (dump_enabled_p ())
7882 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7883 "reduction: not commutative/associative\n");
7884 return false;
7885 }
7886 }
7887
7888 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7889 && ncopies > 1)
7890 {
7891 if (dump_enabled_p ())
7892 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7893 "multiple types in double reduction or condition "
7894 "reduction or fold-left reduction.\n");
7895 return false;
7896 }
7897
7898 internal_fn reduc_fn = IFN_LAST;
7899 if (reduction_type == TREE_CODE_REDUCTION
7900 || reduction_type == FOLD_LEFT_REDUCTION
7901 || reduction_type == INTEGER_INDUC_COND_REDUCTION
7902 || reduction_type == CONST_COND_REDUCTION)
7903 {
7904 if (reduction_type == FOLD_LEFT_REDUCTION
7905 ? fold_left_reduction_fn (code: orig_code, reduc_fn: &reduc_fn)
7906 : reduction_fn_for_scalar_code (code: orig_code, reduc_fn: &reduc_fn))
7907 {
7908 if (reduc_fn != IFN_LAST
7909 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7910 OPTIMIZE_FOR_SPEED))
7911 {
7912 if (dump_enabled_p ())
7913 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7914 "reduc op not supported by target.\n");
7915
7916 reduc_fn = IFN_LAST;
7917 }
7918 }
7919 else
7920 {
7921 if (!nested_cycle || double_reduc)
7922 {
7923 if (dump_enabled_p ())
7924 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7925 "no reduc code for scalar code.\n");
7926
7927 return false;
7928 }
7929 }
7930 }
7931 else if (reduction_type == COND_REDUCTION)
7932 {
7933 int scalar_precision
7934 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
7935 cr_index_scalar_type = make_unsigned_type (scalar_precision);
7936 cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
7937 vectype_out);
7938
7939 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7940 OPTIMIZE_FOR_SPEED))
7941 reduc_fn = IFN_REDUC_MAX;
7942 }
7943 STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
7944
7945 if (reduction_type != EXTRACT_LAST_REDUCTION
7946 && (!nested_cycle || double_reduc)
7947 && reduc_fn == IFN_LAST
7948 && !nunits_out.is_constant ())
7949 {
7950 if (dump_enabled_p ())
7951 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7952 "missing target support for reduction on"
7953 " variable-length vectors.\n");
7954 return false;
7955 }
7956
7957 /* For SLP reductions, see if there is a neutral value we can use. */
7958 tree neutral_op = NULL_TREE;
7959 if (slp_node)
7960 {
7961 tree initial_value = NULL_TREE;
7962 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
7963 initial_value = vect_phi_initial_value (phi: reduc_def_phi);
7964 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7965 code: orig_code, initial_value);
7966 }
7967
7968 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7969 {
7970 /* We can't support in-order reductions of code such as this:
7971
7972 for (int i = 0; i < n1; ++i)
7973 for (int j = 0; j < n2; ++j)
7974 l += a[j];
7975
7976 since GCC effectively transforms the loop when vectorizing:
7977
7978 for (int i = 0; i < n1 / VF; ++i)
7979 for (int j = 0; j < n2; ++j)
7980 for (int k = 0; k < VF; ++k)
7981 l += a[j];
7982
7983 which is a reassociation of the original operation. */
7984 if (dump_enabled_p ())
7985 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7986 "in-order double reduction not supported.\n");
7987
7988 return false;
7989 }
7990
7991 if (reduction_type == FOLD_LEFT_REDUCTION
7992 && slp_node
7993 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7994 {
7995 /* We cannot use in-order reductions in this case because there is
7996 an implicit reassociation of the operations involved. */
7997 if (dump_enabled_p ())
7998 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7999 "in-order unchained SLP reductions not supported.\n");
8000 return false;
8001 }
8002
8003 /* For double reductions, and for SLP reductions with a neutral value,
8004 we construct a variable-length initial vector by loading a vector
8005 full of the neutral value and then shift-and-inserting the start
8006 values into the low-numbered elements. */
8007 if ((double_reduc || neutral_op)
8008 && !nunits_out.is_constant ()
8009 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
8010 vectype_out, OPTIMIZE_FOR_SPEED))
8011 {
8012 if (dump_enabled_p ())
8013 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8014 "reduction on variable-length vectors requires"
8015 " target support for a vector-shift-and-insert"
8016 " operation.\n");
8017 return false;
8018 }
8019
8020 /* Check extra constraints for variable-length unchained SLP reductions. */
8021 if (slp_node
8022 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
8023 && !nunits_out.is_constant ())
8024 {
8025 /* We checked above that we could build the initial vector when
8026 there's a neutral element value. Check here for the case in
8027 which each SLP statement has its own initial value and in which
8028 that value needs to be repeated for every instance of the
8029 statement within the initial vector. */
8030 unsigned int group_size = SLP_TREE_LANES (slp_node);
8031 if (!neutral_op
8032 && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
8033 TREE_TYPE (vectype_out)))
8034 {
8035 if (dump_enabled_p ())
8036 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8037 "unsupported form of SLP reduction for"
8038 " variable-length vectors: cannot build"
8039 " initial vector.\n");
8040 return false;
8041 }
8042 /* The epilogue code relies on the number of elements being a multiple
8043 of the group size. The duplicate-and-interleave approach to setting
8044 up the initial vector does too. */
8045 if (!multiple_p (a: nunits_out, b: group_size))
8046 {
8047 if (dump_enabled_p ())
8048 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8049 "unsupported form of SLP reduction for"
8050 " variable-length vectors: the vector size"
8051 " is not a multiple of the number of results.\n");
8052 return false;
8053 }
8054 }
8055
8056 if (reduction_type == COND_REDUCTION)
8057 {
8058 widest_int ni;
8059
8060 if (! max_loop_iterations (loop, &ni))
8061 {
8062 if (dump_enabled_p ())
8063 dump_printf_loc (MSG_NOTE, vect_location,
8064 "loop count not known, cannot create cond "
8065 "reduction.\n");
8066 return false;
8067 }
8068 /* Convert backedges to iterations. */
8069 ni += 1;
8070
8071 /* The additional index will be the same type as the condition. Check
8072 that the loop can fit into this less one (because we'll use up the
8073 zero slot for when there are no matches). */
8074 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
8075 if (wi::geu_p (x: ni, y: wi::to_widest (t: max_index)))
8076 {
8077 if (dump_enabled_p ())
8078 dump_printf_loc (MSG_NOTE, vect_location,
8079 "loop size is greater than data size.\n");
8080 return false;
8081 }
8082 }
8083
8084 /* In case the vectorization factor (VF) is bigger than the number
8085 of elements that we can fit in a vectype (nunits), we have to generate
8086 more than one vector stmt - i.e - we need to "unroll" the
8087 vector stmt by a factor VF/nunits. For more details see documentation
8088 in vectorizable_operation. */
8089
8090 /* If the reduction is used in an outer loop we need to generate
8091 VF intermediate results, like so (e.g. for ncopies=2):
8092 r0 = phi (init, r0)
8093 r1 = phi (init, r1)
8094 r0 = x0 + r0;
8095 r1 = x1 + r1;
8096 (i.e. we generate VF results in 2 registers).
8097 In this case we have a separate def-use cycle for each copy, and therefore
8098 for each copy we get the vector def for the reduction variable from the
8099 respective phi node created for this copy.
8100
8101 Otherwise (the reduction is unused in the loop nest), we can combine
8102 together intermediate results, like so (e.g. for ncopies=2):
8103 r = phi (init, r)
8104 r = x0 + r;
8105 r = x1 + r;
8106 (i.e. we generate VF/2 results in a single register).
8107 In this case for each copy we get the vector def for the reduction variable
8108 from the vectorized reduction operation generated in the previous iteration.
8109
8110 This only works when we see both the reduction PHI and its only consumer
8111 in vectorizable_reduction and there are no intermediate stmts
8112 participating. When unrolling we want each unrolled iteration to have its
8113 own reduction accumulator since one of the main goals of unrolling a
8114 reduction is to reduce the aggregate loop-carried latency. */
8115 if (ncopies > 1
8116 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
8117 && reduc_chain_length == 1
8118 && loop_vinfo->suggested_unroll_factor == 1)
8119 single_defuse_cycle = true;
8120
8121 if (single_defuse_cycle || lane_reduc_code_p)
8122 {
8123 gcc_assert (op.code != COND_EXPR);
8124
8125 /* 4. Supportable by target? */
8126 bool ok = true;
8127
8128 /* 4.1. check support for the operation in the loop
8129
8130 This isn't necessary for the lane reduction codes, since they
8131 can only be produced by pattern matching, and it's up to the
8132 pattern matcher to test for support. The main reason for
8133 specifically skipping this step is to avoid rechecking whether
8134 mixed-sign dot-products can be implemented using signed
8135 dot-products. */
8136 machine_mode vec_mode = TYPE_MODE (vectype_in);
8137 if (!lane_reduc_code_p
8138 && !directly_supported_p (op.code, vectype_in, optab_vector))
8139 {
8140 if (dump_enabled_p ())
8141 dump_printf (MSG_NOTE, "op not supported by target.\n");
8142 if (maybe_ne (a: GET_MODE_SIZE (mode: vec_mode), UNITS_PER_WORD)
8143 || !vect_can_vectorize_without_simd_p (op.code))
8144 ok = false;
8145 else
8146 if (dump_enabled_p ())
8147 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
8148 }
8149
8150 if (vect_emulated_vector_p (vectype_in)
8151 && !vect_can_vectorize_without_simd_p (op.code))
8152 {
8153 if (dump_enabled_p ())
8154 dump_printf (MSG_NOTE, "using word mode not possible.\n");
8155 return false;
8156 }
8157
8158 /* lane-reducing operations have to go through vect_transform_reduction.
8159 For the other cases try without the single cycle optimization. */
8160 if (!ok)
8161 {
8162 if (lane_reduc_code_p)
8163 return false;
8164 else
8165 single_defuse_cycle = false;
8166 }
8167 }
8168 STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
8169
8170 /* If the reduction stmt is one of the patterns that have lane
8171 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
8172 if ((ncopies > 1 && ! single_defuse_cycle)
8173 && lane_reduc_code_p)
8174 {
8175 if (dump_enabled_p ())
8176 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8177 "multi def-use cycle not possible for lane-reducing "
8178 "reduction operation\n");
8179 return false;
8180 }
8181
8182 if (slp_node
8183 && !(!single_defuse_cycle
8184 && !lane_reduc_code_p
8185 && reduction_type != FOLD_LEFT_REDUCTION))
8186 for (i = 0; i < (int) op.num_ops; i++)
8187 if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
8188 {
8189 if (dump_enabled_p ())
8190 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8191 "incompatible vector types for invariants\n");
8192 return false;
8193 }
8194
8195 if (slp_node)
8196 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8197 else
8198 vec_num = 1;
8199
8200 vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
8201 reduction_type, ncopies, cost_vec);
8202 /* Cost the reduction op inside the loop if transformed via
8203 vect_transform_reduction. Otherwise this is costed by the
8204 separate vectorizable_* routines. */
8205 if (single_defuse_cycle || lane_reduc_code_p)
8206 {
8207 int factor = 1;
8208 if (vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info))
8209 /* Three dot-products and a subtraction. */
8210 factor = 4;
8211 record_stmt_cost (body_cost_vec: cost_vec, count: ncopies * factor, kind: vector_stmt,
8212 stmt_info, misalign: 0, where: vect_body);
8213 }
8214
8215 if (dump_enabled_p ()
8216 && reduction_type == FOLD_LEFT_REDUCTION)
8217 dump_printf_loc (MSG_NOTE, vect_location,
8218 "using an in-order (fold-left) reduction.\n");
8219 STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
8220 /* All but single defuse-cycle optimized, lane-reducing and fold-left
8221 reductions go through their own vectorizable_* routines. */
8222 if (!single_defuse_cycle
8223 && !lane_reduc_code_p
8224 && reduction_type != FOLD_LEFT_REDUCTION)
8225 {
8226 stmt_vec_info tem
8227 = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
8228 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
8229 {
8230 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
8231 tem = REDUC_GROUP_FIRST_ELEMENT (tem);
8232 }
8233 STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
8234 STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
8235 }
8236 else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8237 {
8238 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8239 vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8240 internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
8241
8242 if (reduction_type != FOLD_LEFT_REDUCTION
8243 && !use_mask_by_cond_expr_p (code: op.code, cond_fn, vectype_in)
8244 && (cond_fn == IFN_LAST
8245 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
8246 OPTIMIZE_FOR_SPEED)))
8247 {
8248 if (dump_enabled_p ())
8249 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8250 "can't operate on partial vectors because"
8251 " no conditional operation is available.\n");
8252 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8253 }
8254 else if (reduction_type == FOLD_LEFT_REDUCTION
8255 && reduc_fn == IFN_LAST
8256 && !expand_vec_cond_expr_p (vectype_in,
8257 truth_type_for (vectype_in),
8258 SSA_NAME))
8259 {
8260 if (dump_enabled_p ())
8261 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8262 "can't operate on partial vectors because"
8263 " no conditional operation is available.\n");
8264 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8265 }
8266 else if (reduction_type == FOLD_LEFT_REDUCTION
8267 && internal_fn_mask_index (reduc_fn) == -1
8268 && FLOAT_TYPE_P (vectype_in)
8269 && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
8270 {
8271 if (dump_enabled_p ())
8272 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8273 "can't operate on partial vectors because"
8274 " signed zeros cannot be preserved.\n");
8275 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8276 }
8277 else
8278 {
8279 internal_fn mask_reduc_fn
8280 = get_masked_reduction_fn (reduc_fn, vectype_in);
8281
8282 if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
8283 vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num,
8284 vectype_in, 1);
8285 else
8286 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
8287 vectype_in, NULL);
8288 }
8289 }
8290 return true;
8291}
8292
8293/* STMT_INFO is a dot-product reduction whose multiplication operands
8294 have different signs. Emit a sequence to emulate the operation
8295 using a series of signed DOT_PROD_EXPRs and return the last
8296 statement generated. VEC_DEST is the result of the vector operation
8297 and VOP lists its inputs. */
8298
8299static gassign *
8300vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8301 gimple_stmt_iterator *gsi, tree vec_dest,
8302 tree vop[3])
8303{
8304 tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
8305 tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
8306 tree narrow_elttype = TREE_TYPE (narrow_vectype);
8307 gimple *new_stmt;
8308
8309 /* Make VOP[0] the unsigned operand VOP[1] the signed operand. */
8310 if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
8311 std::swap (a&: vop[0], b&: vop[1]);
8312
8313 /* Convert all inputs to signed types. */
8314 for (int i = 0; i < 3; ++i)
8315 if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
8316 {
8317 tree tmp = make_ssa_name (var: signed_type_for (TREE_TYPE (vop[i])));
8318 new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
8319 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8320 vop[i] = tmp;
8321 }
8322
8323 /* In the comments below we assume 8-bit inputs for simplicity,
8324 but the approach works for any full integer type. */
8325
8326 /* Create a vector of -128. */
8327 tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
8328 tree min_narrow = build_vector_from_val (narrow_vectype,
8329 min_narrow_elttype);
8330
8331 /* Create a vector of 64. */
8332 auto half_wi = wi::lrshift (x: wi::to_wide (t: min_narrow_elttype), y: 1);
8333 tree half_narrow = wide_int_to_tree (type: narrow_elttype, cst: half_wi);
8334 half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
8335
8336 /* Emit: SUB_RES = VOP[0] - 128. */
8337 tree sub_res = make_ssa_name (var: narrow_vectype);
8338 new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
8339 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8340
8341 /* Emit:
8342
8343 STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
8344 STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
8345 STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
8346
8347 on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
8348 Doing the two 64 * y steps first allows more time to compute x. */
8349 tree stage1 = make_ssa_name (var: wide_vectype);
8350 new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
8351 vop[1], half_narrow, vop[2]);
8352 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8353
8354 tree stage2 = make_ssa_name (var: wide_vectype);
8355 new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
8356 vop[1], half_narrow, stage1);
8357 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8358
8359 tree stage3 = make_ssa_name (var: wide_vectype);
8360 new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
8361 sub_res, vop[1], stage2);
8362 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8363
8364 /* Convert STAGE3 to the reduction type. */
8365 return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
8366}
8367
8368/* Transform the definition stmt STMT_INFO of a reduction PHI backedge
8369 value. */
8370
8371bool
8372vect_transform_reduction (loop_vec_info loop_vinfo,
8373 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
8374 gimple **vec_stmt, slp_tree slp_node)
8375{
8376 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8377 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8378 int i;
8379 int ncopies;
8380 int vec_num;
8381
8382 stmt_vec_info reduc_info = info_for_reduction (vinfo: loop_vinfo, stmt_info);
8383 gcc_assert (reduc_info->is_reduc_info);
8384
8385 if (nested_in_vect_loop_p (loop, stmt_info))
8386 {
8387 loop = loop->inner;
8388 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
8389 }
8390
8391 gimple_match_op op;
8392 if (!gimple_extract_op (stmt_info->stmt, &op))
8393 gcc_unreachable ();
8394
8395 /* All uses but the last are expected to be defined in the loop.
8396 The last use is the reduction variable. In case of nested cycle this
8397 assumption is not true: we use reduc_index to record the index of the
8398 reduction variable. */
8399 stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
8400 gphi *reduc_def_phi = as_a <gphi *> (p: phi_info->stmt);
8401 int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
8402 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8403
8404 if (slp_node)
8405 {
8406 ncopies = 1;
8407 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8408 }
8409 else
8410 {
8411 ncopies = vect_get_num_copies (loop_vinfo, vectype: vectype_in);
8412 vec_num = 1;
8413 }
8414
8415 code_helper code = canonicalize_code (op.code, op.type);
8416 internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
8417
8418 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8419 vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8420 bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
8421
8422 /* Transform. */
8423 tree new_temp = NULL_TREE;
8424 auto_vec<tree> vec_oprnds0;
8425 auto_vec<tree> vec_oprnds1;
8426 auto_vec<tree> vec_oprnds2;
8427 tree def0;
8428
8429 if (dump_enabled_p ())
8430 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
8431
8432 /* FORNOW: Multiple types are not supported for condition. */
8433 if (code == COND_EXPR)
8434 gcc_assert (ncopies == 1);
8435
8436 /* A binary COND_OP reduction must have the same definition and else
8437 value. */
8438 bool cond_fn_p = code.is_internal_fn ()
8439 && conditional_internal_fn_code (internal_fn (code)) != ERROR_MARK;
8440 if (cond_fn_p)
8441 {
8442 gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB
8443 || code == IFN_COND_MUL || code == IFN_COND_AND
8444 || code == IFN_COND_IOR || code == IFN_COND_XOR);
8445 gcc_assert (op.num_ops == 4 && (op.ops[1] == op.ops[3]));
8446 }
8447
8448 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8449
8450 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
8451 if (reduction_type == FOLD_LEFT_REDUCTION)
8452 {
8453 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
8454 gcc_assert (code.is_tree_code () || cond_fn_p);
8455 return vectorize_fold_left_reduction
8456 (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_stmt: reduc_def_phi,
8457 code, reduc_fn, ops: op.ops, num_ops: op.num_ops, vectype_in,
8458 reduc_index, masks, lens);
8459 }
8460
8461 bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
8462 gcc_assert (single_defuse_cycle
8463 || code == DOT_PROD_EXPR
8464 || code == WIDEN_SUM_EXPR
8465 || code == SAD_EXPR);
8466
8467 /* Create the destination vector */
8468 tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
8469 tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
8470
8471 /* Get NCOPIES vector definitions for all operands except the reduction
8472 definition. */
8473 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8474 single_defuse_cycle && reduc_index == 0
8475 ? NULL_TREE : op.ops[0], &vec_oprnds0,
8476 single_defuse_cycle && reduc_index == 1
8477 ? NULL_TREE : op.ops[1], &vec_oprnds1,
8478 op.num_ops == 4
8479 || (op.num_ops == 3
8480 && !(single_defuse_cycle && reduc_index == 2))
8481 ? op.ops[2] : NULL_TREE, &vec_oprnds2);
8482
8483 /* For single def-use cycles get one copy of the vectorized reduction
8484 definition. */
8485 if (single_defuse_cycle)
8486 {
8487 gcc_assert (!slp_node);
8488 vect_get_vec_defs_for_operand (vinfo: loop_vinfo, stmt_info, 1,
8489 op: op.ops[reduc_index],
8490 reduc_index == 0 ? &vec_oprnds0
8491 : (reduc_index == 1 ? &vec_oprnds1
8492 : &vec_oprnds2));
8493 }
8494
8495 bool emulated_mixed_dot_prod
8496 = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
8497 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
8498 {
8499 gimple *new_stmt;
8500 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
8501 if (masked_loop_p && !mask_by_cond_expr)
8502 {
8503 /* No conditional ifns have been defined for dot-product yet. */
8504 gcc_assert (code != DOT_PROD_EXPR);
8505
8506 /* Make sure that the reduction accumulator is vop[0]. */
8507 if (reduc_index == 1)
8508 {
8509 gcc_assert (commutative_binary_op_p (code, op.type));
8510 std::swap (a&: vop[0], b&: vop[1]);
8511 }
8512 tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8513 vec_num * ncopies, vectype_in, i);
8514 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
8515 vop[0], vop[1], vop[0]);
8516 new_temp = make_ssa_name (var: vec_dest, stmt: call);
8517 gimple_call_set_lhs (gs: call, lhs: new_temp);
8518 gimple_call_set_nothrow (s: call, nothrow_p: true);
8519 vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
8520 new_stmt = call;
8521 }
8522 else
8523 {
8524 if (op.num_ops >= 3)
8525 vop[2] = vec_oprnds2[i];
8526
8527 if (masked_loop_p && mask_by_cond_expr)
8528 {
8529 tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8530 vec_num * ncopies, vectype_in, i);
8531 build_vect_cond_expr (code, vop, mask, gsi);
8532 }
8533
8534 if (emulated_mixed_dot_prod)
8535 new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
8536 vec_dest, vop);
8537
8538 else if (code.is_internal_fn () && !cond_fn_p)
8539 new_stmt = gimple_build_call_internal (internal_fn (code),
8540 op.num_ops,
8541 vop[0], vop[1], vop[2]);
8542 else if (code.is_internal_fn () && cond_fn_p)
8543 new_stmt = gimple_build_call_internal (internal_fn (code),
8544 op.num_ops,
8545 vop[0], vop[1], vop[2],
8546 vop[1]);
8547 else
8548 new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
8549 vop[0], vop[1], vop[2]);
8550 new_temp = make_ssa_name (var: vec_dest, stmt: new_stmt);
8551 gimple_set_lhs (new_stmt, new_temp);
8552 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8553 }
8554
8555 if (slp_node)
8556 slp_node->push_vec_def (def: new_stmt);
8557 else if (single_defuse_cycle
8558 && i < ncopies - 1)
8559 {
8560 if (reduc_index == 0)
8561 vec_oprnds0.safe_push (obj: gimple_get_lhs (new_stmt));
8562 else if (reduc_index == 1)
8563 vec_oprnds1.safe_push (obj: gimple_get_lhs (new_stmt));
8564 else if (reduc_index == 2)
8565 vec_oprnds2.safe_push (obj: gimple_get_lhs (new_stmt));
8566 }
8567 else
8568 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (obj: new_stmt);
8569 }
8570
8571 if (!slp_node)
8572 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8573
8574 return true;
8575}
8576
8577/* Transform phase of a cycle PHI. */
8578
8579bool
8580vect_transform_cycle_phi (loop_vec_info loop_vinfo,
8581 stmt_vec_info stmt_info, gimple **vec_stmt,
8582 slp_tree slp_node, slp_instance slp_node_instance)
8583{
8584 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8585 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8586 int i;
8587 int ncopies;
8588 int j;
8589 bool nested_cycle = false;
8590 int vec_num;
8591
8592 if (nested_in_vect_loop_p (loop, stmt_info))
8593 {
8594 loop = loop->inner;
8595 nested_cycle = true;
8596 }
8597
8598 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
8599 reduc_stmt_info = vect_stmt_to_vectorize (stmt_info: reduc_stmt_info);
8600 stmt_vec_info reduc_info = info_for_reduction (vinfo: loop_vinfo, stmt_info);
8601 gcc_assert (reduc_info->is_reduc_info);
8602
8603 if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
8604 || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
8605 /* Leave the scalar phi in place. */
8606 return true;
8607
8608 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8609 /* For a nested cycle we do not fill the above. */
8610 if (!vectype_in)
8611 vectype_in = STMT_VINFO_VECTYPE (stmt_info);
8612 gcc_assert (vectype_in);
8613
8614 if (slp_node)
8615 {
8616 /* The size vect_schedule_slp_instance computes is off for us. */
8617 vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
8618 * SLP_TREE_LANES (slp_node), vectype: vectype_in);
8619 ncopies = 1;
8620 }
8621 else
8622 {
8623 vec_num = 1;
8624 ncopies = vect_get_num_copies (loop_vinfo, vectype: vectype_in);
8625 }
8626
8627 /* Check whether we should use a single PHI node and accumulate
8628 vectors to one before the backedge. */
8629 if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
8630 ncopies = 1;
8631
8632 /* Create the destination vector */
8633 gphi *phi = as_a <gphi *> (p: stmt_info->stmt);
8634 tree vec_dest = vect_create_destination_var (gimple_phi_result (gs: phi),
8635 vectype_out);
8636
8637 /* Get the loop-entry arguments. */
8638 tree vec_initial_def = NULL_TREE;
8639 auto_vec<tree> vec_initial_defs;
8640 if (slp_node)
8641 {
8642 vec_initial_defs.reserve (nelems: vec_num);
8643 if (nested_cycle)
8644 {
8645 unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
8646 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
8647 &vec_initial_defs);
8648 }
8649 else
8650 {
8651 gcc_assert (slp_node == slp_node_instance->reduc_phis);
8652 vec<tree> &initial_values = reduc_info->reduc_initial_values;
8653 vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
8654
8655 unsigned int num_phis = stmts.length ();
8656 if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
8657 num_phis = 1;
8658 initial_values.reserve (nelems: num_phis);
8659 for (unsigned int i = 0; i < num_phis; ++i)
8660 {
8661 gphi *this_phi = as_a<gphi *> (p: stmts[i]->stmt);
8662 initial_values.quick_push (obj: vect_phi_initial_value (phi: this_phi));
8663 }
8664 if (vec_num == 1)
8665 vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8666 if (!initial_values.is_empty ())
8667 {
8668 tree initial_value
8669 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
8670 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8671 tree neutral_op
8672 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8673 code, initial_value);
8674 get_initial_defs_for_reduction (loop_vinfo, reduc_info,
8675 vec_oprnds: &vec_initial_defs, number_of_vectors: vec_num,
8676 group_size: stmts.length (), neutral_op);
8677 }
8678 }
8679 }
8680 else
8681 {
8682 /* Get at the scalar def before the loop, that defines the initial
8683 value of the reduction variable. */
8684 tree initial_def = vect_phi_initial_value (phi);
8685 reduc_info->reduc_initial_values.safe_push (obj: initial_def);
8686 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
8687 and we can't use zero for induc_val, use initial_def. Similarly
8688 for REDUC_MIN and initial_def larger than the base. */
8689 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
8690 {
8691 tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
8692 if (TREE_CODE (initial_def) == INTEGER_CST
8693 && !integer_zerop (induc_val)
8694 && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
8695 && tree_int_cst_lt (t1: initial_def, t2: induc_val))
8696 || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
8697 && tree_int_cst_lt (t1: induc_val, t2: initial_def))))
8698 {
8699 induc_val = initial_def;
8700 /* Communicate we used the initial_def to epilouge
8701 generation. */
8702 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
8703 }
8704 vec_initial_def = build_vector_from_val (vectype_out, induc_val);
8705 }
8706 else if (nested_cycle)
8707 {
8708 /* Do not use an adjustment def as that case is not supported
8709 correctly if ncopies is not one. */
8710 vect_get_vec_defs_for_operand (vinfo: loop_vinfo, reduc_stmt_info,
8711 ncopies, op: initial_def,
8712 &vec_initial_defs);
8713 }
8714 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
8715 || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
8716 /* Fill the initial vector with the initial scalar value. */
8717 vec_initial_def
8718 = get_initial_def_for_reduction (loop_vinfo, reduc_info: reduc_stmt_info,
8719 init_val: initial_def, neutral_op: initial_def);
8720 else
8721 {
8722 if (ncopies == 1)
8723 vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8724 if (!reduc_info->reduc_initial_values.is_empty ())
8725 {
8726 initial_def = reduc_info->reduc_initial_values[0];
8727 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8728 tree neutral_op
8729 = neutral_op_for_reduction (TREE_TYPE (initial_def),
8730 code, initial_value: initial_def);
8731 gcc_assert (neutral_op);
8732 /* Try to simplify the vector initialization by applying an
8733 adjustment after the reduction has been performed. */
8734 if (!reduc_info->reused_accumulator
8735 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8736 && !operand_equal_p (neutral_op, initial_def))
8737 {
8738 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
8739 = initial_def;
8740 initial_def = neutral_op;
8741 }
8742 vec_initial_def
8743 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
8744 init_val: initial_def, neutral_op);
8745 }
8746 }
8747 }
8748
8749 if (vec_initial_def)
8750 {
8751 vec_initial_defs.create (nelems: ncopies);
8752 for (i = 0; i < ncopies; ++i)
8753 vec_initial_defs.quick_push (obj: vec_initial_def);
8754 }
8755
8756 if (auto *accumulator = reduc_info->reused_accumulator)
8757 {
8758 tree def = accumulator->reduc_input;
8759 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8760 {
8761 unsigned int nreduc;
8762 bool res = constant_multiple_p (a: TYPE_VECTOR_SUBPARTS
8763 (TREE_TYPE (def)),
8764 b: TYPE_VECTOR_SUBPARTS (node: vectype_out),
8765 multiple: &nreduc);
8766 gcc_assert (res);
8767 gimple_seq stmts = NULL;
8768 /* Reduce the single vector to a smaller one. */
8769 if (nreduc != 1)
8770 {
8771 /* Perform the reduction in the appropriate type. */
8772 tree rvectype = vectype_out;
8773 if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
8774 TREE_TYPE (TREE_TYPE (def))))
8775 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
8776 TYPE_VECTOR_SUBPARTS
8777 (node: vectype_out));
8778 def = vect_create_partial_epilog (vec_def: def, vectype: rvectype,
8779 STMT_VINFO_REDUC_CODE
8780 (reduc_info),
8781 seq: &stmts);
8782 }
8783 /* The epilogue loop might use a different vector mode, like
8784 VNx2DI vs. V2DI. */
8785 if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
8786 {
8787 tree reduc_type = build_vector_type_for_mode
8788 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
8789 def = gimple_convert (seq: &stmts, type: reduc_type, op: def);
8790 }
8791 /* Adjust the input so we pick up the partially reduced value
8792 for the skip edge in vect_create_epilog_for_reduction. */
8793 accumulator->reduc_input = def;
8794 /* And the reduction could be carried out using a different sign. */
8795 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8796 def = gimple_convert (seq: &stmts, type: vectype_out, op: def);
8797 if (loop_vinfo->main_loop_edge)
8798 {
8799 /* While we'd like to insert on the edge this will split
8800 blocks and disturb bookkeeping, we also will eventually
8801 need this on the skip edge. Rely on sinking to
8802 fixup optimal placement and insert in the pred. */
8803 gimple_stmt_iterator gsi
8804 = gsi_last_bb (bb: loop_vinfo->main_loop_edge->src);
8805 /* Insert before a cond that eventually skips the
8806 epilogue. */
8807 if (!gsi_end_p (i: gsi) && stmt_ends_bb_p (gsi_stmt (i: gsi)))
8808 gsi_prev (i: &gsi);
8809 gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
8810 }
8811 else
8812 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
8813 stmts);
8814 }
8815 if (loop_vinfo->main_loop_edge)
8816 vec_initial_defs[0]
8817 = vect_get_main_loop_result (loop_vinfo, def,
8818 vec_initial_defs[0]);
8819 else
8820 vec_initial_defs.safe_push (obj: def);
8821 }
8822
8823 /* Generate the reduction PHIs upfront. */
8824 for (i = 0; i < vec_num; i++)
8825 {
8826 tree vec_init_def = vec_initial_defs[i];
8827 for (j = 0; j < ncopies; j++)
8828 {
8829 /* Create the reduction-phi that defines the reduction
8830 operand. */
8831 gphi *new_phi = create_phi_node (vec_dest, loop->header);
8832
8833 /* Set the loop-entry arg of the reduction-phi. */
8834 if (j != 0 && nested_cycle)
8835 vec_init_def = vec_initial_defs[j];
8836 add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
8837 UNKNOWN_LOCATION);
8838
8839 /* The loop-latch arg is set in epilogue processing. */
8840
8841 if (slp_node)
8842 slp_node->push_vec_def (def: new_phi);
8843 else
8844 {
8845 if (j == 0)
8846 *vec_stmt = new_phi;
8847 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (obj: new_phi);
8848 }
8849 }
8850 }
8851
8852 return true;
8853}
8854
8855/* Vectorizes LC PHIs. */
8856
8857bool
8858vectorizable_lc_phi (loop_vec_info loop_vinfo,
8859 stmt_vec_info stmt_info, gimple **vec_stmt,
8860 slp_tree slp_node)
8861{
8862 if (!loop_vinfo
8863 || !is_a <gphi *> (p: stmt_info->stmt)
8864 || gimple_phi_num_args (gs: stmt_info->stmt) != 1)
8865 return false;
8866
8867 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
8868 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
8869 return false;
8870
8871 if (!vec_stmt) /* transformation not required. */
8872 {
8873 /* Deal with copies from externs or constants that disguise as
8874 loop-closed PHI nodes (PR97886). */
8875 if (slp_node
8876 && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
8877 SLP_TREE_VECTYPE (slp_node)))
8878 {
8879 if (dump_enabled_p ())
8880 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8881 "incompatible vector types for invariants\n");
8882 return false;
8883 }
8884 STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
8885 return true;
8886 }
8887
8888 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8889 tree scalar_dest = gimple_phi_result (gs: stmt_info->stmt);
8890 basic_block bb = gimple_bb (g: stmt_info->stmt);
8891 edge e = single_pred_edge (bb);
8892 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8893 auto_vec<tree> vec_oprnds;
8894 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
8895 !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
8896 gimple_phi_arg_def (gs: stmt_info->stmt, index: 0), &vec_oprnds);
8897 for (unsigned i = 0; i < vec_oprnds.length (); i++)
8898 {
8899 /* Create the vectorized LC PHI node. */
8900 gphi *new_phi = create_phi_node (vec_dest, bb);
8901 add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
8902 if (slp_node)
8903 slp_node->push_vec_def (def: new_phi);
8904 else
8905 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (obj: new_phi);
8906 }
8907 if (!slp_node)
8908 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8909
8910 return true;
8911}
8912
8913/* Vectorizes PHIs. */
8914
8915bool
8916vectorizable_phi (vec_info *,
8917 stmt_vec_info stmt_info, gimple **vec_stmt,
8918 slp_tree slp_node, stmt_vector_for_cost *cost_vec)
8919{
8920 if (!is_a <gphi *> (p: stmt_info->stmt) || !slp_node)
8921 return false;
8922
8923 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
8924 return false;
8925
8926 tree vectype = SLP_TREE_VECTYPE (slp_node);
8927
8928 if (!vec_stmt) /* transformation not required. */
8929 {
8930 slp_tree child;
8931 unsigned i;
8932 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
8933 if (!child)
8934 {
8935 if (dump_enabled_p ())
8936 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8937 "PHI node with unvectorized backedge def\n");
8938 return false;
8939 }
8940 else if (!vect_maybe_update_slp_op_vectype (child, vectype))
8941 {
8942 if (dump_enabled_p ())
8943 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8944 "incompatible vector types for invariants\n");
8945 return false;
8946 }
8947 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8948 && !useless_type_conversion_p (vectype,
8949 SLP_TREE_VECTYPE (child)))
8950 {
8951 /* With bools we can have mask and non-mask precision vectors
8952 or different non-mask precisions. while pattern recog is
8953 supposed to guarantee consistency here bugs in it can cause
8954 mismatches (PR103489 and PR103800 for example).
8955 Deal with them here instead of ICEing later. */
8956 if (dump_enabled_p ())
8957 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8958 "incompatible vector type setup from "
8959 "bool pattern detection\n");
8960 return false;
8961 }
8962
8963 /* For single-argument PHIs assume coalescing which means zero cost
8964 for the scalar and the vector PHIs. This avoids artificially
8965 favoring the vector path (but may pessimize it in some cases). */
8966 if (gimple_phi_num_args (gs: as_a <gphi *> (p: stmt_info->stmt)) > 1)
8967 record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
8968 vector_stmt, stmt_info, vectype, 0, vect_body);
8969 STMT_VINFO_TYPE (stmt_info) = phi_info_type;
8970 return true;
8971 }
8972
8973 tree scalar_dest = gimple_phi_result (gs: stmt_info->stmt);
8974 basic_block bb = gimple_bb (g: stmt_info->stmt);
8975 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8976 auto_vec<gphi *> new_phis;
8977 for (unsigned i = 0; i < gimple_phi_num_args (gs: stmt_info->stmt); ++i)
8978 {
8979 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8980
8981 /* Skip not yet vectorized defs. */
8982 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8983 && SLP_TREE_VEC_DEFS (child).is_empty ())
8984 continue;
8985
8986 auto_vec<tree> vec_oprnds;
8987 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
8988 if (!new_phis.exists ())
8989 {
8990 new_phis.create (nelems: vec_oprnds.length ());
8991 for (unsigned j = 0; j < vec_oprnds.length (); j++)
8992 {
8993 /* Create the vectorized LC PHI node. */
8994 new_phis.quick_push (obj: create_phi_node (vec_dest, bb));
8995 slp_node->push_vec_def (def: new_phis[j]);
8996 }
8997 }
8998 edge e = gimple_phi_arg_edge (phi: as_a <gphi *> (p: stmt_info->stmt), i);
8999 for (unsigned j = 0; j < vec_oprnds.length (); j++)
9000 add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
9001 }
9002 /* We should have at least one already vectorized child. */
9003 gcc_assert (new_phis.exists ());
9004
9005 return true;
9006}
9007
9008/* Vectorizes first order recurrences. An overview of the transformation
9009 is described below. Suppose we have the following loop.
9010
9011 int t = 0;
9012 for (int i = 0; i < n; ++i)
9013 {
9014 b[i] = a[i] - t;
9015 t = a[i];
9016 }
9017
9018 There is a first-order recurrence on 'a'. For this loop, the scalar IR
9019 looks (simplified) like:
9020
9021 scalar.preheader:
9022 init = 0;
9023
9024 scalar.body:
9025 i = PHI <0(scalar.preheader), i+1(scalar.body)>
9026 _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
9027 _1 = a[i]
9028 b[i] = _1 - _2
9029 if (i < n) goto scalar.body
9030
9031 In this example, _2 is a recurrence because it's value depends on the
9032 previous iteration. We vectorize this as (VF = 4)
9033
9034 vector.preheader:
9035 vect_init = vect_cst(..., ..., ..., 0)
9036
9037 vector.body
9038 i = PHI <0(vector.preheader), i+4(vector.body)>
9039 vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
9040 vect_2 = a[i, i+1, i+2, i+3];
9041 vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
9042 b[i, i+1, i+2, i+3] = vect_2 - vect_3
9043 if (..) goto vector.body
9044
9045 In this function, vectorizable_recurr, we code generate both the
9046 vector PHI node and the permute since those together compute the
9047 vectorized value of the scalar PHI. We do not yet have the
9048 backedge value to fill in there nor into the vec_perm. Those
9049 are filled in maybe_set_vectorized_backedge_value and
9050 vect_schedule_scc.
9051
9052 TODO: Since the scalar loop does not have a use of the recurrence
9053 outside of the loop the natural way to implement peeling via
9054 vectorizing the live value doesn't work. For now peeling of loops
9055 with a recurrence is not implemented. For SLP the supported cases
9056 are restricted to those requiring a single vector recurrence PHI. */
9057
9058bool
9059vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
9060 gimple **vec_stmt, slp_tree slp_node,
9061 stmt_vector_for_cost *cost_vec)
9062{
9063 if (!loop_vinfo || !is_a<gphi *> (p: stmt_info->stmt))
9064 return false;
9065
9066 gphi *phi = as_a<gphi *> (p: stmt_info->stmt);
9067
9068 /* So far we only support first-order recurrence auto-vectorization. */
9069 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
9070 return false;
9071
9072 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9073 unsigned ncopies;
9074 if (slp_node)
9075 ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9076 else
9077 ncopies = vect_get_num_copies (loop_vinfo, vectype);
9078 poly_int64 nunits = TYPE_VECTOR_SUBPARTS (node: vectype);
9079 unsigned dist = slp_node ? SLP_TREE_LANES (slp_node) : 1;
9080 /* We need to be able to make progress with a single vector. */
9081 if (maybe_gt (dist * 2, nunits))
9082 {
9083 if (dump_enabled_p ())
9084 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9085 "first order recurrence exceeds half of "
9086 "a vector\n");
9087 return false;
9088 }
9089
9090 /* First-order recurrence autovectorization needs to handle permutation
9091 with indices = [nunits-1, nunits, nunits+1, ...]. */
9092 vec_perm_builder sel (nunits, 1, 3);
9093 for (int i = 0; i < 3; ++i)
9094 sel.quick_push (obj: nunits - dist + i);
9095 vec_perm_indices indices (sel, 2, nunits);
9096
9097 if (!vec_stmt) /* transformation not required. */
9098 {
9099 if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
9100 indices))
9101 return false;
9102
9103 if (slp_node)
9104 {
9105 /* We eventually need to set a vector type on invariant
9106 arguments. */
9107 unsigned j;
9108 slp_tree child;
9109 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9110 if (!vect_maybe_update_slp_op_vectype
9111 (child, SLP_TREE_VECTYPE (slp_node)))
9112 {
9113 if (dump_enabled_p ())
9114 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9115 "incompatible vector types for "
9116 "invariants\n");
9117 return false;
9118 }
9119 }
9120 /* The recurrence costs the initialization vector and one permute
9121 for each copy. */
9122 unsigned prologue_cost = record_stmt_cost (body_cost_vec: cost_vec, count: 1, kind: scalar_to_vec,
9123 stmt_info, misalign: 0, where: vect_prologue);
9124 unsigned inside_cost = record_stmt_cost (body_cost_vec: cost_vec, count: ncopies, kind: vector_stmt,
9125 stmt_info, misalign: 0, where: vect_body);
9126 if (dump_enabled_p ())
9127 dump_printf_loc (MSG_NOTE, vect_location,
9128 "vectorizable_recurr: inside_cost = %d, "
9129 "prologue_cost = %d .\n", inside_cost,
9130 prologue_cost);
9131
9132 STMT_VINFO_TYPE (stmt_info) = recurr_info_type;
9133 return true;
9134 }
9135
9136 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
9137 basic_block bb = gimple_bb (g: phi);
9138 tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
9139 if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (preheader)))
9140 {
9141 gimple_seq stmts = NULL;
9142 preheader = gimple_convert (seq: &stmts, TREE_TYPE (vectype), op: preheader);
9143 gsi_insert_seq_on_edge_immediate (pe, stmts);
9144 }
9145 tree vec_init = build_vector_from_val (vectype, preheader);
9146 vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
9147
9148 /* Create the vectorized first-order PHI node. */
9149 tree vec_dest = vect_get_new_vect_var (vectype,
9150 vect_simple_var, "vec_recur_");
9151 gphi *new_phi = create_phi_node (vec_dest, bb);
9152 add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
9153
9154 /* Insert shuffles the first-order recurrence autovectorization.
9155 result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>. */
9156 tree perm = vect_gen_perm_mask_checked (vectype, indices);
9157
9158 /* Insert the required permute after the latch definition. The
9159 second and later operands are tentative and will be updated when we have
9160 vectorized the latch definition. */
9161 edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
9162 gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le));
9163 gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def);
9164 gsi_next (i: &gsi2);
9165
9166 for (unsigned i = 0; i < ncopies; ++i)
9167 {
9168 vec_dest = make_ssa_name (var: vectype);
9169 gassign *vperm
9170 = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
9171 i == 0 ? gimple_phi_result (gs: new_phi) : NULL,
9172 NULL, perm);
9173 vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
9174
9175 if (slp_node)
9176 slp_node->push_vec_def (def: vperm);
9177 else
9178 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (obj: vperm);
9179 }
9180
9181 if (!slp_node)
9182 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
9183 return true;
9184}
9185
9186/* Return true if VECTYPE represents a vector that requires lowering
9187 by the vector lowering pass. */
9188
9189bool
9190vect_emulated_vector_p (tree vectype)
9191{
9192 return (!VECTOR_MODE_P (TYPE_MODE (vectype))
9193 && (!VECTOR_BOOLEAN_TYPE_P (vectype)
9194 || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
9195}
9196
9197/* Return true if we can emulate CODE on an integer mode representation
9198 of a vector. */
9199
9200bool
9201vect_can_vectorize_without_simd_p (tree_code code)
9202{
9203 switch (code)
9204 {
9205 case PLUS_EXPR:
9206 case MINUS_EXPR:
9207 case NEGATE_EXPR:
9208 case BIT_AND_EXPR:
9209 case BIT_IOR_EXPR:
9210 case BIT_XOR_EXPR:
9211 case BIT_NOT_EXPR:
9212 return true;
9213
9214 default:
9215 return false;
9216 }
9217}
9218
9219/* Likewise, but taking a code_helper. */
9220
9221bool
9222vect_can_vectorize_without_simd_p (code_helper code)
9223{
9224 return (code.is_tree_code ()
9225 && vect_can_vectorize_without_simd_p (code: tree_code (code)));
9226}
9227
9228/* Create vector init for vectorized iv. */
9229static tree
9230vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9231 tree step_expr, poly_uint64 nunits,
9232 tree vectype,
9233 enum vect_induction_op_type induction_type)
9234{
9235 unsigned HOST_WIDE_INT const_nunits;
9236 tree vec_shift, vec_init, new_name;
9237 unsigned i;
9238 tree itype = TREE_TYPE (vectype);
9239
9240 /* iv_loop is the loop to be vectorized. Create:
9241 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr). */
9242 new_name = gimple_convert (seq: stmts, type: itype, op: init_expr);
9243 switch (induction_type)
9244 {
9245 case vect_step_op_shr:
9246 case vect_step_op_shl:
9247 /* Build the Initial value from shift_expr. */
9248 vec_init = gimple_build_vector_from_val (seq: stmts,
9249 type: vectype,
9250 op: new_name);
9251 vec_shift = gimple_build (seq: stmts, code: VEC_SERIES_EXPR, type: vectype,
9252 ops: build_zero_cst (itype), ops: step_expr);
9253 vec_init = gimple_build (seq: stmts,
9254 code: (induction_type == vect_step_op_shr
9255 ? RSHIFT_EXPR : LSHIFT_EXPR),
9256 type: vectype, ops: vec_init, ops: vec_shift);
9257 break;
9258
9259 case vect_step_op_neg:
9260 {
9261 vec_init = gimple_build_vector_from_val (seq: stmts,
9262 type: vectype,
9263 op: new_name);
9264 tree vec_neg = gimple_build (seq: stmts, code: NEGATE_EXPR,
9265 type: vectype, ops: vec_init);
9266 /* The encoding has 2 interleaved stepped patterns. */
9267 vec_perm_builder sel (nunits, 2, 3);
9268 sel.quick_grow (len: 6);
9269 for (i = 0; i < 3; i++)
9270 {
9271 sel[2 * i] = i;
9272 sel[2 * i + 1] = i + nunits;
9273 }
9274 vec_perm_indices indices (sel, 2, nunits);
9275 /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
9276 fail when vec_init is const vector. In that situation vec_perm is not
9277 really needed. */
9278 tree perm_mask_even
9279 = vect_gen_perm_mask_any (vectype, indices);
9280 vec_init = gimple_build (seq: stmts, code: VEC_PERM_EXPR,
9281 type: vectype,
9282 ops: vec_init, ops: vec_neg,
9283 ops: perm_mask_even);
9284 }
9285 break;
9286
9287 case vect_step_op_mul:
9288 {
9289 /* Use unsigned mult to avoid UD integer overflow. */
9290 gcc_assert (nunits.is_constant (&const_nunits));
9291 tree utype = unsigned_type_for (itype);
9292 tree uvectype = build_vector_type (utype,
9293 TYPE_VECTOR_SUBPARTS (node: vectype));
9294 new_name = gimple_convert (seq: stmts, type: utype, op: new_name);
9295 vec_init = gimple_build_vector_from_val (seq: stmts,
9296 type: uvectype,
9297 op: new_name);
9298 tree_vector_builder elts (uvectype, const_nunits, 1);
9299 tree elt_step = build_one_cst (utype);
9300
9301 elts.quick_push (obj: elt_step);
9302 for (i = 1; i < const_nunits; i++)
9303 {
9304 /* Create: new_name_i = new_name + step_expr. */
9305 elt_step = gimple_build (seq: stmts, code: MULT_EXPR,
9306 type: utype, ops: elt_step, ops: step_expr);
9307 elts.quick_push (obj: elt_step);
9308 }
9309 /* Create a vector from [new_name_0, new_name_1, ...,
9310 new_name_nunits-1]. */
9311 tree vec_mul = gimple_build_vector (seq: stmts, builder: &elts);
9312 vec_init = gimple_build (seq: stmts, code: MULT_EXPR, type: uvectype,
9313 ops: vec_init, ops: vec_mul);
9314 vec_init = gimple_convert (seq: stmts, type: vectype, op: vec_init);
9315 }
9316 break;
9317
9318 default:
9319 gcc_unreachable ();
9320 }
9321
9322 return vec_init;
9323}
9324
9325/* Peel init_expr by skip_niter for induction_type. */
9326tree
9327vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9328 tree skip_niters, tree step_expr,
9329 enum vect_induction_op_type induction_type)
9330{
9331 gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
9332 tree type = TREE_TYPE (init_expr);
9333 unsigned prec = TYPE_PRECISION (type);
9334 switch (induction_type)
9335 {
9336 case vect_step_op_neg:
9337 if (TREE_INT_CST_LOW (skip_niters) % 2)
9338 init_expr = gimple_build (seq: stmts, code: NEGATE_EXPR, type, ops: init_expr);
9339 /* else no change. */
9340 break;
9341
9342 case vect_step_op_shr:
9343 case vect_step_op_shl:
9344 skip_niters = gimple_convert (seq: stmts, type, op: skip_niters);
9345 step_expr = gimple_build (seq: stmts, code: MULT_EXPR, type, ops: step_expr, ops: skip_niters);
9346 /* When shift mount >= precision, need to avoid UD.
9347 In the original loop, there's no UD, and according to semantic,
9348 init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr. */
9349 if (!tree_fits_uhwi_p (step_expr)
9350 || tree_to_uhwi (step_expr) >= prec)
9351 {
9352 if (induction_type == vect_step_op_shl
9353 || TYPE_UNSIGNED (type))
9354 init_expr = build_zero_cst (type);
9355 else
9356 init_expr = gimple_build (seq: stmts, code: RSHIFT_EXPR, type,
9357 ops: init_expr,
9358 ops: wide_int_to_tree (type, cst: prec - 1));
9359 }
9360 else
9361 init_expr = gimple_build (seq: stmts, code: (induction_type == vect_step_op_shr
9362 ? RSHIFT_EXPR : LSHIFT_EXPR),
9363 type, ops: init_expr, ops: step_expr);
9364 break;
9365
9366 case vect_step_op_mul:
9367 {
9368 tree utype = unsigned_type_for (type);
9369 init_expr = gimple_convert (seq: stmts, type: utype, op: init_expr);
9370 wide_int skipn = wi::to_wide (t: skip_niters);
9371 wide_int begin = wi::to_wide (t: step_expr);
9372 auto_mpz base, exp, mod, res;
9373 wi::to_mpz (begin, base, TYPE_SIGN (type));
9374 wi::to_mpz (skipn, exp, UNSIGNED);
9375 mpz_ui_pow_ui (mod, 2, TYPE_PRECISION (type));
9376 mpz_powm (res, base, exp, mod);
9377 begin = wi::from_mpz (type, res, TYPE_SIGN (type));
9378 tree mult_expr = wide_int_to_tree (type: utype, cst: begin);
9379 init_expr = gimple_build (seq: stmts, code: MULT_EXPR, type: utype,
9380 ops: init_expr, ops: mult_expr);
9381 init_expr = gimple_convert (seq: stmts, type, op: init_expr);
9382 }
9383 break;
9384
9385 default:
9386 gcc_unreachable ();
9387 }
9388
9389 return init_expr;
9390}
9391
9392/* Create vector step for vectorized iv. */
9393static tree
9394vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
9395 poly_uint64 vf,
9396 enum vect_induction_op_type induction_type)
9397{
9398 tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
9399 tree new_name = NULL;
9400 /* Step should be pow (step, vf) for mult induction. */
9401 if (induction_type == vect_step_op_mul)
9402 {
9403 gcc_assert (vf.is_constant ());
9404 wide_int begin = wi::to_wide (t: step_expr);
9405
9406 for (unsigned i = 0; i != vf.to_constant () - 1; i++)
9407 begin = wi::mul (x: begin, y: wi::to_wide (t: step_expr));
9408
9409 new_name = wide_int_to_tree (TREE_TYPE (step_expr), cst: begin);
9410 }
9411 else if (induction_type == vect_step_op_neg)
9412 /* Do nothing. */
9413 ;
9414 else
9415 new_name = gimple_build (seq: stmts, code: MULT_EXPR, TREE_TYPE (step_expr),
9416 ops: expr, ops: step_expr);
9417 return new_name;
9418}
9419
9420static tree
9421vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
9422 stmt_vec_info stmt_info,
9423 tree new_name, tree vectype,
9424 enum vect_induction_op_type induction_type)
9425{
9426 /* No step is needed for neg induction. */
9427 if (induction_type == vect_step_op_neg)
9428 return NULL;
9429
9430 tree t = unshare_expr (new_name);
9431 gcc_assert (CONSTANT_CLASS_P (new_name)
9432 || TREE_CODE (new_name) == SSA_NAME);
9433 tree new_vec = build_vector_from_val (vectype, t);
9434 tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
9435 new_vec, vectype, NULL);
9436 return vec_step;
9437}
9438
9439/* Update vectorized iv with vect_step, induc_def is init. */
9440static tree
9441vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
9442 tree induc_def, tree vec_step,
9443 enum vect_induction_op_type induction_type)
9444{
9445 tree vec_def = induc_def;
9446 switch (induction_type)
9447 {
9448 case vect_step_op_mul:
9449 {
9450 /* Use unsigned mult to avoid UD integer overflow. */
9451 tree uvectype
9452 = build_vector_type (unsigned_type_for (TREE_TYPE (vectype)),
9453 TYPE_VECTOR_SUBPARTS (node: vectype));
9454 vec_def = gimple_convert (seq: stmts, type: uvectype, op: vec_def);
9455 vec_step = gimple_convert (seq: stmts, type: uvectype, op: vec_step);
9456 vec_def = gimple_build (seq: stmts, code: MULT_EXPR, type: uvectype,
9457 ops: vec_def, ops: vec_step);
9458 vec_def = gimple_convert (seq: stmts, type: vectype, op: vec_def);
9459 }
9460 break;
9461
9462 case vect_step_op_shr:
9463 vec_def = gimple_build (seq: stmts, code: RSHIFT_EXPR, type: vectype,
9464 ops: vec_def, ops: vec_step);
9465 break;
9466
9467 case vect_step_op_shl:
9468 vec_def = gimple_build (seq: stmts, code: LSHIFT_EXPR, type: vectype,
9469 ops: vec_def, ops: vec_step);
9470 break;
9471 case vect_step_op_neg:
9472 vec_def = induc_def;
9473 /* Do nothing. */
9474 break;
9475 default:
9476 gcc_unreachable ();
9477 }
9478
9479 return vec_def;
9480
9481}
9482
9483/* Function vectorizable_induction
9484
9485 Check if STMT_INFO performs an nonlinear induction computation that can be
9486 vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
9487 a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
9488 basic block.
9489 Return true if STMT_INFO is vectorizable in this way. */
9490
9491static bool
9492vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
9493 stmt_vec_info stmt_info,
9494 gimple **vec_stmt, slp_tree slp_node,
9495 stmt_vector_for_cost *cost_vec)
9496{
9497 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9498 unsigned ncopies;
9499 bool nested_in_vect_loop = false;
9500 class loop *iv_loop;
9501 tree vec_def;
9502 edge pe = loop_preheader_edge (loop);
9503 basic_block new_bb;
9504 tree vec_init, vec_step;
9505 tree new_name;
9506 gimple *new_stmt;
9507 gphi *induction_phi;
9508 tree induc_def, vec_dest;
9509 tree init_expr, step_expr;
9510 tree niters_skip;
9511 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9512 unsigned i;
9513 gimple_stmt_iterator si;
9514
9515 gphi *phi = dyn_cast <gphi *> (p: stmt_info->stmt);
9516
9517 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9518 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (node: vectype);
9519 enum vect_induction_op_type induction_type
9520 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9521
9522 gcc_assert (induction_type > vect_step_op_add);
9523
9524 if (slp_node)
9525 ncopies = 1;
9526 else
9527 ncopies = vect_get_num_copies (loop_vinfo, vectype);
9528 gcc_assert (ncopies >= 1);
9529
9530 /* FORNOW. Only handle nonlinear induction in the same loop. */
9531 if (nested_in_vect_loop_p (loop, stmt_info))
9532 {
9533 if (dump_enabled_p ())
9534 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9535 "nonlinear induction in nested loop.\n");
9536 return false;
9537 }
9538
9539 iv_loop = loop;
9540 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9541
9542 /* TODO: Support slp for nonlinear iv. There should be separate vector iv
9543 update for each iv and a permutation to generate wanted vector iv. */
9544 if (slp_node)
9545 {
9546 if (dump_enabled_p ())
9547 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9548 "SLP induction not supported for nonlinear"
9549 " induction.\n");
9550 return false;
9551 }
9552
9553 if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
9554 {
9555 if (dump_enabled_p ())
9556 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9557 "floating point nonlinear induction vectorization"
9558 " not supported.\n");
9559 return false;
9560 }
9561
9562 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9563 init_expr = vect_phi_initial_value (phi);
9564 gcc_assert (step_expr != NULL_TREE && init_expr != NULL
9565 && TREE_CODE (step_expr) == INTEGER_CST);
9566 /* step_expr should be aligned with init_expr,
9567 .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used. */
9568 step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
9569
9570 if (TREE_CODE (init_expr) == INTEGER_CST)
9571 init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
9572 else
9573 gcc_assert (tree_nop_conversion_p (TREE_TYPE (vectype),
9574 TREE_TYPE (init_expr)));
9575
9576 switch (induction_type)
9577 {
9578 case vect_step_op_neg:
9579 if (TREE_CODE (init_expr) != INTEGER_CST
9580 && TREE_CODE (init_expr) != REAL_CST)
9581 {
9582 /* Check for backend support of NEGATE_EXPR and vec_perm. */
9583 if (!directly_supported_p (NEGATE_EXPR, vectype))
9584 return false;
9585
9586 /* The encoding has 2 interleaved stepped patterns. */
9587 vec_perm_builder sel (nunits, 2, 3);
9588 machine_mode mode = TYPE_MODE (vectype);
9589 sel.quick_grow (len: 6);
9590 for (i = 0; i < 3; i++)
9591 {
9592 sel[i * 2] = i;
9593 sel[i * 2 + 1] = i + nunits;
9594 }
9595 vec_perm_indices indices (sel, 2, nunits);
9596 if (!can_vec_perm_const_p (mode, mode, indices))
9597 return false;
9598 }
9599 break;
9600
9601 case vect_step_op_mul:
9602 {
9603 /* Check for backend support of MULT_EXPR. */
9604 if (!directly_supported_p (MULT_EXPR, vectype))
9605 return false;
9606
9607 /* ?? How to construct vector step for variable number vector.
9608 [ 1, step, pow (step, 2), pow (step, 4), .. ]. */
9609 if (!vf.is_constant ())
9610 return false;
9611 }
9612 break;
9613
9614 case vect_step_op_shr:
9615 /* Check for backend support of RSHIFT_EXPR. */
9616 if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
9617 return false;
9618
9619 /* Don't shift more than type precision to avoid UD. */
9620 if (!tree_fits_uhwi_p (step_expr)
9621 || maybe_ge (nunits * tree_to_uhwi (step_expr),
9622 TYPE_PRECISION (TREE_TYPE (init_expr))))
9623 return false;
9624 break;
9625
9626 case vect_step_op_shl:
9627 /* Check for backend support of RSHIFT_EXPR. */
9628 if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
9629 return false;
9630
9631 /* Don't shift more than type precision to avoid UD. */
9632 if (!tree_fits_uhwi_p (step_expr)
9633 || maybe_ge (nunits * tree_to_uhwi (step_expr),
9634 TYPE_PRECISION (TREE_TYPE (init_expr))))
9635 return false;
9636
9637 break;
9638
9639 default:
9640 gcc_unreachable ();
9641 }
9642
9643 if (!vec_stmt) /* transformation not required. */
9644 {
9645 unsigned inside_cost = 0, prologue_cost = 0;
9646 /* loop cost for vec_loop. Neg induction doesn't have any
9647 inside_cost. */
9648 inside_cost = record_stmt_cost (body_cost_vec: cost_vec, count: ncopies, kind: vector_stmt,
9649 stmt_info, misalign: 0, where: vect_body);
9650
9651 /* loop cost for vec_loop. Neg induction doesn't have any
9652 inside_cost. */
9653 if (induction_type == vect_step_op_neg)
9654 inside_cost = 0;
9655
9656 /* prologue cost for vec_init and vec_step. */
9657 prologue_cost = record_stmt_cost (body_cost_vec: cost_vec, count: 2, kind: scalar_to_vec,
9658 stmt_info, misalign: 0, where: vect_prologue);
9659
9660 if (dump_enabled_p ())
9661 dump_printf_loc (MSG_NOTE, vect_location,
9662 "vect_model_induction_cost: inside_cost = %d, "
9663 "prologue_cost = %d. \n", inside_cost,
9664 prologue_cost);
9665
9666 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9667 DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
9668 return true;
9669 }
9670
9671 /* Transform. */
9672
9673 /* Compute a vector variable, initialized with the first VF values of
9674 the induction variable. E.g., for an iv with IV_PHI='X' and
9675 evolution S, for a vector of 4 units, we want to compute:
9676 [X, X + S, X + 2*S, X + 3*S]. */
9677
9678 if (dump_enabled_p ())
9679 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9680
9681 pe = loop_preheader_edge (iv_loop);
9682 /* Find the first insertion point in the BB. */
9683 basic_block bb = gimple_bb (g: phi);
9684 si = gsi_after_labels (bb);
9685
9686 gimple_seq stmts = NULL;
9687
9688 niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9689 /* If we are using the loop mask to "peel" for alignment then we need
9690 to adjust the start value here. */
9691 if (niters_skip != NULL_TREE)
9692 init_expr = vect_peel_nonlinear_iv_init (stmts: &stmts, init_expr, skip_niters: niters_skip,
9693 step_expr, induction_type);
9694
9695 vec_init = vect_create_nonlinear_iv_init (stmts: &stmts, init_expr,
9696 step_expr, nunits, vectype,
9697 induction_type);
9698 if (stmts)
9699 {
9700 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9701 gcc_assert (!new_bb);
9702 }
9703
9704 stmts = NULL;
9705 new_name = vect_create_nonlinear_iv_step (stmts: &stmts, step_expr,
9706 vf, induction_type);
9707 if (stmts)
9708 {
9709 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9710 gcc_assert (!new_bb);
9711 }
9712
9713 vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9714 new_name, vectype,
9715 induction_type);
9716 /* Create the following def-use cycle:
9717 loop prolog:
9718 vec_init = ...
9719 vec_step = ...
9720 loop:
9721 vec_iv = PHI <vec_init, vec_loop>
9722 ...
9723 STMT
9724 ...
9725 vec_loop = vec_iv + vec_step; */
9726
9727 /* Create the induction-phi that defines the induction-operand. */
9728 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9729 induction_phi = create_phi_node (vec_dest, iv_loop->header);
9730 induc_def = PHI_RESULT (induction_phi);
9731
9732 /* Create the iv update inside the loop. */
9733 stmts = NULL;
9734 vec_def = vect_update_nonlinear_iv (stmts: &stmts, vectype,
9735 induc_def, vec_step,
9736 induction_type);
9737
9738 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9739 new_stmt = SSA_NAME_DEF_STMT (vec_def);
9740
9741 /* Set the arguments of the phi node: */
9742 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9743 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9744 UNKNOWN_LOCATION);
9745
9746 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (obj: induction_phi);
9747 *vec_stmt = induction_phi;
9748
9749 /* In case that vectorization factor (VF) is bigger than the number
9750 of elements that we can fit in a vectype (nunits), we have to generate
9751 more than one vector stmt - i.e - we need to "unroll" the
9752 vector stmt by a factor VF/nunits. For more details see documentation
9753 in vectorizable_operation. */
9754
9755 if (ncopies > 1)
9756 {
9757 stmts = NULL;
9758 /* FORNOW. This restriction should be relaxed. */
9759 gcc_assert (!nested_in_vect_loop);
9760
9761 new_name = vect_create_nonlinear_iv_step (stmts: &stmts, step_expr,
9762 vf: nunits, induction_type);
9763
9764 vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9765 new_name, vectype,
9766 induction_type);
9767 vec_def = induc_def;
9768 for (i = 1; i < ncopies; i++)
9769 {
9770 /* vec_i = vec_prev + vec_step. */
9771 stmts = NULL;
9772 vec_def = vect_update_nonlinear_iv (stmts: &stmts, vectype,
9773 induc_def: vec_def, vec_step,
9774 induction_type);
9775 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9776 new_stmt = SSA_NAME_DEF_STMT (vec_def);
9777 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (obj: new_stmt);
9778 }
9779 }
9780
9781 if (dump_enabled_p ())
9782 dump_printf_loc (MSG_NOTE, vect_location,
9783 "transform induction: created def-use cycle: %G%G",
9784 (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9785
9786 return true;
9787}
9788
9789/* Function vectorizable_induction
9790
9791 Check if STMT_INFO performs an induction computation that can be vectorized.
9792 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
9793 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
9794 Return true if STMT_INFO is vectorizable in this way. */
9795
9796bool
9797vectorizable_induction (loop_vec_info loop_vinfo,
9798 stmt_vec_info stmt_info,
9799 gimple **vec_stmt, slp_tree slp_node,
9800 stmt_vector_for_cost *cost_vec)
9801{
9802 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9803 unsigned ncopies;
9804 bool nested_in_vect_loop = false;
9805 class loop *iv_loop;
9806 tree vec_def;
9807 edge pe = loop_preheader_edge (loop);
9808 basic_block new_bb;
9809 tree new_vec, vec_init, vec_step, t;
9810 tree new_name;
9811 gimple *new_stmt;
9812 gphi *induction_phi;
9813 tree induc_def, vec_dest;
9814 tree init_expr, step_expr;
9815 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9816 unsigned i;
9817 tree expr;
9818 gimple_stmt_iterator si;
9819 enum vect_induction_op_type induction_type
9820 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9821
9822 gphi *phi = dyn_cast <gphi *> (p: stmt_info->stmt);
9823 if (!phi)
9824 return false;
9825
9826 if (!STMT_VINFO_RELEVANT_P (stmt_info))
9827 return false;
9828
9829 /* Make sure it was recognized as induction computation. */
9830 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
9831 return false;
9832
9833 /* Handle nonlinear induction in a separate place. */
9834 if (induction_type != vect_step_op_add)
9835 return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
9836 vec_stmt, slp_node, cost_vec);
9837
9838 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9839 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (node: vectype);
9840
9841 if (slp_node)
9842 ncopies = 1;
9843 else
9844 ncopies = vect_get_num_copies (loop_vinfo, vectype);
9845 gcc_assert (ncopies >= 1);
9846
9847 /* FORNOW. These restrictions should be relaxed. */
9848 if (nested_in_vect_loop_p (loop, stmt_info))
9849 {
9850 imm_use_iterator imm_iter;
9851 use_operand_p use_p;
9852 gimple *exit_phi;
9853 edge latch_e;
9854 tree loop_arg;
9855
9856 if (ncopies > 1)
9857 {
9858 if (dump_enabled_p ())
9859 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9860 "multiple types in nested loop.\n");
9861 return false;
9862 }
9863
9864 exit_phi = NULL;
9865 latch_e = loop_latch_edge (loop->inner);
9866 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
9867 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
9868 {
9869 gimple *use_stmt = USE_STMT (use_p);
9870 if (is_gimple_debug (gs: use_stmt))
9871 continue;
9872
9873 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (g: use_stmt)))
9874 {
9875 exit_phi = use_stmt;
9876 break;
9877 }
9878 }
9879 if (exit_phi)
9880 {
9881 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
9882 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
9883 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
9884 {
9885 if (dump_enabled_p ())
9886 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9887 "inner-loop induction only used outside "
9888 "of the outer vectorized loop.\n");
9889 return false;
9890 }
9891 }
9892
9893 nested_in_vect_loop = true;
9894 iv_loop = loop->inner;
9895 }
9896 else
9897 iv_loop = loop;
9898 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9899
9900 if (slp_node && !nunits.is_constant ())
9901 {
9902 /* The current SLP code creates the step value element-by-element. */
9903 if (dump_enabled_p ())
9904 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9905 "SLP induction not supported for variable-length"
9906 " vectors.\n");
9907 return false;
9908 }
9909
9910 if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
9911 {
9912 if (dump_enabled_p ())
9913 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9914 "floating point induction vectorization disabled\n");
9915 return false;
9916 }
9917
9918 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9919 gcc_assert (step_expr != NULL_TREE);
9920 tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
9921
9922 /* Check for backend support of PLUS/MINUS_EXPR. */
9923 if (!directly_supported_p (PLUS_EXPR, step_vectype)
9924 || !directly_supported_p (MINUS_EXPR, step_vectype))
9925 return false;
9926
9927 if (!vec_stmt) /* transformation not required. */
9928 {
9929 unsigned inside_cost = 0, prologue_cost = 0;
9930 if (slp_node)
9931 {
9932 /* We eventually need to set a vector type on invariant
9933 arguments. */
9934 unsigned j;
9935 slp_tree child;
9936 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9937 if (!vect_maybe_update_slp_op_vectype
9938 (child, SLP_TREE_VECTYPE (slp_node)))
9939 {
9940 if (dump_enabled_p ())
9941 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9942 "incompatible vector types for "
9943 "invariants\n");
9944 return false;
9945 }
9946 /* loop cost for vec_loop. */
9947 inside_cost
9948 = record_stmt_cost (body_cost_vec: cost_vec,
9949 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
9950 kind: vector_stmt, stmt_info, misalign: 0, where: vect_body);
9951 /* prologue cost for vec_init (if not nested) and step. */
9952 prologue_cost = record_stmt_cost (body_cost_vec: cost_vec, count: 1 + !nested_in_vect_loop,
9953 kind: scalar_to_vec,
9954 stmt_info, misalign: 0, where: vect_prologue);
9955 }
9956 else /* if (!slp_node) */
9957 {
9958 /* loop cost for vec_loop. */
9959 inside_cost = record_stmt_cost (body_cost_vec: cost_vec, count: ncopies, kind: vector_stmt,
9960 stmt_info, misalign: 0, where: vect_body);
9961 /* prologue cost for vec_init and vec_step. */
9962 prologue_cost = record_stmt_cost (body_cost_vec: cost_vec, count: 2, kind: scalar_to_vec,
9963 stmt_info, misalign: 0, where: vect_prologue);
9964 }
9965 if (dump_enabled_p ())
9966 dump_printf_loc (MSG_NOTE, vect_location,
9967 "vect_model_induction_cost: inside_cost = %d, "
9968 "prologue_cost = %d .\n", inside_cost,
9969 prologue_cost);
9970
9971 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9972 DUMP_VECT_SCOPE ("vectorizable_induction");
9973 return true;
9974 }
9975
9976 /* Transform. */
9977
9978 /* Compute a vector variable, initialized with the first VF values of
9979 the induction variable. E.g., for an iv with IV_PHI='X' and
9980 evolution S, for a vector of 4 units, we want to compute:
9981 [X, X + S, X + 2*S, X + 3*S]. */
9982
9983 if (dump_enabled_p ())
9984 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9985
9986 pe = loop_preheader_edge (iv_loop);
9987 /* Find the first insertion point in the BB. */
9988 basic_block bb = gimple_bb (g: phi);
9989 si = gsi_after_labels (bb);
9990
9991 /* For SLP induction we have to generate several IVs as for example
9992 with group size 3 we need
9993 [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
9994 [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */
9995 if (slp_node)
9996 {
9997 /* Enforced above. */
9998 unsigned int const_nunits = nunits.to_constant ();
9999
10000 /* The initial values are vectorized, but any lanes > group_size
10001 need adjustment. */
10002 slp_tree init_node
10003 = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
10004
10005 /* Gather steps. Since we do not vectorize inductions as
10006 cycles we have to reconstruct the step from SCEV data. */
10007 unsigned group_size = SLP_TREE_LANES (slp_node);
10008 tree *steps = XALLOCAVEC (tree, group_size);
10009 tree *inits = XALLOCAVEC (tree, group_size);
10010 stmt_vec_info phi_info;
10011 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
10012 {
10013 steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
10014 if (!init_node)
10015 inits[i] = gimple_phi_arg_def (gs: as_a<gphi *> (p: phi_info->stmt),
10016 index: pe->dest_idx);
10017 }
10018
10019 /* Now generate the IVs. */
10020 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10021 gcc_assert ((const_nunits * nvects) % group_size == 0);
10022 unsigned nivs;
10023 if (nested_in_vect_loop)
10024 nivs = nvects;
10025 else
10026 {
10027 /* Compute the number of distinct IVs we need. First reduce
10028 group_size if it is a multiple of const_nunits so we get
10029 one IV for a group_size of 4 but const_nunits 2. */
10030 unsigned group_sizep = group_size;
10031 if (group_sizep % const_nunits == 0)
10032 group_sizep = group_sizep / const_nunits;
10033 nivs = least_common_multiple (group_sizep,
10034 const_nunits) / const_nunits;
10035 }
10036 tree stept = TREE_TYPE (step_vectype);
10037 tree lupdate_mul = NULL_TREE;
10038 if (!nested_in_vect_loop)
10039 {
10040 /* The number of iterations covered in one vector iteration. */
10041 unsigned lup_mul = (nvects * const_nunits) / group_size;
10042 lupdate_mul
10043 = build_vector_from_val (step_vectype,
10044 SCALAR_FLOAT_TYPE_P (stept)
10045 ? build_real_from_wide (stept, lup_mul,
10046 UNSIGNED)
10047 : build_int_cstu (type: stept, lup_mul));
10048 }
10049 tree peel_mul = NULL_TREE;
10050 gimple_seq init_stmts = NULL;
10051 if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
10052 {
10053 if (SCALAR_FLOAT_TYPE_P (stept))
10054 peel_mul = gimple_build (seq: &init_stmts, code: FLOAT_EXPR, type: stept,
10055 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10056 else
10057 peel_mul = gimple_convert (seq: &init_stmts, type: stept,
10058 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10059 peel_mul = gimple_build_vector_from_val (seq: &init_stmts,
10060 type: step_vectype, op: peel_mul);
10061 }
10062 unsigned ivn;
10063 auto_vec<tree> vec_steps;
10064 for (ivn = 0; ivn < nivs; ++ivn)
10065 {
10066 tree_vector_builder step_elts (step_vectype, const_nunits, 1);
10067 tree_vector_builder init_elts (vectype, const_nunits, 1);
10068 tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
10069 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
10070 {
10071 /* The scalar steps of the IVs. */
10072 tree elt = steps[(ivn*const_nunits + eltn) % group_size];
10073 elt = gimple_convert (seq: &init_stmts, TREE_TYPE (step_vectype), op: elt);
10074 step_elts.quick_push (obj: elt);
10075 if (!init_node)
10076 {
10077 /* The scalar inits of the IVs if not vectorized. */
10078 elt = inits[(ivn*const_nunits + eltn) % group_size];
10079 if (!useless_type_conversion_p (TREE_TYPE (vectype),
10080 TREE_TYPE (elt)))
10081 elt = gimple_build (seq: &init_stmts, code: VIEW_CONVERT_EXPR,
10082 TREE_TYPE (vectype), ops: elt);
10083 init_elts.quick_push (obj: elt);
10084 }
10085 /* The number of steps to add to the initial values. */
10086 unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
10087 mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
10088 ? build_real_from_wide (stept,
10089 mul_elt, UNSIGNED)
10090 : build_int_cstu (type: stept, mul_elt));
10091 }
10092 vec_step = gimple_build_vector (seq: &init_stmts, builder: &step_elts);
10093 vec_steps.safe_push (obj: vec_step);
10094 tree step_mul = gimple_build_vector (seq: &init_stmts, builder: &mul_elts);
10095 if (peel_mul)
10096 step_mul = gimple_build (seq: &init_stmts, code: PLUS_EXPR, type: step_vectype,
10097 ops: step_mul, ops: peel_mul);
10098 if (!init_node)
10099 vec_init = gimple_build_vector (seq: &init_stmts, builder: &init_elts);
10100
10101 /* Create the induction-phi that defines the induction-operand. */
10102 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
10103 "vec_iv_");
10104 induction_phi = create_phi_node (vec_dest, iv_loop->header);
10105 induc_def = PHI_RESULT (induction_phi);
10106
10107 /* Create the iv update inside the loop */
10108 tree up = vec_step;
10109 if (lupdate_mul)
10110 up = gimple_build (seq: &init_stmts, code: MULT_EXPR, type: step_vectype,
10111 ops: vec_step, ops: lupdate_mul);
10112 gimple_seq stmts = NULL;
10113 vec_def = gimple_convert (seq: &stmts, type: step_vectype, op: induc_def);
10114 vec_def = gimple_build (seq: &stmts,
10115 code: PLUS_EXPR, type: step_vectype, ops: vec_def, ops: up);
10116 vec_def = gimple_convert (seq: &stmts, type: vectype, op: vec_def);
10117 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10118 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10119 UNKNOWN_LOCATION);
10120
10121 if (init_node)
10122 vec_init = vect_get_slp_vect_def (init_node, ivn);
10123 if (!nested_in_vect_loop
10124 && !integer_zerop (step_mul))
10125 {
10126 vec_def = gimple_convert (seq: &init_stmts, type: step_vectype, op: vec_init);
10127 up = gimple_build (seq: &init_stmts, code: MULT_EXPR, type: step_vectype,
10128 ops: vec_step, ops: step_mul);
10129 vec_def = gimple_build (seq: &init_stmts, code: PLUS_EXPR, type: step_vectype,
10130 ops: vec_def, ops: up);
10131 vec_init = gimple_convert (seq: &init_stmts, type: vectype, op: vec_def);
10132 }
10133
10134 /* Set the arguments of the phi node: */
10135 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10136
10137 slp_node->push_vec_def (def: induction_phi);
10138 }
10139 if (!nested_in_vect_loop)
10140 {
10141 /* Fill up to the number of vectors we need for the whole group. */
10142 nivs = least_common_multiple (group_size,
10143 const_nunits) / const_nunits;
10144 vec_steps.reserve (nelems: nivs-ivn);
10145 for (; ivn < nivs; ++ivn)
10146 {
10147 slp_node->push_vec_def (SLP_TREE_VEC_DEFS (slp_node)[0]);
10148 vec_steps.quick_push (obj: vec_steps[0]);
10149 }
10150 }
10151
10152 /* Re-use IVs when we can. We are generating further vector
10153 stmts by adding VF' * stride to the IVs generated above. */
10154 if (ivn < nvects)
10155 {
10156 unsigned vfp
10157 = least_common_multiple (group_size, const_nunits) / group_size;
10158 tree lupdate_mul
10159 = build_vector_from_val (step_vectype,
10160 SCALAR_FLOAT_TYPE_P (stept)
10161 ? build_real_from_wide (stept,
10162 vfp, UNSIGNED)
10163 : build_int_cstu (type: stept, vfp));
10164 for (; ivn < nvects; ++ivn)
10165 {
10166 gimple *iv
10167 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (slp_node)[ivn - nivs]);
10168 tree def = gimple_get_lhs (iv);
10169 if (ivn < 2*nivs)
10170 vec_steps[ivn - nivs]
10171 = gimple_build (seq: &init_stmts, code: MULT_EXPR, type: step_vectype,
10172 ops: vec_steps[ivn - nivs], ops: lupdate_mul);
10173 gimple_seq stmts = NULL;
10174 def = gimple_convert (seq: &stmts, type: step_vectype, op: def);
10175 def = gimple_build (seq: &stmts, code: PLUS_EXPR, type: step_vectype,
10176 ops: def, ops: vec_steps[ivn % nivs]);
10177 def = gimple_convert (seq: &stmts, type: vectype, op: def);
10178 if (gimple_code (g: iv) == GIMPLE_PHI)
10179 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10180 else
10181 {
10182 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
10183 gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
10184 }
10185 slp_node->push_vec_def (def);
10186 }
10187 }
10188
10189 new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
10190 gcc_assert (!new_bb);
10191
10192 return true;
10193 }
10194
10195 init_expr = vect_phi_initial_value (phi);
10196
10197 gimple_seq stmts = NULL;
10198 if (!nested_in_vect_loop)
10199 {
10200 /* Convert the initial value to the IV update type. */
10201 tree new_type = TREE_TYPE (step_expr);
10202 init_expr = gimple_convert (seq: &stmts, type: new_type, op: init_expr);
10203
10204 /* If we are using the loop mask to "peel" for alignment then we need
10205 to adjust the start value here. */
10206 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
10207 if (skip_niters != NULL_TREE)
10208 {
10209 if (FLOAT_TYPE_P (vectype))
10210 skip_niters = gimple_build (seq: &stmts, code: FLOAT_EXPR, type: new_type,
10211 ops: skip_niters);
10212 else
10213 skip_niters = gimple_convert (seq: &stmts, type: new_type, op: skip_niters);
10214 tree skip_step = gimple_build (seq: &stmts, code: MULT_EXPR, type: new_type,
10215 ops: skip_niters, ops: step_expr);
10216 init_expr = gimple_build (seq: &stmts, code: MINUS_EXPR, type: new_type,
10217 ops: init_expr, ops: skip_step);
10218 }
10219 }
10220
10221 if (stmts)
10222 {
10223 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10224 gcc_assert (!new_bb);
10225 }
10226
10227 /* Create the vector that holds the initial_value of the induction. */
10228 if (nested_in_vect_loop)
10229 {
10230 /* iv_loop is nested in the loop to be vectorized. init_expr had already
10231 been created during vectorization of previous stmts. We obtain it
10232 from the STMT_VINFO_VEC_STMT of the defining stmt. */
10233 auto_vec<tree> vec_inits;
10234 vect_get_vec_defs_for_operand (vinfo: loop_vinfo, stmt_info, 1,
10235 op: init_expr, &vec_inits);
10236 vec_init = vec_inits[0];
10237 /* If the initial value is not of proper type, convert it. */
10238 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
10239 {
10240 new_stmt
10241 = gimple_build_assign (vect_get_new_ssa_name (vectype,
10242 vect_simple_var,
10243 "vec_iv_"),
10244 VIEW_CONVERT_EXPR,
10245 build1 (VIEW_CONVERT_EXPR, vectype,
10246 vec_init));
10247 vec_init = gimple_assign_lhs (gs: new_stmt);
10248 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
10249 new_stmt);
10250 gcc_assert (!new_bb);
10251 }
10252 }
10253 else
10254 {
10255 /* iv_loop is the loop to be vectorized. Create:
10256 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
10257 stmts = NULL;
10258 new_name = gimple_convert (seq: &stmts, TREE_TYPE (step_expr), op: init_expr);
10259
10260 unsigned HOST_WIDE_INT const_nunits;
10261 if (nunits.is_constant (const_value: &const_nunits))
10262 {
10263 tree_vector_builder elts (step_vectype, const_nunits, 1);
10264 elts.quick_push (obj: new_name);
10265 for (i = 1; i < const_nunits; i++)
10266 {
10267 /* Create: new_name_i = new_name + step_expr */
10268 new_name = gimple_build (seq: &stmts, code: PLUS_EXPR, TREE_TYPE (new_name),
10269 ops: new_name, ops: step_expr);
10270 elts.quick_push (obj: new_name);
10271 }
10272 /* Create a vector from [new_name_0, new_name_1, ...,
10273 new_name_nunits-1] */
10274 vec_init = gimple_build_vector (seq: &stmts, builder: &elts);
10275 }
10276 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
10277 /* Build the initial value directly from a VEC_SERIES_EXPR. */
10278 vec_init = gimple_build (seq: &stmts, code: VEC_SERIES_EXPR, type: step_vectype,
10279 ops: new_name, ops: step_expr);
10280 else
10281 {
10282 /* Build:
10283 [base, base, base, ...]
10284 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
10285 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
10286 gcc_assert (flag_associative_math);
10287 tree index = build_index_vector (step_vectype, 0, 1);
10288 tree base_vec = gimple_build_vector_from_val (seq: &stmts, type: step_vectype,
10289 op: new_name);
10290 tree step_vec = gimple_build_vector_from_val (seq: &stmts, type: step_vectype,
10291 op: step_expr);
10292 vec_init = gimple_build (seq: &stmts, code: FLOAT_EXPR, type: step_vectype, ops: index);
10293 vec_init = gimple_build (seq: &stmts, code: MULT_EXPR, type: step_vectype,
10294 ops: vec_init, ops: step_vec);
10295 vec_init = gimple_build (seq: &stmts, code: PLUS_EXPR, type: step_vectype,
10296 ops: vec_init, ops: base_vec);
10297 }
10298 vec_init = gimple_convert (seq: &stmts, type: vectype, op: vec_init);
10299
10300 if (stmts)
10301 {
10302 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10303 gcc_assert (!new_bb);
10304 }
10305 }
10306
10307
10308 /* Create the vector that holds the step of the induction. */
10309 if (nested_in_vect_loop)
10310 /* iv_loop is nested in the loop to be vectorized. Generate:
10311 vec_step = [S, S, S, S] */
10312 new_name = step_expr;
10313 else
10314 {
10315 /* iv_loop is the loop to be vectorized. Generate:
10316 vec_step = [VF*S, VF*S, VF*S, VF*S] */
10317 gimple_seq seq = NULL;
10318 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10319 {
10320 expr = build_int_cst (integer_type_node, vf);
10321 expr = gimple_build (seq: &seq, code: FLOAT_EXPR, TREE_TYPE (step_expr), ops: expr);
10322 }
10323 else
10324 expr = build_int_cst (TREE_TYPE (step_expr), vf);
10325 new_name = gimple_build (seq: &seq, code: MULT_EXPR, TREE_TYPE (step_expr),
10326 ops: expr, ops: step_expr);
10327 if (seq)
10328 {
10329 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10330 gcc_assert (!new_bb);
10331 }
10332 }
10333
10334 t = unshare_expr (new_name);
10335 gcc_assert (CONSTANT_CLASS_P (new_name)
10336 || TREE_CODE (new_name) == SSA_NAME);
10337 new_vec = build_vector_from_val (step_vectype, t);
10338 vec_step = vect_init_vector (loop_vinfo, stmt_info,
10339 new_vec, step_vectype, NULL);
10340
10341
10342 /* Create the following def-use cycle:
10343 loop prolog:
10344 vec_init = ...
10345 vec_step = ...
10346 loop:
10347 vec_iv = PHI <vec_init, vec_loop>
10348 ...
10349 STMT
10350 ...
10351 vec_loop = vec_iv + vec_step; */
10352
10353 /* Create the induction-phi that defines the induction-operand. */
10354 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
10355 induction_phi = create_phi_node (vec_dest, iv_loop->header);
10356 induc_def = PHI_RESULT (induction_phi);
10357
10358 /* Create the iv update inside the loop */
10359 stmts = NULL;
10360 vec_def = gimple_convert (seq: &stmts, type: step_vectype, op: induc_def);
10361 vec_def = gimple_build (seq: &stmts, code: PLUS_EXPR, type: step_vectype, ops: vec_def, ops: vec_step);
10362 vec_def = gimple_convert (seq: &stmts, type: vectype, op: vec_def);
10363 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10364 new_stmt = SSA_NAME_DEF_STMT (vec_def);
10365
10366 /* Set the arguments of the phi node: */
10367 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10368 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10369 UNKNOWN_LOCATION);
10370
10371 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (obj: induction_phi);
10372 *vec_stmt = induction_phi;
10373
10374 /* In case that vectorization factor (VF) is bigger than the number
10375 of elements that we can fit in a vectype (nunits), we have to generate
10376 more than one vector stmt - i.e - we need to "unroll" the
10377 vector stmt by a factor VF/nunits. For more details see documentation
10378 in vectorizable_operation. */
10379
10380 if (ncopies > 1)
10381 {
10382 gimple_seq seq = NULL;
10383 /* FORNOW. This restriction should be relaxed. */
10384 gcc_assert (!nested_in_vect_loop);
10385
10386 /* Create the vector that holds the step of the induction. */
10387 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10388 {
10389 expr = build_int_cst (integer_type_node, nunits);
10390 expr = gimple_build (seq: &seq, code: FLOAT_EXPR, TREE_TYPE (step_expr), ops: expr);
10391 }
10392 else
10393 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
10394 new_name = gimple_build (seq: &seq, code: MULT_EXPR, TREE_TYPE (step_expr),
10395 ops: expr, ops: step_expr);
10396 if (seq)
10397 {
10398 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10399 gcc_assert (!new_bb);
10400 }
10401
10402 t = unshare_expr (new_name);
10403 gcc_assert (CONSTANT_CLASS_P (new_name)
10404 || TREE_CODE (new_name) == SSA_NAME);
10405 new_vec = build_vector_from_val (step_vectype, t);
10406 vec_step = vect_init_vector (loop_vinfo, stmt_info,
10407 new_vec, step_vectype, NULL);
10408
10409 vec_def = induc_def;
10410 for (i = 1; i < ncopies + 1; i++)
10411 {
10412 /* vec_i = vec_prev + vec_step */
10413 gimple_seq stmts = NULL;
10414 vec_def = gimple_convert (seq: &stmts, type: step_vectype, op: vec_def);
10415 vec_def = gimple_build (seq: &stmts,
10416 code: PLUS_EXPR, type: step_vectype, ops: vec_def, ops: vec_step);
10417 vec_def = gimple_convert (seq: &stmts, type: vectype, op: vec_def);
10418
10419 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10420 if (i < ncopies)
10421 {
10422 new_stmt = SSA_NAME_DEF_STMT (vec_def);
10423 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (obj: new_stmt);
10424 }
10425 else
10426 {
10427 /* vec_1 = vec_iv + (VF/n * S)
10428 vec_2 = vec_1 + (VF/n * S)
10429 ...
10430 vec_n = vec_prev + (VF/n * S) = vec_iv + VF * S = vec_loop
10431
10432 vec_n is used as vec_loop to save the large step register and
10433 related operations. */
10434 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10435 UNKNOWN_LOCATION);
10436 }
10437 }
10438 }
10439
10440 if (dump_enabled_p ())
10441 dump_printf_loc (MSG_NOTE, vect_location,
10442 "transform induction: created def-use cycle: %G%G",
10443 (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
10444
10445 return true;
10446}
10447
10448/* Function vectorizable_live_operation.
10449
10450 STMT_INFO computes a value that is used outside the loop. Check if
10451 it can be supported. */
10452
10453bool
10454vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
10455 slp_tree slp_node, slp_instance slp_node_instance,
10456 int slp_index, bool vec_stmt_p,
10457 stmt_vector_for_cost *cost_vec)
10458{
10459 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (p: vinfo);
10460 imm_use_iterator imm_iter;
10461 tree lhs, lhs_type, bitsize;
10462 tree vectype = (slp_node
10463 ? SLP_TREE_VECTYPE (slp_node)
10464 : STMT_VINFO_VECTYPE (stmt_info));
10465 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (node: vectype);
10466 int ncopies;
10467 gimple *use_stmt;
10468 auto_vec<tree> vec_oprnds;
10469 int vec_entry = 0;
10470 poly_uint64 vec_index = 0;
10471
10472 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
10473
10474 /* If a stmt of a reduction is live, vectorize it via
10475 vect_create_epilog_for_reduction. vectorizable_reduction assessed
10476 validity so just trigger the transform here. */
10477 if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
10478 {
10479 if (!vec_stmt_p)
10480 return true;
10481 if (slp_node)
10482 {
10483 /* For reduction chains the meta-info is attached to
10484 the group leader. */
10485 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
10486 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
10487 /* For SLP reductions we vectorize the epilogue for
10488 all involved stmts together. */
10489 else if (slp_index != 0)
10490 return true;
10491 }
10492 stmt_vec_info reduc_info = info_for_reduction (vinfo: loop_vinfo, stmt_info);
10493 gcc_assert (reduc_info->is_reduc_info);
10494 if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
10495 || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
10496 return true;
10497 vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
10498 slp_node_instance);
10499 return true;
10500 }
10501
10502 /* If STMT is not relevant and it is a simple assignment and its inputs are
10503 invariant then it can remain in place, unvectorized. The original last
10504 scalar value that it computes will be used. */
10505 if (!STMT_VINFO_RELEVANT_P (stmt_info))
10506 {
10507 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
10508 if (dump_enabled_p ())
10509 dump_printf_loc (MSG_NOTE, vect_location,
10510 "statement is simple and uses invariant. Leaving in "
10511 "place.\n");
10512 return true;
10513 }
10514
10515 if (slp_node)
10516 ncopies = 1;
10517 else
10518 ncopies = vect_get_num_copies (loop_vinfo, vectype);
10519
10520 if (slp_node)
10521 {
10522 gcc_assert (slp_index >= 0);
10523
10524 /* Get the last occurrence of the scalar index from the concatenation of
10525 all the slp vectors. Calculate which slp vector it is and the index
10526 within. */
10527 int num_scalar = SLP_TREE_LANES (slp_node);
10528 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10529 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
10530
10531 /* Calculate which vector contains the result, and which lane of
10532 that vector we need. */
10533 if (!can_div_trunc_p (a: pos, b: nunits, quotient: &vec_entry, remainder: &vec_index))
10534 {
10535 if (dump_enabled_p ())
10536 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10537 "Cannot determine which vector holds the"
10538 " final result.\n");
10539 return false;
10540 }
10541 }
10542
10543 if (!vec_stmt_p)
10544 {
10545 /* No transformation required. */
10546 if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
10547 {
10548 if (slp_node)
10549 {
10550 if (dump_enabled_p ())
10551 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10552 "can't operate on partial vectors "
10553 "because an SLP statement is live after "
10554 "the loop.\n");
10555 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10556 }
10557 else if (ncopies > 1)
10558 {
10559 if (dump_enabled_p ())
10560 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10561 "can't operate on partial vectors "
10562 "because ncopies is greater than 1.\n");
10563 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10564 }
10565 else
10566 {
10567 gcc_assert (ncopies == 1 && !slp_node);
10568 if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
10569 OPTIMIZE_FOR_SPEED))
10570 vect_record_loop_mask (loop_vinfo,
10571 &LOOP_VINFO_MASKS (loop_vinfo),
10572 1, vectype, NULL);
10573 else if (can_vec_extract_var_idx_p (
10574 TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
10575 vect_record_loop_len (loop_vinfo,
10576 &LOOP_VINFO_LENS (loop_vinfo),
10577 1, vectype, 1);
10578 else
10579 {
10580 if (dump_enabled_p ())
10581 dump_printf_loc (
10582 MSG_MISSED_OPTIMIZATION, vect_location,
10583 "can't operate on partial vectors "
10584 "because the target doesn't support extract "
10585 "last reduction.\n");
10586 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10587 }
10588 }
10589 }
10590 /* ??? Enable for loop costing as well. */
10591 if (!loop_vinfo)
10592 record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
10593 0, vect_epilogue);
10594 return true;
10595 }
10596
10597 /* Use the lhs of the original scalar statement. */
10598 gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
10599 if (dump_enabled_p ())
10600 dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
10601 "stmt %G", stmt);
10602
10603 lhs = gimple_get_lhs (stmt);
10604 lhs_type = TREE_TYPE (lhs);
10605
10606 bitsize = vector_element_bits_tree (vectype);
10607
10608 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
10609 tree vec_lhs, bitstart;
10610 gimple *vec_stmt;
10611 if (slp_node)
10612 {
10613 gcc_assert (!loop_vinfo
10614 || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
10615 && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
10616
10617 /* Get the correct slp vectorized stmt. */
10618 vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
10619 vec_stmt = SSA_NAME_DEF_STMT (vec_lhs);
10620
10621 /* Get entry to use. */
10622 bitstart = bitsize_int (vec_index);
10623 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
10624 }
10625 else
10626 {
10627 /* For multiple copies, get the last copy. */
10628 vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
10629 vec_lhs = gimple_get_lhs (vec_stmt);
10630
10631 /* Get the last lane in the vector. */
10632 bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
10633 }
10634
10635 if (loop_vinfo)
10636 {
10637 /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
10638 requirement, insert one phi node for it. It looks like:
10639 loop;
10640 BB:
10641 # lhs' = PHI <lhs>
10642 ==>
10643 loop;
10644 BB:
10645 # vec_lhs' = PHI <vec_lhs>
10646 new_tree = lane_extract <vec_lhs', ...>;
10647 lhs' = new_tree; */
10648
10649 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10650 basic_block exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
10651 gcc_assert (single_pred_p (exit_bb));
10652
10653 tree vec_lhs_phi = copy_ssa_name (var: vec_lhs);
10654 gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
10655 SET_PHI_ARG_DEF (phi, LOOP_VINFO_IV_EXIT (loop_vinfo)->dest_idx, vec_lhs);
10656
10657 gimple_seq stmts = NULL;
10658 tree new_tree;
10659 if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
10660 {
10661 /* Emit:
10662
10663 SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
10664
10665 where VEC_LHS is the vectorized live-out result and MASK is
10666 the loop mask for the final iteration. */
10667 gcc_assert (ncopies == 1 && !slp_node);
10668 gimple_seq tem = NULL;
10669 gimple_stmt_iterator gsi = gsi_last (seq&: tem);
10670 tree len
10671 = vect_get_loop_len (loop_vinfo, &gsi,
10672 &LOOP_VINFO_LENS (loop_vinfo),
10673 1, vectype, 0, 0);
10674
10675 /* BIAS - 1. */
10676 signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
10677 tree bias_minus_one
10678 = int_const_binop (MINUS_EXPR,
10679 build_int_cst (TREE_TYPE (len), biasval),
10680 build_one_cst (TREE_TYPE (len)));
10681
10682 /* LAST_INDEX = LEN + (BIAS - 1). */
10683 tree last_index = gimple_build (seq: &stmts, code: PLUS_EXPR, TREE_TYPE (len),
10684 ops: len, ops: bias_minus_one);
10685
10686 /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>. */
10687 tree scalar_res
10688 = gimple_build (seq: &stmts, fn: CFN_VEC_EXTRACT, TREE_TYPE (vectype),
10689 args: vec_lhs_phi, args: last_index);
10690
10691 /* Convert the extracted vector element to the scalar type. */
10692 new_tree = gimple_convert (seq: &stmts, type: lhs_type, op: scalar_res);
10693 }
10694 else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
10695 {
10696 /* Emit:
10697
10698 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
10699
10700 where VEC_LHS is the vectorized live-out result and MASK is
10701 the loop mask for the final iteration. */
10702 gcc_assert (ncopies == 1 && !slp_node);
10703 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
10704 gimple_seq tem = NULL;
10705 gimple_stmt_iterator gsi = gsi_last (seq&: tem);
10706 tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
10707 &LOOP_VINFO_MASKS (loop_vinfo),
10708 1, vectype, 0);
10709 gimple_seq_add_seq (&stmts, tem);
10710 tree scalar_res = gimple_build (seq: &stmts, fn: CFN_EXTRACT_LAST, type: scalar_type,
10711 args: mask, args: vec_lhs_phi);
10712
10713 /* Convert the extracted vector element to the scalar type. */
10714 new_tree = gimple_convert (seq: &stmts, type: lhs_type, op: scalar_res);
10715 }
10716 else
10717 {
10718 tree bftype = TREE_TYPE (vectype);
10719 if (VECTOR_BOOLEAN_TYPE_P (vectype))
10720 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10721 new_tree = build3 (BIT_FIELD_REF, bftype,
10722 vec_lhs_phi, bitsize, bitstart);
10723 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10724 &stmts, true, NULL_TREE);
10725 }
10726
10727 gimple_stmt_iterator exit_gsi = gsi_after_labels (bb: exit_bb);
10728 if (stmts)
10729 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
10730
10731 /* Remove existing phis that copy from lhs and create copies
10732 from new_tree. */
10733 gimple_stmt_iterator gsi;
10734 for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (i: gsi);)
10735 {
10736 gimple *phi = gsi_stmt (i: gsi);
10737 if ((gimple_phi_arg_def (gs: phi, index: 0) == lhs))
10738 {
10739 remove_phi_node (&gsi, false);
10740 tree lhs_phi = gimple_phi_result (gs: phi);
10741 gimple *copy = gimple_build_assign (lhs_phi, new_tree);
10742 gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
10743 }
10744 else
10745 gsi_next (i: &gsi);
10746 }
10747
10748 /* There a no further out-of-loop uses of lhs by LC-SSA construction. */
10749 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10750 gcc_assert (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)));
10751 }
10752 else
10753 {
10754 /* For basic-block vectorization simply insert the lane-extraction. */
10755 tree bftype = TREE_TYPE (vectype);
10756 if (VECTOR_BOOLEAN_TYPE_P (vectype))
10757 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10758 tree new_tree = build3 (BIT_FIELD_REF, bftype,
10759 vec_lhs, bitsize, bitstart);
10760 gimple_seq stmts = NULL;
10761 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10762 &stmts, true, NULL_TREE);
10763 if (TREE_CODE (new_tree) == SSA_NAME
10764 && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
10765 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
10766 if (is_a <gphi *> (p: vec_stmt))
10767 {
10768 gimple_stmt_iterator si = gsi_after_labels (bb: gimple_bb (g: vec_stmt));
10769 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10770 }
10771 else
10772 {
10773 gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
10774 gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
10775 }
10776
10777 /* Replace use of lhs with newly computed result. If the use stmt is a
10778 single arg PHI, just replace all uses of PHI result. It's necessary
10779 because lcssa PHI defining lhs may be before newly inserted stmt. */
10780 use_operand_p use_p;
10781 stmt_vec_info use_stmt_info;
10782 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10783 if (!is_gimple_debug (gs: use_stmt)
10784 && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
10785 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
10786 {
10787 /* ??? This can happen when the live lane ends up being
10788 rooted in a vector construction code-generated by an
10789 external SLP node (and code-generation for that already
10790 happened). See gcc.dg/vect/bb-slp-47.c.
10791 Doing this is what would happen if that vector CTOR
10792 were not code-generated yet so it is not too bad.
10793 ??? In fact we'd likely want to avoid this situation
10794 in the first place. */
10795 if (TREE_CODE (new_tree) == SSA_NAME
10796 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10797 && gimple_code (g: use_stmt) != GIMPLE_PHI
10798 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
10799 use_stmt))
10800 {
10801 if (dump_enabled_p ())
10802 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10803 "Using original scalar computation for "
10804 "live lane because use preceeds vector "
10805 "def\n");
10806 continue;
10807 }
10808 /* ??? It can also happen that we end up pulling a def into
10809 a loop where replacing out-of-loop uses would require
10810 a new LC SSA PHI node. Retain the original scalar in
10811 those cases as well. PR98064. */
10812 if (TREE_CODE (new_tree) == SSA_NAME
10813 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10814 && (gimple_bb (g: use_stmt)->loop_father
10815 != gimple_bb (g: vec_stmt)->loop_father)
10816 && !flow_loop_nested_p (gimple_bb (g: vec_stmt)->loop_father,
10817 gimple_bb (g: use_stmt)->loop_father))
10818 {
10819 if (dump_enabled_p ())
10820 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10821 "Using original scalar computation for "
10822 "live lane because there is an out-of-loop "
10823 "definition for it\n");
10824 continue;
10825 }
10826 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10827 SET_USE (use_p, new_tree);
10828 update_stmt (s: use_stmt);
10829 }
10830 }
10831
10832 return true;
10833}
10834
10835/* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
10836
10837static void
10838vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
10839{
10840 ssa_op_iter op_iter;
10841 imm_use_iterator imm_iter;
10842 def_operand_p def_p;
10843 gimple *ustmt;
10844
10845 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
10846 {
10847 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
10848 {
10849 basic_block bb;
10850
10851 if (!is_gimple_debug (gs: ustmt))
10852 continue;
10853
10854 bb = gimple_bb (g: ustmt);
10855
10856 if (!flow_bb_inside_loop_p (loop, bb))
10857 {
10858 if (gimple_debug_bind_p (s: ustmt))
10859 {
10860 if (dump_enabled_p ())
10861 dump_printf_loc (MSG_NOTE, vect_location,
10862 "killing debug use\n");
10863
10864 gimple_debug_bind_reset_value (dbg: ustmt);
10865 update_stmt (s: ustmt);
10866 }
10867 else
10868 gcc_unreachable ();
10869 }
10870 }
10871 }
10872}
10873
10874/* Given loop represented by LOOP_VINFO, return true if computation of
10875 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
10876 otherwise. */
10877
10878static bool
10879loop_niters_no_overflow (loop_vec_info loop_vinfo)
10880{
10881 /* Constant case. */
10882 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
10883 {
10884 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
10885 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
10886
10887 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
10888 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
10889 if (wi::to_widest (t: cst_nitersm1) < wi::to_widest (t: cst_niters))
10890 return true;
10891 }
10892
10893 widest_int max;
10894 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10895 /* Check the upper bound of loop niters. */
10896 if (get_max_loop_iterations (loop, nit: &max))
10897 {
10898 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
10899 signop sgn = TYPE_SIGN (type);
10900 widest_int type_max = widest_int::from (x: wi::max_value (type), sgn);
10901 if (max < type_max)
10902 return true;
10903 }
10904 return false;
10905}
10906
10907/* Return a mask type with half the number of elements as OLD_TYPE,
10908 given that it should have mode NEW_MODE. */
10909
10910tree
10911vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
10912{
10913 poly_uint64 nunits = exact_div (a: TYPE_VECTOR_SUBPARTS (node: old_type), b: 2);
10914 return build_truth_vector_type_for_mode (nunits, new_mode);
10915}
10916
10917/* Return a mask type with twice as many elements as OLD_TYPE,
10918 given that it should have mode NEW_MODE. */
10919
10920tree
10921vect_double_mask_nunits (tree old_type, machine_mode new_mode)
10922{
10923 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (node: old_type) * 2;
10924 return build_truth_vector_type_for_mode (nunits, new_mode);
10925}
10926
10927/* Record that a fully-masked version of LOOP_VINFO would need MASKS to
10928 contain a sequence of NVECTORS masks that each control a vector of type
10929 VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
10930 these vector masks with the vector version of SCALAR_MASK. */
10931
10932void
10933vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
10934 unsigned int nvectors, tree vectype, tree scalar_mask)
10935{
10936 gcc_assert (nvectors != 0);
10937
10938 if (scalar_mask)
10939 {
10940 scalar_cond_masked_key cond (scalar_mask, nvectors);
10941 loop_vinfo->scalar_cond_masked_set.add (k: cond);
10942 }
10943
10944 masks->mask_set.add (k: std::make_pair (x&: vectype, y&: nvectors));
10945}
10946
10947/* Given a complete set of masks MASKS, extract mask number INDEX
10948 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
10949 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
10950
10951 See the comment above vec_loop_masks for more details about the mask
10952 arrangement. */
10953
10954tree
10955vect_get_loop_mask (loop_vec_info loop_vinfo,
10956 gimple_stmt_iterator *gsi, vec_loop_masks *masks,
10957 unsigned int nvectors, tree vectype, unsigned int index)
10958{
10959 if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
10960 == vect_partial_vectors_while_ult)
10961 {
10962 rgroup_controls *rgm = &(masks->rgc_vec)[nvectors - 1];
10963 tree mask_type = rgm->type;
10964
10965 /* Populate the rgroup's mask array, if this is the first time we've
10966 used it. */
10967 if (rgm->controls.is_empty ())
10968 {
10969 rgm->controls.safe_grow_cleared (len: nvectors, exact: true);
10970 for (unsigned int i = 0; i < nvectors; ++i)
10971 {
10972 tree mask = make_temp_ssa_name (type: mask_type, NULL, name: "loop_mask");
10973 /* Provide a dummy definition until the real one is available. */
10974 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
10975 rgm->controls[i] = mask;
10976 }
10977 }
10978
10979 tree mask = rgm->controls[index];
10980 if (maybe_ne (a: TYPE_VECTOR_SUBPARTS (node: mask_type),
10981 b: TYPE_VECTOR_SUBPARTS (node: vectype)))
10982 {
10983 /* A loop mask for data type X can be reused for data type Y
10984 if X has N times more elements than Y and if Y's elements
10985 are N times bigger than X's. In this case each sequence
10986 of N elements in the loop mask will be all-zero or all-one.
10987 We can then view-convert the mask so that each sequence of
10988 N elements is replaced by a single element. */
10989 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
10990 TYPE_VECTOR_SUBPARTS (vectype)));
10991 gimple_seq seq = NULL;
10992 mask_type = truth_type_for (vectype);
10993 mask = gimple_build (seq: &seq, code: VIEW_CONVERT_EXPR, type: mask_type, ops: mask);
10994 if (seq)
10995 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10996 }
10997 return mask;
10998 }
10999 else if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
11000 == vect_partial_vectors_avx512)
11001 {
11002 /* The number of scalars per iteration and the number of vectors are
11003 both compile-time constants. */
11004 unsigned int nscalars_per_iter
11005 = exact_div (a: nvectors * TYPE_VECTOR_SUBPARTS (node: vectype),
11006 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11007
11008 rgroup_controls *rgm = &masks->rgc_vec[nscalars_per_iter - 1];
11009
11010 /* The stored nV is dependent on the mask type produced. */
11011 gcc_assert (exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11012 TYPE_VECTOR_SUBPARTS (rgm->type)).to_constant ()
11013 == rgm->factor);
11014 nvectors = rgm->factor;
11015
11016 /* Populate the rgroup's mask array, if this is the first time we've
11017 used it. */
11018 if (rgm->controls.is_empty ())
11019 {
11020 rgm->controls.safe_grow_cleared (len: nvectors, exact: true);
11021 for (unsigned int i = 0; i < nvectors; ++i)
11022 {
11023 tree mask = make_temp_ssa_name (type: rgm->type, NULL, name: "loop_mask");
11024 /* Provide a dummy definition until the real one is available. */
11025 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
11026 rgm->controls[i] = mask;
11027 }
11028 }
11029 if (known_eq (TYPE_VECTOR_SUBPARTS (rgm->type),
11030 TYPE_VECTOR_SUBPARTS (vectype)))
11031 return rgm->controls[index];
11032
11033 /* Split the vector if needed. Since we are dealing with integer mode
11034 masks with AVX512 we can operate on the integer representation
11035 performing the whole vector shifting. */
11036 unsigned HOST_WIDE_INT factor;
11037 bool ok = constant_multiple_p (a: TYPE_VECTOR_SUBPARTS (node: rgm->type),
11038 b: TYPE_VECTOR_SUBPARTS (node: vectype), multiple: &factor);
11039 gcc_assert (ok);
11040 gcc_assert (GET_MODE_CLASS (TYPE_MODE (rgm->type)) == MODE_INT);
11041 tree mask_type = truth_type_for (vectype);
11042 gcc_assert (GET_MODE_CLASS (TYPE_MODE (mask_type)) == MODE_INT);
11043 unsigned vi = index / factor;
11044 unsigned vpart = index % factor;
11045 tree vec = rgm->controls[vi];
11046 gimple_seq seq = NULL;
11047 vec = gimple_build (seq: &seq, code: VIEW_CONVERT_EXPR,
11048 type: lang_hooks.types.type_for_mode
11049 (TYPE_MODE (rgm->type), 1), ops: vec);
11050 /* For integer mode masks simply shift the right bits into position. */
11051 if (vpart != 0)
11052 vec = gimple_build (seq: &seq, code: RSHIFT_EXPR, TREE_TYPE (vec), ops: vec,
11053 ops: build_int_cst (integer_type_node,
11054 (TYPE_VECTOR_SUBPARTS (node: vectype)
11055 * vpart)));
11056 vec = gimple_convert (seq: &seq, type: lang_hooks.types.type_for_mode
11057 (TYPE_MODE (mask_type), 1), op: vec);
11058 vec = gimple_build (seq: &seq, code: VIEW_CONVERT_EXPR, type: mask_type, ops: vec);
11059 if (seq)
11060 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11061 return vec;
11062 }
11063 else
11064 gcc_unreachable ();
11065}
11066
11067/* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
11068 lengths for controlling an operation on VECTYPE. The operation splits
11069 each element of VECTYPE into FACTOR separate subelements, measuring the
11070 length as a number of these subelements. */
11071
11072void
11073vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
11074 unsigned int nvectors, tree vectype, unsigned int factor)
11075{
11076 gcc_assert (nvectors != 0);
11077 if (lens->length () < nvectors)
11078 lens->safe_grow_cleared (len: nvectors, exact: true);
11079 rgroup_controls *rgl = &(*lens)[nvectors - 1];
11080
11081 /* The number of scalars per iteration, scalar occupied bytes and
11082 the number of vectors are both compile-time constants. */
11083 unsigned int nscalars_per_iter
11084 = exact_div (a: nvectors * TYPE_VECTOR_SUBPARTS (node: vectype),
11085 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11086
11087 if (rgl->max_nscalars_per_iter < nscalars_per_iter)
11088 {
11089 /* For now, we only support cases in which all loads and stores fall back
11090 to VnQI or none do. */
11091 gcc_assert (!rgl->max_nscalars_per_iter
11092 || (rgl->factor == 1 && factor == 1)
11093 || (rgl->max_nscalars_per_iter * rgl->factor
11094 == nscalars_per_iter * factor));
11095 rgl->max_nscalars_per_iter = nscalars_per_iter;
11096 rgl->type = vectype;
11097 rgl->factor = factor;
11098 }
11099}
11100
11101/* Given a complete set of lengths LENS, extract length number INDEX
11102 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11103 where 0 <= INDEX < NVECTORS. Return a value that contains FACTOR
11104 multipled by the number of elements that should be processed.
11105 Insert any set-up statements before GSI. */
11106
11107tree
11108vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
11109 vec_loop_lens *lens, unsigned int nvectors, tree vectype,
11110 unsigned int index, unsigned int factor)
11111{
11112 rgroup_controls *rgl = &(*lens)[nvectors - 1];
11113 bool use_bias_adjusted_len =
11114 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
11115
11116 /* Populate the rgroup's len array, if this is the first time we've
11117 used it. */
11118 if (rgl->controls.is_empty ())
11119 {
11120 rgl->controls.safe_grow_cleared (len: nvectors, exact: true);
11121 for (unsigned int i = 0; i < nvectors; ++i)
11122 {
11123 tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
11124 gcc_assert (len_type != NULL_TREE);
11125
11126 tree len = make_temp_ssa_name (type: len_type, NULL, name: "loop_len");
11127
11128 /* Provide a dummy definition until the real one is available. */
11129 SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
11130 rgl->controls[i] = len;
11131
11132 if (use_bias_adjusted_len)
11133 {
11134 gcc_assert (i == 0);
11135 tree adjusted_len =
11136 make_temp_ssa_name (type: len_type, NULL, name: "adjusted_loop_len");
11137 SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
11138 rgl->bias_adjusted_ctrl = adjusted_len;
11139 }
11140 }
11141 }
11142
11143 if (use_bias_adjusted_len)
11144 return rgl->bias_adjusted_ctrl;
11145
11146 tree loop_len = rgl->controls[index];
11147 if (rgl->factor == 1 && factor == 1)
11148 {
11149 poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (node: rgl->type);
11150 poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (node: vectype);
11151 if (maybe_ne (a: nunits1, b: nunits2))
11152 {
11153 /* A loop len for data type X can be reused for data type Y
11154 if X has N times more elements than Y and if Y's elements
11155 are N times bigger than X's. */
11156 gcc_assert (multiple_p (nunits1, nunits2));
11157 factor = exact_div (a: nunits1, b: nunits2).to_constant ();
11158 tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
11159 gimple_seq seq = NULL;
11160 loop_len = gimple_build (seq: &seq, code: RDIV_EXPR, type: iv_type, ops: loop_len,
11161 ops: build_int_cst (iv_type, factor));
11162 if (seq)
11163 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11164 }
11165 }
11166 return loop_len;
11167}
11168
11169/* Scale profiling counters by estimation for LOOP which is vectorized
11170 by factor VF.
11171 If FLAT is true, the loop we started with had unrealistically flat
11172 profile. */
11173
11174static void
11175scale_profile_for_vect_loop (class loop *loop, edge exit_e, unsigned vf, bool flat)
11176{
11177 /* For flat profiles do not scale down proportionally by VF and only
11178 cap by known iteration count bounds. */
11179 if (flat)
11180 {
11181 if (dump_file && (dump_flags & TDF_DETAILS))
11182 fprintf (stream: dump_file,
11183 format: "Vectorized loop profile seems flat; not scaling iteration "
11184 "count down by the vectorization factor %i\n", vf);
11185 scale_loop_profile (loop, profile_probability::always (),
11186 get_likely_max_loop_iterations_int (loop));
11187 return;
11188 }
11189 /* Loop body executes VF fewer times and exit increases VF times. */
11190 profile_count entry_count = loop_preheader_edge (loop)->count ();
11191
11192 /* If we have unreliable loop profile avoid dropping entry
11193 count bellow header count. This can happen since loops
11194 has unrealistically low trip counts. */
11195 while (vf > 1
11196 && loop->header->count > entry_count
11197 && loop->header->count < entry_count * vf)
11198 {
11199 if (dump_file && (dump_flags & TDF_DETAILS))
11200 fprintf (stream: dump_file,
11201 format: "Vectorization factor %i seems too large for profile "
11202 "prevoiusly believed to be consistent; reducing.\n", vf);
11203 vf /= 2;
11204 }
11205
11206 if (entry_count.nonzero_p ())
11207 set_edge_probability_and_rescale_others
11208 (exit_e,
11209 entry_count.probability_in (overall: loop->header->count / vf));
11210 /* Avoid producing very large exit probability when we do not have
11211 sensible profile. */
11212 else if (exit_e->probability < profile_probability::always () / (vf * 2))
11213 set_edge_probability_and_rescale_others (exit_e, exit_e->probability * vf);
11214 loop->latch->count = single_pred_edge (bb: loop->latch)->count ();
11215
11216 scale_loop_profile (loop, profile_probability::always () / vf,
11217 get_likely_max_loop_iterations_int (loop));
11218}
11219
11220/* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
11221 latch edge values originally defined by it. */
11222
11223static void
11224maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
11225 stmt_vec_info def_stmt_info)
11226{
11227 tree def = gimple_get_lhs (vect_orig_stmt (stmt_info: def_stmt_info)->stmt);
11228 if (!def || TREE_CODE (def) != SSA_NAME)
11229 return;
11230 stmt_vec_info phi_info;
11231 imm_use_iterator iter;
11232 use_operand_p use_p;
11233 FOR_EACH_IMM_USE_FAST (use_p, iter, def)
11234 {
11235 gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p));
11236 if (!phi)
11237 continue;
11238 if (!(gimple_bb (g: phi)->loop_father->header == gimple_bb (g: phi)
11239 && (phi_info = loop_vinfo->lookup_stmt (phi))
11240 && STMT_VINFO_RELEVANT_P (phi_info)))
11241 continue;
11242 loop_p loop = gimple_bb (g: phi)->loop_father;
11243 edge e = loop_latch_edge (loop);
11244 if (PHI_ARG_DEF_FROM_EDGE (phi, e) != def)
11245 continue;
11246
11247 if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
11248 && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
11249 && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
11250 {
11251 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11252 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11253 gcc_assert (phi_defs.length () == latch_defs.length ());
11254 for (unsigned i = 0; i < phi_defs.length (); ++i)
11255 add_phi_arg (as_a <gphi *> (p: phi_defs[i]),
11256 gimple_get_lhs (latch_defs[i]), e,
11257 gimple_phi_arg_location (phi, i: e->dest_idx));
11258 }
11259 else if (STMT_VINFO_DEF_TYPE (phi_info) == vect_first_order_recurrence)
11260 {
11261 /* For first order recurrences we have to update both uses of
11262 the latch definition, the one in the PHI node and the one
11263 in the generated VEC_PERM_EXPR. */
11264 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11265 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11266 gcc_assert (phi_defs.length () == latch_defs.length ());
11267 tree phidef = gimple_assign_rhs1 (gs: phi_defs[0]);
11268 gphi *vphi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
11269 for (unsigned i = 0; i < phi_defs.length (); ++i)
11270 {
11271 gassign *perm = as_a <gassign *> (p: phi_defs[i]);
11272 if (i > 0)
11273 gimple_assign_set_rhs1 (gs: perm, rhs: gimple_get_lhs (latch_defs[i-1]));
11274 gimple_assign_set_rhs2 (gs: perm, rhs: gimple_get_lhs (latch_defs[i]));
11275 update_stmt (s: perm);
11276 }
11277 add_phi_arg (vphi, gimple_get_lhs (latch_defs.last ()), e,
11278 gimple_phi_arg_location (phi, i: e->dest_idx));
11279 }
11280 }
11281}
11282
11283/* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
11284 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
11285 stmt_vec_info. */
11286
11287static bool
11288vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
11289 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
11290{
11291 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11292 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11293
11294 if (dump_enabled_p ())
11295 dump_printf_loc (MSG_NOTE, vect_location,
11296 "------>vectorizing statement: %G", stmt_info->stmt);
11297
11298 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11299 vect_loop_kill_debug_uses (loop, stmt_info);
11300
11301 if (!STMT_VINFO_RELEVANT_P (stmt_info)
11302 && !STMT_VINFO_LIVE_P (stmt_info))
11303 return false;
11304
11305 if (STMT_VINFO_VECTYPE (stmt_info))
11306 {
11307 poly_uint64 nunits
11308 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
11309 if (!STMT_SLP_TYPE (stmt_info)
11310 && maybe_ne (a: nunits, b: vf)
11311 && dump_enabled_p ())
11312 /* For SLP VF is set according to unrolling factor, and not
11313 to vector size, hence for SLP this print is not valid. */
11314 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11315 }
11316
11317 /* Pure SLP statements have already been vectorized. We still need
11318 to apply loop vectorization to hybrid SLP statements. */
11319 if (PURE_SLP_STMT (stmt_info))
11320 return false;
11321
11322 if (dump_enabled_p ())
11323 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
11324
11325 if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
11326 *seen_store = stmt_info;
11327
11328 return true;
11329}
11330
11331/* Helper function to pass to simplify_replace_tree to enable replacing tree's
11332 in the hash_map with its corresponding values. */
11333
11334static tree
11335find_in_mapping (tree t, void *context)
11336{
11337 hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
11338
11339 tree *value = mapping->get (k: t);
11340 return value ? *value : t;
11341}
11342
11343/* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
11344 original loop that has now been vectorized.
11345
11346 The inits of the data_references need to be advanced with the number of
11347 iterations of the main loop. This has been computed in vect_do_peeling and
11348 is stored in parameter ADVANCE. We first restore the data_references
11349 initial offset with the values recored in ORIG_DRS_INIT.
11350
11351 Since the loop_vec_info of this EPILOGUE was constructed for the original
11352 loop, its stmt_vec_infos all point to the original statements. These need
11353 to be updated to point to their corresponding copies as well as the SSA_NAMES
11354 in their PATTERN_DEF_SEQs and RELATED_STMTs.
11355
11356 The data_reference's connections also need to be updated. Their
11357 corresponding dr_vec_info need to be reconnected to the EPILOGUE's
11358 stmt_vec_infos, their statements need to point to their corresponding copy,
11359 if they are gather loads or scatter stores then their reference needs to be
11360 updated to point to its corresponding copy and finally we set
11361 'base_misaligned' to false as we have already peeled for alignment in the
11362 prologue of the main loop. */
11363
11364static void
11365update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
11366{
11367 loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (loop: epilogue);
11368 auto_vec<gimple *> stmt_worklist;
11369 hash_map<tree,tree> mapping;
11370 gimple *orig_stmt, *new_stmt;
11371 gimple_stmt_iterator epilogue_gsi;
11372 gphi_iterator epilogue_phi_gsi;
11373 stmt_vec_info stmt_vinfo = NULL, related_vinfo;
11374 basic_block *epilogue_bbs = get_loop_body (epilogue);
11375 unsigned i;
11376
11377 free (LOOP_VINFO_BBS (epilogue_vinfo));
11378 LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
11379
11380 /* Advance data_reference's with the number of iterations of the previous
11381 loop and its prologue. */
11382 vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
11383
11384
11385 /* The EPILOGUE loop is a copy of the original loop so they share the same
11386 gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
11387 point to the copied statements. We also create a mapping of all LHS' in
11388 the original loop and all the LHS' in the EPILOGUE and create worklists to
11389 update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */
11390 for (unsigned i = 0; i < epilogue->num_nodes; ++i)
11391 {
11392 for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
11393 !gsi_end_p (i: epilogue_phi_gsi); gsi_next (i: &epilogue_phi_gsi))
11394 {
11395 new_stmt = epilogue_phi_gsi.phi ();
11396
11397 gcc_assert (gimple_uid (new_stmt) > 0);
11398 stmt_vinfo
11399 = epilogue_vinfo->stmt_vec_infos[gimple_uid (g: new_stmt) - 1];
11400
11401 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11402 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11403
11404 mapping.put (k: gimple_phi_result (gs: orig_stmt),
11405 v: gimple_phi_result (gs: new_stmt));
11406 /* PHI nodes can not have patterns or related statements. */
11407 gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
11408 && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
11409 }
11410
11411 for (epilogue_gsi = gsi_start_bb (bb: epilogue_bbs[i]);
11412 !gsi_end_p (i: epilogue_gsi); gsi_next (i: &epilogue_gsi))
11413 {
11414 new_stmt = gsi_stmt (i: epilogue_gsi);
11415 if (is_gimple_debug (gs: new_stmt))
11416 continue;
11417
11418 gcc_assert (gimple_uid (new_stmt) > 0);
11419 stmt_vinfo
11420 = epilogue_vinfo->stmt_vec_infos[gimple_uid (g: new_stmt) - 1];
11421
11422 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11423 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11424
11425 if (tree old_lhs = gimple_get_lhs (orig_stmt))
11426 mapping.put (k: old_lhs, v: gimple_get_lhs (new_stmt));
11427
11428 if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
11429 {
11430 gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
11431 for (gimple_stmt_iterator gsi = gsi_start (seq);
11432 !gsi_end_p (i: gsi); gsi_next (i: &gsi))
11433 stmt_worklist.safe_push (obj: gsi_stmt (i: gsi));
11434 }
11435
11436 related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
11437 if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
11438 {
11439 gimple *stmt = STMT_VINFO_STMT (related_vinfo);
11440 stmt_worklist.safe_push (obj: stmt);
11441 /* Set BB such that the assert in
11442 'get_initial_def_for_reduction' is able to determine that
11443 the BB of the related stmt is inside this loop. */
11444 gimple_set_bb (stmt,
11445 gimple_bb (g: new_stmt));
11446 related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
11447 gcc_assert (related_vinfo == NULL
11448 || related_vinfo == stmt_vinfo);
11449 }
11450 }
11451 }
11452
11453 /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
11454 using the original main loop and thus need to be updated to refer to the
11455 cloned variables used in the epilogue. */
11456 for (unsigned i = 0; i < stmt_worklist.length (); ++i)
11457 {
11458 gimple *stmt = stmt_worklist[i];
11459 tree *new_op;
11460
11461 for (unsigned j = 1; j < gimple_num_ops (gs: stmt); ++j)
11462 {
11463 tree op = gimple_op (gs: stmt, i: j);
11464 if ((new_op = mapping.get(k: op)))
11465 gimple_set_op (gs: stmt, i: j, op: *new_op);
11466 else
11467 {
11468 /* PR92429: The last argument of simplify_replace_tree disables
11469 folding when replacing arguments. This is required as
11470 otherwise you might end up with different statements than the
11471 ones analyzed in vect_loop_analyze, leading to different
11472 vectorization. */
11473 op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
11474 &find_in_mapping, &mapping, do_fold: false);
11475 gimple_set_op (gs: stmt, i: j, op);
11476 }
11477 }
11478 }
11479
11480 struct data_reference *dr;
11481 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
11482 FOR_EACH_VEC_ELT (datarefs, i, dr)
11483 {
11484 orig_stmt = DR_STMT (dr);
11485 gcc_assert (gimple_uid (orig_stmt) > 0);
11486 stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (g: orig_stmt) - 1];
11487 /* Data references for gather loads and scatter stores do not use the
11488 updated offset we set using ADVANCE. Instead we have to make sure the
11489 reference in the data references point to the corresponding copy of
11490 the original in the epilogue. Make sure to update both
11491 gather/scatters recognized by dataref analysis and also other
11492 refs that get_load_store_type classified as VMAT_GATHER_SCATTER. */
11493 auto vstmt_vinfo = vect_stmt_to_vectorize (stmt_info: stmt_vinfo);
11494 if (STMT_VINFO_MEMORY_ACCESS_TYPE (vstmt_vinfo) == VMAT_GATHER_SCATTER
11495 || STMT_VINFO_GATHER_SCATTER_P (vstmt_vinfo))
11496 {
11497 DR_REF (dr)
11498 = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
11499 &find_in_mapping, &mapping);
11500 DR_BASE_ADDRESS (dr)
11501 = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
11502 &find_in_mapping, &mapping);
11503 }
11504 DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
11505 stmt_vinfo->dr_aux.stmt = stmt_vinfo;
11506 /* The vector size of the epilogue is smaller than that of the main loop
11507 so the alignment is either the same or lower. This means the dr will
11508 thus by definition be aligned. */
11509 STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
11510 }
11511
11512 epilogue_vinfo->shared->datarefs_copy.release ();
11513 epilogue_vinfo->shared->save_datarefs ();
11514}
11515
11516/* Function vect_transform_loop.
11517
11518 The analysis phase has determined that the loop is vectorizable.
11519 Vectorize the loop - created vectorized stmts to replace the scalar
11520 stmts in the loop, and update the loop exit condition.
11521 Returns scalar epilogue loop if any. */
11522
11523class loop *
11524vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
11525{
11526 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11527 class loop *epilogue = NULL;
11528 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
11529 int nbbs = loop->num_nodes;
11530 int i;
11531 tree niters_vector = NULL_TREE;
11532 tree step_vector = NULL_TREE;
11533 tree niters_vector_mult_vf = NULL_TREE;
11534 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11535 unsigned int lowest_vf = constant_lower_bound (a: vf);
11536 gimple *stmt;
11537 bool check_profitability = false;
11538 unsigned int th;
11539 bool flat = maybe_flat_loop_profile (loop);
11540
11541 DUMP_VECT_SCOPE ("vec_transform_loop");
11542
11543 loop_vinfo->shared->check_datarefs ();
11544
11545 /* Use the more conservative vectorization threshold. If the number
11546 of iterations is constant assume the cost check has been performed
11547 by our caller. If the threshold makes all loops profitable that
11548 run at least the (estimated) vectorization factor number of times
11549 checking is pointless, too. */
11550 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
11551 if (vect_apply_runtime_profitability_check_p (loop_vinfo))
11552 {
11553 if (dump_enabled_p ())
11554 dump_printf_loc (MSG_NOTE, vect_location,
11555 "Profitability threshold is %d loop iterations.\n",
11556 th);
11557 check_profitability = true;
11558 }
11559
11560 /* Make sure there exists a single-predecessor exit bb. Do this before
11561 versioning. */
11562 edge e = LOOP_VINFO_IV_EXIT (loop_vinfo);
11563 if (! single_pred_p (bb: e->dest))
11564 {
11565 split_loop_exit_edge (e, true);
11566 if (dump_enabled_p ())
11567 dump_printf (MSG_NOTE, "split exit edge\n");
11568 }
11569
11570 /* Version the loop first, if required, so the profitability check
11571 comes first. */
11572
11573 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
11574 {
11575 class loop *sloop
11576 = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
11577 sloop->force_vectorize = false;
11578 check_profitability = false;
11579 }
11580
11581 /* Make sure there exists a single-predecessor exit bb also on the
11582 scalar loop copy. Do this after versioning but before peeling
11583 so CFG structure is fine for both scalar and if-converted loop
11584 to make slpeel_duplicate_current_defs_from_edges face matched
11585 loop closed PHI nodes on the exit. */
11586 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
11587 {
11588 e = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
11589 if (! single_pred_p (bb: e->dest))
11590 {
11591 split_loop_exit_edge (e, true);
11592 if (dump_enabled_p ())
11593 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
11594 }
11595 }
11596
11597 tree niters = vect_build_loop_niters (loop_vinfo);
11598 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
11599 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
11600 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
11601 tree advance;
11602 drs_init_vec orig_drs_init;
11603
11604 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
11605 &step_vector, &niters_vector_mult_vf, th,
11606 check_profitability, niters_no_overflow,
11607 &advance);
11608 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
11609 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
11610 {
11611 /* Ifcvt duplicates loop preheader, loop body and produces an basic
11612 block after loop exit. We need to scale all that. */
11613 basic_block preheader
11614 = loop_preheader_edge (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->src;
11615 preheader->count
11616 = preheader->count.apply_probability
11617 (LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11618 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
11619 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11620 single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->dest->count
11621 = preheader->count;
11622 }
11623
11624 if (niters_vector == NULL_TREE)
11625 {
11626 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
11627 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11628 && known_eq (lowest_vf, vf))
11629 {
11630 niters_vector
11631 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
11632 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
11633 step_vector = build_one_cst (TREE_TYPE (niters));
11634 }
11635 else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11636 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
11637 &step_vector, niters_no_overflow);
11638 else
11639 /* vect_do_peeling subtracted the number of peeled prologue
11640 iterations from LOOP_VINFO_NITERS. */
11641 vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
11642 &niters_vector, &step_vector,
11643 niters_no_overflow);
11644 }
11645
11646 /* 1) Make sure the loop header has exactly two entries
11647 2) Make sure we have a preheader basic block. */
11648
11649 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
11650
11651 split_edge (loop_preheader_edge (loop));
11652
11653 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11654 /* This will deal with any possible peeling. */
11655 vect_prepare_for_masked_peels (loop_vinfo);
11656
11657 /* Schedule the SLP instances first, then handle loop vectorization
11658 below. */
11659 if (!loop_vinfo->slp_instances.is_empty ())
11660 {
11661 DUMP_VECT_SCOPE ("scheduling SLP instances");
11662 vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
11663 }
11664
11665 /* FORNOW: the vectorizer supports only loops which body consist
11666 of one basic block (header + empty latch). When the vectorizer will
11667 support more involved loop forms, the order by which the BBs are
11668 traversed need to be reconsidered. */
11669
11670 for (i = 0; i < nbbs; i++)
11671 {
11672 basic_block bb = bbs[i];
11673 stmt_vec_info stmt_info;
11674
11675 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (i: si);
11676 gsi_next (i: &si))
11677 {
11678 gphi *phi = si.phi ();
11679 if (dump_enabled_p ())
11680 dump_printf_loc (MSG_NOTE, vect_location,
11681 "------>vectorizing phi: %G", (gimple *) phi);
11682 stmt_info = loop_vinfo->lookup_stmt (phi);
11683 if (!stmt_info)
11684 continue;
11685
11686 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11687 vect_loop_kill_debug_uses (loop, stmt_info);
11688
11689 if (!STMT_VINFO_RELEVANT_P (stmt_info)
11690 && !STMT_VINFO_LIVE_P (stmt_info))
11691 continue;
11692
11693 if (STMT_VINFO_VECTYPE (stmt_info)
11694 && (maybe_ne
11695 (a: TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), b: vf))
11696 && dump_enabled_p ())
11697 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11698
11699 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
11700 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
11701 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
11702 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
11703 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence
11704 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
11705 && ! PURE_SLP_STMT (stmt_info))
11706 {
11707 if (dump_enabled_p ())
11708 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
11709 vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
11710 }
11711 }
11712
11713 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (i: si);
11714 gsi_next (i: &si))
11715 {
11716 gphi *phi = si.phi ();
11717 stmt_info = loop_vinfo->lookup_stmt (phi);
11718 if (!stmt_info)
11719 continue;
11720
11721 if (!STMT_VINFO_RELEVANT_P (stmt_info)
11722 && !STMT_VINFO_LIVE_P (stmt_info))
11723 continue;
11724
11725 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
11726 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
11727 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
11728 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
11729 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
11730 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence)
11731 && ! PURE_SLP_STMT (stmt_info))
11732 maybe_set_vectorized_backedge_value (loop_vinfo, def_stmt_info: stmt_info);
11733 }
11734
11735 for (gimple_stmt_iterator si = gsi_start_bb (bb);
11736 !gsi_end_p (i: si);)
11737 {
11738 stmt = gsi_stmt (i: si);
11739 /* During vectorization remove existing clobber stmts. */
11740 if (gimple_clobber_p (s: stmt))
11741 {
11742 unlink_stmt_vdef (stmt);
11743 gsi_remove (&si, true);
11744 release_defs (stmt);
11745 }
11746 else
11747 {
11748 /* Ignore vector stmts created in the outer loop. */
11749 stmt_info = loop_vinfo->lookup_stmt (stmt);
11750
11751 /* vector stmts created in the outer-loop during vectorization of
11752 stmts in an inner-loop may not have a stmt_info, and do not
11753 need to be vectorized. */
11754 stmt_vec_info seen_store = NULL;
11755 if (stmt_info)
11756 {
11757 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
11758 {
11759 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
11760 for (gimple_stmt_iterator subsi = gsi_start (seq&: def_seq);
11761 !gsi_end_p (i: subsi); gsi_next (i: &subsi))
11762 {
11763 stmt_vec_info pat_stmt_info
11764 = loop_vinfo->lookup_stmt (gsi_stmt (i: subsi));
11765 vect_transform_loop_stmt (loop_vinfo, stmt_info: pat_stmt_info,
11766 gsi: &si, seen_store: &seen_store);
11767 }
11768 stmt_vec_info pat_stmt_info
11769 = STMT_VINFO_RELATED_STMT (stmt_info);
11770 if (vect_transform_loop_stmt (loop_vinfo, stmt_info: pat_stmt_info,
11771 gsi: &si, seen_store: &seen_store))
11772 maybe_set_vectorized_backedge_value (loop_vinfo,
11773 def_stmt_info: pat_stmt_info);
11774 }
11775 else
11776 {
11777 if (vect_transform_loop_stmt (loop_vinfo, stmt_info, gsi: &si,
11778 seen_store: &seen_store))
11779 maybe_set_vectorized_backedge_value (loop_vinfo,
11780 def_stmt_info: stmt_info);
11781 }
11782 }
11783 gsi_next (i: &si);
11784 if (seen_store)
11785 {
11786 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
11787 /* Interleaving. If IS_STORE is TRUE, the
11788 vectorization of the interleaving chain was
11789 completed - free all the stores in the chain. */
11790 vect_remove_stores (loop_vinfo,
11791 DR_GROUP_FIRST_ELEMENT (seen_store));
11792 else
11793 /* Free the attached stmt_vec_info and remove the stmt. */
11794 loop_vinfo->remove_stmt (stmt_info);
11795 }
11796 }
11797 }
11798
11799 /* Stub out scalar statements that must not survive vectorization.
11800 Doing this here helps with grouped statements, or statements that
11801 are involved in patterns. */
11802 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
11803 !gsi_end_p (i: gsi); gsi_next (i: &gsi))
11804 {
11805 gcall *call = dyn_cast <gcall *> (p: gsi_stmt (i: gsi));
11806 if (!call || !gimple_call_internal_p (gs: call))
11807 continue;
11808 internal_fn ifn = gimple_call_internal_fn (gs: call);
11809 if (ifn == IFN_MASK_LOAD)
11810 {
11811 tree lhs = gimple_get_lhs (call);
11812 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11813 {
11814 tree zero = build_zero_cst (TREE_TYPE (lhs));
11815 gimple *new_stmt = gimple_build_assign (lhs, zero);
11816 gsi_replace (&gsi, new_stmt, true);
11817 }
11818 }
11819 else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
11820 {
11821 tree lhs = gimple_get_lhs (call);
11822 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11823 {
11824 tree else_arg
11825 = gimple_call_arg (gs: call, index: gimple_call_num_args (gs: call) - 1);
11826 gimple *new_stmt = gimple_build_assign (lhs, else_arg);
11827 gsi_replace (&gsi, new_stmt, true);
11828 }
11829 }
11830 }
11831 } /* BBs in loop */
11832
11833 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
11834 a zero NITERS becomes a nonzero NITERS_VECTOR. */
11835 if (integer_onep (step_vector))
11836 niters_no_overflow = true;
11837 vect_set_loop_condition (loop, LOOP_VINFO_IV_EXIT (loop_vinfo), loop_vinfo,
11838 niters_vector, step_vector, niters_vector_mult_vf,
11839 !niters_no_overflow);
11840
11841 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
11842
11843 /* True if the final iteration might not handle a full vector's
11844 worth of scalar iterations. */
11845 bool final_iter_may_be_partial
11846 = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
11847 /* The minimum number of iterations performed by the epilogue. This
11848 is 1 when peeling for gaps because we always need a final scalar
11849 iteration. */
11850 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
11851 /* +1 to convert latch counts to loop iteration counts,
11852 -min_epilogue_iters to remove iterations that cannot be performed
11853 by the vector code. */
11854 int bias_for_lowest = 1 - min_epilogue_iters;
11855 int bias_for_assumed = bias_for_lowest;
11856 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
11857 if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
11858 {
11859 /* When the amount of peeling is known at compile time, the first
11860 iteration will have exactly alignment_npeels active elements.
11861 In the worst case it will have at least one. */
11862 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
11863 bias_for_lowest += lowest_vf - min_first_active;
11864 bias_for_assumed += assumed_vf - min_first_active;
11865 }
11866 /* In these calculations the "- 1" converts loop iteration counts
11867 back to latch counts. */
11868 if (loop->any_upper_bound)
11869 {
11870 loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
11871 loop->nb_iterations_upper_bound
11872 = (final_iter_may_be_partial
11873 ? wi::udiv_ceil (x: loop->nb_iterations_upper_bound + bias_for_lowest,
11874 y: lowest_vf) - 1
11875 : wi::udiv_floor (x: loop->nb_iterations_upper_bound + bias_for_lowest,
11876 y: lowest_vf) - 1);
11877 if (main_vinfo
11878 /* Both peeling for alignment and peeling for gaps can end up
11879 with the scalar epilogue running for more than VF-1 iterations. */
11880 && !main_vinfo->peeling_for_alignment
11881 && !main_vinfo->peeling_for_gaps)
11882 {
11883 unsigned int bound;
11884 poly_uint64 main_iters
11885 = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
11886 LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
11887 main_iters
11888 = upper_bound (a: main_iters,
11889 LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
11890 if (can_div_away_from_zero_p (a: main_iters,
11891 LOOP_VINFO_VECT_FACTOR (loop_vinfo),
11892 quotient: &bound))
11893 loop->nb_iterations_upper_bound
11894 = wi::umin (x: (bound_wide_int) (bound - 1),
11895 y: loop->nb_iterations_upper_bound);
11896 }
11897 }
11898 if (loop->any_likely_upper_bound)
11899 loop->nb_iterations_likely_upper_bound
11900 = (final_iter_may_be_partial
11901 ? wi::udiv_ceil (x: loop->nb_iterations_likely_upper_bound
11902 + bias_for_lowest, y: lowest_vf) - 1
11903 : wi::udiv_floor (x: loop->nb_iterations_likely_upper_bound
11904 + bias_for_lowest, y: lowest_vf) - 1);
11905 if (loop->any_estimate)
11906 loop->nb_iterations_estimate
11907 = (final_iter_may_be_partial
11908 ? wi::udiv_ceil (x: loop->nb_iterations_estimate + bias_for_assumed,
11909 y: assumed_vf) - 1
11910 : wi::udiv_floor (x: loop->nb_iterations_estimate + bias_for_assumed,
11911 y: assumed_vf) - 1);
11912 scale_profile_for_vect_loop (loop, LOOP_VINFO_IV_EXIT (loop_vinfo),
11913 vf: assumed_vf, flat);
11914
11915 if (dump_enabled_p ())
11916 {
11917 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
11918 {
11919 dump_printf_loc (MSG_NOTE, vect_location,
11920 "LOOP VECTORIZED\n");
11921 if (loop->inner)
11922 dump_printf_loc (MSG_NOTE, vect_location,
11923 "OUTER LOOP VECTORIZED\n");
11924 dump_printf (MSG_NOTE, "\n");
11925 }
11926 else
11927 dump_printf_loc (MSG_NOTE, vect_location,
11928 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
11929 GET_MODE_NAME (loop_vinfo->vector_mode));
11930 }
11931
11932 /* Loops vectorized with a variable factor won't benefit from
11933 unrolling/peeling. */
11934 if (!vf.is_constant ())
11935 {
11936 loop->unroll = 1;
11937 if (dump_enabled_p ())
11938 dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
11939 " variable-length vectorization factor\n");
11940 }
11941 /* Free SLP instances here because otherwise stmt reference counting
11942 won't work. */
11943 slp_instance instance;
11944 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
11945 vect_free_slp_instance (instance);
11946 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
11947 /* Clear-up safelen field since its value is invalid after vectorization
11948 since vectorized loop can have loop-carried dependencies. */
11949 loop->safelen = 0;
11950
11951 if (epilogue)
11952 {
11953 update_epilogue_loop_vinfo (epilogue, advance);
11954
11955 epilogue->simduid = loop->simduid;
11956 epilogue->force_vectorize = loop->force_vectorize;
11957 epilogue->dont_vectorize = false;
11958 }
11959
11960 return epilogue;
11961}
11962
11963/* The code below is trying to perform simple optimization - revert
11964 if-conversion for masked stores, i.e. if the mask of a store is zero
11965 do not perform it and all stored value producers also if possible.
11966 For example,
11967 for (i=0; i<n; i++)
11968 if (c[i])
11969 {
11970 p1[i] += 1;
11971 p2[i] = p3[i] +2;
11972 }
11973 this transformation will produce the following semi-hammock:
11974
11975 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
11976 {
11977 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
11978 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
11979 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
11980 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
11981 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
11982 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
11983 }
11984*/
11985
11986void
11987optimize_mask_stores (class loop *loop)
11988{
11989 basic_block *bbs = get_loop_body (loop);
11990 unsigned nbbs = loop->num_nodes;
11991 unsigned i;
11992 basic_block bb;
11993 class loop *bb_loop;
11994 gimple_stmt_iterator gsi;
11995 gimple *stmt;
11996 auto_vec<gimple *> worklist;
11997 auto_purge_vect_location sentinel;
11998
11999 vect_location = find_loop_location (loop);
12000 /* Pick up all masked stores in loop if any. */
12001 for (i = 0; i < nbbs; i++)
12002 {
12003 bb = bbs[i];
12004 for (gsi = gsi_start_bb (bb); !gsi_end_p (i: gsi);
12005 gsi_next (i: &gsi))
12006 {
12007 stmt = gsi_stmt (i: gsi);
12008 if (gimple_call_internal_p (gs: stmt, fn: IFN_MASK_STORE))
12009 worklist.safe_push (obj: stmt);
12010 }
12011 }
12012
12013 free (ptr: bbs);
12014 if (worklist.is_empty ())
12015 return;
12016
12017 /* Loop has masked stores. */
12018 while (!worklist.is_empty ())
12019 {
12020 gimple *last, *last_store;
12021 edge e, efalse;
12022 tree mask;
12023 basic_block store_bb, join_bb;
12024 gimple_stmt_iterator gsi_to;
12025 tree vdef, new_vdef;
12026 gphi *phi;
12027 tree vectype;
12028 tree zero;
12029
12030 last = worklist.pop ();
12031 mask = gimple_call_arg (gs: last, index: 2);
12032 bb = gimple_bb (g: last);
12033 /* Create then_bb and if-then structure in CFG, then_bb belongs to
12034 the same loop as if_bb. It could be different to LOOP when two
12035 level loop-nest is vectorized and mask_store belongs to the inner
12036 one. */
12037 e = split_block (bb, last);
12038 bb_loop = bb->loop_father;
12039 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
12040 join_bb = e->dest;
12041 store_bb = create_empty_bb (bb);
12042 add_bb_to_loop (store_bb, bb_loop);
12043 e->flags = EDGE_TRUE_VALUE;
12044 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
12045 /* Put STORE_BB to likely part. */
12046 efalse->probability = profile_probability::likely ();
12047 e->probability = efalse->probability.invert ();
12048 store_bb->count = efalse->count ();
12049 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
12050 if (dom_info_available_p (CDI_DOMINATORS))
12051 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
12052 if (dump_enabled_p ())
12053 dump_printf_loc (MSG_NOTE, vect_location,
12054 "Create new block %d to sink mask stores.",
12055 store_bb->index);
12056 /* Create vector comparison with boolean result. */
12057 vectype = TREE_TYPE (mask);
12058 zero = build_zero_cst (vectype);
12059 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
12060 gsi = gsi_last_bb (bb);
12061 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
12062 /* Create new PHI node for vdef of the last masked store:
12063 .MEM_2 = VDEF <.MEM_1>
12064 will be converted to
12065 .MEM.3 = VDEF <.MEM_1>
12066 and new PHI node will be created in join bb
12067 .MEM_2 = PHI <.MEM_1, .MEM_3>
12068 */
12069 vdef = gimple_vdef (g: last);
12070 new_vdef = make_ssa_name (var: gimple_vop (cfun), stmt: last);
12071 gimple_set_vdef (g: last, vdef: new_vdef);
12072 phi = create_phi_node (vdef, join_bb);
12073 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
12074
12075 /* Put all masked stores with the same mask to STORE_BB if possible. */
12076 while (true)
12077 {
12078 gimple_stmt_iterator gsi_from;
12079 gimple *stmt1 = NULL;
12080
12081 /* Move masked store to STORE_BB. */
12082 last_store = last;
12083 gsi = gsi_for_stmt (last);
12084 gsi_from = gsi;
12085 /* Shift GSI to the previous stmt for further traversal. */
12086 gsi_prev (i: &gsi);
12087 gsi_to = gsi_start_bb (bb: store_bb);
12088 gsi_move_before (&gsi_from, &gsi_to);
12089 /* Setup GSI_TO to the non-empty block start. */
12090 gsi_to = gsi_start_bb (bb: store_bb);
12091 if (dump_enabled_p ())
12092 dump_printf_loc (MSG_NOTE, vect_location,
12093 "Move stmt to created bb\n%G", last);
12094 /* Move all stored value producers if possible. */
12095 while (!gsi_end_p (i: gsi))
12096 {
12097 tree lhs;
12098 imm_use_iterator imm_iter;
12099 use_operand_p use_p;
12100 bool res;
12101
12102 /* Skip debug statements. */
12103 if (is_gimple_debug (gs: gsi_stmt (i: gsi)))
12104 {
12105 gsi_prev (i: &gsi);
12106 continue;
12107 }
12108 stmt1 = gsi_stmt (i: gsi);
12109 /* Do not consider statements writing to memory or having
12110 volatile operand. */
12111 if (gimple_vdef (g: stmt1)
12112 || gimple_has_volatile_ops (stmt: stmt1))
12113 break;
12114 gsi_from = gsi;
12115 gsi_prev (i: &gsi);
12116 lhs = gimple_get_lhs (stmt1);
12117 if (!lhs)
12118 break;
12119
12120 /* LHS of vectorized stmt must be SSA_NAME. */
12121 if (TREE_CODE (lhs) != SSA_NAME)
12122 break;
12123
12124 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12125 {
12126 /* Remove dead scalar statement. */
12127 if (has_zero_uses (var: lhs))
12128 {
12129 gsi_remove (&gsi_from, true);
12130 continue;
12131 }
12132 }
12133
12134 /* Check that LHS does not have uses outside of STORE_BB. */
12135 res = true;
12136 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
12137 {
12138 gimple *use_stmt;
12139 use_stmt = USE_STMT (use_p);
12140 if (is_gimple_debug (gs: use_stmt))
12141 continue;
12142 if (gimple_bb (g: use_stmt) != store_bb)
12143 {
12144 res = false;
12145 break;
12146 }
12147 }
12148 if (!res)
12149 break;
12150
12151 if (gimple_vuse (g: stmt1)
12152 && gimple_vuse (g: stmt1) != gimple_vuse (g: last_store))
12153 break;
12154
12155 /* Can move STMT1 to STORE_BB. */
12156 if (dump_enabled_p ())
12157 dump_printf_loc (MSG_NOTE, vect_location,
12158 "Move stmt to created bb\n%G", stmt1);
12159 gsi_move_before (&gsi_from, &gsi_to);
12160 /* Shift GSI_TO for further insertion. */
12161 gsi_prev (i: &gsi_to);
12162 }
12163 /* Put other masked stores with the same mask to STORE_BB. */
12164 if (worklist.is_empty ()
12165 || gimple_call_arg (gs: worklist.last (), index: 2) != mask
12166 || worklist.last () != stmt1)
12167 break;
12168 last = worklist.pop ();
12169 }
12170 add_phi_arg (phi, gimple_vuse (g: last_store), e, UNKNOWN_LOCATION);
12171 }
12172}
12173
12174/* Decide whether it is possible to use a zero-based induction variable
12175 when vectorizing LOOP_VINFO with partial vectors. If it is, return
12176 the value that the induction variable must be able to hold in order
12177 to ensure that the rgroups eventually have no active vector elements.
12178 Return -1 otherwise. */
12179
12180widest_int
12181vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
12182{
12183 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
12184 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
12185 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
12186
12187 /* Calculate the value that the induction variable must be able
12188 to hit in order to ensure that we end the loop with an all-false mask.
12189 This involves adding the maximum number of inactive trailing scalar
12190 iterations. */
12191 widest_int iv_limit = -1;
12192 if (max_loop_iterations (loop, &iv_limit))
12193 {
12194 if (niters_skip)
12195 {
12196 /* Add the maximum number of skipped iterations to the
12197 maximum iteration count. */
12198 if (TREE_CODE (niters_skip) == INTEGER_CST)
12199 iv_limit += wi::to_widest (t: niters_skip);
12200 else
12201 iv_limit += max_vf - 1;
12202 }
12203 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
12204 /* Make a conservatively-correct assumption. */
12205 iv_limit += max_vf - 1;
12206
12207 /* IV_LIMIT is the maximum number of latch iterations, which is also
12208 the maximum in-range IV value. Round this value down to the previous
12209 vector alignment boundary and then add an extra full iteration. */
12210 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
12211 iv_limit = (iv_limit & -(int) known_alignment (a: vf)) + max_vf;
12212 }
12213 return iv_limit;
12214}
12215
12216/* For the given rgroup_controls RGC, check whether an induction variable
12217 would ever hit a value that produces a set of all-false masks or zero
12218 lengths before wrapping around. Return true if it's possible to wrap
12219 around before hitting the desirable value, otherwise return false. */
12220
12221bool
12222vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
12223{
12224 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
12225
12226 if (iv_limit == -1)
12227 return true;
12228
12229 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
12230 unsigned int compare_precision = TYPE_PRECISION (compare_type);
12231 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
12232
12233 if (wi::min_precision (x: iv_limit * nitems, sgn: UNSIGNED) > compare_precision)
12234 return true;
12235
12236 return false;
12237}
12238

source code of gcc/tree-vect-loop.cc