1/* Vectorizer Specific Loop Manipulations
2 Copyright (C) 2003-2023 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6This file is part of GCC.
7
8GCC is free software; you can redistribute it and/or modify it under
9the terms of the GNU General Public License as published by the Free
10Software Foundation; either version 3, or (at your option) any later
11version.
12
13GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14WARRANTY; without even the implied warranty of MERCHANTABILITY or
15FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16for more details.
17
18You should have received a copy of the GNU General Public License
19along with GCC; see the file COPYING3. If not see
20<http://www.gnu.org/licenses/>. */
21
22#include "config.h"
23#include "system.h"
24#include "coretypes.h"
25#include "backend.h"
26#include "tree.h"
27#include "gimple.h"
28#include "cfghooks.h"
29#include "tree-pass.h"
30#include "ssa.h"
31#include "fold-const.h"
32#include "cfganal.h"
33#include "gimplify.h"
34#include "gimple-iterator.h"
35#include "gimplify-me.h"
36#include "tree-cfg.h"
37#include "tree-ssa-loop-manip.h"
38#include "tree-into-ssa.h"
39#include "tree-ssa.h"
40#include "cfgloop.h"
41#include "tree-scalar-evolution.h"
42#include "tree-vectorizer.h"
43#include "tree-ssa-loop-ivopts.h"
44#include "gimple-fold.h"
45#include "tree-ssa-loop-niter.h"
46#include "internal-fn.h"
47#include "stor-layout.h"
48#include "optabs-query.h"
49#include "vec-perm-indices.h"
50#include "insn-config.h"
51#include "rtl.h"
52#include "recog.h"
53#include "langhooks.h"
54#include "tree-vector-builder.h"
55#include "optabs-tree.h"
56
57/*************************************************************************
58 Simple Loop Peeling Utilities
59
60 Utilities to support loop peeling for vectorization purposes.
61 *************************************************************************/
62
63
64/* Renames the use *OP_P. */
65
66static void
67rename_use_op (use_operand_p op_p)
68{
69 tree new_name;
70
71 if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
72 return;
73
74 new_name = get_current_def (USE_FROM_PTR (op_p));
75
76 /* Something defined outside of the loop. */
77 if (!new_name)
78 return;
79
80 /* An ordinary ssa name defined in the loop. */
81
82 SET_USE (op_p, new_name);
83}
84
85
86/* Renames the variables in basic block BB. Allow renaming of PHI arguments
87 on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
88 true. */
89
90static void
91rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
92{
93 gimple *stmt;
94 use_operand_p use_p;
95 ssa_op_iter iter;
96 edge e;
97 edge_iterator ei;
98 class loop *loop = bb->loop_father;
99 class loop *outer_loop = NULL;
100
101 if (rename_from_outer_loop)
102 {
103 gcc_assert (loop);
104 outer_loop = loop_outer (loop);
105 }
106
107 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (i: gsi);
108 gsi_next (i: &gsi))
109 {
110 stmt = gsi_stmt (i: gsi);
111 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
112 rename_use_op (op_p: use_p);
113 }
114
115 FOR_EACH_EDGE (e, ei, bb->preds)
116 {
117 if (!flow_bb_inside_loop_p (loop, e->src))
118 {
119 if (!rename_from_outer_loop)
120 continue;
121 if (e->src != outer_loop->header)
122 {
123 if (outer_loop->inner->next)
124 {
125 /* If outer_loop has 2 inner loops, allow there to
126 be an extra basic block which decides which of the
127 two loops to use using LOOP_VECTORIZED. */
128 if (!single_pred_p (bb: e->src)
129 || single_pred (bb: e->src) != outer_loop->header)
130 continue;
131 }
132 }
133 }
134 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (i: gsi);
135 gsi_next (i: &gsi))
136 rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
137 }
138}
139
140
141struct adjust_info
142{
143 tree from, to;
144 basic_block bb;
145};
146
147/* A stack of values to be adjusted in debug stmts. We have to
148 process them LIFO, so that the closest substitution applies. If we
149 processed them FIFO, without the stack, we might substitute uses
150 with a PHI DEF that would soon become non-dominant, and when we got
151 to the suitable one, it wouldn't have anything to substitute any
152 more. */
153static vec<adjust_info, va_heap> adjust_vec;
154
155/* Adjust any debug stmts that referenced AI->from values to use the
156 loop-closed AI->to, if the references are dominated by AI->bb and
157 not by the definition of AI->from. */
158
159static void
160adjust_debug_stmts_now (adjust_info *ai)
161{
162 basic_block bbphi = ai->bb;
163 tree orig_def = ai->from;
164 tree new_def = ai->to;
165 imm_use_iterator imm_iter;
166 gimple *stmt;
167 basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
168
169 gcc_assert (dom_info_available_p (CDI_DOMINATORS));
170
171 /* Adjust any debug stmts that held onto non-loop-closed
172 references. */
173 FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
174 {
175 use_operand_p use_p;
176 basic_block bbuse;
177
178 if (!is_gimple_debug (gs: stmt))
179 continue;
180
181 gcc_assert (gimple_debug_bind_p (stmt));
182
183 bbuse = gimple_bb (g: stmt);
184
185 if ((bbuse == bbphi
186 || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
187 && !(bbuse == bbdef
188 || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
189 {
190 if (new_def)
191 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
192 SET_USE (use_p, new_def);
193 else
194 {
195 gimple_debug_bind_reset_value (dbg: stmt);
196 update_stmt (s: stmt);
197 }
198 }
199 }
200}
201
202/* Adjust debug stmts as scheduled before. */
203
204static void
205adjust_vec_debug_stmts (void)
206{
207 if (!MAY_HAVE_DEBUG_BIND_STMTS)
208 return;
209
210 gcc_assert (adjust_vec.exists ());
211
212 while (!adjust_vec.is_empty ())
213 {
214 adjust_debug_stmts_now (ai: &adjust_vec.last ());
215 adjust_vec.pop ();
216 }
217}
218
219/* Adjust any debug stmts that referenced FROM values to use the
220 loop-closed TO, if the references are dominated by BB and not by
221 the definition of FROM. If adjust_vec is non-NULL, adjustments
222 will be postponed until adjust_vec_debug_stmts is called. */
223
224static void
225adjust_debug_stmts (tree from, tree to, basic_block bb)
226{
227 adjust_info ai;
228
229 if (MAY_HAVE_DEBUG_BIND_STMTS
230 && TREE_CODE (from) == SSA_NAME
231 && ! SSA_NAME_IS_DEFAULT_DEF (from)
232 && ! virtual_operand_p (op: from))
233 {
234 ai.from = from;
235 ai.to = to;
236 ai.bb = bb;
237
238 if (adjust_vec.exists ())
239 adjust_vec.safe_push (obj: ai);
240 else
241 adjust_debug_stmts_now (ai: &ai);
242 }
243}
244
245/* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
246 to adjust any debug stmts that referenced the old phi arg,
247 presumably non-loop-closed references left over from other
248 transformations. */
249
250static void
251adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
252{
253 tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
254
255 gcc_assert (TREE_CODE (orig_def) != SSA_NAME
256 || orig_def != new_def);
257
258 SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
259
260 if (MAY_HAVE_DEBUG_BIND_STMTS)
261 adjust_debug_stmts (from: orig_def, PHI_RESULT (update_phi),
262 bb: gimple_bb (g: update_phi));
263}
264
265/* Define one loop rgroup control CTRL from loop LOOP. INIT_CTRL is the value
266 that the control should have during the first iteration and NEXT_CTRL is the
267 value that it should have on subsequent iterations. */
268
269static void
270vect_set_loop_control (class loop *loop, tree ctrl, tree init_ctrl,
271 tree next_ctrl)
272{
273 gphi *phi = create_phi_node (ctrl, loop->header);
274 add_phi_arg (phi, init_ctrl, loop_preheader_edge (loop), UNKNOWN_LOCATION);
275 add_phi_arg (phi, next_ctrl, loop_latch_edge (loop), UNKNOWN_LOCATION);
276}
277
278/* Add SEQ to the end of LOOP's preheader block. */
279
280static void
281add_preheader_seq (class loop *loop, gimple_seq seq)
282{
283 if (seq)
284 {
285 edge pe = loop_preheader_edge (loop);
286 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
287 gcc_assert (!new_bb);
288 }
289}
290
291/* Add SEQ to the beginning of LOOP's header block. */
292
293static void
294add_header_seq (class loop *loop, gimple_seq seq)
295{
296 if (seq)
297 {
298 gimple_stmt_iterator gsi = gsi_after_labels (bb: loop->header);
299 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
300 }
301}
302
303/* Return true if the target can interleave elements of two vectors.
304 OFFSET is 0 if the first half of the vectors should be interleaved
305 or 1 if the second half should. When returning true, store the
306 associated permutation in INDICES. */
307
308static bool
309interleave_supported_p (vec_perm_indices *indices, tree vectype,
310 unsigned int offset)
311{
312 poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (node: vectype);
313 poly_uint64 base = exact_div (a: nelts, b: 2) * offset;
314 vec_perm_builder sel (nelts, 2, 3);
315 for (unsigned int i = 0; i < 3; ++i)
316 {
317 sel.quick_push (obj: base + i);
318 sel.quick_push (obj: base + i + nelts);
319 }
320 indices->new_vector (sel, 2, nelts);
321 return can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
322 *indices);
323}
324
325/* Try to use permutes to define the masks in DEST_RGM using the masks
326 in SRC_RGM, given that the former has twice as many masks as the
327 latter. Return true on success, adding any new statements to SEQ. */
328
329static bool
330vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm,
331 rgroup_controls *src_rgm)
332{
333 tree src_masktype = src_rgm->type;
334 tree dest_masktype = dest_rgm->type;
335 machine_mode src_mode = TYPE_MODE (src_masktype);
336 insn_code icode1, icode2;
337 if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
338 && (icode1 = optab_handler (op: vec_unpacku_hi_optab,
339 mode: src_mode)) != CODE_FOR_nothing
340 && (icode2 = optab_handler (op: vec_unpacku_lo_optab,
341 mode: src_mode)) != CODE_FOR_nothing)
342 {
343 /* Unpacking the source masks gives at least as many mask bits as
344 we need. We can then VIEW_CONVERT any excess bits away. */
345 machine_mode dest_mode = insn_data[icode1].operand[0].mode;
346 gcc_assert (dest_mode == insn_data[icode2].operand[0].mode);
347 tree unpack_masktype = vect_halve_mask_nunits (src_masktype, dest_mode);
348 for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
349 {
350 tree src = src_rgm->controls[i / 2];
351 tree dest = dest_rgm->controls[i];
352 tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
353 ? VEC_UNPACK_HI_EXPR
354 : VEC_UNPACK_LO_EXPR);
355 gassign *stmt;
356 if (dest_masktype == unpack_masktype)
357 stmt = gimple_build_assign (dest, code, src);
358 else
359 {
360 tree temp = make_ssa_name (var: unpack_masktype);
361 stmt = gimple_build_assign (temp, code, src);
362 gimple_seq_add_stmt (seq, stmt);
363 stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
364 build1 (VIEW_CONVERT_EXPR,
365 dest_masktype, temp));
366 }
367 gimple_seq_add_stmt (seq, stmt);
368 }
369 return true;
370 }
371 vec_perm_indices indices[2];
372 if (dest_masktype == src_masktype
373 && interleave_supported_p (indices: &indices[0], vectype: src_masktype, offset: 0)
374 && interleave_supported_p (indices: &indices[1], vectype: src_masktype, offset: 1))
375 {
376 /* The destination requires twice as many mask bits as the source, so
377 we can use interleaving permutes to double up the number of bits. */
378 tree masks[2];
379 for (unsigned int i = 0; i < 2; ++i)
380 masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
381 for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
382 {
383 tree src = src_rgm->controls[i / 2];
384 tree dest = dest_rgm->controls[i];
385 gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
386 src, src, masks[i & 1]);
387 gimple_seq_add_stmt (seq, stmt);
388 }
389 return true;
390 }
391 return false;
392}
393
394/* Populate DEST_RGM->controls, given that they should add up to STEP.
395
396 STEP = MIN_EXPR <ivtmp_34, VF>;
397
398 First length (MIN (X, VF/N)):
399 loop_len_15 = MIN_EXPR <STEP, VF/N>;
400
401 Second length:
402 tmp = STEP - loop_len_15;
403 loop_len_16 = MIN (tmp, VF/N);
404
405 Third length:
406 tmp2 = tmp - loop_len_16;
407 loop_len_17 = MIN (tmp2, VF/N);
408
409 Last length:
410 loop_len_18 = tmp2 - loop_len_17;
411*/
412
413static void
414vect_adjust_loop_lens_control (tree iv_type, gimple_seq *seq,
415 rgroup_controls *dest_rgm, tree step)
416{
417 tree ctrl_type = dest_rgm->type;
418 poly_uint64 nitems_per_ctrl
419 = TYPE_VECTOR_SUBPARTS (node: ctrl_type) * dest_rgm->factor;
420 tree length_limit = build_int_cst (iv_type, nitems_per_ctrl);
421
422 for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
423 {
424 tree ctrl = dest_rgm->controls[i];
425 if (i == 0)
426 {
427 /* First iteration: MIN (X, VF/N) capped to the range [0, VF/N]. */
428 gassign *assign
429 = gimple_build_assign (ctrl, MIN_EXPR, step, length_limit);
430 gimple_seq_add_stmt (seq, assign);
431 }
432 else if (i == dest_rgm->controls.length () - 1)
433 {
434 /* Last iteration: Remain capped to the range [0, VF/N]. */
435 gassign *assign = gimple_build_assign (ctrl, MINUS_EXPR, step,
436 dest_rgm->controls[i - 1]);
437 gimple_seq_add_stmt (seq, assign);
438 }
439 else
440 {
441 /* (MIN (remain, VF*I/N)) capped to the range [0, VF/N]. */
442 step = gimple_build (seq, code: MINUS_EXPR, type: iv_type, ops: step,
443 ops: dest_rgm->controls[i - 1]);
444 gassign *assign
445 = gimple_build_assign (ctrl, MIN_EXPR, step, length_limit);
446 gimple_seq_add_stmt (seq, assign);
447 }
448 }
449}
450
451/* Helper for vect_set_loop_condition_partial_vectors. Generate definitions
452 for all the rgroup controls in RGC and return a control that is nonzero
453 when the loop needs to iterate. Add any new preheader statements to
454 PREHEADER_SEQ. Use LOOP_COND_GSI to insert code before the exit gcond.
455
456 RGC belongs to loop LOOP. The loop originally iterated NITERS
457 times and has been vectorized according to LOOP_VINFO.
458
459 If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
460 starts with NITERS_SKIP dummy iterations of the scalar loop before
461 the real work starts. The mask elements for these dummy iterations
462 must be 0, to ensure that the extra iterations do not have an effect.
463
464 It is known that:
465
466 NITERS * RGC->max_nscalars_per_iter * RGC->factor
467
468 does not overflow. However, MIGHT_WRAP_P says whether an induction
469 variable that starts at 0 and has step:
470
471 VF * RGC->max_nscalars_per_iter * RGC->factor
472
473 might overflow before hitting a value above:
474
475 (NITERS + NITERS_SKIP) * RGC->max_nscalars_per_iter * RGC->factor
476
477 This means that we cannot guarantee that such an induction variable
478 would ever hit a value that produces a set of all-false masks or zero
479 lengths for RGC.
480
481 Note: the cost of the code generated by this function is modeled
482 by vect_estimate_min_profitable_iters, so changes here may need
483 corresponding changes there. */
484
485static tree
486vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
487 gimple_seq *preheader_seq,
488 gimple_seq *header_seq,
489 gimple_stmt_iterator loop_cond_gsi,
490 rgroup_controls *rgc, tree niters,
491 tree niters_skip, bool might_wrap_p,
492 tree *iv_step, tree *compare_step)
493{
494 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
495 tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
496 bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
497
498 tree ctrl_type = rgc->type;
499 unsigned int nitems_per_iter = rgc->max_nscalars_per_iter * rgc->factor;
500 poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (node: ctrl_type) * rgc->factor;
501 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
502 tree length_limit = NULL_TREE;
503 /* For length, we need length_limit to ensure length in range. */
504 if (!use_masks_p)
505 length_limit = build_int_cst (compare_type, nitems_per_ctrl);
506
507 /* Calculate the maximum number of item values that the rgroup
508 handles in total, the number that it handles for each iteration
509 of the vector loop, and the number that it should skip during the
510 first iteration of the vector loop. */
511 tree nitems_total = niters;
512 tree nitems_step = build_int_cst (iv_type, vf);
513 tree nitems_skip = niters_skip;
514 if (nitems_per_iter != 1)
515 {
516 /* We checked before setting LOOP_VINFO_USING_PARTIAL_VECTORS_P that
517 these multiplications don't overflow. */
518 tree compare_factor = build_int_cst (compare_type, nitems_per_iter);
519 tree iv_factor = build_int_cst (iv_type, nitems_per_iter);
520 nitems_total = gimple_build (seq: preheader_seq, code: MULT_EXPR, type: compare_type,
521 ops: nitems_total, ops: compare_factor);
522 nitems_step = gimple_build (seq: preheader_seq, code: MULT_EXPR, type: iv_type,
523 ops: nitems_step, ops: iv_factor);
524 if (nitems_skip)
525 nitems_skip = gimple_build (seq: preheader_seq, code: MULT_EXPR, type: compare_type,
526 ops: nitems_skip, ops: compare_factor);
527 }
528
529 /* Create an induction variable that counts the number of items
530 processed. */
531 tree index_before_incr, index_after_incr;
532 gimple_stmt_iterator incr_gsi;
533 bool insert_after;
534 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
535 if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
536 {
537 /* Create an IV that counts down from niters_total and whose step
538 is the (variable) amount processed in the current iteration:
539 ...
540 _10 = (unsigned long) count_12(D);
541 ...
542 # ivtmp_9 = PHI <ivtmp_35(6), _10(5)>
543 _36 = (MIN_EXPR | SELECT_VL) <ivtmp_9, POLY_INT_CST [4, 4]>;
544 ...
545 vect__4.8_28 = .LEN_LOAD (_17, 32B, _36, 0);
546 ...
547 ivtmp_35 = ivtmp_9 - POLY_INT_CST [4, 4];
548 ...
549 if (ivtmp_9 > POLY_INT_CST [4, 4])
550 goto <bb 4>; [83.33%]
551 else
552 goto <bb 5>; [16.67%]
553 */
554 nitems_total = gimple_convert (seq: preheader_seq, type: iv_type, op: nitems_total);
555 tree step = rgc->controls.length () == 1 ? rgc->controls[0]
556 : make_ssa_name (var: iv_type);
557 /* Create decrement IV. */
558 if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
559 {
560 create_iv (nitems_total, MINUS_EXPR, step, NULL_TREE, loop, &incr_gsi,
561 insert_after, &index_before_incr, &index_after_incr);
562 tree len = gimple_build (seq: header_seq, code: IFN_SELECT_VL, type: iv_type,
563 ops: index_before_incr, ops: nitems_step);
564 gimple_seq_add_stmt (header_seq, gimple_build_assign (step, len));
565 }
566 else
567 {
568 create_iv (nitems_total, MINUS_EXPR, nitems_step, NULL_TREE, loop,
569 &incr_gsi, insert_after, &index_before_incr,
570 &index_after_incr);
571 gimple_seq_add_stmt (header_seq,
572 gimple_build_assign (step, MIN_EXPR,
573 index_before_incr,
574 nitems_step));
575 }
576 *iv_step = step;
577 *compare_step = nitems_step;
578 return LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) ? index_after_incr
579 : index_before_incr;
580 }
581
582 /* Create increment IV. */
583 create_iv (build_int_cst (iv_type, 0), PLUS_EXPR, nitems_step, NULL_TREE,
584 loop, &incr_gsi, insert_after, &index_before_incr,
585 &index_after_incr);
586
587 tree zero_index = build_int_cst (compare_type, 0);
588 tree test_index, test_limit, first_limit;
589 gimple_stmt_iterator *test_gsi;
590 if (might_wrap_p)
591 {
592 /* In principle the loop should stop iterating once the incremented
593 IV reaches a value greater than or equal to:
594
595 NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP
596
597 However, there's no guarantee that this addition doesn't overflow
598 the comparison type, or that the IV hits a value above it before
599 wrapping around. We therefore adjust the limit down by one
600 IV step:
601
602 (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP)
603 -[infinite-prec] NITEMS_STEP
604
605 and compare the IV against this limit _before_ incrementing it.
606 Since the comparison type is unsigned, we actually want the
607 subtraction to saturate at zero:
608
609 (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP)
610 -[sat] NITEMS_STEP
611
612 And since NITEMS_SKIP < NITEMS_STEP, we can reassociate this as:
613
614 NITEMS_TOTAL -[sat] (NITEMS_STEP - NITEMS_SKIP)
615
616 where the rightmost subtraction can be done directly in
617 COMPARE_TYPE. */
618 test_index = index_before_incr;
619 tree adjust = gimple_convert (seq: preheader_seq, type: compare_type,
620 op: nitems_step);
621 if (nitems_skip)
622 adjust = gimple_build (seq: preheader_seq, code: MINUS_EXPR, type: compare_type,
623 ops: adjust, ops: nitems_skip);
624 test_limit = gimple_build (seq: preheader_seq, code: MAX_EXPR, type: compare_type,
625 ops: nitems_total, ops: adjust);
626 test_limit = gimple_build (seq: preheader_seq, code: MINUS_EXPR, type: compare_type,
627 ops: test_limit, ops: adjust);
628 test_gsi = &incr_gsi;
629
630 /* Get a safe limit for the first iteration. */
631 if (nitems_skip)
632 {
633 /* The first vector iteration can handle at most NITEMS_STEP
634 items. NITEMS_STEP <= CONST_LIMIT, and adding
635 NITEMS_SKIP to that cannot overflow. */
636 tree const_limit = build_int_cst (compare_type,
637 LOOP_VINFO_VECT_FACTOR (loop_vinfo)
638 * nitems_per_iter);
639 first_limit = gimple_build (seq: preheader_seq, code: MIN_EXPR, type: compare_type,
640 ops: nitems_total, ops: const_limit);
641 first_limit = gimple_build (seq: preheader_seq, code: PLUS_EXPR, type: compare_type,
642 ops: first_limit, ops: nitems_skip);
643 }
644 else
645 /* For the first iteration it doesn't matter whether the IV hits
646 a value above NITEMS_TOTAL. That only matters for the latch
647 condition. */
648 first_limit = nitems_total;
649 }
650 else
651 {
652 /* Test the incremented IV, which will always hit a value above
653 the bound before wrapping. */
654 test_index = index_after_incr;
655 test_limit = nitems_total;
656 if (nitems_skip)
657 test_limit = gimple_build (seq: preheader_seq, code: PLUS_EXPR, type: compare_type,
658 ops: test_limit, ops: nitems_skip);
659 test_gsi = &loop_cond_gsi;
660
661 first_limit = test_limit;
662 }
663
664 /* Convert the IV value to the comparison type (either a no-op or
665 a demotion). */
666 gimple_seq test_seq = NULL;
667 test_index = gimple_convert (seq: &test_seq, type: compare_type, op: test_index);
668 gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
669
670 /* Provide a definition of each control in the group. */
671 tree next_ctrl = NULL_TREE;
672 tree ctrl;
673 unsigned int i;
674 FOR_EACH_VEC_ELT_REVERSE (rgc->controls, i, ctrl)
675 {
676 /* Previous controls will cover BIAS items. This control covers the
677 next batch. */
678 poly_uint64 bias = nitems_per_ctrl * i;
679 tree bias_tree = build_int_cst (compare_type, bias);
680
681 /* See whether the first iteration of the vector loop is known
682 to have a full control. */
683 poly_uint64 const_limit;
684 bool first_iteration_full
685 = (poly_int_tree_p (t: first_limit, value: &const_limit)
686 && known_ge (const_limit, (i + 1) * nitems_per_ctrl));
687
688 /* Rather than have a new IV that starts at BIAS and goes up to
689 TEST_LIMIT, prefer to use the same 0-based IV for each control
690 and adjust the bound down by BIAS. */
691 tree this_test_limit = test_limit;
692 if (i != 0)
693 {
694 this_test_limit = gimple_build (seq: preheader_seq, code: MAX_EXPR,
695 type: compare_type, ops: this_test_limit,
696 ops: bias_tree);
697 this_test_limit = gimple_build (seq: preheader_seq, code: MINUS_EXPR,
698 type: compare_type, ops: this_test_limit,
699 ops: bias_tree);
700 }
701
702 /* Create the initial control. First include all items that
703 are within the loop limit. */
704 tree init_ctrl = NULL_TREE;
705 if (!first_iteration_full)
706 {
707 tree start, end;
708 if (first_limit == test_limit)
709 {
710 /* Use a natural test between zero (the initial IV value)
711 and the loop limit. The "else" block would be valid too,
712 but this choice can avoid the need to load BIAS_TREE into
713 a register. */
714 start = zero_index;
715 end = this_test_limit;
716 }
717 else
718 {
719 /* FIRST_LIMIT is the maximum number of items handled by the
720 first iteration of the vector loop. Test the portion
721 associated with this control. */
722 start = bias_tree;
723 end = first_limit;
724 }
725
726 if (use_masks_p)
727 init_ctrl = vect_gen_while (preheader_seq, ctrl_type,
728 start, end, "max_mask");
729 else
730 {
731 init_ctrl = make_temp_ssa_name (type: compare_type, NULL, name: "max_len");
732 gimple_seq seq = vect_gen_len (init_ctrl, start,
733 end, length_limit);
734 gimple_seq_add_seq (preheader_seq, seq);
735 }
736 }
737
738 /* Now AND out the bits that are within the number of skipped
739 items. */
740 poly_uint64 const_skip;
741 if (nitems_skip
742 && !(poly_int_tree_p (t: nitems_skip, value: &const_skip)
743 && known_le (const_skip, bias)))
744 {
745 gcc_assert (use_masks_p);
746 tree unskipped_mask = vect_gen_while_not (preheader_seq, ctrl_type,
747 bias_tree, nitems_skip);
748 if (init_ctrl)
749 init_ctrl = gimple_build (seq: preheader_seq, code: BIT_AND_EXPR, type: ctrl_type,
750 ops: init_ctrl, ops: unskipped_mask);
751 else
752 init_ctrl = unskipped_mask;
753 }
754
755 if (!init_ctrl)
756 {
757 /* First iteration is full. */
758 if (use_masks_p)
759 init_ctrl = build_minus_one_cst (ctrl_type);
760 else
761 init_ctrl = length_limit;
762 }
763
764 /* Get the control value for the next iteration of the loop. */
765 if (use_masks_p)
766 {
767 gimple_seq stmts = NULL;
768 next_ctrl = vect_gen_while (&stmts, ctrl_type, test_index,
769 this_test_limit, "next_mask");
770 gsi_insert_seq_before (test_gsi, stmts, GSI_SAME_STMT);
771 }
772 else
773 {
774 next_ctrl = make_temp_ssa_name (type: compare_type, NULL, name: "next_len");
775 gimple_seq seq = vect_gen_len (next_ctrl, test_index, this_test_limit,
776 length_limit);
777 gsi_insert_seq_before (test_gsi, seq, GSI_SAME_STMT);
778 }
779
780 vect_set_loop_control (loop, ctrl, init_ctrl, next_ctrl);
781 }
782
783 int partial_load_bias = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
784 if (partial_load_bias != 0)
785 {
786 tree adjusted_len = rgc->bias_adjusted_ctrl;
787 gassign *minus = gimple_build_assign (adjusted_len, PLUS_EXPR,
788 rgc->controls[0],
789 build_int_cst
790 (TREE_TYPE (rgc->controls[0]),
791 partial_load_bias));
792 gimple_seq_add_stmt (header_seq, minus);
793 }
794
795 return next_ctrl;
796}
797
798/* Set up the iteration condition and rgroup controls for LOOP, given
799 that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the vectorized
800 loop. LOOP_VINFO describes the vectorization of LOOP. NITERS is
801 the number of iterations of the original scalar loop that should be
802 handled by the vector loop. NITERS_MAYBE_ZERO and FINAL_IV are as
803 for vect_set_loop_condition.
804
805 Insert the branch-back condition before LOOP_COND_GSI and return the
806 final gcond. */
807
808static gcond *
809vect_set_loop_condition_partial_vectors (class loop *loop, edge exit_edge,
810 loop_vec_info loop_vinfo, tree niters,
811 tree final_iv, bool niters_maybe_zero,
812 gimple_stmt_iterator loop_cond_gsi)
813{
814 gimple_seq preheader_seq = NULL;
815 gimple_seq header_seq = NULL;
816
817 bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
818 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
819 unsigned int compare_precision = TYPE_PRECISION (compare_type);
820 tree orig_niters = niters;
821
822 /* Type of the initial value of NITERS. */
823 tree ni_actual_type = TREE_TYPE (niters);
824 unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
825 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
826 if (niters_skip)
827 niters_skip = gimple_convert (seq: &preheader_seq, type: compare_type, op: niters_skip);
828
829 /* Convert NITERS to the same size as the compare. */
830 if (compare_precision > ni_actual_precision
831 && niters_maybe_zero)
832 {
833 /* We know that there is always at least one iteration, so if the
834 count is zero then it must have wrapped. Cope with this by
835 subtracting 1 before the conversion and adding 1 to the result. */
836 gcc_assert (TYPE_UNSIGNED (ni_actual_type));
837 niters = gimple_build (seq: &preheader_seq, code: PLUS_EXPR, type: ni_actual_type,
838 ops: niters, ops: build_minus_one_cst (ni_actual_type));
839 niters = gimple_convert (seq: &preheader_seq, type: compare_type, op: niters);
840 niters = gimple_build (seq: &preheader_seq, code: PLUS_EXPR, type: compare_type,
841 ops: niters, ops: build_one_cst (compare_type));
842 }
843 else
844 niters = gimple_convert (seq: &preheader_seq, type: compare_type, op: niters);
845
846 /* Iterate over all the rgroups and fill in their controls. We could use
847 the first control from any rgroup for the loop condition; here we
848 arbitrarily pick the last. */
849 tree test_ctrl = NULL_TREE;
850 tree iv_step = NULL_TREE;
851 tree compare_step = NULL_TREE;
852 rgroup_controls *rgc;
853 rgroup_controls *iv_rgc = nullptr;
854 unsigned int i;
855 auto_vec<rgroup_controls> *controls = use_masks_p
856 ? &LOOP_VINFO_MASKS (loop_vinfo).rgc_vec
857 : &LOOP_VINFO_LENS (loop_vinfo);
858 FOR_EACH_VEC_ELT (*controls, i, rgc)
859 if (!rgc->controls.is_empty ())
860 {
861 /* First try using permutes. This adds a single vector
862 instruction to the loop for each mask, but needs no extra
863 loop invariants or IVs. */
864 unsigned int nmasks = i + 1;
865 if (use_masks_p && (nmasks & 1) == 0)
866 {
867 rgroup_controls *half_rgc = &(*controls)[nmasks / 2 - 1];
868 if (!half_rgc->controls.is_empty ()
869 && vect_maybe_permute_loop_masks (seq: &header_seq, dest_rgm: rgc, src_rgm: half_rgc))
870 continue;
871 }
872
873 if (!LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
874 || !iv_rgc
875 || (iv_rgc->max_nscalars_per_iter * iv_rgc->factor
876 != rgc->max_nscalars_per_iter * rgc->factor))
877 {
878 /* See whether zero-based IV would ever generate all-false masks
879 or zero length before wrapping around. */
880 bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
881
882 /* Set up all controls for this group. */
883 test_ctrl
884 = vect_set_loop_controls_directly (loop, loop_vinfo,
885 preheader_seq: &preheader_seq, header_seq: &header_seq,
886 loop_cond_gsi, rgc, niters,
887 niters_skip, might_wrap_p,
888 iv_step: &iv_step, compare_step: &compare_step);
889
890 iv_rgc = rgc;
891 }
892
893 if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
894 && rgc->controls.length () > 1)
895 {
896 /* vect_set_loop_controls_directly creates an IV whose step
897 is equal to the expected sum of RGC->controls. Use that
898 information to populate RGC->controls. */
899 tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
900 gcc_assert (iv_step);
901 vect_adjust_loop_lens_control (iv_type, seq: &header_seq, dest_rgm: rgc, step: iv_step);
902 }
903 }
904
905 /* Emit all accumulated statements. */
906 add_preheader_seq (loop, seq: preheader_seq);
907 add_header_seq (loop, seq: header_seq);
908
909 /* Get a boolean result that tells us whether to iterate. */
910 gcond *cond_stmt;
911 if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
912 && !LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
913 {
914 gcc_assert (compare_step);
915 tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? LE_EXPR : GT_EXPR;
916 cond_stmt = gimple_build_cond (code, test_ctrl, compare_step, NULL_TREE,
917 NULL_TREE);
918 }
919 else
920 {
921 tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
922 tree zero_ctrl = build_zero_cst (TREE_TYPE (test_ctrl));
923 cond_stmt
924 = gimple_build_cond (code, test_ctrl, zero_ctrl, NULL_TREE, NULL_TREE);
925 }
926 gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
927
928 /* The loop iterates (NITERS - 1) / VF + 1 times.
929 Subtract one from this to get the latch count. */
930 tree step = build_int_cst (compare_type,
931 LOOP_VINFO_VECT_FACTOR (loop_vinfo));
932 tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
933 build_minus_one_cst (compare_type));
934 loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
935 niters_minus_one, step);
936
937 if (final_iv)
938 {
939 gassign *assign = gimple_build_assign (final_iv, orig_niters);
940 gsi_insert_on_edge_immediate (exit_edge, assign);
941 }
942
943 return cond_stmt;
944}
945
946/* Set up the iteration condition and rgroup controls for LOOP in AVX512
947 style, given that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the
948 vectorized loop. LOOP_VINFO describes the vectorization of LOOP. NITERS is
949 the number of iterations of the original scalar loop that should be
950 handled by the vector loop. NITERS_MAYBE_ZERO and FINAL_IV are as
951 for vect_set_loop_condition.
952
953 Insert the branch-back condition before LOOP_COND_GSI and return the
954 final gcond. */
955
956static gcond *
957vect_set_loop_condition_partial_vectors_avx512 (class loop *loop,
958 edge exit_edge,
959 loop_vec_info loop_vinfo, tree niters,
960 tree final_iv,
961 bool niters_maybe_zero,
962 gimple_stmt_iterator loop_cond_gsi)
963{
964 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
965 tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
966 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
967 tree orig_niters = niters;
968 gimple_seq preheader_seq = NULL;
969
970 /* Create an IV that counts down from niters and whose step
971 is the number of iterations processed in the current iteration.
972 Produce the controls with compares like the following.
973
974 # iv_2 = PHI <niters, iv_3>
975 rem_4 = MIN <iv_2, VF>;
976 remv_6 = { rem_4, rem_4, rem_4, ... }
977 mask_5 = { 0, 0, 1, 1, 2, 2, ... } < remv6;
978 iv_3 = iv_2 - VF;
979 if (iv_2 > VF)
980 continue;
981
982 Where the constant is built with elements at most VF - 1 and
983 repetitions according to max_nscalars_per_iter which is guarnateed
984 to be the same within a group. */
985
986 /* Convert NITERS to the determined IV type. */
987 if (TYPE_PRECISION (iv_type) > TYPE_PRECISION (TREE_TYPE (niters))
988 && niters_maybe_zero)
989 {
990 /* We know that there is always at least one iteration, so if the
991 count is zero then it must have wrapped. Cope with this by
992 subtracting 1 before the conversion and adding 1 to the result. */
993 gcc_assert (TYPE_UNSIGNED (TREE_TYPE (niters)));
994 niters = gimple_build (seq: &preheader_seq, code: PLUS_EXPR, TREE_TYPE (niters),
995 ops: niters, ops: build_minus_one_cst (TREE_TYPE (niters)));
996 niters = gimple_convert (seq: &preheader_seq, type: iv_type, op: niters);
997 niters = gimple_build (seq: &preheader_seq, code: PLUS_EXPR, type: iv_type,
998 ops: niters, ops: build_one_cst (iv_type));
999 }
1000 else
1001 niters = gimple_convert (seq: &preheader_seq, type: iv_type, op: niters);
1002
1003 /* Bias the initial value of the IV in case we need to skip iterations
1004 at the beginning. */
1005 tree niters_adj = niters;
1006 if (niters_skip)
1007 {
1008 tree skip = gimple_convert (seq: &preheader_seq, type: iv_type, op: niters_skip);
1009 niters_adj = gimple_build (seq: &preheader_seq, code: PLUS_EXPR,
1010 type: iv_type, ops: niters, ops: skip);
1011 }
1012
1013 /* The iteration step is the vectorization factor. */
1014 tree iv_step = build_int_cst (iv_type, vf);
1015
1016 /* Create the decrement IV. */
1017 tree index_before_incr, index_after_incr;
1018 gimple_stmt_iterator incr_gsi;
1019 bool insert_after;
1020 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
1021 create_iv (niters_adj, MINUS_EXPR, iv_step, NULL_TREE, loop,
1022 &incr_gsi, insert_after, &index_before_incr,
1023 &index_after_incr);
1024
1025 /* Iterate over all the rgroups and fill in their controls. */
1026 for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1027 {
1028 if (rgc.controls.is_empty ())
1029 continue;
1030
1031 tree ctrl_type = rgc.type;
1032 poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (node: ctrl_type);
1033
1034 tree vectype = rgc.compare_type;
1035
1036 /* index_after_incr is the IV specifying the remaining iterations in
1037 the next iteration. */
1038 tree rem = index_after_incr;
1039 /* When the data type for the compare to produce the mask is
1040 smaller than the IV type we need to saturate. Saturate to
1041 the smallest possible value (IV_TYPE) so we only have to
1042 saturate once (CSE will catch redundant ones we add). */
1043 if (TYPE_PRECISION (TREE_TYPE (vectype)) < TYPE_PRECISION (iv_type))
1044 rem = gimple_build (&incr_gsi, false, GSI_CONTINUE_LINKING,
1045 UNKNOWN_LOCATION,
1046 MIN_EXPR, TREE_TYPE (rem), rem, iv_step);
1047 rem = gimple_convert (&incr_gsi, false, GSI_CONTINUE_LINKING,
1048 UNKNOWN_LOCATION, TREE_TYPE (vectype), rem);
1049
1050 /* Build a data vector composed of the remaining iterations. */
1051 rem = gimple_build_vector_from_val (&incr_gsi, false, GSI_CONTINUE_LINKING,
1052 UNKNOWN_LOCATION, vectype, rem);
1053
1054 /* Provide a definition of each vector in the control group. */
1055 tree next_ctrl = NULL_TREE;
1056 tree first_rem = NULL_TREE;
1057 tree ctrl;
1058 unsigned int i;
1059 FOR_EACH_VEC_ELT_REVERSE (rgc.controls, i, ctrl)
1060 {
1061 /* Previous controls will cover BIAS items. This control covers the
1062 next batch. */
1063 poly_uint64 bias = nitems_per_ctrl * i;
1064
1065 /* Build the constant to compare the remaining iters against,
1066 this is sth like { 0, 0, 1, 1, 2, 2, 3, 3, ... } appropriately
1067 split into pieces. */
1068 unsigned n = TYPE_VECTOR_SUBPARTS (node: ctrl_type).to_constant ();
1069 tree_vector_builder builder (vectype, n, 1);
1070 for (unsigned i = 0; i < n; ++i)
1071 {
1072 unsigned HOST_WIDE_INT val
1073 = (i + bias.to_constant ()) / rgc.max_nscalars_per_iter;
1074 gcc_assert (val < vf.to_constant ());
1075 builder.quick_push (obj: build_int_cst (TREE_TYPE (vectype), val));
1076 }
1077 tree cmp_series = builder.build ();
1078
1079 /* Create the initial control. First include all items that
1080 are within the loop limit. */
1081 tree init_ctrl = NULL_TREE;
1082 poly_uint64 const_limit;
1083 /* See whether the first iteration of the vector loop is known
1084 to have a full control. */
1085 if (poly_int_tree_p (t: niters, value: &const_limit)
1086 && known_ge (const_limit, (i + 1) * nitems_per_ctrl))
1087 init_ctrl = build_minus_one_cst (ctrl_type);
1088 else
1089 {
1090 /* The remaining work items initially are niters. Saturate,
1091 splat and compare. */
1092 if (!first_rem)
1093 {
1094 first_rem = niters;
1095 if (TYPE_PRECISION (TREE_TYPE (vectype))
1096 < TYPE_PRECISION (iv_type))
1097 first_rem = gimple_build (seq: &preheader_seq,
1098 code: MIN_EXPR, TREE_TYPE (first_rem),
1099 ops: first_rem, ops: iv_step);
1100 first_rem = gimple_convert (seq: &preheader_seq, TREE_TYPE (vectype),
1101 op: first_rem);
1102 first_rem = gimple_build_vector_from_val (seq: &preheader_seq,
1103 type: vectype, op: first_rem);
1104 }
1105 init_ctrl = gimple_build (seq: &preheader_seq, code: LT_EXPR, type: ctrl_type,
1106 ops: cmp_series, ops: first_rem);
1107 }
1108
1109 /* Now AND out the bits that are within the number of skipped
1110 items. */
1111 poly_uint64 const_skip;
1112 if (niters_skip
1113 && !(poly_int_tree_p (t: niters_skip, value: &const_skip)
1114 && known_le (const_skip, bias)))
1115 {
1116 /* For integer mode masks it's cheaper to shift out the bits
1117 since that avoids loading a constant. */
1118 gcc_assert (GET_MODE_CLASS (TYPE_MODE (ctrl_type)) == MODE_INT);
1119 init_ctrl = gimple_build (seq: &preheader_seq, code: VIEW_CONVERT_EXPR,
1120 type: lang_hooks.types.type_for_mode
1121 (TYPE_MODE (ctrl_type), 1),
1122 ops: init_ctrl);
1123 /* ??? But when the shift amount isn't constant this requires
1124 a round-trip to GRPs. We could apply the bias to either
1125 side of the compare instead. */
1126 tree shift = gimple_build (seq: &preheader_seq, code: MULT_EXPR,
1127 TREE_TYPE (niters_skip), ops: niters_skip,
1128 ops: build_int_cst (TREE_TYPE (niters_skip),
1129 rgc.max_nscalars_per_iter));
1130 init_ctrl = gimple_build (seq: &preheader_seq, code: LSHIFT_EXPR,
1131 TREE_TYPE (init_ctrl),
1132 ops: init_ctrl, ops: shift);
1133 init_ctrl = gimple_build (seq: &preheader_seq, code: VIEW_CONVERT_EXPR,
1134 type: ctrl_type, ops: init_ctrl);
1135 }
1136
1137 /* Get the control value for the next iteration of the loop. */
1138 next_ctrl = gimple_build (&incr_gsi, false, GSI_CONTINUE_LINKING,
1139 UNKNOWN_LOCATION,
1140 LT_EXPR, ctrl_type, cmp_series, rem);
1141
1142 vect_set_loop_control (loop, ctrl, init_ctrl, next_ctrl);
1143 }
1144 }
1145
1146 /* Emit all accumulated statements. */
1147 add_preheader_seq (loop, seq: preheader_seq);
1148
1149 /* Adjust the exit test using the decrementing IV. */
1150 tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? LE_EXPR : GT_EXPR;
1151 /* When we peel for alignment with niter_skip != 0 this can
1152 cause niter + niter_skip to wrap and since we are comparing the
1153 value before the decrement here we get a false early exit.
1154 We can't compare the value after decrement either because that
1155 decrement could wrap as well as we're not doing a saturating
1156 decrement. To avoid this situation we force a larger
1157 iv_type. */
1158 gcond *cond_stmt = gimple_build_cond (code, index_before_incr, iv_step,
1159 NULL_TREE, NULL_TREE);
1160 gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
1161
1162 /* The loop iterates (NITERS - 1 + NITERS_SKIP) / VF + 1 times.
1163 Subtract one from this to get the latch count. */
1164 tree niters_minus_one
1165 = fold_build2 (PLUS_EXPR, TREE_TYPE (orig_niters), orig_niters,
1166 build_minus_one_cst (TREE_TYPE (orig_niters)));
1167 tree niters_adj2 = fold_convert (iv_type, niters_minus_one);
1168 if (niters_skip)
1169 niters_adj2 = fold_build2 (PLUS_EXPR, iv_type, niters_minus_one,
1170 fold_convert (iv_type, niters_skip));
1171 loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, iv_type,
1172 niters_adj2, iv_step);
1173
1174 if (final_iv)
1175 {
1176 gassign *assign = gimple_build_assign (final_iv, orig_niters);
1177 gsi_insert_on_edge_immediate (single_exit (loop), assign);
1178 }
1179
1180 return cond_stmt;
1181}
1182
1183
1184/* Like vect_set_loop_condition, but handle the case in which the vector
1185 loop handles exactly VF scalars per iteration. */
1186
1187static gcond *
1188vect_set_loop_condition_normal (loop_vec_info /* loop_vinfo */, edge exit_edge,
1189 class loop *loop, tree niters, tree step,
1190 tree final_iv, bool niters_maybe_zero,
1191 gimple_stmt_iterator loop_cond_gsi)
1192{
1193 tree indx_before_incr, indx_after_incr;
1194 gcond *cond_stmt;
1195 gcond *orig_cond;
1196 edge pe = loop_preheader_edge (loop);
1197 gimple_stmt_iterator incr_gsi;
1198 bool insert_after;
1199 enum tree_code code;
1200 tree niters_type = TREE_TYPE (niters);
1201
1202 orig_cond = get_loop_exit_condition (exit_edge);
1203 gcc_assert (orig_cond);
1204 loop_cond_gsi = gsi_for_stmt (orig_cond);
1205
1206 tree init, limit;
1207 if (!niters_maybe_zero && integer_onep (step))
1208 {
1209 /* In this case we can use a simple 0-based IV:
1210
1211 A:
1212 x = 0;
1213 do
1214 {
1215 ...
1216 x += 1;
1217 }
1218 while (x < NITERS); */
1219 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
1220 init = build_zero_cst (niters_type);
1221 limit = niters;
1222 }
1223 else
1224 {
1225 /* The following works for all values of NITERS except 0:
1226
1227 B:
1228 x = 0;
1229 do
1230 {
1231 ...
1232 x += STEP;
1233 }
1234 while (x <= NITERS - STEP);
1235
1236 so that the loop continues to iterate if x + STEP - 1 < NITERS
1237 but stops if x + STEP - 1 >= NITERS.
1238
1239 However, if NITERS is zero, x never hits a value above NITERS - STEP
1240 before wrapping around. There are two obvious ways of dealing with
1241 this:
1242
1243 - start at STEP - 1 and compare x before incrementing it
1244 - start at -1 and compare x after incrementing it
1245
1246 The latter is simpler and is what we use. The loop in this case
1247 looks like:
1248
1249 C:
1250 x = -1;
1251 do
1252 {
1253 ...
1254 x += STEP;
1255 }
1256 while (x < NITERS - STEP);
1257
1258 In both cases the loop limit is NITERS - STEP. */
1259 gimple_seq seq = NULL;
1260 limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
1261 limit = gimple_build (seq: &seq, code: MINUS_EXPR, TREE_TYPE (limit), ops: limit, ops: step);
1262 if (seq)
1263 {
1264 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
1265 gcc_assert (!new_bb);
1266 }
1267 if (niters_maybe_zero)
1268 {
1269 /* Case C. */
1270 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
1271 init = build_all_ones_cst (niters_type);
1272 }
1273 else
1274 {
1275 /* Case B. */
1276 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
1277 init = build_zero_cst (niters_type);
1278 }
1279 }
1280
1281 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
1282 create_iv (init, PLUS_EXPR, step, NULL_TREE, loop,
1283 &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
1284 indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
1285 true, NULL_TREE, true,
1286 GSI_SAME_STMT);
1287 limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
1288 true, GSI_SAME_STMT);
1289
1290 cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
1291 NULL_TREE);
1292
1293 gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
1294
1295 /* Record the number of latch iterations. */
1296 if (limit == niters)
1297 /* Case A: the loop iterates NITERS times. Subtract one to get the
1298 latch count. */
1299 loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
1300 build_int_cst (niters_type, 1));
1301 else
1302 /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
1303 Subtract one from this to get the latch count. */
1304 loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
1305 limit, step);
1306
1307 if (final_iv)
1308 {
1309 gassign *assign;
1310 gcc_assert (single_pred_p (exit_edge->dest));
1311 tree phi_dest
1312 = integer_zerop (init) ? final_iv : copy_ssa_name (var: indx_after_incr);
1313 /* Make sure to maintain LC SSA form here and elide the subtraction
1314 if the value is zero. */
1315 gphi *phi = create_phi_node (phi_dest, exit_edge->dest);
1316 add_phi_arg (phi, indx_after_incr, exit_edge, UNKNOWN_LOCATION);
1317 if (!integer_zerop (init))
1318 {
1319 assign = gimple_build_assign (final_iv, MINUS_EXPR,
1320 phi_dest, init);
1321 gimple_stmt_iterator gsi = gsi_after_labels (bb: exit_edge->dest);
1322 gsi_insert_before (&gsi, assign, GSI_SAME_STMT);
1323 }
1324 }
1325
1326 return cond_stmt;
1327}
1328
1329/* If we're using fully-masked loops, make LOOP iterate:
1330
1331 N == (NITERS - 1) / STEP + 1
1332
1333 times. When NITERS is zero, this is equivalent to making the loop
1334 execute (1 << M) / STEP times, where M is the precision of NITERS.
1335 NITERS_MAYBE_ZERO is true if this last case might occur.
1336
1337 If we're not using fully-masked loops, make LOOP iterate:
1338
1339 N == (NITERS - STEP) / STEP + 1
1340
1341 times, where NITERS is known to be outside the range [1, STEP - 1].
1342 This is equivalent to making the loop execute NITERS / STEP times
1343 when NITERS is nonzero and (1 << M) / STEP times otherwise.
1344 NITERS_MAYBE_ZERO again indicates whether this last case might occur.
1345
1346 If FINAL_IV is nonnull, it is an SSA name that should be set to
1347 N * STEP on exit from the loop.
1348
1349 Assumption: the exit-condition of LOOP is the last stmt in the loop. */
1350
1351void
1352vect_set_loop_condition (class loop *loop, edge loop_e, loop_vec_info loop_vinfo,
1353 tree niters, tree step, tree final_iv,
1354 bool niters_maybe_zero)
1355{
1356 gcond *cond_stmt;
1357 gcond *orig_cond = get_loop_exit_condition (loop_e);
1358 gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
1359
1360 if (loop_vinfo && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1361 {
1362 if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_avx512)
1363 cond_stmt = vect_set_loop_condition_partial_vectors_avx512 (loop, exit_edge: loop_e,
1364 loop_vinfo,
1365 niters, final_iv,
1366 niters_maybe_zero,
1367 loop_cond_gsi);
1368 else
1369 cond_stmt = vect_set_loop_condition_partial_vectors (loop, exit_edge: loop_e,
1370 loop_vinfo,
1371 niters, final_iv,
1372 niters_maybe_zero,
1373 loop_cond_gsi);
1374 }
1375 else
1376 cond_stmt = vect_set_loop_condition_normal (loop_vinfo, exit_edge: loop_e, loop,
1377 niters,
1378 step, final_iv,
1379 niters_maybe_zero,
1380 loop_cond_gsi);
1381
1382 /* Remove old loop exit test. */
1383 stmt_vec_info orig_cond_info;
1384 if (loop_vinfo
1385 && (orig_cond_info = loop_vinfo->lookup_stmt (orig_cond)))
1386 loop_vinfo->remove_stmt (orig_cond_info);
1387 else
1388 gsi_remove (&loop_cond_gsi, true);
1389
1390 if (dump_enabled_p ())
1391 dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: %G",
1392 (gimple *) cond_stmt);
1393}
1394
1395/* Given LOOP this function generates a new copy of it and puts it
1396 on E which is either the entry or exit of LOOP. If SCALAR_LOOP is
1397 non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
1398 basic blocks from SCALAR_LOOP instead of LOOP, but to either the
1399 entry or exit of LOOP. If FLOW_LOOPS then connect LOOP to SCALAR_LOOP as a
1400 continuation. This is correct for cases where one loop continues from the
1401 other like in the vectorizer, but not true for uses in e.g. loop distribution
1402 where the contents of the loop body are split but the iteration space of both
1403 copies remains the same.
1404
1405 If UPDATED_DOMS is not NULL it is update with the list of basic blocks whoms
1406 dominators were updated during the peeling. */
1407
1408class loop *
1409slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
1410 class loop *scalar_loop,
1411 edge scalar_exit, edge e, edge *new_e,
1412 bool flow_loops)
1413{
1414 class loop *new_loop;
1415 basic_block *new_bbs, *bbs, *pbbs;
1416 bool at_exit;
1417 bool was_imm_dom;
1418 basic_block exit_dest;
1419 edge exit, new_exit;
1420 bool duplicate_outer_loop = false;
1421
1422 exit = loop_exit;
1423 at_exit = (e == exit);
1424 if (!at_exit && e != loop_preheader_edge (loop))
1425 return NULL;
1426
1427 if (scalar_loop == NULL)
1428 {
1429 scalar_loop = loop;
1430 scalar_exit = loop_exit;
1431 }
1432 else if (scalar_loop == loop)
1433 scalar_exit = loop_exit;
1434 else
1435 {
1436 /* Loop has been version, match exits up using the aux index. */
1437 for (edge exit : get_loop_exit_edges (scalar_loop))
1438 if (exit->aux == loop_exit->aux)
1439 {
1440 scalar_exit = exit;
1441 break;
1442 }
1443
1444 gcc_assert (scalar_exit);
1445 }
1446
1447 bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1448 pbbs = bbs + 1;
1449 get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
1450 /* Allow duplication of outer loops. */
1451 if (scalar_loop->inner)
1452 duplicate_outer_loop = true;
1453
1454 /* Generate new loop structure. */
1455 new_loop = duplicate_loop (scalar_loop, loop_outer (loop: scalar_loop));
1456 duplicate_subloops (scalar_loop, new_loop);
1457
1458 exit_dest = exit->dest;
1459 was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
1460 exit_dest) == loop->header ?
1461 true : false);
1462
1463 /* Also copy the pre-header, this avoids jumping through hoops to
1464 duplicate the loop entry PHI arguments. Create an empty
1465 pre-header unconditionally for this. */
1466 basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
1467 edge entry_e = single_pred_edge (bb: preheader);
1468 bbs[0] = preheader;
1469 new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1470
1471 copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
1472 &scalar_exit, 1, &new_exit, NULL,
1473 at_exit ? loop->latch : e->src, true);
1474 exit = loop_exit;
1475 basic_block new_preheader = new_bbs[0];
1476
1477 gcc_assert (new_exit);
1478
1479 /* Record the new loop exit information. new_loop doesn't have SCEV data and
1480 so we must initialize the exit information. */
1481 if (new_e)
1482 *new_e = new_exit;
1483
1484 /* Before installing PHI arguments make sure that the edges
1485 into them match that of the scalar loop we analyzed. This
1486 makes sure the SLP tree matches up between the main vectorized
1487 loop and the epilogue vectorized copies. */
1488 if (single_succ_edge (bb: preheader)->dest_idx
1489 != single_succ_edge (bb: new_bbs[0])->dest_idx)
1490 {
1491 basic_block swap_bb = new_bbs[1];
1492 gcc_assert (EDGE_COUNT (swap_bb->preds) == 2);
1493 std::swap (EDGE_PRED (swap_bb, 0), EDGE_PRED (swap_bb, 1));
1494 EDGE_PRED (swap_bb, 0)->dest_idx = 0;
1495 EDGE_PRED (swap_bb, 1)->dest_idx = 1;
1496 }
1497 if (duplicate_outer_loop)
1498 {
1499 class loop *new_inner_loop = get_loop_copy (scalar_loop->inner);
1500 if (loop_preheader_edge (scalar_loop)->dest_idx
1501 != loop_preheader_edge (new_inner_loop)->dest_idx)
1502 {
1503 basic_block swap_bb = new_inner_loop->header;
1504 gcc_assert (EDGE_COUNT (swap_bb->preds) == 2);
1505 std::swap (EDGE_PRED (swap_bb, 0), EDGE_PRED (swap_bb, 1));
1506 EDGE_PRED (swap_bb, 0)->dest_idx = 0;
1507 EDGE_PRED (swap_bb, 1)->dest_idx = 1;
1508 }
1509 }
1510
1511 add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
1512
1513 /* Skip new preheader since it's deleted if copy loop is added at entry. */
1514 for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
1515 rename_variables_in_bb (bb: new_bbs[i], rename_from_outer_loop: duplicate_outer_loop);
1516
1517 /* Rename the exit uses. */
1518 for (edge exit : get_loop_exit_edges (new_loop))
1519 for (auto gsi = gsi_start_phis (exit->dest);
1520 !gsi_end_p (i: gsi); gsi_next (i: &gsi))
1521 {
1522 tree orig_def = PHI_ARG_DEF_FROM_EDGE (gsi.phi (), exit);
1523 rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), exit));
1524 if (MAY_HAVE_DEBUG_BIND_STMTS)
1525 adjust_debug_stmts (from: orig_def, PHI_RESULT (gsi.phi ()), bb: exit->dest);
1526 }
1527
1528 auto loop_exits = get_loop_exit_edges (loop);
1529 auto_vec<basic_block> doms;
1530
1531 if (at_exit) /* Add the loop copy at exit. */
1532 {
1533 if (scalar_loop != loop && new_exit->dest != exit_dest)
1534 {
1535 new_exit = redirect_edge_and_branch (new_exit, exit_dest);
1536 flush_pending_stmts (new_exit);
1537 }
1538
1539 auto_vec <gimple *> new_phis;
1540 hash_map <tree, tree> new_phi_args;
1541 /* First create the empty phi nodes so that when we flush the
1542 statements they can be filled in. However because there is no order
1543 between the PHI nodes in the exits and the loop headers we need to
1544 order them base on the order of the two headers. First record the new
1545 phi nodes. */
1546 for (auto gsi_from = gsi_start_phis (scalar_exit->dest);
1547 !gsi_end_p (i: gsi_from); gsi_next (i: &gsi_from))
1548 {
1549 gimple *from_phi = gsi_stmt (i: gsi_from);
1550 tree new_res = copy_ssa_name (var: gimple_phi_result (gs: from_phi));
1551 gphi *res = create_phi_node (new_res, new_preheader);
1552 new_phis.safe_push (obj: res);
1553 }
1554
1555 /* Then redirect the edges and flush the changes. This writes out the new
1556 SSA names. */
1557 for (edge exit : loop_exits)
1558 {
1559 edge temp_e = redirect_edge_and_branch (exit, new_preheader);
1560 flush_pending_stmts (temp_e);
1561 }
1562 /* Record the new SSA names in the cache so that we can skip materializing
1563 them again when we fill in the rest of the LCSSA variables. */
1564 for (auto phi : new_phis)
1565 {
1566 tree new_arg = gimple_phi_arg (gs: phi, index: 0)->def;
1567
1568 if (!SSA_VAR_P (new_arg))
1569 continue;
1570 /* If the PHI MEM node dominates the loop then we shouldn't create
1571 a new LC-SSSA PHI for it in the intermediate block. */
1572 /* A MEM phi that consitutes a new DEF for the vUSE chain can either
1573 be a .VDEF or a PHI that operates on MEM. And said definition
1574 must not be inside the main loop. Or we must be a parameter.
1575 In the last two cases we may remove a non-MEM PHI node, but since
1576 they dominate both loops the removal is unlikely to cause trouble
1577 as the exits must already be using them. */
1578 if (virtual_operand_p (op: new_arg)
1579 && (SSA_NAME_IS_DEFAULT_DEF (new_arg)
1580 || !flow_bb_inside_loop_p (loop,
1581 gimple_bb (SSA_NAME_DEF_STMT (new_arg)))))
1582 {
1583 auto gsi = gsi_for_stmt (phi);
1584 remove_phi_node (&gsi, true);
1585 continue;
1586 }
1587 new_phi_args.put (k: new_arg, v: gimple_phi_result (gs: phi));
1588
1589 if (TREE_CODE (new_arg) != SSA_NAME)
1590 continue;
1591 }
1592
1593 /* Copy the current loop LC PHI nodes between the original loop exit
1594 block and the new loop header. This allows us to later split the
1595 preheader block and still find the right LC nodes. */
1596 edge loop_entry = single_succ_edge (bb: new_preheader);
1597 if (flow_loops)
1598 for (auto gsi_from = gsi_start_phis (loop->header),
1599 gsi_to = gsi_start_phis (new_loop->header);
1600 !gsi_end_p (i: gsi_from) && !gsi_end_p (i: gsi_to);
1601 gsi_next (i: &gsi_from), gsi_next (i: &gsi_to))
1602 {
1603 gimple *from_phi = gsi_stmt (i: gsi_from);
1604 gimple *to_phi = gsi_stmt (i: gsi_to);
1605 tree new_arg = PHI_ARG_DEF_FROM_EDGE (from_phi,
1606 loop_latch_edge (loop));
1607
1608 /* Check if we've already created a new phi node during edge
1609 redirection. If we have, only propagate the value downwards. */
1610 if (tree *res = new_phi_args.get (k: new_arg))
1611 {
1612 adjust_phi_and_debug_stmts (update_phi: to_phi, e: loop_entry, new_def: *res);
1613 continue;
1614 }
1615
1616 tree new_res = copy_ssa_name (var: gimple_phi_result (gs: from_phi));
1617 gphi *lcssa_phi = create_phi_node (new_res, new_preheader);
1618
1619 /* Main loop exit should use the final iter value. */
1620 add_phi_arg (lcssa_phi, new_arg, loop_exit, UNKNOWN_LOCATION);
1621
1622 adjust_phi_and_debug_stmts (update_phi: to_phi, e: loop_entry, new_def: new_res);
1623 }
1624
1625 set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
1626
1627 if (was_imm_dom || duplicate_outer_loop)
1628 set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
1629
1630 /* And remove the non-necessary forwarder again. Keep the other
1631 one so we have a proper pre-header for the loop at the exit edge. */
1632 redirect_edge_pred (single_succ_edge (bb: preheader),
1633 single_pred (bb: preheader));
1634 delete_basic_block (preheader);
1635 set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1636 loop_preheader_edge (scalar_loop)->src);
1637 }
1638 else /* Add the copy at entry. */
1639 {
1640 /* Copy the current loop LC PHI nodes between the original loop exit
1641 block and the new loop header. This allows us to later split the
1642 preheader block and still find the right LC nodes. */
1643 if (flow_loops)
1644 for (auto gsi_from = gsi_start_phis (new_loop->header),
1645 gsi_to = gsi_start_phis (loop->header);
1646 !gsi_end_p (i: gsi_from) && !gsi_end_p (i: gsi_to);
1647 gsi_next (i: &gsi_from), gsi_next (i: &gsi_to))
1648 {
1649 gimple *from_phi = gsi_stmt (i: gsi_from);
1650 gimple *to_phi = gsi_stmt (i: gsi_to);
1651 tree new_arg = PHI_ARG_DEF_FROM_EDGE (from_phi,
1652 loop_latch_edge (new_loop));
1653 adjust_phi_and_debug_stmts (update_phi: to_phi, e: loop_preheader_edge (loop),
1654 new_def: new_arg);
1655 }
1656
1657 if (scalar_loop != loop)
1658 {
1659 /* Remove the non-necessary forwarder of scalar_loop again. */
1660 redirect_edge_pred (single_succ_edge (bb: preheader),
1661 single_pred (bb: preheader));
1662 delete_basic_block (preheader);
1663 set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1664 loop_preheader_edge (scalar_loop)->src);
1665 preheader = split_edge (loop_preheader_edge (loop));
1666 entry_e = single_pred_edge (bb: preheader);
1667 }
1668
1669 redirect_edge_and_branch_force (entry_e, new_preheader);
1670 flush_pending_stmts (entry_e);
1671 set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
1672
1673 redirect_edge_and_branch_force (new_exit, preheader);
1674 flush_pending_stmts (new_exit);
1675 set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
1676
1677 /* And remove the non-necessary forwarder again. Keep the other
1678 one so we have a proper pre-header for the loop at the exit edge. */
1679 redirect_edge_pred (single_succ_edge (bb: new_preheader),
1680 single_pred (bb: new_preheader));
1681 delete_basic_block (new_preheader);
1682 set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
1683 loop_preheader_edge (new_loop)->src);
1684 }
1685
1686 free (ptr: new_bbs);
1687 free (ptr: bbs);
1688
1689 checking_verify_dominators (dir: CDI_DOMINATORS);
1690
1691 return new_loop;
1692}
1693
1694
1695/* Given the condition expression COND, put it as the last statement of
1696 GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
1697 DOM_BB; return the skip edge. GUARD_TO is the target basic block to
1698 skip the loop. PROBABILITY is the skip edge's probability. Mark the
1699 new edge as irreducible if IRREDUCIBLE_P is true. */
1700
1701static edge
1702slpeel_add_loop_guard (basic_block guard_bb, tree cond,
1703 basic_block guard_to, basic_block dom_bb,
1704 profile_probability probability, bool irreducible_p)
1705{
1706 gimple_stmt_iterator gsi;
1707 edge new_e, enter_e;
1708 gcond *cond_stmt;
1709 gimple_seq gimplify_stmt_list = NULL;
1710
1711 enter_e = EDGE_SUCC (guard_bb, 0);
1712 enter_e->flags &= ~EDGE_FALLTHRU;
1713 enter_e->flags |= EDGE_FALSE_VALUE;
1714 gsi = gsi_last_bb (bb: guard_bb);
1715
1716 cond = force_gimple_operand_1 (cond, &gimplify_stmt_list,
1717 is_gimple_condexpr_for_cond, NULL_TREE);
1718 if (gimplify_stmt_list)
1719 gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
1720
1721 cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
1722 gsi = gsi_last_bb (bb: guard_bb);
1723 gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
1724
1725 /* Add new edge to connect guard block to the merge/loop-exit block. */
1726 new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
1727
1728 new_e->probability = probability;
1729 if (irreducible_p)
1730 new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
1731
1732 enter_e->probability = probability.invert ();
1733 set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
1734
1735 /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS. */
1736 if (enter_e->dest->loop_father->header == enter_e->dest)
1737 split_edge (enter_e);
1738
1739 return new_e;
1740}
1741
1742
1743/* This function verifies that the following restrictions apply to LOOP:
1744 (1) it consists of exactly 2 basic blocks - header, and an empty latch
1745 for innermost loop and 5 basic blocks for outer-loop.
1746 (2) it is single entry, single exit
1747 (3) its exit condition is the last stmt in the header
1748 (4) E is the entry/exit edge of LOOP.
1749 */
1750
1751bool
1752slpeel_can_duplicate_loop_p (const class loop *loop, const_edge exit_e,
1753 const_edge e)
1754{
1755 edge entry_e = loop_preheader_edge (loop);
1756 gcond *orig_cond = get_loop_exit_condition (exit_e);
1757 gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (bb: exit_e->src);
1758 unsigned int num_bb = loop->inner? 5 : 2;
1759
1760 /* All loops have an outer scope; the only case loop->outer is NULL is for
1761 the function itself. */
1762 if (!loop_outer (loop)
1763 || loop->num_nodes != num_bb
1764 || !empty_block_p (loop->latch)
1765 || !exit_e
1766 /* Verify that new loop exit condition can be trivially modified. */
1767 || (!orig_cond || orig_cond != gsi_stmt (i: loop_exit_gsi))
1768 || (e != exit_e && e != entry_e))
1769 return false;
1770
1771 basic_block *bbs = XNEWVEC (basic_block, loop->num_nodes);
1772 get_loop_body_with_size (loop, bbs, loop->num_nodes);
1773 bool ret = can_copy_bbs_p (bbs, loop->num_nodes);
1774 free (ptr: bbs);
1775 return ret;
1776}
1777
1778/* Function find_loop_location.
1779
1780 Extract the location of the loop in the source code.
1781 If the loop is not well formed for vectorization, an estimated
1782 location is calculated.
1783 Return the loop location if succeed and NULL if not. */
1784
1785dump_user_location_t
1786find_loop_location (class loop *loop)
1787{
1788 gimple *stmt = NULL;
1789 basic_block bb;
1790 gimple_stmt_iterator si;
1791
1792 if (!loop)
1793 return dump_user_location_t ();
1794
1795 if (loops_state_satisfies_p (flags: LOOPS_HAVE_RECORDED_EXITS))
1796 {
1797 /* We only care about the loop location, so use any exit with location
1798 information. */
1799 for (edge e : get_loop_exit_edges (loop))
1800 {
1801 stmt = get_loop_exit_condition (e);
1802
1803 if (stmt
1804 && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1805 return stmt;
1806 }
1807 }
1808
1809 /* If we got here the loop is probably not "well formed",
1810 try to estimate the loop location */
1811
1812 if (!loop->header)
1813 return dump_user_location_t ();
1814
1815 bb = loop->header;
1816
1817 for (si = gsi_start_bb (bb); !gsi_end_p (i: si); gsi_next (i: &si))
1818 {
1819 stmt = gsi_stmt (i: si);
1820 if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1821 return stmt;
1822 }
1823
1824 return dump_user_location_t ();
1825}
1826
1827/* Return true if the phi described by STMT_INFO defines an IV of the
1828 loop to be vectorized. */
1829
1830static bool
1831iv_phi_p (stmt_vec_info stmt_info)
1832{
1833 gphi *phi = as_a <gphi *> (p: stmt_info->stmt);
1834 if (virtual_operand_p (PHI_RESULT (phi)))
1835 return false;
1836
1837 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1838 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
1839 return false;
1840
1841 return true;
1842}
1843
1844/* Return true if vectorizer can peel for nonlinear iv. */
1845static bool
1846vect_can_peel_nonlinear_iv_p (loop_vec_info loop_vinfo,
1847 stmt_vec_info stmt_info)
1848{
1849 enum vect_induction_op_type induction_type
1850 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
1851 tree niters_skip;
1852 /* Init_expr will be update by vect_update_ivs_after_vectorizer,
1853 if niters or vf is unkown:
1854 For shift, when shift mount >= precision, there would be UD.
1855 For mult, don't known how to generate
1856 init_expr * pow (step, niters) for variable niters.
1857 For neg, it should be ok, since niters of vectorized main loop
1858 will always be multiple of 2. */
1859 if ((!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1860 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ())
1861 && induction_type != vect_step_op_neg)
1862 {
1863 if (dump_enabled_p ())
1864 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1865 "Peeling for epilogue is not supported"
1866 " for nonlinear induction except neg"
1867 " when iteration count is unknown.\n");
1868 return false;
1869 }
1870
1871 /* Avoid compile time hog on vect_peel_nonlinear_iv_init. */
1872 if (induction_type == vect_step_op_mul)
1873 {
1874 tree step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
1875 tree type = TREE_TYPE (step_expr);
1876
1877 if (wi::exact_log2 (wi::to_wide (t: step_expr)) == -1
1878 && LOOP_VINFO_INT_NITERS(loop_vinfo) >= TYPE_PRECISION (type))
1879 {
1880 if (dump_enabled_p ())
1881 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1882 "Avoid compile time hog on"
1883 " vect_peel_nonlinear_iv_init"
1884 " for nonlinear induction vec_step_op_mul"
1885 " when iteration count is too big.\n");
1886 return false;
1887 }
1888 }
1889
1890 /* Also doens't support peel for neg when niter is variable.
1891 ??? generate something like niter_expr & 1 ? init_expr : -init_expr? */
1892 niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
1893 if ((niters_skip != NULL_TREE
1894 && (TREE_CODE (niters_skip) != INTEGER_CST
1895 || (HOST_WIDE_INT) TREE_INT_CST_LOW (niters_skip) < 0))
1896 || (!vect_use_loop_mask_for_alignment_p (loop_vinfo)
1897 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0))
1898 {
1899 if (dump_enabled_p ())
1900 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1901 "Peeling for alignement is not supported"
1902 " for nonlinear induction when niters_skip"
1903 " is not constant.\n");
1904 return false;
1905 }
1906
1907 return true;
1908}
1909
1910/* Function vect_can_advance_ivs_p
1911
1912 In case the number of iterations that LOOP iterates is unknown at compile
1913 time, an epilog loop will be generated, and the loop induction variables
1914 (IVs) will be "advanced" to the value they are supposed to take just before
1915 the epilog loop. Here we check that the access function of the loop IVs
1916 and the expression that represents the loop bound are simple enough.
1917 These restrictions will be relaxed in the future. */
1918
1919bool
1920vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
1921{
1922 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1923 basic_block bb = loop->header;
1924 gphi_iterator gsi;
1925
1926 /* Analyze phi functions of the loop header. */
1927
1928 if (dump_enabled_p ())
1929 dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
1930 for (gsi = gsi_start_phis (bb); !gsi_end_p (i: gsi); gsi_next (i: &gsi))
1931 {
1932 tree evolution_part;
1933 enum vect_induction_op_type induction_type;
1934
1935 gphi *phi = gsi.phi ();
1936 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1937 if (dump_enabled_p ())
1938 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
1939 phi_info->stmt);
1940
1941 /* Skip virtual phi's. The data dependences that are associated with
1942 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
1943
1944 Skip reduction phis. */
1945 if (!iv_phi_p (stmt_info: phi_info))
1946 {
1947 if (dump_enabled_p ())
1948 dump_printf_loc (MSG_NOTE, vect_location,
1949 "reduc or virtual phi. skip.\n");
1950 continue;
1951 }
1952
1953 induction_type = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (phi_info);
1954 if (induction_type != vect_step_op_add)
1955 {
1956 if (!vect_can_peel_nonlinear_iv_p (loop_vinfo, stmt_info: phi_info))
1957 return false;
1958
1959 continue;
1960 }
1961
1962 /* Analyze the evolution function. */
1963
1964 evolution_part = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1965 if (evolution_part == NULL_TREE)
1966 {
1967 if (dump_enabled_p ())
1968 dump_printf (MSG_MISSED_OPTIMIZATION,
1969 "No access function or evolution.\n");
1970 return false;
1971 }
1972
1973 /* FORNOW: We do not transform initial conditions of IVs
1974 which evolution functions are not invariants in the loop. */
1975
1976 if (!expr_invariant_in_loop_p (loop, evolution_part))
1977 {
1978 if (dump_enabled_p ())
1979 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1980 "evolution not invariant in loop.\n");
1981 return false;
1982 }
1983
1984 /* FORNOW: We do not transform initial conditions of IVs
1985 which evolution functions are a polynomial of degree >= 2. */
1986
1987 if (tree_is_chrec (expr: evolution_part))
1988 {
1989 if (dump_enabled_p ())
1990 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1991 "evolution is chrec.\n");
1992 return false;
1993 }
1994 }
1995
1996 return true;
1997}
1998
1999
2000/* Function vect_update_ivs_after_vectorizer.
2001
2002 "Advance" the induction variables of LOOP to the value they should take
2003 after the execution of LOOP. This is currently necessary because the
2004 vectorizer does not handle induction variables that are used after the
2005 loop. Such a situation occurs when the last iterations of LOOP are
2006 peeled, because:
2007 1. We introduced new uses after LOOP for IVs that were not originally used
2008 after LOOP: the IVs of LOOP are now used by an epilog loop.
2009 2. LOOP is going to be vectorized; this means that it will iterate N/VF
2010 times, whereas the loop IVs should be bumped N times.
2011
2012 Input:
2013 - LOOP - a loop that is going to be vectorized. The last few iterations
2014 of LOOP were peeled.
2015 - NITERS - the number of iterations that LOOP executes (before it is
2016 vectorized). i.e, the number of times the ivs should be bumped.
2017 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
2018 coming out from LOOP on which there are uses of the LOOP ivs
2019 (this is the path from LOOP->exit to epilog_loop->preheader).
2020
2021 The new definitions of the ivs are placed in LOOP->exit.
2022 The phi args associated with the edge UPDATE_E in the bb
2023 UPDATE_E->dest are updated accordingly.
2024
2025 Assumption 1: Like the rest of the vectorizer, this function assumes
2026 a single loop exit that has a single predecessor.
2027
2028 Assumption 2: The phi nodes in the LOOP header and in update_bb are
2029 organized in the same order.
2030
2031 Assumption 3: The access function of the ivs is simple enough (see
2032 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
2033
2034 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
2035 coming out of LOOP on which the ivs of LOOP are used (this is the path
2036 that leads to the epilog loop; other paths skip the epilog loop). This
2037 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
2038 needs to have its phis updated.
2039 */
2040
2041static void
2042vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
2043 tree niters, edge update_e)
2044{
2045 gphi_iterator gsi, gsi1;
2046 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2047 basic_block update_bb = update_e->dest;
2048
2049 basic_block exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
2050
2051 /* Make sure there exists a single-predecessor exit bb: */
2052 gcc_assert (single_pred_p (exit_bb));
2053 gcc_assert (single_succ_edge (exit_bb) == update_e);
2054
2055 for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
2056 !gsi_end_p (i: gsi) && !gsi_end_p (i: gsi1);
2057 gsi_next (i: &gsi), gsi_next (i: &gsi1))
2058 {
2059 tree init_expr;
2060 tree step_expr, off;
2061 tree type;
2062 tree var, ni, ni_name;
2063 gimple_stmt_iterator last_gsi;
2064
2065 gphi *phi = gsi.phi ();
2066 gphi *phi1 = gsi1.phi ();
2067 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
2068 if (dump_enabled_p ())
2069 dump_printf_loc (MSG_NOTE, vect_location,
2070 "vect_update_ivs_after_vectorizer: phi: %G",
2071 (gimple *) phi);
2072
2073 /* Skip reduction and virtual phis. */
2074 if (!iv_phi_p (stmt_info: phi_info))
2075 {
2076 if (dump_enabled_p ())
2077 dump_printf_loc (MSG_NOTE, vect_location,
2078 "reduc or virtual phi. skip.\n");
2079 continue;
2080 }
2081
2082 type = TREE_TYPE (gimple_phi_result (phi));
2083 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
2084 step_expr = unshare_expr (step_expr);
2085
2086 /* FORNOW: We do not support IVs whose evolution function is a polynomial
2087 of degree >= 2 or exponential. */
2088 gcc_assert (!tree_is_chrec (step_expr));
2089
2090 init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
2091 gimple_seq stmts = NULL;
2092 enum vect_induction_op_type induction_type
2093 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (phi_info);
2094
2095 if (induction_type == vect_step_op_add)
2096 {
2097 tree stype = TREE_TYPE (step_expr);
2098 off = fold_build2 (MULT_EXPR, stype,
2099 fold_convert (stype, niters), step_expr);
2100 if (POINTER_TYPE_P (type))
2101 ni = fold_build_pointer_plus (init_expr, off);
2102 else
2103 ni = fold_convert (type,
2104 fold_build2 (PLUS_EXPR, stype,
2105 fold_convert (stype, init_expr),
2106 off));
2107 }
2108 /* Don't bother call vect_peel_nonlinear_iv_init. */
2109 else if (induction_type == vect_step_op_neg)
2110 ni = init_expr;
2111 else
2112 ni = vect_peel_nonlinear_iv_init (&stmts, init_expr,
2113 niters, step_expr,
2114 induction_type);
2115
2116 var = create_tmp_var (type, "tmp");
2117
2118 last_gsi = gsi_last_bb (bb: exit_bb);
2119 gimple_seq new_stmts = NULL;
2120 ni_name = force_gimple_operand (ni, &new_stmts, false, var);
2121 /* Exit_bb shouldn't be empty. */
2122 if (!gsi_end_p (i: last_gsi))
2123 {
2124 gsi_insert_seq_after (&last_gsi, stmts, GSI_SAME_STMT);
2125 gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
2126 }
2127 else
2128 {
2129 gsi_insert_seq_before (&last_gsi, stmts, GSI_SAME_STMT);
2130 gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
2131 }
2132
2133 /* Fix phi expressions in the successor bb. */
2134 adjust_phi_and_debug_stmts (update_phi: phi1, e: update_e, new_def: ni_name);
2135 }
2136}
2137
2138/* Return a gimple value containing the misalignment (measured in vector
2139 elements) for the loop described by LOOP_VINFO, i.e. how many elements
2140 it is away from a perfectly aligned address. Add any new statements
2141 to SEQ. */
2142
2143static tree
2144get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
2145{
2146 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2147 stmt_vec_info stmt_info = dr_info->stmt;
2148 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2149
2150 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
2151 unsigned HOST_WIDE_INT target_align_c;
2152 tree target_align_minus_1;
2153
2154 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
2155 size_zero_node) < 0;
2156 tree offset = (negative
2157 ? size_int ((-TYPE_VECTOR_SUBPARTS (vectype) + 1)
2158 * TREE_INT_CST_LOW
2159 (TYPE_SIZE_UNIT (TREE_TYPE (vectype))))
2160 : size_zero_node);
2161 tree start_addr = vect_create_addr_base_for_vector_ref (loop_vinfo,
2162 stmt_info, seq,
2163 offset);
2164 tree type = unsigned_type_for (TREE_TYPE (start_addr));
2165 if (target_align.is_constant (const_value: &target_align_c))
2166 target_align_minus_1 = build_int_cst (type, target_align_c - 1);
2167 else
2168 {
2169 tree vla = build_int_cst (type, target_align);
2170 tree vla_align = fold_build2 (BIT_AND_EXPR, type, vla,
2171 fold_build2 (MINUS_EXPR, type,
2172 build_int_cst (type, 0), vla));
2173 target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla_align,
2174 build_int_cst (type, 1));
2175 }
2176
2177 HOST_WIDE_INT elem_size
2178 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
2179 tree elem_size_log = build_int_cst (type, exact_log2 (x: elem_size));
2180
2181 /* Create: misalign_in_bytes = addr & (target_align - 1). */
2182 tree int_start_addr = fold_convert (type, start_addr);
2183 tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
2184 target_align_minus_1);
2185
2186 /* Create: misalign_in_elems = misalign_in_bytes / element_size. */
2187 tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
2188 elem_size_log);
2189
2190 return misalign_in_elems;
2191}
2192
2193/* Function vect_gen_prolog_loop_niters
2194
2195 Generate the number of iterations which should be peeled as prolog for the
2196 loop represented by LOOP_VINFO. It is calculated as the misalignment of
2197 DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
2198 As a result, after the execution of this loop, the data reference DR will
2199 refer to an aligned location. The following computation is generated:
2200
2201 If the misalignment of DR is known at compile time:
2202 addr_mis = int mis = DR_MISALIGNMENT (dr);
2203 Else, compute address misalignment in bytes:
2204 addr_mis = addr & (target_align - 1)
2205
2206 prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
2207
2208 (elem_size = element type size; an element is the scalar element whose type
2209 is the inner type of the vectype)
2210
2211 The computations will be emitted at the end of BB. We also compute and
2212 store upper bound (included) of the result in BOUND.
2213
2214 When the step of the data-ref in the loop is not 1 (as in interleaved data
2215 and SLP), the number of iterations of the prolog must be divided by the step
2216 (which is equal to the size of interleaved group).
2217
2218 The above formulas assume that VF == number of elements in the vector. This
2219 may not hold when there are multiple-types in the loop.
2220 In this case, for some data-references in the loop the VF does not represent
2221 the number of elements that fit in the vector. Therefore, instead of VF we
2222 use TYPE_VECTOR_SUBPARTS. */
2223
2224static tree
2225vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
2226 basic_block bb, int *bound)
2227{
2228 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2229 tree var;
2230 tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
2231 gimple_seq stmts = NULL, new_stmts = NULL;
2232 tree iters, iters_name;
2233 stmt_vec_info stmt_info = dr_info->stmt;
2234 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2235 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
2236
2237 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2238 {
2239 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2240
2241 if (dump_enabled_p ())
2242 dump_printf_loc (MSG_NOTE, vect_location,
2243 "known peeling = %d.\n", npeel);
2244
2245 iters = build_int_cst (niters_type, npeel);
2246 *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2247 }
2248 else
2249 {
2250 tree misalign_in_elems = get_misalign_in_elems (seq: &stmts, loop_vinfo);
2251 tree type = TREE_TYPE (misalign_in_elems);
2252 HOST_WIDE_INT elem_size
2253 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
2254 /* We only do prolog peeling if the target alignment is known at compile
2255 time. */
2256 poly_uint64 align_in_elems =
2257 exact_div (a: target_align, b: elem_size);
2258 tree align_in_elems_minus_1 =
2259 build_int_cst (type, align_in_elems - 1);
2260 tree align_in_elems_tree = build_int_cst (type, align_in_elems);
2261
2262 /* Create: (niters_type) ((align_in_elems - misalign_in_elems)
2263 & (align_in_elems - 1)). */
2264 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
2265 size_zero_node) < 0;
2266 if (negative)
2267 iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
2268 align_in_elems_tree);
2269 else
2270 iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
2271 misalign_in_elems);
2272 iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
2273 iters = fold_convert (niters_type, iters);
2274 unsigned HOST_WIDE_INT align_in_elems_c;
2275 if (align_in_elems.is_constant (const_value: &align_in_elems_c))
2276 *bound = align_in_elems_c - 1;
2277 else
2278 *bound = -1;
2279 }
2280
2281 if (dump_enabled_p ())
2282 dump_printf_loc (MSG_NOTE, vect_location,
2283 "niters for prolog loop: %T\n", iters);
2284
2285 var = create_tmp_var (niters_type, "prolog_loop_niters");
2286 iters_name = force_gimple_operand (iters, &new_stmts, false, var);
2287
2288 if (new_stmts)
2289 gimple_seq_add_seq (&stmts, new_stmts);
2290 if (stmts)
2291 {
2292 gcc_assert (single_succ_p (bb));
2293 gimple_stmt_iterator gsi = gsi_last_bb (bb);
2294 if (gsi_end_p (i: gsi))
2295 gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2296 else
2297 gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
2298 }
2299 return iters_name;
2300}
2301
2302
2303/* Function vect_update_init_of_dr
2304
2305 If CODE is PLUS, the vector loop starts NITERS iterations after the
2306 scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
2307 iterations before the scalar one (using masking to skip inactive
2308 elements). This function updates the information recorded in DR to
2309 account for the difference. Specifically, it updates the OFFSET
2310 field of DR_INFO. */
2311
2312static void
2313vect_update_init_of_dr (dr_vec_info *dr_info, tree niters, tree_code code)
2314{
2315 struct data_reference *dr = dr_info->dr;
2316 tree offset = dr_info->offset;
2317 if (!offset)
2318 offset = build_zero_cst (sizetype);
2319
2320 niters = fold_build2 (MULT_EXPR, sizetype,
2321 fold_convert (sizetype, niters),
2322 fold_convert (sizetype, DR_STEP (dr)));
2323 offset = fold_build2 (code, sizetype,
2324 fold_convert (sizetype, offset), niters);
2325 dr_info->offset = offset;
2326}
2327
2328
2329/* Function vect_update_inits_of_drs
2330
2331 Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
2332 CODE and NITERS are as for vect_update_inits_of_dr. */
2333
2334void
2335vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
2336 tree_code code)
2337{
2338 unsigned int i;
2339 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2340 struct data_reference *dr;
2341
2342 DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
2343
2344 /* Adjust niters to sizetype. We used to insert the stmts on loop preheader
2345 here, but since we might use these niters to update the epilogues niters
2346 and data references we can't insert them here as this definition might not
2347 always dominate its uses. */
2348 if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
2349 niters = fold_convert (sizetype, niters);
2350
2351 FOR_EACH_VEC_ELT (datarefs, i, dr)
2352 {
2353 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2354 if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt)
2355 && !STMT_VINFO_SIMD_LANE_ACCESS_P (dr_info->stmt))
2356 vect_update_init_of_dr (dr_info, niters, code);
2357 }
2358}
2359
2360/* For the information recorded in LOOP_VINFO prepare the loop for peeling
2361 by masking. This involves calculating the number of iterations to
2362 be peeled and then aligning all memory references appropriately. */
2363
2364void
2365vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
2366{
2367 tree misalign_in_elems;
2368 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
2369
2370 gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
2371
2372 /* From the information recorded in LOOP_VINFO get the number of iterations
2373 that need to be skipped via masking. */
2374 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2375 {
2376 poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2377 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
2378 misalign_in_elems = build_int_cst (type, misalign);
2379 }
2380 else
2381 {
2382 gimple_seq seq1 = NULL, seq2 = NULL;
2383 misalign_in_elems = get_misalign_in_elems (seq: &seq1, loop_vinfo);
2384 misalign_in_elems = fold_convert (type, misalign_in_elems);
2385 misalign_in_elems = force_gimple_operand (misalign_in_elems,
2386 &seq2, true, NULL_TREE);
2387 gimple_seq_add_seq (&seq1, seq2);
2388 if (seq1)
2389 {
2390 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
2391 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
2392 gcc_assert (!new_bb);
2393 }
2394 }
2395
2396 if (dump_enabled_p ())
2397 dump_printf_loc (MSG_NOTE, vect_location,
2398 "misalignment for fully-masked loop: %T\n",
2399 misalign_in_elems);
2400
2401 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
2402
2403 vect_update_inits_of_drs (loop_vinfo, niters: misalign_in_elems, code: MINUS_EXPR);
2404}
2405
2406/* This function builds ni_name = number of iterations. Statements
2407 are emitted on the loop preheader edge. If NEW_VAR_P is not NULL, set
2408 it to TRUE if new ssa_var is generated. */
2409
2410tree
2411vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
2412{
2413 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
2414 if (TREE_CODE (ni) == INTEGER_CST)
2415 return ni;
2416 else
2417 {
2418 tree ni_name, var;
2419 gimple_seq stmts = NULL;
2420 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
2421
2422 var = create_tmp_var (TREE_TYPE (ni), "niters");
2423 ni_name = force_gimple_operand (ni, &stmts, false, var);
2424 if (stmts)
2425 {
2426 gsi_insert_seq_on_edge_immediate (pe, stmts);
2427 if (new_var_p != NULL)
2428 *new_var_p = true;
2429 }
2430
2431 return ni_name;
2432 }
2433}
2434
2435/* Calculate the number of iterations above which vectorized loop will be
2436 preferred than scalar loop. NITERS_PROLOG is the number of iterations
2437 of prolog loop. If it's integer const, the integer number is also passed
2438 in INT_NITERS_PROLOG. BOUND_PROLOG is the upper bound (inclusive) of the
2439 number of iterations of the prolog loop. BOUND_EPILOG is the corresponding
2440 value for the epilog loop. If CHECK_PROFITABILITY is true, TH is the
2441 threshold below which the scalar (rather than vectorized) loop will be
2442 executed. This function stores the upper bound (inclusive) of the result
2443 in BOUND_SCALAR. */
2444
2445static tree
2446vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
2447 int bound_prolog, poly_int64 bound_epilog, int th,
2448 poly_uint64 *bound_scalar,
2449 bool check_profitability)
2450{
2451 tree type = TREE_TYPE (niters_prolog);
2452 tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
2453 build_int_cst (type, bound_epilog));
2454
2455 *bound_scalar = bound_prolog + bound_epilog;
2456 if (check_profitability)
2457 {
2458 /* TH indicates the minimum niters of vectorized loop, while we
2459 compute the maximum niters of scalar loop. */
2460 th--;
2461 /* Peeling for constant times. */
2462 if (int_niters_prolog >= 0)
2463 {
2464 *bound_scalar = upper_bound (a: int_niters_prolog + bound_epilog, b: th);
2465 return build_int_cst (type, *bound_scalar);
2466 }
2467 /* Peeling an unknown number of times. Note that both BOUND_PROLOG
2468 and BOUND_EPILOG are inclusive upper bounds. */
2469 if (known_ge (th, bound_prolog + bound_epilog))
2470 {
2471 *bound_scalar = th;
2472 return build_int_cst (type, th);
2473 }
2474 /* Need to do runtime comparison. */
2475 else if (maybe_gt (th, bound_epilog))
2476 {
2477 *bound_scalar = upper_bound (a: *bound_scalar, b: th);
2478 return fold_build2 (MAX_EXPR, type,
2479 build_int_cst (type, th), niters);
2480 }
2481 }
2482 return niters;
2483}
2484
2485/* NITERS is the number of times that the original scalar loop executes
2486 after peeling. Work out the maximum number of iterations N that can
2487 be handled by the vectorized form of the loop and then either:
2488
2489 a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
2490
2491 niters_vector = N
2492
2493 b) set *STEP_VECTOR_PTR to one and generate:
2494
2495 niters_vector = N / vf
2496
2497 In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
2498 any new statements on the loop preheader edge. NITERS_NO_OVERFLOW
2499 is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero). */
2500
2501void
2502vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
2503 tree *niters_vector_ptr, tree *step_vector_ptr,
2504 bool niters_no_overflow)
2505{
2506 tree ni_minus_gap, var;
2507 tree niters_vector, step_vector, type = TREE_TYPE (niters);
2508 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2509 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
2510 tree log_vf = NULL_TREE;
2511
2512 /* If epilogue loop is required because of data accesses with gaps, we
2513 subtract one iteration from the total number of iterations here for
2514 correct calculation of RATIO. */
2515 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2516 {
2517 ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
2518 build_one_cst (type));
2519 if (!is_gimple_val (ni_minus_gap))
2520 {
2521 var = create_tmp_var (type, "ni_gap");
2522 gimple *stmts = NULL;
2523 ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
2524 true, var);
2525 gsi_insert_seq_on_edge_immediate (pe, stmts);
2526 }
2527 }
2528 else
2529 ni_minus_gap = niters;
2530
2531 /* To silence some unexpected warnings, simply initialize to 0. */
2532 unsigned HOST_WIDE_INT const_vf = 0;
2533 if (vf.is_constant (const_value: &const_vf)
2534 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2535 {
2536 /* Create: niters >> log2(vf) */
2537 /* If it's known that niters == number of latch executions + 1 doesn't
2538 overflow, we can generate niters >> log2(vf); otherwise we generate
2539 (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
2540 will be at least one. */
2541 log_vf = build_int_cst (type, exact_log2 (x: const_vf));
2542 if (niters_no_overflow)
2543 niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
2544 else
2545 niters_vector
2546 = fold_build2 (PLUS_EXPR, type,
2547 fold_build2 (RSHIFT_EXPR, type,
2548 fold_build2 (MINUS_EXPR, type,
2549 ni_minus_gap,
2550 build_int_cst (type, vf)),
2551 log_vf),
2552 build_int_cst (type, 1));
2553 step_vector = build_one_cst (type);
2554 }
2555 else
2556 {
2557 niters_vector = ni_minus_gap;
2558 step_vector = build_int_cst (type, vf);
2559 }
2560
2561 if (!is_gimple_val (niters_vector))
2562 {
2563 var = create_tmp_var (type, "bnd");
2564 gimple_seq stmts = NULL;
2565 niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
2566 gsi_insert_seq_on_edge_immediate (pe, stmts);
2567 /* Peeling algorithm guarantees that vector loop bound is at least ONE,
2568 we set range information to make niters analyzer's life easier.
2569 Note the number of latch iteration value can be TYPE_MAX_VALUE so
2570 we have to represent the vector niter TYPE_MAX_VALUE + 1 >> log_vf. */
2571 if (stmts != NULL && log_vf)
2572 {
2573 if (niters_no_overflow)
2574 {
2575 value_range vr (type,
2576 wi::one (TYPE_PRECISION (type)),
2577 wi::rshift (x: wi::max_value (TYPE_PRECISION (type),
2578 TYPE_SIGN (type)),
2579 y: exact_log2 (x: const_vf),
2580 TYPE_SIGN (type)));
2581 set_range_info (niters_vector, vr);
2582 }
2583 /* For VF == 1 the vector IV might also overflow so we cannot
2584 assert a minimum value of 1. */
2585 else if (const_vf > 1)
2586 {
2587 value_range vr (type,
2588 wi::one (TYPE_PRECISION (type)),
2589 wi::rshift (x: wi::max_value (TYPE_PRECISION (type),
2590 TYPE_SIGN (type))
2591 - (const_vf - 1),
2592 y: exact_log2 (x: const_vf), TYPE_SIGN (type))
2593 + 1);
2594 set_range_info (niters_vector, vr);
2595 }
2596 }
2597 }
2598 *niters_vector_ptr = niters_vector;
2599 *step_vector_ptr = step_vector;
2600
2601 return;
2602}
2603
2604/* Given NITERS_VECTOR which is the number of iterations for vectorized
2605 loop specified by LOOP_VINFO after vectorization, compute the number
2606 of iterations before vectorization (niters_vector * vf) and store it
2607 to NITERS_VECTOR_MULT_VF_PTR. */
2608
2609static void
2610vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
2611 tree niters_vector,
2612 tree *niters_vector_mult_vf_ptr)
2613{
2614 /* We should be using a step_vector of VF if VF is variable. */
2615 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
2616 tree type = TREE_TYPE (niters_vector);
2617 tree log_vf = build_int_cst (type, exact_log2 (x: vf));
2618 basic_block exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
2619
2620 gcc_assert (niters_vector_mult_vf_ptr != NULL);
2621 tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
2622 niters_vector, log_vf);
2623 if (!is_gimple_val (niters_vector_mult_vf))
2624 {
2625 tree var = create_tmp_var (type, "niters_vector_mult_vf");
2626 gimple_seq stmts = NULL;
2627 niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
2628 &stmts, true, var);
2629 gimple_stmt_iterator gsi = gsi_start_bb (bb: exit_bb);
2630 gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2631 }
2632 *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
2633}
2634
2635/* Function slpeel_add_loop_guard adds guard skipping from the beginning
2636 of SKIP_LOOP to the beginning of UPDATE_LOOP. GUARD_EDGE and MERGE_EDGE
2637 are two pred edges of the merge point before UPDATE_LOOP. The two loops
2638 appear like below:
2639
2640 guard_bb:
2641 if (cond)
2642 goto merge_bb;
2643 else
2644 goto skip_loop;
2645
2646 skip_loop:
2647 header_a:
2648 i_1 = PHI<i_0, i_2>;
2649 ...
2650 i_2 = i_1 + 1;
2651 if (cond_a)
2652 goto latch_a;
2653 else
2654 goto exit_a;
2655 latch_a:
2656 goto header_a;
2657
2658 exit_a:
2659 i_5 = PHI<i_2>;
2660
2661 merge_bb:
2662 ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
2663
2664 update_loop:
2665 header_b:
2666 i_3 = PHI<i_5, i_4>; ;; Use of i_5 to be replaced with i_x.
2667 ...
2668 i_4 = i_3 + 1;
2669 if (cond_b)
2670 goto latch_b;
2671 else
2672 goto exit_bb;
2673 latch_b:
2674 goto header_b;
2675
2676 exit_bb:
2677
2678 This function creates PHI nodes at merge_bb and replaces the use of i_5
2679 in the update_loop's PHI node with the result of new PHI result. */
2680
2681static void
2682slpeel_update_phi_nodes_for_guard1 (class loop *skip_loop,
2683 class loop *update_loop,
2684 edge guard_edge, edge merge_edge)
2685{
2686 location_t merge_loc, guard_loc;
2687 edge orig_e = loop_preheader_edge (skip_loop);
2688 edge update_e = loop_preheader_edge (update_loop);
2689 gphi_iterator gsi_orig, gsi_update;
2690
2691 for ((gsi_orig = gsi_start_phis (skip_loop->header),
2692 gsi_update = gsi_start_phis (update_loop->header));
2693 !gsi_end_p (i: gsi_orig) && !gsi_end_p (i: gsi_update);
2694 gsi_next (i: &gsi_orig), gsi_next (i: &gsi_update))
2695 {
2696 gphi *orig_phi = gsi_orig.phi ();
2697 gphi *update_phi = gsi_update.phi ();
2698
2699 /* Generate new phi node at merge bb of the guard. */
2700 tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2701 gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
2702
2703 /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE. Set the
2704 args in NEW_PHI for these edges. */
2705 tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
2706 tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
2707 merge_loc = gimple_phi_arg_location_from_edge (phi: update_phi, e: update_e);
2708 guard_loc = gimple_phi_arg_location_from_edge (phi: orig_phi, e: orig_e);
2709 add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
2710 add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
2711
2712 /* Update phi in UPDATE_PHI. */
2713 adjust_phi_and_debug_stmts (update_phi, e: update_e, new_def: new_res);
2714 }
2715}
2716
2717/* LOOP_VINFO is an epilogue loop whose corresponding main loop can be skipped.
2718 Return a value that equals:
2719
2720 - MAIN_LOOP_VALUE when LOOP_VINFO is entered from the main loop and
2721 - SKIP_VALUE when the main loop is skipped. */
2722
2723tree
2724vect_get_main_loop_result (loop_vec_info loop_vinfo, tree main_loop_value,
2725 tree skip_value)
2726{
2727 gcc_assert (loop_vinfo->main_loop_edge);
2728
2729 tree phi_result = make_ssa_name (TREE_TYPE (main_loop_value));
2730 basic_block bb = loop_vinfo->main_loop_edge->dest;
2731 gphi *new_phi = create_phi_node (phi_result, bb);
2732 add_phi_arg (new_phi, main_loop_value, loop_vinfo->main_loop_edge,
2733 UNKNOWN_LOCATION);
2734 add_phi_arg (new_phi, skip_value,
2735 loop_vinfo->skip_main_loop_edge, UNKNOWN_LOCATION);
2736 return phi_result;
2737}
2738
2739/* Function vect_do_peeling.
2740
2741 Input:
2742 - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
2743
2744 preheader:
2745 LOOP:
2746 header_bb:
2747 loop_body
2748 if (exit_loop_cond) goto exit_bb
2749 else goto header_bb
2750 exit_bb:
2751
2752 - NITERS: The number of iterations of the loop.
2753 - NITERSM1: The number of iterations of the loop's latch.
2754 - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
2755 - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
2756 CHECK_PROFITABILITY is true.
2757 Output:
2758 - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
2759 iterate after vectorization; see vect_set_loop_condition for details.
2760 - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
2761 should be set to the number of scalar iterations handled by the
2762 vector loop. The SSA name is only used on exit from the loop.
2763
2764 This function peels prolog and epilog from the loop, adds guards skipping
2765 PROLOG and EPILOG for various conditions. As a result, the changed CFG
2766 would look like:
2767
2768 guard_bb_1:
2769 if (prefer_scalar_loop) goto merge_bb_1
2770 else goto guard_bb_2
2771
2772 guard_bb_2:
2773 if (skip_prolog) goto merge_bb_2
2774 else goto prolog_preheader
2775
2776 prolog_preheader:
2777 PROLOG:
2778 prolog_header_bb:
2779 prolog_body
2780 if (exit_prolog_cond) goto prolog_exit_bb
2781 else goto prolog_header_bb
2782 prolog_exit_bb:
2783
2784 merge_bb_2:
2785
2786 vector_preheader:
2787 VECTOR LOOP:
2788 vector_header_bb:
2789 vector_body
2790 if (exit_vector_cond) goto vector_exit_bb
2791 else goto vector_header_bb
2792 vector_exit_bb:
2793
2794 guard_bb_3:
2795 if (skip_epilog) goto merge_bb_3
2796 else goto epilog_preheader
2797
2798 merge_bb_1:
2799
2800 epilog_preheader:
2801 EPILOG:
2802 epilog_header_bb:
2803 epilog_body
2804 if (exit_epilog_cond) goto merge_bb_3
2805 else goto epilog_header_bb
2806
2807 merge_bb_3:
2808
2809 Note this function peels prolog and epilog only if it's necessary,
2810 as well as guards.
2811 This function returns the epilogue loop if a decision was made to vectorize
2812 it, otherwise NULL.
2813
2814 The analysis resulting in this epilogue loop's loop_vec_info was performed
2815 in the same vect_analyze_loop call as the main loop's. At that time
2816 vect_analyze_loop constructs a list of accepted loop_vec_info's for lower
2817 vectorization factors than the main loop. This list is stored in the main
2818 loop's loop_vec_info in the 'epilogue_vinfos' member. Everytime we decide to
2819 vectorize the epilogue loop for a lower vectorization factor, the
2820 loop_vec_info sitting at the top of the epilogue_vinfos list is removed,
2821 updated and linked to the epilogue loop. This is later used to vectorize
2822 the epilogue. The reason the loop_vec_info needs updating is that it was
2823 constructed based on the original main loop, and the epilogue loop is a
2824 copy of this loop, so all links pointing to statements in the original loop
2825 need updating. Furthermore, these loop_vec_infos share the
2826 data_reference's records, which will also need to be updated.
2827
2828 TODO: Guard for prefer_scalar_loop should be emitted along with
2829 versioning conditions if loop versioning is needed. */
2830
2831
2832class loop *
2833vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
2834 tree *niters_vector, tree *step_vector,
2835 tree *niters_vector_mult_vf_var, int th,
2836 bool check_profitability, bool niters_no_overflow,
2837 tree *advance)
2838{
2839 edge e, guard_e;
2840 tree type = TREE_TYPE (niters), guard_cond;
2841 basic_block guard_bb, guard_to;
2842 profile_probability prob_prolog, prob_vector, prob_epilog;
2843 int estimated_vf;
2844 int prolog_peeling = 0;
2845 bool vect_epilogues = loop_vinfo->epilogue_vinfos.length () > 0;
2846 /* We currently do not support prolog peeling if the target alignment is not
2847 known at compile time. 'vect_gen_prolog_loop_niters' depends on the
2848 target alignment being constant. */
2849 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2850 if (dr_info && !DR_TARGET_ALIGNMENT (dr_info).is_constant ())
2851 return NULL;
2852
2853 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2854 prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2855
2856 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2857 poly_uint64 bound_epilog = 0;
2858 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2859 && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2860 bound_epilog += vf - 1;
2861 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2862 bound_epilog += 1;
2863 bool epilog_peeling = maybe_ne (a: bound_epilog, b: 0U);
2864 poly_uint64 bound_scalar = bound_epilog;
2865
2866 if (!prolog_peeling && !epilog_peeling)
2867 return NULL;
2868
2869 /* Before doing any peeling make sure to reset debug binds outside of
2870 the loop refering to defs not in LC SSA. */
2871 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2872 for (unsigned i = 0; i < loop->num_nodes; ++i)
2873 {
2874 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2875 imm_use_iterator ui;
2876 gimple *use_stmt;
2877 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (i: gsi);
2878 gsi_next (i: &gsi))
2879 {
2880 FOR_EACH_IMM_USE_STMT (use_stmt, ui, gimple_phi_result (gsi.phi ()))
2881 if (gimple_debug_bind_p (s: use_stmt)
2882 && loop != gimple_bb (g: use_stmt)->loop_father
2883 && !flow_loop_nested_p (loop,
2884 gimple_bb (g: use_stmt)->loop_father))
2885 {
2886 gimple_debug_bind_reset_value (dbg: use_stmt);
2887 update_stmt (s: use_stmt);
2888 }
2889 }
2890 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (i: gsi);
2891 gsi_next (i: &gsi))
2892 {
2893 ssa_op_iter op_iter;
2894 def_operand_p def_p;
2895 FOR_EACH_SSA_DEF_OPERAND (def_p, gsi_stmt (gsi), op_iter, SSA_OP_DEF)
2896 FOR_EACH_IMM_USE_STMT (use_stmt, ui, DEF_FROM_PTR (def_p))
2897 if (gimple_debug_bind_p (s: use_stmt)
2898 && loop != gimple_bb (g: use_stmt)->loop_father
2899 && !flow_loop_nested_p (loop,
2900 gimple_bb (g: use_stmt)->loop_father))
2901 {
2902 gimple_debug_bind_reset_value (dbg: use_stmt);
2903 update_stmt (s: use_stmt);
2904 }
2905 }
2906 }
2907
2908 prob_vector = profile_probability::guessed_always ().apply_scale (num: 9, den: 10);
2909 estimated_vf = vect_vf_for_cost (loop_vinfo);
2910 if (estimated_vf == 2)
2911 estimated_vf = 3;
2912 prob_prolog = prob_epilog = profile_probability::guessed_always ()
2913 .apply_scale (num: estimated_vf - 1, den: estimated_vf);
2914
2915 class loop *prolog, *epilog = NULL;
2916 class loop *first_loop = loop;
2917 bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
2918
2919 /* SSA form needs to be up-to-date since we are going to manually
2920 update SSA form in slpeel_tree_duplicate_loop_to_edge_cfg and delete all
2921 update SSA state after that, so we have to make sure to not lose any
2922 pending update needs. */
2923 gcc_assert (!need_ssa_update_p (cfun));
2924
2925 /* If we're vectorizing an epilogue loop, we have ensured that the
2926 virtual operand is in SSA form throughout the vectorized main loop.
2927 Normally it is possible to trace the updated
2928 vector-stmt vdefs back to scalar-stmt vdefs and vector-stmt vuses
2929 back to scalar-stmt vuses, meaning that the effect of the SSA update
2930 remains local to the main loop. However, there are rare cases in
2931 which the vectorized loop should have vdefs even when the original scalar
2932 loop didn't. For example, vectorizing a load with IFN_LOAD_LANES
2933 introduces clobbers of the temporary vector array, which in turn
2934 needs new vdefs. If the scalar loop doesn't write to memory, these
2935 new vdefs will be the only ones in the vector loop.
2936 We are currently defering updating virtual SSA form and creating
2937 of a virtual PHI for this case so we do not have to make sure the
2938 newly introduced virtual def is in LCSSA form. */
2939
2940 if (MAY_HAVE_DEBUG_BIND_STMTS)
2941 {
2942 gcc_assert (!adjust_vec.exists ());
2943 adjust_vec.create (nelems: 32);
2944 }
2945 initialize_original_copy_tables ();
2946
2947 /* Record the anchor bb at which the guard should be placed if the scalar
2948 loop might be preferred. */
2949 basic_block anchor = loop_preheader_edge (loop)->src;
2950
2951 /* Generate the number of iterations for the prolog loop. We do this here
2952 so that we can also get the upper bound on the number of iterations. */
2953 tree niters_prolog;
2954 int bound_prolog = 0;
2955 if (prolog_peeling)
2956 {
2957 niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, bb: anchor,
2958 bound: &bound_prolog);
2959 /* If algonment peeling is known, we will always execute prolog. */
2960 if (TREE_CODE (niters_prolog) == INTEGER_CST)
2961 prob_prolog = profile_probability::always ();
2962 }
2963 else
2964 niters_prolog = build_int_cst (type, 0);
2965
2966 loop_vec_info epilogue_vinfo = NULL;
2967 if (vect_epilogues)
2968 {
2969 epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2970 loop_vinfo->epilogue_vinfos.ordered_remove (ix: 0);
2971 }
2972
2973 tree niters_vector_mult_vf = NULL_TREE;
2974 /* Saving NITERs before the loop, as this may be changed by prologue. */
2975 tree before_loop_niters = LOOP_VINFO_NITERS (loop_vinfo);
2976 edge update_e = NULL, skip_e = NULL;
2977 unsigned int lowest_vf = constant_lower_bound (a: vf);
2978 /* Prolog loop may be skipped. */
2979 bool skip_prolog = (prolog_peeling != 0);
2980 /* Skip this loop to epilog when there are not enough iterations to enter this
2981 vectorized loop. If true we should perform runtime checks on the NITERS
2982 to check whether we should skip the current vectorized loop. If we know
2983 the number of scalar iterations we may choose to add a runtime check if
2984 this number "maybe" smaller than the number of iterations required
2985 when we know the number of scalar iterations may potentially
2986 be smaller than the number of iterations required to enter this loop, for
2987 this we use the upper bounds on the prolog and epilog peeling. When we
2988 don't know the number of iterations and don't require versioning it is
2989 because we have asserted that there are enough scalar iterations to enter
2990 the main loop, so this skip is not necessary. When we are versioning then
2991 we only add such a skip if we have chosen to vectorize the epilogue. */
2992 bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2993 ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
2994 b: bound_prolog + bound_epilog)
2995 : (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2996 || vect_epilogues));
2997 /* Epilog loop must be executed if the number of iterations for epilog
2998 loop is known at compile time, otherwise we need to add a check at
2999 the end of vector loop and skip to the end of epilog loop. */
3000 bool skip_epilog = (prolog_peeling < 0
3001 || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3002 || !vf.is_constant ());
3003 /* PEELING_FOR_GAPS is special because epilog loop must be executed. */
3004 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3005 skip_epilog = false;
3006
3007 class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
3008 auto_vec<profile_count> original_counts;
3009 basic_block *original_bbs = NULL;
3010
3011 if (skip_vector)
3012 {
3013 split_edge (loop_preheader_edge (loop));
3014
3015 if (epilog_peeling && (vect_epilogues || scalar_loop == NULL))
3016 {
3017 original_bbs = get_loop_body (loop);
3018 for (unsigned int i = 0; i < loop->num_nodes; i++)
3019 original_counts.safe_push(obj: original_bbs[i]->count);
3020 }
3021
3022 /* Due to the order in which we peel prolog and epilog, we first
3023 propagate probability to the whole loop. The purpose is to
3024 avoid adjusting probabilities of both prolog and vector loops
3025 separately. Note in this case, the probability of epilog loop
3026 needs to be scaled back later. */
3027 basic_block bb_before_loop = loop_preheader_edge (loop)->src;
3028 if (prob_vector.initialized_p ())
3029 {
3030 scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
3031 scale_loop_profile (loop, prob_vector, -1);
3032 }
3033 }
3034
3035 if (vect_epilogues)
3036 {
3037 /* Make sure to set the epilogue's epilogue scalar loop, such that we can
3038 use the original scalar loop as remaining epilogue if necessary. */
3039 LOOP_VINFO_SCALAR_LOOP (epilogue_vinfo)
3040 = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
3041 LOOP_VINFO_SCALAR_IV_EXIT (epilogue_vinfo)
3042 = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
3043 }
3044
3045 if (prolog_peeling)
3046 {
3047 e = loop_preheader_edge (loop);
3048 edge exit_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
3049 gcc_checking_assert (slpeel_can_duplicate_loop_p (loop, exit_e, e));
3050
3051 /* Peel prolog and put it on preheader edge of loop. */
3052 edge scalar_e = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
3053 edge prolog_e = NULL;
3054 prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, loop_exit: exit_e,
3055 scalar_loop, scalar_exit: scalar_e,
3056 e, new_e: &prolog_e);
3057 gcc_assert (prolog);
3058 prolog->force_vectorize = false;
3059
3060 first_loop = prolog;
3061 reset_original_copy_tables ();
3062
3063 /* Update the number of iterations for prolog loop. */
3064 tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
3065 vect_set_loop_condition (loop: prolog, loop_e: prolog_e, NULL, niters: niters_prolog,
3066 step: step_prolog, NULL_TREE, niters_maybe_zero: false);
3067
3068 /* Skip the prolog loop. */
3069 if (skip_prolog)
3070 {
3071 guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
3072 niters_prolog, build_int_cst (type, 0));
3073 guard_bb = loop_preheader_edge (prolog)->src;
3074 basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
3075 guard_to = split_edge (loop_preheader_edge (loop));
3076 guard_e = slpeel_add_loop_guard (guard_bb, cond: guard_cond,
3077 guard_to, dom_bb: guard_bb,
3078 probability: prob_prolog.invert (),
3079 irreducible_p: irred_flag);
3080 e = EDGE_PRED (guard_to, 0);
3081 e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
3082 slpeel_update_phi_nodes_for_guard1 (skip_loop: prolog, update_loop: loop, guard_edge: guard_e, merge_edge: e);
3083
3084 scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
3085 scale_loop_profile (prolog, prob_prolog, bound_prolog - 1);
3086 }
3087
3088 /* Update init address of DRs. */
3089 vect_update_inits_of_drs (loop_vinfo, niters: niters_prolog, code: PLUS_EXPR);
3090 /* Update niters for vector loop. */
3091 LOOP_VINFO_NITERS (loop_vinfo)
3092 = fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
3093 LOOP_VINFO_NITERSM1 (loop_vinfo)
3094 = fold_build2 (MINUS_EXPR, type,
3095 LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
3096 bool new_var_p = false;
3097 niters = vect_build_loop_niters (loop_vinfo, new_var_p: &new_var_p);
3098 /* It's guaranteed that vector loop bound before vectorization is at
3099 least VF, so set range information for newly generated var. */
3100 if (new_var_p)
3101 {
3102 value_range vr (type,
3103 wi::to_wide (t: build_int_cst (type, lowest_vf)),
3104 wi::to_wide (TYPE_MAX_VALUE (type)));
3105 set_range_info (niters, vr);
3106 }
3107
3108 /* Prolog iterates at most bound_prolog times, latch iterates at
3109 most bound_prolog - 1 times. */
3110 record_niter_bound (prolog, bound_prolog - 1, false, true);
3111 delete_update_ssa ();
3112 adjust_vec_debug_stmts ();
3113 scev_reset ();
3114 }
3115 basic_block bb_before_epilog = NULL;
3116
3117 if (epilog_peeling)
3118 {
3119 e = LOOP_VINFO_IV_EXIT (loop_vinfo);
3120 gcc_checking_assert (slpeel_can_duplicate_loop_p (loop, e, e));
3121
3122 /* Peel epilog and put it on exit edge of loop. If we are vectorizing
3123 said epilog then we should use a copy of the main loop as a starting
3124 point. This loop may have already had some preliminary transformations
3125 to allow for more optimal vectorization, for example if-conversion.
3126 If we are not vectorizing the epilog then we should use the scalar loop
3127 as the transformations mentioned above make less or no sense when not
3128 vectorizing. */
3129 edge scalar_e = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
3130 epilog = vect_epilogues ? get_loop_copy (loop) : scalar_loop;
3131 edge epilog_e = vect_epilogues ? e : scalar_e;
3132 edge new_epilog_e = NULL;
3133 epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, loop_exit: e, scalar_loop: epilog,
3134 scalar_exit: epilog_e, e,
3135 new_e: &new_epilog_e);
3136 LOOP_VINFO_EPILOGUE_IV_EXIT (loop_vinfo) = new_epilog_e;
3137 gcc_assert (epilog);
3138 epilog->force_vectorize = false;
3139 bb_before_epilog = loop_preheader_edge (epilog)->src;
3140
3141 /* Scalar version loop may be preferred. In this case, add guard
3142 and skip to epilog. Note this only happens when the number of
3143 iterations of loop is unknown at compile time, otherwise this
3144 won't be vectorized. */
3145 if (skip_vector)
3146 {
3147 /* Additional epilogue iteration is peeled if gap exists. */
3148 tree t = vect_gen_scalar_loop_niters (niters_prolog, int_niters_prolog: prolog_peeling,
3149 bound_prolog, bound_epilog,
3150 th, bound_scalar: &bound_scalar,
3151 check_profitability);
3152 /* Build guard against NITERSM1 since NITERS may overflow. */
3153 guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
3154 guard_bb = anchor;
3155 guard_to = split_edge (loop_preheader_edge (epilog));
3156 guard_e = slpeel_add_loop_guard (guard_bb, cond: guard_cond,
3157 guard_to, dom_bb: guard_bb,
3158 probability: prob_vector.invert (),
3159 irreducible_p: irred_flag);
3160 skip_e = guard_e;
3161 e = EDGE_PRED (guard_to, 0);
3162 e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
3163 slpeel_update_phi_nodes_for_guard1 (skip_loop: first_loop, update_loop: epilog, guard_edge: guard_e, merge_edge: e);
3164
3165 /* Simply propagate profile info from guard_bb to guard_to which is
3166 a merge point of control flow. */
3167 profile_count old_count = guard_to->count;
3168 guard_to->count = guard_bb->count;
3169
3170 /* Restore the counts of the epilog loop if we didn't use the scalar loop. */
3171 if (vect_epilogues || scalar_loop == NULL)
3172 {
3173 gcc_assert(epilog->num_nodes == loop->num_nodes);
3174 basic_block *bbs = get_loop_body (epilog);
3175 for (unsigned int i = 0; i < epilog->num_nodes; i++)
3176 {
3177 gcc_assert(get_bb_original (bbs[i]) == original_bbs[i]);
3178 bbs[i]->count = original_counts[i];
3179 }
3180 free (ptr: bbs);
3181 free (ptr: original_bbs);
3182 }
3183 else if (old_count.nonzero_p ())
3184 scale_loop_profile (epilog, guard_to->count.probability_in (overall: old_count), -1);
3185
3186 /* Only need to handle basic block before epilog loop if it's not
3187 the guard_bb, which is the case when skip_vector is true. */
3188 if (guard_bb != bb_before_epilog)
3189 bb_before_epilog->count = single_pred_edge (bb: bb_before_epilog)->count ();
3190 bb_before_epilog = loop_preheader_edge (epilog)->src;
3191 }
3192 /* If loop is peeled for non-zero constant times, now niters refers to
3193 orig_niters - prolog_peeling, it won't overflow even the orig_niters
3194 overflows. */
3195 niters_no_overflow |= (prolog_peeling > 0);
3196 vect_gen_vector_loop_niters (loop_vinfo, niters,
3197 niters_vector_ptr: niters_vector, step_vector_ptr: step_vector,
3198 niters_no_overflow);
3199 if (!integer_onep (*step_vector))
3200 {
3201 /* On exit from the loop we will have an easy way of calcalating
3202 NITERS_VECTOR / STEP * STEP. Install a dummy definition
3203 until then. */
3204 niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
3205 SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
3206 *niters_vector_mult_vf_var = niters_vector_mult_vf;
3207 }
3208 else
3209 vect_gen_vector_loop_niters_mult_vf (loop_vinfo, niters_vector: *niters_vector,
3210 niters_vector_mult_vf_ptr: &niters_vector_mult_vf);
3211 /* Update IVs of original loop as if they were advanced by
3212 niters_vector_mult_vf steps. */
3213 gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
3214 update_e = skip_vector ? e : loop_preheader_edge (epilog);
3215 vect_update_ivs_after_vectorizer (loop_vinfo, niters: niters_vector_mult_vf,
3216 update_e);
3217
3218 if (skip_epilog)
3219 {
3220 guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
3221 niters, niters_vector_mult_vf);
3222 guard_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
3223 edge epilog_e = LOOP_VINFO_EPILOGUE_IV_EXIT (loop_vinfo);
3224 guard_to = epilog_e->dest;
3225 guard_e = slpeel_add_loop_guard (guard_bb, cond: guard_cond, guard_to,
3226 dom_bb: skip_vector ? anchor : guard_bb,
3227 probability: prob_epilog.invert (),
3228 irreducible_p: irred_flag);
3229 if (vect_epilogues)
3230 epilogue_vinfo->skip_this_loop_edge = guard_e;
3231 edge main_iv = LOOP_VINFO_IV_EXIT (loop_vinfo);
3232 gphi_iterator gsi2 = gsi_start_phis (main_iv->dest);
3233 for (gphi_iterator gsi = gsi_start_phis (guard_to);
3234 !gsi_end_p (i: gsi); gsi_next (i: &gsi))
3235 {
3236 /* We are expecting all of the PHIs we have on epilog_e
3237 to be also on the main loop exit. But sometimes
3238 a stray virtual definition can appear at epilog_e
3239 which we can then take as the same on all exits,
3240 we've removed the LC SSA PHI on the main exit before
3241 so we wouldn't need to create a loop PHI for it. */
3242 if (virtual_operand_p (op: gimple_phi_result (gs: *gsi))
3243 && (gsi_end_p (i: gsi2)
3244 || !virtual_operand_p (op: gimple_phi_result (gs: *gsi2))))
3245 add_phi_arg (*gsi,
3246 gimple_phi_arg_def_from_edge (gs: *gsi, e: epilog_e),
3247 guard_e, UNKNOWN_LOCATION);
3248 else
3249 {
3250 add_phi_arg (*gsi, gimple_phi_result (gs: *gsi2), guard_e,
3251 UNKNOWN_LOCATION);
3252 gsi_next (i: &gsi2);
3253 }
3254 }
3255
3256 /* Only need to handle basic block before epilog loop if it's not
3257 the guard_bb, which is the case when skip_vector is true. */
3258 if (guard_bb != bb_before_epilog)
3259 {
3260 prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
3261
3262 scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
3263 }
3264 scale_loop_profile (epilog, prob_epilog, -1);
3265 }
3266
3267 unsigned HOST_WIDE_INT bound;
3268 if (bound_scalar.is_constant (const_value: &bound))
3269 {
3270 gcc_assert (bound != 0);
3271 /* -1 to convert loop iterations to latch iterations. */
3272 record_niter_bound (epilog, bound - 1, false, true);
3273 scale_loop_profile (epilog, profile_probability::always (),
3274 bound - 1);
3275 }
3276
3277 delete_update_ssa ();
3278 adjust_vec_debug_stmts ();
3279 scev_reset ();
3280 }
3281
3282 if (vect_epilogues)
3283 {
3284 epilog->aux = epilogue_vinfo;
3285 LOOP_VINFO_LOOP (epilogue_vinfo) = epilog;
3286 LOOP_VINFO_IV_EXIT (epilogue_vinfo)
3287 = LOOP_VINFO_EPILOGUE_IV_EXIT (loop_vinfo);
3288
3289 loop_constraint_clear (loop: epilog, LOOP_C_INFINITE);
3290
3291 /* We now must calculate the number of NITERS performed by the previous
3292 loop and EPILOGUE_NITERS to be performed by the epilogue. */
3293 tree niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters_vector_mult_vf),
3294 niters_prolog, niters_vector_mult_vf);
3295
3296 /* If skip_vector we may skip the previous loop, we insert a phi-node to
3297 determine whether we are coming from the previous vectorized loop
3298 using the update_e edge or the skip_vector basic block using the
3299 skip_e edge. */
3300 if (skip_vector)
3301 {
3302 gcc_assert (update_e != NULL && skip_e != NULL);
3303 gphi *new_phi = create_phi_node (make_ssa_name (TREE_TYPE (niters)),
3304 update_e->dest);
3305 tree new_ssa = make_ssa_name (TREE_TYPE (niters));
3306 gimple *stmt = gimple_build_assign (new_ssa, niters);
3307 gimple_stmt_iterator gsi;
3308 if (TREE_CODE (niters_vector_mult_vf) == SSA_NAME
3309 && SSA_NAME_DEF_STMT (niters_vector_mult_vf)->bb != NULL)
3310 {
3311 gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (niters_vector_mult_vf));
3312 gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
3313 }
3314 else
3315 {
3316 gsi = gsi_last_bb (bb: update_e->src);
3317 gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
3318 }
3319
3320 niters = new_ssa;
3321 add_phi_arg (new_phi, niters, update_e, UNKNOWN_LOCATION);
3322 add_phi_arg (new_phi, build_zero_cst (TREE_TYPE (niters)), skip_e,
3323 UNKNOWN_LOCATION);
3324 niters = PHI_RESULT (new_phi);
3325 epilogue_vinfo->main_loop_edge = update_e;
3326 epilogue_vinfo->skip_main_loop_edge = skip_e;
3327 }
3328
3329 /* Set ADVANCE to the number of iterations performed by the previous
3330 loop and its prologue. */
3331 *advance = niters;
3332
3333 /* Subtract the number of iterations performed by the vectorized loop
3334 from the number of total iterations. */
3335 tree epilogue_niters = fold_build2 (MINUS_EXPR, TREE_TYPE (niters),
3336 before_loop_niters,
3337 niters);
3338
3339 LOOP_VINFO_NITERS (epilogue_vinfo) = epilogue_niters;
3340 LOOP_VINFO_NITERSM1 (epilogue_vinfo)
3341 = fold_build2 (MINUS_EXPR, TREE_TYPE (epilogue_niters),
3342 epilogue_niters,
3343 build_one_cst (TREE_TYPE (epilogue_niters)));
3344
3345 /* Decide what to do if the number of epilogue iterations is not
3346 a multiple of the epilogue loop's vectorization factor.
3347 We should have rejected the loop during the analysis phase
3348 if this fails. */
3349 bool res = vect_determine_partial_vectors_and_peeling (epilogue_vinfo);
3350 gcc_assert (res);
3351 }
3352
3353 adjust_vec.release ();
3354 free_original_copy_tables ();
3355
3356 return vect_epilogues ? epilog : NULL;
3357}
3358
3359/* Function vect_create_cond_for_niters_checks.
3360
3361 Create a conditional expression that represents the run-time checks for
3362 loop's niter. The loop is guaranteed to terminate if the run-time
3363 checks hold.
3364
3365 Input:
3366 COND_EXPR - input conditional expression. New conditions will be chained
3367 with logical AND operation. If it is NULL, then the function
3368 is used to return the number of alias checks.
3369 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3370 to be checked.
3371
3372 Output:
3373 COND_EXPR - conditional expression.
3374
3375 The returned COND_EXPR is the conditional expression to be used in the
3376 if statement that controls which version of the loop gets executed at
3377 runtime. */
3378
3379static void
3380vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
3381{
3382 tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
3383
3384 if (*cond_expr)
3385 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3386 *cond_expr, part_cond_expr);
3387 else
3388 *cond_expr = part_cond_expr;
3389}
3390
3391/* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3392 and PART_COND_EXPR are true. Treat a null *COND_EXPR as "true". */
3393
3394static void
3395chain_cond_expr (tree *cond_expr, tree part_cond_expr)
3396{
3397 if (*cond_expr)
3398 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3399 *cond_expr, part_cond_expr);
3400 else
3401 *cond_expr = part_cond_expr;
3402}
3403
3404/* Function vect_create_cond_for_align_checks.
3405
3406 Create a conditional expression that represents the alignment checks for
3407 all of data references (array element references) whose alignment must be
3408 checked at runtime.
3409
3410 Input:
3411 COND_EXPR - input conditional expression. New conditions will be chained
3412 with logical AND operation.
3413 LOOP_VINFO - two fields of the loop information are used.
3414 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
3415 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
3416
3417 Output:
3418 COND_EXPR_STMT_LIST - statements needed to construct the conditional
3419 expression.
3420 The returned value is the conditional expression to be used in the if
3421 statement that controls which version of the loop gets executed at runtime.
3422
3423 The algorithm makes two assumptions:
3424 1) The number of bytes "n" in a vector is a power of 2.
3425 2) An address "a" is aligned if a%n is zero and that this
3426 test can be done as a&(n-1) == 0. For example, for 16
3427 byte vectors the test is a&0xf == 0. */
3428
3429static void
3430vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
3431 tree *cond_expr,
3432 gimple_seq *cond_expr_stmt_list)
3433{
3434 const vec<stmt_vec_info> &may_misalign_stmts
3435 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
3436 stmt_vec_info stmt_info;
3437 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
3438 tree mask_cst;
3439 unsigned int i;
3440 tree int_ptrsize_type;
3441 char tmp_name[20];
3442 tree or_tmp_name = NULL_TREE;
3443 tree and_tmp_name;
3444 gimple *and_stmt;
3445 tree ptrsize_zero;
3446 tree part_cond_expr;
3447
3448 /* Check that mask is one less than a power of 2, i.e., mask is
3449 all zeros followed by all ones. */
3450 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
3451
3452 int_ptrsize_type = signed_type_for (ptr_type_node);
3453
3454 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
3455 of the first vector of the i'th data reference. */
3456
3457 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
3458 {
3459 gimple_seq new_stmt_list = NULL;
3460 tree addr_base;
3461 tree addr_tmp_name;
3462 tree new_or_tmp_name;
3463 gimple *addr_stmt, *or_stmt;
3464 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3465 bool negative = tree_int_cst_compare
3466 (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)), size_zero_node) < 0;
3467 tree offset = negative
3468 ? size_int ((-TYPE_VECTOR_SUBPARTS (vectype) + 1)
3469 * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))))
3470 : size_zero_node;
3471
3472 /* create: addr_tmp = (int)(address_of_first_vector) */
3473 addr_base =
3474 vect_create_addr_base_for_vector_ref (loop_vinfo,
3475 stmt_info, &new_stmt_list,
3476 offset);
3477 if (new_stmt_list != NULL)
3478 gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
3479
3480 sprintf (s: tmp_name, format: "addr2int%d", i);
3481 addr_tmp_name = make_temp_ssa_name (type: int_ptrsize_type, NULL, name: tmp_name);
3482 addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
3483 gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
3484
3485 /* The addresses are OR together. */
3486
3487 if (or_tmp_name != NULL_TREE)
3488 {
3489 /* create: or_tmp = or_tmp | addr_tmp */
3490 sprintf (s: tmp_name, format: "orptrs%d", i);
3491 new_or_tmp_name = make_temp_ssa_name (type: int_ptrsize_type, NULL, name: tmp_name);
3492 or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
3493 or_tmp_name, addr_tmp_name);
3494 gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
3495 or_tmp_name = new_or_tmp_name;
3496 }
3497 else
3498 or_tmp_name = addr_tmp_name;
3499
3500 } /* end for i */
3501
3502 mask_cst = build_int_cst (int_ptrsize_type, mask);
3503
3504 /* create: and_tmp = or_tmp & mask */
3505 and_tmp_name = make_temp_ssa_name (type: int_ptrsize_type, NULL, name: "andmask");
3506
3507 and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
3508 or_tmp_name, mask_cst);
3509 gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
3510
3511 /* Make and_tmp the left operand of the conditional test against zero.
3512 if and_tmp has a nonzero bit then some address is unaligned. */
3513 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
3514 part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
3515 and_tmp_name, ptrsize_zero);
3516 chain_cond_expr (cond_expr, part_cond_expr);
3517}
3518
3519/* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
3520 create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
3521 Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3522 and this new condition are true. Treat a null *COND_EXPR as "true". */
3523
3524static void
3525vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
3526{
3527 const vec<vec_object_pair> &pairs
3528 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3529 unsigned int i;
3530 vec_object_pair *pair;
3531 FOR_EACH_VEC_ELT (pairs, i, pair)
3532 {
3533 tree addr1 = build_fold_addr_expr (pair->first);
3534 tree addr2 = build_fold_addr_expr (pair->second);
3535 tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
3536 addr1, addr2);
3537 chain_cond_expr (cond_expr, part_cond_expr);
3538 }
3539}
3540
3541/* Create an expression that is true when all lower-bound conditions for
3542 the vectorized loop are met. Chain this condition with *COND_EXPR. */
3543
3544static void
3545vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
3546{
3547 const vec<vec_lower_bound> &lower_bounds
3548 = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3549 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3550 {
3551 tree expr = lower_bounds[i].expr;
3552 tree type = unsigned_type_for (TREE_TYPE (expr));
3553 expr = fold_convert (type, expr);
3554 poly_uint64 bound = lower_bounds[i].min_value;
3555 if (!lower_bounds[i].unsigned_p)
3556 {
3557 expr = fold_build2 (PLUS_EXPR, type, expr,
3558 build_int_cstu (type, bound - 1));
3559 bound += bound - 1;
3560 }
3561 tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
3562 build_int_cstu (type, bound));
3563 chain_cond_expr (cond_expr, part_cond_expr);
3564 }
3565}
3566
3567/* Function vect_create_cond_for_alias_checks.
3568
3569 Create a conditional expression that represents the run-time checks for
3570 overlapping of address ranges represented by a list of data references
3571 relations passed as input.
3572
3573 Input:
3574 COND_EXPR - input conditional expression. New conditions will be chained
3575 with logical AND operation. If it is NULL, then the function
3576 is used to return the number of alias checks.
3577 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3578 to be checked.
3579
3580 Output:
3581 COND_EXPR - conditional expression.
3582
3583 The returned COND_EXPR is the conditional expression to be used in the if
3584 statement that controls which version of the loop gets executed at runtime.
3585*/
3586
3587void
3588vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
3589{
3590 const vec<dr_with_seg_len_pair_t> &comp_alias_ddrs =
3591 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3592
3593 if (comp_alias_ddrs.is_empty ())
3594 return;
3595
3596 create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
3597 &comp_alias_ddrs, cond_expr);
3598 if (dump_enabled_p ())
3599 dump_printf_loc (MSG_NOTE, vect_location,
3600 "created %u versioning for alias checks.\n",
3601 comp_alias_ddrs.length ());
3602}
3603
3604
3605/* Function vect_loop_versioning.
3606
3607 If the loop has data references that may or may not be aligned or/and
3608 has data reference relations whose independence was not proven then
3609 two versions of the loop need to be generated, one which is vectorized
3610 and one which isn't. A test is then generated to control which of the
3611 loops is executed. The test checks for the alignment of all of the
3612 data references that may or may not be aligned. An additional
3613 sequence of runtime tests is generated for each pairs of DDRs whose
3614 independence was not proven. The vectorized version of loop is
3615 executed only if both alias and alignment tests are passed.
3616
3617 The test generated to check which version of loop is executed
3618 is modified to also check for profitability as indicated by the
3619 cost model threshold TH.
3620
3621 The versioning precondition(s) are placed in *COND_EXPR and
3622 *COND_EXPR_STMT_LIST. */
3623
3624class loop *
3625vect_loop_versioning (loop_vec_info loop_vinfo,
3626 gimple *loop_vectorized_call)
3627{
3628 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
3629 class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
3630 basic_block condition_bb;
3631 gphi_iterator gsi;
3632 gimple_stmt_iterator cond_exp_gsi;
3633 basic_block merge_bb;
3634 basic_block new_exit_bb;
3635 edge new_exit_e, e;
3636 gphi *orig_phi, *new_phi;
3637 tree cond_expr = NULL_TREE;
3638 gimple_seq cond_expr_stmt_list = NULL;
3639 tree arg;
3640 profile_probability prob = profile_probability::likely ();
3641 gimple_seq gimplify_stmt_list = NULL;
3642 tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
3643 bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
3644 bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
3645 bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
3646 poly_uint64 versioning_threshold
3647 = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3648 tree version_simd_if_cond
3649 = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo);
3650 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3651
3652 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3653 && !ordered_p (a: th, b: versioning_threshold))
3654 cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3655 build_int_cst (TREE_TYPE (scalar_loop_iters),
3656 th - 1));
3657 if (maybe_ne (a: versioning_threshold, b: 0U))
3658 {
3659 tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3660 build_int_cst (TREE_TYPE (scalar_loop_iters),
3661 versioning_threshold - 1));
3662 if (cond_expr)
3663 cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
3664 expr, cond_expr);
3665 else
3666 cond_expr = expr;
3667 }
3668
3669 tree cost_name = NULL_TREE;
3670 profile_probability prob2 = profile_probability::always ();
3671 if (cond_expr
3672 && EXPR_P (cond_expr)
3673 && (version_niter
3674 || version_align
3675 || version_alias
3676 || version_simd_if_cond))
3677 {
3678 cost_name = cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3679 &cond_expr_stmt_list,
3680 is_gimple_val, NULL_TREE);
3681 /* Split prob () into two so that the overall probability of passing
3682 both the cost-model and versioning checks is the orig prob. */
3683 prob2 = prob = prob.sqrt ();
3684 }
3685
3686 if (version_niter)
3687 vect_create_cond_for_niters_checks (loop_vinfo, cond_expr: &cond_expr);
3688
3689 if (cond_expr)
3690 {
3691 gimple_seq tem = NULL;
3692 cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3693 &tem, is_gimple_condexpr_for_cond,
3694 NULL_TREE);
3695 gimple_seq_add_seq (&cond_expr_stmt_list, tem);
3696 }
3697
3698 if (version_align)
3699 vect_create_cond_for_align_checks (loop_vinfo, cond_expr: &cond_expr,
3700 cond_expr_stmt_list: &cond_expr_stmt_list);
3701
3702 if (version_alias)
3703 {
3704 vect_create_cond_for_unequal_addrs (loop_vinfo, cond_expr: &cond_expr);
3705 vect_create_cond_for_lower_bounds (loop_vinfo, cond_expr: &cond_expr);
3706 vect_create_cond_for_alias_checks (loop_vinfo, cond_expr: &cond_expr);
3707 }
3708
3709 if (version_simd_if_cond)
3710 {
3711 gcc_assert (dom_info_available_p (CDI_DOMINATORS));
3712 if (flag_checking)
3713 if (basic_block bb
3714 = gimple_bb (SSA_NAME_DEF_STMT (version_simd_if_cond)))
3715 gcc_assert (bb != loop->header
3716 && dominated_by_p (CDI_DOMINATORS, loop->header, bb)
3717 && (scalar_loop == NULL
3718 || (bb != scalar_loop->header
3719 && dominated_by_p (CDI_DOMINATORS,
3720 scalar_loop->header, bb))));
3721 tree zero = build_zero_cst (TREE_TYPE (version_simd_if_cond));
3722 tree c = fold_build2 (NE_EXPR, boolean_type_node,
3723 version_simd_if_cond, zero);
3724 if (cond_expr)
3725 cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3726 c, cond_expr);
3727 else
3728 cond_expr = c;
3729 if (dump_enabled_p ())
3730 dump_printf_loc (MSG_NOTE, vect_location,
3731 "created versioning for simd if condition check.\n");
3732 }
3733
3734 cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3735 &gimplify_stmt_list,
3736 is_gimple_condexpr_for_cond, NULL_TREE);
3737 gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
3738
3739 /* Compute the outermost loop cond_expr and cond_expr_stmt_list are
3740 invariant in. */
3741 class loop *outermost = outermost_invariant_loop_for_expr (loop, cond_expr);
3742 for (gimple_stmt_iterator gsi = gsi_start (seq&: cond_expr_stmt_list);
3743 !gsi_end_p (i: gsi); gsi_next (i: &gsi))
3744 {
3745 gimple *stmt = gsi_stmt (i: gsi);
3746 update_stmt (s: stmt);
3747 ssa_op_iter iter;
3748 use_operand_p use_p;
3749 basic_block def_bb;
3750 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE)
3751 if ((def_bb = gimple_bb (SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p))))
3752 && flow_bb_inside_loop_p (outermost, def_bb))
3753 outermost = superloop_at_depth (loop, bb_loop_depth (def_bb) + 1);
3754 }
3755
3756 /* Search for the outermost loop we can version. Avoid versioning of
3757 non-perfect nests but allow if-conversion versioned loops inside. */
3758 class loop *loop_to_version = loop;
3759 if (flow_loop_nested_p (outermost, loop))
3760 {
3761 if (dump_enabled_p ())
3762 dump_printf_loc (MSG_NOTE, vect_location,
3763 "trying to apply versioning to outer loop %d\n",
3764 outermost->num);
3765 if (outermost->num == 0)
3766 outermost = superloop_at_depth (loop, 1);
3767 /* And avoid applying versioning on non-perfect nests. */
3768 while (loop_to_version != outermost
3769 && (e = single_exit (loop_outer (loop: loop_to_version)))
3770 && !(e->flags & EDGE_COMPLEX)
3771 && (!loop_outer (loop: loop_to_version)->inner->next
3772 || vect_loop_vectorized_call (loop_to_version))
3773 && (!loop_outer (loop: loop_to_version)->inner->next
3774 || !loop_outer (loop: loop_to_version)->inner->next->next))
3775 loop_to_version = loop_outer (loop: loop_to_version);
3776 }
3777
3778 /* Apply versioning. If there is already a scalar version created by
3779 if-conversion re-use that. Note we cannot re-use the copy of
3780 an if-converted outer-loop when vectorizing the inner loop only. */
3781 gcond *cond;
3782 if ((!loop_to_version->inner || loop == loop_to_version)
3783 && loop_vectorized_call)
3784 {
3785 gcc_assert (scalar_loop);
3786 condition_bb = gimple_bb (g: loop_vectorized_call);
3787 cond = as_a <gcond *> (p: *gsi_last_bb (bb: condition_bb));
3788 gimple_cond_set_condition_from_tree (cond, cond_expr);
3789 update_stmt (s: cond);
3790
3791 if (cond_expr_stmt_list)
3792 {
3793 cond_exp_gsi = gsi_for_stmt (loop_vectorized_call);
3794 gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3795 GSI_SAME_STMT);
3796 }
3797
3798 /* if-conversion uses profile_probability::always () for both paths,
3799 reset the paths probabilities appropriately. */
3800 edge te, fe;
3801 extract_true_false_edges_from_block (condition_bb, &te, &fe);
3802 te->probability = prob;
3803 fe->probability = prob.invert ();
3804 /* We can scale loops counts immediately but have to postpone
3805 scaling the scalar loop because we re-use it during peeling.
3806
3807 Ifcvt duplicates loop preheader, loop body and produces an basic
3808 block after loop exit. We need to scale all that. */
3809 basic_block preheader = loop_preheader_edge (loop_to_version)->src;
3810 preheader->count = preheader->count.apply_probability (prob: prob * prob2);
3811 scale_loop_frequencies (loop_to_version, prob * prob2);
3812 single_exit (loop_to_version)->dest->count = preheader->count;
3813 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo) = (prob * prob2).invert ();
3814
3815 nloop = scalar_loop;
3816 if (dump_enabled_p ())
3817 dump_printf_loc (MSG_NOTE, vect_location,
3818 "reusing %sloop version created by if conversion\n",
3819 loop_to_version != loop ? "outer " : "");
3820 }
3821 else
3822 {
3823 if (loop_to_version != loop
3824 && dump_enabled_p ())
3825 dump_printf_loc (MSG_NOTE, vect_location,
3826 "applying loop versioning to outer loop %d\n",
3827 loop_to_version->num);
3828
3829 unsigned orig_pe_idx = loop_preheader_edge (loop)->dest_idx;
3830
3831 initialize_original_copy_tables ();
3832 nloop = loop_version (loop_to_version, cond_expr, &condition_bb,
3833 prob * prob2, (prob * prob2).invert (),
3834 prob * prob2, (prob * prob2).invert (),
3835 true);
3836 /* We will later insert second conditional so overall outcome of
3837 both is prob * prob2. */
3838 edge true_e, false_e;
3839 extract_true_false_edges_from_block (condition_bb, &true_e, &false_e);
3840 true_e->probability = prob;
3841 false_e->probability = prob.invert ();
3842 gcc_assert (nloop);
3843 nloop = get_loop_copy (loop);
3844
3845 /* For cycle vectorization with SLP we rely on the PHI arguments
3846 appearing in the same order as the SLP node operands which for the
3847 loop PHI nodes means the preheader edge dest index needs to remain
3848 the same for the analyzed loop which also becomes the vectorized one.
3849 Make it so in case the state after versioning differs by redirecting
3850 the first edge into the header to the same destination which moves
3851 it last. */
3852 if (loop_preheader_edge (loop)->dest_idx != orig_pe_idx)
3853 {
3854 edge e = EDGE_PRED (loop->header, 0);
3855 ssa_redirect_edge (e, e->dest);
3856 flush_pending_stmts (e);
3857 }
3858 gcc_assert (loop_preheader_edge (loop)->dest_idx == orig_pe_idx);
3859
3860 /* Kill off IFN_LOOP_VECTORIZED_CALL in the copy, nobody will
3861 reap those otherwise; they also refer to the original
3862 loops. */
3863 class loop *l = loop;
3864 while (gimple *call = vect_loop_vectorized_call (l))
3865 {
3866 call = SSA_NAME_DEF_STMT (get_current_def (gimple_call_lhs (call)));
3867 fold_loop_internal_call (call, boolean_false_node);
3868 l = loop_outer (loop: l);
3869 }
3870 free_original_copy_tables ();
3871
3872 if (cond_expr_stmt_list)
3873 {
3874 cond_exp_gsi = gsi_last_bb (bb: condition_bb);
3875 gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3876 GSI_SAME_STMT);
3877 }
3878
3879 /* Loop versioning violates an assumption we try to maintain during
3880 vectorization - that the loop exit block has a single predecessor.
3881 After versioning, the exit block of both loop versions is the same
3882 basic block (i.e. it has two predecessors). Just in order to simplify
3883 following transformations in the vectorizer, we fix this situation
3884 here by adding a new (empty) block on the exit-edge of the loop,
3885 with the proper loop-exit phis to maintain loop-closed-form.
3886 If loop versioning wasn't done from loop, but scalar_loop instead,
3887 merge_bb will have already just a single successor. */
3888
3889 merge_bb = single_exit (loop_to_version)->dest;
3890 if (EDGE_COUNT (merge_bb->preds) >= 2)
3891 {
3892 gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
3893 new_exit_bb = split_edge (single_exit (loop_to_version));
3894 new_exit_e = single_exit (loop_to_version);
3895 e = EDGE_SUCC (new_exit_bb, 0);
3896
3897 for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (i: gsi);
3898 gsi_next (i: &gsi))
3899 {
3900 tree new_res;
3901 orig_phi = gsi.phi ();
3902 new_res = copy_ssa_name (PHI_RESULT (orig_phi));
3903 new_phi = create_phi_node (new_res, new_exit_bb);
3904 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
3905 add_phi_arg (new_phi, arg, new_exit_e,
3906 gimple_phi_arg_location_from_edge (phi: orig_phi, e));
3907 adjust_phi_and_debug_stmts (update_phi: orig_phi, e, PHI_RESULT (new_phi));
3908 }
3909 }
3910
3911 update_ssa (TODO_update_ssa_no_phi);
3912 }
3913
3914 /* Split the cost model check off to a separate BB. Costing assumes
3915 this is the only thing we perform when we enter the scalar loop
3916 from a failed cost decision. */
3917 if (cost_name && TREE_CODE (cost_name) == SSA_NAME)
3918 {
3919 gimple *def = SSA_NAME_DEF_STMT (cost_name);
3920 gcc_assert (gimple_bb (def) == condition_bb);
3921 /* All uses of the cost check are 'true' after the check we
3922 are going to insert. */
3923 replace_uses_by (cost_name, boolean_true_node);
3924 /* And we're going to build the new single use of it. */
3925 gcond *cond = gimple_build_cond (NE_EXPR, cost_name, boolean_false_node,
3926 NULL_TREE, NULL_TREE);
3927 edge e = split_block (gimple_bb (g: def), def);
3928 gimple_stmt_iterator gsi = gsi_for_stmt (def);
3929 gsi_insert_after (&gsi, cond, GSI_NEW_STMT);
3930 edge true_e, false_e;
3931 extract_true_false_edges_from_block (e->dest, &true_e, &false_e);
3932 e->flags &= ~EDGE_FALLTHRU;
3933 e->flags |= EDGE_TRUE_VALUE;
3934 edge e2 = make_edge (e->src, false_e->dest, EDGE_FALSE_VALUE);
3935 e->probability = prob2;
3936 e2->probability = prob2.invert ();
3937 e->dest->count = e->count ();
3938 set_immediate_dominator (CDI_DOMINATORS, false_e->dest, e->src);
3939 auto_vec<basic_block, 3> adj;
3940 for (basic_block son = first_dom_son (CDI_DOMINATORS, e->dest);
3941 son;
3942 son = next_dom_son (CDI_DOMINATORS, son))
3943 if (EDGE_COUNT (son->preds) > 1)
3944 adj.safe_push (obj: son);
3945 for (auto son : adj)
3946 set_immediate_dominator (CDI_DOMINATORS, son, e->src);
3947 //debug_bb (condition_bb);
3948 //debug_bb (e->src);
3949 }
3950
3951 if (version_niter)
3952 {
3953 /* The versioned loop could be infinite, we need to clear existing
3954 niter information which is copied from the original loop. */
3955 gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
3956 vect_free_loop_info_assumptions (nloop);
3957 }
3958
3959 if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
3960 && dump_enabled_p ())
3961 {
3962 if (version_alias)
3963 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3964 vect_location,
3965 "loop versioned for vectorization because of "
3966 "possible aliasing\n");
3967 if (version_align)
3968 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3969 vect_location,
3970 "loop versioned for vectorization to enhance "
3971 "alignment\n");
3972
3973 }
3974
3975 return nloop;
3976}
3977

source code of gcc/tree-vect-loop-manip.cc