1/* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2023 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6This file is part of GCC.
7
8GCC is free software; you can redistribute it and/or modify it under
9the terms of the GNU General Public License as published by the Free
10Software Foundation; either version 3, or (at your option) any later
11version.
12
13GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14WARRANTY; without even the implied warranty of MERCHANTABILITY or
15FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16for more details.
17
18You should have received a copy of the GNU General Public License
19along with GCC; see the file COPYING3. If not see
20<http://www.gnu.org/licenses/>. */
21
22#include "config.h"
23#include "system.h"
24#include "coretypes.h"
25#include "backend.h"
26#include "target.h"
27#include "rtl.h"
28#include "tree.h"
29#include "gimple.h"
30#include "predict.h"
31#include "memmodel.h"
32#include "tm_p.h"
33#include "ssa.h"
34#include "optabs-tree.h"
35#include "cgraph.h"
36#include "dumpfile.h"
37#include "alias.h"
38#include "fold-const.h"
39#include "stor-layout.h"
40#include "tree-eh.h"
41#include "gimplify.h"
42#include "gimple-iterator.h"
43#include "gimplify-me.h"
44#include "tree-ssa-loop-ivopts.h"
45#include "tree-ssa-loop-manip.h"
46#include "tree-ssa-loop.h"
47#include "cfgloop.h"
48#include "tree-scalar-evolution.h"
49#include "tree-vectorizer.h"
50#include "expr.h"
51#include "builtins.h"
52#include "tree-cfg.h"
53#include "tree-hash-traits.h"
54#include "vec-perm-indices.h"
55#include "internal-fn.h"
56#include "gimple-fold.h"
57
58/* Return true if load- or store-lanes optab OPTAB is implemented for
59 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
60
61static bool
62vect_lanes_optab_supported_p (const char *name, convert_optab optab,
63 tree vectype, unsigned HOST_WIDE_INT count)
64{
65 machine_mode mode, array_mode;
66 bool limit_p;
67
68 mode = TYPE_MODE (vectype);
69 if (!targetm.array_mode (mode, count).exists (mode: &array_mode))
70 {
71 poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
72 limit_p = !targetm.array_mode_supported_p (mode, count);
73 if (!int_mode_for_size (size: bits, limit: limit_p).exists (mode: &array_mode))
74 {
75 if (dump_enabled_p ())
76 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
77 "no array mode for %s[%wu]\n",
78 GET_MODE_NAME (mode), count);
79 return false;
80 }
81 }
82
83 if (convert_optab_handler (op: optab, to_mode: array_mode, from_mode: mode) == CODE_FOR_nothing)
84 {
85 if (dump_enabled_p ())
86 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
87 "cannot use %s<%s><%s>\n", name,
88 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
89 return false;
90 }
91
92 if (dump_enabled_p ())
93 dump_printf_loc (MSG_NOTE, vect_location,
94 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
95 GET_MODE_NAME (mode));
96
97 return true;
98}
99
100/* Helper function to identify a simd clone call. If this is a call to a
101 function with simd clones then return the corresponding cgraph_node,
102 otherwise return NULL. */
103
104static cgraph_node*
105simd_clone_call_p (gimple *stmt)
106{
107 gcall *call = dyn_cast <gcall *> (p: stmt);
108 if (!call)
109 return NULL;
110
111 tree fndecl = NULL_TREE;
112 if (gimple_call_internal_p (gs: call, fn: IFN_MASK_CALL))
113 fndecl = TREE_OPERAND (gimple_call_arg (stmt, 0), 0);
114 else
115 fndecl = gimple_call_fndecl (gs: stmt);
116
117 if (fndecl == NULL_TREE)
118 return NULL;
119
120 cgraph_node *node = cgraph_node::get (decl: fndecl);
121 if (node && node->simd_clones != NULL)
122 return node;
123
124 return NULL;
125}
126
127
128
129/* Return the smallest scalar part of STMT_INFO.
130 This is used to determine the vectype of the stmt. We generally set the
131 vectype according to the type of the result (lhs). For stmts whose
132 result-type is different than the type of the arguments (e.g., demotion,
133 promotion), vectype will be reset appropriately (later). Note that we have
134 to visit the smallest datatype in this function, because that determines the
135 VF. If the smallest datatype in the loop is present only as the rhs of a
136 promotion operation - we'd miss it.
137 Such a case, where a variable of this datatype does not appear in the lhs
138 anywhere in the loop, can only occur if it's an invariant: e.g.:
139 'int_x = (int) short_inv', which we'd expect to have been optimized away by
140 invariant motion. However, we cannot rely on invariant motion to always
141 take invariants out of the loop, and so in the case of promotion we also
142 have to check the rhs.
143 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
144 types. */
145
146tree
147vect_get_smallest_scalar_type (stmt_vec_info stmt_info, tree scalar_type)
148{
149 HOST_WIDE_INT lhs, rhs;
150
151 /* During the analysis phase, this function is called on arbitrary
152 statements that might not have scalar results. */
153 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
154 return scalar_type;
155
156 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
157
158 gassign *assign = dyn_cast <gassign *> (p: stmt_info->stmt);
159 if (assign)
160 {
161 scalar_type = TREE_TYPE (gimple_assign_lhs (assign));
162 if (gimple_assign_cast_p (s: assign)
163 || gimple_assign_rhs_code (gs: assign) == DOT_PROD_EXPR
164 || gimple_assign_rhs_code (gs: assign) == WIDEN_SUM_EXPR
165 || gimple_assign_rhs_code (gs: assign) == WIDEN_MULT_EXPR
166 || gimple_assign_rhs_code (gs: assign) == WIDEN_LSHIFT_EXPR
167 || gimple_assign_rhs_code (gs: assign) == FLOAT_EXPR)
168 {
169 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
170
171 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
172 if (rhs < lhs)
173 scalar_type = rhs_type;
174 }
175 }
176 else if (cgraph_node *node = simd_clone_call_p (stmt: stmt_info->stmt))
177 {
178 auto clone = node->simd_clones->simdclone;
179 for (unsigned int i = 0; i < clone->nargs; ++i)
180 {
181 if (clone->args[i].arg_type == SIMD_CLONE_ARG_TYPE_VECTOR)
182 {
183 tree arg_scalar_type = TREE_TYPE (clone->args[i].vector_type);
184 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (arg_scalar_type));
185 if (rhs < lhs)
186 {
187 scalar_type = arg_scalar_type;
188 lhs = rhs;
189 }
190 }
191 }
192 }
193 else if (gcall *call = dyn_cast <gcall *> (p: stmt_info->stmt))
194 {
195 unsigned int i = 0;
196 if (gimple_call_internal_p (gs: call))
197 {
198 internal_fn ifn = gimple_call_internal_fn (gs: call);
199 if (internal_load_fn_p (ifn))
200 /* For loads the LHS type does the trick. */
201 i = ~0U;
202 else if (internal_store_fn_p (ifn))
203 {
204 /* For stores use the tyep of the stored value. */
205 i = internal_fn_stored_value_index (ifn);
206 scalar_type = TREE_TYPE (gimple_call_arg (call, i));
207 i = ~0U;
208 }
209 else if (internal_fn_mask_index (ifn) == 0)
210 i = 1;
211 }
212 if (i < gimple_call_num_args (gs: call))
213 {
214 tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
215 if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
216 {
217 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
218 if (rhs < lhs)
219 scalar_type = rhs_type;
220 }
221 }
222 }
223
224 return scalar_type;
225}
226
227
228/* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
229 tested at run-time. Return TRUE if DDR was successfully inserted.
230 Return false if versioning is not supported. */
231
232static opt_result
233vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
234{
235 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
236
237 if ((unsigned) param_vect_max_version_for_alias_checks == 0)
238 return opt_result::failure_at (loc: vect_location,
239 fmt: "will not create alias checks, as"
240 " --param vect-max-version-for-alias-checks"
241 " == 0\n");
242
243 opt_result res
244 = runtime_alias_check_p (ddr, loop,
245 optimize_loop_nest_for_speed_p (loop));
246 if (!res)
247 return res;
248
249 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (obj: ddr);
250 return opt_result::success ();
251}
252
253/* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
254
255static void
256vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
257{
258 const vec<tree> &checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
259 for (unsigned int i = 0; i < checks.length(); ++i)
260 if (checks[i] == value)
261 return;
262
263 if (dump_enabled_p ())
264 dump_printf_loc (MSG_NOTE, vect_location,
265 "need run-time check that %T is nonzero\n",
266 value);
267 LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (obj: value);
268}
269
270/* Return true if we know that the order of vectorized DR_INFO_A and
271 vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
272 DR_INFO_B. At least one of the accesses is a write. */
273
274static bool
275vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
276{
277 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
278 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
279
280 /* Single statements are always kept in their original order. */
281 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
282 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
283 return true;
284
285 /* STMT_A and STMT_B belong to overlapping groups. All loads are
286 emitted at the position of the first scalar load.
287 Stores in a group are emitted at the position of the last scalar store.
288 Compute that position and check whether the resulting order matches
289 the current one. */
290 stmt_vec_info il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
291 if (il_a)
292 {
293 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
294 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
295 s = DR_GROUP_NEXT_ELEMENT (s))
296 il_a = get_later_stmt (stmt1_info: il_a, stmt2_info: s);
297 else /* DR_IS_READ */
298 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
299 s = DR_GROUP_NEXT_ELEMENT (s))
300 if (get_later_stmt (stmt1_info: il_a, stmt2_info: s) == il_a)
301 il_a = s;
302 }
303 else
304 il_a = stmtinfo_a;
305 stmt_vec_info il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
306 if (il_b)
307 {
308 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
309 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
310 s = DR_GROUP_NEXT_ELEMENT (s))
311 il_b = get_later_stmt (stmt1_info: il_b, stmt2_info: s);
312 else /* DR_IS_READ */
313 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
314 s = DR_GROUP_NEXT_ELEMENT (s))
315 if (get_later_stmt (stmt1_info: il_b, stmt2_info: s) == il_b)
316 il_b = s;
317 }
318 else
319 il_b = stmtinfo_b;
320 bool a_after_b = (get_later_stmt (stmt1_info: stmtinfo_a, stmt2_info: stmtinfo_b) == stmtinfo_a);
321 return (get_later_stmt (stmt1_info: il_a, stmt2_info: il_b) == il_a) == a_after_b;
322}
323
324/* A subroutine of vect_analyze_data_ref_dependence. Handle
325 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
326 distances. These distances are conservatively correct but they don't
327 reflect a guaranteed dependence.
328
329 Return true if this function does all the work necessary to avoid
330 an alias or false if the caller should use the dependence distances
331 to limit the vectorization factor in the usual way. LOOP_DEPTH is
332 the depth of the loop described by LOOP_VINFO and the other arguments
333 are as for vect_analyze_data_ref_dependence. */
334
335static bool
336vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
337 loop_vec_info loop_vinfo,
338 int loop_depth, unsigned int *max_vf)
339{
340 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
341 for (lambda_vector &dist_v : DDR_DIST_VECTS (ddr))
342 {
343 int dist = dist_v[loop_depth];
344 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
345 {
346 /* If the user asserted safelen >= DIST consecutive iterations
347 can be executed concurrently, assume independence.
348
349 ??? An alternative would be to add the alias check even
350 in this case, and vectorize the fallback loop with the
351 maximum VF set to safelen. However, if the user has
352 explicitly given a length, it's less likely that that
353 would be a win. */
354 if (loop->safelen >= 2 && abs_hwi (x: dist) <= loop->safelen)
355 {
356 if ((unsigned int) loop->safelen < *max_vf)
357 *max_vf = loop->safelen;
358 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
359 continue;
360 }
361
362 /* For dependence distances of 2 or more, we have the option
363 of limiting VF or checking for an alias at runtime.
364 Prefer to check at runtime if we can, to avoid limiting
365 the VF unnecessarily when the bases are in fact independent.
366
367 Note that the alias checks will be removed if the VF ends up
368 being small enough. */
369 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
370 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
371 return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
372 && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
373 && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
374 }
375 }
376 return true;
377}
378
379
380/* Function vect_analyze_data_ref_dependence.
381
382 FIXME: I needed to change the sense of the returned flag.
383
384 Return FALSE if there (might) exist a dependence between a memory-reference
385 DRA and a memory-reference DRB. When versioning for alias may check a
386 dependence at run-time, return TRUE. Adjust *MAX_VF according to
387 the data dependence. */
388
389static opt_result
390vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
391 loop_vec_info loop_vinfo,
392 unsigned int *max_vf)
393{
394 unsigned int i;
395 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
396 struct data_reference *dra = DDR_A (ddr);
397 struct data_reference *drb = DDR_B (ddr);
398 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
399 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
400 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
401 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
402 lambda_vector dist_v;
403 unsigned int loop_depth;
404
405 /* If user asserted safelen consecutive iterations can be
406 executed concurrently, assume independence. */
407 auto apply_safelen = [&]()
408 {
409 if (loop->safelen >= 2)
410 {
411 if ((unsigned int) loop->safelen < *max_vf)
412 *max_vf = loop->safelen;
413 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
414 return true;
415 }
416 return false;
417 };
418
419 /* In loop analysis all data references should be vectorizable. */
420 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
421 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
422 gcc_unreachable ();
423
424 /* Independent data accesses. */
425 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
426 return opt_result::success ();
427
428 if (dra == drb
429 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
430 return opt_result::success ();
431
432 /* We do not have to consider dependences between accesses that belong
433 to the same group, unless the stride could be smaller than the
434 group size. */
435 if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
436 && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
437 == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
438 && !STMT_VINFO_STRIDED_P (stmtinfo_a))
439 return opt_result::success ();
440
441 /* Even if we have an anti-dependence then, as the vectorized loop covers at
442 least two scalar iterations, there is always also a true dependence.
443 As the vectorizer does not re-order loads and stores we can ignore
444 the anti-dependence if TBAA can disambiguate both DRs similar to the
445 case with known negative distance anti-dependences (positive
446 distance anti-dependences would violate TBAA constraints). */
447 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
448 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
449 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
450 get_alias_set (DR_REF (drb))))
451 return opt_result::success ();
452
453 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
454 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
455 {
456 if (apply_safelen ())
457 return opt_result::success ();
458
459 return opt_result::failure_at
460 (loc: stmtinfo_a->stmt,
461 fmt: "possible alias involving gather/scatter between %T and %T\n",
462 DR_REF (dra), DR_REF (drb));
463 }
464
465 /* Unknown data dependence. */
466 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
467 {
468 if (apply_safelen ())
469 return opt_result::success ();
470
471 if (dump_enabled_p ())
472 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
473 "versioning for alias required: "
474 "can't determine dependence between %T and %T\n",
475 DR_REF (dra), DR_REF (drb));
476
477 /* Add to list of ddrs that need to be tested at run-time. */
478 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
479 }
480
481 /* Known data dependence. */
482 if (DDR_NUM_DIST_VECTS (ddr) == 0)
483 {
484 if (apply_safelen ())
485 return opt_result::success ();
486
487 if (dump_enabled_p ())
488 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
489 "versioning for alias required: "
490 "bad dist vector for %T and %T\n",
491 DR_REF (dra), DR_REF (drb));
492 /* Add to list of ddrs that need to be tested at run-time. */
493 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
494 }
495
496 loop_depth = index_in_loop_nest (var: loop->num, DDR_LOOP_NEST (ddr));
497
498 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
499 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
500 loop_depth, max_vf))
501 return opt_result::success ();
502
503 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
504 {
505 int dist = dist_v[loop_depth];
506
507 if (dump_enabled_p ())
508 dump_printf_loc (MSG_NOTE, vect_location,
509 "dependence distance = %d.\n", dist);
510
511 if (dist == 0)
512 {
513 if (dump_enabled_p ())
514 dump_printf_loc (MSG_NOTE, vect_location,
515 "dependence distance == 0 between %T and %T\n",
516 DR_REF (dra), DR_REF (drb));
517
518 /* When we perform grouped accesses and perform implicit CSE
519 by detecting equal accesses and doing disambiguation with
520 runtime alias tests like for
521 .. = a[i];
522 .. = a[i+1];
523 a[i] = ..;
524 a[i+1] = ..;
525 *p = ..;
526 .. = a[i];
527 .. = a[i+1];
528 where we will end up loading { a[i], a[i+1] } once, make
529 sure that inserting group loads before the first load and
530 stores after the last store will do the right thing.
531 Similar for groups like
532 a[i] = ...;
533 ... = a[i];
534 a[i+1] = ...;
535 where loads from the group interleave with the store. */
536 if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
537 return opt_result::failure_at (loc: stmtinfo_a->stmt,
538 fmt: "READ_WRITE dependence"
539 " in interleaving.\n");
540
541 if (loop->safelen < 2)
542 {
543 tree indicator = dr_zero_step_indicator (dra);
544 if (!indicator || integer_zerop (indicator))
545 return opt_result::failure_at (loc: stmtinfo_a->stmt,
546 fmt: "access also has a zero step\n");
547 else if (TREE_CODE (indicator) != INTEGER_CST)
548 vect_check_nonzero_value (loop_vinfo, value: indicator);
549 }
550 continue;
551 }
552
553 if (dist > 0 && DDR_REVERSED_P (ddr))
554 {
555 /* If DDR_REVERSED_P the order of the data-refs in DDR was
556 reversed (to make distance vector positive), and the actual
557 distance is negative. */
558 if (dump_enabled_p ())
559 dump_printf_loc (MSG_NOTE, vect_location,
560 "dependence distance negative.\n");
561 /* When doing outer loop vectorization, we need to check if there is
562 a backward dependence at the inner loop level if the dependence
563 at the outer loop is reversed. See PR81740. */
564 if (nested_in_vect_loop_p (loop, stmt_info: stmtinfo_a)
565 || nested_in_vect_loop_p (loop, stmt_info: stmtinfo_b))
566 {
567 unsigned inner_depth = index_in_loop_nest (var: loop->inner->num,
568 DDR_LOOP_NEST (ddr));
569 if (dist_v[inner_depth] < 0)
570 return opt_result::failure_at (loc: stmtinfo_a->stmt,
571 fmt: "not vectorized, dependence "
572 "between data-refs %T and %T\n",
573 DR_REF (dra), DR_REF (drb));
574 }
575 /* Record a negative dependence distance to later limit the
576 amount of stmt copying / unrolling we can perform.
577 Only need to handle read-after-write dependence. */
578 if (DR_IS_READ (drb)
579 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
580 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
581 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
582 continue;
583 }
584
585 unsigned int abs_dist = abs (x: dist);
586 if (abs_dist >= 2 && abs_dist < *max_vf)
587 {
588 /* The dependence distance requires reduction of the maximal
589 vectorization factor. */
590 *max_vf = abs_dist;
591 if (dump_enabled_p ())
592 dump_printf_loc (MSG_NOTE, vect_location,
593 "adjusting maximal vectorization factor to %i\n",
594 *max_vf);
595 }
596
597 if (abs_dist >= *max_vf)
598 {
599 /* Dependence distance does not create dependence, as far as
600 vectorization is concerned, in this case. */
601 if (dump_enabled_p ())
602 dump_printf_loc (MSG_NOTE, vect_location,
603 "dependence distance >= VF.\n");
604 continue;
605 }
606
607 return opt_result::failure_at (loc: stmtinfo_a->stmt,
608 fmt: "not vectorized, possible dependence "
609 "between data-refs %T and %T\n",
610 DR_REF (dra), DR_REF (drb));
611 }
612
613 return opt_result::success ();
614}
615
616/* Function vect_analyze_data_ref_dependences.
617
618 Examine all the data references in the loop, and make sure there do not
619 exist any data dependences between them. Set *MAX_VF according to
620 the maximum vectorization factor the data dependences allow. */
621
622opt_result
623vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
624 unsigned int *max_vf)
625{
626 unsigned int i;
627 struct data_dependence_relation *ddr;
628
629 DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
630
631 if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
632 {
633 LOOP_VINFO_DDRS (loop_vinfo)
634 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
635 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
636 /* We do not need read-read dependences. */
637 bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
638 &LOOP_VINFO_DDRS (loop_vinfo),
639 LOOP_VINFO_LOOP_NEST (loop_vinfo),
640 false);
641 gcc_assert (res);
642 }
643
644 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
645
646 /* For epilogues we either have no aliases or alias versioning
647 was applied to original loop. Therefore we may just get max_vf
648 using VF of original loop. */
649 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
650 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
651 else
652 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
653 {
654 opt_result res
655 = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
656 if (!res)
657 return res;
658 }
659
660 return opt_result::success ();
661}
662
663
664/* Function vect_slp_analyze_data_ref_dependence.
665
666 Return TRUE if there (might) exist a dependence between a memory-reference
667 DRA and a memory-reference DRB for VINFO. When versioning for alias
668 may check a dependence at run-time, return FALSE. Adjust *MAX_VF
669 according to the data dependence. */
670
671static bool
672vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
673 struct data_dependence_relation *ddr)
674{
675 struct data_reference *dra = DDR_A (ddr);
676 struct data_reference *drb = DDR_B (ddr);
677 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
678 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
679
680 /* We need to check dependences of statements marked as unvectorizable
681 as well, they still can prohibit vectorization. */
682
683 /* Independent data accesses. */
684 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
685 return false;
686
687 if (dra == drb)
688 return false;
689
690 /* Read-read is OK. */
691 if (DR_IS_READ (dra) && DR_IS_READ (drb))
692 return false;
693
694 /* If dra and drb are part of the same interleaving chain consider
695 them independent. */
696 if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
697 && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
698 == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
699 return false;
700
701 /* Unknown data dependence. */
702 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
703 {
704 if (dump_enabled_p ())
705 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
706 "can't determine dependence between %T and %T\n",
707 DR_REF (dra), DR_REF (drb));
708 }
709 else if (dump_enabled_p ())
710 dump_printf_loc (MSG_NOTE, vect_location,
711 "determined dependence between %T and %T\n",
712 DR_REF (dra), DR_REF (drb));
713
714 return true;
715}
716
717
718/* Analyze dependences involved in the transform of a store SLP NODE. */
719
720static bool
721vect_slp_analyze_store_dependences (vec_info *vinfo, slp_tree node)
722{
723 /* This walks over all stmts involved in the SLP store done
724 in NODE verifying we can sink them up to the last stmt in the
725 group. */
726 stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
727 gcc_assert (DR_IS_WRITE (STMT_VINFO_DATA_REF (last_access_info)));
728
729 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
730 {
731 stmt_vec_info access_info
732 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
733 if (access_info == last_access_info)
734 continue;
735 data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
736 ao_ref ref;
737 bool ref_initialized_p = false;
738 for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
739 gsi_stmt (i: gsi) != last_access_info->stmt; gsi_next (i: &gsi))
740 {
741 gimple *stmt = gsi_stmt (i: gsi);
742 if (! gimple_vuse (g: stmt))
743 continue;
744
745 /* If we couldn't record a (single) data reference for this
746 stmt we have to resort to the alias oracle. */
747 stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
748 data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
749 if (!dr_b)
750 {
751 /* We are moving a store - this means
752 we cannot use TBAA for disambiguation. */
753 if (!ref_initialized_p)
754 ao_ref_init (&ref, DR_REF (dr_a));
755 if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
756 || ref_maybe_used_by_stmt_p (stmt, &ref, false))
757 return false;
758 continue;
759 }
760
761 gcc_assert (!gimple_visited_p (stmt));
762
763 ddr_p ddr = initialize_data_dependence_relation (dr_a,
764 dr_b, vNULL);
765 bool dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
766 free_dependence_relation (ddr);
767 if (dependent)
768 return false;
769 }
770 }
771 return true;
772}
773
774/* Analyze dependences involved in the transform of a load SLP NODE. STORES
775 contain the vector of scalar stores of this instance if we are
776 disambiguating the loads. */
777
778static bool
779vect_slp_analyze_load_dependences (vec_info *vinfo, slp_tree node,
780 vec<stmt_vec_info> stores,
781 stmt_vec_info last_store_info)
782{
783 /* This walks over all stmts involved in the SLP load done
784 in NODE verifying we can hoist them up to the first stmt in the
785 group. */
786 stmt_vec_info first_access_info = vect_find_first_scalar_stmt_in_slp (node);
787 gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (first_access_info)));
788
789 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
790 {
791 stmt_vec_info access_info
792 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
793 if (access_info == first_access_info)
794 continue;
795 data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
796 ao_ref ref;
797 bool ref_initialized_p = false;
798 hash_set<stmt_vec_info> grp_visited;
799 for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
800 gsi_stmt (i: gsi) != first_access_info->stmt; gsi_prev (i: &gsi))
801 {
802 gimple *stmt = gsi_stmt (i: gsi);
803 if (! gimple_vdef (g: stmt))
804 continue;
805
806 stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
807
808 /* If we run into a store of this same instance (we've just
809 marked those) then delay dependence checking until we run
810 into the last store because this is where it will have
811 been sunk to (and we verified that we can do that already). */
812 if (gimple_visited_p (stmt))
813 {
814 if (stmt_info != last_store_info)
815 continue;
816
817 for (stmt_vec_info &store_info : stores)
818 {
819 data_reference *store_dr = STMT_VINFO_DATA_REF (store_info);
820 ddr_p ddr = initialize_data_dependence_relation
821 (dr_a, store_dr, vNULL);
822 bool dependent
823 = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
824 free_dependence_relation (ddr);
825 if (dependent)
826 return false;
827 }
828 continue;
829 }
830
831 auto check_hoist = [&] (stmt_vec_info stmt_info) -> bool
832 {
833 /* We are hoisting a load - this means we can use TBAA for
834 disambiguation. */
835 if (!ref_initialized_p)
836 ao_ref_init (&ref, DR_REF (dr_a));
837 if (stmt_may_clobber_ref_p_1 (stmt_info->stmt, &ref, true))
838 {
839 /* If we couldn't record a (single) data reference for this
840 stmt we have to give up now. */
841 data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
842 if (!dr_b)
843 return false;
844 ddr_p ddr = initialize_data_dependence_relation (dr_a,
845 dr_b, vNULL);
846 bool dependent
847 = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
848 free_dependence_relation (ddr);
849 if (dependent)
850 return false;
851 }
852 /* No dependence. */
853 return true;
854 };
855 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
856 {
857 /* When we run into a store group we have to honor
858 that earlier stores might be moved here. We don't
859 know exactly which and where to since we lack a
860 back-mapping from DR to SLP node, so assume all
861 earlier stores are sunk here. It's enough to
862 consider the last stmt of a group for this.
863 ??? Both this and the fact that we disregard that
864 the conflicting instance might be removed later
865 is overly conservative. */
866 if (!grp_visited.add (DR_GROUP_FIRST_ELEMENT (stmt_info)))
867 for (auto store_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
868 store_info != NULL;
869 store_info = DR_GROUP_NEXT_ELEMENT (store_info))
870 if ((store_info == stmt_info
871 || get_later_stmt (stmt1_info: store_info, stmt2_info: stmt_info) == stmt_info)
872 && !check_hoist (store_info))
873 return false;
874 }
875 else
876 {
877 if (!check_hoist (stmt_info))
878 return false;
879 }
880 }
881 }
882 return true;
883}
884
885
886/* Function vect_analyze_data_ref_dependences.
887
888 Examine all the data references in the basic-block, and make sure there
889 do not exist any data dependences between them. Set *MAX_VF according to
890 the maximum vectorization factor the data dependences allow. */
891
892bool
893vect_slp_analyze_instance_dependence (vec_info *vinfo, slp_instance instance)
894{
895 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
896
897 /* The stores of this instance are at the root of the SLP tree. */
898 slp_tree store = NULL;
899 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store)
900 store = SLP_INSTANCE_TREE (instance);
901
902 /* Verify we can sink stores to the vectorized stmt insert location. */
903 stmt_vec_info last_store_info = NULL;
904 if (store)
905 {
906 if (! vect_slp_analyze_store_dependences (vinfo, node: store))
907 return false;
908
909 /* Mark stores in this instance and remember the last one. */
910 last_store_info = vect_find_last_scalar_stmt_in_slp (store);
911 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
912 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, visited_p: true);
913 }
914
915 bool res = true;
916
917 /* Verify we can sink loads to the vectorized stmt insert location,
918 special-casing stores of this instance. */
919 for (slp_tree &load : SLP_INSTANCE_LOADS (instance))
920 if (! vect_slp_analyze_load_dependences (vinfo, node: load,
921 stores: store
922 ? SLP_TREE_SCALAR_STMTS (store)
923 : vNULL, last_store_info))
924 {
925 res = false;
926 break;
927 }
928
929 /* Unset the visited flag. */
930 if (store)
931 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
932 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, visited_p: false);
933
934 return res;
935}
936
937/* Return the misalignment of DR_INFO accessed in VECTYPE with OFFSET
938 applied. */
939
940int
941dr_misalignment (dr_vec_info *dr_info, tree vectype, poly_int64 offset)
942{
943 HOST_WIDE_INT diff = 0;
944 /* Alignment is only analyzed for the first element of a DR group,
945 use that but adjust misalignment by the offset of the access. */
946 if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt))
947 {
948 dr_vec_info *first_dr
949 = STMT_VINFO_DR_INFO (DR_GROUP_FIRST_ELEMENT (dr_info->stmt));
950 /* vect_analyze_data_ref_accesses guarantees that DR_INIT are
951 INTEGER_CSTs and the first element in the group has the lowest
952 address. */
953 diff = (TREE_INT_CST_LOW (DR_INIT (dr_info->dr))
954 - TREE_INT_CST_LOW (DR_INIT (first_dr->dr)));
955 gcc_assert (diff >= 0);
956 dr_info = first_dr;
957 }
958
959 int misalign = dr_info->misalignment;
960 gcc_assert (misalign != DR_MISALIGNMENT_UNINITIALIZED);
961 if (misalign == DR_MISALIGNMENT_UNKNOWN)
962 return misalign;
963
964 /* If the access is only aligned for a vector type with smaller alignment
965 requirement the access has unknown misalignment. */
966 if (maybe_lt (a: dr_info->target_alignment * BITS_PER_UNIT,
967 b: targetm.vectorize.preferred_vector_alignment (vectype)))
968 return DR_MISALIGNMENT_UNKNOWN;
969
970 /* Apply the offset from the DR group start and the externally supplied
971 offset which can for example result from a negative stride access. */
972 poly_int64 misalignment = misalign + diff + offset;
973
974 /* vect_compute_data_ref_alignment will have ensured that target_alignment
975 is constant and otherwise set misalign to DR_MISALIGNMENT_UNKNOWN. */
976 unsigned HOST_WIDE_INT target_alignment_c
977 = dr_info->target_alignment.to_constant ();
978 if (!known_misalignment (value: misalignment, align: target_alignment_c, misalign: &misalign))
979 return DR_MISALIGNMENT_UNKNOWN;
980 return misalign;
981}
982
983/* Record the base alignment guarantee given by DRB, which occurs
984 in STMT_INFO. */
985
986static void
987vect_record_base_alignment (vec_info *vinfo, stmt_vec_info stmt_info,
988 innermost_loop_behavior *drb)
989{
990 bool existed;
991 std::pair<stmt_vec_info, innermost_loop_behavior *> &entry
992 = vinfo->base_alignments.get_or_insert (k: drb->base_address, existed: &existed);
993 if (!existed || entry.second->base_alignment < drb->base_alignment)
994 {
995 entry = std::make_pair (x&: stmt_info, y&: drb);
996 if (dump_enabled_p ())
997 dump_printf_loc (MSG_NOTE, vect_location,
998 "recording new base alignment for %T\n"
999 " alignment: %d\n"
1000 " misalignment: %d\n"
1001 " based on: %G",
1002 drb->base_address,
1003 drb->base_alignment,
1004 drb->base_misalignment,
1005 stmt_info->stmt);
1006 }
1007}
1008
1009/* If the region we're going to vectorize is reached, all unconditional
1010 data references occur at least once. We can therefore pool the base
1011 alignment guarantees from each unconditional reference. Do this by
1012 going through all the data references in VINFO and checking whether
1013 the containing statement makes the reference unconditionally. If so,
1014 record the alignment of the base address in VINFO so that it can be
1015 used for all other references with the same base. */
1016
1017void
1018vect_record_base_alignments (vec_info *vinfo)
1019{
1020 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (p: vinfo);
1021 class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
1022 for (data_reference *dr : vinfo->shared->datarefs)
1023 {
1024 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
1025 stmt_vec_info stmt_info = dr_info->stmt;
1026 if (!DR_IS_CONDITIONAL_IN_STMT (dr)
1027 && STMT_VINFO_VECTORIZABLE (stmt_info)
1028 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1029 {
1030 vect_record_base_alignment (vinfo, stmt_info, drb: &DR_INNERMOST (dr));
1031
1032 /* If DR is nested in the loop that is being vectorized, we can also
1033 record the alignment of the base wrt the outer loop. */
1034 if (loop && nested_in_vect_loop_p (loop, stmt_info))
1035 vect_record_base_alignment
1036 (vinfo, stmt_info, drb: &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
1037 }
1038 }
1039}
1040
1041/* Function vect_compute_data_ref_alignment
1042
1043 Compute the misalignment of the data reference DR_INFO when vectorizing
1044 with VECTYPE.
1045
1046 Output:
1047 1. initialized misalignment info for DR_INFO
1048
1049 FOR NOW: No analysis is actually performed. Misalignment is calculated
1050 only for trivial cases. TODO. */
1051
1052static void
1053vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info,
1054 tree vectype)
1055{
1056 stmt_vec_info stmt_info = dr_info->stmt;
1057 vec_base_alignments *base_alignments = &vinfo->base_alignments;
1058 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (p: vinfo);
1059 class loop *loop = NULL;
1060 tree ref = DR_REF (dr_info->dr);
1061
1062 if (dump_enabled_p ())
1063 dump_printf_loc (MSG_NOTE, vect_location,
1064 "vect_compute_data_ref_alignment:\n");
1065
1066 if (loop_vinfo)
1067 loop = LOOP_VINFO_LOOP (loop_vinfo);
1068
1069 /* Initialize misalignment to unknown. */
1070 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1071
1072 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1073 return;
1074
1075 innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
1076 bool step_preserves_misalignment_p;
1077
1078 poly_uint64 vector_alignment
1079 = exact_div (a: targetm.vectorize.preferred_vector_alignment (vectype),
1080 BITS_PER_UNIT);
1081 SET_DR_TARGET_ALIGNMENT (dr_info, vector_alignment);
1082
1083 /* If the main loop has peeled for alignment we have no way of knowing
1084 whether the data accesses in the epilogues are aligned. We can't at
1085 compile time answer the question whether we have entered the main loop or
1086 not. Fixes PR 92351. */
1087 if (loop_vinfo)
1088 {
1089 loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1090 if (orig_loop_vinfo
1091 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo) != 0)
1092 return;
1093 }
1094
1095 unsigned HOST_WIDE_INT vect_align_c;
1096 if (!vector_alignment.is_constant (const_value: &vect_align_c))
1097 return;
1098
1099 /* No step for BB vectorization. */
1100 if (!loop)
1101 {
1102 gcc_assert (integer_zerop (drb->step));
1103 step_preserves_misalignment_p = true;
1104 }
1105
1106 /* In case the dataref is in an inner-loop of the loop that is being
1107 vectorized (LOOP), we use the base and misalignment information
1108 relative to the outer-loop (LOOP). This is ok only if the misalignment
1109 stays the same throughout the execution of the inner-loop, which is why
1110 we have to check that the stride of the dataref in the inner-loop evenly
1111 divides by the vector alignment. */
1112 else if (nested_in_vect_loop_p (loop, stmt_info))
1113 {
1114 step_preserves_misalignment_p
1115 = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
1116
1117 if (dump_enabled_p ())
1118 {
1119 if (step_preserves_misalignment_p)
1120 dump_printf_loc (MSG_NOTE, vect_location,
1121 "inner step divides the vector alignment.\n");
1122 else
1123 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1124 "inner step doesn't divide the vector"
1125 " alignment.\n");
1126 }
1127 }
1128
1129 /* Similarly we can only use base and misalignment information relative to
1130 an innermost loop if the misalignment stays the same throughout the
1131 execution of the loop. As above, this is the case if the stride of
1132 the dataref evenly divides by the alignment. */
1133 else
1134 {
1135 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1136 step_preserves_misalignment_p
1137 = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, b: vect_align_c);
1138
1139 if (!step_preserves_misalignment_p && dump_enabled_p ())
1140 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1141 "step doesn't divide the vector alignment.\n");
1142 }
1143
1144 unsigned int base_alignment = drb->base_alignment;
1145 unsigned int base_misalignment = drb->base_misalignment;
1146
1147 /* Calculate the maximum of the pooled base address alignment and the
1148 alignment that we can compute for DR itself. */
1149 std::pair<stmt_vec_info, innermost_loop_behavior *> *entry
1150 = base_alignments->get (k: drb->base_address);
1151 if (entry
1152 && base_alignment < (*entry).second->base_alignment
1153 && (loop_vinfo
1154 || (dominated_by_p (CDI_DOMINATORS, gimple_bb (g: stmt_info->stmt),
1155 gimple_bb (g: entry->first->stmt))
1156 && (gimple_bb (g: stmt_info->stmt) != gimple_bb (g: entry->first->stmt)
1157 || (entry->first->dr_aux.group <= dr_info->group)))))
1158 {
1159 base_alignment = entry->second->base_alignment;
1160 base_misalignment = entry->second->base_misalignment;
1161 }
1162
1163 if (drb->offset_alignment < vect_align_c
1164 || !step_preserves_misalignment_p
1165 /* We need to know whether the step wrt the vectorized loop is
1166 negative when computing the starting misalignment below. */
1167 || TREE_CODE (drb->step) != INTEGER_CST)
1168 {
1169 if (dump_enabled_p ())
1170 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1171 "Unknown alignment for access: %T\n", ref);
1172 return;
1173 }
1174
1175 if (base_alignment < vect_align_c)
1176 {
1177 unsigned int max_alignment;
1178 tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1179 if (max_alignment < vect_align_c
1180 || !vect_can_force_dr_alignment_p (base,
1181 vect_align_c * BITS_PER_UNIT))
1182 {
1183 if (dump_enabled_p ())
1184 dump_printf_loc (MSG_NOTE, vect_location,
1185 "can't force alignment of ref: %T\n", ref);
1186 return;
1187 }
1188
1189 /* Force the alignment of the decl.
1190 NOTE: This is the only change to the code we make during
1191 the analysis phase, before deciding to vectorize the loop. */
1192 if (dump_enabled_p ())
1193 dump_printf_loc (MSG_NOTE, vect_location,
1194 "force alignment of %T\n", ref);
1195
1196 dr_info->base_decl = base;
1197 dr_info->base_misaligned = true;
1198 base_misalignment = 0;
1199 }
1200 poly_int64 misalignment
1201 = base_misalignment + wi::to_poly_offset (t: drb->init).force_shwi ();
1202
1203 unsigned int const_misalignment;
1204 if (!known_misalignment (value: misalignment, align: vect_align_c, misalign: &const_misalignment))
1205 {
1206 if (dump_enabled_p ())
1207 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1208 "Non-constant misalignment for access: %T\n", ref);
1209 return;
1210 }
1211
1212 SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1213
1214 if (dump_enabled_p ())
1215 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1216 "misalign = %d bytes of ref %T\n",
1217 const_misalignment, ref);
1218
1219 return;
1220}
1221
1222/* Return whether DR_INFO, which is related to DR_PEEL_INFO in
1223 that it only differs in DR_INIT, is aligned if DR_PEEL_INFO
1224 is made aligned via peeling. */
1225
1226static bool
1227vect_dr_aligned_if_related_peeled_dr_is (dr_vec_info *dr_info,
1228 dr_vec_info *dr_peel_info)
1229{
1230 if (multiple_p (DR_TARGET_ALIGNMENT (dr_peel_info),
1231 DR_TARGET_ALIGNMENT (dr_info)))
1232 {
1233 poly_offset_int diff
1234 = (wi::to_poly_offset (DR_INIT (dr_peel_info->dr))
1235 - wi::to_poly_offset (DR_INIT (dr_info->dr)));
1236 if (known_eq (diff, 0)
1237 || multiple_p (a: diff, DR_TARGET_ALIGNMENT (dr_info)))
1238 return true;
1239 }
1240 return false;
1241}
1242
1243/* Return whether DR_INFO is aligned if DR_PEEL_INFO is made
1244 aligned via peeling. */
1245
1246static bool
1247vect_dr_aligned_if_peeled_dr_is (dr_vec_info *dr_info,
1248 dr_vec_info *dr_peel_info)
1249{
1250 if (!operand_equal_p (DR_BASE_ADDRESS (dr_info->dr),
1251 DR_BASE_ADDRESS (dr_peel_info->dr), flags: 0)
1252 || !operand_equal_p (DR_OFFSET (dr_info->dr),
1253 DR_OFFSET (dr_peel_info->dr), flags: 0)
1254 || !operand_equal_p (DR_STEP (dr_info->dr),
1255 DR_STEP (dr_peel_info->dr), flags: 0))
1256 return false;
1257
1258 return vect_dr_aligned_if_related_peeled_dr_is (dr_info, dr_peel_info);
1259}
1260
1261/* Compute the value for dr_info->misalign so that the access appears
1262 aligned. This is used by peeling to compensate for dr_misalignment
1263 applying the offset for negative step. */
1264
1265int
1266vect_dr_misalign_for_aligned_access (dr_vec_info *dr_info)
1267{
1268 if (tree_int_cst_sgn (DR_STEP (dr_info->dr)) >= 0)
1269 return 0;
1270
1271 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1272 poly_int64 misalignment
1273 = ((TYPE_VECTOR_SUBPARTS (node: vectype) - 1)
1274 * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1275
1276 unsigned HOST_WIDE_INT target_alignment_c;
1277 int misalign;
1278 if (!dr_info->target_alignment.is_constant (const_value: &target_alignment_c)
1279 || !known_misalignment (value: misalignment, align: target_alignment_c, misalign: &misalign))
1280 return DR_MISALIGNMENT_UNKNOWN;
1281 return misalign;
1282}
1283
1284/* Function vect_update_misalignment_for_peel.
1285 Sets DR_INFO's misalignment
1286 - to 0 if it has the same alignment as DR_PEEL_INFO,
1287 - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1288 - to -1 (unknown) otherwise.
1289
1290 DR_INFO - the data reference whose misalignment is to be adjusted.
1291 DR_PEEL_INFO - the data reference whose misalignment is being made
1292 zero in the vector loop by the peel.
1293 NPEEL - the number of iterations in the peel loop if the misalignment
1294 of DR_PEEL_INFO is known at compile time. */
1295
1296static void
1297vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1298 dr_vec_info *dr_peel_info, int npeel)
1299{
1300 /* If dr_info is aligned of dr_peel_info is, then mark it so. */
1301 if (vect_dr_aligned_if_peeled_dr_is (dr_info, dr_peel_info))
1302 {
1303 SET_DR_MISALIGNMENT (dr_info,
1304 vect_dr_misalign_for_aligned_access (dr_peel_info));
1305 return;
1306 }
1307
1308 unsigned HOST_WIDE_INT alignment;
1309 if (DR_TARGET_ALIGNMENT (dr_info).is_constant (const_value: &alignment)
1310 && known_alignment_for_access_p (dr_info,
1311 STMT_VINFO_VECTYPE (dr_info->stmt))
1312 && known_alignment_for_access_p (dr_info: dr_peel_info,
1313 STMT_VINFO_VECTYPE (dr_peel_info->stmt)))
1314 {
1315 int misal = dr_info->misalignment;
1316 misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1317 misal &= alignment - 1;
1318 set_dr_misalignment (dr_info, val: misal);
1319 return;
1320 }
1321
1322 if (dump_enabled_p ())
1323 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1324 "to unknown (-1).\n");
1325 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1326}
1327
1328/* Return true if alignment is relevant for DR_INFO. */
1329
1330static bool
1331vect_relevant_for_alignment_p (dr_vec_info *dr_info)
1332{
1333 stmt_vec_info stmt_info = dr_info->stmt;
1334
1335 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1336 return false;
1337
1338 /* For interleaving, only the alignment of the first access matters. */
1339 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1340 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1341 return false;
1342
1343 /* Scatter-gather and invariant accesses continue to address individual
1344 scalars, so vector-level alignment is irrelevant. */
1345 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1346 || integer_zerop (DR_STEP (dr_info->dr)))
1347 return false;
1348
1349 /* Strided accesses perform only component accesses, alignment is
1350 irrelevant for them. */
1351 if (STMT_VINFO_STRIDED_P (stmt_info)
1352 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1353 return false;
1354
1355 return true;
1356}
1357
1358/* Given an memory reference EXP return whether its alignment is less
1359 than its size. */
1360
1361static bool
1362not_size_aligned (tree exp)
1363{
1364 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1365 return true;
1366
1367 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1368 > get_object_alignment (exp));
1369}
1370
1371/* Function vector_alignment_reachable_p
1372
1373 Return true if vector alignment for DR_INFO is reachable by peeling
1374 a few loop iterations. Return false otherwise. */
1375
1376static bool
1377vector_alignment_reachable_p (dr_vec_info *dr_info)
1378{
1379 stmt_vec_info stmt_info = dr_info->stmt;
1380 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1381
1382 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1383 {
1384 /* For interleaved access we peel only if number of iterations in
1385 the prolog loop ({VF - misalignment}), is a multiple of the
1386 number of the interleaved accesses. */
1387 int elem_size, mis_in_elements;
1388
1389 /* FORNOW: handle only known alignment. */
1390 if (!known_alignment_for_access_p (dr_info, vectype))
1391 return false;
1392
1393 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (node: vectype);
1394 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1395 elem_size = vector_element_size (vector_size, nelements);
1396 mis_in_elements = dr_misalignment (dr_info, vectype) / elem_size;
1397
1398 if (!multiple_p (a: nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1399 return false;
1400 }
1401
1402 /* If misalignment is known at the compile time then allow peeling
1403 only if natural alignment is reachable through peeling. */
1404 if (known_alignment_for_access_p (dr_info, vectype)
1405 && !aligned_access_p (dr_info, vectype))
1406 {
1407 HOST_WIDE_INT elmsize =
1408 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1409 if (dump_enabled_p ())
1410 {
1411 dump_printf_loc (MSG_NOTE, vect_location,
1412 "data size = %wd. misalignment = %d.\n", elmsize,
1413 dr_misalignment (dr_info, vectype));
1414 }
1415 if (dr_misalignment (dr_info, vectype) % elmsize)
1416 {
1417 if (dump_enabled_p ())
1418 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1419 "data size does not divide the misalignment.\n");
1420 return false;
1421 }
1422 }
1423
1424 if (!known_alignment_for_access_p (dr_info, vectype))
1425 {
1426 tree type = TREE_TYPE (DR_REF (dr_info->dr));
1427 bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1428 if (dump_enabled_p ())
1429 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1430 "Unknown misalignment, %snaturally aligned\n",
1431 is_packed ? "not " : "");
1432 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1433 }
1434
1435 return true;
1436}
1437
1438
1439/* Calculate the cost of the memory access represented by DR_INFO. */
1440
1441static void
1442vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info,
1443 dr_alignment_support alignment_support_scheme,
1444 int misalignment,
1445 unsigned int *inside_cost,
1446 unsigned int *outside_cost,
1447 stmt_vector_for_cost *body_cost_vec,
1448 stmt_vector_for_cost *prologue_cost_vec)
1449{
1450 stmt_vec_info stmt_info = dr_info->stmt;
1451 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (p: vinfo);
1452 int ncopies;
1453
1454 if (PURE_SLP_STMT (stmt_info))
1455 ncopies = 1;
1456 else
1457 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1458
1459 if (DR_IS_READ (dr_info->dr))
1460 vect_get_load_cost (vinfo, stmt_info, ncopies, alignment_support_scheme,
1461 misalignment, true, inside_cost,
1462 outside_cost, prologue_cost_vec, body_cost_vec, false);
1463 else
1464 vect_get_store_cost (vinfo,stmt_info, ncopies, alignment_support_scheme,
1465 misalignment, inside_cost, body_cost_vec);
1466
1467 if (dump_enabled_p ())
1468 dump_printf_loc (MSG_NOTE, vect_location,
1469 "vect_get_data_access_cost: inside_cost = %d, "
1470 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1471}
1472
1473
1474typedef struct _vect_peel_info
1475{
1476 dr_vec_info *dr_info;
1477 int npeel;
1478 unsigned int count;
1479} *vect_peel_info;
1480
1481typedef struct _vect_peel_extended_info
1482{
1483 vec_info *vinfo;
1484 struct _vect_peel_info peel_info;
1485 unsigned int inside_cost;
1486 unsigned int outside_cost;
1487} *vect_peel_extended_info;
1488
1489
1490/* Peeling hashtable helpers. */
1491
1492struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1493{
1494 static inline hashval_t hash (const _vect_peel_info *);
1495 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1496};
1497
1498inline hashval_t
1499peel_info_hasher::hash (const _vect_peel_info *peel_info)
1500{
1501 return (hashval_t) peel_info->npeel;
1502}
1503
1504inline bool
1505peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1506{
1507 return (a->npeel == b->npeel);
1508}
1509
1510
1511/* Insert DR_INFO into peeling hash table with NPEEL as key. */
1512
1513static void
1514vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1515 loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1516 int npeel, bool supportable_if_not_aligned)
1517{
1518 struct _vect_peel_info elem, *slot;
1519 _vect_peel_info **new_slot;
1520
1521 elem.npeel = npeel;
1522 slot = peeling_htab->find (value: &elem);
1523 if (slot)
1524 slot->count++;
1525 else
1526 {
1527 slot = XNEW (struct _vect_peel_info);
1528 slot->npeel = npeel;
1529 slot->dr_info = dr_info;
1530 slot->count = 1;
1531 new_slot = peeling_htab->find_slot (value: slot, insert: INSERT);
1532 *new_slot = slot;
1533 }
1534
1535 /* If this DR is not supported with unknown misalignment then bias
1536 this slot when the cost model is disabled. */
1537 if (!supportable_if_not_aligned
1538 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1539 slot->count += VECT_MAX_COST;
1540}
1541
1542
1543/* Traverse peeling hash table to find peeling option that aligns maximum
1544 number of data accesses. */
1545
1546int
1547vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1548 _vect_peel_extended_info *max)
1549{
1550 vect_peel_info elem = *slot;
1551
1552 if (elem->count > max->peel_info.count
1553 || (elem->count == max->peel_info.count
1554 && max->peel_info.npeel > elem->npeel))
1555 {
1556 max->peel_info.npeel = elem->npeel;
1557 max->peel_info.count = elem->count;
1558 max->peel_info.dr_info = elem->dr_info;
1559 }
1560
1561 return 1;
1562}
1563
1564/* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1565 data access costs for all data refs. If UNKNOWN_MISALIGNMENT is true,
1566 npeel is computed at runtime but DR0_INFO's misalignment will be zero
1567 after peeling. */
1568
1569static void
1570vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1571 dr_vec_info *dr0_info,
1572 unsigned int *inside_cost,
1573 unsigned int *outside_cost,
1574 stmt_vector_for_cost *body_cost_vec,
1575 stmt_vector_for_cost *prologue_cost_vec,
1576 unsigned int npeel)
1577{
1578 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1579
1580 bool dr0_alignment_known_p
1581 = (dr0_info
1582 && known_alignment_for_access_p (dr_info: dr0_info,
1583 STMT_VINFO_VECTYPE (dr0_info->stmt)));
1584
1585 for (data_reference *dr : datarefs)
1586 {
1587 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1588 if (!vect_relevant_for_alignment_p (dr_info))
1589 continue;
1590
1591 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1592 dr_alignment_support alignment_support_scheme;
1593 int misalignment;
1594 unsigned HOST_WIDE_INT alignment;
1595
1596 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1597 size_zero_node) < 0;
1598 poly_int64 off = 0;
1599 if (negative)
1600 off = ((TYPE_VECTOR_SUBPARTS (node: vectype) - 1)
1601 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1602
1603 if (npeel == 0)
1604 misalignment = dr_misalignment (dr_info, vectype, offset: off);
1605 else if (dr_info == dr0_info
1606 || vect_dr_aligned_if_peeled_dr_is (dr_info, dr_peel_info: dr0_info))
1607 misalignment = 0;
1608 else if (!dr0_alignment_known_p
1609 || !known_alignment_for_access_p (dr_info, vectype)
1610 || !DR_TARGET_ALIGNMENT (dr_info).is_constant (const_value: &alignment))
1611 misalignment = DR_MISALIGNMENT_UNKNOWN;
1612 else
1613 {
1614 misalignment = dr_misalignment (dr_info, vectype, offset: off);
1615 misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1616 misalignment &= alignment - 1;
1617 }
1618 alignment_support_scheme
1619 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
1620 misalignment);
1621
1622 vect_get_data_access_cost (vinfo: loop_vinfo, dr_info,
1623 alignment_support_scheme, misalignment,
1624 inside_cost, outside_cost,
1625 body_cost_vec, prologue_cost_vec);
1626 }
1627}
1628
1629/* Traverse peeling hash table and calculate cost for each peeling option.
1630 Find the one with the lowest cost. */
1631
1632int
1633vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1634 _vect_peel_extended_info *min)
1635{
1636 vect_peel_info elem = *slot;
1637 int dummy;
1638 unsigned int inside_cost = 0, outside_cost = 0;
1639 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (p: min->vinfo);
1640 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1641 epilogue_cost_vec;
1642
1643 prologue_cost_vec.create (nelems: 2);
1644 body_cost_vec.create (nelems: 2);
1645 epilogue_cost_vec.create (nelems: 2);
1646
1647 vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info: elem->dr_info, inside_cost: &inside_cost,
1648 outside_cost: &outside_cost, body_cost_vec: &body_cost_vec,
1649 prologue_cost_vec: &prologue_cost_vec, npeel: elem->npeel);
1650
1651 body_cost_vec.release ();
1652
1653 outside_cost += vect_get_known_peeling_cost
1654 (loop_vinfo, elem->npeel, &dummy,
1655 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1656 &prologue_cost_vec, &epilogue_cost_vec);
1657
1658 /* Prologue and epilogue costs are added to the target model later.
1659 These costs depend only on the scalar iteration cost, the
1660 number of peeling iterations finally chosen, and the number of
1661 misaligned statements. So discard the information found here. */
1662 prologue_cost_vec.release ();
1663 epilogue_cost_vec.release ();
1664
1665 if (inside_cost < min->inside_cost
1666 || (inside_cost == min->inside_cost
1667 && outside_cost < min->outside_cost))
1668 {
1669 min->inside_cost = inside_cost;
1670 min->outside_cost = outside_cost;
1671 min->peel_info.dr_info = elem->dr_info;
1672 min->peel_info.npeel = elem->npeel;
1673 min->peel_info.count = elem->count;
1674 }
1675
1676 return 1;
1677}
1678
1679
1680/* Choose best peeling option by traversing peeling hash table and either
1681 choosing an option with the lowest cost (if cost model is enabled) or the
1682 option that aligns as many accesses as possible. */
1683
1684static struct _vect_peel_extended_info
1685vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1686 loop_vec_info loop_vinfo)
1687{
1688 struct _vect_peel_extended_info res;
1689
1690 res.peel_info.dr_info = NULL;
1691 res.vinfo = loop_vinfo;
1692
1693 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1694 {
1695 res.inside_cost = INT_MAX;
1696 res.outside_cost = INT_MAX;
1697 peeling_htab->traverse <_vect_peel_extended_info *,
1698 vect_peeling_hash_get_lowest_cost> (argument: &res);
1699 }
1700 else
1701 {
1702 res.peel_info.count = 0;
1703 peeling_htab->traverse <_vect_peel_extended_info *,
1704 vect_peeling_hash_get_most_frequent> (argument: &res);
1705 res.inside_cost = 0;
1706 res.outside_cost = 0;
1707 }
1708
1709 return res;
1710}
1711
1712/* Return true if the new peeling NPEEL is supported. */
1713
1714static bool
1715vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1716 unsigned npeel)
1717{
1718 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1719 enum dr_alignment_support supportable_dr_alignment;
1720
1721 bool dr0_alignment_known_p
1722 = known_alignment_for_access_p (dr_info: dr0_info,
1723 STMT_VINFO_VECTYPE (dr0_info->stmt));
1724
1725 /* Ensure that all data refs can be vectorized after the peel. */
1726 for (data_reference *dr : datarefs)
1727 {
1728 if (dr == dr0_info->dr)
1729 continue;
1730
1731 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1732 if (!vect_relevant_for_alignment_p (dr_info)
1733 || vect_dr_aligned_if_peeled_dr_is (dr_info, dr_peel_info: dr0_info))
1734 continue;
1735
1736 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1737 int misalignment;
1738 unsigned HOST_WIDE_INT alignment;
1739 if (!dr0_alignment_known_p
1740 || !known_alignment_for_access_p (dr_info, vectype)
1741 || !DR_TARGET_ALIGNMENT (dr_info).is_constant (const_value: &alignment))
1742 misalignment = DR_MISALIGNMENT_UNKNOWN;
1743 else
1744 {
1745 misalignment = dr_misalignment (dr_info, vectype);
1746 misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1747 misalignment &= alignment - 1;
1748 }
1749 supportable_dr_alignment
1750 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
1751 misalignment);
1752 if (supportable_dr_alignment == dr_unaligned_unsupported)
1753 return false;
1754 }
1755
1756 return true;
1757}
1758
1759/* Compare two data-references DRA and DRB to group them into chunks
1760 with related alignment. */
1761
1762static int
1763dr_align_group_sort_cmp (const void *dra_, const void *drb_)
1764{
1765 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
1766 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
1767 int cmp;
1768
1769 /* Stabilize sort. */
1770 if (dra == drb)
1771 return 0;
1772
1773 /* Ordering of DRs according to base. */
1774 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
1775 DR_BASE_ADDRESS (drb));
1776 if (cmp != 0)
1777 return cmp;
1778
1779 /* And according to DR_OFFSET. */
1780 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
1781 if (cmp != 0)
1782 return cmp;
1783
1784 /* And after step. */
1785 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
1786 if (cmp != 0)
1787 return cmp;
1788
1789 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
1790 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
1791 if (cmp == 0)
1792 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
1793 return cmp;
1794}
1795
1796/* Function vect_enhance_data_refs_alignment
1797
1798 This pass will use loop versioning and loop peeling in order to enhance
1799 the alignment of data references in the loop.
1800
1801 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1802 original loop is to be vectorized. Any other loops that are created by
1803 the transformations performed in this pass - are not supposed to be
1804 vectorized. This restriction will be relaxed.
1805
1806 This pass will require a cost model to guide it whether to apply peeling
1807 or versioning or a combination of the two. For example, the scheme that
1808 intel uses when given a loop with several memory accesses, is as follows:
1809 choose one memory access ('p') which alignment you want to force by doing
1810 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1811 other accesses are not necessarily aligned, or (2) use loop versioning to
1812 generate one loop in which all accesses are aligned, and another loop in
1813 which only 'p' is necessarily aligned.
1814
1815 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1816 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1817 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1818
1819 Devising a cost model is the most critical aspect of this work. It will
1820 guide us on which access to peel for, whether to use loop versioning, how
1821 many versions to create, etc. The cost model will probably consist of
1822 generic considerations as well as target specific considerations (on
1823 powerpc for example, misaligned stores are more painful than misaligned
1824 loads).
1825
1826 Here are the general steps involved in alignment enhancements:
1827
1828 -- original loop, before alignment analysis:
1829 for (i=0; i<N; i++){
1830 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1831 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1832 }
1833
1834 -- After vect_compute_data_refs_alignment:
1835 for (i=0; i<N; i++){
1836 x = q[i]; # DR_MISALIGNMENT(q) = 3
1837 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1838 }
1839
1840 -- Possibility 1: we do loop versioning:
1841 if (p is aligned) {
1842 for (i=0; i<N; i++){ # loop 1A
1843 x = q[i]; # DR_MISALIGNMENT(q) = 3
1844 p[i] = y; # DR_MISALIGNMENT(p) = 0
1845 }
1846 }
1847 else {
1848 for (i=0; i<N; i++){ # loop 1B
1849 x = q[i]; # DR_MISALIGNMENT(q) = 3
1850 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1851 }
1852 }
1853
1854 -- Possibility 2: we do loop peeling:
1855 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1856 x = q[i];
1857 p[i] = y;
1858 }
1859 for (i = 3; i < N; i++){ # loop 2A
1860 x = q[i]; # DR_MISALIGNMENT(q) = 0
1861 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1862 }
1863
1864 -- Possibility 3: combination of loop peeling and versioning:
1865 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1866 x = q[i];
1867 p[i] = y;
1868 }
1869 if (p is aligned) {
1870 for (i = 3; i<N; i++){ # loop 3A
1871 x = q[i]; # DR_MISALIGNMENT(q) = 0
1872 p[i] = y; # DR_MISALIGNMENT(p) = 0
1873 }
1874 }
1875 else {
1876 for (i = 3; i<N; i++){ # loop 3B
1877 x = q[i]; # DR_MISALIGNMENT(q) = 0
1878 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1879 }
1880 }
1881
1882 These loops are later passed to loop_transform to be vectorized. The
1883 vectorizer will use the alignment information to guide the transformation
1884 (whether to generate regular loads/stores, or with special handling for
1885 misalignment). */
1886
1887opt_result
1888vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1889{
1890 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1891 dr_vec_info *first_store = NULL;
1892 dr_vec_info *dr0_info = NULL;
1893 struct data_reference *dr;
1894 unsigned int i;
1895 bool do_peeling = false;
1896 bool do_versioning = false;
1897 unsigned int npeel = 0;
1898 bool one_misalignment_known = false;
1899 bool one_misalignment_unknown = false;
1900 bool one_dr_unsupportable = false;
1901 dr_vec_info *unsupportable_dr_info = NULL;
1902 unsigned int dr0_same_align_drs = 0, first_store_same_align_drs = 0;
1903 hash_table<peel_info_hasher> peeling_htab (1);
1904
1905 DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1906
1907 /* Reset data so we can safely be called multiple times. */
1908 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (size: 0);
1909 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1910
1911 if (LOOP_VINFO_DATAREFS (loop_vinfo).is_empty ())
1912 return opt_result::success ();
1913
1914 /* Sort the vector of datarefs so DRs that have the same or dependent
1915 alignment are next to each other. */
1916 auto_vec<data_reference_p> datarefs
1917 = LOOP_VINFO_DATAREFS (loop_vinfo).copy ();
1918 datarefs.qsort (dr_align_group_sort_cmp);
1919
1920 /* Compute the number of DRs that become aligned when we peel
1921 a dataref so it becomes aligned. */
1922 auto_vec<unsigned> n_same_align_refs (datarefs.length ());
1923 n_same_align_refs.quick_grow_cleared (len: datarefs.length ());
1924 unsigned i0;
1925 for (i0 = 0; i0 < datarefs.length (); ++i0)
1926 if (DR_BASE_ADDRESS (datarefs[i0]))
1927 break;
1928 for (i = i0 + 1; i <= datarefs.length (); ++i)
1929 {
1930 if (i == datarefs.length ()
1931 || !operand_equal_p (DR_BASE_ADDRESS (datarefs[i0]),
1932 DR_BASE_ADDRESS (datarefs[i]), flags: 0)
1933 || !operand_equal_p (DR_OFFSET (datarefs[i0]),
1934 DR_OFFSET (datarefs[i]), flags: 0)
1935 || !operand_equal_p (DR_STEP (datarefs[i0]),
1936 DR_STEP (datarefs[i]), flags: 0))
1937 {
1938 /* The subgroup [i0, i-1] now only differs in DR_INIT and
1939 possibly DR_TARGET_ALIGNMENT. Still the whole subgroup
1940 will get known misalignment if we align one of the refs
1941 with the largest DR_TARGET_ALIGNMENT. */
1942 for (unsigned j = i0; j < i; ++j)
1943 {
1944 dr_vec_info *dr_infoj = loop_vinfo->lookup_dr (datarefs[j]);
1945 for (unsigned k = i0; k < i; ++k)
1946 {
1947 if (k == j)
1948 continue;
1949 dr_vec_info *dr_infok = loop_vinfo->lookup_dr (datarefs[k]);
1950 if (vect_dr_aligned_if_related_peeled_dr_is (dr_info: dr_infok,
1951 dr_peel_info: dr_infoj))
1952 n_same_align_refs[j]++;
1953 }
1954 }
1955 i0 = i;
1956 }
1957 }
1958
1959 /* While cost model enhancements are expected in the future, the high level
1960 view of the code at this time is as follows:
1961
1962 A) If there is a misaligned access then see if peeling to align
1963 this access can make all data references satisfy
1964 vect_supportable_dr_alignment. If so, update data structures
1965 as needed and return true.
1966
1967 B) If peeling wasn't possible and there is a data reference with an
1968 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1969 then see if loop versioning checks can be used to make all data
1970 references satisfy vect_supportable_dr_alignment. If so, update
1971 data structures as needed and return true.
1972
1973 C) If neither peeling nor versioning were successful then return false if
1974 any data reference does not satisfy vect_supportable_dr_alignment.
1975
1976 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1977
1978 Note, Possibility 3 above (which is peeling and versioning together) is not
1979 being done at this time. */
1980
1981 /* (1) Peeling to force alignment. */
1982
1983 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1984 Considerations:
1985 + How many accesses will become aligned due to the peeling
1986 - How many accesses will become unaligned due to the peeling,
1987 and the cost of misaligned accesses.
1988 - The cost of peeling (the extra runtime checks, the increase
1989 in code size). */
1990
1991 FOR_EACH_VEC_ELT (datarefs, i, dr)
1992 {
1993 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1994 if (!vect_relevant_for_alignment_p (dr_info))
1995 continue;
1996
1997 stmt_vec_info stmt_info = dr_info->stmt;
1998 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1999 do_peeling = vector_alignment_reachable_p (dr_info);
2000 if (do_peeling)
2001 {
2002 if (known_alignment_for_access_p (dr_info, vectype))
2003 {
2004 unsigned int npeel_tmp = 0;
2005 bool negative = tree_int_cst_compare (DR_STEP (dr),
2006 size_zero_node) < 0;
2007
2008 /* If known_alignment_for_access_p then we have set
2009 DR_MISALIGNMENT which is only done if we know it at compiler
2010 time, so it is safe to assume target alignment is constant.
2011 */
2012 unsigned int target_align =
2013 DR_TARGET_ALIGNMENT (dr_info).to_constant ();
2014 unsigned HOST_WIDE_INT dr_size = vect_get_scalar_dr_size (dr_info);
2015 poly_int64 off = 0;
2016 if (negative)
2017 off = (TYPE_VECTOR_SUBPARTS (node: vectype) - 1) * -dr_size;
2018 unsigned int mis = dr_misalignment (dr_info, vectype, offset: off);
2019 mis = negative ? mis : -mis;
2020 if (mis != 0)
2021 npeel_tmp = (mis & (target_align - 1)) / dr_size;
2022
2023 /* For multiple types, it is possible that the bigger type access
2024 will have more than one peeling option. E.g., a loop with two
2025 types: one of size (vector size / 4), and the other one of
2026 size (vector size / 8). Vectorization factor will 8. If both
2027 accesses are misaligned by 3, the first one needs one scalar
2028 iteration to be aligned, and the second one needs 5. But the
2029 first one will be aligned also by peeling 5 scalar
2030 iterations, and in that case both accesses will be aligned.
2031 Hence, except for the immediate peeling amount, we also want
2032 to try to add full vector size, while we don't exceed
2033 vectorization factor.
2034 We do this automatically for cost model, since we calculate
2035 cost for every peeling option. */
2036 poly_uint64 nscalars = npeel_tmp;
2037 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
2038 {
2039 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2040 nscalars = (STMT_SLP_TYPE (stmt_info)
2041 ? vf * DR_GROUP_SIZE (stmt_info) : vf);
2042 }
2043
2044 /* Save info about DR in the hash table. Also include peeling
2045 amounts according to the explanation above. Indicate
2046 the alignment status when the ref is not aligned.
2047 ??? Rather than using unknown alignment here we should
2048 prune all entries from the peeling hashtable which cause
2049 DRs to be not supported. */
2050 bool supportable_if_not_aligned
2051 = vect_supportable_dr_alignment
2052 (loop_vinfo, dr_info, vectype, DR_MISALIGNMENT_UNKNOWN);
2053 while (known_le (npeel_tmp, nscalars))
2054 {
2055 vect_peeling_hash_insert (peeling_htab: &peeling_htab, loop_vinfo,
2056 dr_info, npeel: npeel_tmp,
2057 supportable_if_not_aligned);
2058 npeel_tmp += MAX (1, target_align / dr_size);
2059 }
2060
2061 one_misalignment_known = true;
2062 }
2063 else
2064 {
2065 /* If we don't know any misalignment values, we prefer
2066 peeling for data-ref that has the maximum number of data-refs
2067 with the same alignment, unless the target prefers to align
2068 stores over load. */
2069 unsigned same_align_drs = n_same_align_refs[i];
2070 if (!dr0_info
2071 || dr0_same_align_drs < same_align_drs)
2072 {
2073 dr0_same_align_drs = same_align_drs;
2074 dr0_info = dr_info;
2075 }
2076 /* For data-refs with the same number of related
2077 accesses prefer the one where the misalign
2078 computation will be invariant in the outermost loop. */
2079 else if (dr0_same_align_drs == same_align_drs)
2080 {
2081 class loop *ivloop0, *ivloop;
2082 ivloop0 = outermost_invariant_loop_for_expr
2083 (loop, DR_BASE_ADDRESS (dr0_info->dr));
2084 ivloop = outermost_invariant_loop_for_expr
2085 (loop, DR_BASE_ADDRESS (dr));
2086 if ((ivloop && !ivloop0)
2087 || (ivloop && ivloop0
2088 && flow_loop_nested_p (ivloop, ivloop0)))
2089 dr0_info = dr_info;
2090 }
2091
2092 one_misalignment_unknown = true;
2093
2094 /* Check for data refs with unsupportable alignment that
2095 can be peeled. */
2096 enum dr_alignment_support supportable_dr_alignment
2097 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2098 DR_MISALIGNMENT_UNKNOWN);
2099 if (supportable_dr_alignment == dr_unaligned_unsupported)
2100 {
2101 one_dr_unsupportable = true;
2102 unsupportable_dr_info = dr_info;
2103 }
2104
2105 if (!first_store && DR_IS_WRITE (dr))
2106 {
2107 first_store = dr_info;
2108 first_store_same_align_drs = same_align_drs;
2109 }
2110 }
2111 }
2112 else
2113 {
2114 if (!aligned_access_p (dr_info, vectype))
2115 {
2116 if (dump_enabled_p ())
2117 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2118 "vector alignment may not be reachable\n");
2119 break;
2120 }
2121 }
2122 }
2123
2124 /* Check if we can possibly peel the loop. */
2125 if (!vect_can_advance_ivs_p (loop_vinfo)
2126 || !slpeel_can_duplicate_loop_p (loop, LOOP_VINFO_IV_EXIT (loop_vinfo),
2127 LOOP_VINFO_IV_EXIT (loop_vinfo))
2128 || loop->inner)
2129 do_peeling = false;
2130
2131 struct _vect_peel_extended_info peel_for_known_alignment;
2132 struct _vect_peel_extended_info peel_for_unknown_alignment;
2133 struct _vect_peel_extended_info best_peel;
2134
2135 peel_for_unknown_alignment.inside_cost = INT_MAX;
2136 peel_for_unknown_alignment.outside_cost = INT_MAX;
2137 peel_for_unknown_alignment.peel_info.count = 0;
2138
2139 if (do_peeling
2140 && one_misalignment_unknown)
2141 {
2142 /* Check if the target requires to prefer stores over loads, i.e., if
2143 misaligned stores are more expensive than misaligned loads (taking
2144 drs with same alignment into account). */
2145 unsigned int load_inside_cost = 0;
2146 unsigned int load_outside_cost = 0;
2147 unsigned int store_inside_cost = 0;
2148 unsigned int store_outside_cost = 0;
2149 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
2150
2151 stmt_vector_for_cost dummy;
2152 dummy.create (nelems: 2);
2153 vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
2154 inside_cost: &load_inside_cost,
2155 outside_cost: &load_outside_cost,
2156 body_cost_vec: &dummy, prologue_cost_vec: &dummy, npeel: estimated_npeels);
2157 dummy.release ();
2158
2159 if (first_store)
2160 {
2161 dummy.create (nelems: 2);
2162 vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info: first_store,
2163 inside_cost: &store_inside_cost,
2164 outside_cost: &store_outside_cost,
2165 body_cost_vec: &dummy, prologue_cost_vec: &dummy,
2166 npeel: estimated_npeels);
2167 dummy.release ();
2168 }
2169 else
2170 {
2171 store_inside_cost = INT_MAX;
2172 store_outside_cost = INT_MAX;
2173 }
2174
2175 if (load_inside_cost > store_inside_cost
2176 || (load_inside_cost == store_inside_cost
2177 && load_outside_cost > store_outside_cost))
2178 {
2179 dr0_info = first_store;
2180 dr0_same_align_drs = first_store_same_align_drs;
2181 peel_for_unknown_alignment.inside_cost = store_inside_cost;
2182 peel_for_unknown_alignment.outside_cost = store_outside_cost;
2183 }
2184 else
2185 {
2186 peel_for_unknown_alignment.inside_cost = load_inside_cost;
2187 peel_for_unknown_alignment.outside_cost = load_outside_cost;
2188 }
2189
2190 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2191 prologue_cost_vec.create (nelems: 2);
2192 epilogue_cost_vec.create (nelems: 2);
2193
2194 int dummy2;
2195 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
2196 (loop_vinfo, estimated_npeels, &dummy2,
2197 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2198 &prologue_cost_vec, &epilogue_cost_vec);
2199
2200 prologue_cost_vec.release ();
2201 epilogue_cost_vec.release ();
2202
2203 peel_for_unknown_alignment.peel_info.count = dr0_same_align_drs + 1;
2204 }
2205
2206 peel_for_unknown_alignment.peel_info.npeel = 0;
2207 peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
2208
2209 best_peel = peel_for_unknown_alignment;
2210
2211 peel_for_known_alignment.inside_cost = INT_MAX;
2212 peel_for_known_alignment.outside_cost = INT_MAX;
2213 peel_for_known_alignment.peel_info.count = 0;
2214 peel_for_known_alignment.peel_info.dr_info = NULL;
2215
2216 if (do_peeling && one_misalignment_known)
2217 {
2218 /* Peeling is possible, but there is no data access that is not supported
2219 unless aligned. So we try to choose the best possible peeling from
2220 the hash table. */
2221 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
2222 (peeling_htab: &peeling_htab, loop_vinfo);
2223 }
2224
2225 /* Compare costs of peeling for known and unknown alignment. */
2226 if (peel_for_known_alignment.peel_info.dr_info != NULL
2227 && peel_for_unknown_alignment.inside_cost
2228 >= peel_for_known_alignment.inside_cost)
2229 {
2230 best_peel = peel_for_known_alignment;
2231
2232 /* If the best peeling for known alignment has NPEEL == 0, perform no
2233 peeling at all except if there is an unsupportable dr that we can
2234 align. */
2235 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
2236 do_peeling = false;
2237 }
2238
2239 /* If there is an unsupportable data ref, prefer this over all choices so far
2240 since we'd have to discard a chosen peeling except when it accidentally
2241 aligned the unsupportable data ref. */
2242 if (one_dr_unsupportable)
2243 dr0_info = unsupportable_dr_info;
2244 else if (do_peeling)
2245 {
2246 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
2247 TODO: Use nopeel_outside_cost or get rid of it? */
2248 unsigned nopeel_inside_cost = 0;
2249 unsigned nopeel_outside_cost = 0;
2250
2251 stmt_vector_for_cost dummy;
2252 dummy.create (nelems: 2);
2253 vect_get_peeling_costs_all_drs (loop_vinfo, NULL, inside_cost: &nopeel_inside_cost,
2254 outside_cost: &nopeel_outside_cost, body_cost_vec: &dummy, prologue_cost_vec: &dummy, npeel: 0);
2255 dummy.release ();
2256
2257 /* Add epilogue costs. As we do not peel for alignment here, no prologue
2258 costs will be recorded. */
2259 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2260 prologue_cost_vec.create (nelems: 2);
2261 epilogue_cost_vec.create (nelems: 2);
2262
2263 int dummy2;
2264 nopeel_outside_cost += vect_get_known_peeling_cost
2265 (loop_vinfo, 0, &dummy2,
2266 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2267 &prologue_cost_vec, &epilogue_cost_vec);
2268
2269 prologue_cost_vec.release ();
2270 epilogue_cost_vec.release ();
2271
2272 npeel = best_peel.peel_info.npeel;
2273 dr0_info = best_peel.peel_info.dr_info;
2274
2275 /* If no peeling is not more expensive than the best peeling we
2276 have so far, don't perform any peeling. */
2277 if (nopeel_inside_cost <= best_peel.inside_cost)
2278 do_peeling = false;
2279 }
2280
2281 if (do_peeling)
2282 {
2283 stmt_vec_info stmt_info = dr0_info->stmt;
2284 if (known_alignment_for_access_p (dr_info: dr0_info,
2285 STMT_VINFO_VECTYPE (stmt_info)))
2286 {
2287 bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2288 size_zero_node) < 0;
2289 if (!npeel)
2290 {
2291 /* Since it's known at compile time, compute the number of
2292 iterations in the peeled loop (the peeling factor) for use in
2293 updating DR_MISALIGNMENT values. The peeling factor is the
2294 vectorization factor minus the misalignment as an element
2295 count. */
2296 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2297 poly_int64 off = 0;
2298 if (negative)
2299 off = ((TYPE_VECTOR_SUBPARTS (node: vectype) - 1)
2300 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2301 unsigned int mis
2302 = dr_misalignment (dr_info: dr0_info, vectype, offset: off);
2303 mis = negative ? mis : -mis;
2304 /* If known_alignment_for_access_p then we have set
2305 DR_MISALIGNMENT which is only done if we know it at compiler
2306 time, so it is safe to assume target alignment is constant.
2307 */
2308 unsigned int target_align =
2309 DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
2310 npeel = ((mis & (target_align - 1))
2311 / vect_get_scalar_dr_size (dr_info: dr0_info));
2312 }
2313
2314 /* For interleaved data access every iteration accesses all the
2315 members of the group, therefore we divide the number of iterations
2316 by the group size. */
2317 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2318 npeel /= DR_GROUP_SIZE (stmt_info);
2319
2320 if (dump_enabled_p ())
2321 dump_printf_loc (MSG_NOTE, vect_location,
2322 "Try peeling by %d\n", npeel);
2323 }
2324
2325 /* Ensure that all datarefs can be vectorized after the peel. */
2326 if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2327 do_peeling = false;
2328
2329 /* Check if all datarefs are supportable and log. */
2330 if (do_peeling
2331 && npeel == 0
2332 && known_alignment_for_access_p (dr_info: dr0_info,
2333 STMT_VINFO_VECTYPE (stmt_info)))
2334 return opt_result::success ();
2335
2336 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2337 if (do_peeling)
2338 {
2339 unsigned max_allowed_peel
2340 = param_vect_max_peeling_for_alignment;
2341 if (loop_cost_model (loop) <= VECT_COST_MODEL_CHEAP)
2342 max_allowed_peel = 0;
2343 if (max_allowed_peel != (unsigned)-1)
2344 {
2345 unsigned max_peel = npeel;
2346 if (max_peel == 0)
2347 {
2348 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2349 unsigned HOST_WIDE_INT target_align_c;
2350 if (target_align.is_constant (const_value: &target_align_c))
2351 max_peel =
2352 target_align_c / vect_get_scalar_dr_size (dr_info: dr0_info) - 1;
2353 else
2354 {
2355 do_peeling = false;
2356 if (dump_enabled_p ())
2357 dump_printf_loc (MSG_NOTE, vect_location,
2358 "Disable peeling, max peels set and vector"
2359 " alignment unknown\n");
2360 }
2361 }
2362 if (max_peel > max_allowed_peel)
2363 {
2364 do_peeling = false;
2365 if (dump_enabled_p ())
2366 dump_printf_loc (MSG_NOTE, vect_location,
2367 "Disable peeling, max peels reached: %d\n", max_peel);
2368 }
2369 }
2370 }
2371
2372 /* Cost model #2 - if peeling may result in a remaining loop not
2373 iterating enough to be vectorized then do not peel. Since this
2374 is a cost heuristic rather than a correctness decision, use the
2375 most likely runtime value for variable vectorization factors. */
2376 if (do_peeling
2377 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2378 {
2379 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2380 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2381 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2382 < assumed_vf + max_peel)
2383 do_peeling = false;
2384 }
2385
2386 if (do_peeling)
2387 {
2388 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2389 If the misalignment of DR_i is identical to that of dr0 then set
2390 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2391 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2392 by the peeling factor times the element size of DR_i (MOD the
2393 vectorization factor times the size). Otherwise, the
2394 misalignment of DR_i must be set to unknown. */
2395 FOR_EACH_VEC_ELT (datarefs, i, dr)
2396 if (dr != dr0_info->dr)
2397 {
2398 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2399 if (!vect_relevant_for_alignment_p (dr_info))
2400 continue;
2401
2402 vect_update_misalignment_for_peel (dr_info, dr_peel_info: dr0_info, npeel);
2403 }
2404
2405 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2406 if (npeel)
2407 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2408 else
2409 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = -1;
2410 SET_DR_MISALIGNMENT (dr0_info,
2411 vect_dr_misalign_for_aligned_access (dr0_info));
2412 if (dump_enabled_p ())
2413 {
2414 dump_printf_loc (MSG_NOTE, vect_location,
2415 "Alignment of access forced using peeling.\n");
2416 dump_printf_loc (MSG_NOTE, vect_location,
2417 "Peeling for alignment will be applied.\n");
2418 }
2419
2420 /* The inside-loop cost will be accounted for in vectorizable_load
2421 and vectorizable_store correctly with adjusted alignments.
2422 Drop the body_cst_vec on the floor here. */
2423 return opt_result::success ();
2424 }
2425 }
2426
2427 /* (2) Versioning to force alignment. */
2428
2429 /* Try versioning if:
2430 1) optimize loop for speed and the cost-model is not cheap
2431 2) there is at least one unsupported misaligned data ref with an unknown
2432 misalignment, and
2433 3) all misaligned data refs with a known misalignment are supported, and
2434 4) the number of runtime alignment checks is within reason. */
2435
2436 do_versioning
2437 = (optimize_loop_nest_for_speed_p (loop)
2438 && !loop->inner /* FORNOW */
2439 && loop_cost_model (loop) > VECT_COST_MODEL_CHEAP);
2440
2441 if (do_versioning)
2442 {
2443 FOR_EACH_VEC_ELT (datarefs, i, dr)
2444 {
2445 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2446 if (!vect_relevant_for_alignment_p (dr_info))
2447 continue;
2448
2449 stmt_vec_info stmt_info = dr_info->stmt;
2450 if (STMT_VINFO_STRIDED_P (stmt_info))
2451 {
2452 do_versioning = false;
2453 break;
2454 }
2455
2456 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2457 bool negative = tree_int_cst_compare (DR_STEP (dr),
2458 size_zero_node) < 0;
2459 poly_int64 off = 0;
2460 if (negative)
2461 off = ((TYPE_VECTOR_SUBPARTS (node: vectype) - 1)
2462 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2463 int misalignment;
2464 if ((misalignment = dr_misalignment (dr_info, vectype, offset: off)) == 0)
2465 continue;
2466
2467 enum dr_alignment_support supportable_dr_alignment
2468 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2469 misalignment);
2470 if (supportable_dr_alignment == dr_unaligned_unsupported)
2471 {
2472 if (misalignment != DR_MISALIGNMENT_UNKNOWN
2473 || (LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2474 >= (unsigned) param_vect_max_version_for_alignment_checks))
2475 {
2476 do_versioning = false;
2477 break;
2478 }
2479
2480 /* At present we don't support versioning for alignment
2481 with variable VF, since there's no guarantee that the
2482 VF is a power of two. We could relax this if we added
2483 a way of enforcing a power-of-two size. */
2484 unsigned HOST_WIDE_INT size;
2485 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (const_value: &size))
2486 {
2487 do_versioning = false;
2488 break;
2489 }
2490
2491 /* Forcing alignment in the first iteration is no good if
2492 we don't keep it across iterations. For now, just disable
2493 versioning in this case.
2494 ?? We could actually unroll the loop to achieve the required
2495 overall step alignment, and forcing the alignment could be
2496 done by doing some iterations of the non-vectorized loop. */
2497 if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2498 * DR_STEP_ALIGNMENT (dr),
2499 DR_TARGET_ALIGNMENT (dr_info)))
2500 {
2501 do_versioning = false;
2502 break;
2503 }
2504
2505 /* The rightmost bits of an aligned address must be zeros.
2506 Construct the mask needed for this test. For example,
2507 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2508 mask must be 15 = 0xf. */
2509 int mask = size - 1;
2510
2511 /* FORNOW: use the same mask to test all potentially unaligned
2512 references in the loop. */
2513 if (LOOP_VINFO_PTR_MASK (loop_vinfo)
2514 && LOOP_VINFO_PTR_MASK (loop_vinfo) != mask)
2515 {
2516 do_versioning = false;
2517 break;
2518 }
2519
2520 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2521 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (obj: stmt_info);
2522 }
2523 }
2524
2525 /* Versioning requires at least one misaligned data reference. */
2526 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2527 do_versioning = false;
2528 else if (!do_versioning)
2529 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (size: 0);
2530 }
2531
2532 if (do_versioning)
2533 {
2534 const vec<stmt_vec_info> &may_misalign_stmts
2535 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2536 stmt_vec_info stmt_info;
2537
2538 /* It can now be assumed that the data references in the statements
2539 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2540 of the loop being vectorized. */
2541 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2542 {
2543 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2544 SET_DR_MISALIGNMENT (dr_info,
2545 vect_dr_misalign_for_aligned_access (dr_info));
2546 if (dump_enabled_p ())
2547 dump_printf_loc (MSG_NOTE, vect_location,
2548 "Alignment of access forced using versioning.\n");
2549 }
2550
2551 if (dump_enabled_p ())
2552 dump_printf_loc (MSG_NOTE, vect_location,
2553 "Versioning for alignment will be applied.\n");
2554
2555 /* Peeling and versioning can't be done together at this time. */
2556 gcc_assert (! (do_peeling && do_versioning));
2557
2558 return opt_result::success ();
2559 }
2560
2561 /* This point is reached if neither peeling nor versioning is being done. */
2562 gcc_assert (! (do_peeling || do_versioning));
2563
2564 return opt_result::success ();
2565}
2566
2567
2568/* Function vect_analyze_data_refs_alignment
2569
2570 Analyze the alignment of the data-references in the loop.
2571 Return FALSE if a data reference is found that cannot be vectorized. */
2572
2573opt_result
2574vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo)
2575{
2576 DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2577
2578 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2579 struct data_reference *dr;
2580 unsigned int i;
2581
2582 vect_record_base_alignments (vinfo: loop_vinfo);
2583 FOR_EACH_VEC_ELT (datarefs, i, dr)
2584 {
2585 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2586 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2587 {
2588 if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt)
2589 && DR_GROUP_FIRST_ELEMENT (dr_info->stmt) != dr_info->stmt)
2590 continue;
2591 vect_compute_data_ref_alignment (vinfo: loop_vinfo, dr_info,
2592 STMT_VINFO_VECTYPE (dr_info->stmt));
2593 }
2594 }
2595
2596 return opt_result::success ();
2597}
2598
2599
2600/* Analyze alignment of DRs of stmts in NODE. */
2601
2602static bool
2603vect_slp_analyze_node_alignment (vec_info *vinfo, slp_tree node)
2604{
2605 /* Alignment is maintained in the first element of the group. */
2606 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2607 first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2608 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2609 tree vectype = SLP_TREE_VECTYPE (node);
2610 poly_uint64 vector_alignment
2611 = exact_div (a: targetm.vectorize.preferred_vector_alignment (vectype),
2612 BITS_PER_UNIT);
2613 if (dr_info->misalignment == DR_MISALIGNMENT_UNINITIALIZED)
2614 vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2615 /* Re-analyze alignment when we're facing a vectorization with a bigger
2616 alignment requirement. */
2617 else if (known_lt (dr_info->target_alignment, vector_alignment))
2618 {
2619 poly_uint64 old_target_alignment = dr_info->target_alignment;
2620 int old_misalignment = dr_info->misalignment;
2621 vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2622 /* But keep knowledge about a smaller alignment. */
2623 if (old_misalignment != DR_MISALIGNMENT_UNKNOWN
2624 && dr_info->misalignment == DR_MISALIGNMENT_UNKNOWN)
2625 {
2626 dr_info->target_alignment = old_target_alignment;
2627 dr_info->misalignment = old_misalignment;
2628 }
2629 }
2630 /* When we ever face unordered target alignments the first one wins in terms
2631 of analyzing and the other will become unknown in dr_misalignment. */
2632 return true;
2633}
2634
2635/* Function vect_slp_analyze_instance_alignment
2636
2637 Analyze the alignment of the data-references in the SLP instance.
2638 Return FALSE if a data reference is found that cannot be vectorized. */
2639
2640bool
2641vect_slp_analyze_instance_alignment (vec_info *vinfo,
2642 slp_instance instance)
2643{
2644 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_alignment");
2645
2646 slp_tree node;
2647 unsigned i;
2648 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2649 if (! vect_slp_analyze_node_alignment (vinfo, node))
2650 return false;
2651
2652 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2653 && ! vect_slp_analyze_node_alignment
2654 (vinfo, SLP_INSTANCE_TREE (instance)))
2655 return false;
2656
2657 return true;
2658}
2659
2660
2661/* Analyze groups of accesses: check that DR_INFO belongs to a group of
2662 accesses of legal size, step, etc. Detect gaps, single element
2663 interleaving, and other special cases. Set grouped access info.
2664 Collect groups of strided stores for further use in SLP analysis.
2665 Worker for vect_analyze_group_access. */
2666
2667static bool
2668vect_analyze_group_access_1 (vec_info *vinfo, dr_vec_info *dr_info)
2669{
2670 data_reference *dr = dr_info->dr;
2671 tree step = DR_STEP (dr);
2672 tree scalar_type = TREE_TYPE (DR_REF (dr));
2673 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2674 stmt_vec_info stmt_info = dr_info->stmt;
2675 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (p: vinfo);
2676 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (p: vinfo);
2677 HOST_WIDE_INT dr_step = -1;
2678 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2679 bool slp_impossible = false;
2680
2681 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2682 size of the interleaving group (including gaps). */
2683 if (tree_fits_shwi_p (step))
2684 {
2685 dr_step = tree_to_shwi (step);
2686 /* Check that STEP is a multiple of type size. Otherwise there is
2687 a non-element-sized gap at the end of the group which we
2688 cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2689 ??? As we can handle non-constant step fine here we should
2690 simply remove uses of DR_GROUP_GAP between the last and first
2691 element and instead rely on DR_STEP. DR_GROUP_SIZE then would
2692 simply not include that gap. */
2693 if ((dr_step % type_size) != 0)
2694 {
2695 if (dump_enabled_p ())
2696 dump_printf_loc (MSG_NOTE, vect_location,
2697 "Step %T is not a multiple of the element size"
2698 " for %T\n",
2699 step, DR_REF (dr));
2700 return false;
2701 }
2702 groupsize = absu_hwi (x: dr_step) / type_size;
2703 }
2704 else
2705 groupsize = 0;
2706
2707 /* Not consecutive access is possible only if it is a part of interleaving. */
2708 if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2709 {
2710 /* Check if it this DR is a part of interleaving, and is a single
2711 element of the group that is accessed in the loop. */
2712
2713 /* Gaps are supported only for loads. STEP must be a multiple of the type
2714 size. */
2715 if (DR_IS_READ (dr)
2716 && (dr_step % type_size) == 0
2717 && groupsize > 0
2718 /* This could be UINT_MAX but as we are generating code in a very
2719 inefficient way we have to cap earlier.
2720 See PR91403 for example. */
2721 && groupsize <= 4096)
2722 {
2723 DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2724 DR_GROUP_SIZE (stmt_info) = groupsize;
2725 DR_GROUP_GAP (stmt_info) = groupsize - 1;
2726 if (dump_enabled_p ())
2727 dump_printf_loc (MSG_NOTE, vect_location,
2728 "Detected single element interleaving %T"
2729 " step %T\n",
2730 DR_REF (dr), step);
2731
2732 return true;
2733 }
2734
2735 if (dump_enabled_p ())
2736 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2737 "not consecutive access %G", stmt_info->stmt);
2738
2739 if (bb_vinfo)
2740 {
2741 /* Mark the statement as unvectorizable. */
2742 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2743 return true;
2744 }
2745
2746 if (dump_enabled_p ())
2747 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2748 STMT_VINFO_STRIDED_P (stmt_info) = true;
2749 return true;
2750 }
2751
2752 if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2753 {
2754 /* First stmt in the interleaving chain. Check the chain. */
2755 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2756 struct data_reference *data_ref = dr;
2757 unsigned int count = 1;
2758 tree prev_init = DR_INIT (data_ref);
2759 HOST_WIDE_INT diff, gaps = 0;
2760
2761 /* By construction, all group members have INTEGER_CST DR_INITs. */
2762 while (next)
2763 {
2764 /* We never have the same DR multiple times. */
2765 gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
2766 DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
2767
2768 data_ref = STMT_VINFO_DATA_REF (next);
2769
2770 /* All group members have the same STEP by construction. */
2771 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2772
2773 /* Check that the distance between two accesses is equal to the type
2774 size. Otherwise, we have gaps. */
2775 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2776 - TREE_INT_CST_LOW (prev_init)) / type_size;
2777 if (diff < 1 || diff > UINT_MAX)
2778 {
2779 /* For artificial testcases with array accesses with large
2780 constant indices we can run into overflow issues which
2781 can end up fooling the groupsize constraint below so
2782 check the individual gaps (which are represented as
2783 unsigned int) as well. */
2784 if (dump_enabled_p ())
2785 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2786 "interleaved access with gap larger "
2787 "than representable\n");
2788 return false;
2789 }
2790 if (diff != 1)
2791 {
2792 /* FORNOW: SLP of accesses with gaps is not supported. */
2793 slp_impossible = true;
2794 if (DR_IS_WRITE (data_ref))
2795 {
2796 if (dump_enabled_p ())
2797 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2798 "interleaved store with gaps\n");
2799 return false;
2800 }
2801
2802 gaps += diff - 1;
2803 }
2804
2805 last_accessed_element += diff;
2806
2807 /* Store the gap from the previous member of the group. If there is no
2808 gap in the access, DR_GROUP_GAP is always 1. */
2809 DR_GROUP_GAP (next) = diff;
2810
2811 prev_init = DR_INIT (data_ref);
2812 next = DR_GROUP_NEXT_ELEMENT (next);
2813 /* Count the number of data-refs in the chain. */
2814 count++;
2815 }
2816
2817 if (groupsize == 0)
2818 groupsize = count + gaps;
2819
2820 /* This could be UINT_MAX but as we are generating code in a very
2821 inefficient way we have to cap earlier. See PR78699 for example. */
2822 if (groupsize > 4096)
2823 {
2824 if (dump_enabled_p ())
2825 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2826 "group is too large\n");
2827 return false;
2828 }
2829
2830 /* Check that the size of the interleaving is equal to count for stores,
2831 i.e., that there are no gaps. */
2832 if (groupsize != count
2833 && !DR_IS_READ (dr))
2834 {
2835 groupsize = count;
2836 STMT_VINFO_STRIDED_P (stmt_info) = true;
2837 }
2838
2839 /* If there is a gap after the last load in the group it is the
2840 difference between the groupsize and the last accessed
2841 element.
2842 When there is no gap, this difference should be 0. */
2843 DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2844
2845 DR_GROUP_SIZE (stmt_info) = groupsize;
2846 if (dump_enabled_p ())
2847 {
2848 dump_printf_loc (MSG_NOTE, vect_location,
2849 "Detected interleaving ");
2850 if (DR_IS_READ (dr))
2851 dump_printf (MSG_NOTE, "load ");
2852 else if (STMT_VINFO_STRIDED_P (stmt_info))
2853 dump_printf (MSG_NOTE, "strided store ");
2854 else
2855 dump_printf (MSG_NOTE, "store ");
2856 dump_printf (MSG_NOTE, "of size %u\n",
2857 (unsigned)groupsize);
2858 dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
2859 next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2860 while (next)
2861 {
2862 if (DR_GROUP_GAP (next) != 1)
2863 dump_printf_loc (MSG_NOTE, vect_location,
2864 "\t<gap of %d elements>\n",
2865 DR_GROUP_GAP (next) - 1);
2866 dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
2867 next = DR_GROUP_NEXT_ELEMENT (next);
2868 }
2869 if (DR_GROUP_GAP (stmt_info) != 0)
2870 dump_printf_loc (MSG_NOTE, vect_location,
2871 "\t<gap of %d elements>\n",
2872 DR_GROUP_GAP (stmt_info));
2873 }
2874
2875 /* SLP: create an SLP data structure for every interleaving group of
2876 stores for further analysis in vect_analyse_slp. */
2877 if (DR_IS_WRITE (dr) && !slp_impossible)
2878 {
2879 if (loop_vinfo)
2880 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (obj: stmt_info);
2881 if (bb_vinfo)
2882 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (obj: stmt_info);
2883 }
2884 }
2885
2886 return true;
2887}
2888
2889/* Analyze groups of accesses: check that DR_INFO belongs to a group of
2890 accesses of legal size, step, etc. Detect gaps, single element
2891 interleaving, and other special cases. Set grouped access info.
2892 Collect groups of strided stores for further use in SLP analysis. */
2893
2894static bool
2895vect_analyze_group_access (vec_info *vinfo, dr_vec_info *dr_info)
2896{
2897 if (!vect_analyze_group_access_1 (vinfo, dr_info))
2898 {
2899 /* Dissolve the group if present. */
2900 stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
2901 while (stmt_info)
2902 {
2903 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2904 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2905 DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2906 stmt_info = next;
2907 }
2908 return false;
2909 }
2910 return true;
2911}
2912
2913/* Analyze the access pattern of the data-reference DR_INFO.
2914 In case of non-consecutive accesses call vect_analyze_group_access() to
2915 analyze groups of accesses. */
2916
2917static bool
2918vect_analyze_data_ref_access (vec_info *vinfo, dr_vec_info *dr_info)
2919{
2920 data_reference *dr = dr_info->dr;
2921 tree step = DR_STEP (dr);
2922 tree scalar_type = TREE_TYPE (DR_REF (dr));
2923 stmt_vec_info stmt_info = dr_info->stmt;
2924 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (p: vinfo);
2925 class loop *loop = NULL;
2926
2927 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2928 return true;
2929
2930 if (loop_vinfo)
2931 loop = LOOP_VINFO_LOOP (loop_vinfo);
2932
2933 if (loop_vinfo && !step)
2934 {
2935 if (dump_enabled_p ())
2936 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2937 "bad data-ref access in loop\n");
2938 return false;
2939 }
2940
2941 /* Allow loads with zero step in inner-loop vectorization. */
2942 if (loop_vinfo && integer_zerop (step))
2943 {
2944 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2945 if (!nested_in_vect_loop_p (loop, stmt_info))
2946 return DR_IS_READ (dr);
2947 /* Allow references with zero step for outer loops marked
2948 with pragma omp simd only - it guarantees absence of
2949 loop-carried dependencies between inner loop iterations. */
2950 if (loop->safelen < 2)
2951 {
2952 if (dump_enabled_p ())
2953 dump_printf_loc (MSG_NOTE, vect_location,
2954 "zero step in inner loop of nest\n");
2955 return false;
2956 }
2957 }
2958
2959 if (loop && nested_in_vect_loop_p (loop, stmt_info))
2960 {
2961 /* Interleaved accesses are not yet supported within outer-loop
2962 vectorization for references in the inner-loop. */
2963 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2964
2965 /* For the rest of the analysis we use the outer-loop step. */
2966 step = STMT_VINFO_DR_STEP (stmt_info);
2967 if (integer_zerop (step))
2968 {
2969 if (dump_enabled_p ())
2970 dump_printf_loc (MSG_NOTE, vect_location,
2971 "zero step in outer loop.\n");
2972 return DR_IS_READ (dr);
2973 }
2974 }
2975
2976 /* Consecutive? */
2977 if (TREE_CODE (step) == INTEGER_CST)
2978 {
2979 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2980 if (!tree_int_cst_compare (t1: step, TYPE_SIZE_UNIT (scalar_type))
2981 || (dr_step < 0
2982 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2983 {
2984 /* Mark that it is not interleaving. */
2985 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2986 return true;
2987 }
2988 }
2989
2990 if (loop && nested_in_vect_loop_p (loop, stmt_info))
2991 {
2992 if (dump_enabled_p ())
2993 dump_printf_loc (MSG_NOTE, vect_location,
2994 "grouped access in outer loop.\n");
2995 return false;
2996 }
2997
2998
2999 /* Assume this is a DR handled by non-constant strided load case. */
3000 if (TREE_CODE (step) != INTEGER_CST)
3001 return (STMT_VINFO_STRIDED_P (stmt_info)
3002 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
3003 || vect_analyze_group_access (vinfo, dr_info)));
3004
3005 /* Not consecutive access - check if it's a part of interleaving group. */
3006 return vect_analyze_group_access (vinfo, dr_info);
3007}
3008
3009/* Compare two data-references DRA and DRB to group them into chunks
3010 suitable for grouping. */
3011
3012static int
3013dr_group_sort_cmp (const void *dra_, const void *drb_)
3014{
3015 dr_vec_info *dra_info = *(dr_vec_info **)const_cast<void *>(dra_);
3016 dr_vec_info *drb_info = *(dr_vec_info **)const_cast<void *>(drb_);
3017 data_reference_p dra = dra_info->dr;
3018 data_reference_p drb = drb_info->dr;
3019 int cmp;
3020
3021 /* Stabilize sort. */
3022 if (dra == drb)
3023 return 0;
3024
3025 /* Different group IDs lead never belong to the same group. */
3026 if (dra_info->group != drb_info->group)
3027 return dra_info->group < drb_info->group ? -1 : 1;
3028
3029 /* Ordering of DRs according to base. */
3030 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3031 DR_BASE_ADDRESS (drb));
3032 if (cmp != 0)
3033 return cmp;
3034
3035 /* And according to DR_OFFSET. */
3036 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
3037 if (cmp != 0)
3038 return cmp;
3039
3040 /* Put reads before writes. */
3041 if (DR_IS_READ (dra) != DR_IS_READ (drb))
3042 return DR_IS_READ (dra) ? -1 : 1;
3043
3044 /* Then sort after access size. */
3045 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
3046 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
3047 if (cmp != 0)
3048 return cmp;
3049
3050 /* And after step. */
3051 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
3052 if (cmp != 0)
3053 return cmp;
3054
3055 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
3056 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
3057 if (cmp == 0)
3058 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
3059 return cmp;
3060}
3061
3062/* If OP is the result of a conversion, return the unconverted value,
3063 otherwise return null. */
3064
3065static tree
3066strip_conversion (tree op)
3067{
3068 if (TREE_CODE (op) != SSA_NAME)
3069 return NULL_TREE;
3070 gimple *stmt = SSA_NAME_DEF_STMT (op);
3071 if (!is_gimple_assign (gs: stmt)
3072 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
3073 return NULL_TREE;
3074 return gimple_assign_rhs1 (gs: stmt);
3075}
3076
3077/* Return true if vectorizable_* routines can handle statements STMT1_INFO
3078 and STMT2_INFO being in a single group. When ALLOW_SLP_P, masked loads can
3079 be grouped in SLP mode. */
3080
3081static bool
3082can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
3083 bool allow_slp_p)
3084{
3085 if (gimple_assign_single_p (gs: stmt1_info->stmt))
3086 return gimple_assign_single_p (gs: stmt2_info->stmt);
3087
3088 gcall *call1 = dyn_cast <gcall *> (p: stmt1_info->stmt);
3089 if (call1 && gimple_call_internal_p (gs: call1))
3090 {
3091 /* Check for two masked loads or two masked stores. */
3092 gcall *call2 = dyn_cast <gcall *> (p: stmt2_info->stmt);
3093 if (!call2 || !gimple_call_internal_p (gs: call2))
3094 return false;
3095 internal_fn ifn = gimple_call_internal_fn (gs: call1);
3096 if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
3097 return false;
3098 if (ifn != gimple_call_internal_fn (gs: call2))
3099 return false;
3100
3101 /* Check that the masks are the same. Cope with casts of masks,
3102 like those created by build_mask_conversion. */
3103 tree mask1 = gimple_call_arg (gs: call1, index: 2);
3104 tree mask2 = gimple_call_arg (gs: call2, index: 2);
3105 if (!operand_equal_p (mask1, mask2, flags: 0) && !allow_slp_p)
3106 {
3107 mask1 = strip_conversion (op: mask1);
3108 if (!mask1)
3109 return false;
3110 mask2 = strip_conversion (op: mask2);
3111 if (!mask2)
3112 return false;
3113 if (!operand_equal_p (mask1, mask2, flags: 0))
3114 return false;
3115 }
3116 return true;
3117 }
3118
3119 return false;
3120}
3121
3122/* Function vect_analyze_data_ref_accesses.
3123
3124 Analyze the access pattern of all the data references in the loop.
3125
3126 FORNOW: the only access pattern that is considered vectorizable is a
3127 simple step 1 (consecutive) access.
3128
3129 FORNOW: handle only arrays and pointer accesses. */
3130
3131opt_result
3132vect_analyze_data_ref_accesses (vec_info *vinfo,
3133 vec<int> *dataref_groups)
3134{
3135 unsigned int i;
3136 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
3137
3138 DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
3139
3140 if (datarefs.is_empty ())
3141 return opt_result::success ();
3142
3143 /* Sort the array of datarefs to make building the interleaving chains
3144 linear. Don't modify the original vector's order, it is needed for
3145 determining what dependencies are reversed. */
3146 vec<dr_vec_info *> datarefs_copy;
3147 datarefs_copy.create (nelems: datarefs.length ());
3148 for (unsigned i = 0; i < datarefs.length (); i++)
3149 {
3150 dr_vec_info *dr_info = vinfo->lookup_dr (datarefs[i]);
3151 /* If the caller computed DR grouping use that, otherwise group by
3152 basic blocks. */
3153 if (dataref_groups)
3154 dr_info->group = (*dataref_groups)[i];
3155 else
3156 dr_info->group = gimple_bb (DR_STMT (datarefs[i]))->index;
3157 datarefs_copy.quick_push (obj: dr_info);
3158 }
3159 datarefs_copy.qsort (dr_group_sort_cmp);
3160 hash_set<stmt_vec_info> to_fixup;
3161
3162 /* Build the interleaving chains. */
3163 for (i = 0; i < datarefs_copy.length () - 1;)
3164 {
3165 dr_vec_info *dr_info_a = datarefs_copy[i];
3166 data_reference_p dra = dr_info_a->dr;
3167 int dra_group_id = dr_info_a->group;
3168 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
3169 stmt_vec_info lastinfo = NULL;
3170 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
3171 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
3172 {
3173 ++i;
3174 continue;
3175 }
3176 for (i = i + 1; i < datarefs_copy.length (); ++i)
3177 {
3178 dr_vec_info *dr_info_b = datarefs_copy[i];
3179 data_reference_p drb = dr_info_b->dr;
3180 int drb_group_id = dr_info_b->group;
3181 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
3182 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
3183 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
3184 break;
3185
3186 /* ??? Imperfect sorting (non-compatible types, non-modulo
3187 accesses, same accesses) can lead to a group to be artificially
3188 split here as we don't just skip over those. If it really
3189 matters we can push those to a worklist and re-iterate
3190 over them. The we can just skip ahead to the next DR here. */
3191
3192 /* DRs in a different DR group should not be put into the same
3193 interleaving group. */
3194 if (dra_group_id != drb_group_id)
3195 break;
3196
3197 /* Check that the data-refs have same first location (except init)
3198 and they are both either store or load (not load and store,
3199 not masked loads or stores). */
3200 if (DR_IS_READ (dra) != DR_IS_READ (drb)
3201 || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3202 DR_BASE_ADDRESS (drb)) != 0
3203 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
3204 || !can_group_stmts_p (stmt1_info: stmtinfo_a, stmt2_info: stmtinfo_b, allow_slp_p: true))
3205 break;
3206
3207 /* Check that the data-refs have the same constant size. */
3208 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
3209 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
3210 if (!tree_fits_uhwi_p (sza)
3211 || !tree_fits_uhwi_p (szb)
3212 || !tree_int_cst_equal (sza, szb))
3213 break;
3214
3215 /* Check that the data-refs have the same step. */
3216 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
3217 break;
3218
3219 /* Check the types are compatible.
3220 ??? We don't distinguish this during sorting. */
3221 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3222 TREE_TYPE (DR_REF (drb))))
3223 break;
3224
3225 /* Check that the DR_INITs are compile-time constants. */
3226 if (!tree_fits_shwi_p (DR_INIT (dra))
3227 || !tree_fits_shwi_p (DR_INIT (drb)))
3228 break;
3229
3230 /* Different .GOMP_SIMD_LANE calls still give the same lane,
3231 just hold extra information. */
3232 if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a)
3233 && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b)
3234 && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0)
3235 break;
3236
3237 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
3238 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3239 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3240 HOST_WIDE_INT init_prev
3241 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]->dr));
3242 gcc_assert (init_a <= init_b
3243 && init_a <= init_prev
3244 && init_prev <= init_b);
3245
3246 /* Do not place the same access in the interleaving chain twice. */
3247 if (init_b == init_prev)
3248 {
3249 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]->dr))
3250 < gimple_uid (DR_STMT (drb)));
3251 /* Simply link in duplicates and fix up the chain below. */
3252 }
3253 else
3254 {
3255 /* If init_b == init_a + the size of the type * k, we have an
3256 interleaving, and DRA is accessed before DRB. */
3257 unsigned HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3258 if (type_size_a == 0
3259 || (((unsigned HOST_WIDE_INT)init_b - init_a)
3260 % type_size_a != 0))
3261 break;
3262
3263 /* If we have a store, the accesses are adjacent. This splits
3264 groups into chunks we support (we don't support vectorization
3265 of stores with gaps). */
3266 if (!DR_IS_READ (dra)
3267 && (((unsigned HOST_WIDE_INT)init_b - init_prev)
3268 != type_size_a))
3269 break;
3270
3271 /* If the step (if not zero or non-constant) is smaller than the
3272 difference between data-refs' inits this splits groups into
3273 suitable sizes. */
3274 if (tree_fits_shwi_p (DR_STEP (dra)))
3275 {
3276 unsigned HOST_WIDE_INT step
3277 = absu_hwi (x: tree_to_shwi (DR_STEP (dra)));
3278 if (step != 0
3279 && step <= ((unsigned HOST_WIDE_INT)init_b - init_a))
3280 break;
3281 }
3282 }
3283
3284 if (dump_enabled_p ())
3285 dump_printf_loc (MSG_NOTE, vect_location,
3286 DR_IS_READ (dra)
3287 ? "Detected interleaving load %T and %T\n"
3288 : "Detected interleaving store %T and %T\n",
3289 DR_REF (dra), DR_REF (drb));
3290
3291 /* Link the found element into the group list. */
3292 if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3293 {
3294 DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3295 lastinfo = stmtinfo_a;
3296 }
3297 DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3298 DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3299 lastinfo = stmtinfo_b;
3300
3301 STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
3302 = !can_group_stmts_p (stmt1_info: stmtinfo_a, stmt2_info: stmtinfo_b, allow_slp_p: false);
3303
3304 if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3305 dump_printf_loc (MSG_NOTE, vect_location,
3306 "Load suitable for SLP vectorization only.\n");
3307
3308 if (init_b == init_prev
3309 && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3310 && dump_enabled_p ())
3311 dump_printf_loc (MSG_NOTE, vect_location,
3312 "Queuing group with duplicate access for fixup\n");
3313 }
3314 }
3315
3316 /* Fixup groups with duplicate entries by splitting it. */
3317 while (1)
3318 {
3319 hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3320 if (!(it != to_fixup.end ()))
3321 break;
3322 stmt_vec_info grp = *it;
3323 to_fixup.remove (k: grp);
3324
3325 /* Find the earliest duplicate group member. */
3326 unsigned first_duplicate = -1u;
3327 stmt_vec_info next, g = grp;
3328 while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3329 {
3330 if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
3331 DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3332 && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3333 first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3334 g = next;
3335 }
3336 if (first_duplicate == -1U)
3337 continue;
3338
3339 /* Then move all stmts after the first duplicate to a new group.
3340 Note this is a heuristic but one with the property that *it
3341 is fixed up completely. */
3342 g = grp;
3343 stmt_vec_info newgroup = NULL, ng = grp;
3344 while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3345 {
3346 if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3347 {
3348 DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3349 if (!newgroup)
3350 newgroup = next;
3351 else
3352 DR_GROUP_NEXT_ELEMENT (ng) = next;
3353 ng = next;
3354 DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3355 }
3356 else
3357 g = DR_GROUP_NEXT_ELEMENT (g);
3358 }
3359 DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3360
3361 /* Fixup the new group which still may contain duplicates. */
3362 to_fixup.add (k: newgroup);
3363 }
3364
3365 dr_vec_info *dr_info;
3366 FOR_EACH_VEC_ELT (datarefs_copy, i, dr_info)
3367 {
3368 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3369 && !vect_analyze_data_ref_access (vinfo, dr_info))
3370 {
3371 if (dump_enabled_p ())
3372 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3373 "not vectorized: complicated access pattern.\n");
3374
3375 if (is_a <bb_vec_info> (p: vinfo))
3376 {
3377 /* Mark the statement as not vectorizable. */
3378 STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3379 continue;
3380 }
3381 else
3382 {
3383 datarefs_copy.release ();
3384 return opt_result::failure_at (loc: dr_info->stmt->stmt,
3385 fmt: "not vectorized:"
3386 " complicated access pattern.\n");
3387 }
3388 }
3389 }
3390
3391 datarefs_copy.release ();
3392 return opt_result::success ();
3393}
3394
3395/* Function vect_vfa_segment_size.
3396
3397 Input:
3398 DR_INFO: The data reference.
3399 LENGTH_FACTOR: segment length to consider.
3400
3401 Return a value suitable for the dr_with_seg_len::seg_len field.
3402 This is the "distance travelled" by the pointer from the first
3403 iteration in the segment to the last. Note that it does not include
3404 the size of the access; in effect it only describes the first byte. */
3405
3406static tree
3407vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3408{
3409 length_factor = size_binop (MINUS_EXPR,
3410 fold_convert (sizetype, length_factor),
3411 size_one_node);
3412 return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3413 length_factor);
3414}
3415
3416/* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3417 gives the worst-case number of bytes covered by the segment. */
3418
3419static unsigned HOST_WIDE_INT
3420vect_vfa_access_size (vec_info *vinfo, dr_vec_info *dr_info)
3421{
3422 stmt_vec_info stmt_vinfo = dr_info->stmt;
3423 tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3424 unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3425 unsigned HOST_WIDE_INT access_size = ref_size;
3426 if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3427 {
3428 gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3429 access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3430 }
3431 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3432 int misalignment;
3433 if (STMT_VINFO_VEC_STMTS (stmt_vinfo).exists ()
3434 && ((misalignment = dr_misalignment (dr_info, vectype)), true)
3435 && (vect_supportable_dr_alignment (vinfo, dr_info, vectype, misalignment)
3436 == dr_explicit_realign_optimized))
3437 {
3438 /* We might access a full vector's worth. */
3439 access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3440 }
3441 return access_size;
3442}
3443
3444/* Get the minimum alignment for all the scalar accesses that DR_INFO
3445 describes. */
3446
3447static unsigned int
3448vect_vfa_align (dr_vec_info *dr_info)
3449{
3450 return dr_alignment (dr: dr_info->dr);
3451}
3452
3453/* Function vect_no_alias_p.
3454
3455 Given data references A and B with equal base and offset, see whether
3456 the alias relation can be decided at compilation time. Return 1 if
3457 it can and the references alias, 0 if it can and the references do
3458 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3459 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3460 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3461
3462static int
3463vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3464 tree segment_length_a, tree segment_length_b,
3465 unsigned HOST_WIDE_INT access_size_a,
3466 unsigned HOST_WIDE_INT access_size_b)
3467{
3468 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3469 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3470 poly_uint64 const_length_a;
3471 poly_uint64 const_length_b;
3472
3473 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3474 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3475 [a, a+12) */
3476 if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3477 {
3478 const_length_a = (-wi::to_poly_wide (t: segment_length_a)).force_uhwi ();
3479 offset_a -= const_length_a;
3480 }
3481 else
3482 const_length_a = tree_to_poly_uint64 (segment_length_a);
3483 if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3484 {
3485 const_length_b = (-wi::to_poly_wide (t: segment_length_b)).force_uhwi ();
3486 offset_b -= const_length_b;
3487 }
3488 else
3489 const_length_b = tree_to_poly_uint64 (segment_length_b);
3490
3491 const_length_a += access_size_a;
3492 const_length_b += access_size_b;
3493
3494 if (ranges_known_overlap_p (pos1: offset_a, size1: const_length_a,
3495 pos2: offset_b, size2: const_length_b))
3496 return 1;
3497
3498 if (!ranges_maybe_overlap_p (pos1: offset_a, size1: const_length_a,
3499 pos2: offset_b, size2: const_length_b))
3500 return 0;
3501
3502 return -1;
3503}
3504
3505/* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3506 in DDR is >= VF. */
3507
3508static bool
3509dependence_distance_ge_vf (data_dependence_relation *ddr,
3510 unsigned int loop_depth, poly_uint64 vf)
3511{
3512 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3513 || DDR_NUM_DIST_VECTS (ddr) == 0)
3514 return false;
3515
3516 /* If the dependence is exact, we should have limited the VF instead. */
3517 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3518
3519 unsigned int i;
3520 lambda_vector dist_v;
3521 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3522 {
3523 HOST_WIDE_INT dist = dist_v[loop_depth];
3524 if (dist != 0
3525 && !(dist > 0 && DDR_REVERSED_P (ddr))
3526 && maybe_lt (a: (unsigned HOST_WIDE_INT) abs_hwi (x: dist), b: vf))
3527 return false;
3528 }
3529
3530 if (dump_enabled_p ())
3531 dump_printf_loc (MSG_NOTE, vect_location,
3532 "dependence distance between %T and %T is >= VF\n",
3533 DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3534
3535 return true;
3536}
3537
3538/* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3539
3540static void
3541dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3542{
3543 dump_printf (dump_kind, "%s (%T) >= ",
3544 lower_bound.unsigned_p ? "unsigned" : "abs",
3545 lower_bound.expr);
3546 dump_dec (dump_kind, lower_bound.min_value);
3547}
3548
3549/* Record that the vectorized loop requires the vec_lower_bound described
3550 by EXPR, UNSIGNED_P and MIN_VALUE. */
3551
3552static void
3553vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3554 poly_uint64 min_value)
3555{
3556 vec<vec_lower_bound> &lower_bounds
3557 = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3558 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3559 if (operand_equal_p (lower_bounds[i].expr, expr, flags: 0))
3560 {
3561 unsigned_p &= lower_bounds[i].unsigned_p;
3562 min_value = upper_bound (a: lower_bounds[i].min_value, b: min_value);
3563 if (lower_bounds[i].unsigned_p != unsigned_p
3564 || maybe_lt (a: lower_bounds[i].min_value, b: min_value))
3565 {
3566 lower_bounds[i].unsigned_p = unsigned_p;
3567 lower_bounds[i].min_value = min_value;
3568 if (dump_enabled_p ())
3569 {
3570 dump_printf_loc (MSG_NOTE, vect_location,
3571 "updating run-time check to ");
3572 dump_lower_bound (dump_kind: MSG_NOTE, lower_bound: lower_bounds[i]);
3573 dump_printf (MSG_NOTE, "\n");
3574 }
3575 }
3576 return;
3577 }
3578
3579 vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3580 if (dump_enabled_p ())
3581 {
3582 dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3583 dump_lower_bound (dump_kind: MSG_NOTE, lower_bound);
3584 dump_printf (MSG_NOTE, "\n");
3585 }
3586 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (obj: lower_bound);
3587}
3588
3589/* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3590 will span fewer than GAP bytes. */
3591
3592static bool
3593vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3594 poly_int64 gap)
3595{
3596 stmt_vec_info stmt_info = dr_info->stmt;
3597 HOST_WIDE_INT count
3598 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3599 if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3600 count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3601 return (estimated_poly_value (x: gap)
3602 <= count * vect_get_scalar_dr_size (dr_info));
3603}
3604
3605/* Return true if we know that there is no alias between DR_INFO_A and
3606 DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3607 When returning true, set *LOWER_BOUND_OUT to this N. */
3608
3609static bool
3610vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3611 poly_uint64 *lower_bound_out)
3612{
3613 /* Check that there is a constant gap of known sign between DR_A
3614 and DR_B. */
3615 data_reference *dr_a = dr_info_a->dr;
3616 data_reference *dr_b = dr_info_b->dr;
3617 poly_int64 init_a, init_b;
3618 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), flags: 0)
3619 || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), flags: 0)
3620 || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), flags: 0)
3621 || !poly_int_tree_p (DR_INIT (dr_a), value: &init_a)
3622 || !poly_int_tree_p (DR_INIT (dr_b), value: &init_b)
3623 || !ordered_p (a: init_a, b: init_b))
3624 return false;
3625
3626 /* Sort DR_A and DR_B by the address they access. */
3627 if (maybe_lt (a: init_b, b: init_a))
3628 {
3629 std::swap (a&: init_a, b&: init_b);
3630 std::swap (a&: dr_info_a, b&: dr_info_b);
3631 std::swap (a&: dr_a, b&: dr_b);
3632 }
3633
3634 /* If the two accesses could be dependent within a scalar iteration,
3635 make sure that we'd retain their order. */
3636 if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3637 && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3638 return false;
3639
3640 /* There is no alias if abs (DR_STEP) is greater than or equal to
3641 the bytes spanned by the combination of the two accesses. */
3642 *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info: dr_info_b) - init_a;
3643 return true;
3644}
3645
3646/* Function vect_prune_runtime_alias_test_list.
3647
3648 Prune a list of ddrs to be tested at run-time by versioning for alias.
3649 Merge several alias checks into one if possible.
3650 Return FALSE if resulting list of ddrs is longer then allowed by
3651 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3652
3653opt_result
3654vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3655{
3656 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3657 hash_set <tree_pair_hash> compared_objects;
3658
3659 const vec<ddr_p> &may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3660 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3661 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3662 const vec<vec_object_pair> &check_unequal_addrs
3663 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3664 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3665 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3666
3667 ddr_p ddr;
3668 unsigned int i;
3669 tree length_factor;
3670
3671 DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3672
3673 /* Step values are irrelevant for aliasing if the number of vector
3674 iterations is equal to the number of scalar iterations (which can
3675 happen for fully-SLP loops). */
3676 bool vf_one_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3677
3678 if (!vf_one_p)
3679 {
3680 /* Convert the checks for nonzero steps into bound tests. */
3681 tree value;
3682 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3683 vect_check_lower_bound (loop_vinfo, expr: value, unsigned_p: true, min_value: 1);
3684 }
3685
3686 if (may_alias_ddrs.is_empty ())
3687 return opt_result::success ();
3688
3689 comp_alias_ddrs.create (nelems: may_alias_ddrs.length ());
3690
3691 unsigned int loop_depth
3692 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3693 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3694
3695 /* First, we collect all data ref pairs for aliasing checks. */
3696 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3697 {
3698 poly_uint64 lower_bound;
3699 tree segment_length_a, segment_length_b;
3700 unsigned HOST_WIDE_INT access_size_a, access_size_b;
3701 unsigned int align_a, align_b;
3702
3703 /* Ignore the alias if the VF we chose ended up being no greater
3704 than the dependence distance. */
3705 if (dependence_distance_ge_vf (ddr, loop_depth, vf: vect_factor))
3706 continue;
3707
3708 if (DDR_OBJECT_A (ddr))
3709 {
3710 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3711 if (!compared_objects.add (k: new_pair))
3712 {
3713 if (dump_enabled_p ())
3714 dump_printf_loc (MSG_NOTE, vect_location,
3715 "checking that %T and %T"
3716 " have different addresses\n",
3717 new_pair.first, new_pair.second);
3718 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (obj: new_pair);
3719 }
3720 continue;
3721 }
3722
3723 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3724 stmt_vec_info stmt_info_a = dr_info_a->stmt;
3725
3726 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3727 stmt_vec_info stmt_info_b = dr_info_b->stmt;
3728
3729 bool preserves_scalar_order_p
3730 = vect_preserves_scalar_order_p (dr_info_a, dr_info_b);
3731 bool ignore_step_p
3732 = (vf_one_p
3733 && (preserves_scalar_order_p
3734 || operand_equal_p (DR_STEP (dr_info_a->dr),
3735 DR_STEP (dr_info_b->dr))));
3736
3737 /* Skip the pair if inter-iteration dependencies are irrelevant
3738 and intra-iteration dependencies are guaranteed to be honored. */
3739 if (ignore_step_p
3740 && (preserves_scalar_order_p
3741 || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3742 lower_bound_out: &lower_bound)))
3743 {
3744 if (dump_enabled_p ())
3745 dump_printf_loc (MSG_NOTE, vect_location,
3746 "no need for alias check between "
3747 "%T and %T when VF is 1\n",
3748 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3749 continue;
3750 }
3751
3752 /* See whether we can handle the alias using a bounds check on
3753 the step, and whether that's likely to be the best approach.
3754 (It might not be, for example, if the minimum step is much larger
3755 than the number of bytes handled by one vector iteration.) */
3756 if (!ignore_step_p
3757 && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3758 && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3759 lower_bound_out: &lower_bound)
3760 && (vect_small_gap_p (loop_vinfo, dr_info: dr_info_a, gap: lower_bound)
3761 || vect_small_gap_p (loop_vinfo, dr_info: dr_info_b, gap: lower_bound)))
3762 {
3763 bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3764 if (dump_enabled_p ())
3765 {
3766 dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
3767 "%T and %T when the step %T is outside ",
3768 DR_REF (dr_info_a->dr),
3769 DR_REF (dr_info_b->dr),
3770 DR_STEP (dr_info_a->dr));
3771 if (unsigned_p)
3772 dump_printf (MSG_NOTE, "[0");
3773 else
3774 {
3775 dump_printf (MSG_NOTE, "(");
3776 dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3777 }
3778 dump_printf (MSG_NOTE, ", ");
3779 dump_dec (MSG_NOTE, lower_bound);
3780 dump_printf (MSG_NOTE, ")\n");
3781 }
3782 vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
3783 unsigned_p, min_value: lower_bound);
3784 continue;
3785 }
3786
3787 stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
3788 if (dr_group_first_a)
3789 {
3790 stmt_info_a = dr_group_first_a;
3791 dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
3792 }
3793
3794 stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
3795 if (dr_group_first_b)
3796 {
3797 stmt_info_b = dr_group_first_b;
3798 dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
3799 }
3800
3801 if (ignore_step_p)
3802 {
3803 segment_length_a = size_zero_node;
3804 segment_length_b = size_zero_node;
3805 }
3806 else
3807 {
3808 if (!operand_equal_p (DR_STEP (dr_info_a->dr),
3809 DR_STEP (dr_info_b->dr), flags: 0))
3810 length_factor = scalar_loop_iters;
3811 else
3812 length_factor = size_int (vect_factor);
3813 segment_length_a = vect_vfa_segment_size (dr_info: dr_info_a, length_factor);
3814 segment_length_b = vect_vfa_segment_size (dr_info: dr_info_b, length_factor);
3815 }
3816 access_size_a = vect_vfa_access_size (vinfo: loop_vinfo, dr_info: dr_info_a);
3817 access_size_b = vect_vfa_access_size (vinfo: loop_vinfo, dr_info: dr_info_b);
3818 align_a = vect_vfa_align (dr_info: dr_info_a);
3819 align_b = vect_vfa_align (dr_info: dr_info_b);
3820
3821 /* See whether the alias is known at compilation time. */
3822 if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a->dr),
3823 DR_BASE_ADDRESS (dr_info_b->dr), flags: 0)
3824 && operand_equal_p (DR_OFFSET (dr_info_a->dr),
3825 DR_OFFSET (dr_info_b->dr), flags: 0)
3826 && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
3827 && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
3828 && poly_int_tree_p (t: segment_length_a)
3829 && poly_int_tree_p (t: segment_length_b))
3830 {
3831 int res = vect_compile_time_alias (a: dr_info_a, b: dr_info_b,
3832 segment_length_a,
3833 segment_length_b,
3834 access_size_a,
3835 access_size_b);
3836 if (res >= 0 && dump_enabled_p ())
3837 {
3838 dump_printf_loc (MSG_NOTE, vect_location,
3839 "can tell at compile time that %T and %T",
3840 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3841 if (res == 0)
3842 dump_printf (MSG_NOTE, " do not alias\n");
3843 else
3844 dump_printf (MSG_NOTE, " alias\n");
3845 }
3846
3847 if (res == 0)
3848 continue;
3849
3850 if (res == 1)
3851 return opt_result::failure_at (loc: stmt_info_b->stmt,
3852 fmt: "not vectorized:"
3853 " compilation time alias: %G%G",
3854 stmt_info_a->stmt,
3855 stmt_info_b->stmt);
3856 }
3857
3858 dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
3859 access_size_a, align_a);
3860 dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
3861 access_size_b, align_b);
3862 /* Canonicalize the order to be the one that's needed for accurate
3863 RAW, WAR and WAW flags, in cases where the data references are
3864 well-ordered. The order doesn't really matter otherwise,
3865 but we might as well be consistent. */
3866 if (get_later_stmt (stmt1_info: stmt_info_a, stmt2_info: stmt_info_b) == stmt_info_a)
3867 std::swap (a&: dr_a, b&: dr_b);
3868
3869 dr_with_seg_len_pair_t dr_with_seg_len_pair
3870 (dr_a, dr_b, (preserves_scalar_order_p
3871 ? dr_with_seg_len_pair_t::WELL_ORDERED
3872 : dr_with_seg_len_pair_t::REORDERED));
3873
3874 comp_alias_ddrs.safe_push (obj: dr_with_seg_len_pair);
3875 }
3876
3877 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3878
3879 unsigned int count = (comp_alias_ddrs.length ()
3880 + check_unequal_addrs.length ());
3881
3882 if (count
3883 && (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo))
3884 == VECT_COST_MODEL_VERY_CHEAP))
3885 return opt_result::failure_at
3886 (loc: vect_location, fmt: "would need a runtime alias check\n");
3887
3888 if (dump_enabled_p ())
3889 dump_printf_loc (MSG_NOTE, vect_location,
3890 "improved number of alias checks from %d to %d\n",
3891 may_alias_ddrs.length (), count);
3892 unsigned limit = param_vect_max_version_for_alias_checks;
3893 if (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo)) == VECT_COST_MODEL_CHEAP)
3894 limit = param_vect_max_version_for_alias_checks * 6 / 10;
3895 if (count > limit)
3896 return opt_result::failure_at
3897 (loc: vect_location,
3898 fmt: "number of versioning for alias run-time tests exceeds %d "
3899 "(--param vect-max-version-for-alias-checks)\n", limit);
3900
3901 return opt_result::success ();
3902}
3903
3904/* Check whether we can use an internal function for a gather load
3905 or scatter store. READ_P is true for loads and false for stores.
3906 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
3907 the type of the memory elements being loaded or stored. OFFSET_TYPE
3908 is the type of the offset that is being applied to the invariant
3909 base address. SCALE is the amount by which the offset should
3910 be multiplied *after* it has been converted to address width.
3911
3912 Return true if the function is supported, storing the function id in
3913 *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT. */
3914
3915bool
3916vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
3917 tree vectype, tree memory_type, tree offset_type,
3918 int scale, internal_fn *ifn_out,
3919 tree *offset_vectype_out)
3920{
3921 unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3922 unsigned int element_bits = vector_element_bits (vectype);
3923 if (element_bits != memory_bits)
3924 /* For now the vector elements must be the same width as the
3925 memory elements. */
3926 return false;
3927
3928 /* Work out which function we need. */
3929 internal_fn ifn, alt_ifn, alt_ifn2;
3930 if (read_p)
3931 {
3932 ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3933 alt_ifn = IFN_MASK_GATHER_LOAD;
3934 /* When target supports MASK_LEN_GATHER_LOAD, we always
3935 use MASK_LEN_GATHER_LOAD regardless whether len and
3936 mask are valid or not. */
3937 alt_ifn2 = IFN_MASK_LEN_GATHER_LOAD;
3938 }
3939 else
3940 {
3941 ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3942 alt_ifn = IFN_MASK_SCATTER_STORE;
3943 /* When target supports MASK_LEN_SCATTER_STORE, we always
3944 use MASK_LEN_SCATTER_STORE regardless whether len and
3945 mask are valid or not. */
3946 alt_ifn2 = IFN_MASK_LEN_SCATTER_STORE;
3947 }
3948
3949 for (;;)
3950 {
3951 tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type);
3952 if (!offset_vectype)
3953 return false;
3954
3955 /* Test whether the target supports this combination. */
3956 if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3957 offset_vectype, scale))
3958 {
3959 *ifn_out = ifn;
3960 *offset_vectype_out = offset_vectype;
3961 return true;
3962 }
3963 else if (!masked_p
3964 && internal_gather_scatter_fn_supported_p (alt_ifn, vectype,
3965 memory_type,
3966 offset_vectype,
3967 scale))
3968 {
3969 *ifn_out = alt_ifn;
3970 *offset_vectype_out = offset_vectype;
3971 return true;
3972 }
3973 else if (internal_gather_scatter_fn_supported_p (alt_ifn2, vectype,
3974 memory_type,
3975 offset_vectype, scale))
3976 {
3977 *ifn_out = alt_ifn2;
3978 *offset_vectype_out = offset_vectype;
3979 return true;
3980 }
3981
3982 if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
3983 && TYPE_PRECISION (offset_type) >= element_bits)
3984 return false;
3985
3986 offset_type = build_nonstandard_integer_type
3987 (TYPE_PRECISION (offset_type) * 2, TYPE_UNSIGNED (offset_type));
3988 }
3989}
3990
3991/* STMT_INFO is a call to an internal gather load or scatter store function.
3992 Describe the operation in INFO. */
3993
3994static void
3995vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3996 gather_scatter_info *info)
3997{
3998 gcall *call = as_a <gcall *> (p: stmt_info->stmt);
3999 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4000 data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4001
4002 info->ifn = gimple_call_internal_fn (gs: call);
4003 info->decl = NULL_TREE;
4004 info->base = gimple_call_arg (gs: call, index: 0);
4005 info->offset = gimple_call_arg (gs: call, index: 1);
4006 info->offset_dt = vect_unknown_def_type;
4007 info->offset_vectype = NULL_TREE;
4008 info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
4009 info->element_type = TREE_TYPE (vectype);
4010 info->memory_type = TREE_TYPE (DR_REF (dr));
4011}
4012
4013/* Return true if a non-affine read or write in STMT_INFO is suitable for a
4014 gather load or scatter store. Describe the operation in *INFO if so. */
4015
4016bool
4017vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
4018 gather_scatter_info *info)
4019{
4020 HOST_WIDE_INT scale = 1;
4021 poly_int64 pbitpos, pbitsize;
4022 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4023 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4024 tree offtype = NULL_TREE;
4025 tree decl = NULL_TREE, base, off;
4026 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4027 tree memory_type = TREE_TYPE (DR_REF (dr));
4028 machine_mode pmode;
4029 int punsignedp, reversep, pvolatilep = 0;
4030 internal_fn ifn;
4031 tree offset_vectype;
4032 bool masked_p = false;
4033
4034 /* See whether this is already a call to a gather/scatter internal function.
4035 If not, see whether it's a masked load or store. */
4036 gcall *call = dyn_cast <gcall *> (p: stmt_info->stmt);
4037 if (call && gimple_call_internal_p (gs: call))
4038 {
4039 ifn = gimple_call_internal_fn (gs: call);
4040 if (internal_gather_scatter_fn_p (ifn))
4041 {
4042 vect_describe_gather_scatter_call (stmt_info, info);
4043 return true;
4044 }
4045 masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
4046 }
4047
4048 /* True if we should aim to use internal functions rather than
4049 built-in functions. */
4050 bool use_ifn_p = (DR_IS_READ (dr)
4051 ? supports_vec_gather_load_p (TYPE_MODE (vectype))
4052 : supports_vec_scatter_store_p (TYPE_MODE (vectype)));
4053
4054 base = DR_REF (dr);
4055 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
4056 see if we can use the def stmt of the address. */
4057 if (masked_p
4058 && TREE_CODE (base) == MEM_REF
4059 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
4060 && integer_zerop (TREE_OPERAND (base, 1))
4061 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
4062 {
4063 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
4064 if (is_gimple_assign (gs: def_stmt)
4065 && gimple_assign_rhs_code (gs: def_stmt) == ADDR_EXPR)
4066 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
4067 }
4068
4069 /* The gather and scatter builtins need address of the form
4070 loop_invariant + vector * {1, 2, 4, 8}
4071 or
4072 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
4073 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
4074 of loop invariants/SSA_NAMEs defined in the loop, with casts,
4075 multiplications and additions in it. To get a vector, we need
4076 a single SSA_NAME that will be defined in the loop and will
4077 contain everything that is not loop invariant and that can be
4078 vectorized. The following code attempts to find such a preexistng
4079 SSA_NAME OFF and put the loop invariants into a tree BASE
4080 that can be gimplified before the loop. */
4081 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
4082 &punsignedp, &reversep, &pvolatilep);
4083 if (reversep)
4084 return false;
4085
4086 /* PR 107346. Packed structs can have fields at offsets that are not
4087 multiples of BITS_PER_UNIT. Do not use gather/scatters in such cases. */
4088 if (!multiple_p (a: pbitpos, BITS_PER_UNIT))
4089 return false;
4090
4091 poly_int64 pbytepos = exact_div (a: pbitpos, BITS_PER_UNIT);
4092
4093 if (TREE_CODE (base) == MEM_REF)
4094 {
4095 if (!integer_zerop (TREE_OPERAND (base, 1)))
4096 {
4097 if (off == NULL_TREE)
4098 off = wide_int_to_tree (sizetype, cst: mem_ref_offset (base));
4099 else
4100 off = size_binop (PLUS_EXPR, off,
4101 fold_convert (sizetype, TREE_OPERAND (base, 1)));
4102 }
4103 base = TREE_OPERAND (base, 0);
4104 }
4105 else
4106 base = build_fold_addr_expr (base);
4107
4108 if (off == NULL_TREE)
4109 off = size_zero_node;
4110
4111 /* If base is not loop invariant, either off is 0, then we start with just
4112 the constant offset in the loop invariant BASE and continue with base
4113 as OFF, otherwise give up.
4114 We could handle that case by gimplifying the addition of base + off
4115 into some SSA_NAME and use that as off, but for now punt. */
4116 if (!expr_invariant_in_loop_p (loop, base))
4117 {
4118 if (!integer_zerop (off))
4119 return false;
4120 off = base;
4121 base = size_int (pbytepos);
4122 }
4123 /* Otherwise put base + constant offset into the loop invariant BASE
4124 and continue with OFF. */
4125 else
4126 {
4127 base = fold_convert (sizetype, base);
4128 base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
4129 }
4130
4131 /* OFF at this point may be either a SSA_NAME or some tree expression
4132 from get_inner_reference. Try to peel off loop invariants from it
4133 into BASE as long as possible. */
4134 STRIP_NOPS (off);
4135 while (offtype == NULL_TREE)
4136 {
4137 enum tree_code code;
4138 tree op0, op1, add = NULL_TREE;
4139
4140 if (TREE_CODE (off) == SSA_NAME)
4141 {
4142 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
4143
4144 if (expr_invariant_in_loop_p (loop, off))
4145 return false;
4146
4147 if (gimple_code (g: def_stmt) != GIMPLE_ASSIGN)
4148 break;
4149
4150 op0 = gimple_assign_rhs1 (gs: def_stmt);
4151 code = gimple_assign_rhs_code (gs: def_stmt);
4152 op1 = gimple_assign_rhs2 (gs: def_stmt);
4153 }
4154 else
4155 {
4156 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
4157 return false;
4158 code = TREE_CODE (off);
4159 extract_ops_from_tree (expr: off, code: &code, op0: &op0, op1: &op1);
4160 }
4161 switch (code)
4162 {
4163 case POINTER_PLUS_EXPR:
4164 case PLUS_EXPR:
4165 if (expr_invariant_in_loop_p (loop, op0))
4166 {
4167 add = op0;
4168 off = op1;
4169 do_add:
4170 add = fold_convert (sizetype, add);
4171 if (scale != 1)
4172 add = size_binop (MULT_EXPR, add, size_int (scale));
4173 base = size_binop (PLUS_EXPR, base, add);
4174 continue;
4175 }
4176 if (expr_invariant_in_loop_p (loop, op1))
4177 {
4178 add = op1;
4179 off = op0;
4180 goto do_add;
4181 }
4182 break;
4183 case MINUS_EXPR:
4184 if (expr_invariant_in_loop_p (loop, op1))
4185 {
4186 add = fold_convert (sizetype, op1);
4187 add = size_binop (MINUS_EXPR, size_zero_node, add);
4188 off = op0;
4189 goto do_add;
4190 }
4191 break;
4192 case MULT_EXPR:
4193 if (scale == 1 && tree_fits_shwi_p (op1))
4194 {
4195 int new_scale = tree_to_shwi (op1);
4196 /* Only treat this as a scaling operation if the target
4197 supports it for at least some offset type. */
4198 if (use_ifn_p
4199 && !vect_gather_scatter_fn_p (vinfo: loop_vinfo, DR_IS_READ (dr),
4200 masked_p, vectype, memory_type,
4201 signed_char_type_node,
4202 scale: new_scale, ifn_out: &ifn,
4203 offset_vectype_out: &offset_vectype)
4204 && !vect_gather_scatter_fn_p (vinfo: loop_vinfo, DR_IS_READ (dr),
4205 masked_p, vectype, memory_type,
4206 unsigned_char_type_node,
4207 scale: new_scale, ifn_out: &ifn,
4208 offset_vectype_out: &offset_vectype))
4209 break;
4210 scale = new_scale;
4211 off = op0;
4212 continue;
4213 }
4214 break;
4215 case SSA_NAME:
4216 off = op0;
4217 continue;
4218 CASE_CONVERT:
4219 if (!POINTER_TYPE_P (TREE_TYPE (op0))
4220 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4221 break;
4222
4223 /* Don't include the conversion if the target is happy with
4224 the current offset type. */
4225 if (use_ifn_p
4226 && TREE_CODE (off) == SSA_NAME
4227 && !POINTER_TYPE_P (TREE_TYPE (off))
4228 && vect_gather_scatter_fn_p (vinfo: loop_vinfo, DR_IS_READ (dr),
4229 masked_p, vectype, memory_type,
4230 TREE_TYPE (off), scale, ifn_out: &ifn,
4231 offset_vectype_out: &offset_vectype))
4232 break;
4233
4234 if (TYPE_PRECISION (TREE_TYPE (op0))
4235 == TYPE_PRECISION (TREE_TYPE (off)))
4236 {
4237 off = op0;
4238 continue;
4239 }
4240
4241 /* Include the conversion if it is widening and we're using
4242 the IFN path or the target can handle the converted from
4243 offset or the current size is not already the same as the
4244 data vector element size. */
4245 if ((TYPE_PRECISION (TREE_TYPE (op0))
4246 < TYPE_PRECISION (TREE_TYPE (off)))
4247 && (use_ifn_p
4248 || (DR_IS_READ (dr)
4249 ? (targetm.vectorize.builtin_gather
4250 && targetm.vectorize.builtin_gather (vectype,
4251 TREE_TYPE (op0),
4252 scale))
4253 : (targetm.vectorize.builtin_scatter
4254 && targetm.vectorize.builtin_scatter (vectype,
4255 TREE_TYPE (op0),
4256 scale)))
4257 || !operand_equal_p (TYPE_SIZE (TREE_TYPE (off)),
4258 TYPE_SIZE (TREE_TYPE (vectype)), flags: 0)))
4259 {
4260 off = op0;
4261 offtype = TREE_TYPE (off);
4262 STRIP_NOPS (off);
4263 continue;
4264 }
4265 break;
4266 default:
4267 break;
4268 }
4269 break;
4270 }
4271
4272 /* If at the end OFF still isn't a SSA_NAME or isn't
4273 defined in the loop, punt. */
4274 if (TREE_CODE (off) != SSA_NAME
4275 || expr_invariant_in_loop_p (loop, off))
4276 return false;
4277
4278 if (offtype == NULL_TREE)
4279 offtype = TREE_TYPE (off);
4280
4281 if (use_ifn_p)
4282 {
4283 if (!vect_gather_scatter_fn_p (vinfo: loop_vinfo, DR_IS_READ (dr), masked_p,
4284 vectype, memory_type, offset_type: offtype, scale,
4285 ifn_out: &ifn, offset_vectype_out: &offset_vectype))
4286 ifn = IFN_LAST;
4287 decl = NULL_TREE;
4288 }
4289 else
4290 {
4291 if (DR_IS_READ (dr))
4292 {
4293 if (targetm.vectorize.builtin_gather)
4294 decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
4295 }
4296 else
4297 {
4298 if (targetm.vectorize.builtin_scatter)
4299 decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
4300 }
4301 ifn = IFN_LAST;
4302 /* The offset vector type will be read from DECL when needed. */
4303 offset_vectype = NULL_TREE;
4304 }
4305
4306 info->ifn = ifn;
4307 info->decl = decl;
4308 info->base = base;
4309 info->offset = off;
4310 info->offset_dt = vect_unknown_def_type;
4311 info->offset_vectype = offset_vectype;
4312 info->scale = scale;
4313 info->element_type = TREE_TYPE (vectype);
4314 info->memory_type = memory_type;
4315 return true;
4316}
4317
4318/* Find the data references in STMT, analyze them with respect to LOOP and
4319 append them to DATAREFS. Return false if datarefs in this stmt cannot
4320 be handled. */
4321
4322opt_result
4323vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
4324 vec<data_reference_p> *datarefs,
4325 vec<int> *dataref_groups, int group_id)
4326{
4327 /* We can ignore clobbers for dataref analysis - they are removed during
4328 loop vectorization and BB vectorization checks dependences with a
4329 stmt walk. */
4330 if (gimple_clobber_p (s: stmt))
4331 return opt_result::success ();
4332
4333 if (gimple_has_volatile_ops (stmt))
4334 return opt_result::failure_at (loc: stmt, fmt: "not vectorized: volatile type: %G",
4335 stmt);
4336
4337 if (stmt_can_throw_internal (cfun, stmt))
4338 return opt_result::failure_at (loc: stmt,
4339 fmt: "not vectorized:"
4340 " statement can throw an exception: %G",
4341 stmt);
4342
4343 auto_vec<data_reference_p, 2> refs;
4344 opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
4345 if (!res)
4346 return res;
4347
4348 if (refs.is_empty ())
4349 return opt_result::success ();
4350
4351 if (refs.length () > 1)
4352 {
4353 while (!refs.is_empty ())
4354 free_data_ref (refs.pop ());
4355 return opt_result::failure_at (loc: stmt,
4356 fmt: "not vectorized: more than one "
4357 "data ref in stmt: %G", stmt);
4358 }
4359
4360 data_reference_p dr = refs.pop ();
4361 if (gcall *call = dyn_cast <gcall *> (p: stmt))
4362 if (!gimple_call_internal_p (gs: call)
4363 || (gimple_call_internal_fn (gs: call) != IFN_MASK_LOAD
4364 && gimple_call_internal_fn (gs: call) != IFN_MASK_STORE))
4365 {
4366 free_data_ref (dr);
4367 return opt_result::failure_at (loc: stmt,
4368 fmt: "not vectorized: dr in a call %G", stmt);
4369 }
4370
4371 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4372 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4373 {
4374 free_data_ref (dr);
4375 return opt_result::failure_at (loc: stmt,
4376 fmt: "not vectorized:"
4377 " statement is an unsupported"
4378 " bitfield access %G", stmt);
4379 }
4380
4381 if (DR_BASE_ADDRESS (dr)
4382 && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4383 {
4384 free_data_ref (dr);
4385 return opt_result::failure_at (loc: stmt,
4386 fmt: "not vectorized:"
4387 " base addr of dr is a constant\n");
4388 }
4389
4390 /* Check whether this may be a SIMD lane access and adjust the
4391 DR to make it easier for us to handle it. */
4392 if (loop
4393 && loop->simduid
4394 && (!DR_BASE_ADDRESS (dr)
4395 || !DR_OFFSET (dr)
4396 || !DR_INIT (dr)
4397 || !DR_STEP (dr)))
4398 {
4399 struct data_reference *newdr
4400 = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4401 DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4402 if (DR_BASE_ADDRESS (newdr)
4403 && DR_OFFSET (newdr)
4404 && DR_INIT (newdr)
4405 && DR_STEP (newdr)
4406 && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4407 && integer_zerop (DR_STEP (newdr)))
4408 {
4409 tree base_address = DR_BASE_ADDRESS (newdr);
4410 tree off = DR_OFFSET (newdr);
4411 tree step = ssize_int (1);
4412 if (integer_zerop (off)
4413 && TREE_CODE (base_address) == POINTER_PLUS_EXPR)
4414 {
4415 off = TREE_OPERAND (base_address, 1);
4416 base_address = TREE_OPERAND (base_address, 0);
4417 }
4418 STRIP_NOPS (off);
4419 if (TREE_CODE (off) == MULT_EXPR
4420 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4421 {
4422 step = TREE_OPERAND (off, 1);
4423 off = TREE_OPERAND (off, 0);
4424 STRIP_NOPS (off);
4425 }
4426 if (CONVERT_EXPR_P (off)
4427 && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4428 < TYPE_PRECISION (TREE_TYPE (off))))
4429 off = TREE_OPERAND (off, 0);
4430 if (TREE_CODE (off) == SSA_NAME)
4431 {
4432 gimple *def = SSA_NAME_DEF_STMT (off);
4433 /* Look through widening conversion. */
4434 if (is_gimple_assign (gs: def)
4435 && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)))
4436 {
4437 tree rhs1 = gimple_assign_rhs1 (gs: def);
4438 if (TREE_CODE (rhs1) == SSA_NAME
4439 && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
4440 && (TYPE_PRECISION (TREE_TYPE (off))
4441 > TYPE_PRECISION (TREE_TYPE (rhs1))))
4442 def = SSA_NAME_DEF_STMT (rhs1);
4443 }
4444 if (is_gimple_call (gs: def)
4445 && gimple_call_internal_p (gs: def)
4446 && (gimple_call_internal_fn (gs: def) == IFN_GOMP_SIMD_LANE))
4447 {
4448 tree arg = gimple_call_arg (gs: def, index: 0);
4449 tree reft = TREE_TYPE (DR_REF (newdr));
4450 gcc_assert (TREE_CODE (arg) == SSA_NAME);
4451 arg = SSA_NAME_VAR (arg);
4452 if (arg == loop->simduid
4453 /* For now. */
4454 && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4455 {
4456 DR_BASE_ADDRESS (newdr) = base_address;
4457 DR_OFFSET (newdr) = ssize_int (0);
4458 DR_STEP (newdr) = step;
4459 DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4460 DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step);
4461 /* Mark as simd-lane access. */
4462 tree arg2 = gimple_call_arg (gs: def, index: 1);
4463 newdr->aux = (void *) (-1 - tree_to_uhwi (arg2));
4464 free_data_ref (dr);
4465 datarefs->safe_push (obj: newdr);
4466 if (dataref_groups)
4467 dataref_groups->safe_push (obj: group_id);
4468 return opt_result::success ();
4469 }
4470 }
4471 }
4472 }
4473 free_data_ref (newdr);
4474 }
4475
4476 datarefs->safe_push (obj: dr);
4477 if (dataref_groups)
4478 dataref_groups->safe_push (obj: group_id);
4479 return opt_result::success ();
4480}
4481
4482/* Function vect_analyze_data_refs.
4483
4484 Find all the data references in the loop or basic block.
4485
4486 The general structure of the analysis of data refs in the vectorizer is as
4487 follows:
4488 1- vect_analyze_data_refs(loop/bb): call
4489 compute_data_dependences_for_loop/bb to find and analyze all data-refs
4490 in the loop/bb and their dependences.
4491 2- vect_analyze_dependences(): apply dependence testing using ddrs.
4492 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4493 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4494
4495*/
4496
4497opt_result
4498vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
4499{
4500 class loop *loop = NULL;
4501 unsigned int i;
4502 struct data_reference *dr;
4503 tree scalar_type;
4504
4505 DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4506
4507 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (p: vinfo))
4508 loop = LOOP_VINFO_LOOP (loop_vinfo);
4509
4510 /* Go through the data-refs, check that the analysis succeeded. Update
4511 pointer from stmt_vec_info struct to DR and vectype. */
4512
4513 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4514 FOR_EACH_VEC_ELT (datarefs, i, dr)
4515 {
4516 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4517 poly_uint64 vf;
4518
4519 gcc_assert (DR_REF (dr));
4520 stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4521 gcc_assert (!stmt_info->dr_aux.dr);
4522 stmt_info->dr_aux.dr = dr;
4523 stmt_info->dr_aux.stmt = stmt_info;
4524
4525 /* Check that analysis of the data-ref succeeded. */
4526 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4527 || !DR_STEP (dr))
4528 {
4529 bool maybe_gather
4530 = DR_IS_READ (dr)
4531 && !TREE_THIS_VOLATILE (DR_REF (dr));
4532 bool maybe_scatter
4533 = DR_IS_WRITE (dr)
4534 && !TREE_THIS_VOLATILE (DR_REF (dr));
4535
4536 /* If target supports vector gather loads or scatter stores,
4537 see if they can't be used. */
4538 if (is_a <loop_vec_info> (p: vinfo)
4539 && !nested_in_vect_loop_p (loop, stmt_info))
4540 {
4541 if (maybe_gather || maybe_scatter)
4542 {
4543 if (maybe_gather)
4544 gatherscatter = GATHER;
4545 else
4546 gatherscatter = SCATTER;
4547 }
4548 }
4549
4550 if (gatherscatter == SG_NONE)
4551 {
4552 if (dump_enabled_p ())
4553 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4554 "not vectorized: data ref analysis "
4555 "failed %G", stmt_info->stmt);
4556 if (is_a <bb_vec_info> (p: vinfo))
4557 {
4558 /* In BB vectorization the ref can still participate
4559 in dependence analysis, we just can't vectorize it. */
4560 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4561 continue;
4562 }
4563 return opt_result::failure_at (loc: stmt_info->stmt,
4564 fmt: "not vectorized:"
4565 " data ref analysis failed: %G",
4566 stmt_info->stmt);
4567 }
4568 }
4569
4570 /* See if this was detected as SIMD lane access. */
4571 if (dr->aux == (void *)-1
4572 || dr->aux == (void *)-2
4573 || dr->aux == (void *)-3
4574 || dr->aux == (void *)-4)
4575 {
4576 if (nested_in_vect_loop_p (loop, stmt_info))
4577 return opt_result::failure_at (loc: stmt_info->stmt,
4578 fmt: "not vectorized:"
4579 " data ref analysis failed: %G",
4580 stmt_info->stmt);
4581 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)
4582 = -(uintptr_t) dr->aux;
4583 }
4584
4585 tree base = get_base_address (DR_REF (dr));
4586 if (base && VAR_P (base) && DECL_NONALIASED (base))
4587 {
4588 if (dump_enabled_p ())
4589 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4590 "not vectorized: base object not addressable "
4591 "for stmt: %G", stmt_info->stmt);
4592 if (is_a <bb_vec_info> (p: vinfo))
4593 {
4594 /* In BB vectorization the ref can still participate
4595 in dependence analysis, we just can't vectorize it. */
4596 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4597 continue;
4598 }
4599 return opt_result::failure_at (loc: stmt_info->stmt,
4600 fmt: "not vectorized: base object not"
4601 " addressable for stmt: %G",
4602 stmt_info->stmt);
4603 }
4604
4605 if (is_a <loop_vec_info> (p: vinfo)
4606 && DR_STEP (dr)
4607 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4608 {
4609 if (nested_in_vect_loop_p (loop, stmt_info))
4610 return opt_result::failure_at (loc: stmt_info->stmt,
4611 fmt: "not vectorized: "
4612 "not suitable for strided load %G",
4613 stmt_info->stmt);
4614 STMT_VINFO_STRIDED_P (stmt_info) = true;
4615 }
4616
4617 /* Update DR field in stmt_vec_info struct. */
4618
4619 /* If the dataref is in an inner-loop of the loop that is considered for
4620 for vectorization, we also want to analyze the access relative to
4621 the outer-loop (DR contains information only relative to the
4622 inner-most enclosing loop). We do that by building a reference to the
4623 first location accessed by the inner-loop, and analyze it relative to
4624 the outer-loop. */
4625 if (loop && nested_in_vect_loop_p (loop, stmt_info))
4626 {
4627 /* Build a reference to the first location accessed by the
4628 inner loop: *(BASE + INIT + OFFSET). By construction,
4629 this address must be invariant in the inner loop, so we
4630 can consider it as being used in the outer loop. */
4631 tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4632 tree offset = unshare_expr (DR_OFFSET (dr));
4633 tree init = unshare_expr (DR_INIT (dr));
4634 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4635 init, offset);
4636 tree init_addr = fold_build_pointer_plus (base, init_offset);
4637 tree init_ref = build_fold_indirect_ref (init_addr);
4638
4639 if (dump_enabled_p ())
4640 dump_printf_loc (MSG_NOTE, vect_location,
4641 "analyze in outer loop: %T\n", init_ref);
4642
4643 opt_result res
4644 = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4645 init_ref, loop, stmt_info->stmt);
4646 if (!res)
4647 /* dr_analyze_innermost already explained the failure. */
4648 return res;
4649
4650 if (dump_enabled_p ())
4651 dump_printf_loc (MSG_NOTE, vect_location,
4652 "\touter base_address: %T\n"
4653 "\touter offset from base address: %T\n"
4654 "\touter constant offset from base address: %T\n"
4655 "\touter step: %T\n"
4656 "\touter base alignment: %d\n\n"
4657 "\touter base misalignment: %d\n"
4658 "\touter offset alignment: %d\n"
4659 "\touter step alignment: %d\n",
4660 STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4661 STMT_VINFO_DR_OFFSET (stmt_info),
4662 STMT_VINFO_DR_INIT (stmt_info),
4663 STMT_VINFO_DR_STEP (stmt_info),
4664 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4665 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4666 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4667 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4668 }
4669
4670 /* Set vectype for STMT. */
4671 scalar_type = TREE_TYPE (DR_REF (dr));
4672 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
4673 if (!vectype)
4674 {
4675 if (dump_enabled_p ())
4676 {
4677 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4678 "not vectorized: no vectype for stmt: %G",
4679 stmt_info->stmt);
4680 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4681 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4682 scalar_type);
4683 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4684 }
4685
4686 if (is_a <bb_vec_info> (p: vinfo))
4687 {
4688 /* No vector type is fine, the ref can still participate
4689 in dependence analysis, we just can't vectorize it. */
4690 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4691 continue;
4692 }
4693 if (fatal)
4694 *fatal = false;
4695 return opt_result::failure_at (loc: stmt_info->stmt,
4696 fmt: "not vectorized:"
4697 " no vectype for stmt: %G"
4698 " scalar_type: %T\n",
4699 stmt_info->stmt, scalar_type);
4700 }
4701 else
4702 {
4703 if (dump_enabled_p ())
4704 dump_printf_loc (MSG_NOTE, vect_location,
4705 "got vectype for stmt: %G%T\n",
4706 stmt_info->stmt, vectype);
4707 }
4708
4709 /* Adjust the minimal vectorization factor according to the
4710 vector type. */
4711 vf = TYPE_VECTOR_SUBPARTS (node: vectype);
4712 *min_vf = upper_bound (a: *min_vf, b: vf);
4713
4714 /* Leave the BB vectorizer to pick the vector type later, based on
4715 the final dataref group size and SLP node size. */
4716 if (is_a <loop_vec_info> (p: vinfo))
4717 STMT_VINFO_VECTYPE (stmt_info) = vectype;
4718
4719 if (gatherscatter != SG_NONE)
4720 {
4721 gather_scatter_info gs_info;
4722 if (!vect_check_gather_scatter (stmt_info,
4723 loop_vinfo: as_a <loop_vec_info> (p: vinfo),
4724 info: &gs_info)
4725 || !get_vectype_for_scalar_type (vinfo,
4726 TREE_TYPE (gs_info.offset)))
4727 {
4728 if (fatal)
4729 *fatal = false;
4730 return opt_result::failure_at
4731 (loc: stmt_info->stmt,
4732 fmt: (gatherscatter == GATHER)
4733 ? "not vectorized: not suitable for gather load %G"
4734 : "not vectorized: not suitable for scatter store %G",
4735 stmt_info->stmt);
4736 }
4737 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4738 }
4739 }
4740
4741 /* We used to stop processing and prune the list here. Verify we no
4742 longer need to. */
4743 gcc_assert (i == datarefs.length ());
4744
4745 return opt_result::success ();
4746}
4747
4748
4749/* Function vect_get_new_vect_var.
4750
4751 Returns a name for a new variable. The current naming scheme appends the
4752 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4753 the name of vectorizer generated variables, and appends that to NAME if
4754 provided. */
4755
4756tree
4757vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4758{
4759 const char *prefix;
4760 tree new_vect_var;
4761
4762 switch (var_kind)
4763 {
4764 case vect_simple_var:
4765 prefix = "vect";
4766 break;
4767 case vect_scalar_var:
4768 prefix = "stmp";
4769 break;
4770 case vect_mask_var:
4771 prefix = "mask";
4772 break;
4773 case vect_pointer_var:
4774 prefix = "vectp";
4775 break;
4776 default:
4777 gcc_unreachable ();
4778 }
4779
4780 if (name)
4781 {
4782 char* tmp = concat (prefix, "_", name, NULL);
4783 new_vect_var = create_tmp_reg (type, tmp);
4784 free (ptr: tmp);
4785 }
4786 else
4787 new_vect_var = create_tmp_reg (type, prefix);
4788
4789 return new_vect_var;
4790}
4791
4792/* Like vect_get_new_vect_var but return an SSA name. */
4793
4794tree
4795vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4796{
4797 const char *prefix;
4798 tree new_vect_var;
4799
4800 switch (var_kind)
4801 {
4802 case vect_simple_var:
4803 prefix = "vect";
4804 break;
4805 case vect_scalar_var:
4806 prefix = "stmp";
4807 break;
4808 case vect_pointer_var:
4809 prefix = "vectp";
4810 break;
4811 default:
4812 gcc_unreachable ();
4813 }
4814
4815 if (name)
4816 {
4817 char* tmp = concat (prefix, "_", name, NULL);
4818 new_vect_var = make_temp_ssa_name (type, NULL, name: tmp);
4819 free (ptr: tmp);
4820 }
4821 else
4822 new_vect_var = make_temp_ssa_name (type, NULL, name: prefix);
4823
4824 return new_vect_var;
4825}
4826
4827/* Duplicate points-to info on NAME from DR_INFO. */
4828
4829static void
4830vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
4831{
4832 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
4833 /* DR_PTR_INFO is for a base SSA name, not including constant or
4834 variable offsets in the ref so its alignment info does not apply. */
4835 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4836}
4837
4838/* Function vect_create_addr_base_for_vector_ref.
4839
4840 Create an expression that computes the address of the first memory location
4841 that will be accessed for a data reference.
4842
4843 Input:
4844 STMT_INFO: The statement containing the data reference.
4845 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4846 OFFSET: Optional. If supplied, it is be added to the initial address.
4847 LOOP: Specify relative to which loop-nest should the address be computed.
4848 For example, when the dataref is in an inner-loop nested in an
4849 outer-loop that is now being vectorized, LOOP can be either the
4850 outer-loop, or the inner-loop. The first memory location accessed
4851 by the following dataref ('in' points to short):
4852
4853 for (i=0; i<N; i++)
4854 for (j=0; j<M; j++)
4855 s += in[i+j]
4856
4857 is as follows:
4858 if LOOP=i_loop: &in (relative to i_loop)
4859 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4860
4861 Output:
4862 1. Return an SSA_NAME whose value is the address of the memory location of
4863 the first vector of the data reference.
4864 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4865 these statement(s) which define the returned SSA_NAME.
4866
4867 FORNOW: We are only handling array accesses with step 1. */
4868
4869tree
4870vect_create_addr_base_for_vector_ref (vec_info *vinfo, stmt_vec_info stmt_info,
4871 gimple_seq *new_stmt_list,
4872 tree offset)
4873{
4874 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4875 struct data_reference *dr = dr_info->dr;
4876 const char *base_name;
4877 tree addr_base;
4878 tree dest;
4879 gimple_seq seq = NULL;
4880 tree vect_ptr_type;
4881 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (p: vinfo);
4882 innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
4883
4884 tree data_ref_base = unshare_expr (drb->base_address);
4885 tree base_offset = unshare_expr (get_dr_vinfo_offset (vinfo, dr_info, check_outer: true));
4886 tree init = unshare_expr (drb->init);
4887
4888 if (loop_vinfo)
4889 base_name = get_name (data_ref_base);
4890 else
4891 {
4892 base_offset = ssize_int (0);
4893 init = ssize_int (0);
4894 base_name = get_name (DR_REF (dr));
4895 }
4896
4897 /* Create base_offset */
4898 base_offset = size_binop (PLUS_EXPR,
4899 fold_convert (sizetype, base_offset),
4900 fold_convert (sizetype, init));
4901
4902 if (offset)
4903 {
4904 offset = fold_convert (sizetype, offset);
4905 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4906 base_offset, offset);
4907 }
4908
4909 /* base + base_offset */
4910 if (loop_vinfo)
4911 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4912 else
4913 addr_base = build1 (ADDR_EXPR,
4914 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4915 /* Strip zero offset components since we don't need
4916 them and they can confuse late diagnostics if
4917 we CSE them wrongly. See PR106904 for example. */
4918 unshare_expr (strip_zero_offset_components
4919 (DR_REF (dr))));
4920
4921 vect_ptr_type = build_pointer_type (TREE_TYPE (DR_REF (dr)));
4922 dest = vect_get_new_vect_var (type: vect_ptr_type, var_kind: vect_pointer_var, name: base_name);
4923 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4924 gimple_seq_add_seq (new_stmt_list, seq);
4925
4926 if (DR_PTR_INFO (dr)
4927 && TREE_CODE (addr_base) == SSA_NAME
4928 /* We should only duplicate pointer info to newly created SSA names. */
4929 && SSA_NAME_VAR (addr_base) == dest)
4930 {
4931 gcc_assert (!SSA_NAME_PTR_INFO (addr_base));
4932 vect_duplicate_ssa_name_ptr_info (name: addr_base, dr_info);
4933 }
4934
4935 if (dump_enabled_p ())
4936 dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
4937
4938 return addr_base;
4939}
4940
4941
4942/* Function vect_create_data_ref_ptr.
4943
4944 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4945 location accessed in the loop by STMT_INFO, along with the def-use update
4946 chain to appropriately advance the pointer through the loop iterations.
4947 Also set aliasing information for the pointer. This pointer is used by
4948 the callers to this function to create a memory reference expression for
4949 vector load/store access.
4950
4951 Input:
4952 1. STMT_INFO: a stmt that references memory. Expected to be of the form
4953 GIMPLE_ASSIGN <name, data-ref> or
4954 GIMPLE_ASSIGN <data-ref, name>.
4955 2. AGGR_TYPE: the type of the reference, which should be either a vector
4956 or an array.
4957 3. AT_LOOP: the loop where the vector memref is to be created.
4958 4. OFFSET (optional): a byte offset to be added to the initial address
4959 accessed by the data-ref in STMT_INFO.
4960 5. BSI: location where the new stmts are to be placed if there is no loop
4961 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4962 pointing to the initial address.
4963 8. IV_STEP (optional, defaults to NULL): the amount that should be added
4964 to the IV during each iteration of the loop. NULL says to move
4965 by one copy of AGGR_TYPE up or down, depending on the step of the
4966 data reference.
4967
4968 Output:
4969 1. Declare a new ptr to vector_type, and have it point to the base of the
4970 data reference (initial addressed accessed by the data reference).
4971 For example, for vector of type V8HI, the following code is generated:
4972
4973 v8hi *ap;
4974 ap = (v8hi *)initial_address;
4975
4976 if OFFSET is not supplied:
4977 initial_address = &a[init];
4978 if OFFSET is supplied:
4979 initial_address = &a[init] + OFFSET;
4980 if BYTE_OFFSET is supplied:
4981 initial_address = &a[init] + BYTE_OFFSET;
4982
4983 Return the initial_address in INITIAL_ADDRESS.
4984
4985 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4986 update the pointer in each iteration of the loop.
4987
4988 Return the increment stmt that updates the pointer in PTR_INCR.
4989
4990 3. Return the pointer. */
4991
4992tree
4993vect_create_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info,
4994 tree aggr_type, class loop *at_loop, tree offset,
4995 tree *initial_address, gimple_stmt_iterator *gsi,
4996 gimple **ptr_incr, bool only_init,
4997 tree iv_step)
4998{
4999 const char *base_name;
5000 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (p: vinfo);
5001 class loop *loop = NULL;
5002 bool nested_in_vect_loop = false;
5003 class loop *containing_loop = NULL;
5004 tree aggr_ptr_type;
5005 tree aggr_ptr;
5006 tree new_temp;
5007 gimple_seq new_stmt_list = NULL;
5008 edge pe = NULL;
5009 basic_block new_bb;
5010 tree aggr_ptr_init;
5011 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5012 struct data_reference *dr = dr_info->dr;
5013 tree aptr;
5014 gimple_stmt_iterator incr_gsi;
5015 bool insert_after;
5016 tree indx_before_incr, indx_after_incr;
5017 gimple *incr;
5018 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (p: vinfo);
5019
5020 gcc_assert (iv_step != NULL_TREE
5021 || TREE_CODE (aggr_type) == ARRAY_TYPE
5022 || TREE_CODE (aggr_type) == VECTOR_TYPE);
5023
5024 if (loop_vinfo)
5025 {
5026 loop = LOOP_VINFO_LOOP (loop_vinfo);
5027 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5028 containing_loop = (gimple_bb (g: stmt_info->stmt))->loop_father;
5029 pe = loop_preheader_edge (loop);
5030 }
5031 else
5032 {
5033 gcc_assert (bb_vinfo);
5034 only_init = true;
5035 *ptr_incr = NULL;
5036 }
5037
5038 /* Create an expression for the first address accessed by this load
5039 in LOOP. */
5040 base_name = get_name (DR_BASE_ADDRESS (dr));
5041
5042 if (dump_enabled_p ())
5043 {
5044 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
5045 dump_printf_loc (MSG_NOTE, vect_location,
5046 "create %s-pointer variable to type: %T",
5047 get_tree_code_name (TREE_CODE (aggr_type)),
5048 aggr_type);
5049 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
5050 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
5051 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
5052 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
5053 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
5054 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
5055 else
5056 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
5057 dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
5058 }
5059
5060 /* (1) Create the new aggregate-pointer variable.
5061 Vector and array types inherit the alias set of their component
5062 type by default so we need to use a ref-all pointer if the data
5063 reference does not conflict with the created aggregated data
5064 reference because it is not addressable. */
5065 bool need_ref_all = false;
5066 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
5067 get_alias_set (DR_REF (dr))))
5068 need_ref_all = true;
5069 /* Likewise for any of the data references in the stmt group. */
5070 else if (DR_GROUP_SIZE (stmt_info) > 1)
5071 {
5072 stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
5073 do
5074 {
5075 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
5076 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
5077 get_alias_set (DR_REF (sdr))))
5078 {
5079 need_ref_all = true;
5080 break;
5081 }
5082 sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
5083 }
5084 while (sinfo);
5085 }
5086 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
5087 need_ref_all);
5088 aggr_ptr = vect_get_new_vect_var (type: aggr_ptr_type, var_kind: vect_pointer_var, name: base_name);
5089
5090
5091 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
5092 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
5093 def-use update cycles for the pointer: one relative to the outer-loop
5094 (LOOP), which is what steps (3) and (4) below do. The other is relative
5095 to the inner-loop (which is the inner-most loop containing the dataref),
5096 and this is done be step (5) below.
5097
5098 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
5099 inner-most loop, and so steps (3),(4) work the same, and step (5) is
5100 redundant. Steps (3),(4) create the following:
5101
5102 vp0 = &base_addr;
5103 LOOP: vp1 = phi(vp0,vp2)
5104 ...
5105 ...
5106 vp2 = vp1 + step
5107 goto LOOP
5108
5109 If there is an inner-loop nested in loop, then step (5) will also be
5110 applied, and an additional update in the inner-loop will be created:
5111
5112 vp0 = &base_addr;
5113 LOOP: vp1 = phi(vp0,vp2)
5114 ...
5115 inner: vp3 = phi(vp1,vp4)
5116 vp4 = vp3 + inner_step
5117 if () goto inner
5118 ...
5119 vp2 = vp1 + step
5120 if () goto LOOP */
5121
5122 /* (2) Calculate the initial address of the aggregate-pointer, and set
5123 the aggregate-pointer to point to it before the loop. */
5124
5125 /* Create: (&(base[init_val]+offset) in the loop preheader. */
5126
5127 new_temp = vect_create_addr_base_for_vector_ref (vinfo,
5128 stmt_info, new_stmt_list: &new_stmt_list,
5129 offset);
5130 if (new_stmt_list)
5131 {
5132 if (pe)
5133 {
5134 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
5135 gcc_assert (!new_bb);
5136 }
5137 else
5138 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
5139 }
5140
5141 *initial_address = new_temp;
5142 aggr_ptr_init = new_temp;
5143
5144 /* (3) Handle the updating of the aggregate-pointer inside the loop.
5145 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
5146 inner-loop nested in LOOP (during outer-loop vectorization). */
5147
5148 /* No update in loop is required. */
5149 if (only_init && (!loop_vinfo || at_loop == loop))
5150 aptr = aggr_ptr_init;
5151 else
5152 {
5153 /* Accesses to invariant addresses should be handled specially
5154 by the caller. */
5155 tree step = vect_dr_behavior (vinfo, dr_info)->step;
5156 gcc_assert (!integer_zerop (step));
5157
5158 if (iv_step == NULL_TREE)
5159 {
5160 /* The step of the aggregate pointer is the type size,
5161 negated for downward accesses. */
5162 iv_step = TYPE_SIZE_UNIT (aggr_type);
5163 if (tree_int_cst_sgn (step) == -1)
5164 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
5165 }
5166
5167 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5168
5169 create_iv (aggr_ptr_init, PLUS_EXPR,
5170 fold_convert (aggr_ptr_type, iv_step),
5171 aggr_ptr, loop, &incr_gsi, insert_after,
5172 &indx_before_incr, &indx_after_incr);
5173 incr = gsi_stmt (i: incr_gsi);
5174
5175 /* Copy the points-to information if it exists. */
5176 if (DR_PTR_INFO (dr))
5177 {
5178 vect_duplicate_ssa_name_ptr_info (name: indx_before_incr, dr_info);
5179 vect_duplicate_ssa_name_ptr_info (name: indx_after_incr, dr_info);
5180 }
5181 if (ptr_incr)
5182 *ptr_incr = incr;
5183
5184 aptr = indx_before_incr;
5185 }
5186
5187 if (!nested_in_vect_loop || only_init)
5188 return aptr;
5189
5190
5191 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
5192 nested in LOOP, if exists. */
5193
5194 gcc_assert (nested_in_vect_loop);
5195 if (!only_init)
5196 {
5197 standard_iv_increment_position (containing_loop, &incr_gsi,
5198 &insert_after);
5199 create_iv (aptr, PLUS_EXPR, fold_convert (aggr_ptr_type, DR_STEP (dr)),
5200 aggr_ptr, containing_loop, &incr_gsi, insert_after,
5201 &indx_before_incr, &indx_after_incr);
5202 incr = gsi_stmt (i: incr_gsi);
5203
5204 /* Copy the points-to information if it exists. */
5205 if (DR_PTR_INFO (dr))
5206 {
5207 vect_duplicate_ssa_name_ptr_info (name: indx_before_incr, dr_info);
5208 vect_duplicate_ssa_name_ptr_info (name: indx_after_incr, dr_info);
5209 }
5210 if (ptr_incr)
5211 *ptr_incr = incr;
5212
5213 return indx_before_incr;
5214 }
5215 else
5216 gcc_unreachable ();
5217}
5218
5219
5220/* Function bump_vector_ptr
5221
5222 Increment a pointer (to a vector type) by vector-size. If requested,
5223 i.e. if PTR-INCR is given, then also connect the new increment stmt
5224 to the existing def-use update-chain of the pointer, by modifying
5225 the PTR_INCR as illustrated below:
5226
5227 The pointer def-use update-chain before this function:
5228 DATAREF_PTR = phi (p_0, p_2)
5229 ....
5230 PTR_INCR: p_2 = DATAREF_PTR + step
5231
5232 The pointer def-use update-chain after this function:
5233 DATAREF_PTR = phi (p_0, p_2)
5234 ....
5235 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
5236 ....
5237 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
5238
5239 Input:
5240 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
5241 in the loop.
5242 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
5243 the loop. The increment amount across iterations is expected
5244 to be vector_size.
5245 BSI - location where the new update stmt is to be placed.
5246 STMT_INFO - the original scalar memory-access stmt that is being vectorized.
5247 BUMP - optional. The offset by which to bump the pointer. If not given,
5248 the offset is assumed to be vector_size.
5249
5250 Output: Return NEW_DATAREF_PTR as illustrated above.
5251
5252*/
5253
5254tree
5255bump_vector_ptr (vec_info *vinfo,
5256 tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
5257 stmt_vec_info stmt_info, tree bump)
5258{
5259 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5260 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5261 tree update = TYPE_SIZE_UNIT (vectype);
5262 gimple *incr_stmt;
5263 ssa_op_iter iter;
5264 use_operand_p use_p;
5265 tree new_dataref_ptr;
5266
5267 if (bump)
5268 update = bump;
5269
5270 if (TREE_CODE (dataref_ptr) == SSA_NAME)
5271 new_dataref_ptr = copy_ssa_name (var: dataref_ptr);
5272 else if (is_gimple_min_invariant (dataref_ptr))
5273 /* When possible avoid emitting a separate increment stmt that will
5274 force the addressed object addressable. */
5275 return build1 (ADDR_EXPR, TREE_TYPE (dataref_ptr),
5276 fold_build2 (MEM_REF,
5277 TREE_TYPE (TREE_TYPE (dataref_ptr)),
5278 dataref_ptr,
5279 fold_convert (ptr_type_node, update)));
5280 else
5281 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
5282 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
5283 dataref_ptr, update);
5284 vect_finish_stmt_generation (vinfo, stmt_info, incr_stmt, gsi);
5285 /* Fold the increment, avoiding excessive chains use-def chains of
5286 those, leading to compile-time issues for passes until the next
5287 forwprop pass which would do this as well. */
5288 gimple_stmt_iterator fold_gsi = gsi_for_stmt (incr_stmt);
5289 if (fold_stmt (&fold_gsi, follow_all_ssa_edges))
5290 {
5291 incr_stmt = gsi_stmt (i: fold_gsi);
5292 update_stmt (s: incr_stmt);
5293 }
5294
5295 /* Copy the points-to information if it exists. */
5296 if (DR_PTR_INFO (dr))
5297 {
5298 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
5299 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
5300 }
5301
5302 if (!ptr_incr)
5303 return new_dataref_ptr;
5304
5305 /* Update the vector-pointer's cross-iteration increment. */
5306 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5307 {
5308 tree use = USE_FROM_PTR (use_p);
5309
5310 if (use == dataref_ptr)
5311 SET_USE (use_p, new_dataref_ptr);
5312 else
5313 gcc_assert (operand_equal_p (use, update, 0));
5314 }
5315
5316 return new_dataref_ptr;
5317}
5318
5319
5320/* Copy memory reference info such as base/clique from the SRC reference
5321 to the DEST MEM_REF. */
5322
5323void
5324vect_copy_ref_info (tree dest, tree src)
5325{
5326 if (TREE_CODE (dest) != MEM_REF)
5327 return;
5328
5329 tree src_base = src;
5330 while (handled_component_p (t: src_base))
5331 src_base = TREE_OPERAND (src_base, 0);
5332 if (TREE_CODE (src_base) != MEM_REF
5333 && TREE_CODE (src_base) != TARGET_MEM_REF)
5334 return;
5335
5336 MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5337 MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5338}
5339
5340
5341/* Function vect_create_destination_var.
5342
5343 Create a new temporary of type VECTYPE. */
5344
5345tree
5346vect_create_destination_var (tree scalar_dest, tree vectype)
5347{
5348 tree vec_dest;
5349 const char *name;
5350 char *new_name;
5351 tree type;
5352 enum vect_var_kind kind;
5353
5354 kind = vectype
5355 ? VECTOR_BOOLEAN_TYPE_P (vectype)
5356 ? vect_mask_var
5357 : vect_simple_var
5358 : vect_scalar_var;
5359 type = vectype ? vectype : TREE_TYPE (scalar_dest);
5360
5361 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5362
5363 name = get_name (scalar_dest);
5364 if (name)
5365 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5366 else
5367 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5368 vec_dest = vect_get_new_vect_var (type, var_kind: kind, name: new_name);
5369 free (ptr: new_name);
5370
5371 return vec_dest;
5372}
5373
5374/* Function vect_grouped_store_supported.
5375
5376 Returns TRUE if interleave high and interleave low permutations
5377 are supported, and FALSE otherwise. */
5378
5379bool
5380vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5381{
5382 machine_mode mode = TYPE_MODE (vectype);
5383
5384 /* vect_permute_store_chain requires the group size to be equal to 3 or
5385 be a power of two. */
5386 if (count != 3 && exact_log2 (x: count) == -1)
5387 {
5388 if (dump_enabled_p ())
5389 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5390 "the size of the group of accesses"
5391 " is not a power of 2 or not eqaul to 3\n");
5392 return false;
5393 }
5394
5395 /* Check that the permutation is supported. */
5396 if (VECTOR_MODE_P (mode))
5397 {
5398 unsigned int i;
5399 if (count == 3)
5400 {
5401 unsigned int j0 = 0, j1 = 0, j2 = 0;
5402 unsigned int i, j;
5403
5404 unsigned int nelt;
5405 if (!GET_MODE_NUNITS (mode).is_constant (const_value: &nelt))
5406 {
5407 if (dump_enabled_p ())
5408 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5409 "cannot handle groups of 3 stores for"
5410 " variable-length vectors\n");
5411 return false;
5412 }
5413
5414 vec_perm_builder sel (nelt, nelt, 1);
5415 sel.quick_grow (len: nelt);
5416 vec_perm_indices indices;
5417 for (j = 0; j < 3; j++)
5418 {
5419 int nelt0 = ((3 - j) * nelt) % 3;
5420 int nelt1 = ((3 - j) * nelt + 1) % 3;
5421 int nelt2 = ((3 - j) * nelt + 2) % 3;
5422 for (i = 0; i < nelt; i++)
5423 {
5424 if (3 * i + nelt0 < nelt)
5425 sel[3 * i + nelt0] = j0++;
5426 if (3 * i + nelt1 < nelt)
5427 sel[3 * i + nelt1] = nelt + j1++;
5428 if (3 * i + nelt2 < nelt)
5429 sel[3 * i + nelt2] = 0;
5430 }
5431 indices.new_vector (sel, 2, nelt);
5432 if (!can_vec_perm_const_p (mode, mode, indices))
5433 {
5434 if (dump_enabled_p ())
5435 dump_printf (MSG_MISSED_OPTIMIZATION,
5436 "permutation op not supported by target.\n");
5437 return false;
5438 }
5439
5440 for (i = 0; i < nelt; i++)
5441 {
5442 if (3 * i + nelt0 < nelt)
5443 sel[3 * i + nelt0] = 3 * i + nelt0;
5444 if (3 * i + nelt1 < nelt)
5445 sel[3 * i + nelt1] = 3 * i + nelt1;
5446 if (3 * i + nelt2 < nelt)
5447 sel[3 * i + nelt2] = nelt + j2++;
5448 }
5449 indices.new_vector (sel, 2, nelt);
5450 if (!can_vec_perm_const_p (mode, mode, indices))
5451 {
5452 if (dump_enabled_p ())
5453 dump_printf (MSG_MISSED_OPTIMIZATION,
5454 "permutation op not supported by target.\n");
5455 return false;
5456 }
5457 }
5458 return true;
5459 }
5460 else
5461 {
5462 /* If length is not equal to 3 then only power of 2 is supported. */
5463 gcc_assert (pow2p_hwi (count));
5464 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5465
5466 /* The encoding has 2 interleaved stepped patterns. */
5467 if(!multiple_p (a: nelt, b: 2))
5468 return false;
5469 vec_perm_builder sel (nelt, 2, 3);
5470 sel.quick_grow (len: 6);
5471 for (i = 0; i < 3; i++)
5472 {
5473 sel[i * 2] = i;
5474 sel[i * 2 + 1] = i + nelt;
5475 }
5476 vec_perm_indices indices (sel, 2, nelt);
5477 if (can_vec_perm_const_p (mode, mode, indices))
5478 {
5479 for (i = 0; i < 6; i++)
5480 sel[i] += exact_div (a: nelt, b: 2);
5481 indices.new_vector (sel, 2, nelt);
5482 if (can_vec_perm_const_p (mode, mode, indices))
5483 return true;
5484 }
5485 }
5486 }
5487
5488 if (dump_enabled_p ())
5489 dump_printf (MSG_MISSED_OPTIMIZATION,
5490 "permutation op not supported by target.\n");
5491 return false;
5492}
5493
5494/* Return FN if vec_{mask_,mask_len_}store_lanes is available for COUNT vectors
5495 of type VECTYPE. MASKED_P says whether the masked form is needed. */
5496
5497internal_fn
5498vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5499 bool masked_p)
5500{
5501 if (vect_lanes_optab_supported_p (name: "vec_mask_len_store_lanes",
5502 optab: vec_mask_len_store_lanes_optab, vectype,
5503 count))
5504 return IFN_MASK_LEN_STORE_LANES;
5505 else if (masked_p)
5506 {
5507 if (vect_lanes_optab_supported_p (name: "vec_mask_store_lanes",
5508 optab: vec_mask_store_lanes_optab, vectype,
5509 count))
5510 return IFN_MASK_STORE_LANES;
5511 }
5512 else
5513 {
5514 if (vect_lanes_optab_supported_p (name: "vec_store_lanes",
5515 optab: vec_store_lanes_optab, vectype, count))
5516 return IFN_STORE_LANES;
5517 }
5518 return IFN_LAST;
5519}
5520
5521
5522/* Function vect_permute_store_chain.
5523
5524 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5525 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5526 the data correctly for the stores. Return the final references for stores
5527 in RESULT_CHAIN.
5528
5529 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5530 The input is 4 vectors each containing 8 elements. We assign a number to
5531 each element, the input sequence is:
5532
5533 1st vec: 0 1 2 3 4 5 6 7
5534 2nd vec: 8 9 10 11 12 13 14 15
5535 3rd vec: 16 17 18 19 20 21 22 23
5536 4th vec: 24 25 26 27 28 29 30 31
5537
5538 The output sequence should be:
5539
5540 1st vec: 0 8 16 24 1 9 17 25
5541 2nd vec: 2 10 18 26 3 11 19 27
5542 3rd vec: 4 12 20 28 5 13 21 30
5543 4th vec: 6 14 22 30 7 15 23 31
5544
5545 i.e., we interleave the contents of the four vectors in their order.
5546
5547 We use interleave_high/low instructions to create such output. The input of
5548 each interleave_high/low operation is two vectors:
5549 1st vec 2nd vec
5550 0 1 2 3 4 5 6 7
5551 the even elements of the result vector are obtained left-to-right from the
5552 high/low elements of the first vector. The odd elements of the result are
5553 obtained left-to-right from the high/low elements of the second vector.
5554 The output of interleave_high will be: 0 4 1 5
5555 and of interleave_low: 2 6 3 7
5556
5557
5558 The permutation is done in log LENGTH stages. In each stage interleave_high
5559 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5560 where the first argument is taken from the first half of DR_CHAIN and the
5561 second argument from it's second half.
5562 In our example,
5563
5564 I1: interleave_high (1st vec, 3rd vec)
5565 I2: interleave_low (1st vec, 3rd vec)
5566 I3: interleave_high (2nd vec, 4th vec)
5567 I4: interleave_low (2nd vec, 4th vec)
5568
5569 The output for the first stage is:
5570
5571 I1: 0 16 1 17 2 18 3 19
5572 I2: 4 20 5 21 6 22 7 23
5573 I3: 8 24 9 25 10 26 11 27
5574 I4: 12 28 13 29 14 30 15 31
5575
5576 The output of the second stage, i.e. the final result is:
5577
5578 I1: 0 8 16 24 1 9 17 25
5579 I2: 2 10 18 26 3 11 19 27
5580 I3: 4 12 20 28 5 13 21 30
5581 I4: 6 14 22 30 7 15 23 31. */
5582
5583void
5584vect_permute_store_chain (vec_info *vinfo, vec<tree> &dr_chain,
5585 unsigned int length,
5586 stmt_vec_info stmt_info,
5587 gimple_stmt_iterator *gsi,
5588 vec<tree> *result_chain)
5589{
5590 tree vect1, vect2, high, low;
5591 gimple *perm_stmt;
5592 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5593 tree perm_mask_low, perm_mask_high;
5594 tree data_ref;
5595 tree perm3_mask_low, perm3_mask_high;
5596 unsigned int i, j, n, log_length = exact_log2 (x: length);
5597
5598 result_chain->quick_grow (len: length);
5599 memcpy (dest: result_chain->address (), src: dr_chain.address (),
5600 n: length * sizeof (tree));
5601
5602 if (length == 3)
5603 {
5604 /* vect_grouped_store_supported ensures that this is constant. */
5605 unsigned int nelt = TYPE_VECTOR_SUBPARTS (node: vectype).to_constant ();
5606 unsigned int j0 = 0, j1 = 0, j2 = 0;
5607
5608 vec_perm_builder sel (nelt, nelt, 1);
5609 sel.quick_grow (len: nelt);
5610 vec_perm_indices indices;
5611 for (j = 0; j < 3; j++)
5612 {
5613 int nelt0 = ((3 - j) * nelt) % 3;
5614 int nelt1 = ((3 - j) * nelt + 1) % 3;
5615 int nelt2 = ((3 - j) * nelt + 2) % 3;
5616
5617 for (i = 0; i < nelt; i++)
5618 {
5619 if (3 * i + nelt0 < nelt)
5620 sel[3 * i + nelt0] = j0++;
5621 if (3 * i + nelt1 < nelt)
5622 sel[3 * i + nelt1] = nelt + j1++;
5623 if (3 * i + nelt2 < nelt)
5624 sel[3 * i + nelt2] = 0;
5625 }
5626 indices.new_vector (sel, 2, nelt);
5627 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5628
5629 for (i = 0; i < nelt; i++)
5630 {
5631 if (3 * i + nelt0 < nelt)
5632 sel[3 * i + nelt0] = 3 * i + nelt0;
5633 if (3 * i + nelt1 < nelt)
5634 sel[3 * i + nelt1] = 3 * i + nelt1;
5635 if (3 * i + nelt2 < nelt)
5636 sel[3 * i + nelt2] = nelt + j2++;
5637 }
5638 indices.new_vector (sel, 2, nelt);
5639 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5640
5641 vect1 = dr_chain[0];
5642 vect2 = dr_chain[1];
5643
5644 /* Create interleaving stmt:
5645 low = VEC_PERM_EXPR <vect1, vect2,
5646 {j, nelt, *, j + 1, nelt + j + 1, *,
5647 j + 2, nelt + j + 2, *, ...}> */
5648 data_ref = make_temp_ssa_name (type: vectype, NULL, name: "vect_shuffle3_low");
5649 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5650 vect2, perm3_mask_low);
5651 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5652
5653 vect1 = data_ref;
5654 vect2 = dr_chain[2];
5655 /* Create interleaving stmt:
5656 low = VEC_PERM_EXPR <vect1, vect2,
5657 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5658 6, 7, nelt + j + 2, ...}> */
5659 data_ref = make_temp_ssa_name (type: vectype, NULL, name: "vect_shuffle3_high");
5660 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5661 vect2, perm3_mask_high);
5662 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5663 (*result_chain)[j] = data_ref;
5664 }
5665 }
5666 else
5667 {
5668 /* If length is not equal to 3 then only power of 2 is supported. */
5669 gcc_assert (pow2p_hwi (length));
5670
5671 /* The encoding has 2 interleaved stepped patterns. */
5672 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (node: vectype);
5673 vec_perm_builder sel (nelt, 2, 3);
5674 sel.quick_grow (len: 6);
5675 for (i = 0; i < 3; i++)
5676 {
5677 sel[i * 2] = i;
5678 sel[i * 2 + 1] = i + nelt;
5679 }
5680 vec_perm_indices indices (sel, 2, nelt);
5681 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5682
5683 for (i = 0; i < 6; i++)
5684 sel[i] += exact_div (a: nelt, b: 2);
5685 indices.new_vector (sel, 2, nelt);
5686 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5687
5688 for (i = 0, n = log_length; i < n; i++)
5689 {
5690 for (j = 0; j < length/2; j++)
5691 {
5692 vect1 = dr_chain[j];
5693 vect2 = dr_chain[j+length/2];
5694
5695 /* Create interleaving stmt:
5696 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5697 ...}> */
5698 high = make_temp_ssa_name (type: vectype, NULL, name: "vect_inter_high");
5699 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5700 vect2, perm_mask_high);
5701 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5702 (*result_chain)[2*j] = high;
5703
5704 /* Create interleaving stmt:
5705 low = VEC_PERM_EXPR <vect1, vect2,
5706 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5707 ...}> */
5708 low = make_temp_ssa_name (type: vectype, NULL, name: "vect_inter_low");
5709 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5710 vect2, perm_mask_low);
5711 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5712 (*result_chain)[2*j+1] = low;
5713 }
5714 memcpy (dest: dr_chain.address (), src: result_chain->address (),
5715 n: length * sizeof (tree));
5716 }
5717 }
5718}
5719
5720/* Function vect_setup_realignment
5721
5722 This function is called when vectorizing an unaligned load using
5723 the dr_explicit_realign[_optimized] scheme.
5724 This function generates the following code at the loop prolog:
5725
5726 p = initial_addr;
5727 x msq_init = *(floor(p)); # prolog load
5728 realignment_token = call target_builtin;
5729 loop:
5730 x msq = phi (msq_init, ---)
5731
5732 The stmts marked with x are generated only for the case of
5733 dr_explicit_realign_optimized.
5734
5735 The code above sets up a new (vector) pointer, pointing to the first
5736 location accessed by STMT_INFO, and a "floor-aligned" load using that
5737 pointer. It also generates code to compute the "realignment-token"
5738 (if the relevant target hook was defined), and creates a phi-node at the
5739 loop-header bb whose arguments are the result of the prolog-load (created
5740 by this function) and the result of a load that takes place in the loop
5741 (to be created by the caller to this function).
5742
5743 For the case of dr_explicit_realign_optimized:
5744 The caller to this function uses the phi-result (msq) to create the
5745 realignment code inside the loop, and sets up the missing phi argument,
5746 as follows:
5747 loop:
5748 msq = phi (msq_init, lsq)
5749 lsq = *(floor(p')); # load in loop
5750 result = realign_load (msq, lsq, realignment_token);
5751
5752 For the case of dr_explicit_realign:
5753 loop:
5754 msq = *(floor(p)); # load in loop
5755 p' = p + (VS-1);
5756 lsq = *(floor(p')); # load in loop
5757 result = realign_load (msq, lsq, realignment_token);
5758
5759 Input:
5760 STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5761 a memory location that may be unaligned.
5762 BSI - place where new code is to be inserted.
5763 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5764 is used.
5765
5766 Output:
5767 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5768 target hook, if defined.
5769 Return value - the result of the loop-header phi node. */
5770
5771tree
5772vect_setup_realignment (vec_info *vinfo, stmt_vec_info stmt_info,
5773 gimple_stmt_iterator *gsi, tree *realignment_token,
5774 enum dr_alignment_support alignment_support_scheme,
5775 tree init_addr,
5776 class loop **at_loop)
5777{
5778 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5779 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (p: vinfo);
5780 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5781 struct data_reference *dr = dr_info->dr;
5782 class loop *loop = NULL;
5783 edge pe = NULL;
5784 tree scalar_dest = gimple_assign_lhs (gs: stmt_info->stmt);
5785 tree vec_dest;
5786 gimple *inc;
5787 tree ptr;
5788 tree data_ref;
5789 basic_block new_bb;
5790 tree msq_init = NULL_TREE;
5791 tree new_temp;
5792 gphi *phi_stmt;
5793 tree msq = NULL_TREE;
5794 gimple_seq stmts = NULL;
5795 bool compute_in_loop = false;
5796 bool nested_in_vect_loop = false;
5797 class loop *containing_loop = (gimple_bb (g: stmt_info->stmt))->loop_father;
5798 class loop *loop_for_initial_load = NULL;
5799
5800 if (loop_vinfo)
5801 {
5802 loop = LOOP_VINFO_LOOP (loop_vinfo);
5803 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5804 }
5805
5806 gcc_assert (alignment_support_scheme == dr_explicit_realign
5807 || alignment_support_scheme == dr_explicit_realign_optimized);
5808
5809 /* We need to generate three things:
5810 1. the misalignment computation
5811 2. the extra vector load (for the optimized realignment scheme).
5812 3. the phi node for the two vectors from which the realignment is
5813 done (for the optimized realignment scheme). */
5814
5815 /* 1. Determine where to generate the misalignment computation.
5816
5817 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5818 calculation will be generated by this function, outside the loop (in the
5819 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5820 caller, inside the loop.
5821
5822 Background: If the misalignment remains fixed throughout the iterations of
5823 the loop, then both realignment schemes are applicable, and also the
5824 misalignment computation can be done outside LOOP. This is because we are
5825 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5826 are a multiple of VS (the Vector Size), and therefore the misalignment in
5827 different vectorized LOOP iterations is always the same.
5828 The problem arises only if the memory access is in an inner-loop nested
5829 inside LOOP, which is now being vectorized using outer-loop vectorization.
5830 This is the only case when the misalignment of the memory access may not
5831 remain fixed throughout the iterations of the inner-loop (as explained in
5832 detail in vect_supportable_dr_alignment). In this case, not only is the
5833 optimized realignment scheme not applicable, but also the misalignment
5834 computation (and generation of the realignment token that is passed to
5835 REALIGN_LOAD) have to be done inside the loop.
5836
5837 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5838 or not, which in turn determines if the misalignment is computed inside
5839 the inner-loop, or outside LOOP. */
5840
5841 if (init_addr != NULL_TREE || !loop_vinfo)
5842 {
5843 compute_in_loop = true;
5844 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5845 }
5846
5847
5848 /* 2. Determine where to generate the extra vector load.
5849
5850 For the optimized realignment scheme, instead of generating two vector
5851 loads in each iteration, we generate a single extra vector load in the
5852 preheader of the loop, and in each iteration reuse the result of the
5853 vector load from the previous iteration. In case the memory access is in
5854 an inner-loop nested inside LOOP, which is now being vectorized using
5855 outer-loop vectorization, we need to determine whether this initial vector
5856 load should be generated at the preheader of the inner-loop, or can be
5857 generated at the preheader of LOOP. If the memory access has no evolution
5858 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5859 to be generated inside LOOP (in the preheader of the inner-loop). */
5860
5861 if (nested_in_vect_loop)
5862 {
5863 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5864 bool invariant_in_outerloop =
5865 (tree_int_cst_compare (t1: outerloop_step, size_zero_node) == 0);
5866 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5867 }
5868 else
5869 loop_for_initial_load = loop;
5870 if (at_loop)
5871 *at_loop = loop_for_initial_load;
5872
5873 tree vuse = NULL_TREE;
5874 if (loop_for_initial_load)
5875 {
5876 pe = loop_preheader_edge (loop_for_initial_load);
5877 if (gphi *vphi = get_virtual_phi (loop_for_initial_load->header))
5878 vuse = PHI_ARG_DEF_FROM_EDGE (vphi, pe);
5879 }
5880 if (!vuse)
5881 vuse = gimple_vuse (g: gsi_stmt (i: *gsi));
5882
5883 /* 3. For the case of the optimized realignment, create the first vector
5884 load at the loop preheader. */
5885
5886 if (alignment_support_scheme == dr_explicit_realign_optimized)
5887 {
5888 /* Create msq_init = *(floor(p1)) in the loop preheader */
5889 gassign *new_stmt;
5890
5891 gcc_assert (!compute_in_loop);
5892 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5893 ptr = vect_create_data_ref_ptr (vinfo, stmt_info, aggr_type: vectype,
5894 at_loop: loop_for_initial_load, NULL_TREE,
5895 initial_address: &init_addr, NULL, ptr_incr: &inc, only_init: true);
5896 if (TREE_CODE (ptr) == SSA_NAME)
5897 new_temp = copy_ssa_name (var: ptr);
5898 else
5899 new_temp = make_ssa_name (TREE_TYPE (ptr));
5900 poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
5901 tree type = TREE_TYPE (ptr);
5902 new_stmt = gimple_build_assign
5903 (new_temp, BIT_AND_EXPR, ptr,
5904 fold_build2 (MINUS_EXPR, type,
5905 build_int_cst (type, 0),
5906 build_int_cst (type, align)));
5907 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5908 gcc_assert (!new_bb);
5909 data_ref
5910 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5911 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5912 vect_copy_ref_info (dest: data_ref, DR_REF (dr));
5913 new_stmt = gimple_build_assign (vec_dest, data_ref);
5914 new_temp = make_ssa_name (var: vec_dest, stmt: new_stmt);
5915 gimple_assign_set_lhs (gs: new_stmt, lhs: new_temp);
5916 gimple_set_vuse (g: new_stmt, vuse);
5917 if (pe)
5918 {
5919 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5920 gcc_assert (!new_bb);
5921 }
5922 else
5923 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5924
5925 msq_init = gimple_assign_lhs (gs: new_stmt);
5926 }
5927
5928 /* 4. Create realignment token using a target builtin, if available.
5929 It is done either inside the containing loop, or before LOOP (as
5930 determined above). */
5931
5932 if (targetm.vectorize.builtin_mask_for_load)
5933 {
5934 gcall *new_stmt;
5935 tree builtin_decl;
5936
5937 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5938 if (!init_addr)
5939 {
5940 /* Generate the INIT_ADDR computation outside LOOP. */
5941 init_addr = vect_create_addr_base_for_vector_ref (vinfo,
5942 stmt_info, new_stmt_list: &stmts,
5943 NULL_TREE);
5944 if (loop)
5945 {
5946 pe = loop_preheader_edge (loop);
5947 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5948 gcc_assert (!new_bb);
5949 }
5950 else
5951 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5952 }
5953
5954 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5955 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5956 vec_dest =
5957 vect_create_destination_var (scalar_dest,
5958 vectype: gimple_call_return_type (gs: new_stmt));
5959 new_temp = make_ssa_name (var: vec_dest, stmt: new_stmt);
5960 gimple_call_set_lhs (gs: new_stmt, lhs: new_temp);
5961
5962 if (compute_in_loop)
5963 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5964 else
5965 {
5966 /* Generate the misalignment computation outside LOOP. */
5967 pe = loop_preheader_edge (loop);
5968 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5969 gcc_assert (!new_bb);
5970 }
5971
5972 *realignment_token = gimple_call_lhs (gs: new_stmt);
5973
5974 /* The result of the CALL_EXPR to this builtin is determined from
5975 the value of the parameter and no global variables are touched
5976 which makes the builtin a "const" function. Requiring the
5977 builtin to have the "const" attribute makes it unnecessary
5978 to call mark_call_clobbered. */
5979 gcc_assert (TREE_READONLY (builtin_decl));
5980 }
5981
5982 if (alignment_support_scheme == dr_explicit_realign)
5983 return msq;
5984
5985 gcc_assert (!compute_in_loop);
5986 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5987
5988
5989 /* 5. Create msq = phi <msq_init, lsq> in loop */
5990
5991 pe = loop_preheader_edge (containing_loop);
5992 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5993 msq = make_ssa_name (var: vec_dest);
5994 phi_stmt = create_phi_node (msq, containing_loop->header);
5995 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5996
5997 return msq;
5998}
5999
6000
6001/* Function vect_grouped_load_supported.
6002
6003 COUNT is the size of the load group (the number of statements plus the
6004 number of gaps). SINGLE_ELEMENT_P is true if there is actually
6005 only one statement, with a gap of COUNT - 1.
6006
6007 Returns true if a suitable permute exists. */
6008
6009bool
6010vect_grouped_load_supported (tree vectype, bool single_element_p,
6011 unsigned HOST_WIDE_INT count)
6012{
6013 machine_mode mode = TYPE_MODE (vectype);
6014
6015 /* If this is single-element interleaving with an element distance
6016 that leaves unused vector loads around punt - we at least create
6017 very sub-optimal code in that case (and blow up memory,
6018 see PR65518). */
6019 if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
6020 {
6021 if (dump_enabled_p ())
6022 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6023 "single-element interleaving not supported "
6024 "for not adjacent vector loads\n");
6025 return false;
6026 }
6027
6028 /* vect_permute_load_chain requires the group size to be equal to 3 or
6029 be a power of two. */
6030 if (count != 3 && exact_log2 (x: count) == -1)
6031 {
6032 if (dump_enabled_p ())
6033 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6034 "the size of the group of accesses"
6035 " is not a power of 2 or not equal to 3\n");
6036 return false;
6037 }
6038
6039 /* Check that the permutation is supported. */
6040 if (VECTOR_MODE_P (mode))
6041 {
6042 unsigned int i, j;
6043 if (count == 3)
6044 {
6045 unsigned int nelt;
6046 if (!GET_MODE_NUNITS (mode).is_constant (const_value: &nelt))
6047 {
6048 if (dump_enabled_p ())
6049 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6050 "cannot handle groups of 3 loads for"
6051 " variable-length vectors\n");
6052 return false;
6053 }
6054
6055 vec_perm_builder sel (nelt, nelt, 1);
6056 sel.quick_grow (len: nelt);
6057 vec_perm_indices indices;
6058 unsigned int k;
6059 for (k = 0; k < 3; k++)
6060 {
6061 for (i = 0; i < nelt; i++)
6062 if (3 * i + k < 2 * nelt)
6063 sel[i] = 3 * i + k;
6064 else
6065 sel[i] = 0;
6066 indices.new_vector (sel, 2, nelt);
6067 if (!can_vec_perm_const_p (mode, mode, indices))
6068 {
6069 if (dump_enabled_p ())
6070 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6071 "shuffle of 3 loads is not supported by"
6072 " target\n");
6073 return false;
6074 }
6075 for (i = 0, j = 0; i < nelt; i++)
6076 if (3 * i + k < 2 * nelt)
6077 sel[i] = i;
6078 else
6079 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6080 indices.new_vector (sel, 2, nelt);
6081 if (!can_vec_perm_const_p (mode, mode, indices))
6082 {
6083 if (dump_enabled_p ())
6084 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6085 "shuffle of 3 loads is not supported by"
6086 " target\n");
6087 return false;
6088 }
6089 }
6090 return true;
6091 }
6092 else
6093 {
6094 /* If length is not equal to 3 then only power of 2 is supported. */
6095 gcc_assert (pow2p_hwi (count));
6096 poly_uint64 nelt = GET_MODE_NUNITS (mode);
6097
6098 /* The encoding has a single stepped pattern. */
6099 vec_perm_builder sel (nelt, 1, 3);
6100 sel.quick_grow (len: 3);
6101 for (i = 0; i < 3; i++)
6102 sel[i] = i * 2;
6103 vec_perm_indices indices (sel, 2, nelt);
6104 if (can_vec_perm_const_p (mode, mode, indices))
6105 {
6106 for (i = 0; i < 3; i++)
6107 sel[i] = i * 2 + 1;
6108 indices.new_vector (sel, 2, nelt);
6109 if (can_vec_perm_const_p (mode, mode, indices))
6110 return true;
6111 }
6112 }
6113 }
6114
6115 if (dump_enabled_p ())
6116 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6117 "extract even/odd not supported by target\n");
6118 return false;
6119}
6120
6121/* Return FN if vec_{masked_,mask_len_}load_lanes is available for COUNT vectors
6122 of type VECTYPE. MASKED_P says whether the masked form is needed. */
6123
6124internal_fn
6125vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
6126 bool masked_p)
6127{
6128 if (vect_lanes_optab_supported_p (name: "vec_mask_len_load_lanes",
6129 optab: vec_mask_len_load_lanes_optab, vectype,
6130 count))
6131 return IFN_MASK_LEN_LOAD_LANES;
6132 else if (masked_p)
6133 {
6134 if (vect_lanes_optab_supported_p (name: "vec_mask_load_lanes",
6135 optab: vec_mask_load_lanes_optab, vectype,
6136 count))
6137 return IFN_MASK_LOAD_LANES;
6138 }
6139 else
6140 {
6141 if (vect_lanes_optab_supported_p (name: "vec_load_lanes", optab: vec_load_lanes_optab,
6142 vectype, count))
6143 return IFN_LOAD_LANES;
6144 }
6145 return IFN_LAST;
6146}
6147
6148/* Function vect_permute_load_chain.
6149
6150 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
6151 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
6152 the input data correctly. Return the final references for loads in
6153 RESULT_CHAIN.
6154
6155 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
6156 The input is 4 vectors each containing 8 elements. We assign a number to each
6157 element, the input sequence is:
6158
6159 1st vec: 0 1 2 3 4 5 6 7
6160 2nd vec: 8 9 10 11 12 13 14 15
6161 3rd vec: 16 17 18 19 20 21 22 23
6162 4th vec: 24 25 26 27 28 29 30 31
6163
6164 The output sequence should be:
6165
6166 1st vec: 0 4 8 12 16 20 24 28
6167 2nd vec: 1 5 9 13 17 21 25 29
6168 3rd vec: 2 6 10 14 18 22 26 30
6169 4th vec: 3 7 11 15 19 23 27 31
6170
6171 i.e., the first output vector should contain the first elements of each
6172 interleaving group, etc.
6173
6174 We use extract_even/odd instructions to create such output. The input of
6175 each extract_even/odd operation is two vectors
6176 1st vec 2nd vec
6177 0 1 2 3 4 5 6 7
6178
6179 and the output is the vector of extracted even/odd elements. The output of
6180 extract_even will be: 0 2 4 6
6181 and of extract_odd: 1 3 5 7
6182
6183
6184 The permutation is done in log LENGTH stages. In each stage extract_even
6185 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
6186 their order. In our example,
6187
6188 E1: extract_even (1st vec, 2nd vec)
6189 E2: extract_odd (1st vec, 2nd vec)
6190 E3: extract_even (3rd vec, 4th vec)
6191 E4: extract_odd (3rd vec, 4th vec)
6192
6193 The output for the first stage will be:
6194
6195 E1: 0 2 4 6 8 10 12 14
6196 E2: 1 3 5 7 9 11 13 15
6197 E3: 16 18 20 22 24 26 28 30
6198 E4: 17 19 21 23 25 27 29 31
6199
6200 In order to proceed and create the correct sequence for the next stage (or
6201 for the correct output, if the second stage is the last one, as in our
6202 example), we first put the output of extract_even operation and then the
6203 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
6204 The input for the second stage is:
6205
6206 1st vec (E1): 0 2 4 6 8 10 12 14
6207 2nd vec (E3): 16 18 20 22 24 26 28 30
6208 3rd vec (E2): 1 3 5 7 9 11 13 15
6209 4th vec (E4): 17 19 21 23 25 27 29 31
6210
6211 The output of the second stage:
6212
6213 E1: 0 4 8 12 16 20 24 28
6214 E2: 2 6 10 14 18 22 26 30
6215 E3: 1 5 9 13 17 21 25 29
6216 E4: 3 7 11 15 19 23 27 31
6217
6218 And RESULT_CHAIN after reordering:
6219
6220 1st vec (E1): 0 4 8 12 16 20 24 28
6221 2nd vec (E3): 1 5 9 13 17 21 25 29
6222 3rd vec (E2): 2 6 10 14 18 22 26 30
6223 4th vec (E4): 3 7 11 15 19 23 27 31. */
6224
6225static void
6226vect_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6227 unsigned int length,
6228 stmt_vec_info stmt_info,
6229 gimple_stmt_iterator *gsi,
6230 vec<tree> *result_chain)
6231{
6232 tree data_ref, first_vect, second_vect;
6233 tree perm_mask_even, perm_mask_odd;
6234 tree perm3_mask_low, perm3_mask_high;
6235 gimple *perm_stmt;
6236 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6237 unsigned int i, j, log_length = exact_log2 (x: length);
6238
6239 result_chain->quick_grow (len: length);
6240 memcpy (dest: result_chain->address (), src: dr_chain.address (),
6241 n: length * sizeof (tree));
6242
6243 if (length == 3)
6244 {
6245 /* vect_grouped_load_supported ensures that this is constant. */
6246 unsigned nelt = TYPE_VECTOR_SUBPARTS (node: vectype).to_constant ();
6247 unsigned int k;
6248
6249 vec_perm_builder sel (nelt, nelt, 1);
6250 sel.quick_grow (len: nelt);
6251 vec_perm_indices indices;
6252 for (k = 0; k < 3; k++)
6253 {
6254 for (i = 0; i < nelt; i++)
6255 if (3 * i + k < 2 * nelt)
6256 sel[i] = 3 * i + k;
6257 else
6258 sel[i] = 0;
6259 indices.new_vector (sel, 2, nelt);
6260 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
6261
6262 for (i = 0, j = 0; i < nelt; i++)
6263 if (3 * i + k < 2 * nelt)
6264 sel[i] = i;
6265 else
6266 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6267 indices.new_vector (sel, 2, nelt);
6268 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
6269
6270 first_vect = dr_chain[0];
6271 second_vect = dr_chain[1];
6272
6273 /* Create interleaving stmt (low part of):
6274 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6275 ...}> */
6276 data_ref = make_temp_ssa_name (type: vectype, NULL, name: "vect_shuffle3_low");
6277 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6278 second_vect, perm3_mask_low);
6279 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6280
6281 /* Create interleaving stmt (high part of):
6282 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6283 ...}> */
6284 first_vect = data_ref;
6285 second_vect = dr_chain[2];
6286 data_ref = make_temp_ssa_name (type: vectype, NULL, name: "vect_shuffle3_high");
6287 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6288 second_vect, perm3_mask_high);
6289 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6290 (*result_chain)[k] = data_ref;
6291 }
6292 }
6293 else
6294 {
6295 /* If length is not equal to 3 then only power of 2 is supported. */
6296 gcc_assert (pow2p_hwi (length));
6297
6298 /* The encoding has a single stepped pattern. */
6299 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (node: vectype);
6300 vec_perm_builder sel (nelt, 1, 3);
6301 sel.quick_grow (len: 3);
6302 for (i = 0; i < 3; ++i)
6303 sel[i] = i * 2;
6304 vec_perm_indices indices (sel, 2, nelt);
6305 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
6306
6307 for (i = 0; i < 3; ++i)
6308 sel[i] = i * 2 + 1;
6309 indices.new_vector (sel, 2, nelt);
6310 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
6311
6312 for (i = 0; i < log_length; i++)
6313 {
6314 for (j = 0; j < length; j += 2)
6315 {
6316 first_vect = dr_chain[j];
6317 second_vect = dr_chain[j+1];
6318
6319 /* data_ref = permute_even (first_data_ref, second_data_ref); */
6320 data_ref = make_temp_ssa_name (type: vectype, NULL, name: "vect_perm_even");
6321 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6322 first_vect, second_vect,
6323 perm_mask_even);
6324 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6325 (*result_chain)[j/2] = data_ref;
6326
6327 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
6328 data_ref = make_temp_ssa_name (type: vectype, NULL, name: "vect_perm_odd");
6329 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6330 first_vect, second_vect,
6331 perm_mask_odd);
6332 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6333 (*result_chain)[j/2+length/2] = data_ref;
6334 }
6335 memcpy (dest: dr_chain.address (), src: result_chain->address (),
6336 n: length * sizeof (tree));
6337 }
6338 }
6339}
6340
6341/* Function vect_shift_permute_load_chain.
6342
6343 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6344 sequence of stmts to reorder the input data accordingly.
6345 Return the final references for loads in RESULT_CHAIN.
6346 Return true if successed, false otherwise.
6347
6348 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6349 The input is 3 vectors each containing 8 elements. We assign a
6350 number to each element, the input sequence is:
6351
6352 1st vec: 0 1 2 3 4 5 6 7
6353 2nd vec: 8 9 10 11 12 13 14 15
6354 3rd vec: 16 17 18 19 20 21 22 23
6355
6356 The output sequence should be:
6357
6358 1st vec: 0 3 6 9 12 15 18 21
6359 2nd vec: 1 4 7 10 13 16 19 22
6360 3rd vec: 2 5 8 11 14 17 20 23
6361
6362 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6363
6364 First we shuffle all 3 vectors to get correct elements order:
6365
6366 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
6367 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
6368 3rd vec: (16 19 22) (17 20 23) (18 21)
6369
6370 Next we unite and shift vector 3 times:
6371
6372 1st step:
6373 shift right by 6 the concatenation of:
6374 "1st vec" and "2nd vec"
6375 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6376 "2nd vec" and "3rd vec"
6377 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6378 "3rd vec" and "1st vec"
6379 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
6380 | New vectors |
6381
6382 So that now new vectors are:
6383
6384 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
6385 2nd vec: (10 13) (16 19 22) (17 20 23)
6386 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
6387
6388 2nd step:
6389 shift right by 5 the concatenation of:
6390 "1st vec" and "3rd vec"
6391 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
6392 "2nd vec" and "1st vec"
6393 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
6394 "3rd vec" and "2nd vec"
6395 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
6396 | New vectors |
6397
6398 So that now new vectors are:
6399
6400 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
6401 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
6402 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
6403
6404 3rd step:
6405 shift right by 5 the concatenation of:
6406 "1st vec" and "1st vec"
6407 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
6408 shift right by 3 the concatenation of:
6409 "2nd vec" and "2nd vec"
6410 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
6411 | New vectors |
6412
6413 So that now all vectors are READY:
6414 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
6415 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
6416 3rd vec: ( 1 4 7) (10 13) (16 19 22)
6417
6418 This algorithm is faster than one in vect_permute_load_chain if:
6419 1. "shift of a concatination" is faster than general permutation.
6420 This is usually so.
6421 2. The TARGET machine can't execute vector instructions in parallel.
6422 This is because each step of the algorithm depends on previous.
6423 The algorithm in vect_permute_load_chain is much more parallel.
6424
6425 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6426*/
6427
6428static bool
6429vect_shift_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6430 unsigned int length,
6431 stmt_vec_info stmt_info,
6432 gimple_stmt_iterator *gsi,
6433 vec<tree> *result_chain)
6434{
6435 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6436 tree perm2_mask1, perm2_mask2, perm3_mask;
6437 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6438 gimple *perm_stmt;
6439
6440 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6441 machine_mode vmode = TYPE_MODE (vectype);
6442 unsigned int i;
6443 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (p: vinfo);
6444
6445 unsigned HOST_WIDE_INT nelt, vf;
6446 if (!TYPE_VECTOR_SUBPARTS (node: vectype).is_constant (const_value: &nelt)
6447 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (const_value: &vf))
6448 /* Not supported for variable-length vectors. */
6449 return false;
6450
6451 vec_perm_builder sel (nelt, nelt, 1);
6452 sel.quick_grow (len: nelt);
6453
6454 result_chain->quick_grow (len: length);
6455 memcpy (dest: result_chain->address (), src: dr_chain.address (),
6456 n: length * sizeof (tree));
6457
6458 if (pow2p_hwi (x: length) && vf > 4)
6459 {
6460 unsigned int j, log_length = exact_log2 (x: length);
6461 for (i = 0; i < nelt / 2; ++i)
6462 sel[i] = i * 2;
6463 for (i = 0; i < nelt / 2; ++i)
6464 sel[nelt / 2 + i] = i * 2 + 1;
6465 vec_perm_indices indices (sel, 2, nelt);
6466 if (!can_vec_perm_const_p (vmode, vmode, indices))
6467 {
6468 if (dump_enabled_p ())
6469 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6470 "shuffle of 2 fields structure is not \
6471 supported by target\n");
6472 return false;
6473 }
6474 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6475
6476 for (i = 0; i < nelt / 2; ++i)
6477 sel[i] = i * 2 + 1;
6478 for (i = 0; i < nelt / 2; ++i)
6479 sel[nelt / 2 + i] = i * 2;
6480 indices.new_vector (sel, 2, nelt);
6481 if (!can_vec_perm_const_p (vmode, vmode, indices))
6482 {
6483 if (dump_enabled_p ())
6484 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6485 "shuffle of 2 fields structure is not \
6486 supported by target\n");
6487 return false;
6488 }
6489 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6490
6491 /* Generating permutation constant to shift all elements.
6492 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6493 for (i = 0; i < nelt; i++)
6494 sel[i] = nelt / 2 + i;
6495 indices.new_vector (sel, 2, nelt);
6496 if (!can_vec_perm_const_p (vmode, vmode, indices))
6497 {
6498 if (dump_enabled_p ())
6499 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6500 "shift permutation is not supported by target\n");
6501 return false;
6502 }
6503 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6504
6505 /* Generating permutation constant to select vector from 2.
6506 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6507 for (i = 0; i < nelt / 2; i++)
6508 sel[i] = i;
6509 for (i = nelt / 2; i < nelt; i++)
6510 sel[i] = nelt + i;
6511 indices.new_vector (sel, 2, nelt);
6512 if (!can_vec_perm_const_p (vmode, vmode, indices))
6513 {
6514 if (dump_enabled_p ())
6515 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6516 "select is not supported by target\n");
6517 return false;
6518 }
6519 select_mask = vect_gen_perm_mask_checked (vectype, indices);
6520
6521 for (i = 0; i < log_length; i++)
6522 {
6523 for (j = 0; j < length; j += 2)
6524 {
6525 first_vect = dr_chain[j];
6526 second_vect = dr_chain[j + 1];
6527
6528 data_ref = make_temp_ssa_name (type: vectype, NULL, name: "vect_shuffle2");
6529 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6530 first_vect, first_vect,
6531 perm2_mask1);
6532 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6533 vect[0] = data_ref;
6534
6535 data_ref = make_temp_ssa_name (type: vectype, NULL, name: "vect_shuffle2");
6536 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6537 second_vect, second_vect,
6538 perm2_mask2);
6539 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6540 vect[1] = data_ref;
6541
6542 data_ref = make_temp_ssa_name (type: vectype, NULL, name: "vect_shift");
6543 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6544 vect[0], vect[1], shift1_mask);
6545 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6546 (*result_chain)[j/2 + length/2] = data_ref;
6547
6548 data_ref = make_temp_ssa_name (type: vectype, NULL, name: "vect_select");
6549 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6550 vect[0], vect[1], select_mask);
6551 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6552 (*result_chain)[j/2] = data_ref;
6553 }
6554 memcpy (dest: dr_chain.address (), src: result_chain->address (),
6555 n: length * sizeof (tree));
6556 }
6557 return true;
6558 }
6559 if (length == 3 && vf > 2)
6560 {
6561 unsigned int k = 0, l = 0;
6562
6563 /* Generating permutation constant to get all elements in rigth order.
6564 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6565 for (i = 0; i < nelt; i++)
6566 {
6567 if (3 * k + (l % 3) >= nelt)
6568 {
6569 k = 0;
6570 l += (3 - (nelt % 3));
6571 }
6572 sel[i] = 3 * k + (l % 3);
6573 k++;
6574 }
6575 vec_perm_indices indices (sel, 2, nelt);
6576 if (!can_vec_perm_const_p (vmode, vmode, indices))
6577 {
6578 if (dump_enabled_p ())
6579 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6580 "shuffle of 3 fields structure is not \
6581 supported by target\n");
6582 return false;
6583 }
6584 perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6585
6586 /* Generating permutation constant to shift all elements.
6587 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6588 for (i = 0; i < nelt; i++)
6589 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6590 indices.new_vector (sel, 2, nelt);
6591 if (!can_vec_perm_const_p (vmode, vmode, indices))
6592 {
6593 if (dump_enabled_p ())
6594 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6595 "shift permutation is not supported by target\n");
6596 return false;
6597 }
6598 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6599
6600 /* Generating permutation constant to shift all elements.
6601 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6602 for (i = 0; i < nelt; i++)
6603 sel[i] = 2 * (nelt / 3) + 1 + i;
6604 indices.new_vector (sel, 2, nelt);
6605 if (!can_vec_perm_const_p (vmode, vmode, indices))
6606 {
6607 if (dump_enabled_p ())
6608 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6609 "shift permutation is not supported by target\n");
6610 return false;
6611 }
6612 shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6613
6614 /* Generating permutation constant to shift all elements.
6615 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6616 for (i = 0; i < nelt; i++)
6617 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6618 indices.new_vector (sel, 2, nelt);
6619 if (!can_vec_perm_const_p (vmode, vmode, indices))
6620 {
6621 if (dump_enabled_p ())
6622 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6623 "shift permutation is not supported by target\n");
6624 return false;
6625 }
6626 shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6627
6628 /* Generating permutation constant to shift all elements.
6629 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6630 for (i = 0; i < nelt; i++)
6631 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6632 indices.new_vector (sel, 2, nelt);
6633 if (!can_vec_perm_const_p (vmode, vmode, indices))
6634 {
6635 if (dump_enabled_p ())
6636 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6637 "shift permutation is not supported by target\n");
6638 return false;
6639 }
6640 shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6641
6642 for (k = 0; k < 3; k++)
6643 {
6644 data_ref = make_temp_ssa_name (type: vectype, NULL, name: "vect_shuffle3");
6645 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6646 dr_chain[k], dr_chain[k],
6647 perm3_mask);
6648 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6649 vect[k] = data_ref;
6650 }
6651
6652 for (k = 0; k < 3; k++)
6653 {
6654 data_ref = make_temp_ssa_name (type: vectype, NULL, name: "vect_shift1");
6655 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6656 vect[k % 3], vect[(k + 1) % 3],
6657 shift1_mask);
6658 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6659 vect_shift[k] = data_ref;
6660 }
6661
6662 for (k = 0; k < 3; k++)
6663 {
6664 data_ref = make_temp_ssa_name (type: vectype, NULL, name: "vect_shift2");
6665 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6666 vect_shift[(4 - k) % 3],
6667 vect_shift[(3 - k) % 3],
6668 shift2_mask);
6669 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6670 vect[k] = data_ref;
6671 }
6672
6673 (*result_chain)[3 - (nelt % 3)] = vect[2];
6674
6675 data_ref = make_temp_ssa_name (type: vectype, NULL, name: "vect_shift3");
6676 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6677 vect[0], shift3_mask);
6678 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6679 (*result_chain)[nelt % 3] = data_ref;
6680
6681 data_ref = make_temp_ssa_name (type: vectype, NULL, name: "vect_shift4");
6682 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6683 vect[1], shift4_mask);
6684 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6685 (*result_chain)[0] = data_ref;
6686 return true;
6687 }
6688 return false;
6689}
6690
6691/* Function vect_transform_grouped_load.
6692
6693 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6694 to perform their permutation and ascribe the result vectorized statements to
6695 the scalar statements.
6696*/
6697
6698void
6699vect_transform_grouped_load (vec_info *vinfo, stmt_vec_info stmt_info,
6700 vec<tree> dr_chain,
6701 int size, gimple_stmt_iterator *gsi)
6702{
6703 machine_mode mode;
6704 vec<tree> result_chain = vNULL;
6705
6706 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6707 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6708 vectors, that are ready for vector computation. */
6709 result_chain.create (nelems: size);
6710
6711 /* If reassociation width for vector type is 2 or greater target machine can
6712 execute 2 or more vector instructions in parallel. Otherwise try to
6713 get chain for loads group using vect_shift_permute_load_chain. */
6714 mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6715 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6716 || pow2p_hwi (x: size)
6717 || !vect_shift_permute_load_chain (vinfo, dr_chain, length: size, stmt_info,
6718 gsi, result_chain: &result_chain))
6719 vect_permute_load_chain (vinfo, dr_chain,
6720 length: size, stmt_info, gsi, result_chain: &result_chain);
6721 vect_record_grouped_load_vectors (vinfo, stmt_info, result_chain);
6722 result_chain.release ();
6723}
6724
6725/* RESULT_CHAIN contains the output of a group of grouped loads that were
6726 generated as part of the vectorization of STMT_INFO. Assign the statement
6727 for each vector to the associated scalar statement. */
6728
6729void
6730vect_record_grouped_load_vectors (vec_info *, stmt_vec_info stmt_info,
6731 vec<tree> result_chain)
6732{
6733 stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6734 unsigned int i, gap_count;
6735 tree tmp_data_ref;
6736
6737 /* Put a permuted data-ref in the VECTORIZED_STMT field.
6738 Since we scan the chain starting from it's first node, their order
6739 corresponds the order of data-refs in RESULT_CHAIN. */
6740 stmt_vec_info next_stmt_info = first_stmt_info;
6741 gap_count = 1;
6742 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6743 {
6744 if (!next_stmt_info)
6745 break;
6746
6747 /* Skip the gaps. Loads created for the gaps will be removed by dead
6748 code elimination pass later. No need to check for the first stmt in
6749 the group, since it always exists.
6750 DR_GROUP_GAP is the number of steps in elements from the previous
6751 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
6752 correspond to the gaps. */
6753 if (next_stmt_info != first_stmt_info
6754 && gap_count < DR_GROUP_GAP (next_stmt_info))
6755 {
6756 gap_count++;
6757 continue;
6758 }
6759
6760 /* ??? The following needs cleanup after the removal of
6761 DR_GROUP_SAME_DR_STMT. */
6762 if (next_stmt_info)
6763 {
6764 gimple *new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6765 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6766 copies, and we put the new vector statement last. */
6767 STMT_VINFO_VEC_STMTS (next_stmt_info).safe_push (obj: new_stmt);
6768
6769 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
6770 gap_count = 1;
6771 }
6772 }
6773}
6774
6775/* Function vect_force_dr_alignment_p.
6776
6777 Returns whether the alignment of a DECL can be forced to be aligned
6778 on ALIGNMENT bit boundary. */
6779
6780bool
6781vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
6782{
6783 if (!VAR_P (decl))
6784 return false;
6785
6786 if (decl_in_symtab_p (decl)
6787 && !symtab_node::get (decl)->can_increase_alignment_p ())
6788 return false;
6789
6790 if (TREE_STATIC (decl))
6791 return (known_le (alignment,
6792 (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
6793 else
6794 return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
6795}
6796
6797/* Return whether the data reference DR_INFO is supported with respect to its
6798 alignment.
6799 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6800 it is aligned, i.e., check if it is possible to vectorize it with different
6801 alignment. */
6802
6803enum dr_alignment_support
6804vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info,
6805 tree vectype, int misalignment)
6806{
6807 data_reference *dr = dr_info->dr;
6808 stmt_vec_info stmt_info = dr_info->stmt;
6809 machine_mode mode = TYPE_MODE (vectype);
6810 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (p: vinfo);
6811 class loop *vect_loop = NULL;
6812 bool nested_in_vect_loop = false;
6813
6814 if (misalignment == 0)
6815 return dr_aligned;
6816
6817 /* For now assume all conditional loads/stores support unaligned
6818 access without any special code. */
6819 if (gcall *stmt = dyn_cast <gcall *> (p: stmt_info->stmt))
6820 if (gimple_call_internal_p (gs: stmt)
6821 && (gimple_call_internal_fn (gs: stmt) == IFN_MASK_LOAD
6822 || gimple_call_internal_fn (gs: stmt) == IFN_MASK_STORE))
6823 return dr_unaligned_supported;
6824
6825 if (loop_vinfo)
6826 {
6827 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6828 nested_in_vect_loop = nested_in_vect_loop_p (loop: vect_loop, stmt_info);
6829 }
6830
6831 /* Possibly unaligned access. */
6832
6833 /* We can choose between using the implicit realignment scheme (generating
6834 a misaligned_move stmt) and the explicit realignment scheme (generating
6835 aligned loads with a REALIGN_LOAD). There are two variants to the
6836 explicit realignment scheme: optimized, and unoptimized.
6837 We can optimize the realignment only if the step between consecutive
6838 vector loads is equal to the vector size. Since the vector memory
6839 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6840 is guaranteed that the misalignment amount remains the same throughout the
6841 execution of the vectorized loop. Therefore, we can create the
6842 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6843 at the loop preheader.
6844
6845 However, in the case of outer-loop vectorization, when vectorizing a
6846 memory access in the inner-loop nested within the LOOP that is now being
6847 vectorized, while it is guaranteed that the misalignment of the
6848 vectorized memory access will remain the same in different outer-loop
6849 iterations, it is *not* guaranteed that is will remain the same throughout
6850 the execution of the inner-loop. This is because the inner-loop advances
6851 with the original scalar step (and not in steps of VS). If the inner-loop
6852 step happens to be a multiple of VS, then the misalignment remains fixed
6853 and we can use the optimized realignment scheme. For example:
6854
6855 for (i=0; i<N; i++)
6856 for (j=0; j<M; j++)
6857 s += a[i+j];
6858
6859 When vectorizing the i-loop in the above example, the step between
6860 consecutive vector loads is 1, and so the misalignment does not remain
6861 fixed across the execution of the inner-loop, and the realignment cannot
6862 be optimized (as illustrated in the following pseudo vectorized loop):
6863
6864 for (i=0; i<N; i+=4)
6865 for (j=0; j<M; j++){
6866 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6867 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6868 // (assuming that we start from an aligned address).
6869 }
6870
6871 We therefore have to use the unoptimized realignment scheme:
6872
6873 for (i=0; i<N; i+=4)
6874 for (j=k; j<M; j+=4)
6875 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6876 // that the misalignment of the initial address is
6877 // 0).
6878
6879 The loop can then be vectorized as follows:
6880
6881 for (k=0; k<4; k++){
6882 rt = get_realignment_token (&vp[k]);
6883 for (i=0; i<N; i+=4){
6884 v1 = vp[i+k];
6885 for (j=k; j<M; j+=4){
6886 v2 = vp[i+j+VS-1];
6887 va = REALIGN_LOAD <v1,v2,rt>;
6888 vs += va;
6889 v1 = v2;
6890 }
6891 }
6892 } */
6893
6894 if (DR_IS_READ (dr))
6895 {
6896 if (optab_handler (op: vec_realign_load_optab, mode) != CODE_FOR_nothing
6897 && (!targetm.vectorize.builtin_mask_for_load
6898 || targetm.vectorize.builtin_mask_for_load ()))
6899 {
6900 /* If we are doing SLP then the accesses need not have the
6901 same alignment, instead it depends on the SLP group size. */
6902 if (loop_vinfo
6903 && STMT_SLP_TYPE (stmt_info)
6904 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
6905 || !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6906 * (DR_GROUP_SIZE
6907 (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6908 b: TYPE_VECTOR_SUBPARTS (node: vectype))))
6909 ;
6910 else if (!loop_vinfo
6911 || (nested_in_vect_loop
6912 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6913 b: GET_MODE_SIZE (TYPE_MODE (vectype)))))
6914 return dr_explicit_realign;
6915 else
6916 return dr_explicit_realign_optimized;
6917 }
6918 }
6919
6920 bool is_packed = false;
6921 tree type = TREE_TYPE (DR_REF (dr));
6922 if (misalignment == DR_MISALIGNMENT_UNKNOWN)
6923 is_packed = not_size_aligned (DR_REF (dr));
6924 if (targetm.vectorize.support_vector_misalignment (mode, type, misalignment,
6925 is_packed))
6926 return dr_unaligned_supported;
6927
6928 /* Unsupported. */
6929 return dr_unaligned_unsupported;
6930}
6931

source code of gcc/tree-vect-data-refs.cc