1 | /* Data References Analysis and Manipulation Utilities for Vectorization. |
2 | Copyright (C) 2003-2023 Free Software Foundation, Inc. |
3 | Contributed by Dorit Naishlos <dorit@il.ibm.com> |
4 | and Ira Rosen <irar@il.ibm.com> |
5 | |
6 | This file is part of GCC. |
7 | |
8 | GCC is free software; you can redistribute it and/or modify it under |
9 | the terms of the GNU General Public License as published by the Free |
10 | Software Foundation; either version 3, or (at your option) any later |
11 | version. |
12 | |
13 | GCC is distributed in the hope that it will be useful, but WITHOUT ANY |
14 | WARRANTY; without even the implied warranty of MERCHANTABILITY or |
15 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
16 | for more details. |
17 | |
18 | You should have received a copy of the GNU General Public License |
19 | along with GCC; see the file COPYING3. If not see |
20 | <http://www.gnu.org/licenses/>. */ |
21 | |
22 | #include "config.h" |
23 | #include "system.h" |
24 | #include "coretypes.h" |
25 | #include "backend.h" |
26 | #include "target.h" |
27 | #include "rtl.h" |
28 | #include "tree.h" |
29 | #include "gimple.h" |
30 | #include "predict.h" |
31 | #include "memmodel.h" |
32 | #include "tm_p.h" |
33 | #include "ssa.h" |
34 | #include "optabs-tree.h" |
35 | #include "cgraph.h" |
36 | #include "dumpfile.h" |
37 | #include "alias.h" |
38 | #include "fold-const.h" |
39 | #include "stor-layout.h" |
40 | #include "tree-eh.h" |
41 | #include "gimplify.h" |
42 | #include "gimple-iterator.h" |
43 | #include "gimplify-me.h" |
44 | #include "tree-ssa-loop-ivopts.h" |
45 | #include "tree-ssa-loop-manip.h" |
46 | #include "tree-ssa-loop.h" |
47 | #include "cfgloop.h" |
48 | #include "tree-scalar-evolution.h" |
49 | #include "tree-vectorizer.h" |
50 | #include "expr.h" |
51 | #include "builtins.h" |
52 | #include "tree-cfg.h" |
53 | #include "tree-hash-traits.h" |
54 | #include "vec-perm-indices.h" |
55 | #include "internal-fn.h" |
56 | #include "gimple-fold.h" |
57 | |
58 | /* Return true if load- or store-lanes optab OPTAB is implemented for |
59 | COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */ |
60 | |
61 | static bool |
62 | vect_lanes_optab_supported_p (const char *name, convert_optab optab, |
63 | tree vectype, unsigned HOST_WIDE_INT count) |
64 | { |
65 | machine_mode mode, array_mode; |
66 | bool limit_p; |
67 | |
68 | mode = TYPE_MODE (vectype); |
69 | if (!targetm.array_mode (mode, count).exists (mode: &array_mode)) |
70 | { |
71 | poly_uint64 bits = count * GET_MODE_BITSIZE (mode); |
72 | limit_p = !targetm.array_mode_supported_p (mode, count); |
73 | if (!int_mode_for_size (size: bits, limit: limit_p).exists (mode: &array_mode)) |
74 | { |
75 | if (dump_enabled_p ()) |
76 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
77 | "no array mode for %s[%wu]\n" , |
78 | GET_MODE_NAME (mode), count); |
79 | return false; |
80 | } |
81 | } |
82 | |
83 | if (convert_optab_handler (op: optab, to_mode: array_mode, from_mode: mode) == CODE_FOR_nothing) |
84 | { |
85 | if (dump_enabled_p ()) |
86 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
87 | "cannot use %s<%s><%s>\n" , name, |
88 | GET_MODE_NAME (array_mode), GET_MODE_NAME (mode)); |
89 | return false; |
90 | } |
91 | |
92 | if (dump_enabled_p ()) |
93 | dump_printf_loc (MSG_NOTE, vect_location, |
94 | "can use %s<%s><%s>\n" , name, GET_MODE_NAME (array_mode), |
95 | GET_MODE_NAME (mode)); |
96 | |
97 | return true; |
98 | } |
99 | |
100 | /* Helper function to identify a simd clone call. If this is a call to a |
101 | function with simd clones then return the corresponding cgraph_node, |
102 | otherwise return NULL. */ |
103 | |
104 | static cgraph_node* |
105 | simd_clone_call_p (gimple *stmt) |
106 | { |
107 | gcall *call = dyn_cast <gcall *> (p: stmt); |
108 | if (!call) |
109 | return NULL; |
110 | |
111 | tree fndecl = NULL_TREE; |
112 | if (gimple_call_internal_p (gs: call, fn: IFN_MASK_CALL)) |
113 | fndecl = TREE_OPERAND (gimple_call_arg (stmt, 0), 0); |
114 | else |
115 | fndecl = gimple_call_fndecl (gs: stmt); |
116 | |
117 | if (fndecl == NULL_TREE) |
118 | return NULL; |
119 | |
120 | cgraph_node *node = cgraph_node::get (decl: fndecl); |
121 | if (node && node->simd_clones != NULL) |
122 | return node; |
123 | |
124 | return NULL; |
125 | } |
126 | |
127 | |
128 | |
129 | /* Return the smallest scalar part of STMT_INFO. |
130 | This is used to determine the vectype of the stmt. We generally set the |
131 | vectype according to the type of the result (lhs). For stmts whose |
132 | result-type is different than the type of the arguments (e.g., demotion, |
133 | promotion), vectype will be reset appropriately (later). Note that we have |
134 | to visit the smallest datatype in this function, because that determines the |
135 | VF. If the smallest datatype in the loop is present only as the rhs of a |
136 | promotion operation - we'd miss it. |
137 | Such a case, where a variable of this datatype does not appear in the lhs |
138 | anywhere in the loop, can only occur if it's an invariant: e.g.: |
139 | 'int_x = (int) short_inv', which we'd expect to have been optimized away by |
140 | invariant motion. However, we cannot rely on invariant motion to always |
141 | take invariants out of the loop, and so in the case of promotion we also |
142 | have to check the rhs. |
143 | LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding |
144 | types. */ |
145 | |
146 | tree |
147 | vect_get_smallest_scalar_type (stmt_vec_info stmt_info, tree scalar_type) |
148 | { |
149 | HOST_WIDE_INT lhs, rhs; |
150 | |
151 | /* During the analysis phase, this function is called on arbitrary |
152 | statements that might not have scalar results. */ |
153 | if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type))) |
154 | return scalar_type; |
155 | |
156 | lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type)); |
157 | |
158 | gassign *assign = dyn_cast <gassign *> (p: stmt_info->stmt); |
159 | if (assign) |
160 | { |
161 | scalar_type = TREE_TYPE (gimple_assign_lhs (assign)); |
162 | if (gimple_assign_cast_p (s: assign) |
163 | || gimple_assign_rhs_code (gs: assign) == DOT_PROD_EXPR |
164 | || gimple_assign_rhs_code (gs: assign) == WIDEN_SUM_EXPR |
165 | || gimple_assign_rhs_code (gs: assign) == WIDEN_MULT_EXPR |
166 | || gimple_assign_rhs_code (gs: assign) == WIDEN_LSHIFT_EXPR |
167 | || gimple_assign_rhs_code (gs: assign) == FLOAT_EXPR) |
168 | { |
169 | tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign)); |
170 | |
171 | rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type)); |
172 | if (rhs < lhs) |
173 | scalar_type = rhs_type; |
174 | } |
175 | } |
176 | else if (cgraph_node *node = simd_clone_call_p (stmt: stmt_info->stmt)) |
177 | { |
178 | auto clone = node->simd_clones->simdclone; |
179 | for (unsigned int i = 0; i < clone->nargs; ++i) |
180 | { |
181 | if (clone->args[i].arg_type == SIMD_CLONE_ARG_TYPE_VECTOR) |
182 | { |
183 | tree arg_scalar_type = TREE_TYPE (clone->args[i].vector_type); |
184 | rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (arg_scalar_type)); |
185 | if (rhs < lhs) |
186 | { |
187 | scalar_type = arg_scalar_type; |
188 | lhs = rhs; |
189 | } |
190 | } |
191 | } |
192 | } |
193 | else if (gcall *call = dyn_cast <gcall *> (p: stmt_info->stmt)) |
194 | { |
195 | unsigned int i = 0; |
196 | if (gimple_call_internal_p (gs: call)) |
197 | { |
198 | internal_fn ifn = gimple_call_internal_fn (gs: call); |
199 | if (internal_load_fn_p (ifn)) |
200 | /* For loads the LHS type does the trick. */ |
201 | i = ~0U; |
202 | else if (internal_store_fn_p (ifn)) |
203 | { |
204 | /* For stores use the tyep of the stored value. */ |
205 | i = internal_fn_stored_value_index (ifn); |
206 | scalar_type = TREE_TYPE (gimple_call_arg (call, i)); |
207 | i = ~0U; |
208 | } |
209 | else if (internal_fn_mask_index (ifn) == 0) |
210 | i = 1; |
211 | } |
212 | if (i < gimple_call_num_args (gs: call)) |
213 | { |
214 | tree rhs_type = TREE_TYPE (gimple_call_arg (call, i)); |
215 | if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type))) |
216 | { |
217 | rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type)); |
218 | if (rhs < lhs) |
219 | scalar_type = rhs_type; |
220 | } |
221 | } |
222 | } |
223 | |
224 | return scalar_type; |
225 | } |
226 | |
227 | |
228 | /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be |
229 | tested at run-time. Return TRUE if DDR was successfully inserted. |
230 | Return false if versioning is not supported. */ |
231 | |
232 | static opt_result |
233 | vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo) |
234 | { |
235 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
236 | |
237 | if ((unsigned) param_vect_max_version_for_alias_checks == 0) |
238 | return opt_result::failure_at (loc: vect_location, |
239 | fmt: "will not create alias checks, as" |
240 | " --param vect-max-version-for-alias-checks" |
241 | " == 0\n" ); |
242 | |
243 | opt_result res |
244 | = runtime_alias_check_p (ddr, loop, |
245 | optimize_loop_nest_for_speed_p (loop)); |
246 | if (!res) |
247 | return res; |
248 | |
249 | LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (obj: ddr); |
250 | return opt_result::success (); |
251 | } |
252 | |
253 | /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */ |
254 | |
255 | static void |
256 | vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value) |
257 | { |
258 | const vec<tree> &checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo); |
259 | for (unsigned int i = 0; i < checks.length(); ++i) |
260 | if (checks[i] == value) |
261 | return; |
262 | |
263 | if (dump_enabled_p ()) |
264 | dump_printf_loc (MSG_NOTE, vect_location, |
265 | "need run-time check that %T is nonzero\n" , |
266 | value); |
267 | LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (obj: value); |
268 | } |
269 | |
270 | /* Return true if we know that the order of vectorized DR_INFO_A and |
271 | vectorized DR_INFO_B will be the same as the order of DR_INFO_A and |
272 | DR_INFO_B. At least one of the accesses is a write. */ |
273 | |
274 | static bool |
275 | vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b) |
276 | { |
277 | stmt_vec_info stmtinfo_a = dr_info_a->stmt; |
278 | stmt_vec_info stmtinfo_b = dr_info_b->stmt; |
279 | |
280 | /* Single statements are always kept in their original order. */ |
281 | if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a) |
282 | && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b)) |
283 | return true; |
284 | |
285 | /* STMT_A and STMT_B belong to overlapping groups. All loads are |
286 | emitted at the position of the first scalar load. |
287 | Stores in a group are emitted at the position of the last scalar store. |
288 | Compute that position and check whether the resulting order matches |
289 | the current one. */ |
290 | stmt_vec_info il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a); |
291 | if (il_a) |
292 | { |
293 | if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a))) |
294 | for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s; |
295 | s = DR_GROUP_NEXT_ELEMENT (s)) |
296 | il_a = get_later_stmt (stmt1_info: il_a, stmt2_info: s); |
297 | else /* DR_IS_READ */ |
298 | for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s; |
299 | s = DR_GROUP_NEXT_ELEMENT (s)) |
300 | if (get_later_stmt (stmt1_info: il_a, stmt2_info: s) == il_a) |
301 | il_a = s; |
302 | } |
303 | else |
304 | il_a = stmtinfo_a; |
305 | stmt_vec_info il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b); |
306 | if (il_b) |
307 | { |
308 | if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b))) |
309 | for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s; |
310 | s = DR_GROUP_NEXT_ELEMENT (s)) |
311 | il_b = get_later_stmt (stmt1_info: il_b, stmt2_info: s); |
312 | else /* DR_IS_READ */ |
313 | for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s; |
314 | s = DR_GROUP_NEXT_ELEMENT (s)) |
315 | if (get_later_stmt (stmt1_info: il_b, stmt2_info: s) == il_b) |
316 | il_b = s; |
317 | } |
318 | else |
319 | il_b = stmtinfo_b; |
320 | bool a_after_b = (get_later_stmt (stmt1_info: stmtinfo_a, stmt2_info: stmtinfo_b) == stmtinfo_a); |
321 | return (get_later_stmt (stmt1_info: il_a, stmt2_info: il_b) == il_a) == a_after_b; |
322 | } |
323 | |
324 | /* A subroutine of vect_analyze_data_ref_dependence. Handle |
325 | DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence |
326 | distances. These distances are conservatively correct but they don't |
327 | reflect a guaranteed dependence. |
328 | |
329 | Return true if this function does all the work necessary to avoid |
330 | an alias or false if the caller should use the dependence distances |
331 | to limit the vectorization factor in the usual way. LOOP_DEPTH is |
332 | the depth of the loop described by LOOP_VINFO and the other arguments |
333 | are as for vect_analyze_data_ref_dependence. */ |
334 | |
335 | static bool |
336 | vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr, |
337 | loop_vec_info loop_vinfo, |
338 | int loop_depth, unsigned int *max_vf) |
339 | { |
340 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
341 | for (lambda_vector &dist_v : DDR_DIST_VECTS (ddr)) |
342 | { |
343 | int dist = dist_v[loop_depth]; |
344 | if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr))) |
345 | { |
346 | /* If the user asserted safelen >= DIST consecutive iterations |
347 | can be executed concurrently, assume independence. |
348 | |
349 | ??? An alternative would be to add the alias check even |
350 | in this case, and vectorize the fallback loop with the |
351 | maximum VF set to safelen. However, if the user has |
352 | explicitly given a length, it's less likely that that |
353 | would be a win. */ |
354 | if (loop->safelen >= 2 && abs_hwi (x: dist) <= loop->safelen) |
355 | { |
356 | if ((unsigned int) loop->safelen < *max_vf) |
357 | *max_vf = loop->safelen; |
358 | LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false; |
359 | continue; |
360 | } |
361 | |
362 | /* For dependence distances of 2 or more, we have the option |
363 | of limiting VF or checking for an alias at runtime. |
364 | Prefer to check at runtime if we can, to avoid limiting |
365 | the VF unnecessarily when the bases are in fact independent. |
366 | |
367 | Note that the alias checks will be removed if the VF ends up |
368 | being small enough. */ |
369 | dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr)); |
370 | dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr)); |
371 | return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt) |
372 | && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt) |
373 | && vect_mark_for_runtime_alias_test (ddr, loop_vinfo)); |
374 | } |
375 | } |
376 | return true; |
377 | } |
378 | |
379 | |
380 | /* Function vect_analyze_data_ref_dependence. |
381 | |
382 | FIXME: I needed to change the sense of the returned flag. |
383 | |
384 | Return FALSE if there (might) exist a dependence between a memory-reference |
385 | DRA and a memory-reference DRB. When versioning for alias may check a |
386 | dependence at run-time, return TRUE. Adjust *MAX_VF according to |
387 | the data dependence. */ |
388 | |
389 | static opt_result |
390 | vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr, |
391 | loop_vec_info loop_vinfo, |
392 | unsigned int *max_vf) |
393 | { |
394 | unsigned int i; |
395 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
396 | struct data_reference *dra = DDR_A (ddr); |
397 | struct data_reference *drb = DDR_B (ddr); |
398 | dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra); |
399 | dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb); |
400 | stmt_vec_info stmtinfo_a = dr_info_a->stmt; |
401 | stmt_vec_info stmtinfo_b = dr_info_b->stmt; |
402 | lambda_vector dist_v; |
403 | unsigned int loop_depth; |
404 | |
405 | /* If user asserted safelen consecutive iterations can be |
406 | executed concurrently, assume independence. */ |
407 | auto apply_safelen = [&]() |
408 | { |
409 | if (loop->safelen >= 2) |
410 | { |
411 | if ((unsigned int) loop->safelen < *max_vf) |
412 | *max_vf = loop->safelen; |
413 | LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false; |
414 | return true; |
415 | } |
416 | return false; |
417 | }; |
418 | |
419 | /* In loop analysis all data references should be vectorizable. */ |
420 | if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a) |
421 | || !STMT_VINFO_VECTORIZABLE (stmtinfo_b)) |
422 | gcc_unreachable (); |
423 | |
424 | /* Independent data accesses. */ |
425 | if (DDR_ARE_DEPENDENT (ddr) == chrec_known) |
426 | return opt_result::success (); |
427 | |
428 | if (dra == drb |
429 | || (DR_IS_READ (dra) && DR_IS_READ (drb))) |
430 | return opt_result::success (); |
431 | |
432 | /* We do not have to consider dependences between accesses that belong |
433 | to the same group, unless the stride could be smaller than the |
434 | group size. */ |
435 | if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a) |
436 | && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a) |
437 | == DR_GROUP_FIRST_ELEMENT (stmtinfo_b)) |
438 | && !STMT_VINFO_STRIDED_P (stmtinfo_a)) |
439 | return opt_result::success (); |
440 | |
441 | /* Even if we have an anti-dependence then, as the vectorized loop covers at |
442 | least two scalar iterations, there is always also a true dependence. |
443 | As the vectorizer does not re-order loads and stores we can ignore |
444 | the anti-dependence if TBAA can disambiguate both DRs similar to the |
445 | case with known negative distance anti-dependences (positive |
446 | distance anti-dependences would violate TBAA constraints). */ |
447 | if (((DR_IS_READ (dra) && DR_IS_WRITE (drb)) |
448 | || (DR_IS_WRITE (dra) && DR_IS_READ (drb))) |
449 | && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)), |
450 | get_alias_set (DR_REF (drb)))) |
451 | return opt_result::success (); |
452 | |
453 | if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a) |
454 | || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b)) |
455 | { |
456 | if (apply_safelen ()) |
457 | return opt_result::success (); |
458 | |
459 | return opt_result::failure_at |
460 | (loc: stmtinfo_a->stmt, |
461 | fmt: "possible alias involving gather/scatter between %T and %T\n" , |
462 | DR_REF (dra), DR_REF (drb)); |
463 | } |
464 | |
465 | /* Unknown data dependence. */ |
466 | if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know) |
467 | { |
468 | if (apply_safelen ()) |
469 | return opt_result::success (); |
470 | |
471 | if (dump_enabled_p ()) |
472 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt, |
473 | "versioning for alias required: " |
474 | "can't determine dependence between %T and %T\n" , |
475 | DR_REF (dra), DR_REF (drb)); |
476 | |
477 | /* Add to list of ddrs that need to be tested at run-time. */ |
478 | return vect_mark_for_runtime_alias_test (ddr, loop_vinfo); |
479 | } |
480 | |
481 | /* Known data dependence. */ |
482 | if (DDR_NUM_DIST_VECTS (ddr) == 0) |
483 | { |
484 | if (apply_safelen ()) |
485 | return opt_result::success (); |
486 | |
487 | if (dump_enabled_p ()) |
488 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt, |
489 | "versioning for alias required: " |
490 | "bad dist vector for %T and %T\n" , |
491 | DR_REF (dra), DR_REF (drb)); |
492 | /* Add to list of ddrs that need to be tested at run-time. */ |
493 | return vect_mark_for_runtime_alias_test (ddr, loop_vinfo); |
494 | } |
495 | |
496 | loop_depth = index_in_loop_nest (var: loop->num, DDR_LOOP_NEST (ddr)); |
497 | |
498 | if (DDR_COULD_BE_INDEPENDENT_P (ddr) |
499 | && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo, |
500 | loop_depth, max_vf)) |
501 | return opt_result::success (); |
502 | |
503 | FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v) |
504 | { |
505 | int dist = dist_v[loop_depth]; |
506 | |
507 | if (dump_enabled_p ()) |
508 | dump_printf_loc (MSG_NOTE, vect_location, |
509 | "dependence distance = %d.\n" , dist); |
510 | |
511 | if (dist == 0) |
512 | { |
513 | if (dump_enabled_p ()) |
514 | dump_printf_loc (MSG_NOTE, vect_location, |
515 | "dependence distance == 0 between %T and %T\n" , |
516 | DR_REF (dra), DR_REF (drb)); |
517 | |
518 | /* When we perform grouped accesses and perform implicit CSE |
519 | by detecting equal accesses and doing disambiguation with |
520 | runtime alias tests like for |
521 | .. = a[i]; |
522 | .. = a[i+1]; |
523 | a[i] = ..; |
524 | a[i+1] = ..; |
525 | *p = ..; |
526 | .. = a[i]; |
527 | .. = a[i+1]; |
528 | where we will end up loading { a[i], a[i+1] } once, make |
529 | sure that inserting group loads before the first load and |
530 | stores after the last store will do the right thing. |
531 | Similar for groups like |
532 | a[i] = ...; |
533 | ... = a[i]; |
534 | a[i+1] = ...; |
535 | where loads from the group interleave with the store. */ |
536 | if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b)) |
537 | return opt_result::failure_at (loc: stmtinfo_a->stmt, |
538 | fmt: "READ_WRITE dependence" |
539 | " in interleaving.\n" ); |
540 | |
541 | if (loop->safelen < 2) |
542 | { |
543 | tree indicator = dr_zero_step_indicator (dra); |
544 | if (!indicator || integer_zerop (indicator)) |
545 | return opt_result::failure_at (loc: stmtinfo_a->stmt, |
546 | fmt: "access also has a zero step\n" ); |
547 | else if (TREE_CODE (indicator) != INTEGER_CST) |
548 | vect_check_nonzero_value (loop_vinfo, value: indicator); |
549 | } |
550 | continue; |
551 | } |
552 | |
553 | if (dist > 0 && DDR_REVERSED_P (ddr)) |
554 | { |
555 | /* If DDR_REVERSED_P the order of the data-refs in DDR was |
556 | reversed (to make distance vector positive), and the actual |
557 | distance is negative. */ |
558 | if (dump_enabled_p ()) |
559 | dump_printf_loc (MSG_NOTE, vect_location, |
560 | "dependence distance negative.\n" ); |
561 | /* When doing outer loop vectorization, we need to check if there is |
562 | a backward dependence at the inner loop level if the dependence |
563 | at the outer loop is reversed. See PR81740. */ |
564 | if (nested_in_vect_loop_p (loop, stmt_info: stmtinfo_a) |
565 | || nested_in_vect_loop_p (loop, stmt_info: stmtinfo_b)) |
566 | { |
567 | unsigned inner_depth = index_in_loop_nest (var: loop->inner->num, |
568 | DDR_LOOP_NEST (ddr)); |
569 | if (dist_v[inner_depth] < 0) |
570 | return opt_result::failure_at (loc: stmtinfo_a->stmt, |
571 | fmt: "not vectorized, dependence " |
572 | "between data-refs %T and %T\n" , |
573 | DR_REF (dra), DR_REF (drb)); |
574 | } |
575 | /* Record a negative dependence distance to later limit the |
576 | amount of stmt copying / unrolling we can perform. |
577 | Only need to handle read-after-write dependence. */ |
578 | if (DR_IS_READ (drb) |
579 | && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0 |
580 | || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist)) |
581 | STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist; |
582 | continue; |
583 | } |
584 | |
585 | unsigned int abs_dist = abs (x: dist); |
586 | if (abs_dist >= 2 && abs_dist < *max_vf) |
587 | { |
588 | /* The dependence distance requires reduction of the maximal |
589 | vectorization factor. */ |
590 | *max_vf = abs_dist; |
591 | if (dump_enabled_p ()) |
592 | dump_printf_loc (MSG_NOTE, vect_location, |
593 | "adjusting maximal vectorization factor to %i\n" , |
594 | *max_vf); |
595 | } |
596 | |
597 | if (abs_dist >= *max_vf) |
598 | { |
599 | /* Dependence distance does not create dependence, as far as |
600 | vectorization is concerned, in this case. */ |
601 | if (dump_enabled_p ()) |
602 | dump_printf_loc (MSG_NOTE, vect_location, |
603 | "dependence distance >= VF.\n" ); |
604 | continue; |
605 | } |
606 | |
607 | return opt_result::failure_at (loc: stmtinfo_a->stmt, |
608 | fmt: "not vectorized, possible dependence " |
609 | "between data-refs %T and %T\n" , |
610 | DR_REF (dra), DR_REF (drb)); |
611 | } |
612 | |
613 | return opt_result::success (); |
614 | } |
615 | |
616 | /* Function vect_analyze_data_ref_dependences. |
617 | |
618 | Examine all the data references in the loop, and make sure there do not |
619 | exist any data dependences between them. Set *MAX_VF according to |
620 | the maximum vectorization factor the data dependences allow. */ |
621 | |
622 | opt_result |
623 | vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, |
624 | unsigned int *max_vf) |
625 | { |
626 | unsigned int i; |
627 | struct data_dependence_relation *ddr; |
628 | |
629 | DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences" ); |
630 | |
631 | if (!LOOP_VINFO_DDRS (loop_vinfo).exists ()) |
632 | { |
633 | LOOP_VINFO_DDRS (loop_vinfo) |
634 | .create (LOOP_VINFO_DATAREFS (loop_vinfo).length () |
635 | * LOOP_VINFO_DATAREFS (loop_vinfo).length ()); |
636 | /* We do not need read-read dependences. */ |
637 | bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo), |
638 | &LOOP_VINFO_DDRS (loop_vinfo), |
639 | LOOP_VINFO_LOOP_NEST (loop_vinfo), |
640 | false); |
641 | gcc_assert (res); |
642 | } |
643 | |
644 | LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true; |
645 | |
646 | /* For epilogues we either have no aliases or alias versioning |
647 | was applied to original loop. Therefore we may just get max_vf |
648 | using VF of original loop. */ |
649 | if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)) |
650 | *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo); |
651 | else |
652 | FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr) |
653 | { |
654 | opt_result res |
655 | = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf); |
656 | if (!res) |
657 | return res; |
658 | } |
659 | |
660 | return opt_result::success (); |
661 | } |
662 | |
663 | |
664 | /* Function vect_slp_analyze_data_ref_dependence. |
665 | |
666 | Return TRUE if there (might) exist a dependence between a memory-reference |
667 | DRA and a memory-reference DRB for VINFO. When versioning for alias |
668 | may check a dependence at run-time, return FALSE. Adjust *MAX_VF |
669 | according to the data dependence. */ |
670 | |
671 | static bool |
672 | vect_slp_analyze_data_ref_dependence (vec_info *vinfo, |
673 | struct data_dependence_relation *ddr) |
674 | { |
675 | struct data_reference *dra = DDR_A (ddr); |
676 | struct data_reference *drb = DDR_B (ddr); |
677 | dr_vec_info *dr_info_a = vinfo->lookup_dr (dra); |
678 | dr_vec_info *dr_info_b = vinfo->lookup_dr (drb); |
679 | |
680 | /* We need to check dependences of statements marked as unvectorizable |
681 | as well, they still can prohibit vectorization. */ |
682 | |
683 | /* Independent data accesses. */ |
684 | if (DDR_ARE_DEPENDENT (ddr) == chrec_known) |
685 | return false; |
686 | |
687 | if (dra == drb) |
688 | return false; |
689 | |
690 | /* Read-read is OK. */ |
691 | if (DR_IS_READ (dra) && DR_IS_READ (drb)) |
692 | return false; |
693 | |
694 | /* If dra and drb are part of the same interleaving chain consider |
695 | them independent. */ |
696 | if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt) |
697 | && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt) |
698 | == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt))) |
699 | return false; |
700 | |
701 | /* Unknown data dependence. */ |
702 | if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know) |
703 | { |
704 | if (dump_enabled_p ()) |
705 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
706 | "can't determine dependence between %T and %T\n" , |
707 | DR_REF (dra), DR_REF (drb)); |
708 | } |
709 | else if (dump_enabled_p ()) |
710 | dump_printf_loc (MSG_NOTE, vect_location, |
711 | "determined dependence between %T and %T\n" , |
712 | DR_REF (dra), DR_REF (drb)); |
713 | |
714 | return true; |
715 | } |
716 | |
717 | |
718 | /* Analyze dependences involved in the transform of a store SLP NODE. */ |
719 | |
720 | static bool |
721 | vect_slp_analyze_store_dependences (vec_info *vinfo, slp_tree node) |
722 | { |
723 | /* This walks over all stmts involved in the SLP store done |
724 | in NODE verifying we can sink them up to the last stmt in the |
725 | group. */ |
726 | stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node); |
727 | gcc_assert (DR_IS_WRITE (STMT_VINFO_DATA_REF (last_access_info))); |
728 | |
729 | for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k) |
730 | { |
731 | stmt_vec_info access_info |
732 | = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]); |
733 | if (access_info == last_access_info) |
734 | continue; |
735 | data_reference *dr_a = STMT_VINFO_DATA_REF (access_info); |
736 | ao_ref ref; |
737 | bool ref_initialized_p = false; |
738 | for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt); |
739 | gsi_stmt (i: gsi) != last_access_info->stmt; gsi_next (i: &gsi)) |
740 | { |
741 | gimple *stmt = gsi_stmt (i: gsi); |
742 | if (! gimple_vuse (g: stmt)) |
743 | continue; |
744 | |
745 | /* If we couldn't record a (single) data reference for this |
746 | stmt we have to resort to the alias oracle. */ |
747 | stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt); |
748 | data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info); |
749 | if (!dr_b) |
750 | { |
751 | /* We are moving a store - this means |
752 | we cannot use TBAA for disambiguation. */ |
753 | if (!ref_initialized_p) |
754 | ao_ref_init (&ref, DR_REF (dr_a)); |
755 | if (stmt_may_clobber_ref_p_1 (stmt, &ref, false) |
756 | || ref_maybe_used_by_stmt_p (stmt, &ref, false)) |
757 | return false; |
758 | continue; |
759 | } |
760 | |
761 | gcc_assert (!gimple_visited_p (stmt)); |
762 | |
763 | ddr_p ddr = initialize_data_dependence_relation (dr_a, |
764 | dr_b, vNULL); |
765 | bool dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr); |
766 | free_dependence_relation (ddr); |
767 | if (dependent) |
768 | return false; |
769 | } |
770 | } |
771 | return true; |
772 | } |
773 | |
774 | /* Analyze dependences involved in the transform of a load SLP NODE. STORES |
775 | contain the vector of scalar stores of this instance if we are |
776 | disambiguating the loads. */ |
777 | |
778 | static bool |
779 | vect_slp_analyze_load_dependences (vec_info *vinfo, slp_tree node, |
780 | vec<stmt_vec_info> stores, |
781 | stmt_vec_info last_store_info) |
782 | { |
783 | /* This walks over all stmts involved in the SLP load done |
784 | in NODE verifying we can hoist them up to the first stmt in the |
785 | group. */ |
786 | stmt_vec_info first_access_info = vect_find_first_scalar_stmt_in_slp (node); |
787 | gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (first_access_info))); |
788 | |
789 | for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k) |
790 | { |
791 | stmt_vec_info access_info |
792 | = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]); |
793 | if (access_info == first_access_info) |
794 | continue; |
795 | data_reference *dr_a = STMT_VINFO_DATA_REF (access_info); |
796 | ao_ref ref; |
797 | bool ref_initialized_p = false; |
798 | hash_set<stmt_vec_info> grp_visited; |
799 | for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt); |
800 | gsi_stmt (i: gsi) != first_access_info->stmt; gsi_prev (i: &gsi)) |
801 | { |
802 | gimple *stmt = gsi_stmt (i: gsi); |
803 | if (! gimple_vdef (g: stmt)) |
804 | continue; |
805 | |
806 | stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt); |
807 | |
808 | /* If we run into a store of this same instance (we've just |
809 | marked those) then delay dependence checking until we run |
810 | into the last store because this is where it will have |
811 | been sunk to (and we verified that we can do that already). */ |
812 | if (gimple_visited_p (stmt)) |
813 | { |
814 | if (stmt_info != last_store_info) |
815 | continue; |
816 | |
817 | for (stmt_vec_info &store_info : stores) |
818 | { |
819 | data_reference *store_dr = STMT_VINFO_DATA_REF (store_info); |
820 | ddr_p ddr = initialize_data_dependence_relation |
821 | (dr_a, store_dr, vNULL); |
822 | bool dependent |
823 | = vect_slp_analyze_data_ref_dependence (vinfo, ddr); |
824 | free_dependence_relation (ddr); |
825 | if (dependent) |
826 | return false; |
827 | } |
828 | continue; |
829 | } |
830 | |
831 | auto check_hoist = [&] (stmt_vec_info stmt_info) -> bool |
832 | { |
833 | /* We are hoisting a load - this means we can use TBAA for |
834 | disambiguation. */ |
835 | if (!ref_initialized_p) |
836 | ao_ref_init (&ref, DR_REF (dr_a)); |
837 | if (stmt_may_clobber_ref_p_1 (stmt_info->stmt, &ref, true)) |
838 | { |
839 | /* If we couldn't record a (single) data reference for this |
840 | stmt we have to give up now. */ |
841 | data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info); |
842 | if (!dr_b) |
843 | return false; |
844 | ddr_p ddr = initialize_data_dependence_relation (dr_a, |
845 | dr_b, vNULL); |
846 | bool dependent |
847 | = vect_slp_analyze_data_ref_dependence (vinfo, ddr); |
848 | free_dependence_relation (ddr); |
849 | if (dependent) |
850 | return false; |
851 | } |
852 | /* No dependence. */ |
853 | return true; |
854 | }; |
855 | if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) |
856 | { |
857 | /* When we run into a store group we have to honor |
858 | that earlier stores might be moved here. We don't |
859 | know exactly which and where to since we lack a |
860 | back-mapping from DR to SLP node, so assume all |
861 | earlier stores are sunk here. It's enough to |
862 | consider the last stmt of a group for this. |
863 | ??? Both this and the fact that we disregard that |
864 | the conflicting instance might be removed later |
865 | is overly conservative. */ |
866 | if (!grp_visited.add (DR_GROUP_FIRST_ELEMENT (stmt_info))) |
867 | for (auto store_info = DR_GROUP_FIRST_ELEMENT (stmt_info); |
868 | store_info != NULL; |
869 | store_info = DR_GROUP_NEXT_ELEMENT (store_info)) |
870 | if ((store_info == stmt_info |
871 | || get_later_stmt (stmt1_info: store_info, stmt2_info: stmt_info) == stmt_info) |
872 | && !check_hoist (store_info)) |
873 | return false; |
874 | } |
875 | else |
876 | { |
877 | if (!check_hoist (stmt_info)) |
878 | return false; |
879 | } |
880 | } |
881 | } |
882 | return true; |
883 | } |
884 | |
885 | |
886 | /* Function vect_analyze_data_ref_dependences. |
887 | |
888 | Examine all the data references in the basic-block, and make sure there |
889 | do not exist any data dependences between them. Set *MAX_VF according to |
890 | the maximum vectorization factor the data dependences allow. */ |
891 | |
892 | bool |
893 | vect_slp_analyze_instance_dependence (vec_info *vinfo, slp_instance instance) |
894 | { |
895 | DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence" ); |
896 | |
897 | /* The stores of this instance are at the root of the SLP tree. */ |
898 | slp_tree store = NULL; |
899 | if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store) |
900 | store = SLP_INSTANCE_TREE (instance); |
901 | |
902 | /* Verify we can sink stores to the vectorized stmt insert location. */ |
903 | stmt_vec_info last_store_info = NULL; |
904 | if (store) |
905 | { |
906 | if (! vect_slp_analyze_store_dependences (vinfo, node: store)) |
907 | return false; |
908 | |
909 | /* Mark stores in this instance and remember the last one. */ |
910 | last_store_info = vect_find_last_scalar_stmt_in_slp (store); |
911 | for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k) |
912 | gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, visited_p: true); |
913 | } |
914 | |
915 | bool res = true; |
916 | |
917 | /* Verify we can sink loads to the vectorized stmt insert location, |
918 | special-casing stores of this instance. */ |
919 | for (slp_tree &load : SLP_INSTANCE_LOADS (instance)) |
920 | if (! vect_slp_analyze_load_dependences (vinfo, node: load, |
921 | stores: store |
922 | ? SLP_TREE_SCALAR_STMTS (store) |
923 | : vNULL, last_store_info)) |
924 | { |
925 | res = false; |
926 | break; |
927 | } |
928 | |
929 | /* Unset the visited flag. */ |
930 | if (store) |
931 | for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k) |
932 | gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, visited_p: false); |
933 | |
934 | return res; |
935 | } |
936 | |
937 | /* Return the misalignment of DR_INFO accessed in VECTYPE with OFFSET |
938 | applied. */ |
939 | |
940 | int |
941 | dr_misalignment (dr_vec_info *dr_info, tree vectype, poly_int64 offset) |
942 | { |
943 | HOST_WIDE_INT diff = 0; |
944 | /* Alignment is only analyzed for the first element of a DR group, |
945 | use that but adjust misalignment by the offset of the access. */ |
946 | if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt)) |
947 | { |
948 | dr_vec_info *first_dr |
949 | = STMT_VINFO_DR_INFO (DR_GROUP_FIRST_ELEMENT (dr_info->stmt)); |
950 | /* vect_analyze_data_ref_accesses guarantees that DR_INIT are |
951 | INTEGER_CSTs and the first element in the group has the lowest |
952 | address. */ |
953 | diff = (TREE_INT_CST_LOW (DR_INIT (dr_info->dr)) |
954 | - TREE_INT_CST_LOW (DR_INIT (first_dr->dr))); |
955 | gcc_assert (diff >= 0); |
956 | dr_info = first_dr; |
957 | } |
958 | |
959 | int misalign = dr_info->misalignment; |
960 | gcc_assert (misalign != DR_MISALIGNMENT_UNINITIALIZED); |
961 | if (misalign == DR_MISALIGNMENT_UNKNOWN) |
962 | return misalign; |
963 | |
964 | /* If the access is only aligned for a vector type with smaller alignment |
965 | requirement the access has unknown misalignment. */ |
966 | if (maybe_lt (a: dr_info->target_alignment * BITS_PER_UNIT, |
967 | b: targetm.vectorize.preferred_vector_alignment (vectype))) |
968 | return DR_MISALIGNMENT_UNKNOWN; |
969 | |
970 | /* Apply the offset from the DR group start and the externally supplied |
971 | offset which can for example result from a negative stride access. */ |
972 | poly_int64 misalignment = misalign + diff + offset; |
973 | |
974 | /* vect_compute_data_ref_alignment will have ensured that target_alignment |
975 | is constant and otherwise set misalign to DR_MISALIGNMENT_UNKNOWN. */ |
976 | unsigned HOST_WIDE_INT target_alignment_c |
977 | = dr_info->target_alignment.to_constant (); |
978 | if (!known_misalignment (value: misalignment, align: target_alignment_c, misalign: &misalign)) |
979 | return DR_MISALIGNMENT_UNKNOWN; |
980 | return misalign; |
981 | } |
982 | |
983 | /* Record the base alignment guarantee given by DRB, which occurs |
984 | in STMT_INFO. */ |
985 | |
986 | static void |
987 | vect_record_base_alignment (vec_info *vinfo, stmt_vec_info stmt_info, |
988 | innermost_loop_behavior *drb) |
989 | { |
990 | bool existed; |
991 | std::pair<stmt_vec_info, innermost_loop_behavior *> &entry |
992 | = vinfo->base_alignments.get_or_insert (k: drb->base_address, existed: &existed); |
993 | if (!existed || entry.second->base_alignment < drb->base_alignment) |
994 | { |
995 | entry = std::make_pair (x&: stmt_info, y&: drb); |
996 | if (dump_enabled_p ()) |
997 | dump_printf_loc (MSG_NOTE, vect_location, |
998 | "recording new base alignment for %T\n" |
999 | " alignment: %d\n" |
1000 | " misalignment: %d\n" |
1001 | " based on: %G" , |
1002 | drb->base_address, |
1003 | drb->base_alignment, |
1004 | drb->base_misalignment, |
1005 | stmt_info->stmt); |
1006 | } |
1007 | } |
1008 | |
1009 | /* If the region we're going to vectorize is reached, all unconditional |
1010 | data references occur at least once. We can therefore pool the base |
1011 | alignment guarantees from each unconditional reference. Do this by |
1012 | going through all the data references in VINFO and checking whether |
1013 | the containing statement makes the reference unconditionally. If so, |
1014 | record the alignment of the base address in VINFO so that it can be |
1015 | used for all other references with the same base. */ |
1016 | |
1017 | void |
1018 | vect_record_base_alignments (vec_info *vinfo) |
1019 | { |
1020 | loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (p: vinfo); |
1021 | class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL; |
1022 | for (data_reference *dr : vinfo->shared->datarefs) |
1023 | { |
1024 | dr_vec_info *dr_info = vinfo->lookup_dr (dr); |
1025 | stmt_vec_info stmt_info = dr_info->stmt; |
1026 | if (!DR_IS_CONDITIONAL_IN_STMT (dr) |
1027 | && STMT_VINFO_VECTORIZABLE (stmt_info) |
1028 | && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)) |
1029 | { |
1030 | vect_record_base_alignment (vinfo, stmt_info, drb: &DR_INNERMOST (dr)); |
1031 | |
1032 | /* If DR is nested in the loop that is being vectorized, we can also |
1033 | record the alignment of the base wrt the outer loop. */ |
1034 | if (loop && nested_in_vect_loop_p (loop, stmt_info)) |
1035 | vect_record_base_alignment |
1036 | (vinfo, stmt_info, drb: &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info)); |
1037 | } |
1038 | } |
1039 | } |
1040 | |
1041 | /* Function vect_compute_data_ref_alignment |
1042 | |
1043 | Compute the misalignment of the data reference DR_INFO when vectorizing |
1044 | with VECTYPE. |
1045 | |
1046 | Output: |
1047 | 1. initialized misalignment info for DR_INFO |
1048 | |
1049 | FOR NOW: No analysis is actually performed. Misalignment is calculated |
1050 | only for trivial cases. TODO. */ |
1051 | |
1052 | static void |
1053 | vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info, |
1054 | tree vectype) |
1055 | { |
1056 | stmt_vec_info stmt_info = dr_info->stmt; |
1057 | vec_base_alignments *base_alignments = &vinfo->base_alignments; |
1058 | loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (p: vinfo); |
1059 | class loop *loop = NULL; |
1060 | tree ref = DR_REF (dr_info->dr); |
1061 | |
1062 | if (dump_enabled_p ()) |
1063 | dump_printf_loc (MSG_NOTE, vect_location, |
1064 | "vect_compute_data_ref_alignment:\n" ); |
1065 | |
1066 | if (loop_vinfo) |
1067 | loop = LOOP_VINFO_LOOP (loop_vinfo); |
1068 | |
1069 | /* Initialize misalignment to unknown. */ |
1070 | SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN); |
1071 | |
1072 | if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) |
1073 | return; |
1074 | |
1075 | innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info); |
1076 | bool step_preserves_misalignment_p; |
1077 | |
1078 | poly_uint64 vector_alignment |
1079 | = exact_div (a: targetm.vectorize.preferred_vector_alignment (vectype), |
1080 | BITS_PER_UNIT); |
1081 | SET_DR_TARGET_ALIGNMENT (dr_info, vector_alignment); |
1082 | |
1083 | /* If the main loop has peeled for alignment we have no way of knowing |
1084 | whether the data accesses in the epilogues are aligned. We can't at |
1085 | compile time answer the question whether we have entered the main loop or |
1086 | not. Fixes PR 92351. */ |
1087 | if (loop_vinfo) |
1088 | { |
1089 | loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo); |
1090 | if (orig_loop_vinfo |
1091 | && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo) != 0) |
1092 | return; |
1093 | } |
1094 | |
1095 | unsigned HOST_WIDE_INT vect_align_c; |
1096 | if (!vector_alignment.is_constant (const_value: &vect_align_c)) |
1097 | return; |
1098 | |
1099 | /* No step for BB vectorization. */ |
1100 | if (!loop) |
1101 | { |
1102 | gcc_assert (integer_zerop (drb->step)); |
1103 | step_preserves_misalignment_p = true; |
1104 | } |
1105 | |
1106 | /* In case the dataref is in an inner-loop of the loop that is being |
1107 | vectorized (LOOP), we use the base and misalignment information |
1108 | relative to the outer-loop (LOOP). This is ok only if the misalignment |
1109 | stays the same throughout the execution of the inner-loop, which is why |
1110 | we have to check that the stride of the dataref in the inner-loop evenly |
1111 | divides by the vector alignment. */ |
1112 | else if (nested_in_vect_loop_p (loop, stmt_info)) |
1113 | { |
1114 | step_preserves_misalignment_p |
1115 | = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0; |
1116 | |
1117 | if (dump_enabled_p ()) |
1118 | { |
1119 | if (step_preserves_misalignment_p) |
1120 | dump_printf_loc (MSG_NOTE, vect_location, |
1121 | "inner step divides the vector alignment.\n" ); |
1122 | else |
1123 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
1124 | "inner step doesn't divide the vector" |
1125 | " alignment.\n" ); |
1126 | } |
1127 | } |
1128 | |
1129 | /* Similarly we can only use base and misalignment information relative to |
1130 | an innermost loop if the misalignment stays the same throughout the |
1131 | execution of the loop. As above, this is the case if the stride of |
1132 | the dataref evenly divides by the alignment. */ |
1133 | else |
1134 | { |
1135 | poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); |
1136 | step_preserves_misalignment_p |
1137 | = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, b: vect_align_c); |
1138 | |
1139 | if (!step_preserves_misalignment_p && dump_enabled_p ()) |
1140 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
1141 | "step doesn't divide the vector alignment.\n" ); |
1142 | } |
1143 | |
1144 | unsigned int base_alignment = drb->base_alignment; |
1145 | unsigned int base_misalignment = drb->base_misalignment; |
1146 | |
1147 | /* Calculate the maximum of the pooled base address alignment and the |
1148 | alignment that we can compute for DR itself. */ |
1149 | std::pair<stmt_vec_info, innermost_loop_behavior *> *entry |
1150 | = base_alignments->get (k: drb->base_address); |
1151 | if (entry |
1152 | && base_alignment < (*entry).second->base_alignment |
1153 | && (loop_vinfo |
1154 | || (dominated_by_p (CDI_DOMINATORS, gimple_bb (g: stmt_info->stmt), |
1155 | gimple_bb (g: entry->first->stmt)) |
1156 | && (gimple_bb (g: stmt_info->stmt) != gimple_bb (g: entry->first->stmt) |
1157 | || (entry->first->dr_aux.group <= dr_info->group))))) |
1158 | { |
1159 | base_alignment = entry->second->base_alignment; |
1160 | base_misalignment = entry->second->base_misalignment; |
1161 | } |
1162 | |
1163 | if (drb->offset_alignment < vect_align_c |
1164 | || !step_preserves_misalignment_p |
1165 | /* We need to know whether the step wrt the vectorized loop is |
1166 | negative when computing the starting misalignment below. */ |
1167 | || TREE_CODE (drb->step) != INTEGER_CST) |
1168 | { |
1169 | if (dump_enabled_p ()) |
1170 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
1171 | "Unknown alignment for access: %T\n" , ref); |
1172 | return; |
1173 | } |
1174 | |
1175 | if (base_alignment < vect_align_c) |
1176 | { |
1177 | unsigned int max_alignment; |
1178 | tree base = get_base_for_alignment (drb->base_address, &max_alignment); |
1179 | if (max_alignment < vect_align_c |
1180 | || !vect_can_force_dr_alignment_p (base, |
1181 | vect_align_c * BITS_PER_UNIT)) |
1182 | { |
1183 | if (dump_enabled_p ()) |
1184 | dump_printf_loc (MSG_NOTE, vect_location, |
1185 | "can't force alignment of ref: %T\n" , ref); |
1186 | return; |
1187 | } |
1188 | |
1189 | /* Force the alignment of the decl. |
1190 | NOTE: This is the only change to the code we make during |
1191 | the analysis phase, before deciding to vectorize the loop. */ |
1192 | if (dump_enabled_p ()) |
1193 | dump_printf_loc (MSG_NOTE, vect_location, |
1194 | "force alignment of %T\n" , ref); |
1195 | |
1196 | dr_info->base_decl = base; |
1197 | dr_info->base_misaligned = true; |
1198 | base_misalignment = 0; |
1199 | } |
1200 | poly_int64 misalignment |
1201 | = base_misalignment + wi::to_poly_offset (t: drb->init).force_shwi (); |
1202 | |
1203 | unsigned int const_misalignment; |
1204 | if (!known_misalignment (value: misalignment, align: vect_align_c, misalign: &const_misalignment)) |
1205 | { |
1206 | if (dump_enabled_p ()) |
1207 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
1208 | "Non-constant misalignment for access: %T\n" , ref); |
1209 | return; |
1210 | } |
1211 | |
1212 | SET_DR_MISALIGNMENT (dr_info, const_misalignment); |
1213 | |
1214 | if (dump_enabled_p ()) |
1215 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
1216 | "misalign = %d bytes of ref %T\n" , |
1217 | const_misalignment, ref); |
1218 | |
1219 | return; |
1220 | } |
1221 | |
1222 | /* Return whether DR_INFO, which is related to DR_PEEL_INFO in |
1223 | that it only differs in DR_INIT, is aligned if DR_PEEL_INFO |
1224 | is made aligned via peeling. */ |
1225 | |
1226 | static bool |
1227 | vect_dr_aligned_if_related_peeled_dr_is (dr_vec_info *dr_info, |
1228 | dr_vec_info *dr_peel_info) |
1229 | { |
1230 | if (multiple_p (DR_TARGET_ALIGNMENT (dr_peel_info), |
1231 | DR_TARGET_ALIGNMENT (dr_info))) |
1232 | { |
1233 | poly_offset_int diff |
1234 | = (wi::to_poly_offset (DR_INIT (dr_peel_info->dr)) |
1235 | - wi::to_poly_offset (DR_INIT (dr_info->dr))); |
1236 | if (known_eq (diff, 0) |
1237 | || multiple_p (a: diff, DR_TARGET_ALIGNMENT (dr_info))) |
1238 | return true; |
1239 | } |
1240 | return false; |
1241 | } |
1242 | |
1243 | /* Return whether DR_INFO is aligned if DR_PEEL_INFO is made |
1244 | aligned via peeling. */ |
1245 | |
1246 | static bool |
1247 | vect_dr_aligned_if_peeled_dr_is (dr_vec_info *dr_info, |
1248 | dr_vec_info *dr_peel_info) |
1249 | { |
1250 | if (!operand_equal_p (DR_BASE_ADDRESS (dr_info->dr), |
1251 | DR_BASE_ADDRESS (dr_peel_info->dr), flags: 0) |
1252 | || !operand_equal_p (DR_OFFSET (dr_info->dr), |
1253 | DR_OFFSET (dr_peel_info->dr), flags: 0) |
1254 | || !operand_equal_p (DR_STEP (dr_info->dr), |
1255 | DR_STEP (dr_peel_info->dr), flags: 0)) |
1256 | return false; |
1257 | |
1258 | return vect_dr_aligned_if_related_peeled_dr_is (dr_info, dr_peel_info); |
1259 | } |
1260 | |
1261 | /* Compute the value for dr_info->misalign so that the access appears |
1262 | aligned. This is used by peeling to compensate for dr_misalignment |
1263 | applying the offset for negative step. */ |
1264 | |
1265 | int |
1266 | vect_dr_misalign_for_aligned_access (dr_vec_info *dr_info) |
1267 | { |
1268 | if (tree_int_cst_sgn (DR_STEP (dr_info->dr)) >= 0) |
1269 | return 0; |
1270 | |
1271 | tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt); |
1272 | poly_int64 misalignment |
1273 | = ((TYPE_VECTOR_SUBPARTS (node: vectype) - 1) |
1274 | * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype)))); |
1275 | |
1276 | unsigned HOST_WIDE_INT target_alignment_c; |
1277 | int misalign; |
1278 | if (!dr_info->target_alignment.is_constant (const_value: &target_alignment_c) |
1279 | || !known_misalignment (value: misalignment, align: target_alignment_c, misalign: &misalign)) |
1280 | return DR_MISALIGNMENT_UNKNOWN; |
1281 | return misalign; |
1282 | } |
1283 | |
1284 | /* Function vect_update_misalignment_for_peel. |
1285 | Sets DR_INFO's misalignment |
1286 | - to 0 if it has the same alignment as DR_PEEL_INFO, |
1287 | - to the misalignment computed using NPEEL if DR_INFO's salignment is known, |
1288 | - to -1 (unknown) otherwise. |
1289 | |
1290 | DR_INFO - the data reference whose misalignment is to be adjusted. |
1291 | DR_PEEL_INFO - the data reference whose misalignment is being made |
1292 | zero in the vector loop by the peel. |
1293 | NPEEL - the number of iterations in the peel loop if the misalignment |
1294 | of DR_PEEL_INFO is known at compile time. */ |
1295 | |
1296 | static void |
1297 | vect_update_misalignment_for_peel (dr_vec_info *dr_info, |
1298 | dr_vec_info *dr_peel_info, int npeel) |
1299 | { |
1300 | /* If dr_info is aligned of dr_peel_info is, then mark it so. */ |
1301 | if (vect_dr_aligned_if_peeled_dr_is (dr_info, dr_peel_info)) |
1302 | { |
1303 | SET_DR_MISALIGNMENT (dr_info, |
1304 | vect_dr_misalign_for_aligned_access (dr_peel_info)); |
1305 | return; |
1306 | } |
1307 | |
1308 | unsigned HOST_WIDE_INT alignment; |
1309 | if (DR_TARGET_ALIGNMENT (dr_info).is_constant (const_value: &alignment) |
1310 | && known_alignment_for_access_p (dr_info, |
1311 | STMT_VINFO_VECTYPE (dr_info->stmt)) |
1312 | && known_alignment_for_access_p (dr_info: dr_peel_info, |
1313 | STMT_VINFO_VECTYPE (dr_peel_info->stmt))) |
1314 | { |
1315 | int misal = dr_info->misalignment; |
1316 | misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr)); |
1317 | misal &= alignment - 1; |
1318 | set_dr_misalignment (dr_info, val: misal); |
1319 | return; |
1320 | } |
1321 | |
1322 | if (dump_enabled_p ()) |
1323 | dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \ |
1324 | "to unknown (-1).\n" ); |
1325 | SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN); |
1326 | } |
1327 | |
1328 | /* Return true if alignment is relevant for DR_INFO. */ |
1329 | |
1330 | static bool |
1331 | vect_relevant_for_alignment_p (dr_vec_info *dr_info) |
1332 | { |
1333 | stmt_vec_info stmt_info = dr_info->stmt; |
1334 | |
1335 | if (!STMT_VINFO_RELEVANT_P (stmt_info)) |
1336 | return false; |
1337 | |
1338 | /* For interleaving, only the alignment of the first access matters. */ |
1339 | if (STMT_VINFO_GROUPED_ACCESS (stmt_info) |
1340 | && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info) |
1341 | return false; |
1342 | |
1343 | /* Scatter-gather and invariant accesses continue to address individual |
1344 | scalars, so vector-level alignment is irrelevant. */ |
1345 | if (STMT_VINFO_GATHER_SCATTER_P (stmt_info) |
1346 | || integer_zerop (DR_STEP (dr_info->dr))) |
1347 | return false; |
1348 | |
1349 | /* Strided accesses perform only component accesses, alignment is |
1350 | irrelevant for them. */ |
1351 | if (STMT_VINFO_STRIDED_P (stmt_info) |
1352 | && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) |
1353 | return false; |
1354 | |
1355 | return true; |
1356 | } |
1357 | |
1358 | /* Given an memory reference EXP return whether its alignment is less |
1359 | than its size. */ |
1360 | |
1361 | static bool |
1362 | not_size_aligned (tree exp) |
1363 | { |
1364 | if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp)))) |
1365 | return true; |
1366 | |
1367 | return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp))) |
1368 | > get_object_alignment (exp)); |
1369 | } |
1370 | |
1371 | /* Function vector_alignment_reachable_p |
1372 | |
1373 | Return true if vector alignment for DR_INFO is reachable by peeling |
1374 | a few loop iterations. Return false otherwise. */ |
1375 | |
1376 | static bool |
1377 | vector_alignment_reachable_p (dr_vec_info *dr_info) |
1378 | { |
1379 | stmt_vec_info stmt_info = dr_info->stmt; |
1380 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); |
1381 | |
1382 | if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) |
1383 | { |
1384 | /* For interleaved access we peel only if number of iterations in |
1385 | the prolog loop ({VF - misalignment}), is a multiple of the |
1386 | number of the interleaved accesses. */ |
1387 | int elem_size, mis_in_elements; |
1388 | |
1389 | /* FORNOW: handle only known alignment. */ |
1390 | if (!known_alignment_for_access_p (dr_info, vectype)) |
1391 | return false; |
1392 | |
1393 | poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (node: vectype); |
1394 | poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype)); |
1395 | elem_size = vector_element_size (vector_size, nelements); |
1396 | mis_in_elements = dr_misalignment (dr_info, vectype) / elem_size; |
1397 | |
1398 | if (!multiple_p (a: nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info))) |
1399 | return false; |
1400 | } |
1401 | |
1402 | /* If misalignment is known at the compile time then allow peeling |
1403 | only if natural alignment is reachable through peeling. */ |
1404 | if (known_alignment_for_access_p (dr_info, vectype) |
1405 | && !aligned_access_p (dr_info, vectype)) |
1406 | { |
1407 | HOST_WIDE_INT elmsize = |
1408 | int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype))); |
1409 | if (dump_enabled_p ()) |
1410 | { |
1411 | dump_printf_loc (MSG_NOTE, vect_location, |
1412 | "data size = %wd. misalignment = %d.\n" , elmsize, |
1413 | dr_misalignment (dr_info, vectype)); |
1414 | } |
1415 | if (dr_misalignment (dr_info, vectype) % elmsize) |
1416 | { |
1417 | if (dump_enabled_p ()) |
1418 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
1419 | "data size does not divide the misalignment.\n" ); |
1420 | return false; |
1421 | } |
1422 | } |
1423 | |
1424 | if (!known_alignment_for_access_p (dr_info, vectype)) |
1425 | { |
1426 | tree type = TREE_TYPE (DR_REF (dr_info->dr)); |
1427 | bool is_packed = not_size_aligned (DR_REF (dr_info->dr)); |
1428 | if (dump_enabled_p ()) |
1429 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
1430 | "Unknown misalignment, %snaturally aligned\n" , |
1431 | is_packed ? "not " : "" ); |
1432 | return targetm.vectorize.vector_alignment_reachable (type, is_packed); |
1433 | } |
1434 | |
1435 | return true; |
1436 | } |
1437 | |
1438 | |
1439 | /* Calculate the cost of the memory access represented by DR_INFO. */ |
1440 | |
1441 | static void |
1442 | vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info, |
1443 | dr_alignment_support alignment_support_scheme, |
1444 | int misalignment, |
1445 | unsigned int *inside_cost, |
1446 | unsigned int *outside_cost, |
1447 | stmt_vector_for_cost *body_cost_vec, |
1448 | stmt_vector_for_cost *prologue_cost_vec) |
1449 | { |
1450 | stmt_vec_info stmt_info = dr_info->stmt; |
1451 | loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (p: vinfo); |
1452 | int ncopies; |
1453 | |
1454 | if (PURE_SLP_STMT (stmt_info)) |
1455 | ncopies = 1; |
1456 | else |
1457 | ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info)); |
1458 | |
1459 | if (DR_IS_READ (dr_info->dr)) |
1460 | vect_get_load_cost (vinfo, stmt_info, ncopies, alignment_support_scheme, |
1461 | misalignment, true, inside_cost, |
1462 | outside_cost, prologue_cost_vec, body_cost_vec, false); |
1463 | else |
1464 | vect_get_store_cost (vinfo,stmt_info, ncopies, alignment_support_scheme, |
1465 | misalignment, inside_cost, body_cost_vec); |
1466 | |
1467 | if (dump_enabled_p ()) |
1468 | dump_printf_loc (MSG_NOTE, vect_location, |
1469 | "vect_get_data_access_cost: inside_cost = %d, " |
1470 | "outside_cost = %d.\n" , *inside_cost, *outside_cost); |
1471 | } |
1472 | |
1473 | |
1474 | typedef struct _vect_peel_info |
1475 | { |
1476 | dr_vec_info *dr_info; |
1477 | int npeel; |
1478 | unsigned int count; |
1479 | } *vect_peel_info; |
1480 | |
1481 | typedef struct _vect_peel_extended_info |
1482 | { |
1483 | vec_info *vinfo; |
1484 | struct _vect_peel_info peel_info; |
1485 | unsigned int inside_cost; |
1486 | unsigned int outside_cost; |
1487 | } *vect_peel_extended_info; |
1488 | |
1489 | |
1490 | /* Peeling hashtable helpers. */ |
1491 | |
1492 | struct peel_info_hasher : free_ptr_hash <_vect_peel_info> |
1493 | { |
1494 | static inline hashval_t hash (const _vect_peel_info *); |
1495 | static inline bool equal (const _vect_peel_info *, const _vect_peel_info *); |
1496 | }; |
1497 | |
1498 | inline hashval_t |
1499 | peel_info_hasher::hash (const _vect_peel_info *peel_info) |
1500 | { |
1501 | return (hashval_t) peel_info->npeel; |
1502 | } |
1503 | |
1504 | inline bool |
1505 | peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b) |
1506 | { |
1507 | return (a->npeel == b->npeel); |
1508 | } |
1509 | |
1510 | |
1511 | /* Insert DR_INFO into peeling hash table with NPEEL as key. */ |
1512 | |
1513 | static void |
1514 | vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab, |
1515 | loop_vec_info loop_vinfo, dr_vec_info *dr_info, |
1516 | int npeel, bool supportable_if_not_aligned) |
1517 | { |
1518 | struct _vect_peel_info elem, *slot; |
1519 | _vect_peel_info **new_slot; |
1520 | |
1521 | elem.npeel = npeel; |
1522 | slot = peeling_htab->find (value: &elem); |
1523 | if (slot) |
1524 | slot->count++; |
1525 | else |
1526 | { |
1527 | slot = XNEW (struct _vect_peel_info); |
1528 | slot->npeel = npeel; |
1529 | slot->dr_info = dr_info; |
1530 | slot->count = 1; |
1531 | new_slot = peeling_htab->find_slot (value: slot, insert: INSERT); |
1532 | *new_slot = slot; |
1533 | } |
1534 | |
1535 | /* If this DR is not supported with unknown misalignment then bias |
1536 | this slot when the cost model is disabled. */ |
1537 | if (!supportable_if_not_aligned |
1538 | && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) |
1539 | slot->count += VECT_MAX_COST; |
1540 | } |
1541 | |
1542 | |
1543 | /* Traverse peeling hash table to find peeling option that aligns maximum |
1544 | number of data accesses. */ |
1545 | |
1546 | int |
1547 | vect_peeling_hash_get_most_frequent (_vect_peel_info **slot, |
1548 | _vect_peel_extended_info *max) |
1549 | { |
1550 | vect_peel_info elem = *slot; |
1551 | |
1552 | if (elem->count > max->peel_info.count |
1553 | || (elem->count == max->peel_info.count |
1554 | && max->peel_info.npeel > elem->npeel)) |
1555 | { |
1556 | max->peel_info.npeel = elem->npeel; |
1557 | max->peel_info.count = elem->count; |
1558 | max->peel_info.dr_info = elem->dr_info; |
1559 | } |
1560 | |
1561 | return 1; |
1562 | } |
1563 | |
1564 | /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking |
1565 | data access costs for all data refs. If UNKNOWN_MISALIGNMENT is true, |
1566 | npeel is computed at runtime but DR0_INFO's misalignment will be zero |
1567 | after peeling. */ |
1568 | |
1569 | static void |
1570 | vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo, |
1571 | dr_vec_info *dr0_info, |
1572 | unsigned int *inside_cost, |
1573 | unsigned int *outside_cost, |
1574 | stmt_vector_for_cost *body_cost_vec, |
1575 | stmt_vector_for_cost *prologue_cost_vec, |
1576 | unsigned int npeel) |
1577 | { |
1578 | vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); |
1579 | |
1580 | bool dr0_alignment_known_p |
1581 | = (dr0_info |
1582 | && known_alignment_for_access_p (dr_info: dr0_info, |
1583 | STMT_VINFO_VECTYPE (dr0_info->stmt))); |
1584 | |
1585 | for (data_reference *dr : datarefs) |
1586 | { |
1587 | dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr); |
1588 | if (!vect_relevant_for_alignment_p (dr_info)) |
1589 | continue; |
1590 | |
1591 | tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt); |
1592 | dr_alignment_support alignment_support_scheme; |
1593 | int misalignment; |
1594 | unsigned HOST_WIDE_INT alignment; |
1595 | |
1596 | bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr), |
1597 | size_zero_node) < 0; |
1598 | poly_int64 off = 0; |
1599 | if (negative) |
1600 | off = ((TYPE_VECTOR_SUBPARTS (node: vectype) - 1) |
1601 | * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype)))); |
1602 | |
1603 | if (npeel == 0) |
1604 | misalignment = dr_misalignment (dr_info, vectype, offset: off); |
1605 | else if (dr_info == dr0_info |
1606 | || vect_dr_aligned_if_peeled_dr_is (dr_info, dr_peel_info: dr0_info)) |
1607 | misalignment = 0; |
1608 | else if (!dr0_alignment_known_p |
1609 | || !known_alignment_for_access_p (dr_info, vectype) |
1610 | || !DR_TARGET_ALIGNMENT (dr_info).is_constant (const_value: &alignment)) |
1611 | misalignment = DR_MISALIGNMENT_UNKNOWN; |
1612 | else |
1613 | { |
1614 | misalignment = dr_misalignment (dr_info, vectype, offset: off); |
1615 | misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr)); |
1616 | misalignment &= alignment - 1; |
1617 | } |
1618 | alignment_support_scheme |
1619 | = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype, |
1620 | misalignment); |
1621 | |
1622 | vect_get_data_access_cost (vinfo: loop_vinfo, dr_info, |
1623 | alignment_support_scheme, misalignment, |
1624 | inside_cost, outside_cost, |
1625 | body_cost_vec, prologue_cost_vec); |
1626 | } |
1627 | } |
1628 | |
1629 | /* Traverse peeling hash table and calculate cost for each peeling option. |
1630 | Find the one with the lowest cost. */ |
1631 | |
1632 | int |
1633 | vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot, |
1634 | _vect_peel_extended_info *min) |
1635 | { |
1636 | vect_peel_info elem = *slot; |
1637 | int dummy; |
1638 | unsigned int inside_cost = 0, outside_cost = 0; |
1639 | loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (p: min->vinfo); |
1640 | stmt_vector_for_cost prologue_cost_vec, body_cost_vec, |
1641 | epilogue_cost_vec; |
1642 | |
1643 | prologue_cost_vec.create (nelems: 2); |
1644 | body_cost_vec.create (nelems: 2); |
1645 | epilogue_cost_vec.create (nelems: 2); |
1646 | |
1647 | vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info: elem->dr_info, inside_cost: &inside_cost, |
1648 | outside_cost: &outside_cost, body_cost_vec: &body_cost_vec, |
1649 | prologue_cost_vec: &prologue_cost_vec, npeel: elem->npeel); |
1650 | |
1651 | body_cost_vec.release (); |
1652 | |
1653 | outside_cost += vect_get_known_peeling_cost |
1654 | (loop_vinfo, elem->npeel, &dummy, |
1655 | &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), |
1656 | &prologue_cost_vec, &epilogue_cost_vec); |
1657 | |
1658 | /* Prologue and epilogue costs are added to the target model later. |
1659 | These costs depend only on the scalar iteration cost, the |
1660 | number of peeling iterations finally chosen, and the number of |
1661 | misaligned statements. So discard the information found here. */ |
1662 | prologue_cost_vec.release (); |
1663 | epilogue_cost_vec.release (); |
1664 | |
1665 | if (inside_cost < min->inside_cost |
1666 | || (inside_cost == min->inside_cost |
1667 | && outside_cost < min->outside_cost)) |
1668 | { |
1669 | min->inside_cost = inside_cost; |
1670 | min->outside_cost = outside_cost; |
1671 | min->peel_info.dr_info = elem->dr_info; |
1672 | min->peel_info.npeel = elem->npeel; |
1673 | min->peel_info.count = elem->count; |
1674 | } |
1675 | |
1676 | return 1; |
1677 | } |
1678 | |
1679 | |
1680 | /* Choose best peeling option by traversing peeling hash table and either |
1681 | choosing an option with the lowest cost (if cost model is enabled) or the |
1682 | option that aligns as many accesses as possible. */ |
1683 | |
1684 | static struct _vect_peel_extended_info |
1685 | vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab, |
1686 | loop_vec_info loop_vinfo) |
1687 | { |
1688 | struct _vect_peel_extended_info res; |
1689 | |
1690 | res.peel_info.dr_info = NULL; |
1691 | res.vinfo = loop_vinfo; |
1692 | |
1693 | if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) |
1694 | { |
1695 | res.inside_cost = INT_MAX; |
1696 | res.outside_cost = INT_MAX; |
1697 | peeling_htab->traverse <_vect_peel_extended_info *, |
1698 | vect_peeling_hash_get_lowest_cost> (argument: &res); |
1699 | } |
1700 | else |
1701 | { |
1702 | res.peel_info.count = 0; |
1703 | peeling_htab->traverse <_vect_peel_extended_info *, |
1704 | vect_peeling_hash_get_most_frequent> (argument: &res); |
1705 | res.inside_cost = 0; |
1706 | res.outside_cost = 0; |
1707 | } |
1708 | |
1709 | return res; |
1710 | } |
1711 | |
1712 | /* Return true if the new peeling NPEEL is supported. */ |
1713 | |
1714 | static bool |
1715 | vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info, |
1716 | unsigned npeel) |
1717 | { |
1718 | vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); |
1719 | enum dr_alignment_support supportable_dr_alignment; |
1720 | |
1721 | bool dr0_alignment_known_p |
1722 | = known_alignment_for_access_p (dr_info: dr0_info, |
1723 | STMT_VINFO_VECTYPE (dr0_info->stmt)); |
1724 | |
1725 | /* Ensure that all data refs can be vectorized after the peel. */ |
1726 | for (data_reference *dr : datarefs) |
1727 | { |
1728 | if (dr == dr0_info->dr) |
1729 | continue; |
1730 | |
1731 | dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr); |
1732 | if (!vect_relevant_for_alignment_p (dr_info) |
1733 | || vect_dr_aligned_if_peeled_dr_is (dr_info, dr_peel_info: dr0_info)) |
1734 | continue; |
1735 | |
1736 | tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt); |
1737 | int misalignment; |
1738 | unsigned HOST_WIDE_INT alignment; |
1739 | if (!dr0_alignment_known_p |
1740 | || !known_alignment_for_access_p (dr_info, vectype) |
1741 | || !DR_TARGET_ALIGNMENT (dr_info).is_constant (const_value: &alignment)) |
1742 | misalignment = DR_MISALIGNMENT_UNKNOWN; |
1743 | else |
1744 | { |
1745 | misalignment = dr_misalignment (dr_info, vectype); |
1746 | misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr)); |
1747 | misalignment &= alignment - 1; |
1748 | } |
1749 | supportable_dr_alignment |
1750 | = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype, |
1751 | misalignment); |
1752 | if (supportable_dr_alignment == dr_unaligned_unsupported) |
1753 | return false; |
1754 | } |
1755 | |
1756 | return true; |
1757 | } |
1758 | |
1759 | /* Compare two data-references DRA and DRB to group them into chunks |
1760 | with related alignment. */ |
1761 | |
1762 | static int |
1763 | dr_align_group_sort_cmp (const void *dra_, const void *drb_) |
1764 | { |
1765 | data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_); |
1766 | data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_); |
1767 | int cmp; |
1768 | |
1769 | /* Stabilize sort. */ |
1770 | if (dra == drb) |
1771 | return 0; |
1772 | |
1773 | /* Ordering of DRs according to base. */ |
1774 | cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra), |
1775 | DR_BASE_ADDRESS (drb)); |
1776 | if (cmp != 0) |
1777 | return cmp; |
1778 | |
1779 | /* And according to DR_OFFSET. */ |
1780 | cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)); |
1781 | if (cmp != 0) |
1782 | return cmp; |
1783 | |
1784 | /* And after step. */ |
1785 | cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)); |
1786 | if (cmp != 0) |
1787 | return cmp; |
1788 | |
1789 | /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */ |
1790 | cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)); |
1791 | if (cmp == 0) |
1792 | return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1; |
1793 | return cmp; |
1794 | } |
1795 | |
1796 | /* Function vect_enhance_data_refs_alignment |
1797 | |
1798 | This pass will use loop versioning and loop peeling in order to enhance |
1799 | the alignment of data references in the loop. |
1800 | |
1801 | FOR NOW: we assume that whatever versioning/peeling takes place, only the |
1802 | original loop is to be vectorized. Any other loops that are created by |
1803 | the transformations performed in this pass - are not supposed to be |
1804 | vectorized. This restriction will be relaxed. |
1805 | |
1806 | This pass will require a cost model to guide it whether to apply peeling |
1807 | or versioning or a combination of the two. For example, the scheme that |
1808 | intel uses when given a loop with several memory accesses, is as follows: |
1809 | choose one memory access ('p') which alignment you want to force by doing |
1810 | peeling. Then, either (1) generate a loop in which 'p' is aligned and all |
1811 | other accesses are not necessarily aligned, or (2) use loop versioning to |
1812 | generate one loop in which all accesses are aligned, and another loop in |
1813 | which only 'p' is necessarily aligned. |
1814 | |
1815 | ("Automatic Intra-Register Vectorization for the Intel Architecture", |
1816 | Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International |
1817 | Journal of Parallel Programming, Vol. 30, No. 2, April 2002.) |
1818 | |
1819 | Devising a cost model is the most critical aspect of this work. It will |
1820 | guide us on which access to peel for, whether to use loop versioning, how |
1821 | many versions to create, etc. The cost model will probably consist of |
1822 | generic considerations as well as target specific considerations (on |
1823 | powerpc for example, misaligned stores are more painful than misaligned |
1824 | loads). |
1825 | |
1826 | Here are the general steps involved in alignment enhancements: |
1827 | |
1828 | -- original loop, before alignment analysis: |
1829 | for (i=0; i<N; i++){ |
1830 | x = q[i]; # DR_MISALIGNMENT(q) = unknown |
1831 | p[i] = y; # DR_MISALIGNMENT(p) = unknown |
1832 | } |
1833 | |
1834 | -- After vect_compute_data_refs_alignment: |
1835 | for (i=0; i<N; i++){ |
1836 | x = q[i]; # DR_MISALIGNMENT(q) = 3 |
1837 | p[i] = y; # DR_MISALIGNMENT(p) = unknown |
1838 | } |
1839 | |
1840 | -- Possibility 1: we do loop versioning: |
1841 | if (p is aligned) { |
1842 | for (i=0; i<N; i++){ # loop 1A |
1843 | x = q[i]; # DR_MISALIGNMENT(q) = 3 |
1844 | p[i] = y; # DR_MISALIGNMENT(p) = 0 |
1845 | } |
1846 | } |
1847 | else { |
1848 | for (i=0; i<N; i++){ # loop 1B |
1849 | x = q[i]; # DR_MISALIGNMENT(q) = 3 |
1850 | p[i] = y; # DR_MISALIGNMENT(p) = unaligned |
1851 | } |
1852 | } |
1853 | |
1854 | -- Possibility 2: we do loop peeling: |
1855 | for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized). |
1856 | x = q[i]; |
1857 | p[i] = y; |
1858 | } |
1859 | for (i = 3; i < N; i++){ # loop 2A |
1860 | x = q[i]; # DR_MISALIGNMENT(q) = 0 |
1861 | p[i] = y; # DR_MISALIGNMENT(p) = unknown |
1862 | } |
1863 | |
1864 | -- Possibility 3: combination of loop peeling and versioning: |
1865 | for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized). |
1866 | x = q[i]; |
1867 | p[i] = y; |
1868 | } |
1869 | if (p is aligned) { |
1870 | for (i = 3; i<N; i++){ # loop 3A |
1871 | x = q[i]; # DR_MISALIGNMENT(q) = 0 |
1872 | p[i] = y; # DR_MISALIGNMENT(p) = 0 |
1873 | } |
1874 | } |
1875 | else { |
1876 | for (i = 3; i<N; i++){ # loop 3B |
1877 | x = q[i]; # DR_MISALIGNMENT(q) = 0 |
1878 | p[i] = y; # DR_MISALIGNMENT(p) = unaligned |
1879 | } |
1880 | } |
1881 | |
1882 | These loops are later passed to loop_transform to be vectorized. The |
1883 | vectorizer will use the alignment information to guide the transformation |
1884 | (whether to generate regular loads/stores, or with special handling for |
1885 | misalignment). */ |
1886 | |
1887 | opt_result |
1888 | vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo) |
1889 | { |
1890 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
1891 | dr_vec_info *first_store = NULL; |
1892 | dr_vec_info *dr0_info = NULL; |
1893 | struct data_reference *dr; |
1894 | unsigned int i; |
1895 | bool do_peeling = false; |
1896 | bool do_versioning = false; |
1897 | unsigned int npeel = 0; |
1898 | bool one_misalignment_known = false; |
1899 | bool one_misalignment_unknown = false; |
1900 | bool one_dr_unsupportable = false; |
1901 | dr_vec_info *unsupportable_dr_info = NULL; |
1902 | unsigned int dr0_same_align_drs = 0, first_store_same_align_drs = 0; |
1903 | hash_table<peel_info_hasher> peeling_htab (1); |
1904 | |
1905 | DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment" ); |
1906 | |
1907 | /* Reset data so we can safely be called multiple times. */ |
1908 | LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (size: 0); |
1909 | LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0; |
1910 | |
1911 | if (LOOP_VINFO_DATAREFS (loop_vinfo).is_empty ()) |
1912 | return opt_result::success (); |
1913 | |
1914 | /* Sort the vector of datarefs so DRs that have the same or dependent |
1915 | alignment are next to each other. */ |
1916 | auto_vec<data_reference_p> datarefs |
1917 | = LOOP_VINFO_DATAREFS (loop_vinfo).copy (); |
1918 | datarefs.qsort (dr_align_group_sort_cmp); |
1919 | |
1920 | /* Compute the number of DRs that become aligned when we peel |
1921 | a dataref so it becomes aligned. */ |
1922 | auto_vec<unsigned> n_same_align_refs (datarefs.length ()); |
1923 | n_same_align_refs.quick_grow_cleared (len: datarefs.length ()); |
1924 | unsigned i0; |
1925 | for (i0 = 0; i0 < datarefs.length (); ++i0) |
1926 | if (DR_BASE_ADDRESS (datarefs[i0])) |
1927 | break; |
1928 | for (i = i0 + 1; i <= datarefs.length (); ++i) |
1929 | { |
1930 | if (i == datarefs.length () |
1931 | || !operand_equal_p (DR_BASE_ADDRESS (datarefs[i0]), |
1932 | DR_BASE_ADDRESS (datarefs[i]), flags: 0) |
1933 | || !operand_equal_p (DR_OFFSET (datarefs[i0]), |
1934 | DR_OFFSET (datarefs[i]), flags: 0) |
1935 | || !operand_equal_p (DR_STEP (datarefs[i0]), |
1936 | DR_STEP (datarefs[i]), flags: 0)) |
1937 | { |
1938 | /* The subgroup [i0, i-1] now only differs in DR_INIT and |
1939 | possibly DR_TARGET_ALIGNMENT. Still the whole subgroup |
1940 | will get known misalignment if we align one of the refs |
1941 | with the largest DR_TARGET_ALIGNMENT. */ |
1942 | for (unsigned j = i0; j < i; ++j) |
1943 | { |
1944 | dr_vec_info *dr_infoj = loop_vinfo->lookup_dr (datarefs[j]); |
1945 | for (unsigned k = i0; k < i; ++k) |
1946 | { |
1947 | if (k == j) |
1948 | continue; |
1949 | dr_vec_info *dr_infok = loop_vinfo->lookup_dr (datarefs[k]); |
1950 | if (vect_dr_aligned_if_related_peeled_dr_is (dr_info: dr_infok, |
1951 | dr_peel_info: dr_infoj)) |
1952 | n_same_align_refs[j]++; |
1953 | } |
1954 | } |
1955 | i0 = i; |
1956 | } |
1957 | } |
1958 | |
1959 | /* While cost model enhancements are expected in the future, the high level |
1960 | view of the code at this time is as follows: |
1961 | |
1962 | A) If there is a misaligned access then see if peeling to align |
1963 | this access can make all data references satisfy |
1964 | vect_supportable_dr_alignment. If so, update data structures |
1965 | as needed and return true. |
1966 | |
1967 | B) If peeling wasn't possible and there is a data reference with an |
1968 | unknown misalignment that does not satisfy vect_supportable_dr_alignment |
1969 | then see if loop versioning checks can be used to make all data |
1970 | references satisfy vect_supportable_dr_alignment. If so, update |
1971 | data structures as needed and return true. |
1972 | |
1973 | C) If neither peeling nor versioning were successful then return false if |
1974 | any data reference does not satisfy vect_supportable_dr_alignment. |
1975 | |
1976 | D) Return true (all data references satisfy vect_supportable_dr_alignment). |
1977 | |
1978 | Note, Possibility 3 above (which is peeling and versioning together) is not |
1979 | being done at this time. */ |
1980 | |
1981 | /* (1) Peeling to force alignment. */ |
1982 | |
1983 | /* (1.1) Decide whether to perform peeling, and how many iterations to peel: |
1984 | Considerations: |
1985 | + How many accesses will become aligned due to the peeling |
1986 | - How many accesses will become unaligned due to the peeling, |
1987 | and the cost of misaligned accesses. |
1988 | - The cost of peeling (the extra runtime checks, the increase |
1989 | in code size). */ |
1990 | |
1991 | FOR_EACH_VEC_ELT (datarefs, i, dr) |
1992 | { |
1993 | dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr); |
1994 | if (!vect_relevant_for_alignment_p (dr_info)) |
1995 | continue; |
1996 | |
1997 | stmt_vec_info stmt_info = dr_info->stmt; |
1998 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); |
1999 | do_peeling = vector_alignment_reachable_p (dr_info); |
2000 | if (do_peeling) |
2001 | { |
2002 | if (known_alignment_for_access_p (dr_info, vectype)) |
2003 | { |
2004 | unsigned int npeel_tmp = 0; |
2005 | bool negative = tree_int_cst_compare (DR_STEP (dr), |
2006 | size_zero_node) < 0; |
2007 | |
2008 | /* If known_alignment_for_access_p then we have set |
2009 | DR_MISALIGNMENT which is only done if we know it at compiler |
2010 | time, so it is safe to assume target alignment is constant. |
2011 | */ |
2012 | unsigned int target_align = |
2013 | DR_TARGET_ALIGNMENT (dr_info).to_constant (); |
2014 | unsigned HOST_WIDE_INT dr_size = vect_get_scalar_dr_size (dr_info); |
2015 | poly_int64 off = 0; |
2016 | if (negative) |
2017 | off = (TYPE_VECTOR_SUBPARTS (node: vectype) - 1) * -dr_size; |
2018 | unsigned int mis = dr_misalignment (dr_info, vectype, offset: off); |
2019 | mis = negative ? mis : -mis; |
2020 | if (mis != 0) |
2021 | npeel_tmp = (mis & (target_align - 1)) / dr_size; |
2022 | |
2023 | /* For multiple types, it is possible that the bigger type access |
2024 | will have more than one peeling option. E.g., a loop with two |
2025 | types: one of size (vector size / 4), and the other one of |
2026 | size (vector size / 8). Vectorization factor will 8. If both |
2027 | accesses are misaligned by 3, the first one needs one scalar |
2028 | iteration to be aligned, and the second one needs 5. But the |
2029 | first one will be aligned also by peeling 5 scalar |
2030 | iterations, and in that case both accesses will be aligned. |
2031 | Hence, except for the immediate peeling amount, we also want |
2032 | to try to add full vector size, while we don't exceed |
2033 | vectorization factor. |
2034 | We do this automatically for cost model, since we calculate |
2035 | cost for every peeling option. */ |
2036 | poly_uint64 nscalars = npeel_tmp; |
2037 | if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) |
2038 | { |
2039 | poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); |
2040 | nscalars = (STMT_SLP_TYPE (stmt_info) |
2041 | ? vf * DR_GROUP_SIZE (stmt_info) : vf); |
2042 | } |
2043 | |
2044 | /* Save info about DR in the hash table. Also include peeling |
2045 | amounts according to the explanation above. Indicate |
2046 | the alignment status when the ref is not aligned. |
2047 | ??? Rather than using unknown alignment here we should |
2048 | prune all entries from the peeling hashtable which cause |
2049 | DRs to be not supported. */ |
2050 | bool supportable_if_not_aligned |
2051 | = vect_supportable_dr_alignment |
2052 | (loop_vinfo, dr_info, vectype, DR_MISALIGNMENT_UNKNOWN); |
2053 | while (known_le (npeel_tmp, nscalars)) |
2054 | { |
2055 | vect_peeling_hash_insert (peeling_htab: &peeling_htab, loop_vinfo, |
2056 | dr_info, npeel: npeel_tmp, |
2057 | supportable_if_not_aligned); |
2058 | npeel_tmp += MAX (1, target_align / dr_size); |
2059 | } |
2060 | |
2061 | one_misalignment_known = true; |
2062 | } |
2063 | else |
2064 | { |
2065 | /* If we don't know any misalignment values, we prefer |
2066 | peeling for data-ref that has the maximum number of data-refs |
2067 | with the same alignment, unless the target prefers to align |
2068 | stores over load. */ |
2069 | unsigned same_align_drs = n_same_align_refs[i]; |
2070 | if (!dr0_info |
2071 | || dr0_same_align_drs < same_align_drs) |
2072 | { |
2073 | dr0_same_align_drs = same_align_drs; |
2074 | dr0_info = dr_info; |
2075 | } |
2076 | /* For data-refs with the same number of related |
2077 | accesses prefer the one where the misalign |
2078 | computation will be invariant in the outermost loop. */ |
2079 | else if (dr0_same_align_drs == same_align_drs) |
2080 | { |
2081 | class loop *ivloop0, *ivloop; |
2082 | ivloop0 = outermost_invariant_loop_for_expr |
2083 | (loop, DR_BASE_ADDRESS (dr0_info->dr)); |
2084 | ivloop = outermost_invariant_loop_for_expr |
2085 | (loop, DR_BASE_ADDRESS (dr)); |
2086 | if ((ivloop && !ivloop0) |
2087 | || (ivloop && ivloop0 |
2088 | && flow_loop_nested_p (ivloop, ivloop0))) |
2089 | dr0_info = dr_info; |
2090 | } |
2091 | |
2092 | one_misalignment_unknown = true; |
2093 | |
2094 | /* Check for data refs with unsupportable alignment that |
2095 | can be peeled. */ |
2096 | enum dr_alignment_support supportable_dr_alignment |
2097 | = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype, |
2098 | DR_MISALIGNMENT_UNKNOWN); |
2099 | if (supportable_dr_alignment == dr_unaligned_unsupported) |
2100 | { |
2101 | one_dr_unsupportable = true; |
2102 | unsupportable_dr_info = dr_info; |
2103 | } |
2104 | |
2105 | if (!first_store && DR_IS_WRITE (dr)) |
2106 | { |
2107 | first_store = dr_info; |
2108 | first_store_same_align_drs = same_align_drs; |
2109 | } |
2110 | } |
2111 | } |
2112 | else |
2113 | { |
2114 | if (!aligned_access_p (dr_info, vectype)) |
2115 | { |
2116 | if (dump_enabled_p ()) |
2117 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
2118 | "vector alignment may not be reachable\n" ); |
2119 | break; |
2120 | } |
2121 | } |
2122 | } |
2123 | |
2124 | /* Check if we can possibly peel the loop. */ |
2125 | if (!vect_can_advance_ivs_p (loop_vinfo) |
2126 | || !slpeel_can_duplicate_loop_p (loop, LOOP_VINFO_IV_EXIT (loop_vinfo), |
2127 | LOOP_VINFO_IV_EXIT (loop_vinfo)) |
2128 | || loop->inner) |
2129 | do_peeling = false; |
2130 | |
2131 | struct _vect_peel_extended_info peel_for_known_alignment; |
2132 | struct _vect_peel_extended_info peel_for_unknown_alignment; |
2133 | struct _vect_peel_extended_info best_peel; |
2134 | |
2135 | peel_for_unknown_alignment.inside_cost = INT_MAX; |
2136 | peel_for_unknown_alignment.outside_cost = INT_MAX; |
2137 | peel_for_unknown_alignment.peel_info.count = 0; |
2138 | |
2139 | if (do_peeling |
2140 | && one_misalignment_unknown) |
2141 | { |
2142 | /* Check if the target requires to prefer stores over loads, i.e., if |
2143 | misaligned stores are more expensive than misaligned loads (taking |
2144 | drs with same alignment into account). */ |
2145 | unsigned int load_inside_cost = 0; |
2146 | unsigned int load_outside_cost = 0; |
2147 | unsigned int store_inside_cost = 0; |
2148 | unsigned int store_outside_cost = 0; |
2149 | unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2; |
2150 | |
2151 | stmt_vector_for_cost dummy; |
2152 | dummy.create (nelems: 2); |
2153 | vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info, |
2154 | inside_cost: &load_inside_cost, |
2155 | outside_cost: &load_outside_cost, |
2156 | body_cost_vec: &dummy, prologue_cost_vec: &dummy, npeel: estimated_npeels); |
2157 | dummy.release (); |
2158 | |
2159 | if (first_store) |
2160 | { |
2161 | dummy.create (nelems: 2); |
2162 | vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info: first_store, |
2163 | inside_cost: &store_inside_cost, |
2164 | outside_cost: &store_outside_cost, |
2165 | body_cost_vec: &dummy, prologue_cost_vec: &dummy, |
2166 | npeel: estimated_npeels); |
2167 | dummy.release (); |
2168 | } |
2169 | else |
2170 | { |
2171 | store_inside_cost = INT_MAX; |
2172 | store_outside_cost = INT_MAX; |
2173 | } |
2174 | |
2175 | if (load_inside_cost > store_inside_cost |
2176 | || (load_inside_cost == store_inside_cost |
2177 | && load_outside_cost > store_outside_cost)) |
2178 | { |
2179 | dr0_info = first_store; |
2180 | dr0_same_align_drs = first_store_same_align_drs; |
2181 | peel_for_unknown_alignment.inside_cost = store_inside_cost; |
2182 | peel_for_unknown_alignment.outside_cost = store_outside_cost; |
2183 | } |
2184 | else |
2185 | { |
2186 | peel_for_unknown_alignment.inside_cost = load_inside_cost; |
2187 | peel_for_unknown_alignment.outside_cost = load_outside_cost; |
2188 | } |
2189 | |
2190 | stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec; |
2191 | prologue_cost_vec.create (nelems: 2); |
2192 | epilogue_cost_vec.create (nelems: 2); |
2193 | |
2194 | int dummy2; |
2195 | peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost |
2196 | (loop_vinfo, estimated_npeels, &dummy2, |
2197 | &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), |
2198 | &prologue_cost_vec, &epilogue_cost_vec); |
2199 | |
2200 | prologue_cost_vec.release (); |
2201 | epilogue_cost_vec.release (); |
2202 | |
2203 | peel_for_unknown_alignment.peel_info.count = dr0_same_align_drs + 1; |
2204 | } |
2205 | |
2206 | peel_for_unknown_alignment.peel_info.npeel = 0; |
2207 | peel_for_unknown_alignment.peel_info.dr_info = dr0_info; |
2208 | |
2209 | best_peel = peel_for_unknown_alignment; |
2210 | |
2211 | peel_for_known_alignment.inside_cost = INT_MAX; |
2212 | peel_for_known_alignment.outside_cost = INT_MAX; |
2213 | peel_for_known_alignment.peel_info.count = 0; |
2214 | peel_for_known_alignment.peel_info.dr_info = NULL; |
2215 | |
2216 | if (do_peeling && one_misalignment_known) |
2217 | { |
2218 | /* Peeling is possible, but there is no data access that is not supported |
2219 | unless aligned. So we try to choose the best possible peeling from |
2220 | the hash table. */ |
2221 | peel_for_known_alignment = vect_peeling_hash_choose_best_peeling |
2222 | (peeling_htab: &peeling_htab, loop_vinfo); |
2223 | } |
2224 | |
2225 | /* Compare costs of peeling for known and unknown alignment. */ |
2226 | if (peel_for_known_alignment.peel_info.dr_info != NULL |
2227 | && peel_for_unknown_alignment.inside_cost |
2228 | >= peel_for_known_alignment.inside_cost) |
2229 | { |
2230 | best_peel = peel_for_known_alignment; |
2231 | |
2232 | /* If the best peeling for known alignment has NPEEL == 0, perform no |
2233 | peeling at all except if there is an unsupportable dr that we can |
2234 | align. */ |
2235 | if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable) |
2236 | do_peeling = false; |
2237 | } |
2238 | |
2239 | /* If there is an unsupportable data ref, prefer this over all choices so far |
2240 | since we'd have to discard a chosen peeling except when it accidentally |
2241 | aligned the unsupportable data ref. */ |
2242 | if (one_dr_unsupportable) |
2243 | dr0_info = unsupportable_dr_info; |
2244 | else if (do_peeling) |
2245 | { |
2246 | /* Calculate the penalty for no peeling, i.e. leaving everything as-is. |
2247 | TODO: Use nopeel_outside_cost or get rid of it? */ |
2248 | unsigned nopeel_inside_cost = 0; |
2249 | unsigned nopeel_outside_cost = 0; |
2250 | |
2251 | stmt_vector_for_cost dummy; |
2252 | dummy.create (nelems: 2); |
2253 | vect_get_peeling_costs_all_drs (loop_vinfo, NULL, inside_cost: &nopeel_inside_cost, |
2254 | outside_cost: &nopeel_outside_cost, body_cost_vec: &dummy, prologue_cost_vec: &dummy, npeel: 0); |
2255 | dummy.release (); |
2256 | |
2257 | /* Add epilogue costs. As we do not peel for alignment here, no prologue |
2258 | costs will be recorded. */ |
2259 | stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec; |
2260 | prologue_cost_vec.create (nelems: 2); |
2261 | epilogue_cost_vec.create (nelems: 2); |
2262 | |
2263 | int dummy2; |
2264 | nopeel_outside_cost += vect_get_known_peeling_cost |
2265 | (loop_vinfo, 0, &dummy2, |
2266 | &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), |
2267 | &prologue_cost_vec, &epilogue_cost_vec); |
2268 | |
2269 | prologue_cost_vec.release (); |
2270 | epilogue_cost_vec.release (); |
2271 | |
2272 | npeel = best_peel.peel_info.npeel; |
2273 | dr0_info = best_peel.peel_info.dr_info; |
2274 | |
2275 | /* If no peeling is not more expensive than the best peeling we |
2276 | have so far, don't perform any peeling. */ |
2277 | if (nopeel_inside_cost <= best_peel.inside_cost) |
2278 | do_peeling = false; |
2279 | } |
2280 | |
2281 | if (do_peeling) |
2282 | { |
2283 | stmt_vec_info stmt_info = dr0_info->stmt; |
2284 | if (known_alignment_for_access_p (dr_info: dr0_info, |
2285 | STMT_VINFO_VECTYPE (stmt_info))) |
2286 | { |
2287 | bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr), |
2288 | size_zero_node) < 0; |
2289 | if (!npeel) |
2290 | { |
2291 | /* Since it's known at compile time, compute the number of |
2292 | iterations in the peeled loop (the peeling factor) for use in |
2293 | updating DR_MISALIGNMENT values. The peeling factor is the |
2294 | vectorization factor minus the misalignment as an element |
2295 | count. */ |
2296 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); |
2297 | poly_int64 off = 0; |
2298 | if (negative) |
2299 | off = ((TYPE_VECTOR_SUBPARTS (node: vectype) - 1) |
2300 | * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype)))); |
2301 | unsigned int mis |
2302 | = dr_misalignment (dr_info: dr0_info, vectype, offset: off); |
2303 | mis = negative ? mis : -mis; |
2304 | /* If known_alignment_for_access_p then we have set |
2305 | DR_MISALIGNMENT which is only done if we know it at compiler |
2306 | time, so it is safe to assume target alignment is constant. |
2307 | */ |
2308 | unsigned int target_align = |
2309 | DR_TARGET_ALIGNMENT (dr0_info).to_constant (); |
2310 | npeel = ((mis & (target_align - 1)) |
2311 | / vect_get_scalar_dr_size (dr_info: dr0_info)); |
2312 | } |
2313 | |
2314 | /* For interleaved data access every iteration accesses all the |
2315 | members of the group, therefore we divide the number of iterations |
2316 | by the group size. */ |
2317 | if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) |
2318 | npeel /= DR_GROUP_SIZE (stmt_info); |
2319 | |
2320 | if (dump_enabled_p ()) |
2321 | dump_printf_loc (MSG_NOTE, vect_location, |
2322 | "Try peeling by %d\n" , npeel); |
2323 | } |
2324 | |
2325 | /* Ensure that all datarefs can be vectorized after the peel. */ |
2326 | if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel)) |
2327 | do_peeling = false; |
2328 | |
2329 | /* Check if all datarefs are supportable and log. */ |
2330 | if (do_peeling |
2331 | && npeel == 0 |
2332 | && known_alignment_for_access_p (dr_info: dr0_info, |
2333 | STMT_VINFO_VECTYPE (stmt_info))) |
2334 | return opt_result::success (); |
2335 | |
2336 | /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */ |
2337 | if (do_peeling) |
2338 | { |
2339 | unsigned max_allowed_peel |
2340 | = param_vect_max_peeling_for_alignment; |
2341 | if (loop_cost_model (loop) <= VECT_COST_MODEL_CHEAP) |
2342 | max_allowed_peel = 0; |
2343 | if (max_allowed_peel != (unsigned)-1) |
2344 | { |
2345 | unsigned max_peel = npeel; |
2346 | if (max_peel == 0) |
2347 | { |
2348 | poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info); |
2349 | unsigned HOST_WIDE_INT target_align_c; |
2350 | if (target_align.is_constant (const_value: &target_align_c)) |
2351 | max_peel = |
2352 | target_align_c / vect_get_scalar_dr_size (dr_info: dr0_info) - 1; |
2353 | else |
2354 | { |
2355 | do_peeling = false; |
2356 | if (dump_enabled_p ()) |
2357 | dump_printf_loc (MSG_NOTE, vect_location, |
2358 | "Disable peeling, max peels set and vector" |
2359 | " alignment unknown\n" ); |
2360 | } |
2361 | } |
2362 | if (max_peel > max_allowed_peel) |
2363 | { |
2364 | do_peeling = false; |
2365 | if (dump_enabled_p ()) |
2366 | dump_printf_loc (MSG_NOTE, vect_location, |
2367 | "Disable peeling, max peels reached: %d\n" , max_peel); |
2368 | } |
2369 | } |
2370 | } |
2371 | |
2372 | /* Cost model #2 - if peeling may result in a remaining loop not |
2373 | iterating enough to be vectorized then do not peel. Since this |
2374 | is a cost heuristic rather than a correctness decision, use the |
2375 | most likely runtime value for variable vectorization factors. */ |
2376 | if (do_peeling |
2377 | && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) |
2378 | { |
2379 | unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo); |
2380 | unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel; |
2381 | if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo) |
2382 | < assumed_vf + max_peel) |
2383 | do_peeling = false; |
2384 | } |
2385 | |
2386 | if (do_peeling) |
2387 | { |
2388 | /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i. |
2389 | If the misalignment of DR_i is identical to that of dr0 then set |
2390 | DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and |
2391 | dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i) |
2392 | by the peeling factor times the element size of DR_i (MOD the |
2393 | vectorization factor times the size). Otherwise, the |
2394 | misalignment of DR_i must be set to unknown. */ |
2395 | FOR_EACH_VEC_ELT (datarefs, i, dr) |
2396 | if (dr != dr0_info->dr) |
2397 | { |
2398 | dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr); |
2399 | if (!vect_relevant_for_alignment_p (dr_info)) |
2400 | continue; |
2401 | |
2402 | vect_update_misalignment_for_peel (dr_info, dr_peel_info: dr0_info, npeel); |
2403 | } |
2404 | |
2405 | LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info; |
2406 | if (npeel) |
2407 | LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel; |
2408 | else |
2409 | LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = -1; |
2410 | SET_DR_MISALIGNMENT (dr0_info, |
2411 | vect_dr_misalign_for_aligned_access (dr0_info)); |
2412 | if (dump_enabled_p ()) |
2413 | { |
2414 | dump_printf_loc (MSG_NOTE, vect_location, |
2415 | "Alignment of access forced using peeling.\n" ); |
2416 | dump_printf_loc (MSG_NOTE, vect_location, |
2417 | "Peeling for alignment will be applied.\n" ); |
2418 | } |
2419 | |
2420 | /* The inside-loop cost will be accounted for in vectorizable_load |
2421 | and vectorizable_store correctly with adjusted alignments. |
2422 | Drop the body_cst_vec on the floor here. */ |
2423 | return opt_result::success (); |
2424 | } |
2425 | } |
2426 | |
2427 | /* (2) Versioning to force alignment. */ |
2428 | |
2429 | /* Try versioning if: |
2430 | 1) optimize loop for speed and the cost-model is not cheap |
2431 | 2) there is at least one unsupported misaligned data ref with an unknown |
2432 | misalignment, and |
2433 | 3) all misaligned data refs with a known misalignment are supported, and |
2434 | 4) the number of runtime alignment checks is within reason. */ |
2435 | |
2436 | do_versioning |
2437 | = (optimize_loop_nest_for_speed_p (loop) |
2438 | && !loop->inner /* FORNOW */ |
2439 | && loop_cost_model (loop) > VECT_COST_MODEL_CHEAP); |
2440 | |
2441 | if (do_versioning) |
2442 | { |
2443 | FOR_EACH_VEC_ELT (datarefs, i, dr) |
2444 | { |
2445 | dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr); |
2446 | if (!vect_relevant_for_alignment_p (dr_info)) |
2447 | continue; |
2448 | |
2449 | stmt_vec_info stmt_info = dr_info->stmt; |
2450 | if (STMT_VINFO_STRIDED_P (stmt_info)) |
2451 | { |
2452 | do_versioning = false; |
2453 | break; |
2454 | } |
2455 | |
2456 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); |
2457 | bool negative = tree_int_cst_compare (DR_STEP (dr), |
2458 | size_zero_node) < 0; |
2459 | poly_int64 off = 0; |
2460 | if (negative) |
2461 | off = ((TYPE_VECTOR_SUBPARTS (node: vectype) - 1) |
2462 | * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype)))); |
2463 | int misalignment; |
2464 | if ((misalignment = dr_misalignment (dr_info, vectype, offset: off)) == 0) |
2465 | continue; |
2466 | |
2467 | enum dr_alignment_support supportable_dr_alignment |
2468 | = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype, |
2469 | misalignment); |
2470 | if (supportable_dr_alignment == dr_unaligned_unsupported) |
2471 | { |
2472 | if (misalignment != DR_MISALIGNMENT_UNKNOWN |
2473 | || (LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length () |
2474 | >= (unsigned) param_vect_max_version_for_alignment_checks)) |
2475 | { |
2476 | do_versioning = false; |
2477 | break; |
2478 | } |
2479 | |
2480 | /* At present we don't support versioning for alignment |
2481 | with variable VF, since there's no guarantee that the |
2482 | VF is a power of two. We could relax this if we added |
2483 | a way of enforcing a power-of-two size. */ |
2484 | unsigned HOST_WIDE_INT size; |
2485 | if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (const_value: &size)) |
2486 | { |
2487 | do_versioning = false; |
2488 | break; |
2489 | } |
2490 | |
2491 | /* Forcing alignment in the first iteration is no good if |
2492 | we don't keep it across iterations. For now, just disable |
2493 | versioning in this case. |
2494 | ?? We could actually unroll the loop to achieve the required |
2495 | overall step alignment, and forcing the alignment could be |
2496 | done by doing some iterations of the non-vectorized loop. */ |
2497 | if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo) |
2498 | * DR_STEP_ALIGNMENT (dr), |
2499 | DR_TARGET_ALIGNMENT (dr_info))) |
2500 | { |
2501 | do_versioning = false; |
2502 | break; |
2503 | } |
2504 | |
2505 | /* The rightmost bits of an aligned address must be zeros. |
2506 | Construct the mask needed for this test. For example, |
2507 | GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the |
2508 | mask must be 15 = 0xf. */ |
2509 | int mask = size - 1; |
2510 | |
2511 | /* FORNOW: use the same mask to test all potentially unaligned |
2512 | references in the loop. */ |
2513 | if (LOOP_VINFO_PTR_MASK (loop_vinfo) |
2514 | && LOOP_VINFO_PTR_MASK (loop_vinfo) != mask) |
2515 | { |
2516 | do_versioning = false; |
2517 | break; |
2518 | } |
2519 | |
2520 | LOOP_VINFO_PTR_MASK (loop_vinfo) = mask; |
2521 | LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (obj: stmt_info); |
2522 | } |
2523 | } |
2524 | |
2525 | /* Versioning requires at least one misaligned data reference. */ |
2526 | if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)) |
2527 | do_versioning = false; |
2528 | else if (!do_versioning) |
2529 | LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (size: 0); |
2530 | } |
2531 | |
2532 | if (do_versioning) |
2533 | { |
2534 | const vec<stmt_vec_info> &may_misalign_stmts |
2535 | = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo); |
2536 | stmt_vec_info stmt_info; |
2537 | |
2538 | /* It can now be assumed that the data references in the statements |
2539 | in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version |
2540 | of the loop being vectorized. */ |
2541 | FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info) |
2542 | { |
2543 | dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info); |
2544 | SET_DR_MISALIGNMENT (dr_info, |
2545 | vect_dr_misalign_for_aligned_access (dr_info)); |
2546 | if (dump_enabled_p ()) |
2547 | dump_printf_loc (MSG_NOTE, vect_location, |
2548 | "Alignment of access forced using versioning.\n" ); |
2549 | } |
2550 | |
2551 | if (dump_enabled_p ()) |
2552 | dump_printf_loc (MSG_NOTE, vect_location, |
2553 | "Versioning for alignment will be applied.\n" ); |
2554 | |
2555 | /* Peeling and versioning can't be done together at this time. */ |
2556 | gcc_assert (! (do_peeling && do_versioning)); |
2557 | |
2558 | return opt_result::success (); |
2559 | } |
2560 | |
2561 | /* This point is reached if neither peeling nor versioning is being done. */ |
2562 | gcc_assert (! (do_peeling || do_versioning)); |
2563 | |
2564 | return opt_result::success (); |
2565 | } |
2566 | |
2567 | |
2568 | /* Function vect_analyze_data_refs_alignment |
2569 | |
2570 | Analyze the alignment of the data-references in the loop. |
2571 | Return FALSE if a data reference is found that cannot be vectorized. */ |
2572 | |
2573 | opt_result |
2574 | vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo) |
2575 | { |
2576 | DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment" ); |
2577 | |
2578 | vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); |
2579 | struct data_reference *dr; |
2580 | unsigned int i; |
2581 | |
2582 | vect_record_base_alignments (vinfo: loop_vinfo); |
2583 | FOR_EACH_VEC_ELT (datarefs, i, dr) |
2584 | { |
2585 | dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr); |
2586 | if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)) |
2587 | { |
2588 | if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt) |
2589 | && DR_GROUP_FIRST_ELEMENT (dr_info->stmt) != dr_info->stmt) |
2590 | continue; |
2591 | vect_compute_data_ref_alignment (vinfo: loop_vinfo, dr_info, |
2592 | STMT_VINFO_VECTYPE (dr_info->stmt)); |
2593 | } |
2594 | } |
2595 | |
2596 | return opt_result::success (); |
2597 | } |
2598 | |
2599 | |
2600 | /* Analyze alignment of DRs of stmts in NODE. */ |
2601 | |
2602 | static bool |
2603 | vect_slp_analyze_node_alignment (vec_info *vinfo, slp_tree node) |
2604 | { |
2605 | /* Alignment is maintained in the first element of the group. */ |
2606 | stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0]; |
2607 | first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info); |
2608 | dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info); |
2609 | tree vectype = SLP_TREE_VECTYPE (node); |
2610 | poly_uint64 vector_alignment |
2611 | = exact_div (a: targetm.vectorize.preferred_vector_alignment (vectype), |
2612 | BITS_PER_UNIT); |
2613 | if (dr_info->misalignment == DR_MISALIGNMENT_UNINITIALIZED) |
2614 | vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node)); |
2615 | /* Re-analyze alignment when we're facing a vectorization with a bigger |
2616 | alignment requirement. */ |
2617 | else if (known_lt (dr_info->target_alignment, vector_alignment)) |
2618 | { |
2619 | poly_uint64 old_target_alignment = dr_info->target_alignment; |
2620 | int old_misalignment = dr_info->misalignment; |
2621 | vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node)); |
2622 | /* But keep knowledge about a smaller alignment. */ |
2623 | if (old_misalignment != DR_MISALIGNMENT_UNKNOWN |
2624 | && dr_info->misalignment == DR_MISALIGNMENT_UNKNOWN) |
2625 | { |
2626 | dr_info->target_alignment = old_target_alignment; |
2627 | dr_info->misalignment = old_misalignment; |
2628 | } |
2629 | } |
2630 | /* When we ever face unordered target alignments the first one wins in terms |
2631 | of analyzing and the other will become unknown in dr_misalignment. */ |
2632 | return true; |
2633 | } |
2634 | |
2635 | /* Function vect_slp_analyze_instance_alignment |
2636 | |
2637 | Analyze the alignment of the data-references in the SLP instance. |
2638 | Return FALSE if a data reference is found that cannot be vectorized. */ |
2639 | |
2640 | bool |
2641 | vect_slp_analyze_instance_alignment (vec_info *vinfo, |
2642 | slp_instance instance) |
2643 | { |
2644 | DUMP_VECT_SCOPE ("vect_slp_analyze_instance_alignment" ); |
2645 | |
2646 | slp_tree node; |
2647 | unsigned i; |
2648 | FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node) |
2649 | if (! vect_slp_analyze_node_alignment (vinfo, node)) |
2650 | return false; |
2651 | |
2652 | if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store |
2653 | && ! vect_slp_analyze_node_alignment |
2654 | (vinfo, SLP_INSTANCE_TREE (instance))) |
2655 | return false; |
2656 | |
2657 | return true; |
2658 | } |
2659 | |
2660 | |
2661 | /* Analyze groups of accesses: check that DR_INFO belongs to a group of |
2662 | accesses of legal size, step, etc. Detect gaps, single element |
2663 | interleaving, and other special cases. Set grouped access info. |
2664 | Collect groups of strided stores for further use in SLP analysis. |
2665 | Worker for vect_analyze_group_access. */ |
2666 | |
2667 | static bool |
2668 | vect_analyze_group_access_1 (vec_info *vinfo, dr_vec_info *dr_info) |
2669 | { |
2670 | data_reference *dr = dr_info->dr; |
2671 | tree step = DR_STEP (dr); |
2672 | tree scalar_type = TREE_TYPE (DR_REF (dr)); |
2673 | HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type)); |
2674 | stmt_vec_info stmt_info = dr_info->stmt; |
2675 | loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (p: vinfo); |
2676 | bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (p: vinfo); |
2677 | HOST_WIDE_INT dr_step = -1; |
2678 | HOST_WIDE_INT groupsize, last_accessed_element = 1; |
2679 | bool slp_impossible = false; |
2680 | |
2681 | /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the |
2682 | size of the interleaving group (including gaps). */ |
2683 | if (tree_fits_shwi_p (step)) |
2684 | { |
2685 | dr_step = tree_to_shwi (step); |
2686 | /* Check that STEP is a multiple of type size. Otherwise there is |
2687 | a non-element-sized gap at the end of the group which we |
2688 | cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE. |
2689 | ??? As we can handle non-constant step fine here we should |
2690 | simply remove uses of DR_GROUP_GAP between the last and first |
2691 | element and instead rely on DR_STEP. DR_GROUP_SIZE then would |
2692 | simply not include that gap. */ |
2693 | if ((dr_step % type_size) != 0) |
2694 | { |
2695 | if (dump_enabled_p ()) |
2696 | dump_printf_loc (MSG_NOTE, vect_location, |
2697 | "Step %T is not a multiple of the element size" |
2698 | " for %T\n" , |
2699 | step, DR_REF (dr)); |
2700 | return false; |
2701 | } |
2702 | groupsize = absu_hwi (x: dr_step) / type_size; |
2703 | } |
2704 | else |
2705 | groupsize = 0; |
2706 | |
2707 | /* Not consecutive access is possible only if it is a part of interleaving. */ |
2708 | if (!DR_GROUP_FIRST_ELEMENT (stmt_info)) |
2709 | { |
2710 | /* Check if it this DR is a part of interleaving, and is a single |
2711 | element of the group that is accessed in the loop. */ |
2712 | |
2713 | /* Gaps are supported only for loads. STEP must be a multiple of the type |
2714 | size. */ |
2715 | if (DR_IS_READ (dr) |
2716 | && (dr_step % type_size) == 0 |
2717 | && groupsize > 0 |
2718 | /* This could be UINT_MAX but as we are generating code in a very |
2719 | inefficient way we have to cap earlier. |
2720 | See PR91403 for example. */ |
2721 | && groupsize <= 4096) |
2722 | { |
2723 | DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info; |
2724 | DR_GROUP_SIZE (stmt_info) = groupsize; |
2725 | DR_GROUP_GAP (stmt_info) = groupsize - 1; |
2726 | if (dump_enabled_p ()) |
2727 | dump_printf_loc (MSG_NOTE, vect_location, |
2728 | "Detected single element interleaving %T" |
2729 | " step %T\n" , |
2730 | DR_REF (dr), step); |
2731 | |
2732 | return true; |
2733 | } |
2734 | |
2735 | if (dump_enabled_p ()) |
2736 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
2737 | "not consecutive access %G" , stmt_info->stmt); |
2738 | |
2739 | if (bb_vinfo) |
2740 | { |
2741 | /* Mark the statement as unvectorizable. */ |
2742 | STMT_VINFO_VECTORIZABLE (stmt_info) = false; |
2743 | return true; |
2744 | } |
2745 | |
2746 | if (dump_enabled_p ()) |
2747 | dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n" ); |
2748 | STMT_VINFO_STRIDED_P (stmt_info) = true; |
2749 | return true; |
2750 | } |
2751 | |
2752 | if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info) |
2753 | { |
2754 | /* First stmt in the interleaving chain. Check the chain. */ |
2755 | stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info); |
2756 | struct data_reference *data_ref = dr; |
2757 | unsigned int count = 1; |
2758 | tree prev_init = DR_INIT (data_ref); |
2759 | HOST_WIDE_INT diff, gaps = 0; |
2760 | |
2761 | /* By construction, all group members have INTEGER_CST DR_INITs. */ |
2762 | while (next) |
2763 | { |
2764 | /* We never have the same DR multiple times. */ |
2765 | gcc_assert (tree_int_cst_compare (DR_INIT (data_ref), |
2766 | DR_INIT (STMT_VINFO_DATA_REF (next))) != 0); |
2767 | |
2768 | data_ref = STMT_VINFO_DATA_REF (next); |
2769 | |
2770 | /* All group members have the same STEP by construction. */ |
2771 | gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0)); |
2772 | |
2773 | /* Check that the distance between two accesses is equal to the type |
2774 | size. Otherwise, we have gaps. */ |
2775 | diff = (TREE_INT_CST_LOW (DR_INIT (data_ref)) |
2776 | - TREE_INT_CST_LOW (prev_init)) / type_size; |
2777 | if (diff < 1 || diff > UINT_MAX) |
2778 | { |
2779 | /* For artificial testcases with array accesses with large |
2780 | constant indices we can run into overflow issues which |
2781 | can end up fooling the groupsize constraint below so |
2782 | check the individual gaps (which are represented as |
2783 | unsigned int) as well. */ |
2784 | if (dump_enabled_p ()) |
2785 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
2786 | "interleaved access with gap larger " |
2787 | "than representable\n" ); |
2788 | return false; |
2789 | } |
2790 | if (diff != 1) |
2791 | { |
2792 | /* FORNOW: SLP of accesses with gaps is not supported. */ |
2793 | slp_impossible = true; |
2794 | if (DR_IS_WRITE (data_ref)) |
2795 | { |
2796 | if (dump_enabled_p ()) |
2797 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
2798 | "interleaved store with gaps\n" ); |
2799 | return false; |
2800 | } |
2801 | |
2802 | gaps += diff - 1; |
2803 | } |
2804 | |
2805 | last_accessed_element += diff; |
2806 | |
2807 | /* Store the gap from the previous member of the group. If there is no |
2808 | gap in the access, DR_GROUP_GAP is always 1. */ |
2809 | DR_GROUP_GAP (next) = diff; |
2810 | |
2811 | prev_init = DR_INIT (data_ref); |
2812 | next = DR_GROUP_NEXT_ELEMENT (next); |
2813 | /* Count the number of data-refs in the chain. */ |
2814 | count++; |
2815 | } |
2816 | |
2817 | if (groupsize == 0) |
2818 | groupsize = count + gaps; |
2819 | |
2820 | /* This could be UINT_MAX but as we are generating code in a very |
2821 | inefficient way we have to cap earlier. See PR78699 for example. */ |
2822 | if (groupsize > 4096) |
2823 | { |
2824 | if (dump_enabled_p ()) |
2825 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
2826 | "group is too large\n" ); |
2827 | return false; |
2828 | } |
2829 | |
2830 | /* Check that the size of the interleaving is equal to count for stores, |
2831 | i.e., that there are no gaps. */ |
2832 | if (groupsize != count |
2833 | && !DR_IS_READ (dr)) |
2834 | { |
2835 | groupsize = count; |
2836 | STMT_VINFO_STRIDED_P (stmt_info) = true; |
2837 | } |
2838 | |
2839 | /* If there is a gap after the last load in the group it is the |
2840 | difference between the groupsize and the last accessed |
2841 | element. |
2842 | When there is no gap, this difference should be 0. */ |
2843 | DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element; |
2844 | |
2845 | DR_GROUP_SIZE (stmt_info) = groupsize; |
2846 | if (dump_enabled_p ()) |
2847 | { |
2848 | dump_printf_loc (MSG_NOTE, vect_location, |
2849 | "Detected interleaving " ); |
2850 | if (DR_IS_READ (dr)) |
2851 | dump_printf (MSG_NOTE, "load " ); |
2852 | else if (STMT_VINFO_STRIDED_P (stmt_info)) |
2853 | dump_printf (MSG_NOTE, "strided store " ); |
2854 | else |
2855 | dump_printf (MSG_NOTE, "store " ); |
2856 | dump_printf (MSG_NOTE, "of size %u\n" , |
2857 | (unsigned)groupsize); |
2858 | dump_printf_loc (MSG_NOTE, vect_location, "\t%G" , stmt_info->stmt); |
2859 | next = DR_GROUP_NEXT_ELEMENT (stmt_info); |
2860 | while (next) |
2861 | { |
2862 | if (DR_GROUP_GAP (next) != 1) |
2863 | dump_printf_loc (MSG_NOTE, vect_location, |
2864 | "\t<gap of %d elements>\n" , |
2865 | DR_GROUP_GAP (next) - 1); |
2866 | dump_printf_loc (MSG_NOTE, vect_location, "\t%G" , next->stmt); |
2867 | next = DR_GROUP_NEXT_ELEMENT (next); |
2868 | } |
2869 | if (DR_GROUP_GAP (stmt_info) != 0) |
2870 | dump_printf_loc (MSG_NOTE, vect_location, |
2871 | "\t<gap of %d elements>\n" , |
2872 | DR_GROUP_GAP (stmt_info)); |
2873 | } |
2874 | |
2875 | /* SLP: create an SLP data structure for every interleaving group of |
2876 | stores for further analysis in vect_analyse_slp. */ |
2877 | if (DR_IS_WRITE (dr) && !slp_impossible) |
2878 | { |
2879 | if (loop_vinfo) |
2880 | LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (obj: stmt_info); |
2881 | if (bb_vinfo) |
2882 | BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (obj: stmt_info); |
2883 | } |
2884 | } |
2885 | |
2886 | return true; |
2887 | } |
2888 | |
2889 | /* Analyze groups of accesses: check that DR_INFO belongs to a group of |
2890 | accesses of legal size, step, etc. Detect gaps, single element |
2891 | interleaving, and other special cases. Set grouped access info. |
2892 | Collect groups of strided stores for further use in SLP analysis. */ |
2893 | |
2894 | static bool |
2895 | vect_analyze_group_access (vec_info *vinfo, dr_vec_info *dr_info) |
2896 | { |
2897 | if (!vect_analyze_group_access_1 (vinfo, dr_info)) |
2898 | { |
2899 | /* Dissolve the group if present. */ |
2900 | stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt); |
2901 | while (stmt_info) |
2902 | { |
2903 | stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info); |
2904 | DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL; |
2905 | DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL; |
2906 | stmt_info = next; |
2907 | } |
2908 | return false; |
2909 | } |
2910 | return true; |
2911 | } |
2912 | |
2913 | /* Analyze the access pattern of the data-reference DR_INFO. |
2914 | In case of non-consecutive accesses call vect_analyze_group_access() to |
2915 | analyze groups of accesses. */ |
2916 | |
2917 | static bool |
2918 | vect_analyze_data_ref_access (vec_info *vinfo, dr_vec_info *dr_info) |
2919 | { |
2920 | data_reference *dr = dr_info->dr; |
2921 | tree step = DR_STEP (dr); |
2922 | tree scalar_type = TREE_TYPE (DR_REF (dr)); |
2923 | stmt_vec_info stmt_info = dr_info->stmt; |
2924 | loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (p: vinfo); |
2925 | class loop *loop = NULL; |
2926 | |
2927 | if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) |
2928 | return true; |
2929 | |
2930 | if (loop_vinfo) |
2931 | loop = LOOP_VINFO_LOOP (loop_vinfo); |
2932 | |
2933 | if (loop_vinfo && !step) |
2934 | { |
2935 | if (dump_enabled_p ()) |
2936 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
2937 | "bad data-ref access in loop\n" ); |
2938 | return false; |
2939 | } |
2940 | |
2941 | /* Allow loads with zero step in inner-loop vectorization. */ |
2942 | if (loop_vinfo && integer_zerop (step)) |
2943 | { |
2944 | DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL; |
2945 | if (!nested_in_vect_loop_p (loop, stmt_info)) |
2946 | return DR_IS_READ (dr); |
2947 | /* Allow references with zero step for outer loops marked |
2948 | with pragma omp simd only - it guarantees absence of |
2949 | loop-carried dependencies between inner loop iterations. */ |
2950 | if (loop->safelen < 2) |
2951 | { |
2952 | if (dump_enabled_p ()) |
2953 | dump_printf_loc (MSG_NOTE, vect_location, |
2954 | "zero step in inner loop of nest\n" ); |
2955 | return false; |
2956 | } |
2957 | } |
2958 | |
2959 | if (loop && nested_in_vect_loop_p (loop, stmt_info)) |
2960 | { |
2961 | /* Interleaved accesses are not yet supported within outer-loop |
2962 | vectorization for references in the inner-loop. */ |
2963 | DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL; |
2964 | |
2965 | /* For the rest of the analysis we use the outer-loop step. */ |
2966 | step = STMT_VINFO_DR_STEP (stmt_info); |
2967 | if (integer_zerop (step)) |
2968 | { |
2969 | if (dump_enabled_p ()) |
2970 | dump_printf_loc (MSG_NOTE, vect_location, |
2971 | "zero step in outer loop.\n" ); |
2972 | return DR_IS_READ (dr); |
2973 | } |
2974 | } |
2975 | |
2976 | /* Consecutive? */ |
2977 | if (TREE_CODE (step) == INTEGER_CST) |
2978 | { |
2979 | HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step); |
2980 | if (!tree_int_cst_compare (t1: step, TYPE_SIZE_UNIT (scalar_type)) |
2981 | || (dr_step < 0 |
2982 | && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step))) |
2983 | { |
2984 | /* Mark that it is not interleaving. */ |
2985 | DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL; |
2986 | return true; |
2987 | } |
2988 | } |
2989 | |
2990 | if (loop && nested_in_vect_loop_p (loop, stmt_info)) |
2991 | { |
2992 | if (dump_enabled_p ()) |
2993 | dump_printf_loc (MSG_NOTE, vect_location, |
2994 | "grouped access in outer loop.\n" ); |
2995 | return false; |
2996 | } |
2997 | |
2998 | |
2999 | /* Assume this is a DR handled by non-constant strided load case. */ |
3000 | if (TREE_CODE (step) != INTEGER_CST) |
3001 | return (STMT_VINFO_STRIDED_P (stmt_info) |
3002 | && (!STMT_VINFO_GROUPED_ACCESS (stmt_info) |
3003 | || vect_analyze_group_access (vinfo, dr_info))); |
3004 | |
3005 | /* Not consecutive access - check if it's a part of interleaving group. */ |
3006 | return vect_analyze_group_access (vinfo, dr_info); |
3007 | } |
3008 | |
3009 | /* Compare two data-references DRA and DRB to group them into chunks |
3010 | suitable for grouping. */ |
3011 | |
3012 | static int |
3013 | dr_group_sort_cmp (const void *dra_, const void *drb_) |
3014 | { |
3015 | dr_vec_info *dra_info = *(dr_vec_info **)const_cast<void *>(dra_); |
3016 | dr_vec_info *drb_info = *(dr_vec_info **)const_cast<void *>(drb_); |
3017 | data_reference_p dra = dra_info->dr; |
3018 | data_reference_p drb = drb_info->dr; |
3019 | int cmp; |
3020 | |
3021 | /* Stabilize sort. */ |
3022 | if (dra == drb) |
3023 | return 0; |
3024 | |
3025 | /* Different group IDs lead never belong to the same group. */ |
3026 | if (dra_info->group != drb_info->group) |
3027 | return dra_info->group < drb_info->group ? -1 : 1; |
3028 | |
3029 | /* Ordering of DRs according to base. */ |
3030 | cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra), |
3031 | DR_BASE_ADDRESS (drb)); |
3032 | if (cmp != 0) |
3033 | return cmp; |
3034 | |
3035 | /* And according to DR_OFFSET. */ |
3036 | cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)); |
3037 | if (cmp != 0) |
3038 | return cmp; |
3039 | |
3040 | /* Put reads before writes. */ |
3041 | if (DR_IS_READ (dra) != DR_IS_READ (drb)) |
3042 | return DR_IS_READ (dra) ? -1 : 1; |
3043 | |
3044 | /* Then sort after access size. */ |
3045 | cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))), |
3046 | TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)))); |
3047 | if (cmp != 0) |
3048 | return cmp; |
3049 | |
3050 | /* And after step. */ |
3051 | cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)); |
3052 | if (cmp != 0) |
3053 | return cmp; |
3054 | |
3055 | /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */ |
3056 | cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)); |
3057 | if (cmp == 0) |
3058 | return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1; |
3059 | return cmp; |
3060 | } |
3061 | |
3062 | /* If OP is the result of a conversion, return the unconverted value, |
3063 | otherwise return null. */ |
3064 | |
3065 | static tree |
3066 | strip_conversion (tree op) |
3067 | { |
3068 | if (TREE_CODE (op) != SSA_NAME) |
3069 | return NULL_TREE; |
3070 | gimple *stmt = SSA_NAME_DEF_STMT (op); |
3071 | if (!is_gimple_assign (gs: stmt) |
3072 | || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt))) |
3073 | return NULL_TREE; |
3074 | return gimple_assign_rhs1 (gs: stmt); |
3075 | } |
3076 | |
3077 | /* Return true if vectorizable_* routines can handle statements STMT1_INFO |
3078 | and STMT2_INFO being in a single group. When ALLOW_SLP_P, masked loads can |
3079 | be grouped in SLP mode. */ |
3080 | |
3081 | static bool |
3082 | can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info, |
3083 | bool allow_slp_p) |
3084 | { |
3085 | if (gimple_assign_single_p (gs: stmt1_info->stmt)) |
3086 | return gimple_assign_single_p (gs: stmt2_info->stmt); |
3087 | |
3088 | gcall *call1 = dyn_cast <gcall *> (p: stmt1_info->stmt); |
3089 | if (call1 && gimple_call_internal_p (gs: call1)) |
3090 | { |
3091 | /* Check for two masked loads or two masked stores. */ |
3092 | gcall *call2 = dyn_cast <gcall *> (p: stmt2_info->stmt); |
3093 | if (!call2 || !gimple_call_internal_p (gs: call2)) |
3094 | return false; |
3095 | internal_fn ifn = gimple_call_internal_fn (gs: call1); |
3096 | if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE) |
3097 | return false; |
3098 | if (ifn != gimple_call_internal_fn (gs: call2)) |
3099 | return false; |
3100 | |
3101 | /* Check that the masks are the same. Cope with casts of masks, |
3102 | like those created by build_mask_conversion. */ |
3103 | tree mask1 = gimple_call_arg (gs: call1, index: 2); |
3104 | tree mask2 = gimple_call_arg (gs: call2, index: 2); |
3105 | if (!operand_equal_p (mask1, mask2, flags: 0) && !allow_slp_p) |
3106 | { |
3107 | mask1 = strip_conversion (op: mask1); |
3108 | if (!mask1) |
3109 | return false; |
3110 | mask2 = strip_conversion (op: mask2); |
3111 | if (!mask2) |
3112 | return false; |
3113 | if (!operand_equal_p (mask1, mask2, flags: 0)) |
3114 | return false; |
3115 | } |
3116 | return true; |
3117 | } |
3118 | |
3119 | return false; |
3120 | } |
3121 | |
3122 | /* Function vect_analyze_data_ref_accesses. |
3123 | |
3124 | Analyze the access pattern of all the data references in the loop. |
3125 | |
3126 | FORNOW: the only access pattern that is considered vectorizable is a |
3127 | simple step 1 (consecutive) access. |
3128 | |
3129 | FORNOW: handle only arrays and pointer accesses. */ |
3130 | |
3131 | opt_result |
3132 | vect_analyze_data_ref_accesses (vec_info *vinfo, |
3133 | vec<int> *dataref_groups) |
3134 | { |
3135 | unsigned int i; |
3136 | vec<data_reference_p> datarefs = vinfo->shared->datarefs; |
3137 | |
3138 | DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses" ); |
3139 | |
3140 | if (datarefs.is_empty ()) |
3141 | return opt_result::success (); |
3142 | |
3143 | /* Sort the array of datarefs to make building the interleaving chains |
3144 | linear. Don't modify the original vector's order, it is needed for |
3145 | determining what dependencies are reversed. */ |
3146 | vec<dr_vec_info *> datarefs_copy; |
3147 | datarefs_copy.create (nelems: datarefs.length ()); |
3148 | for (unsigned i = 0; i < datarefs.length (); i++) |
3149 | { |
3150 | dr_vec_info *dr_info = vinfo->lookup_dr (datarefs[i]); |
3151 | /* If the caller computed DR grouping use that, otherwise group by |
3152 | basic blocks. */ |
3153 | if (dataref_groups) |
3154 | dr_info->group = (*dataref_groups)[i]; |
3155 | else |
3156 | dr_info->group = gimple_bb (DR_STMT (datarefs[i]))->index; |
3157 | datarefs_copy.quick_push (obj: dr_info); |
3158 | } |
3159 | datarefs_copy.qsort (dr_group_sort_cmp); |
3160 | hash_set<stmt_vec_info> to_fixup; |
3161 | |
3162 | /* Build the interleaving chains. */ |
3163 | for (i = 0; i < datarefs_copy.length () - 1;) |
3164 | { |
3165 | dr_vec_info *dr_info_a = datarefs_copy[i]; |
3166 | data_reference_p dra = dr_info_a->dr; |
3167 | int dra_group_id = dr_info_a->group; |
3168 | stmt_vec_info stmtinfo_a = dr_info_a->stmt; |
3169 | stmt_vec_info lastinfo = NULL; |
3170 | if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a) |
3171 | || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)) |
3172 | { |
3173 | ++i; |
3174 | continue; |
3175 | } |
3176 | for (i = i + 1; i < datarefs_copy.length (); ++i) |
3177 | { |
3178 | dr_vec_info *dr_info_b = datarefs_copy[i]; |
3179 | data_reference_p drb = dr_info_b->dr; |
3180 | int drb_group_id = dr_info_b->group; |
3181 | stmt_vec_info stmtinfo_b = dr_info_b->stmt; |
3182 | if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b) |
3183 | || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b)) |
3184 | break; |
3185 | |
3186 | /* ??? Imperfect sorting (non-compatible types, non-modulo |
3187 | accesses, same accesses) can lead to a group to be artificially |
3188 | split here as we don't just skip over those. If it really |
3189 | matters we can push those to a worklist and re-iterate |
3190 | over them. The we can just skip ahead to the next DR here. */ |
3191 | |
3192 | /* DRs in a different DR group should not be put into the same |
3193 | interleaving group. */ |
3194 | if (dra_group_id != drb_group_id) |
3195 | break; |
3196 | |
3197 | /* Check that the data-refs have same first location (except init) |
3198 | and they are both either store or load (not load and store, |
3199 | not masked loads or stores). */ |
3200 | if (DR_IS_READ (dra) != DR_IS_READ (drb) |
3201 | || data_ref_compare_tree (DR_BASE_ADDRESS (dra), |
3202 | DR_BASE_ADDRESS (drb)) != 0 |
3203 | || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0 |
3204 | || !can_group_stmts_p (stmt1_info: stmtinfo_a, stmt2_info: stmtinfo_b, allow_slp_p: true)) |
3205 | break; |
3206 | |
3207 | /* Check that the data-refs have the same constant size. */ |
3208 | tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))); |
3209 | tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))); |
3210 | if (!tree_fits_uhwi_p (sza) |
3211 | || !tree_fits_uhwi_p (szb) |
3212 | || !tree_int_cst_equal (sza, szb)) |
3213 | break; |
3214 | |
3215 | /* Check that the data-refs have the same step. */ |
3216 | if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0) |
3217 | break; |
3218 | |
3219 | /* Check the types are compatible. |
3220 | ??? We don't distinguish this during sorting. */ |
3221 | if (!types_compatible_p (TREE_TYPE (DR_REF (dra)), |
3222 | TREE_TYPE (DR_REF (drb)))) |
3223 | break; |
3224 | |
3225 | /* Check that the DR_INITs are compile-time constants. */ |
3226 | if (!tree_fits_shwi_p (DR_INIT (dra)) |
3227 | || !tree_fits_shwi_p (DR_INIT (drb))) |
3228 | break; |
3229 | |
3230 | /* Different .GOMP_SIMD_LANE calls still give the same lane, |
3231 | just hold extra information. */ |
3232 | if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a) |
3233 | && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b) |
3234 | && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0) |
3235 | break; |
3236 | |
3237 | /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */ |
3238 | HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra)); |
3239 | HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb)); |
3240 | HOST_WIDE_INT init_prev |
3241 | = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]->dr)); |
3242 | gcc_assert (init_a <= init_b |
3243 | && init_a <= init_prev |
3244 | && init_prev <= init_b); |
3245 | |
3246 | /* Do not place the same access in the interleaving chain twice. */ |
3247 | if (init_b == init_prev) |
3248 | { |
3249 | gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]->dr)) |
3250 | < gimple_uid (DR_STMT (drb))); |
3251 | /* Simply link in duplicates and fix up the chain below. */ |
3252 | } |
3253 | else |
3254 | { |
3255 | /* If init_b == init_a + the size of the type * k, we have an |
3256 | interleaving, and DRA is accessed before DRB. */ |
3257 | unsigned HOST_WIDE_INT type_size_a = tree_to_uhwi (sza); |
3258 | if (type_size_a == 0 |
3259 | || (((unsigned HOST_WIDE_INT)init_b - init_a) |
3260 | % type_size_a != 0)) |
3261 | break; |
3262 | |
3263 | /* If we have a store, the accesses are adjacent. This splits |
3264 | groups into chunks we support (we don't support vectorization |
3265 | of stores with gaps). */ |
3266 | if (!DR_IS_READ (dra) |
3267 | && (((unsigned HOST_WIDE_INT)init_b - init_prev) |
3268 | != type_size_a)) |
3269 | break; |
3270 | |
3271 | /* If the step (if not zero or non-constant) is smaller than the |
3272 | difference between data-refs' inits this splits groups into |
3273 | suitable sizes. */ |
3274 | if (tree_fits_shwi_p (DR_STEP (dra))) |
3275 | { |
3276 | unsigned HOST_WIDE_INT step |
3277 | = absu_hwi (x: tree_to_shwi (DR_STEP (dra))); |
3278 | if (step != 0 |
3279 | && step <= ((unsigned HOST_WIDE_INT)init_b - init_a)) |
3280 | break; |
3281 | } |
3282 | } |
3283 | |
3284 | if (dump_enabled_p ()) |
3285 | dump_printf_loc (MSG_NOTE, vect_location, |
3286 | DR_IS_READ (dra) |
3287 | ? "Detected interleaving load %T and %T\n" |
3288 | : "Detected interleaving store %T and %T\n" , |
3289 | DR_REF (dra), DR_REF (drb)); |
3290 | |
3291 | /* Link the found element into the group list. */ |
3292 | if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a)) |
3293 | { |
3294 | DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a; |
3295 | lastinfo = stmtinfo_a; |
3296 | } |
3297 | DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a; |
3298 | DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b; |
3299 | lastinfo = stmtinfo_b; |
3300 | |
3301 | STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a) |
3302 | = !can_group_stmts_p (stmt1_info: stmtinfo_a, stmt2_info: stmtinfo_b, allow_slp_p: false); |
3303 | |
3304 | if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)) |
3305 | dump_printf_loc (MSG_NOTE, vect_location, |
3306 | "Load suitable for SLP vectorization only.\n" ); |
3307 | |
3308 | if (init_b == init_prev |
3309 | && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)) |
3310 | && dump_enabled_p ()) |
3311 | dump_printf_loc (MSG_NOTE, vect_location, |
3312 | "Queuing group with duplicate access for fixup\n" ); |
3313 | } |
3314 | } |
3315 | |
3316 | /* Fixup groups with duplicate entries by splitting it. */ |
3317 | while (1) |
3318 | { |
3319 | hash_set<stmt_vec_info>::iterator it = to_fixup.begin (); |
3320 | if (!(it != to_fixup.end ())) |
3321 | break; |
3322 | stmt_vec_info grp = *it; |
3323 | to_fixup.remove (k: grp); |
3324 | |
3325 | /* Find the earliest duplicate group member. */ |
3326 | unsigned first_duplicate = -1u; |
3327 | stmt_vec_info next, g = grp; |
3328 | while ((next = DR_GROUP_NEXT_ELEMENT (g))) |
3329 | { |
3330 | if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr), |
3331 | DR_INIT (STMT_VINFO_DR_INFO (g)->dr)) |
3332 | && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate) |
3333 | first_duplicate = gimple_uid (STMT_VINFO_STMT (next)); |
3334 | g = next; |
3335 | } |
3336 | if (first_duplicate == -1U) |
3337 | continue; |
3338 | |
3339 | /* Then move all stmts after the first duplicate to a new group. |
3340 | Note this is a heuristic but one with the property that *it |
3341 | is fixed up completely. */ |
3342 | g = grp; |
3343 | stmt_vec_info newgroup = NULL, ng = grp; |
3344 | while ((next = DR_GROUP_NEXT_ELEMENT (g))) |
3345 | { |
3346 | if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate) |
3347 | { |
3348 | DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next); |
3349 | if (!newgroup) |
3350 | newgroup = next; |
3351 | else |
3352 | DR_GROUP_NEXT_ELEMENT (ng) = next; |
3353 | ng = next; |
3354 | DR_GROUP_FIRST_ELEMENT (ng) = newgroup; |
3355 | } |
3356 | else |
3357 | g = DR_GROUP_NEXT_ELEMENT (g); |
3358 | } |
3359 | DR_GROUP_NEXT_ELEMENT (ng) = NULL; |
3360 | |
3361 | /* Fixup the new group which still may contain duplicates. */ |
3362 | to_fixup.add (k: newgroup); |
3363 | } |
3364 | |
3365 | dr_vec_info *dr_info; |
3366 | FOR_EACH_VEC_ELT (datarefs_copy, i, dr_info) |
3367 | { |
3368 | if (STMT_VINFO_VECTORIZABLE (dr_info->stmt) |
3369 | && !vect_analyze_data_ref_access (vinfo, dr_info)) |
3370 | { |
3371 | if (dump_enabled_p ()) |
3372 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
3373 | "not vectorized: complicated access pattern.\n" ); |
3374 | |
3375 | if (is_a <bb_vec_info> (p: vinfo)) |
3376 | { |
3377 | /* Mark the statement as not vectorizable. */ |
3378 | STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false; |
3379 | continue; |
3380 | } |
3381 | else |
3382 | { |
3383 | datarefs_copy.release (); |
3384 | return opt_result::failure_at (loc: dr_info->stmt->stmt, |
3385 | fmt: "not vectorized:" |
3386 | " complicated access pattern.\n" ); |
3387 | } |
3388 | } |
3389 | } |
3390 | |
3391 | datarefs_copy.release (); |
3392 | return opt_result::success (); |
3393 | } |
3394 | |
3395 | /* Function vect_vfa_segment_size. |
3396 | |
3397 | Input: |
3398 | DR_INFO: The data reference. |
3399 | LENGTH_FACTOR: segment length to consider. |
3400 | |
3401 | Return a value suitable for the dr_with_seg_len::seg_len field. |
3402 | This is the "distance travelled" by the pointer from the first |
3403 | iteration in the segment to the last. Note that it does not include |
3404 | the size of the access; in effect it only describes the first byte. */ |
3405 | |
3406 | static tree |
3407 | vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor) |
3408 | { |
3409 | length_factor = size_binop (MINUS_EXPR, |
3410 | fold_convert (sizetype, length_factor), |
3411 | size_one_node); |
3412 | return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)), |
3413 | length_factor); |
3414 | } |
3415 | |
3416 | /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)), |
3417 | gives the worst-case number of bytes covered by the segment. */ |
3418 | |
3419 | static unsigned HOST_WIDE_INT |
3420 | vect_vfa_access_size (vec_info *vinfo, dr_vec_info *dr_info) |
3421 | { |
3422 | stmt_vec_info stmt_vinfo = dr_info->stmt; |
3423 | tree ref_type = TREE_TYPE (DR_REF (dr_info->dr)); |
3424 | unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type)); |
3425 | unsigned HOST_WIDE_INT access_size = ref_size; |
3426 | if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo)) |
3427 | { |
3428 | gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo); |
3429 | access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo); |
3430 | } |
3431 | tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo); |
3432 | int misalignment; |
3433 | if (STMT_VINFO_VEC_STMTS (stmt_vinfo).exists () |
3434 | && ((misalignment = dr_misalignment (dr_info, vectype)), true) |
3435 | && (vect_supportable_dr_alignment (vinfo, dr_info, vectype, misalignment) |
3436 | == dr_explicit_realign_optimized)) |
3437 | { |
3438 | /* We might access a full vector's worth. */ |
3439 | access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size; |
3440 | } |
3441 | return access_size; |
3442 | } |
3443 | |
3444 | /* Get the minimum alignment for all the scalar accesses that DR_INFO |
3445 | describes. */ |
3446 | |
3447 | static unsigned int |
3448 | vect_vfa_align (dr_vec_info *dr_info) |
3449 | { |
3450 | return dr_alignment (dr: dr_info->dr); |
3451 | } |
3452 | |
3453 | /* Function vect_no_alias_p. |
3454 | |
3455 | Given data references A and B with equal base and offset, see whether |
3456 | the alias relation can be decided at compilation time. Return 1 if |
3457 | it can and the references alias, 0 if it can and the references do |
3458 | not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A, |
3459 | SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent |
3460 | of dr_with_seg_len::{seg_len,access_size} for A and B. */ |
3461 | |
3462 | static int |
3463 | vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b, |
3464 | tree segment_length_a, tree segment_length_b, |
3465 | unsigned HOST_WIDE_INT access_size_a, |
3466 | unsigned HOST_WIDE_INT access_size_b) |
3467 | { |
3468 | poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr)); |
3469 | poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr)); |
3470 | poly_uint64 const_length_a; |
3471 | poly_uint64 const_length_b; |
3472 | |
3473 | /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT |
3474 | bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of |
3475 | [a, a+12) */ |
3476 | if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0) |
3477 | { |
3478 | const_length_a = (-wi::to_poly_wide (t: segment_length_a)).force_uhwi (); |
3479 | offset_a -= const_length_a; |
3480 | } |
3481 | else |
3482 | const_length_a = tree_to_poly_uint64 (segment_length_a); |
3483 | if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0) |
3484 | { |
3485 | const_length_b = (-wi::to_poly_wide (t: segment_length_b)).force_uhwi (); |
3486 | offset_b -= const_length_b; |
3487 | } |
3488 | else |
3489 | const_length_b = tree_to_poly_uint64 (segment_length_b); |
3490 | |
3491 | const_length_a += access_size_a; |
3492 | const_length_b += access_size_b; |
3493 | |
3494 | if (ranges_known_overlap_p (pos1: offset_a, size1: const_length_a, |
3495 | pos2: offset_b, size2: const_length_b)) |
3496 | return 1; |
3497 | |
3498 | if (!ranges_maybe_overlap_p (pos1: offset_a, size1: const_length_a, |
3499 | pos2: offset_b, size2: const_length_b)) |
3500 | return 0; |
3501 | |
3502 | return -1; |
3503 | } |
3504 | |
3505 | /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH |
3506 | in DDR is >= VF. */ |
3507 | |
3508 | static bool |
3509 | dependence_distance_ge_vf (data_dependence_relation *ddr, |
3510 | unsigned int loop_depth, poly_uint64 vf) |
3511 | { |
3512 | if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE |
3513 | || DDR_NUM_DIST_VECTS (ddr) == 0) |
3514 | return false; |
3515 | |
3516 | /* If the dependence is exact, we should have limited the VF instead. */ |
3517 | gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr)); |
3518 | |
3519 | unsigned int i; |
3520 | lambda_vector dist_v; |
3521 | FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v) |
3522 | { |
3523 | HOST_WIDE_INT dist = dist_v[loop_depth]; |
3524 | if (dist != 0 |
3525 | && !(dist > 0 && DDR_REVERSED_P (ddr)) |
3526 | && maybe_lt (a: (unsigned HOST_WIDE_INT) abs_hwi (x: dist), b: vf)) |
3527 | return false; |
3528 | } |
3529 | |
3530 | if (dump_enabled_p ()) |
3531 | dump_printf_loc (MSG_NOTE, vect_location, |
3532 | "dependence distance between %T and %T is >= VF\n" , |
3533 | DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr))); |
3534 | |
3535 | return true; |
3536 | } |
3537 | |
3538 | /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */ |
3539 | |
3540 | static void |
3541 | dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound) |
3542 | { |
3543 | dump_printf (dump_kind, "%s (%T) >= " , |
3544 | lower_bound.unsigned_p ? "unsigned" : "abs" , |
3545 | lower_bound.expr); |
3546 | dump_dec (dump_kind, lower_bound.min_value); |
3547 | } |
3548 | |
3549 | /* Record that the vectorized loop requires the vec_lower_bound described |
3550 | by EXPR, UNSIGNED_P and MIN_VALUE. */ |
3551 | |
3552 | static void |
3553 | vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p, |
3554 | poly_uint64 min_value) |
3555 | { |
3556 | vec<vec_lower_bound> &lower_bounds |
3557 | = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo); |
3558 | for (unsigned int i = 0; i < lower_bounds.length (); ++i) |
3559 | if (operand_equal_p (lower_bounds[i].expr, expr, flags: 0)) |
3560 | { |
3561 | unsigned_p &= lower_bounds[i].unsigned_p; |
3562 | min_value = upper_bound (a: lower_bounds[i].min_value, b: min_value); |
3563 | if (lower_bounds[i].unsigned_p != unsigned_p |
3564 | || maybe_lt (a: lower_bounds[i].min_value, b: min_value)) |
3565 | { |
3566 | lower_bounds[i].unsigned_p = unsigned_p; |
3567 | lower_bounds[i].min_value = min_value; |
3568 | if (dump_enabled_p ()) |
3569 | { |
3570 | dump_printf_loc (MSG_NOTE, vect_location, |
3571 | "updating run-time check to " ); |
3572 | dump_lower_bound (dump_kind: MSG_NOTE, lower_bound: lower_bounds[i]); |
3573 | dump_printf (MSG_NOTE, "\n" ); |
3574 | } |
3575 | } |
3576 | return; |
3577 | } |
3578 | |
3579 | vec_lower_bound lower_bound (expr, unsigned_p, min_value); |
3580 | if (dump_enabled_p ()) |
3581 | { |
3582 | dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that " ); |
3583 | dump_lower_bound (dump_kind: MSG_NOTE, lower_bound); |
3584 | dump_printf (MSG_NOTE, "\n" ); |
3585 | } |
3586 | LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (obj: lower_bound); |
3587 | } |
3588 | |
3589 | /* Return true if it's unlikely that the step of the vectorized form of DR_INFO |
3590 | will span fewer than GAP bytes. */ |
3591 | |
3592 | static bool |
3593 | vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info, |
3594 | poly_int64 gap) |
3595 | { |
3596 | stmt_vec_info stmt_info = dr_info->stmt; |
3597 | HOST_WIDE_INT count |
3598 | = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo)); |
3599 | if (DR_GROUP_FIRST_ELEMENT (stmt_info)) |
3600 | count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info)); |
3601 | return (estimated_poly_value (x: gap) |
3602 | <= count * vect_get_scalar_dr_size (dr_info)); |
3603 | } |
3604 | |
3605 | /* Return true if we know that there is no alias between DR_INFO_A and |
3606 | DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N. |
3607 | When returning true, set *LOWER_BOUND_OUT to this N. */ |
3608 | |
3609 | static bool |
3610 | vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b, |
3611 | poly_uint64 *lower_bound_out) |
3612 | { |
3613 | /* Check that there is a constant gap of known sign between DR_A |
3614 | and DR_B. */ |
3615 | data_reference *dr_a = dr_info_a->dr; |
3616 | data_reference *dr_b = dr_info_b->dr; |
3617 | poly_int64 init_a, init_b; |
3618 | if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), flags: 0) |
3619 | || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), flags: 0) |
3620 | || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), flags: 0) |
3621 | || !poly_int_tree_p (DR_INIT (dr_a), value: &init_a) |
3622 | || !poly_int_tree_p (DR_INIT (dr_b), value: &init_b) |
3623 | || !ordered_p (a: init_a, b: init_b)) |
3624 | return false; |
3625 | |
3626 | /* Sort DR_A and DR_B by the address they access. */ |
3627 | if (maybe_lt (a: init_b, b: init_a)) |
3628 | { |
3629 | std::swap (a&: init_a, b&: init_b); |
3630 | std::swap (a&: dr_info_a, b&: dr_info_b); |
3631 | std::swap (a&: dr_a, b&: dr_b); |
3632 | } |
3633 | |
3634 | /* If the two accesses could be dependent within a scalar iteration, |
3635 | make sure that we'd retain their order. */ |
3636 | if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b) |
3637 | && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b)) |
3638 | return false; |
3639 | |
3640 | /* There is no alias if abs (DR_STEP) is greater than or equal to |
3641 | the bytes spanned by the combination of the two accesses. */ |
3642 | *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info: dr_info_b) - init_a; |
3643 | return true; |
3644 | } |
3645 | |
3646 | /* Function vect_prune_runtime_alias_test_list. |
3647 | |
3648 | Prune a list of ddrs to be tested at run-time by versioning for alias. |
3649 | Merge several alias checks into one if possible. |
3650 | Return FALSE if resulting list of ddrs is longer then allowed by |
3651 | PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */ |
3652 | |
3653 | opt_result |
3654 | vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo) |
3655 | { |
3656 | typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash; |
3657 | hash_set <tree_pair_hash> compared_objects; |
3658 | |
3659 | const vec<ddr_p> &may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo); |
3660 | vec<dr_with_seg_len_pair_t> &comp_alias_ddrs |
3661 | = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo); |
3662 | const vec<vec_object_pair> &check_unequal_addrs |
3663 | = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo); |
3664 | poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); |
3665 | tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo); |
3666 | |
3667 | ddr_p ddr; |
3668 | unsigned int i; |
3669 | tree length_factor; |
3670 | |
3671 | DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list" ); |
3672 | |
3673 | /* Step values are irrelevant for aliasing if the number of vector |
3674 | iterations is equal to the number of scalar iterations (which can |
3675 | happen for fully-SLP loops). */ |
3676 | bool vf_one_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U); |
3677 | |
3678 | if (!vf_one_p) |
3679 | { |
3680 | /* Convert the checks for nonzero steps into bound tests. */ |
3681 | tree value; |
3682 | FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value) |
3683 | vect_check_lower_bound (loop_vinfo, expr: value, unsigned_p: true, min_value: 1); |
3684 | } |
3685 | |
3686 | if (may_alias_ddrs.is_empty ()) |
3687 | return opt_result::success (); |
3688 | |
3689 | comp_alias_ddrs.create (nelems: may_alias_ddrs.length ()); |
3690 | |
3691 | unsigned int loop_depth |
3692 | = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num, |
3693 | LOOP_VINFO_LOOP_NEST (loop_vinfo)); |
3694 | |
3695 | /* First, we collect all data ref pairs for aliasing checks. */ |
3696 | FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr) |
3697 | { |
3698 | poly_uint64 lower_bound; |
3699 | tree segment_length_a, segment_length_b; |
3700 | unsigned HOST_WIDE_INT access_size_a, access_size_b; |
3701 | unsigned int align_a, align_b; |
3702 | |
3703 | /* Ignore the alias if the VF we chose ended up being no greater |
3704 | than the dependence distance. */ |
3705 | if (dependence_distance_ge_vf (ddr, loop_depth, vf: vect_factor)) |
3706 | continue; |
3707 | |
3708 | if (DDR_OBJECT_A (ddr)) |
3709 | { |
3710 | vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr)); |
3711 | if (!compared_objects.add (k: new_pair)) |
3712 | { |
3713 | if (dump_enabled_p ()) |
3714 | dump_printf_loc (MSG_NOTE, vect_location, |
3715 | "checking that %T and %T" |
3716 | " have different addresses\n" , |
3717 | new_pair.first, new_pair.second); |
3718 | LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (obj: new_pair); |
3719 | } |
3720 | continue; |
3721 | } |
3722 | |
3723 | dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr)); |
3724 | stmt_vec_info stmt_info_a = dr_info_a->stmt; |
3725 | |
3726 | dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr)); |
3727 | stmt_vec_info stmt_info_b = dr_info_b->stmt; |
3728 | |
3729 | bool preserves_scalar_order_p |
3730 | = vect_preserves_scalar_order_p (dr_info_a, dr_info_b); |
3731 | bool ignore_step_p |
3732 | = (vf_one_p |
3733 | && (preserves_scalar_order_p |
3734 | || operand_equal_p (DR_STEP (dr_info_a->dr), |
3735 | DR_STEP (dr_info_b->dr)))); |
3736 | |
3737 | /* Skip the pair if inter-iteration dependencies are irrelevant |
3738 | and intra-iteration dependencies are guaranteed to be honored. */ |
3739 | if (ignore_step_p |
3740 | && (preserves_scalar_order_p |
3741 | || vectorizable_with_step_bound_p (dr_info_a, dr_info_b, |
3742 | lower_bound_out: &lower_bound))) |
3743 | { |
3744 | if (dump_enabled_p ()) |
3745 | dump_printf_loc (MSG_NOTE, vect_location, |
3746 | "no need for alias check between " |
3747 | "%T and %T when VF is 1\n" , |
3748 | DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr)); |
3749 | continue; |
3750 | } |
3751 | |
3752 | /* See whether we can handle the alias using a bounds check on |
3753 | the step, and whether that's likely to be the best approach. |
3754 | (It might not be, for example, if the minimum step is much larger |
3755 | than the number of bytes handled by one vector iteration.) */ |
3756 | if (!ignore_step_p |
3757 | && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST |
3758 | && vectorizable_with_step_bound_p (dr_info_a, dr_info_b, |
3759 | lower_bound_out: &lower_bound) |
3760 | && (vect_small_gap_p (loop_vinfo, dr_info: dr_info_a, gap: lower_bound) |
3761 | || vect_small_gap_p (loop_vinfo, dr_info: dr_info_b, gap: lower_bound))) |
3762 | { |
3763 | bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr); |
3764 | if (dump_enabled_p ()) |
3765 | { |
3766 | dump_printf_loc (MSG_NOTE, vect_location, "no alias between " |
3767 | "%T and %T when the step %T is outside " , |
3768 | DR_REF (dr_info_a->dr), |
3769 | DR_REF (dr_info_b->dr), |
3770 | DR_STEP (dr_info_a->dr)); |
3771 | if (unsigned_p) |
3772 | dump_printf (MSG_NOTE, "[0" ); |
3773 | else |
3774 | { |
3775 | dump_printf (MSG_NOTE, "(" ); |
3776 | dump_dec (MSG_NOTE, poly_int64 (-lower_bound)); |
3777 | } |
3778 | dump_printf (MSG_NOTE, ", " ); |
3779 | dump_dec (MSG_NOTE, lower_bound); |
3780 | dump_printf (MSG_NOTE, ")\n" ); |
3781 | } |
3782 | vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr), |
3783 | unsigned_p, min_value: lower_bound); |
3784 | continue; |
3785 | } |
3786 | |
3787 | stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a); |
3788 | if (dr_group_first_a) |
3789 | { |
3790 | stmt_info_a = dr_group_first_a; |
3791 | dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a); |
3792 | } |
3793 | |
3794 | stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b); |
3795 | if (dr_group_first_b) |
3796 | { |
3797 | stmt_info_b = dr_group_first_b; |
3798 | dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b); |
3799 | } |
3800 | |
3801 | if (ignore_step_p) |
3802 | { |
3803 | segment_length_a = size_zero_node; |
3804 | segment_length_b = size_zero_node; |
3805 | } |
3806 | else |
3807 | { |
3808 | if (!operand_equal_p (DR_STEP (dr_info_a->dr), |
3809 | DR_STEP (dr_info_b->dr), flags: 0)) |
3810 | length_factor = scalar_loop_iters; |
3811 | else |
3812 | length_factor = size_int (vect_factor); |
3813 | segment_length_a = vect_vfa_segment_size (dr_info: dr_info_a, length_factor); |
3814 | segment_length_b = vect_vfa_segment_size (dr_info: dr_info_b, length_factor); |
3815 | } |
3816 | access_size_a = vect_vfa_access_size (vinfo: loop_vinfo, dr_info: dr_info_a); |
3817 | access_size_b = vect_vfa_access_size (vinfo: loop_vinfo, dr_info: dr_info_b); |
3818 | align_a = vect_vfa_align (dr_info: dr_info_a); |
3819 | align_b = vect_vfa_align (dr_info: dr_info_b); |
3820 | |
3821 | /* See whether the alias is known at compilation time. */ |
3822 | if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a->dr), |
3823 | DR_BASE_ADDRESS (dr_info_b->dr), flags: 0) |
3824 | && operand_equal_p (DR_OFFSET (dr_info_a->dr), |
3825 | DR_OFFSET (dr_info_b->dr), flags: 0) |
3826 | && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST |
3827 | && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST |
3828 | && poly_int_tree_p (t: segment_length_a) |
3829 | && poly_int_tree_p (t: segment_length_b)) |
3830 | { |
3831 | int res = vect_compile_time_alias (a: dr_info_a, b: dr_info_b, |
3832 | segment_length_a, |
3833 | segment_length_b, |
3834 | access_size_a, |
3835 | access_size_b); |
3836 | if (res >= 0 && dump_enabled_p ()) |
3837 | { |
3838 | dump_printf_loc (MSG_NOTE, vect_location, |
3839 | "can tell at compile time that %T and %T" , |
3840 | DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr)); |
3841 | if (res == 0) |
3842 | dump_printf (MSG_NOTE, " do not alias\n" ); |
3843 | else |
3844 | dump_printf (MSG_NOTE, " alias\n" ); |
3845 | } |
3846 | |
3847 | if (res == 0) |
3848 | continue; |
3849 | |
3850 | if (res == 1) |
3851 | return opt_result::failure_at (loc: stmt_info_b->stmt, |
3852 | fmt: "not vectorized:" |
3853 | " compilation time alias: %G%G" , |
3854 | stmt_info_a->stmt, |
3855 | stmt_info_b->stmt); |
3856 | } |
3857 | |
3858 | dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a, |
3859 | access_size_a, align_a); |
3860 | dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b, |
3861 | access_size_b, align_b); |
3862 | /* Canonicalize the order to be the one that's needed for accurate |
3863 | RAW, WAR and WAW flags, in cases where the data references are |
3864 | well-ordered. The order doesn't really matter otherwise, |
3865 | but we might as well be consistent. */ |
3866 | if (get_later_stmt (stmt1_info: stmt_info_a, stmt2_info: stmt_info_b) == stmt_info_a) |
3867 | std::swap (a&: dr_a, b&: dr_b); |
3868 | |
3869 | dr_with_seg_len_pair_t dr_with_seg_len_pair |
3870 | (dr_a, dr_b, (preserves_scalar_order_p |
3871 | ? dr_with_seg_len_pair_t::WELL_ORDERED |
3872 | : dr_with_seg_len_pair_t::REORDERED)); |
3873 | |
3874 | comp_alias_ddrs.safe_push (obj: dr_with_seg_len_pair); |
3875 | } |
3876 | |
3877 | prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor); |
3878 | |
3879 | unsigned int count = (comp_alias_ddrs.length () |
3880 | + check_unequal_addrs.length ()); |
3881 | |
3882 | if (count |
3883 | && (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo)) |
3884 | == VECT_COST_MODEL_VERY_CHEAP)) |
3885 | return opt_result::failure_at |
3886 | (loc: vect_location, fmt: "would need a runtime alias check\n" ); |
3887 | |
3888 | if (dump_enabled_p ()) |
3889 | dump_printf_loc (MSG_NOTE, vect_location, |
3890 | "improved number of alias checks from %d to %d\n" , |
3891 | may_alias_ddrs.length (), count); |
3892 | unsigned limit = param_vect_max_version_for_alias_checks; |
3893 | if (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo)) == VECT_COST_MODEL_CHEAP) |
3894 | limit = param_vect_max_version_for_alias_checks * 6 / 10; |
3895 | if (count > limit) |
3896 | return opt_result::failure_at |
3897 | (loc: vect_location, |
3898 | fmt: "number of versioning for alias run-time tests exceeds %d " |
3899 | "(--param vect-max-version-for-alias-checks)\n" , limit); |
3900 | |
3901 | return opt_result::success (); |
3902 | } |
3903 | |
3904 | /* Check whether we can use an internal function for a gather load |
3905 | or scatter store. READ_P is true for loads and false for stores. |
3906 | MASKED_P is true if the load or store is conditional. MEMORY_TYPE is |
3907 | the type of the memory elements being loaded or stored. OFFSET_TYPE |
3908 | is the type of the offset that is being applied to the invariant |
3909 | base address. SCALE is the amount by which the offset should |
3910 | be multiplied *after* it has been converted to address width. |
3911 | |
3912 | Return true if the function is supported, storing the function id in |
3913 | *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT. */ |
3914 | |
3915 | bool |
3916 | vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p, |
3917 | tree vectype, tree memory_type, tree offset_type, |
3918 | int scale, internal_fn *ifn_out, |
3919 | tree *offset_vectype_out) |
3920 | { |
3921 | unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type)); |
3922 | unsigned int element_bits = vector_element_bits (vectype); |
3923 | if (element_bits != memory_bits) |
3924 | /* For now the vector elements must be the same width as the |
3925 | memory elements. */ |
3926 | return false; |
3927 | |
3928 | /* Work out which function we need. */ |
3929 | internal_fn ifn, alt_ifn, alt_ifn2; |
3930 | if (read_p) |
3931 | { |
3932 | ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD; |
3933 | alt_ifn = IFN_MASK_GATHER_LOAD; |
3934 | /* When target supports MASK_LEN_GATHER_LOAD, we always |
3935 | use MASK_LEN_GATHER_LOAD regardless whether len and |
3936 | mask are valid or not. */ |
3937 | alt_ifn2 = IFN_MASK_LEN_GATHER_LOAD; |
3938 | } |
3939 | else |
3940 | { |
3941 | ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE; |
3942 | alt_ifn = IFN_MASK_SCATTER_STORE; |
3943 | /* When target supports MASK_LEN_SCATTER_STORE, we always |
3944 | use MASK_LEN_SCATTER_STORE regardless whether len and |
3945 | mask are valid or not. */ |
3946 | alt_ifn2 = IFN_MASK_LEN_SCATTER_STORE; |
3947 | } |
3948 | |
3949 | for (;;) |
3950 | { |
3951 | tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type); |
3952 | if (!offset_vectype) |
3953 | return false; |
3954 | |
3955 | /* Test whether the target supports this combination. */ |
3956 | if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type, |
3957 | offset_vectype, scale)) |
3958 | { |
3959 | *ifn_out = ifn; |
3960 | *offset_vectype_out = offset_vectype; |
3961 | return true; |
3962 | } |
3963 | else if (!masked_p |
3964 | && internal_gather_scatter_fn_supported_p (alt_ifn, vectype, |
3965 | memory_type, |
3966 | offset_vectype, |
3967 | scale)) |
3968 | { |
3969 | *ifn_out = alt_ifn; |
3970 | *offset_vectype_out = offset_vectype; |
3971 | return true; |
3972 | } |
3973 | else if (internal_gather_scatter_fn_supported_p (alt_ifn2, vectype, |
3974 | memory_type, |
3975 | offset_vectype, scale)) |
3976 | { |
3977 | *ifn_out = alt_ifn2; |
3978 | *offset_vectype_out = offset_vectype; |
3979 | return true; |
3980 | } |
3981 | |
3982 | if (TYPE_PRECISION (offset_type) >= POINTER_SIZE |
3983 | && TYPE_PRECISION (offset_type) >= element_bits) |
3984 | return false; |
3985 | |
3986 | offset_type = build_nonstandard_integer_type |
3987 | (TYPE_PRECISION (offset_type) * 2, TYPE_UNSIGNED (offset_type)); |
3988 | } |
3989 | } |
3990 | |
3991 | /* STMT_INFO is a call to an internal gather load or scatter store function. |
3992 | Describe the operation in INFO. */ |
3993 | |
3994 | static void |
3995 | vect_describe_gather_scatter_call (stmt_vec_info stmt_info, |
3996 | gather_scatter_info *info) |
3997 | { |
3998 | gcall *call = as_a <gcall *> (p: stmt_info->stmt); |
3999 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); |
4000 | data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); |
4001 | |
4002 | info->ifn = gimple_call_internal_fn (gs: call); |
4003 | info->decl = NULL_TREE; |
4004 | info->base = gimple_call_arg (gs: call, index: 0); |
4005 | info->offset = gimple_call_arg (gs: call, index: 1); |
4006 | info->offset_dt = vect_unknown_def_type; |
4007 | info->offset_vectype = NULL_TREE; |
4008 | info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2)); |
4009 | info->element_type = TREE_TYPE (vectype); |
4010 | info->memory_type = TREE_TYPE (DR_REF (dr)); |
4011 | } |
4012 | |
4013 | /* Return true if a non-affine read or write in STMT_INFO is suitable for a |
4014 | gather load or scatter store. Describe the operation in *INFO if so. */ |
4015 | |
4016 | bool |
4017 | vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo, |
4018 | gather_scatter_info *info) |
4019 | { |
4020 | HOST_WIDE_INT scale = 1; |
4021 | poly_int64 pbitpos, pbitsize; |
4022 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
4023 | struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); |
4024 | tree offtype = NULL_TREE; |
4025 | tree decl = NULL_TREE, base, off; |
4026 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); |
4027 | tree memory_type = TREE_TYPE (DR_REF (dr)); |
4028 | machine_mode pmode; |
4029 | int punsignedp, reversep, pvolatilep = 0; |
4030 | internal_fn ifn; |
4031 | tree offset_vectype; |
4032 | bool masked_p = false; |
4033 | |
4034 | /* See whether this is already a call to a gather/scatter internal function. |
4035 | If not, see whether it's a masked load or store. */ |
4036 | gcall *call = dyn_cast <gcall *> (p: stmt_info->stmt); |
4037 | if (call && gimple_call_internal_p (gs: call)) |
4038 | { |
4039 | ifn = gimple_call_internal_fn (gs: call); |
4040 | if (internal_gather_scatter_fn_p (ifn)) |
4041 | { |
4042 | vect_describe_gather_scatter_call (stmt_info, info); |
4043 | return true; |
4044 | } |
4045 | masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE); |
4046 | } |
4047 | |
4048 | /* True if we should aim to use internal functions rather than |
4049 | built-in functions. */ |
4050 | bool use_ifn_p = (DR_IS_READ (dr) |
4051 | ? supports_vec_gather_load_p (TYPE_MODE (vectype)) |
4052 | : supports_vec_scatter_store_p (TYPE_MODE (vectype))); |
4053 | |
4054 | base = DR_REF (dr); |
4055 | /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF, |
4056 | see if we can use the def stmt of the address. */ |
4057 | if (masked_p |
4058 | && TREE_CODE (base) == MEM_REF |
4059 | && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME |
4060 | && integer_zerop (TREE_OPERAND (base, 1)) |
4061 | && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0))) |
4062 | { |
4063 | gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0)); |
4064 | if (is_gimple_assign (gs: def_stmt) |
4065 | && gimple_assign_rhs_code (gs: def_stmt) == ADDR_EXPR) |
4066 | base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0); |
4067 | } |
4068 | |
4069 | /* The gather and scatter builtins need address of the form |
4070 | loop_invariant + vector * {1, 2, 4, 8} |
4071 | or |
4072 | loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }. |
4073 | Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture |
4074 | of loop invariants/SSA_NAMEs defined in the loop, with casts, |
4075 | multiplications and additions in it. To get a vector, we need |
4076 | a single SSA_NAME that will be defined in the loop and will |
4077 | contain everything that is not loop invariant and that can be |
4078 | vectorized. The following code attempts to find such a preexistng |
4079 | SSA_NAME OFF and put the loop invariants into a tree BASE |
4080 | that can be gimplified before the loop. */ |
4081 | base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode, |
4082 | &punsignedp, &reversep, &pvolatilep); |
4083 | if (reversep) |
4084 | return false; |
4085 | |
4086 | /* PR 107346. Packed structs can have fields at offsets that are not |
4087 | multiples of BITS_PER_UNIT. Do not use gather/scatters in such cases. */ |
4088 | if (!multiple_p (a: pbitpos, BITS_PER_UNIT)) |
4089 | return false; |
4090 | |
4091 | poly_int64 pbytepos = exact_div (a: pbitpos, BITS_PER_UNIT); |
4092 | |
4093 | if (TREE_CODE (base) == MEM_REF) |
4094 | { |
4095 | if (!integer_zerop (TREE_OPERAND (base, 1))) |
4096 | { |
4097 | if (off == NULL_TREE) |
4098 | off = wide_int_to_tree (sizetype, cst: mem_ref_offset (base)); |
4099 | else |
4100 | off = size_binop (PLUS_EXPR, off, |
4101 | fold_convert (sizetype, TREE_OPERAND (base, 1))); |
4102 | } |
4103 | base = TREE_OPERAND (base, 0); |
4104 | } |
4105 | else |
4106 | base = build_fold_addr_expr (base); |
4107 | |
4108 | if (off == NULL_TREE) |
4109 | off = size_zero_node; |
4110 | |
4111 | /* If base is not loop invariant, either off is 0, then we start with just |
4112 | the constant offset in the loop invariant BASE and continue with base |
4113 | as OFF, otherwise give up. |
4114 | We could handle that case by gimplifying the addition of base + off |
4115 | into some SSA_NAME and use that as off, but for now punt. */ |
4116 | if (!expr_invariant_in_loop_p (loop, base)) |
4117 | { |
4118 | if (!integer_zerop (off)) |
4119 | return false; |
4120 | off = base; |
4121 | base = size_int (pbytepos); |
4122 | } |
4123 | /* Otherwise put base + constant offset into the loop invariant BASE |
4124 | and continue with OFF. */ |
4125 | else |
4126 | { |
4127 | base = fold_convert (sizetype, base); |
4128 | base = size_binop (PLUS_EXPR, base, size_int (pbytepos)); |
4129 | } |
4130 | |
4131 | /* OFF at this point may be either a SSA_NAME or some tree expression |
4132 | from get_inner_reference. Try to peel off loop invariants from it |
4133 | into BASE as long as possible. */ |
4134 | STRIP_NOPS (off); |
4135 | while (offtype == NULL_TREE) |
4136 | { |
4137 | enum tree_code code; |
4138 | tree op0, op1, add = NULL_TREE; |
4139 | |
4140 | if (TREE_CODE (off) == SSA_NAME) |
4141 | { |
4142 | gimple *def_stmt = SSA_NAME_DEF_STMT (off); |
4143 | |
4144 | if (expr_invariant_in_loop_p (loop, off)) |
4145 | return false; |
4146 | |
4147 | if (gimple_code (g: def_stmt) != GIMPLE_ASSIGN) |
4148 | break; |
4149 | |
4150 | op0 = gimple_assign_rhs1 (gs: def_stmt); |
4151 | code = gimple_assign_rhs_code (gs: def_stmt); |
4152 | op1 = gimple_assign_rhs2 (gs: def_stmt); |
4153 | } |
4154 | else |
4155 | { |
4156 | if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS) |
4157 | return false; |
4158 | code = TREE_CODE (off); |
4159 | extract_ops_from_tree (expr: off, code: &code, op0: &op0, op1: &op1); |
4160 | } |
4161 | switch (code) |
4162 | { |
4163 | case POINTER_PLUS_EXPR: |
4164 | case PLUS_EXPR: |
4165 | if (expr_invariant_in_loop_p (loop, op0)) |
4166 | { |
4167 | add = op0; |
4168 | off = op1; |
4169 | do_add: |
4170 | add = fold_convert (sizetype, add); |
4171 | if (scale != 1) |
4172 | add = size_binop (MULT_EXPR, add, size_int (scale)); |
4173 | base = size_binop (PLUS_EXPR, base, add); |
4174 | continue; |
4175 | } |
4176 | if (expr_invariant_in_loop_p (loop, op1)) |
4177 | { |
4178 | add = op1; |
4179 | off = op0; |
4180 | goto do_add; |
4181 | } |
4182 | break; |
4183 | case MINUS_EXPR: |
4184 | if (expr_invariant_in_loop_p (loop, op1)) |
4185 | { |
4186 | add = fold_convert (sizetype, op1); |
4187 | add = size_binop (MINUS_EXPR, size_zero_node, add); |
4188 | off = op0; |
4189 | goto do_add; |
4190 | } |
4191 | break; |
4192 | case MULT_EXPR: |
4193 | if (scale == 1 && tree_fits_shwi_p (op1)) |
4194 | { |
4195 | int new_scale = tree_to_shwi (op1); |
4196 | /* Only treat this as a scaling operation if the target |
4197 | supports it for at least some offset type. */ |
4198 | if (use_ifn_p |
4199 | && !vect_gather_scatter_fn_p (vinfo: loop_vinfo, DR_IS_READ (dr), |
4200 | masked_p, vectype, memory_type, |
4201 | signed_char_type_node, |
4202 | scale: new_scale, ifn_out: &ifn, |
4203 | offset_vectype_out: &offset_vectype) |
4204 | && !vect_gather_scatter_fn_p (vinfo: loop_vinfo, DR_IS_READ (dr), |
4205 | masked_p, vectype, memory_type, |
4206 | unsigned_char_type_node, |
4207 | scale: new_scale, ifn_out: &ifn, |
4208 | offset_vectype_out: &offset_vectype)) |
4209 | break; |
4210 | scale = new_scale; |
4211 | off = op0; |
4212 | continue; |
4213 | } |
4214 | break; |
4215 | case SSA_NAME: |
4216 | off = op0; |
4217 | continue; |
4218 | CASE_CONVERT: |
4219 | if (!POINTER_TYPE_P (TREE_TYPE (op0)) |
4220 | && !INTEGRAL_TYPE_P (TREE_TYPE (op0))) |
4221 | break; |
4222 | |
4223 | /* Don't include the conversion if the target is happy with |
4224 | the current offset type. */ |
4225 | if (use_ifn_p |
4226 | && TREE_CODE (off) == SSA_NAME |
4227 | && !POINTER_TYPE_P (TREE_TYPE (off)) |
4228 | && vect_gather_scatter_fn_p (vinfo: loop_vinfo, DR_IS_READ (dr), |
4229 | masked_p, vectype, memory_type, |
4230 | TREE_TYPE (off), scale, ifn_out: &ifn, |
4231 | offset_vectype_out: &offset_vectype)) |
4232 | break; |
4233 | |
4234 | if (TYPE_PRECISION (TREE_TYPE (op0)) |
4235 | == TYPE_PRECISION (TREE_TYPE (off))) |
4236 | { |
4237 | off = op0; |
4238 | continue; |
4239 | } |
4240 | |
4241 | /* Include the conversion if it is widening and we're using |
4242 | the IFN path or the target can handle the converted from |
4243 | offset or the current size is not already the same as the |
4244 | data vector element size. */ |
4245 | if ((TYPE_PRECISION (TREE_TYPE (op0)) |
4246 | < TYPE_PRECISION (TREE_TYPE (off))) |
4247 | && (use_ifn_p |
4248 | || (DR_IS_READ (dr) |
4249 | ? (targetm.vectorize.builtin_gather |
4250 | && targetm.vectorize.builtin_gather (vectype, |
4251 | TREE_TYPE (op0), |
4252 | scale)) |
4253 | : (targetm.vectorize.builtin_scatter |
4254 | && targetm.vectorize.builtin_scatter (vectype, |
4255 | TREE_TYPE (op0), |
4256 | scale))) |
4257 | || !operand_equal_p (TYPE_SIZE (TREE_TYPE (off)), |
4258 | TYPE_SIZE (TREE_TYPE (vectype)), flags: 0))) |
4259 | { |
4260 | off = op0; |
4261 | offtype = TREE_TYPE (off); |
4262 | STRIP_NOPS (off); |
4263 | continue; |
4264 | } |
4265 | break; |
4266 | default: |
4267 | break; |
4268 | } |
4269 | break; |
4270 | } |
4271 | |
4272 | /* If at the end OFF still isn't a SSA_NAME or isn't |
4273 | defined in the loop, punt. */ |
4274 | if (TREE_CODE (off) != SSA_NAME |
4275 | || expr_invariant_in_loop_p (loop, off)) |
4276 | return false; |
4277 | |
4278 | if (offtype == NULL_TREE) |
4279 | offtype = TREE_TYPE (off); |
4280 | |
4281 | if (use_ifn_p) |
4282 | { |
4283 | if (!vect_gather_scatter_fn_p (vinfo: loop_vinfo, DR_IS_READ (dr), masked_p, |
4284 | vectype, memory_type, offset_type: offtype, scale, |
4285 | ifn_out: &ifn, offset_vectype_out: &offset_vectype)) |
4286 | ifn = IFN_LAST; |
4287 | decl = NULL_TREE; |
4288 | } |
4289 | else |
4290 | { |
4291 | if (DR_IS_READ (dr)) |
4292 | { |
4293 | if (targetm.vectorize.builtin_gather) |
4294 | decl = targetm.vectorize.builtin_gather (vectype, offtype, scale); |
4295 | } |
4296 | else |
4297 | { |
4298 | if (targetm.vectorize.builtin_scatter) |
4299 | decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale); |
4300 | } |
4301 | ifn = IFN_LAST; |
4302 | /* The offset vector type will be read from DECL when needed. */ |
4303 | offset_vectype = NULL_TREE; |
4304 | } |
4305 | |
4306 | info->ifn = ifn; |
4307 | info->decl = decl; |
4308 | info->base = base; |
4309 | info->offset = off; |
4310 | info->offset_dt = vect_unknown_def_type; |
4311 | info->offset_vectype = offset_vectype; |
4312 | info->scale = scale; |
4313 | info->element_type = TREE_TYPE (vectype); |
4314 | info->memory_type = memory_type; |
4315 | return true; |
4316 | } |
4317 | |
4318 | /* Find the data references in STMT, analyze them with respect to LOOP and |
4319 | append them to DATAREFS. Return false if datarefs in this stmt cannot |
4320 | be handled. */ |
4321 | |
4322 | opt_result |
4323 | vect_find_stmt_data_reference (loop_p loop, gimple *stmt, |
4324 | vec<data_reference_p> *datarefs, |
4325 | vec<int> *dataref_groups, int group_id) |
4326 | { |
4327 | /* We can ignore clobbers for dataref analysis - they are removed during |
4328 | loop vectorization and BB vectorization checks dependences with a |
4329 | stmt walk. */ |
4330 | if (gimple_clobber_p (s: stmt)) |
4331 | return opt_result::success (); |
4332 | |
4333 | if (gimple_has_volatile_ops (stmt)) |
4334 | return opt_result::failure_at (loc: stmt, fmt: "not vectorized: volatile type: %G" , |
4335 | stmt); |
4336 | |
4337 | if (stmt_can_throw_internal (cfun, stmt)) |
4338 | return opt_result::failure_at (loc: stmt, |
4339 | fmt: "not vectorized:" |
4340 | " statement can throw an exception: %G" , |
4341 | stmt); |
4342 | |
4343 | auto_vec<data_reference_p, 2> refs; |
4344 | opt_result res = find_data_references_in_stmt (loop, stmt, &refs); |
4345 | if (!res) |
4346 | return res; |
4347 | |
4348 | if (refs.is_empty ()) |
4349 | return opt_result::success (); |
4350 | |
4351 | if (refs.length () > 1) |
4352 | { |
4353 | while (!refs.is_empty ()) |
4354 | free_data_ref (refs.pop ()); |
4355 | return opt_result::failure_at (loc: stmt, |
4356 | fmt: "not vectorized: more than one " |
4357 | "data ref in stmt: %G" , stmt); |
4358 | } |
4359 | |
4360 | data_reference_p dr = refs.pop (); |
4361 | if (gcall *call = dyn_cast <gcall *> (p: stmt)) |
4362 | if (!gimple_call_internal_p (gs: call) |
4363 | || (gimple_call_internal_fn (gs: call) != IFN_MASK_LOAD |
4364 | && gimple_call_internal_fn (gs: call) != IFN_MASK_STORE)) |
4365 | { |
4366 | free_data_ref (dr); |
4367 | return opt_result::failure_at (loc: stmt, |
4368 | fmt: "not vectorized: dr in a call %G" , stmt); |
4369 | } |
4370 | |
4371 | if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF |
4372 | && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1))) |
4373 | { |
4374 | free_data_ref (dr); |
4375 | return opt_result::failure_at (loc: stmt, |
4376 | fmt: "not vectorized:" |
4377 | " statement is an unsupported" |
4378 | " bitfield access %G" , stmt); |
4379 | } |
4380 | |
4381 | if (DR_BASE_ADDRESS (dr) |
4382 | && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST) |
4383 | { |
4384 | free_data_ref (dr); |
4385 | return opt_result::failure_at (loc: stmt, |
4386 | fmt: "not vectorized:" |
4387 | " base addr of dr is a constant\n" ); |
4388 | } |
4389 | |
4390 | /* Check whether this may be a SIMD lane access and adjust the |
4391 | DR to make it easier for us to handle it. */ |
4392 | if (loop |
4393 | && loop->simduid |
4394 | && (!DR_BASE_ADDRESS (dr) |
4395 | || !DR_OFFSET (dr) |
4396 | || !DR_INIT (dr) |
4397 | || !DR_STEP (dr))) |
4398 | { |
4399 | struct data_reference *newdr |
4400 | = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt, |
4401 | DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr)); |
4402 | if (DR_BASE_ADDRESS (newdr) |
4403 | && DR_OFFSET (newdr) |
4404 | && DR_INIT (newdr) |
4405 | && DR_STEP (newdr) |
4406 | && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST |
4407 | && integer_zerop (DR_STEP (newdr))) |
4408 | { |
4409 | tree base_address = DR_BASE_ADDRESS (newdr); |
4410 | tree off = DR_OFFSET (newdr); |
4411 | tree step = ssize_int (1); |
4412 | if (integer_zerop (off) |
4413 | && TREE_CODE (base_address) == POINTER_PLUS_EXPR) |
4414 | { |
4415 | off = TREE_OPERAND (base_address, 1); |
4416 | base_address = TREE_OPERAND (base_address, 0); |
4417 | } |
4418 | STRIP_NOPS (off); |
4419 | if (TREE_CODE (off) == MULT_EXPR |
4420 | && tree_fits_uhwi_p (TREE_OPERAND (off, 1))) |
4421 | { |
4422 | step = TREE_OPERAND (off, 1); |
4423 | off = TREE_OPERAND (off, 0); |
4424 | STRIP_NOPS (off); |
4425 | } |
4426 | if (CONVERT_EXPR_P (off) |
4427 | && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0))) |
4428 | < TYPE_PRECISION (TREE_TYPE (off)))) |
4429 | off = TREE_OPERAND (off, 0); |
4430 | if (TREE_CODE (off) == SSA_NAME) |
4431 | { |
4432 | gimple *def = SSA_NAME_DEF_STMT (off); |
4433 | /* Look through widening conversion. */ |
4434 | if (is_gimple_assign (gs: def) |
4435 | && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def))) |
4436 | { |
4437 | tree rhs1 = gimple_assign_rhs1 (gs: def); |
4438 | if (TREE_CODE (rhs1) == SSA_NAME |
4439 | && INTEGRAL_TYPE_P (TREE_TYPE (rhs1)) |
4440 | && (TYPE_PRECISION (TREE_TYPE (off)) |
4441 | > TYPE_PRECISION (TREE_TYPE (rhs1)))) |
4442 | def = SSA_NAME_DEF_STMT (rhs1); |
4443 | } |
4444 | if (is_gimple_call (gs: def) |
4445 | && gimple_call_internal_p (gs: def) |
4446 | && (gimple_call_internal_fn (gs: def) == IFN_GOMP_SIMD_LANE)) |
4447 | { |
4448 | tree arg = gimple_call_arg (gs: def, index: 0); |
4449 | tree reft = TREE_TYPE (DR_REF (newdr)); |
4450 | gcc_assert (TREE_CODE (arg) == SSA_NAME); |
4451 | arg = SSA_NAME_VAR (arg); |
4452 | if (arg == loop->simduid |
4453 | /* For now. */ |
4454 | && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step)) |
4455 | { |
4456 | DR_BASE_ADDRESS (newdr) = base_address; |
4457 | DR_OFFSET (newdr) = ssize_int (0); |
4458 | DR_STEP (newdr) = step; |
4459 | DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT; |
4460 | DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step); |
4461 | /* Mark as simd-lane access. */ |
4462 | tree arg2 = gimple_call_arg (gs: def, index: 1); |
4463 | newdr->aux = (void *) (-1 - tree_to_uhwi (arg2)); |
4464 | free_data_ref (dr); |
4465 | datarefs->safe_push (obj: newdr); |
4466 | if (dataref_groups) |
4467 | dataref_groups->safe_push (obj: group_id); |
4468 | return opt_result::success (); |
4469 | } |
4470 | } |
4471 | } |
4472 | } |
4473 | free_data_ref (newdr); |
4474 | } |
4475 | |
4476 | datarefs->safe_push (obj: dr); |
4477 | if (dataref_groups) |
4478 | dataref_groups->safe_push (obj: group_id); |
4479 | return opt_result::success (); |
4480 | } |
4481 | |
4482 | /* Function vect_analyze_data_refs. |
4483 | |
4484 | Find all the data references in the loop or basic block. |
4485 | |
4486 | The general structure of the analysis of data refs in the vectorizer is as |
4487 | follows: |
4488 | 1- vect_analyze_data_refs(loop/bb): call |
4489 | compute_data_dependences_for_loop/bb to find and analyze all data-refs |
4490 | in the loop/bb and their dependences. |
4491 | 2- vect_analyze_dependences(): apply dependence testing using ddrs. |
4492 | 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok. |
4493 | 4- vect_analyze_drs_access(): check that ref_stmt.step is ok. |
4494 | |
4495 | */ |
4496 | |
4497 | opt_result |
4498 | vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal) |
4499 | { |
4500 | class loop *loop = NULL; |
4501 | unsigned int i; |
4502 | struct data_reference *dr; |
4503 | tree scalar_type; |
4504 | |
4505 | DUMP_VECT_SCOPE ("vect_analyze_data_refs" ); |
4506 | |
4507 | if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (p: vinfo)) |
4508 | loop = LOOP_VINFO_LOOP (loop_vinfo); |
4509 | |
4510 | /* Go through the data-refs, check that the analysis succeeded. Update |
4511 | pointer from stmt_vec_info struct to DR and vectype. */ |
4512 | |
4513 | vec<data_reference_p> datarefs = vinfo->shared->datarefs; |
4514 | FOR_EACH_VEC_ELT (datarefs, i, dr) |
4515 | { |
4516 | enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE; |
4517 | poly_uint64 vf; |
4518 | |
4519 | gcc_assert (DR_REF (dr)); |
4520 | stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr)); |
4521 | gcc_assert (!stmt_info->dr_aux.dr); |
4522 | stmt_info->dr_aux.dr = dr; |
4523 | stmt_info->dr_aux.stmt = stmt_info; |
4524 | |
4525 | /* Check that analysis of the data-ref succeeded. */ |
4526 | if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr) |
4527 | || !DR_STEP (dr)) |
4528 | { |
4529 | bool maybe_gather |
4530 | = DR_IS_READ (dr) |
4531 | && !TREE_THIS_VOLATILE (DR_REF (dr)); |
4532 | bool maybe_scatter |
4533 | = DR_IS_WRITE (dr) |
4534 | && !TREE_THIS_VOLATILE (DR_REF (dr)); |
4535 | |
4536 | /* If target supports vector gather loads or scatter stores, |
4537 | see if they can't be used. */ |
4538 | if (is_a <loop_vec_info> (p: vinfo) |
4539 | && !nested_in_vect_loop_p (loop, stmt_info)) |
4540 | { |
4541 | if (maybe_gather || maybe_scatter) |
4542 | { |
4543 | if (maybe_gather) |
4544 | gatherscatter = GATHER; |
4545 | else |
4546 | gatherscatter = SCATTER; |
4547 | } |
4548 | } |
4549 | |
4550 | if (gatherscatter == SG_NONE) |
4551 | { |
4552 | if (dump_enabled_p ()) |
4553 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
4554 | "not vectorized: data ref analysis " |
4555 | "failed %G" , stmt_info->stmt); |
4556 | if (is_a <bb_vec_info> (p: vinfo)) |
4557 | { |
4558 | /* In BB vectorization the ref can still participate |
4559 | in dependence analysis, we just can't vectorize it. */ |
4560 | STMT_VINFO_VECTORIZABLE (stmt_info) = false; |
4561 | continue; |
4562 | } |
4563 | return opt_result::failure_at (loc: stmt_info->stmt, |
4564 | fmt: "not vectorized:" |
4565 | " data ref analysis failed: %G" , |
4566 | stmt_info->stmt); |
4567 | } |
4568 | } |
4569 | |
4570 | /* See if this was detected as SIMD lane access. */ |
4571 | if (dr->aux == (void *)-1 |
4572 | || dr->aux == (void *)-2 |
4573 | || dr->aux == (void *)-3 |
4574 | || dr->aux == (void *)-4) |
4575 | { |
4576 | if (nested_in_vect_loop_p (loop, stmt_info)) |
4577 | return opt_result::failure_at (loc: stmt_info->stmt, |
4578 | fmt: "not vectorized:" |
4579 | " data ref analysis failed: %G" , |
4580 | stmt_info->stmt); |
4581 | STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) |
4582 | = -(uintptr_t) dr->aux; |
4583 | } |
4584 | |
4585 | tree base = get_base_address (DR_REF (dr)); |
4586 | if (base && VAR_P (base) && DECL_NONALIASED (base)) |
4587 | { |
4588 | if (dump_enabled_p ()) |
4589 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
4590 | "not vectorized: base object not addressable " |
4591 | "for stmt: %G" , stmt_info->stmt); |
4592 | if (is_a <bb_vec_info> (p: vinfo)) |
4593 | { |
4594 | /* In BB vectorization the ref can still participate |
4595 | in dependence analysis, we just can't vectorize it. */ |
4596 | STMT_VINFO_VECTORIZABLE (stmt_info) = false; |
4597 | continue; |
4598 | } |
4599 | return opt_result::failure_at (loc: stmt_info->stmt, |
4600 | fmt: "not vectorized: base object not" |
4601 | " addressable for stmt: %G" , |
4602 | stmt_info->stmt); |
4603 | } |
4604 | |
4605 | if (is_a <loop_vec_info> (p: vinfo) |
4606 | && DR_STEP (dr) |
4607 | && TREE_CODE (DR_STEP (dr)) != INTEGER_CST) |
4608 | { |
4609 | if (nested_in_vect_loop_p (loop, stmt_info)) |
4610 | return opt_result::failure_at (loc: stmt_info->stmt, |
4611 | fmt: "not vectorized: " |
4612 | "not suitable for strided load %G" , |
4613 | stmt_info->stmt); |
4614 | STMT_VINFO_STRIDED_P (stmt_info) = true; |
4615 | } |
4616 | |
4617 | /* Update DR field in stmt_vec_info struct. */ |
4618 | |
4619 | /* If the dataref is in an inner-loop of the loop that is considered for |
4620 | for vectorization, we also want to analyze the access relative to |
4621 | the outer-loop (DR contains information only relative to the |
4622 | inner-most enclosing loop). We do that by building a reference to the |
4623 | first location accessed by the inner-loop, and analyze it relative to |
4624 | the outer-loop. */ |
4625 | if (loop && nested_in_vect_loop_p (loop, stmt_info)) |
4626 | { |
4627 | /* Build a reference to the first location accessed by the |
4628 | inner loop: *(BASE + INIT + OFFSET). By construction, |
4629 | this address must be invariant in the inner loop, so we |
4630 | can consider it as being used in the outer loop. */ |
4631 | tree base = unshare_expr (DR_BASE_ADDRESS (dr)); |
4632 | tree offset = unshare_expr (DR_OFFSET (dr)); |
4633 | tree init = unshare_expr (DR_INIT (dr)); |
4634 | tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), |
4635 | init, offset); |
4636 | tree init_addr = fold_build_pointer_plus (base, init_offset); |
4637 | tree init_ref = build_fold_indirect_ref (init_addr); |
4638 | |
4639 | if (dump_enabled_p ()) |
4640 | dump_printf_loc (MSG_NOTE, vect_location, |
4641 | "analyze in outer loop: %T\n" , init_ref); |
4642 | |
4643 | opt_result res |
4644 | = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info), |
4645 | init_ref, loop, stmt_info->stmt); |
4646 | if (!res) |
4647 | /* dr_analyze_innermost already explained the failure. */ |
4648 | return res; |
4649 | |
4650 | if (dump_enabled_p ()) |
4651 | dump_printf_loc (MSG_NOTE, vect_location, |
4652 | "\touter base_address: %T\n" |
4653 | "\touter offset from base address: %T\n" |
4654 | "\touter constant offset from base address: %T\n" |
4655 | "\touter step: %T\n" |
4656 | "\touter base alignment: %d\n\n" |
4657 | "\touter base misalignment: %d\n" |
4658 | "\touter offset alignment: %d\n" |
4659 | "\touter step alignment: %d\n" , |
4660 | STMT_VINFO_DR_BASE_ADDRESS (stmt_info), |
4661 | STMT_VINFO_DR_OFFSET (stmt_info), |
4662 | STMT_VINFO_DR_INIT (stmt_info), |
4663 | STMT_VINFO_DR_STEP (stmt_info), |
4664 | STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info), |
4665 | STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info), |
4666 | STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info), |
4667 | STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info)); |
4668 | } |
4669 | |
4670 | /* Set vectype for STMT. */ |
4671 | scalar_type = TREE_TYPE (DR_REF (dr)); |
4672 | tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type); |
4673 | if (!vectype) |
4674 | { |
4675 | if (dump_enabled_p ()) |
4676 | { |
4677 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
4678 | "not vectorized: no vectype for stmt: %G" , |
4679 | stmt_info->stmt); |
4680 | dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: " ); |
4681 | dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS, |
4682 | scalar_type); |
4683 | dump_printf (MSG_MISSED_OPTIMIZATION, "\n" ); |
4684 | } |
4685 | |
4686 | if (is_a <bb_vec_info> (p: vinfo)) |
4687 | { |
4688 | /* No vector type is fine, the ref can still participate |
4689 | in dependence analysis, we just can't vectorize it. */ |
4690 | STMT_VINFO_VECTORIZABLE (stmt_info) = false; |
4691 | continue; |
4692 | } |
4693 | if (fatal) |
4694 | *fatal = false; |
4695 | return opt_result::failure_at (loc: stmt_info->stmt, |
4696 | fmt: "not vectorized:" |
4697 | " no vectype for stmt: %G" |
4698 | " scalar_type: %T\n" , |
4699 | stmt_info->stmt, scalar_type); |
4700 | } |
4701 | else |
4702 | { |
4703 | if (dump_enabled_p ()) |
4704 | dump_printf_loc (MSG_NOTE, vect_location, |
4705 | "got vectype for stmt: %G%T\n" , |
4706 | stmt_info->stmt, vectype); |
4707 | } |
4708 | |
4709 | /* Adjust the minimal vectorization factor according to the |
4710 | vector type. */ |
4711 | vf = TYPE_VECTOR_SUBPARTS (node: vectype); |
4712 | *min_vf = upper_bound (a: *min_vf, b: vf); |
4713 | |
4714 | /* Leave the BB vectorizer to pick the vector type later, based on |
4715 | the final dataref group size and SLP node size. */ |
4716 | if (is_a <loop_vec_info> (p: vinfo)) |
4717 | STMT_VINFO_VECTYPE (stmt_info) = vectype; |
4718 | |
4719 | if (gatherscatter != SG_NONE) |
4720 | { |
4721 | gather_scatter_info gs_info; |
4722 | if (!vect_check_gather_scatter (stmt_info, |
4723 | loop_vinfo: as_a <loop_vec_info> (p: vinfo), |
4724 | info: &gs_info) |
4725 | || !get_vectype_for_scalar_type (vinfo, |
4726 | TREE_TYPE (gs_info.offset))) |
4727 | { |
4728 | if (fatal) |
4729 | *fatal = false; |
4730 | return opt_result::failure_at |
4731 | (loc: stmt_info->stmt, |
4732 | fmt: (gatherscatter == GATHER) |
4733 | ? "not vectorized: not suitable for gather load %G" |
4734 | : "not vectorized: not suitable for scatter store %G" , |
4735 | stmt_info->stmt); |
4736 | } |
4737 | STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter; |
4738 | } |
4739 | } |
4740 | |
4741 | /* We used to stop processing and prune the list here. Verify we no |
4742 | longer need to. */ |
4743 | gcc_assert (i == datarefs.length ()); |
4744 | |
4745 | return opt_result::success (); |
4746 | } |
4747 | |
4748 | |
4749 | /* Function vect_get_new_vect_var. |
4750 | |
4751 | Returns a name for a new variable. The current naming scheme appends the |
4752 | prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to |
4753 | the name of vectorizer generated variables, and appends that to NAME if |
4754 | provided. */ |
4755 | |
4756 | tree |
4757 | vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name) |
4758 | { |
4759 | const char *prefix; |
4760 | tree new_vect_var; |
4761 | |
4762 | switch (var_kind) |
4763 | { |
4764 | case vect_simple_var: |
4765 | prefix = "vect" ; |
4766 | break; |
4767 | case vect_scalar_var: |
4768 | prefix = "stmp" ; |
4769 | break; |
4770 | case vect_mask_var: |
4771 | prefix = "mask" ; |
4772 | break; |
4773 | case vect_pointer_var: |
4774 | prefix = "vectp" ; |
4775 | break; |
4776 | default: |
4777 | gcc_unreachable (); |
4778 | } |
4779 | |
4780 | if (name) |
4781 | { |
4782 | char* tmp = concat (prefix, "_" , name, NULL); |
4783 | new_vect_var = create_tmp_reg (type, tmp); |
4784 | free (ptr: tmp); |
4785 | } |
4786 | else |
4787 | new_vect_var = create_tmp_reg (type, prefix); |
4788 | |
4789 | return new_vect_var; |
4790 | } |
4791 | |
4792 | /* Like vect_get_new_vect_var but return an SSA name. */ |
4793 | |
4794 | tree |
4795 | vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name) |
4796 | { |
4797 | const char *prefix; |
4798 | tree new_vect_var; |
4799 | |
4800 | switch (var_kind) |
4801 | { |
4802 | case vect_simple_var: |
4803 | prefix = "vect" ; |
4804 | break; |
4805 | case vect_scalar_var: |
4806 | prefix = "stmp" ; |
4807 | break; |
4808 | case vect_pointer_var: |
4809 | prefix = "vectp" ; |
4810 | break; |
4811 | default: |
4812 | gcc_unreachable (); |
4813 | } |
4814 | |
4815 | if (name) |
4816 | { |
4817 | char* tmp = concat (prefix, "_" , name, NULL); |
4818 | new_vect_var = make_temp_ssa_name (type, NULL, name: tmp); |
4819 | free (ptr: tmp); |
4820 | } |
4821 | else |
4822 | new_vect_var = make_temp_ssa_name (type, NULL, name: prefix); |
4823 | |
4824 | return new_vect_var; |
4825 | } |
4826 | |
4827 | /* Duplicate points-to info on NAME from DR_INFO. */ |
4828 | |
4829 | static void |
4830 | vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info) |
4831 | { |
4832 | duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr)); |
4833 | /* DR_PTR_INFO is for a base SSA name, not including constant or |
4834 | variable offsets in the ref so its alignment info does not apply. */ |
4835 | mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name)); |
4836 | } |
4837 | |
4838 | /* Function vect_create_addr_base_for_vector_ref. |
4839 | |
4840 | Create an expression that computes the address of the first memory location |
4841 | that will be accessed for a data reference. |
4842 | |
4843 | Input: |
4844 | STMT_INFO: The statement containing the data reference. |
4845 | NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list. |
4846 | OFFSET: Optional. If supplied, it is be added to the initial address. |
4847 | LOOP: Specify relative to which loop-nest should the address be computed. |
4848 | For example, when the dataref is in an inner-loop nested in an |
4849 | outer-loop that is now being vectorized, LOOP can be either the |
4850 | outer-loop, or the inner-loop. The first memory location accessed |
4851 | by the following dataref ('in' points to short): |
4852 | |
4853 | for (i=0; i<N; i++) |
4854 | for (j=0; j<M; j++) |
4855 | s += in[i+j] |
4856 | |
4857 | is as follows: |
4858 | if LOOP=i_loop: &in (relative to i_loop) |
4859 | if LOOP=j_loop: &in+i*2B (relative to j_loop) |
4860 | |
4861 | Output: |
4862 | 1. Return an SSA_NAME whose value is the address of the memory location of |
4863 | the first vector of the data reference. |
4864 | 2. If new_stmt_list is not NULL_TREE after return then the caller must insert |
4865 | these statement(s) which define the returned SSA_NAME. |
4866 | |
4867 | FORNOW: We are only handling array accesses with step 1. */ |
4868 | |
4869 | tree |
4870 | vect_create_addr_base_for_vector_ref (vec_info *vinfo, stmt_vec_info stmt_info, |
4871 | gimple_seq *new_stmt_list, |
4872 | tree offset) |
4873 | { |
4874 | dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info); |
4875 | struct data_reference *dr = dr_info->dr; |
4876 | const char *base_name; |
4877 | tree addr_base; |
4878 | tree dest; |
4879 | gimple_seq seq = NULL; |
4880 | tree vect_ptr_type; |
4881 | loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (p: vinfo); |
4882 | innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info); |
4883 | |
4884 | tree data_ref_base = unshare_expr (drb->base_address); |
4885 | tree base_offset = unshare_expr (get_dr_vinfo_offset (vinfo, dr_info, check_outer: true)); |
4886 | tree init = unshare_expr (drb->init); |
4887 | |
4888 | if (loop_vinfo) |
4889 | base_name = get_name (data_ref_base); |
4890 | else |
4891 | { |
4892 | base_offset = ssize_int (0); |
4893 | init = ssize_int (0); |
4894 | base_name = get_name (DR_REF (dr)); |
4895 | } |
4896 | |
4897 | /* Create base_offset */ |
4898 | base_offset = size_binop (PLUS_EXPR, |
4899 | fold_convert (sizetype, base_offset), |
4900 | fold_convert (sizetype, init)); |
4901 | |
4902 | if (offset) |
4903 | { |
4904 | offset = fold_convert (sizetype, offset); |
4905 | base_offset = fold_build2 (PLUS_EXPR, sizetype, |
4906 | base_offset, offset); |
4907 | } |
4908 | |
4909 | /* base + base_offset */ |
4910 | if (loop_vinfo) |
4911 | addr_base = fold_build_pointer_plus (data_ref_base, base_offset); |
4912 | else |
4913 | addr_base = build1 (ADDR_EXPR, |
4914 | build_pointer_type (TREE_TYPE (DR_REF (dr))), |
4915 | /* Strip zero offset components since we don't need |
4916 | them and they can confuse late diagnostics if |
4917 | we CSE them wrongly. See PR106904 for example. */ |
4918 | unshare_expr (strip_zero_offset_components |
4919 | (DR_REF (dr)))); |
4920 | |
4921 | vect_ptr_type = build_pointer_type (TREE_TYPE (DR_REF (dr))); |
4922 | dest = vect_get_new_vect_var (type: vect_ptr_type, var_kind: vect_pointer_var, name: base_name); |
4923 | addr_base = force_gimple_operand (addr_base, &seq, true, dest); |
4924 | gimple_seq_add_seq (new_stmt_list, seq); |
4925 | |
4926 | if (DR_PTR_INFO (dr) |
4927 | && TREE_CODE (addr_base) == SSA_NAME |
4928 | /* We should only duplicate pointer info to newly created SSA names. */ |
4929 | && SSA_NAME_VAR (addr_base) == dest) |
4930 | { |
4931 | gcc_assert (!SSA_NAME_PTR_INFO (addr_base)); |
4932 | vect_duplicate_ssa_name_ptr_info (name: addr_base, dr_info); |
4933 | } |
4934 | |
4935 | if (dump_enabled_p ()) |
4936 | dump_printf_loc (MSG_NOTE, vect_location, "created %T\n" , addr_base); |
4937 | |
4938 | return addr_base; |
4939 | } |
4940 | |
4941 | |
4942 | /* Function vect_create_data_ref_ptr. |
4943 | |
4944 | Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first |
4945 | location accessed in the loop by STMT_INFO, along with the def-use update |
4946 | chain to appropriately advance the pointer through the loop iterations. |
4947 | Also set aliasing information for the pointer. This pointer is used by |
4948 | the callers to this function to create a memory reference expression for |
4949 | vector load/store access. |
4950 | |
4951 | Input: |
4952 | 1. STMT_INFO: a stmt that references memory. Expected to be of the form |
4953 | GIMPLE_ASSIGN <name, data-ref> or |
4954 | GIMPLE_ASSIGN <data-ref, name>. |
4955 | 2. AGGR_TYPE: the type of the reference, which should be either a vector |
4956 | or an array. |
4957 | 3. AT_LOOP: the loop where the vector memref is to be created. |
4958 | 4. OFFSET (optional): a byte offset to be added to the initial address |
4959 | accessed by the data-ref in STMT_INFO. |
4960 | 5. BSI: location where the new stmts are to be placed if there is no loop |
4961 | 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain |
4962 | pointing to the initial address. |
4963 | 8. IV_STEP (optional, defaults to NULL): the amount that should be added |
4964 | to the IV during each iteration of the loop. NULL says to move |
4965 | by one copy of AGGR_TYPE up or down, depending on the step of the |
4966 | data reference. |
4967 | |
4968 | Output: |
4969 | 1. Declare a new ptr to vector_type, and have it point to the base of the |
4970 | data reference (initial addressed accessed by the data reference). |
4971 | For example, for vector of type V8HI, the following code is generated: |
4972 | |
4973 | v8hi *ap; |
4974 | ap = (v8hi *)initial_address; |
4975 | |
4976 | if OFFSET is not supplied: |
4977 | initial_address = &a[init]; |
4978 | if OFFSET is supplied: |
4979 | initial_address = &a[init] + OFFSET; |
4980 | if BYTE_OFFSET is supplied: |
4981 | initial_address = &a[init] + BYTE_OFFSET; |
4982 | |
4983 | Return the initial_address in INITIAL_ADDRESS. |
4984 | |
4985 | 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also |
4986 | update the pointer in each iteration of the loop. |
4987 | |
4988 | Return the increment stmt that updates the pointer in PTR_INCR. |
4989 | |
4990 | 3. Return the pointer. */ |
4991 | |
4992 | tree |
4993 | vect_create_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info, |
4994 | tree aggr_type, class loop *at_loop, tree offset, |
4995 | tree *initial_address, gimple_stmt_iterator *gsi, |
4996 | gimple **ptr_incr, bool only_init, |
4997 | tree iv_step) |
4998 | { |
4999 | const char *base_name; |
5000 | loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (p: vinfo); |
5001 | class loop *loop = NULL; |
5002 | bool nested_in_vect_loop = false; |
5003 | class loop *containing_loop = NULL; |
5004 | tree aggr_ptr_type; |
5005 | tree aggr_ptr; |
5006 | tree new_temp; |
5007 | gimple_seq new_stmt_list = NULL; |
5008 | edge pe = NULL; |
5009 | basic_block new_bb; |
5010 | tree aggr_ptr_init; |
5011 | dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info); |
5012 | struct data_reference *dr = dr_info->dr; |
5013 | tree aptr; |
5014 | gimple_stmt_iterator incr_gsi; |
5015 | bool insert_after; |
5016 | tree indx_before_incr, indx_after_incr; |
5017 | gimple *incr; |
5018 | bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (p: vinfo); |
5019 | |
5020 | gcc_assert (iv_step != NULL_TREE |
5021 | || TREE_CODE (aggr_type) == ARRAY_TYPE |
5022 | || TREE_CODE (aggr_type) == VECTOR_TYPE); |
5023 | |
5024 | if (loop_vinfo) |
5025 | { |
5026 | loop = LOOP_VINFO_LOOP (loop_vinfo); |
5027 | nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info); |
5028 | containing_loop = (gimple_bb (g: stmt_info->stmt))->loop_father; |
5029 | pe = loop_preheader_edge (loop); |
5030 | } |
5031 | else |
5032 | { |
5033 | gcc_assert (bb_vinfo); |
5034 | only_init = true; |
5035 | *ptr_incr = NULL; |
5036 | } |
5037 | |
5038 | /* Create an expression for the first address accessed by this load |
5039 | in LOOP. */ |
5040 | base_name = get_name (DR_BASE_ADDRESS (dr)); |
5041 | |
5042 | if (dump_enabled_p ()) |
5043 | { |
5044 | tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr)); |
5045 | dump_printf_loc (MSG_NOTE, vect_location, |
5046 | "create %s-pointer variable to type: %T" , |
5047 | get_tree_code_name (TREE_CODE (aggr_type)), |
5048 | aggr_type); |
5049 | if (TREE_CODE (dr_base_type) == ARRAY_TYPE) |
5050 | dump_printf (MSG_NOTE, " vectorizing an array ref: " ); |
5051 | else if (TREE_CODE (dr_base_type) == VECTOR_TYPE) |
5052 | dump_printf (MSG_NOTE, " vectorizing a vector ref: " ); |
5053 | else if (TREE_CODE (dr_base_type) == RECORD_TYPE) |
5054 | dump_printf (MSG_NOTE, " vectorizing a record based array ref: " ); |
5055 | else |
5056 | dump_printf (MSG_NOTE, " vectorizing a pointer ref: " ); |
5057 | dump_printf (MSG_NOTE, "%T\n" , DR_BASE_OBJECT (dr)); |
5058 | } |
5059 | |
5060 | /* (1) Create the new aggregate-pointer variable. |
5061 | Vector and array types inherit the alias set of their component |
5062 | type by default so we need to use a ref-all pointer if the data |
5063 | reference does not conflict with the created aggregated data |
5064 | reference because it is not addressable. */ |
5065 | bool need_ref_all = false; |
5066 | if (!alias_sets_conflict_p (get_alias_set (aggr_type), |
5067 | get_alias_set (DR_REF (dr)))) |
5068 | need_ref_all = true; |
5069 | /* Likewise for any of the data references in the stmt group. */ |
5070 | else if (DR_GROUP_SIZE (stmt_info) > 1) |
5071 | { |
5072 | stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info); |
5073 | do |
5074 | { |
5075 | struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo); |
5076 | if (!alias_sets_conflict_p (get_alias_set (aggr_type), |
5077 | get_alias_set (DR_REF (sdr)))) |
5078 | { |
5079 | need_ref_all = true; |
5080 | break; |
5081 | } |
5082 | sinfo = DR_GROUP_NEXT_ELEMENT (sinfo); |
5083 | } |
5084 | while (sinfo); |
5085 | } |
5086 | aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode, |
5087 | need_ref_all); |
5088 | aggr_ptr = vect_get_new_vect_var (type: aggr_ptr_type, var_kind: vect_pointer_var, name: base_name); |
5089 | |
5090 | |
5091 | /* Note: If the dataref is in an inner-loop nested in LOOP, and we are |
5092 | vectorizing LOOP (i.e., outer-loop vectorization), we need to create two |
5093 | def-use update cycles for the pointer: one relative to the outer-loop |
5094 | (LOOP), which is what steps (3) and (4) below do. The other is relative |
5095 | to the inner-loop (which is the inner-most loop containing the dataref), |
5096 | and this is done be step (5) below. |
5097 | |
5098 | When vectorizing inner-most loops, the vectorized loop (LOOP) is also the |
5099 | inner-most loop, and so steps (3),(4) work the same, and step (5) is |
5100 | redundant. Steps (3),(4) create the following: |
5101 | |
5102 | vp0 = &base_addr; |
5103 | LOOP: vp1 = phi(vp0,vp2) |
5104 | ... |
5105 | ... |
5106 | vp2 = vp1 + step |
5107 | goto LOOP |
5108 | |
5109 | If there is an inner-loop nested in loop, then step (5) will also be |
5110 | applied, and an additional update in the inner-loop will be created: |
5111 | |
5112 | vp0 = &base_addr; |
5113 | LOOP: vp1 = phi(vp0,vp2) |
5114 | ... |
5115 | inner: vp3 = phi(vp1,vp4) |
5116 | vp4 = vp3 + inner_step |
5117 | if () goto inner |
5118 | ... |
5119 | vp2 = vp1 + step |
5120 | if () goto LOOP */ |
5121 | |
5122 | /* (2) Calculate the initial address of the aggregate-pointer, and set |
5123 | the aggregate-pointer to point to it before the loop. */ |
5124 | |
5125 | /* Create: (&(base[init_val]+offset) in the loop preheader. */ |
5126 | |
5127 | new_temp = vect_create_addr_base_for_vector_ref (vinfo, |
5128 | stmt_info, new_stmt_list: &new_stmt_list, |
5129 | offset); |
5130 | if (new_stmt_list) |
5131 | { |
5132 | if (pe) |
5133 | { |
5134 | new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list); |
5135 | gcc_assert (!new_bb); |
5136 | } |
5137 | else |
5138 | gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT); |
5139 | } |
5140 | |
5141 | *initial_address = new_temp; |
5142 | aggr_ptr_init = new_temp; |
5143 | |
5144 | /* (3) Handle the updating of the aggregate-pointer inside the loop. |
5145 | This is needed when ONLY_INIT is false, and also when AT_LOOP is the |
5146 | inner-loop nested in LOOP (during outer-loop vectorization). */ |
5147 | |
5148 | /* No update in loop is required. */ |
5149 | if (only_init && (!loop_vinfo || at_loop == loop)) |
5150 | aptr = aggr_ptr_init; |
5151 | else |
5152 | { |
5153 | /* Accesses to invariant addresses should be handled specially |
5154 | by the caller. */ |
5155 | tree step = vect_dr_behavior (vinfo, dr_info)->step; |
5156 | gcc_assert (!integer_zerop (step)); |
5157 | |
5158 | if (iv_step == NULL_TREE) |
5159 | { |
5160 | /* The step of the aggregate pointer is the type size, |
5161 | negated for downward accesses. */ |
5162 | iv_step = TYPE_SIZE_UNIT (aggr_type); |
5163 | if (tree_int_cst_sgn (step) == -1) |
5164 | iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step); |
5165 | } |
5166 | |
5167 | standard_iv_increment_position (loop, &incr_gsi, &insert_after); |
5168 | |
5169 | create_iv (aggr_ptr_init, PLUS_EXPR, |
5170 | fold_convert (aggr_ptr_type, iv_step), |
5171 | aggr_ptr, loop, &incr_gsi, insert_after, |
5172 | &indx_before_incr, &indx_after_incr); |
5173 | incr = gsi_stmt (i: incr_gsi); |
5174 | |
5175 | /* Copy the points-to information if it exists. */ |
5176 | if (DR_PTR_INFO (dr)) |
5177 | { |
5178 | vect_duplicate_ssa_name_ptr_info (name: indx_before_incr, dr_info); |
5179 | vect_duplicate_ssa_name_ptr_info (name: indx_after_incr, dr_info); |
5180 | } |
5181 | if (ptr_incr) |
5182 | *ptr_incr = incr; |
5183 | |
5184 | aptr = indx_before_incr; |
5185 | } |
5186 | |
5187 | if (!nested_in_vect_loop || only_init) |
5188 | return aptr; |
5189 | |
5190 | |
5191 | /* (4) Handle the updating of the aggregate-pointer inside the inner-loop |
5192 | nested in LOOP, if exists. */ |
5193 | |
5194 | gcc_assert (nested_in_vect_loop); |
5195 | if (!only_init) |
5196 | { |
5197 | standard_iv_increment_position (containing_loop, &incr_gsi, |
5198 | &insert_after); |
5199 | create_iv (aptr, PLUS_EXPR, fold_convert (aggr_ptr_type, DR_STEP (dr)), |
5200 | aggr_ptr, containing_loop, &incr_gsi, insert_after, |
5201 | &indx_before_incr, &indx_after_incr); |
5202 | incr = gsi_stmt (i: incr_gsi); |
5203 | |
5204 | /* Copy the points-to information if it exists. */ |
5205 | if (DR_PTR_INFO (dr)) |
5206 | { |
5207 | vect_duplicate_ssa_name_ptr_info (name: indx_before_incr, dr_info); |
5208 | vect_duplicate_ssa_name_ptr_info (name: indx_after_incr, dr_info); |
5209 | } |
5210 | if (ptr_incr) |
5211 | *ptr_incr = incr; |
5212 | |
5213 | return indx_before_incr; |
5214 | } |
5215 | else |
5216 | gcc_unreachable (); |
5217 | } |
5218 | |
5219 | |
5220 | /* Function bump_vector_ptr |
5221 | |
5222 | Increment a pointer (to a vector type) by vector-size. If requested, |
5223 | i.e. if PTR-INCR is given, then also connect the new increment stmt |
5224 | to the existing def-use update-chain of the pointer, by modifying |
5225 | the PTR_INCR as illustrated below: |
5226 | |
5227 | The pointer def-use update-chain before this function: |
5228 | DATAREF_PTR = phi (p_0, p_2) |
5229 | .... |
5230 | PTR_INCR: p_2 = DATAREF_PTR + step |
5231 | |
5232 | The pointer def-use update-chain after this function: |
5233 | DATAREF_PTR = phi (p_0, p_2) |
5234 | .... |
5235 | NEW_DATAREF_PTR = DATAREF_PTR + BUMP |
5236 | .... |
5237 | PTR_INCR: p_2 = NEW_DATAREF_PTR + step |
5238 | |
5239 | Input: |
5240 | DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated |
5241 | in the loop. |
5242 | PTR_INCR - optional. The stmt that updates the pointer in each iteration of |
5243 | the loop. The increment amount across iterations is expected |
5244 | to be vector_size. |
5245 | BSI - location where the new update stmt is to be placed. |
5246 | STMT_INFO - the original scalar memory-access stmt that is being vectorized. |
5247 | BUMP - optional. The offset by which to bump the pointer. If not given, |
5248 | the offset is assumed to be vector_size. |
5249 | |
5250 | Output: Return NEW_DATAREF_PTR as illustrated above. |
5251 | |
5252 | */ |
5253 | |
5254 | tree |
5255 | bump_vector_ptr (vec_info *vinfo, |
5256 | tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi, |
5257 | stmt_vec_info stmt_info, tree bump) |
5258 | { |
5259 | struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); |
5260 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); |
5261 | tree update = TYPE_SIZE_UNIT (vectype); |
5262 | gimple *incr_stmt; |
5263 | ssa_op_iter iter; |
5264 | use_operand_p use_p; |
5265 | tree new_dataref_ptr; |
5266 | |
5267 | if (bump) |
5268 | update = bump; |
5269 | |
5270 | if (TREE_CODE (dataref_ptr) == SSA_NAME) |
5271 | new_dataref_ptr = copy_ssa_name (var: dataref_ptr); |
5272 | else if (is_gimple_min_invariant (dataref_ptr)) |
5273 | /* When possible avoid emitting a separate increment stmt that will |
5274 | force the addressed object addressable. */ |
5275 | return build1 (ADDR_EXPR, TREE_TYPE (dataref_ptr), |
5276 | fold_build2 (MEM_REF, |
5277 | TREE_TYPE (TREE_TYPE (dataref_ptr)), |
5278 | dataref_ptr, |
5279 | fold_convert (ptr_type_node, update))); |
5280 | else |
5281 | new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr)); |
5282 | incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR, |
5283 | dataref_ptr, update); |
5284 | vect_finish_stmt_generation (vinfo, stmt_info, incr_stmt, gsi); |
5285 | /* Fold the increment, avoiding excessive chains use-def chains of |
5286 | those, leading to compile-time issues for passes until the next |
5287 | forwprop pass which would do this as well. */ |
5288 | gimple_stmt_iterator fold_gsi = gsi_for_stmt (incr_stmt); |
5289 | if (fold_stmt (&fold_gsi, follow_all_ssa_edges)) |
5290 | { |
5291 | incr_stmt = gsi_stmt (i: fold_gsi); |
5292 | update_stmt (s: incr_stmt); |
5293 | } |
5294 | |
5295 | /* Copy the points-to information if it exists. */ |
5296 | if (DR_PTR_INFO (dr)) |
5297 | { |
5298 | duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr)); |
5299 | mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr)); |
5300 | } |
5301 | |
5302 | if (!ptr_incr) |
5303 | return new_dataref_ptr; |
5304 | |
5305 | /* Update the vector-pointer's cross-iteration increment. */ |
5306 | FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE) |
5307 | { |
5308 | tree use = USE_FROM_PTR (use_p); |
5309 | |
5310 | if (use == dataref_ptr) |
5311 | SET_USE (use_p, new_dataref_ptr); |
5312 | else |
5313 | gcc_assert (operand_equal_p (use, update, 0)); |
5314 | } |
5315 | |
5316 | return new_dataref_ptr; |
5317 | } |
5318 | |
5319 | |
5320 | /* Copy memory reference info such as base/clique from the SRC reference |
5321 | to the DEST MEM_REF. */ |
5322 | |
5323 | void |
5324 | vect_copy_ref_info (tree dest, tree src) |
5325 | { |
5326 | if (TREE_CODE (dest) != MEM_REF) |
5327 | return; |
5328 | |
5329 | tree src_base = src; |
5330 | while (handled_component_p (t: src_base)) |
5331 | src_base = TREE_OPERAND (src_base, 0); |
5332 | if (TREE_CODE (src_base) != MEM_REF |
5333 | && TREE_CODE (src_base) != TARGET_MEM_REF) |
5334 | return; |
5335 | |
5336 | MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base); |
5337 | MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base); |
5338 | } |
5339 | |
5340 | |
5341 | /* Function vect_create_destination_var. |
5342 | |
5343 | Create a new temporary of type VECTYPE. */ |
5344 | |
5345 | tree |
5346 | vect_create_destination_var (tree scalar_dest, tree vectype) |
5347 | { |
5348 | tree vec_dest; |
5349 | const char *name; |
5350 | char *new_name; |
5351 | tree type; |
5352 | enum vect_var_kind kind; |
5353 | |
5354 | kind = vectype |
5355 | ? VECTOR_BOOLEAN_TYPE_P (vectype) |
5356 | ? vect_mask_var |
5357 | : vect_simple_var |
5358 | : vect_scalar_var; |
5359 | type = vectype ? vectype : TREE_TYPE (scalar_dest); |
5360 | |
5361 | gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME); |
5362 | |
5363 | name = get_name (scalar_dest); |
5364 | if (name) |
5365 | new_name = xasprintf ("%s_%u" , name, SSA_NAME_VERSION (scalar_dest)); |
5366 | else |
5367 | new_name = xasprintf ("_%u" , SSA_NAME_VERSION (scalar_dest)); |
5368 | vec_dest = vect_get_new_vect_var (type, var_kind: kind, name: new_name); |
5369 | free (ptr: new_name); |
5370 | |
5371 | return vec_dest; |
5372 | } |
5373 | |
5374 | /* Function vect_grouped_store_supported. |
5375 | |
5376 | Returns TRUE if interleave high and interleave low permutations |
5377 | are supported, and FALSE otherwise. */ |
5378 | |
5379 | bool |
5380 | vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count) |
5381 | { |
5382 | machine_mode mode = TYPE_MODE (vectype); |
5383 | |
5384 | /* vect_permute_store_chain requires the group size to be equal to 3 or |
5385 | be a power of two. */ |
5386 | if (count != 3 && exact_log2 (x: count) == -1) |
5387 | { |
5388 | if (dump_enabled_p ()) |
5389 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
5390 | "the size of the group of accesses" |
5391 | " is not a power of 2 or not eqaul to 3\n" ); |
5392 | return false; |
5393 | } |
5394 | |
5395 | /* Check that the permutation is supported. */ |
5396 | if (VECTOR_MODE_P (mode)) |
5397 | { |
5398 | unsigned int i; |
5399 | if (count == 3) |
5400 | { |
5401 | unsigned int j0 = 0, j1 = 0, j2 = 0; |
5402 | unsigned int i, j; |
5403 | |
5404 | unsigned int nelt; |
5405 | if (!GET_MODE_NUNITS (mode).is_constant (const_value: &nelt)) |
5406 | { |
5407 | if (dump_enabled_p ()) |
5408 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
5409 | "cannot handle groups of 3 stores for" |
5410 | " variable-length vectors\n" ); |
5411 | return false; |
5412 | } |
5413 | |
5414 | vec_perm_builder sel (nelt, nelt, 1); |
5415 | sel.quick_grow (len: nelt); |
5416 | vec_perm_indices indices; |
5417 | for (j = 0; j < 3; j++) |
5418 | { |
5419 | int nelt0 = ((3 - j) * nelt) % 3; |
5420 | int nelt1 = ((3 - j) * nelt + 1) % 3; |
5421 | int nelt2 = ((3 - j) * nelt + 2) % 3; |
5422 | for (i = 0; i < nelt; i++) |
5423 | { |
5424 | if (3 * i + nelt0 < nelt) |
5425 | sel[3 * i + nelt0] = j0++; |
5426 | if (3 * i + nelt1 < nelt) |
5427 | sel[3 * i + nelt1] = nelt + j1++; |
5428 | if (3 * i + nelt2 < nelt) |
5429 | sel[3 * i + nelt2] = 0; |
5430 | } |
5431 | indices.new_vector (sel, 2, nelt); |
5432 | if (!can_vec_perm_const_p (mode, mode, indices)) |
5433 | { |
5434 | if (dump_enabled_p ()) |
5435 | dump_printf (MSG_MISSED_OPTIMIZATION, |
5436 | "permutation op not supported by target.\n" ); |
5437 | return false; |
5438 | } |
5439 | |
5440 | for (i = 0; i < nelt; i++) |
5441 | { |
5442 | if (3 * i + nelt0 < nelt) |
5443 | sel[3 * i + nelt0] = 3 * i + nelt0; |
5444 | if (3 * i + nelt1 < nelt) |
5445 | sel[3 * i + nelt1] = 3 * i + nelt1; |
5446 | if (3 * i + nelt2 < nelt) |
5447 | sel[3 * i + nelt2] = nelt + j2++; |
5448 | } |
5449 | indices.new_vector (sel, 2, nelt); |
5450 | if (!can_vec_perm_const_p (mode, mode, indices)) |
5451 | { |
5452 | if (dump_enabled_p ()) |
5453 | dump_printf (MSG_MISSED_OPTIMIZATION, |
5454 | "permutation op not supported by target.\n" ); |
5455 | return false; |
5456 | } |
5457 | } |
5458 | return true; |
5459 | } |
5460 | else |
5461 | { |
5462 | /* If length is not equal to 3 then only power of 2 is supported. */ |
5463 | gcc_assert (pow2p_hwi (count)); |
5464 | poly_uint64 nelt = GET_MODE_NUNITS (mode); |
5465 | |
5466 | /* The encoding has 2 interleaved stepped patterns. */ |
5467 | if(!multiple_p (a: nelt, b: 2)) |
5468 | return false; |
5469 | vec_perm_builder sel (nelt, 2, 3); |
5470 | sel.quick_grow (len: 6); |
5471 | for (i = 0; i < 3; i++) |
5472 | { |
5473 | sel[i * 2] = i; |
5474 | sel[i * 2 + 1] = i + nelt; |
5475 | } |
5476 | vec_perm_indices indices (sel, 2, nelt); |
5477 | if (can_vec_perm_const_p (mode, mode, indices)) |
5478 | { |
5479 | for (i = 0; i < 6; i++) |
5480 | sel[i] += exact_div (a: nelt, b: 2); |
5481 | indices.new_vector (sel, 2, nelt); |
5482 | if (can_vec_perm_const_p (mode, mode, indices)) |
5483 | return true; |
5484 | } |
5485 | } |
5486 | } |
5487 | |
5488 | if (dump_enabled_p ()) |
5489 | dump_printf (MSG_MISSED_OPTIMIZATION, |
5490 | "permutation op not supported by target.\n" ); |
5491 | return false; |
5492 | } |
5493 | |
5494 | /* Return FN if vec_{mask_,mask_len_}store_lanes is available for COUNT vectors |
5495 | of type VECTYPE. MASKED_P says whether the masked form is needed. */ |
5496 | |
5497 | internal_fn |
5498 | vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count, |
5499 | bool masked_p) |
5500 | { |
5501 | if (vect_lanes_optab_supported_p (name: "vec_mask_len_store_lanes" , |
5502 | optab: vec_mask_len_store_lanes_optab, vectype, |
5503 | count)) |
5504 | return IFN_MASK_LEN_STORE_LANES; |
5505 | else if (masked_p) |
5506 | { |
5507 | if (vect_lanes_optab_supported_p (name: "vec_mask_store_lanes" , |
5508 | optab: vec_mask_store_lanes_optab, vectype, |
5509 | count)) |
5510 | return IFN_MASK_STORE_LANES; |
5511 | } |
5512 | else |
5513 | { |
5514 | if (vect_lanes_optab_supported_p (name: "vec_store_lanes" , |
5515 | optab: vec_store_lanes_optab, vectype, count)) |
5516 | return IFN_STORE_LANES; |
5517 | } |
5518 | return IFN_LAST; |
5519 | } |
5520 | |
5521 | |
5522 | /* Function vect_permute_store_chain. |
5523 | |
5524 | Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be |
5525 | a power of 2 or equal to 3, generate interleave_high/low stmts to reorder |
5526 | the data correctly for the stores. Return the final references for stores |
5527 | in RESULT_CHAIN. |
5528 | |
5529 | E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8. |
5530 | The input is 4 vectors each containing 8 elements. We assign a number to |
5531 | each element, the input sequence is: |
5532 | |
5533 | 1st vec: 0 1 2 3 4 5 6 7 |
5534 | 2nd vec: 8 9 10 11 12 13 14 15 |
5535 | 3rd vec: 16 17 18 19 20 21 22 23 |
5536 | 4th vec: 24 25 26 27 28 29 30 31 |
5537 | |
5538 | The output sequence should be: |
5539 | |
5540 | 1st vec: 0 8 16 24 1 9 17 25 |
5541 | 2nd vec: 2 10 18 26 3 11 19 27 |
5542 | 3rd vec: 4 12 20 28 5 13 21 30 |
5543 | 4th vec: 6 14 22 30 7 15 23 31 |
5544 | |
5545 | i.e., we interleave the contents of the four vectors in their order. |
5546 | |
5547 | We use interleave_high/low instructions to create such output. The input of |
5548 | each interleave_high/low operation is two vectors: |
5549 | 1st vec 2nd vec |
5550 | 0 1 2 3 4 5 6 7 |
5551 | the even elements of the result vector are obtained left-to-right from the |
5552 | high/low elements of the first vector. The odd elements of the result are |
5553 | obtained left-to-right from the high/low elements of the second vector. |
5554 | The output of interleave_high will be: 0 4 1 5 |
5555 | and of interleave_low: 2 6 3 7 |
5556 | |
5557 | |
5558 | The permutation is done in log LENGTH stages. In each stage interleave_high |
5559 | and interleave_low stmts are created for each pair of vectors in DR_CHAIN, |
5560 | where the first argument is taken from the first half of DR_CHAIN and the |
5561 | second argument from it's second half. |
5562 | In our example, |
5563 | |
5564 | I1: interleave_high (1st vec, 3rd vec) |
5565 | I2: interleave_low (1st vec, 3rd vec) |
5566 | I3: interleave_high (2nd vec, 4th vec) |
5567 | I4: interleave_low (2nd vec, 4th vec) |
5568 | |
5569 | The output for the first stage is: |
5570 | |
5571 | I1: 0 16 1 17 2 18 3 19 |
5572 | I2: 4 20 5 21 6 22 7 23 |
5573 | I3: 8 24 9 25 10 26 11 27 |
5574 | I4: 12 28 13 29 14 30 15 31 |
5575 | |
5576 | The output of the second stage, i.e. the final result is: |
5577 | |
5578 | I1: 0 8 16 24 1 9 17 25 |
5579 | I2: 2 10 18 26 3 11 19 27 |
5580 | I3: 4 12 20 28 5 13 21 30 |
5581 | I4: 6 14 22 30 7 15 23 31. */ |
5582 | |
5583 | void |
5584 | vect_permute_store_chain (vec_info *vinfo, vec<tree> &dr_chain, |
5585 | unsigned int length, |
5586 | stmt_vec_info stmt_info, |
5587 | gimple_stmt_iterator *gsi, |
5588 | vec<tree> *result_chain) |
5589 | { |
5590 | tree vect1, vect2, high, low; |
5591 | gimple *perm_stmt; |
5592 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); |
5593 | tree perm_mask_low, perm_mask_high; |
5594 | tree data_ref; |
5595 | tree perm3_mask_low, perm3_mask_high; |
5596 | unsigned int i, j, n, log_length = exact_log2 (x: length); |
5597 | |
5598 | result_chain->quick_grow (len: length); |
5599 | memcpy (dest: result_chain->address (), src: dr_chain.address (), |
5600 | n: length * sizeof (tree)); |
5601 | |
5602 | if (length == 3) |
5603 | { |
5604 | /* vect_grouped_store_supported ensures that this is constant. */ |
5605 | unsigned int nelt = TYPE_VECTOR_SUBPARTS (node: vectype).to_constant (); |
5606 | unsigned int j0 = 0, j1 = 0, j2 = 0; |
5607 | |
5608 | vec_perm_builder sel (nelt, nelt, 1); |
5609 | sel.quick_grow (len: nelt); |
5610 | vec_perm_indices indices; |
5611 | for (j = 0; j < 3; j++) |
5612 | { |
5613 | int nelt0 = ((3 - j) * nelt) % 3; |
5614 | int nelt1 = ((3 - j) * nelt + 1) % 3; |
5615 | int nelt2 = ((3 - j) * nelt + 2) % 3; |
5616 | |
5617 | for (i = 0; i < nelt; i++) |
5618 | { |
5619 | if (3 * i + nelt0 < nelt) |
5620 | sel[3 * i + nelt0] = j0++; |
5621 | if (3 * i + nelt1 < nelt) |
5622 | sel[3 * i + nelt1] = nelt + j1++; |
5623 | if (3 * i + nelt2 < nelt) |
5624 | sel[3 * i + nelt2] = 0; |
5625 | } |
5626 | indices.new_vector (sel, 2, nelt); |
5627 | perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices); |
5628 | |
5629 | for (i = 0; i < nelt; i++) |
5630 | { |
5631 | if (3 * i + nelt0 < nelt) |
5632 | sel[3 * i + nelt0] = 3 * i + nelt0; |
5633 | if (3 * i + nelt1 < nelt) |
5634 | sel[3 * i + nelt1] = 3 * i + nelt1; |
5635 | if (3 * i + nelt2 < nelt) |
5636 | sel[3 * i + nelt2] = nelt + j2++; |
5637 | } |
5638 | indices.new_vector (sel, 2, nelt); |
5639 | perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices); |
5640 | |
5641 | vect1 = dr_chain[0]; |
5642 | vect2 = dr_chain[1]; |
5643 | |
5644 | /* Create interleaving stmt: |
5645 | low = VEC_PERM_EXPR <vect1, vect2, |
5646 | {j, nelt, *, j + 1, nelt + j + 1, *, |
5647 | j + 2, nelt + j + 2, *, ...}> */ |
5648 | data_ref = make_temp_ssa_name (type: vectype, NULL, name: "vect_shuffle3_low" ); |
5649 | perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1, |
5650 | vect2, perm3_mask_low); |
5651 | vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); |
5652 | |
5653 | vect1 = data_ref; |
5654 | vect2 = dr_chain[2]; |
5655 | /* Create interleaving stmt: |
5656 | low = VEC_PERM_EXPR <vect1, vect2, |
5657 | {0, 1, nelt + j, 3, 4, nelt + j + 1, |
5658 | 6, 7, nelt + j + 2, ...}> */ |
5659 | data_ref = make_temp_ssa_name (type: vectype, NULL, name: "vect_shuffle3_high" ); |
5660 | perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1, |
5661 | vect2, perm3_mask_high); |
5662 | vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); |
5663 | (*result_chain)[j] = data_ref; |
5664 | } |
5665 | } |
5666 | else |
5667 | { |
5668 | /* If length is not equal to 3 then only power of 2 is supported. */ |
5669 | gcc_assert (pow2p_hwi (length)); |
5670 | |
5671 | /* The encoding has 2 interleaved stepped patterns. */ |
5672 | poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (node: vectype); |
5673 | vec_perm_builder sel (nelt, 2, 3); |
5674 | sel.quick_grow (len: 6); |
5675 | for (i = 0; i < 3; i++) |
5676 | { |
5677 | sel[i * 2] = i; |
5678 | sel[i * 2 + 1] = i + nelt; |
5679 | } |
5680 | vec_perm_indices indices (sel, 2, nelt); |
5681 | perm_mask_high = vect_gen_perm_mask_checked (vectype, indices); |
5682 | |
5683 | for (i = 0; i < 6; i++) |
5684 | sel[i] += exact_div (a: nelt, b: 2); |
5685 | indices.new_vector (sel, 2, nelt); |
5686 | perm_mask_low = vect_gen_perm_mask_checked (vectype, indices); |
5687 | |
5688 | for (i = 0, n = log_length; i < n; i++) |
5689 | { |
5690 | for (j = 0; j < length/2; j++) |
5691 | { |
5692 | vect1 = dr_chain[j]; |
5693 | vect2 = dr_chain[j+length/2]; |
5694 | |
5695 | /* Create interleaving stmt: |
5696 | high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, |
5697 | ...}> */ |
5698 | high = make_temp_ssa_name (type: vectype, NULL, name: "vect_inter_high" ); |
5699 | perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1, |
5700 | vect2, perm_mask_high); |
5701 | vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); |
5702 | (*result_chain)[2*j] = high; |
5703 | |
5704 | /* Create interleaving stmt: |
5705 | low = VEC_PERM_EXPR <vect1, vect2, |
5706 | {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1, |
5707 | ...}> */ |
5708 | low = make_temp_ssa_name (type: vectype, NULL, name: "vect_inter_low" ); |
5709 | perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1, |
5710 | vect2, perm_mask_low); |
5711 | vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); |
5712 | (*result_chain)[2*j+1] = low; |
5713 | } |
5714 | memcpy (dest: dr_chain.address (), src: result_chain->address (), |
5715 | n: length * sizeof (tree)); |
5716 | } |
5717 | } |
5718 | } |
5719 | |
5720 | /* Function vect_setup_realignment |
5721 | |
5722 | This function is called when vectorizing an unaligned load using |
5723 | the dr_explicit_realign[_optimized] scheme. |
5724 | This function generates the following code at the loop prolog: |
5725 | |
5726 | p = initial_addr; |
5727 | x msq_init = *(floor(p)); # prolog load |
5728 | realignment_token = call target_builtin; |
5729 | loop: |
5730 | x msq = phi (msq_init, ---) |
5731 | |
5732 | The stmts marked with x are generated only for the case of |
5733 | dr_explicit_realign_optimized. |
5734 | |
5735 | The code above sets up a new (vector) pointer, pointing to the first |
5736 | location accessed by STMT_INFO, and a "floor-aligned" load using that |
5737 | pointer. It also generates code to compute the "realignment-token" |
5738 | (if the relevant target hook was defined), and creates a phi-node at the |
5739 | loop-header bb whose arguments are the result of the prolog-load (created |
5740 | by this function) and the result of a load that takes place in the loop |
5741 | (to be created by the caller to this function). |
5742 | |
5743 | For the case of dr_explicit_realign_optimized: |
5744 | The caller to this function uses the phi-result (msq) to create the |
5745 | realignment code inside the loop, and sets up the missing phi argument, |
5746 | as follows: |
5747 | loop: |
5748 | msq = phi (msq_init, lsq) |
5749 | lsq = *(floor(p')); # load in loop |
5750 | result = realign_load (msq, lsq, realignment_token); |
5751 | |
5752 | For the case of dr_explicit_realign: |
5753 | loop: |
5754 | msq = *(floor(p)); # load in loop |
5755 | p' = p + (VS-1); |
5756 | lsq = *(floor(p')); # load in loop |
5757 | result = realign_load (msq, lsq, realignment_token); |
5758 | |
5759 | Input: |
5760 | STMT_INFO - (scalar) load stmt to be vectorized. This load accesses |
5761 | a memory location that may be unaligned. |
5762 | BSI - place where new code is to be inserted. |
5763 | ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes |
5764 | is used. |
5765 | |
5766 | Output: |
5767 | REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load |
5768 | target hook, if defined. |
5769 | Return value - the result of the loop-header phi node. */ |
5770 | |
5771 | tree |
5772 | vect_setup_realignment (vec_info *vinfo, stmt_vec_info stmt_info, |
5773 | gimple_stmt_iterator *gsi, tree *realignment_token, |
5774 | enum dr_alignment_support alignment_support_scheme, |
5775 | tree init_addr, |
5776 | class loop **at_loop) |
5777 | { |
5778 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); |
5779 | loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (p: vinfo); |
5780 | dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info); |
5781 | struct data_reference *dr = dr_info->dr; |
5782 | class loop *loop = NULL; |
5783 | edge pe = NULL; |
5784 | tree scalar_dest = gimple_assign_lhs (gs: stmt_info->stmt); |
5785 | tree vec_dest; |
5786 | gimple *inc; |
5787 | tree ptr; |
5788 | tree data_ref; |
5789 | basic_block new_bb; |
5790 | tree msq_init = NULL_TREE; |
5791 | tree new_temp; |
5792 | gphi *phi_stmt; |
5793 | tree msq = NULL_TREE; |
5794 | gimple_seq stmts = NULL; |
5795 | bool compute_in_loop = false; |
5796 | bool nested_in_vect_loop = false; |
5797 | class loop *containing_loop = (gimple_bb (g: stmt_info->stmt))->loop_father; |
5798 | class loop *loop_for_initial_load = NULL; |
5799 | |
5800 | if (loop_vinfo) |
5801 | { |
5802 | loop = LOOP_VINFO_LOOP (loop_vinfo); |
5803 | nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info); |
5804 | } |
5805 | |
5806 | gcc_assert (alignment_support_scheme == dr_explicit_realign |
5807 | || alignment_support_scheme == dr_explicit_realign_optimized); |
5808 | |
5809 | /* We need to generate three things: |
5810 | 1. the misalignment computation |
5811 | 2. the extra vector load (for the optimized realignment scheme). |
5812 | 3. the phi node for the two vectors from which the realignment is |
5813 | done (for the optimized realignment scheme). */ |
5814 | |
5815 | /* 1. Determine where to generate the misalignment computation. |
5816 | |
5817 | If INIT_ADDR is NULL_TREE, this indicates that the misalignment |
5818 | calculation will be generated by this function, outside the loop (in the |
5819 | preheader). Otherwise, INIT_ADDR had already been computed for us by the |
5820 | caller, inside the loop. |
5821 | |
5822 | Background: If the misalignment remains fixed throughout the iterations of |
5823 | the loop, then both realignment schemes are applicable, and also the |
5824 | misalignment computation can be done outside LOOP. This is because we are |
5825 | vectorizing LOOP, and so the memory accesses in LOOP advance in steps that |
5826 | are a multiple of VS (the Vector Size), and therefore the misalignment in |
5827 | different vectorized LOOP iterations is always the same. |
5828 | The problem arises only if the memory access is in an inner-loop nested |
5829 | inside LOOP, which is now being vectorized using outer-loop vectorization. |
5830 | This is the only case when the misalignment of the memory access may not |
5831 | remain fixed throughout the iterations of the inner-loop (as explained in |
5832 | detail in vect_supportable_dr_alignment). In this case, not only is the |
5833 | optimized realignment scheme not applicable, but also the misalignment |
5834 | computation (and generation of the realignment token that is passed to |
5835 | REALIGN_LOAD) have to be done inside the loop. |
5836 | |
5837 | In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode |
5838 | or not, which in turn determines if the misalignment is computed inside |
5839 | the inner-loop, or outside LOOP. */ |
5840 | |
5841 | if (init_addr != NULL_TREE || !loop_vinfo) |
5842 | { |
5843 | compute_in_loop = true; |
5844 | gcc_assert (alignment_support_scheme == dr_explicit_realign); |
5845 | } |
5846 | |
5847 | |
5848 | /* 2. Determine where to generate the extra vector load. |
5849 | |
5850 | For the optimized realignment scheme, instead of generating two vector |
5851 | loads in each iteration, we generate a single extra vector load in the |
5852 | preheader of the loop, and in each iteration reuse the result of the |
5853 | vector load from the previous iteration. In case the memory access is in |
5854 | an inner-loop nested inside LOOP, which is now being vectorized using |
5855 | outer-loop vectorization, we need to determine whether this initial vector |
5856 | load should be generated at the preheader of the inner-loop, or can be |
5857 | generated at the preheader of LOOP. If the memory access has no evolution |
5858 | in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has |
5859 | to be generated inside LOOP (in the preheader of the inner-loop). */ |
5860 | |
5861 | if (nested_in_vect_loop) |
5862 | { |
5863 | tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info); |
5864 | bool invariant_in_outerloop = |
5865 | (tree_int_cst_compare (t1: outerloop_step, size_zero_node) == 0); |
5866 | loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner); |
5867 | } |
5868 | else |
5869 | loop_for_initial_load = loop; |
5870 | if (at_loop) |
5871 | *at_loop = loop_for_initial_load; |
5872 | |
5873 | tree vuse = NULL_TREE; |
5874 | if (loop_for_initial_load) |
5875 | { |
5876 | pe = loop_preheader_edge (loop_for_initial_load); |
5877 | if (gphi *vphi = get_virtual_phi (loop_for_initial_load->header)) |
5878 | vuse = PHI_ARG_DEF_FROM_EDGE (vphi, pe); |
5879 | } |
5880 | if (!vuse) |
5881 | vuse = gimple_vuse (g: gsi_stmt (i: *gsi)); |
5882 | |
5883 | /* 3. For the case of the optimized realignment, create the first vector |
5884 | load at the loop preheader. */ |
5885 | |
5886 | if (alignment_support_scheme == dr_explicit_realign_optimized) |
5887 | { |
5888 | /* Create msq_init = *(floor(p1)) in the loop preheader */ |
5889 | gassign *new_stmt; |
5890 | |
5891 | gcc_assert (!compute_in_loop); |
5892 | vec_dest = vect_create_destination_var (scalar_dest, vectype); |
5893 | ptr = vect_create_data_ref_ptr (vinfo, stmt_info, aggr_type: vectype, |
5894 | at_loop: loop_for_initial_load, NULL_TREE, |
5895 | initial_address: &init_addr, NULL, ptr_incr: &inc, only_init: true); |
5896 | if (TREE_CODE (ptr) == SSA_NAME) |
5897 | new_temp = copy_ssa_name (var: ptr); |
5898 | else |
5899 | new_temp = make_ssa_name (TREE_TYPE (ptr)); |
5900 | poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info); |
5901 | tree type = TREE_TYPE (ptr); |
5902 | new_stmt = gimple_build_assign |
5903 | (new_temp, BIT_AND_EXPR, ptr, |
5904 | fold_build2 (MINUS_EXPR, type, |
5905 | build_int_cst (type, 0), |
5906 | build_int_cst (type, align))); |
5907 | new_bb = gsi_insert_on_edge_immediate (pe, new_stmt); |
5908 | gcc_assert (!new_bb); |
5909 | data_ref |
5910 | = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp, |
5911 | build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0)); |
5912 | vect_copy_ref_info (dest: data_ref, DR_REF (dr)); |
5913 | new_stmt = gimple_build_assign (vec_dest, data_ref); |
5914 | new_temp = make_ssa_name (var: vec_dest, stmt: new_stmt); |
5915 | gimple_assign_set_lhs (gs: new_stmt, lhs: new_temp); |
5916 | gimple_set_vuse (g: new_stmt, vuse); |
5917 | if (pe) |
5918 | { |
5919 | new_bb = gsi_insert_on_edge_immediate (pe, new_stmt); |
5920 | gcc_assert (!new_bb); |
5921 | } |
5922 | else |
5923 | gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); |
5924 | |
5925 | msq_init = gimple_assign_lhs (gs: new_stmt); |
5926 | } |
5927 | |
5928 | /* 4. Create realignment token using a target builtin, if available. |
5929 | It is done either inside the containing loop, or before LOOP (as |
5930 | determined above). */ |
5931 | |
5932 | if (targetm.vectorize.builtin_mask_for_load) |
5933 | { |
5934 | gcall *new_stmt; |
5935 | tree builtin_decl; |
5936 | |
5937 | /* Compute INIT_ADDR - the initial addressed accessed by this memref. */ |
5938 | if (!init_addr) |
5939 | { |
5940 | /* Generate the INIT_ADDR computation outside LOOP. */ |
5941 | init_addr = vect_create_addr_base_for_vector_ref (vinfo, |
5942 | stmt_info, new_stmt_list: &stmts, |
5943 | NULL_TREE); |
5944 | if (loop) |
5945 | { |
5946 | pe = loop_preheader_edge (loop); |
5947 | new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); |
5948 | gcc_assert (!new_bb); |
5949 | } |
5950 | else |
5951 | gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); |
5952 | } |
5953 | |
5954 | builtin_decl = targetm.vectorize.builtin_mask_for_load (); |
5955 | new_stmt = gimple_build_call (builtin_decl, 1, init_addr); |
5956 | vec_dest = |
5957 | vect_create_destination_var (scalar_dest, |
5958 | vectype: gimple_call_return_type (gs: new_stmt)); |
5959 | new_temp = make_ssa_name (var: vec_dest, stmt: new_stmt); |
5960 | gimple_call_set_lhs (gs: new_stmt, lhs: new_temp); |
5961 | |
5962 | if (compute_in_loop) |
5963 | gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); |
5964 | else |
5965 | { |
5966 | /* Generate the misalignment computation outside LOOP. */ |
5967 | pe = loop_preheader_edge (loop); |
5968 | new_bb = gsi_insert_on_edge_immediate (pe, new_stmt); |
5969 | gcc_assert (!new_bb); |
5970 | } |
5971 | |
5972 | *realignment_token = gimple_call_lhs (gs: new_stmt); |
5973 | |
5974 | /* The result of the CALL_EXPR to this builtin is determined from |
5975 | the value of the parameter and no global variables are touched |
5976 | which makes the builtin a "const" function. Requiring the |
5977 | builtin to have the "const" attribute makes it unnecessary |
5978 | to call mark_call_clobbered. */ |
5979 | gcc_assert (TREE_READONLY (builtin_decl)); |
5980 | } |
5981 | |
5982 | if (alignment_support_scheme == dr_explicit_realign) |
5983 | return msq; |
5984 | |
5985 | gcc_assert (!compute_in_loop); |
5986 | gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized); |
5987 | |
5988 | |
5989 | /* 5. Create msq = phi <msq_init, lsq> in loop */ |
5990 | |
5991 | pe = loop_preheader_edge (containing_loop); |
5992 | vec_dest = vect_create_destination_var (scalar_dest, vectype); |
5993 | msq = make_ssa_name (var: vec_dest); |
5994 | phi_stmt = create_phi_node (msq, containing_loop->header); |
5995 | add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION); |
5996 | |
5997 | return msq; |
5998 | } |
5999 | |
6000 | |
6001 | /* Function vect_grouped_load_supported. |
6002 | |
6003 | COUNT is the size of the load group (the number of statements plus the |
6004 | number of gaps). SINGLE_ELEMENT_P is true if there is actually |
6005 | only one statement, with a gap of COUNT - 1. |
6006 | |
6007 | Returns true if a suitable permute exists. */ |
6008 | |
6009 | bool |
6010 | vect_grouped_load_supported (tree vectype, bool single_element_p, |
6011 | unsigned HOST_WIDE_INT count) |
6012 | { |
6013 | machine_mode mode = TYPE_MODE (vectype); |
6014 | |
6015 | /* If this is single-element interleaving with an element distance |
6016 | that leaves unused vector loads around punt - we at least create |
6017 | very sub-optimal code in that case (and blow up memory, |
6018 | see PR65518). */ |
6019 | if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype))) |
6020 | { |
6021 | if (dump_enabled_p ()) |
6022 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
6023 | "single-element interleaving not supported " |
6024 | "for not adjacent vector loads\n" ); |
6025 | return false; |
6026 | } |
6027 | |
6028 | /* vect_permute_load_chain requires the group size to be equal to 3 or |
6029 | be a power of two. */ |
6030 | if (count != 3 && exact_log2 (x: count) == -1) |
6031 | { |
6032 | if (dump_enabled_p ()) |
6033 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
6034 | "the size of the group of accesses" |
6035 | " is not a power of 2 or not equal to 3\n" ); |
6036 | return false; |
6037 | } |
6038 | |
6039 | /* Check that the permutation is supported. */ |
6040 | if (VECTOR_MODE_P (mode)) |
6041 | { |
6042 | unsigned int i, j; |
6043 | if (count == 3) |
6044 | { |
6045 | unsigned int nelt; |
6046 | if (!GET_MODE_NUNITS (mode).is_constant (const_value: &nelt)) |
6047 | { |
6048 | if (dump_enabled_p ()) |
6049 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
6050 | "cannot handle groups of 3 loads for" |
6051 | " variable-length vectors\n" ); |
6052 | return false; |
6053 | } |
6054 | |
6055 | vec_perm_builder sel (nelt, nelt, 1); |
6056 | sel.quick_grow (len: nelt); |
6057 | vec_perm_indices indices; |
6058 | unsigned int k; |
6059 | for (k = 0; k < 3; k++) |
6060 | { |
6061 | for (i = 0; i < nelt; i++) |
6062 | if (3 * i + k < 2 * nelt) |
6063 | sel[i] = 3 * i + k; |
6064 | else |
6065 | sel[i] = 0; |
6066 | indices.new_vector (sel, 2, nelt); |
6067 | if (!can_vec_perm_const_p (mode, mode, indices)) |
6068 | { |
6069 | if (dump_enabled_p ()) |
6070 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
6071 | "shuffle of 3 loads is not supported by" |
6072 | " target\n" ); |
6073 | return false; |
6074 | } |
6075 | for (i = 0, j = 0; i < nelt; i++) |
6076 | if (3 * i + k < 2 * nelt) |
6077 | sel[i] = i; |
6078 | else |
6079 | sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++); |
6080 | indices.new_vector (sel, 2, nelt); |
6081 | if (!can_vec_perm_const_p (mode, mode, indices)) |
6082 | { |
6083 | if (dump_enabled_p ()) |
6084 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
6085 | "shuffle of 3 loads is not supported by" |
6086 | " target\n" ); |
6087 | return false; |
6088 | } |
6089 | } |
6090 | return true; |
6091 | } |
6092 | else |
6093 | { |
6094 | /* If length is not equal to 3 then only power of 2 is supported. */ |
6095 | gcc_assert (pow2p_hwi (count)); |
6096 | poly_uint64 nelt = GET_MODE_NUNITS (mode); |
6097 | |
6098 | /* The encoding has a single stepped pattern. */ |
6099 | vec_perm_builder sel (nelt, 1, 3); |
6100 | sel.quick_grow (len: 3); |
6101 | for (i = 0; i < 3; i++) |
6102 | sel[i] = i * 2; |
6103 | vec_perm_indices indices (sel, 2, nelt); |
6104 | if (can_vec_perm_const_p (mode, mode, indices)) |
6105 | { |
6106 | for (i = 0; i < 3; i++) |
6107 | sel[i] = i * 2 + 1; |
6108 | indices.new_vector (sel, 2, nelt); |
6109 | if (can_vec_perm_const_p (mode, mode, indices)) |
6110 | return true; |
6111 | } |
6112 | } |
6113 | } |
6114 | |
6115 | if (dump_enabled_p ()) |
6116 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
6117 | "extract even/odd not supported by target\n" ); |
6118 | return false; |
6119 | } |
6120 | |
6121 | /* Return FN if vec_{masked_,mask_len_}load_lanes is available for COUNT vectors |
6122 | of type VECTYPE. MASKED_P says whether the masked form is needed. */ |
6123 | |
6124 | internal_fn |
6125 | vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count, |
6126 | bool masked_p) |
6127 | { |
6128 | if (vect_lanes_optab_supported_p (name: "vec_mask_len_load_lanes" , |
6129 | optab: vec_mask_len_load_lanes_optab, vectype, |
6130 | count)) |
6131 | return IFN_MASK_LEN_LOAD_LANES; |
6132 | else if (masked_p) |
6133 | { |
6134 | if (vect_lanes_optab_supported_p (name: "vec_mask_load_lanes" , |
6135 | optab: vec_mask_load_lanes_optab, vectype, |
6136 | count)) |
6137 | return IFN_MASK_LOAD_LANES; |
6138 | } |
6139 | else |
6140 | { |
6141 | if (vect_lanes_optab_supported_p (name: "vec_load_lanes" , optab: vec_load_lanes_optab, |
6142 | vectype, count)) |
6143 | return IFN_LOAD_LANES; |
6144 | } |
6145 | return IFN_LAST; |
6146 | } |
6147 | |
6148 | /* Function vect_permute_load_chain. |
6149 | |
6150 | Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be |
6151 | a power of 2 or equal to 3, generate extract_even/odd stmts to reorder |
6152 | the input data correctly. Return the final references for loads in |
6153 | RESULT_CHAIN. |
6154 | |
6155 | E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8. |
6156 | The input is 4 vectors each containing 8 elements. We assign a number to each |
6157 | element, the input sequence is: |
6158 | |
6159 | 1st vec: 0 1 2 3 4 5 6 7 |
6160 | 2nd vec: 8 9 10 11 12 13 14 15 |
6161 | 3rd vec: 16 17 18 19 20 21 22 23 |
6162 | 4th vec: 24 25 26 27 28 29 30 31 |
6163 | |
6164 | The output sequence should be: |
6165 | |
6166 | 1st vec: 0 4 8 12 16 20 24 28 |
6167 | 2nd vec: 1 5 9 13 17 21 25 29 |
6168 | 3rd vec: 2 6 10 14 18 22 26 30 |
6169 | 4th vec: 3 7 11 15 19 23 27 31 |
6170 | |
6171 | i.e., the first output vector should contain the first elements of each |
6172 | interleaving group, etc. |
6173 | |
6174 | We use extract_even/odd instructions to create such output. The input of |
6175 | each extract_even/odd operation is two vectors |
6176 | 1st vec 2nd vec |
6177 | 0 1 2 3 4 5 6 7 |
6178 | |
6179 | and the output is the vector of extracted even/odd elements. The output of |
6180 | extract_even will be: 0 2 4 6 |
6181 | and of extract_odd: 1 3 5 7 |
6182 | |
6183 | |
6184 | The permutation is done in log LENGTH stages. In each stage extract_even |
6185 | and extract_odd stmts are created for each pair of vectors in DR_CHAIN in |
6186 | their order. In our example, |
6187 | |
6188 | E1: extract_even (1st vec, 2nd vec) |
6189 | E2: extract_odd (1st vec, 2nd vec) |
6190 | E3: extract_even (3rd vec, 4th vec) |
6191 | E4: extract_odd (3rd vec, 4th vec) |
6192 | |
6193 | The output for the first stage will be: |
6194 | |
6195 | E1: 0 2 4 6 8 10 12 14 |
6196 | E2: 1 3 5 7 9 11 13 15 |
6197 | E3: 16 18 20 22 24 26 28 30 |
6198 | E4: 17 19 21 23 25 27 29 31 |
6199 | |
6200 | In order to proceed and create the correct sequence for the next stage (or |
6201 | for the correct output, if the second stage is the last one, as in our |
6202 | example), we first put the output of extract_even operation and then the |
6203 | output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN). |
6204 | The input for the second stage is: |
6205 | |
6206 | 1st vec (E1): 0 2 4 6 8 10 12 14 |
6207 | 2nd vec (E3): 16 18 20 22 24 26 28 30 |
6208 | 3rd vec (E2): 1 3 5 7 9 11 13 15 |
6209 | 4th vec (E4): 17 19 21 23 25 27 29 31 |
6210 | |
6211 | The output of the second stage: |
6212 | |
6213 | E1: 0 4 8 12 16 20 24 28 |
6214 | E2: 2 6 10 14 18 22 26 30 |
6215 | E3: 1 5 9 13 17 21 25 29 |
6216 | E4: 3 7 11 15 19 23 27 31 |
6217 | |
6218 | And RESULT_CHAIN after reordering: |
6219 | |
6220 | 1st vec (E1): 0 4 8 12 16 20 24 28 |
6221 | 2nd vec (E3): 1 5 9 13 17 21 25 29 |
6222 | 3rd vec (E2): 2 6 10 14 18 22 26 30 |
6223 | 4th vec (E4): 3 7 11 15 19 23 27 31. */ |
6224 | |
6225 | static void |
6226 | vect_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain, |
6227 | unsigned int length, |
6228 | stmt_vec_info stmt_info, |
6229 | gimple_stmt_iterator *gsi, |
6230 | vec<tree> *result_chain) |
6231 | { |
6232 | tree data_ref, first_vect, second_vect; |
6233 | tree perm_mask_even, perm_mask_odd; |
6234 | tree perm3_mask_low, perm3_mask_high; |
6235 | gimple *perm_stmt; |
6236 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); |
6237 | unsigned int i, j, log_length = exact_log2 (x: length); |
6238 | |
6239 | result_chain->quick_grow (len: length); |
6240 | memcpy (dest: result_chain->address (), src: dr_chain.address (), |
6241 | n: length * sizeof (tree)); |
6242 | |
6243 | if (length == 3) |
6244 | { |
6245 | /* vect_grouped_load_supported ensures that this is constant. */ |
6246 | unsigned nelt = TYPE_VECTOR_SUBPARTS (node: vectype).to_constant (); |
6247 | unsigned int k; |
6248 | |
6249 | vec_perm_builder sel (nelt, nelt, 1); |
6250 | sel.quick_grow (len: nelt); |
6251 | vec_perm_indices indices; |
6252 | for (k = 0; k < 3; k++) |
6253 | { |
6254 | for (i = 0; i < nelt; i++) |
6255 | if (3 * i + k < 2 * nelt) |
6256 | sel[i] = 3 * i + k; |
6257 | else |
6258 | sel[i] = 0; |
6259 | indices.new_vector (sel, 2, nelt); |
6260 | perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices); |
6261 | |
6262 | for (i = 0, j = 0; i < nelt; i++) |
6263 | if (3 * i + k < 2 * nelt) |
6264 | sel[i] = i; |
6265 | else |
6266 | sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++); |
6267 | indices.new_vector (sel, 2, nelt); |
6268 | perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices); |
6269 | |
6270 | first_vect = dr_chain[0]; |
6271 | second_vect = dr_chain[1]; |
6272 | |
6273 | /* Create interleaving stmt (low part of): |
6274 | low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k, |
6275 | ...}> */ |
6276 | data_ref = make_temp_ssa_name (type: vectype, NULL, name: "vect_shuffle3_low" ); |
6277 | perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect, |
6278 | second_vect, perm3_mask_low); |
6279 | vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); |
6280 | |
6281 | /* Create interleaving stmt (high part of): |
6282 | high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k, |
6283 | ...}> */ |
6284 | first_vect = data_ref; |
6285 | second_vect = dr_chain[2]; |
6286 | data_ref = make_temp_ssa_name (type: vectype, NULL, name: "vect_shuffle3_high" ); |
6287 | perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect, |
6288 | second_vect, perm3_mask_high); |
6289 | vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); |
6290 | (*result_chain)[k] = data_ref; |
6291 | } |
6292 | } |
6293 | else |
6294 | { |
6295 | /* If length is not equal to 3 then only power of 2 is supported. */ |
6296 | gcc_assert (pow2p_hwi (length)); |
6297 | |
6298 | /* The encoding has a single stepped pattern. */ |
6299 | poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (node: vectype); |
6300 | vec_perm_builder sel (nelt, 1, 3); |
6301 | sel.quick_grow (len: 3); |
6302 | for (i = 0; i < 3; ++i) |
6303 | sel[i] = i * 2; |
6304 | vec_perm_indices indices (sel, 2, nelt); |
6305 | perm_mask_even = vect_gen_perm_mask_checked (vectype, indices); |
6306 | |
6307 | for (i = 0; i < 3; ++i) |
6308 | sel[i] = i * 2 + 1; |
6309 | indices.new_vector (sel, 2, nelt); |
6310 | perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices); |
6311 | |
6312 | for (i = 0; i < log_length; i++) |
6313 | { |
6314 | for (j = 0; j < length; j += 2) |
6315 | { |
6316 | first_vect = dr_chain[j]; |
6317 | second_vect = dr_chain[j+1]; |
6318 | |
6319 | /* data_ref = permute_even (first_data_ref, second_data_ref); */ |
6320 | data_ref = make_temp_ssa_name (type: vectype, NULL, name: "vect_perm_even" ); |
6321 | perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, |
6322 | first_vect, second_vect, |
6323 | perm_mask_even); |
6324 | vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); |
6325 | (*result_chain)[j/2] = data_ref; |
6326 | |
6327 | /* data_ref = permute_odd (first_data_ref, second_data_ref); */ |
6328 | data_ref = make_temp_ssa_name (type: vectype, NULL, name: "vect_perm_odd" ); |
6329 | perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, |
6330 | first_vect, second_vect, |
6331 | perm_mask_odd); |
6332 | vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); |
6333 | (*result_chain)[j/2+length/2] = data_ref; |
6334 | } |
6335 | memcpy (dest: dr_chain.address (), src: result_chain->address (), |
6336 | n: length * sizeof (tree)); |
6337 | } |
6338 | } |
6339 | } |
6340 | |
6341 | /* Function vect_shift_permute_load_chain. |
6342 | |
6343 | Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate |
6344 | sequence of stmts to reorder the input data accordingly. |
6345 | Return the final references for loads in RESULT_CHAIN. |
6346 | Return true if successed, false otherwise. |
6347 | |
6348 | E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8. |
6349 | The input is 3 vectors each containing 8 elements. We assign a |
6350 | number to each element, the input sequence is: |
6351 | |
6352 | 1st vec: 0 1 2 3 4 5 6 7 |
6353 | 2nd vec: 8 9 10 11 12 13 14 15 |
6354 | 3rd vec: 16 17 18 19 20 21 22 23 |
6355 | |
6356 | The output sequence should be: |
6357 | |
6358 | 1st vec: 0 3 6 9 12 15 18 21 |
6359 | 2nd vec: 1 4 7 10 13 16 19 22 |
6360 | 3rd vec: 2 5 8 11 14 17 20 23 |
6361 | |
6362 | We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output. |
6363 | |
6364 | First we shuffle all 3 vectors to get correct elements order: |
6365 | |
6366 | 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5) |
6367 | 2nd vec: ( 8 11 14) ( 9 12 15) (10 13) |
6368 | 3rd vec: (16 19 22) (17 20 23) (18 21) |
6369 | |
6370 | Next we unite and shift vector 3 times: |
6371 | |
6372 | 1st step: |
6373 | shift right by 6 the concatenation of: |
6374 | "1st vec" and "2nd vec" |
6375 | ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13) |
6376 | "2nd vec" and "3rd vec" |
6377 | ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21) |
6378 | "3rd vec" and "1st vec" |
6379 | (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5) |
6380 | | New vectors | |
6381 | |
6382 | So that now new vectors are: |
6383 | |
6384 | 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15) |
6385 | 2nd vec: (10 13) (16 19 22) (17 20 23) |
6386 | 3rd vec: (18 21) ( 0 3 6) ( 1 4 7) |
6387 | |
6388 | 2nd step: |
6389 | shift right by 5 the concatenation of: |
6390 | "1st vec" and "3rd vec" |
6391 | ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7) |
6392 | "2nd vec" and "1st vec" |
6393 | (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15) |
6394 | "3rd vec" and "2nd vec" |
6395 | (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23) |
6396 | | New vectors | |
6397 | |
6398 | So that now new vectors are: |
6399 | |
6400 | 1st vec: ( 9 12 15) (18 21) ( 0 3 6) |
6401 | 2nd vec: (17 20 23) ( 2 5) ( 8 11 14) |
6402 | 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY |
6403 | |
6404 | 3rd step: |
6405 | shift right by 5 the concatenation of: |
6406 | "1st vec" and "1st vec" |
6407 | ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6) |
6408 | shift right by 3 the concatenation of: |
6409 | "2nd vec" and "2nd vec" |
6410 | (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14) |
6411 | | New vectors | |
6412 | |
6413 | So that now all vectors are READY: |
6414 | 1st vec: ( 0 3 6) ( 9 12 15) (18 21) |
6415 | 2nd vec: ( 2 5) ( 8 11 14) (17 20 23) |
6416 | 3rd vec: ( 1 4 7) (10 13) (16 19 22) |
6417 | |
6418 | This algorithm is faster than one in vect_permute_load_chain if: |
6419 | 1. "shift of a concatination" is faster than general permutation. |
6420 | This is usually so. |
6421 | 2. The TARGET machine can't execute vector instructions in parallel. |
6422 | This is because each step of the algorithm depends on previous. |
6423 | The algorithm in vect_permute_load_chain is much more parallel. |
6424 | |
6425 | The algorithm is applicable only for LOAD CHAIN LENGTH less than VF. |
6426 | */ |
6427 | |
6428 | static bool |
6429 | vect_shift_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain, |
6430 | unsigned int length, |
6431 | stmt_vec_info stmt_info, |
6432 | gimple_stmt_iterator *gsi, |
6433 | vec<tree> *result_chain) |
6434 | { |
6435 | tree vect[3], vect_shift[3], data_ref, first_vect, second_vect; |
6436 | tree perm2_mask1, perm2_mask2, perm3_mask; |
6437 | tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask; |
6438 | gimple *perm_stmt; |
6439 | |
6440 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); |
6441 | machine_mode vmode = TYPE_MODE (vectype); |
6442 | unsigned int i; |
6443 | loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (p: vinfo); |
6444 | |
6445 | unsigned HOST_WIDE_INT nelt, vf; |
6446 | if (!TYPE_VECTOR_SUBPARTS (node: vectype).is_constant (const_value: &nelt) |
6447 | || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (const_value: &vf)) |
6448 | /* Not supported for variable-length vectors. */ |
6449 | return false; |
6450 | |
6451 | vec_perm_builder sel (nelt, nelt, 1); |
6452 | sel.quick_grow (len: nelt); |
6453 | |
6454 | result_chain->quick_grow (len: length); |
6455 | memcpy (dest: result_chain->address (), src: dr_chain.address (), |
6456 | n: length * sizeof (tree)); |
6457 | |
6458 | if (pow2p_hwi (x: length) && vf > 4) |
6459 | { |
6460 | unsigned int j, log_length = exact_log2 (x: length); |
6461 | for (i = 0; i < nelt / 2; ++i) |
6462 | sel[i] = i * 2; |
6463 | for (i = 0; i < nelt / 2; ++i) |
6464 | sel[nelt / 2 + i] = i * 2 + 1; |
6465 | vec_perm_indices indices (sel, 2, nelt); |
6466 | if (!can_vec_perm_const_p (vmode, vmode, indices)) |
6467 | { |
6468 | if (dump_enabled_p ()) |
6469 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
6470 | "shuffle of 2 fields structure is not \ |
6471 | supported by target\n" ); |
6472 | return false; |
6473 | } |
6474 | perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices); |
6475 | |
6476 | for (i = 0; i < nelt / 2; ++i) |
6477 | sel[i] = i * 2 + 1; |
6478 | for (i = 0; i < nelt / 2; ++i) |
6479 | sel[nelt / 2 + i] = i * 2; |
6480 | indices.new_vector (sel, 2, nelt); |
6481 | if (!can_vec_perm_const_p (vmode, vmode, indices)) |
6482 | { |
6483 | if (dump_enabled_p ()) |
6484 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
6485 | "shuffle of 2 fields structure is not \ |
6486 | supported by target\n" ); |
6487 | return false; |
6488 | } |
6489 | perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices); |
6490 | |
6491 | /* Generating permutation constant to shift all elements. |
6492 | For vector length 8 it is {4 5 6 7 8 9 10 11}. */ |
6493 | for (i = 0; i < nelt; i++) |
6494 | sel[i] = nelt / 2 + i; |
6495 | indices.new_vector (sel, 2, nelt); |
6496 | if (!can_vec_perm_const_p (vmode, vmode, indices)) |
6497 | { |
6498 | if (dump_enabled_p ()) |
6499 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
6500 | "shift permutation is not supported by target\n" ); |
6501 | return false; |
6502 | } |
6503 | shift1_mask = vect_gen_perm_mask_checked (vectype, indices); |
6504 | |
6505 | /* Generating permutation constant to select vector from 2. |
6506 | For vector length 8 it is {0 1 2 3 12 13 14 15}. */ |
6507 | for (i = 0; i < nelt / 2; i++) |
6508 | sel[i] = i; |
6509 | for (i = nelt / 2; i < nelt; i++) |
6510 | sel[i] = nelt + i; |
6511 | indices.new_vector (sel, 2, nelt); |
6512 | if (!can_vec_perm_const_p (vmode, vmode, indices)) |
6513 | { |
6514 | if (dump_enabled_p ()) |
6515 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
6516 | "select is not supported by target\n" ); |
6517 | return false; |
6518 | } |
6519 | select_mask = vect_gen_perm_mask_checked (vectype, indices); |
6520 | |
6521 | for (i = 0; i < log_length; i++) |
6522 | { |
6523 | for (j = 0; j < length; j += 2) |
6524 | { |
6525 | first_vect = dr_chain[j]; |
6526 | second_vect = dr_chain[j + 1]; |
6527 | |
6528 | data_ref = make_temp_ssa_name (type: vectype, NULL, name: "vect_shuffle2" ); |
6529 | perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, |
6530 | first_vect, first_vect, |
6531 | perm2_mask1); |
6532 | vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); |
6533 | vect[0] = data_ref; |
6534 | |
6535 | data_ref = make_temp_ssa_name (type: vectype, NULL, name: "vect_shuffle2" ); |
6536 | perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, |
6537 | second_vect, second_vect, |
6538 | perm2_mask2); |
6539 | vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); |
6540 | vect[1] = data_ref; |
6541 | |
6542 | data_ref = make_temp_ssa_name (type: vectype, NULL, name: "vect_shift" ); |
6543 | perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, |
6544 | vect[0], vect[1], shift1_mask); |
6545 | vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); |
6546 | (*result_chain)[j/2 + length/2] = data_ref; |
6547 | |
6548 | data_ref = make_temp_ssa_name (type: vectype, NULL, name: "vect_select" ); |
6549 | perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, |
6550 | vect[0], vect[1], select_mask); |
6551 | vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); |
6552 | (*result_chain)[j/2] = data_ref; |
6553 | } |
6554 | memcpy (dest: dr_chain.address (), src: result_chain->address (), |
6555 | n: length * sizeof (tree)); |
6556 | } |
6557 | return true; |
6558 | } |
6559 | if (length == 3 && vf > 2) |
6560 | { |
6561 | unsigned int k = 0, l = 0; |
6562 | |
6563 | /* Generating permutation constant to get all elements in rigth order. |
6564 | For vector length 8 it is {0 3 6 1 4 7 2 5}. */ |
6565 | for (i = 0; i < nelt; i++) |
6566 | { |
6567 | if (3 * k + (l % 3) >= nelt) |
6568 | { |
6569 | k = 0; |
6570 | l += (3 - (nelt % 3)); |
6571 | } |
6572 | sel[i] = 3 * k + (l % 3); |
6573 | k++; |
6574 | } |
6575 | vec_perm_indices indices (sel, 2, nelt); |
6576 | if (!can_vec_perm_const_p (vmode, vmode, indices)) |
6577 | { |
6578 | if (dump_enabled_p ()) |
6579 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
6580 | "shuffle of 3 fields structure is not \ |
6581 | supported by target\n" ); |
6582 | return false; |
6583 | } |
6584 | perm3_mask = vect_gen_perm_mask_checked (vectype, indices); |
6585 | |
6586 | /* Generating permutation constant to shift all elements. |
6587 | For vector length 8 it is {6 7 8 9 10 11 12 13}. */ |
6588 | for (i = 0; i < nelt; i++) |
6589 | sel[i] = 2 * (nelt / 3) + (nelt % 3) + i; |
6590 | indices.new_vector (sel, 2, nelt); |
6591 | if (!can_vec_perm_const_p (vmode, vmode, indices)) |
6592 | { |
6593 | if (dump_enabled_p ()) |
6594 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
6595 | "shift permutation is not supported by target\n" ); |
6596 | return false; |
6597 | } |
6598 | shift1_mask = vect_gen_perm_mask_checked (vectype, indices); |
6599 | |
6600 | /* Generating permutation constant to shift all elements. |
6601 | For vector length 8 it is {5 6 7 8 9 10 11 12}. */ |
6602 | for (i = 0; i < nelt; i++) |
6603 | sel[i] = 2 * (nelt / 3) + 1 + i; |
6604 | indices.new_vector (sel, 2, nelt); |
6605 | if (!can_vec_perm_const_p (vmode, vmode, indices)) |
6606 | { |
6607 | if (dump_enabled_p ()) |
6608 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
6609 | "shift permutation is not supported by target\n" ); |
6610 | return false; |
6611 | } |
6612 | shift2_mask = vect_gen_perm_mask_checked (vectype, indices); |
6613 | |
6614 | /* Generating permutation constant to shift all elements. |
6615 | For vector length 8 it is {3 4 5 6 7 8 9 10}. */ |
6616 | for (i = 0; i < nelt; i++) |
6617 | sel[i] = (nelt / 3) + (nelt % 3) / 2 + i; |
6618 | indices.new_vector (sel, 2, nelt); |
6619 | if (!can_vec_perm_const_p (vmode, vmode, indices)) |
6620 | { |
6621 | if (dump_enabled_p ()) |
6622 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
6623 | "shift permutation is not supported by target\n" ); |
6624 | return false; |
6625 | } |
6626 | shift3_mask = vect_gen_perm_mask_checked (vectype, indices); |
6627 | |
6628 | /* Generating permutation constant to shift all elements. |
6629 | For vector length 8 it is {5 6 7 8 9 10 11 12}. */ |
6630 | for (i = 0; i < nelt; i++) |
6631 | sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i; |
6632 | indices.new_vector (sel, 2, nelt); |
6633 | if (!can_vec_perm_const_p (vmode, vmode, indices)) |
6634 | { |
6635 | if (dump_enabled_p ()) |
6636 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
6637 | "shift permutation is not supported by target\n" ); |
6638 | return false; |
6639 | } |
6640 | shift4_mask = vect_gen_perm_mask_checked (vectype, indices); |
6641 | |
6642 | for (k = 0; k < 3; k++) |
6643 | { |
6644 | data_ref = make_temp_ssa_name (type: vectype, NULL, name: "vect_shuffle3" ); |
6645 | perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, |
6646 | dr_chain[k], dr_chain[k], |
6647 | perm3_mask); |
6648 | vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); |
6649 | vect[k] = data_ref; |
6650 | } |
6651 | |
6652 | for (k = 0; k < 3; k++) |
6653 | { |
6654 | data_ref = make_temp_ssa_name (type: vectype, NULL, name: "vect_shift1" ); |
6655 | perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, |
6656 | vect[k % 3], vect[(k + 1) % 3], |
6657 | shift1_mask); |
6658 | vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); |
6659 | vect_shift[k] = data_ref; |
6660 | } |
6661 | |
6662 | for (k = 0; k < 3; k++) |
6663 | { |
6664 | data_ref = make_temp_ssa_name (type: vectype, NULL, name: "vect_shift2" ); |
6665 | perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, |
6666 | vect_shift[(4 - k) % 3], |
6667 | vect_shift[(3 - k) % 3], |
6668 | shift2_mask); |
6669 | vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); |
6670 | vect[k] = data_ref; |
6671 | } |
6672 | |
6673 | (*result_chain)[3 - (nelt % 3)] = vect[2]; |
6674 | |
6675 | data_ref = make_temp_ssa_name (type: vectype, NULL, name: "vect_shift3" ); |
6676 | perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0], |
6677 | vect[0], shift3_mask); |
6678 | vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); |
6679 | (*result_chain)[nelt % 3] = data_ref; |
6680 | |
6681 | data_ref = make_temp_ssa_name (type: vectype, NULL, name: "vect_shift4" ); |
6682 | perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1], |
6683 | vect[1], shift4_mask); |
6684 | vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); |
6685 | (*result_chain)[0] = data_ref; |
6686 | return true; |
6687 | } |
6688 | return false; |
6689 | } |
6690 | |
6691 | /* Function vect_transform_grouped_load. |
6692 | |
6693 | Given a chain of input interleaved data-refs (in DR_CHAIN), build statements |
6694 | to perform their permutation and ascribe the result vectorized statements to |
6695 | the scalar statements. |
6696 | */ |
6697 | |
6698 | void |
6699 | vect_transform_grouped_load (vec_info *vinfo, stmt_vec_info stmt_info, |
6700 | vec<tree> dr_chain, |
6701 | int size, gimple_stmt_iterator *gsi) |
6702 | { |
6703 | machine_mode mode; |
6704 | vec<tree> result_chain = vNULL; |
6705 | |
6706 | /* DR_CHAIN contains input data-refs that are a part of the interleaving. |
6707 | RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted |
6708 | vectors, that are ready for vector computation. */ |
6709 | result_chain.create (nelems: size); |
6710 | |
6711 | /* If reassociation width for vector type is 2 or greater target machine can |
6712 | execute 2 or more vector instructions in parallel. Otherwise try to |
6713 | get chain for loads group using vect_shift_permute_load_chain. */ |
6714 | mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info)); |
6715 | if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1 |
6716 | || pow2p_hwi (x: size) |
6717 | || !vect_shift_permute_load_chain (vinfo, dr_chain, length: size, stmt_info, |
6718 | gsi, result_chain: &result_chain)) |
6719 | vect_permute_load_chain (vinfo, dr_chain, |
6720 | length: size, stmt_info, gsi, result_chain: &result_chain); |
6721 | vect_record_grouped_load_vectors (vinfo, stmt_info, result_chain); |
6722 | result_chain.release (); |
6723 | } |
6724 | |
6725 | /* RESULT_CHAIN contains the output of a group of grouped loads that were |
6726 | generated as part of the vectorization of STMT_INFO. Assign the statement |
6727 | for each vector to the associated scalar statement. */ |
6728 | |
6729 | void |
6730 | vect_record_grouped_load_vectors (vec_info *, stmt_vec_info stmt_info, |
6731 | vec<tree> result_chain) |
6732 | { |
6733 | stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); |
6734 | unsigned int i, gap_count; |
6735 | tree tmp_data_ref; |
6736 | |
6737 | /* Put a permuted data-ref in the VECTORIZED_STMT field. |
6738 | Since we scan the chain starting from it's first node, their order |
6739 | corresponds the order of data-refs in RESULT_CHAIN. */ |
6740 | stmt_vec_info next_stmt_info = first_stmt_info; |
6741 | gap_count = 1; |
6742 | FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref) |
6743 | { |
6744 | if (!next_stmt_info) |
6745 | break; |
6746 | |
6747 | /* Skip the gaps. Loads created for the gaps will be removed by dead |
6748 | code elimination pass later. No need to check for the first stmt in |
6749 | the group, since it always exists. |
6750 | DR_GROUP_GAP is the number of steps in elements from the previous |
6751 | access (if there is no gap DR_GROUP_GAP is 1). We skip loads that |
6752 | correspond to the gaps. */ |
6753 | if (next_stmt_info != first_stmt_info |
6754 | && gap_count < DR_GROUP_GAP (next_stmt_info)) |
6755 | { |
6756 | gap_count++; |
6757 | continue; |
6758 | } |
6759 | |
6760 | /* ??? The following needs cleanup after the removal of |
6761 | DR_GROUP_SAME_DR_STMT. */ |
6762 | if (next_stmt_info) |
6763 | { |
6764 | gimple *new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref); |
6765 | /* We assume that if VEC_STMT is not NULL, this is a case of multiple |
6766 | copies, and we put the new vector statement last. */ |
6767 | STMT_VINFO_VEC_STMTS (next_stmt_info).safe_push (obj: new_stmt); |
6768 | |
6769 | next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info); |
6770 | gap_count = 1; |
6771 | } |
6772 | } |
6773 | } |
6774 | |
6775 | /* Function vect_force_dr_alignment_p. |
6776 | |
6777 | Returns whether the alignment of a DECL can be forced to be aligned |
6778 | on ALIGNMENT bit boundary. */ |
6779 | |
6780 | bool |
6781 | vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment) |
6782 | { |
6783 | if (!VAR_P (decl)) |
6784 | return false; |
6785 | |
6786 | if (decl_in_symtab_p (decl) |
6787 | && !symtab_node::get (decl)->can_increase_alignment_p ()) |
6788 | return false; |
6789 | |
6790 | if (TREE_STATIC (decl)) |
6791 | return (known_le (alignment, |
6792 | (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT)); |
6793 | else |
6794 | return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT)); |
6795 | } |
6796 | |
6797 | /* Return whether the data reference DR_INFO is supported with respect to its |
6798 | alignment. |
6799 | If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even |
6800 | it is aligned, i.e., check if it is possible to vectorize it with different |
6801 | alignment. */ |
6802 | |
6803 | enum dr_alignment_support |
6804 | vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info, |
6805 | tree vectype, int misalignment) |
6806 | { |
6807 | data_reference *dr = dr_info->dr; |
6808 | stmt_vec_info stmt_info = dr_info->stmt; |
6809 | machine_mode mode = TYPE_MODE (vectype); |
6810 | loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (p: vinfo); |
6811 | class loop *vect_loop = NULL; |
6812 | bool nested_in_vect_loop = false; |
6813 | |
6814 | if (misalignment == 0) |
6815 | return dr_aligned; |
6816 | |
6817 | /* For now assume all conditional loads/stores support unaligned |
6818 | access without any special code. */ |
6819 | if (gcall *stmt = dyn_cast <gcall *> (p: stmt_info->stmt)) |
6820 | if (gimple_call_internal_p (gs: stmt) |
6821 | && (gimple_call_internal_fn (gs: stmt) == IFN_MASK_LOAD |
6822 | || gimple_call_internal_fn (gs: stmt) == IFN_MASK_STORE)) |
6823 | return dr_unaligned_supported; |
6824 | |
6825 | if (loop_vinfo) |
6826 | { |
6827 | vect_loop = LOOP_VINFO_LOOP (loop_vinfo); |
6828 | nested_in_vect_loop = nested_in_vect_loop_p (loop: vect_loop, stmt_info); |
6829 | } |
6830 | |
6831 | /* Possibly unaligned access. */ |
6832 | |
6833 | /* We can choose between using the implicit realignment scheme (generating |
6834 | a misaligned_move stmt) and the explicit realignment scheme (generating |
6835 | aligned loads with a REALIGN_LOAD). There are two variants to the |
6836 | explicit realignment scheme: optimized, and unoptimized. |
6837 | We can optimize the realignment only if the step between consecutive |
6838 | vector loads is equal to the vector size. Since the vector memory |
6839 | accesses advance in steps of VS (Vector Size) in the vectorized loop, it |
6840 | is guaranteed that the misalignment amount remains the same throughout the |
6841 | execution of the vectorized loop. Therefore, we can create the |
6842 | "realignment token" (the permutation mask that is passed to REALIGN_LOAD) |
6843 | at the loop preheader. |
6844 | |
6845 | However, in the case of outer-loop vectorization, when vectorizing a |
6846 | memory access in the inner-loop nested within the LOOP that is now being |
6847 | vectorized, while it is guaranteed that the misalignment of the |
6848 | vectorized memory access will remain the same in different outer-loop |
6849 | iterations, it is *not* guaranteed that is will remain the same throughout |
6850 | the execution of the inner-loop. This is because the inner-loop advances |
6851 | with the original scalar step (and not in steps of VS). If the inner-loop |
6852 | step happens to be a multiple of VS, then the misalignment remains fixed |
6853 | and we can use the optimized realignment scheme. For example: |
6854 | |
6855 | for (i=0; i<N; i++) |
6856 | for (j=0; j<M; j++) |
6857 | s += a[i+j]; |
6858 | |
6859 | When vectorizing the i-loop in the above example, the step between |
6860 | consecutive vector loads is 1, and so the misalignment does not remain |
6861 | fixed across the execution of the inner-loop, and the realignment cannot |
6862 | be optimized (as illustrated in the following pseudo vectorized loop): |
6863 | |
6864 | for (i=0; i<N; i+=4) |
6865 | for (j=0; j<M; j++){ |
6866 | vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...} |
6867 | // when j is {0,1,2,3,4,5,6,7,...} respectively. |
6868 | // (assuming that we start from an aligned address). |
6869 | } |
6870 | |
6871 | We therefore have to use the unoptimized realignment scheme: |
6872 | |
6873 | for (i=0; i<N; i+=4) |
6874 | for (j=k; j<M; j+=4) |
6875 | vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming |
6876 | // that the misalignment of the initial address is |
6877 | // 0). |
6878 | |
6879 | The loop can then be vectorized as follows: |
6880 | |
6881 | for (k=0; k<4; k++){ |
6882 | rt = get_realignment_token (&vp[k]); |
6883 | for (i=0; i<N; i+=4){ |
6884 | v1 = vp[i+k]; |
6885 | for (j=k; j<M; j+=4){ |
6886 | v2 = vp[i+j+VS-1]; |
6887 | va = REALIGN_LOAD <v1,v2,rt>; |
6888 | vs += va; |
6889 | v1 = v2; |
6890 | } |
6891 | } |
6892 | } */ |
6893 | |
6894 | if (DR_IS_READ (dr)) |
6895 | { |
6896 | if (optab_handler (op: vec_realign_load_optab, mode) != CODE_FOR_nothing |
6897 | && (!targetm.vectorize.builtin_mask_for_load |
6898 | || targetm.vectorize.builtin_mask_for_load ())) |
6899 | { |
6900 | /* If we are doing SLP then the accesses need not have the |
6901 | same alignment, instead it depends on the SLP group size. */ |
6902 | if (loop_vinfo |
6903 | && STMT_SLP_TYPE (stmt_info) |
6904 | && (!STMT_VINFO_GROUPED_ACCESS (stmt_info) |
6905 | || !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo) |
6906 | * (DR_GROUP_SIZE |
6907 | (DR_GROUP_FIRST_ELEMENT (stmt_info))), |
6908 | b: TYPE_VECTOR_SUBPARTS (node: vectype)))) |
6909 | ; |
6910 | else if (!loop_vinfo |
6911 | || (nested_in_vect_loop |
6912 | && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)), |
6913 | b: GET_MODE_SIZE (TYPE_MODE (vectype))))) |
6914 | return dr_explicit_realign; |
6915 | else |
6916 | return dr_explicit_realign_optimized; |
6917 | } |
6918 | } |
6919 | |
6920 | bool is_packed = false; |
6921 | tree type = TREE_TYPE (DR_REF (dr)); |
6922 | if (misalignment == DR_MISALIGNMENT_UNKNOWN) |
6923 | is_packed = not_size_aligned (DR_REF (dr)); |
6924 | if (targetm.vectorize.support_vector_misalignment (mode, type, misalignment, |
6925 | is_packed)) |
6926 | return dr_unaligned_supported; |
6927 | |
6928 | /* Unsupported. */ |
6929 | return dr_unaligned_unsupported; |
6930 | } |
6931 | |