1 | /* SLP - Basic Block Vectorization |
2 | Copyright (C) 2007-2023 Free Software Foundation, Inc. |
3 | Contributed by Dorit Naishlos <dorit@il.ibm.com> |
4 | and Ira Rosen <irar@il.ibm.com> |
5 | |
6 | This file is part of GCC. |
7 | |
8 | GCC is free software; you can redistribute it and/or modify it under |
9 | the terms of the GNU General Public License as published by the Free |
10 | Software Foundation; either version 3, or (at your option) any later |
11 | version. |
12 | |
13 | GCC is distributed in the hope that it will be useful, but WITHOUT ANY |
14 | WARRANTY; without even the implied warranty of MERCHANTABILITY or |
15 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
16 | for more details. |
17 | |
18 | You should have received a copy of the GNU General Public License |
19 | along with GCC; see the file COPYING3. If not see |
20 | <http://www.gnu.org/licenses/>. */ |
21 | |
22 | #include "config.h" |
23 | #define INCLUDE_ALGORITHM |
24 | #include "system.h" |
25 | #include "coretypes.h" |
26 | #include "backend.h" |
27 | #include "target.h" |
28 | #include "rtl.h" |
29 | #include "tree.h" |
30 | #include "gimple.h" |
31 | #include "tree-pass.h" |
32 | #include "ssa.h" |
33 | #include "optabs-tree.h" |
34 | #include "insn-config.h" |
35 | #include "recog.h" /* FIXME: for insn_data */ |
36 | #include "fold-const.h" |
37 | #include "stor-layout.h" |
38 | #include "gimple-iterator.h" |
39 | #include "cfgloop.h" |
40 | #include "tree-vectorizer.h" |
41 | #include "langhooks.h" |
42 | #include "gimple-walk.h" |
43 | #include "dbgcnt.h" |
44 | #include "tree-vector-builder.h" |
45 | #include "vec-perm-indices.h" |
46 | #include "gimple-fold.h" |
47 | #include "internal-fn.h" |
48 | #include "dump-context.h" |
49 | #include "cfganal.h" |
50 | #include "tree-eh.h" |
51 | #include "tree-cfg.h" |
52 | #include "alloc-pool.h" |
53 | #include "sreal.h" |
54 | #include "predict.h" |
55 | |
56 | static bool vect_transform_slp_perm_load_1 (vec_info *, slp_tree, |
57 | load_permutation_t &, |
58 | const vec<tree> &, |
59 | gimple_stmt_iterator *, |
60 | poly_uint64, bool, bool, |
61 | unsigned *, |
62 | unsigned * = nullptr, |
63 | bool = false); |
64 | static int vectorizable_slp_permutation_1 (vec_info *, gimple_stmt_iterator *, |
65 | slp_tree, lane_permutation_t &, |
66 | vec<slp_tree> &, bool); |
67 | static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *, |
68 | slp_tree, stmt_vector_for_cost *); |
69 | static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree); |
70 | |
71 | static object_allocator<_slp_tree> *slp_tree_pool; |
72 | static slp_tree slp_first_node; |
73 | |
74 | void |
75 | vect_slp_init (void) |
76 | { |
77 | slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes" ); |
78 | } |
79 | |
80 | void |
81 | vect_slp_fini (void) |
82 | { |
83 | while (slp_first_node) |
84 | delete slp_first_node; |
85 | delete slp_tree_pool; |
86 | slp_tree_pool = NULL; |
87 | } |
88 | |
89 | void * |
90 | _slp_tree::operator new (size_t n) |
91 | { |
92 | gcc_assert (n == sizeof (_slp_tree)); |
93 | return slp_tree_pool->allocate_raw (); |
94 | } |
95 | |
96 | void |
97 | _slp_tree::operator delete (void *node, size_t n) |
98 | { |
99 | gcc_assert (n == sizeof (_slp_tree)); |
100 | slp_tree_pool->remove_raw (object: node); |
101 | } |
102 | |
103 | |
104 | /* Initialize a SLP node. */ |
105 | |
106 | _slp_tree::_slp_tree () |
107 | { |
108 | this->prev_node = NULL; |
109 | if (slp_first_node) |
110 | slp_first_node->prev_node = this; |
111 | this->next_node = slp_first_node; |
112 | slp_first_node = this; |
113 | SLP_TREE_SCALAR_STMTS (this) = vNULL; |
114 | SLP_TREE_SCALAR_OPS (this) = vNULL; |
115 | SLP_TREE_VEC_DEFS (this) = vNULL; |
116 | SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0; |
117 | SLP_TREE_CHILDREN (this) = vNULL; |
118 | SLP_TREE_LOAD_PERMUTATION (this) = vNULL; |
119 | SLP_TREE_LANE_PERMUTATION (this) = vNULL; |
120 | SLP_TREE_SIMD_CLONE_INFO (this) = vNULL; |
121 | SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def; |
122 | SLP_TREE_CODE (this) = ERROR_MARK; |
123 | SLP_TREE_VECTYPE (this) = NULL_TREE; |
124 | SLP_TREE_REPRESENTATIVE (this) = NULL; |
125 | SLP_TREE_REF_COUNT (this) = 1; |
126 | this->failed = NULL; |
127 | this->max_nunits = 1; |
128 | this->lanes = 0; |
129 | } |
130 | |
131 | /* Tear down a SLP node. */ |
132 | |
133 | _slp_tree::~_slp_tree () |
134 | { |
135 | if (this->prev_node) |
136 | this->prev_node->next_node = this->next_node; |
137 | else |
138 | slp_first_node = this->next_node; |
139 | if (this->next_node) |
140 | this->next_node->prev_node = this->prev_node; |
141 | SLP_TREE_CHILDREN (this).release (); |
142 | SLP_TREE_SCALAR_STMTS (this).release (); |
143 | SLP_TREE_SCALAR_OPS (this).release (); |
144 | SLP_TREE_VEC_DEFS (this).release (); |
145 | SLP_TREE_LOAD_PERMUTATION (this).release (); |
146 | SLP_TREE_LANE_PERMUTATION (this).release (); |
147 | SLP_TREE_SIMD_CLONE_INFO (this).release (); |
148 | if (this->failed) |
149 | free (ptr: failed); |
150 | } |
151 | |
152 | /* Push the single SSA definition in DEF to the vector of vector defs. */ |
153 | |
154 | void |
155 | _slp_tree::push_vec_def (gimple *def) |
156 | { |
157 | if (gphi *phi = dyn_cast <gphi *> (p: def)) |
158 | vec_defs.quick_push (obj: gimple_phi_result (gs: phi)); |
159 | else |
160 | { |
161 | def_operand_p defop = single_ssa_def_operand (stmt: def, SSA_OP_ALL_DEFS); |
162 | vec_defs.quick_push (obj: get_def_from_ptr (def: defop)); |
163 | } |
164 | } |
165 | |
166 | /* Recursively free the memory allocated for the SLP tree rooted at NODE. */ |
167 | |
168 | void |
169 | vect_free_slp_tree (slp_tree node) |
170 | { |
171 | int i; |
172 | slp_tree child; |
173 | |
174 | if (--SLP_TREE_REF_COUNT (node) != 0) |
175 | return; |
176 | |
177 | FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) |
178 | if (child) |
179 | vect_free_slp_tree (node: child); |
180 | |
181 | /* If the node defines any SLP only patterns then those patterns are no |
182 | longer valid and should be removed. */ |
183 | stmt_vec_info rep_stmt_info = SLP_TREE_REPRESENTATIVE (node); |
184 | if (rep_stmt_info && STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info)) |
185 | { |
186 | stmt_vec_info stmt_info = vect_orig_stmt (stmt_info: rep_stmt_info); |
187 | STMT_VINFO_IN_PATTERN_P (stmt_info) = false; |
188 | STMT_SLP_TYPE (stmt_info) = STMT_SLP_TYPE (rep_stmt_info); |
189 | } |
190 | |
191 | delete node; |
192 | } |
193 | |
194 | /* Return a location suitable for dumpings related to the SLP instance. */ |
195 | |
196 | dump_user_location_t |
197 | _slp_instance::location () const |
198 | { |
199 | if (!root_stmts.is_empty ()) |
200 | return root_stmts[0]->stmt; |
201 | else |
202 | return SLP_TREE_SCALAR_STMTS (root)[0]->stmt; |
203 | } |
204 | |
205 | |
206 | /* Free the memory allocated for the SLP instance. */ |
207 | |
208 | void |
209 | vect_free_slp_instance (slp_instance instance) |
210 | { |
211 | vect_free_slp_tree (SLP_INSTANCE_TREE (instance)); |
212 | SLP_INSTANCE_LOADS (instance).release (); |
213 | SLP_INSTANCE_ROOT_STMTS (instance).release (); |
214 | SLP_INSTANCE_REMAIN_DEFS (instance).release (); |
215 | instance->subgraph_entries.release (); |
216 | instance->cost_vec.release (); |
217 | free (ptr: instance); |
218 | } |
219 | |
220 | |
221 | /* Create an SLP node for SCALAR_STMTS. */ |
222 | |
223 | slp_tree |
224 | vect_create_new_slp_node (unsigned nops, tree_code code) |
225 | { |
226 | slp_tree node = new _slp_tree; |
227 | SLP_TREE_SCALAR_STMTS (node) = vNULL; |
228 | SLP_TREE_CHILDREN (node).create (nelems: nops); |
229 | SLP_TREE_DEF_TYPE (node) = vect_internal_def; |
230 | SLP_TREE_CODE (node) = code; |
231 | return node; |
232 | } |
233 | /* Create an SLP node for SCALAR_STMTS. */ |
234 | |
235 | static slp_tree |
236 | vect_create_new_slp_node (slp_tree node, |
237 | vec<stmt_vec_info> scalar_stmts, unsigned nops) |
238 | { |
239 | SLP_TREE_SCALAR_STMTS (node) = scalar_stmts; |
240 | SLP_TREE_CHILDREN (node).create (nelems: nops); |
241 | SLP_TREE_DEF_TYPE (node) = vect_internal_def; |
242 | SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0]; |
243 | SLP_TREE_LANES (node) = scalar_stmts.length (); |
244 | return node; |
245 | } |
246 | |
247 | /* Create an SLP node for SCALAR_STMTS. */ |
248 | |
249 | static slp_tree |
250 | vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops) |
251 | { |
252 | return vect_create_new_slp_node (node: new _slp_tree, scalar_stmts, nops); |
253 | } |
254 | |
255 | /* Create an SLP node for OPS. */ |
256 | |
257 | static slp_tree |
258 | vect_create_new_slp_node (slp_tree node, vec<tree> ops) |
259 | { |
260 | SLP_TREE_SCALAR_OPS (node) = ops; |
261 | SLP_TREE_DEF_TYPE (node) = vect_external_def; |
262 | SLP_TREE_LANES (node) = ops.length (); |
263 | return node; |
264 | } |
265 | |
266 | /* Create an SLP node for OPS. */ |
267 | |
268 | static slp_tree |
269 | vect_create_new_slp_node (vec<tree> ops) |
270 | { |
271 | return vect_create_new_slp_node (node: new _slp_tree, ops); |
272 | } |
273 | |
274 | |
275 | /* This structure is used in creation of an SLP tree. Each instance |
276 | corresponds to the same operand in a group of scalar stmts in an SLP |
277 | node. */ |
278 | typedef struct _slp_oprnd_info |
279 | { |
280 | /* Def-stmts for the operands. */ |
281 | vec<stmt_vec_info> def_stmts; |
282 | /* Operands. */ |
283 | vec<tree> ops; |
284 | /* Information about the first statement, its vector def-type, type, the |
285 | operand itself in case it's constant, and an indication if it's a pattern |
286 | stmt and gather/scatter info. */ |
287 | tree first_op_type; |
288 | enum vect_def_type first_dt; |
289 | bool any_pattern; |
290 | bool first_gs_p; |
291 | gather_scatter_info first_gs_info; |
292 | } *slp_oprnd_info; |
293 | |
294 | |
295 | /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each |
296 | operand. */ |
297 | static vec<slp_oprnd_info> |
298 | vect_create_oprnd_info (int nops, int group_size) |
299 | { |
300 | int i; |
301 | slp_oprnd_info oprnd_info; |
302 | vec<slp_oprnd_info> oprnds_info; |
303 | |
304 | oprnds_info.create (nelems: nops); |
305 | for (i = 0; i < nops; i++) |
306 | { |
307 | oprnd_info = XNEW (struct _slp_oprnd_info); |
308 | oprnd_info->def_stmts.create (nelems: group_size); |
309 | oprnd_info->ops.create (nelems: group_size); |
310 | oprnd_info->first_dt = vect_uninitialized_def; |
311 | oprnd_info->first_op_type = NULL_TREE; |
312 | oprnd_info->any_pattern = false; |
313 | oprnd_info->first_gs_p = false; |
314 | oprnds_info.quick_push (obj: oprnd_info); |
315 | } |
316 | |
317 | return oprnds_info; |
318 | } |
319 | |
320 | |
321 | /* Free operands info. */ |
322 | |
323 | static void |
324 | vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info) |
325 | { |
326 | int i; |
327 | slp_oprnd_info oprnd_info; |
328 | |
329 | FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info) |
330 | { |
331 | oprnd_info->def_stmts.release (); |
332 | oprnd_info->ops.release (); |
333 | XDELETE (oprnd_info); |
334 | } |
335 | |
336 | oprnds_info.release (); |
337 | } |
338 | |
339 | /* Return the execution frequency of NODE (so that a higher value indicates |
340 | a "more important" node when optimizing for speed). */ |
341 | |
342 | static sreal |
343 | vect_slp_node_weight (slp_tree node) |
344 | { |
345 | stmt_vec_info stmt_info = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node)); |
346 | basic_block bb = gimple_bb (g: stmt_info->stmt); |
347 | return bb->count.to_sreal_scale (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count); |
348 | } |
349 | |
350 | /* Return true if STMTS contains a pattern statement. */ |
351 | |
352 | static bool |
353 | vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts) |
354 | { |
355 | stmt_vec_info stmt_info; |
356 | unsigned int i; |
357 | FOR_EACH_VEC_ELT (stmts, i, stmt_info) |
358 | if (is_pattern_stmt_p (stmt_info)) |
359 | return true; |
360 | return false; |
361 | } |
362 | |
363 | /* Return true when all lanes in the external or constant NODE have |
364 | the same value. */ |
365 | |
366 | static bool |
367 | vect_slp_tree_uniform_p (slp_tree node) |
368 | { |
369 | gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def |
370 | || SLP_TREE_DEF_TYPE (node) == vect_external_def); |
371 | |
372 | /* Pre-exsting vectors. */ |
373 | if (SLP_TREE_SCALAR_OPS (node).is_empty ()) |
374 | return false; |
375 | |
376 | unsigned i; |
377 | tree op, first = NULL_TREE; |
378 | FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op) |
379 | if (!first) |
380 | first = op; |
381 | else if (!operand_equal_p (first, op, flags: 0)) |
382 | return false; |
383 | |
384 | return true; |
385 | } |
386 | |
387 | /* Find the place of the data-ref in STMT_INFO in the interleaving chain |
388 | that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part |
389 | of the chain. */ |
390 | |
391 | int |
392 | vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info, |
393 | stmt_vec_info first_stmt_info) |
394 | { |
395 | stmt_vec_info next_stmt_info = first_stmt_info; |
396 | int result = 0; |
397 | |
398 | if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info)) |
399 | return -1; |
400 | |
401 | do |
402 | { |
403 | if (next_stmt_info == stmt_info) |
404 | return result; |
405 | next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info); |
406 | if (next_stmt_info) |
407 | result += DR_GROUP_GAP (next_stmt_info); |
408 | } |
409 | while (next_stmt_info); |
410 | |
411 | return -1; |
412 | } |
413 | |
414 | /* Check whether it is possible to load COUNT elements of type ELT_TYPE |
415 | using the method implemented by duplicate_and_interleave. Return true |
416 | if so, returning the number of intermediate vectors in *NVECTORS_OUT |
417 | (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT |
418 | (if nonnull). */ |
419 | |
420 | bool |
421 | can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count, |
422 | tree elt_type, unsigned int *nvectors_out, |
423 | tree *vector_type_out, |
424 | tree *permutes) |
425 | { |
426 | tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count); |
427 | if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type))) |
428 | return false; |
429 | |
430 | machine_mode base_vector_mode = TYPE_MODE (base_vector_type); |
431 | poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode); |
432 | unsigned int nvectors = 1; |
433 | for (;;) |
434 | { |
435 | scalar_int_mode int_mode; |
436 | poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT; |
437 | if (int_mode_for_size (size: elt_bits, limit: 1).exists (mode: &int_mode)) |
438 | { |
439 | /* Get the natural vector type for this SLP group size. */ |
440 | tree int_type = build_nonstandard_integer_type |
441 | (GET_MODE_BITSIZE (mode: int_mode), 1); |
442 | tree vector_type |
443 | = get_vectype_for_scalar_type (vinfo, int_type, count); |
444 | poly_int64 half_nelts; |
445 | if (vector_type |
446 | && VECTOR_MODE_P (TYPE_MODE (vector_type)) |
447 | && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)), |
448 | GET_MODE_SIZE (base_vector_mode)) |
449 | && multiple_p (a: GET_MODE_NUNITS (TYPE_MODE (vector_type)), |
450 | b: 2, multiple: &half_nelts)) |
451 | { |
452 | /* Try fusing consecutive sequences of COUNT / NVECTORS elements |
453 | together into elements of type INT_TYPE and using the result |
454 | to build NVECTORS vectors. */ |
455 | poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type)); |
456 | vec_perm_builder sel1 (nelts, 2, 3); |
457 | vec_perm_builder sel2 (nelts, 2, 3); |
458 | |
459 | for (unsigned int i = 0; i < 3; ++i) |
460 | { |
461 | sel1.quick_push (obj: i); |
462 | sel1.quick_push (obj: i + nelts); |
463 | sel2.quick_push (obj: half_nelts + i); |
464 | sel2.quick_push (obj: half_nelts + i + nelts); |
465 | } |
466 | vec_perm_indices indices1 (sel1, 2, nelts); |
467 | vec_perm_indices indices2 (sel2, 2, nelts); |
468 | machine_mode vmode = TYPE_MODE (vector_type); |
469 | if (can_vec_perm_const_p (vmode, vmode, indices1) |
470 | && can_vec_perm_const_p (vmode, vmode, indices2)) |
471 | { |
472 | if (nvectors_out) |
473 | *nvectors_out = nvectors; |
474 | if (vector_type_out) |
475 | *vector_type_out = vector_type; |
476 | if (permutes) |
477 | { |
478 | permutes[0] = vect_gen_perm_mask_checked (vector_type, |
479 | indices1); |
480 | permutes[1] = vect_gen_perm_mask_checked (vector_type, |
481 | indices2); |
482 | } |
483 | return true; |
484 | } |
485 | } |
486 | } |
487 | if (!multiple_p (a: elt_bytes, b: 2, multiple: &elt_bytes)) |
488 | return false; |
489 | nvectors *= 2; |
490 | } |
491 | } |
492 | |
493 | /* Return true if DTA and DTB match. */ |
494 | |
495 | static bool |
496 | vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb) |
497 | { |
498 | return (dta == dtb |
499 | || ((dta == vect_external_def || dta == vect_constant_def) |
500 | && (dtb == vect_external_def || dtb == vect_constant_def))); |
501 | } |
502 | |
503 | static const int cond_expr_maps[3][5] = { |
504 | { 4, -1, -2, 1, 2 }, |
505 | { 4, -2, -1, 1, 2 }, |
506 | { 4, -1, -2, 2, 1 } |
507 | }; |
508 | static const int arg1_map[] = { 1, 1 }; |
509 | static const int arg2_map[] = { 1, 2 }; |
510 | static const int arg1_arg4_map[] = { 2, 1, 4 }; |
511 | static const int arg3_arg2_map[] = { 2, 3, 2 }; |
512 | static const int op1_op0_map[] = { 2, 1, 0 }; |
513 | static const int off_map[] = { 1, -3 }; |
514 | static const int off_op0_map[] = { 2, -3, 0 }; |
515 | static const int off_arg2_map[] = { 2, -3, 2 }; |
516 | static const int off_arg3_arg2_map[] = { 3, -3, 3, 2 }; |
517 | static const int mask_call_maps[6][7] = { |
518 | { 1, 1, }, |
519 | { 2, 1, 2, }, |
520 | { 3, 1, 2, 3, }, |
521 | { 4, 1, 2, 3, 4, }, |
522 | { 5, 1, 2, 3, 4, 5, }, |
523 | { 6, 1, 2, 3, 4, 5, 6 }, |
524 | }; |
525 | |
526 | /* For most SLP statements, there is a one-to-one mapping between |
527 | gimple arguments and child nodes. If that is not true for STMT, |
528 | return an array that contains: |
529 | |
530 | - the number of child nodes, followed by |
531 | - for each child node, the index of the argument associated with that node. |
532 | The special index -1 is the first operand of an embedded comparison and |
533 | the special index -2 is the second operand of an embedded comparison. |
534 | The special indes -3 is the offset of a gather as analyzed by |
535 | vect_check_gather_scatter. |
536 | |
537 | SWAP is as for vect_get_and_check_slp_defs. */ |
538 | |
539 | static const int * |
540 | vect_get_operand_map (const gimple *stmt, bool gather_scatter_p = false, |
541 | unsigned char swap = 0) |
542 | { |
543 | if (auto assign = dyn_cast<const gassign *> (p: stmt)) |
544 | { |
545 | if (gimple_assign_rhs_code (gs: assign) == COND_EXPR |
546 | && COMPARISON_CLASS_P (gimple_assign_rhs1 (assign))) |
547 | return cond_expr_maps[swap]; |
548 | if (TREE_CODE_CLASS (gimple_assign_rhs_code (assign)) == tcc_comparison |
549 | && swap) |
550 | return op1_op0_map; |
551 | if (gather_scatter_p) |
552 | return (TREE_CODE (gimple_assign_lhs (assign)) != SSA_NAME |
553 | ? off_op0_map : off_map); |
554 | } |
555 | gcc_assert (!swap); |
556 | if (auto call = dyn_cast<const gcall *> (p: stmt)) |
557 | { |
558 | if (gimple_call_internal_p (gs: call)) |
559 | switch (gimple_call_internal_fn (gs: call)) |
560 | { |
561 | case IFN_MASK_LOAD: |
562 | return gather_scatter_p ? off_arg2_map : arg2_map; |
563 | |
564 | case IFN_GATHER_LOAD: |
565 | return arg1_map; |
566 | |
567 | case IFN_MASK_GATHER_LOAD: |
568 | case IFN_MASK_LEN_GATHER_LOAD: |
569 | return arg1_arg4_map; |
570 | |
571 | case IFN_MASK_STORE: |
572 | return gather_scatter_p ? off_arg3_arg2_map : arg3_arg2_map; |
573 | |
574 | case IFN_MASK_CALL: |
575 | { |
576 | unsigned nargs = gimple_call_num_args (gs: call); |
577 | if (nargs >= 2 && nargs <= 7) |
578 | return mask_call_maps[nargs-2]; |
579 | else |
580 | return nullptr; |
581 | } |
582 | |
583 | default: |
584 | break; |
585 | } |
586 | } |
587 | return nullptr; |
588 | } |
589 | |
590 | /* Return the SLP node child index for operand OP of STMT. */ |
591 | |
592 | int |
593 | vect_slp_child_index_for_operand (const gimple *stmt, int op, |
594 | bool gather_scatter_p) |
595 | { |
596 | const int *opmap = vect_get_operand_map (stmt, gather_scatter_p); |
597 | if (!opmap) |
598 | return op; |
599 | for (int i = 1; i < 1 + opmap[0]; ++i) |
600 | if (opmap[i] == op) |
601 | return i - 1; |
602 | gcc_unreachable (); |
603 | } |
604 | |
605 | /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that |
606 | they are of a valid type and that they match the defs of the first stmt of |
607 | the SLP group (stored in OPRNDS_INFO). This function tries to match stmts |
608 | by swapping operands of STMTS[STMT_NUM] when possible. Non-zero SWAP |
609 | indicates swap is required for cond_expr stmts. Specifically, SWAP |
610 | is 1 if STMT is cond and operands of comparison need to be swapped; |
611 | SWAP is 2 if STMT is cond and code of comparison needs to be inverted. |
612 | |
613 | If there was a fatal error return -1; if the error could be corrected by |
614 | swapping operands of father node of this one, return 1; if everything is |
615 | ok return 0. */ |
616 | static int |
617 | vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap, |
618 | bool *skip_args, |
619 | vec<stmt_vec_info> stmts, unsigned stmt_num, |
620 | vec<slp_oprnd_info> *oprnds_info) |
621 | { |
622 | stmt_vec_info stmt_info = stmts[stmt_num]; |
623 | tree oprnd; |
624 | unsigned int i, number_of_oprnds; |
625 | enum vect_def_type dt = vect_uninitialized_def; |
626 | slp_oprnd_info oprnd_info; |
627 | gather_scatter_info gs_info; |
628 | unsigned int gs_op = -1u; |
629 | unsigned int commutative_op = -1U; |
630 | bool first = stmt_num == 0; |
631 | |
632 | if (!is_a<gcall *> (p: stmt_info->stmt) |
633 | && !is_a<gassign *> (p: stmt_info->stmt) |
634 | && !is_a<gphi *> (p: stmt_info->stmt)) |
635 | return -1; |
636 | |
637 | number_of_oprnds = gimple_num_args (gs: stmt_info->stmt); |
638 | const int *map |
639 | = vect_get_operand_map (stmt: stmt_info->stmt, |
640 | STMT_VINFO_GATHER_SCATTER_P (stmt_info), swap); |
641 | if (map) |
642 | number_of_oprnds = *map++; |
643 | if (gcall *stmt = dyn_cast <gcall *> (p: stmt_info->stmt)) |
644 | { |
645 | if (gimple_call_internal_p (gs: stmt)) |
646 | { |
647 | internal_fn ifn = gimple_call_internal_fn (gs: stmt); |
648 | commutative_op = first_commutative_argument (ifn); |
649 | } |
650 | } |
651 | else if (gassign *stmt = dyn_cast <gassign *> (p: stmt_info->stmt)) |
652 | { |
653 | if (commutative_tree_code (gimple_assign_rhs_code (gs: stmt))) |
654 | commutative_op = 0; |
655 | } |
656 | |
657 | bool swapped = (swap != 0); |
658 | bool backedge = false; |
659 | enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds); |
660 | for (i = 0; i < number_of_oprnds; i++) |
661 | { |
662 | oprnd_info = (*oprnds_info)[i]; |
663 | int opno = map ? map[i] : int (i); |
664 | if (opno == -3) |
665 | { |
666 | gcc_assert (STMT_VINFO_GATHER_SCATTER_P (stmt_info)); |
667 | if (!is_a <loop_vec_info> (p: vinfo) |
668 | || !vect_check_gather_scatter (stmt_info, |
669 | as_a <loop_vec_info> (p: vinfo), |
670 | first ? &oprnd_info->first_gs_info |
671 | : &gs_info)) |
672 | return -1; |
673 | |
674 | if (first) |
675 | { |
676 | oprnd_info->first_gs_p = true; |
677 | oprnd = oprnd_info->first_gs_info.offset; |
678 | } |
679 | else |
680 | { |
681 | gs_op = i; |
682 | oprnd = gs_info.offset; |
683 | } |
684 | } |
685 | else if (opno < 0) |
686 | oprnd = TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno); |
687 | else |
688 | { |
689 | oprnd = gimple_arg (gs: stmt_info->stmt, i: opno); |
690 | if (gphi *stmt = dyn_cast <gphi *> (p: stmt_info->stmt)) |
691 | { |
692 | edge e = gimple_phi_arg_edge (phi: stmt, i: opno); |
693 | backedge = (is_a <bb_vec_info> (p: vinfo) |
694 | ? e->flags & EDGE_DFS_BACK |
695 | : dominated_by_p (CDI_DOMINATORS, e->src, |
696 | gimple_bb (g: stmt_info->stmt))); |
697 | } |
698 | } |
699 | if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR) |
700 | oprnd = TREE_OPERAND (oprnd, 0); |
701 | |
702 | stmt_vec_info def_stmt_info; |
703 | if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info)) |
704 | { |
705 | if (dump_enabled_p ()) |
706 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
707 | "Build SLP failed: can't analyze def for %T\n" , |
708 | oprnd); |
709 | |
710 | return -1; |
711 | } |
712 | |
713 | if (skip_args[i]) |
714 | { |
715 | oprnd_info->def_stmts.quick_push (NULL); |
716 | oprnd_info->ops.quick_push (NULL_TREE); |
717 | oprnd_info->first_dt = vect_uninitialized_def; |
718 | continue; |
719 | } |
720 | |
721 | oprnd_info->def_stmts.quick_push (obj: def_stmt_info); |
722 | oprnd_info->ops.quick_push (obj: oprnd); |
723 | |
724 | if (def_stmt_info |
725 | && is_pattern_stmt_p (stmt_info: def_stmt_info)) |
726 | { |
727 | if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info)) |
728 | != def_stmt_info) |
729 | oprnd_info->any_pattern = true; |
730 | else |
731 | /* If we promote this to external use the original stmt def. */ |
732 | oprnd_info->ops.last () |
733 | = gimple_get_lhs (vect_orig_stmt (stmt_info: def_stmt_info)->stmt); |
734 | } |
735 | |
736 | /* If there's a extern def on a backedge make sure we can |
737 | code-generate at the region start. |
738 | ??? This is another case that could be fixed by adjusting |
739 | how we split the function but at the moment we'd have conflicting |
740 | goals there. */ |
741 | if (backedge |
742 | && dts[i] == vect_external_def |
743 | && is_a <bb_vec_info> (p: vinfo) |
744 | && TREE_CODE (oprnd) == SSA_NAME |
745 | && !SSA_NAME_IS_DEFAULT_DEF (oprnd) |
746 | && !dominated_by_p (CDI_DOMINATORS, |
747 | as_a <bb_vec_info> (p: vinfo)->bbs[0], |
748 | gimple_bb (SSA_NAME_DEF_STMT (oprnd)))) |
749 | { |
750 | if (dump_enabled_p ()) |
751 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
752 | "Build SLP failed: extern def %T only defined " |
753 | "on backedge\n" , oprnd); |
754 | return -1; |
755 | } |
756 | |
757 | if (first) |
758 | { |
759 | tree type = TREE_TYPE (oprnd); |
760 | dt = dts[i]; |
761 | if ((dt == vect_constant_def |
762 | || dt == vect_external_def) |
763 | && !GET_MODE_SIZE (mode: vinfo->vector_mode).is_constant () |
764 | && TREE_CODE (type) != BOOLEAN_TYPE |
765 | && !can_duplicate_and_interleave_p (vinfo, count: stmts.length (), elt_type: type)) |
766 | { |
767 | if (dump_enabled_p ()) |
768 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
769 | "Build SLP failed: invalid type of def " |
770 | "for variable-length SLP %T\n" , oprnd); |
771 | return -1; |
772 | } |
773 | |
774 | /* For the swapping logic below force vect_reduction_def |
775 | for the reduction op in a SLP reduction group. */ |
776 | if (!STMT_VINFO_DATA_REF (stmt_info) |
777 | && REDUC_GROUP_FIRST_ELEMENT (stmt_info) |
778 | && (int)i == STMT_VINFO_REDUC_IDX (stmt_info) |
779 | && def_stmt_info) |
780 | dts[i] = dt = vect_reduction_def; |
781 | |
782 | /* Check the types of the definition. */ |
783 | switch (dt) |
784 | { |
785 | case vect_external_def: |
786 | case vect_constant_def: |
787 | case vect_internal_def: |
788 | case vect_reduction_def: |
789 | case vect_induction_def: |
790 | case vect_nested_cycle: |
791 | case vect_first_order_recurrence: |
792 | break; |
793 | |
794 | default: |
795 | /* FORNOW: Not supported. */ |
796 | if (dump_enabled_p ()) |
797 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
798 | "Build SLP failed: illegal type of def %T\n" , |
799 | oprnd); |
800 | return -1; |
801 | } |
802 | |
803 | oprnd_info->first_dt = dt; |
804 | oprnd_info->first_op_type = type; |
805 | } |
806 | } |
807 | if (first) |
808 | return 0; |
809 | |
810 | /* Now match the operand definition types to that of the first stmt. */ |
811 | for (i = 0; i < number_of_oprnds;) |
812 | { |
813 | if (skip_args[i]) |
814 | { |
815 | ++i; |
816 | continue; |
817 | } |
818 | |
819 | oprnd_info = (*oprnds_info)[i]; |
820 | dt = dts[i]; |
821 | stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num]; |
822 | oprnd = oprnd_info->ops[stmt_num]; |
823 | tree type = TREE_TYPE (oprnd); |
824 | |
825 | if (!types_compatible_p (type1: oprnd_info->first_op_type, type2: type)) |
826 | { |
827 | if (dump_enabled_p ()) |
828 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
829 | "Build SLP failed: different operand types\n" ); |
830 | return 1; |
831 | } |
832 | |
833 | if ((gs_op == i) != oprnd_info->first_gs_p) |
834 | { |
835 | if (dump_enabled_p ()) |
836 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
837 | "Build SLP failed: mixed gather and non-gather\n" ); |
838 | return 1; |
839 | } |
840 | else if (gs_op == i) |
841 | { |
842 | if (!operand_equal_p (oprnd_info->first_gs_info.base, |
843 | gs_info.base)) |
844 | { |
845 | if (dump_enabled_p ()) |
846 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
847 | "Build SLP failed: different gather base\n" ); |
848 | return 1; |
849 | } |
850 | if (oprnd_info->first_gs_info.scale != gs_info.scale) |
851 | { |
852 | if (dump_enabled_p ()) |
853 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
854 | "Build SLP failed: different gather scale\n" ); |
855 | return 1; |
856 | } |
857 | } |
858 | |
859 | /* Not first stmt of the group, check that the def-stmt/s match |
860 | the def-stmt/s of the first stmt. Allow different definition |
861 | types for reduction chains: the first stmt must be a |
862 | vect_reduction_def (a phi node), and the rest |
863 | end in the reduction chain. */ |
864 | if ((!vect_def_types_match (dta: oprnd_info->first_dt, dtb: dt) |
865 | && !(oprnd_info->first_dt == vect_reduction_def |
866 | && !STMT_VINFO_DATA_REF (stmt_info) |
867 | && REDUC_GROUP_FIRST_ELEMENT (stmt_info) |
868 | && def_stmt_info |
869 | && !STMT_VINFO_DATA_REF (def_stmt_info) |
870 | && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info) |
871 | == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))) |
872 | || (!STMT_VINFO_DATA_REF (stmt_info) |
873 | && REDUC_GROUP_FIRST_ELEMENT (stmt_info) |
874 | && ((!def_stmt_info |
875 | || STMT_VINFO_DATA_REF (def_stmt_info) |
876 | || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info) |
877 | != REDUC_GROUP_FIRST_ELEMENT (stmt_info))) |
878 | != (oprnd_info->first_dt != vect_reduction_def)))) |
879 | { |
880 | /* Try swapping operands if we got a mismatch. For BB |
881 | vectorization only in case it will clearly improve things. */ |
882 | if (i == commutative_op && !swapped |
883 | && (!is_a <bb_vec_info> (p: vinfo) |
884 | || (!vect_def_types_match (dta: (*oprnds_info)[i+1]->first_dt, |
885 | dtb: dts[i+1]) |
886 | && (vect_def_types_match (dta: oprnd_info->first_dt, dtb: dts[i+1]) |
887 | || vect_def_types_match |
888 | (dta: (*oprnds_info)[i+1]->first_dt, dtb: dts[i]))))) |
889 | { |
890 | if (dump_enabled_p ()) |
891 | dump_printf_loc (MSG_NOTE, vect_location, |
892 | "trying swapped operands\n" ); |
893 | std::swap (a&: dts[i], b&: dts[i+1]); |
894 | std::swap (a&: (*oprnds_info)[i]->def_stmts[stmt_num], |
895 | b&: (*oprnds_info)[i+1]->def_stmts[stmt_num]); |
896 | std::swap (a&: (*oprnds_info)[i]->ops[stmt_num], |
897 | b&: (*oprnds_info)[i+1]->ops[stmt_num]); |
898 | swapped = true; |
899 | continue; |
900 | } |
901 | |
902 | if (is_a <bb_vec_info> (p: vinfo) |
903 | && !oprnd_info->any_pattern) |
904 | { |
905 | /* Now for commutative ops we should see whether we can |
906 | make the other operand matching. */ |
907 | if (dump_enabled_p ()) |
908 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
909 | "treating operand as external\n" ); |
910 | oprnd_info->first_dt = dt = vect_external_def; |
911 | } |
912 | else |
913 | { |
914 | if (dump_enabled_p ()) |
915 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
916 | "Build SLP failed: different types\n" ); |
917 | return 1; |
918 | } |
919 | } |
920 | |
921 | /* Make sure to demote the overall operand to external. */ |
922 | if (dt == vect_external_def) |
923 | oprnd_info->first_dt = vect_external_def; |
924 | /* For a SLP reduction chain we want to duplicate the reduction to |
925 | each of the chain members. That gets us a sane SLP graph (still |
926 | the stmts are not 100% correct wrt the initial values). */ |
927 | else if ((dt == vect_internal_def |
928 | || dt == vect_reduction_def) |
929 | && oprnd_info->first_dt == vect_reduction_def |
930 | && !STMT_VINFO_DATA_REF (stmt_info) |
931 | && REDUC_GROUP_FIRST_ELEMENT (stmt_info) |
932 | && !STMT_VINFO_DATA_REF (def_stmt_info) |
933 | && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info) |
934 | == REDUC_GROUP_FIRST_ELEMENT (stmt_info))) |
935 | { |
936 | oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0]; |
937 | oprnd_info->ops[stmt_num] = oprnd_info->ops[0]; |
938 | } |
939 | |
940 | ++i; |
941 | } |
942 | |
943 | /* Swap operands. */ |
944 | if (swapped) |
945 | { |
946 | if (dump_enabled_p ()) |
947 | dump_printf_loc (MSG_NOTE, vect_location, |
948 | "swapped operands to match def types in %G" , |
949 | stmt_info->stmt); |
950 | } |
951 | |
952 | return 0; |
953 | } |
954 | |
955 | /* Return true if call statements CALL1 and CALL2 are similar enough |
956 | to be combined into the same SLP group. */ |
957 | |
958 | bool |
959 | compatible_calls_p (gcall *call1, gcall *call2) |
960 | { |
961 | unsigned int nargs = gimple_call_num_args (gs: call1); |
962 | if (nargs != gimple_call_num_args (gs: call2)) |
963 | return false; |
964 | |
965 | if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2)) |
966 | return false; |
967 | |
968 | if (gimple_call_internal_p (gs: call1)) |
969 | { |
970 | if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)), |
971 | TREE_TYPE (gimple_call_lhs (call2)))) |
972 | return false; |
973 | for (unsigned int i = 0; i < nargs; ++i) |
974 | if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)), |
975 | TREE_TYPE (gimple_call_arg (call2, i)))) |
976 | return false; |
977 | } |
978 | else |
979 | { |
980 | if (!operand_equal_p (gimple_call_fn (gs: call1), |
981 | gimple_call_fn (gs: call2), flags: 0)) |
982 | return false; |
983 | |
984 | if (gimple_call_fntype (gs: call1) != gimple_call_fntype (gs: call2)) |
985 | return false; |
986 | } |
987 | |
988 | /* Check that any unvectorized arguments are equal. */ |
989 | if (const int *map = vect_get_operand_map (stmt: call1)) |
990 | { |
991 | unsigned int nkept = *map++; |
992 | unsigned int mapi = 0; |
993 | for (unsigned int i = 0; i < nargs; ++i) |
994 | if (mapi < nkept && map[mapi] == int (i)) |
995 | mapi += 1; |
996 | else if (!operand_equal_p (gimple_call_arg (gs: call1, index: i), |
997 | gimple_call_arg (gs: call2, index: i))) |
998 | return false; |
999 | } |
1000 | |
1001 | return true; |
1002 | } |
1003 | |
1004 | /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the |
1005 | caller's attempt to find the vector type in STMT_INFO with the narrowest |
1006 | element type. Return true if VECTYPE is nonnull and if it is valid |
1007 | for STMT_INFO. When returning true, update MAX_NUNITS to reflect the |
1008 | number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for |
1009 | vect_build_slp_tree. */ |
1010 | |
1011 | static bool |
1012 | vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info, |
1013 | unsigned int group_size, |
1014 | tree vectype, poly_uint64 *max_nunits) |
1015 | { |
1016 | if (!vectype) |
1017 | { |
1018 | if (dump_enabled_p ()) |
1019 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
1020 | "Build SLP failed: unsupported data-type in %G\n" , |
1021 | stmt_info->stmt); |
1022 | /* Fatal mismatch. */ |
1023 | return false; |
1024 | } |
1025 | |
1026 | /* If populating the vector type requires unrolling then fail |
1027 | before adjusting *max_nunits for basic-block vectorization. */ |
1028 | if (is_a <bb_vec_info> (p: vinfo) |
1029 | && !multiple_p (a: group_size, b: TYPE_VECTOR_SUBPARTS (node: vectype))) |
1030 | { |
1031 | if (dump_enabled_p ()) |
1032 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
1033 | "Build SLP failed: unrolling required " |
1034 | "in basic block SLP\n" ); |
1035 | /* Fatal mismatch. */ |
1036 | return false; |
1037 | } |
1038 | |
1039 | /* In case of multiple types we need to detect the smallest type. */ |
1040 | vect_update_max_nunits (max_nunits, vectype); |
1041 | return true; |
1042 | } |
1043 | |
1044 | /* Verify if the scalar stmts STMTS are isomorphic, require data |
1045 | permutation or are of unsupported types of operation. Return |
1046 | true if they are, otherwise return false and indicate in *MATCHES |
1047 | which stmts are not isomorphic to the first one. If MATCHES[0] |
1048 | is false then this indicates the comparison could not be |
1049 | carried out or the stmts will never be vectorized by SLP. |
1050 | |
1051 | Note COND_EXPR is possibly isomorphic to another one after swapping its |
1052 | operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to |
1053 | the first stmt by swapping the two operands of comparison; set SWAP[i] |
1054 | to 2 if stmt I is isormorphic to the first stmt by inverting the code |
1055 | of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped |
1056 | to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */ |
1057 | |
1058 | static bool |
1059 | vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, |
1060 | vec<stmt_vec_info> stmts, unsigned int group_size, |
1061 | poly_uint64 *max_nunits, bool *matches, |
1062 | bool *two_operators, tree *node_vectype) |
1063 | { |
1064 | unsigned int i; |
1065 | stmt_vec_info first_stmt_info = stmts[0]; |
1066 | code_helper first_stmt_code = ERROR_MARK; |
1067 | code_helper alt_stmt_code = ERROR_MARK; |
1068 | code_helper rhs_code = ERROR_MARK; |
1069 | code_helper first_cond_code = ERROR_MARK; |
1070 | tree lhs; |
1071 | bool need_same_oprnds = false; |
1072 | tree vectype = NULL_TREE, first_op1 = NULL_TREE; |
1073 | stmt_vec_info first_load = NULL, prev_first_load = NULL; |
1074 | bool first_stmt_ldst_p = false, ldst_p = false; |
1075 | bool first_stmt_phi_p = false, phi_p = false; |
1076 | bool maybe_soft_fail = false; |
1077 | tree soft_fail_nunits_vectype = NULL_TREE; |
1078 | |
1079 | /* For every stmt in NODE find its def stmt/s. */ |
1080 | stmt_vec_info stmt_info; |
1081 | FOR_EACH_VEC_ELT (stmts, i, stmt_info) |
1082 | { |
1083 | gimple *stmt = stmt_info->stmt; |
1084 | swap[i] = 0; |
1085 | matches[i] = false; |
1086 | |
1087 | if (dump_enabled_p ()) |
1088 | dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G" , stmt); |
1089 | |
1090 | /* Fail to vectorize statements marked as unvectorizable, throw |
1091 | or are volatile. */ |
1092 | if (!STMT_VINFO_VECTORIZABLE (stmt_info) |
1093 | || stmt_can_throw_internal (cfun, stmt) |
1094 | || gimple_has_volatile_ops (stmt)) |
1095 | { |
1096 | if (dump_enabled_p ()) |
1097 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
1098 | "Build SLP failed: unvectorizable statement %G" , |
1099 | stmt); |
1100 | /* ??? For BB vectorization we want to commutate operands in a way |
1101 | to shuffle all unvectorizable defs into one operand and have |
1102 | the other still vectorized. The following doesn't reliably |
1103 | work for this though but it's the easiest we can do here. */ |
1104 | if (is_a <bb_vec_info> (p: vinfo) && i != 0) |
1105 | continue; |
1106 | /* Fatal mismatch. */ |
1107 | matches[0] = false; |
1108 | return false; |
1109 | } |
1110 | |
1111 | gcall *call_stmt = dyn_cast <gcall *> (p: stmt); |
1112 | lhs = gimple_get_lhs (stmt); |
1113 | if (lhs == NULL_TREE |
1114 | && (!call_stmt |
1115 | || !gimple_call_internal_p (gs: stmt) |
1116 | || !internal_store_fn_p (gimple_call_internal_fn (gs: stmt)))) |
1117 | { |
1118 | if (dump_enabled_p ()) |
1119 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
1120 | "Build SLP failed: not GIMPLE_ASSIGN nor " |
1121 | "GIMPLE_CALL %G" , stmt); |
1122 | if (is_a <bb_vec_info> (p: vinfo) && i != 0) |
1123 | continue; |
1124 | /* Fatal mismatch. */ |
1125 | matches[0] = false; |
1126 | return false; |
1127 | } |
1128 | |
1129 | tree nunits_vectype; |
1130 | if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype, |
1131 | &nunits_vectype, group_size)) |
1132 | { |
1133 | if (is_a <bb_vec_info> (p: vinfo) && i != 0) |
1134 | continue; |
1135 | /* Fatal mismatch. */ |
1136 | matches[0] = false; |
1137 | return false; |
1138 | } |
1139 | /* Record nunits required but continue analysis, producing matches[] |
1140 | as if nunits was not an issue. This allows splitting of groups |
1141 | to happen. */ |
1142 | if (nunits_vectype |
1143 | && !vect_record_max_nunits (vinfo, stmt_info, group_size, |
1144 | vectype: nunits_vectype, max_nunits)) |
1145 | { |
1146 | gcc_assert (is_a <bb_vec_info> (vinfo)); |
1147 | maybe_soft_fail = true; |
1148 | soft_fail_nunits_vectype = nunits_vectype; |
1149 | } |
1150 | |
1151 | gcc_assert (vectype); |
1152 | |
1153 | if (call_stmt) |
1154 | { |
1155 | combined_fn cfn = gimple_call_combined_fn (call_stmt); |
1156 | if (cfn != CFN_LAST && cfn != CFN_MASK_CALL) |
1157 | rhs_code = cfn; |
1158 | else |
1159 | rhs_code = CALL_EXPR; |
1160 | |
1161 | if (cfn == CFN_MASK_LOAD |
1162 | || cfn == CFN_GATHER_LOAD |
1163 | || cfn == CFN_MASK_GATHER_LOAD |
1164 | || cfn == CFN_MASK_LEN_GATHER_LOAD) |
1165 | ldst_p = true; |
1166 | else if (cfn == CFN_MASK_STORE) |
1167 | { |
1168 | ldst_p = true; |
1169 | rhs_code = CFN_MASK_STORE; |
1170 | } |
1171 | else if ((cfn != CFN_LAST |
1172 | && cfn != CFN_MASK_CALL |
1173 | && internal_fn_p (code: cfn) |
1174 | && !vectorizable_internal_fn_p (fn: as_internal_fn (code: cfn))) |
1175 | || gimple_call_tail_p (s: call_stmt) |
1176 | || gimple_call_noreturn_p (s: call_stmt) |
1177 | || gimple_call_chain (gs: call_stmt)) |
1178 | { |
1179 | if (dump_enabled_p ()) |
1180 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
1181 | "Build SLP failed: unsupported call type %G" , |
1182 | (gimple *) call_stmt); |
1183 | if (is_a <bb_vec_info> (p: vinfo) && i != 0) |
1184 | continue; |
1185 | /* Fatal mismatch. */ |
1186 | matches[0] = false; |
1187 | return false; |
1188 | } |
1189 | } |
1190 | else if (gimple_code (g: stmt) == GIMPLE_PHI) |
1191 | { |
1192 | rhs_code = ERROR_MARK; |
1193 | phi_p = true; |
1194 | } |
1195 | else |
1196 | { |
1197 | rhs_code = gimple_assign_rhs_code (gs: stmt); |
1198 | ldst_p = STMT_VINFO_DATA_REF (stmt_info) != nullptr; |
1199 | } |
1200 | |
1201 | /* Check the operation. */ |
1202 | if (i == 0) |
1203 | { |
1204 | *node_vectype = vectype; |
1205 | first_stmt_code = rhs_code; |
1206 | first_stmt_ldst_p = ldst_p; |
1207 | first_stmt_phi_p = phi_p; |
1208 | |
1209 | /* Shift arguments should be equal in all the packed stmts for a |
1210 | vector shift with scalar shift operand. */ |
1211 | if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR |
1212 | || rhs_code == LROTATE_EXPR |
1213 | || rhs_code == RROTATE_EXPR) |
1214 | { |
1215 | /* First see if we have a vector/vector shift. */ |
1216 | if (!directly_supported_p (rhs_code, vectype, optab_vector)) |
1217 | { |
1218 | /* No vector/vector shift, try for a vector/scalar shift. */ |
1219 | if (!directly_supported_p (rhs_code, vectype, optab_scalar)) |
1220 | { |
1221 | if (dump_enabled_p ()) |
1222 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
1223 | "Build SLP failed: " |
1224 | "op not supported by target.\n" ); |
1225 | if (is_a <bb_vec_info> (p: vinfo) && i != 0) |
1226 | continue; |
1227 | /* Fatal mismatch. */ |
1228 | matches[0] = false; |
1229 | return false; |
1230 | } |
1231 | need_same_oprnds = true; |
1232 | first_op1 = gimple_assign_rhs2 (gs: stmt); |
1233 | } |
1234 | } |
1235 | else if (rhs_code == WIDEN_LSHIFT_EXPR) |
1236 | { |
1237 | need_same_oprnds = true; |
1238 | first_op1 = gimple_assign_rhs2 (gs: stmt); |
1239 | } |
1240 | else if (!ldst_p |
1241 | && rhs_code == BIT_FIELD_REF) |
1242 | { |
1243 | tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0); |
1244 | if (!is_a <bb_vec_info> (p: vinfo) |
1245 | || TREE_CODE (vec) != SSA_NAME |
1246 | /* When the element types are not compatible we pun the |
1247 | source to the target vectype which requires equal size. */ |
1248 | || ((!VECTOR_TYPE_P (TREE_TYPE (vec)) |
1249 | || !types_compatible_p (TREE_TYPE (vectype), |
1250 | TREE_TYPE (TREE_TYPE (vec)))) |
1251 | && !operand_equal_p (TYPE_SIZE (vectype), |
1252 | TYPE_SIZE (TREE_TYPE (vec))))) |
1253 | { |
1254 | if (dump_enabled_p ()) |
1255 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
1256 | "Build SLP failed: " |
1257 | "BIT_FIELD_REF not supported\n" ); |
1258 | /* Fatal mismatch. */ |
1259 | matches[0] = false; |
1260 | return false; |
1261 | } |
1262 | } |
1263 | else if (rhs_code == CFN_DIV_POW2) |
1264 | { |
1265 | need_same_oprnds = true; |
1266 | first_op1 = gimple_call_arg (gs: call_stmt, index: 1); |
1267 | } |
1268 | } |
1269 | else |
1270 | { |
1271 | if (first_stmt_code != rhs_code |
1272 | && alt_stmt_code == ERROR_MARK) |
1273 | alt_stmt_code = rhs_code; |
1274 | if ((first_stmt_code != rhs_code |
1275 | && (first_stmt_code != IMAGPART_EXPR |
1276 | || rhs_code != REALPART_EXPR) |
1277 | && (first_stmt_code != REALPART_EXPR |
1278 | || rhs_code != IMAGPART_EXPR) |
1279 | /* Handle mismatches in plus/minus by computing both |
1280 | and merging the results. */ |
1281 | && !((first_stmt_code == PLUS_EXPR |
1282 | || first_stmt_code == MINUS_EXPR) |
1283 | && (alt_stmt_code == PLUS_EXPR |
1284 | || alt_stmt_code == MINUS_EXPR) |
1285 | && rhs_code == alt_stmt_code) |
1286 | && !(first_stmt_code.is_tree_code () |
1287 | && rhs_code.is_tree_code () |
1288 | && (TREE_CODE_CLASS (tree_code (first_stmt_code)) |
1289 | == tcc_comparison) |
1290 | && (swap_tree_comparison (tree_code (first_stmt_code)) |
1291 | == tree_code (rhs_code))) |
1292 | && !(STMT_VINFO_GROUPED_ACCESS (stmt_info) |
1293 | && (first_stmt_code == ARRAY_REF |
1294 | || first_stmt_code == BIT_FIELD_REF |
1295 | || first_stmt_code == INDIRECT_REF |
1296 | || first_stmt_code == COMPONENT_REF |
1297 | || first_stmt_code == MEM_REF) |
1298 | && (rhs_code == ARRAY_REF |
1299 | || rhs_code == BIT_FIELD_REF |
1300 | || rhs_code == INDIRECT_REF |
1301 | || rhs_code == COMPONENT_REF |
1302 | || rhs_code == MEM_REF))) |
1303 | || (ldst_p |
1304 | && (STMT_VINFO_GROUPED_ACCESS (stmt_info) |
1305 | != STMT_VINFO_GROUPED_ACCESS (first_stmt_info))) |
1306 | || (ldst_p |
1307 | && (STMT_VINFO_GATHER_SCATTER_P (stmt_info) |
1308 | != STMT_VINFO_GATHER_SCATTER_P (first_stmt_info))) |
1309 | || first_stmt_ldst_p != ldst_p |
1310 | || first_stmt_phi_p != phi_p) |
1311 | { |
1312 | if (dump_enabled_p ()) |
1313 | { |
1314 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
1315 | "Build SLP failed: different operation " |
1316 | "in stmt %G" , stmt); |
1317 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
1318 | "original stmt %G" , first_stmt_info->stmt); |
1319 | } |
1320 | /* Mismatch. */ |
1321 | continue; |
1322 | } |
1323 | |
1324 | if (!ldst_p |
1325 | && first_stmt_code == BIT_FIELD_REF |
1326 | && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0) |
1327 | != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0))) |
1328 | { |
1329 | if (dump_enabled_p ()) |
1330 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
1331 | "Build SLP failed: different BIT_FIELD_REF " |
1332 | "arguments in %G" , stmt); |
1333 | /* Mismatch. */ |
1334 | continue; |
1335 | } |
1336 | |
1337 | if (call_stmt |
1338 | && first_stmt_code != CFN_MASK_LOAD |
1339 | && first_stmt_code != CFN_MASK_STORE) |
1340 | { |
1341 | if (!compatible_calls_p (call1: as_a <gcall *> (p: stmts[0]->stmt), |
1342 | call2: call_stmt)) |
1343 | { |
1344 | if (dump_enabled_p ()) |
1345 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
1346 | "Build SLP failed: different calls in %G" , |
1347 | stmt); |
1348 | /* Mismatch. */ |
1349 | continue; |
1350 | } |
1351 | } |
1352 | |
1353 | if ((phi_p || gimple_could_trap_p (stmt_info->stmt)) |
1354 | && (gimple_bb (g: first_stmt_info->stmt) |
1355 | != gimple_bb (g: stmt_info->stmt))) |
1356 | { |
1357 | if (dump_enabled_p ()) |
1358 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
1359 | "Build SLP failed: different BB for PHI " |
1360 | "or possibly trapping operation in %G" , stmt); |
1361 | /* Mismatch. */ |
1362 | continue; |
1363 | } |
1364 | |
1365 | if (need_same_oprnds) |
1366 | { |
1367 | tree other_op1 = gimple_arg (gs: stmt, i: 1); |
1368 | if (!operand_equal_p (first_op1, other_op1, flags: 0)) |
1369 | { |
1370 | if (dump_enabled_p ()) |
1371 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
1372 | "Build SLP failed: different shift " |
1373 | "arguments in %G" , stmt); |
1374 | /* Mismatch. */ |
1375 | continue; |
1376 | } |
1377 | } |
1378 | |
1379 | if (!types_compatible_p (type1: vectype, type2: *node_vectype)) |
1380 | { |
1381 | if (dump_enabled_p ()) |
1382 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
1383 | "Build SLP failed: different vector type " |
1384 | "in %G" , stmt); |
1385 | /* Mismatch. */ |
1386 | continue; |
1387 | } |
1388 | } |
1389 | |
1390 | /* Grouped store or load. */ |
1391 | if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) |
1392 | { |
1393 | gcc_assert (ldst_p); |
1394 | if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info))) |
1395 | { |
1396 | /* Store. */ |
1397 | gcc_assert (rhs_code == CFN_MASK_STORE |
1398 | || REFERENCE_CLASS_P (lhs) |
1399 | || DECL_P (lhs)); |
1400 | } |
1401 | else |
1402 | { |
1403 | /* Load. */ |
1404 | first_load = DR_GROUP_FIRST_ELEMENT (stmt_info); |
1405 | if (prev_first_load) |
1406 | { |
1407 | /* Check that there are no loads from different interleaving |
1408 | chains in the same node. */ |
1409 | if (prev_first_load != first_load) |
1410 | { |
1411 | if (dump_enabled_p ()) |
1412 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, |
1413 | vect_location, |
1414 | "Build SLP failed: different " |
1415 | "interleaving chains in one node %G" , |
1416 | stmt); |
1417 | /* Mismatch. */ |
1418 | continue; |
1419 | } |
1420 | } |
1421 | else |
1422 | prev_first_load = first_load; |
1423 | } |
1424 | } |
1425 | /* Non-grouped store or load. */ |
1426 | else if (ldst_p) |
1427 | { |
1428 | if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)) |
1429 | && rhs_code != CFN_GATHER_LOAD |
1430 | && rhs_code != CFN_MASK_GATHER_LOAD |
1431 | && rhs_code != CFN_MASK_LEN_GATHER_LOAD |
1432 | && !STMT_VINFO_GATHER_SCATTER_P (stmt_info) |
1433 | /* Not grouped loads are handled as externals for BB |
1434 | vectorization. For loop vectorization we can handle |
1435 | splats the same we handle single element interleaving. */ |
1436 | && (is_a <bb_vec_info> (p: vinfo) |
1437 | || stmt_info != first_stmt_info)) |
1438 | { |
1439 | /* Not grouped load. */ |
1440 | if (dump_enabled_p ()) |
1441 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
1442 | "Build SLP failed: not grouped load %G" , stmt); |
1443 | |
1444 | if (i != 0) |
1445 | continue; |
1446 | /* Fatal mismatch. */ |
1447 | matches[0] = false; |
1448 | return false; |
1449 | } |
1450 | } |
1451 | /* Not memory operation. */ |
1452 | else |
1453 | { |
1454 | if (!phi_p |
1455 | && rhs_code.is_tree_code () |
1456 | && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary |
1457 | && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary |
1458 | && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_expression |
1459 | && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_comparison |
1460 | && rhs_code != VIEW_CONVERT_EXPR |
1461 | && rhs_code != CALL_EXPR |
1462 | && rhs_code != BIT_FIELD_REF) |
1463 | { |
1464 | if (dump_enabled_p ()) |
1465 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
1466 | "Build SLP failed: operation unsupported %G" , |
1467 | stmt); |
1468 | if (is_a <bb_vec_info> (p: vinfo) && i != 0) |
1469 | continue; |
1470 | /* Fatal mismatch. */ |
1471 | matches[0] = false; |
1472 | return false; |
1473 | } |
1474 | |
1475 | if (rhs_code == COND_EXPR) |
1476 | { |
1477 | tree cond_expr = gimple_assign_rhs1 (gs: stmt); |
1478 | enum tree_code cond_code = TREE_CODE (cond_expr); |
1479 | enum tree_code swap_code = ERROR_MARK; |
1480 | enum tree_code invert_code = ERROR_MARK; |
1481 | |
1482 | if (i == 0) |
1483 | first_cond_code = TREE_CODE (cond_expr); |
1484 | else if (TREE_CODE_CLASS (cond_code) == tcc_comparison) |
1485 | { |
1486 | bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0)); |
1487 | swap_code = swap_tree_comparison (cond_code); |
1488 | invert_code = invert_tree_comparison (cond_code, honor_nans); |
1489 | } |
1490 | |
1491 | if (first_cond_code == cond_code) |
1492 | ; |
1493 | /* Isomorphic can be achieved by swapping. */ |
1494 | else if (first_cond_code == swap_code) |
1495 | swap[i] = 1; |
1496 | /* Isomorphic can be achieved by inverting. */ |
1497 | else if (first_cond_code == invert_code) |
1498 | swap[i] = 2; |
1499 | else |
1500 | { |
1501 | if (dump_enabled_p ()) |
1502 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
1503 | "Build SLP failed: different" |
1504 | " operation %G" , stmt); |
1505 | /* Mismatch. */ |
1506 | continue; |
1507 | } |
1508 | } |
1509 | |
1510 | if (rhs_code.is_tree_code () |
1511 | && TREE_CODE_CLASS ((tree_code)rhs_code) == tcc_comparison |
1512 | && (swap_tree_comparison ((tree_code)first_stmt_code) |
1513 | == (tree_code)rhs_code)) |
1514 | swap[i] = 1; |
1515 | } |
1516 | |
1517 | matches[i] = true; |
1518 | } |
1519 | |
1520 | for (i = 0; i < group_size; ++i) |
1521 | if (!matches[i]) |
1522 | return false; |
1523 | |
1524 | /* If we allowed a two-operation SLP node verify the target can cope |
1525 | with the permute we are going to use. */ |
1526 | if (alt_stmt_code != ERROR_MARK |
1527 | && (!alt_stmt_code.is_tree_code () |
1528 | || (TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_reference |
1529 | && TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_comparison))) |
1530 | { |
1531 | *two_operators = true; |
1532 | } |
1533 | |
1534 | if (maybe_soft_fail) |
1535 | { |
1536 | unsigned HOST_WIDE_INT const_nunits; |
1537 | if (!TYPE_VECTOR_SUBPARTS |
1538 | (node: soft_fail_nunits_vectype).is_constant (const_value: &const_nunits) |
1539 | || const_nunits > group_size) |
1540 | matches[0] = false; |
1541 | else |
1542 | { |
1543 | /* With constant vector elements simulate a mismatch at the |
1544 | point we need to split. */ |
1545 | unsigned tail = group_size & (const_nunits - 1); |
1546 | memset (s: &matches[group_size - tail], c: 0, n: sizeof (bool) * tail); |
1547 | } |
1548 | return false; |
1549 | } |
1550 | |
1551 | return true; |
1552 | } |
1553 | |
1554 | /* Traits for the hash_set to record failed SLP builds for a stmt set. |
1555 | Note we never remove apart from at destruction time so we do not |
1556 | need a special value for deleted that differs from empty. */ |
1557 | struct bst_traits |
1558 | { |
1559 | typedef vec <stmt_vec_info> value_type; |
1560 | typedef vec <stmt_vec_info> compare_type; |
1561 | static inline hashval_t hash (value_type); |
1562 | static inline bool equal (value_type existing, value_type candidate); |
1563 | static inline bool is_empty (value_type x) { return !x.exists (); } |
1564 | static inline bool is_deleted (value_type x) { return !x.exists (); } |
1565 | static const bool empty_zero_p = true; |
1566 | static inline void mark_empty (value_type &x) { x.release (); } |
1567 | static inline void mark_deleted (value_type &x) { x.release (); } |
1568 | static inline void remove (value_type &x) { x.release (); } |
1569 | }; |
1570 | inline hashval_t |
1571 | bst_traits::hash (value_type x) |
1572 | { |
1573 | inchash::hash h; |
1574 | for (unsigned i = 0; i < x.length (); ++i) |
1575 | h.add_int (v: gimple_uid (g: x[i]->stmt)); |
1576 | return h.end (); |
1577 | } |
1578 | inline bool |
1579 | bst_traits::equal (value_type existing, value_type candidate) |
1580 | { |
1581 | if (existing.length () != candidate.length ()) |
1582 | return false; |
1583 | for (unsigned i = 0; i < existing.length (); ++i) |
1584 | if (existing[i] != candidate[i]) |
1585 | return false; |
1586 | return true; |
1587 | } |
1588 | |
1589 | /* ??? This was std::pair<std::pair<tree_code, vect_def_type>, tree> |
1590 | but then vec::insert does memmove and that's not compatible with |
1591 | std::pair. */ |
1592 | struct chain_op_t |
1593 | { |
1594 | chain_op_t (tree_code code_, vect_def_type dt_, tree op_) |
1595 | : code (code_), dt (dt_), op (op_) {} |
1596 | tree_code code; |
1597 | vect_def_type dt; |
1598 | tree op; |
1599 | }; |
1600 | |
1601 | /* Comparator for sorting associatable chains. */ |
1602 | |
1603 | static int |
1604 | dt_sort_cmp (const void *op1_, const void *op2_, void *) |
1605 | { |
1606 | auto *op1 = (const chain_op_t *) op1_; |
1607 | auto *op2 = (const chain_op_t *) op2_; |
1608 | if (op1->dt != op2->dt) |
1609 | return (int)op1->dt - (int)op2->dt; |
1610 | return (int)op1->code - (int)op2->code; |
1611 | } |
1612 | |
1613 | /* Linearize the associatable expression chain at START with the |
1614 | associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR), |
1615 | filling CHAIN with the result and using WORKLIST as intermediate storage. |
1616 | CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE |
1617 | or MINUS_EXPR. *CHAIN_STMTS if not NULL is filled with all computation |
1618 | stmts, starting with START. */ |
1619 | |
1620 | static void |
1621 | vect_slp_linearize_chain (vec_info *vinfo, |
1622 | vec<std::pair<tree_code, gimple *> > &worklist, |
1623 | vec<chain_op_t> &chain, |
1624 | enum tree_code code, gimple *start, |
1625 | gimple *&code_stmt, gimple *&alt_code_stmt, |
1626 | vec<gimple *> *chain_stmts) |
1627 | { |
1628 | /* For each lane linearize the addition/subtraction (or other |
1629 | uniform associatable operation) expression tree. */ |
1630 | worklist.safe_push (obj: std::make_pair (x&: code, y&: start)); |
1631 | while (!worklist.is_empty ()) |
1632 | { |
1633 | auto entry = worklist.pop (); |
1634 | gassign *stmt = as_a <gassign *> (p: entry.second); |
1635 | enum tree_code in_code = entry.first; |
1636 | enum tree_code this_code = gimple_assign_rhs_code (gs: stmt); |
1637 | /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE. */ |
1638 | if (!code_stmt |
1639 | && gimple_assign_rhs_code (gs: stmt) == code) |
1640 | code_stmt = stmt; |
1641 | else if (!alt_code_stmt |
1642 | && gimple_assign_rhs_code (gs: stmt) == MINUS_EXPR) |
1643 | alt_code_stmt = stmt; |
1644 | if (chain_stmts) |
1645 | chain_stmts->safe_push (obj: stmt); |
1646 | for (unsigned opnum = 1; opnum <= 2; ++opnum) |
1647 | { |
1648 | tree op = gimple_op (gs: stmt, i: opnum); |
1649 | vect_def_type dt; |
1650 | stmt_vec_info def_stmt_info; |
1651 | bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info); |
1652 | gcc_assert (res); |
1653 | if (dt == vect_internal_def |
1654 | && is_pattern_stmt_p (stmt_info: def_stmt_info)) |
1655 | op = gimple_get_lhs (def_stmt_info->stmt); |
1656 | gimple *use_stmt; |
1657 | use_operand_p use_p; |
1658 | if (dt == vect_internal_def |
1659 | && single_imm_use (var: op, use_p: &use_p, stmt: &use_stmt) |
1660 | && is_gimple_assign (gs: def_stmt_info->stmt) |
1661 | && (gimple_assign_rhs_code (gs: def_stmt_info->stmt) == code |
1662 | || (code == PLUS_EXPR |
1663 | && (gimple_assign_rhs_code (gs: def_stmt_info->stmt) |
1664 | == MINUS_EXPR)))) |
1665 | { |
1666 | tree_code op_def_code = this_code; |
1667 | if (op_def_code == MINUS_EXPR && opnum == 1) |
1668 | op_def_code = PLUS_EXPR; |
1669 | if (in_code == MINUS_EXPR) |
1670 | op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR; |
1671 | worklist.safe_push (obj: std::make_pair (x&: op_def_code, |
1672 | y&: def_stmt_info->stmt)); |
1673 | } |
1674 | else |
1675 | { |
1676 | tree_code op_def_code = this_code; |
1677 | if (op_def_code == MINUS_EXPR && opnum == 1) |
1678 | op_def_code = PLUS_EXPR; |
1679 | if (in_code == MINUS_EXPR) |
1680 | op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR; |
1681 | chain.safe_push (obj: chain_op_t (op_def_code, dt, op)); |
1682 | } |
1683 | } |
1684 | } |
1685 | } |
1686 | |
1687 | typedef hash_map <vec <stmt_vec_info>, slp_tree, |
1688 | simple_hashmap_traits <bst_traits, slp_tree> > |
1689 | scalar_stmts_to_slp_tree_map_t; |
1690 | |
1691 | static slp_tree |
1692 | vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node, |
1693 | vec<stmt_vec_info> stmts, unsigned int group_size, |
1694 | poly_uint64 *max_nunits, |
1695 | bool *matches, unsigned *limit, unsigned *tree_size, |
1696 | scalar_stmts_to_slp_tree_map_t *bst_map); |
1697 | |
1698 | static slp_tree |
1699 | vect_build_slp_tree (vec_info *vinfo, |
1700 | vec<stmt_vec_info> stmts, unsigned int group_size, |
1701 | poly_uint64 *max_nunits, |
1702 | bool *matches, unsigned *limit, unsigned *tree_size, |
1703 | scalar_stmts_to_slp_tree_map_t *bst_map) |
1704 | { |
1705 | if (slp_tree *leader = bst_map->get (k: stmts)) |
1706 | { |
1707 | if (dump_enabled_p ()) |
1708 | dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n" , |
1709 | !(*leader)->failed ? "" : "failed " , |
1710 | (void *) *leader); |
1711 | if (!(*leader)->failed) |
1712 | { |
1713 | SLP_TREE_REF_COUNT (*leader)++; |
1714 | vect_update_max_nunits (max_nunits, nunits: (*leader)->max_nunits); |
1715 | stmts.release (); |
1716 | return *leader; |
1717 | } |
1718 | memcpy (dest: matches, src: (*leader)->failed, n: sizeof (bool) * group_size); |
1719 | return NULL; |
1720 | } |
1721 | |
1722 | /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2 |
1723 | so we can pick up backedge destinations during discovery. */ |
1724 | slp_tree res = new _slp_tree; |
1725 | SLP_TREE_DEF_TYPE (res) = vect_internal_def; |
1726 | SLP_TREE_SCALAR_STMTS (res) = stmts; |
1727 | bst_map->put (k: stmts.copy (), v: res); |
1728 | |
1729 | if (*limit == 0) |
1730 | { |
1731 | if (dump_enabled_p ()) |
1732 | dump_printf_loc (MSG_NOTE, vect_location, |
1733 | "SLP discovery limit exceeded\n" ); |
1734 | /* Mark the node invalid so we can detect those when still in use |
1735 | as backedge destinations. */ |
1736 | SLP_TREE_SCALAR_STMTS (res) = vNULL; |
1737 | SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def; |
1738 | res->failed = XNEWVEC (bool, group_size); |
1739 | memset (s: res->failed, c: 0, n: sizeof (bool) * group_size); |
1740 | memset (s: matches, c: 0, n: sizeof (bool) * group_size); |
1741 | return NULL; |
1742 | } |
1743 | --*limit; |
1744 | |
1745 | if (dump_enabled_p ()) |
1746 | dump_printf_loc (MSG_NOTE, vect_location, |
1747 | "starting SLP discovery for node %p\n" , (void *) res); |
1748 | |
1749 | poly_uint64 this_max_nunits = 1; |
1750 | slp_tree res_ = vect_build_slp_tree_2 (vinfo, node: res, stmts, group_size, |
1751 | max_nunits: &this_max_nunits, |
1752 | matches, limit, tree_size, bst_map); |
1753 | if (!res_) |
1754 | { |
1755 | if (dump_enabled_p ()) |
1756 | dump_printf_loc (MSG_NOTE, vect_location, |
1757 | "SLP discovery for node %p failed\n" , (void *) res); |
1758 | /* Mark the node invalid so we can detect those when still in use |
1759 | as backedge destinations. */ |
1760 | SLP_TREE_SCALAR_STMTS (res) = vNULL; |
1761 | SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def; |
1762 | res->failed = XNEWVEC (bool, group_size); |
1763 | if (flag_checking) |
1764 | { |
1765 | unsigned i; |
1766 | for (i = 0; i < group_size; ++i) |
1767 | if (!matches[i]) |
1768 | break; |
1769 | gcc_assert (i < group_size); |
1770 | } |
1771 | memcpy (dest: res->failed, src: matches, n: sizeof (bool) * group_size); |
1772 | } |
1773 | else |
1774 | { |
1775 | if (dump_enabled_p ()) |
1776 | dump_printf_loc (MSG_NOTE, vect_location, |
1777 | "SLP discovery for node %p succeeded\n" , |
1778 | (void *) res); |
1779 | gcc_assert (res_ == res); |
1780 | res->max_nunits = this_max_nunits; |
1781 | vect_update_max_nunits (max_nunits, nunits: this_max_nunits); |
1782 | /* Keep a reference for the bst_map use. */ |
1783 | SLP_TREE_REF_COUNT (res)++; |
1784 | } |
1785 | return res_; |
1786 | } |
1787 | |
1788 | /* Helper for building an associated SLP node chain. */ |
1789 | |
1790 | static void |
1791 | vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype, |
1792 | slp_tree op0, slp_tree op1, |
1793 | stmt_vec_info oper1, stmt_vec_info oper2, |
1794 | vec<std::pair<unsigned, unsigned> > lperm) |
1795 | { |
1796 | unsigned group_size = SLP_TREE_LANES (op1); |
1797 | |
1798 | slp_tree child1 = new _slp_tree; |
1799 | SLP_TREE_DEF_TYPE (child1) = vect_internal_def; |
1800 | SLP_TREE_VECTYPE (child1) = vectype; |
1801 | SLP_TREE_LANES (child1) = group_size; |
1802 | SLP_TREE_CHILDREN (child1).create (nelems: 2); |
1803 | SLP_TREE_CHILDREN (child1).quick_push (obj: op0); |
1804 | SLP_TREE_CHILDREN (child1).quick_push (obj: op1); |
1805 | SLP_TREE_REPRESENTATIVE (child1) = oper1; |
1806 | |
1807 | slp_tree child2 = new _slp_tree; |
1808 | SLP_TREE_DEF_TYPE (child2) = vect_internal_def; |
1809 | SLP_TREE_VECTYPE (child2) = vectype; |
1810 | SLP_TREE_LANES (child2) = group_size; |
1811 | SLP_TREE_CHILDREN (child2).create (nelems: 2); |
1812 | SLP_TREE_CHILDREN (child2).quick_push (obj: op0); |
1813 | SLP_TREE_REF_COUNT (op0)++; |
1814 | SLP_TREE_CHILDREN (child2).quick_push (obj: op1); |
1815 | SLP_TREE_REF_COUNT (op1)++; |
1816 | SLP_TREE_REPRESENTATIVE (child2) = oper2; |
1817 | |
1818 | SLP_TREE_DEF_TYPE (perm) = vect_internal_def; |
1819 | SLP_TREE_CODE (perm) = VEC_PERM_EXPR; |
1820 | SLP_TREE_VECTYPE (perm) = vectype; |
1821 | SLP_TREE_LANES (perm) = group_size; |
1822 | /* ??? We should set this NULL but that's not expected. */ |
1823 | SLP_TREE_REPRESENTATIVE (perm) = oper1; |
1824 | SLP_TREE_LANE_PERMUTATION (perm) = lperm; |
1825 | SLP_TREE_CHILDREN (perm).quick_push (obj: child1); |
1826 | SLP_TREE_CHILDREN (perm).quick_push (obj: child2); |
1827 | } |
1828 | |
1829 | /* Recursively build an SLP tree starting from NODE. |
1830 | Fail (and return a value not equal to zero) if def-stmts are not |
1831 | isomorphic, require data permutation or are of unsupported types of |
1832 | operation. Otherwise, return 0. |
1833 | The value returned is the depth in the SLP tree where a mismatch |
1834 | was found. */ |
1835 | |
1836 | static slp_tree |
1837 | vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node, |
1838 | vec<stmt_vec_info> stmts, unsigned int group_size, |
1839 | poly_uint64 *max_nunits, |
1840 | bool *matches, unsigned *limit, unsigned *tree_size, |
1841 | scalar_stmts_to_slp_tree_map_t *bst_map) |
1842 | { |
1843 | unsigned nops, i, this_tree_size = 0; |
1844 | poly_uint64 this_max_nunits = *max_nunits; |
1845 | |
1846 | matches[0] = false; |
1847 | |
1848 | stmt_vec_info stmt_info = stmts[0]; |
1849 | if (!is_a<gcall *> (p: stmt_info->stmt) |
1850 | && !is_a<gassign *> (p: stmt_info->stmt) |
1851 | && !is_a<gphi *> (p: stmt_info->stmt)) |
1852 | return NULL; |
1853 | |
1854 | nops = gimple_num_args (gs: stmt_info->stmt); |
1855 | if (const int *map = vect_get_operand_map (stmt: stmt_info->stmt, |
1856 | STMT_VINFO_GATHER_SCATTER_P |
1857 | (stmt_info))) |
1858 | nops = map[0]; |
1859 | |
1860 | /* If the SLP node is a PHI (induction or reduction), terminate |
1861 | the recursion. */ |
1862 | bool *skip_args = XALLOCAVEC (bool, nops); |
1863 | memset (s: skip_args, c: 0, n: sizeof (bool) * nops); |
1864 | if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (p: vinfo)) |
1865 | if (gphi *stmt = dyn_cast <gphi *> (p: stmt_info->stmt)) |
1866 | { |
1867 | tree scalar_type = TREE_TYPE (PHI_RESULT (stmt)); |
1868 | tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type, |
1869 | group_size); |
1870 | if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype, |
1871 | max_nunits)) |
1872 | return NULL; |
1873 | |
1874 | vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info); |
1875 | if (def_type == vect_induction_def) |
1876 | { |
1877 | /* Induction PHIs are not cycles but walk the initial |
1878 | value. Only for inner loops through, for outer loops |
1879 | we need to pick up the value from the actual PHIs |
1880 | to more easily support peeling and epilogue vectorization. */ |
1881 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
1882 | if (!nested_in_vect_loop_p (loop, stmt_info)) |
1883 | skip_args[loop_preheader_edge (loop)->dest_idx] = true; |
1884 | else |
1885 | loop = loop->inner; |
1886 | skip_args[loop_latch_edge (loop)->dest_idx] = true; |
1887 | } |
1888 | else if (def_type == vect_reduction_def |
1889 | || def_type == vect_double_reduction_def |
1890 | || def_type == vect_nested_cycle |
1891 | || def_type == vect_first_order_recurrence) |
1892 | { |
1893 | /* Else def types have to match. */ |
1894 | stmt_vec_info other_info; |
1895 | bool all_same = true; |
1896 | FOR_EACH_VEC_ELT (stmts, i, other_info) |
1897 | { |
1898 | if (STMT_VINFO_DEF_TYPE (other_info) != def_type) |
1899 | return NULL; |
1900 | if (other_info != stmt_info) |
1901 | all_same = false; |
1902 | } |
1903 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
1904 | /* Reduction initial values are not explicitely represented. */ |
1905 | if (def_type != vect_first_order_recurrence |
1906 | && !nested_in_vect_loop_p (loop, stmt_info)) |
1907 | skip_args[loop_preheader_edge (loop)->dest_idx] = true; |
1908 | /* Reduction chain backedge defs are filled manually. |
1909 | ??? Need a better way to identify a SLP reduction chain PHI. |
1910 | Or a better overall way to SLP match those. */ |
1911 | if (all_same && def_type == vect_reduction_def) |
1912 | skip_args[loop_latch_edge (loop)->dest_idx] = true; |
1913 | } |
1914 | else if (def_type != vect_internal_def) |
1915 | return NULL; |
1916 | } |
1917 | |
1918 | |
1919 | bool two_operators = false; |
1920 | unsigned char *swap = XALLOCAVEC (unsigned char, group_size); |
1921 | tree vectype = NULL_TREE; |
1922 | if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size, |
1923 | max_nunits: &this_max_nunits, matches, two_operators: &two_operators, |
1924 | node_vectype: &vectype)) |
1925 | return NULL; |
1926 | |
1927 | /* If the SLP node is a load, terminate the recursion unless masked. */ |
1928 | if (STMT_VINFO_DATA_REF (stmt_info) |
1929 | && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))) |
1930 | { |
1931 | if (gcall *stmt = dyn_cast <gcall *> (p: stmt_info->stmt)) |
1932 | gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD) |
1933 | || gimple_call_internal_p (stmt, IFN_GATHER_LOAD) |
1934 | || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD) |
1935 | || gimple_call_internal_p (stmt, IFN_MASK_LEN_GATHER_LOAD)); |
1936 | else if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) |
1937 | gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))); |
1938 | else |
1939 | { |
1940 | *max_nunits = this_max_nunits; |
1941 | (*tree_size)++; |
1942 | node = vect_create_new_slp_node (node, scalar_stmts: stmts, nops: 0); |
1943 | SLP_TREE_VECTYPE (node) = vectype; |
1944 | /* And compute the load permutation. Whether it is actually |
1945 | a permutation depends on the unrolling factor which is |
1946 | decided later. */ |
1947 | vec<unsigned> load_permutation; |
1948 | int j; |
1949 | stmt_vec_info load_info; |
1950 | load_permutation.create (nelems: group_size); |
1951 | stmt_vec_info first_stmt_info |
1952 | = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]); |
1953 | FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info) |
1954 | { |
1955 | int load_place; |
1956 | if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) |
1957 | load_place = vect_get_place_in_interleaving_chain |
1958 | (stmt_info: load_info, first_stmt_info); |
1959 | else |
1960 | load_place = 0; |
1961 | gcc_assert (load_place != -1); |
1962 | load_permutation.safe_push (obj: load_place); |
1963 | } |
1964 | SLP_TREE_LOAD_PERMUTATION (node) = load_permutation; |
1965 | return node; |
1966 | } |
1967 | } |
1968 | else if (gimple_assign_single_p (gs: stmt_info->stmt) |
1969 | && !gimple_vuse (g: stmt_info->stmt) |
1970 | && gimple_assign_rhs_code (gs: stmt_info->stmt) == BIT_FIELD_REF) |
1971 | { |
1972 | /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference |
1973 | the same SSA name vector of a compatible type to vectype. */ |
1974 | vec<std::pair<unsigned, unsigned> > lperm = vNULL; |
1975 | tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0); |
1976 | stmt_vec_info estmt_info; |
1977 | FOR_EACH_VEC_ELT (stmts, i, estmt_info) |
1978 | { |
1979 | gassign *estmt = as_a <gassign *> (p: estmt_info->stmt); |
1980 | tree bfref = gimple_assign_rhs1 (gs: estmt); |
1981 | HOST_WIDE_INT lane; |
1982 | if (!known_eq (bit_field_size (bfref), |
1983 | tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype)))) |
1984 | || !constant_multiple_p (a: bit_field_offset (t: bfref), |
1985 | b: bit_field_size (t: bfref), multiple: &lane)) |
1986 | { |
1987 | lperm.release (); |
1988 | matches[0] = false; |
1989 | return NULL; |
1990 | } |
1991 | lperm.safe_push (obj: std::make_pair (x: 0, y: (unsigned)lane)); |
1992 | } |
1993 | slp_tree vnode = vect_create_new_slp_node (ops: vNULL); |
1994 | if (operand_equal_p (TYPE_SIZE (vectype), TYPE_SIZE (TREE_TYPE (vec)))) |
1995 | /* ??? We record vectype here but we hide eventually necessary |
1996 | punning and instead rely on code generation to materialize |
1997 | VIEW_CONVERT_EXPRs as necessary. We instead should make |
1998 | this explicit somehow. */ |
1999 | SLP_TREE_VECTYPE (vnode) = vectype; |
2000 | else |
2001 | { |
2002 | /* For different size but compatible elements we can still |
2003 | use VEC_PERM_EXPR without punning. */ |
2004 | gcc_assert (VECTOR_TYPE_P (TREE_TYPE (vec)) |
2005 | && types_compatible_p (TREE_TYPE (vectype), |
2006 | TREE_TYPE (TREE_TYPE (vec)))); |
2007 | SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec); |
2008 | } |
2009 | auto nunits = TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (vnode)); |
2010 | unsigned HOST_WIDE_INT const_nunits; |
2011 | if (nunits.is_constant (const_value: &const_nunits)) |
2012 | SLP_TREE_LANES (vnode) = const_nunits; |
2013 | SLP_TREE_VEC_DEFS (vnode).safe_push (obj: vec); |
2014 | /* We are always building a permutation node even if it is an identity |
2015 | permute to shield the rest of the vectorizer from the odd node |
2016 | representing an actual vector without any scalar ops. |
2017 | ??? We could hide it completely with making the permute node |
2018 | external? */ |
2019 | node = vect_create_new_slp_node (node, scalar_stmts: stmts, nops: 1); |
2020 | SLP_TREE_CODE (node) = VEC_PERM_EXPR; |
2021 | SLP_TREE_LANE_PERMUTATION (node) = lperm; |
2022 | SLP_TREE_VECTYPE (node) = vectype; |
2023 | SLP_TREE_CHILDREN (node).quick_push (obj: vnode); |
2024 | return node; |
2025 | } |
2026 | /* When discovery reaches an associatable operation see whether we can |
2027 | improve that to match up lanes in a way superior to the operand |
2028 | swapping code which at most looks at two defs. |
2029 | ??? For BB vectorization we cannot do the brute-force search |
2030 | for matching as we can succeed by means of builds from scalars |
2031 | and have no good way to "cost" one build against another. */ |
2032 | else if (is_a <loop_vec_info> (p: vinfo) |
2033 | /* ??? We don't handle !vect_internal_def defs below. */ |
2034 | && STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def |
2035 | && is_gimple_assign (gs: stmt_info->stmt) |
2036 | && (associative_tree_code (gimple_assign_rhs_code (gs: stmt_info->stmt)) |
2037 | || gimple_assign_rhs_code (gs: stmt_info->stmt) == MINUS_EXPR) |
2038 | && ((FLOAT_TYPE_P (vectype) && flag_associative_math) |
2039 | || (INTEGRAL_TYPE_P (TREE_TYPE (vectype)) |
2040 | && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype))))) |
2041 | { |
2042 | /* See if we have a chain of (mixed) adds or subtracts or other |
2043 | associatable ops. */ |
2044 | enum tree_code code = gimple_assign_rhs_code (gs: stmt_info->stmt); |
2045 | if (code == MINUS_EXPR) |
2046 | code = PLUS_EXPR; |
2047 | stmt_vec_info other_op_stmt_info = NULL; |
2048 | stmt_vec_info op_stmt_info = NULL; |
2049 | unsigned chain_len = 0; |
2050 | auto_vec<chain_op_t> chain; |
2051 | auto_vec<std::pair<tree_code, gimple *> > worklist; |
2052 | auto_vec<vec<chain_op_t> > chains (group_size); |
2053 | auto_vec<slp_tree, 4> children; |
2054 | bool hard_fail = true; |
2055 | for (unsigned lane = 0; lane < group_size; ++lane) |
2056 | { |
2057 | /* For each lane linearize the addition/subtraction (or other |
2058 | uniform associatable operation) expression tree. */ |
2059 | gimple *op_stmt = NULL, *other_op_stmt = NULL; |
2060 | vect_slp_linearize_chain (vinfo, worklist, chain, code, |
2061 | start: stmts[lane]->stmt, code_stmt&: op_stmt, alt_code_stmt&: other_op_stmt, |
2062 | NULL); |
2063 | if (!op_stmt_info && op_stmt) |
2064 | op_stmt_info = vinfo->lookup_stmt (op_stmt); |
2065 | if (!other_op_stmt_info && other_op_stmt) |
2066 | other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt); |
2067 | if (chain.length () == 2) |
2068 | { |
2069 | /* In a chain of just two elements resort to the regular |
2070 | operand swapping scheme. If we run into a length |
2071 | mismatch still hard-FAIL. */ |
2072 | if (chain_len == 0) |
2073 | hard_fail = false; |
2074 | else |
2075 | { |
2076 | matches[lane] = false; |
2077 | /* ??? We might want to process the other lanes, but |
2078 | make sure to not give false matching hints to the |
2079 | caller for lanes we did not process. */ |
2080 | if (lane != group_size - 1) |
2081 | matches[0] = false; |
2082 | } |
2083 | break; |
2084 | } |
2085 | else if (chain_len == 0) |
2086 | chain_len = chain.length (); |
2087 | else if (chain.length () != chain_len) |
2088 | { |
2089 | /* ??? Here we could slip in magic to compensate with |
2090 | neutral operands. */ |
2091 | matches[lane] = false; |
2092 | if (lane != group_size - 1) |
2093 | matches[0] = false; |
2094 | break; |
2095 | } |
2096 | chains.quick_push (obj: chain.copy ()); |
2097 | chain.truncate (size: 0); |
2098 | } |
2099 | if (chains.length () == group_size) |
2100 | { |
2101 | /* We cannot yet use SLP_TREE_CODE to communicate the operation. */ |
2102 | if (!op_stmt_info) |
2103 | { |
2104 | hard_fail = false; |
2105 | goto out; |
2106 | } |
2107 | /* Now we have a set of chains with the same length. */ |
2108 | /* 1. pre-sort according to def_type and operation. */ |
2109 | for (unsigned lane = 0; lane < group_size; ++lane) |
2110 | chains[lane].stablesort (cmp: dt_sort_cmp, data: vinfo); |
2111 | if (dump_enabled_p ()) |
2112 | { |
2113 | dump_printf_loc (MSG_NOTE, vect_location, |
2114 | "pre-sorted chains of %s\n" , |
2115 | get_tree_code_name (code)); |
2116 | for (unsigned lane = 0; lane < group_size; ++lane) |
2117 | { |
2118 | for (unsigned opnum = 0; opnum < chain_len; ++opnum) |
2119 | dump_printf (MSG_NOTE, "%s %T " , |
2120 | get_tree_code_name (chains[lane][opnum].code), |
2121 | chains[lane][opnum].op); |
2122 | dump_printf (MSG_NOTE, "\n" ); |
2123 | } |
2124 | } |
2125 | /* 2. try to build children nodes, associating as necessary. */ |
2126 | for (unsigned n = 0; n < chain_len; ++n) |
2127 | { |
2128 | vect_def_type dt = chains[0][n].dt; |
2129 | unsigned lane; |
2130 | for (lane = 0; lane < group_size; ++lane) |
2131 | if (chains[lane][n].dt != dt) |
2132 | { |
2133 | if (dt == vect_constant_def |
2134 | && chains[lane][n].dt == vect_external_def) |
2135 | dt = vect_external_def; |
2136 | else if (dt == vect_external_def |
2137 | && chains[lane][n].dt == vect_constant_def) |
2138 | ; |
2139 | else |
2140 | break; |
2141 | } |
2142 | if (lane != group_size) |
2143 | { |
2144 | if (dump_enabled_p ()) |
2145 | dump_printf_loc (MSG_NOTE, vect_location, |
2146 | "giving up on chain due to mismatched " |
2147 | "def types\n" ); |
2148 | matches[lane] = false; |
2149 | if (lane != group_size - 1) |
2150 | matches[0] = false; |
2151 | goto out; |
2152 | } |
2153 | if (dt == vect_constant_def |
2154 | || dt == vect_external_def) |
2155 | { |
2156 | /* Check whether we can build the invariant. If we can't |
2157 | we never will be able to. */ |
2158 | tree type = TREE_TYPE (chains[0][n].op); |
2159 | if (!GET_MODE_SIZE (mode: vinfo->vector_mode).is_constant () |
2160 | && (TREE_CODE (type) == BOOLEAN_TYPE |
2161 | || !can_duplicate_and_interleave_p (vinfo, count: group_size, |
2162 | elt_type: type))) |
2163 | { |
2164 | matches[0] = false; |
2165 | goto out; |
2166 | } |
2167 | vec<tree> ops; |
2168 | ops.create (nelems: group_size); |
2169 | for (lane = 0; lane < group_size; ++lane) |
2170 | ops.quick_push (obj: chains[lane][n].op); |
2171 | slp_tree child = vect_create_new_slp_node (ops); |
2172 | SLP_TREE_DEF_TYPE (child) = dt; |
2173 | children.safe_push (obj: child); |
2174 | } |
2175 | else if (dt != vect_internal_def) |
2176 | { |
2177 | /* Not sure, we might need sth special. |
2178 | gcc.dg/vect/pr96854.c, |
2179 | gfortran.dg/vect/fast-math-pr37021.f90 |
2180 | and gfortran.dg/vect/pr61171.f trigger. */ |
2181 | /* Soft-fail for now. */ |
2182 | hard_fail = false; |
2183 | goto out; |
2184 | } |
2185 | else |
2186 | { |
2187 | vec<stmt_vec_info> op_stmts; |
2188 | op_stmts.create (nelems: group_size); |
2189 | slp_tree child = NULL; |
2190 | /* Brute-force our way. We have to consider a lane |
2191 | failing after fixing an earlier fail up in the |
2192 | SLP discovery recursion. So track the current |
2193 | permute per lane. */ |
2194 | unsigned *perms = XALLOCAVEC (unsigned, group_size); |
2195 | memset (s: perms, c: 0, n: sizeof (unsigned) * group_size); |
2196 | do |
2197 | { |
2198 | op_stmts.truncate (size: 0); |
2199 | for (lane = 0; lane < group_size; ++lane) |
2200 | op_stmts.quick_push |
2201 | (obj: vinfo->lookup_def (chains[lane][n].op)); |
2202 | child = vect_build_slp_tree (vinfo, stmts: op_stmts, |
2203 | group_size, max_nunits: &this_max_nunits, |
2204 | matches, limit, |
2205 | tree_size: &this_tree_size, bst_map); |
2206 | /* ??? We're likely getting too many fatal mismatches |
2207 | here so maybe we want to ignore them (but then we |
2208 | have no idea which lanes fatally mismatched). */ |
2209 | if (child || !matches[0]) |
2210 | break; |
2211 | /* Swap another lane we have not yet matched up into |
2212 | lanes that did not match. If we run out of |
2213 | permute possibilities for a lane terminate the |
2214 | search. */ |
2215 | bool term = false; |
2216 | for (lane = 1; lane < group_size; ++lane) |
2217 | if (!matches[lane]) |
2218 | { |
2219 | if (n + perms[lane] + 1 == chain_len) |
2220 | { |
2221 | term = true; |
2222 | break; |
2223 | } |
2224 | std::swap (a&: chains[lane][n], |
2225 | b&: chains[lane][n + perms[lane] + 1]); |
2226 | perms[lane]++; |
2227 | } |
2228 | if (term) |
2229 | break; |
2230 | } |
2231 | while (1); |
2232 | if (!child) |
2233 | { |
2234 | if (dump_enabled_p ()) |
2235 | dump_printf_loc (MSG_NOTE, vect_location, |
2236 | "failed to match up op %d\n" , n); |
2237 | op_stmts.release (); |
2238 | if (lane != group_size - 1) |
2239 | matches[0] = false; |
2240 | else |
2241 | matches[lane] = false; |
2242 | goto out; |
2243 | } |
2244 | if (dump_enabled_p ()) |
2245 | { |
2246 | dump_printf_loc (MSG_NOTE, vect_location, |
2247 | "matched up op %d to\n" , n); |
2248 | vect_print_slp_tree (MSG_NOTE, vect_location, child); |
2249 | } |
2250 | children.safe_push (obj: child); |
2251 | } |
2252 | } |
2253 | /* 3. build SLP nodes to combine the chain. */ |
2254 | for (unsigned lane = 0; lane < group_size; ++lane) |
2255 | if (chains[lane][0].code != code) |
2256 | { |
2257 | /* See if there's any alternate all-PLUS entry. */ |
2258 | unsigned n; |
2259 | for (n = 1; n < chain_len; ++n) |
2260 | { |
2261 | for (lane = 0; lane < group_size; ++lane) |
2262 | if (chains[lane][n].code != code) |
2263 | break; |
2264 | if (lane == group_size) |
2265 | break; |
2266 | } |
2267 | if (n != chain_len) |
2268 | { |
2269 | /* Swap that in at first position. */ |
2270 | std::swap (a&: children[0], b&: children[n]); |
2271 | for (lane = 0; lane < group_size; ++lane) |
2272 | std::swap (a&: chains[lane][0], b&: chains[lane][n]); |
2273 | } |
2274 | else |
2275 | { |
2276 | /* ??? When this triggers and we end up with two |
2277 | vect_constant/external_def up-front things break (ICE) |
2278 | spectacularly finding an insertion place for the |
2279 | all-constant op. We should have a fully |
2280 | vect_internal_def operand though(?) so we can swap |
2281 | that into first place and then prepend the all-zero |
2282 | constant. */ |
2283 | if (dump_enabled_p ()) |
2284 | dump_printf_loc (MSG_NOTE, vect_location, |
2285 | "inserting constant zero to compensate " |
2286 | "for (partially) negated first " |
2287 | "operand\n" ); |
2288 | chain_len++; |
2289 | for (lane = 0; lane < group_size; ++lane) |
2290 | chains[lane].safe_insert |
2291 | (ix: 0, obj: chain_op_t (code, vect_constant_def, NULL_TREE)); |
2292 | vec<tree> zero_ops; |
2293 | zero_ops.create (nelems: group_size); |
2294 | zero_ops.quick_push (obj: build_zero_cst (TREE_TYPE (vectype))); |
2295 | for (lane = 1; lane < group_size; ++lane) |
2296 | zero_ops.quick_push (obj: zero_ops[0]); |
2297 | slp_tree zero = vect_create_new_slp_node (ops: zero_ops); |
2298 | SLP_TREE_DEF_TYPE (zero) = vect_constant_def; |
2299 | children.safe_insert (ix: 0, obj: zero); |
2300 | } |
2301 | break; |
2302 | } |
2303 | for (unsigned i = 1; i < children.length (); ++i) |
2304 | { |
2305 | slp_tree op0 = children[i - 1]; |
2306 | slp_tree op1 = children[i]; |
2307 | bool this_two_op = false; |
2308 | for (unsigned lane = 0; lane < group_size; ++lane) |
2309 | if (chains[lane][i].code != chains[0][i].code) |
2310 | { |
2311 | this_two_op = true; |
2312 | break; |
2313 | } |
2314 | slp_tree child; |
2315 | if (i == children.length () - 1) |
2316 | child = vect_create_new_slp_node (node, scalar_stmts: stmts, nops: 2); |
2317 | else |
2318 | child = vect_create_new_slp_node (nops: 2, code: ERROR_MARK); |
2319 | if (this_two_op) |
2320 | { |
2321 | vec<std::pair<unsigned, unsigned> > lperm; |
2322 | lperm.create (nelems: group_size); |
2323 | for (unsigned lane = 0; lane < group_size; ++lane) |
2324 | lperm.quick_push (obj: std::make_pair |
2325 | (x: chains[lane][i].code != chains[0][i].code, y&: lane)); |
2326 | vect_slp_build_two_operator_nodes (perm: child, vectype, op0, op1, |
2327 | oper1: (chains[0][i].code == code |
2328 | ? op_stmt_info |
2329 | : other_op_stmt_info), |
2330 | oper2: (chains[0][i].code == code |
2331 | ? other_op_stmt_info |
2332 | : op_stmt_info), |
2333 | lperm); |
2334 | } |
2335 | else |
2336 | { |
2337 | SLP_TREE_DEF_TYPE (child) = vect_internal_def; |
2338 | SLP_TREE_VECTYPE (child) = vectype; |
2339 | SLP_TREE_LANES (child) = group_size; |
2340 | SLP_TREE_CHILDREN (child).quick_push (obj: op0); |
2341 | SLP_TREE_CHILDREN (child).quick_push (obj: op1); |
2342 | SLP_TREE_REPRESENTATIVE (child) |
2343 | = (chains[0][i].code == code |
2344 | ? op_stmt_info : other_op_stmt_info); |
2345 | } |
2346 | children[i] = child; |
2347 | } |
2348 | *tree_size += this_tree_size + 1; |
2349 | *max_nunits = this_max_nunits; |
2350 | while (!chains.is_empty ()) |
2351 | chains.pop ().release (); |
2352 | return node; |
2353 | } |
2354 | out: |
2355 | while (!children.is_empty ()) |
2356 | vect_free_slp_tree (node: children.pop ()); |
2357 | while (!chains.is_empty ()) |
2358 | chains.pop ().release (); |
2359 | /* Hard-fail, otherwise we might run into quadratic processing of the |
2360 | chains starting one stmt into the chain again. */ |
2361 | if (hard_fail) |
2362 | return NULL; |
2363 | /* Fall thru to normal processing. */ |
2364 | } |
2365 | |
2366 | /* Get at the operands, verifying they are compatible. */ |
2367 | vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size); |
2368 | slp_oprnd_info oprnd_info; |
2369 | FOR_EACH_VEC_ELT (stmts, i, stmt_info) |
2370 | { |
2371 | int res = vect_get_and_check_slp_defs (vinfo, swap: swap[i], skip_args, |
2372 | stmts, stmt_num: i, oprnds_info: &oprnds_info); |
2373 | if (res != 0) |
2374 | matches[(res == -1) ? 0 : i] = false; |
2375 | if (!matches[0]) |
2376 | break; |
2377 | } |
2378 | for (i = 0; i < group_size; ++i) |
2379 | if (!matches[i]) |
2380 | { |
2381 | vect_free_oprnd_info (oprnds_info); |
2382 | return NULL; |
2383 | } |
2384 | swap = NULL; |
2385 | |
2386 | auto_vec<slp_tree, 4> children; |
2387 | |
2388 | stmt_info = stmts[0]; |
2389 | |
2390 | /* Create SLP_TREE nodes for the definition node/s. */ |
2391 | FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info) |
2392 | { |
2393 | slp_tree child; |
2394 | unsigned int j; |
2395 | |
2396 | /* We're skipping certain operands from processing, for example |
2397 | outer loop reduction initial defs. */ |
2398 | if (skip_args[i]) |
2399 | { |
2400 | children.safe_push (NULL); |
2401 | continue; |
2402 | } |
2403 | |
2404 | if (oprnd_info->first_dt == vect_uninitialized_def) |
2405 | { |
2406 | /* COND_EXPR have one too many eventually if the condition |
2407 | is a SSA name. */ |
2408 | gcc_assert (i == 3 && nops == 4); |
2409 | continue; |
2410 | } |
2411 | |
2412 | if (is_a <bb_vec_info> (p: vinfo) |
2413 | && oprnd_info->first_dt == vect_internal_def |
2414 | && !oprnd_info->any_pattern) |
2415 | { |
2416 | /* For BB vectorization, if all defs are the same do not |
2417 | bother to continue the build along the single-lane |
2418 | graph but use a splat of the scalar value. */ |
2419 | stmt_vec_info first_def = oprnd_info->def_stmts[0]; |
2420 | for (j = 1; j < group_size; ++j) |
2421 | if (oprnd_info->def_stmts[j] != first_def) |
2422 | break; |
2423 | if (j == group_size |
2424 | /* But avoid doing this for loads where we may be |
2425 | able to CSE things, unless the stmt is not |
2426 | vectorizable. */ |
2427 | && (!STMT_VINFO_VECTORIZABLE (first_def) |
2428 | || !gimple_vuse (g: first_def->stmt))) |
2429 | { |
2430 | if (dump_enabled_p ()) |
2431 | dump_printf_loc (MSG_NOTE, vect_location, |
2432 | "Using a splat of the uniform operand %G" , |
2433 | first_def->stmt); |
2434 | oprnd_info->first_dt = vect_external_def; |
2435 | } |
2436 | } |
2437 | |
2438 | if (oprnd_info->first_dt == vect_external_def |
2439 | || oprnd_info->first_dt == vect_constant_def) |
2440 | { |
2441 | slp_tree invnode = vect_create_new_slp_node (ops: oprnd_info->ops); |
2442 | SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt; |
2443 | oprnd_info->ops = vNULL; |
2444 | children.safe_push (obj: invnode); |
2445 | continue; |
2446 | } |
2447 | |
2448 | if ((child = vect_build_slp_tree (vinfo, stmts: oprnd_info->def_stmts, |
2449 | group_size, max_nunits: &this_max_nunits, |
2450 | matches, limit, |
2451 | tree_size: &this_tree_size, bst_map)) != NULL) |
2452 | { |
2453 | oprnd_info->def_stmts = vNULL; |
2454 | children.safe_push (obj: child); |
2455 | continue; |
2456 | } |
2457 | |
2458 | /* If the SLP build for operand zero failed and operand zero |
2459 | and one can be commutated try that for the scalar stmts |
2460 | that failed the match. */ |
2461 | if (i == 0 |
2462 | /* A first scalar stmt mismatch signals a fatal mismatch. */ |
2463 | && matches[0] |
2464 | /* ??? For COND_EXPRs we can swap the comparison operands |
2465 | as well as the arms under some constraints. */ |
2466 | && nops == 2 |
2467 | && oprnds_info[1]->first_dt == vect_internal_def |
2468 | && is_gimple_assign (gs: stmt_info->stmt) |
2469 | /* Swapping operands for reductions breaks assumptions later on. */ |
2470 | && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def |
2471 | && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def) |
2472 | { |
2473 | /* See whether we can swap the matching or the non-matching |
2474 | stmt operands. */ |
2475 | bool swap_not_matching = true; |
2476 | do |
2477 | { |
2478 | for (j = 0; j < group_size; ++j) |
2479 | { |
2480 | if (matches[j] != !swap_not_matching) |
2481 | continue; |
2482 | stmt_vec_info stmt_info = stmts[j]; |
2483 | /* Verify if we can swap operands of this stmt. */ |
2484 | gassign *stmt = dyn_cast <gassign *> (p: stmt_info->stmt); |
2485 | if (!stmt |
2486 | || !commutative_tree_code (gimple_assign_rhs_code (gs: stmt))) |
2487 | { |
2488 | if (!swap_not_matching) |
2489 | goto fail; |
2490 | swap_not_matching = false; |
2491 | break; |
2492 | } |
2493 | } |
2494 | } |
2495 | while (j != group_size); |
2496 | |
2497 | /* Swap mismatched definition stmts. */ |
2498 | if (dump_enabled_p ()) |
2499 | dump_printf_loc (MSG_NOTE, vect_location, |
2500 | "Re-trying with swapped operands of stmts " ); |
2501 | for (j = 0; j < group_size; ++j) |
2502 | if (matches[j] == !swap_not_matching) |
2503 | { |
2504 | std::swap (a&: oprnds_info[0]->def_stmts[j], |
2505 | b&: oprnds_info[1]->def_stmts[j]); |
2506 | std::swap (a&: oprnds_info[0]->ops[j], |
2507 | b&: oprnds_info[1]->ops[j]); |
2508 | if (dump_enabled_p ()) |
2509 | dump_printf (MSG_NOTE, "%d " , j); |
2510 | } |
2511 | if (dump_enabled_p ()) |
2512 | dump_printf (MSG_NOTE, "\n" ); |
2513 | /* After swapping some operands we lost track whether an |
2514 | operand has any pattern defs so be conservative here. */ |
2515 | if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern) |
2516 | oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true; |
2517 | /* And try again with scratch 'matches' ... */ |
2518 | bool *tem = XALLOCAVEC (bool, group_size); |
2519 | if ((child = vect_build_slp_tree (vinfo, stmts: oprnd_info->def_stmts, |
2520 | group_size, max_nunits: &this_max_nunits, |
2521 | matches: tem, limit, |
2522 | tree_size: &this_tree_size, bst_map)) != NULL) |
2523 | { |
2524 | oprnd_info->def_stmts = vNULL; |
2525 | children.safe_push (obj: child); |
2526 | continue; |
2527 | } |
2528 | } |
2529 | fail: |
2530 | |
2531 | /* If the SLP build failed and we analyze a basic-block |
2532 | simply treat nodes we fail to build as externally defined |
2533 | (and thus build vectors from the scalar defs). |
2534 | The cost model will reject outright expensive cases. |
2535 | ??? This doesn't treat cases where permutation ultimatively |
2536 | fails (or we don't try permutation below). Ideally we'd |
2537 | even compute a permutation that will end up with the maximum |
2538 | SLP tree size... */ |
2539 | if (is_a <bb_vec_info> (p: vinfo) |
2540 | /* ??? Rejecting patterns this way doesn't work. We'd have to |
2541 | do extra work to cancel the pattern so the uses see the |
2542 | scalar version. */ |
2543 | && !is_pattern_stmt_p (stmt_info) |
2544 | && !oprnd_info->any_pattern) |
2545 | { |
2546 | /* But if there's a leading vector sized set of matching stmts |
2547 | fail here so we can split the group. This matches the condition |
2548 | vect_analyze_slp_instance uses. */ |
2549 | /* ??? We might want to split here and combine the results to support |
2550 | multiple vector sizes better. */ |
2551 | for (j = 0; j < group_size; ++j) |
2552 | if (!matches[j]) |
2553 | break; |
2554 | if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype))) |
2555 | { |
2556 | if (dump_enabled_p ()) |
2557 | dump_printf_loc (MSG_NOTE, vect_location, |
2558 | "Building vector operands from scalars\n" ); |
2559 | this_tree_size++; |
2560 | child = vect_create_new_slp_node (ops: oprnd_info->ops); |
2561 | children.safe_push (obj: child); |
2562 | oprnd_info->ops = vNULL; |
2563 | continue; |
2564 | } |
2565 | } |
2566 | |
2567 | gcc_assert (child == NULL); |
2568 | FOR_EACH_VEC_ELT (children, j, child) |
2569 | if (child) |
2570 | vect_free_slp_tree (node: child); |
2571 | vect_free_oprnd_info (oprnds_info); |
2572 | return NULL; |
2573 | } |
2574 | |
2575 | vect_free_oprnd_info (oprnds_info); |
2576 | |
2577 | /* If we have all children of a child built up from uniform scalars |
2578 | or does more than one possibly expensive vector construction then |
2579 | just throw that away, causing it built up from scalars. |
2580 | The exception is the SLP node for the vector store. */ |
2581 | if (is_a <bb_vec_info> (p: vinfo) |
2582 | && !STMT_VINFO_GROUPED_ACCESS (stmt_info) |
2583 | /* ??? Rejecting patterns this way doesn't work. We'd have to |
2584 | do extra work to cancel the pattern so the uses see the |
2585 | scalar version. */ |
2586 | && !is_pattern_stmt_p (stmt_info)) |
2587 | { |
2588 | slp_tree child; |
2589 | unsigned j; |
2590 | bool all_uniform_p = true; |
2591 | unsigned n_vector_builds = 0; |
2592 | FOR_EACH_VEC_ELT (children, j, child) |
2593 | { |
2594 | if (!child) |
2595 | ; |
2596 | else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def) |
2597 | all_uniform_p = false; |
2598 | else if (!vect_slp_tree_uniform_p (node: child)) |
2599 | { |
2600 | all_uniform_p = false; |
2601 | if (SLP_TREE_DEF_TYPE (child) == vect_external_def) |
2602 | n_vector_builds++; |
2603 | } |
2604 | } |
2605 | if (all_uniform_p |
2606 | || n_vector_builds > 1 |
2607 | || (n_vector_builds == children.length () |
2608 | && is_a <gphi *> (p: stmt_info->stmt))) |
2609 | { |
2610 | /* Roll back. */ |
2611 | matches[0] = false; |
2612 | FOR_EACH_VEC_ELT (children, j, child) |
2613 | if (child) |
2614 | vect_free_slp_tree (node: child); |
2615 | |
2616 | if (dump_enabled_p ()) |
2617 | dump_printf_loc (MSG_NOTE, vect_location, |
2618 | "Building parent vector operands from " |
2619 | "scalars instead\n" ); |
2620 | return NULL; |
2621 | } |
2622 | } |
2623 | |
2624 | *tree_size += this_tree_size + 1; |
2625 | *max_nunits = this_max_nunits; |
2626 | |
2627 | if (two_operators) |
2628 | { |
2629 | /* ??? We'd likely want to either cache in bst_map sth like |
2630 | { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or |
2631 | the true { a+b, a+b, a+b, a+b } ... but there we don't have |
2632 | explicit stmts to put in so the keying on 'stmts' doesn't |
2633 | work (but we have the same issue with nodes that use 'ops'). */ |
2634 | slp_tree one = new _slp_tree; |
2635 | slp_tree two = new _slp_tree; |
2636 | SLP_TREE_DEF_TYPE (one) = vect_internal_def; |
2637 | SLP_TREE_DEF_TYPE (two) = vect_internal_def; |
2638 | SLP_TREE_VECTYPE (one) = vectype; |
2639 | SLP_TREE_VECTYPE (two) = vectype; |
2640 | SLP_TREE_CHILDREN (one).safe_splice (src: children); |
2641 | SLP_TREE_CHILDREN (two).safe_splice (src: children); |
2642 | slp_tree child; |
2643 | FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child) |
2644 | SLP_TREE_REF_COUNT (child)++; |
2645 | |
2646 | /* Here we record the original defs since this |
2647 | node represents the final lane configuration. */ |
2648 | node = vect_create_new_slp_node (node, scalar_stmts: stmts, nops: 2); |
2649 | SLP_TREE_VECTYPE (node) = vectype; |
2650 | SLP_TREE_CODE (node) = VEC_PERM_EXPR; |
2651 | SLP_TREE_CHILDREN (node).quick_push (obj: one); |
2652 | SLP_TREE_CHILDREN (node).quick_push (obj: two); |
2653 | gassign *stmt = as_a <gassign *> (p: stmts[0]->stmt); |
2654 | enum tree_code code0 = gimple_assign_rhs_code (gs: stmt); |
2655 | enum tree_code ocode = ERROR_MARK; |
2656 | stmt_vec_info ostmt_info; |
2657 | unsigned j = 0; |
2658 | FOR_EACH_VEC_ELT (stmts, i, ostmt_info) |
2659 | { |
2660 | gassign *ostmt = as_a <gassign *> (p: ostmt_info->stmt); |
2661 | if (gimple_assign_rhs_code (gs: ostmt) != code0) |
2662 | { |
2663 | SLP_TREE_LANE_PERMUTATION (node).safe_push (obj: std::make_pair (x: 1, y&: i)); |
2664 | ocode = gimple_assign_rhs_code (gs: ostmt); |
2665 | j = i; |
2666 | } |
2667 | else |
2668 | SLP_TREE_LANE_PERMUTATION (node).safe_push (obj: std::make_pair (x: 0, y&: i)); |
2669 | } |
2670 | SLP_TREE_CODE (one) = code0; |
2671 | SLP_TREE_CODE (two) = ocode; |
2672 | SLP_TREE_LANES (one) = stmts.length (); |
2673 | SLP_TREE_LANES (two) = stmts.length (); |
2674 | SLP_TREE_REPRESENTATIVE (one) = stmts[0]; |
2675 | SLP_TREE_REPRESENTATIVE (two) = stmts[j]; |
2676 | return node; |
2677 | } |
2678 | |
2679 | node = vect_create_new_slp_node (node, scalar_stmts: stmts, nops); |
2680 | SLP_TREE_VECTYPE (node) = vectype; |
2681 | SLP_TREE_CHILDREN (node).splice (src: children); |
2682 | return node; |
2683 | } |
2684 | |
2685 | /* Dump a single SLP tree NODE. */ |
2686 | |
2687 | static void |
2688 | vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc, |
2689 | slp_tree node) |
2690 | { |
2691 | unsigned i, j; |
2692 | slp_tree child; |
2693 | stmt_vec_info stmt_info; |
2694 | tree op; |
2695 | |
2696 | dump_metadata_t metadata (dump_kind, loc.get_impl_location ()); |
2697 | dump_user_location_t user_loc = loc.get_user_location (); |
2698 | dump_printf_loc (metadata, user_loc, |
2699 | "node%s %p (max_nunits=" HOST_WIDE_INT_PRINT_UNSIGNED |
2700 | ", refcnt=%u)" , |
2701 | SLP_TREE_DEF_TYPE (node) == vect_external_def |
2702 | ? " (external)" |
2703 | : (SLP_TREE_DEF_TYPE (node) == vect_constant_def |
2704 | ? " (constant)" |
2705 | : "" ), (void *) node, |
2706 | estimated_poly_value (x: node->max_nunits), |
2707 | SLP_TREE_REF_COUNT (node)); |
2708 | if (SLP_TREE_VECTYPE (node)) |
2709 | dump_printf (metadata, " %T" , SLP_TREE_VECTYPE (node)); |
2710 | dump_printf (metadata, "\n" ); |
2711 | if (SLP_TREE_DEF_TYPE (node) == vect_internal_def) |
2712 | { |
2713 | if (SLP_TREE_CODE (node) == VEC_PERM_EXPR) |
2714 | dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n" ); |
2715 | else |
2716 | dump_printf_loc (metadata, user_loc, "op template: %G" , |
2717 | SLP_TREE_REPRESENTATIVE (node)->stmt); |
2718 | } |
2719 | if (SLP_TREE_SCALAR_STMTS (node).exists ()) |
2720 | FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info) |
2721 | dump_printf_loc (metadata, user_loc, "\tstmt %u %G" , i, stmt_info->stmt); |
2722 | else |
2723 | { |
2724 | dump_printf_loc (metadata, user_loc, "\t{ " ); |
2725 | FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op) |
2726 | dump_printf (metadata, "%T%s " , op, |
2727 | i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "" ); |
2728 | dump_printf (metadata, "}\n" ); |
2729 | } |
2730 | if (SLP_TREE_LOAD_PERMUTATION (node).exists ()) |
2731 | { |
2732 | dump_printf_loc (metadata, user_loc, "\tload permutation {" ); |
2733 | FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j) |
2734 | dump_printf (dump_kind, " %u" , j); |
2735 | dump_printf (dump_kind, " }\n" ); |
2736 | } |
2737 | if (SLP_TREE_LANE_PERMUTATION (node).exists ()) |
2738 | { |
2739 | dump_printf_loc (metadata, user_loc, "\tlane permutation {" ); |
2740 | for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i) |
2741 | dump_printf (dump_kind, " %u[%u]" , |
2742 | SLP_TREE_LANE_PERMUTATION (node)[i].first, |
2743 | SLP_TREE_LANE_PERMUTATION (node)[i].second); |
2744 | dump_printf (dump_kind, " }\n" ); |
2745 | } |
2746 | if (SLP_TREE_CHILDREN (node).is_empty ()) |
2747 | return; |
2748 | dump_printf_loc (metadata, user_loc, "\tchildren" ); |
2749 | FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) |
2750 | dump_printf (dump_kind, " %p" , (void *)child); |
2751 | dump_printf (dump_kind, "\n" ); |
2752 | } |
2753 | |
2754 | DEBUG_FUNCTION void |
2755 | debug (slp_tree node) |
2756 | { |
2757 | debug_dump_context ctx; |
2758 | vect_print_slp_tree (dump_kind: MSG_NOTE, |
2759 | loc: dump_location_t::from_location_t (UNKNOWN_LOCATION), |
2760 | node); |
2761 | } |
2762 | |
2763 | /* Recursive helper for the dot producer below. */ |
2764 | |
2765 | static void |
2766 | dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited) |
2767 | { |
2768 | if (visited.add (k: node)) |
2769 | return; |
2770 | |
2771 | fprintf (stream: f, format: "\"%p\" [label=\"" , (void *)node); |
2772 | vect_print_slp_tree (dump_kind: MSG_NOTE, |
2773 | loc: dump_location_t::from_location_t (UNKNOWN_LOCATION), |
2774 | node); |
2775 | fprintf (stream: f, format: "\"];\n" ); |
2776 | |
2777 | |
2778 | for (slp_tree child : SLP_TREE_CHILDREN (node)) |
2779 | fprintf (stream: f, format: "\"%p\" -> \"%p\";" , (void *)node, (void *)child); |
2780 | |
2781 | for (slp_tree child : SLP_TREE_CHILDREN (node)) |
2782 | if (child) |
2783 | dot_slp_tree (f, node: child, visited); |
2784 | } |
2785 | |
2786 | DEBUG_FUNCTION void |
2787 | dot_slp_tree (const char *fname, slp_tree node) |
2788 | { |
2789 | FILE *f = fopen (filename: fname, modes: "w" ); |
2790 | fprintf (stream: f, format: "digraph {\n" ); |
2791 | fflush (stream: f); |
2792 | { |
2793 | debug_dump_context ctx (f); |
2794 | hash_set<slp_tree> visited; |
2795 | dot_slp_tree (f, node, visited); |
2796 | } |
2797 | fflush (stream: f); |
2798 | fprintf (stream: f, format: "}\n" ); |
2799 | fclose (stream: f); |
2800 | } |
2801 | |
2802 | /* Dump a slp tree NODE using flags specified in DUMP_KIND. */ |
2803 | |
2804 | static void |
2805 | vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc, |
2806 | slp_tree node, hash_set<slp_tree> &visited) |
2807 | { |
2808 | unsigned i; |
2809 | slp_tree child; |
2810 | |
2811 | if (visited.add (k: node)) |
2812 | return; |
2813 | |
2814 | vect_print_slp_tree (dump_kind, loc, node); |
2815 | |
2816 | FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) |
2817 | if (child) |
2818 | vect_print_slp_graph (dump_kind, loc, node: child, visited); |
2819 | } |
2820 | |
2821 | static void |
2822 | vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc, |
2823 | slp_tree entry) |
2824 | { |
2825 | hash_set<slp_tree> visited; |
2826 | vect_print_slp_graph (dump_kind, loc, node: entry, visited); |
2827 | } |
2828 | |
2829 | /* Mark the tree rooted at NODE with PURE_SLP. */ |
2830 | |
2831 | static void |
2832 | vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited) |
2833 | { |
2834 | int i; |
2835 | stmt_vec_info stmt_info; |
2836 | slp_tree child; |
2837 | |
2838 | if (SLP_TREE_DEF_TYPE (node) != vect_internal_def) |
2839 | return; |
2840 | |
2841 | if (visited.add (k: node)) |
2842 | return; |
2843 | |
2844 | FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info) |
2845 | STMT_SLP_TYPE (stmt_info) = pure_slp; |
2846 | |
2847 | FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) |
2848 | if (child) |
2849 | vect_mark_slp_stmts (node: child, visited); |
2850 | } |
2851 | |
2852 | static void |
2853 | vect_mark_slp_stmts (slp_tree node) |
2854 | { |
2855 | hash_set<slp_tree> visited; |
2856 | vect_mark_slp_stmts (node, visited); |
2857 | } |
2858 | |
2859 | /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */ |
2860 | |
2861 | static void |
2862 | vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited) |
2863 | { |
2864 | int i; |
2865 | stmt_vec_info stmt_info; |
2866 | slp_tree child; |
2867 | |
2868 | if (SLP_TREE_DEF_TYPE (node) != vect_internal_def) |
2869 | return; |
2870 | |
2871 | if (visited.add (k: node)) |
2872 | return; |
2873 | |
2874 | FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info) |
2875 | { |
2876 | gcc_assert (!STMT_VINFO_RELEVANT (stmt_info) |
2877 | || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope); |
2878 | STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope; |
2879 | } |
2880 | |
2881 | FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) |
2882 | if (child) |
2883 | vect_mark_slp_stmts_relevant (node: child, visited); |
2884 | } |
2885 | |
2886 | static void |
2887 | vect_mark_slp_stmts_relevant (slp_tree node) |
2888 | { |
2889 | hash_set<slp_tree> visited; |
2890 | vect_mark_slp_stmts_relevant (node, visited); |
2891 | } |
2892 | |
2893 | |
2894 | /* Gather loads in the SLP graph NODE and populate the INST loads array. */ |
2895 | |
2896 | static void |
2897 | vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node, |
2898 | hash_set<slp_tree> &visited) |
2899 | { |
2900 | if (!node || visited.add (k: node)) |
2901 | return; |
2902 | |
2903 | if (SLP_TREE_DEF_TYPE (node) != vect_internal_def) |
2904 | return; |
2905 | |
2906 | if (SLP_TREE_CODE (node) != VEC_PERM_EXPR) |
2907 | { |
2908 | stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node); |
2909 | if (STMT_VINFO_DATA_REF (stmt_info) |
2910 | && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))) |
2911 | loads.safe_push (obj: node); |
2912 | } |
2913 | |
2914 | unsigned i; |
2915 | slp_tree child; |
2916 | FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) |
2917 | vect_gather_slp_loads (loads, node: child, visited); |
2918 | } |
2919 | |
2920 | |
2921 | /* Find the last store in SLP INSTANCE. */ |
2922 | |
2923 | stmt_vec_info |
2924 | vect_find_last_scalar_stmt_in_slp (slp_tree node) |
2925 | { |
2926 | stmt_vec_info last = NULL; |
2927 | stmt_vec_info stmt_vinfo; |
2928 | |
2929 | for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (ix: i, ptr: &stmt_vinfo); i++) |
2930 | { |
2931 | stmt_vinfo = vect_orig_stmt (stmt_info: stmt_vinfo); |
2932 | last = last ? get_later_stmt (stmt1_info: stmt_vinfo, stmt2_info: last) : stmt_vinfo; |
2933 | } |
2934 | |
2935 | return last; |
2936 | } |
2937 | |
2938 | /* Find the first stmt in NODE. */ |
2939 | |
2940 | stmt_vec_info |
2941 | vect_find_first_scalar_stmt_in_slp (slp_tree node) |
2942 | { |
2943 | stmt_vec_info first = NULL; |
2944 | stmt_vec_info stmt_vinfo; |
2945 | |
2946 | for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (ix: i, ptr: &stmt_vinfo); i++) |
2947 | { |
2948 | stmt_vinfo = vect_orig_stmt (stmt_info: stmt_vinfo); |
2949 | if (!first |
2950 | || get_later_stmt (stmt1_info: stmt_vinfo, stmt2_info: first) == first) |
2951 | first = stmt_vinfo; |
2952 | } |
2953 | |
2954 | return first; |
2955 | } |
2956 | |
2957 | /* Splits a group of stores, currently beginning at FIRST_VINFO, into |
2958 | two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE |
2959 | (also containing the first GROUP1_SIZE stmts, since stores are |
2960 | consecutive), the second containing the remainder. |
2961 | Return the first stmt in the second group. */ |
2962 | |
2963 | static stmt_vec_info |
2964 | vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size) |
2965 | { |
2966 | gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo); |
2967 | gcc_assert (group1_size > 0); |
2968 | int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size; |
2969 | gcc_assert (group2_size > 0); |
2970 | DR_GROUP_SIZE (first_vinfo) = group1_size; |
2971 | |
2972 | stmt_vec_info stmt_info = first_vinfo; |
2973 | for (unsigned i = group1_size; i > 1; i--) |
2974 | { |
2975 | stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info); |
2976 | gcc_assert (DR_GROUP_GAP (stmt_info) == 1); |
2977 | } |
2978 | /* STMT is now the last element of the first group. */ |
2979 | stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info); |
2980 | DR_GROUP_NEXT_ELEMENT (stmt_info) = 0; |
2981 | |
2982 | DR_GROUP_SIZE (group2) = group2_size; |
2983 | for (stmt_info = group2; stmt_info; |
2984 | stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info)) |
2985 | { |
2986 | DR_GROUP_FIRST_ELEMENT (stmt_info) = group2; |
2987 | gcc_assert (DR_GROUP_GAP (stmt_info) == 1); |
2988 | } |
2989 | |
2990 | /* For the second group, the DR_GROUP_GAP is that before the original group, |
2991 | plus skipping over the first vector. */ |
2992 | DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size; |
2993 | |
2994 | /* DR_GROUP_GAP of the first group now has to skip over the second group too. */ |
2995 | DR_GROUP_GAP (first_vinfo) += group2_size; |
2996 | |
2997 | if (dump_enabled_p ()) |
2998 | dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n" , |
2999 | group1_size, group2_size); |
3000 | |
3001 | return group2; |
3002 | } |
3003 | |
3004 | /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE |
3005 | statements and a vector of NUNITS elements. */ |
3006 | |
3007 | static poly_uint64 |
3008 | calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size) |
3009 | { |
3010 | return exact_div (a: common_multiple (a: nunits, b: group_size), b: group_size); |
3011 | } |
3012 | |
3013 | /* Helper that checks to see if a node is a load node. */ |
3014 | |
3015 | static inline bool |
3016 | vect_is_slp_load_node (slp_tree root) |
3017 | { |
3018 | return SLP_TREE_DEF_TYPE (root) == vect_internal_def |
3019 | && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root)) |
3020 | && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root))); |
3021 | } |
3022 | |
3023 | |
3024 | /* Helper function of optimize_load_redistribution that performs the operation |
3025 | recursively. */ |
3026 | |
3027 | static slp_tree |
3028 | optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map, |
3029 | vec_info *vinfo, unsigned int group_size, |
3030 | hash_map<slp_tree, slp_tree> *load_map, |
3031 | slp_tree root) |
3032 | { |
3033 | if (slp_tree *leader = load_map->get (k: root)) |
3034 | return *leader; |
3035 | |
3036 | slp_tree node; |
3037 | unsigned i; |
3038 | |
3039 | /* For now, we don't know anything about externals so do not do anything. */ |
3040 | if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def) |
3041 | return NULL; |
3042 | else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR) |
3043 | { |
3044 | /* First convert this node into a load node and add it to the leaves |
3045 | list and flatten the permute from a lane to a load one. If it's |
3046 | unneeded it will be elided later. */ |
3047 | vec<stmt_vec_info> stmts; |
3048 | stmts.create (SLP_TREE_LANES (root)); |
3049 | lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root); |
3050 | for (unsigned j = 0; j < lane_perm.length (); j++) |
3051 | { |
3052 | std::pair<unsigned, unsigned> perm = lane_perm[j]; |
3053 | node = SLP_TREE_CHILDREN (root)[perm.first]; |
3054 | |
3055 | if (!vect_is_slp_load_node (root: node) |
3056 | || SLP_TREE_CHILDREN (node).exists ()) |
3057 | { |
3058 | stmts.release (); |
3059 | goto next; |
3060 | } |
3061 | |
3062 | stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]); |
3063 | } |
3064 | |
3065 | if (dump_enabled_p ()) |
3066 | dump_printf_loc (MSG_NOTE, vect_location, |
3067 | "converting stmts on permute node %p\n" , |
3068 | (void *) root); |
3069 | |
3070 | bool *matches = XALLOCAVEC (bool, group_size); |
3071 | poly_uint64 max_nunits = 1; |
3072 | unsigned tree_size = 0, limit = 1; |
3073 | node = vect_build_slp_tree (vinfo, stmts, group_size, max_nunits: &max_nunits, |
3074 | matches, limit: &limit, tree_size: &tree_size, bst_map); |
3075 | if (!node) |
3076 | stmts.release (); |
3077 | |
3078 | load_map->put (k: root, v: node); |
3079 | return node; |
3080 | } |
3081 | |
3082 | next: |
3083 | load_map->put (k: root, NULL); |
3084 | |
3085 | FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node) |
3086 | { |
3087 | slp_tree value |
3088 | = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map, |
3089 | root: node); |
3090 | if (value) |
3091 | { |
3092 | SLP_TREE_REF_COUNT (value)++; |
3093 | SLP_TREE_CHILDREN (root)[i] = value; |
3094 | /* ??? We know the original leafs of the replaced nodes will |
3095 | be referenced by bst_map, only the permutes created by |
3096 | pattern matching are not. */ |
3097 | if (SLP_TREE_REF_COUNT (node) == 1) |
3098 | load_map->remove (k: node); |
3099 | vect_free_slp_tree (node); |
3100 | } |
3101 | } |
3102 | |
3103 | return NULL; |
3104 | } |
3105 | |
3106 | /* Temporary workaround for loads not being CSEd during SLP build. This |
3107 | function will traverse the SLP tree rooted in ROOT for INSTANCE and find |
3108 | VEC_PERM nodes that blend vectors from multiple nodes that all read from the |
3109 | same DR such that the final operation is equal to a permuted load. Such |
3110 | NODES are then directly converted into LOADS themselves. The nodes are |
3111 | CSEd using BST_MAP. */ |
3112 | |
3113 | static void |
3114 | optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map, |
3115 | vec_info *vinfo, unsigned int group_size, |
3116 | hash_map<slp_tree, slp_tree> *load_map, |
3117 | slp_tree root) |
3118 | { |
3119 | slp_tree node; |
3120 | unsigned i; |
3121 | |
3122 | FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node) |
3123 | { |
3124 | slp_tree value |
3125 | = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map, |
3126 | root: node); |
3127 | if (value) |
3128 | { |
3129 | SLP_TREE_REF_COUNT (value)++; |
3130 | SLP_TREE_CHILDREN (root)[i] = value; |
3131 | /* ??? We know the original leafs of the replaced nodes will |
3132 | be referenced by bst_map, only the permutes created by |
3133 | pattern matching are not. */ |
3134 | if (SLP_TREE_REF_COUNT (node) == 1) |
3135 | load_map->remove (k: node); |
3136 | vect_free_slp_tree (node); |
3137 | } |
3138 | } |
3139 | } |
3140 | |
3141 | /* Helper function of vect_match_slp_patterns. |
3142 | |
3143 | Attempts to match patterns against the slp tree rooted in REF_NODE using |
3144 | VINFO. Patterns are matched in post-order traversal. |
3145 | |
3146 | If matching is successful the value in REF_NODE is updated and returned, if |
3147 | not then it is returned unchanged. */ |
3148 | |
3149 | static bool |
3150 | vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo, |
3151 | slp_tree_to_load_perm_map_t *perm_cache, |
3152 | slp_compat_nodes_map_t *compat_cache, |
3153 | hash_set<slp_tree> *visited) |
3154 | { |
3155 | unsigned i; |
3156 | slp_tree node = *ref_node; |
3157 | bool found_p = false; |
3158 | if (!node || visited->add (k: node)) |
3159 | return false; |
3160 | |
3161 | slp_tree child; |
3162 | FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) |
3163 | found_p |= vect_match_slp_patterns_2 (ref_node: &SLP_TREE_CHILDREN (node)[i], |
3164 | vinfo, perm_cache, compat_cache, |
3165 | visited); |
3166 | |
3167 | for (unsigned x = 0; x < num__slp_patterns; x++) |
3168 | { |
3169 | vect_pattern *pattern |
3170 | = slp_patterns[x] (perm_cache, compat_cache, ref_node); |
3171 | if (pattern) |
3172 | { |
3173 | pattern->build (vinfo); |
3174 | delete pattern; |
3175 | found_p = true; |
3176 | } |
3177 | } |
3178 | |
3179 | return found_p; |
3180 | } |
3181 | |
3182 | /* Applies pattern matching to the given SLP tree rooted in REF_NODE using |
3183 | vec_info VINFO. |
3184 | |
3185 | The modified tree is returned. Patterns are tried in order and multiple |
3186 | patterns may match. */ |
3187 | |
3188 | static bool |
3189 | vect_match_slp_patterns (slp_instance instance, vec_info *vinfo, |
3190 | hash_set<slp_tree> *visited, |
3191 | slp_tree_to_load_perm_map_t *perm_cache, |
3192 | slp_compat_nodes_map_t *compat_cache) |
3193 | { |
3194 | DUMP_VECT_SCOPE ("vect_match_slp_patterns" ); |
3195 | slp_tree *ref_node = &SLP_INSTANCE_TREE (instance); |
3196 | |
3197 | if (dump_enabled_p ()) |
3198 | dump_printf_loc (MSG_NOTE, vect_location, |
3199 | "Analyzing SLP tree %p for patterns\n" , |
3200 | (void *) SLP_INSTANCE_TREE (instance)); |
3201 | |
3202 | return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, compat_cache, |
3203 | visited); |
3204 | } |
3205 | |
3206 | /* STMT_INFO is a store group of size GROUP_SIZE that we are considering |
3207 | splitting into two, with the first split group having size NEW_GROUP_SIZE. |
3208 | Return true if we could use IFN_STORE_LANES instead and if that appears |
3209 | to be the better approach. */ |
3210 | |
3211 | static bool |
3212 | vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info, |
3213 | unsigned int group_size, |
3214 | unsigned int new_group_size) |
3215 | { |
3216 | tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info))); |
3217 | tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type); |
3218 | if (!vectype) |
3219 | return false; |
3220 | /* Allow the split if one of the two new groups would operate on full |
3221 | vectors *within* rather than across one scalar loop iteration. |
3222 | This is purely a heuristic, but it should work well for group |
3223 | sizes of 3 and 4, where the possible splits are: |
3224 | |
3225 | 3->2+1: OK if the vector has exactly two elements |
3226 | 4->2+2: Likewise |
3227 | 4->3+1: Less clear-cut. */ |
3228 | if (multiple_p (a: group_size - new_group_size, b: TYPE_VECTOR_SUBPARTS (node: vectype)) |
3229 | || multiple_p (a: new_group_size, b: TYPE_VECTOR_SUBPARTS (node: vectype))) |
3230 | return false; |
3231 | return vect_store_lanes_supported (vectype, group_size, false) != IFN_LAST; |
3232 | } |
3233 | |
3234 | /* Analyze an SLP instance starting from a group of grouped stores. Call |
3235 | vect_build_slp_tree to build a tree of packed stmts if possible. |
3236 | Return FALSE if it's impossible to SLP any stmt in the loop. */ |
3237 | |
3238 | static bool |
3239 | vect_analyze_slp_instance (vec_info *vinfo, |
3240 | scalar_stmts_to_slp_tree_map_t *bst_map, |
3241 | stmt_vec_info stmt_info, slp_instance_kind kind, |
3242 | unsigned max_tree_size, unsigned *limit); |
3243 | |
3244 | /* Analyze an SLP instance starting from SCALAR_STMTS which are a group |
3245 | of KIND. Return true if successful. */ |
3246 | |
3247 | static bool |
3248 | vect_build_slp_instance (vec_info *vinfo, |
3249 | slp_instance_kind kind, |
3250 | vec<stmt_vec_info> &scalar_stmts, |
3251 | vec<stmt_vec_info> &root_stmt_infos, |
3252 | vec<tree> &remain, |
3253 | unsigned max_tree_size, unsigned *limit, |
3254 | scalar_stmts_to_slp_tree_map_t *bst_map, |
3255 | /* ??? We need stmt_info for group splitting. */ |
3256 | stmt_vec_info stmt_info_) |
3257 | { |
3258 | if (kind == slp_inst_kind_ctor) |
3259 | { |
3260 | if (dump_enabled_p ()) |
3261 | dump_printf_loc (MSG_NOTE, vect_location, |
3262 | "Analyzing vectorizable constructor: %G\n" , |
3263 | root_stmt_infos[0]->stmt); |
3264 | } |
3265 | |
3266 | if (dump_enabled_p ()) |
3267 | { |
3268 | dump_printf_loc (MSG_NOTE, vect_location, |
3269 | "Starting SLP discovery for\n" ); |
3270 | for (unsigned i = 0; i < scalar_stmts.length (); ++i) |
3271 | dump_printf_loc (MSG_NOTE, vect_location, |
3272 | " %G" , scalar_stmts[i]->stmt); |
3273 | } |
3274 | |
3275 | /* When a BB reduction doesn't have an even number of lanes |
3276 | strip it down, treating the remaining lane as scalar. |
3277 | ??? Selecting the optimal set of lanes to vectorize would be nice |
3278 | but SLP build for all lanes will fail quickly because we think |
3279 | we're going to need unrolling. */ |
3280 | if (kind == slp_inst_kind_bb_reduc |
3281 | && (scalar_stmts.length () & 1)) |
3282 | remain.safe_insert (ix: 0, obj: gimple_get_lhs (scalar_stmts.pop ()->stmt)); |
3283 | |
3284 | /* Build the tree for the SLP instance. */ |
3285 | unsigned int group_size = scalar_stmts.length (); |
3286 | bool *matches = XALLOCAVEC (bool, group_size); |
3287 | poly_uint64 max_nunits = 1; |
3288 | unsigned tree_size = 0; |
3289 | unsigned i; |
3290 | slp_tree node = vect_build_slp_tree (vinfo, stmts: scalar_stmts, group_size, |
3291 | max_nunits: &max_nunits, matches, limit, |
3292 | tree_size: &tree_size, bst_map); |
3293 | if (node != NULL) |
3294 | { |
3295 | /* Calculate the unrolling factor based on the smallest type. */ |
3296 | poly_uint64 unrolling_factor |
3297 | = calculate_unrolling_factor (nunits: max_nunits, group_size); |
3298 | |
3299 | if (maybe_ne (a: unrolling_factor, b: 1U) |
3300 | && is_a <bb_vec_info> (p: vinfo)) |
3301 | { |
3302 | unsigned HOST_WIDE_INT const_max_nunits; |
3303 | if (!max_nunits.is_constant (const_value: &const_max_nunits) |
3304 | || const_max_nunits > group_size) |
3305 | { |
3306 | if (dump_enabled_p ()) |
3307 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
3308 | "Build SLP failed: store group " |
3309 | "size not a multiple of the vector size " |
3310 | "in basic block SLP\n" ); |
3311 | vect_free_slp_tree (node); |
3312 | return false; |
3313 | } |
3314 | /* Fatal mismatch. */ |
3315 | if (dump_enabled_p ()) |
3316 | dump_printf_loc (MSG_NOTE, vect_location, |
3317 | "SLP discovery succeeded but node needs " |
3318 | "splitting\n" ); |
3319 | memset (s: matches, c: true, n: group_size); |
3320 | matches[group_size / const_max_nunits * const_max_nunits] = false; |
3321 | vect_free_slp_tree (node); |
3322 | } |
3323 | else |
3324 | { |
3325 | /* Create a new SLP instance. */ |
3326 | slp_instance new_instance = XNEW (class _slp_instance); |
3327 | SLP_INSTANCE_TREE (new_instance) = node; |
3328 | SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor; |
3329 | SLP_INSTANCE_LOADS (new_instance) = vNULL; |
3330 | SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos; |
3331 | SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain; |
3332 | SLP_INSTANCE_KIND (new_instance) = kind; |
3333 | new_instance->reduc_phis = NULL; |
3334 | new_instance->cost_vec = vNULL; |
3335 | new_instance->subgraph_entries = vNULL; |
3336 | |
3337 | if (dump_enabled_p ()) |
3338 | dump_printf_loc (MSG_NOTE, vect_location, |
3339 | "SLP size %u vs. limit %u.\n" , |
3340 | tree_size, max_tree_size); |
3341 | |
3342 | /* Fixup SLP reduction chains. */ |
3343 | if (kind == slp_inst_kind_reduc_chain) |
3344 | { |
3345 | /* If this is a reduction chain with a conversion in front |
3346 | amend the SLP tree with a node for that. */ |
3347 | gimple *scalar_def |
3348 | = vect_orig_stmt (stmt_info: scalar_stmts[group_size - 1])->stmt; |
3349 | if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def) |
3350 | { |
3351 | /* Get at the conversion stmt - we know it's the single use |
3352 | of the last stmt of the reduction chain. */ |
3353 | use_operand_p use_p; |
3354 | bool r = single_imm_use (var: gimple_assign_lhs (gs: scalar_def), |
3355 | use_p: &use_p, stmt: &scalar_def); |
3356 | gcc_assert (r); |
3357 | stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def); |
3358 | next_info = vect_stmt_to_vectorize (stmt_info: next_info); |
3359 | scalar_stmts = vNULL; |
3360 | scalar_stmts.create (nelems: group_size); |
3361 | for (unsigned i = 0; i < group_size; ++i) |
3362 | scalar_stmts.quick_push (obj: next_info); |
3363 | slp_tree conv = vect_create_new_slp_node (scalar_stmts, nops: 1); |
3364 | SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info); |
3365 | SLP_TREE_CHILDREN (conv).quick_push (obj: node); |
3366 | SLP_INSTANCE_TREE (new_instance) = conv; |
3367 | /* We also have to fake this conversion stmt as SLP reduction |
3368 | group so we don't have to mess with too much code |
3369 | elsewhere. */ |
3370 | REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info; |
3371 | REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL; |
3372 | } |
3373 | /* Fill the backedge child of the PHI SLP node. The |
3374 | general matching code cannot find it because the |
3375 | scalar code does not reflect how we vectorize the |
3376 | reduction. */ |
3377 | use_operand_p use_p; |
3378 | imm_use_iterator imm_iter; |
3379 | class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo)); |
3380 | FOR_EACH_IMM_USE_FAST (use_p, imm_iter, |
3381 | gimple_get_lhs (scalar_def)) |
3382 | /* There are exactly two non-debug uses, the reduction |
3383 | PHI and the loop-closed PHI node. */ |
3384 | if (!is_gimple_debug (USE_STMT (use_p)) |
3385 | && gimple_bb (USE_STMT (use_p)) == loop->header) |
3386 | { |
3387 | auto_vec<stmt_vec_info, 64> phis (group_size); |
3388 | stmt_vec_info phi_info |
3389 | = vinfo->lookup_stmt (USE_STMT (use_p)); |
3390 | for (unsigned i = 0; i < group_size; ++i) |
3391 | phis.quick_push (obj: phi_info); |
3392 | slp_tree *phi_node = bst_map->get (k: phis); |
3393 | unsigned dest_idx = loop_latch_edge (loop)->dest_idx; |
3394 | SLP_TREE_CHILDREN (*phi_node)[dest_idx] |
3395 | = SLP_INSTANCE_TREE (new_instance); |
3396 | SLP_INSTANCE_TREE (new_instance)->refcnt++; |
3397 | } |
3398 | } |
3399 | |
3400 | vinfo->slp_instances.safe_push (obj: new_instance); |
3401 | |
3402 | /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with |
3403 | the number of scalar stmts in the root in a few places. |
3404 | Verify that assumption holds. */ |
3405 | gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance)) |
3406 | .length () == group_size); |
3407 | |
3408 | if (dump_enabled_p ()) |
3409 | { |
3410 | dump_printf_loc (MSG_NOTE, vect_location, |
3411 | "Final SLP tree for instance %p:\n" , |
3412 | (void *) new_instance); |
3413 | vect_print_slp_graph (dump_kind: MSG_NOTE, loc: vect_location, |
3414 | SLP_INSTANCE_TREE (new_instance)); |
3415 | } |
3416 | |
3417 | return true; |
3418 | } |
3419 | } |
3420 | else |
3421 | { |
3422 | /* Failed to SLP. */ |
3423 | /* Free the allocated memory. */ |
3424 | scalar_stmts.release (); |
3425 | } |
3426 | |
3427 | stmt_vec_info stmt_info = stmt_info_; |
3428 | /* Try to break the group up into pieces. */ |
3429 | if (kind == slp_inst_kind_store) |
3430 | { |
3431 | /* ??? We could delay all the actual splitting of store-groups |
3432 | until after SLP discovery of the original group completed. |
3433 | Then we can recurse to vect_build_slp_instance directly. */ |
3434 | for (i = 0; i < group_size; i++) |
3435 | if (!matches[i]) |
3436 | break; |
3437 | |
3438 | /* For basic block SLP, try to break the group up into multiples of |
3439 | a vector size. */ |
3440 | if (is_a <bb_vec_info> (p: vinfo) |
3441 | && (i > 1 && i < group_size)) |
3442 | { |
3443 | tree scalar_type |
3444 | = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info))); |
3445 | tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type, |
3446 | 1 << floor_log2 (x: i)); |
3447 | unsigned HOST_WIDE_INT const_nunits; |
3448 | if (vectype |
3449 | && TYPE_VECTOR_SUBPARTS (node: vectype).is_constant (const_value: &const_nunits)) |
3450 | { |
3451 | /* Split into two groups at the first vector boundary. */ |
3452 | gcc_assert ((const_nunits & (const_nunits - 1)) == 0); |
3453 | unsigned group1_size = i & ~(const_nunits - 1); |
3454 | |
3455 | if (dump_enabled_p ()) |
3456 | dump_printf_loc (MSG_NOTE, vect_location, |
3457 | "Splitting SLP group at stmt %u\n" , i); |
3458 | stmt_vec_info rest = vect_split_slp_store_group (first_vinfo: stmt_info, |
3459 | group1_size); |
3460 | bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info, |
3461 | kind, max_tree_size, |
3462 | limit); |
3463 | /* Split the rest at the failure point and possibly |
3464 | re-analyze the remaining matching part if it has |
3465 | at least two lanes. */ |
3466 | if (group1_size < i |
3467 | && (i + 1 < group_size |
3468 | || i - group1_size > 1)) |
3469 | { |
3470 | stmt_vec_info rest2 = rest; |
3471 | rest = vect_split_slp_store_group (first_vinfo: rest, group1_size: i - group1_size); |
3472 | if (i - group1_size > 1) |
3473 | res |= vect_analyze_slp_instance (vinfo, bst_map, stmt_info: rest2, |
3474 | kind, max_tree_size, |
3475 | limit); |
3476 | } |
3477 | /* Re-analyze the non-matching tail if it has at least |
3478 | two lanes. */ |
3479 | if (i + 1 < group_size) |
3480 | res |= vect_analyze_slp_instance (vinfo, bst_map, |
3481 | stmt_info: rest, kind, max_tree_size, |
3482 | limit); |
3483 | return res; |
3484 | } |
3485 | } |
3486 | |
3487 | /* For loop vectorization split into arbitrary pieces of size > 1. */ |
3488 | if (is_a <loop_vec_info> (p: vinfo) |
3489 | && (i > 1 && i < group_size) |
3490 | && !vect_slp_prefer_store_lanes_p (vinfo, stmt_info, group_size, new_group_size: i)) |
3491 | { |
3492 | unsigned group1_size = i; |
3493 | |
3494 | if (dump_enabled_p ()) |
3495 | dump_printf_loc (MSG_NOTE, vect_location, |
3496 | "Splitting SLP group at stmt %u\n" , i); |
3497 | |
3498 | stmt_vec_info rest = vect_split_slp_store_group (first_vinfo: stmt_info, |
3499 | group1_size); |
3500 | /* Loop vectorization cannot handle gaps in stores, make sure |
3501 | the split group appears as strided. */ |
3502 | STMT_VINFO_STRIDED_P (rest) = 1; |
3503 | DR_GROUP_GAP (rest) = 0; |
3504 | STMT_VINFO_STRIDED_P (stmt_info) = 1; |
3505 | DR_GROUP_GAP (stmt_info) = 0; |
3506 | |
3507 | bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info, |
3508 | kind, max_tree_size, limit); |
3509 | if (i + 1 < group_size) |
3510 | res |= vect_analyze_slp_instance (vinfo, bst_map, |
3511 | stmt_info: rest, kind, max_tree_size, limit); |
3512 | |
3513 | return res; |
3514 | } |
3515 | |
3516 | /* Even though the first vector did not all match, we might be able to SLP |
3517 | (some) of the remainder. FORNOW ignore this possibility. */ |
3518 | } |
3519 | |
3520 | /* Failed to SLP. */ |
3521 | if (dump_enabled_p ()) |
3522 | dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n" ); |
3523 | return false; |
3524 | } |
3525 | |
3526 | |
3527 | /* Analyze an SLP instance starting from a group of grouped stores. Call |
3528 | vect_build_slp_tree to build a tree of packed stmts if possible. |
3529 | Return FALSE if it's impossible to SLP any stmt in the loop. */ |
3530 | |
3531 | static bool |
3532 | vect_analyze_slp_instance (vec_info *vinfo, |
3533 | scalar_stmts_to_slp_tree_map_t *bst_map, |
3534 | stmt_vec_info stmt_info, |
3535 | slp_instance_kind kind, |
3536 | unsigned max_tree_size, unsigned *limit) |
3537 | { |
3538 | unsigned int i; |
3539 | vec<stmt_vec_info> scalar_stmts; |
3540 | |
3541 | if (is_a <bb_vec_info> (p: vinfo)) |
3542 | vect_location = stmt_info->stmt; |
3543 | |
3544 | stmt_vec_info next_info = stmt_info; |
3545 | if (kind == slp_inst_kind_store) |
3546 | { |
3547 | /* Collect the stores and store them in scalar_stmts. */ |
3548 | scalar_stmts.create (DR_GROUP_SIZE (stmt_info)); |
3549 | while (next_info) |
3550 | { |
3551 | scalar_stmts.quick_push (obj: vect_stmt_to_vectorize (stmt_info: next_info)); |
3552 | next_info = DR_GROUP_NEXT_ELEMENT (next_info); |
3553 | } |
3554 | } |
3555 | else if (kind == slp_inst_kind_reduc_chain) |
3556 | { |
3557 | /* Collect the reduction stmts and store them in scalar_stmts. */ |
3558 | scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info)); |
3559 | while (next_info) |
3560 | { |
3561 | scalar_stmts.quick_push (obj: vect_stmt_to_vectorize (stmt_info: next_info)); |
3562 | next_info = REDUC_GROUP_NEXT_ELEMENT (next_info); |
3563 | } |
3564 | /* Mark the first element of the reduction chain as reduction to properly |
3565 | transform the node. In the reduction analysis phase only the last |
3566 | element of the chain is marked as reduction. */ |
3567 | STMT_VINFO_DEF_TYPE (stmt_info) |
3568 | = STMT_VINFO_DEF_TYPE (scalar_stmts.last ()); |
3569 | STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)) |
3570 | = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ())); |
3571 | } |
3572 | else if (kind == slp_inst_kind_reduc_group) |
3573 | { |
3574 | /* Collect reduction statements. */ |
3575 | const vec<stmt_vec_info> &reductions |
3576 | = as_a <loop_vec_info> (p: vinfo)->reductions; |
3577 | scalar_stmts.create (nelems: reductions.length ()); |
3578 | for (i = 0; reductions.iterate (ix: i, ptr: &next_info); i++) |
3579 | if ((STMT_VINFO_RELEVANT_P (next_info) |
3580 | || STMT_VINFO_LIVE_P (next_info)) |
3581 | /* ??? Make sure we didn't skip a conversion around a reduction |
3582 | path. In that case we'd have to reverse engineer that conversion |
3583 | stmt following the chain using reduc_idx and from the PHI |
3584 | using reduc_def. */ |
3585 | && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def) |
3586 | scalar_stmts.quick_push (obj: next_info); |
3587 | /* If less than two were relevant/live there's nothing to SLP. */ |
3588 | if (scalar_stmts.length () < 2) |
3589 | return false; |
3590 | } |
3591 | else |
3592 | gcc_unreachable (); |
3593 | |
3594 | vec<stmt_vec_info> roots = vNULL; |
3595 | vec<tree> remain = vNULL; |
3596 | /* Build the tree for the SLP instance. */ |
3597 | bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts, |
3598 | root_stmt_infos&: roots, remain, |
3599 | max_tree_size, limit, bst_map, |
3600 | stmt_info_: kind == slp_inst_kind_store |
3601 | ? stmt_info : NULL); |
3602 | |
3603 | /* ??? If this is slp_inst_kind_store and the above succeeded here's |
3604 | where we should do store group splitting. */ |
3605 | |
3606 | return res; |
3607 | } |
3608 | |
3609 | /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP |
3610 | trees of packed scalar stmts if SLP is possible. */ |
3611 | |
3612 | opt_result |
3613 | vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size) |
3614 | { |
3615 | unsigned int i; |
3616 | stmt_vec_info first_element; |
3617 | slp_instance instance; |
3618 | |
3619 | DUMP_VECT_SCOPE ("vect_analyze_slp" ); |
3620 | |
3621 | unsigned limit = max_tree_size; |
3622 | |
3623 | scalar_stmts_to_slp_tree_map_t *bst_map |
3624 | = new scalar_stmts_to_slp_tree_map_t (); |
3625 | |
3626 | /* Find SLP sequences starting from groups of grouped stores. */ |
3627 | FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element) |
3628 | vect_analyze_slp_instance (vinfo, bst_map, stmt_info: first_element, |
3629 | kind: slp_inst_kind_store, max_tree_size, limit: &limit); |
3630 | |
3631 | if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (p: vinfo)) |
3632 | { |
3633 | for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i) |
3634 | { |
3635 | vect_location = bb_vinfo->roots[i].roots[0]->stmt; |
3636 | if (vect_build_slp_instance (vinfo: bb_vinfo, kind: bb_vinfo->roots[i].kind, |
3637 | scalar_stmts&: bb_vinfo->roots[i].stmts, |
3638 | root_stmt_infos&: bb_vinfo->roots[i].roots, |
3639 | remain&: bb_vinfo->roots[i].remain, |
3640 | max_tree_size, limit: &limit, bst_map, NULL)) |
3641 | { |
3642 | bb_vinfo->roots[i].stmts = vNULL; |
3643 | bb_vinfo->roots[i].roots = vNULL; |
3644 | bb_vinfo->roots[i].remain = vNULL; |
3645 | } |
3646 | } |
3647 | } |
3648 | |
3649 | if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (p: vinfo)) |
3650 | { |
3651 | /* Find SLP sequences starting from reduction chains. */ |
3652 | FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element) |
3653 | if (! STMT_VINFO_RELEVANT_P (first_element) |
3654 | && ! STMT_VINFO_LIVE_P (first_element)) |
3655 | ; |
3656 | else if (! vect_analyze_slp_instance (vinfo, bst_map, stmt_info: first_element, |
3657 | kind: slp_inst_kind_reduc_chain, |
3658 | max_tree_size, limit: &limit)) |
3659 | { |
3660 | /* Dissolve reduction chain group. */ |
3661 | stmt_vec_info vinfo = first_element; |
3662 | stmt_vec_info last = NULL; |
3663 | while (vinfo) |
3664 | { |
3665 | stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo); |
3666 | REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL; |
3667 | REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL; |
3668 | last = vinfo; |
3669 | vinfo = next; |
3670 | } |
3671 | STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def; |
3672 | /* It can be still vectorized as part of an SLP reduction. */ |
3673 | loop_vinfo->reductions.safe_push (obj: last); |
3674 | } |
3675 | |
3676 | /* Find SLP sequences starting from groups of reductions. */ |
3677 | if (loop_vinfo->reductions.length () > 1) |
3678 | vect_analyze_slp_instance (vinfo, bst_map, stmt_info: loop_vinfo->reductions[0], |
3679 | kind: slp_inst_kind_reduc_group, max_tree_size, |
3680 | limit: &limit); |
3681 | } |
3682 | |
3683 | hash_set<slp_tree> visited_patterns; |
3684 | slp_tree_to_load_perm_map_t perm_cache; |
3685 | slp_compat_nodes_map_t compat_cache; |
3686 | |
3687 | /* See if any patterns can be found in the SLP tree. */ |
3688 | bool pattern_found = false; |
3689 | FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance) |
3690 | pattern_found |= vect_match_slp_patterns (instance, vinfo, |
3691 | visited: &visited_patterns, perm_cache: &perm_cache, |
3692 | compat_cache: &compat_cache); |
3693 | |
3694 | /* If any were found optimize permutations of loads. */ |
3695 | if (pattern_found) |
3696 | { |
3697 | hash_map<slp_tree, slp_tree> load_map; |
3698 | FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance) |
3699 | { |
3700 | slp_tree root = SLP_INSTANCE_TREE (instance); |
3701 | optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root), |
3702 | load_map: &load_map, root); |
3703 | } |
3704 | } |
3705 | |
3706 | |
3707 | |
3708 | /* The map keeps a reference on SLP nodes built, release that. */ |
3709 | for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin (); |
3710 | it != bst_map->end (); ++it) |
3711 | if ((*it).second) |
3712 | vect_free_slp_tree (node: (*it).second); |
3713 | delete bst_map; |
3714 | |
3715 | if (pattern_found && dump_enabled_p ()) |
3716 | { |
3717 | dump_printf_loc (MSG_NOTE, vect_location, |
3718 | "Pattern matched SLP tree\n" ); |
3719 | hash_set<slp_tree> visited; |
3720 | FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance) |
3721 | vect_print_slp_graph (dump_kind: MSG_NOTE, loc: vect_location, |
3722 | SLP_INSTANCE_TREE (instance), visited); |
3723 | } |
3724 | |
3725 | return opt_result::success (); |
3726 | } |
3727 | |
3728 | /* Estimates the cost of inserting layout changes into the SLP graph. |
3729 | It can also say that the insertion is impossible. */ |
3730 | |
3731 | struct slpg_layout_cost |
3732 | { |
3733 | slpg_layout_cost () = default; |
3734 | slpg_layout_cost (sreal, bool); |
3735 | |
3736 | static slpg_layout_cost impossible () { return { sreal::max (), 0 }; } |
3737 | bool is_possible () const { return depth != sreal::max (); } |
3738 | |
3739 | bool operator== (const slpg_layout_cost &) const; |
3740 | bool operator!= (const slpg_layout_cost &) const; |
3741 | |
3742 | bool is_better_than (const slpg_layout_cost &, bool) const; |
3743 | |
3744 | void add_parallel_cost (const slpg_layout_cost &); |
3745 | void add_serial_cost (const slpg_layout_cost &); |
3746 | void split (unsigned int); |
3747 | |
3748 | /* The longest sequence of layout changes needed during any traversal |
3749 | of the partition dag, weighted by execution frequency. |
3750 | |
3751 | This is the most important metric when optimizing for speed, since |
3752 | it helps to ensure that we keep the number of operations on |
3753 | critical paths to a minimum. */ |
3754 | sreal depth = 0; |
3755 | |
3756 | /* An estimate of the total number of operations needed. It is weighted by |
3757 | execution frequency when optimizing for speed but not when optimizing for |
3758 | size. In order to avoid double-counting, a node with a fanout of N will |
3759 | distribute 1/N of its total cost to each successor. |
3760 | |
3761 | This is the most important metric when optimizing for size, since |
3762 | it helps to keep the total number of operations to a minimum, */ |
3763 | sreal total = 0; |
3764 | }; |
3765 | |
3766 | /* Construct costs for a node with weight WEIGHT. A higher weight |
3767 | indicates more frequent execution. IS_FOR_SIZE is true if we are |
3768 | optimizing for size rather than speed. */ |
3769 | |
3770 | slpg_layout_cost::slpg_layout_cost (sreal weight, bool is_for_size) |
3771 | : depth (weight), total (is_for_size && weight > 0 ? 1 : weight) |
3772 | { |
3773 | } |
3774 | |
3775 | bool |
3776 | slpg_layout_cost::operator== (const slpg_layout_cost &other) const |
3777 | { |
3778 | return depth == other.depth && total == other.total; |
3779 | } |
3780 | |
3781 | bool |
3782 | slpg_layout_cost::operator!= (const slpg_layout_cost &other) const |
3783 | { |
3784 | return !operator== (other); |
3785 | } |
3786 | |
3787 | /* Return true if these costs are better than OTHER. IS_FOR_SIZE is |
3788 | true if we are optimizing for size rather than speed. */ |
3789 | |
3790 | bool |
3791 | slpg_layout_cost::is_better_than (const slpg_layout_cost &other, |
3792 | bool is_for_size) const |
3793 | { |
3794 | if (is_for_size) |
3795 | { |
3796 | if (total != other.total) |
3797 | return total < other.total; |
3798 | return depth < other.depth; |
3799 | } |
3800 | else |
3801 | { |
3802 | if (depth != other.depth) |
3803 | return depth < other.depth; |
3804 | return total < other.total; |
3805 | } |
3806 | } |
3807 | |
3808 | /* Increase the costs to account for something with cost INPUT_COST |
3809 | happening in parallel with the current costs. */ |
3810 | |
3811 | void |
3812 | slpg_layout_cost::add_parallel_cost (const slpg_layout_cost &input_cost) |
3813 | { |
3814 | depth = std::max (a: depth, b: input_cost.depth); |
3815 | total += input_cost.total; |
3816 | } |
3817 | |
3818 | /* Increase the costs to account for something with cost INPUT_COST |
3819 | happening in series with the current costs. */ |
3820 | |
3821 | void |
3822 | slpg_layout_cost::add_serial_cost (const slpg_layout_cost &other) |
3823 | { |
3824 | depth += other.depth; |
3825 | total += other.total; |
3826 | } |
3827 | |
3828 | /* Split the total cost among TIMES successors or predecessors. */ |
3829 | |
3830 | void |
3831 | slpg_layout_cost::split (unsigned int times) |
3832 | { |
3833 | if (times > 1) |
3834 | total /= times; |
3835 | } |
3836 | |
3837 | /* Information about one node in the SLP graph, for use during |
3838 | vect_optimize_slp_pass. */ |
3839 | |
3840 | struct slpg_vertex |
3841 | { |
3842 | slpg_vertex (slp_tree node_) : node (node_) {} |
3843 | |
3844 | /* The node itself. */ |
3845 | slp_tree node; |
3846 | |
3847 | /* Which partition the node belongs to, or -1 if none. Nodes outside of |
3848 | partitions are flexible; they can have whichever layout consumers |
3849 | want them to have. */ |
3850 | int partition = -1; |
3851 | |
3852 | /* The number of nodes that directly use the result of this one |
3853 | (i.e. the number of nodes that count this one as a child). */ |
3854 | unsigned int out_degree = 0; |
3855 | |
3856 | /* The execution frequency of the node. */ |
3857 | sreal weight = 0; |
3858 | |
3859 | /* The total execution frequency of all nodes that directly use the |
3860 | result of this one. */ |
3861 | sreal out_weight = 0; |
3862 | }; |
3863 | |
3864 | /* Information about one partition of the SLP graph, for use during |
3865 | vect_optimize_slp_pass. */ |
3866 | |
3867 | struct slpg_partition_info |
3868 | { |
3869 | /* The nodes in the partition occupy indices [NODE_BEGIN, NODE_END) |
3870 | of m_partitioned_nodes. */ |
3871 | unsigned int node_begin = 0; |
3872 | unsigned int node_end = 0; |
3873 | |
3874 | /* Which layout we've chosen to use for this partition, or -1 if |
3875 | we haven't picked one yet. */ |
3876 | int layout = -1; |
3877 | |
3878 | /* The number of predecessors and successors in the partition dag. |
3879 | The predecessors always have lower partition numbers and the |
3880 | successors always have higher partition numbers. |
3881 | |
3882 | Note that the directions of these edges are not necessarily the |
3883 | same as in the data flow graph. For example, if an SCC has separate |
3884 | partitions for an inner loop and an outer loop, the inner loop's |
3885 | partition will have at least two incoming edges from the outer loop's |
3886 | partition: one for a live-in value and one for a live-out value. |
3887 | In data flow terms, one of these edges would also be from the outer loop |
3888 | to the inner loop, but the other would be in the opposite direction. */ |
3889 | unsigned int in_degree = 0; |
3890 | unsigned int out_degree = 0; |
3891 | }; |
3892 | |
3893 | /* Information about the costs of using a particular layout for a |
3894 | particular partition. It can also say that the combination is |
3895 | impossible. */ |
3896 | |
3897 | struct slpg_partition_layout_costs |
3898 | { |
3899 | bool is_possible () const { return internal_cost.is_possible (); } |
3900 | void mark_impossible () { internal_cost = slpg_layout_cost::impossible (); } |
3901 | |
3902 | /* The costs inherited from predecessor partitions. */ |
3903 | slpg_layout_cost in_cost; |
3904 | |
3905 | /* The inherent cost of the layout within the node itself. For example, |
3906 | this is nonzero for a load if choosing a particular layout would require |
3907 | the load to permute the loaded elements. It is nonzero for a |
3908 | VEC_PERM_EXPR if the permutation cannot be eliminated or converted |
3909 | to full-vector moves. */ |
3910 | slpg_layout_cost internal_cost; |
3911 | |
3912 | /* The costs inherited from successor partitions. */ |
3913 | slpg_layout_cost out_cost; |
3914 | }; |
3915 | |
3916 | /* This class tries to optimize the layout of vectors in order to avoid |
3917 | unnecessary shuffling. At the moment, the set of possible layouts are |
3918 | restricted to bijective permutations. |
3919 | |
3920 | The goal of the pass depends on whether we're optimizing for size or |
3921 | for speed. When optimizing for size, the goal is to reduce the overall |
3922 | number of layout changes (including layout changes implied by things |
3923 | like load permutations). When optimizing for speed, the goal is to |
3924 | reduce the maximum latency attributable to layout changes on any |
3925 | non-cyclical path through the data flow graph. |
3926 | |
3927 | For example, when optimizing a loop nest for speed, we will prefer |
3928 | to make layout changes outside of a loop rather than inside of a loop, |
3929 | and will prefer to make layout changes in parallel rather than serially, |
3930 | even if that increases the overall number of layout changes. |
3931 | |
3932 | The high-level procedure is: |
3933 | |
3934 | (1) Build a graph in which edges go from uses (parents) to definitions |
3935 | (children). |
3936 | |
3937 | (2) Divide the graph into a dag of strongly-connected components (SCCs). |
3938 | |
3939 | (3) When optimizing for speed, partition the nodes in each SCC based |
3940 | on their containing cfg loop. When optimizing for size, treat |
3941 | each SCC as a single partition. |
3942 | |
3943 | This gives us a dag of partitions. The goal is now to assign a |
3944 | layout to each partition. |
3945 | |
3946 | (4) Construct a set of vector layouts that are worth considering. |
3947 | Record which nodes must keep their current layout. |
3948 | |
3949 | (5) Perform a forward walk over the partition dag (from loads to stores) |
3950 | accumulating the "forward" cost of using each layout. When visiting |
3951 | each partition, assign a tentative choice of layout to the partition |
3952 | and use that choice when calculating the cost of using a different |
3953 | layout in successor partitions. |
3954 | |
3955 | (6) Perform a backward walk over the partition dag (from stores to loads), |
3956 | accumulating the "backward" cost of using each layout. When visiting |
3957 | each partition, make a final choice of layout for that partition based |
3958 | on the accumulated forward costs (from (5)) and backward costs |
3959 | (from (6)). |
3960 | |
3961 | (7) Apply the chosen layouts to the SLP graph. |
3962 | |
3963 | For example, consider the SLP statements: |
3964 | |
3965 | S1: a_1 = load |
3966 | loop: |
3967 | S2: a_2 = PHI<a_1, a_3> |
3968 | S3: b_1 = load |
3969 | S4: a_3 = a_2 + b_1 |
3970 | exit: |
3971 | S5: a_4 = PHI<a_3> |
3972 | S6: store a_4 |
3973 | |
3974 | S2 and S4 form an SCC and are part of the same loop. Every other |
3975 | statement is in a singleton SCC. In this example there is a one-to-one |
3976 | mapping between SCCs and partitions and the partition dag looks like this; |
3977 | |
3978 | S1 S3 |
3979 | \ / |
3980 | S2+S4 |
3981 | | |
3982 | S5 |
3983 | | |
3984 | S6 |
3985 | |
3986 | S2, S3 and S4 will have a higher execution frequency than the other |
3987 | statements, so when optimizing for speed, the goal is to avoid any |
3988 | layout changes: |
3989 | |
3990 | - within S3 |
3991 | - within S2+S4 |
3992 | - on the S3->S2+S4 edge |
3993 | |
3994 | For example, if S3 was originally a reversing load, the goal of the |
3995 | pass is to make it an unreversed load and change the layout on the |
3996 | S1->S2+S4 and S2+S4->S5 edges to compensate. (Changing the layout |
3997 | on S1->S2+S4 and S5->S6 would also be acceptable.) |
3998 | |
3999 | The difference between SCCs and partitions becomes important if we |
4000 | add an outer loop: |
4001 | |
4002 | S1: a_1 = ... |
4003 | loop1: |
4004 | S2: a_2 = PHI<a_1, a_6> |
4005 | S3: b_1 = load |
4006 | S4: a_3 = a_2 + b_1 |
4007 | loop2: |
4008 | S5: a_4 = PHI<a_3, a_5> |
4009 | S6: c_1 = load |
4010 | S7: a_5 = a_4 + c_1 |
4011 | exit2: |
4012 | S8: a_6 = PHI<a_5> |
4013 | S9: store a_6 |
4014 | exit1: |
4015 | |
4016 | Here, S2, S4, S5, S7 and S8 form a single SCC. However, when optimizing |
4017 | for speed, we usually do not want restrictions in the outer loop to "infect" |
4018 | the decision for the inner loop. For example, if an outer-loop node |
4019 | in the SCC contains a statement with a fixed layout, that should not |
4020 | prevent the inner loop from using a different layout. Conversely, |
4021 | the inner loop should not dictate a layout to the outer loop: if the |
4022 | outer loop does a lot of computation, then it may not be efficient to |
4023 | do all of that computation in the inner loop's preferred layout. |
4024 | |
4025 | So when optimizing for speed, we partition the SCC into S2+S4+S8 (outer) |
4026 | and S5+S7 (inner). We also try to arrange partitions so that: |
4027 | |
4028 | - the partition for an outer loop comes before the partition for |
4029 | an inner loop |
4030 | |
4031 | - if a sibling loop A dominates a sibling loop B, A's partition |
4032 | comes before B's |
4033 | |
4034 | This gives the following partition dag for the example above: |
4035 | |
4036 | S1 S3 |
4037 | \ / |
4038 | S2+S4+S8 S6 |
4039 | | \\ / |
4040 | | S5+S7 |
4041 | | |
4042 | S9 |
4043 | |
4044 | There are two edges from S2+S4+S8 to S5+S7: one for the edge S4->S5 and |
4045 | one for a reversal of the edge S7->S8. |
4046 | |
4047 | The backward walk picks a layout for S5+S7 before S2+S4+S8. The choice |
4048 | for S2+S4+S8 therefore has to balance the cost of using the outer loop's |
4049 | preferred layout against the cost of changing the layout on entry to the |
4050 | inner loop (S4->S5) and on exit from the inner loop (S7->S8 reversed). |
4051 | |
4052 | Although this works well when optimizing for speed, it has the downside |
4053 | when optimizing for size that the choice of layout for S5+S7 is completely |
4054 | independent of S9, which lessens the chance of reducing the overall number |
4055 | of permutations. We therefore do not partition SCCs when optimizing |
4056 | for size. |
4057 | |
4058 | To give a concrete example of the difference between optimizing |
4059 | for size and speed, consider: |
4060 | |
4061 | a[0] = (b[1] << c[3]) - d[1]; |
4062 | a[1] = (b[0] << c[2]) - d[0]; |
4063 | a[2] = (b[3] << c[1]) - d[3]; |
4064 | a[3] = (b[2] << c[0]) - d[2]; |
4065 | |
4066 | There are three different layouts here: one for a, one for b and d, |
4067 | and one for c. When optimizing for speed it is better to permute each |
4068 | of b, c and d into the order required by a, since those permutations |
4069 | happen in parallel. But when optimizing for size, it is better to: |
4070 | |
4071 | - permute c into the same order as b |
4072 | - do the arithmetic |
4073 | - permute the result into the order required by a |
4074 | |
4075 | This gives 2 permutations rather than 3. */ |
4076 | |
4077 | class vect_optimize_slp_pass |
4078 | { |
4079 | public: |
4080 | vect_optimize_slp_pass (vec_info *vinfo) : m_vinfo (vinfo) {} |
4081 | void run (); |
4082 | |
4083 | private: |
4084 | /* Graph building. */ |
4085 | struct loop *containing_loop (slp_tree); |
4086 | bool is_cfg_latch_edge (graph_edge *); |
4087 | void build_vertices (hash_set<slp_tree> &, slp_tree); |
4088 | void build_vertices (); |
4089 | void build_graph (); |
4090 | |
4091 | /* Partitioning. */ |
4092 | void create_partitions (); |
4093 | template<typename T> void for_each_partition_edge (unsigned int, T); |
4094 | |
4095 | /* Layout selection. */ |
4096 | bool is_compatible_layout (slp_tree, unsigned int); |
4097 | int change_layout_cost (slp_tree, unsigned int, unsigned int); |
4098 | slpg_partition_layout_costs &partition_layout_costs (unsigned int, |
4099 | unsigned int); |
4100 | void change_vec_perm_layout (slp_tree, lane_permutation_t &, |
4101 | int, unsigned int); |
4102 | int internal_node_cost (slp_tree, int, unsigned int); |
4103 | void start_choosing_layouts (); |
4104 | |
4105 | /* Cost propagation. */ |
4106 | slpg_layout_cost edge_layout_cost (graph_edge *, unsigned int, |
4107 | unsigned int, unsigned int); |
4108 | slpg_layout_cost total_in_cost (unsigned int); |
4109 | slpg_layout_cost forward_cost (graph_edge *, unsigned int, unsigned int); |
4110 | slpg_layout_cost backward_cost (graph_edge *, unsigned int, unsigned int); |
4111 | void forward_pass (); |
4112 | void backward_pass (); |
4113 | |
4114 | /* Rematerialization. */ |
4115 | slp_tree get_result_with_layout (slp_tree, unsigned int); |
4116 | void materialize (); |
4117 | |
4118 | /* Clean-up. */ |
4119 | void remove_redundant_permutations (); |
4120 | |
4121 | void dump (); |
4122 | |
4123 | vec_info *m_vinfo; |
4124 | |
4125 | /* True if we should optimize the graph for size, false if we should |
4126 | optimize it for speed. (It wouldn't be easy to make this decision |
4127 | more locally.) */ |
4128 | bool m_optimize_size; |
4129 | |
4130 | /* A graph of all SLP nodes, with edges leading from uses to definitions. |
4131 | In other words, a node's predecessors are its slp_tree parents and |
4132 | a node's successors are its slp_tree children. */ |
4133 | graph *m_slpg = nullptr; |
4134 | |
4135 | /* The vertices of M_SLPG, indexed by slp_tree::vertex. */ |
4136 | auto_vec<slpg_vertex> m_vertices; |
4137 | |
4138 | /* The list of all leaves of M_SLPG. such as external definitions, constants, |
4139 | and loads. */ |
4140 | auto_vec<int> m_leafs; |
4141 | |
4142 | /* This array has one entry for every vector layout that we're considering. |
4143 | Element 0 is null and indicates "no change". Other entries describe |
4144 | permutations that are inherent in the current graph and that we would |
4145 | like to reverse if possible. |
4146 | |
4147 | For example, a permutation { 1, 2, 3, 0 } means that something has |
4148 | effectively been permuted in that way, such as a load group |
4149 | { a[1], a[2], a[3], a[0] } (viewed as a permutation of a[0:3]). |
4150 | We'd then like to apply the reverse permutation { 3, 0, 1, 2 } |
4151 | in order to put things "back" in order. */ |
4152 | auto_vec<vec<unsigned> > m_perms; |
4153 | |
4154 | /* A partitioning of the nodes for which a layout must be chosen. |
4155 | Each partition represents an <SCC, cfg loop> pair; that is, |
4156 | nodes in different SCCs belong to different partitions, and nodes |
4157 | within an SCC can be further partitioned according to a containing |
4158 | cfg loop. Partition <SCC1, L1> comes before <SCC2, L2> if: |
4159 | |
4160 | - SCC1 != SCC2 and SCC1 is a predecessor of SCC2 in a forward walk |
4161 | from leaves (such as loads) to roots (such as stores). |
4162 | |
4163 | - SCC1 == SCC2 and L1's header strictly dominates L2's header. */ |
4164 | auto_vec<slpg_partition_info> m_partitions; |
4165 | |
4166 | /* The list of all nodes for which a layout must be chosen. Nodes for |
4167 | partition P come before the nodes for partition P+1. Nodes within a |
4168 | partition are in reverse postorder. */ |
4169 | auto_vec<unsigned int> m_partitioned_nodes; |
4170 | |
4171 | /* Index P * num-layouts + L contains the cost of using layout L |
4172 | for partition P. */ |
4173 | auto_vec<slpg_partition_layout_costs> m_partition_layout_costs; |
4174 | |
4175 | /* Index N * num-layouts + L, if nonnull, is a node that provides the |
4176 | original output of node N adjusted to have layout L. */ |
4177 | auto_vec<slp_tree> m_node_layouts; |
4178 | }; |
4179 | |
4180 | /* Fill the vertices and leafs vector with all nodes in the SLP graph. |
4181 | Also record whether we should optimize anything for speed rather |
4182 | than size. */ |
4183 | |
4184 | void |
4185 | vect_optimize_slp_pass::build_vertices (hash_set<slp_tree> &visited, |
4186 | slp_tree node) |
4187 | { |
4188 | unsigned i; |
4189 | slp_tree child; |
4190 | |
4191 | if (visited.add (k: node)) |
4192 | return; |
4193 | |
4194 | if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node)) |
4195 | { |
4196 | basic_block bb = gimple_bb (g: vect_orig_stmt (stmt_info: rep)->stmt); |
4197 | if (optimize_bb_for_speed_p (bb)) |
4198 | m_optimize_size = false; |
4199 | } |
4200 | |
4201 | node->vertex = m_vertices.length (); |
4202 | m_vertices.safe_push (obj: slpg_vertex (node)); |
4203 | |
4204 | bool leaf = true; |
4205 | bool force_leaf = false; |
4206 | FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) |
4207 | if (child) |
4208 | { |
4209 | leaf = false; |
4210 | build_vertices (visited, node: child); |
4211 | } |
4212 | else |
4213 | force_leaf = true; |
4214 | /* Since SLP discovery works along use-def edges all cycles have an |
4215 | entry - but there's the exception of cycles where we do not handle |
4216 | the entry explicitely (but with a NULL SLP node), like some reductions |
4217 | and inductions. Force those SLP PHIs to act as leafs to make them |
4218 | backwards reachable. */ |
4219 | if (leaf || force_leaf) |
4220 | m_leafs.safe_push (obj: node->vertex); |
4221 | } |
4222 | |
4223 | /* Fill the vertices and leafs vector with all nodes in the SLP graph. */ |
4224 | |
4225 | void |
4226 | vect_optimize_slp_pass::build_vertices () |
4227 | { |
4228 | hash_set<slp_tree> visited; |
4229 | unsigned i; |
4230 | slp_instance instance; |
4231 | FOR_EACH_VEC_ELT (m_vinfo->slp_instances, i, instance) |
4232 | build_vertices (visited, SLP_INSTANCE_TREE (instance)); |
4233 | } |
4234 | |
4235 | /* Apply (reverse) bijectite PERM to VEC. */ |
4236 | |
4237 | template <class T> |
4238 | static void |
4239 | vect_slp_permute (vec<unsigned> perm, |
4240 | vec<T> &vec, bool reverse) |
4241 | { |
4242 | auto_vec<T, 64> saved; |
4243 | saved.create (vec.length ()); |
4244 | for (unsigned i = 0; i < vec.length (); ++i) |
4245 | saved.quick_push (vec[i]); |
4246 | |
4247 | if (reverse) |
4248 | { |
4249 | for (unsigned i = 0; i < vec.length (); ++i) |
4250 | vec[perm[i]] = saved[i]; |
4251 | for (unsigned i = 0; i < vec.length (); ++i) |
4252 | gcc_assert (vec[perm[i]] == saved[i]); |
4253 | } |
4254 | else |
4255 | { |
4256 | for (unsigned i = 0; i < vec.length (); ++i) |
4257 | vec[i] = saved[perm[i]]; |
4258 | for (unsigned i = 0; i < vec.length (); ++i) |
4259 | gcc_assert (vec[i] == saved[perm[i]]); |
4260 | } |
4261 | } |
4262 | |
4263 | /* Return the cfg loop that contains NODE. */ |
4264 | |
4265 | struct loop * |
4266 | vect_optimize_slp_pass::containing_loop (slp_tree node) |
4267 | { |
4268 | stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node); |
4269 | if (!rep) |
4270 | return ENTRY_BLOCK_PTR_FOR_FN (cfun)->loop_father; |
4271 | return gimple_bb (g: vect_orig_stmt (stmt_info: rep)->stmt)->loop_father; |
4272 | } |
4273 | |
4274 | /* Return true if UD (an edge from a use to a definition) is associated |
4275 | with a loop latch edge in the cfg. */ |
4276 | |
4277 | bool |
4278 | vect_optimize_slp_pass::is_cfg_latch_edge (graph_edge *ud) |
4279 | { |
4280 | slp_tree use = m_vertices[ud->src].node; |
4281 | slp_tree def = m_vertices[ud->dest].node; |
4282 | if (SLP_TREE_DEF_TYPE (use) != vect_internal_def |
4283 | || SLP_TREE_DEF_TYPE (def) != vect_internal_def) |
4284 | return false; |
4285 | |
4286 | stmt_vec_info use_rep = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (use)); |
4287 | return (is_a<gphi *> (p: use_rep->stmt) |
4288 | && bb_loop_header_p (gimple_bb (g: use_rep->stmt)) |
4289 | && containing_loop (node: def) == containing_loop (node: use)); |
4290 | } |
4291 | |
4292 | /* Build the graph. Mark edges that correspond to cfg loop latch edges with |
4293 | a nonnull data field. */ |
4294 | |
4295 | void |
4296 | vect_optimize_slp_pass::build_graph () |
4297 | { |
4298 | m_optimize_size = true; |
4299 | build_vertices (); |
4300 | |
4301 | m_slpg = new_graph (m_vertices.length ()); |
4302 | for (slpg_vertex &v : m_vertices) |
4303 | for (slp_tree child : SLP_TREE_CHILDREN (v.node)) |
4304 | if (child) |
4305 | { |
4306 | graph_edge *ud = add_edge (m_slpg, v.node->vertex, child->vertex); |
4307 | if (is_cfg_latch_edge (ud)) |
4308 | ud->data = this; |
4309 | } |
4310 | } |
4311 | |
4312 | /* Return true if E corresponds to a loop latch edge in the cfg. */ |
4313 | |
4314 | static bool |
4315 | skip_cfg_latch_edges (graph_edge *e) |
4316 | { |
4317 | return e->data; |
4318 | } |
4319 | |
4320 | /* Create the node partitions. */ |
4321 | |
4322 | void |
4323 | vect_optimize_slp_pass::create_partitions () |
4324 | { |
4325 | /* Calculate a postorder of the graph, ignoring edges that correspond |
4326 | to natural latch edges in the cfg. Reading the vector from the end |
4327 | to the beginning gives the reverse postorder. */ |
4328 | auto_vec<int> initial_rpo; |
4329 | graphds_dfs (m_slpg, &m_leafs[0], m_leafs.length (), &initial_rpo, |
4330 | false, NULL, skip_cfg_latch_edges); |
4331 | gcc_assert (initial_rpo.length () == m_vertices.length ()); |
4332 | |
4333 | /* Calculate the strongly connected components of the graph. */ |
4334 | auto_vec<int> scc_grouping; |
4335 | unsigned int num_sccs = graphds_scc (m_slpg, NULL, NULL, &scc_grouping); |
4336 | |
4337 | /* Create a new index order in which all nodes from the same SCC are |
4338 | consecutive. Use scc_pos to record the index of the first node in |
4339 | each SCC. */ |
4340 | auto_vec<unsigned int> scc_pos (num_sccs); |
4341 | int last_component = -1; |
4342 | unsigned int node_count = 0; |
4343 | for (unsigned int node_i : scc_grouping) |
4344 | { |
4345 | if (last_component != m_slpg->vertices[node_i].component) |
4346 | { |
4347 | last_component = m_slpg->vertices[node_i].component; |
4348 | gcc_assert (last_component == int (scc_pos.length ())); |
4349 | scc_pos.quick_push (obj: node_count); |
4350 | } |
4351 | node_count += 1; |
4352 | } |
4353 | gcc_assert (node_count == initial_rpo.length () |
4354 | && last_component + 1 == int (num_sccs)); |
4355 | |
4356 | /* Use m_partitioned_nodes to group nodes into SCC order, with the nodes |
4357 | inside each SCC following the RPO we calculated above. The fact that |
4358 | we ignored natural latch edges when calculating the RPO should ensure |
4359 | that, for natural loop nests: |
4360 | |
4361 | - the first node that we encounter in a cfg loop is the loop header phi |
4362 | - the loop header phis are in dominance order |
4363 | |
4364 | Arranging for this is an optimization (see below) rather than a |
4365 | correctness issue. Unnatural loops with a tangled mess of backedges |
4366 | will still work correctly, but might give poorer results. |
4367 | |
4368 | Also update scc_pos so that it gives 1 + the index of the last node |
4369 | in the SCC. */ |
4370 | m_partitioned_nodes.safe_grow (len: node_count); |
4371 | for (unsigned int old_i = initial_rpo.length (); old_i-- > 0;) |
4372 | { |
4373 | unsigned int node_i = initial_rpo[old_i]; |
4374 | unsigned int new_i = scc_pos[m_slpg->vertices[node_i].component]++; |
4375 | m_partitioned_nodes[new_i] = node_i; |
4376 | } |
4377 | |
4378 | /* When optimizing for speed, partition each SCC based on the containing |
4379 | cfg loop. The order we constructed above should ensure that, for natural |
4380 | cfg loops, we'll create sub-SCC partitions for outer loops before |
4381 | the corresponding sub-SCC partitions for inner loops. Similarly, |
4382 | when one sibling loop A dominates another sibling loop B, we should |
4383 | create a sub-SCC partition for A before a sub-SCC partition for B. |
4384 | |
4385 | As above, nothing depends for correctness on whether this achieves |
4386 | a natural nesting, but we should get better results when it does. */ |
4387 | m_partitions.reserve (nelems: m_vertices.length ()); |
4388 | unsigned int next_partition_i = 0; |
4389 | hash_map<struct loop *, int> loop_partitions; |
4390 | unsigned int rpo_begin = 0; |
4391 | unsigned int num_partitioned_nodes = 0; |
4392 | for (unsigned int rpo_end : scc_pos) |
4393 | { |
4394 | loop_partitions.empty (); |
4395 | unsigned int partition_i = next_partition_i; |
4396 | for (unsigned int rpo_i = rpo_begin; rpo_i < rpo_end; ++rpo_i) |
4397 | { |
4398 | /* Handle externals and constants optimistically throughout. |
4399 | But treat existing vectors as fixed since we do not handle |
4400 | permuting them. */ |
4401 | unsigned int node_i = m_partitioned_nodes[rpo_i]; |
4402 | auto &vertex = m_vertices[node_i]; |
4403 | if ((SLP_TREE_DEF_TYPE (vertex.node) == vect_external_def |
4404 | && !SLP_TREE_VEC_DEFS (vertex.node).exists ()) |
4405 | || SLP_TREE_DEF_TYPE (vertex.node) == vect_constant_def) |
4406 | vertex.partition = -1; |
4407 | else |
4408 | { |
4409 | bool existed; |
4410 | if (m_optimize_size) |
4411 | existed = next_partition_i > partition_i; |
4412 | else |
4413 | { |
4414 | struct loop *loop = containing_loop (node: vertex.node); |
4415 | auto &entry = loop_partitions.get_or_insert (k: loop, existed: &existed); |
4416 | if (!existed) |
4417 | entry = next_partition_i; |
4418 | partition_i = entry; |
4419 | } |
4420 | if (!existed) |
4421 | { |
4422 | m_partitions.quick_push (obj: slpg_partition_info ()); |
4423 | next_partition_i += 1; |
4424 | } |
4425 | vertex.partition = partition_i; |
4426 | num_partitioned_nodes += 1; |
4427 | m_partitions[partition_i].node_end += 1; |
4428 | } |
4429 | } |
4430 | rpo_begin = rpo_end; |
4431 | } |
4432 | |
4433 | /* Assign ranges of consecutive node indices to each partition, |
4434 | in partition order. Start with node_end being the same as |
4435 | node_begin so that the next loop can use it as a counter. */ |
4436 | unsigned int node_begin = 0; |
4437 | for (auto &partition : m_partitions) |
4438 | { |
4439 | partition.node_begin = node_begin; |
4440 | node_begin += partition.node_end; |
4441 | partition.node_end = partition.node_begin; |
4442 | } |
4443 | gcc_assert (node_begin == num_partitioned_nodes); |
4444 | |
4445 | /* Finally build the list of nodes in partition order. */ |
4446 | m_partitioned_nodes.truncate (size: num_partitioned_nodes); |
4447 | for (unsigned int node_i = 0; node_i < m_vertices.length (); ++node_i) |
4448 | { |
4449 | int partition_i = m_vertices[node_i].partition; |
4450 | if (partition_i >= 0) |
4451 | { |
4452 | unsigned int order_i = m_partitions[partition_i].node_end++; |
4453 | m_partitioned_nodes[order_i] = node_i; |
4454 | } |
4455 | } |
4456 | } |
4457 | |
4458 | /* Look for edges from earlier partitions into node NODE_I and edges from |
4459 | node NODE_I into later partitions. Call: |
4460 | |
4461 | FN (ud, other_node_i) |
4462 | |
4463 | for each such use-to-def edge ud, where other_node_i is the node at the |
4464 | other end of the edge. */ |
4465 | |
4466 | template<typename T> |
4467 | void |
4468 | vect_optimize_slp_pass::for_each_partition_edge (unsigned int node_i, T fn) |
4469 | { |
4470 | int partition_i = m_vertices[node_i].partition; |
4471 | for (graph_edge *pred = m_slpg->vertices[node_i].pred; |
4472 | pred; pred = pred->pred_next) |
4473 | { |
4474 | int src_partition_i = m_vertices[pred->src].partition; |
4475 | if (src_partition_i >= 0 && src_partition_i != partition_i) |
4476 | fn (pred, pred->src); |
4477 | } |
4478 | for (graph_edge *succ = m_slpg->vertices[node_i].succ; |
4479 | succ; succ = succ->succ_next) |
4480 | { |
4481 | int dest_partition_i = m_vertices[succ->dest].partition; |
4482 | if (dest_partition_i >= 0 && dest_partition_i != partition_i) |
4483 | fn (succ, succ->dest); |
4484 | } |
4485 | } |
4486 | |
4487 | /* Return true if layout LAYOUT_I is compatible with the number of SLP lanes |
4488 | that NODE would operate on. This test is independent of NODE's actual |
4489 | operation. */ |
4490 | |
4491 | bool |
4492 | vect_optimize_slp_pass::is_compatible_layout (slp_tree node, |
4493 | unsigned int layout_i) |
4494 | { |
4495 | if (layout_i == 0) |
4496 | return true; |
4497 | |
4498 | if (SLP_TREE_LANES (node) != m_perms[layout_i].length ()) |
4499 | return false; |
4500 | |
4501 | return true; |
4502 | } |
4503 | |
4504 | /* Return the cost (in arbtirary units) of going from layout FROM_LAYOUT_I |
4505 | to layout TO_LAYOUT_I for a node like NODE. Return -1 if either of the |
4506 | layouts is incompatible with NODE or if the change is not possible for |
4507 | some other reason. |
4508 | |
4509 | The properties taken from NODE include the number of lanes and the |
4510 | vector type. The actual operation doesn't matter. */ |
4511 | |
4512 | int |
4513 | vect_optimize_slp_pass::change_layout_cost (slp_tree node, |
4514 | unsigned int from_layout_i, |
4515 | unsigned int to_layout_i) |
4516 | { |
4517 | if (!is_compatible_layout (node, layout_i: from_layout_i) |
4518 | || !is_compatible_layout (node, layout_i: to_layout_i)) |
4519 | return -1; |
4520 | |
4521 | if (from_layout_i == to_layout_i) |
4522 | return 0; |
4523 | |
4524 | auto_vec<slp_tree, 1> children (1); |
4525 | children.quick_push (obj: node); |
4526 | auto_lane_permutation_t perm (SLP_TREE_LANES (node)); |
4527 | if (from_layout_i > 0) |
4528 | for (unsigned int i : m_perms[from_layout_i]) |
4529 | perm.quick_push (obj: { 0, i }); |
4530 | else |
4531 | for (unsigned int i = 0; i < SLP_TREE_LANES (node); ++i) |
4532 | perm.quick_push (obj: { 0, i }); |
4533 | if (to_layout_i > 0) |
4534 | vect_slp_permute (perm: m_perms[to_layout_i], vec&: perm, reverse: true); |
4535 | auto count = vectorizable_slp_permutation_1 (m_vinfo, nullptr, node, perm, |
4536 | children, false); |
4537 | if (count >= 0) |
4538 | return MAX (count, 1); |
4539 | |
4540 | /* ??? In principle we could try changing via layout 0, giving two |
4541 | layout changes rather than 1. Doing that would require |
4542 | corresponding support in get_result_with_layout. */ |
4543 | return -1; |
4544 | } |
4545 | |
4546 | /* Return the costs of assigning layout LAYOUT_I to partition PARTITION_I. */ |
4547 | |
4548 | inline slpg_partition_layout_costs & |
4549 | vect_optimize_slp_pass::partition_layout_costs (unsigned int partition_i, |
4550 | unsigned int layout_i) |
4551 | { |
4552 | return m_partition_layout_costs[partition_i * m_perms.length () + layout_i]; |
4553 | } |
4554 | |
4555 | /* Change PERM in one of two ways: |
4556 | |
4557 | - if IN_LAYOUT_I < 0, accept input operand I in the layout that has been |
4558 | chosen for child I of NODE. |
4559 | |
4560 | - if IN_LAYOUT >= 0, accept all inputs operands with that layout. |
4561 | |
4562 | In both cases, arrange for the output to have layout OUT_LAYOUT_I */ |
4563 | |
4564 | void |
4565 | vect_optimize_slp_pass:: |
4566 | change_vec_perm_layout (slp_tree node, lane_permutation_t &perm, |
4567 | int in_layout_i, unsigned int out_layout_i) |
4568 | { |
4569 | for (auto &entry : perm) |
4570 | { |
4571 | int this_in_layout_i = in_layout_i; |
4572 | if (this_in_layout_i < 0) |
4573 | { |
4574 | slp_tree in_node = SLP_TREE_CHILDREN (node)[entry.first]; |
4575 | unsigned int in_partition_i = m_vertices[in_node->vertex].partition; |
4576 | this_in_layout_i = m_partitions[in_partition_i].layout; |
4577 | } |
4578 | if (this_in_layout_i > 0) |
4579 | entry.second = m_perms[this_in_layout_i][entry.second]; |
4580 | } |
4581 | if (out_layout_i > 0) |
4582 | vect_slp_permute (perm: m_perms[out_layout_i], vec&: perm, reverse: true); |
4583 | } |
4584 | |
4585 | /* Check whether the target allows NODE to be rearranged so that the node's |
4586 | output has layout OUT_LAYOUT_I. Return the cost of the change if so, |
4587 | in the same arbitrary units as for change_layout_cost. Return -1 otherwise. |
4588 | |
4589 | If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I < 0, also check whether |
4590 | NODE can adapt to the layout changes that have (perhaps provisionally) |
4591 | been chosen for NODE's children, so that no extra permutations are |
4592 | needed on either the input or the output of NODE. |
4593 | |
4594 | If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I >= 0, instead assume |
4595 | that all inputs will be forced into layout IN_LAYOUT_I beforehand. |
4596 | |
4597 | IN_LAYOUT_I has no meaning for other types of node. |
4598 | |
4599 | Keeping the node as-is is always valid. If the target doesn't appear |
4600 | to support the node as-is, but might realistically support other layouts, |
4601 | then layout 0 instead has the cost of a worst-case permutation. On the |
4602 | one hand, this ensures that every node has at least one valid layout, |
4603 | avoiding what would otherwise be an awkward special case. On the other, |
4604 | it still encourages the pass to change an invalid pre-existing layout |
4605 | choice into a valid one. */ |
4606 | |
4607 | int |
4608 | vect_optimize_slp_pass::internal_node_cost (slp_tree node, int in_layout_i, |
4609 | unsigned int out_layout_i) |
4610 | { |
4611 | const int fallback_cost = 1; |
4612 | |
4613 | if (SLP_TREE_CODE (node) == VEC_PERM_EXPR) |
4614 | { |
4615 | auto_lane_permutation_t tmp_perm; |
4616 | tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node)); |
4617 | |
4618 | /* Check that the child nodes support the chosen layout. Checking |
4619 | the first child is enough, since any second child would have the |
4620 | same shape. */ |
4621 | auto first_child = SLP_TREE_CHILDREN (node)[0]; |
4622 | if (in_layout_i > 0 |
4623 | && !is_compatible_layout (node: first_child, layout_i: in_layout_i)) |
4624 | return -1; |
4625 | |
4626 | change_vec_perm_layout (node, perm&: tmp_perm, in_layout_i, out_layout_i); |
4627 | int count = vectorizable_slp_permutation_1 (m_vinfo, nullptr, |
4628 | node, tmp_perm, |
4629 | SLP_TREE_CHILDREN (node), |
4630 | false); |
4631 | if (count < 0) |
4632 | { |
4633 | if (in_layout_i == 0 && out_layout_i == 0) |
4634 | { |
4635 | /* Use the fallback cost if the node could in principle support |
4636 | some nonzero layout for both the inputs and the outputs. |
4637 | Otherwise assume that the node will be rejected later |
4638 | and rebuilt from scalars. */ |
4639 | if (SLP_TREE_LANES (node) == SLP_TREE_LANES (first_child)) |
4640 | return fallback_cost; |
4641 | return 0; |
4642 | } |
4643 | return -1; |
4644 | } |
4645 | |
4646 | /* We currently have no way of telling whether the new layout is cheaper |
4647 | or more expensive than the old one. But at least in principle, |
4648 | it should be worth making zero permutations (whole-vector shuffles) |
4649 | cheaper than real permutations, in case the pass is able to remove |
4650 | the latter. */ |
4651 | return count == 0 ? 0 : 1; |
4652 | } |
4653 | |
4654 | stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node); |
4655 | if (rep |
4656 | && STMT_VINFO_DATA_REF (rep) |
4657 | && DR_IS_READ (STMT_VINFO_DATA_REF (rep)) |
4658 | && SLP_TREE_LOAD_PERMUTATION (node).exists ()) |
4659 | { |
4660 | auto_load_permutation_t tmp_perm; |
4661 | tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node)); |
4662 | if (out_layout_i > 0) |
4663 | vect_slp_permute (perm: m_perms[out_layout_i], vec&: tmp_perm, reverse: true); |
4664 | |
4665 | poly_uint64 vf = 1; |
4666 | if (auto loop_vinfo = dyn_cast<loop_vec_info> (p: m_vinfo)) |
4667 | vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); |
4668 | unsigned int n_perms; |
4669 | if (!vect_transform_slp_perm_load_1 (m_vinfo, node, tmp_perm, vNULL, |
4670 | nullptr, vf, true, false, &n_perms)) |
4671 | { |
4672 | auto rep = SLP_TREE_REPRESENTATIVE (node); |
4673 | if (out_layout_i == 0) |
4674 | { |
4675 | /* Use the fallback cost if the load is an N-to-N permutation. |
4676 | Otherwise assume that the node will be rejected later |
4677 | and rebuilt from scalars. */ |
4678 | if (STMT_VINFO_GROUPED_ACCESS (rep) |
4679 | && (DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (rep)) |
4680 | == SLP_TREE_LANES (node))) |
4681 | return fallback_cost; |
4682 | return 0; |
4683 | } |
4684 | return -1; |
4685 | } |
4686 | |
4687 | /* See the comment above the corresponding VEC_PERM_EXPR handling. */ |
4688 | return n_perms == 0 ? 0 : 1; |
4689 | } |
4690 | |
4691 | return 0; |
4692 | } |
4693 | |
4694 | /* Decide which element layouts we should consider using. Calculate the |
4695 | weights associated with inserting layout changes on partition edges. |
4696 | Also mark partitions that cannot change layout, by setting their |
4697 | layout to zero. */ |
4698 | |
4699 | void |
4700 | vect_optimize_slp_pass::start_choosing_layouts () |
4701 | { |
4702 | /* Used to assign unique permutation indices. */ |
4703 | using perm_hash = unbounded_hashmap_traits< |
4704 | vec_free_hash_base<int_hash_base<unsigned>>, |
4705 | int_hash<int, -1, -2> |
4706 | >; |
4707 | hash_map<vec<unsigned>, int, perm_hash> layout_ids; |
4708 | |
4709 | /* Layout 0 is "no change". */ |
4710 | m_perms.safe_push (obj: vNULL); |
4711 | |
4712 | /* Create layouts from existing permutations. */ |
4713 | auto_load_permutation_t tmp_perm; |
4714 | for (unsigned int node_i : m_partitioned_nodes) |
4715 | { |
4716 | /* Leafs also double as entries to the reverse graph. Allow the |
4717 | layout of those to be changed. */ |
4718 | auto &vertex = m_vertices[node_i]; |
4719 | auto &partition = m_partitions[vertex.partition]; |
4720 | if (!m_slpg->vertices[node_i].succ) |
4721 | partition.layout = 0; |
4722 | |
4723 | /* Loads and VEC_PERM_EXPRs are the only things generating permutes. */ |
4724 | slp_tree node = vertex.node; |
4725 | stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node); |
4726 | slp_tree child; |
4727 | unsigned HOST_WIDE_INT imin, imax = 0; |
4728 | bool any_permute = false; |
4729 | tmp_perm.truncate (size: 0); |
4730 | if (SLP_TREE_LOAD_PERMUTATION (node).exists ()) |
4731 | { |
4732 | /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the node |
4733 | unpermuted, record a layout that reverses this permutation. |
4734 | |
4735 | We would need more work to cope with loads that are internally |
4736 | permuted and also have inputs (such as masks for |
4737 | IFN_MASK_LOADs). */ |
4738 | gcc_assert (partition.layout == 0 && !m_slpg->vertices[node_i].succ); |
4739 | if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt)) |
4740 | { |
4741 | partition.layout = -1; |
4742 | continue; |
4743 | } |
4744 | dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt); |
4745 | imin = DR_GROUP_SIZE (dr_stmt) + 1; |
4746 | tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node)); |
4747 | } |
4748 | else if (SLP_TREE_CODE (node) == VEC_PERM_EXPR |
4749 | && SLP_TREE_CHILDREN (node).length () == 1 |
4750 | && (child = SLP_TREE_CHILDREN (node)[0]) |
4751 | && (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (child)) |
4752 | .is_constant (const_value: &imin))) |
4753 | { |
4754 | /* If the child has the same vector size as this node, |
4755 | reversing the permutation can make the permutation a no-op. |
4756 | In other cases it can change a true permutation into a |
4757 | full-vector extract. */ |
4758 | tmp_perm.reserve (SLP_TREE_LANES (node)); |
4759 | for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j) |
4760 | tmp_perm.quick_push (SLP_TREE_LANE_PERMUTATION (node)[j].second); |
4761 | } |
4762 | else |
4763 | continue; |
4764 | |
4765 | for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j) |
4766 | { |
4767 | unsigned idx = tmp_perm[j]; |
4768 | imin = MIN (imin, idx); |
4769 | imax = MAX (imax, idx); |
4770 | if (idx - tmp_perm[0] != j) |
4771 | any_permute = true; |
4772 | } |
4773 | /* If the span doesn't match we'd disrupt VF computation, avoid |
4774 | that for now. */ |
4775 | if (imax - imin + 1 != SLP_TREE_LANES (node)) |
4776 | continue; |
4777 | /* If there's no permute no need to split one out. In this case |
4778 | we can consider turning a load into a permuted load, if that |
4779 | turns out to be cheaper than alternatives. */ |
4780 | if (!any_permute) |
4781 | { |
4782 | partition.layout = -1; |
4783 | continue; |
4784 | } |
4785 | |
4786 | /* For now only handle true permutes, like |
4787 | vect_attempt_slp_rearrange_stmts did. This allows us to be lazy |
4788 | when permuting constants and invariants keeping the permute |
4789 | bijective. */ |
4790 | auto_sbitmap load_index (SLP_TREE_LANES (node)); |
4791 | bitmap_clear (load_index); |
4792 | for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j) |
4793 | bitmap_set_bit (map: load_index, bitno: tmp_perm[j] - imin); |
4794 | unsigned j; |
4795 | for (j = 0; j < SLP_TREE_LANES (node); ++j) |
4796 | if (!bitmap_bit_p (map: load_index, bitno: j)) |
4797 | break; |
4798 | if (j != SLP_TREE_LANES (node)) |
4799 | continue; |
4800 | |
4801 | vec<unsigned> perm = vNULL; |
4802 | perm.safe_grow (SLP_TREE_LANES (node), exact: true); |
4803 | for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j) |
4804 | perm[j] = tmp_perm[j] - imin; |
4805 | |
4806 | if (int (m_perms.length ()) >= param_vect_max_layout_candidates) |
4807 | { |
4808 | /* Continue to use existing layouts, but don't add any more. */ |
4809 | int *entry = layout_ids.get (k: perm); |
4810 | partition.layout = entry ? *entry : 0; |
4811 | perm.release (); |
4812 | } |
4813 | else |
4814 | { |
4815 | bool existed; |
4816 | int &layout_i = layout_ids.get_or_insert (k: perm, existed: &existed); |
4817 | if (existed) |
4818 | perm.release (); |
4819 | else |
4820 | { |
4821 | layout_i = m_perms.length (); |
4822 | m_perms.safe_push (obj: perm); |
4823 | } |
4824 | partition.layout = layout_i; |
4825 | } |
4826 | } |
4827 | |
4828 | /* Initially assume that every layout is possible and has zero cost |
4829 | in every partition. */ |
4830 | m_partition_layout_costs.safe_grow_cleared (len: m_partitions.length () |
4831 | * m_perms.length ()); |
4832 | |
4833 | /* We have to mark outgoing permutations facing non-associating-reduction |
4834 | graph entries that are not represented as to be materialized. |
4835 | slp_inst_kind_bb_reduc currently only covers associatable reductions. */ |
4836 | for (slp_instance instance : m_vinfo->slp_instances) |
4837 | if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor) |
4838 | { |
4839 | unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex; |
4840 | m_partitions[m_vertices[node_i].partition].layout = 0; |
4841 | } |
4842 | else if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_reduc_chain) |
4843 | { |
4844 | stmt_vec_info stmt_info |
4845 | = SLP_TREE_REPRESENTATIVE (SLP_INSTANCE_TREE (instance)); |
4846 | stmt_vec_info reduc_info = info_for_reduction (m_vinfo, stmt_info); |
4847 | if (needs_fold_left_reduction_p (TREE_TYPE |
4848 | (gimple_get_lhs (stmt_info->stmt)), |
4849 | STMT_VINFO_REDUC_CODE (reduc_info))) |
4850 | { |
4851 | unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex; |
4852 | m_partitions[m_vertices[node_i].partition].layout = 0; |
4853 | } |
4854 | } |
4855 | |
4856 | /* Check which layouts each node and partition can handle. Calculate the |
4857 | weights associated with inserting layout changes on edges. */ |
4858 | for (unsigned int node_i : m_partitioned_nodes) |
4859 | { |
4860 | auto &vertex = m_vertices[node_i]; |
4861 | auto &partition = m_partitions[vertex.partition]; |
4862 | slp_tree node = vertex.node; |
4863 | |
4864 | if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node)) |
4865 | { |
4866 | vertex.weight = vect_slp_node_weight (node); |
4867 | |
4868 | /* We do not handle stores with a permutation, so all |
4869 | incoming permutations must have been materialized. |
4870 | |
4871 | We also don't handle masked grouped loads, which lack a |
4872 | permutation vector. In this case the memory locations |
4873 | form an implicit second input to the loads, on top of the |
4874 | explicit mask input, and the memory input's layout cannot |
4875 | be changed. |
4876 | |
4877 | On the other hand, we do support permuting gather loads and |
4878 | masked gather loads, where each scalar load is independent |
4879 | of the others. This can be useful if the address/index input |
4880 | benefits from permutation. */ |
4881 | if (STMT_VINFO_DATA_REF (rep) |
4882 | && STMT_VINFO_GROUPED_ACCESS (rep) |
4883 | && !SLP_TREE_LOAD_PERMUTATION (node).exists ()) |
4884 | partition.layout = 0; |
4885 | |
4886 | /* We cannot change the layout of an operation that is |
4887 | not independent on lanes. Note this is an explicit |
4888 | negative list since that's much shorter than the respective |
4889 | positive one but it's critical to keep maintaining it. */ |
4890 | if (is_gimple_call (STMT_VINFO_STMT (rep))) |
4891 | switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep))) |
4892 | { |
4893 | case CFN_COMPLEX_ADD_ROT90: |
4894 | case CFN_COMPLEX_ADD_ROT270: |
4895 | case CFN_COMPLEX_MUL: |
4896 | case CFN_COMPLEX_MUL_CONJ: |
4897 | case CFN_VEC_ADDSUB: |
4898 | case CFN_VEC_FMADDSUB: |
4899 | case CFN_VEC_FMSUBADD: |
4900 | partition.layout = 0; |
4901 | default:; |
4902 | } |
4903 | } |
4904 | |
4905 | auto process_edge = [&](graph_edge *ud, unsigned int other_node_i) |
4906 | { |
4907 | auto &other_vertex = m_vertices[other_node_i]; |
4908 | |
4909 | /* Count the number of edges from earlier partitions and the number |
4910 | of edges to later partitions. */ |
4911 | if (other_vertex.partition < vertex.partition) |
4912 | partition.in_degree += 1; |
4913 | else |
4914 | partition.out_degree += 1; |
4915 | |
4916 | /* If the current node uses the result of OTHER_NODE_I, accumulate |
4917 | the effects of that. */ |
4918 | if (ud->src == int (node_i)) |
4919 | { |
4920 | other_vertex.out_weight += vertex.weight; |
4921 | other_vertex.out_degree += 1; |
4922 | } |
4923 | }; |
4924 | for_each_partition_edge (node_i, fn: process_edge); |
4925 | } |
4926 | } |
4927 | |
4928 | /* Return the incoming costs for node NODE_I, assuming that each input keeps |
4929 | its current (provisional) choice of layout. The inputs do not necessarily |
4930 | have the same layout as each other. */ |
4931 | |
4932 | slpg_layout_cost |
4933 | vect_optimize_slp_pass::total_in_cost (unsigned int node_i) |
4934 | { |
4935 | auto &vertex = m_vertices[node_i]; |
4936 | slpg_layout_cost cost; |
4937 | auto add_cost = [&](graph_edge *, unsigned int other_node_i) |
4938 | { |
4939 | auto &other_vertex = m_vertices[other_node_i]; |
4940 | if (other_vertex.partition < vertex.partition) |
4941 | { |
4942 | auto &other_partition = m_partitions[other_vertex.partition]; |
4943 | auto &other_costs = partition_layout_costs (partition_i: other_vertex.partition, |
4944 | layout_i: other_partition.layout); |
4945 | slpg_layout_cost this_cost = other_costs.in_cost; |
4946 | this_cost.add_serial_cost (other: other_costs.internal_cost); |
4947 | this_cost.split (times: other_partition.out_degree); |
4948 | cost.add_parallel_cost (input_cost: this_cost); |
4949 | } |
4950 | }; |
4951 | for_each_partition_edge (node_i, fn: add_cost); |
4952 | return cost; |
4953 | } |
4954 | |
4955 | /* Return the cost of switching between layout LAYOUT1_I (at node NODE1_I) |
4956 | and layout LAYOUT2_I on cross-partition use-to-def edge UD. Return |
4957 | slpg_layout_cost::impossible () if the change isn't possible. */ |
4958 | |
4959 | slpg_layout_cost |
4960 | vect_optimize_slp_pass:: |
4961 | edge_layout_cost (graph_edge *ud, unsigned int node1_i, unsigned int layout1_i, |
4962 | unsigned int layout2_i) |
4963 | { |
4964 | auto &def_vertex = m_vertices[ud->dest]; |
4965 | auto &use_vertex = m_vertices[ud->src]; |
4966 | auto def_layout_i = ud->dest == int (node1_i) ? layout1_i : layout2_i; |
4967 | auto use_layout_i = ud->dest == int (node1_i) ? layout2_i : layout1_i; |
4968 | auto factor = change_layout_cost (node: def_vertex.node, from_layout_i: def_layout_i, |
4969 | to_layout_i: use_layout_i); |
4970 | if (factor < 0) |
4971 | return slpg_layout_cost::impossible (); |
4972 | |
4973 | /* We have a choice of putting the layout change at the site of the |
4974 | definition or at the site of the use. Prefer the former when |
4975 | optimizing for size or when the execution frequency of the |
4976 | definition is no greater than the combined execution frequencies of |
4977 | the uses. When putting the layout change at the site of the definition, |
4978 | divvy up the cost among all consumers. */ |
4979 | if (m_optimize_size || def_vertex.weight <= def_vertex.out_weight) |
4980 | { |
4981 | slpg_layout_cost cost = { def_vertex.weight * factor, m_optimize_size }; |
4982 | cost.split (times: def_vertex.out_degree); |
4983 | return cost; |
4984 | } |
4985 | return { use_vertex.weight * factor, m_optimize_size }; |
4986 | } |
4987 | |
4988 | /* UD represents a use-def link between FROM_NODE_I and a node in a later |
4989 | partition; FROM_NODE_I could be the definition node or the use node. |
4990 | The node at the other end of the link wants to use layout TO_LAYOUT_I. |
4991 | Return the cost of any necessary fix-ups on edge UD, or return |
4992 | slpg_layout_cost::impossible () if the change isn't possible. |
4993 | |
4994 | At this point, FROM_NODE_I's partition has chosen the cheapest |
4995 | layout based on the information available so far, but this choice |
4996 | is only provisional. */ |
4997 | |
4998 | slpg_layout_cost |
4999 | vect_optimize_slp_pass::forward_cost (graph_edge *ud, unsigned int from_node_i, |
5000 | unsigned int to_layout_i) |
5001 | { |
5002 | auto &from_vertex = m_vertices[from_node_i]; |
5003 | unsigned int from_partition_i = from_vertex.partition; |
5004 | slpg_partition_info &from_partition = m_partitions[from_partition_i]; |
5005 | gcc_assert (from_partition.layout >= 0); |
5006 | |
5007 | /* First calculate the cost on the assumption that FROM_PARTITION sticks |
5008 | with its current layout preference. */ |
5009 | slpg_layout_cost cost = slpg_layout_cost::impossible (); |
5010 | auto edge_cost = edge_layout_cost (ud, node1_i: from_node_i, |
5011 | layout1_i: from_partition.layout, layout2_i: to_layout_i); |
5012 | if (edge_cost.is_possible ()) |
5013 | { |
5014 | auto &from_costs = partition_layout_costs (partition_i: from_partition_i, |
5015 | layout_i: from_partition.layout); |
5016 | cost = from_costs.in_cost; |
5017 | cost.add_serial_cost (other: from_costs.internal_cost); |
5018 | cost.split (times: from_partition.out_degree); |
5019 | cost.add_serial_cost (other: edge_cost); |
5020 | } |
5021 | |
5022 | /* Take the minimum of that cost and the cost that applies if |
5023 | FROM_PARTITION instead switches to TO_LAYOUT_I. */ |
5024 | auto &direct_layout_costs = partition_layout_costs (partition_i: from_partition_i, |
5025 | layout_i: to_layout_i); |
5026 | if (direct_layout_costs.is_possible ()) |
5027 | { |
5028 | slpg_layout_cost direct_cost = direct_layout_costs.in_cost; |
5029 | direct_cost.add_serial_cost (other: direct_layout_costs.internal_cost); |
5030 | direct_cost.split (times: from_partition.out_degree); |
5031 | if (!cost.is_possible () |
5032 | || direct_cost.is_better_than (other: cost, is_for_size: m_optimize_size)) |
5033 | cost = direct_cost; |
5034 | } |
5035 | |
5036 | return cost; |
5037 | } |
5038 | |
5039 | /* UD represents a use-def link between TO_NODE_I and a node in an earlier |
5040 | partition; TO_NODE_I could be the definition node or the use node. |
5041 | The node at the other end of the link wants to use layout FROM_LAYOUT_I; |
5042 | return the cost of any necessary fix-ups on edge UD, or |
5043 | slpg_layout_cost::impossible () if the choice cannot be made. |
5044 | |
5045 | At this point, TO_NODE_I's partition has a fixed choice of layout. */ |
5046 | |
5047 | slpg_layout_cost |
5048 | vect_optimize_slp_pass::backward_cost (graph_edge *ud, unsigned int to_node_i, |
5049 | unsigned int from_layout_i) |
5050 | { |
5051 | auto &to_vertex = m_vertices[to_node_i]; |
5052 | unsigned int to_partition_i = to_vertex.partition; |
5053 | slpg_partition_info &to_partition = m_partitions[to_partition_i]; |
5054 | gcc_assert (to_partition.layout >= 0); |
5055 | |
5056 | /* If TO_NODE_I is a VEC_PERM_EXPR consumer, see whether it can be |
5057 | adjusted for this input having layout FROM_LAYOUT_I. Assume that |
5058 | any other inputs keep their current choice of layout. */ |
5059 | auto &to_costs = partition_layout_costs (partition_i: to_partition_i, |
5060 | layout_i: to_partition.layout); |
5061 | if (ud->src == int (to_node_i) |
5062 | && SLP_TREE_CODE (to_vertex.node) == VEC_PERM_EXPR) |
5063 | { |
5064 | auto &from_partition = m_partitions[m_vertices[ud->dest].partition]; |
5065 | auto old_layout = from_partition.layout; |
5066 | from_partition.layout = from_layout_i; |
5067 | int factor = internal_node_cost (node: to_vertex.node, in_layout_i: -1, |
5068 | out_layout_i: to_partition.layout); |
5069 | from_partition.layout = old_layout; |
5070 | if (factor >= 0) |
5071 | { |
5072 | slpg_layout_cost cost = to_costs.out_cost; |
5073 | cost.add_serial_cost (other: { to_vertex.weight * factor, |
5074 | m_optimize_size }); |
5075 | cost.split (times: to_partition.in_degree); |
5076 | return cost; |
5077 | } |
5078 | } |
5079 | |
5080 | /* Compute the cost if we insert any necessary layout change on edge UD. */ |
5081 | auto edge_cost = edge_layout_cost (ud, node1_i: to_node_i, |
5082 | layout1_i: to_partition.layout, layout2_i: from_layout_i); |
5083 | if (edge_cost.is_possible ()) |
5084 | { |
5085 | slpg_layout_cost cost = to_costs.out_cost; |
5086 | cost.add_serial_cost (other: to_costs.internal_cost); |
5087 | cost.split (times: to_partition.in_degree); |
5088 | cost.add_serial_cost (other: edge_cost); |
5089 | return cost; |
5090 | } |
5091 | |
5092 | return slpg_layout_cost::impossible (); |
5093 | } |
5094 | |
5095 | /* Make a forward pass through the partitions, accumulating input costs. |
5096 | Make a tentative (provisional) choice of layout for each partition, |
5097 | ensuring that this choice still allows later partitions to keep |
5098 | their original layout. */ |
5099 | |
5100 | void |
5101 | vect_optimize_slp_pass::forward_pass () |
5102 | { |
5103 | for (unsigned int partition_i = 0; partition_i < m_partitions.length (); |
5104 | ++partition_i) |
5105 | { |
5106 | auto &partition = m_partitions[partition_i]; |
5107 | |
5108 | /* If the partition consists of a single VEC_PERM_EXPR, precompute |
5109 | the incoming cost that would apply if every predecessor partition |
5110 | keeps its current layout. This is used within the loop below. */ |
5111 | slpg_layout_cost in_cost; |
5112 | slp_tree single_node = nullptr; |
5113 | if (partition.node_end == partition.node_begin + 1) |
5114 | { |
5115 | unsigned int node_i = m_partitioned_nodes[partition.node_begin]; |
5116 | single_node = m_vertices[node_i].node; |
5117 | if (SLP_TREE_CODE (single_node) == VEC_PERM_EXPR) |
5118 | in_cost = total_in_cost (node_i); |
5119 | } |
5120 | |
5121 | /* Go through the possible layouts. Decide which ones are valid |
5122 | for this partition and record which of the valid layouts has |
5123 | the lowest cost. */ |
5124 | unsigned int min_layout_i = 0; |
5125 | slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible (); |
5126 | for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i) |
5127 | { |
5128 | auto &layout_costs = partition_layout_costs (partition_i, layout_i); |
5129 | if (!layout_costs.is_possible ()) |
5130 | continue; |
5131 | |
5132 | /* If the recorded layout is already 0 then the layout cannot |
5133 | change. */ |
5134 | if (partition.layout == 0 && layout_i != 0) |
5135 | { |
5136 | layout_costs.mark_impossible (); |
5137 | continue; |
5138 | } |
5139 | |
5140 | bool is_possible = true; |
5141 | for (unsigned int order_i = partition.node_begin; |
5142 | order_i < partition.node_end; ++order_i) |
5143 | { |
5144 | unsigned int node_i = m_partitioned_nodes[order_i]; |
5145 | auto &vertex = m_vertices[node_i]; |
5146 | |
5147 | /* Reject the layout if it is individually incompatible |
5148 | with any node in the partition. */ |
5149 | if (!is_compatible_layout (node: vertex.node, layout_i)) |
5150 | { |
5151 | is_possible = false; |
5152 | break; |
5153 | } |
5154 | |
5155 | auto add_cost = [&](graph_edge *ud, unsigned int other_node_i) |
5156 | { |
5157 | auto &other_vertex = m_vertices[other_node_i]; |
5158 | if (other_vertex.partition < vertex.partition) |
5159 | { |
5160 | /* Accumulate the incoming costs from earlier |
5161 | partitions, plus the cost of any layout changes |
5162 | on UD itself. */ |
5163 | auto cost = forward_cost (ud, from_node_i: other_node_i, to_layout_i: layout_i); |
5164 | if (!cost.is_possible ()) |
5165 | is_possible = false; |
5166 | else |
5167 | layout_costs.in_cost.add_parallel_cost (input_cost: cost); |
5168 | } |
5169 | else |
5170 | /* Reject the layout if it would make layout 0 impossible |
5171 | for later partitions. This amounts to testing that the |
5172 | target supports reversing the layout change on edges |
5173 | to later partitions. |
5174 | |
5175 | In principle, it might be possible to push a layout |
5176 | change all the way down a graph, so that it never |
5177 | needs to be reversed and so that the target doesn't |
5178 | need to support the reverse operation. But it would |
5179 | be awkward to bail out if we hit a partition that |
5180 | does not support the new layout, especially since |
5181 | we are not dealing with a lattice. */ |
5182 | is_possible &= edge_layout_cost (ud, node1_i: other_node_i, layout1_i: 0, |
5183 | layout2_i: layout_i).is_possible (); |
5184 | }; |
5185 | for_each_partition_edge (node_i, fn: add_cost); |
5186 | |
5187 | /* Accumulate the cost of using LAYOUT_I within NODE, |
5188 | both for the inputs and the outputs. */ |
5189 | int factor = internal_node_cost (node: vertex.node, in_layout_i: layout_i, |
5190 | out_layout_i: layout_i); |
5191 | if (factor < 0) |
5192 | { |
5193 | is_possible = false; |
5194 | break; |
5195 | } |
5196 | else if (factor) |
5197 | layout_costs.internal_cost.add_serial_cost |
5198 | (other: { vertex.weight * factor, m_optimize_size }); |
5199 | } |
5200 | if (!is_possible) |
5201 | { |
5202 | layout_costs.mark_impossible (); |
5203 | continue; |
5204 | } |
5205 | |
5206 | /* Combine the incoming and partition-internal costs. */ |
5207 | slpg_layout_cost combined_cost = layout_costs.in_cost; |
5208 | combined_cost.add_serial_cost (other: layout_costs.internal_cost); |
5209 | |
5210 | /* If this partition consists of a single VEC_PERM_EXPR, see |
5211 | if the VEC_PERM_EXPR can be changed to support output layout |
5212 | LAYOUT_I while keeping all the provisional choices of input |
5213 | layout. */ |
5214 | if (single_node |
5215 | && SLP_TREE_CODE (single_node) == VEC_PERM_EXPR) |
5216 | { |
5217 | int factor = internal_node_cost (node: single_node, in_layout_i: -1, out_layout_i: layout_i); |
5218 | if (factor >= 0) |
5219 | { |
5220 | auto weight = m_vertices[single_node->vertex].weight; |
5221 | slpg_layout_cost internal_cost |
5222 | = { weight * factor, m_optimize_size }; |
5223 | |
5224 | slpg_layout_cost alt_cost = in_cost; |
5225 | alt_cost.add_serial_cost (other: internal_cost); |
5226 | if (alt_cost.is_better_than (other: combined_cost, is_for_size: m_optimize_size)) |
5227 | { |
5228 | combined_cost = alt_cost; |
5229 | layout_costs.in_cost = in_cost; |
5230 | layout_costs.internal_cost = internal_cost; |
5231 | } |
5232 | } |
5233 | } |
5234 | |
5235 | /* Record the layout with the lowest cost. Prefer layout 0 in |
5236 | the event of a tie between it and another layout. */ |
5237 | if (!min_layout_cost.is_possible () |
5238 | || combined_cost.is_better_than (other: min_layout_cost, |
5239 | is_for_size: m_optimize_size)) |
5240 | { |
5241 | min_layout_i = layout_i; |
5242 | min_layout_cost = combined_cost; |
5243 | } |
5244 | } |
5245 | |
5246 | /* This loop's handling of earlier partitions should ensure that |
5247 | choosing the original layout for the current partition is no |
5248 | less valid than it was in the original graph, even with the |
5249 | provisional layout choices for those earlier partitions. */ |
5250 | gcc_assert (min_layout_cost.is_possible ()); |
5251 | partition.layout = min_layout_i; |
5252 | } |
5253 | } |
5254 | |
5255 | /* Make a backward pass through the partitions, accumulating output costs. |
5256 | Make a final choice of layout for each partition. */ |
5257 | |
5258 | void |
5259 | vect_optimize_slp_pass::backward_pass () |
5260 | { |
5261 | for (unsigned int partition_i = m_partitions.length (); partition_i-- > 0;) |
5262 | { |
5263 | auto &partition = m_partitions[partition_i]; |
5264 | |
5265 | unsigned int min_layout_i = 0; |
5266 | slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible (); |
5267 | for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i) |
5268 | { |
5269 | auto &layout_costs = partition_layout_costs (partition_i, layout_i); |
5270 | if (!layout_costs.is_possible ()) |
5271 | continue; |
5272 | |
5273 | /* Accumulate the costs from successor partitions. */ |
5274 | bool is_possible = true; |
5275 | for (unsigned int order_i = partition.node_begin; |
5276 | order_i < partition.node_end; ++order_i) |
5277 | { |
5278 | unsigned int node_i = m_partitioned_nodes[order_i]; |
5279 | auto &vertex = m_vertices[node_i]; |
5280 | auto add_cost = [&](graph_edge *ud, unsigned int other_node_i) |
5281 | { |
5282 | auto &other_vertex = m_vertices[other_node_i]; |
5283 | auto &other_partition = m_partitions[other_vertex.partition]; |
5284 | if (other_vertex.partition > vertex.partition) |
5285 | { |
5286 | /* Accumulate the incoming costs from later |
5287 | partitions, plus the cost of any layout changes |
5288 | on UD itself. */ |
5289 | auto cost = backward_cost (ud, to_node_i: other_node_i, from_layout_i: layout_i); |
5290 | if (!cost.is_possible ()) |
5291 | is_possible = false; |
5292 | else |
5293 | layout_costs.out_cost.add_parallel_cost (input_cost: cost); |
5294 | } |
5295 | else |
5296 | /* Make sure that earlier partitions can (if necessary |
5297 | or beneficial) keep the layout that they chose in |
5298 | the forward pass. This ensures that there is at |
5299 | least one valid choice of layout. */ |
5300 | is_possible &= edge_layout_cost (ud, node1_i: other_node_i, |
5301 | layout1_i: other_partition.layout, |
5302 | layout2_i: layout_i).is_possible (); |
5303 | }; |
5304 | for_each_partition_edge (node_i, fn: add_cost); |
5305 | } |
5306 | if (!is_possible) |
5307 | { |
5308 | layout_costs.mark_impossible (); |
5309 | continue; |
5310 | } |
5311 | |
5312 | /* Locally combine the costs from the forward and backward passes. |
5313 | (This combined cost is not passed on, since that would lead |
5314 | to double counting.) */ |
5315 | slpg_layout_cost combined_cost = layout_costs.in_cost; |
5316 | combined_cost.add_serial_cost (other: layout_costs.internal_cost); |
5317 | combined_cost.add_serial_cost (other: layout_costs.out_cost); |
5318 | |
5319 | /* Record the layout with the lowest cost. Prefer layout 0 in |
5320 | the event of a tie between it and another layout. */ |
5321 | if (!min_layout_cost.is_possible () |
5322 | || combined_cost.is_better_than (other: min_layout_cost, |
5323 | is_for_size: m_optimize_size)) |
5324 | { |
5325 | min_layout_i = layout_i; |
5326 | min_layout_cost = combined_cost; |
5327 | } |
5328 | } |
5329 | |
5330 | gcc_assert (min_layout_cost.is_possible ()); |
5331 | partition.layout = min_layout_i; |
5332 | } |
5333 | } |
5334 | |
5335 | /* Return a node that applies layout TO_LAYOUT_I to the original form of NODE. |
5336 | NODE already has the layout that was selected for its partition. */ |
5337 | |
5338 | slp_tree |
5339 | vect_optimize_slp_pass::get_result_with_layout (slp_tree node, |
5340 | unsigned int to_layout_i) |
5341 | { |
5342 | unsigned int result_i = node->vertex * m_perms.length () + to_layout_i; |
5343 | slp_tree result = m_node_layouts[result_i]; |
5344 | if (result) |
5345 | return result; |
5346 | |
5347 | if (SLP_TREE_DEF_TYPE (node) == vect_constant_def |
5348 | || (SLP_TREE_DEF_TYPE (node) == vect_external_def |
5349 | /* We can't permute vector defs in place. */ |
5350 | && SLP_TREE_VEC_DEFS (node).is_empty ())) |
5351 | { |
5352 | /* If the vector is uniform or unchanged, there's nothing to do. */ |
5353 | if (to_layout_i == 0 || vect_slp_tree_uniform_p (node)) |
5354 | result = node; |
5355 | else |
5356 | { |
5357 | auto scalar_ops = SLP_TREE_SCALAR_OPS (node).copy (); |
5358 | result = vect_create_new_slp_node (ops: scalar_ops); |
5359 | vect_slp_permute (perm: m_perms[to_layout_i], vec&: scalar_ops, reverse: true); |
5360 | } |
5361 | } |
5362 | else |
5363 | { |
5364 | unsigned int partition_i = m_vertices[node->vertex].partition; |
5365 | unsigned int from_layout_i = m_partitions[partition_i].layout; |
5366 | if (from_layout_i == to_layout_i) |
5367 | return node; |
5368 | |
5369 | /* If NODE is itself a VEC_PERM_EXPR, try to create a parallel |
5370 | permutation instead of a serial one. Leave the new permutation |
5371 | in TMP_PERM on success. */ |
5372 | auto_lane_permutation_t tmp_perm; |
5373 | unsigned int num_inputs = 1; |
5374 | if (SLP_TREE_CODE (node) == VEC_PERM_EXPR) |
5375 | { |
5376 | tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node)); |
5377 | if (from_layout_i != 0) |
5378 | vect_slp_permute (perm: m_perms[from_layout_i], vec&: tmp_perm, reverse: false); |
5379 | if (to_layout_i != 0) |
5380 | vect_slp_permute (perm: m_perms[to_layout_i], vec&: tmp_perm, reverse: true); |
5381 | if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node, |
5382 | tmp_perm, |
5383 | SLP_TREE_CHILDREN (node), |
5384 | false) >= 0) |
5385 | num_inputs = SLP_TREE_CHILDREN (node).length (); |
5386 | else |
5387 | tmp_perm.truncate (size: 0); |
5388 | } |
5389 | |
5390 | if (dump_enabled_p ()) |
5391 | { |
5392 | if (tmp_perm.length () > 0) |
5393 | dump_printf_loc (MSG_NOTE, vect_location, |
5394 | "duplicating permutation node %p with" |
5395 | " layout %d\n" , |
5396 | (void *) node, to_layout_i); |
5397 | else |
5398 | dump_printf_loc (MSG_NOTE, vect_location, |
5399 | "inserting permutation node in place of %p\n" , |
5400 | (void *) node); |
5401 | } |
5402 | |
5403 | unsigned int num_lanes = SLP_TREE_LANES (node); |
5404 | result = vect_create_new_slp_node (nops: num_inputs, code: VEC_PERM_EXPR); |
5405 | if (SLP_TREE_SCALAR_STMTS (node).length ()) |
5406 | { |
5407 | auto &stmts = SLP_TREE_SCALAR_STMTS (result); |
5408 | stmts.safe_splice (SLP_TREE_SCALAR_STMTS (node)); |
5409 | if (from_layout_i != 0) |
5410 | vect_slp_permute (perm: m_perms[from_layout_i], vec&: stmts, reverse: false); |
5411 | if (to_layout_i != 0) |
5412 | vect_slp_permute (perm: m_perms[to_layout_i], vec&: stmts, reverse: true); |
5413 | } |
5414 | SLP_TREE_REPRESENTATIVE (result) = SLP_TREE_REPRESENTATIVE (node); |
5415 | SLP_TREE_LANES (result) = num_lanes; |
5416 | SLP_TREE_VECTYPE (result) = SLP_TREE_VECTYPE (node); |
5417 | result->vertex = -1; |
5418 | |
5419 | auto &lane_perm = SLP_TREE_LANE_PERMUTATION (result); |
5420 | if (tmp_perm.length ()) |
5421 | { |
5422 | lane_perm.safe_splice (src: tmp_perm); |
5423 | SLP_TREE_CHILDREN (result).safe_splice (SLP_TREE_CHILDREN (node)); |
5424 | } |
5425 | else |
5426 | { |
5427 | lane_perm.create (nelems: num_lanes); |
5428 | for (unsigned j = 0; j < num_lanes; ++j) |
5429 | lane_perm.quick_push (obj: { 0, j }); |
5430 | if (from_layout_i != 0) |
5431 | vect_slp_permute (perm: m_perms[from_layout_i], vec&: lane_perm, reverse: false); |
5432 | if (to_layout_i != 0) |
5433 | vect_slp_permute (perm: m_perms[to_layout_i], vec&: lane_perm, reverse: true); |
5434 | SLP_TREE_CHILDREN (result).safe_push (obj: node); |
5435 | } |
5436 | for (slp_tree child : SLP_TREE_CHILDREN (result)) |
5437 | child->refcnt++; |
5438 | } |
5439 | m_node_layouts[result_i] = result; |
5440 | return result; |
5441 | } |
5442 | |
5443 | /* Apply the chosen vector layouts to the SLP graph. */ |
5444 | |
5445 | void |
5446 | vect_optimize_slp_pass::materialize () |
5447 | { |
5448 | /* We no longer need the costs, so avoid having two O(N * P) arrays |
5449 | live at the same time. */ |
5450 | m_partition_layout_costs.release (); |
5451 | m_node_layouts.safe_grow_cleared (len: m_vertices.length () * m_perms.length ()); |
5452 | |
5453 | auto_sbitmap fully_folded (m_vertices.length ()); |
5454 | bitmap_clear (fully_folded); |
5455 | for (unsigned int node_i : m_partitioned_nodes) |
5456 | { |
5457 | auto &vertex = m_vertices[node_i]; |
5458 | slp_tree node = vertex.node; |
5459 | int layout_i = m_partitions[vertex.partition].layout; |
5460 | gcc_assert (layout_i >= 0); |
5461 | |
5462 | /* Rearrange the scalar statements to match the chosen layout. */ |
5463 | if (layout_i > 0) |
5464 | vect_slp_permute (perm: m_perms[layout_i], |
5465 | SLP_TREE_SCALAR_STMTS (node), reverse: true); |
5466 | |
5467 | /* Update load and lane permutations. */ |
5468 | if (SLP_TREE_CODE (node) == VEC_PERM_EXPR) |
5469 | { |
5470 | /* First try to absorb the input vector layouts. If that fails, |
5471 | force the inputs to have layout LAYOUT_I too. We checked that |
5472 | that was possible before deciding to use nonzero output layouts. |
5473 | (Note that at this stage we don't really have any guarantee that |
5474 | the target supports the original VEC_PERM_EXPR.) */ |
5475 | auto &perm = SLP_TREE_LANE_PERMUTATION (node); |
5476 | auto_lane_permutation_t tmp_perm; |
5477 | tmp_perm.safe_splice (src: perm); |
5478 | change_vec_perm_layout (node, perm&: tmp_perm, in_layout_i: -1, out_layout_i: layout_i); |
5479 | if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node, |
5480 | tmp_perm, |
5481 | SLP_TREE_CHILDREN (node), |
5482 | false) >= 0) |
5483 | { |
5484 | if (dump_enabled_p () |
5485 | && !std::equal (first1: tmp_perm.begin (), last1: tmp_perm.end (), |
5486 | first2: perm.begin ())) |
5487 | dump_printf_loc (MSG_NOTE, vect_location, |
5488 | "absorbing input layouts into %p\n" , |
5489 | (void *) node); |
5490 | std::copy (first: tmp_perm.begin (), last: tmp_perm.end (), result: perm.begin ()); |
5491 | bitmap_set_bit (map: fully_folded, bitno: node_i); |
5492 | } |
5493 | else |
5494 | { |
5495 | /* Not MSG_MISSED because it would make no sense to users. */ |
5496 | if (dump_enabled_p ()) |
5497 | dump_printf_loc (MSG_NOTE, vect_location, |
5498 | "failed to absorb input layouts into %p\n" , |
5499 | (void *) node); |
5500 | change_vec_perm_layout (node: nullptr, perm, in_layout_i: layout_i, out_layout_i: layout_i); |
5501 | } |
5502 | } |
5503 | else |
5504 | { |
5505 | gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ()); |
5506 | auto &load_perm = SLP_TREE_LOAD_PERMUTATION (node); |
5507 | if (layout_i > 0) |
5508 | /* ??? When we handle non-bijective permutes the idea |
5509 | is that we can force the load-permutation to be |
5510 | { min, min + 1, min + 2, ... max }. But then the |
5511 | scalar defs might no longer match the lane content |
5512 | which means wrong-code with live lane vectorization. |
5513 | So we possibly have to have NULL entries for those. */ |
5514 | vect_slp_permute (perm: m_perms[layout_i], vec&: load_perm, reverse: true); |
5515 | } |
5516 | } |
5517 | |
5518 | /* Do this before any nodes disappear, since it involves a walk |
5519 | over the leaves. */ |
5520 | remove_redundant_permutations (); |
5521 | |
5522 | /* Replace each child with a correctly laid-out version. */ |
5523 | for (unsigned int node_i : m_partitioned_nodes) |
5524 | { |
5525 | /* Skip nodes that have already been handled above. */ |
5526 | if (bitmap_bit_p (map: fully_folded, bitno: node_i)) |
5527 | continue; |
5528 | |
5529 | auto &vertex = m_vertices[node_i]; |
5530 | int in_layout_i = m_partitions[vertex.partition].layout; |
5531 | gcc_assert (in_layout_i >= 0); |
5532 | |
5533 | unsigned j; |
5534 | slp_tree child; |
5535 | FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (vertex.node), j, child) |
5536 | { |
5537 | if (!child) |
5538 | continue; |
5539 | |
5540 | slp_tree new_child = get_result_with_layout (node: child, to_layout_i: in_layout_i); |
5541 | if (new_child != child) |
5542 | { |
5543 | vect_free_slp_tree (node: child); |
5544 | SLP_TREE_CHILDREN (vertex.node)[j] = new_child; |
5545 | new_child->refcnt += 1; |
5546 | } |
5547 | } |
5548 | } |
5549 | } |
5550 | |
5551 | /* Elide load permutations that are not necessary. Such permutations might |
5552 | be pre-existing, rather than created by the layout optimizations. */ |
5553 | |
5554 | void |
5555 | vect_optimize_slp_pass::remove_redundant_permutations () |
5556 | { |
5557 | for (unsigned int node_i : m_leafs) |
5558 | { |
5559 | slp_tree node = m_vertices[node_i].node; |
5560 | if (!SLP_TREE_LOAD_PERMUTATION (node).exists ()) |
5561 | continue; |
5562 | |
5563 | /* In basic block vectorization we allow any subchain of an interleaving |
5564 | chain. |
5565 | FORNOW: not in loop SLP because of realignment complications. */ |
5566 | if (is_a <bb_vec_info> (p: m_vinfo)) |
5567 | { |
5568 | bool subchain_p = true; |
5569 | stmt_vec_info next_load_info = NULL; |
5570 | stmt_vec_info load_info; |
5571 | unsigned j; |
5572 | FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info) |
5573 | { |
5574 | if (j != 0 |
5575 | && (next_load_info != load_info |
5576 | || DR_GROUP_GAP (load_info) != 1)) |
5577 | { |
5578 | subchain_p = false; |
5579 | break; |
5580 | } |
5581 | next_load_info = DR_GROUP_NEXT_ELEMENT (load_info); |
5582 | } |
5583 | if (subchain_p) |
5584 | { |
5585 | SLP_TREE_LOAD_PERMUTATION (node).release (); |
5586 | continue; |
5587 | } |
5588 | } |
5589 | else |
5590 | { |
5591 | loop_vec_info loop_vinfo = as_a<loop_vec_info> (p: m_vinfo); |
5592 | stmt_vec_info load_info; |
5593 | bool this_load_permuted = false; |
5594 | unsigned j; |
5595 | FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info) |
5596 | if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j) |
5597 | { |
5598 | this_load_permuted = true; |
5599 | break; |
5600 | } |
5601 | /* When this isn't a grouped access we know it's single element |
5602 | and contiguous. */ |
5603 | if (!STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (node)[0])) |
5604 | { |
5605 | if (!this_load_permuted |
5606 | && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U) |
5607 | || SLP_TREE_LANES (node) == 1)) |
5608 | SLP_TREE_LOAD_PERMUTATION (node).release (); |
5609 | continue; |
5610 | } |
5611 | stmt_vec_info first_stmt_info |
5612 | = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]); |
5613 | if (!this_load_permuted |
5614 | /* The load requires permutation when unrolling exposes |
5615 | a gap either because the group is larger than the SLP |
5616 | group-size or because there is a gap between the groups. */ |
5617 | && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U) |
5618 | || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info)) |
5619 | && DR_GROUP_GAP (first_stmt_info) == 0))) |
5620 | { |
5621 | SLP_TREE_LOAD_PERMUTATION (node).release (); |
5622 | continue; |
5623 | } |
5624 | } |
5625 | } |
5626 | } |
5627 | |
5628 | /* Print the partition graph and layout information to the dump file. */ |
5629 | |
5630 | void |
5631 | vect_optimize_slp_pass::dump () |
5632 | { |
5633 | dump_printf_loc (MSG_NOTE, vect_location, |
5634 | "SLP optimize permutations:\n" ); |
5635 | for (unsigned int layout_i = 1; layout_i < m_perms.length (); ++layout_i) |
5636 | { |
5637 | dump_printf_loc (MSG_NOTE, vect_location, " %d: { " , layout_i); |
5638 | const char *sep = "" ; |
5639 | for (unsigned int idx : m_perms[layout_i]) |
5640 | { |
5641 | dump_printf (MSG_NOTE, "%s%d" , sep, idx); |
5642 | sep = ", " ; |
5643 | } |
5644 | dump_printf (MSG_NOTE, " }\n" ); |
5645 | } |
5646 | dump_printf_loc (MSG_NOTE, vect_location, |
5647 | "SLP optimize partitions:\n" ); |
5648 | for (unsigned int partition_i = 0; partition_i < m_partitions.length (); |
5649 | ++partition_i) |
5650 | { |
5651 | auto &partition = m_partitions[partition_i]; |
5652 | dump_printf_loc (MSG_NOTE, vect_location, " -------------\n" ); |
5653 | dump_printf_loc (MSG_NOTE, vect_location, |
5654 | " partition %d (layout %d):\n" , |
5655 | partition_i, partition.layout); |
5656 | dump_printf_loc (MSG_NOTE, vect_location, " nodes:\n" ); |
5657 | for (unsigned int order_i = partition.node_begin; |
5658 | order_i < partition.node_end; ++order_i) |
5659 | { |
5660 | auto &vertex = m_vertices[m_partitioned_nodes[order_i]]; |
5661 | dump_printf_loc (MSG_NOTE, vect_location, " - %p:\n" , |
5662 | (void *) vertex.node); |
5663 | dump_printf_loc (MSG_NOTE, vect_location, |
5664 | " weight: %f\n" , |
5665 | vertex.weight.to_double ()); |
5666 | if (vertex.out_degree) |
5667 | dump_printf_loc (MSG_NOTE, vect_location, |
5668 | " out weight: %f (degree %d)\n" , |
5669 | vertex.out_weight.to_double (), |
5670 | vertex.out_degree); |
5671 | if (SLP_TREE_CODE (vertex.node) == VEC_PERM_EXPR) |
5672 | dump_printf_loc (MSG_NOTE, vect_location, |
5673 | " op: VEC_PERM_EXPR\n" ); |
5674 | else if (auto rep = SLP_TREE_REPRESENTATIVE (vertex.node)) |
5675 | dump_printf_loc (MSG_NOTE, vect_location, |
5676 | " op template: %G" , rep->stmt); |
5677 | } |
5678 | dump_printf_loc (MSG_NOTE, vect_location, " edges:\n" ); |
5679 | for (unsigned int order_i = partition.node_begin; |
5680 | order_i < partition.node_end; ++order_i) |
5681 | { |
5682 | unsigned int node_i = m_partitioned_nodes[order_i]; |
5683 | auto &vertex = m_vertices[node_i]; |
5684 | auto print_edge = [&](graph_edge *, unsigned int other_node_i) |
5685 | { |
5686 | auto &other_vertex = m_vertices[other_node_i]; |
5687 | if (other_vertex.partition < vertex.partition) |
5688 | dump_printf_loc (MSG_NOTE, vect_location, |
5689 | " - %p [%d] --> %p\n" , |
5690 | (void *) other_vertex.node, |
5691 | other_vertex.partition, |
5692 | (void *) vertex.node); |
5693 | else |
5694 | dump_printf_loc (MSG_NOTE, vect_location, |
5695 | " - %p --> [%d] %p\n" , |
5696 | (void *) vertex.node, |
5697 | other_vertex.partition, |
5698 | (void *) other_vertex.node); |
5699 | }; |
5700 | for_each_partition_edge (node_i, fn: print_edge); |
5701 | } |
5702 | |
5703 | for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i) |
5704 | { |
5705 | auto &layout_costs = partition_layout_costs (partition_i, layout_i); |
5706 | if (layout_costs.is_possible ()) |
5707 | { |
5708 | dump_printf_loc (MSG_NOTE, vect_location, |
5709 | " layout %d:%s\n" , layout_i, |
5710 | partition.layout == int (layout_i) |
5711 | ? " (*)" : "" ); |
5712 | slpg_layout_cost combined_cost = layout_costs.in_cost; |
5713 | combined_cost.add_serial_cost (other: layout_costs.internal_cost); |
5714 | combined_cost.add_serial_cost (other: layout_costs.out_cost); |
5715 | #define TEMPLATE "{depth: %f, total: %f}" |
5716 | dump_printf_loc (MSG_NOTE, vect_location, |
5717 | " " TEMPLATE "\n" , |
5718 | layout_costs.in_cost.depth.to_double (), |
5719 | layout_costs.in_cost.total.to_double ()); |
5720 | dump_printf_loc (MSG_NOTE, vect_location, |
5721 | " + " TEMPLATE "\n" , |
5722 | layout_costs.internal_cost.depth.to_double (), |
5723 | layout_costs.internal_cost.total.to_double ()); |
5724 | dump_printf_loc (MSG_NOTE, vect_location, |
5725 | " + " TEMPLATE "\n" , |
5726 | layout_costs.out_cost.depth.to_double (), |
5727 | layout_costs.out_cost.total.to_double ()); |
5728 | dump_printf_loc (MSG_NOTE, vect_location, |
5729 | " = " TEMPLATE "\n" , |
5730 | combined_cost.depth.to_double (), |
5731 | combined_cost.total.to_double ()); |
5732 | #undef TEMPLATE |
5733 | } |
5734 | else |
5735 | dump_printf_loc (MSG_NOTE, vect_location, |
5736 | " layout %d: rejected\n" , layout_i); |
5737 | } |
5738 | } |
5739 | } |
5740 | |
5741 | /* Main entry point for the SLP graph optimization pass. */ |
5742 | |
5743 | void |
5744 | vect_optimize_slp_pass::run () |
5745 | { |
5746 | build_graph (); |
5747 | create_partitions (); |
5748 | start_choosing_layouts (); |
5749 | if (m_perms.length () > 1) |
5750 | { |
5751 | forward_pass (); |
5752 | backward_pass (); |
5753 | if (dump_enabled_p ()) |
5754 | dump (); |
5755 | materialize (); |
5756 | while (!m_perms.is_empty ()) |
5757 | m_perms.pop ().release (); |
5758 | } |
5759 | else |
5760 | remove_redundant_permutations (); |
5761 | free_graph (g: m_slpg); |
5762 | } |
5763 | |
5764 | /* Optimize the SLP graph of VINFO. */ |
5765 | |
5766 | void |
5767 | vect_optimize_slp (vec_info *vinfo) |
5768 | { |
5769 | if (vinfo->slp_instances.is_empty ()) |
5770 | return; |
5771 | vect_optimize_slp_pass (vinfo).run (); |
5772 | } |
5773 | |
5774 | /* Gather loads reachable from the individual SLP graph entries. */ |
5775 | |
5776 | void |
5777 | vect_gather_slp_loads (vec_info *vinfo) |
5778 | { |
5779 | unsigned i; |
5780 | slp_instance instance; |
5781 | FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance) |
5782 | { |
5783 | hash_set<slp_tree> visited; |
5784 | vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance), |
5785 | SLP_INSTANCE_TREE (instance), visited); |
5786 | } |
5787 | } |
5788 | |
5789 | |
5790 | /* For each possible SLP instance decide whether to SLP it and calculate overall |
5791 | unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at |
5792 | least one instance. */ |
5793 | |
5794 | bool |
5795 | vect_make_slp_decision (loop_vec_info loop_vinfo) |
5796 | { |
5797 | unsigned int i; |
5798 | poly_uint64 unrolling_factor = 1; |
5799 | const vec<slp_instance> &slp_instances |
5800 | = LOOP_VINFO_SLP_INSTANCES (loop_vinfo); |
5801 | slp_instance instance; |
5802 | int decided_to_slp = 0; |
5803 | |
5804 | DUMP_VECT_SCOPE ("vect_make_slp_decision" ); |
5805 | |
5806 | FOR_EACH_VEC_ELT (slp_instances, i, instance) |
5807 | { |
5808 | /* FORNOW: SLP if you can. */ |
5809 | /* All unroll factors have the form: |
5810 | |
5811 | GET_MODE_SIZE (vinfo->vector_mode) * X |
5812 | |
5813 | for some rational X, so they must have a common multiple. */ |
5814 | unrolling_factor |
5815 | = force_common_multiple (a: unrolling_factor, |
5816 | SLP_INSTANCE_UNROLLING_FACTOR (instance)); |
5817 | |
5818 | /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we |
5819 | call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and |
5820 | loop-based vectorization. Such stmts will be marked as HYBRID. */ |
5821 | vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance)); |
5822 | decided_to_slp++; |
5823 | } |
5824 | |
5825 | LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor; |
5826 | |
5827 | if (decided_to_slp && dump_enabled_p ()) |
5828 | { |
5829 | dump_printf_loc (MSG_NOTE, vect_location, |
5830 | "Decided to SLP %d instances. Unrolling factor " , |
5831 | decided_to_slp); |
5832 | dump_dec (MSG_NOTE, unrolling_factor); |
5833 | dump_printf (MSG_NOTE, "\n" ); |
5834 | } |
5835 | |
5836 | return (decided_to_slp > 0); |
5837 | } |
5838 | |
5839 | /* Private data for vect_detect_hybrid_slp. */ |
5840 | struct vdhs_data |
5841 | { |
5842 | loop_vec_info loop_vinfo; |
5843 | vec<stmt_vec_info> *worklist; |
5844 | }; |
5845 | |
5846 | /* Walker for walk_gimple_op. */ |
5847 | |
5848 | static tree |
5849 | vect_detect_hybrid_slp (tree *tp, int *, void *data) |
5850 | { |
5851 | walk_stmt_info *wi = (walk_stmt_info *)data; |
5852 | vdhs_data *dat = (vdhs_data *)wi->info; |
5853 | |
5854 | if (wi->is_lhs) |
5855 | return NULL_TREE; |
5856 | |
5857 | stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp); |
5858 | if (!def_stmt_info) |
5859 | return NULL_TREE; |
5860 | def_stmt_info = vect_stmt_to_vectorize (stmt_info: def_stmt_info); |
5861 | if (PURE_SLP_STMT (def_stmt_info)) |
5862 | { |
5863 | if (dump_enabled_p ()) |
5864 | dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G" , |
5865 | def_stmt_info->stmt); |
5866 | STMT_SLP_TYPE (def_stmt_info) = hybrid; |
5867 | dat->worklist->safe_push (obj: def_stmt_info); |
5868 | } |
5869 | |
5870 | return NULL_TREE; |
5871 | } |
5872 | |
5873 | /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp |
5874 | if so, otherwise pushing it to WORKLIST. */ |
5875 | |
5876 | static void |
5877 | maybe_push_to_hybrid_worklist (vec_info *vinfo, |
5878 | vec<stmt_vec_info> &worklist, |
5879 | stmt_vec_info stmt_info) |
5880 | { |
5881 | if (dump_enabled_p ()) |
5882 | dump_printf_loc (MSG_NOTE, vect_location, |
5883 | "Processing hybrid candidate : %G" , stmt_info->stmt); |
5884 | stmt_vec_info orig_info = vect_orig_stmt (stmt_info); |
5885 | imm_use_iterator iter2; |
5886 | ssa_op_iter iter1; |
5887 | use_operand_p use_p; |
5888 | def_operand_p def_p; |
5889 | bool any_def = false; |
5890 | FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF) |
5891 | { |
5892 | any_def = true; |
5893 | FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p)) |
5894 | { |
5895 | if (is_gimple_debug (USE_STMT (use_p))) |
5896 | continue; |
5897 | stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p)); |
5898 | /* An out-of loop use means this is a loop_vect sink. */ |
5899 | if (!use_info) |
5900 | { |
5901 | if (dump_enabled_p ()) |
5902 | dump_printf_loc (MSG_NOTE, vect_location, |
5903 | "Found loop_vect sink: %G" , stmt_info->stmt); |
5904 | worklist.safe_push (obj: stmt_info); |
5905 | return; |
5906 | } |
5907 | else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info))) |
5908 | { |
5909 | if (dump_enabled_p ()) |
5910 | dump_printf_loc (MSG_NOTE, vect_location, |
5911 | "Found loop_vect use: %G" , use_info->stmt); |
5912 | worklist.safe_push (obj: stmt_info); |
5913 | return; |
5914 | } |
5915 | } |
5916 | } |
5917 | /* No def means this is a loo_vect sink. */ |
5918 | if (!any_def) |
5919 | { |
5920 | if (dump_enabled_p ()) |
5921 | dump_printf_loc (MSG_NOTE, vect_location, |
5922 | "Found loop_vect sink: %G" , stmt_info->stmt); |
5923 | worklist.safe_push (obj: stmt_info); |
5924 | return; |
5925 | } |
5926 | if (dump_enabled_p ()) |
5927 | dump_printf_loc (MSG_NOTE, vect_location, |
5928 | "Marked SLP consumed stmt pure: %G" , stmt_info->stmt); |
5929 | STMT_SLP_TYPE (stmt_info) = pure_slp; |
5930 | } |
5931 | |
5932 | /* Find stmts that must be both vectorized and SLPed. */ |
5933 | |
5934 | void |
5935 | vect_detect_hybrid_slp (loop_vec_info loop_vinfo) |
5936 | { |
5937 | DUMP_VECT_SCOPE ("vect_detect_hybrid_slp" ); |
5938 | |
5939 | /* All stmts participating in SLP are marked pure_slp, all other |
5940 | stmts are loop_vect. |
5941 | First collect all loop_vect stmts into a worklist. |
5942 | SLP patterns cause not all original scalar stmts to appear in |
5943 | SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp. |
5944 | Rectify this here and do a backward walk over the IL only considering |
5945 | stmts as loop_vect when they are used by a loop_vect stmt and otherwise |
5946 | mark them as pure_slp. */ |
5947 | auto_vec<stmt_vec_info> worklist; |
5948 | for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - 1; i >= 0; --i) |
5949 | { |
5950 | basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i]; |
5951 | for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (i: gsi); |
5952 | gsi_next (i: &gsi)) |
5953 | { |
5954 | gphi *phi = gsi.phi (); |
5955 | stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi); |
5956 | if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info)) |
5957 | maybe_push_to_hybrid_worklist (vinfo: loop_vinfo, |
5958 | worklist, stmt_info); |
5959 | } |
5960 | for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (i: gsi); |
5961 | gsi_prev (i: &gsi)) |
5962 | { |
5963 | gimple *stmt = gsi_stmt (i: gsi); |
5964 | if (is_gimple_debug (gs: stmt)) |
5965 | continue; |
5966 | stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt); |
5967 | if (STMT_VINFO_IN_PATTERN_P (stmt_info)) |
5968 | { |
5969 | for (gimple_stmt_iterator gsi2 |
5970 | = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info)); |
5971 | !gsi_end_p (i: gsi2); gsi_next (i: &gsi2)) |
5972 | { |
5973 | stmt_vec_info patt_info |
5974 | = loop_vinfo->lookup_stmt (gsi_stmt (i: gsi2)); |
5975 | if (!STMT_SLP_TYPE (patt_info) |
5976 | && STMT_VINFO_RELEVANT (patt_info)) |
5977 | maybe_push_to_hybrid_worklist (vinfo: loop_vinfo, |
5978 | worklist, stmt_info: patt_info); |
5979 | } |
5980 | stmt_info = STMT_VINFO_RELATED_STMT (stmt_info); |
5981 | } |
5982 | if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info)) |
5983 | maybe_push_to_hybrid_worklist (vinfo: loop_vinfo, |
5984 | worklist, stmt_info); |
5985 | } |
5986 | } |
5987 | |
5988 | /* Now we have a worklist of non-SLP stmts, follow use->def chains and |
5989 | mark any SLP vectorized stmt as hybrid. |
5990 | ??? We're visiting def stmts N times (once for each non-SLP and |
5991 | once for each hybrid-SLP use). */ |
5992 | walk_stmt_info wi; |
5993 | vdhs_data dat; |
5994 | dat.worklist = &worklist; |
5995 | dat.loop_vinfo = loop_vinfo; |
5996 | memset (s: &wi, c: 0, n: sizeof (wi)); |
5997 | wi.info = (void *)&dat; |
5998 | while (!worklist.is_empty ()) |
5999 | { |
6000 | stmt_vec_info stmt_info = worklist.pop (); |
6001 | /* Since SSA operands are not set up for pattern stmts we need |
6002 | to use walk_gimple_op. */ |
6003 | wi.is_lhs = 0; |
6004 | walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi); |
6005 | /* For gather/scatter make sure to walk the offset operand, that |
6006 | can be a scaling and conversion away. */ |
6007 | gather_scatter_info gs_info; |
6008 | if (STMT_VINFO_GATHER_SCATTER_P (stmt_info) |
6009 | && vect_check_gather_scatter (stmt_info, loop_vinfo, &gs_info)) |
6010 | { |
6011 | int dummy; |
6012 | vect_detect_hybrid_slp (tp: &gs_info.offset, &dummy, data: &wi); |
6013 | } |
6014 | } |
6015 | } |
6016 | |
6017 | |
6018 | /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */ |
6019 | |
6020 | _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared) |
6021 | : vec_info (vec_info::bb, shared), |
6022 | bbs (_bbs), |
6023 | roots (vNULL) |
6024 | { |
6025 | for (unsigned i = 0; i < bbs.length (); ++i) |
6026 | { |
6027 | if (i != 0) |
6028 | for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (i: si); |
6029 | gsi_next (i: &si)) |
6030 | { |
6031 | gphi *phi = si.phi (); |
6032 | gimple_set_uid (g: phi, uid: 0); |
6033 | add_stmt (phi); |
6034 | } |
6035 | for (gimple_stmt_iterator gsi = gsi_start_bb (bb: bbs[i]); |
6036 | !gsi_end_p (i: gsi); gsi_next (i: &gsi)) |
6037 | { |
6038 | gimple *stmt = gsi_stmt (i: gsi); |
6039 | gimple_set_uid (g: stmt, uid: 0); |
6040 | if (is_gimple_debug (gs: stmt)) |
6041 | continue; |
6042 | add_stmt (stmt); |
6043 | } |
6044 | } |
6045 | } |
6046 | |
6047 | |
6048 | /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the |
6049 | stmts in the basic block. */ |
6050 | |
6051 | _bb_vec_info::~_bb_vec_info () |
6052 | { |
6053 | /* Reset region marker. */ |
6054 | for (unsigned i = 0; i < bbs.length (); ++i) |
6055 | { |
6056 | if (i != 0) |
6057 | for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (i: si); |
6058 | gsi_next (i: &si)) |
6059 | { |
6060 | gphi *phi = si.phi (); |
6061 | gimple_set_uid (g: phi, uid: -1); |
6062 | } |
6063 | for (gimple_stmt_iterator gsi = gsi_start_bb (bb: bbs[i]); |
6064 | !gsi_end_p (i: gsi); gsi_next (i: &gsi)) |
6065 | { |
6066 | gimple *stmt = gsi_stmt (i: gsi); |
6067 | gimple_set_uid (g: stmt, uid: -1); |
6068 | } |
6069 | } |
6070 | |
6071 | for (unsigned i = 0; i < roots.length (); ++i) |
6072 | { |
6073 | roots[i].stmts.release (); |
6074 | roots[i].roots.release (); |
6075 | roots[i].remain.release (); |
6076 | } |
6077 | roots.release (); |
6078 | } |
6079 | |
6080 | /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE, |
6081 | given then that child nodes have already been processed, and that |
6082 | their def types currently match their SLP node's def type. */ |
6083 | |
6084 | static bool |
6085 | vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node, |
6086 | slp_instance node_instance, |
6087 | stmt_vector_for_cost *cost_vec) |
6088 | { |
6089 | stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node); |
6090 | |
6091 | /* Calculate the number of vector statements to be created for the |
6092 | scalar stmts in this node. For SLP reductions it is equal to the |
6093 | number of vector statements in the children (which has already been |
6094 | calculated by the recursive call). Otherwise it is the number of |
6095 | scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by |
6096 | VF divided by the number of elements in a vector. */ |
6097 | if (SLP_TREE_CODE (node) != VEC_PERM_EXPR |
6098 | && !STMT_VINFO_DATA_REF (stmt_info) |
6099 | && REDUC_GROUP_FIRST_ELEMENT (stmt_info)) |
6100 | { |
6101 | for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i) |
6102 | if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def) |
6103 | { |
6104 | SLP_TREE_NUMBER_OF_VEC_STMTS (node) |
6105 | = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]); |
6106 | break; |
6107 | } |
6108 | } |
6109 | else |
6110 | { |
6111 | poly_uint64 vf; |
6112 | if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (p: vinfo)) |
6113 | vf = loop_vinfo->vectorization_factor; |
6114 | else |
6115 | vf = 1; |
6116 | unsigned int group_size = SLP_TREE_LANES (node); |
6117 | tree vectype = SLP_TREE_VECTYPE (node); |
6118 | SLP_TREE_NUMBER_OF_VEC_STMTS (node) |
6119 | = vect_get_num_vectors (nunits: vf * group_size, vectype); |
6120 | } |
6121 | |
6122 | /* Handle purely internal nodes. */ |
6123 | if (SLP_TREE_CODE (node) == VEC_PERM_EXPR) |
6124 | { |
6125 | if (!vectorizable_slp_permutation (vinfo, NULL, node, cost_vec)) |
6126 | return false; |
6127 | |
6128 | stmt_vec_info slp_stmt_info; |
6129 | unsigned int i; |
6130 | FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info) |
6131 | { |
6132 | if (STMT_VINFO_LIVE_P (slp_stmt_info) |
6133 | && !vectorizable_live_operation (vinfo, slp_stmt_info, node, |
6134 | node_instance, i, |
6135 | false, cost_vec)) |
6136 | return false; |
6137 | } |
6138 | return true; |
6139 | } |
6140 | |
6141 | bool dummy; |
6142 | return vect_analyze_stmt (vinfo, stmt_info, &dummy, |
6143 | node, node_instance, cost_vec); |
6144 | } |
6145 | |
6146 | /* Try to build NODE from scalars, returning true on success. |
6147 | NODE_INSTANCE is the SLP instance that contains NODE. */ |
6148 | |
6149 | static bool |
6150 | vect_slp_convert_to_external (vec_info *vinfo, slp_tree node, |
6151 | slp_instance node_instance) |
6152 | { |
6153 | stmt_vec_info stmt_info; |
6154 | unsigned int i; |
6155 | |
6156 | if (!is_a <bb_vec_info> (p: vinfo) |
6157 | || node == SLP_INSTANCE_TREE (node_instance) |
6158 | || !SLP_TREE_SCALAR_STMTS (node).exists () |
6159 | || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node)) |
6160 | /* Force the mask use to be built from scalars instead. */ |
6161 | || VECTOR_BOOLEAN_TYPE_P (SLP_TREE_VECTYPE (node))) |
6162 | return false; |
6163 | |
6164 | if (dump_enabled_p ()) |
6165 | dump_printf_loc (MSG_NOTE, vect_location, |
6166 | "Building vector operands of %p from scalars instead\n" , |
6167 | (void *) node); |
6168 | |
6169 | /* Don't remove and free the child nodes here, since they could be |
6170 | referenced by other structures. The analysis and scheduling phases |
6171 | (need to) ignore child nodes of anything that isn't vect_internal_def. */ |
6172 | unsigned int group_size = SLP_TREE_LANES (node); |
6173 | SLP_TREE_DEF_TYPE (node) = vect_external_def; |
6174 | /* Invariants get their vector type from the uses. */ |
6175 | SLP_TREE_VECTYPE (node) = NULL_TREE; |
6176 | SLP_TREE_SCALAR_OPS (node).safe_grow (len: group_size, exact: true); |
6177 | SLP_TREE_LOAD_PERMUTATION (node).release (); |
6178 | FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info) |
6179 | { |
6180 | tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt); |
6181 | SLP_TREE_SCALAR_OPS (node)[i] = lhs; |
6182 | } |
6183 | return true; |
6184 | } |
6185 | |
6186 | /* Return true if all elements of the slice are the same. */ |
6187 | bool |
6188 | vect_scalar_ops_slice::all_same_p () const |
6189 | { |
6190 | for (unsigned int i = 1; i < length; ++i) |
6191 | if (!operand_equal_p (op (i: 0), op (i))) |
6192 | return false; |
6193 | return true; |
6194 | } |
6195 | |
6196 | hashval_t |
6197 | vect_scalar_ops_slice_hash::hash (const value_type &s) |
6198 | { |
6199 | hashval_t hash = 0; |
6200 | for (unsigned i = 0; i < s.length; ++i) |
6201 | hash = iterative_hash_expr (tree: s.op (i), seed: hash); |
6202 | return hash; |
6203 | } |
6204 | |
6205 | bool |
6206 | vect_scalar_ops_slice_hash::equal (const value_type &s1, |
6207 | const compare_type &s2) |
6208 | { |
6209 | if (s1.length != s2.length) |
6210 | return false; |
6211 | for (unsigned i = 0; i < s1.length; ++i) |
6212 | if (!operand_equal_p (s1.op (i), s2.op (i))) |
6213 | return false; |
6214 | return true; |
6215 | } |
6216 | |
6217 | /* Compute the prologue cost for invariant or constant operands represented |
6218 | by NODE. */ |
6219 | |
6220 | static void |
6221 | vect_prologue_cost_for_slp (slp_tree node, |
6222 | stmt_vector_for_cost *cost_vec) |
6223 | { |
6224 | /* There's a special case of an existing vector, that costs nothing. */ |
6225 | if (SLP_TREE_SCALAR_OPS (node).length () == 0 |
6226 | && !SLP_TREE_VEC_DEFS (node).is_empty ()) |
6227 | return; |
6228 | /* Without looking at the actual initializer a vector of |
6229 | constants can be implemented as load from the constant pool. |
6230 | When all elements are the same we can use a splat. */ |
6231 | tree vectype = SLP_TREE_VECTYPE (node); |
6232 | unsigned group_size = SLP_TREE_SCALAR_OPS (node).length (); |
6233 | unsigned HOST_WIDE_INT const_nunits; |
6234 | unsigned nelt_limit; |
6235 | auto ops = &SLP_TREE_SCALAR_OPS (node); |
6236 | auto_vec<unsigned int> starts (SLP_TREE_NUMBER_OF_VEC_STMTS (node)); |
6237 | if (TYPE_VECTOR_SUBPARTS (node: vectype).is_constant (const_value: &const_nunits) |
6238 | && ! multiple_p (a: const_nunits, b: group_size)) |
6239 | { |
6240 | nelt_limit = const_nunits; |
6241 | hash_set<vect_scalar_ops_slice_hash> vector_ops; |
6242 | for (unsigned int i = 0; i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); ++i) |
6243 | if (!vector_ops.add (k: { .ops: ops, .start: i * const_nunits, .length: const_nunits })) |
6244 | starts.quick_push (obj: i * const_nunits); |
6245 | } |
6246 | else |
6247 | { |
6248 | /* If either the vector has variable length or the vectors |
6249 | are composed of repeated whole groups we only need to |
6250 | cost construction once. All vectors will be the same. */ |
6251 | nelt_limit = group_size; |
6252 | starts.quick_push (obj: 0); |
6253 | } |
6254 | /* ??? We're just tracking whether vectors in a single node are the same. |
6255 | Ideally we'd do something more global. */ |
6256 | bool passed = false; |
6257 | for (unsigned int start : starts) |
6258 | { |
6259 | vect_cost_for_stmt kind; |
6260 | if (SLP_TREE_DEF_TYPE (node) == vect_constant_def) |
6261 | kind = vector_load; |
6262 | else if (vect_scalar_ops_slice { .ops: ops, .start: start, .length: nelt_limit }.all_same_p ()) |
6263 | kind = scalar_to_vec; |
6264 | else |
6265 | kind = vec_construct; |
6266 | /* The target cost hook has no idea which part of the SLP node |
6267 | we are costing so avoid passing it down more than once. Pass |
6268 | it to the first vec_construct or scalar_to_vec part since for those |
6269 | the x86 backend tries to account for GPR to XMM register moves. */ |
6270 | record_stmt_cost (cost_vec, 1, kind, |
6271 | (kind != vector_load && !passed) ? node : nullptr, |
6272 | vectype, 0, vect_prologue); |
6273 | if (kind != vector_load) |
6274 | passed = true; |
6275 | } |
6276 | } |
6277 | |
6278 | /* Analyze statements contained in SLP tree NODE after recursively analyzing |
6279 | the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE. |
6280 | |
6281 | Return true if the operations are supported. */ |
6282 | |
6283 | static bool |
6284 | vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node, |
6285 | slp_instance node_instance, |
6286 | hash_set<slp_tree> &visited_set, |
6287 | vec<slp_tree> &visited_vec, |
6288 | stmt_vector_for_cost *cost_vec) |
6289 | { |
6290 | int i, j; |
6291 | slp_tree child; |
6292 | |
6293 | /* Assume we can code-generate all invariants. */ |
6294 | if (!node |
6295 | || SLP_TREE_DEF_TYPE (node) == vect_constant_def |
6296 | || SLP_TREE_DEF_TYPE (node) == vect_external_def) |
6297 | return true; |
6298 | |
6299 | if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def) |
6300 | { |
6301 | if (dump_enabled_p ()) |
6302 | dump_printf_loc (MSG_NOTE, vect_location, |
6303 | "Failed cyclic SLP reference in %p\n" , (void *) node); |
6304 | return false; |
6305 | } |
6306 | gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def); |
6307 | |
6308 | /* If we already analyzed the exact same set of scalar stmts we're done. |
6309 | We share the generated vector stmts for those. */ |
6310 | if (visited_set.add (k: node)) |
6311 | return true; |
6312 | visited_vec.safe_push (obj: node); |
6313 | |
6314 | bool res = true; |
6315 | unsigned visited_rec_start = visited_vec.length (); |
6316 | unsigned cost_vec_rec_start = cost_vec->length (); |
6317 | bool seen_non_constant_child = false; |
6318 | FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) |
6319 | { |
6320 | res = vect_slp_analyze_node_operations (vinfo, node: child, node_instance, |
6321 | visited_set, visited_vec, |
6322 | cost_vec); |
6323 | if (!res) |
6324 | break; |
6325 | if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def) |
6326 | seen_non_constant_child = true; |
6327 | } |
6328 | /* We're having difficulties scheduling nodes with just constant |
6329 | operands and no scalar stmts since we then cannot compute a stmt |
6330 | insertion place. */ |
6331 | if (!seen_non_constant_child && SLP_TREE_SCALAR_STMTS (node).is_empty ()) |
6332 | { |
6333 | if (dump_enabled_p ()) |
6334 | dump_printf_loc (MSG_NOTE, vect_location, |
6335 | "Cannot vectorize all-constant op node %p\n" , |
6336 | (void *) node); |
6337 | res = false; |
6338 | } |
6339 | |
6340 | if (res) |
6341 | res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance, |
6342 | cost_vec); |
6343 | /* If analysis failed we have to pop all recursive visited nodes |
6344 | plus ourselves. */ |
6345 | if (!res) |
6346 | { |
6347 | while (visited_vec.length () >= visited_rec_start) |
6348 | visited_set.remove (k: visited_vec.pop ()); |
6349 | cost_vec->truncate (size: cost_vec_rec_start); |
6350 | } |
6351 | |
6352 | /* When the node can be vectorized cost invariant nodes it references. |
6353 | This is not done in DFS order to allow the refering node |
6354 | vectorizable_* calls to nail down the invariant nodes vector type |
6355 | and possibly unshare it if it needs a different vector type than |
6356 | other referrers. */ |
6357 | if (res) |
6358 | FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child) |
6359 | if (child |
6360 | && (SLP_TREE_DEF_TYPE (child) == vect_constant_def |
6361 | || SLP_TREE_DEF_TYPE (child) == vect_external_def) |
6362 | /* Perform usual caching, note code-generation still |
6363 | code-gens these nodes multiple times but we expect |
6364 | to CSE them later. */ |
6365 | && !visited_set.add (k: child)) |
6366 | { |
6367 | visited_vec.safe_push (obj: child); |
6368 | /* ??? After auditing more code paths make a "default" |
6369 | and push the vector type from NODE to all children |
6370 | if it is not already set. */ |
6371 | /* Compute the number of vectors to be generated. */ |
6372 | tree vector_type = SLP_TREE_VECTYPE (child); |
6373 | if (!vector_type) |
6374 | { |
6375 | /* For shifts with a scalar argument we don't need |
6376 | to cost or code-generate anything. |
6377 | ??? Represent this more explicitely. */ |
6378 | gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node)) |
6379 | == shift_vec_info_type) |
6380 | && j == 1); |
6381 | continue; |
6382 | } |
6383 | unsigned group_size = SLP_TREE_LANES (child); |
6384 | poly_uint64 vf = 1; |
6385 | if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (p: vinfo)) |
6386 | vf = loop_vinfo->vectorization_factor; |
6387 | SLP_TREE_NUMBER_OF_VEC_STMTS (child) |
6388 | = vect_get_num_vectors (nunits: vf * group_size, vectype: vector_type); |
6389 | /* And cost them. */ |
6390 | vect_prologue_cost_for_slp (node: child, cost_vec); |
6391 | } |
6392 | |
6393 | /* If this node or any of its children can't be vectorized, try pruning |
6394 | the tree here rather than felling the whole thing. */ |
6395 | if (!res && vect_slp_convert_to_external (vinfo, node, node_instance)) |
6396 | { |
6397 | /* We'll need to revisit this for invariant costing and number |
6398 | of vectorized stmt setting. */ |
6399 | res = true; |
6400 | } |
6401 | |
6402 | return res; |
6403 | } |
6404 | |
6405 | /* Mark lanes of NODE that are live outside of the basic-block vectorized |
6406 | region and that can be vectorized using vectorizable_live_operation |
6407 | with STMT_VINFO_LIVE_P. Not handled live operations will cause the |
6408 | scalar code computing it to be retained. */ |
6409 | |
6410 | static void |
6411 | vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node, |
6412 | slp_instance instance, |
6413 | stmt_vector_for_cost *cost_vec, |
6414 | hash_set<stmt_vec_info> &svisited, |
6415 | hash_set<slp_tree> &visited) |
6416 | { |
6417 | if (visited.add (k: node)) |
6418 | return; |
6419 | |
6420 | unsigned i; |
6421 | stmt_vec_info stmt_info; |
6422 | stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node); |
6423 | FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info) |
6424 | { |
6425 | if (svisited.contains (k: stmt_info)) |
6426 | continue; |
6427 | stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info); |
6428 | if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info) |
6429 | && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info) |
6430 | /* Only the pattern root stmt computes the original scalar value. */ |
6431 | continue; |
6432 | bool mark_visited = true; |
6433 | gimple *orig_stmt = orig_stmt_info->stmt; |
6434 | ssa_op_iter op_iter; |
6435 | def_operand_p def_p; |
6436 | FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF) |
6437 | { |
6438 | imm_use_iterator use_iter; |
6439 | gimple *use_stmt; |
6440 | stmt_vec_info use_stmt_info; |
6441 | FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p)) |
6442 | if (!is_gimple_debug (gs: use_stmt)) |
6443 | { |
6444 | use_stmt_info = bb_vinfo->lookup_stmt (use_stmt); |
6445 | if (!use_stmt_info |
6446 | || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))) |
6447 | { |
6448 | STMT_VINFO_LIVE_P (stmt_info) = true; |
6449 | if (vectorizable_live_operation (bb_vinfo, stmt_info, |
6450 | node, instance, i, |
6451 | false, cost_vec)) |
6452 | /* ??? So we know we can vectorize the live stmt |
6453 | from one SLP node. If we cannot do so from all |
6454 | or none consistently we'd have to record which |
6455 | SLP node (and lane) we want to use for the live |
6456 | operation. So make sure we can code-generate |
6457 | from all nodes. */ |
6458 | mark_visited = false; |
6459 | else |
6460 | STMT_VINFO_LIVE_P (stmt_info) = false; |
6461 | break; |
6462 | } |
6463 | } |
6464 | /* We have to verify whether we can insert the lane extract |
6465 | before all uses. The following is a conservative approximation. |
6466 | We cannot put this into vectorizable_live_operation because |
6467 | iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT |
6468 | doesn't work. |
6469 | Note that while the fact that we emit code for loads at the |
6470 | first load should make this a non-problem leafs we construct |
6471 | from scalars are vectorized after the last scalar def. |
6472 | ??? If we'd actually compute the insert location during |
6473 | analysis we could use sth less conservative than the last |
6474 | scalar stmt in the node for the dominance check. */ |
6475 | /* ??? What remains is "live" uses in vector CTORs in the same |
6476 | SLP graph which is where those uses can end up code-generated |
6477 | right after their definition instead of close to their original |
6478 | use. But that would restrict us to code-generate lane-extracts |
6479 | from the latest stmt in a node. So we compensate for this |
6480 | during code-generation, simply not replacing uses for those |
6481 | hopefully rare cases. */ |
6482 | if (STMT_VINFO_LIVE_P (stmt_info)) |
6483 | FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p)) |
6484 | if (!is_gimple_debug (gs: use_stmt) |
6485 | && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt)) |
6486 | || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))) |
6487 | && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt)) |
6488 | { |
6489 | if (dump_enabled_p ()) |
6490 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
6491 | "Cannot determine insertion place for " |
6492 | "lane extract\n" ); |
6493 | STMT_VINFO_LIVE_P (stmt_info) = false; |
6494 | mark_visited = true; |
6495 | } |
6496 | } |
6497 | if (mark_visited) |
6498 | svisited.add (k: stmt_info); |
6499 | } |
6500 | |
6501 | slp_tree child; |
6502 | FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) |
6503 | if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def) |
6504 | vect_bb_slp_mark_live_stmts (bb_vinfo, node: child, instance, |
6505 | cost_vec, svisited, visited); |
6506 | } |
6507 | |
6508 | /* Determine whether we can vectorize the reduction epilogue for INSTANCE. */ |
6509 | |
6510 | static bool |
6511 | vectorizable_bb_reduc_epilogue (slp_instance instance, |
6512 | stmt_vector_for_cost *cost_vec) |
6513 | { |
6514 | gassign *stmt = as_a <gassign *> (p: instance->root_stmts[0]->stmt); |
6515 | enum tree_code reduc_code = gimple_assign_rhs_code (gs: stmt); |
6516 | if (reduc_code == MINUS_EXPR) |
6517 | reduc_code = PLUS_EXPR; |
6518 | internal_fn reduc_fn; |
6519 | tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance)); |
6520 | if (!vectype |
6521 | || !reduction_fn_for_scalar_code (reduc_code, &reduc_fn) |
6522 | || reduc_fn == IFN_LAST |
6523 | || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH) |
6524 | || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)), |
6525 | TREE_TYPE (vectype))) |
6526 | { |
6527 | if (dump_enabled_p ()) |
6528 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
6529 | "not vectorized: basic block reduction epilogue " |
6530 | "operation unsupported.\n" ); |
6531 | return false; |
6532 | } |
6533 | |
6534 | /* There's no way to cost a horizontal vector reduction via REDUC_FN so |
6535 | cost log2 vector operations plus shuffles and one extraction. */ |
6536 | unsigned steps = floor_log2 (x: vect_nunits_for_cost (vec_type: vectype)); |
6537 | record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0], |
6538 | vectype, 0, vect_body); |
6539 | record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0], |
6540 | vectype, 0, vect_body); |
6541 | record_stmt_cost (cost_vec, 1, vec_to_scalar, instance->root_stmts[0], |
6542 | vectype, 0, vect_body); |
6543 | |
6544 | /* Since we replace all stmts of a possibly longer scalar reduction |
6545 | chain account for the extra scalar stmts for that. */ |
6546 | record_stmt_cost (body_cost_vec: cost_vec, count: instance->remain_defs.length (), kind: scalar_stmt, |
6547 | stmt_info: instance->root_stmts[0], misalign: 0, where: vect_body); |
6548 | return true; |
6549 | } |
6550 | |
6551 | /* Prune from ROOTS all stmts that are computed as part of lanes of NODE |
6552 | and recurse to children. */ |
6553 | |
6554 | static void |
6555 | vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots, |
6556 | hash_set<slp_tree> &visited) |
6557 | { |
6558 | if (SLP_TREE_DEF_TYPE (node) != vect_internal_def |
6559 | || visited.add (k: node)) |
6560 | return; |
6561 | |
6562 | stmt_vec_info stmt; |
6563 | unsigned i; |
6564 | FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt) |
6565 | roots.remove (k: vect_orig_stmt (stmt_info: stmt)); |
6566 | |
6567 | slp_tree child; |
6568 | FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) |
6569 | if (child) |
6570 | vect_slp_prune_covered_roots (node: child, roots, visited); |
6571 | } |
6572 | |
6573 | /* Analyze statements in SLP instances of VINFO. Return true if the |
6574 | operations are supported. */ |
6575 | |
6576 | bool |
6577 | vect_slp_analyze_operations (vec_info *vinfo) |
6578 | { |
6579 | slp_instance instance; |
6580 | int i; |
6581 | |
6582 | DUMP_VECT_SCOPE ("vect_slp_analyze_operations" ); |
6583 | |
6584 | hash_set<slp_tree> visited; |
6585 | for (i = 0; vinfo->slp_instances.iterate (ix: i, ptr: &instance); ) |
6586 | { |
6587 | auto_vec<slp_tree> visited_vec; |
6588 | stmt_vector_for_cost cost_vec; |
6589 | cost_vec.create (nelems: 2); |
6590 | if (is_a <bb_vec_info> (p: vinfo)) |
6591 | vect_location = instance->location (); |
6592 | if (!vect_slp_analyze_node_operations (vinfo, |
6593 | SLP_INSTANCE_TREE (instance), |
6594 | node_instance: instance, visited_set&: visited, visited_vec, |
6595 | cost_vec: &cost_vec) |
6596 | /* CTOR instances require vectorized defs for the SLP tree root. */ |
6597 | || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor |
6598 | && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance)) |
6599 | != vect_internal_def |
6600 | /* Make sure we vectorized with the expected type. */ |
6601 | || !useless_type_conversion_p |
6602 | (TREE_TYPE (TREE_TYPE (gimple_assign_rhs1 |
6603 | (instance->root_stmts[0]->stmt))), |
6604 | TREE_TYPE (SLP_TREE_VECTYPE |
6605 | (SLP_INSTANCE_TREE (instance)))))) |
6606 | /* Check we can vectorize the reduction. */ |
6607 | || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc |
6608 | && !vectorizable_bb_reduc_epilogue (instance, cost_vec: &cost_vec))) |
6609 | { |
6610 | slp_tree node = SLP_INSTANCE_TREE (instance); |
6611 | stmt_vec_info stmt_info; |
6612 | if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()) |
6613 | stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0]; |
6614 | else |
6615 | stmt_info = SLP_TREE_SCALAR_STMTS (node)[0]; |
6616 | if (dump_enabled_p ()) |
6617 | dump_printf_loc (MSG_NOTE, vect_location, |
6618 | "removing SLP instance operations starting from: %G" , |
6619 | stmt_info->stmt); |
6620 | vect_free_slp_instance (instance); |
6621 | vinfo->slp_instances.ordered_remove (ix: i); |
6622 | cost_vec.release (); |
6623 | while (!visited_vec.is_empty ()) |
6624 | visited.remove (k: visited_vec.pop ()); |
6625 | } |
6626 | else |
6627 | { |
6628 | i++; |
6629 | if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (p: vinfo)) |
6630 | { |
6631 | add_stmt_costs (costs: loop_vinfo->vector_costs, cost_vec: &cost_vec); |
6632 | cost_vec.release (); |
6633 | } |
6634 | else |
6635 | /* For BB vectorization remember the SLP graph entry |
6636 | cost for later. */ |
6637 | instance->cost_vec = cost_vec; |
6638 | } |
6639 | } |
6640 | |
6641 | /* Now look for SLP instances with a root that are covered by other |
6642 | instances and remove them. */ |
6643 | hash_set<stmt_vec_info> roots; |
6644 | for (i = 0; vinfo->slp_instances.iterate (ix: i, ptr: &instance); ++i) |
6645 | if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()) |
6646 | roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]); |
6647 | if (!roots.is_empty ()) |
6648 | { |
6649 | visited.empty (); |
6650 | for (i = 0; vinfo->slp_instances.iterate (ix: i, ptr: &instance); ++i) |
6651 | vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots, |
6652 | visited); |
6653 | for (i = 0; vinfo->slp_instances.iterate (ix: i, ptr: &instance); ) |
6654 | if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty () |
6655 | && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0])) |
6656 | { |
6657 | stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0]; |
6658 | if (dump_enabled_p ()) |
6659 | dump_printf_loc (MSG_NOTE, vect_location, |
6660 | "removing SLP instance operations starting " |
6661 | "from: %G" , root->stmt); |
6662 | vect_free_slp_instance (instance); |
6663 | vinfo->slp_instances.ordered_remove (ix: i); |
6664 | } |
6665 | else |
6666 | ++i; |
6667 | } |
6668 | |
6669 | /* Compute vectorizable live stmts. */ |
6670 | if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (p: vinfo)) |
6671 | { |
6672 | hash_set<stmt_vec_info> svisited; |
6673 | hash_set<slp_tree> visited; |
6674 | for (i = 0; vinfo->slp_instances.iterate (ix: i, ptr: &instance); ++i) |
6675 | { |
6676 | vect_location = instance->location (); |
6677 | vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance), |
6678 | instance, cost_vec: &instance->cost_vec, svisited, |
6679 | visited); |
6680 | } |
6681 | } |
6682 | |
6683 | return !vinfo->slp_instances.is_empty (); |
6684 | } |
6685 | |
6686 | /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively |
6687 | closing the eventual chain. */ |
6688 | |
6689 | static slp_instance |
6690 | get_ultimate_leader (slp_instance instance, |
6691 | hash_map<slp_instance, slp_instance> &instance_leader) |
6692 | { |
6693 | auto_vec<slp_instance *, 8> chain; |
6694 | slp_instance *tem; |
6695 | while (*(tem = instance_leader.get (k: instance)) != instance) |
6696 | { |
6697 | chain.safe_push (obj: tem); |
6698 | instance = *tem; |
6699 | } |
6700 | while (!chain.is_empty ()) |
6701 | *chain.pop () = instance; |
6702 | return instance; |
6703 | } |
6704 | |
6705 | namespace { |
6706 | /* Subroutine of vect_bb_partition_graph_r. Map KEY to INSTANCE in |
6707 | KEY_TO_INSTANCE, making INSTANCE the leader of any previous mapping |
6708 | for KEY. Return true if KEY was already in KEY_TO_INSTANCE. |
6709 | |
6710 | INSTANCE_LEADER is as for get_ultimate_leader. */ |
6711 | |
6712 | template<typename T> |
6713 | bool |
6714 | vect_map_to_instance (slp_instance instance, T key, |
6715 | hash_map<T, slp_instance> &key_to_instance, |
6716 | hash_map<slp_instance, slp_instance> &instance_leader) |
6717 | { |
6718 | bool existed_p; |
6719 | slp_instance &key_instance = key_to_instance.get_or_insert (key, &existed_p); |
6720 | if (!existed_p) |
6721 | ; |
6722 | else if (key_instance != instance) |
6723 | { |
6724 | /* If we're running into a previously marked key make us the |
6725 | leader of the current ultimate leader. This keeps the |
6726 | leader chain acyclic and works even when the current instance |
6727 | connects two previously independent graph parts. */ |
6728 | slp_instance key_leader |
6729 | = get_ultimate_leader (instance: key_instance, instance_leader); |
6730 | if (key_leader != instance) |
6731 | instance_leader.put (k: key_leader, v: instance); |
6732 | } |
6733 | key_instance = instance; |
6734 | return existed_p; |
6735 | } |
6736 | } |
6737 | |
6738 | /* Worker of vect_bb_partition_graph, recurse on NODE. */ |
6739 | |
6740 | static void |
6741 | vect_bb_partition_graph_r (bb_vec_info bb_vinfo, |
6742 | slp_instance instance, slp_tree node, |
6743 | hash_map<stmt_vec_info, slp_instance> &stmt_to_instance, |
6744 | hash_map<slp_tree, slp_instance> &node_to_instance, |
6745 | hash_map<slp_instance, slp_instance> &instance_leader) |
6746 | { |
6747 | stmt_vec_info stmt_info; |
6748 | unsigned i; |
6749 | |
6750 | FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info) |
6751 | vect_map_to_instance (instance, key: stmt_info, key_to_instance&: stmt_to_instance, |
6752 | instance_leader); |
6753 | |
6754 | if (vect_map_to_instance (instance, key: node, key_to_instance&: node_to_instance, |
6755 | instance_leader)) |
6756 | return; |
6757 | |
6758 | slp_tree child; |
6759 | FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) |
6760 | if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def) |
6761 | vect_bb_partition_graph_r (bb_vinfo, instance, node: child, stmt_to_instance, |
6762 | node_to_instance, instance_leader); |
6763 | } |
6764 | |
6765 | /* Partition the SLP graph into pieces that can be costed independently. */ |
6766 | |
6767 | static void |
6768 | vect_bb_partition_graph (bb_vec_info bb_vinfo) |
6769 | { |
6770 | DUMP_VECT_SCOPE ("vect_bb_partition_graph" ); |
6771 | |
6772 | /* First walk the SLP graph assigning each involved scalar stmt a |
6773 | corresponding SLP graph entry and upon visiting a previously |
6774 | marked stmt, make the stmts leader the current SLP graph entry. */ |
6775 | hash_map<stmt_vec_info, slp_instance> stmt_to_instance; |
6776 | hash_map<slp_tree, slp_instance> node_to_instance; |
6777 | hash_map<slp_instance, slp_instance> instance_leader; |
6778 | slp_instance instance; |
6779 | for (unsigned i = 0; bb_vinfo->slp_instances.iterate (ix: i, ptr: &instance); ++i) |
6780 | { |
6781 | instance_leader.put (k: instance, v: instance); |
6782 | vect_bb_partition_graph_r (bb_vinfo, |
6783 | instance, SLP_INSTANCE_TREE (instance), |
6784 | stmt_to_instance, node_to_instance, |
6785 | instance_leader); |
6786 | } |
6787 | |
6788 | /* Then collect entries to each independent subgraph. */ |
6789 | for (unsigned i = 0; bb_vinfo->slp_instances.iterate (ix: i, ptr: &instance); ++i) |
6790 | { |
6791 | slp_instance leader = get_ultimate_leader (instance, instance_leader); |
6792 | leader->subgraph_entries.safe_push (obj: instance); |
6793 | if (dump_enabled_p () |
6794 | && leader != instance) |
6795 | dump_printf_loc (MSG_NOTE, vect_location, |
6796 | "instance %p is leader of %p\n" , |
6797 | (void *) leader, (void *) instance); |
6798 | } |
6799 | } |
6800 | |
6801 | /* Compute the set of scalar stmts participating in internal and external |
6802 | nodes. */ |
6803 | |
6804 | static void |
6805 | vect_slp_gather_vectorized_scalar_stmts (vec_info *vinfo, slp_tree node, |
6806 | hash_set<slp_tree> &visited, |
6807 | hash_set<stmt_vec_info> &vstmts, |
6808 | hash_set<stmt_vec_info> &estmts) |
6809 | { |
6810 | int i; |
6811 | stmt_vec_info stmt_info; |
6812 | slp_tree child; |
6813 | |
6814 | if (visited.add (k: node)) |
6815 | return; |
6816 | |
6817 | if (SLP_TREE_DEF_TYPE (node) == vect_internal_def) |
6818 | { |
6819 | FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info) |
6820 | vstmts.add (k: stmt_info); |
6821 | |
6822 | FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) |
6823 | if (child) |
6824 | vect_slp_gather_vectorized_scalar_stmts (vinfo, node: child, visited, |
6825 | vstmts, estmts); |
6826 | } |
6827 | else |
6828 | for (tree def : SLP_TREE_SCALAR_OPS (node)) |
6829 | { |
6830 | stmt_vec_info def_stmt = vinfo->lookup_def (def); |
6831 | if (def_stmt) |
6832 | estmts.add (k: def_stmt); |
6833 | } |
6834 | } |
6835 | |
6836 | |
6837 | /* Compute the scalar cost of the SLP node NODE and its children |
6838 | and return it. Do not account defs that are marked in LIFE and |
6839 | update LIFE according to uses of NODE. */ |
6840 | |
6841 | static void |
6842 | vect_bb_slp_scalar_cost (vec_info *vinfo, |
6843 | slp_tree node, vec<bool, va_heap> *life, |
6844 | stmt_vector_for_cost *cost_vec, |
6845 | hash_set<stmt_vec_info> &vectorized_scalar_stmts, |
6846 | hash_set<slp_tree> &visited) |
6847 | { |
6848 | unsigned i; |
6849 | stmt_vec_info stmt_info; |
6850 | slp_tree child; |
6851 | |
6852 | if (visited.add (k: node)) |
6853 | return; |
6854 | |
6855 | FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info) |
6856 | { |
6857 | ssa_op_iter op_iter; |
6858 | def_operand_p def_p; |
6859 | |
6860 | if ((*life)[i]) |
6861 | continue; |
6862 | |
6863 | stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info); |
6864 | gimple *orig_stmt = orig_stmt_info->stmt; |
6865 | |
6866 | /* If there is a non-vectorized use of the defs then the scalar |
6867 | stmt is kept live in which case we do not account it or any |
6868 | required defs in the SLP children in the scalar cost. This |
6869 | way we make the vectorization more costly when compared to |
6870 | the scalar cost. */ |
6871 | if (!STMT_VINFO_LIVE_P (stmt_info)) |
6872 | { |
6873 | auto_vec<gimple *, 8> worklist; |
6874 | hash_set<gimple *> *worklist_visited = NULL; |
6875 | worklist.quick_push (obj: orig_stmt); |
6876 | do |
6877 | { |
6878 | gimple *work_stmt = worklist.pop (); |
6879 | FOR_EACH_PHI_OR_STMT_DEF (def_p, work_stmt, op_iter, SSA_OP_DEF) |
6880 | { |
6881 | imm_use_iterator use_iter; |
6882 | gimple *use_stmt; |
6883 | FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, |
6884 | DEF_FROM_PTR (def_p)) |
6885 | if (!is_gimple_debug (gs: use_stmt)) |
6886 | { |
6887 | stmt_vec_info use_stmt_info |
6888 | = vinfo->lookup_stmt (use_stmt); |
6889 | if (!use_stmt_info |
6890 | || !vectorized_scalar_stmts.contains (k: use_stmt_info)) |
6891 | { |
6892 | if (use_stmt_info |
6893 | && STMT_VINFO_IN_PATTERN_P (use_stmt_info)) |
6894 | { |
6895 | /* For stmts participating in patterns we have |
6896 | to check its uses recursively. */ |
6897 | if (!worklist_visited) |
6898 | worklist_visited = new hash_set<gimple *> (); |
6899 | if (!worklist_visited->add (k: use_stmt)) |
6900 | worklist.safe_push (obj: use_stmt); |
6901 | continue; |
6902 | } |
6903 | (*life)[i] = true; |
6904 | goto next_lane; |
6905 | } |
6906 | } |
6907 | } |
6908 | } |
6909 | while (!worklist.is_empty ()); |
6910 | next_lane: |
6911 | if (worklist_visited) |
6912 | delete worklist_visited; |
6913 | if ((*life)[i]) |
6914 | continue; |
6915 | } |
6916 | |
6917 | /* Count scalar stmts only once. */ |
6918 | if (gimple_visited_p (stmt: orig_stmt)) |
6919 | continue; |
6920 | gimple_set_visited (stmt: orig_stmt, visited_p: true); |
6921 | |
6922 | vect_cost_for_stmt kind; |
6923 | if (STMT_VINFO_DATA_REF (orig_stmt_info)) |
6924 | { |
6925 | if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info))) |
6926 | kind = scalar_load; |
6927 | else |
6928 | kind = scalar_store; |
6929 | } |
6930 | else if (vect_nop_conversion_p (orig_stmt_info)) |
6931 | continue; |
6932 | /* For single-argument PHIs assume coalescing which means zero cost |
6933 | for the scalar and the vector PHIs. This avoids artificially |
6934 | favoring the vector path (but may pessimize it in some cases). */ |
6935 | else if (is_a <gphi *> (p: orig_stmt_info->stmt) |
6936 | && gimple_phi_num_args |
6937 | (gs: as_a <gphi *> (p: orig_stmt_info->stmt)) == 1) |
6938 | continue; |
6939 | else |
6940 | kind = scalar_stmt; |
6941 | record_stmt_cost (cost_vec, 1, kind, orig_stmt_info, |
6942 | SLP_TREE_VECTYPE (node), 0, vect_body); |
6943 | } |
6944 | |
6945 | auto_vec<bool, 20> subtree_life; |
6946 | FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) |
6947 | { |
6948 | if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def) |
6949 | { |
6950 | /* Do not directly pass LIFE to the recursive call, copy it to |
6951 | confine changes in the callee to the current child/subtree. */ |
6952 | if (SLP_TREE_CODE (node) == VEC_PERM_EXPR) |
6953 | { |
6954 | subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), exact: true); |
6955 | for (unsigned j = 0; |
6956 | j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j) |
6957 | { |
6958 | auto perm = SLP_TREE_LANE_PERMUTATION (node)[j]; |
6959 | if (perm.first == i) |
6960 | subtree_life[perm.second] = (*life)[j]; |
6961 | } |
6962 | } |
6963 | else |
6964 | { |
6965 | gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child)); |
6966 | subtree_life.safe_splice (src: *life); |
6967 | } |
6968 | vect_bb_slp_scalar_cost (vinfo, node: child, life: &subtree_life, cost_vec, |
6969 | vectorized_scalar_stmts, visited); |
6970 | subtree_life.truncate (size: 0); |
6971 | } |
6972 | } |
6973 | } |
6974 | |
6975 | /* Comparator for the loop-index sorted cost vectors. */ |
6976 | |
6977 | static int |
6978 | li_cost_vec_cmp (const void *a_, const void *b_) |
6979 | { |
6980 | auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_; |
6981 | auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_; |
6982 | if (a->first < b->first) |
6983 | return -1; |
6984 | else if (a->first == b->first) |
6985 | return 0; |
6986 | return 1; |
6987 | } |
6988 | |
6989 | /* Check if vectorization of the basic block is profitable for the |
6990 | subgraph denoted by SLP_INSTANCES. */ |
6991 | |
6992 | static bool |
6993 | vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo, |
6994 | vec<slp_instance> slp_instances, |
6995 | loop_p orig_loop) |
6996 | { |
6997 | slp_instance instance; |
6998 | int i; |
6999 | unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0; |
7000 | unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0; |
7001 | |
7002 | if (dump_enabled_p ()) |
7003 | { |
7004 | dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n" ); |
7005 | hash_set<slp_tree> visited; |
7006 | FOR_EACH_VEC_ELT (slp_instances, i, instance) |
7007 | vect_print_slp_graph (dump_kind: MSG_NOTE, loc: vect_location, |
7008 | SLP_INSTANCE_TREE (instance), visited); |
7009 | } |
7010 | |
7011 | /* Compute the set of scalar stmts we know will go away 'locally' when |
7012 | vectorizing. This used to be tracked with just PURE_SLP_STMT but that's |
7013 | not accurate for nodes promoted extern late or for scalar stmts that |
7014 | are used both in extern defs and in vectorized defs. */ |
7015 | hash_set<stmt_vec_info> vectorized_scalar_stmts; |
7016 | hash_set<stmt_vec_info> scalar_stmts_in_externs; |
7017 | hash_set<slp_tree> visited; |
7018 | FOR_EACH_VEC_ELT (slp_instances, i, instance) |
7019 | { |
7020 | vect_slp_gather_vectorized_scalar_stmts (vinfo: bb_vinfo, |
7021 | SLP_INSTANCE_TREE (instance), |
7022 | visited, |
7023 | vstmts&: vectorized_scalar_stmts, |
7024 | estmts&: scalar_stmts_in_externs); |
7025 | for (stmt_vec_info rstmt : SLP_INSTANCE_ROOT_STMTS (instance)) |
7026 | vectorized_scalar_stmts.add (k: rstmt); |
7027 | } |
7028 | /* Scalar stmts used as defs in external nodes need to be preseved, so |
7029 | remove them from vectorized_scalar_stmts. */ |
7030 | for (stmt_vec_info stmt : scalar_stmts_in_externs) |
7031 | vectorized_scalar_stmts.remove (k: stmt); |
7032 | |
7033 | /* Calculate scalar cost and sum the cost for the vector stmts |
7034 | previously collected. */ |
7035 | stmt_vector_for_cost scalar_costs = vNULL; |
7036 | stmt_vector_for_cost vector_costs = vNULL; |
7037 | visited.empty (); |
7038 | FOR_EACH_VEC_ELT (slp_instances, i, instance) |
7039 | { |
7040 | auto_vec<bool, 20> life; |
7041 | life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)), |
7042 | exact: true); |
7043 | if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()) |
7044 | record_stmt_cost (body_cost_vec: &scalar_costs, |
7045 | SLP_INSTANCE_ROOT_STMTS (instance).length (), |
7046 | kind: scalar_stmt, |
7047 | SLP_INSTANCE_ROOT_STMTS (instance)[0], misalign: 0, where: vect_body); |
7048 | vect_bb_slp_scalar_cost (vinfo: bb_vinfo, |
7049 | SLP_INSTANCE_TREE (instance), |
7050 | life: &life, cost_vec: &scalar_costs, vectorized_scalar_stmts, |
7051 | visited); |
7052 | vector_costs.safe_splice (src: instance->cost_vec); |
7053 | instance->cost_vec.release (); |
7054 | } |
7055 | |
7056 | if (dump_enabled_p ()) |
7057 | dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n" ); |
7058 | |
7059 | /* When costing non-loop vectorization we need to consider each covered |
7060 | loop independently and make sure vectorization is profitable. For |
7061 | now we assume a loop may be not entered or executed an arbitrary |
7062 | number of iterations (??? static information can provide more |
7063 | precise info here) which means we can simply cost each containing |
7064 | loops stmts separately. */ |
7065 | |
7066 | /* First produce cost vectors sorted by loop index. */ |
7067 | auto_vec<std::pair<unsigned, stmt_info_for_cost *> > |
7068 | li_scalar_costs (scalar_costs.length ()); |
7069 | auto_vec<std::pair<unsigned, stmt_info_for_cost *> > |
7070 | li_vector_costs (vector_costs.length ()); |
7071 | stmt_info_for_cost *cost; |
7072 | FOR_EACH_VEC_ELT (scalar_costs, i, cost) |
7073 | { |
7074 | unsigned l = gimple_bb (g: cost->stmt_info->stmt)->loop_father->num; |
7075 | li_scalar_costs.quick_push (obj: std::make_pair (x&: l, y&: cost)); |
7076 | } |
7077 | /* Use a random used loop as fallback in case the first vector_costs |
7078 | entry does not have a stmt_info associated with it. */ |
7079 | unsigned l = li_scalar_costs[0].first; |
7080 | FOR_EACH_VEC_ELT (vector_costs, i, cost) |
7081 | { |
7082 | /* We inherit from the previous COST, invariants, externals and |
7083 | extracts immediately follow the cost for the related stmt. */ |
7084 | if (cost->stmt_info) |
7085 | l = gimple_bb (g: cost->stmt_info->stmt)->loop_father->num; |
7086 | li_vector_costs.quick_push (obj: std::make_pair (x&: l, y&: cost)); |
7087 | } |
7088 | li_scalar_costs.qsort (li_cost_vec_cmp); |
7089 | li_vector_costs.qsort (li_cost_vec_cmp); |
7090 | |
7091 | /* Now cost the portions individually. */ |
7092 | unsigned vi = 0; |
7093 | unsigned si = 0; |
7094 | bool profitable = true; |
7095 | while (si < li_scalar_costs.length () |
7096 | && vi < li_vector_costs.length ()) |
7097 | { |
7098 | unsigned sl = li_scalar_costs[si].first; |
7099 | unsigned vl = li_vector_costs[vi].first; |
7100 | if (sl != vl) |
7101 | { |
7102 | if (dump_enabled_p ()) |
7103 | dump_printf_loc (MSG_NOTE, vect_location, |
7104 | "Scalar %d and vector %d loop part do not " |
7105 | "match up, skipping scalar part\n" , sl, vl); |
7106 | /* Skip the scalar part, assuming zero cost on the vector side. */ |
7107 | do |
7108 | { |
7109 | si++; |
7110 | } |
7111 | while (si < li_scalar_costs.length () |
7112 | && li_scalar_costs[si].first == sl); |
7113 | continue; |
7114 | } |
7115 | |
7116 | class vector_costs *scalar_target_cost_data = init_cost (vinfo: bb_vinfo, costing_for_scalar: true); |
7117 | do |
7118 | { |
7119 | add_stmt_cost (costs: scalar_target_cost_data, i: li_scalar_costs[si].second); |
7120 | si++; |
7121 | } |
7122 | while (si < li_scalar_costs.length () |
7123 | && li_scalar_costs[si].first == sl); |
7124 | unsigned dummy; |
7125 | finish_cost (costs: scalar_target_cost_data, scalar_costs: nullptr, |
7126 | prologue_cost: &dummy, body_cost: &scalar_cost, epilogue_cost: &dummy); |
7127 | |
7128 | /* Complete the target-specific vector cost calculation. */ |
7129 | class vector_costs *vect_target_cost_data = init_cost (vinfo: bb_vinfo, costing_for_scalar: false); |
7130 | do |
7131 | { |
7132 | add_stmt_cost (costs: vect_target_cost_data, i: li_vector_costs[vi].second); |
7133 | vi++; |
7134 | } |
7135 | while (vi < li_vector_costs.length () |
7136 | && li_vector_costs[vi].first == vl); |
7137 | finish_cost (costs: vect_target_cost_data, scalar_costs: scalar_target_cost_data, |
7138 | prologue_cost: &vec_prologue_cost, body_cost: &vec_inside_cost, epilogue_cost: &vec_epilogue_cost); |
7139 | delete scalar_target_cost_data; |
7140 | delete vect_target_cost_data; |
7141 | |
7142 | vec_outside_cost = vec_prologue_cost + vec_epilogue_cost; |
7143 | |
7144 | if (dump_enabled_p ()) |
7145 | { |
7146 | dump_printf_loc (MSG_NOTE, vect_location, |
7147 | "Cost model analysis for part in loop %d:\n" , sl); |
7148 | dump_printf (MSG_NOTE, " Vector cost: %d\n" , |
7149 | vec_inside_cost + vec_outside_cost); |
7150 | dump_printf (MSG_NOTE, " Scalar cost: %d\n" , scalar_cost); |
7151 | } |
7152 | |
7153 | /* Vectorization is profitable if its cost is more than the cost of scalar |
7154 | version. Note that we err on the vector side for equal cost because |
7155 | the cost estimate is otherwise quite pessimistic (constant uses are |
7156 | free on the scalar side but cost a load on the vector side for |
7157 | example). */ |
7158 | if (vec_outside_cost + vec_inside_cost > scalar_cost) |
7159 | { |
7160 | profitable = false; |
7161 | break; |
7162 | } |
7163 | } |
7164 | if (profitable && vi < li_vector_costs.length ()) |
7165 | { |
7166 | if (dump_enabled_p ()) |
7167 | dump_printf_loc (MSG_NOTE, vect_location, |
7168 | "Excess vector cost for part in loop %d:\n" , |
7169 | li_vector_costs[vi].first); |
7170 | profitable = false; |
7171 | } |
7172 | |
7173 | /* Unset visited flag. This is delayed when the subgraph is profitable |
7174 | and we process the loop for remaining unvectorized if-converted code. */ |
7175 | if (!orig_loop || !profitable) |
7176 | FOR_EACH_VEC_ELT (scalar_costs, i, cost) |
7177 | gimple_set_visited (stmt: cost->stmt_info->stmt, visited_p: false); |
7178 | |
7179 | scalar_costs.release (); |
7180 | vector_costs.release (); |
7181 | |
7182 | return profitable; |
7183 | } |
7184 | |
7185 | /* qsort comparator for lane defs. */ |
7186 | |
7187 | static int |
7188 | vld_cmp (const void *a_, const void *b_) |
7189 | { |
7190 | auto *a = (const std::pair<unsigned, tree> *)a_; |
7191 | auto *b = (const std::pair<unsigned, tree> *)b_; |
7192 | return a->first - b->first; |
7193 | } |
7194 | |
7195 | /* Return true if USE_STMT is a vector lane insert into VEC and set |
7196 | *THIS_LANE to the lane number that is set. */ |
7197 | |
7198 | static bool |
7199 | vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane) |
7200 | { |
7201 | gassign *use_ass = dyn_cast <gassign *> (p: use_stmt); |
7202 | if (!use_ass |
7203 | || gimple_assign_rhs_code (gs: use_ass) != BIT_INSERT_EXPR |
7204 | || (vec |
7205 | ? gimple_assign_rhs1 (gs: use_ass) != vec |
7206 | : ((vec = gimple_assign_rhs1 (gs: use_ass)), false)) |
7207 | || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)), |
7208 | TREE_TYPE (gimple_assign_rhs2 (use_ass))) |
7209 | || !constant_multiple_p |
7210 | (a: tree_to_poly_uint64 (gimple_assign_rhs3 (gs: use_ass)), |
7211 | b: tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))), |
7212 | multiple: this_lane)) |
7213 | return false; |
7214 | return true; |
7215 | } |
7216 | |
7217 | /* Find any vectorizable constructors and add them to the grouped_store |
7218 | array. */ |
7219 | |
7220 | static void |
7221 | vect_slp_check_for_roots (bb_vec_info bb_vinfo) |
7222 | { |
7223 | for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i) |
7224 | for (gimple_stmt_iterator gsi = gsi_start_bb (bb: bb_vinfo->bbs[i]); |
7225 | !gsi_end_p (i: gsi); gsi_next (i: &gsi)) |
7226 | { |
7227 | gassign *assign = dyn_cast<gassign *> (p: gsi_stmt (i: gsi)); |
7228 | if (!assign) |
7229 | continue; |
7230 | |
7231 | tree rhs = gimple_assign_rhs1 (gs: assign); |
7232 | enum tree_code code = gimple_assign_rhs_code (gs: assign); |
7233 | use_operand_p use_p; |
7234 | gimple *use_stmt; |
7235 | if (code == CONSTRUCTOR) |
7236 | { |
7237 | if (!VECTOR_TYPE_P (TREE_TYPE (rhs)) |
7238 | || maybe_ne (a: TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)), |
7239 | CONSTRUCTOR_NELTS (rhs)) |
7240 | || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value)) |
7241 | || uniform_vector_p (rhs)) |
7242 | continue; |
7243 | |
7244 | unsigned j; |
7245 | tree val; |
7246 | FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val) |
7247 | if (TREE_CODE (val) != SSA_NAME |
7248 | || !bb_vinfo->lookup_def (val)) |
7249 | break; |
7250 | if (j != CONSTRUCTOR_NELTS (rhs)) |
7251 | continue; |
7252 | |
7253 | vec<stmt_vec_info> roots = vNULL; |
7254 | roots.safe_push (obj: bb_vinfo->lookup_stmt (assign)); |
7255 | vec<stmt_vec_info> stmts; |
7256 | stmts.create (CONSTRUCTOR_NELTS (rhs)); |
7257 | FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val) |
7258 | stmts.quick_push |
7259 | (obj: vect_stmt_to_vectorize (stmt_info: bb_vinfo->lookup_def (val))); |
7260 | bb_vinfo->roots.safe_push (obj: slp_root (slp_inst_kind_ctor, |
7261 | stmts, roots)); |
7262 | } |
7263 | else if (code == BIT_INSERT_EXPR |
7264 | && VECTOR_TYPE_P (TREE_TYPE (rhs)) |
7265 | && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant () |
7266 | && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1 |
7267 | && integer_zerop (gimple_assign_rhs3 (gs: assign)) |
7268 | && useless_type_conversion_p |
7269 | (TREE_TYPE (TREE_TYPE (rhs)), |
7270 | TREE_TYPE (gimple_assign_rhs2 (assign))) |
7271 | && bb_vinfo->lookup_def (gimple_assign_rhs2 (gs: assign))) |
7272 | { |
7273 | /* We start to match on insert to lane zero but since the |
7274 | inserts need not be ordered we'd have to search both |
7275 | the def and the use chains. */ |
7276 | tree vectype = TREE_TYPE (rhs); |
7277 | unsigned nlanes = TYPE_VECTOR_SUBPARTS (node: vectype).to_constant (); |
7278 | auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes); |
7279 | auto_sbitmap lanes (nlanes); |
7280 | bitmap_clear (lanes); |
7281 | bitmap_set_bit (map: lanes, bitno: 0); |
7282 | tree def = gimple_assign_lhs (gs: assign); |
7283 | lane_defs.quick_push |
7284 | (obj: std::make_pair (x: 0, y: gimple_assign_rhs2 (gs: assign))); |
7285 | unsigned lanes_found = 1; |
7286 | /* Start with the use chains, the last stmt will be the root. */ |
7287 | stmt_vec_info last = bb_vinfo->lookup_stmt (assign); |
7288 | vec<stmt_vec_info> roots = vNULL; |
7289 | roots.safe_push (obj: last); |
7290 | do |
7291 | { |
7292 | use_operand_p use_p; |
7293 | gimple *use_stmt; |
7294 | if (!single_imm_use (var: def, use_p: &use_p, stmt: &use_stmt)) |
7295 | break; |
7296 | unsigned this_lane; |
7297 | if (!bb_vinfo->lookup_stmt (use_stmt) |
7298 | || !vect_slp_is_lane_insert (use_stmt, vec: def, this_lane: &this_lane) |
7299 | || !bb_vinfo->lookup_def (gimple_assign_rhs2 (gs: use_stmt))) |
7300 | break; |
7301 | if (bitmap_bit_p (map: lanes, bitno: this_lane)) |
7302 | break; |
7303 | lanes_found++; |
7304 | bitmap_set_bit (map: lanes, bitno: this_lane); |
7305 | gassign *use_ass = as_a <gassign *> (p: use_stmt); |
7306 | lane_defs.quick_push (obj: std::make_pair |
7307 | (x&: this_lane, y: gimple_assign_rhs2 (gs: use_ass))); |
7308 | last = bb_vinfo->lookup_stmt (use_ass); |
7309 | roots.safe_push (obj: last); |
7310 | def = gimple_assign_lhs (gs: use_ass); |
7311 | } |
7312 | while (lanes_found < nlanes); |
7313 | if (roots.length () > 1) |
7314 | std::swap(a&: roots[0], b&: roots[roots.length () - 1]); |
7315 | if (lanes_found < nlanes) |
7316 | { |
7317 | /* Now search the def chain. */ |
7318 | def = gimple_assign_rhs1 (gs: assign); |
7319 | do |
7320 | { |
7321 | if (TREE_CODE (def) != SSA_NAME |
7322 | || !has_single_use (var: def)) |
7323 | break; |
7324 | gimple *def_stmt = SSA_NAME_DEF_STMT (def); |
7325 | unsigned this_lane; |
7326 | if (!bb_vinfo->lookup_stmt (def_stmt) |
7327 | || !vect_slp_is_lane_insert (use_stmt: def_stmt, |
7328 | NULL_TREE, this_lane: &this_lane) |
7329 | || !bb_vinfo->lookup_def (gimple_assign_rhs2 (gs: def_stmt))) |
7330 | break; |
7331 | if (bitmap_bit_p (map: lanes, bitno: this_lane)) |
7332 | break; |
7333 | lanes_found++; |
7334 | bitmap_set_bit (map: lanes, bitno: this_lane); |
7335 | lane_defs.quick_push (obj: std::make_pair |
7336 | (x&: this_lane, |
7337 | y: gimple_assign_rhs2 (gs: def_stmt))); |
7338 | roots.safe_push (obj: bb_vinfo->lookup_stmt (def_stmt)); |
7339 | def = gimple_assign_rhs1 (gs: def_stmt); |
7340 | } |
7341 | while (lanes_found < nlanes); |
7342 | } |
7343 | if (lanes_found == nlanes) |
7344 | { |
7345 | /* Sort lane_defs after the lane index and register the root. */ |
7346 | lane_defs.qsort (vld_cmp); |
7347 | vec<stmt_vec_info> stmts; |
7348 | stmts.create (nelems: nlanes); |
7349 | for (unsigned i = 0; i < nlanes; ++i) |
7350 | stmts.quick_push (obj: bb_vinfo->lookup_def (lane_defs[i].second)); |
7351 | bb_vinfo->roots.safe_push (obj: slp_root (slp_inst_kind_ctor, |
7352 | stmts, roots)); |
7353 | } |
7354 | else |
7355 | roots.release (); |
7356 | } |
7357 | else if (!VECTOR_TYPE_P (TREE_TYPE (rhs)) |
7358 | && (associative_tree_code (code) || code == MINUS_EXPR) |
7359 | /* ??? This pessimizes a two-element reduction. PR54400. |
7360 | ??? In-order reduction could be handled if we only |
7361 | traverse one operand chain in vect_slp_linearize_chain. */ |
7362 | && !needs_fold_left_reduction_p (TREE_TYPE (rhs), code) |
7363 | /* Ops with constants at the tail can be stripped here. */ |
7364 | && TREE_CODE (rhs) == SSA_NAME |
7365 | && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME |
7366 | /* Should be the chain end. */ |
7367 | && (!single_imm_use (var: gimple_assign_lhs (gs: assign), |
7368 | use_p: &use_p, stmt: &use_stmt) |
7369 | || !is_gimple_assign (gs: use_stmt) |
7370 | || (gimple_assign_rhs_code (gs: use_stmt) != code |
7371 | && ((code != PLUS_EXPR && code != MINUS_EXPR) |
7372 | || (gimple_assign_rhs_code (gs: use_stmt) |
7373 | != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR)))))) |
7374 | { |
7375 | /* We start the match at the end of a possible association |
7376 | chain. */ |
7377 | auto_vec<chain_op_t> chain; |
7378 | auto_vec<std::pair<tree_code, gimple *> > worklist; |
7379 | auto_vec<gimple *> chain_stmts; |
7380 | gimple *code_stmt = NULL, *alt_code_stmt = NULL; |
7381 | if (code == MINUS_EXPR) |
7382 | code = PLUS_EXPR; |
7383 | internal_fn reduc_fn; |
7384 | if (!reduction_fn_for_scalar_code (code, &reduc_fn) |
7385 | || reduc_fn == IFN_LAST) |
7386 | continue; |
7387 | vect_slp_linearize_chain (vinfo: bb_vinfo, worklist, chain, code, start: assign, |
7388 | /* ??? */ |
7389 | code_stmt, alt_code_stmt, chain_stmts: &chain_stmts); |
7390 | if (chain.length () > 1) |
7391 | { |
7392 | /* Sort the chain according to def_type and operation. */ |
7393 | chain.sort (cmp: dt_sort_cmp, data: bb_vinfo); |
7394 | /* ??? Now we'd want to strip externals and constants |
7395 | but record those to be handled in the epilogue. */ |
7396 | /* ??? For now do not allow mixing ops or externs/constants. */ |
7397 | bool invalid = false; |
7398 | unsigned remain_cnt = 0; |
7399 | for (unsigned i = 0; i < chain.length (); ++i) |
7400 | { |
7401 | if (chain[i].code != code) |
7402 | { |
7403 | invalid = true; |
7404 | break; |
7405 | } |
7406 | if (chain[i].dt != vect_internal_def) |
7407 | remain_cnt++; |
7408 | } |
7409 | if (!invalid && chain.length () - remain_cnt > 1) |
7410 | { |
7411 | vec<stmt_vec_info> stmts; |
7412 | vec<tree> remain = vNULL; |
7413 | stmts.create (nelems: chain.length ()); |
7414 | if (remain_cnt > 0) |
7415 | remain.create (nelems: remain_cnt); |
7416 | for (unsigned i = 0; i < chain.length (); ++i) |
7417 | { |
7418 | if (chain[i].dt == vect_internal_def) |
7419 | stmts.quick_push (obj: bb_vinfo->lookup_def (chain[i].op)); |
7420 | else |
7421 | remain.quick_push (obj: chain[i].op); |
7422 | } |
7423 | vec<stmt_vec_info> roots; |
7424 | roots.create (nelems: chain_stmts.length ()); |
7425 | for (unsigned i = 0; i < chain_stmts.length (); ++i) |
7426 | roots.quick_push (obj: bb_vinfo->lookup_stmt (chain_stmts[i])); |
7427 | bb_vinfo->roots.safe_push (obj: slp_root (slp_inst_kind_bb_reduc, |
7428 | stmts, roots, remain)); |
7429 | } |
7430 | } |
7431 | } |
7432 | } |
7433 | } |
7434 | |
7435 | /* Walk the grouped store chains and replace entries with their |
7436 | pattern variant if any. */ |
7437 | |
7438 | static void |
7439 | vect_fixup_store_groups_with_patterns (vec_info *vinfo) |
7440 | { |
7441 | stmt_vec_info first_element; |
7442 | unsigned i; |
7443 | |
7444 | FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element) |
7445 | { |
7446 | /* We also have CTORs in this array. */ |
7447 | if (!STMT_VINFO_GROUPED_ACCESS (first_element)) |
7448 | continue; |
7449 | if (STMT_VINFO_IN_PATTERN_P (first_element)) |
7450 | { |
7451 | stmt_vec_info orig = first_element; |
7452 | first_element = STMT_VINFO_RELATED_STMT (first_element); |
7453 | DR_GROUP_FIRST_ELEMENT (first_element) = first_element; |
7454 | DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig); |
7455 | DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig); |
7456 | DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig); |
7457 | vinfo->grouped_stores[i] = first_element; |
7458 | } |
7459 | stmt_vec_info prev = first_element; |
7460 | while (DR_GROUP_NEXT_ELEMENT (prev)) |
7461 | { |
7462 | stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev); |
7463 | if (STMT_VINFO_IN_PATTERN_P (elt)) |
7464 | { |
7465 | stmt_vec_info orig = elt; |
7466 | elt = STMT_VINFO_RELATED_STMT (elt); |
7467 | DR_GROUP_NEXT_ELEMENT (prev) = elt; |
7468 | DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig); |
7469 | DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig); |
7470 | } |
7471 | DR_GROUP_FIRST_ELEMENT (elt) = first_element; |
7472 | prev = elt; |
7473 | } |
7474 | } |
7475 | } |
7476 | |
7477 | /* Check if the region described by BB_VINFO can be vectorized, returning |
7478 | true if so. When returning false, set FATAL to true if the same failure |
7479 | would prevent vectorization at other vector sizes, false if it is still |
7480 | worth trying other sizes. N_STMTS is the number of statements in the |
7481 | region. */ |
7482 | |
7483 | static bool |
7484 | vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal, |
7485 | vec<int> *dataref_groups) |
7486 | { |
7487 | DUMP_VECT_SCOPE ("vect_slp_analyze_bb" ); |
7488 | |
7489 | slp_instance instance; |
7490 | int i; |
7491 | poly_uint64 min_vf = 2; |
7492 | |
7493 | /* The first group of checks is independent of the vector size. */ |
7494 | fatal = true; |
7495 | |
7496 | /* Analyze the data references. */ |
7497 | |
7498 | if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL)) |
7499 | { |
7500 | if (dump_enabled_p ()) |
7501 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
7502 | "not vectorized: unhandled data-ref in basic " |
7503 | "block.\n" ); |
7504 | return false; |
7505 | } |
7506 | |
7507 | if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups)) |
7508 | { |
7509 | if (dump_enabled_p ()) |
7510 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
7511 | "not vectorized: unhandled data access in " |
7512 | "basic block.\n" ); |
7513 | return false; |
7514 | } |
7515 | |
7516 | vect_slp_check_for_roots (bb_vinfo); |
7517 | |
7518 | /* If there are no grouped stores and no constructors in the region |
7519 | there is no need to continue with pattern recog as vect_analyze_slp |
7520 | will fail anyway. */ |
7521 | if (bb_vinfo->grouped_stores.is_empty () |
7522 | && bb_vinfo->roots.is_empty ()) |
7523 | { |
7524 | if (dump_enabled_p ()) |
7525 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
7526 | "not vectorized: no grouped stores in " |
7527 | "basic block.\n" ); |
7528 | return false; |
7529 | } |
7530 | |
7531 | /* While the rest of the analysis below depends on it in some way. */ |
7532 | fatal = false; |
7533 | |
7534 | vect_pattern_recog (bb_vinfo); |
7535 | |
7536 | /* Update store groups from pattern processing. */ |
7537 | vect_fixup_store_groups_with_patterns (vinfo: bb_vinfo); |
7538 | |
7539 | /* Check the SLP opportunities in the basic block, analyze and build SLP |
7540 | trees. */ |
7541 | if (!vect_analyze_slp (vinfo: bb_vinfo, max_tree_size: n_stmts)) |
7542 | { |
7543 | if (dump_enabled_p ()) |
7544 | { |
7545 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
7546 | "Failed to SLP the basic block.\n" ); |
7547 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
7548 | "not vectorized: failed to find SLP opportunities " |
7549 | "in basic block.\n" ); |
7550 | } |
7551 | return false; |
7552 | } |
7553 | |
7554 | /* Optimize permutations. */ |
7555 | vect_optimize_slp (vinfo: bb_vinfo); |
7556 | |
7557 | /* Gather the loads reachable from the SLP graph entries. */ |
7558 | vect_gather_slp_loads (vinfo: bb_vinfo); |
7559 | |
7560 | vect_record_base_alignments (bb_vinfo); |
7561 | |
7562 | /* Analyze and verify the alignment of data references and the |
7563 | dependence in the SLP instances. */ |
7564 | for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (ix: i, ptr: &instance); ) |
7565 | { |
7566 | vect_location = instance->location (); |
7567 | if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance) |
7568 | || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance)) |
7569 | { |
7570 | slp_tree node = SLP_INSTANCE_TREE (instance); |
7571 | stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0]; |
7572 | if (dump_enabled_p ()) |
7573 | dump_printf_loc (MSG_NOTE, vect_location, |
7574 | "removing SLP instance operations starting from: %G" , |
7575 | stmt_info->stmt); |
7576 | vect_free_slp_instance (instance); |
7577 | BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (ix: i); |
7578 | continue; |
7579 | } |
7580 | |
7581 | /* Mark all the statements that we want to vectorize as pure SLP and |
7582 | relevant. */ |
7583 | vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance)); |
7584 | vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance)); |
7585 | unsigned j; |
7586 | stmt_vec_info root; |
7587 | /* Likewise consider instance root stmts as vectorized. */ |
7588 | FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance), j, root) |
7589 | STMT_SLP_TYPE (root) = pure_slp; |
7590 | |
7591 | i++; |
7592 | } |
7593 | if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ()) |
7594 | return false; |
7595 | |
7596 | if (!vect_slp_analyze_operations (vinfo: bb_vinfo)) |
7597 | { |
7598 | if (dump_enabled_p ()) |
7599 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
7600 | "not vectorized: bad operation in basic block.\n" ); |
7601 | return false; |
7602 | } |
7603 | |
7604 | vect_bb_partition_graph (bb_vinfo); |
7605 | |
7606 | return true; |
7607 | } |
7608 | |
7609 | /* Subroutine of vect_slp_bb. Try to vectorize the statements for all |
7610 | basic blocks in BBS, returning true on success. |
7611 | The region has N_STMTS statements and has the datarefs given by DATAREFS. */ |
7612 | |
7613 | static bool |
7614 | vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs, |
7615 | vec<int> *dataref_groups, unsigned int n_stmts, |
7616 | loop_p orig_loop) |
7617 | { |
7618 | bb_vec_info bb_vinfo; |
7619 | auto_vector_modes vector_modes; |
7620 | |
7621 | /* Autodetect first vector size we try. */ |
7622 | machine_mode next_vector_mode = VOIDmode; |
7623 | targetm.vectorize.autovectorize_vector_modes (&vector_modes, false); |
7624 | unsigned int mode_i = 0; |
7625 | |
7626 | vec_info_shared shared; |
7627 | |
7628 | machine_mode autodetected_vector_mode = VOIDmode; |
7629 | while (1) |
7630 | { |
7631 | bool vectorized = false; |
7632 | bool fatal = false; |
7633 | bb_vinfo = new _bb_vec_info (bbs, &shared); |
7634 | |
7635 | bool first_time_p = shared.datarefs.is_empty (); |
7636 | BB_VINFO_DATAREFS (bb_vinfo) = datarefs; |
7637 | if (first_time_p) |
7638 | bb_vinfo->shared->save_datarefs (); |
7639 | else |
7640 | bb_vinfo->shared->check_datarefs (); |
7641 | bb_vinfo->vector_mode = next_vector_mode; |
7642 | |
7643 | if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups)) |
7644 | { |
7645 | if (dump_enabled_p ()) |
7646 | { |
7647 | dump_printf_loc (MSG_NOTE, vect_location, |
7648 | "***** Analysis succeeded with vector mode" |
7649 | " %s\n" , GET_MODE_NAME (bb_vinfo->vector_mode)); |
7650 | dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n" ); |
7651 | } |
7652 | |
7653 | bb_vinfo->shared->check_datarefs (); |
7654 | |
7655 | auto_vec<slp_instance> profitable_subgraphs; |
7656 | for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo)) |
7657 | { |
7658 | if (instance->subgraph_entries.is_empty ()) |
7659 | continue; |
7660 | |
7661 | dump_user_location_t saved_vect_location = vect_location; |
7662 | vect_location = instance->location (); |
7663 | if (!unlimited_cost_model (NULL) |
7664 | && !vect_bb_vectorization_profitable_p |
7665 | (bb_vinfo, slp_instances: instance->subgraph_entries, orig_loop)) |
7666 | { |
7667 | if (dump_enabled_p ()) |
7668 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
7669 | "not vectorized: vectorization is not " |
7670 | "profitable.\n" ); |
7671 | vect_location = saved_vect_location; |
7672 | continue; |
7673 | } |
7674 | |
7675 | vect_location = saved_vect_location; |
7676 | if (!dbg_cnt (index: vect_slp)) |
7677 | continue; |
7678 | |
7679 | profitable_subgraphs.safe_push (obj: instance); |
7680 | } |
7681 | |
7682 | /* When we're vectorizing an if-converted loop body make sure |
7683 | we vectorized all if-converted code. */ |
7684 | if (!profitable_subgraphs.is_empty () |
7685 | && orig_loop) |
7686 | { |
7687 | gcc_assert (bb_vinfo->bbs.length () == 1); |
7688 | for (gimple_stmt_iterator gsi = gsi_start_bb (bb: bb_vinfo->bbs[0]); |
7689 | !gsi_end_p (i: gsi); gsi_next (i: &gsi)) |
7690 | { |
7691 | /* The costing above left us with DCEable vectorized scalar |
7692 | stmts having the visited flag set on profitable |
7693 | subgraphs. Do the delayed clearing of the flag here. */ |
7694 | if (gimple_visited_p (stmt: gsi_stmt (i: gsi))) |
7695 | { |
7696 | gimple_set_visited (stmt: gsi_stmt (i: gsi), visited_p: false); |
7697 | continue; |
7698 | } |
7699 | if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED) |
7700 | continue; |
7701 | |
7702 | if (gassign *ass = dyn_cast <gassign *> (p: gsi_stmt (i: gsi))) |
7703 | if (gimple_assign_rhs_code (gs: ass) == COND_EXPR) |
7704 | { |
7705 | if (!profitable_subgraphs.is_empty () |
7706 | && dump_enabled_p ()) |
7707 | dump_printf_loc (MSG_NOTE, vect_location, |
7708 | "not profitable because of " |
7709 | "unprofitable if-converted scalar " |
7710 | "code\n" ); |
7711 | profitable_subgraphs.truncate (size: 0); |
7712 | } |
7713 | } |
7714 | } |
7715 | |
7716 | /* Finally schedule the profitable subgraphs. */ |
7717 | for (slp_instance instance : profitable_subgraphs) |
7718 | { |
7719 | if (!vectorized && dump_enabled_p ()) |
7720 | dump_printf_loc (MSG_NOTE, vect_location, |
7721 | "Basic block will be vectorized " |
7722 | "using SLP\n" ); |
7723 | vectorized = true; |
7724 | |
7725 | /* Dump before scheduling as store vectorization will remove |
7726 | the original stores and mess with the instance tree |
7727 | so querying its location will eventually ICE. */ |
7728 | if (flag_checking) |
7729 | for (slp_instance sub : instance->subgraph_entries) |
7730 | gcc_assert (SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub))); |
7731 | unsigned HOST_WIDE_INT bytes; |
7732 | if (dump_enabled_p ()) |
7733 | for (slp_instance sub : instance->subgraph_entries) |
7734 | { |
7735 | tree vtype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub)); |
7736 | if (GET_MODE_SIZE (TYPE_MODE (vtype)).is_constant (const_value: &bytes)) |
7737 | dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, |
7738 | sub->location (), |
7739 | "basic block part vectorized using %wu " |
7740 | "byte vectors\n" , bytes); |
7741 | else |
7742 | dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, |
7743 | sub->location (), |
7744 | "basic block part vectorized using " |
7745 | "variable length vectors\n" ); |
7746 | } |
7747 | |
7748 | dump_user_location_t saved_vect_location = vect_location; |
7749 | vect_location = instance->location (); |
7750 | |
7751 | vect_schedule_slp (bb_vinfo, instance->subgraph_entries); |
7752 | |
7753 | vect_location = saved_vect_location; |
7754 | } |
7755 | } |
7756 | else |
7757 | { |
7758 | if (dump_enabled_p ()) |
7759 | dump_printf_loc (MSG_NOTE, vect_location, |
7760 | "***** Analysis failed with vector mode %s\n" , |
7761 | GET_MODE_NAME (bb_vinfo->vector_mode)); |
7762 | } |
7763 | |
7764 | if (mode_i == 0) |
7765 | autodetected_vector_mode = bb_vinfo->vector_mode; |
7766 | |
7767 | if (!fatal) |
7768 | while (mode_i < vector_modes.length () |
7769 | && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i])) |
7770 | { |
7771 | if (dump_enabled_p ()) |
7772 | dump_printf_loc (MSG_NOTE, vect_location, |
7773 | "***** The result for vector mode %s would" |
7774 | " be the same\n" , |
7775 | GET_MODE_NAME (vector_modes[mode_i])); |
7776 | mode_i += 1; |
7777 | } |
7778 | |
7779 | delete bb_vinfo; |
7780 | |
7781 | if (mode_i < vector_modes.length () |
7782 | && VECTOR_MODE_P (autodetected_vector_mode) |
7783 | && (related_vector_mode (vector_modes[mode_i], |
7784 | GET_MODE_INNER (autodetected_vector_mode)) |
7785 | == autodetected_vector_mode) |
7786 | && (related_vector_mode (autodetected_vector_mode, |
7787 | GET_MODE_INNER (vector_modes[mode_i])) |
7788 | == vector_modes[mode_i])) |
7789 | { |
7790 | if (dump_enabled_p ()) |
7791 | dump_printf_loc (MSG_NOTE, vect_location, |
7792 | "***** Skipping vector mode %s, which would" |
7793 | " repeat the analysis for %s\n" , |
7794 | GET_MODE_NAME (vector_modes[mode_i]), |
7795 | GET_MODE_NAME (autodetected_vector_mode)); |
7796 | mode_i += 1; |
7797 | } |
7798 | |
7799 | if (vectorized |
7800 | || mode_i == vector_modes.length () |
7801 | || autodetected_vector_mode == VOIDmode |
7802 | /* If vect_slp_analyze_bb_1 signaled that analysis for all |
7803 | vector sizes will fail do not bother iterating. */ |
7804 | || fatal) |
7805 | return vectorized; |
7806 | |
7807 | /* Try the next biggest vector size. */ |
7808 | next_vector_mode = vector_modes[mode_i++]; |
7809 | if (dump_enabled_p ()) |
7810 | dump_printf_loc (MSG_NOTE, vect_location, |
7811 | "***** Re-trying analysis with vector mode %s\n" , |
7812 | GET_MODE_NAME (next_vector_mode)); |
7813 | } |
7814 | } |
7815 | |
7816 | |
7817 | /* Main entry for the BB vectorizer. Analyze and transform BBS, returns |
7818 | true if anything in the basic-block was vectorized. */ |
7819 | |
7820 | static bool |
7821 | vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop) |
7822 | { |
7823 | vec<data_reference_p> datarefs = vNULL; |
7824 | auto_vec<int> dataref_groups; |
7825 | int insns = 0; |
7826 | int current_group = 0; |
7827 | |
7828 | for (unsigned i = 0; i < bbs.length (); i++) |
7829 | { |
7830 | basic_block bb = bbs[i]; |
7831 | for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (i: gsi); |
7832 | gsi_next (i: &gsi)) |
7833 | { |
7834 | gimple *stmt = gsi_stmt (i: gsi); |
7835 | if (is_gimple_debug (gs: stmt)) |
7836 | continue; |
7837 | |
7838 | insns++; |
7839 | |
7840 | if (gimple_location (g: stmt) != UNKNOWN_LOCATION) |
7841 | vect_location = stmt; |
7842 | |
7843 | if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs, |
7844 | &dataref_groups, current_group)) |
7845 | ++current_group; |
7846 | } |
7847 | /* New BBs always start a new DR group. */ |
7848 | ++current_group; |
7849 | } |
7850 | |
7851 | return vect_slp_region (bbs, datarefs, dataref_groups: &dataref_groups, n_stmts: insns, orig_loop); |
7852 | } |
7853 | |
7854 | /* Special entry for the BB vectorizer. Analyze and transform a single |
7855 | if-converted BB with ORIG_LOOPs body being the not if-converted |
7856 | representation. Returns true if anything in the basic-block was |
7857 | vectorized. */ |
7858 | |
7859 | bool |
7860 | vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop) |
7861 | { |
7862 | auto_vec<basic_block> bbs; |
7863 | bbs.safe_push (obj: bb); |
7864 | return vect_slp_bbs (bbs, orig_loop); |
7865 | } |
7866 | |
7867 | /* Main entry for the BB vectorizer. Analyze and transform BB, returns |
7868 | true if anything in the basic-block was vectorized. */ |
7869 | |
7870 | bool |
7871 | vect_slp_function (function *fun) |
7872 | { |
7873 | bool r = false; |
7874 | int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun)); |
7875 | auto_bitmap exit_bbs; |
7876 | bitmap_set_bit (exit_bbs, EXIT_BLOCK); |
7877 | edge entry = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (fun)); |
7878 | unsigned n = rev_post_order_and_mark_dfs_back_seme (fun, entry, exit_bbs, |
7879 | true, rpo, NULL); |
7880 | |
7881 | /* For the moment split the function into pieces to avoid making |
7882 | the iteration on the vector mode moot. Split at points we know |
7883 | to not handle well which is CFG merges (SLP discovery doesn't |
7884 | handle non-loop-header PHIs) and loop exits. Since pattern |
7885 | recog requires reverse iteration to visit uses before defs |
7886 | simply chop RPO into pieces. */ |
7887 | auto_vec<basic_block> bbs; |
7888 | for (unsigned i = 0; i < n; i++) |
7889 | { |
7890 | basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]); |
7891 | bool split = false; |
7892 | |
7893 | /* Split when a BB is not dominated by the first block. */ |
7894 | if (!bbs.is_empty () |
7895 | && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0])) |
7896 | { |
7897 | if (dump_enabled_p ()) |
7898 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
7899 | "splitting region at dominance boundary bb%d\n" , |
7900 | bb->index); |
7901 | split = true; |
7902 | } |
7903 | /* Split when the loop determined by the first block |
7904 | is exited. This is because we eventually insert |
7905 | invariants at region begin. */ |
7906 | else if (!bbs.is_empty () |
7907 | && bbs[0]->loop_father != bb->loop_father |
7908 | && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father)) |
7909 | { |
7910 | if (dump_enabled_p ()) |
7911 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
7912 | "splitting region at loop %d exit at bb%d\n" , |
7913 | bbs[0]->loop_father->num, bb->index); |
7914 | split = true; |
7915 | } |
7916 | else if (!bbs.is_empty () |
7917 | && bb->loop_father->header == bb |
7918 | && bb->loop_father->dont_vectorize) |
7919 | { |
7920 | if (dump_enabled_p ()) |
7921 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
7922 | "splitting region at dont-vectorize loop %d " |
7923 | "entry at bb%d\n" , |
7924 | bb->loop_father->num, bb->index); |
7925 | split = true; |
7926 | } |
7927 | |
7928 | if (split && !bbs.is_empty ()) |
7929 | { |
7930 | r |= vect_slp_bbs (bbs, NULL); |
7931 | bbs.truncate (size: 0); |
7932 | } |
7933 | |
7934 | if (bbs.is_empty ()) |
7935 | { |
7936 | /* We need to be able to insert at the head of the region which |
7937 | we cannot for region starting with a returns-twice call. */ |
7938 | if (gcall *first = safe_dyn_cast <gcall *> (p: first_stmt (bb))) |
7939 | if (gimple_call_flags (first) & ECF_RETURNS_TWICE) |
7940 | { |
7941 | if (dump_enabled_p ()) |
7942 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
7943 | "skipping bb%d as start of region as it " |
7944 | "starts with returns-twice call\n" , |
7945 | bb->index); |
7946 | continue; |
7947 | } |
7948 | /* If the loop this BB belongs to is marked as not to be vectorized |
7949 | honor that also for BB vectorization. */ |
7950 | if (bb->loop_father->dont_vectorize) |
7951 | continue; |
7952 | } |
7953 | |
7954 | bbs.safe_push (obj: bb); |
7955 | |
7956 | /* When we have a stmt ending this block and defining a |
7957 | value we have to insert on edges when inserting after it for |
7958 | a vector containing its definition. Avoid this for now. */ |
7959 | if (gimple *last = *gsi_last_bb (bb)) |
7960 | if (gimple_get_lhs (last) |
7961 | && is_ctrl_altering_stmt (last)) |
7962 | { |
7963 | if (dump_enabled_p ()) |
7964 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
7965 | "splitting region at control altering " |
7966 | "definition %G" , last); |
7967 | r |= vect_slp_bbs (bbs, NULL); |
7968 | bbs.truncate (size: 0); |
7969 | } |
7970 | } |
7971 | |
7972 | if (!bbs.is_empty ()) |
7973 | r |= vect_slp_bbs (bbs, NULL); |
7974 | |
7975 | free (ptr: rpo); |
7976 | |
7977 | return r; |
7978 | } |
7979 | |
7980 | /* Build a variable-length vector in which the elements in ELTS are repeated |
7981 | to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in |
7982 | RESULTS and add any new instructions to SEQ. |
7983 | |
7984 | The approach we use is: |
7985 | |
7986 | (1) Find a vector mode VM with integer elements of mode IM. |
7987 | |
7988 | (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of |
7989 | ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs |
7990 | from small vectors to IM. |
7991 | |
7992 | (3) Duplicate each ELTS'[I] into a vector of mode VM. |
7993 | |
7994 | (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the |
7995 | correct byte contents. |
7996 | |
7997 | (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type. |
7998 | |
7999 | We try to find the largest IM for which this sequence works, in order |
8000 | to cut down on the number of interleaves. */ |
8001 | |
8002 | void |
8003 | duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type, |
8004 | const vec<tree> &elts, unsigned int nresults, |
8005 | vec<tree> &results) |
8006 | { |
8007 | unsigned int nelts = elts.length (); |
8008 | tree element_type = TREE_TYPE (vector_type); |
8009 | |
8010 | /* (1) Find a vector mode VM with integer elements of mode IM. */ |
8011 | unsigned int nvectors = 1; |
8012 | tree new_vector_type; |
8013 | tree permutes[2]; |
8014 | if (!can_duplicate_and_interleave_p (vinfo, count: nelts, elt_type: element_type, |
8015 | nvectors_out: &nvectors, vector_type_out: &new_vector_type, |
8016 | permutes)) |
8017 | gcc_unreachable (); |
8018 | |
8019 | /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */ |
8020 | unsigned int partial_nelts = nelts / nvectors; |
8021 | tree partial_vector_type = build_vector_type (element_type, partial_nelts); |
8022 | |
8023 | tree_vector_builder partial_elts; |
8024 | auto_vec<tree, 32> pieces (nvectors * 2); |
8025 | pieces.quick_grow_cleared (len: nvectors * 2); |
8026 | for (unsigned int i = 0; i < nvectors; ++i) |
8027 | { |
8028 | /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of |
8029 | ELTS' has mode IM. */ |
8030 | partial_elts.new_vector (type: partial_vector_type, npatterns: partial_nelts, nelts_per_pattern: 1); |
8031 | for (unsigned int j = 0; j < partial_nelts; ++j) |
8032 | partial_elts.quick_push (obj: elts[i * partial_nelts + j]); |
8033 | tree t = gimple_build_vector (seq, builder: &partial_elts); |
8034 | t = gimple_build (seq, code: VIEW_CONVERT_EXPR, |
8035 | TREE_TYPE (new_vector_type), ops: t); |
8036 | |
8037 | /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */ |
8038 | pieces[i] = gimple_build_vector_from_val (seq, type: new_vector_type, op: t); |
8039 | } |
8040 | |
8041 | /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the |
8042 | correct byte contents. |
8043 | |
8044 | Conceptually, we need to repeat the following operation log2(nvectors) |
8045 | times, where hi_start = nvectors / 2: |
8046 | |
8047 | out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute); |
8048 | out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute); |
8049 | |
8050 | However, if each input repeats every N elements and the VF is |
8051 | a multiple of N * 2, the HI result is the same as the LO result. |
8052 | This will be true for the first N1 iterations of the outer loop, |
8053 | followed by N2 iterations for which both the LO and HI results |
8054 | are needed. I.e.: |
8055 | |
8056 | N1 + N2 = log2(nvectors) |
8057 | |
8058 | Each "N1 iteration" doubles the number of redundant vectors and the |
8059 | effect of the process as a whole is to have a sequence of nvectors/2**N1 |
8060 | vectors that repeats 2**N1 times. Rather than generate these redundant |
8061 | vectors, we halve the number of vectors for each N1 iteration. */ |
8062 | unsigned int in_start = 0; |
8063 | unsigned int out_start = nvectors; |
8064 | unsigned int new_nvectors = nvectors; |
8065 | for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2) |
8066 | { |
8067 | unsigned int hi_start = new_nvectors / 2; |
8068 | unsigned int out_i = 0; |
8069 | for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i) |
8070 | { |
8071 | if ((in_i & 1) != 0 |
8072 | && multiple_p (a: TYPE_VECTOR_SUBPARTS (node: new_vector_type), |
8073 | b: 2 * in_repeat)) |
8074 | continue; |
8075 | |
8076 | tree output = make_ssa_name (var: new_vector_type); |
8077 | tree input1 = pieces[in_start + (in_i / 2)]; |
8078 | tree input2 = pieces[in_start + (in_i / 2) + hi_start]; |
8079 | gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR, |
8080 | input1, input2, |
8081 | permutes[in_i & 1]); |
8082 | gimple_seq_add_stmt (seq, stmt); |
8083 | pieces[out_start + out_i] = output; |
8084 | out_i += 1; |
8085 | } |
8086 | std::swap (a&: in_start, b&: out_start); |
8087 | new_nvectors = out_i; |
8088 | } |
8089 | |
8090 | /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */ |
8091 | results.reserve (nelems: nresults); |
8092 | for (unsigned int i = 0; i < nresults; ++i) |
8093 | if (i < new_nvectors) |
8094 | results.quick_push (obj: gimple_build (seq, code: VIEW_CONVERT_EXPR, type: vector_type, |
8095 | ops: pieces[in_start + i])); |
8096 | else |
8097 | results.quick_push (obj: results[i - new_nvectors]); |
8098 | } |
8099 | |
8100 | |
8101 | /* For constant and loop invariant defs in OP_NODE this function creates |
8102 | vector defs that will be used in the vectorized stmts and stores them |
8103 | to SLP_TREE_VEC_DEFS of OP_NODE. */ |
8104 | |
8105 | static void |
8106 | vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node) |
8107 | { |
8108 | unsigned HOST_WIDE_INT nunits; |
8109 | tree vec_cst; |
8110 | unsigned j, number_of_places_left_in_vector; |
8111 | tree vector_type; |
8112 | tree vop; |
8113 | int group_size = op_node->ops.length (); |
8114 | unsigned int vec_num, i; |
8115 | unsigned number_of_copies = 1; |
8116 | bool constant_p; |
8117 | gimple_seq ctor_seq = NULL; |
8118 | auto_vec<tree, 16> permute_results; |
8119 | |
8120 | /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */ |
8121 | vector_type = SLP_TREE_VECTYPE (op_node); |
8122 | |
8123 | unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node); |
8124 | SLP_TREE_VEC_DEFS (op_node).create (nelems: number_of_vectors); |
8125 | auto_vec<tree> voprnds (number_of_vectors); |
8126 | |
8127 | /* NUMBER_OF_COPIES is the number of times we need to use the same values in |
8128 | created vectors. It is greater than 1 if unrolling is performed. |
8129 | |
8130 | For example, we have two scalar operands, s1 and s2 (e.g., group of |
8131 | strided accesses of size two), while NUNITS is four (i.e., four scalars |
8132 | of this type can be packed in a vector). The output vector will contain |
8133 | two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES |
8134 | will be 2). |
8135 | |
8136 | If GROUP_SIZE > NUNITS, the scalars will be split into several vectors |
8137 | containing the operands. |
8138 | |
8139 | For example, NUNITS is four as before, and the group size is 8 |
8140 | (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and |
8141 | {s5, s6, s7, s8}. */ |
8142 | |
8143 | /* When using duplicate_and_interleave, we just need one element for |
8144 | each scalar statement. */ |
8145 | if (!TYPE_VECTOR_SUBPARTS (node: vector_type).is_constant (const_value: &nunits)) |
8146 | nunits = group_size; |
8147 | |
8148 | number_of_copies = nunits * number_of_vectors / group_size; |
8149 | |
8150 | number_of_places_left_in_vector = nunits; |
8151 | constant_p = true; |
8152 | tree_vector_builder elts (vector_type, nunits, 1); |
8153 | elts.quick_grow (len: nunits); |
8154 | stmt_vec_info insert_after = NULL; |
8155 | for (j = 0; j < number_of_copies; j++) |
8156 | { |
8157 | tree op; |
8158 | for (i = group_size - 1; op_node->ops.iterate (ix: i, ptr: &op); i--) |
8159 | { |
8160 | /* Create 'vect_ = {op0,op1,...,opn}'. */ |
8161 | number_of_places_left_in_vector--; |
8162 | tree orig_op = op; |
8163 | if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op))) |
8164 | { |
8165 | if (CONSTANT_CLASS_P (op)) |
8166 | { |
8167 | if (VECTOR_BOOLEAN_TYPE_P (vector_type)) |
8168 | { |
8169 | /* Can't use VIEW_CONVERT_EXPR for booleans because |
8170 | of possibly different sizes of scalar value and |
8171 | vector element. */ |
8172 | if (integer_zerop (op)) |
8173 | op = build_int_cst (TREE_TYPE (vector_type), 0); |
8174 | else if (integer_onep (op)) |
8175 | op = build_all_ones_cst (TREE_TYPE (vector_type)); |
8176 | else |
8177 | gcc_unreachable (); |
8178 | } |
8179 | else |
8180 | op = fold_unary (VIEW_CONVERT_EXPR, |
8181 | TREE_TYPE (vector_type), op); |
8182 | gcc_assert (op && CONSTANT_CLASS_P (op)); |
8183 | } |
8184 | else |
8185 | { |
8186 | tree new_temp = make_ssa_name (TREE_TYPE (vector_type)); |
8187 | gimple *init_stmt; |
8188 | if (VECTOR_BOOLEAN_TYPE_P (vector_type)) |
8189 | { |
8190 | tree true_val |
8191 | = build_all_ones_cst (TREE_TYPE (vector_type)); |
8192 | tree false_val |
8193 | = build_zero_cst (TREE_TYPE (vector_type)); |
8194 | gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op))); |
8195 | init_stmt = gimple_build_assign (new_temp, COND_EXPR, |
8196 | op, true_val, |
8197 | false_val); |
8198 | } |
8199 | else |
8200 | { |
8201 | op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type), |
8202 | op); |
8203 | init_stmt |
8204 | = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR, |
8205 | op); |
8206 | } |
8207 | gimple_seq_add_stmt (&ctor_seq, init_stmt); |
8208 | op = new_temp; |
8209 | } |
8210 | } |
8211 | elts[number_of_places_left_in_vector] = op; |
8212 | if (!CONSTANT_CLASS_P (op)) |
8213 | constant_p = false; |
8214 | /* For BB vectorization we have to compute an insert location |
8215 | when a def is inside the analyzed region since we cannot |
8216 | simply insert at the BB start in this case. */ |
8217 | stmt_vec_info opdef; |
8218 | if (TREE_CODE (orig_op) == SSA_NAME |
8219 | && !SSA_NAME_IS_DEFAULT_DEF (orig_op) |
8220 | && is_a <bb_vec_info> (p: vinfo) |
8221 | && (opdef = vinfo->lookup_def (orig_op))) |
8222 | { |
8223 | if (!insert_after) |
8224 | insert_after = opdef; |
8225 | else |
8226 | insert_after = get_later_stmt (stmt1_info: insert_after, stmt2_info: opdef); |
8227 | } |
8228 | |
8229 | if (number_of_places_left_in_vector == 0) |
8230 | { |
8231 | if (constant_p |
8232 | ? multiple_p (a: TYPE_VECTOR_SUBPARTS (node: vector_type), b: nunits) |
8233 | : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits)) |
8234 | vec_cst = gimple_build_vector (seq: &ctor_seq, builder: &elts); |
8235 | else |
8236 | { |
8237 | if (permute_results.is_empty ()) |
8238 | duplicate_and_interleave (vinfo, seq: &ctor_seq, vector_type, |
8239 | elts, nresults: number_of_vectors, |
8240 | results&: permute_results); |
8241 | vec_cst = permute_results[number_of_vectors - j - 1]; |
8242 | } |
8243 | if (!gimple_seq_empty_p (s: ctor_seq)) |
8244 | { |
8245 | if (insert_after) |
8246 | { |
8247 | gimple_stmt_iterator gsi; |
8248 | if (gimple_code (g: insert_after->stmt) == GIMPLE_PHI) |
8249 | { |
8250 | gsi = gsi_after_labels (bb: gimple_bb (g: insert_after->stmt)); |
8251 | gsi_insert_seq_before (&gsi, ctor_seq, |
8252 | GSI_CONTINUE_LINKING); |
8253 | } |
8254 | else if (!stmt_ends_bb_p (insert_after->stmt)) |
8255 | { |
8256 | gsi = gsi_for_stmt (insert_after->stmt); |
8257 | gsi_insert_seq_after (&gsi, ctor_seq, |
8258 | GSI_CONTINUE_LINKING); |
8259 | } |
8260 | else |
8261 | { |
8262 | /* When we want to insert after a def where the |
8263 | defining stmt throws then insert on the fallthru |
8264 | edge. */ |
8265 | edge e = find_fallthru_edge |
8266 | (edges: gimple_bb (g: insert_after->stmt)->succs); |
8267 | basic_block new_bb |
8268 | = gsi_insert_seq_on_edge_immediate (e, ctor_seq); |
8269 | gcc_assert (!new_bb); |
8270 | } |
8271 | } |
8272 | else |
8273 | vinfo->insert_seq_on_entry (NULL, ctor_seq); |
8274 | ctor_seq = NULL; |
8275 | } |
8276 | voprnds.quick_push (obj: vec_cst); |
8277 | insert_after = NULL; |
8278 | number_of_places_left_in_vector = nunits; |
8279 | constant_p = true; |
8280 | elts.new_vector (type: vector_type, npatterns: nunits, nelts_per_pattern: 1); |
8281 | elts.quick_grow (len: nunits); |
8282 | } |
8283 | } |
8284 | } |
8285 | |
8286 | /* Since the vectors are created in the reverse order, we should invert |
8287 | them. */ |
8288 | vec_num = voprnds.length (); |
8289 | for (j = vec_num; j != 0; j--) |
8290 | { |
8291 | vop = voprnds[j - 1]; |
8292 | SLP_TREE_VEC_DEFS (op_node).quick_push (obj: vop); |
8293 | } |
8294 | |
8295 | /* In case that VF is greater than the unrolling factor needed for the SLP |
8296 | group of stmts, NUMBER_OF_VECTORS to be created is greater than |
8297 | NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have |
8298 | to replicate the vectors. */ |
8299 | while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ()) |
8300 | for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (ix: i, ptr: &vop) && i < vec_num; |
8301 | i++) |
8302 | SLP_TREE_VEC_DEFS (op_node).quick_push (obj: vop); |
8303 | } |
8304 | |
8305 | /* Get the Ith vectorized definition from SLP_NODE. */ |
8306 | |
8307 | tree |
8308 | vect_get_slp_vect_def (slp_tree slp_node, unsigned i) |
8309 | { |
8310 | return SLP_TREE_VEC_DEFS (slp_node)[i]; |
8311 | } |
8312 | |
8313 | /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */ |
8314 | |
8315 | void |
8316 | vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs) |
8317 | { |
8318 | vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node)); |
8319 | vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node)); |
8320 | } |
8321 | |
8322 | /* Get N vectorized definitions for SLP_NODE. */ |
8323 | |
8324 | void |
8325 | vect_get_slp_defs (vec_info *, |
8326 | slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n) |
8327 | { |
8328 | if (n == -1U) |
8329 | n = SLP_TREE_CHILDREN (slp_node).length (); |
8330 | |
8331 | for (unsigned i = 0; i < n; ++i) |
8332 | { |
8333 | slp_tree child = SLP_TREE_CHILDREN (slp_node)[i]; |
8334 | vec<tree> vec_defs = vNULL; |
8335 | vect_get_slp_defs (slp_node: child, vec_defs: &vec_defs); |
8336 | vec_oprnds->quick_push (obj: vec_defs); |
8337 | } |
8338 | } |
8339 | |
8340 | /* A subroutine of vect_transform_slp_perm_load with two extra arguments: |
8341 | - PERM gives the permutation that the caller wants to use for NODE, |
8342 | which might be different from SLP_LOAD_PERMUTATION. |
8343 | - DUMP_P controls whether the function dumps information. */ |
8344 | |
8345 | static bool |
8346 | vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node, |
8347 | load_permutation_t &perm, |
8348 | const vec<tree> &dr_chain, |
8349 | gimple_stmt_iterator *gsi, poly_uint64 vf, |
8350 | bool analyze_only, bool dump_p, |
8351 | unsigned *n_perms, unsigned int *n_loads, |
8352 | bool dce_chain) |
8353 | { |
8354 | stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0]; |
8355 | int vec_index = 0; |
8356 | tree vectype = SLP_TREE_VECTYPE (node); |
8357 | unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length (); |
8358 | unsigned int mask_element; |
8359 | unsigned dr_group_size; |
8360 | machine_mode mode; |
8361 | |
8362 | if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)) |
8363 | dr_group_size = 1; |
8364 | else |
8365 | { |
8366 | stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); |
8367 | dr_group_size = DR_GROUP_SIZE (stmt_info); |
8368 | } |
8369 | |
8370 | mode = TYPE_MODE (vectype); |
8371 | poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (node: vectype); |
8372 | unsigned int nstmts = SLP_TREE_NUMBER_OF_VEC_STMTS (node); |
8373 | |
8374 | /* Initialize the vect stmts of NODE to properly insert the generated |
8375 | stmts later. */ |
8376 | if (! analyze_only) |
8377 | for (unsigned i = SLP_TREE_VEC_DEFS (node).length (); i < nstmts; i++) |
8378 | SLP_TREE_VEC_DEFS (node).quick_push (NULL_TREE); |
8379 | |
8380 | /* Generate permutation masks for every NODE. Number of masks for each NODE |
8381 | is equal to GROUP_SIZE. |
8382 | E.g., we have a group of three nodes with three loads from the same |
8383 | location in each node, and the vector size is 4. I.e., we have a |
8384 | a0b0c0a1b1c1... sequence and we need to create the following vectors: |
8385 | for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3 |
8386 | for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3 |
8387 | ... |
8388 | |
8389 | The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}. |
8390 | The last mask is illegal since we assume two operands for permute |
8391 | operation, and the mask element values can't be outside that range. |
8392 | Hence, the last mask must be converted into {2,5,5,5}. |
8393 | For the first two permutations we need the first and the second input |
8394 | vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation |
8395 | we need the second and the third vectors: {b1,c1,a2,b2} and |
8396 | {c2,a3,b3,c3}. */ |
8397 | |
8398 | int vect_stmts_counter = 0; |
8399 | unsigned int index = 0; |
8400 | int first_vec_index = -1; |
8401 | int second_vec_index = -1; |
8402 | bool noop_p = true; |
8403 | *n_perms = 0; |
8404 | |
8405 | vec_perm_builder mask; |
8406 | unsigned int nelts_to_build; |
8407 | unsigned int nvectors_per_build; |
8408 | unsigned int in_nlanes; |
8409 | bool repeating_p = (group_size == dr_group_size |
8410 | && multiple_p (a: nunits, b: group_size)); |
8411 | if (repeating_p) |
8412 | { |
8413 | /* A single vector contains a whole number of copies of the node, so: |
8414 | (a) all permutes can use the same mask; and |
8415 | (b) the permutes only need a single vector input. */ |
8416 | mask.new_vector (full_nelts: nunits, npatterns: group_size, nelts_per_pattern: 3); |
8417 | nelts_to_build = mask.encoded_nelts (); |
8418 | /* It's possible to obtain zero nstmts during analyze_only, so make |
8419 | it at least one to ensure the later computation for n_perms |
8420 | proceed. */ |
8421 | nvectors_per_build = nstmts > 0 ? nstmts : 1; |
8422 | in_nlanes = dr_group_size * 3; |
8423 | } |
8424 | else |
8425 | { |
8426 | /* We need to construct a separate mask for each vector statement. */ |
8427 | unsigned HOST_WIDE_INT const_nunits, const_vf; |
8428 | if (!nunits.is_constant (const_value: &const_nunits) |
8429 | || !vf.is_constant (const_value: &const_vf)) |
8430 | return false; |
8431 | mask.new_vector (full_nelts: const_nunits, npatterns: const_nunits, nelts_per_pattern: 1); |
8432 | nelts_to_build = const_vf * group_size; |
8433 | nvectors_per_build = 1; |
8434 | in_nlanes = const_vf * dr_group_size; |
8435 | } |
8436 | auto_sbitmap used_in_lanes (in_nlanes); |
8437 | bitmap_clear (used_in_lanes); |
8438 | auto_bitmap used_defs; |
8439 | |
8440 | unsigned int count = mask.encoded_nelts (); |
8441 | mask.quick_grow (len: count); |
8442 | vec_perm_indices indices; |
8443 | |
8444 | for (unsigned int j = 0; j < nelts_to_build; j++) |
8445 | { |
8446 | unsigned int iter_num = j / group_size; |
8447 | unsigned int stmt_num = j % group_size; |
8448 | unsigned int i = (iter_num * dr_group_size + perm[stmt_num]); |
8449 | bitmap_set_bit (map: used_in_lanes, bitno: i); |
8450 | if (repeating_p) |
8451 | { |
8452 | first_vec_index = 0; |
8453 | mask_element = i; |
8454 | } |
8455 | else |
8456 | { |
8457 | /* Enforced before the loop when !repeating_p. */ |
8458 | unsigned int const_nunits = nunits.to_constant (); |
8459 | vec_index = i / const_nunits; |
8460 | mask_element = i % const_nunits; |
8461 | if (vec_index == first_vec_index |
8462 | || first_vec_index == -1) |
8463 | { |
8464 | first_vec_index = vec_index; |
8465 | } |
8466 | else if (vec_index == second_vec_index |
8467 | || second_vec_index == -1) |
8468 | { |
8469 | second_vec_index = vec_index; |
8470 | mask_element += const_nunits; |
8471 | } |
8472 | else |
8473 | { |
8474 | if (dump_p) |
8475 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
8476 | "permutation requires at " |
8477 | "least three vectors %G" , |
8478 | stmt_info->stmt); |
8479 | gcc_assert (analyze_only); |
8480 | return false; |
8481 | } |
8482 | |
8483 | gcc_assert (mask_element < 2 * const_nunits); |
8484 | } |
8485 | |
8486 | if (mask_element != index) |
8487 | noop_p = false; |
8488 | mask[index++] = mask_element; |
8489 | |
8490 | if (index == count) |
8491 | { |
8492 | if (!noop_p) |
8493 | { |
8494 | indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits); |
8495 | if (!can_vec_perm_const_p (mode, mode, indices)) |
8496 | { |
8497 | if (dump_p) |
8498 | { |
8499 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
8500 | "unsupported vect permute { " ); |
8501 | for (i = 0; i < count; ++i) |
8502 | { |
8503 | dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]); |
8504 | dump_printf (MSG_MISSED_OPTIMIZATION, " " ); |
8505 | } |
8506 | dump_printf (MSG_MISSED_OPTIMIZATION, "}\n" ); |
8507 | } |
8508 | gcc_assert (analyze_only); |
8509 | return false; |
8510 | } |
8511 | |
8512 | tree mask_vec = NULL_TREE; |
8513 | if (!analyze_only) |
8514 | mask_vec = vect_gen_perm_mask_checked (vectype, indices); |
8515 | |
8516 | if (second_vec_index == -1) |
8517 | second_vec_index = first_vec_index; |
8518 | |
8519 | for (unsigned int ri = 0; ri < nvectors_per_build; ++ri) |
8520 | { |
8521 | ++*n_perms; |
8522 | if (analyze_only) |
8523 | continue; |
8524 | /* Generate the permute statement if necessary. */ |
8525 | tree first_vec = dr_chain[first_vec_index + ri]; |
8526 | tree second_vec = dr_chain[second_vec_index + ri]; |
8527 | gassign *stmt = as_a<gassign *> (p: stmt_info->stmt); |
8528 | tree perm_dest |
8529 | = vect_create_destination_var (gimple_assign_lhs (gs: stmt), |
8530 | vectype); |
8531 | perm_dest = make_ssa_name (var: perm_dest); |
8532 | gimple *perm_stmt |
8533 | = gimple_build_assign (perm_dest, VEC_PERM_EXPR, first_vec, |
8534 | second_vec, mask_vec); |
8535 | vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, |
8536 | gsi); |
8537 | if (dce_chain) |
8538 | { |
8539 | bitmap_set_bit (used_defs, first_vec_index + ri); |
8540 | bitmap_set_bit (used_defs, second_vec_index + ri); |
8541 | } |
8542 | |
8543 | /* Store the vector statement in NODE. */ |
8544 | SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = perm_dest; |
8545 | } |
8546 | } |
8547 | else if (!analyze_only) |
8548 | { |
8549 | for (unsigned int ri = 0; ri < nvectors_per_build; ++ri) |
8550 | { |
8551 | tree first_vec = dr_chain[first_vec_index + ri]; |
8552 | /* If mask was NULL_TREE generate the requested |
8553 | identity transform. */ |
8554 | if (dce_chain) |
8555 | bitmap_set_bit (used_defs, first_vec_index + ri); |
8556 | |
8557 | /* Store the vector statement in NODE. */ |
8558 | SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = first_vec; |
8559 | } |
8560 | } |
8561 | |
8562 | index = 0; |
8563 | first_vec_index = -1; |
8564 | second_vec_index = -1; |
8565 | noop_p = true; |
8566 | } |
8567 | } |
8568 | |
8569 | if (n_loads) |
8570 | { |
8571 | if (repeating_p) |
8572 | *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node); |
8573 | else |
8574 | { |
8575 | /* Enforced above when !repeating_p. */ |
8576 | unsigned int const_nunits = nunits.to_constant (); |
8577 | *n_loads = 0; |
8578 | bool load_seen = false; |
8579 | for (unsigned i = 0; i < in_nlanes; ++i) |
8580 | { |
8581 | if (i % const_nunits == 0) |
8582 | { |
8583 | if (load_seen) |
8584 | *n_loads += 1; |
8585 | load_seen = false; |
8586 | } |
8587 | if (bitmap_bit_p (map: used_in_lanes, bitno: i)) |
8588 | load_seen = true; |
8589 | } |
8590 | if (load_seen) |
8591 | *n_loads += 1; |
8592 | } |
8593 | } |
8594 | |
8595 | if (dce_chain) |
8596 | for (unsigned i = 0; i < dr_chain.length (); ++i) |
8597 | if (!bitmap_bit_p (used_defs, i)) |
8598 | { |
8599 | gimple *stmt = SSA_NAME_DEF_STMT (dr_chain[i]); |
8600 | gimple_stmt_iterator rgsi = gsi_for_stmt (stmt); |
8601 | gsi_remove (&rgsi, true); |
8602 | release_defs (stmt); |
8603 | } |
8604 | |
8605 | return true; |
8606 | } |
8607 | |
8608 | /* Generate vector permute statements from a list of loads in DR_CHAIN. |
8609 | If ANALYZE_ONLY is TRUE, only check that it is possible to create valid |
8610 | permute statements for the SLP node NODE. Store the number of vector |
8611 | permute instructions in *N_PERMS and the number of vector load |
8612 | instructions in *N_LOADS. If DCE_CHAIN is true, remove all definitions |
8613 | that were not needed. */ |
8614 | |
8615 | bool |
8616 | vect_transform_slp_perm_load (vec_info *vinfo, |
8617 | slp_tree node, const vec<tree> &dr_chain, |
8618 | gimple_stmt_iterator *gsi, poly_uint64 vf, |
8619 | bool analyze_only, unsigned *n_perms, |
8620 | unsigned int *n_loads, bool dce_chain) |
8621 | { |
8622 | return vect_transform_slp_perm_load_1 (vinfo, node, |
8623 | SLP_TREE_LOAD_PERMUTATION (node), |
8624 | dr_chain, gsi, vf, analyze_only, |
8625 | dump_p: dump_enabled_p (), n_perms, n_loads, |
8626 | dce_chain); |
8627 | } |
8628 | |
8629 | /* Produce the next vector result for SLP permutation NODE by adding a vector |
8630 | statement at GSI. If MASK_VEC is nonnull, add: |
8631 | |
8632 | <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC> |
8633 | |
8634 | otherwise add: |
8635 | |
8636 | <new SSA name> = FIRST_DEF. */ |
8637 | |
8638 | static void |
8639 | vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi, |
8640 | slp_tree node, tree first_def, tree second_def, |
8641 | tree mask_vec, poly_uint64 identity_offset) |
8642 | { |
8643 | tree vectype = SLP_TREE_VECTYPE (node); |
8644 | |
8645 | /* ??? We SLP match existing vector element extracts but |
8646 | allow punning which we need to re-instantiate at uses |
8647 | but have no good way of explicitly representing. */ |
8648 | if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)), TYPE_SIZE (vectype)) |
8649 | && !types_compatible_p (TREE_TYPE (first_def), type2: vectype)) |
8650 | { |
8651 | gassign *conv_stmt |
8652 | = gimple_build_assign (make_ssa_name (var: vectype), |
8653 | build1 (VIEW_CONVERT_EXPR, vectype, first_def)); |
8654 | vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi); |
8655 | first_def = gimple_assign_lhs (gs: conv_stmt); |
8656 | } |
8657 | gassign *perm_stmt; |
8658 | tree perm_dest = make_ssa_name (var: vectype); |
8659 | if (mask_vec) |
8660 | { |
8661 | if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)), |
8662 | TYPE_SIZE (vectype)) |
8663 | && !types_compatible_p (TREE_TYPE (second_def), type2: vectype)) |
8664 | { |
8665 | gassign *conv_stmt |
8666 | = gimple_build_assign (make_ssa_name (var: vectype), |
8667 | build1 (VIEW_CONVERT_EXPR, |
8668 | vectype, second_def)); |
8669 | vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi); |
8670 | second_def = gimple_assign_lhs (gs: conv_stmt); |
8671 | } |
8672 | perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR, |
8673 | first_def, second_def, |
8674 | mask_vec); |
8675 | } |
8676 | else if (!types_compatible_p (TREE_TYPE (first_def), type2: vectype)) |
8677 | { |
8678 | /* For identity permutes we still need to handle the case |
8679 | of offsetted extracts or concats. */ |
8680 | unsigned HOST_WIDE_INT c; |
8681 | auto first_def_nunits |
8682 | = TYPE_VECTOR_SUBPARTS (TREE_TYPE (first_def)); |
8683 | if (known_le (TYPE_VECTOR_SUBPARTS (vectype), first_def_nunits)) |
8684 | { |
8685 | unsigned HOST_WIDE_INT elsz |
8686 | = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (first_def)))); |
8687 | tree lowpart = build3 (BIT_FIELD_REF, vectype, first_def, |
8688 | TYPE_SIZE (vectype), |
8689 | bitsize_int (identity_offset * elsz)); |
8690 | perm_stmt = gimple_build_assign (perm_dest, lowpart); |
8691 | } |
8692 | else if (constant_multiple_p (a: TYPE_VECTOR_SUBPARTS (node: vectype), |
8693 | b: first_def_nunits, multiple: &c) && c == 2) |
8694 | { |
8695 | tree ctor = build_constructor_va (vectype, 2, NULL_TREE, first_def, |
8696 | NULL_TREE, second_def); |
8697 | perm_stmt = gimple_build_assign (perm_dest, ctor); |
8698 | } |
8699 | else |
8700 | gcc_unreachable (); |
8701 | } |
8702 | else |
8703 | { |
8704 | /* We need a copy here in case the def was external. */ |
8705 | perm_stmt = gimple_build_assign (perm_dest, first_def); |
8706 | } |
8707 | vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi); |
8708 | /* Store the vector statement in NODE. */ |
8709 | node->push_vec_def (def: perm_stmt); |
8710 | } |
8711 | |
8712 | /* Subroutine of vectorizable_slp_permutation. Check whether the target |
8713 | can perform permutation PERM on the (1 or 2) input nodes in CHILDREN. |
8714 | If GSI is nonnull, emit the permutation there. |
8715 | |
8716 | When GSI is null, the only purpose of NODE is to give properties |
8717 | of the result, such as the vector type and number of SLP lanes. |
8718 | The node does not need to be a VEC_PERM_EXPR. |
8719 | |
8720 | If the target supports the operation, return the number of individual |
8721 | VEC_PERM_EXPRs needed, otherwise return -1. Print information to the |
8722 | dump file if DUMP_P is true. */ |
8723 | |
8724 | static int |
8725 | vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi, |
8726 | slp_tree node, lane_permutation_t &perm, |
8727 | vec<slp_tree> &children, bool dump_p) |
8728 | { |
8729 | tree vectype = SLP_TREE_VECTYPE (node); |
8730 | |
8731 | /* ??? We currently only support all same vector input types |
8732 | while the SLP IL should really do a concat + select and thus accept |
8733 | arbitrary mismatches. */ |
8734 | slp_tree child; |
8735 | unsigned i; |
8736 | poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (node: vectype); |
8737 | bool repeating_p = multiple_p (a: nunits, SLP_TREE_LANES (node)); |
8738 | tree op_vectype = NULL_TREE; |
8739 | FOR_EACH_VEC_ELT (children, i, child) |
8740 | if (SLP_TREE_VECTYPE (child)) |
8741 | { |
8742 | op_vectype = SLP_TREE_VECTYPE (child); |
8743 | break; |
8744 | } |
8745 | if (!op_vectype) |
8746 | op_vectype = vectype; |
8747 | FOR_EACH_VEC_ELT (children, i, child) |
8748 | { |
8749 | if ((SLP_TREE_DEF_TYPE (child) != vect_internal_def |
8750 | && !vect_maybe_update_slp_op_vectype (child, op_vectype)) |
8751 | || !types_compatible_p (SLP_TREE_VECTYPE (child), type2: op_vectype) |
8752 | || !types_compatible_p (TREE_TYPE (vectype), TREE_TYPE (op_vectype))) |
8753 | { |
8754 | if (dump_p) |
8755 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
8756 | "Unsupported vector types in lane permutation\n" ); |
8757 | return -1; |
8758 | } |
8759 | if (SLP_TREE_LANES (child) != SLP_TREE_LANES (node)) |
8760 | repeating_p = false; |
8761 | } |
8762 | |
8763 | gcc_assert (perm.length () == SLP_TREE_LANES (node)); |
8764 | if (dump_p) |
8765 | { |
8766 | dump_printf_loc (MSG_NOTE, vect_location, |
8767 | "vectorizing permutation" ); |
8768 | for (unsigned i = 0; i < perm.length (); ++i) |
8769 | dump_printf (MSG_NOTE, " op%u[%u]" , perm[i].first, perm[i].second); |
8770 | if (repeating_p) |
8771 | dump_printf (MSG_NOTE, " (repeat %d)\n" , SLP_TREE_LANES (node)); |
8772 | dump_printf (MSG_NOTE, "\n" ); |
8773 | } |
8774 | |
8775 | /* REPEATING_P is true if every output vector is guaranteed to use the |
8776 | same permute vector. We can handle that case for both variable-length |
8777 | and constant-length vectors, but we only handle other cases for |
8778 | constant-length vectors. |
8779 | |
8780 | Set: |
8781 | |
8782 | - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute |
8783 | mask vector that we want to build. |
8784 | |
8785 | - NCOPIES to the number of copies of PERM that we need in order |
8786 | to build the necessary permute mask vectors. |
8787 | |
8788 | - NOUTPUTS_PER_MASK to the number of output vectors we want to create |
8789 | for each permute mask vector. This is only relevant when GSI is |
8790 | nonnull. */ |
8791 | uint64_t npatterns; |
8792 | unsigned nelts_per_pattern; |
8793 | uint64_t ncopies; |
8794 | unsigned noutputs_per_mask; |
8795 | if (repeating_p) |
8796 | { |
8797 | /* We need a single permute mask vector that has the form: |
8798 | |
8799 | { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... } |
8800 | |
8801 | In other words, the original n-element permute in PERM is |
8802 | "unrolled" to fill a full vector. The stepped vector encoding |
8803 | that we use for permutes requires 3n elements. */ |
8804 | npatterns = SLP_TREE_LANES (node); |
8805 | nelts_per_pattern = ncopies = 3; |
8806 | noutputs_per_mask = SLP_TREE_NUMBER_OF_VEC_STMTS (node); |
8807 | } |
8808 | else |
8809 | { |
8810 | /* Calculate every element of every permute mask vector explicitly, |
8811 | instead of relying on the pattern described above. */ |
8812 | if (!nunits.is_constant (const_value: &npatterns)) |
8813 | return -1; |
8814 | nelts_per_pattern = ncopies = 1; |
8815 | if (loop_vec_info linfo = dyn_cast <loop_vec_info> (p: vinfo)) |
8816 | if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (const_value: &ncopies)) |
8817 | return -1; |
8818 | noutputs_per_mask = 1; |
8819 | } |
8820 | unsigned olanes = ncopies * SLP_TREE_LANES (node); |
8821 | gcc_assert (repeating_p || multiple_p (olanes, nunits)); |
8822 | |
8823 | /* Compute the { { SLP operand, vector index}, lane } permutation sequence |
8824 | from the { SLP operand, scalar lane } permutation as recorded in the |
8825 | SLP node as intermediate step. This part should already work |
8826 | with SLP children with arbitrary number of lanes. */ |
8827 | auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm; |
8828 | auto_vec<unsigned> active_lane; |
8829 | vperm.create (nelems: olanes); |
8830 | active_lane.safe_grow_cleared (len: children.length (), exact: true); |
8831 | for (unsigned i = 0; i < ncopies; ++i) |
8832 | { |
8833 | for (unsigned pi = 0; pi < perm.length (); ++pi) |
8834 | { |
8835 | std::pair<unsigned, unsigned> p = perm[pi]; |
8836 | tree vtype = SLP_TREE_VECTYPE (children[p.first]); |
8837 | if (repeating_p) |
8838 | vperm.quick_push (obj: {{p.first, 0}, p.second + active_lane[p.first]}); |
8839 | else |
8840 | { |
8841 | /* We checked above that the vectors are constant-length. */ |
8842 | unsigned vnunits = TYPE_VECTOR_SUBPARTS (node: vtype).to_constant (); |
8843 | unsigned vi = (active_lane[p.first] + p.second) / vnunits; |
8844 | unsigned vl = (active_lane[p.first] + p.second) % vnunits; |
8845 | vperm.quick_push (obj: {{p.first, vi}, vl}); |
8846 | } |
8847 | } |
8848 | /* Advance to the next group. */ |
8849 | for (unsigned j = 0; j < children.length (); ++j) |
8850 | active_lane[j] += SLP_TREE_LANES (children[j]); |
8851 | } |
8852 | |
8853 | if (dump_p) |
8854 | { |
8855 | dump_printf_loc (MSG_NOTE, vect_location, |
8856 | "vectorizing permutation" ); |
8857 | for (unsigned i = 0; i < perm.length (); ++i) |
8858 | dump_printf (MSG_NOTE, " op%u[%u]" , perm[i].first, perm[i].second); |
8859 | if (repeating_p) |
8860 | dump_printf (MSG_NOTE, " (repeat %d)\n" , SLP_TREE_LANES (node)); |
8861 | dump_printf (MSG_NOTE, "\n" ); |
8862 | dump_printf_loc (MSG_NOTE, vect_location, "as" ); |
8863 | for (unsigned i = 0; i < vperm.length (); ++i) |
8864 | { |
8865 | if (i != 0 |
8866 | && (repeating_p |
8867 | ? multiple_p (a: i, b: npatterns) |
8868 | : multiple_p (a: i, b: TYPE_VECTOR_SUBPARTS (node: vectype)))) |
8869 | dump_printf (MSG_NOTE, "," ); |
8870 | dump_printf (MSG_NOTE, " vops%u[%u][%u]" , |
8871 | vperm[i].first.first, vperm[i].first.second, |
8872 | vperm[i].second); |
8873 | } |
8874 | dump_printf (MSG_NOTE, "\n" ); |
8875 | } |
8876 | |
8877 | /* We can only handle two-vector permutes, everything else should |
8878 | be lowered on the SLP level. The following is closely inspired |
8879 | by vect_transform_slp_perm_load and is supposed to eventually |
8880 | replace it. |
8881 | ??? As intermediate step do code-gen in the SLP tree representation |
8882 | somehow? */ |
8883 | std::pair<unsigned, unsigned> first_vec = std::make_pair (x: -1U, y: -1U); |
8884 | std::pair<unsigned, unsigned> second_vec = std::make_pair (x: -1U, y: -1U); |
8885 | unsigned int index = 0; |
8886 | poly_uint64 mask_element; |
8887 | vec_perm_builder mask; |
8888 | mask.new_vector (full_nelts: nunits, npatterns, nelts_per_pattern); |
8889 | unsigned int count = mask.encoded_nelts (); |
8890 | mask.quick_grow (len: count); |
8891 | vec_perm_indices indices; |
8892 | unsigned nperms = 0; |
8893 | for (unsigned i = 0; i < vperm.length (); ++i) |
8894 | { |
8895 | mask_element = vperm[i].second; |
8896 | if (first_vec.first == -1U |
8897 | || first_vec == vperm[i].first) |
8898 | first_vec = vperm[i].first; |
8899 | else if (second_vec.first == -1U |
8900 | || second_vec == vperm[i].first) |
8901 | { |
8902 | second_vec = vperm[i].first; |
8903 | mask_element += nunits; |
8904 | } |
8905 | else |
8906 | { |
8907 | if (dump_p) |
8908 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
8909 | "permutation requires at " |
8910 | "least three vectors\n" ); |
8911 | gcc_assert (!gsi); |
8912 | return -1; |
8913 | } |
8914 | |
8915 | mask[index++] = mask_element; |
8916 | |
8917 | if (index == count) |
8918 | { |
8919 | indices.new_vector (mask, second_vec.first == -1U ? 1 : 2, |
8920 | TYPE_VECTOR_SUBPARTS (node: op_vectype)); |
8921 | bool identity_p = (indices.series_p (0, 1, mask[0], 1) |
8922 | && constant_multiple_p (a: mask[0], b: nunits)); |
8923 | machine_mode vmode = TYPE_MODE (vectype); |
8924 | machine_mode op_vmode = TYPE_MODE (op_vectype); |
8925 | unsigned HOST_WIDE_INT c; |
8926 | if ((!identity_p |
8927 | && !can_vec_perm_const_p (vmode, op_vmode, indices)) |
8928 | || (identity_p |
8929 | && !known_le (nunits, |
8930 | TYPE_VECTOR_SUBPARTS (op_vectype)) |
8931 | && (!constant_multiple_p (a: nunits, |
8932 | b: TYPE_VECTOR_SUBPARTS (node: op_vectype), |
8933 | multiple: &c) || c != 2))) |
8934 | { |
8935 | if (dump_p) |
8936 | { |
8937 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, |
8938 | vect_location, |
8939 | "unsupported vect permute { " ); |
8940 | for (i = 0; i < count; ++i) |
8941 | { |
8942 | dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]); |
8943 | dump_printf (MSG_MISSED_OPTIMIZATION, " " ); |
8944 | } |
8945 | dump_printf (MSG_MISSED_OPTIMIZATION, "}\n" ); |
8946 | } |
8947 | gcc_assert (!gsi); |
8948 | return -1; |
8949 | } |
8950 | |
8951 | if (!identity_p) |
8952 | nperms++; |
8953 | if (gsi) |
8954 | { |
8955 | if (second_vec.first == -1U) |
8956 | second_vec = first_vec; |
8957 | |
8958 | slp_tree |
8959 | first_node = children[first_vec.first], |
8960 | second_node = children[second_vec.first]; |
8961 | |
8962 | tree mask_vec = NULL_TREE; |
8963 | if (!identity_p) |
8964 | mask_vec = vect_gen_perm_mask_checked (vectype, indices); |
8965 | |
8966 | for (unsigned int vi = 0; vi < noutputs_per_mask; ++vi) |
8967 | { |
8968 | tree first_def |
8969 | = vect_get_slp_vect_def (slp_node: first_node, |
8970 | i: first_vec.second + vi); |
8971 | tree second_def |
8972 | = vect_get_slp_vect_def (slp_node: second_node, |
8973 | i: second_vec.second + vi); |
8974 | vect_add_slp_permutation (vinfo, gsi, node, first_def, |
8975 | second_def, mask_vec, identity_offset: mask[0]); |
8976 | } |
8977 | } |
8978 | |
8979 | index = 0; |
8980 | first_vec = std::make_pair (x: -1U, y: -1U); |
8981 | second_vec = std::make_pair (x: -1U, y: -1U); |
8982 | } |
8983 | } |
8984 | |
8985 | return nperms; |
8986 | } |
8987 | |
8988 | /* Vectorize the SLP permutations in NODE as specified |
8989 | in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP |
8990 | child number and lane number. |
8991 | Interleaving of two two-lane two-child SLP subtrees (not supported): |
8992 | [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ] |
8993 | A blend of two four-lane two-child SLP subtrees: |
8994 | [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ] |
8995 | Highpart of a four-lane one-child SLP subtree (not supported): |
8996 | [ { 0, 2 }, { 0, 3 } ] |
8997 | Where currently only a subset is supported by code generating below. */ |
8998 | |
8999 | static bool |
9000 | vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi, |
9001 | slp_tree node, stmt_vector_for_cost *cost_vec) |
9002 | { |
9003 | tree vectype = SLP_TREE_VECTYPE (node); |
9004 | lane_permutation_t &perm = SLP_TREE_LANE_PERMUTATION (node); |
9005 | int nperms = vectorizable_slp_permutation_1 (vinfo, gsi, node, perm, |
9006 | SLP_TREE_CHILDREN (node), |
9007 | dump_p: dump_enabled_p ()); |
9008 | if (nperms < 0) |
9009 | return false; |
9010 | |
9011 | if (!gsi) |
9012 | record_stmt_cost (cost_vec, nperms, vec_perm, node, vectype, 0, vect_body); |
9013 | |
9014 | return true; |
9015 | } |
9016 | |
9017 | /* Vectorize SLP NODE. */ |
9018 | |
9019 | static void |
9020 | vect_schedule_slp_node (vec_info *vinfo, |
9021 | slp_tree node, slp_instance instance) |
9022 | { |
9023 | gimple_stmt_iterator si; |
9024 | int i; |
9025 | slp_tree child; |
9026 | |
9027 | /* For existing vectors there's nothing to do. */ |
9028 | if (SLP_TREE_DEF_TYPE (node) == vect_external_def |
9029 | && SLP_TREE_VEC_DEFS (node).exists ()) |
9030 | return; |
9031 | |
9032 | gcc_assert (SLP_TREE_VEC_DEFS (node).is_empty ()); |
9033 | |
9034 | /* Vectorize externals and constants. */ |
9035 | if (SLP_TREE_DEF_TYPE (node) == vect_constant_def |
9036 | || SLP_TREE_DEF_TYPE (node) == vect_external_def) |
9037 | { |
9038 | /* ??? vectorizable_shift can end up using a scalar operand which is |
9039 | currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the |
9040 | node in this case. */ |
9041 | if (!SLP_TREE_VECTYPE (node)) |
9042 | return; |
9043 | |
9044 | vect_create_constant_vectors (vinfo, op_node: node); |
9045 | return; |
9046 | } |
9047 | |
9048 | stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node); |
9049 | |
9050 | gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0); |
9051 | SLP_TREE_VEC_DEFS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node)); |
9052 | |
9053 | if (dump_enabled_p ()) |
9054 | dump_printf_loc (MSG_NOTE, vect_location, |
9055 | "------>vectorizing SLP node starting from: %G" , |
9056 | stmt_info->stmt); |
9057 | |
9058 | if (STMT_VINFO_DATA_REF (stmt_info) |
9059 | && SLP_TREE_CODE (node) != VEC_PERM_EXPR) |
9060 | { |
9061 | /* Vectorized loads go before the first scalar load to make it |
9062 | ready early, vectorized stores go before the last scalar |
9063 | stmt which is where all uses are ready. */ |
9064 | stmt_vec_info last_stmt_info = NULL; |
9065 | if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))) |
9066 | last_stmt_info = vect_find_first_scalar_stmt_in_slp (node); |
9067 | else /* DR_IS_WRITE */ |
9068 | last_stmt_info = vect_find_last_scalar_stmt_in_slp (node); |
9069 | si = gsi_for_stmt (last_stmt_info->stmt); |
9070 | } |
9071 | else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type |
9072 | || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type |
9073 | || STMT_VINFO_TYPE (stmt_info) == phi_info_type) |
9074 | && SLP_TREE_CODE (node) != VEC_PERM_EXPR) |
9075 | { |
9076 | /* For PHI node vectorization we do not use the insertion iterator. */ |
9077 | si = gsi_none (); |
9078 | } |
9079 | else |
9080 | { |
9081 | /* Emit other stmts after the children vectorized defs which is |
9082 | earliest possible. */ |
9083 | gimple *last_stmt = NULL; |
9084 | bool seen_vector_def = false; |
9085 | FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) |
9086 | if (SLP_TREE_DEF_TYPE (child) == vect_internal_def) |
9087 | { |
9088 | /* For fold-left reductions we are retaining the scalar |
9089 | reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS |
9090 | set so the representation isn't perfect. Resort to the |
9091 | last scalar def here. */ |
9092 | if (SLP_TREE_VEC_DEFS (child).is_empty ()) |
9093 | { |
9094 | gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child)) |
9095 | == cycle_phi_info_type); |
9096 | gphi *phi = as_a <gphi *> |
9097 | (p: vect_find_last_scalar_stmt_in_slp (node: child)->stmt); |
9098 | if (!last_stmt |
9099 | || vect_stmt_dominates_stmt_p (last_stmt, phi)) |
9100 | last_stmt = phi; |
9101 | } |
9102 | /* We are emitting all vectorized stmts in the same place and |
9103 | the last one is the last. |
9104 | ??? Unless we have a load permutation applied and that |
9105 | figures to re-use an earlier generated load. */ |
9106 | unsigned j; |
9107 | tree vdef; |
9108 | FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef) |
9109 | { |
9110 | gimple *vstmt = SSA_NAME_DEF_STMT (vdef); |
9111 | if (!last_stmt |
9112 | || vect_stmt_dominates_stmt_p (last_stmt, vstmt)) |
9113 | last_stmt = vstmt; |
9114 | } |
9115 | } |
9116 | else if (!SLP_TREE_VECTYPE (child)) |
9117 | { |
9118 | /* For externals we use unvectorized at all scalar defs. */ |
9119 | unsigned j; |
9120 | tree def; |
9121 | FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def) |
9122 | if (TREE_CODE (def) == SSA_NAME |
9123 | && !SSA_NAME_IS_DEFAULT_DEF (def)) |
9124 | { |
9125 | gimple *stmt = SSA_NAME_DEF_STMT (def); |
9126 | if (!last_stmt |
9127 | || vect_stmt_dominates_stmt_p (last_stmt, stmt)) |
9128 | last_stmt = stmt; |
9129 | } |
9130 | } |
9131 | else |
9132 | { |
9133 | /* For externals we have to look at all defs since their |
9134 | insertion place is decided per vector. But beware |
9135 | of pre-existing vectors where we need to make sure |
9136 | we do not insert before the region boundary. */ |
9137 | if (SLP_TREE_SCALAR_OPS (child).is_empty () |
9138 | && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0])) |
9139 | seen_vector_def = true; |
9140 | else |
9141 | { |
9142 | unsigned j; |
9143 | tree vdef; |
9144 | FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef) |
9145 | if (TREE_CODE (vdef) == SSA_NAME |
9146 | && !SSA_NAME_IS_DEFAULT_DEF (vdef)) |
9147 | { |
9148 | gimple *vstmt = SSA_NAME_DEF_STMT (vdef); |
9149 | if (!last_stmt |
9150 | || vect_stmt_dominates_stmt_p (last_stmt, vstmt)) |
9151 | last_stmt = vstmt; |
9152 | } |
9153 | } |
9154 | } |
9155 | /* This can happen when all children are pre-existing vectors or |
9156 | constants. */ |
9157 | if (!last_stmt) |
9158 | last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt; |
9159 | if (!last_stmt) |
9160 | { |
9161 | gcc_assert (seen_vector_def); |
9162 | si = gsi_after_labels (bb: as_a <bb_vec_info> (p: vinfo)->bbs[0]); |
9163 | } |
9164 | else if (is_ctrl_altering_stmt (last_stmt)) |
9165 | { |
9166 | /* We split regions to vectorize at control altering stmts |
9167 | with a definition so this must be an external which |
9168 | we can insert at the start of the region. */ |
9169 | si = gsi_after_labels (bb: as_a <bb_vec_info> (p: vinfo)->bbs[0]); |
9170 | } |
9171 | else if (is_a <bb_vec_info> (p: vinfo) |
9172 | && gimple_bb (g: last_stmt) != gimple_bb (g: stmt_info->stmt) |
9173 | && gimple_could_trap_p (stmt_info->stmt)) |
9174 | { |
9175 | /* We've constrained possibly trapping operations to all come |
9176 | from the same basic-block, if vectorized defs would allow earlier |
9177 | scheduling still force vectorized stmts to the original block. |
9178 | This is only necessary for BB vectorization since for loop vect |
9179 | all operations are in a single BB and scalar stmt based |
9180 | placement doesn't play well with epilogue vectorization. */ |
9181 | gcc_assert (dominated_by_p (CDI_DOMINATORS, |
9182 | gimple_bb (stmt_info->stmt), |
9183 | gimple_bb (last_stmt))); |
9184 | si = gsi_after_labels (bb: gimple_bb (g: stmt_info->stmt)); |
9185 | } |
9186 | else if (is_a <gphi *> (p: last_stmt)) |
9187 | si = gsi_after_labels (bb: gimple_bb (g: last_stmt)); |
9188 | else |
9189 | { |
9190 | si = gsi_for_stmt (last_stmt); |
9191 | gsi_next (i: &si); |
9192 | } |
9193 | } |
9194 | |
9195 | /* Handle purely internal nodes. */ |
9196 | if (SLP_TREE_CODE (node) == VEC_PERM_EXPR) |
9197 | { |
9198 | /* ??? the transform kind is stored to STMT_VINFO_TYPE which might |
9199 | be shared with different SLP nodes (but usually it's the same |
9200 | operation apart from the case the stmt is only there for denoting |
9201 | the actual scalar lane defs ...). So do not call vect_transform_stmt |
9202 | but open-code it here (partly). */ |
9203 | bool done = vectorizable_slp_permutation (vinfo, gsi: &si, node, NULL); |
9204 | gcc_assert (done); |
9205 | stmt_vec_info slp_stmt_info; |
9206 | unsigned int i; |
9207 | FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info) |
9208 | if (STMT_VINFO_LIVE_P (slp_stmt_info)) |
9209 | { |
9210 | done = vectorizable_live_operation (vinfo, slp_stmt_info, node, |
9211 | instance, i, true, NULL); |
9212 | gcc_assert (done); |
9213 | } |
9214 | } |
9215 | else |
9216 | vect_transform_stmt (vinfo, stmt_info, &si, node, instance); |
9217 | } |
9218 | |
9219 | /* Replace scalar calls from SLP node NODE with setting of their lhs to zero. |
9220 | For loop vectorization this is done in vectorizable_call, but for SLP |
9221 | it needs to be deferred until end of vect_schedule_slp, because multiple |
9222 | SLP instances may refer to the same scalar stmt. */ |
9223 | |
9224 | static void |
9225 | vect_remove_slp_scalar_calls (vec_info *vinfo, |
9226 | slp_tree node, hash_set<slp_tree> &visited) |
9227 | { |
9228 | gimple *new_stmt; |
9229 | gimple_stmt_iterator gsi; |
9230 | int i; |
9231 | slp_tree child; |
9232 | tree lhs; |
9233 | stmt_vec_info stmt_info; |
9234 | |
9235 | if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def) |
9236 | return; |
9237 | |
9238 | if (visited.add (k: node)) |
9239 | return; |
9240 | |
9241 | FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) |
9242 | vect_remove_slp_scalar_calls (vinfo, node: child, visited); |
9243 | |
9244 | FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info) |
9245 | { |
9246 | gcall *stmt = dyn_cast <gcall *> (p: stmt_info->stmt); |
9247 | if (!stmt || gimple_bb (g: stmt) == NULL) |
9248 | continue; |
9249 | if (is_pattern_stmt_p (stmt_info) |
9250 | || !PURE_SLP_STMT (stmt_info)) |
9251 | continue; |
9252 | lhs = gimple_call_lhs (gs: stmt); |
9253 | if (lhs) |
9254 | new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs))); |
9255 | else |
9256 | { |
9257 | new_stmt = gimple_build_nop (); |
9258 | unlink_stmt_vdef (stmt_info->stmt); |
9259 | } |
9260 | gsi = gsi_for_stmt (stmt); |
9261 | vinfo->replace_stmt (&gsi, stmt_info, new_stmt); |
9262 | if (lhs) |
9263 | SSA_NAME_DEF_STMT (lhs) = new_stmt; |
9264 | } |
9265 | } |
9266 | |
9267 | static void |
9268 | vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node) |
9269 | { |
9270 | hash_set<slp_tree> visited; |
9271 | vect_remove_slp_scalar_calls (vinfo, node, visited); |
9272 | } |
9273 | |
9274 | /* Vectorize the instance root. */ |
9275 | |
9276 | void |
9277 | vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance) |
9278 | { |
9279 | gassign *rstmt = NULL; |
9280 | |
9281 | if (instance->kind == slp_inst_kind_ctor) |
9282 | { |
9283 | if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1) |
9284 | { |
9285 | tree vect_lhs = SLP_TREE_VEC_DEFS (node)[0]; |
9286 | tree root_lhs = gimple_get_lhs (instance->root_stmts[0]->stmt); |
9287 | if (!useless_type_conversion_p (TREE_TYPE (root_lhs), |
9288 | TREE_TYPE (vect_lhs))) |
9289 | vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs), |
9290 | vect_lhs); |
9291 | rstmt = gimple_build_assign (root_lhs, vect_lhs); |
9292 | } |
9293 | else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1) |
9294 | { |
9295 | int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node); |
9296 | tree child_def; |
9297 | int j; |
9298 | vec<constructor_elt, va_gc> *v; |
9299 | vec_alloc (v, nelems: nelts); |
9300 | |
9301 | /* A CTOR can handle V16HI composition from VNx8HI so we |
9302 | do not need to convert vector elements if the types |
9303 | do not match. */ |
9304 | FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (node), j, child_def) |
9305 | CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, child_def); |
9306 | tree lhs = gimple_get_lhs (instance->root_stmts[0]->stmt); |
9307 | tree rtype |
9308 | = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[0]->stmt)); |
9309 | tree r_constructor = build_constructor (rtype, v); |
9310 | rstmt = gimple_build_assign (lhs, r_constructor); |
9311 | } |
9312 | } |
9313 | else if (instance->kind == slp_inst_kind_bb_reduc) |
9314 | { |
9315 | /* Largely inspired by reduction chain epilogue handling in |
9316 | vect_create_epilog_for_reduction. */ |
9317 | vec<tree> vec_defs = vNULL; |
9318 | vect_get_slp_defs (slp_node: node, vec_defs: &vec_defs); |
9319 | enum tree_code reduc_code |
9320 | = gimple_assign_rhs_code (gs: instance->root_stmts[0]->stmt); |
9321 | /* ??? We actually have to reflect signs somewhere. */ |
9322 | if (reduc_code == MINUS_EXPR) |
9323 | reduc_code = PLUS_EXPR; |
9324 | gimple_seq epilogue = NULL; |
9325 | /* We may end up with more than one vector result, reduce them |
9326 | to one vector. */ |
9327 | tree vec_def = vec_defs[0]; |
9328 | tree vectype = TREE_TYPE (vec_def); |
9329 | tree compute_vectype = vectype; |
9330 | bool pun_for_overflow_p = (ANY_INTEGRAL_TYPE_P (vectype) |
9331 | && TYPE_OVERFLOW_UNDEFINED (vectype) |
9332 | && operation_can_overflow (reduc_code)); |
9333 | if (pun_for_overflow_p) |
9334 | { |
9335 | compute_vectype = unsigned_type_for (vectype); |
9336 | vec_def = gimple_build (seq: &epilogue, code: VIEW_CONVERT_EXPR, |
9337 | type: compute_vectype, ops: vec_def); |
9338 | } |
9339 | for (unsigned i = 1; i < vec_defs.length (); ++i) |
9340 | { |
9341 | tree def = vec_defs[i]; |
9342 | if (pun_for_overflow_p) |
9343 | def = gimple_build (seq: &epilogue, code: VIEW_CONVERT_EXPR, |
9344 | type: compute_vectype, ops: def); |
9345 | vec_def = gimple_build (seq: &epilogue, code: reduc_code, type: compute_vectype, |
9346 | ops: vec_def, ops: def); |
9347 | } |
9348 | vec_defs.release (); |
9349 | /* ??? Support other schemes than direct internal fn. */ |
9350 | internal_fn reduc_fn; |
9351 | if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn) |
9352 | || reduc_fn == IFN_LAST) |
9353 | gcc_unreachable (); |
9354 | tree scalar_def = gimple_build (seq: &epilogue, fn: as_combined_fn (fn: reduc_fn), |
9355 | TREE_TYPE (compute_vectype), args: vec_def); |
9356 | if (!SLP_INSTANCE_REMAIN_DEFS (instance).is_empty ()) |
9357 | { |
9358 | tree rem_def = NULL_TREE; |
9359 | for (auto def : SLP_INSTANCE_REMAIN_DEFS (instance)) |
9360 | { |
9361 | def = gimple_convert (seq: &epilogue, TREE_TYPE (scalar_def), op: def); |
9362 | if (!rem_def) |
9363 | rem_def = def; |
9364 | else |
9365 | rem_def = gimple_build (seq: &epilogue, code: reduc_code, |
9366 | TREE_TYPE (scalar_def), |
9367 | ops: rem_def, ops: def); |
9368 | } |
9369 | scalar_def = gimple_build (seq: &epilogue, code: reduc_code, |
9370 | TREE_TYPE (scalar_def), |
9371 | ops: scalar_def, ops: rem_def); |
9372 | } |
9373 | scalar_def = gimple_convert (seq: &epilogue, |
9374 | TREE_TYPE (vectype), op: scalar_def); |
9375 | gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt); |
9376 | gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT); |
9377 | gimple_assign_set_rhs_from_tree (&rgsi, scalar_def); |
9378 | update_stmt (s: gsi_stmt (i: rgsi)); |
9379 | return; |
9380 | } |
9381 | else |
9382 | gcc_unreachable (); |
9383 | |
9384 | gcc_assert (rstmt); |
9385 | |
9386 | gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt); |
9387 | gsi_replace (&rgsi, rstmt, true); |
9388 | } |
9389 | |
9390 | struct slp_scc_info |
9391 | { |
9392 | bool on_stack; |
9393 | int dfs; |
9394 | int lowlink; |
9395 | }; |
9396 | |
9397 | /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */ |
9398 | |
9399 | static void |
9400 | vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance, |
9401 | hash_map<slp_tree, slp_scc_info> &scc_info, |
9402 | int &maxdfs, vec<slp_tree> &stack) |
9403 | { |
9404 | bool existed_p; |
9405 | slp_scc_info *info = &scc_info.get_or_insert (k: node, existed: &existed_p); |
9406 | gcc_assert (!existed_p); |
9407 | info->dfs = maxdfs; |
9408 | info->lowlink = maxdfs; |
9409 | maxdfs++; |
9410 | |
9411 | /* Leaf. */ |
9412 | if (SLP_TREE_DEF_TYPE (node) != vect_internal_def) |
9413 | { |
9414 | info->on_stack = false; |
9415 | vect_schedule_slp_node (vinfo, node, instance); |
9416 | return; |
9417 | } |
9418 | |
9419 | info->on_stack = true; |
9420 | stack.safe_push (obj: node); |
9421 | |
9422 | unsigned i; |
9423 | slp_tree child; |
9424 | /* DFS recurse. */ |
9425 | FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) |
9426 | { |
9427 | if (!child) |
9428 | continue; |
9429 | slp_scc_info *child_info = scc_info.get (k: child); |
9430 | if (!child_info) |
9431 | { |
9432 | vect_schedule_scc (vinfo, node: child, instance, scc_info, maxdfs, stack); |
9433 | /* Recursion might have re-allocated the node. */ |
9434 | info = scc_info.get (k: node); |
9435 | child_info = scc_info.get (k: child); |
9436 | info->lowlink = MIN (info->lowlink, child_info->lowlink); |
9437 | } |
9438 | else if (child_info->on_stack) |
9439 | info->lowlink = MIN (info->lowlink, child_info->dfs); |
9440 | } |
9441 | if (info->lowlink != info->dfs) |
9442 | return; |
9443 | |
9444 | auto_vec<slp_tree, 4> phis_to_fixup; |
9445 | |
9446 | /* Singleton. */ |
9447 | if (stack.last () == node) |
9448 | { |
9449 | stack.pop (); |
9450 | info->on_stack = false; |
9451 | vect_schedule_slp_node (vinfo, node, instance); |
9452 | if (SLP_TREE_CODE (node) != VEC_PERM_EXPR |
9453 | && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt)) |
9454 | phis_to_fixup.quick_push (obj: node); |
9455 | } |
9456 | else |
9457 | { |
9458 | /* SCC. */ |
9459 | int last_idx = stack.length () - 1; |
9460 | while (stack[last_idx] != node) |
9461 | last_idx--; |
9462 | /* We can break the cycle at PHIs who have at least one child |
9463 | code generated. Then we could re-start the DFS walk until |
9464 | all nodes in the SCC are covered (we might have new entries |
9465 | for only back-reachable nodes). But it's simpler to just |
9466 | iterate and schedule those that are ready. */ |
9467 | unsigned todo = stack.length () - last_idx; |
9468 | do |
9469 | { |
9470 | for (int idx = stack.length () - 1; idx >= last_idx; --idx) |
9471 | { |
9472 | slp_tree entry = stack[idx]; |
9473 | if (!entry) |
9474 | continue; |
9475 | bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR |
9476 | && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt)); |
9477 | bool ready = !phi; |
9478 | FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child) |
9479 | if (!child) |
9480 | { |
9481 | gcc_assert (phi); |
9482 | ready = true; |
9483 | break; |
9484 | } |
9485 | else if (scc_info.get (k: child)->on_stack) |
9486 | { |
9487 | if (!phi) |
9488 | { |
9489 | ready = false; |
9490 | break; |
9491 | } |
9492 | } |
9493 | else |
9494 | { |
9495 | if (phi) |
9496 | { |
9497 | ready = true; |
9498 | break; |
9499 | } |
9500 | } |
9501 | if (ready) |
9502 | { |
9503 | vect_schedule_slp_node (vinfo, node: entry, instance); |
9504 | scc_info.get (k: entry)->on_stack = false; |
9505 | stack[idx] = NULL; |
9506 | todo--; |
9507 | if (phi) |
9508 | phis_to_fixup.safe_push (obj: entry); |
9509 | } |
9510 | } |
9511 | } |
9512 | while (todo != 0); |
9513 | |
9514 | /* Pop the SCC. */ |
9515 | stack.truncate (size: last_idx); |
9516 | } |
9517 | |
9518 | /* Now fixup the backedge def of the vectorized PHIs in this SCC. */ |
9519 | slp_tree phi_node; |
9520 | FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node) |
9521 | { |
9522 | gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt); |
9523 | edge_iterator ei; |
9524 | edge e; |
9525 | FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds) |
9526 | { |
9527 | unsigned dest_idx = e->dest_idx; |
9528 | child = SLP_TREE_CHILDREN (phi_node)[dest_idx]; |
9529 | if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def) |
9530 | continue; |
9531 | unsigned n = SLP_TREE_VEC_DEFS (phi_node).length (); |
9532 | /* Simply fill all args. */ |
9533 | if (STMT_VINFO_DEF_TYPE (SLP_TREE_REPRESENTATIVE (phi_node)) |
9534 | != vect_first_order_recurrence) |
9535 | for (unsigned i = 0; i < n; ++i) |
9536 | { |
9537 | tree phidef = SLP_TREE_VEC_DEFS (phi_node)[i]; |
9538 | gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef)); |
9539 | add_phi_arg (phi, vect_get_slp_vect_def (slp_node: child, i), |
9540 | e, gimple_phi_arg_location (phi, i: dest_idx)); |
9541 | } |
9542 | else |
9543 | { |
9544 | /* Unless it is a first order recurrence which needs |
9545 | args filled in for both the PHI node and the permutes. */ |
9546 | gimple *perm |
9547 | = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[0]); |
9548 | gimple *rphi = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (perm)); |
9549 | add_phi_arg (as_a <gphi *> (p: rphi), |
9550 | vect_get_slp_vect_def (slp_node: child, i: n - 1), |
9551 | e, gimple_phi_arg_location (phi, i: dest_idx)); |
9552 | for (unsigned i = 0; i < n; ++i) |
9553 | { |
9554 | gimple *perm |
9555 | = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[i]); |
9556 | if (i > 0) |
9557 | gimple_assign_set_rhs1 (gs: perm, |
9558 | rhs: vect_get_slp_vect_def (slp_node: child, i: i - 1)); |
9559 | gimple_assign_set_rhs2 (gs: perm, |
9560 | rhs: vect_get_slp_vect_def (slp_node: child, i)); |
9561 | update_stmt (s: perm); |
9562 | } |
9563 | } |
9564 | } |
9565 | } |
9566 | } |
9567 | |
9568 | /* Generate vector code for SLP_INSTANCES in the loop/basic block. */ |
9569 | |
9570 | void |
9571 | vect_schedule_slp (vec_info *vinfo, const vec<slp_instance> &slp_instances) |
9572 | { |
9573 | slp_instance instance; |
9574 | unsigned int i; |
9575 | |
9576 | hash_map<slp_tree, slp_scc_info> scc_info; |
9577 | int maxdfs = 0; |
9578 | FOR_EACH_VEC_ELT (slp_instances, i, instance) |
9579 | { |
9580 | slp_tree node = SLP_INSTANCE_TREE (instance); |
9581 | if (dump_enabled_p ()) |
9582 | { |
9583 | dump_printf_loc (MSG_NOTE, vect_location, |
9584 | "Vectorizing SLP tree:\n" ); |
9585 | /* ??? Dump all? */ |
9586 | if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()) |
9587 | dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G" , |
9588 | SLP_INSTANCE_ROOT_STMTS (instance)[0]->stmt); |
9589 | vect_print_slp_graph (dump_kind: MSG_NOTE, loc: vect_location, |
9590 | SLP_INSTANCE_TREE (instance)); |
9591 | } |
9592 | /* Schedule the tree of INSTANCE, scheduling SCCs in a way to |
9593 | have a PHI be the node breaking the cycle. */ |
9594 | auto_vec<slp_tree> stack; |
9595 | if (!scc_info.get (k: node)) |
9596 | vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack); |
9597 | |
9598 | if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()) |
9599 | vectorize_slp_instance_root_stmt (node, instance); |
9600 | |
9601 | if (dump_enabled_p ()) |
9602 | dump_printf_loc (MSG_NOTE, vect_location, |
9603 | "vectorizing stmts using SLP.\n" ); |
9604 | } |
9605 | |
9606 | FOR_EACH_VEC_ELT (slp_instances, i, instance) |
9607 | { |
9608 | slp_tree root = SLP_INSTANCE_TREE (instance); |
9609 | stmt_vec_info store_info; |
9610 | unsigned int j; |
9611 | |
9612 | /* Remove scalar call stmts. Do not do this for basic-block |
9613 | vectorization as not all uses may be vectorized. |
9614 | ??? Why should this be necessary? DCE should be able to |
9615 | remove the stmts itself. |
9616 | ??? For BB vectorization we can as well remove scalar |
9617 | stmts starting from the SLP tree root if they have no |
9618 | uses. */ |
9619 | if (is_a <loop_vec_info> (p: vinfo)) |
9620 | vect_remove_slp_scalar_calls (vinfo, node: root); |
9621 | |
9622 | /* Remove vectorized stores original scalar stmts. */ |
9623 | for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (ix: j, ptr: &store_info); j++) |
9624 | { |
9625 | if (!STMT_VINFO_DATA_REF (store_info) |
9626 | || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info))) |
9627 | break; |
9628 | |
9629 | store_info = vect_orig_stmt (stmt_info: store_info); |
9630 | /* Free the attached stmt_vec_info and remove the stmt. */ |
9631 | vinfo->remove_stmt (store_info); |
9632 | |
9633 | /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it |
9634 | to not crash in vect_free_slp_tree later. */ |
9635 | if (SLP_TREE_REPRESENTATIVE (root) == store_info) |
9636 | SLP_TREE_REPRESENTATIVE (root) = NULL; |
9637 | } |
9638 | } |
9639 | } |
9640 | |