1/* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2023 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6This file is part of GCC.
7
8GCC is free software; you can redistribute it and/or modify it under
9the terms of the GNU General Public License as published by the Free
10Software Foundation; either version 3, or (at your option) any later
11version.
12
13GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14WARRANTY; without even the implied warranty of MERCHANTABILITY or
15FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16for more details.
17
18You should have received a copy of the GNU General Public License
19along with GCC; see the file COPYING3. If not see
20<http://www.gnu.org/licenses/>. */
21
22#include "config.h"
23#define INCLUDE_ALGORITHM
24#include "system.h"
25#include "coretypes.h"
26#include "backend.h"
27#include "target.h"
28#include "rtl.h"
29#include "tree.h"
30#include "gimple.h"
31#include "tree-pass.h"
32#include "ssa.h"
33#include "optabs-tree.h"
34#include "insn-config.h"
35#include "recog.h" /* FIXME: for insn_data */
36#include "fold-const.h"
37#include "stor-layout.h"
38#include "gimple-iterator.h"
39#include "cfgloop.h"
40#include "tree-vectorizer.h"
41#include "langhooks.h"
42#include "gimple-walk.h"
43#include "dbgcnt.h"
44#include "tree-vector-builder.h"
45#include "vec-perm-indices.h"
46#include "gimple-fold.h"
47#include "internal-fn.h"
48#include "dump-context.h"
49#include "cfganal.h"
50#include "tree-eh.h"
51#include "tree-cfg.h"
52#include "alloc-pool.h"
53#include "sreal.h"
54#include "predict.h"
55
56static bool vect_transform_slp_perm_load_1 (vec_info *, slp_tree,
57 load_permutation_t &,
58 const vec<tree> &,
59 gimple_stmt_iterator *,
60 poly_uint64, bool, bool,
61 unsigned *,
62 unsigned * = nullptr,
63 bool = false);
64static int vectorizable_slp_permutation_1 (vec_info *, gimple_stmt_iterator *,
65 slp_tree, lane_permutation_t &,
66 vec<slp_tree> &, bool);
67static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
68 slp_tree, stmt_vector_for_cost *);
69static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
70
71static object_allocator<_slp_tree> *slp_tree_pool;
72static slp_tree slp_first_node;
73
74void
75vect_slp_init (void)
76{
77 slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
78}
79
80void
81vect_slp_fini (void)
82{
83 while (slp_first_node)
84 delete slp_first_node;
85 delete slp_tree_pool;
86 slp_tree_pool = NULL;
87}
88
89void *
90_slp_tree::operator new (size_t n)
91{
92 gcc_assert (n == sizeof (_slp_tree));
93 return slp_tree_pool->allocate_raw ();
94}
95
96void
97_slp_tree::operator delete (void *node, size_t n)
98{
99 gcc_assert (n == sizeof (_slp_tree));
100 slp_tree_pool->remove_raw (object: node);
101}
102
103
104/* Initialize a SLP node. */
105
106_slp_tree::_slp_tree ()
107{
108 this->prev_node = NULL;
109 if (slp_first_node)
110 slp_first_node->prev_node = this;
111 this->next_node = slp_first_node;
112 slp_first_node = this;
113 SLP_TREE_SCALAR_STMTS (this) = vNULL;
114 SLP_TREE_SCALAR_OPS (this) = vNULL;
115 SLP_TREE_VEC_DEFS (this) = vNULL;
116 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
117 SLP_TREE_CHILDREN (this) = vNULL;
118 SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
119 SLP_TREE_LANE_PERMUTATION (this) = vNULL;
120 SLP_TREE_SIMD_CLONE_INFO (this) = vNULL;
121 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
122 SLP_TREE_CODE (this) = ERROR_MARK;
123 SLP_TREE_VECTYPE (this) = NULL_TREE;
124 SLP_TREE_REPRESENTATIVE (this) = NULL;
125 SLP_TREE_REF_COUNT (this) = 1;
126 this->failed = NULL;
127 this->max_nunits = 1;
128 this->lanes = 0;
129}
130
131/* Tear down a SLP node. */
132
133_slp_tree::~_slp_tree ()
134{
135 if (this->prev_node)
136 this->prev_node->next_node = this->next_node;
137 else
138 slp_first_node = this->next_node;
139 if (this->next_node)
140 this->next_node->prev_node = this->prev_node;
141 SLP_TREE_CHILDREN (this).release ();
142 SLP_TREE_SCALAR_STMTS (this).release ();
143 SLP_TREE_SCALAR_OPS (this).release ();
144 SLP_TREE_VEC_DEFS (this).release ();
145 SLP_TREE_LOAD_PERMUTATION (this).release ();
146 SLP_TREE_LANE_PERMUTATION (this).release ();
147 SLP_TREE_SIMD_CLONE_INFO (this).release ();
148 if (this->failed)
149 free (ptr: failed);
150}
151
152/* Push the single SSA definition in DEF to the vector of vector defs. */
153
154void
155_slp_tree::push_vec_def (gimple *def)
156{
157 if (gphi *phi = dyn_cast <gphi *> (p: def))
158 vec_defs.quick_push (obj: gimple_phi_result (gs: phi));
159 else
160 {
161 def_operand_p defop = single_ssa_def_operand (stmt: def, SSA_OP_ALL_DEFS);
162 vec_defs.quick_push (obj: get_def_from_ptr (def: defop));
163 }
164}
165
166/* Recursively free the memory allocated for the SLP tree rooted at NODE. */
167
168void
169vect_free_slp_tree (slp_tree node)
170{
171 int i;
172 slp_tree child;
173
174 if (--SLP_TREE_REF_COUNT (node) != 0)
175 return;
176
177 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
178 if (child)
179 vect_free_slp_tree (node: child);
180
181 /* If the node defines any SLP only patterns then those patterns are no
182 longer valid and should be removed. */
183 stmt_vec_info rep_stmt_info = SLP_TREE_REPRESENTATIVE (node);
184 if (rep_stmt_info && STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info))
185 {
186 stmt_vec_info stmt_info = vect_orig_stmt (stmt_info: rep_stmt_info);
187 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
188 STMT_SLP_TYPE (stmt_info) = STMT_SLP_TYPE (rep_stmt_info);
189 }
190
191 delete node;
192}
193
194/* Return a location suitable for dumpings related to the SLP instance. */
195
196dump_user_location_t
197_slp_instance::location () const
198{
199 if (!root_stmts.is_empty ())
200 return root_stmts[0]->stmt;
201 else
202 return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
203}
204
205
206/* Free the memory allocated for the SLP instance. */
207
208void
209vect_free_slp_instance (slp_instance instance)
210{
211 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
212 SLP_INSTANCE_LOADS (instance).release ();
213 SLP_INSTANCE_ROOT_STMTS (instance).release ();
214 SLP_INSTANCE_REMAIN_DEFS (instance).release ();
215 instance->subgraph_entries.release ();
216 instance->cost_vec.release ();
217 free (ptr: instance);
218}
219
220
221/* Create an SLP node for SCALAR_STMTS. */
222
223slp_tree
224vect_create_new_slp_node (unsigned nops, tree_code code)
225{
226 slp_tree node = new _slp_tree;
227 SLP_TREE_SCALAR_STMTS (node) = vNULL;
228 SLP_TREE_CHILDREN (node).create (nelems: nops);
229 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
230 SLP_TREE_CODE (node) = code;
231 return node;
232}
233/* Create an SLP node for SCALAR_STMTS. */
234
235static slp_tree
236vect_create_new_slp_node (slp_tree node,
237 vec<stmt_vec_info> scalar_stmts, unsigned nops)
238{
239 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
240 SLP_TREE_CHILDREN (node).create (nelems: nops);
241 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
242 SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
243 SLP_TREE_LANES (node) = scalar_stmts.length ();
244 return node;
245}
246
247/* Create an SLP node for SCALAR_STMTS. */
248
249static slp_tree
250vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
251{
252 return vect_create_new_slp_node (node: new _slp_tree, scalar_stmts, nops);
253}
254
255/* Create an SLP node for OPS. */
256
257static slp_tree
258vect_create_new_slp_node (slp_tree node, vec<tree> ops)
259{
260 SLP_TREE_SCALAR_OPS (node) = ops;
261 SLP_TREE_DEF_TYPE (node) = vect_external_def;
262 SLP_TREE_LANES (node) = ops.length ();
263 return node;
264}
265
266/* Create an SLP node for OPS. */
267
268static slp_tree
269vect_create_new_slp_node (vec<tree> ops)
270{
271 return vect_create_new_slp_node (node: new _slp_tree, ops);
272}
273
274
275/* This structure is used in creation of an SLP tree. Each instance
276 corresponds to the same operand in a group of scalar stmts in an SLP
277 node. */
278typedef struct _slp_oprnd_info
279{
280 /* Def-stmts for the operands. */
281 vec<stmt_vec_info> def_stmts;
282 /* Operands. */
283 vec<tree> ops;
284 /* Information about the first statement, its vector def-type, type, the
285 operand itself in case it's constant, and an indication if it's a pattern
286 stmt and gather/scatter info. */
287 tree first_op_type;
288 enum vect_def_type first_dt;
289 bool any_pattern;
290 bool first_gs_p;
291 gather_scatter_info first_gs_info;
292} *slp_oprnd_info;
293
294
295/* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
296 operand. */
297static vec<slp_oprnd_info>
298vect_create_oprnd_info (int nops, int group_size)
299{
300 int i;
301 slp_oprnd_info oprnd_info;
302 vec<slp_oprnd_info> oprnds_info;
303
304 oprnds_info.create (nelems: nops);
305 for (i = 0; i < nops; i++)
306 {
307 oprnd_info = XNEW (struct _slp_oprnd_info);
308 oprnd_info->def_stmts.create (nelems: group_size);
309 oprnd_info->ops.create (nelems: group_size);
310 oprnd_info->first_dt = vect_uninitialized_def;
311 oprnd_info->first_op_type = NULL_TREE;
312 oprnd_info->any_pattern = false;
313 oprnd_info->first_gs_p = false;
314 oprnds_info.quick_push (obj: oprnd_info);
315 }
316
317 return oprnds_info;
318}
319
320
321/* Free operands info. */
322
323static void
324vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
325{
326 int i;
327 slp_oprnd_info oprnd_info;
328
329 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
330 {
331 oprnd_info->def_stmts.release ();
332 oprnd_info->ops.release ();
333 XDELETE (oprnd_info);
334 }
335
336 oprnds_info.release ();
337}
338
339/* Return the execution frequency of NODE (so that a higher value indicates
340 a "more important" node when optimizing for speed). */
341
342static sreal
343vect_slp_node_weight (slp_tree node)
344{
345 stmt_vec_info stmt_info = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
346 basic_block bb = gimple_bb (g: stmt_info->stmt);
347 return bb->count.to_sreal_scale (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count);
348}
349
350/* Return true if STMTS contains a pattern statement. */
351
352static bool
353vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
354{
355 stmt_vec_info stmt_info;
356 unsigned int i;
357 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
358 if (is_pattern_stmt_p (stmt_info))
359 return true;
360 return false;
361}
362
363/* Return true when all lanes in the external or constant NODE have
364 the same value. */
365
366static bool
367vect_slp_tree_uniform_p (slp_tree node)
368{
369 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
370 || SLP_TREE_DEF_TYPE (node) == vect_external_def);
371
372 /* Pre-exsting vectors. */
373 if (SLP_TREE_SCALAR_OPS (node).is_empty ())
374 return false;
375
376 unsigned i;
377 tree op, first = NULL_TREE;
378 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
379 if (!first)
380 first = op;
381 else if (!operand_equal_p (first, op, flags: 0))
382 return false;
383
384 return true;
385}
386
387/* Find the place of the data-ref in STMT_INFO in the interleaving chain
388 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
389 of the chain. */
390
391int
392vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
393 stmt_vec_info first_stmt_info)
394{
395 stmt_vec_info next_stmt_info = first_stmt_info;
396 int result = 0;
397
398 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
399 return -1;
400
401 do
402 {
403 if (next_stmt_info == stmt_info)
404 return result;
405 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
406 if (next_stmt_info)
407 result += DR_GROUP_GAP (next_stmt_info);
408 }
409 while (next_stmt_info);
410
411 return -1;
412}
413
414/* Check whether it is possible to load COUNT elements of type ELT_TYPE
415 using the method implemented by duplicate_and_interleave. Return true
416 if so, returning the number of intermediate vectors in *NVECTORS_OUT
417 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
418 (if nonnull). */
419
420bool
421can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
422 tree elt_type, unsigned int *nvectors_out,
423 tree *vector_type_out,
424 tree *permutes)
425{
426 tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
427 if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
428 return false;
429
430 machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
431 poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
432 unsigned int nvectors = 1;
433 for (;;)
434 {
435 scalar_int_mode int_mode;
436 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
437 if (int_mode_for_size (size: elt_bits, limit: 1).exists (mode: &int_mode))
438 {
439 /* Get the natural vector type for this SLP group size. */
440 tree int_type = build_nonstandard_integer_type
441 (GET_MODE_BITSIZE (mode: int_mode), 1);
442 tree vector_type
443 = get_vectype_for_scalar_type (vinfo, int_type, count);
444 poly_int64 half_nelts;
445 if (vector_type
446 && VECTOR_MODE_P (TYPE_MODE (vector_type))
447 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
448 GET_MODE_SIZE (base_vector_mode))
449 && multiple_p (a: GET_MODE_NUNITS (TYPE_MODE (vector_type)),
450 b: 2, multiple: &half_nelts))
451 {
452 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
453 together into elements of type INT_TYPE and using the result
454 to build NVECTORS vectors. */
455 poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
456 vec_perm_builder sel1 (nelts, 2, 3);
457 vec_perm_builder sel2 (nelts, 2, 3);
458
459 for (unsigned int i = 0; i < 3; ++i)
460 {
461 sel1.quick_push (obj: i);
462 sel1.quick_push (obj: i + nelts);
463 sel2.quick_push (obj: half_nelts + i);
464 sel2.quick_push (obj: half_nelts + i + nelts);
465 }
466 vec_perm_indices indices1 (sel1, 2, nelts);
467 vec_perm_indices indices2 (sel2, 2, nelts);
468 machine_mode vmode = TYPE_MODE (vector_type);
469 if (can_vec_perm_const_p (vmode, vmode, indices1)
470 && can_vec_perm_const_p (vmode, vmode, indices2))
471 {
472 if (nvectors_out)
473 *nvectors_out = nvectors;
474 if (vector_type_out)
475 *vector_type_out = vector_type;
476 if (permutes)
477 {
478 permutes[0] = vect_gen_perm_mask_checked (vector_type,
479 indices1);
480 permutes[1] = vect_gen_perm_mask_checked (vector_type,
481 indices2);
482 }
483 return true;
484 }
485 }
486 }
487 if (!multiple_p (a: elt_bytes, b: 2, multiple: &elt_bytes))
488 return false;
489 nvectors *= 2;
490 }
491}
492
493/* Return true if DTA and DTB match. */
494
495static bool
496vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
497{
498 return (dta == dtb
499 || ((dta == vect_external_def || dta == vect_constant_def)
500 && (dtb == vect_external_def || dtb == vect_constant_def)));
501}
502
503static const int cond_expr_maps[3][5] = {
504 { 4, -1, -2, 1, 2 },
505 { 4, -2, -1, 1, 2 },
506 { 4, -1, -2, 2, 1 }
507};
508static const int arg1_map[] = { 1, 1 };
509static const int arg2_map[] = { 1, 2 };
510static const int arg1_arg4_map[] = { 2, 1, 4 };
511static const int arg3_arg2_map[] = { 2, 3, 2 };
512static const int op1_op0_map[] = { 2, 1, 0 };
513static const int off_map[] = { 1, -3 };
514static const int off_op0_map[] = { 2, -3, 0 };
515static const int off_arg2_map[] = { 2, -3, 2 };
516static const int off_arg3_arg2_map[] = { 3, -3, 3, 2 };
517static const int mask_call_maps[6][7] = {
518 { 1, 1, },
519 { 2, 1, 2, },
520 { 3, 1, 2, 3, },
521 { 4, 1, 2, 3, 4, },
522 { 5, 1, 2, 3, 4, 5, },
523 { 6, 1, 2, 3, 4, 5, 6 },
524};
525
526/* For most SLP statements, there is a one-to-one mapping between
527 gimple arguments and child nodes. If that is not true for STMT,
528 return an array that contains:
529
530 - the number of child nodes, followed by
531 - for each child node, the index of the argument associated with that node.
532 The special index -1 is the first operand of an embedded comparison and
533 the special index -2 is the second operand of an embedded comparison.
534 The special indes -3 is the offset of a gather as analyzed by
535 vect_check_gather_scatter.
536
537 SWAP is as for vect_get_and_check_slp_defs. */
538
539static const int *
540vect_get_operand_map (const gimple *stmt, bool gather_scatter_p = false,
541 unsigned char swap = 0)
542{
543 if (auto assign = dyn_cast<const gassign *> (p: stmt))
544 {
545 if (gimple_assign_rhs_code (gs: assign) == COND_EXPR
546 && COMPARISON_CLASS_P (gimple_assign_rhs1 (assign)))
547 return cond_expr_maps[swap];
548 if (TREE_CODE_CLASS (gimple_assign_rhs_code (assign)) == tcc_comparison
549 && swap)
550 return op1_op0_map;
551 if (gather_scatter_p)
552 return (TREE_CODE (gimple_assign_lhs (assign)) != SSA_NAME
553 ? off_op0_map : off_map);
554 }
555 gcc_assert (!swap);
556 if (auto call = dyn_cast<const gcall *> (p: stmt))
557 {
558 if (gimple_call_internal_p (gs: call))
559 switch (gimple_call_internal_fn (gs: call))
560 {
561 case IFN_MASK_LOAD:
562 return gather_scatter_p ? off_arg2_map : arg2_map;
563
564 case IFN_GATHER_LOAD:
565 return arg1_map;
566
567 case IFN_MASK_GATHER_LOAD:
568 case IFN_MASK_LEN_GATHER_LOAD:
569 return arg1_arg4_map;
570
571 case IFN_MASK_STORE:
572 return gather_scatter_p ? off_arg3_arg2_map : arg3_arg2_map;
573
574 case IFN_MASK_CALL:
575 {
576 unsigned nargs = gimple_call_num_args (gs: call);
577 if (nargs >= 2 && nargs <= 7)
578 return mask_call_maps[nargs-2];
579 else
580 return nullptr;
581 }
582
583 default:
584 break;
585 }
586 }
587 return nullptr;
588}
589
590/* Return the SLP node child index for operand OP of STMT. */
591
592int
593vect_slp_child_index_for_operand (const gimple *stmt, int op,
594 bool gather_scatter_p)
595{
596 const int *opmap = vect_get_operand_map (stmt, gather_scatter_p);
597 if (!opmap)
598 return op;
599 for (int i = 1; i < 1 + opmap[0]; ++i)
600 if (opmap[i] == op)
601 return i - 1;
602 gcc_unreachable ();
603}
604
605/* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
606 they are of a valid type and that they match the defs of the first stmt of
607 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
608 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero SWAP
609 indicates swap is required for cond_expr stmts. Specifically, SWAP
610 is 1 if STMT is cond and operands of comparison need to be swapped;
611 SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
612
613 If there was a fatal error return -1; if the error could be corrected by
614 swapping operands of father node of this one, return 1; if everything is
615 ok return 0. */
616static int
617vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
618 bool *skip_args,
619 vec<stmt_vec_info> stmts, unsigned stmt_num,
620 vec<slp_oprnd_info> *oprnds_info)
621{
622 stmt_vec_info stmt_info = stmts[stmt_num];
623 tree oprnd;
624 unsigned int i, number_of_oprnds;
625 enum vect_def_type dt = vect_uninitialized_def;
626 slp_oprnd_info oprnd_info;
627 gather_scatter_info gs_info;
628 unsigned int gs_op = -1u;
629 unsigned int commutative_op = -1U;
630 bool first = stmt_num == 0;
631
632 if (!is_a<gcall *> (p: stmt_info->stmt)
633 && !is_a<gassign *> (p: stmt_info->stmt)
634 && !is_a<gphi *> (p: stmt_info->stmt))
635 return -1;
636
637 number_of_oprnds = gimple_num_args (gs: stmt_info->stmt);
638 const int *map
639 = vect_get_operand_map (stmt: stmt_info->stmt,
640 STMT_VINFO_GATHER_SCATTER_P (stmt_info), swap);
641 if (map)
642 number_of_oprnds = *map++;
643 if (gcall *stmt = dyn_cast <gcall *> (p: stmt_info->stmt))
644 {
645 if (gimple_call_internal_p (gs: stmt))
646 {
647 internal_fn ifn = gimple_call_internal_fn (gs: stmt);
648 commutative_op = first_commutative_argument (ifn);
649 }
650 }
651 else if (gassign *stmt = dyn_cast <gassign *> (p: stmt_info->stmt))
652 {
653 if (commutative_tree_code (gimple_assign_rhs_code (gs: stmt)))
654 commutative_op = 0;
655 }
656
657 bool swapped = (swap != 0);
658 bool backedge = false;
659 enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
660 for (i = 0; i < number_of_oprnds; i++)
661 {
662 oprnd_info = (*oprnds_info)[i];
663 int opno = map ? map[i] : int (i);
664 if (opno == -3)
665 {
666 gcc_assert (STMT_VINFO_GATHER_SCATTER_P (stmt_info));
667 if (!is_a <loop_vec_info> (p: vinfo)
668 || !vect_check_gather_scatter (stmt_info,
669 as_a <loop_vec_info> (p: vinfo),
670 first ? &oprnd_info->first_gs_info
671 : &gs_info))
672 return -1;
673
674 if (first)
675 {
676 oprnd_info->first_gs_p = true;
677 oprnd = oprnd_info->first_gs_info.offset;
678 }
679 else
680 {
681 gs_op = i;
682 oprnd = gs_info.offset;
683 }
684 }
685 else if (opno < 0)
686 oprnd = TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno);
687 else
688 {
689 oprnd = gimple_arg (gs: stmt_info->stmt, i: opno);
690 if (gphi *stmt = dyn_cast <gphi *> (p: stmt_info->stmt))
691 {
692 edge e = gimple_phi_arg_edge (phi: stmt, i: opno);
693 backedge = (is_a <bb_vec_info> (p: vinfo)
694 ? e->flags & EDGE_DFS_BACK
695 : dominated_by_p (CDI_DOMINATORS, e->src,
696 gimple_bb (g: stmt_info->stmt)));
697 }
698 }
699 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
700 oprnd = TREE_OPERAND (oprnd, 0);
701
702 stmt_vec_info def_stmt_info;
703 if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
704 {
705 if (dump_enabled_p ())
706 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
707 "Build SLP failed: can't analyze def for %T\n",
708 oprnd);
709
710 return -1;
711 }
712
713 if (skip_args[i])
714 {
715 oprnd_info->def_stmts.quick_push (NULL);
716 oprnd_info->ops.quick_push (NULL_TREE);
717 oprnd_info->first_dt = vect_uninitialized_def;
718 continue;
719 }
720
721 oprnd_info->def_stmts.quick_push (obj: def_stmt_info);
722 oprnd_info->ops.quick_push (obj: oprnd);
723
724 if (def_stmt_info
725 && is_pattern_stmt_p (stmt_info: def_stmt_info))
726 {
727 if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
728 != def_stmt_info)
729 oprnd_info->any_pattern = true;
730 else
731 /* If we promote this to external use the original stmt def. */
732 oprnd_info->ops.last ()
733 = gimple_get_lhs (vect_orig_stmt (stmt_info: def_stmt_info)->stmt);
734 }
735
736 /* If there's a extern def on a backedge make sure we can
737 code-generate at the region start.
738 ??? This is another case that could be fixed by adjusting
739 how we split the function but at the moment we'd have conflicting
740 goals there. */
741 if (backedge
742 && dts[i] == vect_external_def
743 && is_a <bb_vec_info> (p: vinfo)
744 && TREE_CODE (oprnd) == SSA_NAME
745 && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
746 && !dominated_by_p (CDI_DOMINATORS,
747 as_a <bb_vec_info> (p: vinfo)->bbs[0],
748 gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
749 {
750 if (dump_enabled_p ())
751 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
752 "Build SLP failed: extern def %T only defined "
753 "on backedge\n", oprnd);
754 return -1;
755 }
756
757 if (first)
758 {
759 tree type = TREE_TYPE (oprnd);
760 dt = dts[i];
761 if ((dt == vect_constant_def
762 || dt == vect_external_def)
763 && !GET_MODE_SIZE (mode: vinfo->vector_mode).is_constant ()
764 && TREE_CODE (type) != BOOLEAN_TYPE
765 && !can_duplicate_and_interleave_p (vinfo, count: stmts.length (), elt_type: type))
766 {
767 if (dump_enabled_p ())
768 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
769 "Build SLP failed: invalid type of def "
770 "for variable-length SLP %T\n", oprnd);
771 return -1;
772 }
773
774 /* For the swapping logic below force vect_reduction_def
775 for the reduction op in a SLP reduction group. */
776 if (!STMT_VINFO_DATA_REF (stmt_info)
777 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
778 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
779 && def_stmt_info)
780 dts[i] = dt = vect_reduction_def;
781
782 /* Check the types of the definition. */
783 switch (dt)
784 {
785 case vect_external_def:
786 case vect_constant_def:
787 case vect_internal_def:
788 case vect_reduction_def:
789 case vect_induction_def:
790 case vect_nested_cycle:
791 case vect_first_order_recurrence:
792 break;
793
794 default:
795 /* FORNOW: Not supported. */
796 if (dump_enabled_p ())
797 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
798 "Build SLP failed: illegal type of def %T\n",
799 oprnd);
800 return -1;
801 }
802
803 oprnd_info->first_dt = dt;
804 oprnd_info->first_op_type = type;
805 }
806 }
807 if (first)
808 return 0;
809
810 /* Now match the operand definition types to that of the first stmt. */
811 for (i = 0; i < number_of_oprnds;)
812 {
813 if (skip_args[i])
814 {
815 ++i;
816 continue;
817 }
818
819 oprnd_info = (*oprnds_info)[i];
820 dt = dts[i];
821 stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
822 oprnd = oprnd_info->ops[stmt_num];
823 tree type = TREE_TYPE (oprnd);
824
825 if (!types_compatible_p (type1: oprnd_info->first_op_type, type2: type))
826 {
827 if (dump_enabled_p ())
828 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
829 "Build SLP failed: different operand types\n");
830 return 1;
831 }
832
833 if ((gs_op == i) != oprnd_info->first_gs_p)
834 {
835 if (dump_enabled_p ())
836 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
837 "Build SLP failed: mixed gather and non-gather\n");
838 return 1;
839 }
840 else if (gs_op == i)
841 {
842 if (!operand_equal_p (oprnd_info->first_gs_info.base,
843 gs_info.base))
844 {
845 if (dump_enabled_p ())
846 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
847 "Build SLP failed: different gather base\n");
848 return 1;
849 }
850 if (oprnd_info->first_gs_info.scale != gs_info.scale)
851 {
852 if (dump_enabled_p ())
853 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
854 "Build SLP failed: different gather scale\n");
855 return 1;
856 }
857 }
858
859 /* Not first stmt of the group, check that the def-stmt/s match
860 the def-stmt/s of the first stmt. Allow different definition
861 types for reduction chains: the first stmt must be a
862 vect_reduction_def (a phi node), and the rest
863 end in the reduction chain. */
864 if ((!vect_def_types_match (dta: oprnd_info->first_dt, dtb: dt)
865 && !(oprnd_info->first_dt == vect_reduction_def
866 && !STMT_VINFO_DATA_REF (stmt_info)
867 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
868 && def_stmt_info
869 && !STMT_VINFO_DATA_REF (def_stmt_info)
870 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
871 == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
872 || (!STMT_VINFO_DATA_REF (stmt_info)
873 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
874 && ((!def_stmt_info
875 || STMT_VINFO_DATA_REF (def_stmt_info)
876 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
877 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
878 != (oprnd_info->first_dt != vect_reduction_def))))
879 {
880 /* Try swapping operands if we got a mismatch. For BB
881 vectorization only in case it will clearly improve things. */
882 if (i == commutative_op && !swapped
883 && (!is_a <bb_vec_info> (p: vinfo)
884 || (!vect_def_types_match (dta: (*oprnds_info)[i+1]->first_dt,
885 dtb: dts[i+1])
886 && (vect_def_types_match (dta: oprnd_info->first_dt, dtb: dts[i+1])
887 || vect_def_types_match
888 (dta: (*oprnds_info)[i+1]->first_dt, dtb: dts[i])))))
889 {
890 if (dump_enabled_p ())
891 dump_printf_loc (MSG_NOTE, vect_location,
892 "trying swapped operands\n");
893 std::swap (a&: dts[i], b&: dts[i+1]);
894 std::swap (a&: (*oprnds_info)[i]->def_stmts[stmt_num],
895 b&: (*oprnds_info)[i+1]->def_stmts[stmt_num]);
896 std::swap (a&: (*oprnds_info)[i]->ops[stmt_num],
897 b&: (*oprnds_info)[i+1]->ops[stmt_num]);
898 swapped = true;
899 continue;
900 }
901
902 if (is_a <bb_vec_info> (p: vinfo)
903 && !oprnd_info->any_pattern)
904 {
905 /* Now for commutative ops we should see whether we can
906 make the other operand matching. */
907 if (dump_enabled_p ())
908 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
909 "treating operand as external\n");
910 oprnd_info->first_dt = dt = vect_external_def;
911 }
912 else
913 {
914 if (dump_enabled_p ())
915 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
916 "Build SLP failed: different types\n");
917 return 1;
918 }
919 }
920
921 /* Make sure to demote the overall operand to external. */
922 if (dt == vect_external_def)
923 oprnd_info->first_dt = vect_external_def;
924 /* For a SLP reduction chain we want to duplicate the reduction to
925 each of the chain members. That gets us a sane SLP graph (still
926 the stmts are not 100% correct wrt the initial values). */
927 else if ((dt == vect_internal_def
928 || dt == vect_reduction_def)
929 && oprnd_info->first_dt == vect_reduction_def
930 && !STMT_VINFO_DATA_REF (stmt_info)
931 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
932 && !STMT_VINFO_DATA_REF (def_stmt_info)
933 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
934 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
935 {
936 oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
937 oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
938 }
939
940 ++i;
941 }
942
943 /* Swap operands. */
944 if (swapped)
945 {
946 if (dump_enabled_p ())
947 dump_printf_loc (MSG_NOTE, vect_location,
948 "swapped operands to match def types in %G",
949 stmt_info->stmt);
950 }
951
952 return 0;
953}
954
955/* Return true if call statements CALL1 and CALL2 are similar enough
956 to be combined into the same SLP group. */
957
958bool
959compatible_calls_p (gcall *call1, gcall *call2)
960{
961 unsigned int nargs = gimple_call_num_args (gs: call1);
962 if (nargs != gimple_call_num_args (gs: call2))
963 return false;
964
965 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
966 return false;
967
968 if (gimple_call_internal_p (gs: call1))
969 {
970 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
971 TREE_TYPE (gimple_call_lhs (call2))))
972 return false;
973 for (unsigned int i = 0; i < nargs; ++i)
974 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
975 TREE_TYPE (gimple_call_arg (call2, i))))
976 return false;
977 }
978 else
979 {
980 if (!operand_equal_p (gimple_call_fn (gs: call1),
981 gimple_call_fn (gs: call2), flags: 0))
982 return false;
983
984 if (gimple_call_fntype (gs: call1) != gimple_call_fntype (gs: call2))
985 return false;
986 }
987
988 /* Check that any unvectorized arguments are equal. */
989 if (const int *map = vect_get_operand_map (stmt: call1))
990 {
991 unsigned int nkept = *map++;
992 unsigned int mapi = 0;
993 for (unsigned int i = 0; i < nargs; ++i)
994 if (mapi < nkept && map[mapi] == int (i))
995 mapi += 1;
996 else if (!operand_equal_p (gimple_call_arg (gs: call1, index: i),
997 gimple_call_arg (gs: call2, index: i)))
998 return false;
999 }
1000
1001 return true;
1002}
1003
1004/* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
1005 caller's attempt to find the vector type in STMT_INFO with the narrowest
1006 element type. Return true if VECTYPE is nonnull and if it is valid
1007 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
1008 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
1009 vect_build_slp_tree. */
1010
1011static bool
1012vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
1013 unsigned int group_size,
1014 tree vectype, poly_uint64 *max_nunits)
1015{
1016 if (!vectype)
1017 {
1018 if (dump_enabled_p ())
1019 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1020 "Build SLP failed: unsupported data-type in %G\n",
1021 stmt_info->stmt);
1022 /* Fatal mismatch. */
1023 return false;
1024 }
1025
1026 /* If populating the vector type requires unrolling then fail
1027 before adjusting *max_nunits for basic-block vectorization. */
1028 if (is_a <bb_vec_info> (p: vinfo)
1029 && !multiple_p (a: group_size, b: TYPE_VECTOR_SUBPARTS (node: vectype)))
1030 {
1031 if (dump_enabled_p ())
1032 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1033 "Build SLP failed: unrolling required "
1034 "in basic block SLP\n");
1035 /* Fatal mismatch. */
1036 return false;
1037 }
1038
1039 /* In case of multiple types we need to detect the smallest type. */
1040 vect_update_max_nunits (max_nunits, vectype);
1041 return true;
1042}
1043
1044/* Verify if the scalar stmts STMTS are isomorphic, require data
1045 permutation or are of unsupported types of operation. Return
1046 true if they are, otherwise return false and indicate in *MATCHES
1047 which stmts are not isomorphic to the first one. If MATCHES[0]
1048 is false then this indicates the comparison could not be
1049 carried out or the stmts will never be vectorized by SLP.
1050
1051 Note COND_EXPR is possibly isomorphic to another one after swapping its
1052 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
1053 the first stmt by swapping the two operands of comparison; set SWAP[i]
1054 to 2 if stmt I is isormorphic to the first stmt by inverting the code
1055 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
1056 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
1057
1058static bool
1059vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
1060 vec<stmt_vec_info> stmts, unsigned int group_size,
1061 poly_uint64 *max_nunits, bool *matches,
1062 bool *two_operators, tree *node_vectype)
1063{
1064 unsigned int i;
1065 stmt_vec_info first_stmt_info = stmts[0];
1066 code_helper first_stmt_code = ERROR_MARK;
1067 code_helper alt_stmt_code = ERROR_MARK;
1068 code_helper rhs_code = ERROR_MARK;
1069 code_helper first_cond_code = ERROR_MARK;
1070 tree lhs;
1071 bool need_same_oprnds = false;
1072 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
1073 stmt_vec_info first_load = NULL, prev_first_load = NULL;
1074 bool first_stmt_ldst_p = false, ldst_p = false;
1075 bool first_stmt_phi_p = false, phi_p = false;
1076 bool maybe_soft_fail = false;
1077 tree soft_fail_nunits_vectype = NULL_TREE;
1078
1079 /* For every stmt in NODE find its def stmt/s. */
1080 stmt_vec_info stmt_info;
1081 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1082 {
1083 gimple *stmt = stmt_info->stmt;
1084 swap[i] = 0;
1085 matches[i] = false;
1086
1087 if (dump_enabled_p ())
1088 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
1089
1090 /* Fail to vectorize statements marked as unvectorizable, throw
1091 or are volatile. */
1092 if (!STMT_VINFO_VECTORIZABLE (stmt_info)
1093 || stmt_can_throw_internal (cfun, stmt)
1094 || gimple_has_volatile_ops (stmt))
1095 {
1096 if (dump_enabled_p ())
1097 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1098 "Build SLP failed: unvectorizable statement %G",
1099 stmt);
1100 /* ??? For BB vectorization we want to commutate operands in a way
1101 to shuffle all unvectorizable defs into one operand and have
1102 the other still vectorized. The following doesn't reliably
1103 work for this though but it's the easiest we can do here. */
1104 if (is_a <bb_vec_info> (p: vinfo) && i != 0)
1105 continue;
1106 /* Fatal mismatch. */
1107 matches[0] = false;
1108 return false;
1109 }
1110
1111 gcall *call_stmt = dyn_cast <gcall *> (p: stmt);
1112 lhs = gimple_get_lhs (stmt);
1113 if (lhs == NULL_TREE
1114 && (!call_stmt
1115 || !gimple_call_internal_p (gs: stmt)
1116 || !internal_store_fn_p (gimple_call_internal_fn (gs: stmt))))
1117 {
1118 if (dump_enabled_p ())
1119 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1120 "Build SLP failed: not GIMPLE_ASSIGN nor "
1121 "GIMPLE_CALL %G", stmt);
1122 if (is_a <bb_vec_info> (p: vinfo) && i != 0)
1123 continue;
1124 /* Fatal mismatch. */
1125 matches[0] = false;
1126 return false;
1127 }
1128
1129 tree nunits_vectype;
1130 if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
1131 &nunits_vectype, group_size))
1132 {
1133 if (is_a <bb_vec_info> (p: vinfo) && i != 0)
1134 continue;
1135 /* Fatal mismatch. */
1136 matches[0] = false;
1137 return false;
1138 }
1139 /* Record nunits required but continue analysis, producing matches[]
1140 as if nunits was not an issue. This allows splitting of groups
1141 to happen. */
1142 if (nunits_vectype
1143 && !vect_record_max_nunits (vinfo, stmt_info, group_size,
1144 vectype: nunits_vectype, max_nunits))
1145 {
1146 gcc_assert (is_a <bb_vec_info> (vinfo));
1147 maybe_soft_fail = true;
1148 soft_fail_nunits_vectype = nunits_vectype;
1149 }
1150
1151 gcc_assert (vectype);
1152
1153 if (call_stmt)
1154 {
1155 combined_fn cfn = gimple_call_combined_fn (call_stmt);
1156 if (cfn != CFN_LAST && cfn != CFN_MASK_CALL)
1157 rhs_code = cfn;
1158 else
1159 rhs_code = CALL_EXPR;
1160
1161 if (cfn == CFN_MASK_LOAD
1162 || cfn == CFN_GATHER_LOAD
1163 || cfn == CFN_MASK_GATHER_LOAD
1164 || cfn == CFN_MASK_LEN_GATHER_LOAD)
1165 ldst_p = true;
1166 else if (cfn == CFN_MASK_STORE)
1167 {
1168 ldst_p = true;
1169 rhs_code = CFN_MASK_STORE;
1170 }
1171 else if ((cfn != CFN_LAST
1172 && cfn != CFN_MASK_CALL
1173 && internal_fn_p (code: cfn)
1174 && !vectorizable_internal_fn_p (fn: as_internal_fn (code: cfn)))
1175 || gimple_call_tail_p (s: call_stmt)
1176 || gimple_call_noreturn_p (s: call_stmt)
1177 || gimple_call_chain (gs: call_stmt))
1178 {
1179 if (dump_enabled_p ())
1180 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1181 "Build SLP failed: unsupported call type %G",
1182 (gimple *) call_stmt);
1183 if (is_a <bb_vec_info> (p: vinfo) && i != 0)
1184 continue;
1185 /* Fatal mismatch. */
1186 matches[0] = false;
1187 return false;
1188 }
1189 }
1190 else if (gimple_code (g: stmt) == GIMPLE_PHI)
1191 {
1192 rhs_code = ERROR_MARK;
1193 phi_p = true;
1194 }
1195 else
1196 {
1197 rhs_code = gimple_assign_rhs_code (gs: stmt);
1198 ldst_p = STMT_VINFO_DATA_REF (stmt_info) != nullptr;
1199 }
1200
1201 /* Check the operation. */
1202 if (i == 0)
1203 {
1204 *node_vectype = vectype;
1205 first_stmt_code = rhs_code;
1206 first_stmt_ldst_p = ldst_p;
1207 first_stmt_phi_p = phi_p;
1208
1209 /* Shift arguments should be equal in all the packed stmts for a
1210 vector shift with scalar shift operand. */
1211 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1212 || rhs_code == LROTATE_EXPR
1213 || rhs_code == RROTATE_EXPR)
1214 {
1215 /* First see if we have a vector/vector shift. */
1216 if (!directly_supported_p (rhs_code, vectype, optab_vector))
1217 {
1218 /* No vector/vector shift, try for a vector/scalar shift. */
1219 if (!directly_supported_p (rhs_code, vectype, optab_scalar))
1220 {
1221 if (dump_enabled_p ())
1222 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1223 "Build SLP failed: "
1224 "op not supported by target.\n");
1225 if (is_a <bb_vec_info> (p: vinfo) && i != 0)
1226 continue;
1227 /* Fatal mismatch. */
1228 matches[0] = false;
1229 return false;
1230 }
1231 need_same_oprnds = true;
1232 first_op1 = gimple_assign_rhs2 (gs: stmt);
1233 }
1234 }
1235 else if (rhs_code == WIDEN_LSHIFT_EXPR)
1236 {
1237 need_same_oprnds = true;
1238 first_op1 = gimple_assign_rhs2 (gs: stmt);
1239 }
1240 else if (!ldst_p
1241 && rhs_code == BIT_FIELD_REF)
1242 {
1243 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1244 if (!is_a <bb_vec_info> (p: vinfo)
1245 || TREE_CODE (vec) != SSA_NAME
1246 /* When the element types are not compatible we pun the
1247 source to the target vectype which requires equal size. */
1248 || ((!VECTOR_TYPE_P (TREE_TYPE (vec))
1249 || !types_compatible_p (TREE_TYPE (vectype),
1250 TREE_TYPE (TREE_TYPE (vec))))
1251 && !operand_equal_p (TYPE_SIZE (vectype),
1252 TYPE_SIZE (TREE_TYPE (vec)))))
1253 {
1254 if (dump_enabled_p ())
1255 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1256 "Build SLP failed: "
1257 "BIT_FIELD_REF not supported\n");
1258 /* Fatal mismatch. */
1259 matches[0] = false;
1260 return false;
1261 }
1262 }
1263 else if (rhs_code == CFN_DIV_POW2)
1264 {
1265 need_same_oprnds = true;
1266 first_op1 = gimple_call_arg (gs: call_stmt, index: 1);
1267 }
1268 }
1269 else
1270 {
1271 if (first_stmt_code != rhs_code
1272 && alt_stmt_code == ERROR_MARK)
1273 alt_stmt_code = rhs_code;
1274 if ((first_stmt_code != rhs_code
1275 && (first_stmt_code != IMAGPART_EXPR
1276 || rhs_code != REALPART_EXPR)
1277 && (first_stmt_code != REALPART_EXPR
1278 || rhs_code != IMAGPART_EXPR)
1279 /* Handle mismatches in plus/minus by computing both
1280 and merging the results. */
1281 && !((first_stmt_code == PLUS_EXPR
1282 || first_stmt_code == MINUS_EXPR)
1283 && (alt_stmt_code == PLUS_EXPR
1284 || alt_stmt_code == MINUS_EXPR)
1285 && rhs_code == alt_stmt_code)
1286 && !(first_stmt_code.is_tree_code ()
1287 && rhs_code.is_tree_code ()
1288 && (TREE_CODE_CLASS (tree_code (first_stmt_code))
1289 == tcc_comparison)
1290 && (swap_tree_comparison (tree_code (first_stmt_code))
1291 == tree_code (rhs_code)))
1292 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1293 && (first_stmt_code == ARRAY_REF
1294 || first_stmt_code == BIT_FIELD_REF
1295 || first_stmt_code == INDIRECT_REF
1296 || first_stmt_code == COMPONENT_REF
1297 || first_stmt_code == MEM_REF)
1298 && (rhs_code == ARRAY_REF
1299 || rhs_code == BIT_FIELD_REF
1300 || rhs_code == INDIRECT_REF
1301 || rhs_code == COMPONENT_REF
1302 || rhs_code == MEM_REF)))
1303 || (ldst_p
1304 && (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1305 != STMT_VINFO_GROUPED_ACCESS (first_stmt_info)))
1306 || (ldst_p
1307 && (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1308 != STMT_VINFO_GATHER_SCATTER_P (first_stmt_info)))
1309 || first_stmt_ldst_p != ldst_p
1310 || first_stmt_phi_p != phi_p)
1311 {
1312 if (dump_enabled_p ())
1313 {
1314 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1315 "Build SLP failed: different operation "
1316 "in stmt %G", stmt);
1317 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1318 "original stmt %G", first_stmt_info->stmt);
1319 }
1320 /* Mismatch. */
1321 continue;
1322 }
1323
1324 if (!ldst_p
1325 && first_stmt_code == BIT_FIELD_REF
1326 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1327 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1328 {
1329 if (dump_enabled_p ())
1330 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1331 "Build SLP failed: different BIT_FIELD_REF "
1332 "arguments in %G", stmt);
1333 /* Mismatch. */
1334 continue;
1335 }
1336
1337 if (call_stmt
1338 && first_stmt_code != CFN_MASK_LOAD
1339 && first_stmt_code != CFN_MASK_STORE)
1340 {
1341 if (!compatible_calls_p (call1: as_a <gcall *> (p: stmts[0]->stmt),
1342 call2: call_stmt))
1343 {
1344 if (dump_enabled_p ())
1345 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1346 "Build SLP failed: different calls in %G",
1347 stmt);
1348 /* Mismatch. */
1349 continue;
1350 }
1351 }
1352
1353 if ((phi_p || gimple_could_trap_p (stmt_info->stmt))
1354 && (gimple_bb (g: first_stmt_info->stmt)
1355 != gimple_bb (g: stmt_info->stmt)))
1356 {
1357 if (dump_enabled_p ())
1358 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1359 "Build SLP failed: different BB for PHI "
1360 "or possibly trapping operation in %G", stmt);
1361 /* Mismatch. */
1362 continue;
1363 }
1364
1365 if (need_same_oprnds)
1366 {
1367 tree other_op1 = gimple_arg (gs: stmt, i: 1);
1368 if (!operand_equal_p (first_op1, other_op1, flags: 0))
1369 {
1370 if (dump_enabled_p ())
1371 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1372 "Build SLP failed: different shift "
1373 "arguments in %G", stmt);
1374 /* Mismatch. */
1375 continue;
1376 }
1377 }
1378
1379 if (!types_compatible_p (type1: vectype, type2: *node_vectype))
1380 {
1381 if (dump_enabled_p ())
1382 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1383 "Build SLP failed: different vector type "
1384 "in %G", stmt);
1385 /* Mismatch. */
1386 continue;
1387 }
1388 }
1389
1390 /* Grouped store or load. */
1391 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1392 {
1393 gcc_assert (ldst_p);
1394 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1395 {
1396 /* Store. */
1397 gcc_assert (rhs_code == CFN_MASK_STORE
1398 || REFERENCE_CLASS_P (lhs)
1399 || DECL_P (lhs));
1400 }
1401 else
1402 {
1403 /* Load. */
1404 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1405 if (prev_first_load)
1406 {
1407 /* Check that there are no loads from different interleaving
1408 chains in the same node. */
1409 if (prev_first_load != first_load)
1410 {
1411 if (dump_enabled_p ())
1412 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1413 vect_location,
1414 "Build SLP failed: different "
1415 "interleaving chains in one node %G",
1416 stmt);
1417 /* Mismatch. */
1418 continue;
1419 }
1420 }
1421 else
1422 prev_first_load = first_load;
1423 }
1424 }
1425 /* Non-grouped store or load. */
1426 else if (ldst_p)
1427 {
1428 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))
1429 && rhs_code != CFN_GATHER_LOAD
1430 && rhs_code != CFN_MASK_GATHER_LOAD
1431 && rhs_code != CFN_MASK_LEN_GATHER_LOAD
1432 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1433 /* Not grouped loads are handled as externals for BB
1434 vectorization. For loop vectorization we can handle
1435 splats the same we handle single element interleaving. */
1436 && (is_a <bb_vec_info> (p: vinfo)
1437 || stmt_info != first_stmt_info))
1438 {
1439 /* Not grouped load. */
1440 if (dump_enabled_p ())
1441 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1442 "Build SLP failed: not grouped load %G", stmt);
1443
1444 if (i != 0)
1445 continue;
1446 /* Fatal mismatch. */
1447 matches[0] = false;
1448 return false;
1449 }
1450 }
1451 /* Not memory operation. */
1452 else
1453 {
1454 if (!phi_p
1455 && rhs_code.is_tree_code ()
1456 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary
1457 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary
1458 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_expression
1459 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_comparison
1460 && rhs_code != VIEW_CONVERT_EXPR
1461 && rhs_code != CALL_EXPR
1462 && rhs_code != BIT_FIELD_REF)
1463 {
1464 if (dump_enabled_p ())
1465 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1466 "Build SLP failed: operation unsupported %G",
1467 stmt);
1468 if (is_a <bb_vec_info> (p: vinfo) && i != 0)
1469 continue;
1470 /* Fatal mismatch. */
1471 matches[0] = false;
1472 return false;
1473 }
1474
1475 if (rhs_code == COND_EXPR)
1476 {
1477 tree cond_expr = gimple_assign_rhs1 (gs: stmt);
1478 enum tree_code cond_code = TREE_CODE (cond_expr);
1479 enum tree_code swap_code = ERROR_MARK;
1480 enum tree_code invert_code = ERROR_MARK;
1481
1482 if (i == 0)
1483 first_cond_code = TREE_CODE (cond_expr);
1484 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1485 {
1486 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1487 swap_code = swap_tree_comparison (cond_code);
1488 invert_code = invert_tree_comparison (cond_code, honor_nans);
1489 }
1490
1491 if (first_cond_code == cond_code)
1492 ;
1493 /* Isomorphic can be achieved by swapping. */
1494 else if (first_cond_code == swap_code)
1495 swap[i] = 1;
1496 /* Isomorphic can be achieved by inverting. */
1497 else if (first_cond_code == invert_code)
1498 swap[i] = 2;
1499 else
1500 {
1501 if (dump_enabled_p ())
1502 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1503 "Build SLP failed: different"
1504 " operation %G", stmt);
1505 /* Mismatch. */
1506 continue;
1507 }
1508 }
1509
1510 if (rhs_code.is_tree_code ()
1511 && TREE_CODE_CLASS ((tree_code)rhs_code) == tcc_comparison
1512 && (swap_tree_comparison ((tree_code)first_stmt_code)
1513 == (tree_code)rhs_code))
1514 swap[i] = 1;
1515 }
1516
1517 matches[i] = true;
1518 }
1519
1520 for (i = 0; i < group_size; ++i)
1521 if (!matches[i])
1522 return false;
1523
1524 /* If we allowed a two-operation SLP node verify the target can cope
1525 with the permute we are going to use. */
1526 if (alt_stmt_code != ERROR_MARK
1527 && (!alt_stmt_code.is_tree_code ()
1528 || (TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_reference
1529 && TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_comparison)))
1530 {
1531 *two_operators = true;
1532 }
1533
1534 if (maybe_soft_fail)
1535 {
1536 unsigned HOST_WIDE_INT const_nunits;
1537 if (!TYPE_VECTOR_SUBPARTS
1538 (node: soft_fail_nunits_vectype).is_constant (const_value: &const_nunits)
1539 || const_nunits > group_size)
1540 matches[0] = false;
1541 else
1542 {
1543 /* With constant vector elements simulate a mismatch at the
1544 point we need to split. */
1545 unsigned tail = group_size & (const_nunits - 1);
1546 memset (s: &matches[group_size - tail], c: 0, n: sizeof (bool) * tail);
1547 }
1548 return false;
1549 }
1550
1551 return true;
1552}
1553
1554/* Traits for the hash_set to record failed SLP builds for a stmt set.
1555 Note we never remove apart from at destruction time so we do not
1556 need a special value for deleted that differs from empty. */
1557struct bst_traits
1558{
1559 typedef vec <stmt_vec_info> value_type;
1560 typedef vec <stmt_vec_info> compare_type;
1561 static inline hashval_t hash (value_type);
1562 static inline bool equal (value_type existing, value_type candidate);
1563 static inline bool is_empty (value_type x) { return !x.exists (); }
1564 static inline bool is_deleted (value_type x) { return !x.exists (); }
1565 static const bool empty_zero_p = true;
1566 static inline void mark_empty (value_type &x) { x.release (); }
1567 static inline void mark_deleted (value_type &x) { x.release (); }
1568 static inline void remove (value_type &x) { x.release (); }
1569};
1570inline hashval_t
1571bst_traits::hash (value_type x)
1572{
1573 inchash::hash h;
1574 for (unsigned i = 0; i < x.length (); ++i)
1575 h.add_int (v: gimple_uid (g: x[i]->stmt));
1576 return h.end ();
1577}
1578inline bool
1579bst_traits::equal (value_type existing, value_type candidate)
1580{
1581 if (existing.length () != candidate.length ())
1582 return false;
1583 for (unsigned i = 0; i < existing.length (); ++i)
1584 if (existing[i] != candidate[i])
1585 return false;
1586 return true;
1587}
1588
1589/* ??? This was std::pair<std::pair<tree_code, vect_def_type>, tree>
1590 but then vec::insert does memmove and that's not compatible with
1591 std::pair. */
1592struct chain_op_t
1593{
1594 chain_op_t (tree_code code_, vect_def_type dt_, tree op_)
1595 : code (code_), dt (dt_), op (op_) {}
1596 tree_code code;
1597 vect_def_type dt;
1598 tree op;
1599};
1600
1601/* Comparator for sorting associatable chains. */
1602
1603static int
1604dt_sort_cmp (const void *op1_, const void *op2_, void *)
1605{
1606 auto *op1 = (const chain_op_t *) op1_;
1607 auto *op2 = (const chain_op_t *) op2_;
1608 if (op1->dt != op2->dt)
1609 return (int)op1->dt - (int)op2->dt;
1610 return (int)op1->code - (int)op2->code;
1611}
1612
1613/* Linearize the associatable expression chain at START with the
1614 associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
1615 filling CHAIN with the result and using WORKLIST as intermediate storage.
1616 CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
1617 or MINUS_EXPR. *CHAIN_STMTS if not NULL is filled with all computation
1618 stmts, starting with START. */
1619
1620static void
1621vect_slp_linearize_chain (vec_info *vinfo,
1622 vec<std::pair<tree_code, gimple *> > &worklist,
1623 vec<chain_op_t> &chain,
1624 enum tree_code code, gimple *start,
1625 gimple *&code_stmt, gimple *&alt_code_stmt,
1626 vec<gimple *> *chain_stmts)
1627{
1628 /* For each lane linearize the addition/subtraction (or other
1629 uniform associatable operation) expression tree. */
1630 worklist.safe_push (obj: std::make_pair (x&: code, y&: start));
1631 while (!worklist.is_empty ())
1632 {
1633 auto entry = worklist.pop ();
1634 gassign *stmt = as_a <gassign *> (p: entry.second);
1635 enum tree_code in_code = entry.first;
1636 enum tree_code this_code = gimple_assign_rhs_code (gs: stmt);
1637 /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE. */
1638 if (!code_stmt
1639 && gimple_assign_rhs_code (gs: stmt) == code)
1640 code_stmt = stmt;
1641 else if (!alt_code_stmt
1642 && gimple_assign_rhs_code (gs: stmt) == MINUS_EXPR)
1643 alt_code_stmt = stmt;
1644 if (chain_stmts)
1645 chain_stmts->safe_push (obj: stmt);
1646 for (unsigned opnum = 1; opnum <= 2; ++opnum)
1647 {
1648 tree op = gimple_op (gs: stmt, i: opnum);
1649 vect_def_type dt;
1650 stmt_vec_info def_stmt_info;
1651 bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
1652 gcc_assert (res);
1653 if (dt == vect_internal_def
1654 && is_pattern_stmt_p (stmt_info: def_stmt_info))
1655 op = gimple_get_lhs (def_stmt_info->stmt);
1656 gimple *use_stmt;
1657 use_operand_p use_p;
1658 if (dt == vect_internal_def
1659 && single_imm_use (var: op, use_p: &use_p, stmt: &use_stmt)
1660 && is_gimple_assign (gs: def_stmt_info->stmt)
1661 && (gimple_assign_rhs_code (gs: def_stmt_info->stmt) == code
1662 || (code == PLUS_EXPR
1663 && (gimple_assign_rhs_code (gs: def_stmt_info->stmt)
1664 == MINUS_EXPR))))
1665 {
1666 tree_code op_def_code = this_code;
1667 if (op_def_code == MINUS_EXPR && opnum == 1)
1668 op_def_code = PLUS_EXPR;
1669 if (in_code == MINUS_EXPR)
1670 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1671 worklist.safe_push (obj: std::make_pair (x&: op_def_code,
1672 y&: def_stmt_info->stmt));
1673 }
1674 else
1675 {
1676 tree_code op_def_code = this_code;
1677 if (op_def_code == MINUS_EXPR && opnum == 1)
1678 op_def_code = PLUS_EXPR;
1679 if (in_code == MINUS_EXPR)
1680 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1681 chain.safe_push (obj: chain_op_t (op_def_code, dt, op));
1682 }
1683 }
1684 }
1685}
1686
1687typedef hash_map <vec <stmt_vec_info>, slp_tree,
1688 simple_hashmap_traits <bst_traits, slp_tree> >
1689 scalar_stmts_to_slp_tree_map_t;
1690
1691static slp_tree
1692vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1693 vec<stmt_vec_info> stmts, unsigned int group_size,
1694 poly_uint64 *max_nunits,
1695 bool *matches, unsigned *limit, unsigned *tree_size,
1696 scalar_stmts_to_slp_tree_map_t *bst_map);
1697
1698static slp_tree
1699vect_build_slp_tree (vec_info *vinfo,
1700 vec<stmt_vec_info> stmts, unsigned int group_size,
1701 poly_uint64 *max_nunits,
1702 bool *matches, unsigned *limit, unsigned *tree_size,
1703 scalar_stmts_to_slp_tree_map_t *bst_map)
1704{
1705 if (slp_tree *leader = bst_map->get (k: stmts))
1706 {
1707 if (dump_enabled_p ())
1708 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1709 !(*leader)->failed ? "" : "failed ",
1710 (void *) *leader);
1711 if (!(*leader)->failed)
1712 {
1713 SLP_TREE_REF_COUNT (*leader)++;
1714 vect_update_max_nunits (max_nunits, nunits: (*leader)->max_nunits);
1715 stmts.release ();
1716 return *leader;
1717 }
1718 memcpy (dest: matches, src: (*leader)->failed, n: sizeof (bool) * group_size);
1719 return NULL;
1720 }
1721
1722 /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1723 so we can pick up backedge destinations during discovery. */
1724 slp_tree res = new _slp_tree;
1725 SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1726 SLP_TREE_SCALAR_STMTS (res) = stmts;
1727 bst_map->put (k: stmts.copy (), v: res);
1728
1729 if (*limit == 0)
1730 {
1731 if (dump_enabled_p ())
1732 dump_printf_loc (MSG_NOTE, vect_location,
1733 "SLP discovery limit exceeded\n");
1734 /* Mark the node invalid so we can detect those when still in use
1735 as backedge destinations. */
1736 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1737 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1738 res->failed = XNEWVEC (bool, group_size);
1739 memset (s: res->failed, c: 0, n: sizeof (bool) * group_size);
1740 memset (s: matches, c: 0, n: sizeof (bool) * group_size);
1741 return NULL;
1742 }
1743 --*limit;
1744
1745 if (dump_enabled_p ())
1746 dump_printf_loc (MSG_NOTE, vect_location,
1747 "starting SLP discovery for node %p\n", (void *) res);
1748
1749 poly_uint64 this_max_nunits = 1;
1750 slp_tree res_ = vect_build_slp_tree_2 (vinfo, node: res, stmts, group_size,
1751 max_nunits: &this_max_nunits,
1752 matches, limit, tree_size, bst_map);
1753 if (!res_)
1754 {
1755 if (dump_enabled_p ())
1756 dump_printf_loc (MSG_NOTE, vect_location,
1757 "SLP discovery for node %p failed\n", (void *) res);
1758 /* Mark the node invalid so we can detect those when still in use
1759 as backedge destinations. */
1760 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1761 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1762 res->failed = XNEWVEC (bool, group_size);
1763 if (flag_checking)
1764 {
1765 unsigned i;
1766 for (i = 0; i < group_size; ++i)
1767 if (!matches[i])
1768 break;
1769 gcc_assert (i < group_size);
1770 }
1771 memcpy (dest: res->failed, src: matches, n: sizeof (bool) * group_size);
1772 }
1773 else
1774 {
1775 if (dump_enabled_p ())
1776 dump_printf_loc (MSG_NOTE, vect_location,
1777 "SLP discovery for node %p succeeded\n",
1778 (void *) res);
1779 gcc_assert (res_ == res);
1780 res->max_nunits = this_max_nunits;
1781 vect_update_max_nunits (max_nunits, nunits: this_max_nunits);
1782 /* Keep a reference for the bst_map use. */
1783 SLP_TREE_REF_COUNT (res)++;
1784 }
1785 return res_;
1786}
1787
1788/* Helper for building an associated SLP node chain. */
1789
1790static void
1791vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
1792 slp_tree op0, slp_tree op1,
1793 stmt_vec_info oper1, stmt_vec_info oper2,
1794 vec<std::pair<unsigned, unsigned> > lperm)
1795{
1796 unsigned group_size = SLP_TREE_LANES (op1);
1797
1798 slp_tree child1 = new _slp_tree;
1799 SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
1800 SLP_TREE_VECTYPE (child1) = vectype;
1801 SLP_TREE_LANES (child1) = group_size;
1802 SLP_TREE_CHILDREN (child1).create (nelems: 2);
1803 SLP_TREE_CHILDREN (child1).quick_push (obj: op0);
1804 SLP_TREE_CHILDREN (child1).quick_push (obj: op1);
1805 SLP_TREE_REPRESENTATIVE (child1) = oper1;
1806
1807 slp_tree child2 = new _slp_tree;
1808 SLP_TREE_DEF_TYPE (child2) = vect_internal_def;
1809 SLP_TREE_VECTYPE (child2) = vectype;
1810 SLP_TREE_LANES (child2) = group_size;
1811 SLP_TREE_CHILDREN (child2).create (nelems: 2);
1812 SLP_TREE_CHILDREN (child2).quick_push (obj: op0);
1813 SLP_TREE_REF_COUNT (op0)++;
1814 SLP_TREE_CHILDREN (child2).quick_push (obj: op1);
1815 SLP_TREE_REF_COUNT (op1)++;
1816 SLP_TREE_REPRESENTATIVE (child2) = oper2;
1817
1818 SLP_TREE_DEF_TYPE (perm) = vect_internal_def;
1819 SLP_TREE_CODE (perm) = VEC_PERM_EXPR;
1820 SLP_TREE_VECTYPE (perm) = vectype;
1821 SLP_TREE_LANES (perm) = group_size;
1822 /* ??? We should set this NULL but that's not expected. */
1823 SLP_TREE_REPRESENTATIVE (perm) = oper1;
1824 SLP_TREE_LANE_PERMUTATION (perm) = lperm;
1825 SLP_TREE_CHILDREN (perm).quick_push (obj: child1);
1826 SLP_TREE_CHILDREN (perm).quick_push (obj: child2);
1827}
1828
1829/* Recursively build an SLP tree starting from NODE.
1830 Fail (and return a value not equal to zero) if def-stmts are not
1831 isomorphic, require data permutation or are of unsupported types of
1832 operation. Otherwise, return 0.
1833 The value returned is the depth in the SLP tree where a mismatch
1834 was found. */
1835
1836static slp_tree
1837vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1838 vec<stmt_vec_info> stmts, unsigned int group_size,
1839 poly_uint64 *max_nunits,
1840 bool *matches, unsigned *limit, unsigned *tree_size,
1841 scalar_stmts_to_slp_tree_map_t *bst_map)
1842{
1843 unsigned nops, i, this_tree_size = 0;
1844 poly_uint64 this_max_nunits = *max_nunits;
1845
1846 matches[0] = false;
1847
1848 stmt_vec_info stmt_info = stmts[0];
1849 if (!is_a<gcall *> (p: stmt_info->stmt)
1850 && !is_a<gassign *> (p: stmt_info->stmt)
1851 && !is_a<gphi *> (p: stmt_info->stmt))
1852 return NULL;
1853
1854 nops = gimple_num_args (gs: stmt_info->stmt);
1855 if (const int *map = vect_get_operand_map (stmt: stmt_info->stmt,
1856 STMT_VINFO_GATHER_SCATTER_P
1857 (stmt_info)))
1858 nops = map[0];
1859
1860 /* If the SLP node is a PHI (induction or reduction), terminate
1861 the recursion. */
1862 bool *skip_args = XALLOCAVEC (bool, nops);
1863 memset (s: skip_args, c: 0, n: sizeof (bool) * nops);
1864 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (p: vinfo))
1865 if (gphi *stmt = dyn_cast <gphi *> (p: stmt_info->stmt))
1866 {
1867 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1868 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1869 group_size);
1870 if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1871 max_nunits))
1872 return NULL;
1873
1874 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1875 if (def_type == vect_induction_def)
1876 {
1877 /* Induction PHIs are not cycles but walk the initial
1878 value. Only for inner loops through, for outer loops
1879 we need to pick up the value from the actual PHIs
1880 to more easily support peeling and epilogue vectorization. */
1881 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1882 if (!nested_in_vect_loop_p (loop, stmt_info))
1883 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1884 else
1885 loop = loop->inner;
1886 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1887 }
1888 else if (def_type == vect_reduction_def
1889 || def_type == vect_double_reduction_def
1890 || def_type == vect_nested_cycle
1891 || def_type == vect_first_order_recurrence)
1892 {
1893 /* Else def types have to match. */
1894 stmt_vec_info other_info;
1895 bool all_same = true;
1896 FOR_EACH_VEC_ELT (stmts, i, other_info)
1897 {
1898 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1899 return NULL;
1900 if (other_info != stmt_info)
1901 all_same = false;
1902 }
1903 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1904 /* Reduction initial values are not explicitely represented. */
1905 if (def_type != vect_first_order_recurrence
1906 && !nested_in_vect_loop_p (loop, stmt_info))
1907 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1908 /* Reduction chain backedge defs are filled manually.
1909 ??? Need a better way to identify a SLP reduction chain PHI.
1910 Or a better overall way to SLP match those. */
1911 if (all_same && def_type == vect_reduction_def)
1912 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1913 }
1914 else if (def_type != vect_internal_def)
1915 return NULL;
1916 }
1917
1918
1919 bool two_operators = false;
1920 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1921 tree vectype = NULL_TREE;
1922 if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1923 max_nunits: &this_max_nunits, matches, two_operators: &two_operators,
1924 node_vectype: &vectype))
1925 return NULL;
1926
1927 /* If the SLP node is a load, terminate the recursion unless masked. */
1928 if (STMT_VINFO_DATA_REF (stmt_info)
1929 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1930 {
1931 if (gcall *stmt = dyn_cast <gcall *> (p: stmt_info->stmt))
1932 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)
1933 || gimple_call_internal_p (stmt, IFN_GATHER_LOAD)
1934 || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD)
1935 || gimple_call_internal_p (stmt, IFN_MASK_LEN_GATHER_LOAD));
1936 else if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1937 gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
1938 else
1939 {
1940 *max_nunits = this_max_nunits;
1941 (*tree_size)++;
1942 node = vect_create_new_slp_node (node, scalar_stmts: stmts, nops: 0);
1943 SLP_TREE_VECTYPE (node) = vectype;
1944 /* And compute the load permutation. Whether it is actually
1945 a permutation depends on the unrolling factor which is
1946 decided later. */
1947 vec<unsigned> load_permutation;
1948 int j;
1949 stmt_vec_info load_info;
1950 load_permutation.create (nelems: group_size);
1951 stmt_vec_info first_stmt_info
1952 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1953 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1954 {
1955 int load_place;
1956 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1957 load_place = vect_get_place_in_interleaving_chain
1958 (stmt_info: load_info, first_stmt_info);
1959 else
1960 load_place = 0;
1961 gcc_assert (load_place != -1);
1962 load_permutation.safe_push (obj: load_place);
1963 }
1964 SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1965 return node;
1966 }
1967 }
1968 else if (gimple_assign_single_p (gs: stmt_info->stmt)
1969 && !gimple_vuse (g: stmt_info->stmt)
1970 && gimple_assign_rhs_code (gs: stmt_info->stmt) == BIT_FIELD_REF)
1971 {
1972 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1973 the same SSA name vector of a compatible type to vectype. */
1974 vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1975 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1976 stmt_vec_info estmt_info;
1977 FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1978 {
1979 gassign *estmt = as_a <gassign *> (p: estmt_info->stmt);
1980 tree bfref = gimple_assign_rhs1 (gs: estmt);
1981 HOST_WIDE_INT lane;
1982 if (!known_eq (bit_field_size (bfref),
1983 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1984 || !constant_multiple_p (a: bit_field_offset (t: bfref),
1985 b: bit_field_size (t: bfref), multiple: &lane))
1986 {
1987 lperm.release ();
1988 matches[0] = false;
1989 return NULL;
1990 }
1991 lperm.safe_push (obj: std::make_pair (x: 0, y: (unsigned)lane));
1992 }
1993 slp_tree vnode = vect_create_new_slp_node (ops: vNULL);
1994 if (operand_equal_p (TYPE_SIZE (vectype), TYPE_SIZE (TREE_TYPE (vec))))
1995 /* ??? We record vectype here but we hide eventually necessary
1996 punning and instead rely on code generation to materialize
1997 VIEW_CONVERT_EXPRs as necessary. We instead should make
1998 this explicit somehow. */
1999 SLP_TREE_VECTYPE (vnode) = vectype;
2000 else
2001 {
2002 /* For different size but compatible elements we can still
2003 use VEC_PERM_EXPR without punning. */
2004 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (vec))
2005 && types_compatible_p (TREE_TYPE (vectype),
2006 TREE_TYPE (TREE_TYPE (vec))));
2007 SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
2008 }
2009 auto nunits = TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (vnode));
2010 unsigned HOST_WIDE_INT const_nunits;
2011 if (nunits.is_constant (const_value: &const_nunits))
2012 SLP_TREE_LANES (vnode) = const_nunits;
2013 SLP_TREE_VEC_DEFS (vnode).safe_push (obj: vec);
2014 /* We are always building a permutation node even if it is an identity
2015 permute to shield the rest of the vectorizer from the odd node
2016 representing an actual vector without any scalar ops.
2017 ??? We could hide it completely with making the permute node
2018 external? */
2019 node = vect_create_new_slp_node (node, scalar_stmts: stmts, nops: 1);
2020 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2021 SLP_TREE_LANE_PERMUTATION (node) = lperm;
2022 SLP_TREE_VECTYPE (node) = vectype;
2023 SLP_TREE_CHILDREN (node).quick_push (obj: vnode);
2024 return node;
2025 }
2026 /* When discovery reaches an associatable operation see whether we can
2027 improve that to match up lanes in a way superior to the operand
2028 swapping code which at most looks at two defs.
2029 ??? For BB vectorization we cannot do the brute-force search
2030 for matching as we can succeed by means of builds from scalars
2031 and have no good way to "cost" one build against another. */
2032 else if (is_a <loop_vec_info> (p: vinfo)
2033 /* ??? We don't handle !vect_internal_def defs below. */
2034 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2035 && is_gimple_assign (gs: stmt_info->stmt)
2036 && (associative_tree_code (gimple_assign_rhs_code (gs: stmt_info->stmt))
2037 || gimple_assign_rhs_code (gs: stmt_info->stmt) == MINUS_EXPR)
2038 && ((FLOAT_TYPE_P (vectype) && flag_associative_math)
2039 || (INTEGRAL_TYPE_P (TREE_TYPE (vectype))
2040 && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype)))))
2041 {
2042 /* See if we have a chain of (mixed) adds or subtracts or other
2043 associatable ops. */
2044 enum tree_code code = gimple_assign_rhs_code (gs: stmt_info->stmt);
2045 if (code == MINUS_EXPR)
2046 code = PLUS_EXPR;
2047 stmt_vec_info other_op_stmt_info = NULL;
2048 stmt_vec_info op_stmt_info = NULL;
2049 unsigned chain_len = 0;
2050 auto_vec<chain_op_t> chain;
2051 auto_vec<std::pair<tree_code, gimple *> > worklist;
2052 auto_vec<vec<chain_op_t> > chains (group_size);
2053 auto_vec<slp_tree, 4> children;
2054 bool hard_fail = true;
2055 for (unsigned lane = 0; lane < group_size; ++lane)
2056 {
2057 /* For each lane linearize the addition/subtraction (or other
2058 uniform associatable operation) expression tree. */
2059 gimple *op_stmt = NULL, *other_op_stmt = NULL;
2060 vect_slp_linearize_chain (vinfo, worklist, chain, code,
2061 start: stmts[lane]->stmt, code_stmt&: op_stmt, alt_code_stmt&: other_op_stmt,
2062 NULL);
2063 if (!op_stmt_info && op_stmt)
2064 op_stmt_info = vinfo->lookup_stmt (op_stmt);
2065 if (!other_op_stmt_info && other_op_stmt)
2066 other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
2067 if (chain.length () == 2)
2068 {
2069 /* In a chain of just two elements resort to the regular
2070 operand swapping scheme. If we run into a length
2071 mismatch still hard-FAIL. */
2072 if (chain_len == 0)
2073 hard_fail = false;
2074 else
2075 {
2076 matches[lane] = false;
2077 /* ??? We might want to process the other lanes, but
2078 make sure to not give false matching hints to the
2079 caller for lanes we did not process. */
2080 if (lane != group_size - 1)
2081 matches[0] = false;
2082 }
2083 break;
2084 }
2085 else if (chain_len == 0)
2086 chain_len = chain.length ();
2087 else if (chain.length () != chain_len)
2088 {
2089 /* ??? Here we could slip in magic to compensate with
2090 neutral operands. */
2091 matches[lane] = false;
2092 if (lane != group_size - 1)
2093 matches[0] = false;
2094 break;
2095 }
2096 chains.quick_push (obj: chain.copy ());
2097 chain.truncate (size: 0);
2098 }
2099 if (chains.length () == group_size)
2100 {
2101 /* We cannot yet use SLP_TREE_CODE to communicate the operation. */
2102 if (!op_stmt_info)
2103 {
2104 hard_fail = false;
2105 goto out;
2106 }
2107 /* Now we have a set of chains with the same length. */
2108 /* 1. pre-sort according to def_type and operation. */
2109 for (unsigned lane = 0; lane < group_size; ++lane)
2110 chains[lane].stablesort (cmp: dt_sort_cmp, data: vinfo);
2111 if (dump_enabled_p ())
2112 {
2113 dump_printf_loc (MSG_NOTE, vect_location,
2114 "pre-sorted chains of %s\n",
2115 get_tree_code_name (code));
2116 for (unsigned lane = 0; lane < group_size; ++lane)
2117 {
2118 for (unsigned opnum = 0; opnum < chain_len; ++opnum)
2119 dump_printf (MSG_NOTE, "%s %T ",
2120 get_tree_code_name (chains[lane][opnum].code),
2121 chains[lane][opnum].op);
2122 dump_printf (MSG_NOTE, "\n");
2123 }
2124 }
2125 /* 2. try to build children nodes, associating as necessary. */
2126 for (unsigned n = 0; n < chain_len; ++n)
2127 {
2128 vect_def_type dt = chains[0][n].dt;
2129 unsigned lane;
2130 for (lane = 0; lane < group_size; ++lane)
2131 if (chains[lane][n].dt != dt)
2132 {
2133 if (dt == vect_constant_def
2134 && chains[lane][n].dt == vect_external_def)
2135 dt = vect_external_def;
2136 else if (dt == vect_external_def
2137 && chains[lane][n].dt == vect_constant_def)
2138 ;
2139 else
2140 break;
2141 }
2142 if (lane != group_size)
2143 {
2144 if (dump_enabled_p ())
2145 dump_printf_loc (MSG_NOTE, vect_location,
2146 "giving up on chain due to mismatched "
2147 "def types\n");
2148 matches[lane] = false;
2149 if (lane != group_size - 1)
2150 matches[0] = false;
2151 goto out;
2152 }
2153 if (dt == vect_constant_def
2154 || dt == vect_external_def)
2155 {
2156 /* Check whether we can build the invariant. If we can't
2157 we never will be able to. */
2158 tree type = TREE_TYPE (chains[0][n].op);
2159 if (!GET_MODE_SIZE (mode: vinfo->vector_mode).is_constant ()
2160 && (TREE_CODE (type) == BOOLEAN_TYPE
2161 || !can_duplicate_and_interleave_p (vinfo, count: group_size,
2162 elt_type: type)))
2163 {
2164 matches[0] = false;
2165 goto out;
2166 }
2167 vec<tree> ops;
2168 ops.create (nelems: group_size);
2169 for (lane = 0; lane < group_size; ++lane)
2170 ops.quick_push (obj: chains[lane][n].op);
2171 slp_tree child = vect_create_new_slp_node (ops);
2172 SLP_TREE_DEF_TYPE (child) = dt;
2173 children.safe_push (obj: child);
2174 }
2175 else if (dt != vect_internal_def)
2176 {
2177 /* Not sure, we might need sth special.
2178 gcc.dg/vect/pr96854.c,
2179 gfortran.dg/vect/fast-math-pr37021.f90
2180 and gfortran.dg/vect/pr61171.f trigger. */
2181 /* Soft-fail for now. */
2182 hard_fail = false;
2183 goto out;
2184 }
2185 else
2186 {
2187 vec<stmt_vec_info> op_stmts;
2188 op_stmts.create (nelems: group_size);
2189 slp_tree child = NULL;
2190 /* Brute-force our way. We have to consider a lane
2191 failing after fixing an earlier fail up in the
2192 SLP discovery recursion. So track the current
2193 permute per lane. */
2194 unsigned *perms = XALLOCAVEC (unsigned, group_size);
2195 memset (s: perms, c: 0, n: sizeof (unsigned) * group_size);
2196 do
2197 {
2198 op_stmts.truncate (size: 0);
2199 for (lane = 0; lane < group_size; ++lane)
2200 op_stmts.quick_push
2201 (obj: vinfo->lookup_def (chains[lane][n].op));
2202 child = vect_build_slp_tree (vinfo, stmts: op_stmts,
2203 group_size, max_nunits: &this_max_nunits,
2204 matches, limit,
2205 tree_size: &this_tree_size, bst_map);
2206 /* ??? We're likely getting too many fatal mismatches
2207 here so maybe we want to ignore them (but then we
2208 have no idea which lanes fatally mismatched). */
2209 if (child || !matches[0])
2210 break;
2211 /* Swap another lane we have not yet matched up into
2212 lanes that did not match. If we run out of
2213 permute possibilities for a lane terminate the
2214 search. */
2215 bool term = false;
2216 for (lane = 1; lane < group_size; ++lane)
2217 if (!matches[lane])
2218 {
2219 if (n + perms[lane] + 1 == chain_len)
2220 {
2221 term = true;
2222 break;
2223 }
2224 std::swap (a&: chains[lane][n],
2225 b&: chains[lane][n + perms[lane] + 1]);
2226 perms[lane]++;
2227 }
2228 if (term)
2229 break;
2230 }
2231 while (1);
2232 if (!child)
2233 {
2234 if (dump_enabled_p ())
2235 dump_printf_loc (MSG_NOTE, vect_location,
2236 "failed to match up op %d\n", n);
2237 op_stmts.release ();
2238 if (lane != group_size - 1)
2239 matches[0] = false;
2240 else
2241 matches[lane] = false;
2242 goto out;
2243 }
2244 if (dump_enabled_p ())
2245 {
2246 dump_printf_loc (MSG_NOTE, vect_location,
2247 "matched up op %d to\n", n);
2248 vect_print_slp_tree (MSG_NOTE, vect_location, child);
2249 }
2250 children.safe_push (obj: child);
2251 }
2252 }
2253 /* 3. build SLP nodes to combine the chain. */
2254 for (unsigned lane = 0; lane < group_size; ++lane)
2255 if (chains[lane][0].code != code)
2256 {
2257 /* See if there's any alternate all-PLUS entry. */
2258 unsigned n;
2259 for (n = 1; n < chain_len; ++n)
2260 {
2261 for (lane = 0; lane < group_size; ++lane)
2262 if (chains[lane][n].code != code)
2263 break;
2264 if (lane == group_size)
2265 break;
2266 }
2267 if (n != chain_len)
2268 {
2269 /* Swap that in at first position. */
2270 std::swap (a&: children[0], b&: children[n]);
2271 for (lane = 0; lane < group_size; ++lane)
2272 std::swap (a&: chains[lane][0], b&: chains[lane][n]);
2273 }
2274 else
2275 {
2276 /* ??? When this triggers and we end up with two
2277 vect_constant/external_def up-front things break (ICE)
2278 spectacularly finding an insertion place for the
2279 all-constant op. We should have a fully
2280 vect_internal_def operand though(?) so we can swap
2281 that into first place and then prepend the all-zero
2282 constant. */
2283 if (dump_enabled_p ())
2284 dump_printf_loc (MSG_NOTE, vect_location,
2285 "inserting constant zero to compensate "
2286 "for (partially) negated first "
2287 "operand\n");
2288 chain_len++;
2289 for (lane = 0; lane < group_size; ++lane)
2290 chains[lane].safe_insert
2291 (ix: 0, obj: chain_op_t (code, vect_constant_def, NULL_TREE));
2292 vec<tree> zero_ops;
2293 zero_ops.create (nelems: group_size);
2294 zero_ops.quick_push (obj: build_zero_cst (TREE_TYPE (vectype)));
2295 for (lane = 1; lane < group_size; ++lane)
2296 zero_ops.quick_push (obj: zero_ops[0]);
2297 slp_tree zero = vect_create_new_slp_node (ops: zero_ops);
2298 SLP_TREE_DEF_TYPE (zero) = vect_constant_def;
2299 children.safe_insert (ix: 0, obj: zero);
2300 }
2301 break;
2302 }
2303 for (unsigned i = 1; i < children.length (); ++i)
2304 {
2305 slp_tree op0 = children[i - 1];
2306 slp_tree op1 = children[i];
2307 bool this_two_op = false;
2308 for (unsigned lane = 0; lane < group_size; ++lane)
2309 if (chains[lane][i].code != chains[0][i].code)
2310 {
2311 this_two_op = true;
2312 break;
2313 }
2314 slp_tree child;
2315 if (i == children.length () - 1)
2316 child = vect_create_new_slp_node (node, scalar_stmts: stmts, nops: 2);
2317 else
2318 child = vect_create_new_slp_node (nops: 2, code: ERROR_MARK);
2319 if (this_two_op)
2320 {
2321 vec<std::pair<unsigned, unsigned> > lperm;
2322 lperm.create (nelems: group_size);
2323 for (unsigned lane = 0; lane < group_size; ++lane)
2324 lperm.quick_push (obj: std::make_pair
2325 (x: chains[lane][i].code != chains[0][i].code, y&: lane));
2326 vect_slp_build_two_operator_nodes (perm: child, vectype, op0, op1,
2327 oper1: (chains[0][i].code == code
2328 ? op_stmt_info
2329 : other_op_stmt_info),
2330 oper2: (chains[0][i].code == code
2331 ? other_op_stmt_info
2332 : op_stmt_info),
2333 lperm);
2334 }
2335 else
2336 {
2337 SLP_TREE_DEF_TYPE (child) = vect_internal_def;
2338 SLP_TREE_VECTYPE (child) = vectype;
2339 SLP_TREE_LANES (child) = group_size;
2340 SLP_TREE_CHILDREN (child).quick_push (obj: op0);
2341 SLP_TREE_CHILDREN (child).quick_push (obj: op1);
2342 SLP_TREE_REPRESENTATIVE (child)
2343 = (chains[0][i].code == code
2344 ? op_stmt_info : other_op_stmt_info);
2345 }
2346 children[i] = child;
2347 }
2348 *tree_size += this_tree_size + 1;
2349 *max_nunits = this_max_nunits;
2350 while (!chains.is_empty ())
2351 chains.pop ().release ();
2352 return node;
2353 }
2354out:
2355 while (!children.is_empty ())
2356 vect_free_slp_tree (node: children.pop ());
2357 while (!chains.is_empty ())
2358 chains.pop ().release ();
2359 /* Hard-fail, otherwise we might run into quadratic processing of the
2360 chains starting one stmt into the chain again. */
2361 if (hard_fail)
2362 return NULL;
2363 /* Fall thru to normal processing. */
2364 }
2365
2366 /* Get at the operands, verifying they are compatible. */
2367 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
2368 slp_oprnd_info oprnd_info;
2369 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
2370 {
2371 int res = vect_get_and_check_slp_defs (vinfo, swap: swap[i], skip_args,
2372 stmts, stmt_num: i, oprnds_info: &oprnds_info);
2373 if (res != 0)
2374 matches[(res == -1) ? 0 : i] = false;
2375 if (!matches[0])
2376 break;
2377 }
2378 for (i = 0; i < group_size; ++i)
2379 if (!matches[i])
2380 {
2381 vect_free_oprnd_info (oprnds_info);
2382 return NULL;
2383 }
2384 swap = NULL;
2385
2386 auto_vec<slp_tree, 4> children;
2387
2388 stmt_info = stmts[0];
2389
2390 /* Create SLP_TREE nodes for the definition node/s. */
2391 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2392 {
2393 slp_tree child;
2394 unsigned int j;
2395
2396 /* We're skipping certain operands from processing, for example
2397 outer loop reduction initial defs. */
2398 if (skip_args[i])
2399 {
2400 children.safe_push (NULL);
2401 continue;
2402 }
2403
2404 if (oprnd_info->first_dt == vect_uninitialized_def)
2405 {
2406 /* COND_EXPR have one too many eventually if the condition
2407 is a SSA name. */
2408 gcc_assert (i == 3 && nops == 4);
2409 continue;
2410 }
2411
2412 if (is_a <bb_vec_info> (p: vinfo)
2413 && oprnd_info->first_dt == vect_internal_def
2414 && !oprnd_info->any_pattern)
2415 {
2416 /* For BB vectorization, if all defs are the same do not
2417 bother to continue the build along the single-lane
2418 graph but use a splat of the scalar value. */
2419 stmt_vec_info first_def = oprnd_info->def_stmts[0];
2420 for (j = 1; j < group_size; ++j)
2421 if (oprnd_info->def_stmts[j] != first_def)
2422 break;
2423 if (j == group_size
2424 /* But avoid doing this for loads where we may be
2425 able to CSE things, unless the stmt is not
2426 vectorizable. */
2427 && (!STMT_VINFO_VECTORIZABLE (first_def)
2428 || !gimple_vuse (g: first_def->stmt)))
2429 {
2430 if (dump_enabled_p ())
2431 dump_printf_loc (MSG_NOTE, vect_location,
2432 "Using a splat of the uniform operand %G",
2433 first_def->stmt);
2434 oprnd_info->first_dt = vect_external_def;
2435 }
2436 }
2437
2438 if (oprnd_info->first_dt == vect_external_def
2439 || oprnd_info->first_dt == vect_constant_def)
2440 {
2441 slp_tree invnode = vect_create_new_slp_node (ops: oprnd_info->ops);
2442 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
2443 oprnd_info->ops = vNULL;
2444 children.safe_push (obj: invnode);
2445 continue;
2446 }
2447
2448 if ((child = vect_build_slp_tree (vinfo, stmts: oprnd_info->def_stmts,
2449 group_size, max_nunits: &this_max_nunits,
2450 matches, limit,
2451 tree_size: &this_tree_size, bst_map)) != NULL)
2452 {
2453 oprnd_info->def_stmts = vNULL;
2454 children.safe_push (obj: child);
2455 continue;
2456 }
2457
2458 /* If the SLP build for operand zero failed and operand zero
2459 and one can be commutated try that for the scalar stmts
2460 that failed the match. */
2461 if (i == 0
2462 /* A first scalar stmt mismatch signals a fatal mismatch. */
2463 && matches[0]
2464 /* ??? For COND_EXPRs we can swap the comparison operands
2465 as well as the arms under some constraints. */
2466 && nops == 2
2467 && oprnds_info[1]->first_dt == vect_internal_def
2468 && is_gimple_assign (gs: stmt_info->stmt)
2469 /* Swapping operands for reductions breaks assumptions later on. */
2470 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
2471 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2472 {
2473 /* See whether we can swap the matching or the non-matching
2474 stmt operands. */
2475 bool swap_not_matching = true;
2476 do
2477 {
2478 for (j = 0; j < group_size; ++j)
2479 {
2480 if (matches[j] != !swap_not_matching)
2481 continue;
2482 stmt_vec_info stmt_info = stmts[j];
2483 /* Verify if we can swap operands of this stmt. */
2484 gassign *stmt = dyn_cast <gassign *> (p: stmt_info->stmt);
2485 if (!stmt
2486 || !commutative_tree_code (gimple_assign_rhs_code (gs: stmt)))
2487 {
2488 if (!swap_not_matching)
2489 goto fail;
2490 swap_not_matching = false;
2491 break;
2492 }
2493 }
2494 }
2495 while (j != group_size);
2496
2497 /* Swap mismatched definition stmts. */
2498 if (dump_enabled_p ())
2499 dump_printf_loc (MSG_NOTE, vect_location,
2500 "Re-trying with swapped operands of stmts ");
2501 for (j = 0; j < group_size; ++j)
2502 if (matches[j] == !swap_not_matching)
2503 {
2504 std::swap (a&: oprnds_info[0]->def_stmts[j],
2505 b&: oprnds_info[1]->def_stmts[j]);
2506 std::swap (a&: oprnds_info[0]->ops[j],
2507 b&: oprnds_info[1]->ops[j]);
2508 if (dump_enabled_p ())
2509 dump_printf (MSG_NOTE, "%d ", j);
2510 }
2511 if (dump_enabled_p ())
2512 dump_printf (MSG_NOTE, "\n");
2513 /* After swapping some operands we lost track whether an
2514 operand has any pattern defs so be conservative here. */
2515 if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern)
2516 oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true;
2517 /* And try again with scratch 'matches' ... */
2518 bool *tem = XALLOCAVEC (bool, group_size);
2519 if ((child = vect_build_slp_tree (vinfo, stmts: oprnd_info->def_stmts,
2520 group_size, max_nunits: &this_max_nunits,
2521 matches: tem, limit,
2522 tree_size: &this_tree_size, bst_map)) != NULL)
2523 {
2524 oprnd_info->def_stmts = vNULL;
2525 children.safe_push (obj: child);
2526 continue;
2527 }
2528 }
2529fail:
2530
2531 /* If the SLP build failed and we analyze a basic-block
2532 simply treat nodes we fail to build as externally defined
2533 (and thus build vectors from the scalar defs).
2534 The cost model will reject outright expensive cases.
2535 ??? This doesn't treat cases where permutation ultimatively
2536 fails (or we don't try permutation below). Ideally we'd
2537 even compute a permutation that will end up with the maximum
2538 SLP tree size... */
2539 if (is_a <bb_vec_info> (p: vinfo)
2540 /* ??? Rejecting patterns this way doesn't work. We'd have to
2541 do extra work to cancel the pattern so the uses see the
2542 scalar version. */
2543 && !is_pattern_stmt_p (stmt_info)
2544 && !oprnd_info->any_pattern)
2545 {
2546 /* But if there's a leading vector sized set of matching stmts
2547 fail here so we can split the group. This matches the condition
2548 vect_analyze_slp_instance uses. */
2549 /* ??? We might want to split here and combine the results to support
2550 multiple vector sizes better. */
2551 for (j = 0; j < group_size; ++j)
2552 if (!matches[j])
2553 break;
2554 if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
2555 {
2556 if (dump_enabled_p ())
2557 dump_printf_loc (MSG_NOTE, vect_location,
2558 "Building vector operands from scalars\n");
2559 this_tree_size++;
2560 child = vect_create_new_slp_node (ops: oprnd_info->ops);
2561 children.safe_push (obj: child);
2562 oprnd_info->ops = vNULL;
2563 continue;
2564 }
2565 }
2566
2567 gcc_assert (child == NULL);
2568 FOR_EACH_VEC_ELT (children, j, child)
2569 if (child)
2570 vect_free_slp_tree (node: child);
2571 vect_free_oprnd_info (oprnds_info);
2572 return NULL;
2573 }
2574
2575 vect_free_oprnd_info (oprnds_info);
2576
2577 /* If we have all children of a child built up from uniform scalars
2578 or does more than one possibly expensive vector construction then
2579 just throw that away, causing it built up from scalars.
2580 The exception is the SLP node for the vector store. */
2581 if (is_a <bb_vec_info> (p: vinfo)
2582 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
2583 /* ??? Rejecting patterns this way doesn't work. We'd have to
2584 do extra work to cancel the pattern so the uses see the
2585 scalar version. */
2586 && !is_pattern_stmt_p (stmt_info))
2587 {
2588 slp_tree child;
2589 unsigned j;
2590 bool all_uniform_p = true;
2591 unsigned n_vector_builds = 0;
2592 FOR_EACH_VEC_ELT (children, j, child)
2593 {
2594 if (!child)
2595 ;
2596 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2597 all_uniform_p = false;
2598 else if (!vect_slp_tree_uniform_p (node: child))
2599 {
2600 all_uniform_p = false;
2601 if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
2602 n_vector_builds++;
2603 }
2604 }
2605 if (all_uniform_p
2606 || n_vector_builds > 1
2607 || (n_vector_builds == children.length ()
2608 && is_a <gphi *> (p: stmt_info->stmt)))
2609 {
2610 /* Roll back. */
2611 matches[0] = false;
2612 FOR_EACH_VEC_ELT (children, j, child)
2613 if (child)
2614 vect_free_slp_tree (node: child);
2615
2616 if (dump_enabled_p ())
2617 dump_printf_loc (MSG_NOTE, vect_location,
2618 "Building parent vector operands from "
2619 "scalars instead\n");
2620 return NULL;
2621 }
2622 }
2623
2624 *tree_size += this_tree_size + 1;
2625 *max_nunits = this_max_nunits;
2626
2627 if (two_operators)
2628 {
2629 /* ??? We'd likely want to either cache in bst_map sth like
2630 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
2631 the true { a+b, a+b, a+b, a+b } ... but there we don't have
2632 explicit stmts to put in so the keying on 'stmts' doesn't
2633 work (but we have the same issue with nodes that use 'ops'). */
2634 slp_tree one = new _slp_tree;
2635 slp_tree two = new _slp_tree;
2636 SLP_TREE_DEF_TYPE (one) = vect_internal_def;
2637 SLP_TREE_DEF_TYPE (two) = vect_internal_def;
2638 SLP_TREE_VECTYPE (one) = vectype;
2639 SLP_TREE_VECTYPE (two) = vectype;
2640 SLP_TREE_CHILDREN (one).safe_splice (src: children);
2641 SLP_TREE_CHILDREN (two).safe_splice (src: children);
2642 slp_tree child;
2643 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
2644 SLP_TREE_REF_COUNT (child)++;
2645
2646 /* Here we record the original defs since this
2647 node represents the final lane configuration. */
2648 node = vect_create_new_slp_node (node, scalar_stmts: stmts, nops: 2);
2649 SLP_TREE_VECTYPE (node) = vectype;
2650 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2651 SLP_TREE_CHILDREN (node).quick_push (obj: one);
2652 SLP_TREE_CHILDREN (node).quick_push (obj: two);
2653 gassign *stmt = as_a <gassign *> (p: stmts[0]->stmt);
2654 enum tree_code code0 = gimple_assign_rhs_code (gs: stmt);
2655 enum tree_code ocode = ERROR_MARK;
2656 stmt_vec_info ostmt_info;
2657 unsigned j = 0;
2658 FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
2659 {
2660 gassign *ostmt = as_a <gassign *> (p: ostmt_info->stmt);
2661 if (gimple_assign_rhs_code (gs: ostmt) != code0)
2662 {
2663 SLP_TREE_LANE_PERMUTATION (node).safe_push (obj: std::make_pair (x: 1, y&: i));
2664 ocode = gimple_assign_rhs_code (gs: ostmt);
2665 j = i;
2666 }
2667 else
2668 SLP_TREE_LANE_PERMUTATION (node).safe_push (obj: std::make_pair (x: 0, y&: i));
2669 }
2670 SLP_TREE_CODE (one) = code0;
2671 SLP_TREE_CODE (two) = ocode;
2672 SLP_TREE_LANES (one) = stmts.length ();
2673 SLP_TREE_LANES (two) = stmts.length ();
2674 SLP_TREE_REPRESENTATIVE (one) = stmts[0];
2675 SLP_TREE_REPRESENTATIVE (two) = stmts[j];
2676 return node;
2677 }
2678
2679 node = vect_create_new_slp_node (node, scalar_stmts: stmts, nops);
2680 SLP_TREE_VECTYPE (node) = vectype;
2681 SLP_TREE_CHILDREN (node).splice (src: children);
2682 return node;
2683}
2684
2685/* Dump a single SLP tree NODE. */
2686
2687static void
2688vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
2689 slp_tree node)
2690{
2691 unsigned i, j;
2692 slp_tree child;
2693 stmt_vec_info stmt_info;
2694 tree op;
2695
2696 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
2697 dump_user_location_t user_loc = loc.get_user_location ();
2698 dump_printf_loc (metadata, user_loc,
2699 "node%s %p (max_nunits=" HOST_WIDE_INT_PRINT_UNSIGNED
2700 ", refcnt=%u)",
2701 SLP_TREE_DEF_TYPE (node) == vect_external_def
2702 ? " (external)"
2703 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
2704 ? " (constant)"
2705 : ""), (void *) node,
2706 estimated_poly_value (x: node->max_nunits),
2707 SLP_TREE_REF_COUNT (node));
2708 if (SLP_TREE_VECTYPE (node))
2709 dump_printf (metadata, " %T", SLP_TREE_VECTYPE (node));
2710 dump_printf (metadata, "\n");
2711 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
2712 {
2713 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
2714 dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
2715 else
2716 dump_printf_loc (metadata, user_loc, "op template: %G",
2717 SLP_TREE_REPRESENTATIVE (node)->stmt);
2718 }
2719 if (SLP_TREE_SCALAR_STMTS (node).exists ())
2720 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2721 dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
2722 else
2723 {
2724 dump_printf_loc (metadata, user_loc, "\t{ ");
2725 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
2726 dump_printf (metadata, "%T%s ", op,
2727 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
2728 dump_printf (metadata, "}\n");
2729 }
2730 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2731 {
2732 dump_printf_loc (metadata, user_loc, "\tload permutation {");
2733 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
2734 dump_printf (dump_kind, " %u", j);
2735 dump_printf (dump_kind, " }\n");
2736 }
2737 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
2738 {
2739 dump_printf_loc (metadata, user_loc, "\tlane permutation {");
2740 for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
2741 dump_printf (dump_kind, " %u[%u]",
2742 SLP_TREE_LANE_PERMUTATION (node)[i].first,
2743 SLP_TREE_LANE_PERMUTATION (node)[i].second);
2744 dump_printf (dump_kind, " }\n");
2745 }
2746 if (SLP_TREE_CHILDREN (node).is_empty ())
2747 return;
2748 dump_printf_loc (metadata, user_loc, "\tchildren");
2749 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2750 dump_printf (dump_kind, " %p", (void *)child);
2751 dump_printf (dump_kind, "\n");
2752}
2753
2754DEBUG_FUNCTION void
2755debug (slp_tree node)
2756{
2757 debug_dump_context ctx;
2758 vect_print_slp_tree (dump_kind: MSG_NOTE,
2759 loc: dump_location_t::from_location_t (UNKNOWN_LOCATION),
2760 node);
2761}
2762
2763/* Recursive helper for the dot producer below. */
2764
2765static void
2766dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited)
2767{
2768 if (visited.add (k: node))
2769 return;
2770
2771 fprintf (stream: f, format: "\"%p\" [label=\"", (void *)node);
2772 vect_print_slp_tree (dump_kind: MSG_NOTE,
2773 loc: dump_location_t::from_location_t (UNKNOWN_LOCATION),
2774 node);
2775 fprintf (stream: f, format: "\"];\n");
2776
2777
2778 for (slp_tree child : SLP_TREE_CHILDREN (node))
2779 fprintf (stream: f, format: "\"%p\" -> \"%p\";", (void *)node, (void *)child);
2780
2781 for (slp_tree child : SLP_TREE_CHILDREN (node))
2782 if (child)
2783 dot_slp_tree (f, node: child, visited);
2784}
2785
2786DEBUG_FUNCTION void
2787dot_slp_tree (const char *fname, slp_tree node)
2788{
2789 FILE *f = fopen (filename: fname, modes: "w");
2790 fprintf (stream: f, format: "digraph {\n");
2791 fflush (stream: f);
2792 {
2793 debug_dump_context ctx (f);
2794 hash_set<slp_tree> visited;
2795 dot_slp_tree (f, node, visited);
2796 }
2797 fflush (stream: f);
2798 fprintf (stream: f, format: "}\n");
2799 fclose (stream: f);
2800}
2801
2802/* Dump a slp tree NODE using flags specified in DUMP_KIND. */
2803
2804static void
2805vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2806 slp_tree node, hash_set<slp_tree> &visited)
2807{
2808 unsigned i;
2809 slp_tree child;
2810
2811 if (visited.add (k: node))
2812 return;
2813
2814 vect_print_slp_tree (dump_kind, loc, node);
2815
2816 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2817 if (child)
2818 vect_print_slp_graph (dump_kind, loc, node: child, visited);
2819}
2820
2821static void
2822vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2823 slp_tree entry)
2824{
2825 hash_set<slp_tree> visited;
2826 vect_print_slp_graph (dump_kind, loc, node: entry, visited);
2827}
2828
2829/* Mark the tree rooted at NODE with PURE_SLP. */
2830
2831static void
2832vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
2833{
2834 int i;
2835 stmt_vec_info stmt_info;
2836 slp_tree child;
2837
2838 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2839 return;
2840
2841 if (visited.add (k: node))
2842 return;
2843
2844 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2845 STMT_SLP_TYPE (stmt_info) = pure_slp;
2846
2847 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2848 if (child)
2849 vect_mark_slp_stmts (node: child, visited);
2850}
2851
2852static void
2853vect_mark_slp_stmts (slp_tree node)
2854{
2855 hash_set<slp_tree> visited;
2856 vect_mark_slp_stmts (node, visited);
2857}
2858
2859/* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
2860
2861static void
2862vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
2863{
2864 int i;
2865 stmt_vec_info stmt_info;
2866 slp_tree child;
2867
2868 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2869 return;
2870
2871 if (visited.add (k: node))
2872 return;
2873
2874 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2875 {
2876 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
2877 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
2878 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
2879 }
2880
2881 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2882 if (child)
2883 vect_mark_slp_stmts_relevant (node: child, visited);
2884}
2885
2886static void
2887vect_mark_slp_stmts_relevant (slp_tree node)
2888{
2889 hash_set<slp_tree> visited;
2890 vect_mark_slp_stmts_relevant (node, visited);
2891}
2892
2893
2894/* Gather loads in the SLP graph NODE and populate the INST loads array. */
2895
2896static void
2897vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
2898 hash_set<slp_tree> &visited)
2899{
2900 if (!node || visited.add (k: node))
2901 return;
2902
2903 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2904 return;
2905
2906 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR)
2907 {
2908 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
2909 if (STMT_VINFO_DATA_REF (stmt_info)
2910 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2911 loads.safe_push (obj: node);
2912 }
2913
2914 unsigned i;
2915 slp_tree child;
2916 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2917 vect_gather_slp_loads (loads, node: child, visited);
2918}
2919
2920
2921/* Find the last store in SLP INSTANCE. */
2922
2923stmt_vec_info
2924vect_find_last_scalar_stmt_in_slp (slp_tree node)
2925{
2926 stmt_vec_info last = NULL;
2927 stmt_vec_info stmt_vinfo;
2928
2929 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (ix: i, ptr: &stmt_vinfo); i++)
2930 {
2931 stmt_vinfo = vect_orig_stmt (stmt_info: stmt_vinfo);
2932 last = last ? get_later_stmt (stmt1_info: stmt_vinfo, stmt2_info: last) : stmt_vinfo;
2933 }
2934
2935 return last;
2936}
2937
2938/* Find the first stmt in NODE. */
2939
2940stmt_vec_info
2941vect_find_first_scalar_stmt_in_slp (slp_tree node)
2942{
2943 stmt_vec_info first = NULL;
2944 stmt_vec_info stmt_vinfo;
2945
2946 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (ix: i, ptr: &stmt_vinfo); i++)
2947 {
2948 stmt_vinfo = vect_orig_stmt (stmt_info: stmt_vinfo);
2949 if (!first
2950 || get_later_stmt (stmt1_info: stmt_vinfo, stmt2_info: first) == first)
2951 first = stmt_vinfo;
2952 }
2953
2954 return first;
2955}
2956
2957/* Splits a group of stores, currently beginning at FIRST_VINFO, into
2958 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2959 (also containing the first GROUP1_SIZE stmts, since stores are
2960 consecutive), the second containing the remainder.
2961 Return the first stmt in the second group. */
2962
2963static stmt_vec_info
2964vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
2965{
2966 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
2967 gcc_assert (group1_size > 0);
2968 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
2969 gcc_assert (group2_size > 0);
2970 DR_GROUP_SIZE (first_vinfo) = group1_size;
2971
2972 stmt_vec_info stmt_info = first_vinfo;
2973 for (unsigned i = group1_size; i > 1; i--)
2974 {
2975 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2976 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2977 }
2978 /* STMT is now the last element of the first group. */
2979 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
2980 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
2981
2982 DR_GROUP_SIZE (group2) = group2_size;
2983 for (stmt_info = group2; stmt_info;
2984 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
2985 {
2986 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
2987 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2988 }
2989
2990 /* For the second group, the DR_GROUP_GAP is that before the original group,
2991 plus skipping over the first vector. */
2992 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
2993
2994 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
2995 DR_GROUP_GAP (first_vinfo) += group2_size;
2996
2997 if (dump_enabled_p ())
2998 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2999 group1_size, group2_size);
3000
3001 return group2;
3002}
3003
3004/* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
3005 statements and a vector of NUNITS elements. */
3006
3007static poly_uint64
3008calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
3009{
3010 return exact_div (a: common_multiple (a: nunits, b: group_size), b: group_size);
3011}
3012
3013/* Helper that checks to see if a node is a load node. */
3014
3015static inline bool
3016vect_is_slp_load_node (slp_tree root)
3017{
3018 return SLP_TREE_DEF_TYPE (root) == vect_internal_def
3019 && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
3020 && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root)));
3021}
3022
3023
3024/* Helper function of optimize_load_redistribution that performs the operation
3025 recursively. */
3026
3027static slp_tree
3028optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
3029 vec_info *vinfo, unsigned int group_size,
3030 hash_map<slp_tree, slp_tree> *load_map,
3031 slp_tree root)
3032{
3033 if (slp_tree *leader = load_map->get (k: root))
3034 return *leader;
3035
3036 slp_tree node;
3037 unsigned i;
3038
3039 /* For now, we don't know anything about externals so do not do anything. */
3040 if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
3041 return NULL;
3042 else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
3043 {
3044 /* First convert this node into a load node and add it to the leaves
3045 list and flatten the permute from a lane to a load one. If it's
3046 unneeded it will be elided later. */
3047 vec<stmt_vec_info> stmts;
3048 stmts.create (SLP_TREE_LANES (root));
3049 lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
3050 for (unsigned j = 0; j < lane_perm.length (); j++)
3051 {
3052 std::pair<unsigned, unsigned> perm = lane_perm[j];
3053 node = SLP_TREE_CHILDREN (root)[perm.first];
3054
3055 if (!vect_is_slp_load_node (root: node)
3056 || SLP_TREE_CHILDREN (node).exists ())
3057 {
3058 stmts.release ();
3059 goto next;
3060 }
3061
3062 stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
3063 }
3064
3065 if (dump_enabled_p ())
3066 dump_printf_loc (MSG_NOTE, vect_location,
3067 "converting stmts on permute node %p\n",
3068 (void *) root);
3069
3070 bool *matches = XALLOCAVEC (bool, group_size);
3071 poly_uint64 max_nunits = 1;
3072 unsigned tree_size = 0, limit = 1;
3073 node = vect_build_slp_tree (vinfo, stmts, group_size, max_nunits: &max_nunits,
3074 matches, limit: &limit, tree_size: &tree_size, bst_map);
3075 if (!node)
3076 stmts.release ();
3077
3078 load_map->put (k: root, v: node);
3079 return node;
3080 }
3081
3082next:
3083 load_map->put (k: root, NULL);
3084
3085 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3086 {
3087 slp_tree value
3088 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3089 root: node);
3090 if (value)
3091 {
3092 SLP_TREE_REF_COUNT (value)++;
3093 SLP_TREE_CHILDREN (root)[i] = value;
3094 /* ??? We know the original leafs of the replaced nodes will
3095 be referenced by bst_map, only the permutes created by
3096 pattern matching are not. */
3097 if (SLP_TREE_REF_COUNT (node) == 1)
3098 load_map->remove (k: node);
3099 vect_free_slp_tree (node);
3100 }
3101 }
3102
3103 return NULL;
3104}
3105
3106/* Temporary workaround for loads not being CSEd during SLP build. This
3107 function will traverse the SLP tree rooted in ROOT for INSTANCE and find
3108 VEC_PERM nodes that blend vectors from multiple nodes that all read from the
3109 same DR such that the final operation is equal to a permuted load. Such
3110 NODES are then directly converted into LOADS themselves. The nodes are
3111 CSEd using BST_MAP. */
3112
3113static void
3114optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
3115 vec_info *vinfo, unsigned int group_size,
3116 hash_map<slp_tree, slp_tree> *load_map,
3117 slp_tree root)
3118{
3119 slp_tree node;
3120 unsigned i;
3121
3122 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3123 {
3124 slp_tree value
3125 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3126 root: node);
3127 if (value)
3128 {
3129 SLP_TREE_REF_COUNT (value)++;
3130 SLP_TREE_CHILDREN (root)[i] = value;
3131 /* ??? We know the original leafs of the replaced nodes will
3132 be referenced by bst_map, only the permutes created by
3133 pattern matching are not. */
3134 if (SLP_TREE_REF_COUNT (node) == 1)
3135 load_map->remove (k: node);
3136 vect_free_slp_tree (node);
3137 }
3138 }
3139}
3140
3141/* Helper function of vect_match_slp_patterns.
3142
3143 Attempts to match patterns against the slp tree rooted in REF_NODE using
3144 VINFO. Patterns are matched in post-order traversal.
3145
3146 If matching is successful the value in REF_NODE is updated and returned, if
3147 not then it is returned unchanged. */
3148
3149static bool
3150vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
3151 slp_tree_to_load_perm_map_t *perm_cache,
3152 slp_compat_nodes_map_t *compat_cache,
3153 hash_set<slp_tree> *visited)
3154{
3155 unsigned i;
3156 slp_tree node = *ref_node;
3157 bool found_p = false;
3158 if (!node || visited->add (k: node))
3159 return false;
3160
3161 slp_tree child;
3162 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3163 found_p |= vect_match_slp_patterns_2 (ref_node: &SLP_TREE_CHILDREN (node)[i],
3164 vinfo, perm_cache, compat_cache,
3165 visited);
3166
3167 for (unsigned x = 0; x < num__slp_patterns; x++)
3168 {
3169 vect_pattern *pattern
3170 = slp_patterns[x] (perm_cache, compat_cache, ref_node);
3171 if (pattern)
3172 {
3173 pattern->build (vinfo);
3174 delete pattern;
3175 found_p = true;
3176 }
3177 }
3178
3179 return found_p;
3180}
3181
3182/* Applies pattern matching to the given SLP tree rooted in REF_NODE using
3183 vec_info VINFO.
3184
3185 The modified tree is returned. Patterns are tried in order and multiple
3186 patterns may match. */
3187
3188static bool
3189vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
3190 hash_set<slp_tree> *visited,
3191 slp_tree_to_load_perm_map_t *perm_cache,
3192 slp_compat_nodes_map_t *compat_cache)
3193{
3194 DUMP_VECT_SCOPE ("vect_match_slp_patterns");
3195 slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
3196
3197 if (dump_enabled_p ())
3198 dump_printf_loc (MSG_NOTE, vect_location,
3199 "Analyzing SLP tree %p for patterns\n",
3200 (void *) SLP_INSTANCE_TREE (instance));
3201
3202 return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, compat_cache,
3203 visited);
3204}
3205
3206/* STMT_INFO is a store group of size GROUP_SIZE that we are considering
3207 splitting into two, with the first split group having size NEW_GROUP_SIZE.
3208 Return true if we could use IFN_STORE_LANES instead and if that appears
3209 to be the better approach. */
3210
3211static bool
3212vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
3213 unsigned int group_size,
3214 unsigned int new_group_size)
3215{
3216 tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3217 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
3218 if (!vectype)
3219 return false;
3220 /* Allow the split if one of the two new groups would operate on full
3221 vectors *within* rather than across one scalar loop iteration.
3222 This is purely a heuristic, but it should work well for group
3223 sizes of 3 and 4, where the possible splits are:
3224
3225 3->2+1: OK if the vector has exactly two elements
3226 4->2+2: Likewise
3227 4->3+1: Less clear-cut. */
3228 if (multiple_p (a: group_size - new_group_size, b: TYPE_VECTOR_SUBPARTS (node: vectype))
3229 || multiple_p (a: new_group_size, b: TYPE_VECTOR_SUBPARTS (node: vectype)))
3230 return false;
3231 return vect_store_lanes_supported (vectype, group_size, false) != IFN_LAST;
3232}
3233
3234/* Analyze an SLP instance starting from a group of grouped stores. Call
3235 vect_build_slp_tree to build a tree of packed stmts if possible.
3236 Return FALSE if it's impossible to SLP any stmt in the loop. */
3237
3238static bool
3239vect_analyze_slp_instance (vec_info *vinfo,
3240 scalar_stmts_to_slp_tree_map_t *bst_map,
3241 stmt_vec_info stmt_info, slp_instance_kind kind,
3242 unsigned max_tree_size, unsigned *limit);
3243
3244/* Analyze an SLP instance starting from SCALAR_STMTS which are a group
3245 of KIND. Return true if successful. */
3246
3247static bool
3248vect_build_slp_instance (vec_info *vinfo,
3249 slp_instance_kind kind,
3250 vec<stmt_vec_info> &scalar_stmts,
3251 vec<stmt_vec_info> &root_stmt_infos,
3252 vec<tree> &remain,
3253 unsigned max_tree_size, unsigned *limit,
3254 scalar_stmts_to_slp_tree_map_t *bst_map,
3255 /* ??? We need stmt_info for group splitting. */
3256 stmt_vec_info stmt_info_)
3257{
3258 if (kind == slp_inst_kind_ctor)
3259 {
3260 if (dump_enabled_p ())
3261 dump_printf_loc (MSG_NOTE, vect_location,
3262 "Analyzing vectorizable constructor: %G\n",
3263 root_stmt_infos[0]->stmt);
3264 }
3265
3266 if (dump_enabled_p ())
3267 {
3268 dump_printf_loc (MSG_NOTE, vect_location,
3269 "Starting SLP discovery for\n");
3270 for (unsigned i = 0; i < scalar_stmts.length (); ++i)
3271 dump_printf_loc (MSG_NOTE, vect_location,
3272 " %G", scalar_stmts[i]->stmt);
3273 }
3274
3275 /* When a BB reduction doesn't have an even number of lanes
3276 strip it down, treating the remaining lane as scalar.
3277 ??? Selecting the optimal set of lanes to vectorize would be nice
3278 but SLP build for all lanes will fail quickly because we think
3279 we're going to need unrolling. */
3280 if (kind == slp_inst_kind_bb_reduc
3281 && (scalar_stmts.length () & 1))
3282 remain.safe_insert (ix: 0, obj: gimple_get_lhs (scalar_stmts.pop ()->stmt));
3283
3284 /* Build the tree for the SLP instance. */
3285 unsigned int group_size = scalar_stmts.length ();
3286 bool *matches = XALLOCAVEC (bool, group_size);
3287 poly_uint64 max_nunits = 1;
3288 unsigned tree_size = 0;
3289 unsigned i;
3290 slp_tree node = vect_build_slp_tree (vinfo, stmts: scalar_stmts, group_size,
3291 max_nunits: &max_nunits, matches, limit,
3292 tree_size: &tree_size, bst_map);
3293 if (node != NULL)
3294 {
3295 /* Calculate the unrolling factor based on the smallest type. */
3296 poly_uint64 unrolling_factor
3297 = calculate_unrolling_factor (nunits: max_nunits, group_size);
3298
3299 if (maybe_ne (a: unrolling_factor, b: 1U)
3300 && is_a <bb_vec_info> (p: vinfo))
3301 {
3302 unsigned HOST_WIDE_INT const_max_nunits;
3303 if (!max_nunits.is_constant (const_value: &const_max_nunits)
3304 || const_max_nunits > group_size)
3305 {
3306 if (dump_enabled_p ())
3307 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3308 "Build SLP failed: store group "
3309 "size not a multiple of the vector size "
3310 "in basic block SLP\n");
3311 vect_free_slp_tree (node);
3312 return false;
3313 }
3314 /* Fatal mismatch. */
3315 if (dump_enabled_p ())
3316 dump_printf_loc (MSG_NOTE, vect_location,
3317 "SLP discovery succeeded but node needs "
3318 "splitting\n");
3319 memset (s: matches, c: true, n: group_size);
3320 matches[group_size / const_max_nunits * const_max_nunits] = false;
3321 vect_free_slp_tree (node);
3322 }
3323 else
3324 {
3325 /* Create a new SLP instance. */
3326 slp_instance new_instance = XNEW (class _slp_instance);
3327 SLP_INSTANCE_TREE (new_instance) = node;
3328 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
3329 SLP_INSTANCE_LOADS (new_instance) = vNULL;
3330 SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
3331 SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
3332 SLP_INSTANCE_KIND (new_instance) = kind;
3333 new_instance->reduc_phis = NULL;
3334 new_instance->cost_vec = vNULL;
3335 new_instance->subgraph_entries = vNULL;
3336
3337 if (dump_enabled_p ())
3338 dump_printf_loc (MSG_NOTE, vect_location,
3339 "SLP size %u vs. limit %u.\n",
3340 tree_size, max_tree_size);
3341
3342 /* Fixup SLP reduction chains. */
3343 if (kind == slp_inst_kind_reduc_chain)
3344 {
3345 /* If this is a reduction chain with a conversion in front
3346 amend the SLP tree with a node for that. */
3347 gimple *scalar_def
3348 = vect_orig_stmt (stmt_info: scalar_stmts[group_size - 1])->stmt;
3349 if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
3350 {
3351 /* Get at the conversion stmt - we know it's the single use
3352 of the last stmt of the reduction chain. */
3353 use_operand_p use_p;
3354 bool r = single_imm_use (var: gimple_assign_lhs (gs: scalar_def),
3355 use_p: &use_p, stmt: &scalar_def);
3356 gcc_assert (r);
3357 stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
3358 next_info = vect_stmt_to_vectorize (stmt_info: next_info);
3359 scalar_stmts = vNULL;
3360 scalar_stmts.create (nelems: group_size);
3361 for (unsigned i = 0; i < group_size; ++i)
3362 scalar_stmts.quick_push (obj: next_info);
3363 slp_tree conv = vect_create_new_slp_node (scalar_stmts, nops: 1);
3364 SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
3365 SLP_TREE_CHILDREN (conv).quick_push (obj: node);
3366 SLP_INSTANCE_TREE (new_instance) = conv;
3367 /* We also have to fake this conversion stmt as SLP reduction
3368 group so we don't have to mess with too much code
3369 elsewhere. */
3370 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
3371 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
3372 }
3373 /* Fill the backedge child of the PHI SLP node. The
3374 general matching code cannot find it because the
3375 scalar code does not reflect how we vectorize the
3376 reduction. */
3377 use_operand_p use_p;
3378 imm_use_iterator imm_iter;
3379 class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
3380 FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
3381 gimple_get_lhs (scalar_def))
3382 /* There are exactly two non-debug uses, the reduction
3383 PHI and the loop-closed PHI node. */
3384 if (!is_gimple_debug (USE_STMT (use_p))
3385 && gimple_bb (USE_STMT (use_p)) == loop->header)
3386 {
3387 auto_vec<stmt_vec_info, 64> phis (group_size);
3388 stmt_vec_info phi_info
3389 = vinfo->lookup_stmt (USE_STMT (use_p));
3390 for (unsigned i = 0; i < group_size; ++i)
3391 phis.quick_push (obj: phi_info);
3392 slp_tree *phi_node = bst_map->get (k: phis);
3393 unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
3394 SLP_TREE_CHILDREN (*phi_node)[dest_idx]
3395 = SLP_INSTANCE_TREE (new_instance);
3396 SLP_INSTANCE_TREE (new_instance)->refcnt++;
3397 }
3398 }
3399
3400 vinfo->slp_instances.safe_push (obj: new_instance);
3401
3402 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
3403 the number of scalar stmts in the root in a few places.
3404 Verify that assumption holds. */
3405 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
3406 .length () == group_size);
3407
3408 if (dump_enabled_p ())
3409 {
3410 dump_printf_loc (MSG_NOTE, vect_location,
3411 "Final SLP tree for instance %p:\n",
3412 (void *) new_instance);
3413 vect_print_slp_graph (dump_kind: MSG_NOTE, loc: vect_location,
3414 SLP_INSTANCE_TREE (new_instance));
3415 }
3416
3417 return true;
3418 }
3419 }
3420 else
3421 {
3422 /* Failed to SLP. */
3423 /* Free the allocated memory. */
3424 scalar_stmts.release ();
3425 }
3426
3427 stmt_vec_info stmt_info = stmt_info_;
3428 /* Try to break the group up into pieces. */
3429 if (kind == slp_inst_kind_store)
3430 {
3431 /* ??? We could delay all the actual splitting of store-groups
3432 until after SLP discovery of the original group completed.
3433 Then we can recurse to vect_build_slp_instance directly. */
3434 for (i = 0; i < group_size; i++)
3435 if (!matches[i])
3436 break;
3437
3438 /* For basic block SLP, try to break the group up into multiples of
3439 a vector size. */
3440 if (is_a <bb_vec_info> (p: vinfo)
3441 && (i > 1 && i < group_size))
3442 {
3443 tree scalar_type
3444 = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3445 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
3446 1 << floor_log2 (x: i));
3447 unsigned HOST_WIDE_INT const_nunits;
3448 if (vectype
3449 && TYPE_VECTOR_SUBPARTS (node: vectype).is_constant (const_value: &const_nunits))
3450 {
3451 /* Split into two groups at the first vector boundary. */
3452 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
3453 unsigned group1_size = i & ~(const_nunits - 1);
3454
3455 if (dump_enabled_p ())
3456 dump_printf_loc (MSG_NOTE, vect_location,
3457 "Splitting SLP group at stmt %u\n", i);
3458 stmt_vec_info rest = vect_split_slp_store_group (first_vinfo: stmt_info,
3459 group1_size);
3460 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3461 kind, max_tree_size,
3462 limit);
3463 /* Split the rest at the failure point and possibly
3464 re-analyze the remaining matching part if it has
3465 at least two lanes. */
3466 if (group1_size < i
3467 && (i + 1 < group_size
3468 || i - group1_size > 1))
3469 {
3470 stmt_vec_info rest2 = rest;
3471 rest = vect_split_slp_store_group (first_vinfo: rest, group1_size: i - group1_size);
3472 if (i - group1_size > 1)
3473 res |= vect_analyze_slp_instance (vinfo, bst_map, stmt_info: rest2,
3474 kind, max_tree_size,
3475 limit);
3476 }
3477 /* Re-analyze the non-matching tail if it has at least
3478 two lanes. */
3479 if (i + 1 < group_size)
3480 res |= vect_analyze_slp_instance (vinfo, bst_map,
3481 stmt_info: rest, kind, max_tree_size,
3482 limit);
3483 return res;
3484 }
3485 }
3486
3487 /* For loop vectorization split into arbitrary pieces of size > 1. */
3488 if (is_a <loop_vec_info> (p: vinfo)
3489 && (i > 1 && i < group_size)
3490 && !vect_slp_prefer_store_lanes_p (vinfo, stmt_info, group_size, new_group_size: i))
3491 {
3492 unsigned group1_size = i;
3493
3494 if (dump_enabled_p ())
3495 dump_printf_loc (MSG_NOTE, vect_location,
3496 "Splitting SLP group at stmt %u\n", i);
3497
3498 stmt_vec_info rest = vect_split_slp_store_group (first_vinfo: stmt_info,
3499 group1_size);
3500 /* Loop vectorization cannot handle gaps in stores, make sure
3501 the split group appears as strided. */
3502 STMT_VINFO_STRIDED_P (rest) = 1;
3503 DR_GROUP_GAP (rest) = 0;
3504 STMT_VINFO_STRIDED_P (stmt_info) = 1;
3505 DR_GROUP_GAP (stmt_info) = 0;
3506
3507 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3508 kind, max_tree_size, limit);
3509 if (i + 1 < group_size)
3510 res |= vect_analyze_slp_instance (vinfo, bst_map,
3511 stmt_info: rest, kind, max_tree_size, limit);
3512
3513 return res;
3514 }
3515
3516 /* Even though the first vector did not all match, we might be able to SLP
3517 (some) of the remainder. FORNOW ignore this possibility. */
3518 }
3519
3520 /* Failed to SLP. */
3521 if (dump_enabled_p ())
3522 dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
3523 return false;
3524}
3525
3526
3527/* Analyze an SLP instance starting from a group of grouped stores. Call
3528 vect_build_slp_tree to build a tree of packed stmts if possible.
3529 Return FALSE if it's impossible to SLP any stmt in the loop. */
3530
3531static bool
3532vect_analyze_slp_instance (vec_info *vinfo,
3533 scalar_stmts_to_slp_tree_map_t *bst_map,
3534 stmt_vec_info stmt_info,
3535 slp_instance_kind kind,
3536 unsigned max_tree_size, unsigned *limit)
3537{
3538 unsigned int i;
3539 vec<stmt_vec_info> scalar_stmts;
3540
3541 if (is_a <bb_vec_info> (p: vinfo))
3542 vect_location = stmt_info->stmt;
3543
3544 stmt_vec_info next_info = stmt_info;
3545 if (kind == slp_inst_kind_store)
3546 {
3547 /* Collect the stores and store them in scalar_stmts. */
3548 scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
3549 while (next_info)
3550 {
3551 scalar_stmts.quick_push (obj: vect_stmt_to_vectorize (stmt_info: next_info));
3552 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
3553 }
3554 }
3555 else if (kind == slp_inst_kind_reduc_chain)
3556 {
3557 /* Collect the reduction stmts and store them in scalar_stmts. */
3558 scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
3559 while (next_info)
3560 {
3561 scalar_stmts.quick_push (obj: vect_stmt_to_vectorize (stmt_info: next_info));
3562 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
3563 }
3564 /* Mark the first element of the reduction chain as reduction to properly
3565 transform the node. In the reduction analysis phase only the last
3566 element of the chain is marked as reduction. */
3567 STMT_VINFO_DEF_TYPE (stmt_info)
3568 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
3569 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
3570 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
3571 }
3572 else if (kind == slp_inst_kind_reduc_group)
3573 {
3574 /* Collect reduction statements. */
3575 const vec<stmt_vec_info> &reductions
3576 = as_a <loop_vec_info> (p: vinfo)->reductions;
3577 scalar_stmts.create (nelems: reductions.length ());
3578 for (i = 0; reductions.iterate (ix: i, ptr: &next_info); i++)
3579 if ((STMT_VINFO_RELEVANT_P (next_info)
3580 || STMT_VINFO_LIVE_P (next_info))
3581 /* ??? Make sure we didn't skip a conversion around a reduction
3582 path. In that case we'd have to reverse engineer that conversion
3583 stmt following the chain using reduc_idx and from the PHI
3584 using reduc_def. */
3585 && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def)
3586 scalar_stmts.quick_push (obj: next_info);
3587 /* If less than two were relevant/live there's nothing to SLP. */
3588 if (scalar_stmts.length () < 2)
3589 return false;
3590 }
3591 else
3592 gcc_unreachable ();
3593
3594 vec<stmt_vec_info> roots = vNULL;
3595 vec<tree> remain = vNULL;
3596 /* Build the tree for the SLP instance. */
3597 bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
3598 root_stmt_infos&: roots, remain,
3599 max_tree_size, limit, bst_map,
3600 stmt_info_: kind == slp_inst_kind_store
3601 ? stmt_info : NULL);
3602
3603 /* ??? If this is slp_inst_kind_store and the above succeeded here's
3604 where we should do store group splitting. */
3605
3606 return res;
3607}
3608
3609/* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
3610 trees of packed scalar stmts if SLP is possible. */
3611
3612opt_result
3613vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
3614{
3615 unsigned int i;
3616 stmt_vec_info first_element;
3617 slp_instance instance;
3618
3619 DUMP_VECT_SCOPE ("vect_analyze_slp");
3620
3621 unsigned limit = max_tree_size;
3622
3623 scalar_stmts_to_slp_tree_map_t *bst_map
3624 = new scalar_stmts_to_slp_tree_map_t ();
3625
3626 /* Find SLP sequences starting from groups of grouped stores. */
3627 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
3628 vect_analyze_slp_instance (vinfo, bst_map, stmt_info: first_element,
3629 kind: slp_inst_kind_store, max_tree_size, limit: &limit);
3630
3631 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (p: vinfo))
3632 {
3633 for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
3634 {
3635 vect_location = bb_vinfo->roots[i].roots[0]->stmt;
3636 if (vect_build_slp_instance (vinfo: bb_vinfo, kind: bb_vinfo->roots[i].kind,
3637 scalar_stmts&: bb_vinfo->roots[i].stmts,
3638 root_stmt_infos&: bb_vinfo->roots[i].roots,
3639 remain&: bb_vinfo->roots[i].remain,
3640 max_tree_size, limit: &limit, bst_map, NULL))
3641 {
3642 bb_vinfo->roots[i].stmts = vNULL;
3643 bb_vinfo->roots[i].roots = vNULL;
3644 bb_vinfo->roots[i].remain = vNULL;
3645 }
3646 }
3647 }
3648
3649 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (p: vinfo))
3650 {
3651 /* Find SLP sequences starting from reduction chains. */
3652 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
3653 if (! STMT_VINFO_RELEVANT_P (first_element)
3654 && ! STMT_VINFO_LIVE_P (first_element))
3655 ;
3656 else if (! vect_analyze_slp_instance (vinfo, bst_map, stmt_info: first_element,
3657 kind: slp_inst_kind_reduc_chain,
3658 max_tree_size, limit: &limit))
3659 {
3660 /* Dissolve reduction chain group. */
3661 stmt_vec_info vinfo = first_element;
3662 stmt_vec_info last = NULL;
3663 while (vinfo)
3664 {
3665 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
3666 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
3667 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
3668 last = vinfo;
3669 vinfo = next;
3670 }
3671 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
3672 /* It can be still vectorized as part of an SLP reduction. */
3673 loop_vinfo->reductions.safe_push (obj: last);
3674 }
3675
3676 /* Find SLP sequences starting from groups of reductions. */
3677 if (loop_vinfo->reductions.length () > 1)
3678 vect_analyze_slp_instance (vinfo, bst_map, stmt_info: loop_vinfo->reductions[0],
3679 kind: slp_inst_kind_reduc_group, max_tree_size,
3680 limit: &limit);
3681 }
3682
3683 hash_set<slp_tree> visited_patterns;
3684 slp_tree_to_load_perm_map_t perm_cache;
3685 slp_compat_nodes_map_t compat_cache;
3686
3687 /* See if any patterns can be found in the SLP tree. */
3688 bool pattern_found = false;
3689 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3690 pattern_found |= vect_match_slp_patterns (instance, vinfo,
3691 visited: &visited_patterns, perm_cache: &perm_cache,
3692 compat_cache: &compat_cache);
3693
3694 /* If any were found optimize permutations of loads. */
3695 if (pattern_found)
3696 {
3697 hash_map<slp_tree, slp_tree> load_map;
3698 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3699 {
3700 slp_tree root = SLP_INSTANCE_TREE (instance);
3701 optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
3702 load_map: &load_map, root);
3703 }
3704 }
3705
3706
3707
3708 /* The map keeps a reference on SLP nodes built, release that. */
3709 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
3710 it != bst_map->end (); ++it)
3711 if ((*it).second)
3712 vect_free_slp_tree (node: (*it).second);
3713 delete bst_map;
3714
3715 if (pattern_found && dump_enabled_p ())
3716 {
3717 dump_printf_loc (MSG_NOTE, vect_location,
3718 "Pattern matched SLP tree\n");
3719 hash_set<slp_tree> visited;
3720 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3721 vect_print_slp_graph (dump_kind: MSG_NOTE, loc: vect_location,
3722 SLP_INSTANCE_TREE (instance), visited);
3723 }
3724
3725 return opt_result::success ();
3726}
3727
3728/* Estimates the cost of inserting layout changes into the SLP graph.
3729 It can also say that the insertion is impossible. */
3730
3731struct slpg_layout_cost
3732{
3733 slpg_layout_cost () = default;
3734 slpg_layout_cost (sreal, bool);
3735
3736 static slpg_layout_cost impossible () { return { sreal::max (), 0 }; }
3737 bool is_possible () const { return depth != sreal::max (); }
3738
3739 bool operator== (const slpg_layout_cost &) const;
3740 bool operator!= (const slpg_layout_cost &) const;
3741
3742 bool is_better_than (const slpg_layout_cost &, bool) const;
3743
3744 void add_parallel_cost (const slpg_layout_cost &);
3745 void add_serial_cost (const slpg_layout_cost &);
3746 void split (unsigned int);
3747
3748 /* The longest sequence of layout changes needed during any traversal
3749 of the partition dag, weighted by execution frequency.
3750
3751 This is the most important metric when optimizing for speed, since
3752 it helps to ensure that we keep the number of operations on
3753 critical paths to a minimum. */
3754 sreal depth = 0;
3755
3756 /* An estimate of the total number of operations needed. It is weighted by
3757 execution frequency when optimizing for speed but not when optimizing for
3758 size. In order to avoid double-counting, a node with a fanout of N will
3759 distribute 1/N of its total cost to each successor.
3760
3761 This is the most important metric when optimizing for size, since
3762 it helps to keep the total number of operations to a minimum, */
3763 sreal total = 0;
3764};
3765
3766/* Construct costs for a node with weight WEIGHT. A higher weight
3767 indicates more frequent execution. IS_FOR_SIZE is true if we are
3768 optimizing for size rather than speed. */
3769
3770slpg_layout_cost::slpg_layout_cost (sreal weight, bool is_for_size)
3771 : depth (weight), total (is_for_size && weight > 0 ? 1 : weight)
3772{
3773}
3774
3775bool
3776slpg_layout_cost::operator== (const slpg_layout_cost &other) const
3777{
3778 return depth == other.depth && total == other.total;
3779}
3780
3781bool
3782slpg_layout_cost::operator!= (const slpg_layout_cost &other) const
3783{
3784 return !operator== (other);
3785}
3786
3787/* Return true if these costs are better than OTHER. IS_FOR_SIZE is
3788 true if we are optimizing for size rather than speed. */
3789
3790bool
3791slpg_layout_cost::is_better_than (const slpg_layout_cost &other,
3792 bool is_for_size) const
3793{
3794 if (is_for_size)
3795 {
3796 if (total != other.total)
3797 return total < other.total;
3798 return depth < other.depth;
3799 }
3800 else
3801 {
3802 if (depth != other.depth)
3803 return depth < other.depth;
3804 return total < other.total;
3805 }
3806}
3807
3808/* Increase the costs to account for something with cost INPUT_COST
3809 happening in parallel with the current costs. */
3810
3811void
3812slpg_layout_cost::add_parallel_cost (const slpg_layout_cost &input_cost)
3813{
3814 depth = std::max (a: depth, b: input_cost.depth);
3815 total += input_cost.total;
3816}
3817
3818/* Increase the costs to account for something with cost INPUT_COST
3819 happening in series with the current costs. */
3820
3821void
3822slpg_layout_cost::add_serial_cost (const slpg_layout_cost &other)
3823{
3824 depth += other.depth;
3825 total += other.total;
3826}
3827
3828/* Split the total cost among TIMES successors or predecessors. */
3829
3830void
3831slpg_layout_cost::split (unsigned int times)
3832{
3833 if (times > 1)
3834 total /= times;
3835}
3836
3837/* Information about one node in the SLP graph, for use during
3838 vect_optimize_slp_pass. */
3839
3840struct slpg_vertex
3841{
3842 slpg_vertex (slp_tree node_) : node (node_) {}
3843
3844 /* The node itself. */
3845 slp_tree node;
3846
3847 /* Which partition the node belongs to, or -1 if none. Nodes outside of
3848 partitions are flexible; they can have whichever layout consumers
3849 want them to have. */
3850 int partition = -1;
3851
3852 /* The number of nodes that directly use the result of this one
3853 (i.e. the number of nodes that count this one as a child). */
3854 unsigned int out_degree = 0;
3855
3856 /* The execution frequency of the node. */
3857 sreal weight = 0;
3858
3859 /* The total execution frequency of all nodes that directly use the
3860 result of this one. */
3861 sreal out_weight = 0;
3862};
3863
3864/* Information about one partition of the SLP graph, for use during
3865 vect_optimize_slp_pass. */
3866
3867struct slpg_partition_info
3868{
3869 /* The nodes in the partition occupy indices [NODE_BEGIN, NODE_END)
3870 of m_partitioned_nodes. */
3871 unsigned int node_begin = 0;
3872 unsigned int node_end = 0;
3873
3874 /* Which layout we've chosen to use for this partition, or -1 if
3875 we haven't picked one yet. */
3876 int layout = -1;
3877
3878 /* The number of predecessors and successors in the partition dag.
3879 The predecessors always have lower partition numbers and the
3880 successors always have higher partition numbers.
3881
3882 Note that the directions of these edges are not necessarily the
3883 same as in the data flow graph. For example, if an SCC has separate
3884 partitions for an inner loop and an outer loop, the inner loop's
3885 partition will have at least two incoming edges from the outer loop's
3886 partition: one for a live-in value and one for a live-out value.
3887 In data flow terms, one of these edges would also be from the outer loop
3888 to the inner loop, but the other would be in the opposite direction. */
3889 unsigned int in_degree = 0;
3890 unsigned int out_degree = 0;
3891};
3892
3893/* Information about the costs of using a particular layout for a
3894 particular partition. It can also say that the combination is
3895 impossible. */
3896
3897struct slpg_partition_layout_costs
3898{
3899 bool is_possible () const { return internal_cost.is_possible (); }
3900 void mark_impossible () { internal_cost = slpg_layout_cost::impossible (); }
3901
3902 /* The costs inherited from predecessor partitions. */
3903 slpg_layout_cost in_cost;
3904
3905 /* The inherent cost of the layout within the node itself. For example,
3906 this is nonzero for a load if choosing a particular layout would require
3907 the load to permute the loaded elements. It is nonzero for a
3908 VEC_PERM_EXPR if the permutation cannot be eliminated or converted
3909 to full-vector moves. */
3910 slpg_layout_cost internal_cost;
3911
3912 /* The costs inherited from successor partitions. */
3913 slpg_layout_cost out_cost;
3914};
3915
3916/* This class tries to optimize the layout of vectors in order to avoid
3917 unnecessary shuffling. At the moment, the set of possible layouts are
3918 restricted to bijective permutations.
3919
3920 The goal of the pass depends on whether we're optimizing for size or
3921 for speed. When optimizing for size, the goal is to reduce the overall
3922 number of layout changes (including layout changes implied by things
3923 like load permutations). When optimizing for speed, the goal is to
3924 reduce the maximum latency attributable to layout changes on any
3925 non-cyclical path through the data flow graph.
3926
3927 For example, when optimizing a loop nest for speed, we will prefer
3928 to make layout changes outside of a loop rather than inside of a loop,
3929 and will prefer to make layout changes in parallel rather than serially,
3930 even if that increases the overall number of layout changes.
3931
3932 The high-level procedure is:
3933
3934 (1) Build a graph in which edges go from uses (parents) to definitions
3935 (children).
3936
3937 (2) Divide the graph into a dag of strongly-connected components (SCCs).
3938
3939 (3) When optimizing for speed, partition the nodes in each SCC based
3940 on their containing cfg loop. When optimizing for size, treat
3941 each SCC as a single partition.
3942
3943 This gives us a dag of partitions. The goal is now to assign a
3944 layout to each partition.
3945
3946 (4) Construct a set of vector layouts that are worth considering.
3947 Record which nodes must keep their current layout.
3948
3949 (5) Perform a forward walk over the partition dag (from loads to stores)
3950 accumulating the "forward" cost of using each layout. When visiting
3951 each partition, assign a tentative choice of layout to the partition
3952 and use that choice when calculating the cost of using a different
3953 layout in successor partitions.
3954
3955 (6) Perform a backward walk over the partition dag (from stores to loads),
3956 accumulating the "backward" cost of using each layout. When visiting
3957 each partition, make a final choice of layout for that partition based
3958 on the accumulated forward costs (from (5)) and backward costs
3959 (from (6)).
3960
3961 (7) Apply the chosen layouts to the SLP graph.
3962
3963 For example, consider the SLP statements:
3964
3965 S1: a_1 = load
3966 loop:
3967 S2: a_2 = PHI<a_1, a_3>
3968 S3: b_1 = load
3969 S4: a_3 = a_2 + b_1
3970 exit:
3971 S5: a_4 = PHI<a_3>
3972 S6: store a_4
3973
3974 S2 and S4 form an SCC and are part of the same loop. Every other
3975 statement is in a singleton SCC. In this example there is a one-to-one
3976 mapping between SCCs and partitions and the partition dag looks like this;
3977
3978 S1 S3
3979 \ /
3980 S2+S4
3981 |
3982 S5
3983 |
3984 S6
3985
3986 S2, S3 and S4 will have a higher execution frequency than the other
3987 statements, so when optimizing for speed, the goal is to avoid any
3988 layout changes:
3989
3990 - within S3
3991 - within S2+S4
3992 - on the S3->S2+S4 edge
3993
3994 For example, if S3 was originally a reversing load, the goal of the
3995 pass is to make it an unreversed load and change the layout on the
3996 S1->S2+S4 and S2+S4->S5 edges to compensate. (Changing the layout
3997 on S1->S2+S4 and S5->S6 would also be acceptable.)
3998
3999 The difference between SCCs and partitions becomes important if we
4000 add an outer loop:
4001
4002 S1: a_1 = ...
4003 loop1:
4004 S2: a_2 = PHI<a_1, a_6>
4005 S3: b_1 = load
4006 S4: a_3 = a_2 + b_1
4007 loop2:
4008 S5: a_4 = PHI<a_3, a_5>
4009 S6: c_1 = load
4010 S7: a_5 = a_4 + c_1
4011 exit2:
4012 S8: a_6 = PHI<a_5>
4013 S9: store a_6
4014 exit1:
4015
4016 Here, S2, S4, S5, S7 and S8 form a single SCC. However, when optimizing
4017 for speed, we usually do not want restrictions in the outer loop to "infect"
4018 the decision for the inner loop. For example, if an outer-loop node
4019 in the SCC contains a statement with a fixed layout, that should not
4020 prevent the inner loop from using a different layout. Conversely,
4021 the inner loop should not dictate a layout to the outer loop: if the
4022 outer loop does a lot of computation, then it may not be efficient to
4023 do all of that computation in the inner loop's preferred layout.
4024
4025 So when optimizing for speed, we partition the SCC into S2+S4+S8 (outer)
4026 and S5+S7 (inner). We also try to arrange partitions so that:
4027
4028 - the partition for an outer loop comes before the partition for
4029 an inner loop
4030
4031 - if a sibling loop A dominates a sibling loop B, A's partition
4032 comes before B's
4033
4034 This gives the following partition dag for the example above:
4035
4036 S1 S3
4037 \ /
4038 S2+S4+S8 S6
4039 | \\ /
4040 | S5+S7
4041 |
4042 S9
4043
4044 There are two edges from S2+S4+S8 to S5+S7: one for the edge S4->S5 and
4045 one for a reversal of the edge S7->S8.
4046
4047 The backward walk picks a layout for S5+S7 before S2+S4+S8. The choice
4048 for S2+S4+S8 therefore has to balance the cost of using the outer loop's
4049 preferred layout against the cost of changing the layout on entry to the
4050 inner loop (S4->S5) and on exit from the inner loop (S7->S8 reversed).
4051
4052 Although this works well when optimizing for speed, it has the downside
4053 when optimizing for size that the choice of layout for S5+S7 is completely
4054 independent of S9, which lessens the chance of reducing the overall number
4055 of permutations. We therefore do not partition SCCs when optimizing
4056 for size.
4057
4058 To give a concrete example of the difference between optimizing
4059 for size and speed, consider:
4060
4061 a[0] = (b[1] << c[3]) - d[1];
4062 a[1] = (b[0] << c[2]) - d[0];
4063 a[2] = (b[3] << c[1]) - d[3];
4064 a[3] = (b[2] << c[0]) - d[2];
4065
4066 There are three different layouts here: one for a, one for b and d,
4067 and one for c. When optimizing for speed it is better to permute each
4068 of b, c and d into the order required by a, since those permutations
4069 happen in parallel. But when optimizing for size, it is better to:
4070
4071 - permute c into the same order as b
4072 - do the arithmetic
4073 - permute the result into the order required by a
4074
4075 This gives 2 permutations rather than 3. */
4076
4077class vect_optimize_slp_pass
4078{
4079public:
4080 vect_optimize_slp_pass (vec_info *vinfo) : m_vinfo (vinfo) {}
4081 void run ();
4082
4083private:
4084 /* Graph building. */
4085 struct loop *containing_loop (slp_tree);
4086 bool is_cfg_latch_edge (graph_edge *);
4087 void build_vertices (hash_set<slp_tree> &, slp_tree);
4088 void build_vertices ();
4089 void build_graph ();
4090
4091 /* Partitioning. */
4092 void create_partitions ();
4093 template<typename T> void for_each_partition_edge (unsigned int, T);
4094
4095 /* Layout selection. */
4096 bool is_compatible_layout (slp_tree, unsigned int);
4097 int change_layout_cost (slp_tree, unsigned int, unsigned int);
4098 slpg_partition_layout_costs &partition_layout_costs (unsigned int,
4099 unsigned int);
4100 void change_vec_perm_layout (slp_tree, lane_permutation_t &,
4101 int, unsigned int);
4102 int internal_node_cost (slp_tree, int, unsigned int);
4103 void start_choosing_layouts ();
4104
4105 /* Cost propagation. */
4106 slpg_layout_cost edge_layout_cost (graph_edge *, unsigned int,
4107 unsigned int, unsigned int);
4108 slpg_layout_cost total_in_cost (unsigned int);
4109 slpg_layout_cost forward_cost (graph_edge *, unsigned int, unsigned int);
4110 slpg_layout_cost backward_cost (graph_edge *, unsigned int, unsigned int);
4111 void forward_pass ();
4112 void backward_pass ();
4113
4114 /* Rematerialization. */
4115 slp_tree get_result_with_layout (slp_tree, unsigned int);
4116 void materialize ();
4117
4118 /* Clean-up. */
4119 void remove_redundant_permutations ();
4120
4121 void dump ();
4122
4123 vec_info *m_vinfo;
4124
4125 /* True if we should optimize the graph for size, false if we should
4126 optimize it for speed. (It wouldn't be easy to make this decision
4127 more locally.) */
4128 bool m_optimize_size;
4129
4130 /* A graph of all SLP nodes, with edges leading from uses to definitions.
4131 In other words, a node's predecessors are its slp_tree parents and
4132 a node's successors are its slp_tree children. */
4133 graph *m_slpg = nullptr;
4134
4135 /* The vertices of M_SLPG, indexed by slp_tree::vertex. */
4136 auto_vec<slpg_vertex> m_vertices;
4137
4138 /* The list of all leaves of M_SLPG. such as external definitions, constants,
4139 and loads. */
4140 auto_vec<int> m_leafs;
4141
4142 /* This array has one entry for every vector layout that we're considering.
4143 Element 0 is null and indicates "no change". Other entries describe
4144 permutations that are inherent in the current graph and that we would
4145 like to reverse if possible.
4146
4147 For example, a permutation { 1, 2, 3, 0 } means that something has
4148 effectively been permuted in that way, such as a load group
4149 { a[1], a[2], a[3], a[0] } (viewed as a permutation of a[0:3]).
4150 We'd then like to apply the reverse permutation { 3, 0, 1, 2 }
4151 in order to put things "back" in order. */
4152 auto_vec<vec<unsigned> > m_perms;
4153
4154 /* A partitioning of the nodes for which a layout must be chosen.
4155 Each partition represents an <SCC, cfg loop> pair; that is,
4156 nodes in different SCCs belong to different partitions, and nodes
4157 within an SCC can be further partitioned according to a containing
4158 cfg loop. Partition <SCC1, L1> comes before <SCC2, L2> if:
4159
4160 - SCC1 != SCC2 and SCC1 is a predecessor of SCC2 in a forward walk
4161 from leaves (such as loads) to roots (such as stores).
4162
4163 - SCC1 == SCC2 and L1's header strictly dominates L2's header. */
4164 auto_vec<slpg_partition_info> m_partitions;
4165
4166 /* The list of all nodes for which a layout must be chosen. Nodes for
4167 partition P come before the nodes for partition P+1. Nodes within a
4168 partition are in reverse postorder. */
4169 auto_vec<unsigned int> m_partitioned_nodes;
4170
4171 /* Index P * num-layouts + L contains the cost of using layout L
4172 for partition P. */
4173 auto_vec<slpg_partition_layout_costs> m_partition_layout_costs;
4174
4175 /* Index N * num-layouts + L, if nonnull, is a node that provides the
4176 original output of node N adjusted to have layout L. */
4177 auto_vec<slp_tree> m_node_layouts;
4178};
4179
4180/* Fill the vertices and leafs vector with all nodes in the SLP graph.
4181 Also record whether we should optimize anything for speed rather
4182 than size. */
4183
4184void
4185vect_optimize_slp_pass::build_vertices (hash_set<slp_tree> &visited,
4186 slp_tree node)
4187{
4188 unsigned i;
4189 slp_tree child;
4190
4191 if (visited.add (k: node))
4192 return;
4193
4194 if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
4195 {
4196 basic_block bb = gimple_bb (g: vect_orig_stmt (stmt_info: rep)->stmt);
4197 if (optimize_bb_for_speed_p (bb))
4198 m_optimize_size = false;
4199 }
4200
4201 node->vertex = m_vertices.length ();
4202 m_vertices.safe_push (obj: slpg_vertex (node));
4203
4204 bool leaf = true;
4205 bool force_leaf = false;
4206 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4207 if (child)
4208 {
4209 leaf = false;
4210 build_vertices (visited, node: child);
4211 }
4212 else
4213 force_leaf = true;
4214 /* Since SLP discovery works along use-def edges all cycles have an
4215 entry - but there's the exception of cycles where we do not handle
4216 the entry explicitely (but with a NULL SLP node), like some reductions
4217 and inductions. Force those SLP PHIs to act as leafs to make them
4218 backwards reachable. */
4219 if (leaf || force_leaf)
4220 m_leafs.safe_push (obj: node->vertex);
4221}
4222
4223/* Fill the vertices and leafs vector with all nodes in the SLP graph. */
4224
4225void
4226vect_optimize_slp_pass::build_vertices ()
4227{
4228 hash_set<slp_tree> visited;
4229 unsigned i;
4230 slp_instance instance;
4231 FOR_EACH_VEC_ELT (m_vinfo->slp_instances, i, instance)
4232 build_vertices (visited, SLP_INSTANCE_TREE (instance));
4233}
4234
4235/* Apply (reverse) bijectite PERM to VEC. */
4236
4237template <class T>
4238static void
4239vect_slp_permute (vec<unsigned> perm,
4240 vec<T> &vec, bool reverse)
4241{
4242 auto_vec<T, 64> saved;
4243 saved.create (vec.length ());
4244 for (unsigned i = 0; i < vec.length (); ++i)
4245 saved.quick_push (vec[i]);
4246
4247 if (reverse)
4248 {
4249 for (unsigned i = 0; i < vec.length (); ++i)
4250 vec[perm[i]] = saved[i];
4251 for (unsigned i = 0; i < vec.length (); ++i)
4252 gcc_assert (vec[perm[i]] == saved[i]);
4253 }
4254 else
4255 {
4256 for (unsigned i = 0; i < vec.length (); ++i)
4257 vec[i] = saved[perm[i]];
4258 for (unsigned i = 0; i < vec.length (); ++i)
4259 gcc_assert (vec[i] == saved[perm[i]]);
4260 }
4261}
4262
4263/* Return the cfg loop that contains NODE. */
4264
4265struct loop *
4266vect_optimize_slp_pass::containing_loop (slp_tree node)
4267{
4268 stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
4269 if (!rep)
4270 return ENTRY_BLOCK_PTR_FOR_FN (cfun)->loop_father;
4271 return gimple_bb (g: vect_orig_stmt (stmt_info: rep)->stmt)->loop_father;
4272}
4273
4274/* Return true if UD (an edge from a use to a definition) is associated
4275 with a loop latch edge in the cfg. */
4276
4277bool
4278vect_optimize_slp_pass::is_cfg_latch_edge (graph_edge *ud)
4279{
4280 slp_tree use = m_vertices[ud->src].node;
4281 slp_tree def = m_vertices[ud->dest].node;
4282 if (SLP_TREE_DEF_TYPE (use) != vect_internal_def
4283 || SLP_TREE_DEF_TYPE (def) != vect_internal_def)
4284 return false;
4285
4286 stmt_vec_info use_rep = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (use));
4287 return (is_a<gphi *> (p: use_rep->stmt)
4288 && bb_loop_header_p (gimple_bb (g: use_rep->stmt))
4289 && containing_loop (node: def) == containing_loop (node: use));
4290}
4291
4292/* Build the graph. Mark edges that correspond to cfg loop latch edges with
4293 a nonnull data field. */
4294
4295void
4296vect_optimize_slp_pass::build_graph ()
4297{
4298 m_optimize_size = true;
4299 build_vertices ();
4300
4301 m_slpg = new_graph (m_vertices.length ());
4302 for (slpg_vertex &v : m_vertices)
4303 for (slp_tree child : SLP_TREE_CHILDREN (v.node))
4304 if (child)
4305 {
4306 graph_edge *ud = add_edge (m_slpg, v.node->vertex, child->vertex);
4307 if (is_cfg_latch_edge (ud))
4308 ud->data = this;
4309 }
4310}
4311
4312/* Return true if E corresponds to a loop latch edge in the cfg. */
4313
4314static bool
4315skip_cfg_latch_edges (graph_edge *e)
4316{
4317 return e->data;
4318}
4319
4320/* Create the node partitions. */
4321
4322void
4323vect_optimize_slp_pass::create_partitions ()
4324{
4325 /* Calculate a postorder of the graph, ignoring edges that correspond
4326 to natural latch edges in the cfg. Reading the vector from the end
4327 to the beginning gives the reverse postorder. */
4328 auto_vec<int> initial_rpo;
4329 graphds_dfs (m_slpg, &m_leafs[0], m_leafs.length (), &initial_rpo,
4330 false, NULL, skip_cfg_latch_edges);
4331 gcc_assert (initial_rpo.length () == m_vertices.length ());
4332
4333 /* Calculate the strongly connected components of the graph. */
4334 auto_vec<int> scc_grouping;
4335 unsigned int num_sccs = graphds_scc (m_slpg, NULL, NULL, &scc_grouping);
4336
4337 /* Create a new index order in which all nodes from the same SCC are
4338 consecutive. Use scc_pos to record the index of the first node in
4339 each SCC. */
4340 auto_vec<unsigned int> scc_pos (num_sccs);
4341 int last_component = -1;
4342 unsigned int node_count = 0;
4343 for (unsigned int node_i : scc_grouping)
4344 {
4345 if (last_component != m_slpg->vertices[node_i].component)
4346 {
4347 last_component = m_slpg->vertices[node_i].component;
4348 gcc_assert (last_component == int (scc_pos.length ()));
4349 scc_pos.quick_push (obj: node_count);
4350 }
4351 node_count += 1;
4352 }
4353 gcc_assert (node_count == initial_rpo.length ()
4354 && last_component + 1 == int (num_sccs));
4355
4356 /* Use m_partitioned_nodes to group nodes into SCC order, with the nodes
4357 inside each SCC following the RPO we calculated above. The fact that
4358 we ignored natural latch edges when calculating the RPO should ensure
4359 that, for natural loop nests:
4360
4361 - the first node that we encounter in a cfg loop is the loop header phi
4362 - the loop header phis are in dominance order
4363
4364 Arranging for this is an optimization (see below) rather than a
4365 correctness issue. Unnatural loops with a tangled mess of backedges
4366 will still work correctly, but might give poorer results.
4367
4368 Also update scc_pos so that it gives 1 + the index of the last node
4369 in the SCC. */
4370 m_partitioned_nodes.safe_grow (len: node_count);
4371 for (unsigned int old_i = initial_rpo.length (); old_i-- > 0;)
4372 {
4373 unsigned int node_i = initial_rpo[old_i];
4374 unsigned int new_i = scc_pos[m_slpg->vertices[node_i].component]++;
4375 m_partitioned_nodes[new_i] = node_i;
4376 }
4377
4378 /* When optimizing for speed, partition each SCC based on the containing
4379 cfg loop. The order we constructed above should ensure that, for natural
4380 cfg loops, we'll create sub-SCC partitions for outer loops before
4381 the corresponding sub-SCC partitions for inner loops. Similarly,
4382 when one sibling loop A dominates another sibling loop B, we should
4383 create a sub-SCC partition for A before a sub-SCC partition for B.
4384
4385 As above, nothing depends for correctness on whether this achieves
4386 a natural nesting, but we should get better results when it does. */
4387 m_partitions.reserve (nelems: m_vertices.length ());
4388 unsigned int next_partition_i = 0;
4389 hash_map<struct loop *, int> loop_partitions;
4390 unsigned int rpo_begin = 0;
4391 unsigned int num_partitioned_nodes = 0;
4392 for (unsigned int rpo_end : scc_pos)
4393 {
4394 loop_partitions.empty ();
4395 unsigned int partition_i = next_partition_i;
4396 for (unsigned int rpo_i = rpo_begin; rpo_i < rpo_end; ++rpo_i)
4397 {
4398 /* Handle externals and constants optimistically throughout.
4399 But treat existing vectors as fixed since we do not handle
4400 permuting them. */
4401 unsigned int node_i = m_partitioned_nodes[rpo_i];
4402 auto &vertex = m_vertices[node_i];
4403 if ((SLP_TREE_DEF_TYPE (vertex.node) == vect_external_def
4404 && !SLP_TREE_VEC_DEFS (vertex.node).exists ())
4405 || SLP_TREE_DEF_TYPE (vertex.node) == vect_constant_def)
4406 vertex.partition = -1;
4407 else
4408 {
4409 bool existed;
4410 if (m_optimize_size)
4411 existed = next_partition_i > partition_i;
4412 else
4413 {
4414 struct loop *loop = containing_loop (node: vertex.node);
4415 auto &entry = loop_partitions.get_or_insert (k: loop, existed: &existed);
4416 if (!existed)
4417 entry = next_partition_i;
4418 partition_i = entry;
4419 }
4420 if (!existed)
4421 {
4422 m_partitions.quick_push (obj: slpg_partition_info ());
4423 next_partition_i += 1;
4424 }
4425 vertex.partition = partition_i;
4426 num_partitioned_nodes += 1;
4427 m_partitions[partition_i].node_end += 1;
4428 }
4429 }
4430 rpo_begin = rpo_end;
4431 }
4432
4433 /* Assign ranges of consecutive node indices to each partition,
4434 in partition order. Start with node_end being the same as
4435 node_begin so that the next loop can use it as a counter. */
4436 unsigned int node_begin = 0;
4437 for (auto &partition : m_partitions)
4438 {
4439 partition.node_begin = node_begin;
4440 node_begin += partition.node_end;
4441 partition.node_end = partition.node_begin;
4442 }
4443 gcc_assert (node_begin == num_partitioned_nodes);
4444
4445 /* Finally build the list of nodes in partition order. */
4446 m_partitioned_nodes.truncate (size: num_partitioned_nodes);
4447 for (unsigned int node_i = 0; node_i < m_vertices.length (); ++node_i)
4448 {
4449 int partition_i = m_vertices[node_i].partition;
4450 if (partition_i >= 0)
4451 {
4452 unsigned int order_i = m_partitions[partition_i].node_end++;
4453 m_partitioned_nodes[order_i] = node_i;
4454 }
4455 }
4456}
4457
4458/* Look for edges from earlier partitions into node NODE_I and edges from
4459 node NODE_I into later partitions. Call:
4460
4461 FN (ud, other_node_i)
4462
4463 for each such use-to-def edge ud, where other_node_i is the node at the
4464 other end of the edge. */
4465
4466template<typename T>
4467void
4468vect_optimize_slp_pass::for_each_partition_edge (unsigned int node_i, T fn)
4469{
4470 int partition_i = m_vertices[node_i].partition;
4471 for (graph_edge *pred = m_slpg->vertices[node_i].pred;
4472 pred; pred = pred->pred_next)
4473 {
4474 int src_partition_i = m_vertices[pred->src].partition;
4475 if (src_partition_i >= 0 && src_partition_i != partition_i)
4476 fn (pred, pred->src);
4477 }
4478 for (graph_edge *succ = m_slpg->vertices[node_i].succ;
4479 succ; succ = succ->succ_next)
4480 {
4481 int dest_partition_i = m_vertices[succ->dest].partition;
4482 if (dest_partition_i >= 0 && dest_partition_i != partition_i)
4483 fn (succ, succ->dest);
4484 }
4485}
4486
4487/* Return true if layout LAYOUT_I is compatible with the number of SLP lanes
4488 that NODE would operate on. This test is independent of NODE's actual
4489 operation. */
4490
4491bool
4492vect_optimize_slp_pass::is_compatible_layout (slp_tree node,
4493 unsigned int layout_i)
4494{
4495 if (layout_i == 0)
4496 return true;
4497
4498 if (SLP_TREE_LANES (node) != m_perms[layout_i].length ())
4499 return false;
4500
4501 return true;
4502}
4503
4504/* Return the cost (in arbtirary units) of going from layout FROM_LAYOUT_I
4505 to layout TO_LAYOUT_I for a node like NODE. Return -1 if either of the
4506 layouts is incompatible with NODE or if the change is not possible for
4507 some other reason.
4508
4509 The properties taken from NODE include the number of lanes and the
4510 vector type. The actual operation doesn't matter. */
4511
4512int
4513vect_optimize_slp_pass::change_layout_cost (slp_tree node,
4514 unsigned int from_layout_i,
4515 unsigned int to_layout_i)
4516{
4517 if (!is_compatible_layout (node, layout_i: from_layout_i)
4518 || !is_compatible_layout (node, layout_i: to_layout_i))
4519 return -1;
4520
4521 if (from_layout_i == to_layout_i)
4522 return 0;
4523
4524 auto_vec<slp_tree, 1> children (1);
4525 children.quick_push (obj: node);
4526 auto_lane_permutation_t perm (SLP_TREE_LANES (node));
4527 if (from_layout_i > 0)
4528 for (unsigned int i : m_perms[from_layout_i])
4529 perm.quick_push (obj: { 0, i });
4530 else
4531 for (unsigned int i = 0; i < SLP_TREE_LANES (node); ++i)
4532 perm.quick_push (obj: { 0, i });
4533 if (to_layout_i > 0)
4534 vect_slp_permute (perm: m_perms[to_layout_i], vec&: perm, reverse: true);
4535 auto count = vectorizable_slp_permutation_1 (m_vinfo, nullptr, node, perm,
4536 children, false);
4537 if (count >= 0)
4538 return MAX (count, 1);
4539
4540 /* ??? In principle we could try changing via layout 0, giving two
4541 layout changes rather than 1. Doing that would require
4542 corresponding support in get_result_with_layout. */
4543 return -1;
4544}
4545
4546/* Return the costs of assigning layout LAYOUT_I to partition PARTITION_I. */
4547
4548inline slpg_partition_layout_costs &
4549vect_optimize_slp_pass::partition_layout_costs (unsigned int partition_i,
4550 unsigned int layout_i)
4551{
4552 return m_partition_layout_costs[partition_i * m_perms.length () + layout_i];
4553}
4554
4555/* Change PERM in one of two ways:
4556
4557 - if IN_LAYOUT_I < 0, accept input operand I in the layout that has been
4558 chosen for child I of NODE.
4559
4560 - if IN_LAYOUT >= 0, accept all inputs operands with that layout.
4561
4562 In both cases, arrange for the output to have layout OUT_LAYOUT_I */
4563
4564void
4565vect_optimize_slp_pass::
4566change_vec_perm_layout (slp_tree node, lane_permutation_t &perm,
4567 int in_layout_i, unsigned int out_layout_i)
4568{
4569 for (auto &entry : perm)
4570 {
4571 int this_in_layout_i = in_layout_i;
4572 if (this_in_layout_i < 0)
4573 {
4574 slp_tree in_node = SLP_TREE_CHILDREN (node)[entry.first];
4575 unsigned int in_partition_i = m_vertices[in_node->vertex].partition;
4576 this_in_layout_i = m_partitions[in_partition_i].layout;
4577 }
4578 if (this_in_layout_i > 0)
4579 entry.second = m_perms[this_in_layout_i][entry.second];
4580 }
4581 if (out_layout_i > 0)
4582 vect_slp_permute (perm: m_perms[out_layout_i], vec&: perm, reverse: true);
4583}
4584
4585/* Check whether the target allows NODE to be rearranged so that the node's
4586 output has layout OUT_LAYOUT_I. Return the cost of the change if so,
4587 in the same arbitrary units as for change_layout_cost. Return -1 otherwise.
4588
4589 If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I < 0, also check whether
4590 NODE can adapt to the layout changes that have (perhaps provisionally)
4591 been chosen for NODE's children, so that no extra permutations are
4592 needed on either the input or the output of NODE.
4593
4594 If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I >= 0, instead assume
4595 that all inputs will be forced into layout IN_LAYOUT_I beforehand.
4596
4597 IN_LAYOUT_I has no meaning for other types of node.
4598
4599 Keeping the node as-is is always valid. If the target doesn't appear
4600 to support the node as-is, but might realistically support other layouts,
4601 then layout 0 instead has the cost of a worst-case permutation. On the
4602 one hand, this ensures that every node has at least one valid layout,
4603 avoiding what would otherwise be an awkward special case. On the other,
4604 it still encourages the pass to change an invalid pre-existing layout
4605 choice into a valid one. */
4606
4607int
4608vect_optimize_slp_pass::internal_node_cost (slp_tree node, int in_layout_i,
4609 unsigned int out_layout_i)
4610{
4611 const int fallback_cost = 1;
4612
4613 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
4614 {
4615 auto_lane_permutation_t tmp_perm;
4616 tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
4617
4618 /* Check that the child nodes support the chosen layout. Checking
4619 the first child is enough, since any second child would have the
4620 same shape. */
4621 auto first_child = SLP_TREE_CHILDREN (node)[0];
4622 if (in_layout_i > 0
4623 && !is_compatible_layout (node: first_child, layout_i: in_layout_i))
4624 return -1;
4625
4626 change_vec_perm_layout (node, perm&: tmp_perm, in_layout_i, out_layout_i);
4627 int count = vectorizable_slp_permutation_1 (m_vinfo, nullptr,
4628 node, tmp_perm,
4629 SLP_TREE_CHILDREN (node),
4630 false);
4631 if (count < 0)
4632 {
4633 if (in_layout_i == 0 && out_layout_i == 0)
4634 {
4635 /* Use the fallback cost if the node could in principle support
4636 some nonzero layout for both the inputs and the outputs.
4637 Otherwise assume that the node will be rejected later
4638 and rebuilt from scalars. */
4639 if (SLP_TREE_LANES (node) == SLP_TREE_LANES (first_child))
4640 return fallback_cost;
4641 return 0;
4642 }
4643 return -1;
4644 }
4645
4646 /* We currently have no way of telling whether the new layout is cheaper
4647 or more expensive than the old one. But at least in principle,
4648 it should be worth making zero permutations (whole-vector shuffles)
4649 cheaper than real permutations, in case the pass is able to remove
4650 the latter. */
4651 return count == 0 ? 0 : 1;
4652 }
4653
4654 stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
4655 if (rep
4656 && STMT_VINFO_DATA_REF (rep)
4657 && DR_IS_READ (STMT_VINFO_DATA_REF (rep))
4658 && SLP_TREE_LOAD_PERMUTATION (node).exists ())
4659 {
4660 auto_load_permutation_t tmp_perm;
4661 tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
4662 if (out_layout_i > 0)
4663 vect_slp_permute (perm: m_perms[out_layout_i], vec&: tmp_perm, reverse: true);
4664
4665 poly_uint64 vf = 1;
4666 if (auto loop_vinfo = dyn_cast<loop_vec_info> (p: m_vinfo))
4667 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4668 unsigned int n_perms;
4669 if (!vect_transform_slp_perm_load_1 (m_vinfo, node, tmp_perm, vNULL,
4670 nullptr, vf, true, false, &n_perms))
4671 {
4672 auto rep = SLP_TREE_REPRESENTATIVE (node);
4673 if (out_layout_i == 0)
4674 {
4675 /* Use the fallback cost if the load is an N-to-N permutation.
4676 Otherwise assume that the node will be rejected later
4677 and rebuilt from scalars. */
4678 if (STMT_VINFO_GROUPED_ACCESS (rep)
4679 && (DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (rep))
4680 == SLP_TREE_LANES (node)))
4681 return fallback_cost;
4682 return 0;
4683 }
4684 return -1;
4685 }
4686
4687 /* See the comment above the corresponding VEC_PERM_EXPR handling. */
4688 return n_perms == 0 ? 0 : 1;
4689 }
4690
4691 return 0;
4692}
4693
4694/* Decide which element layouts we should consider using. Calculate the
4695 weights associated with inserting layout changes on partition edges.
4696 Also mark partitions that cannot change layout, by setting their
4697 layout to zero. */
4698
4699void
4700vect_optimize_slp_pass::start_choosing_layouts ()
4701{
4702 /* Used to assign unique permutation indices. */
4703 using perm_hash = unbounded_hashmap_traits<
4704 vec_free_hash_base<int_hash_base<unsigned>>,
4705 int_hash<int, -1, -2>
4706 >;
4707 hash_map<vec<unsigned>, int, perm_hash> layout_ids;
4708
4709 /* Layout 0 is "no change". */
4710 m_perms.safe_push (obj: vNULL);
4711
4712 /* Create layouts from existing permutations. */
4713 auto_load_permutation_t tmp_perm;
4714 for (unsigned int node_i : m_partitioned_nodes)
4715 {
4716 /* Leafs also double as entries to the reverse graph. Allow the
4717 layout of those to be changed. */
4718 auto &vertex = m_vertices[node_i];
4719 auto &partition = m_partitions[vertex.partition];
4720 if (!m_slpg->vertices[node_i].succ)
4721 partition.layout = 0;
4722
4723 /* Loads and VEC_PERM_EXPRs are the only things generating permutes. */
4724 slp_tree node = vertex.node;
4725 stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
4726 slp_tree child;
4727 unsigned HOST_WIDE_INT imin, imax = 0;
4728 bool any_permute = false;
4729 tmp_perm.truncate (size: 0);
4730 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
4731 {
4732 /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the node
4733 unpermuted, record a layout that reverses this permutation.
4734
4735 We would need more work to cope with loads that are internally
4736 permuted and also have inputs (such as masks for
4737 IFN_MASK_LOADs). */
4738 gcc_assert (partition.layout == 0 && !m_slpg->vertices[node_i].succ);
4739 if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
4740 {
4741 partition.layout = -1;
4742 continue;
4743 }
4744 dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
4745 imin = DR_GROUP_SIZE (dr_stmt) + 1;
4746 tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
4747 }
4748 else if (SLP_TREE_CODE (node) == VEC_PERM_EXPR
4749 && SLP_TREE_CHILDREN (node).length () == 1
4750 && (child = SLP_TREE_CHILDREN (node)[0])
4751 && (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (child))
4752 .is_constant (const_value: &imin)))
4753 {
4754 /* If the child has the same vector size as this node,
4755 reversing the permutation can make the permutation a no-op.
4756 In other cases it can change a true permutation into a
4757 full-vector extract. */
4758 tmp_perm.reserve (SLP_TREE_LANES (node));
4759 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4760 tmp_perm.quick_push (SLP_TREE_LANE_PERMUTATION (node)[j].second);
4761 }
4762 else
4763 continue;
4764
4765 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4766 {
4767 unsigned idx = tmp_perm[j];
4768 imin = MIN (imin, idx);
4769 imax = MAX (imax, idx);
4770 if (idx - tmp_perm[0] != j)
4771 any_permute = true;
4772 }
4773 /* If the span doesn't match we'd disrupt VF computation, avoid
4774 that for now. */
4775 if (imax - imin + 1 != SLP_TREE_LANES (node))
4776 continue;
4777 /* If there's no permute no need to split one out. In this case
4778 we can consider turning a load into a permuted load, if that
4779 turns out to be cheaper than alternatives. */
4780 if (!any_permute)
4781 {
4782 partition.layout = -1;
4783 continue;
4784 }
4785
4786 /* For now only handle true permutes, like
4787 vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
4788 when permuting constants and invariants keeping the permute
4789 bijective. */
4790 auto_sbitmap load_index (SLP_TREE_LANES (node));
4791 bitmap_clear (load_index);
4792 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4793 bitmap_set_bit (map: load_index, bitno: tmp_perm[j] - imin);
4794 unsigned j;
4795 for (j = 0; j < SLP_TREE_LANES (node); ++j)
4796 if (!bitmap_bit_p (map: load_index, bitno: j))
4797 break;
4798 if (j != SLP_TREE_LANES (node))
4799 continue;
4800
4801 vec<unsigned> perm = vNULL;
4802 perm.safe_grow (SLP_TREE_LANES (node), exact: true);
4803 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4804 perm[j] = tmp_perm[j] - imin;
4805
4806 if (int (m_perms.length ()) >= param_vect_max_layout_candidates)
4807 {
4808 /* Continue to use existing layouts, but don't add any more. */
4809 int *entry = layout_ids.get (k: perm);
4810 partition.layout = entry ? *entry : 0;
4811 perm.release ();
4812 }
4813 else
4814 {
4815 bool existed;
4816 int &layout_i = layout_ids.get_or_insert (k: perm, existed: &existed);
4817 if (existed)
4818 perm.release ();
4819 else
4820 {
4821 layout_i = m_perms.length ();
4822 m_perms.safe_push (obj: perm);
4823 }
4824 partition.layout = layout_i;
4825 }
4826 }
4827
4828 /* Initially assume that every layout is possible and has zero cost
4829 in every partition. */
4830 m_partition_layout_costs.safe_grow_cleared (len: m_partitions.length ()
4831 * m_perms.length ());
4832
4833 /* We have to mark outgoing permutations facing non-associating-reduction
4834 graph entries that are not represented as to be materialized.
4835 slp_inst_kind_bb_reduc currently only covers associatable reductions. */
4836 for (slp_instance instance : m_vinfo->slp_instances)
4837 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
4838 {
4839 unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
4840 m_partitions[m_vertices[node_i].partition].layout = 0;
4841 }
4842 else if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_reduc_chain)
4843 {
4844 stmt_vec_info stmt_info
4845 = SLP_TREE_REPRESENTATIVE (SLP_INSTANCE_TREE (instance));
4846 stmt_vec_info reduc_info = info_for_reduction (m_vinfo, stmt_info);
4847 if (needs_fold_left_reduction_p (TREE_TYPE
4848 (gimple_get_lhs (stmt_info->stmt)),
4849 STMT_VINFO_REDUC_CODE (reduc_info)))
4850 {
4851 unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
4852 m_partitions[m_vertices[node_i].partition].layout = 0;
4853 }
4854 }
4855
4856 /* Check which layouts each node and partition can handle. Calculate the
4857 weights associated with inserting layout changes on edges. */
4858 for (unsigned int node_i : m_partitioned_nodes)
4859 {
4860 auto &vertex = m_vertices[node_i];
4861 auto &partition = m_partitions[vertex.partition];
4862 slp_tree node = vertex.node;
4863
4864 if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
4865 {
4866 vertex.weight = vect_slp_node_weight (node);
4867
4868 /* We do not handle stores with a permutation, so all
4869 incoming permutations must have been materialized.
4870
4871 We also don't handle masked grouped loads, which lack a
4872 permutation vector. In this case the memory locations
4873 form an implicit second input to the loads, on top of the
4874 explicit mask input, and the memory input's layout cannot
4875 be changed.
4876
4877 On the other hand, we do support permuting gather loads and
4878 masked gather loads, where each scalar load is independent
4879 of the others. This can be useful if the address/index input
4880 benefits from permutation. */
4881 if (STMT_VINFO_DATA_REF (rep)
4882 && STMT_VINFO_GROUPED_ACCESS (rep)
4883 && !SLP_TREE_LOAD_PERMUTATION (node).exists ())
4884 partition.layout = 0;
4885
4886 /* We cannot change the layout of an operation that is
4887 not independent on lanes. Note this is an explicit
4888 negative list since that's much shorter than the respective
4889 positive one but it's critical to keep maintaining it. */
4890 if (is_gimple_call (STMT_VINFO_STMT (rep)))
4891 switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
4892 {
4893 case CFN_COMPLEX_ADD_ROT90:
4894 case CFN_COMPLEX_ADD_ROT270:
4895 case CFN_COMPLEX_MUL:
4896 case CFN_COMPLEX_MUL_CONJ:
4897 case CFN_VEC_ADDSUB:
4898 case CFN_VEC_FMADDSUB:
4899 case CFN_VEC_FMSUBADD:
4900 partition.layout = 0;
4901 default:;
4902 }
4903 }
4904
4905 auto process_edge = [&](graph_edge *ud, unsigned int other_node_i)
4906 {
4907 auto &other_vertex = m_vertices[other_node_i];
4908
4909 /* Count the number of edges from earlier partitions and the number
4910 of edges to later partitions. */
4911 if (other_vertex.partition < vertex.partition)
4912 partition.in_degree += 1;
4913 else
4914 partition.out_degree += 1;
4915
4916 /* If the current node uses the result of OTHER_NODE_I, accumulate
4917 the effects of that. */
4918 if (ud->src == int (node_i))
4919 {
4920 other_vertex.out_weight += vertex.weight;
4921 other_vertex.out_degree += 1;
4922 }
4923 };
4924 for_each_partition_edge (node_i, fn: process_edge);
4925 }
4926}
4927
4928/* Return the incoming costs for node NODE_I, assuming that each input keeps
4929 its current (provisional) choice of layout. The inputs do not necessarily
4930 have the same layout as each other. */
4931
4932slpg_layout_cost
4933vect_optimize_slp_pass::total_in_cost (unsigned int node_i)
4934{
4935 auto &vertex = m_vertices[node_i];
4936 slpg_layout_cost cost;
4937 auto add_cost = [&](graph_edge *, unsigned int other_node_i)
4938 {
4939 auto &other_vertex = m_vertices[other_node_i];
4940 if (other_vertex.partition < vertex.partition)
4941 {
4942 auto &other_partition = m_partitions[other_vertex.partition];
4943 auto &other_costs = partition_layout_costs (partition_i: other_vertex.partition,
4944 layout_i: other_partition.layout);
4945 slpg_layout_cost this_cost = other_costs.in_cost;
4946 this_cost.add_serial_cost (other: other_costs.internal_cost);
4947 this_cost.split (times: other_partition.out_degree);
4948 cost.add_parallel_cost (input_cost: this_cost);
4949 }
4950 };
4951 for_each_partition_edge (node_i, fn: add_cost);
4952 return cost;
4953}
4954
4955/* Return the cost of switching between layout LAYOUT1_I (at node NODE1_I)
4956 and layout LAYOUT2_I on cross-partition use-to-def edge UD. Return
4957 slpg_layout_cost::impossible () if the change isn't possible. */
4958
4959slpg_layout_cost
4960vect_optimize_slp_pass::
4961edge_layout_cost (graph_edge *ud, unsigned int node1_i, unsigned int layout1_i,
4962 unsigned int layout2_i)
4963{
4964 auto &def_vertex = m_vertices[ud->dest];
4965 auto &use_vertex = m_vertices[ud->src];
4966 auto def_layout_i = ud->dest == int (node1_i) ? layout1_i : layout2_i;
4967 auto use_layout_i = ud->dest == int (node1_i) ? layout2_i : layout1_i;
4968 auto factor = change_layout_cost (node: def_vertex.node, from_layout_i: def_layout_i,
4969 to_layout_i: use_layout_i);
4970 if (factor < 0)
4971 return slpg_layout_cost::impossible ();
4972
4973 /* We have a choice of putting the layout change at the site of the
4974 definition or at the site of the use. Prefer the former when
4975 optimizing for size or when the execution frequency of the
4976 definition is no greater than the combined execution frequencies of
4977 the uses. When putting the layout change at the site of the definition,
4978 divvy up the cost among all consumers. */
4979 if (m_optimize_size || def_vertex.weight <= def_vertex.out_weight)
4980 {
4981 slpg_layout_cost cost = { def_vertex.weight * factor, m_optimize_size };
4982 cost.split (times: def_vertex.out_degree);
4983 return cost;
4984 }
4985 return { use_vertex.weight * factor, m_optimize_size };
4986}
4987
4988/* UD represents a use-def link between FROM_NODE_I and a node in a later
4989 partition; FROM_NODE_I could be the definition node or the use node.
4990 The node at the other end of the link wants to use layout TO_LAYOUT_I.
4991 Return the cost of any necessary fix-ups on edge UD, or return
4992 slpg_layout_cost::impossible () if the change isn't possible.
4993
4994 At this point, FROM_NODE_I's partition has chosen the cheapest
4995 layout based on the information available so far, but this choice
4996 is only provisional. */
4997
4998slpg_layout_cost
4999vect_optimize_slp_pass::forward_cost (graph_edge *ud, unsigned int from_node_i,
5000 unsigned int to_layout_i)
5001{
5002 auto &from_vertex = m_vertices[from_node_i];
5003 unsigned int from_partition_i = from_vertex.partition;
5004 slpg_partition_info &from_partition = m_partitions[from_partition_i];
5005 gcc_assert (from_partition.layout >= 0);
5006
5007 /* First calculate the cost on the assumption that FROM_PARTITION sticks
5008 with its current layout preference. */
5009 slpg_layout_cost cost = slpg_layout_cost::impossible ();
5010 auto edge_cost = edge_layout_cost (ud, node1_i: from_node_i,
5011 layout1_i: from_partition.layout, layout2_i: to_layout_i);
5012 if (edge_cost.is_possible ())
5013 {
5014 auto &from_costs = partition_layout_costs (partition_i: from_partition_i,
5015 layout_i: from_partition.layout);
5016 cost = from_costs.in_cost;
5017 cost.add_serial_cost (other: from_costs.internal_cost);
5018 cost.split (times: from_partition.out_degree);
5019 cost.add_serial_cost (other: edge_cost);
5020 }
5021
5022 /* Take the minimum of that cost and the cost that applies if
5023 FROM_PARTITION instead switches to TO_LAYOUT_I. */
5024 auto &direct_layout_costs = partition_layout_costs (partition_i: from_partition_i,
5025 layout_i: to_layout_i);
5026 if (direct_layout_costs.is_possible ())
5027 {
5028 slpg_layout_cost direct_cost = direct_layout_costs.in_cost;
5029 direct_cost.add_serial_cost (other: direct_layout_costs.internal_cost);
5030 direct_cost.split (times: from_partition.out_degree);
5031 if (!cost.is_possible ()
5032 || direct_cost.is_better_than (other: cost, is_for_size: m_optimize_size))
5033 cost = direct_cost;
5034 }
5035
5036 return cost;
5037}
5038
5039/* UD represents a use-def link between TO_NODE_I and a node in an earlier
5040 partition; TO_NODE_I could be the definition node or the use node.
5041 The node at the other end of the link wants to use layout FROM_LAYOUT_I;
5042 return the cost of any necessary fix-ups on edge UD, or
5043 slpg_layout_cost::impossible () if the choice cannot be made.
5044
5045 At this point, TO_NODE_I's partition has a fixed choice of layout. */
5046
5047slpg_layout_cost
5048vect_optimize_slp_pass::backward_cost (graph_edge *ud, unsigned int to_node_i,
5049 unsigned int from_layout_i)
5050{
5051 auto &to_vertex = m_vertices[to_node_i];
5052 unsigned int to_partition_i = to_vertex.partition;
5053 slpg_partition_info &to_partition = m_partitions[to_partition_i];
5054 gcc_assert (to_partition.layout >= 0);
5055
5056 /* If TO_NODE_I is a VEC_PERM_EXPR consumer, see whether it can be
5057 adjusted for this input having layout FROM_LAYOUT_I. Assume that
5058 any other inputs keep their current choice of layout. */
5059 auto &to_costs = partition_layout_costs (partition_i: to_partition_i,
5060 layout_i: to_partition.layout);
5061 if (ud->src == int (to_node_i)
5062 && SLP_TREE_CODE (to_vertex.node) == VEC_PERM_EXPR)
5063 {
5064 auto &from_partition = m_partitions[m_vertices[ud->dest].partition];
5065 auto old_layout = from_partition.layout;
5066 from_partition.layout = from_layout_i;
5067 int factor = internal_node_cost (node: to_vertex.node, in_layout_i: -1,
5068 out_layout_i: to_partition.layout);
5069 from_partition.layout = old_layout;
5070 if (factor >= 0)
5071 {
5072 slpg_layout_cost cost = to_costs.out_cost;
5073 cost.add_serial_cost (other: { to_vertex.weight * factor,
5074 m_optimize_size });
5075 cost.split (times: to_partition.in_degree);
5076 return cost;
5077 }
5078 }
5079
5080 /* Compute the cost if we insert any necessary layout change on edge UD. */
5081 auto edge_cost = edge_layout_cost (ud, node1_i: to_node_i,
5082 layout1_i: to_partition.layout, layout2_i: from_layout_i);
5083 if (edge_cost.is_possible ())
5084 {
5085 slpg_layout_cost cost = to_costs.out_cost;
5086 cost.add_serial_cost (other: to_costs.internal_cost);
5087 cost.split (times: to_partition.in_degree);
5088 cost.add_serial_cost (other: edge_cost);
5089 return cost;
5090 }
5091
5092 return slpg_layout_cost::impossible ();
5093}
5094
5095/* Make a forward pass through the partitions, accumulating input costs.
5096 Make a tentative (provisional) choice of layout for each partition,
5097 ensuring that this choice still allows later partitions to keep
5098 their original layout. */
5099
5100void
5101vect_optimize_slp_pass::forward_pass ()
5102{
5103 for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
5104 ++partition_i)
5105 {
5106 auto &partition = m_partitions[partition_i];
5107
5108 /* If the partition consists of a single VEC_PERM_EXPR, precompute
5109 the incoming cost that would apply if every predecessor partition
5110 keeps its current layout. This is used within the loop below. */
5111 slpg_layout_cost in_cost;
5112 slp_tree single_node = nullptr;
5113 if (partition.node_end == partition.node_begin + 1)
5114 {
5115 unsigned int node_i = m_partitioned_nodes[partition.node_begin];
5116 single_node = m_vertices[node_i].node;
5117 if (SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
5118 in_cost = total_in_cost (node_i);
5119 }
5120
5121 /* Go through the possible layouts. Decide which ones are valid
5122 for this partition and record which of the valid layouts has
5123 the lowest cost. */
5124 unsigned int min_layout_i = 0;
5125 slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
5126 for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5127 {
5128 auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5129 if (!layout_costs.is_possible ())
5130 continue;
5131
5132 /* If the recorded layout is already 0 then the layout cannot
5133 change. */
5134 if (partition.layout == 0 && layout_i != 0)
5135 {
5136 layout_costs.mark_impossible ();
5137 continue;
5138 }
5139
5140 bool is_possible = true;
5141 for (unsigned int order_i = partition.node_begin;
5142 order_i < partition.node_end; ++order_i)
5143 {
5144 unsigned int node_i = m_partitioned_nodes[order_i];
5145 auto &vertex = m_vertices[node_i];
5146
5147 /* Reject the layout if it is individually incompatible
5148 with any node in the partition. */
5149 if (!is_compatible_layout (node: vertex.node, layout_i))
5150 {
5151 is_possible = false;
5152 break;
5153 }
5154
5155 auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
5156 {
5157 auto &other_vertex = m_vertices[other_node_i];
5158 if (other_vertex.partition < vertex.partition)
5159 {
5160 /* Accumulate the incoming costs from earlier
5161 partitions, plus the cost of any layout changes
5162 on UD itself. */
5163 auto cost = forward_cost (ud, from_node_i: other_node_i, to_layout_i: layout_i);
5164 if (!cost.is_possible ())
5165 is_possible = false;
5166 else
5167 layout_costs.in_cost.add_parallel_cost (input_cost: cost);
5168 }
5169 else
5170 /* Reject the layout if it would make layout 0 impossible
5171 for later partitions. This amounts to testing that the
5172 target supports reversing the layout change on edges
5173 to later partitions.
5174
5175 In principle, it might be possible to push a layout
5176 change all the way down a graph, so that it never
5177 needs to be reversed and so that the target doesn't
5178 need to support the reverse operation. But it would
5179 be awkward to bail out if we hit a partition that
5180 does not support the new layout, especially since
5181 we are not dealing with a lattice. */
5182 is_possible &= edge_layout_cost (ud, node1_i: other_node_i, layout1_i: 0,
5183 layout2_i: layout_i).is_possible ();
5184 };
5185 for_each_partition_edge (node_i, fn: add_cost);
5186
5187 /* Accumulate the cost of using LAYOUT_I within NODE,
5188 both for the inputs and the outputs. */
5189 int factor = internal_node_cost (node: vertex.node, in_layout_i: layout_i,
5190 out_layout_i: layout_i);
5191 if (factor < 0)
5192 {
5193 is_possible = false;
5194 break;
5195 }
5196 else if (factor)
5197 layout_costs.internal_cost.add_serial_cost
5198 (other: { vertex.weight * factor, m_optimize_size });
5199 }
5200 if (!is_possible)
5201 {
5202 layout_costs.mark_impossible ();
5203 continue;
5204 }
5205
5206 /* Combine the incoming and partition-internal costs. */
5207 slpg_layout_cost combined_cost = layout_costs.in_cost;
5208 combined_cost.add_serial_cost (other: layout_costs.internal_cost);
5209
5210 /* If this partition consists of a single VEC_PERM_EXPR, see
5211 if the VEC_PERM_EXPR can be changed to support output layout
5212 LAYOUT_I while keeping all the provisional choices of input
5213 layout. */
5214 if (single_node
5215 && SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
5216 {
5217 int factor = internal_node_cost (node: single_node, in_layout_i: -1, out_layout_i: layout_i);
5218 if (factor >= 0)
5219 {
5220 auto weight = m_vertices[single_node->vertex].weight;
5221 slpg_layout_cost internal_cost
5222 = { weight * factor, m_optimize_size };
5223
5224 slpg_layout_cost alt_cost = in_cost;
5225 alt_cost.add_serial_cost (other: internal_cost);
5226 if (alt_cost.is_better_than (other: combined_cost, is_for_size: m_optimize_size))
5227 {
5228 combined_cost = alt_cost;
5229 layout_costs.in_cost = in_cost;
5230 layout_costs.internal_cost = internal_cost;
5231 }
5232 }
5233 }
5234
5235 /* Record the layout with the lowest cost. Prefer layout 0 in
5236 the event of a tie between it and another layout. */
5237 if (!min_layout_cost.is_possible ()
5238 || combined_cost.is_better_than (other: min_layout_cost,
5239 is_for_size: m_optimize_size))
5240 {
5241 min_layout_i = layout_i;
5242 min_layout_cost = combined_cost;
5243 }
5244 }
5245
5246 /* This loop's handling of earlier partitions should ensure that
5247 choosing the original layout for the current partition is no
5248 less valid than it was in the original graph, even with the
5249 provisional layout choices for those earlier partitions. */
5250 gcc_assert (min_layout_cost.is_possible ());
5251 partition.layout = min_layout_i;
5252 }
5253}
5254
5255/* Make a backward pass through the partitions, accumulating output costs.
5256 Make a final choice of layout for each partition. */
5257
5258void
5259vect_optimize_slp_pass::backward_pass ()
5260{
5261 for (unsigned int partition_i = m_partitions.length (); partition_i-- > 0;)
5262 {
5263 auto &partition = m_partitions[partition_i];
5264
5265 unsigned int min_layout_i = 0;
5266 slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
5267 for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5268 {
5269 auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5270 if (!layout_costs.is_possible ())
5271 continue;
5272
5273 /* Accumulate the costs from successor partitions. */
5274 bool is_possible = true;
5275 for (unsigned int order_i = partition.node_begin;
5276 order_i < partition.node_end; ++order_i)
5277 {
5278 unsigned int node_i = m_partitioned_nodes[order_i];
5279 auto &vertex = m_vertices[node_i];
5280 auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
5281 {
5282 auto &other_vertex = m_vertices[other_node_i];
5283 auto &other_partition = m_partitions[other_vertex.partition];
5284 if (other_vertex.partition > vertex.partition)
5285 {
5286 /* Accumulate the incoming costs from later
5287 partitions, plus the cost of any layout changes
5288 on UD itself. */
5289 auto cost = backward_cost (ud, to_node_i: other_node_i, from_layout_i: layout_i);
5290 if (!cost.is_possible ())
5291 is_possible = false;
5292 else
5293 layout_costs.out_cost.add_parallel_cost (input_cost: cost);
5294 }
5295 else
5296 /* Make sure that earlier partitions can (if necessary
5297 or beneficial) keep the layout that they chose in
5298 the forward pass. This ensures that there is at
5299 least one valid choice of layout. */
5300 is_possible &= edge_layout_cost (ud, node1_i: other_node_i,
5301 layout1_i: other_partition.layout,
5302 layout2_i: layout_i).is_possible ();
5303 };
5304 for_each_partition_edge (node_i, fn: add_cost);
5305 }
5306 if (!is_possible)
5307 {
5308 layout_costs.mark_impossible ();
5309 continue;
5310 }
5311
5312 /* Locally combine the costs from the forward and backward passes.
5313 (This combined cost is not passed on, since that would lead
5314 to double counting.) */
5315 slpg_layout_cost combined_cost = layout_costs.in_cost;
5316 combined_cost.add_serial_cost (other: layout_costs.internal_cost);
5317 combined_cost.add_serial_cost (other: layout_costs.out_cost);
5318
5319 /* Record the layout with the lowest cost. Prefer layout 0 in
5320 the event of a tie between it and another layout. */
5321 if (!min_layout_cost.is_possible ()
5322 || combined_cost.is_better_than (other: min_layout_cost,
5323 is_for_size: m_optimize_size))
5324 {
5325 min_layout_i = layout_i;
5326 min_layout_cost = combined_cost;
5327 }
5328 }
5329
5330 gcc_assert (min_layout_cost.is_possible ());
5331 partition.layout = min_layout_i;
5332 }
5333}
5334
5335/* Return a node that applies layout TO_LAYOUT_I to the original form of NODE.
5336 NODE already has the layout that was selected for its partition. */
5337
5338slp_tree
5339vect_optimize_slp_pass::get_result_with_layout (slp_tree node,
5340 unsigned int to_layout_i)
5341{
5342 unsigned int result_i = node->vertex * m_perms.length () + to_layout_i;
5343 slp_tree result = m_node_layouts[result_i];
5344 if (result)
5345 return result;
5346
5347 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
5348 || (SLP_TREE_DEF_TYPE (node) == vect_external_def
5349 /* We can't permute vector defs in place. */
5350 && SLP_TREE_VEC_DEFS (node).is_empty ()))
5351 {
5352 /* If the vector is uniform or unchanged, there's nothing to do. */
5353 if (to_layout_i == 0 || vect_slp_tree_uniform_p (node))
5354 result = node;
5355 else
5356 {
5357 auto scalar_ops = SLP_TREE_SCALAR_OPS (node).copy ();
5358 result = vect_create_new_slp_node (ops: scalar_ops);
5359 vect_slp_permute (perm: m_perms[to_layout_i], vec&: scalar_ops, reverse: true);
5360 }
5361 }
5362 else
5363 {
5364 unsigned int partition_i = m_vertices[node->vertex].partition;
5365 unsigned int from_layout_i = m_partitions[partition_i].layout;
5366 if (from_layout_i == to_layout_i)
5367 return node;
5368
5369 /* If NODE is itself a VEC_PERM_EXPR, try to create a parallel
5370 permutation instead of a serial one. Leave the new permutation
5371 in TMP_PERM on success. */
5372 auto_lane_permutation_t tmp_perm;
5373 unsigned int num_inputs = 1;
5374 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5375 {
5376 tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
5377 if (from_layout_i != 0)
5378 vect_slp_permute (perm: m_perms[from_layout_i], vec&: tmp_perm, reverse: false);
5379 if (to_layout_i != 0)
5380 vect_slp_permute (perm: m_perms[to_layout_i], vec&: tmp_perm, reverse: true);
5381 if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
5382 tmp_perm,
5383 SLP_TREE_CHILDREN (node),
5384 false) >= 0)
5385 num_inputs = SLP_TREE_CHILDREN (node).length ();
5386 else
5387 tmp_perm.truncate (size: 0);
5388 }
5389
5390 if (dump_enabled_p ())
5391 {
5392 if (tmp_perm.length () > 0)
5393 dump_printf_loc (MSG_NOTE, vect_location,
5394 "duplicating permutation node %p with"
5395 " layout %d\n",
5396 (void *) node, to_layout_i);
5397 else
5398 dump_printf_loc (MSG_NOTE, vect_location,
5399 "inserting permutation node in place of %p\n",
5400 (void *) node);
5401 }
5402
5403 unsigned int num_lanes = SLP_TREE_LANES (node);
5404 result = vect_create_new_slp_node (nops: num_inputs, code: VEC_PERM_EXPR);
5405 if (SLP_TREE_SCALAR_STMTS (node).length ())
5406 {
5407 auto &stmts = SLP_TREE_SCALAR_STMTS (result);
5408 stmts.safe_splice (SLP_TREE_SCALAR_STMTS (node));
5409 if (from_layout_i != 0)
5410 vect_slp_permute (perm: m_perms[from_layout_i], vec&: stmts, reverse: false);
5411 if (to_layout_i != 0)
5412 vect_slp_permute (perm: m_perms[to_layout_i], vec&: stmts, reverse: true);
5413 }
5414 SLP_TREE_REPRESENTATIVE (result) = SLP_TREE_REPRESENTATIVE (node);
5415 SLP_TREE_LANES (result) = num_lanes;
5416 SLP_TREE_VECTYPE (result) = SLP_TREE_VECTYPE (node);
5417 result->vertex = -1;
5418
5419 auto &lane_perm = SLP_TREE_LANE_PERMUTATION (result);
5420 if (tmp_perm.length ())
5421 {
5422 lane_perm.safe_splice (src: tmp_perm);
5423 SLP_TREE_CHILDREN (result).safe_splice (SLP_TREE_CHILDREN (node));
5424 }
5425 else
5426 {
5427 lane_perm.create (nelems: num_lanes);
5428 for (unsigned j = 0; j < num_lanes; ++j)
5429 lane_perm.quick_push (obj: { 0, j });
5430 if (from_layout_i != 0)
5431 vect_slp_permute (perm: m_perms[from_layout_i], vec&: lane_perm, reverse: false);
5432 if (to_layout_i != 0)
5433 vect_slp_permute (perm: m_perms[to_layout_i], vec&: lane_perm, reverse: true);
5434 SLP_TREE_CHILDREN (result).safe_push (obj: node);
5435 }
5436 for (slp_tree child : SLP_TREE_CHILDREN (result))
5437 child->refcnt++;
5438 }
5439 m_node_layouts[result_i] = result;
5440 return result;
5441}
5442
5443/* Apply the chosen vector layouts to the SLP graph. */
5444
5445void
5446vect_optimize_slp_pass::materialize ()
5447{
5448 /* We no longer need the costs, so avoid having two O(N * P) arrays
5449 live at the same time. */
5450 m_partition_layout_costs.release ();
5451 m_node_layouts.safe_grow_cleared (len: m_vertices.length () * m_perms.length ());
5452
5453 auto_sbitmap fully_folded (m_vertices.length ());
5454 bitmap_clear (fully_folded);
5455 for (unsigned int node_i : m_partitioned_nodes)
5456 {
5457 auto &vertex = m_vertices[node_i];
5458 slp_tree node = vertex.node;
5459 int layout_i = m_partitions[vertex.partition].layout;
5460 gcc_assert (layout_i >= 0);
5461
5462 /* Rearrange the scalar statements to match the chosen layout. */
5463 if (layout_i > 0)
5464 vect_slp_permute (perm: m_perms[layout_i],
5465 SLP_TREE_SCALAR_STMTS (node), reverse: true);
5466
5467 /* Update load and lane permutations. */
5468 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5469 {
5470 /* First try to absorb the input vector layouts. If that fails,
5471 force the inputs to have layout LAYOUT_I too. We checked that
5472 that was possible before deciding to use nonzero output layouts.
5473 (Note that at this stage we don't really have any guarantee that
5474 the target supports the original VEC_PERM_EXPR.) */
5475 auto &perm = SLP_TREE_LANE_PERMUTATION (node);
5476 auto_lane_permutation_t tmp_perm;
5477 tmp_perm.safe_splice (src: perm);
5478 change_vec_perm_layout (node, perm&: tmp_perm, in_layout_i: -1, out_layout_i: layout_i);
5479 if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
5480 tmp_perm,
5481 SLP_TREE_CHILDREN (node),
5482 false) >= 0)
5483 {
5484 if (dump_enabled_p ()
5485 && !std::equal (first1: tmp_perm.begin (), last1: tmp_perm.end (),
5486 first2: perm.begin ()))
5487 dump_printf_loc (MSG_NOTE, vect_location,
5488 "absorbing input layouts into %p\n",
5489 (void *) node);
5490 std::copy (first: tmp_perm.begin (), last: tmp_perm.end (), result: perm.begin ());
5491 bitmap_set_bit (map: fully_folded, bitno: node_i);
5492 }
5493 else
5494 {
5495 /* Not MSG_MISSED because it would make no sense to users. */
5496 if (dump_enabled_p ())
5497 dump_printf_loc (MSG_NOTE, vect_location,
5498 "failed to absorb input layouts into %p\n",
5499 (void *) node);
5500 change_vec_perm_layout (node: nullptr, perm, in_layout_i: layout_i, out_layout_i: layout_i);
5501 }
5502 }
5503 else
5504 {
5505 gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
5506 auto &load_perm = SLP_TREE_LOAD_PERMUTATION (node);
5507 if (layout_i > 0)
5508 /* ??? When we handle non-bijective permutes the idea
5509 is that we can force the load-permutation to be
5510 { min, min + 1, min + 2, ... max }. But then the
5511 scalar defs might no longer match the lane content
5512 which means wrong-code with live lane vectorization.
5513 So we possibly have to have NULL entries for those. */
5514 vect_slp_permute (perm: m_perms[layout_i], vec&: load_perm, reverse: true);
5515 }
5516 }
5517
5518 /* Do this before any nodes disappear, since it involves a walk
5519 over the leaves. */
5520 remove_redundant_permutations ();
5521
5522 /* Replace each child with a correctly laid-out version. */
5523 for (unsigned int node_i : m_partitioned_nodes)
5524 {
5525 /* Skip nodes that have already been handled above. */
5526 if (bitmap_bit_p (map: fully_folded, bitno: node_i))
5527 continue;
5528
5529 auto &vertex = m_vertices[node_i];
5530 int in_layout_i = m_partitions[vertex.partition].layout;
5531 gcc_assert (in_layout_i >= 0);
5532
5533 unsigned j;
5534 slp_tree child;
5535 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (vertex.node), j, child)
5536 {
5537 if (!child)
5538 continue;
5539
5540 slp_tree new_child = get_result_with_layout (node: child, to_layout_i: in_layout_i);
5541 if (new_child != child)
5542 {
5543 vect_free_slp_tree (node: child);
5544 SLP_TREE_CHILDREN (vertex.node)[j] = new_child;
5545 new_child->refcnt += 1;
5546 }
5547 }
5548 }
5549}
5550
5551/* Elide load permutations that are not necessary. Such permutations might
5552 be pre-existing, rather than created by the layout optimizations. */
5553
5554void
5555vect_optimize_slp_pass::remove_redundant_permutations ()
5556{
5557 for (unsigned int node_i : m_leafs)
5558 {
5559 slp_tree node = m_vertices[node_i].node;
5560 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
5561 continue;
5562
5563 /* In basic block vectorization we allow any subchain of an interleaving
5564 chain.
5565 FORNOW: not in loop SLP because of realignment complications. */
5566 if (is_a <bb_vec_info> (p: m_vinfo))
5567 {
5568 bool subchain_p = true;
5569 stmt_vec_info next_load_info = NULL;
5570 stmt_vec_info load_info;
5571 unsigned j;
5572 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
5573 {
5574 if (j != 0
5575 && (next_load_info != load_info
5576 || DR_GROUP_GAP (load_info) != 1))
5577 {
5578 subchain_p = false;
5579 break;
5580 }
5581 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
5582 }
5583 if (subchain_p)
5584 {
5585 SLP_TREE_LOAD_PERMUTATION (node).release ();
5586 continue;
5587 }
5588 }
5589 else
5590 {
5591 loop_vec_info loop_vinfo = as_a<loop_vec_info> (p: m_vinfo);
5592 stmt_vec_info load_info;
5593 bool this_load_permuted = false;
5594 unsigned j;
5595 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
5596 if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
5597 {
5598 this_load_permuted = true;
5599 break;
5600 }
5601 /* When this isn't a grouped access we know it's single element
5602 and contiguous. */
5603 if (!STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (node)[0]))
5604 {
5605 if (!this_load_permuted
5606 && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
5607 || SLP_TREE_LANES (node) == 1))
5608 SLP_TREE_LOAD_PERMUTATION (node).release ();
5609 continue;
5610 }
5611 stmt_vec_info first_stmt_info
5612 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
5613 if (!this_load_permuted
5614 /* The load requires permutation when unrolling exposes
5615 a gap either because the group is larger than the SLP
5616 group-size or because there is a gap between the groups. */
5617 && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
5618 || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
5619 && DR_GROUP_GAP (first_stmt_info) == 0)))
5620 {
5621 SLP_TREE_LOAD_PERMUTATION (node).release ();
5622 continue;
5623 }
5624 }
5625 }
5626}
5627
5628/* Print the partition graph and layout information to the dump file. */
5629
5630void
5631vect_optimize_slp_pass::dump ()
5632{
5633 dump_printf_loc (MSG_NOTE, vect_location,
5634 "SLP optimize permutations:\n");
5635 for (unsigned int layout_i = 1; layout_i < m_perms.length (); ++layout_i)
5636 {
5637 dump_printf_loc (MSG_NOTE, vect_location, " %d: { ", layout_i);
5638 const char *sep = "";
5639 for (unsigned int idx : m_perms[layout_i])
5640 {
5641 dump_printf (MSG_NOTE, "%s%d", sep, idx);
5642 sep = ", ";
5643 }
5644 dump_printf (MSG_NOTE, " }\n");
5645 }
5646 dump_printf_loc (MSG_NOTE, vect_location,
5647 "SLP optimize partitions:\n");
5648 for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
5649 ++partition_i)
5650 {
5651 auto &partition = m_partitions[partition_i];
5652 dump_printf_loc (MSG_NOTE, vect_location, " -------------\n");
5653 dump_printf_loc (MSG_NOTE, vect_location,
5654 " partition %d (layout %d):\n",
5655 partition_i, partition.layout);
5656 dump_printf_loc (MSG_NOTE, vect_location, " nodes:\n");
5657 for (unsigned int order_i = partition.node_begin;
5658 order_i < partition.node_end; ++order_i)
5659 {
5660 auto &vertex = m_vertices[m_partitioned_nodes[order_i]];
5661 dump_printf_loc (MSG_NOTE, vect_location, " - %p:\n",
5662 (void *) vertex.node);
5663 dump_printf_loc (MSG_NOTE, vect_location,
5664 " weight: %f\n",
5665 vertex.weight.to_double ());
5666 if (vertex.out_degree)
5667 dump_printf_loc (MSG_NOTE, vect_location,
5668 " out weight: %f (degree %d)\n",
5669 vertex.out_weight.to_double (),
5670 vertex.out_degree);
5671 if (SLP_TREE_CODE (vertex.node) == VEC_PERM_EXPR)
5672 dump_printf_loc (MSG_NOTE, vect_location,
5673 " op: VEC_PERM_EXPR\n");
5674 else if (auto rep = SLP_TREE_REPRESENTATIVE (vertex.node))
5675 dump_printf_loc (MSG_NOTE, vect_location,
5676 " op template: %G", rep->stmt);
5677 }
5678 dump_printf_loc (MSG_NOTE, vect_location, " edges:\n");
5679 for (unsigned int order_i = partition.node_begin;
5680 order_i < partition.node_end; ++order_i)
5681 {
5682 unsigned int node_i = m_partitioned_nodes[order_i];
5683 auto &vertex = m_vertices[node_i];
5684 auto print_edge = [&](graph_edge *, unsigned int other_node_i)
5685 {
5686 auto &other_vertex = m_vertices[other_node_i];
5687 if (other_vertex.partition < vertex.partition)
5688 dump_printf_loc (MSG_NOTE, vect_location,
5689 " - %p [%d] --> %p\n",
5690 (void *) other_vertex.node,
5691 other_vertex.partition,
5692 (void *) vertex.node);
5693 else
5694 dump_printf_loc (MSG_NOTE, vect_location,
5695 " - %p --> [%d] %p\n",
5696 (void *) vertex.node,
5697 other_vertex.partition,
5698 (void *) other_vertex.node);
5699 };
5700 for_each_partition_edge (node_i, fn: print_edge);
5701 }
5702
5703 for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5704 {
5705 auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5706 if (layout_costs.is_possible ())
5707 {
5708 dump_printf_loc (MSG_NOTE, vect_location,
5709 " layout %d:%s\n", layout_i,
5710 partition.layout == int (layout_i)
5711 ? " (*)" : "");
5712 slpg_layout_cost combined_cost = layout_costs.in_cost;
5713 combined_cost.add_serial_cost (other: layout_costs.internal_cost);
5714 combined_cost.add_serial_cost (other: layout_costs.out_cost);
5715#define TEMPLATE "{depth: %f, total: %f}"
5716 dump_printf_loc (MSG_NOTE, vect_location,
5717 " " TEMPLATE "\n",
5718 layout_costs.in_cost.depth.to_double (),
5719 layout_costs.in_cost.total.to_double ());
5720 dump_printf_loc (MSG_NOTE, vect_location,
5721 " + " TEMPLATE "\n",
5722 layout_costs.internal_cost.depth.to_double (),
5723 layout_costs.internal_cost.total.to_double ());
5724 dump_printf_loc (MSG_NOTE, vect_location,
5725 " + " TEMPLATE "\n",
5726 layout_costs.out_cost.depth.to_double (),
5727 layout_costs.out_cost.total.to_double ());
5728 dump_printf_loc (MSG_NOTE, vect_location,
5729 " = " TEMPLATE "\n",
5730 combined_cost.depth.to_double (),
5731 combined_cost.total.to_double ());
5732#undef TEMPLATE
5733 }
5734 else
5735 dump_printf_loc (MSG_NOTE, vect_location,
5736 " layout %d: rejected\n", layout_i);
5737 }
5738 }
5739}
5740
5741/* Main entry point for the SLP graph optimization pass. */
5742
5743void
5744vect_optimize_slp_pass::run ()
5745{
5746 build_graph ();
5747 create_partitions ();
5748 start_choosing_layouts ();
5749 if (m_perms.length () > 1)
5750 {
5751 forward_pass ();
5752 backward_pass ();
5753 if (dump_enabled_p ())
5754 dump ();
5755 materialize ();
5756 while (!m_perms.is_empty ())
5757 m_perms.pop ().release ();
5758 }
5759 else
5760 remove_redundant_permutations ();
5761 free_graph (g: m_slpg);
5762}
5763
5764/* Optimize the SLP graph of VINFO. */
5765
5766void
5767vect_optimize_slp (vec_info *vinfo)
5768{
5769 if (vinfo->slp_instances.is_empty ())
5770 return;
5771 vect_optimize_slp_pass (vinfo).run ();
5772}
5773
5774/* Gather loads reachable from the individual SLP graph entries. */
5775
5776void
5777vect_gather_slp_loads (vec_info *vinfo)
5778{
5779 unsigned i;
5780 slp_instance instance;
5781 FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
5782 {
5783 hash_set<slp_tree> visited;
5784 vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
5785 SLP_INSTANCE_TREE (instance), visited);
5786 }
5787}
5788
5789
5790/* For each possible SLP instance decide whether to SLP it and calculate overall
5791 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
5792 least one instance. */
5793
5794bool
5795vect_make_slp_decision (loop_vec_info loop_vinfo)
5796{
5797 unsigned int i;
5798 poly_uint64 unrolling_factor = 1;
5799 const vec<slp_instance> &slp_instances
5800 = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
5801 slp_instance instance;
5802 int decided_to_slp = 0;
5803
5804 DUMP_VECT_SCOPE ("vect_make_slp_decision");
5805
5806 FOR_EACH_VEC_ELT (slp_instances, i, instance)
5807 {
5808 /* FORNOW: SLP if you can. */
5809 /* All unroll factors have the form:
5810
5811 GET_MODE_SIZE (vinfo->vector_mode) * X
5812
5813 for some rational X, so they must have a common multiple. */
5814 unrolling_factor
5815 = force_common_multiple (a: unrolling_factor,
5816 SLP_INSTANCE_UNROLLING_FACTOR (instance));
5817
5818 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
5819 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
5820 loop-based vectorization. Such stmts will be marked as HYBRID. */
5821 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
5822 decided_to_slp++;
5823 }
5824
5825 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
5826
5827 if (decided_to_slp && dump_enabled_p ())
5828 {
5829 dump_printf_loc (MSG_NOTE, vect_location,
5830 "Decided to SLP %d instances. Unrolling factor ",
5831 decided_to_slp);
5832 dump_dec (MSG_NOTE, unrolling_factor);
5833 dump_printf (MSG_NOTE, "\n");
5834 }
5835
5836 return (decided_to_slp > 0);
5837}
5838
5839/* Private data for vect_detect_hybrid_slp. */
5840struct vdhs_data
5841{
5842 loop_vec_info loop_vinfo;
5843 vec<stmt_vec_info> *worklist;
5844};
5845
5846/* Walker for walk_gimple_op. */
5847
5848static tree
5849vect_detect_hybrid_slp (tree *tp, int *, void *data)
5850{
5851 walk_stmt_info *wi = (walk_stmt_info *)data;
5852 vdhs_data *dat = (vdhs_data *)wi->info;
5853
5854 if (wi->is_lhs)
5855 return NULL_TREE;
5856
5857 stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
5858 if (!def_stmt_info)
5859 return NULL_TREE;
5860 def_stmt_info = vect_stmt_to_vectorize (stmt_info: def_stmt_info);
5861 if (PURE_SLP_STMT (def_stmt_info))
5862 {
5863 if (dump_enabled_p ())
5864 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
5865 def_stmt_info->stmt);
5866 STMT_SLP_TYPE (def_stmt_info) = hybrid;
5867 dat->worklist->safe_push (obj: def_stmt_info);
5868 }
5869
5870 return NULL_TREE;
5871}
5872
5873/* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
5874 if so, otherwise pushing it to WORKLIST. */
5875
5876static void
5877maybe_push_to_hybrid_worklist (vec_info *vinfo,
5878 vec<stmt_vec_info> &worklist,
5879 stmt_vec_info stmt_info)
5880{
5881 if (dump_enabled_p ())
5882 dump_printf_loc (MSG_NOTE, vect_location,
5883 "Processing hybrid candidate : %G", stmt_info->stmt);
5884 stmt_vec_info orig_info = vect_orig_stmt (stmt_info);
5885 imm_use_iterator iter2;
5886 ssa_op_iter iter1;
5887 use_operand_p use_p;
5888 def_operand_p def_p;
5889 bool any_def = false;
5890 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF)
5891 {
5892 any_def = true;
5893 FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p))
5894 {
5895 if (is_gimple_debug (USE_STMT (use_p)))
5896 continue;
5897 stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p));
5898 /* An out-of loop use means this is a loop_vect sink. */
5899 if (!use_info)
5900 {
5901 if (dump_enabled_p ())
5902 dump_printf_loc (MSG_NOTE, vect_location,
5903 "Found loop_vect sink: %G", stmt_info->stmt);
5904 worklist.safe_push (obj: stmt_info);
5905 return;
5906 }
5907 else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info)))
5908 {
5909 if (dump_enabled_p ())
5910 dump_printf_loc (MSG_NOTE, vect_location,
5911 "Found loop_vect use: %G", use_info->stmt);
5912 worklist.safe_push (obj: stmt_info);
5913 return;
5914 }
5915 }
5916 }
5917 /* No def means this is a loo_vect sink. */
5918 if (!any_def)
5919 {
5920 if (dump_enabled_p ())
5921 dump_printf_loc (MSG_NOTE, vect_location,
5922 "Found loop_vect sink: %G", stmt_info->stmt);
5923 worklist.safe_push (obj: stmt_info);
5924 return;
5925 }
5926 if (dump_enabled_p ())
5927 dump_printf_loc (MSG_NOTE, vect_location,
5928 "Marked SLP consumed stmt pure: %G", stmt_info->stmt);
5929 STMT_SLP_TYPE (stmt_info) = pure_slp;
5930}
5931
5932/* Find stmts that must be both vectorized and SLPed. */
5933
5934void
5935vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
5936{
5937 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
5938
5939 /* All stmts participating in SLP are marked pure_slp, all other
5940 stmts are loop_vect.
5941 First collect all loop_vect stmts into a worklist.
5942 SLP patterns cause not all original scalar stmts to appear in
5943 SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
5944 Rectify this here and do a backward walk over the IL only considering
5945 stmts as loop_vect when they are used by a loop_vect stmt and otherwise
5946 mark them as pure_slp. */
5947 auto_vec<stmt_vec_info> worklist;
5948 for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - 1; i >= 0; --i)
5949 {
5950 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
5951 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (i: gsi);
5952 gsi_next (i: &gsi))
5953 {
5954 gphi *phi = gsi.phi ();
5955 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
5956 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
5957 maybe_push_to_hybrid_worklist (vinfo: loop_vinfo,
5958 worklist, stmt_info);
5959 }
5960 for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (i: gsi);
5961 gsi_prev (i: &gsi))
5962 {
5963 gimple *stmt = gsi_stmt (i: gsi);
5964 if (is_gimple_debug (gs: stmt))
5965 continue;
5966 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
5967 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
5968 {
5969 for (gimple_stmt_iterator gsi2
5970 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
5971 !gsi_end_p (i: gsi2); gsi_next (i: &gsi2))
5972 {
5973 stmt_vec_info patt_info
5974 = loop_vinfo->lookup_stmt (gsi_stmt (i: gsi2));
5975 if (!STMT_SLP_TYPE (patt_info)
5976 && STMT_VINFO_RELEVANT (patt_info))
5977 maybe_push_to_hybrid_worklist (vinfo: loop_vinfo,
5978 worklist, stmt_info: patt_info);
5979 }
5980 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
5981 }
5982 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
5983 maybe_push_to_hybrid_worklist (vinfo: loop_vinfo,
5984 worklist, stmt_info);
5985 }
5986 }
5987
5988 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
5989 mark any SLP vectorized stmt as hybrid.
5990 ??? We're visiting def stmts N times (once for each non-SLP and
5991 once for each hybrid-SLP use). */
5992 walk_stmt_info wi;
5993 vdhs_data dat;
5994 dat.worklist = &worklist;
5995 dat.loop_vinfo = loop_vinfo;
5996 memset (s: &wi, c: 0, n: sizeof (wi));
5997 wi.info = (void *)&dat;
5998 while (!worklist.is_empty ())
5999 {
6000 stmt_vec_info stmt_info = worklist.pop ();
6001 /* Since SSA operands are not set up for pattern stmts we need
6002 to use walk_gimple_op. */
6003 wi.is_lhs = 0;
6004 walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
6005 /* For gather/scatter make sure to walk the offset operand, that
6006 can be a scaling and conversion away. */
6007 gather_scatter_info gs_info;
6008 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
6009 && vect_check_gather_scatter (stmt_info, loop_vinfo, &gs_info))
6010 {
6011 int dummy;
6012 vect_detect_hybrid_slp (tp: &gs_info.offset, &dummy, data: &wi);
6013 }
6014 }
6015}
6016
6017
6018/* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
6019
6020_bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
6021 : vec_info (vec_info::bb, shared),
6022 bbs (_bbs),
6023 roots (vNULL)
6024{
6025 for (unsigned i = 0; i < bbs.length (); ++i)
6026 {
6027 if (i != 0)
6028 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (i: si);
6029 gsi_next (i: &si))
6030 {
6031 gphi *phi = si.phi ();
6032 gimple_set_uid (g: phi, uid: 0);
6033 add_stmt (phi);
6034 }
6035 for (gimple_stmt_iterator gsi = gsi_start_bb (bb: bbs[i]);
6036 !gsi_end_p (i: gsi); gsi_next (i: &gsi))
6037 {
6038 gimple *stmt = gsi_stmt (i: gsi);
6039 gimple_set_uid (g: stmt, uid: 0);
6040 if (is_gimple_debug (gs: stmt))
6041 continue;
6042 add_stmt (stmt);
6043 }
6044 }
6045}
6046
6047
6048/* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
6049 stmts in the basic block. */
6050
6051_bb_vec_info::~_bb_vec_info ()
6052{
6053 /* Reset region marker. */
6054 for (unsigned i = 0; i < bbs.length (); ++i)
6055 {
6056 if (i != 0)
6057 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (i: si);
6058 gsi_next (i: &si))
6059 {
6060 gphi *phi = si.phi ();
6061 gimple_set_uid (g: phi, uid: -1);
6062 }
6063 for (gimple_stmt_iterator gsi = gsi_start_bb (bb: bbs[i]);
6064 !gsi_end_p (i: gsi); gsi_next (i: &gsi))
6065 {
6066 gimple *stmt = gsi_stmt (i: gsi);
6067 gimple_set_uid (g: stmt, uid: -1);
6068 }
6069 }
6070
6071 for (unsigned i = 0; i < roots.length (); ++i)
6072 {
6073 roots[i].stmts.release ();
6074 roots[i].roots.release ();
6075 roots[i].remain.release ();
6076 }
6077 roots.release ();
6078}
6079
6080/* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
6081 given then that child nodes have already been processed, and that
6082 their def types currently match their SLP node's def type. */
6083
6084static bool
6085vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
6086 slp_instance node_instance,
6087 stmt_vector_for_cost *cost_vec)
6088{
6089 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
6090
6091 /* Calculate the number of vector statements to be created for the
6092 scalar stmts in this node. For SLP reductions it is equal to the
6093 number of vector statements in the children (which has already been
6094 calculated by the recursive call). Otherwise it is the number of
6095 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
6096 VF divided by the number of elements in a vector. */
6097 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
6098 && !STMT_VINFO_DATA_REF (stmt_info)
6099 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6100 {
6101 for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
6102 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
6103 {
6104 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
6105 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
6106 break;
6107 }
6108 }
6109 else
6110 {
6111 poly_uint64 vf;
6112 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (p: vinfo))
6113 vf = loop_vinfo->vectorization_factor;
6114 else
6115 vf = 1;
6116 unsigned int group_size = SLP_TREE_LANES (node);
6117 tree vectype = SLP_TREE_VECTYPE (node);
6118 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
6119 = vect_get_num_vectors (nunits: vf * group_size, vectype);
6120 }
6121
6122 /* Handle purely internal nodes. */
6123 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6124 {
6125 if (!vectorizable_slp_permutation (vinfo, NULL, node, cost_vec))
6126 return false;
6127
6128 stmt_vec_info slp_stmt_info;
6129 unsigned int i;
6130 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
6131 {
6132 if (STMT_VINFO_LIVE_P (slp_stmt_info)
6133 && !vectorizable_live_operation (vinfo, slp_stmt_info, node,
6134 node_instance, i,
6135 false, cost_vec))
6136 return false;
6137 }
6138 return true;
6139 }
6140
6141 bool dummy;
6142 return vect_analyze_stmt (vinfo, stmt_info, &dummy,
6143 node, node_instance, cost_vec);
6144}
6145
6146/* Try to build NODE from scalars, returning true on success.
6147 NODE_INSTANCE is the SLP instance that contains NODE. */
6148
6149static bool
6150vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
6151 slp_instance node_instance)
6152{
6153 stmt_vec_info stmt_info;
6154 unsigned int i;
6155
6156 if (!is_a <bb_vec_info> (p: vinfo)
6157 || node == SLP_INSTANCE_TREE (node_instance)
6158 || !SLP_TREE_SCALAR_STMTS (node).exists ()
6159 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node))
6160 /* Force the mask use to be built from scalars instead. */
6161 || VECTOR_BOOLEAN_TYPE_P (SLP_TREE_VECTYPE (node)))
6162 return false;
6163
6164 if (dump_enabled_p ())
6165 dump_printf_loc (MSG_NOTE, vect_location,
6166 "Building vector operands of %p from scalars instead\n",
6167 (void *) node);
6168
6169 /* Don't remove and free the child nodes here, since they could be
6170 referenced by other structures. The analysis and scheduling phases
6171 (need to) ignore child nodes of anything that isn't vect_internal_def. */
6172 unsigned int group_size = SLP_TREE_LANES (node);
6173 SLP_TREE_DEF_TYPE (node) = vect_external_def;
6174 /* Invariants get their vector type from the uses. */
6175 SLP_TREE_VECTYPE (node) = NULL_TREE;
6176 SLP_TREE_SCALAR_OPS (node).safe_grow (len: group_size, exact: true);
6177 SLP_TREE_LOAD_PERMUTATION (node).release ();
6178 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6179 {
6180 tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
6181 SLP_TREE_SCALAR_OPS (node)[i] = lhs;
6182 }
6183 return true;
6184}
6185
6186/* Return true if all elements of the slice are the same. */
6187bool
6188vect_scalar_ops_slice::all_same_p () const
6189{
6190 for (unsigned int i = 1; i < length; ++i)
6191 if (!operand_equal_p (op (i: 0), op (i)))
6192 return false;
6193 return true;
6194}
6195
6196hashval_t
6197vect_scalar_ops_slice_hash::hash (const value_type &s)
6198{
6199 hashval_t hash = 0;
6200 for (unsigned i = 0; i < s.length; ++i)
6201 hash = iterative_hash_expr (tree: s.op (i), seed: hash);
6202 return hash;
6203}
6204
6205bool
6206vect_scalar_ops_slice_hash::equal (const value_type &s1,
6207 const compare_type &s2)
6208{
6209 if (s1.length != s2.length)
6210 return false;
6211 for (unsigned i = 0; i < s1.length; ++i)
6212 if (!operand_equal_p (s1.op (i), s2.op (i)))
6213 return false;
6214 return true;
6215}
6216
6217/* Compute the prologue cost for invariant or constant operands represented
6218 by NODE. */
6219
6220static void
6221vect_prologue_cost_for_slp (slp_tree node,
6222 stmt_vector_for_cost *cost_vec)
6223{
6224 /* There's a special case of an existing vector, that costs nothing. */
6225 if (SLP_TREE_SCALAR_OPS (node).length () == 0
6226 && !SLP_TREE_VEC_DEFS (node).is_empty ())
6227 return;
6228 /* Without looking at the actual initializer a vector of
6229 constants can be implemented as load from the constant pool.
6230 When all elements are the same we can use a splat. */
6231 tree vectype = SLP_TREE_VECTYPE (node);
6232 unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
6233 unsigned HOST_WIDE_INT const_nunits;
6234 unsigned nelt_limit;
6235 auto ops = &SLP_TREE_SCALAR_OPS (node);
6236 auto_vec<unsigned int> starts (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
6237 if (TYPE_VECTOR_SUBPARTS (node: vectype).is_constant (const_value: &const_nunits)
6238 && ! multiple_p (a: const_nunits, b: group_size))
6239 {
6240 nelt_limit = const_nunits;
6241 hash_set<vect_scalar_ops_slice_hash> vector_ops;
6242 for (unsigned int i = 0; i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); ++i)
6243 if (!vector_ops.add (k: { .ops: ops, .start: i * const_nunits, .length: const_nunits }))
6244 starts.quick_push (obj: i * const_nunits);
6245 }
6246 else
6247 {
6248 /* If either the vector has variable length or the vectors
6249 are composed of repeated whole groups we only need to
6250 cost construction once. All vectors will be the same. */
6251 nelt_limit = group_size;
6252 starts.quick_push (obj: 0);
6253 }
6254 /* ??? We're just tracking whether vectors in a single node are the same.
6255 Ideally we'd do something more global. */
6256 bool passed = false;
6257 for (unsigned int start : starts)
6258 {
6259 vect_cost_for_stmt kind;
6260 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def)
6261 kind = vector_load;
6262 else if (vect_scalar_ops_slice { .ops: ops, .start: start, .length: nelt_limit }.all_same_p ())
6263 kind = scalar_to_vec;
6264 else
6265 kind = vec_construct;
6266 /* The target cost hook has no idea which part of the SLP node
6267 we are costing so avoid passing it down more than once. Pass
6268 it to the first vec_construct or scalar_to_vec part since for those
6269 the x86 backend tries to account for GPR to XMM register moves. */
6270 record_stmt_cost (cost_vec, 1, kind,
6271 (kind != vector_load && !passed) ? node : nullptr,
6272 vectype, 0, vect_prologue);
6273 if (kind != vector_load)
6274 passed = true;
6275 }
6276}
6277
6278/* Analyze statements contained in SLP tree NODE after recursively analyzing
6279 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
6280
6281 Return true if the operations are supported. */
6282
6283static bool
6284vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
6285 slp_instance node_instance,
6286 hash_set<slp_tree> &visited_set,
6287 vec<slp_tree> &visited_vec,
6288 stmt_vector_for_cost *cost_vec)
6289{
6290 int i, j;
6291 slp_tree child;
6292
6293 /* Assume we can code-generate all invariants. */
6294 if (!node
6295 || SLP_TREE_DEF_TYPE (node) == vect_constant_def
6296 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
6297 return true;
6298
6299 if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
6300 {
6301 if (dump_enabled_p ())
6302 dump_printf_loc (MSG_NOTE, vect_location,
6303 "Failed cyclic SLP reference in %p\n", (void *) node);
6304 return false;
6305 }
6306 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
6307
6308 /* If we already analyzed the exact same set of scalar stmts we're done.
6309 We share the generated vector stmts for those. */
6310 if (visited_set.add (k: node))
6311 return true;
6312 visited_vec.safe_push (obj: node);
6313
6314 bool res = true;
6315 unsigned visited_rec_start = visited_vec.length ();
6316 unsigned cost_vec_rec_start = cost_vec->length ();
6317 bool seen_non_constant_child = false;
6318 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6319 {
6320 res = vect_slp_analyze_node_operations (vinfo, node: child, node_instance,
6321 visited_set, visited_vec,
6322 cost_vec);
6323 if (!res)
6324 break;
6325 if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
6326 seen_non_constant_child = true;
6327 }
6328 /* We're having difficulties scheduling nodes with just constant
6329 operands and no scalar stmts since we then cannot compute a stmt
6330 insertion place. */
6331 if (!seen_non_constant_child && SLP_TREE_SCALAR_STMTS (node).is_empty ())
6332 {
6333 if (dump_enabled_p ())
6334 dump_printf_loc (MSG_NOTE, vect_location,
6335 "Cannot vectorize all-constant op node %p\n",
6336 (void *) node);
6337 res = false;
6338 }
6339
6340 if (res)
6341 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
6342 cost_vec);
6343 /* If analysis failed we have to pop all recursive visited nodes
6344 plus ourselves. */
6345 if (!res)
6346 {
6347 while (visited_vec.length () >= visited_rec_start)
6348 visited_set.remove (k: visited_vec.pop ());
6349 cost_vec->truncate (size: cost_vec_rec_start);
6350 }
6351
6352 /* When the node can be vectorized cost invariant nodes it references.
6353 This is not done in DFS order to allow the refering node
6354 vectorizable_* calls to nail down the invariant nodes vector type
6355 and possibly unshare it if it needs a different vector type than
6356 other referrers. */
6357 if (res)
6358 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
6359 if (child
6360 && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
6361 || SLP_TREE_DEF_TYPE (child) == vect_external_def)
6362 /* Perform usual caching, note code-generation still
6363 code-gens these nodes multiple times but we expect
6364 to CSE them later. */
6365 && !visited_set.add (k: child))
6366 {
6367 visited_vec.safe_push (obj: child);
6368 /* ??? After auditing more code paths make a "default"
6369 and push the vector type from NODE to all children
6370 if it is not already set. */
6371 /* Compute the number of vectors to be generated. */
6372 tree vector_type = SLP_TREE_VECTYPE (child);
6373 if (!vector_type)
6374 {
6375 /* For shifts with a scalar argument we don't need
6376 to cost or code-generate anything.
6377 ??? Represent this more explicitely. */
6378 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
6379 == shift_vec_info_type)
6380 && j == 1);
6381 continue;
6382 }
6383 unsigned group_size = SLP_TREE_LANES (child);
6384 poly_uint64 vf = 1;
6385 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (p: vinfo))
6386 vf = loop_vinfo->vectorization_factor;
6387 SLP_TREE_NUMBER_OF_VEC_STMTS (child)
6388 = vect_get_num_vectors (nunits: vf * group_size, vectype: vector_type);
6389 /* And cost them. */
6390 vect_prologue_cost_for_slp (node: child, cost_vec);
6391 }
6392
6393 /* If this node or any of its children can't be vectorized, try pruning
6394 the tree here rather than felling the whole thing. */
6395 if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
6396 {
6397 /* We'll need to revisit this for invariant costing and number
6398 of vectorized stmt setting. */
6399 res = true;
6400 }
6401
6402 return res;
6403}
6404
6405/* Mark lanes of NODE that are live outside of the basic-block vectorized
6406 region and that can be vectorized using vectorizable_live_operation
6407 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
6408 scalar code computing it to be retained. */
6409
6410static void
6411vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
6412 slp_instance instance,
6413 stmt_vector_for_cost *cost_vec,
6414 hash_set<stmt_vec_info> &svisited,
6415 hash_set<slp_tree> &visited)
6416{
6417 if (visited.add (k: node))
6418 return;
6419
6420 unsigned i;
6421 stmt_vec_info stmt_info;
6422 stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
6423 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6424 {
6425 if (svisited.contains (k: stmt_info))
6426 continue;
6427 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6428 if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
6429 && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
6430 /* Only the pattern root stmt computes the original scalar value. */
6431 continue;
6432 bool mark_visited = true;
6433 gimple *orig_stmt = orig_stmt_info->stmt;
6434 ssa_op_iter op_iter;
6435 def_operand_p def_p;
6436 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
6437 {
6438 imm_use_iterator use_iter;
6439 gimple *use_stmt;
6440 stmt_vec_info use_stmt_info;
6441 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
6442 if (!is_gimple_debug (gs: use_stmt))
6443 {
6444 use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
6445 if (!use_stmt_info
6446 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
6447 {
6448 STMT_VINFO_LIVE_P (stmt_info) = true;
6449 if (vectorizable_live_operation (bb_vinfo, stmt_info,
6450 node, instance, i,
6451 false, cost_vec))
6452 /* ??? So we know we can vectorize the live stmt
6453 from one SLP node. If we cannot do so from all
6454 or none consistently we'd have to record which
6455 SLP node (and lane) we want to use for the live
6456 operation. So make sure we can code-generate
6457 from all nodes. */
6458 mark_visited = false;
6459 else
6460 STMT_VINFO_LIVE_P (stmt_info) = false;
6461 break;
6462 }
6463 }
6464 /* We have to verify whether we can insert the lane extract
6465 before all uses. The following is a conservative approximation.
6466 We cannot put this into vectorizable_live_operation because
6467 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
6468 doesn't work.
6469 Note that while the fact that we emit code for loads at the
6470 first load should make this a non-problem leafs we construct
6471 from scalars are vectorized after the last scalar def.
6472 ??? If we'd actually compute the insert location during
6473 analysis we could use sth less conservative than the last
6474 scalar stmt in the node for the dominance check. */
6475 /* ??? What remains is "live" uses in vector CTORs in the same
6476 SLP graph which is where those uses can end up code-generated
6477 right after their definition instead of close to their original
6478 use. But that would restrict us to code-generate lane-extracts
6479 from the latest stmt in a node. So we compensate for this
6480 during code-generation, simply not replacing uses for those
6481 hopefully rare cases. */
6482 if (STMT_VINFO_LIVE_P (stmt_info))
6483 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
6484 if (!is_gimple_debug (gs: use_stmt)
6485 && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
6486 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
6487 && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
6488 {
6489 if (dump_enabled_p ())
6490 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6491 "Cannot determine insertion place for "
6492 "lane extract\n");
6493 STMT_VINFO_LIVE_P (stmt_info) = false;
6494 mark_visited = true;
6495 }
6496 }
6497 if (mark_visited)
6498 svisited.add (k: stmt_info);
6499 }
6500
6501 slp_tree child;
6502 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6503 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6504 vect_bb_slp_mark_live_stmts (bb_vinfo, node: child, instance,
6505 cost_vec, svisited, visited);
6506}
6507
6508/* Determine whether we can vectorize the reduction epilogue for INSTANCE. */
6509
6510static bool
6511vectorizable_bb_reduc_epilogue (slp_instance instance,
6512 stmt_vector_for_cost *cost_vec)
6513{
6514 gassign *stmt = as_a <gassign *> (p: instance->root_stmts[0]->stmt);
6515 enum tree_code reduc_code = gimple_assign_rhs_code (gs: stmt);
6516 if (reduc_code == MINUS_EXPR)
6517 reduc_code = PLUS_EXPR;
6518 internal_fn reduc_fn;
6519 tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
6520 if (!vectype
6521 || !reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
6522 || reduc_fn == IFN_LAST
6523 || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
6524 || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
6525 TREE_TYPE (vectype)))
6526 {
6527 if (dump_enabled_p ())
6528 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6529 "not vectorized: basic block reduction epilogue "
6530 "operation unsupported.\n");
6531 return false;
6532 }
6533
6534 /* There's no way to cost a horizontal vector reduction via REDUC_FN so
6535 cost log2 vector operations plus shuffles and one extraction. */
6536 unsigned steps = floor_log2 (x: vect_nunits_for_cost (vec_type: vectype));
6537 record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
6538 vectype, 0, vect_body);
6539 record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
6540 vectype, 0, vect_body);
6541 record_stmt_cost (cost_vec, 1, vec_to_scalar, instance->root_stmts[0],
6542 vectype, 0, vect_body);
6543
6544 /* Since we replace all stmts of a possibly longer scalar reduction
6545 chain account for the extra scalar stmts for that. */
6546 record_stmt_cost (body_cost_vec: cost_vec, count: instance->remain_defs.length (), kind: scalar_stmt,
6547 stmt_info: instance->root_stmts[0], misalign: 0, where: vect_body);
6548 return true;
6549}
6550
6551/* Prune from ROOTS all stmts that are computed as part of lanes of NODE
6552 and recurse to children. */
6553
6554static void
6555vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots,
6556 hash_set<slp_tree> &visited)
6557{
6558 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
6559 || visited.add (k: node))
6560 return;
6561
6562 stmt_vec_info stmt;
6563 unsigned i;
6564 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
6565 roots.remove (k: vect_orig_stmt (stmt_info: stmt));
6566
6567 slp_tree child;
6568 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6569 if (child)
6570 vect_slp_prune_covered_roots (node: child, roots, visited);
6571}
6572
6573/* Analyze statements in SLP instances of VINFO. Return true if the
6574 operations are supported. */
6575
6576bool
6577vect_slp_analyze_operations (vec_info *vinfo)
6578{
6579 slp_instance instance;
6580 int i;
6581
6582 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
6583
6584 hash_set<slp_tree> visited;
6585 for (i = 0; vinfo->slp_instances.iterate (ix: i, ptr: &instance); )
6586 {
6587 auto_vec<slp_tree> visited_vec;
6588 stmt_vector_for_cost cost_vec;
6589 cost_vec.create (nelems: 2);
6590 if (is_a <bb_vec_info> (p: vinfo))
6591 vect_location = instance->location ();
6592 if (!vect_slp_analyze_node_operations (vinfo,
6593 SLP_INSTANCE_TREE (instance),
6594 node_instance: instance, visited_set&: visited, visited_vec,
6595 cost_vec: &cost_vec)
6596 /* CTOR instances require vectorized defs for the SLP tree root. */
6597 || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor
6598 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
6599 != vect_internal_def
6600 /* Make sure we vectorized with the expected type. */
6601 || !useless_type_conversion_p
6602 (TREE_TYPE (TREE_TYPE (gimple_assign_rhs1
6603 (instance->root_stmts[0]->stmt))),
6604 TREE_TYPE (SLP_TREE_VECTYPE
6605 (SLP_INSTANCE_TREE (instance))))))
6606 /* Check we can vectorize the reduction. */
6607 || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
6608 && !vectorizable_bb_reduc_epilogue (instance, cost_vec: &cost_vec)))
6609 {
6610 slp_tree node = SLP_INSTANCE_TREE (instance);
6611 stmt_vec_info stmt_info;
6612 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6613 stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0];
6614 else
6615 stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
6616 if (dump_enabled_p ())
6617 dump_printf_loc (MSG_NOTE, vect_location,
6618 "removing SLP instance operations starting from: %G",
6619 stmt_info->stmt);
6620 vect_free_slp_instance (instance);
6621 vinfo->slp_instances.ordered_remove (ix: i);
6622 cost_vec.release ();
6623 while (!visited_vec.is_empty ())
6624 visited.remove (k: visited_vec.pop ());
6625 }
6626 else
6627 {
6628 i++;
6629 if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (p: vinfo))
6630 {
6631 add_stmt_costs (costs: loop_vinfo->vector_costs, cost_vec: &cost_vec);
6632 cost_vec.release ();
6633 }
6634 else
6635 /* For BB vectorization remember the SLP graph entry
6636 cost for later. */
6637 instance->cost_vec = cost_vec;
6638 }
6639 }
6640
6641 /* Now look for SLP instances with a root that are covered by other
6642 instances and remove them. */
6643 hash_set<stmt_vec_info> roots;
6644 for (i = 0; vinfo->slp_instances.iterate (ix: i, ptr: &instance); ++i)
6645 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6646 roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]);
6647 if (!roots.is_empty ())
6648 {
6649 visited.empty ();
6650 for (i = 0; vinfo->slp_instances.iterate (ix: i, ptr: &instance); ++i)
6651 vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots,
6652 visited);
6653 for (i = 0; vinfo->slp_instances.iterate (ix: i, ptr: &instance); )
6654 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
6655 && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0]))
6656 {
6657 stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0];
6658 if (dump_enabled_p ())
6659 dump_printf_loc (MSG_NOTE, vect_location,
6660 "removing SLP instance operations starting "
6661 "from: %G", root->stmt);
6662 vect_free_slp_instance (instance);
6663 vinfo->slp_instances.ordered_remove (ix: i);
6664 }
6665 else
6666 ++i;
6667 }
6668
6669 /* Compute vectorizable live stmts. */
6670 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (p: vinfo))
6671 {
6672 hash_set<stmt_vec_info> svisited;
6673 hash_set<slp_tree> visited;
6674 for (i = 0; vinfo->slp_instances.iterate (ix: i, ptr: &instance); ++i)
6675 {
6676 vect_location = instance->location ();
6677 vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
6678 instance, cost_vec: &instance->cost_vec, svisited,
6679 visited);
6680 }
6681 }
6682
6683 return !vinfo->slp_instances.is_empty ();
6684}
6685
6686/* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
6687 closing the eventual chain. */
6688
6689static slp_instance
6690get_ultimate_leader (slp_instance instance,
6691 hash_map<slp_instance, slp_instance> &instance_leader)
6692{
6693 auto_vec<slp_instance *, 8> chain;
6694 slp_instance *tem;
6695 while (*(tem = instance_leader.get (k: instance)) != instance)
6696 {
6697 chain.safe_push (obj: tem);
6698 instance = *tem;
6699 }
6700 while (!chain.is_empty ())
6701 *chain.pop () = instance;
6702 return instance;
6703}
6704
6705namespace {
6706/* Subroutine of vect_bb_partition_graph_r. Map KEY to INSTANCE in
6707 KEY_TO_INSTANCE, making INSTANCE the leader of any previous mapping
6708 for KEY. Return true if KEY was already in KEY_TO_INSTANCE.
6709
6710 INSTANCE_LEADER is as for get_ultimate_leader. */
6711
6712template<typename T>
6713bool
6714vect_map_to_instance (slp_instance instance, T key,
6715 hash_map<T, slp_instance> &key_to_instance,
6716 hash_map<slp_instance, slp_instance> &instance_leader)
6717{
6718 bool existed_p;
6719 slp_instance &key_instance = key_to_instance.get_or_insert (key, &existed_p);
6720 if (!existed_p)
6721 ;
6722 else if (key_instance != instance)
6723 {
6724 /* If we're running into a previously marked key make us the
6725 leader of the current ultimate leader. This keeps the
6726 leader chain acyclic and works even when the current instance
6727 connects two previously independent graph parts. */
6728 slp_instance key_leader
6729 = get_ultimate_leader (instance: key_instance, instance_leader);
6730 if (key_leader != instance)
6731 instance_leader.put (k: key_leader, v: instance);
6732 }
6733 key_instance = instance;
6734 return existed_p;
6735}
6736}
6737
6738/* Worker of vect_bb_partition_graph, recurse on NODE. */
6739
6740static void
6741vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
6742 slp_instance instance, slp_tree node,
6743 hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
6744 hash_map<slp_tree, slp_instance> &node_to_instance,
6745 hash_map<slp_instance, slp_instance> &instance_leader)
6746{
6747 stmt_vec_info stmt_info;
6748 unsigned i;
6749
6750 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6751 vect_map_to_instance (instance, key: stmt_info, key_to_instance&: stmt_to_instance,
6752 instance_leader);
6753
6754 if (vect_map_to_instance (instance, key: node, key_to_instance&: node_to_instance,
6755 instance_leader))
6756 return;
6757
6758 slp_tree child;
6759 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6760 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6761 vect_bb_partition_graph_r (bb_vinfo, instance, node: child, stmt_to_instance,
6762 node_to_instance, instance_leader);
6763}
6764
6765/* Partition the SLP graph into pieces that can be costed independently. */
6766
6767static void
6768vect_bb_partition_graph (bb_vec_info bb_vinfo)
6769{
6770 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
6771
6772 /* First walk the SLP graph assigning each involved scalar stmt a
6773 corresponding SLP graph entry and upon visiting a previously
6774 marked stmt, make the stmts leader the current SLP graph entry. */
6775 hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
6776 hash_map<slp_tree, slp_instance> node_to_instance;
6777 hash_map<slp_instance, slp_instance> instance_leader;
6778 slp_instance instance;
6779 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (ix: i, ptr: &instance); ++i)
6780 {
6781 instance_leader.put (k: instance, v: instance);
6782 vect_bb_partition_graph_r (bb_vinfo,
6783 instance, SLP_INSTANCE_TREE (instance),
6784 stmt_to_instance, node_to_instance,
6785 instance_leader);
6786 }
6787
6788 /* Then collect entries to each independent subgraph. */
6789 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (ix: i, ptr: &instance); ++i)
6790 {
6791 slp_instance leader = get_ultimate_leader (instance, instance_leader);
6792 leader->subgraph_entries.safe_push (obj: instance);
6793 if (dump_enabled_p ()
6794 && leader != instance)
6795 dump_printf_loc (MSG_NOTE, vect_location,
6796 "instance %p is leader of %p\n",
6797 (void *) leader, (void *) instance);
6798 }
6799}
6800
6801/* Compute the set of scalar stmts participating in internal and external
6802 nodes. */
6803
6804static void
6805vect_slp_gather_vectorized_scalar_stmts (vec_info *vinfo, slp_tree node,
6806 hash_set<slp_tree> &visited,
6807 hash_set<stmt_vec_info> &vstmts,
6808 hash_set<stmt_vec_info> &estmts)
6809{
6810 int i;
6811 stmt_vec_info stmt_info;
6812 slp_tree child;
6813
6814 if (visited.add (k: node))
6815 return;
6816
6817 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
6818 {
6819 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6820 vstmts.add (k: stmt_info);
6821
6822 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6823 if (child)
6824 vect_slp_gather_vectorized_scalar_stmts (vinfo, node: child, visited,
6825 vstmts, estmts);
6826 }
6827 else
6828 for (tree def : SLP_TREE_SCALAR_OPS (node))
6829 {
6830 stmt_vec_info def_stmt = vinfo->lookup_def (def);
6831 if (def_stmt)
6832 estmts.add (k: def_stmt);
6833 }
6834}
6835
6836
6837/* Compute the scalar cost of the SLP node NODE and its children
6838 and return it. Do not account defs that are marked in LIFE and
6839 update LIFE according to uses of NODE. */
6840
6841static void
6842vect_bb_slp_scalar_cost (vec_info *vinfo,
6843 slp_tree node, vec<bool, va_heap> *life,
6844 stmt_vector_for_cost *cost_vec,
6845 hash_set<stmt_vec_info> &vectorized_scalar_stmts,
6846 hash_set<slp_tree> &visited)
6847{
6848 unsigned i;
6849 stmt_vec_info stmt_info;
6850 slp_tree child;
6851
6852 if (visited.add (k: node))
6853 return;
6854
6855 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6856 {
6857 ssa_op_iter op_iter;
6858 def_operand_p def_p;
6859
6860 if ((*life)[i])
6861 continue;
6862
6863 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6864 gimple *orig_stmt = orig_stmt_info->stmt;
6865
6866 /* If there is a non-vectorized use of the defs then the scalar
6867 stmt is kept live in which case we do not account it or any
6868 required defs in the SLP children in the scalar cost. This
6869 way we make the vectorization more costly when compared to
6870 the scalar cost. */
6871 if (!STMT_VINFO_LIVE_P (stmt_info))
6872 {
6873 auto_vec<gimple *, 8> worklist;
6874 hash_set<gimple *> *worklist_visited = NULL;
6875 worklist.quick_push (obj: orig_stmt);
6876 do
6877 {
6878 gimple *work_stmt = worklist.pop ();
6879 FOR_EACH_PHI_OR_STMT_DEF (def_p, work_stmt, op_iter, SSA_OP_DEF)
6880 {
6881 imm_use_iterator use_iter;
6882 gimple *use_stmt;
6883 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter,
6884 DEF_FROM_PTR (def_p))
6885 if (!is_gimple_debug (gs: use_stmt))
6886 {
6887 stmt_vec_info use_stmt_info
6888 = vinfo->lookup_stmt (use_stmt);
6889 if (!use_stmt_info
6890 || !vectorized_scalar_stmts.contains (k: use_stmt_info))
6891 {
6892 if (use_stmt_info
6893 && STMT_VINFO_IN_PATTERN_P (use_stmt_info))
6894 {
6895 /* For stmts participating in patterns we have
6896 to check its uses recursively. */
6897 if (!worklist_visited)
6898 worklist_visited = new hash_set<gimple *> ();
6899 if (!worklist_visited->add (k: use_stmt))
6900 worklist.safe_push (obj: use_stmt);
6901 continue;
6902 }
6903 (*life)[i] = true;
6904 goto next_lane;
6905 }
6906 }
6907 }
6908 }
6909 while (!worklist.is_empty ());
6910next_lane:
6911 if (worklist_visited)
6912 delete worklist_visited;
6913 if ((*life)[i])
6914 continue;
6915 }
6916
6917 /* Count scalar stmts only once. */
6918 if (gimple_visited_p (stmt: orig_stmt))
6919 continue;
6920 gimple_set_visited (stmt: orig_stmt, visited_p: true);
6921
6922 vect_cost_for_stmt kind;
6923 if (STMT_VINFO_DATA_REF (orig_stmt_info))
6924 {
6925 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
6926 kind = scalar_load;
6927 else
6928 kind = scalar_store;
6929 }
6930 else if (vect_nop_conversion_p (orig_stmt_info))
6931 continue;
6932 /* For single-argument PHIs assume coalescing which means zero cost
6933 for the scalar and the vector PHIs. This avoids artificially
6934 favoring the vector path (but may pessimize it in some cases). */
6935 else if (is_a <gphi *> (p: orig_stmt_info->stmt)
6936 && gimple_phi_num_args
6937 (gs: as_a <gphi *> (p: orig_stmt_info->stmt)) == 1)
6938 continue;
6939 else
6940 kind = scalar_stmt;
6941 record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
6942 SLP_TREE_VECTYPE (node), 0, vect_body);
6943 }
6944
6945 auto_vec<bool, 20> subtree_life;
6946 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6947 {
6948 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6949 {
6950 /* Do not directly pass LIFE to the recursive call, copy it to
6951 confine changes in the callee to the current child/subtree. */
6952 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6953 {
6954 subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), exact: true);
6955 for (unsigned j = 0;
6956 j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
6957 {
6958 auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
6959 if (perm.first == i)
6960 subtree_life[perm.second] = (*life)[j];
6961 }
6962 }
6963 else
6964 {
6965 gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
6966 subtree_life.safe_splice (src: *life);
6967 }
6968 vect_bb_slp_scalar_cost (vinfo, node: child, life: &subtree_life, cost_vec,
6969 vectorized_scalar_stmts, visited);
6970 subtree_life.truncate (size: 0);
6971 }
6972 }
6973}
6974
6975/* Comparator for the loop-index sorted cost vectors. */
6976
6977static int
6978li_cost_vec_cmp (const void *a_, const void *b_)
6979{
6980 auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
6981 auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
6982 if (a->first < b->first)
6983 return -1;
6984 else if (a->first == b->first)
6985 return 0;
6986 return 1;
6987}
6988
6989/* Check if vectorization of the basic block is profitable for the
6990 subgraph denoted by SLP_INSTANCES. */
6991
6992static bool
6993vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
6994 vec<slp_instance> slp_instances,
6995 loop_p orig_loop)
6996{
6997 slp_instance instance;
6998 int i;
6999 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
7000 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
7001
7002 if (dump_enabled_p ())
7003 {
7004 dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
7005 hash_set<slp_tree> visited;
7006 FOR_EACH_VEC_ELT (slp_instances, i, instance)
7007 vect_print_slp_graph (dump_kind: MSG_NOTE, loc: vect_location,
7008 SLP_INSTANCE_TREE (instance), visited);
7009 }
7010
7011 /* Compute the set of scalar stmts we know will go away 'locally' when
7012 vectorizing. This used to be tracked with just PURE_SLP_STMT but that's
7013 not accurate for nodes promoted extern late or for scalar stmts that
7014 are used both in extern defs and in vectorized defs. */
7015 hash_set<stmt_vec_info> vectorized_scalar_stmts;
7016 hash_set<stmt_vec_info> scalar_stmts_in_externs;
7017 hash_set<slp_tree> visited;
7018 FOR_EACH_VEC_ELT (slp_instances, i, instance)
7019 {
7020 vect_slp_gather_vectorized_scalar_stmts (vinfo: bb_vinfo,
7021 SLP_INSTANCE_TREE (instance),
7022 visited,
7023 vstmts&: vectorized_scalar_stmts,
7024 estmts&: scalar_stmts_in_externs);
7025 for (stmt_vec_info rstmt : SLP_INSTANCE_ROOT_STMTS (instance))
7026 vectorized_scalar_stmts.add (k: rstmt);
7027 }
7028 /* Scalar stmts used as defs in external nodes need to be preseved, so
7029 remove them from vectorized_scalar_stmts. */
7030 for (stmt_vec_info stmt : scalar_stmts_in_externs)
7031 vectorized_scalar_stmts.remove (k: stmt);
7032
7033 /* Calculate scalar cost and sum the cost for the vector stmts
7034 previously collected. */
7035 stmt_vector_for_cost scalar_costs = vNULL;
7036 stmt_vector_for_cost vector_costs = vNULL;
7037 visited.empty ();
7038 FOR_EACH_VEC_ELT (slp_instances, i, instance)
7039 {
7040 auto_vec<bool, 20> life;
7041 life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
7042 exact: true);
7043 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
7044 record_stmt_cost (body_cost_vec: &scalar_costs,
7045 SLP_INSTANCE_ROOT_STMTS (instance).length (),
7046 kind: scalar_stmt,
7047 SLP_INSTANCE_ROOT_STMTS (instance)[0], misalign: 0, where: vect_body);
7048 vect_bb_slp_scalar_cost (vinfo: bb_vinfo,
7049 SLP_INSTANCE_TREE (instance),
7050 life: &life, cost_vec: &scalar_costs, vectorized_scalar_stmts,
7051 visited);
7052 vector_costs.safe_splice (src: instance->cost_vec);
7053 instance->cost_vec.release ();
7054 }
7055
7056 if (dump_enabled_p ())
7057 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
7058
7059 /* When costing non-loop vectorization we need to consider each covered
7060 loop independently and make sure vectorization is profitable. For
7061 now we assume a loop may be not entered or executed an arbitrary
7062 number of iterations (??? static information can provide more
7063 precise info here) which means we can simply cost each containing
7064 loops stmts separately. */
7065
7066 /* First produce cost vectors sorted by loop index. */
7067 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
7068 li_scalar_costs (scalar_costs.length ());
7069 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
7070 li_vector_costs (vector_costs.length ());
7071 stmt_info_for_cost *cost;
7072 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
7073 {
7074 unsigned l = gimple_bb (g: cost->stmt_info->stmt)->loop_father->num;
7075 li_scalar_costs.quick_push (obj: std::make_pair (x&: l, y&: cost));
7076 }
7077 /* Use a random used loop as fallback in case the first vector_costs
7078 entry does not have a stmt_info associated with it. */
7079 unsigned l = li_scalar_costs[0].first;
7080 FOR_EACH_VEC_ELT (vector_costs, i, cost)
7081 {
7082 /* We inherit from the previous COST, invariants, externals and
7083 extracts immediately follow the cost for the related stmt. */
7084 if (cost->stmt_info)
7085 l = gimple_bb (g: cost->stmt_info->stmt)->loop_father->num;
7086 li_vector_costs.quick_push (obj: std::make_pair (x&: l, y&: cost));
7087 }
7088 li_scalar_costs.qsort (li_cost_vec_cmp);
7089 li_vector_costs.qsort (li_cost_vec_cmp);
7090
7091 /* Now cost the portions individually. */
7092 unsigned vi = 0;
7093 unsigned si = 0;
7094 bool profitable = true;
7095 while (si < li_scalar_costs.length ()
7096 && vi < li_vector_costs.length ())
7097 {
7098 unsigned sl = li_scalar_costs[si].first;
7099 unsigned vl = li_vector_costs[vi].first;
7100 if (sl != vl)
7101 {
7102 if (dump_enabled_p ())
7103 dump_printf_loc (MSG_NOTE, vect_location,
7104 "Scalar %d and vector %d loop part do not "
7105 "match up, skipping scalar part\n", sl, vl);
7106 /* Skip the scalar part, assuming zero cost on the vector side. */
7107 do
7108 {
7109 si++;
7110 }
7111 while (si < li_scalar_costs.length ()
7112 && li_scalar_costs[si].first == sl);
7113 continue;
7114 }
7115
7116 class vector_costs *scalar_target_cost_data = init_cost (vinfo: bb_vinfo, costing_for_scalar: true);
7117 do
7118 {
7119 add_stmt_cost (costs: scalar_target_cost_data, i: li_scalar_costs[si].second);
7120 si++;
7121 }
7122 while (si < li_scalar_costs.length ()
7123 && li_scalar_costs[si].first == sl);
7124 unsigned dummy;
7125 finish_cost (costs: scalar_target_cost_data, scalar_costs: nullptr,
7126 prologue_cost: &dummy, body_cost: &scalar_cost, epilogue_cost: &dummy);
7127
7128 /* Complete the target-specific vector cost calculation. */
7129 class vector_costs *vect_target_cost_data = init_cost (vinfo: bb_vinfo, costing_for_scalar: false);
7130 do
7131 {
7132 add_stmt_cost (costs: vect_target_cost_data, i: li_vector_costs[vi].second);
7133 vi++;
7134 }
7135 while (vi < li_vector_costs.length ()
7136 && li_vector_costs[vi].first == vl);
7137 finish_cost (costs: vect_target_cost_data, scalar_costs: scalar_target_cost_data,
7138 prologue_cost: &vec_prologue_cost, body_cost: &vec_inside_cost, epilogue_cost: &vec_epilogue_cost);
7139 delete scalar_target_cost_data;
7140 delete vect_target_cost_data;
7141
7142 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
7143
7144 if (dump_enabled_p ())
7145 {
7146 dump_printf_loc (MSG_NOTE, vect_location,
7147 "Cost model analysis for part in loop %d:\n", sl);
7148 dump_printf (MSG_NOTE, " Vector cost: %d\n",
7149 vec_inside_cost + vec_outside_cost);
7150 dump_printf (MSG_NOTE, " Scalar cost: %d\n", scalar_cost);
7151 }
7152
7153 /* Vectorization is profitable if its cost is more than the cost of scalar
7154 version. Note that we err on the vector side for equal cost because
7155 the cost estimate is otherwise quite pessimistic (constant uses are
7156 free on the scalar side but cost a load on the vector side for
7157 example). */
7158 if (vec_outside_cost + vec_inside_cost > scalar_cost)
7159 {
7160 profitable = false;
7161 break;
7162 }
7163 }
7164 if (profitable && vi < li_vector_costs.length ())
7165 {
7166 if (dump_enabled_p ())
7167 dump_printf_loc (MSG_NOTE, vect_location,
7168 "Excess vector cost for part in loop %d:\n",
7169 li_vector_costs[vi].first);
7170 profitable = false;
7171 }
7172
7173 /* Unset visited flag. This is delayed when the subgraph is profitable
7174 and we process the loop for remaining unvectorized if-converted code. */
7175 if (!orig_loop || !profitable)
7176 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
7177 gimple_set_visited (stmt: cost->stmt_info->stmt, visited_p: false);
7178
7179 scalar_costs.release ();
7180 vector_costs.release ();
7181
7182 return profitable;
7183}
7184
7185/* qsort comparator for lane defs. */
7186
7187static int
7188vld_cmp (const void *a_, const void *b_)
7189{
7190 auto *a = (const std::pair<unsigned, tree> *)a_;
7191 auto *b = (const std::pair<unsigned, tree> *)b_;
7192 return a->first - b->first;
7193}
7194
7195/* Return true if USE_STMT is a vector lane insert into VEC and set
7196 *THIS_LANE to the lane number that is set. */
7197
7198static bool
7199vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
7200{
7201 gassign *use_ass = dyn_cast <gassign *> (p: use_stmt);
7202 if (!use_ass
7203 || gimple_assign_rhs_code (gs: use_ass) != BIT_INSERT_EXPR
7204 || (vec
7205 ? gimple_assign_rhs1 (gs: use_ass) != vec
7206 : ((vec = gimple_assign_rhs1 (gs: use_ass)), false))
7207 || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
7208 TREE_TYPE (gimple_assign_rhs2 (use_ass)))
7209 || !constant_multiple_p
7210 (a: tree_to_poly_uint64 (gimple_assign_rhs3 (gs: use_ass)),
7211 b: tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
7212 multiple: this_lane))
7213 return false;
7214 return true;
7215}
7216
7217/* Find any vectorizable constructors and add them to the grouped_store
7218 array. */
7219
7220static void
7221vect_slp_check_for_roots (bb_vec_info bb_vinfo)
7222{
7223 for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i)
7224 for (gimple_stmt_iterator gsi = gsi_start_bb (bb: bb_vinfo->bbs[i]);
7225 !gsi_end_p (i: gsi); gsi_next (i: &gsi))
7226 {
7227 gassign *assign = dyn_cast<gassign *> (p: gsi_stmt (i: gsi));
7228 if (!assign)
7229 continue;
7230
7231 tree rhs = gimple_assign_rhs1 (gs: assign);
7232 enum tree_code code = gimple_assign_rhs_code (gs: assign);
7233 use_operand_p use_p;
7234 gimple *use_stmt;
7235 if (code == CONSTRUCTOR)
7236 {
7237 if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
7238 || maybe_ne (a: TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
7239 CONSTRUCTOR_NELTS (rhs))
7240 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
7241 || uniform_vector_p (rhs))
7242 continue;
7243
7244 unsigned j;
7245 tree val;
7246 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
7247 if (TREE_CODE (val) != SSA_NAME
7248 || !bb_vinfo->lookup_def (val))
7249 break;
7250 if (j != CONSTRUCTOR_NELTS (rhs))
7251 continue;
7252
7253 vec<stmt_vec_info> roots = vNULL;
7254 roots.safe_push (obj: bb_vinfo->lookup_stmt (assign));
7255 vec<stmt_vec_info> stmts;
7256 stmts.create (CONSTRUCTOR_NELTS (rhs));
7257 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
7258 stmts.quick_push
7259 (obj: vect_stmt_to_vectorize (stmt_info: bb_vinfo->lookup_def (val)));
7260 bb_vinfo->roots.safe_push (obj: slp_root (slp_inst_kind_ctor,
7261 stmts, roots));
7262 }
7263 else if (code == BIT_INSERT_EXPR
7264 && VECTOR_TYPE_P (TREE_TYPE (rhs))
7265 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
7266 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
7267 && integer_zerop (gimple_assign_rhs3 (gs: assign))
7268 && useless_type_conversion_p
7269 (TREE_TYPE (TREE_TYPE (rhs)),
7270 TREE_TYPE (gimple_assign_rhs2 (assign)))
7271 && bb_vinfo->lookup_def (gimple_assign_rhs2 (gs: assign)))
7272 {
7273 /* We start to match on insert to lane zero but since the
7274 inserts need not be ordered we'd have to search both
7275 the def and the use chains. */
7276 tree vectype = TREE_TYPE (rhs);
7277 unsigned nlanes = TYPE_VECTOR_SUBPARTS (node: vectype).to_constant ();
7278 auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
7279 auto_sbitmap lanes (nlanes);
7280 bitmap_clear (lanes);
7281 bitmap_set_bit (map: lanes, bitno: 0);
7282 tree def = gimple_assign_lhs (gs: assign);
7283 lane_defs.quick_push
7284 (obj: std::make_pair (x: 0, y: gimple_assign_rhs2 (gs: assign)));
7285 unsigned lanes_found = 1;
7286 /* Start with the use chains, the last stmt will be the root. */
7287 stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
7288 vec<stmt_vec_info> roots = vNULL;
7289 roots.safe_push (obj: last);
7290 do
7291 {
7292 use_operand_p use_p;
7293 gimple *use_stmt;
7294 if (!single_imm_use (var: def, use_p: &use_p, stmt: &use_stmt))
7295 break;
7296 unsigned this_lane;
7297 if (!bb_vinfo->lookup_stmt (use_stmt)
7298 || !vect_slp_is_lane_insert (use_stmt, vec: def, this_lane: &this_lane)
7299 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (gs: use_stmt)))
7300 break;
7301 if (bitmap_bit_p (map: lanes, bitno: this_lane))
7302 break;
7303 lanes_found++;
7304 bitmap_set_bit (map: lanes, bitno: this_lane);
7305 gassign *use_ass = as_a <gassign *> (p: use_stmt);
7306 lane_defs.quick_push (obj: std::make_pair
7307 (x&: this_lane, y: gimple_assign_rhs2 (gs: use_ass)));
7308 last = bb_vinfo->lookup_stmt (use_ass);
7309 roots.safe_push (obj: last);
7310 def = gimple_assign_lhs (gs: use_ass);
7311 }
7312 while (lanes_found < nlanes);
7313 if (roots.length () > 1)
7314 std::swap(a&: roots[0], b&: roots[roots.length () - 1]);
7315 if (lanes_found < nlanes)
7316 {
7317 /* Now search the def chain. */
7318 def = gimple_assign_rhs1 (gs: assign);
7319 do
7320 {
7321 if (TREE_CODE (def) != SSA_NAME
7322 || !has_single_use (var: def))
7323 break;
7324 gimple *def_stmt = SSA_NAME_DEF_STMT (def);
7325 unsigned this_lane;
7326 if (!bb_vinfo->lookup_stmt (def_stmt)
7327 || !vect_slp_is_lane_insert (use_stmt: def_stmt,
7328 NULL_TREE, this_lane: &this_lane)
7329 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (gs: def_stmt)))
7330 break;
7331 if (bitmap_bit_p (map: lanes, bitno: this_lane))
7332 break;
7333 lanes_found++;
7334 bitmap_set_bit (map: lanes, bitno: this_lane);
7335 lane_defs.quick_push (obj: std::make_pair
7336 (x&: this_lane,
7337 y: gimple_assign_rhs2 (gs: def_stmt)));
7338 roots.safe_push (obj: bb_vinfo->lookup_stmt (def_stmt));
7339 def = gimple_assign_rhs1 (gs: def_stmt);
7340 }
7341 while (lanes_found < nlanes);
7342 }
7343 if (lanes_found == nlanes)
7344 {
7345 /* Sort lane_defs after the lane index and register the root. */
7346 lane_defs.qsort (vld_cmp);
7347 vec<stmt_vec_info> stmts;
7348 stmts.create (nelems: nlanes);
7349 for (unsigned i = 0; i < nlanes; ++i)
7350 stmts.quick_push (obj: bb_vinfo->lookup_def (lane_defs[i].second));
7351 bb_vinfo->roots.safe_push (obj: slp_root (slp_inst_kind_ctor,
7352 stmts, roots));
7353 }
7354 else
7355 roots.release ();
7356 }
7357 else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
7358 && (associative_tree_code (code) || code == MINUS_EXPR)
7359 /* ??? This pessimizes a two-element reduction. PR54400.
7360 ??? In-order reduction could be handled if we only
7361 traverse one operand chain in vect_slp_linearize_chain. */
7362 && !needs_fold_left_reduction_p (TREE_TYPE (rhs), code)
7363 /* Ops with constants at the tail can be stripped here. */
7364 && TREE_CODE (rhs) == SSA_NAME
7365 && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
7366 /* Should be the chain end. */
7367 && (!single_imm_use (var: gimple_assign_lhs (gs: assign),
7368 use_p: &use_p, stmt: &use_stmt)
7369 || !is_gimple_assign (gs: use_stmt)
7370 || (gimple_assign_rhs_code (gs: use_stmt) != code
7371 && ((code != PLUS_EXPR && code != MINUS_EXPR)
7372 || (gimple_assign_rhs_code (gs: use_stmt)
7373 != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR))))))
7374 {
7375 /* We start the match at the end of a possible association
7376 chain. */
7377 auto_vec<chain_op_t> chain;
7378 auto_vec<std::pair<tree_code, gimple *> > worklist;
7379 auto_vec<gimple *> chain_stmts;
7380 gimple *code_stmt = NULL, *alt_code_stmt = NULL;
7381 if (code == MINUS_EXPR)
7382 code = PLUS_EXPR;
7383 internal_fn reduc_fn;
7384 if (!reduction_fn_for_scalar_code (code, &reduc_fn)
7385 || reduc_fn == IFN_LAST)
7386 continue;
7387 vect_slp_linearize_chain (vinfo: bb_vinfo, worklist, chain, code, start: assign,
7388 /* ??? */
7389 code_stmt, alt_code_stmt, chain_stmts: &chain_stmts);
7390 if (chain.length () > 1)
7391 {
7392 /* Sort the chain according to def_type and operation. */
7393 chain.sort (cmp: dt_sort_cmp, data: bb_vinfo);
7394 /* ??? Now we'd want to strip externals and constants
7395 but record those to be handled in the epilogue. */
7396 /* ??? For now do not allow mixing ops or externs/constants. */
7397 bool invalid = false;
7398 unsigned remain_cnt = 0;
7399 for (unsigned i = 0; i < chain.length (); ++i)
7400 {
7401 if (chain[i].code != code)
7402 {
7403 invalid = true;
7404 break;
7405 }
7406 if (chain[i].dt != vect_internal_def)
7407 remain_cnt++;
7408 }
7409 if (!invalid && chain.length () - remain_cnt > 1)
7410 {
7411 vec<stmt_vec_info> stmts;
7412 vec<tree> remain = vNULL;
7413 stmts.create (nelems: chain.length ());
7414 if (remain_cnt > 0)
7415 remain.create (nelems: remain_cnt);
7416 for (unsigned i = 0; i < chain.length (); ++i)
7417 {
7418 if (chain[i].dt == vect_internal_def)
7419 stmts.quick_push (obj: bb_vinfo->lookup_def (chain[i].op));
7420 else
7421 remain.quick_push (obj: chain[i].op);
7422 }
7423 vec<stmt_vec_info> roots;
7424 roots.create (nelems: chain_stmts.length ());
7425 for (unsigned i = 0; i < chain_stmts.length (); ++i)
7426 roots.quick_push (obj: bb_vinfo->lookup_stmt (chain_stmts[i]));
7427 bb_vinfo->roots.safe_push (obj: slp_root (slp_inst_kind_bb_reduc,
7428 stmts, roots, remain));
7429 }
7430 }
7431 }
7432 }
7433}
7434
7435/* Walk the grouped store chains and replace entries with their
7436 pattern variant if any. */
7437
7438static void
7439vect_fixup_store_groups_with_patterns (vec_info *vinfo)
7440{
7441 stmt_vec_info first_element;
7442 unsigned i;
7443
7444 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
7445 {
7446 /* We also have CTORs in this array. */
7447 if (!STMT_VINFO_GROUPED_ACCESS (first_element))
7448 continue;
7449 if (STMT_VINFO_IN_PATTERN_P (first_element))
7450 {
7451 stmt_vec_info orig = first_element;
7452 first_element = STMT_VINFO_RELATED_STMT (first_element);
7453 DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
7454 DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
7455 DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
7456 DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
7457 vinfo->grouped_stores[i] = first_element;
7458 }
7459 stmt_vec_info prev = first_element;
7460 while (DR_GROUP_NEXT_ELEMENT (prev))
7461 {
7462 stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
7463 if (STMT_VINFO_IN_PATTERN_P (elt))
7464 {
7465 stmt_vec_info orig = elt;
7466 elt = STMT_VINFO_RELATED_STMT (elt);
7467 DR_GROUP_NEXT_ELEMENT (prev) = elt;
7468 DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
7469 DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
7470 }
7471 DR_GROUP_FIRST_ELEMENT (elt) = first_element;
7472 prev = elt;
7473 }
7474 }
7475}
7476
7477/* Check if the region described by BB_VINFO can be vectorized, returning
7478 true if so. When returning false, set FATAL to true if the same failure
7479 would prevent vectorization at other vector sizes, false if it is still
7480 worth trying other sizes. N_STMTS is the number of statements in the
7481 region. */
7482
7483static bool
7484vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
7485 vec<int> *dataref_groups)
7486{
7487 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
7488
7489 slp_instance instance;
7490 int i;
7491 poly_uint64 min_vf = 2;
7492
7493 /* The first group of checks is independent of the vector size. */
7494 fatal = true;
7495
7496 /* Analyze the data references. */
7497
7498 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
7499 {
7500 if (dump_enabled_p ())
7501 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7502 "not vectorized: unhandled data-ref in basic "
7503 "block.\n");
7504 return false;
7505 }
7506
7507 if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
7508 {
7509 if (dump_enabled_p ())
7510 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7511 "not vectorized: unhandled data access in "
7512 "basic block.\n");
7513 return false;
7514 }
7515
7516 vect_slp_check_for_roots (bb_vinfo);
7517
7518 /* If there are no grouped stores and no constructors in the region
7519 there is no need to continue with pattern recog as vect_analyze_slp
7520 will fail anyway. */
7521 if (bb_vinfo->grouped_stores.is_empty ()
7522 && bb_vinfo->roots.is_empty ())
7523 {
7524 if (dump_enabled_p ())
7525 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7526 "not vectorized: no grouped stores in "
7527 "basic block.\n");
7528 return false;
7529 }
7530
7531 /* While the rest of the analysis below depends on it in some way. */
7532 fatal = false;
7533
7534 vect_pattern_recog (bb_vinfo);
7535
7536 /* Update store groups from pattern processing. */
7537 vect_fixup_store_groups_with_patterns (vinfo: bb_vinfo);
7538
7539 /* Check the SLP opportunities in the basic block, analyze and build SLP
7540 trees. */
7541 if (!vect_analyze_slp (vinfo: bb_vinfo, max_tree_size: n_stmts))
7542 {
7543 if (dump_enabled_p ())
7544 {
7545 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7546 "Failed to SLP the basic block.\n");
7547 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7548 "not vectorized: failed to find SLP opportunities "
7549 "in basic block.\n");
7550 }
7551 return false;
7552 }
7553
7554 /* Optimize permutations. */
7555 vect_optimize_slp (vinfo: bb_vinfo);
7556
7557 /* Gather the loads reachable from the SLP graph entries. */
7558 vect_gather_slp_loads (vinfo: bb_vinfo);
7559
7560 vect_record_base_alignments (bb_vinfo);
7561
7562 /* Analyze and verify the alignment of data references and the
7563 dependence in the SLP instances. */
7564 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (ix: i, ptr: &instance); )
7565 {
7566 vect_location = instance->location ();
7567 if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
7568 || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
7569 {
7570 slp_tree node = SLP_INSTANCE_TREE (instance);
7571 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
7572 if (dump_enabled_p ())
7573 dump_printf_loc (MSG_NOTE, vect_location,
7574 "removing SLP instance operations starting from: %G",
7575 stmt_info->stmt);
7576 vect_free_slp_instance (instance);
7577 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (ix: i);
7578 continue;
7579 }
7580
7581 /* Mark all the statements that we want to vectorize as pure SLP and
7582 relevant. */
7583 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
7584 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
7585 unsigned j;
7586 stmt_vec_info root;
7587 /* Likewise consider instance root stmts as vectorized. */
7588 FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance), j, root)
7589 STMT_SLP_TYPE (root) = pure_slp;
7590
7591 i++;
7592 }
7593 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
7594 return false;
7595
7596 if (!vect_slp_analyze_operations (vinfo: bb_vinfo))
7597 {
7598 if (dump_enabled_p ())
7599 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7600 "not vectorized: bad operation in basic block.\n");
7601 return false;
7602 }
7603
7604 vect_bb_partition_graph (bb_vinfo);
7605
7606 return true;
7607}
7608
7609/* Subroutine of vect_slp_bb. Try to vectorize the statements for all
7610 basic blocks in BBS, returning true on success.
7611 The region has N_STMTS statements and has the datarefs given by DATAREFS. */
7612
7613static bool
7614vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
7615 vec<int> *dataref_groups, unsigned int n_stmts,
7616 loop_p orig_loop)
7617{
7618 bb_vec_info bb_vinfo;
7619 auto_vector_modes vector_modes;
7620
7621 /* Autodetect first vector size we try. */
7622 machine_mode next_vector_mode = VOIDmode;
7623 targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
7624 unsigned int mode_i = 0;
7625
7626 vec_info_shared shared;
7627
7628 machine_mode autodetected_vector_mode = VOIDmode;
7629 while (1)
7630 {
7631 bool vectorized = false;
7632 bool fatal = false;
7633 bb_vinfo = new _bb_vec_info (bbs, &shared);
7634
7635 bool first_time_p = shared.datarefs.is_empty ();
7636 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
7637 if (first_time_p)
7638 bb_vinfo->shared->save_datarefs ();
7639 else
7640 bb_vinfo->shared->check_datarefs ();
7641 bb_vinfo->vector_mode = next_vector_mode;
7642
7643 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
7644 {
7645 if (dump_enabled_p ())
7646 {
7647 dump_printf_loc (MSG_NOTE, vect_location,
7648 "***** Analysis succeeded with vector mode"
7649 " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
7650 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
7651 }
7652
7653 bb_vinfo->shared->check_datarefs ();
7654
7655 auto_vec<slp_instance> profitable_subgraphs;
7656 for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
7657 {
7658 if (instance->subgraph_entries.is_empty ())
7659 continue;
7660
7661 dump_user_location_t saved_vect_location = vect_location;
7662 vect_location = instance->location ();
7663 if (!unlimited_cost_model (NULL)
7664 && !vect_bb_vectorization_profitable_p
7665 (bb_vinfo, slp_instances: instance->subgraph_entries, orig_loop))
7666 {
7667 if (dump_enabled_p ())
7668 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7669 "not vectorized: vectorization is not "
7670 "profitable.\n");
7671 vect_location = saved_vect_location;
7672 continue;
7673 }
7674
7675 vect_location = saved_vect_location;
7676 if (!dbg_cnt (index: vect_slp))
7677 continue;
7678
7679 profitable_subgraphs.safe_push (obj: instance);
7680 }
7681
7682 /* When we're vectorizing an if-converted loop body make sure
7683 we vectorized all if-converted code. */
7684 if (!profitable_subgraphs.is_empty ()
7685 && orig_loop)
7686 {
7687 gcc_assert (bb_vinfo->bbs.length () == 1);
7688 for (gimple_stmt_iterator gsi = gsi_start_bb (bb: bb_vinfo->bbs[0]);
7689 !gsi_end_p (i: gsi); gsi_next (i: &gsi))
7690 {
7691 /* The costing above left us with DCEable vectorized scalar
7692 stmts having the visited flag set on profitable
7693 subgraphs. Do the delayed clearing of the flag here. */
7694 if (gimple_visited_p (stmt: gsi_stmt (i: gsi)))
7695 {
7696 gimple_set_visited (stmt: gsi_stmt (i: gsi), visited_p: false);
7697 continue;
7698 }
7699 if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED)
7700 continue;
7701
7702 if (gassign *ass = dyn_cast <gassign *> (p: gsi_stmt (i: gsi)))
7703 if (gimple_assign_rhs_code (gs: ass) == COND_EXPR)
7704 {
7705 if (!profitable_subgraphs.is_empty ()
7706 && dump_enabled_p ())
7707 dump_printf_loc (MSG_NOTE, vect_location,
7708 "not profitable because of "
7709 "unprofitable if-converted scalar "
7710 "code\n");
7711 profitable_subgraphs.truncate (size: 0);
7712 }
7713 }
7714 }
7715
7716 /* Finally schedule the profitable subgraphs. */
7717 for (slp_instance instance : profitable_subgraphs)
7718 {
7719 if (!vectorized && dump_enabled_p ())
7720 dump_printf_loc (MSG_NOTE, vect_location,
7721 "Basic block will be vectorized "
7722 "using SLP\n");
7723 vectorized = true;
7724
7725 /* Dump before scheduling as store vectorization will remove
7726 the original stores and mess with the instance tree
7727 so querying its location will eventually ICE. */
7728 if (flag_checking)
7729 for (slp_instance sub : instance->subgraph_entries)
7730 gcc_assert (SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub)));
7731 unsigned HOST_WIDE_INT bytes;
7732 if (dump_enabled_p ())
7733 for (slp_instance sub : instance->subgraph_entries)
7734 {
7735 tree vtype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub));
7736 if (GET_MODE_SIZE (TYPE_MODE (vtype)).is_constant (const_value: &bytes))
7737 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
7738 sub->location (),
7739 "basic block part vectorized using %wu "
7740 "byte vectors\n", bytes);
7741 else
7742 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
7743 sub->location (),
7744 "basic block part vectorized using "
7745 "variable length vectors\n");
7746 }
7747
7748 dump_user_location_t saved_vect_location = vect_location;
7749 vect_location = instance->location ();
7750
7751 vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
7752
7753 vect_location = saved_vect_location;
7754 }
7755 }
7756 else
7757 {
7758 if (dump_enabled_p ())
7759 dump_printf_loc (MSG_NOTE, vect_location,
7760 "***** Analysis failed with vector mode %s\n",
7761 GET_MODE_NAME (bb_vinfo->vector_mode));
7762 }
7763
7764 if (mode_i == 0)
7765 autodetected_vector_mode = bb_vinfo->vector_mode;
7766
7767 if (!fatal)
7768 while (mode_i < vector_modes.length ()
7769 && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
7770 {
7771 if (dump_enabled_p ())
7772 dump_printf_loc (MSG_NOTE, vect_location,
7773 "***** The result for vector mode %s would"
7774 " be the same\n",
7775 GET_MODE_NAME (vector_modes[mode_i]));
7776 mode_i += 1;
7777 }
7778
7779 delete bb_vinfo;
7780
7781 if (mode_i < vector_modes.length ()
7782 && VECTOR_MODE_P (autodetected_vector_mode)
7783 && (related_vector_mode (vector_modes[mode_i],
7784 GET_MODE_INNER (autodetected_vector_mode))
7785 == autodetected_vector_mode)
7786 && (related_vector_mode (autodetected_vector_mode,
7787 GET_MODE_INNER (vector_modes[mode_i]))
7788 == vector_modes[mode_i]))
7789 {
7790 if (dump_enabled_p ())
7791 dump_printf_loc (MSG_NOTE, vect_location,
7792 "***** Skipping vector mode %s, which would"
7793 " repeat the analysis for %s\n",
7794 GET_MODE_NAME (vector_modes[mode_i]),
7795 GET_MODE_NAME (autodetected_vector_mode));
7796 mode_i += 1;
7797 }
7798
7799 if (vectorized
7800 || mode_i == vector_modes.length ()
7801 || autodetected_vector_mode == VOIDmode
7802 /* If vect_slp_analyze_bb_1 signaled that analysis for all
7803 vector sizes will fail do not bother iterating. */
7804 || fatal)
7805 return vectorized;
7806
7807 /* Try the next biggest vector size. */
7808 next_vector_mode = vector_modes[mode_i++];
7809 if (dump_enabled_p ())
7810 dump_printf_loc (MSG_NOTE, vect_location,
7811 "***** Re-trying analysis with vector mode %s\n",
7812 GET_MODE_NAME (next_vector_mode));
7813 }
7814}
7815
7816
7817/* Main entry for the BB vectorizer. Analyze and transform BBS, returns
7818 true if anything in the basic-block was vectorized. */
7819
7820static bool
7821vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
7822{
7823 vec<data_reference_p> datarefs = vNULL;
7824 auto_vec<int> dataref_groups;
7825 int insns = 0;
7826 int current_group = 0;
7827
7828 for (unsigned i = 0; i < bbs.length (); i++)
7829 {
7830 basic_block bb = bbs[i];
7831 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (i: gsi);
7832 gsi_next (i: &gsi))
7833 {
7834 gimple *stmt = gsi_stmt (i: gsi);
7835 if (is_gimple_debug (gs: stmt))
7836 continue;
7837
7838 insns++;
7839
7840 if (gimple_location (g: stmt) != UNKNOWN_LOCATION)
7841 vect_location = stmt;
7842
7843 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
7844 &dataref_groups, current_group))
7845 ++current_group;
7846 }
7847 /* New BBs always start a new DR group. */
7848 ++current_group;
7849 }
7850
7851 return vect_slp_region (bbs, datarefs, dataref_groups: &dataref_groups, n_stmts: insns, orig_loop);
7852}
7853
7854/* Special entry for the BB vectorizer. Analyze and transform a single
7855 if-converted BB with ORIG_LOOPs body being the not if-converted
7856 representation. Returns true if anything in the basic-block was
7857 vectorized. */
7858
7859bool
7860vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
7861{
7862 auto_vec<basic_block> bbs;
7863 bbs.safe_push (obj: bb);
7864 return vect_slp_bbs (bbs, orig_loop);
7865}
7866
7867/* Main entry for the BB vectorizer. Analyze and transform BB, returns
7868 true if anything in the basic-block was vectorized. */
7869
7870bool
7871vect_slp_function (function *fun)
7872{
7873 bool r = false;
7874 int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
7875 auto_bitmap exit_bbs;
7876 bitmap_set_bit (exit_bbs, EXIT_BLOCK);
7877 edge entry = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (fun));
7878 unsigned n = rev_post_order_and_mark_dfs_back_seme (fun, entry, exit_bbs,
7879 true, rpo, NULL);
7880
7881 /* For the moment split the function into pieces to avoid making
7882 the iteration on the vector mode moot. Split at points we know
7883 to not handle well which is CFG merges (SLP discovery doesn't
7884 handle non-loop-header PHIs) and loop exits. Since pattern
7885 recog requires reverse iteration to visit uses before defs
7886 simply chop RPO into pieces. */
7887 auto_vec<basic_block> bbs;
7888 for (unsigned i = 0; i < n; i++)
7889 {
7890 basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
7891 bool split = false;
7892
7893 /* Split when a BB is not dominated by the first block. */
7894 if (!bbs.is_empty ()
7895 && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
7896 {
7897 if (dump_enabled_p ())
7898 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7899 "splitting region at dominance boundary bb%d\n",
7900 bb->index);
7901 split = true;
7902 }
7903 /* Split when the loop determined by the first block
7904 is exited. This is because we eventually insert
7905 invariants at region begin. */
7906 else if (!bbs.is_empty ()
7907 && bbs[0]->loop_father != bb->loop_father
7908 && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
7909 {
7910 if (dump_enabled_p ())
7911 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7912 "splitting region at loop %d exit at bb%d\n",
7913 bbs[0]->loop_father->num, bb->index);
7914 split = true;
7915 }
7916 else if (!bbs.is_empty ()
7917 && bb->loop_father->header == bb
7918 && bb->loop_father->dont_vectorize)
7919 {
7920 if (dump_enabled_p ())
7921 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7922 "splitting region at dont-vectorize loop %d "
7923 "entry at bb%d\n",
7924 bb->loop_father->num, bb->index);
7925 split = true;
7926 }
7927
7928 if (split && !bbs.is_empty ())
7929 {
7930 r |= vect_slp_bbs (bbs, NULL);
7931 bbs.truncate (size: 0);
7932 }
7933
7934 if (bbs.is_empty ())
7935 {
7936 /* We need to be able to insert at the head of the region which
7937 we cannot for region starting with a returns-twice call. */
7938 if (gcall *first = safe_dyn_cast <gcall *> (p: first_stmt (bb)))
7939 if (gimple_call_flags (first) & ECF_RETURNS_TWICE)
7940 {
7941 if (dump_enabled_p ())
7942 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7943 "skipping bb%d as start of region as it "
7944 "starts with returns-twice call\n",
7945 bb->index);
7946 continue;
7947 }
7948 /* If the loop this BB belongs to is marked as not to be vectorized
7949 honor that also for BB vectorization. */
7950 if (bb->loop_father->dont_vectorize)
7951 continue;
7952 }
7953
7954 bbs.safe_push (obj: bb);
7955
7956 /* When we have a stmt ending this block and defining a
7957 value we have to insert on edges when inserting after it for
7958 a vector containing its definition. Avoid this for now. */
7959 if (gimple *last = *gsi_last_bb (bb))
7960 if (gimple_get_lhs (last)
7961 && is_ctrl_altering_stmt (last))
7962 {
7963 if (dump_enabled_p ())
7964 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7965 "splitting region at control altering "
7966 "definition %G", last);
7967 r |= vect_slp_bbs (bbs, NULL);
7968 bbs.truncate (size: 0);
7969 }
7970 }
7971
7972 if (!bbs.is_empty ())
7973 r |= vect_slp_bbs (bbs, NULL);
7974
7975 free (ptr: rpo);
7976
7977 return r;
7978}
7979
7980/* Build a variable-length vector in which the elements in ELTS are repeated
7981 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
7982 RESULTS and add any new instructions to SEQ.
7983
7984 The approach we use is:
7985
7986 (1) Find a vector mode VM with integer elements of mode IM.
7987
7988 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
7989 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
7990 from small vectors to IM.
7991
7992 (3) Duplicate each ELTS'[I] into a vector of mode VM.
7993
7994 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
7995 correct byte contents.
7996
7997 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
7998
7999 We try to find the largest IM for which this sequence works, in order
8000 to cut down on the number of interleaves. */
8001
8002void
8003duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
8004 const vec<tree> &elts, unsigned int nresults,
8005 vec<tree> &results)
8006{
8007 unsigned int nelts = elts.length ();
8008 tree element_type = TREE_TYPE (vector_type);
8009
8010 /* (1) Find a vector mode VM with integer elements of mode IM. */
8011 unsigned int nvectors = 1;
8012 tree new_vector_type;
8013 tree permutes[2];
8014 if (!can_duplicate_and_interleave_p (vinfo, count: nelts, elt_type: element_type,
8015 nvectors_out: &nvectors, vector_type_out: &new_vector_type,
8016 permutes))
8017 gcc_unreachable ();
8018
8019 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
8020 unsigned int partial_nelts = nelts / nvectors;
8021 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
8022
8023 tree_vector_builder partial_elts;
8024 auto_vec<tree, 32> pieces (nvectors * 2);
8025 pieces.quick_grow_cleared (len: nvectors * 2);
8026 for (unsigned int i = 0; i < nvectors; ++i)
8027 {
8028 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
8029 ELTS' has mode IM. */
8030 partial_elts.new_vector (type: partial_vector_type, npatterns: partial_nelts, nelts_per_pattern: 1);
8031 for (unsigned int j = 0; j < partial_nelts; ++j)
8032 partial_elts.quick_push (obj: elts[i * partial_nelts + j]);
8033 tree t = gimple_build_vector (seq, builder: &partial_elts);
8034 t = gimple_build (seq, code: VIEW_CONVERT_EXPR,
8035 TREE_TYPE (new_vector_type), ops: t);
8036
8037 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
8038 pieces[i] = gimple_build_vector_from_val (seq, type: new_vector_type, op: t);
8039 }
8040
8041 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
8042 correct byte contents.
8043
8044 Conceptually, we need to repeat the following operation log2(nvectors)
8045 times, where hi_start = nvectors / 2:
8046
8047 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
8048 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
8049
8050 However, if each input repeats every N elements and the VF is
8051 a multiple of N * 2, the HI result is the same as the LO result.
8052 This will be true for the first N1 iterations of the outer loop,
8053 followed by N2 iterations for which both the LO and HI results
8054 are needed. I.e.:
8055
8056 N1 + N2 = log2(nvectors)
8057
8058 Each "N1 iteration" doubles the number of redundant vectors and the
8059 effect of the process as a whole is to have a sequence of nvectors/2**N1
8060 vectors that repeats 2**N1 times. Rather than generate these redundant
8061 vectors, we halve the number of vectors for each N1 iteration. */
8062 unsigned int in_start = 0;
8063 unsigned int out_start = nvectors;
8064 unsigned int new_nvectors = nvectors;
8065 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
8066 {
8067 unsigned int hi_start = new_nvectors / 2;
8068 unsigned int out_i = 0;
8069 for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
8070 {
8071 if ((in_i & 1) != 0
8072 && multiple_p (a: TYPE_VECTOR_SUBPARTS (node: new_vector_type),
8073 b: 2 * in_repeat))
8074 continue;
8075
8076 tree output = make_ssa_name (var: new_vector_type);
8077 tree input1 = pieces[in_start + (in_i / 2)];
8078 tree input2 = pieces[in_start + (in_i / 2) + hi_start];
8079 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
8080 input1, input2,
8081 permutes[in_i & 1]);
8082 gimple_seq_add_stmt (seq, stmt);
8083 pieces[out_start + out_i] = output;
8084 out_i += 1;
8085 }
8086 std::swap (a&: in_start, b&: out_start);
8087 new_nvectors = out_i;
8088 }
8089
8090 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
8091 results.reserve (nelems: nresults);
8092 for (unsigned int i = 0; i < nresults; ++i)
8093 if (i < new_nvectors)
8094 results.quick_push (obj: gimple_build (seq, code: VIEW_CONVERT_EXPR, type: vector_type,
8095 ops: pieces[in_start + i]));
8096 else
8097 results.quick_push (obj: results[i - new_nvectors]);
8098}
8099
8100
8101/* For constant and loop invariant defs in OP_NODE this function creates
8102 vector defs that will be used in the vectorized stmts and stores them
8103 to SLP_TREE_VEC_DEFS of OP_NODE. */
8104
8105static void
8106vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
8107{
8108 unsigned HOST_WIDE_INT nunits;
8109 tree vec_cst;
8110 unsigned j, number_of_places_left_in_vector;
8111 tree vector_type;
8112 tree vop;
8113 int group_size = op_node->ops.length ();
8114 unsigned int vec_num, i;
8115 unsigned number_of_copies = 1;
8116 bool constant_p;
8117 gimple_seq ctor_seq = NULL;
8118 auto_vec<tree, 16> permute_results;
8119
8120 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
8121 vector_type = SLP_TREE_VECTYPE (op_node);
8122
8123 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
8124 SLP_TREE_VEC_DEFS (op_node).create (nelems: number_of_vectors);
8125 auto_vec<tree> voprnds (number_of_vectors);
8126
8127 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
8128 created vectors. It is greater than 1 if unrolling is performed.
8129
8130 For example, we have two scalar operands, s1 and s2 (e.g., group of
8131 strided accesses of size two), while NUNITS is four (i.e., four scalars
8132 of this type can be packed in a vector). The output vector will contain
8133 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
8134 will be 2).
8135
8136 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
8137 containing the operands.
8138
8139 For example, NUNITS is four as before, and the group size is 8
8140 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
8141 {s5, s6, s7, s8}. */
8142
8143 /* When using duplicate_and_interleave, we just need one element for
8144 each scalar statement. */
8145 if (!TYPE_VECTOR_SUBPARTS (node: vector_type).is_constant (const_value: &nunits))
8146 nunits = group_size;
8147
8148 number_of_copies = nunits * number_of_vectors / group_size;
8149
8150 number_of_places_left_in_vector = nunits;
8151 constant_p = true;
8152 tree_vector_builder elts (vector_type, nunits, 1);
8153 elts.quick_grow (len: nunits);
8154 stmt_vec_info insert_after = NULL;
8155 for (j = 0; j < number_of_copies; j++)
8156 {
8157 tree op;
8158 for (i = group_size - 1; op_node->ops.iterate (ix: i, ptr: &op); i--)
8159 {
8160 /* Create 'vect_ = {op0,op1,...,opn}'. */
8161 number_of_places_left_in_vector--;
8162 tree orig_op = op;
8163 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
8164 {
8165 if (CONSTANT_CLASS_P (op))
8166 {
8167 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
8168 {
8169 /* Can't use VIEW_CONVERT_EXPR for booleans because
8170 of possibly different sizes of scalar value and
8171 vector element. */
8172 if (integer_zerop (op))
8173 op = build_int_cst (TREE_TYPE (vector_type), 0);
8174 else if (integer_onep (op))
8175 op = build_all_ones_cst (TREE_TYPE (vector_type));
8176 else
8177 gcc_unreachable ();
8178 }
8179 else
8180 op = fold_unary (VIEW_CONVERT_EXPR,
8181 TREE_TYPE (vector_type), op);
8182 gcc_assert (op && CONSTANT_CLASS_P (op));
8183 }
8184 else
8185 {
8186 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
8187 gimple *init_stmt;
8188 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
8189 {
8190 tree true_val
8191 = build_all_ones_cst (TREE_TYPE (vector_type));
8192 tree false_val
8193 = build_zero_cst (TREE_TYPE (vector_type));
8194 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
8195 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
8196 op, true_val,
8197 false_val);
8198 }
8199 else
8200 {
8201 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
8202 op);
8203 init_stmt
8204 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
8205 op);
8206 }
8207 gimple_seq_add_stmt (&ctor_seq, init_stmt);
8208 op = new_temp;
8209 }
8210 }
8211 elts[number_of_places_left_in_vector] = op;
8212 if (!CONSTANT_CLASS_P (op))
8213 constant_p = false;
8214 /* For BB vectorization we have to compute an insert location
8215 when a def is inside the analyzed region since we cannot
8216 simply insert at the BB start in this case. */
8217 stmt_vec_info opdef;
8218 if (TREE_CODE (orig_op) == SSA_NAME
8219 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
8220 && is_a <bb_vec_info> (p: vinfo)
8221 && (opdef = vinfo->lookup_def (orig_op)))
8222 {
8223 if (!insert_after)
8224 insert_after = opdef;
8225 else
8226 insert_after = get_later_stmt (stmt1_info: insert_after, stmt2_info: opdef);
8227 }
8228
8229 if (number_of_places_left_in_vector == 0)
8230 {
8231 if (constant_p
8232 ? multiple_p (a: TYPE_VECTOR_SUBPARTS (node: vector_type), b: nunits)
8233 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
8234 vec_cst = gimple_build_vector (seq: &ctor_seq, builder: &elts);
8235 else
8236 {
8237 if (permute_results.is_empty ())
8238 duplicate_and_interleave (vinfo, seq: &ctor_seq, vector_type,
8239 elts, nresults: number_of_vectors,
8240 results&: permute_results);
8241 vec_cst = permute_results[number_of_vectors - j - 1];
8242 }
8243 if (!gimple_seq_empty_p (s: ctor_seq))
8244 {
8245 if (insert_after)
8246 {
8247 gimple_stmt_iterator gsi;
8248 if (gimple_code (g: insert_after->stmt) == GIMPLE_PHI)
8249 {
8250 gsi = gsi_after_labels (bb: gimple_bb (g: insert_after->stmt));
8251 gsi_insert_seq_before (&gsi, ctor_seq,
8252 GSI_CONTINUE_LINKING);
8253 }
8254 else if (!stmt_ends_bb_p (insert_after->stmt))
8255 {
8256 gsi = gsi_for_stmt (insert_after->stmt);
8257 gsi_insert_seq_after (&gsi, ctor_seq,
8258 GSI_CONTINUE_LINKING);
8259 }
8260 else
8261 {
8262 /* When we want to insert after a def where the
8263 defining stmt throws then insert on the fallthru
8264 edge. */
8265 edge e = find_fallthru_edge
8266 (edges: gimple_bb (g: insert_after->stmt)->succs);
8267 basic_block new_bb
8268 = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
8269 gcc_assert (!new_bb);
8270 }
8271 }
8272 else
8273 vinfo->insert_seq_on_entry (NULL, ctor_seq);
8274 ctor_seq = NULL;
8275 }
8276 voprnds.quick_push (obj: vec_cst);
8277 insert_after = NULL;
8278 number_of_places_left_in_vector = nunits;
8279 constant_p = true;
8280 elts.new_vector (type: vector_type, npatterns: nunits, nelts_per_pattern: 1);
8281 elts.quick_grow (len: nunits);
8282 }
8283 }
8284 }
8285
8286 /* Since the vectors are created in the reverse order, we should invert
8287 them. */
8288 vec_num = voprnds.length ();
8289 for (j = vec_num; j != 0; j--)
8290 {
8291 vop = voprnds[j - 1];
8292 SLP_TREE_VEC_DEFS (op_node).quick_push (obj: vop);
8293 }
8294
8295 /* In case that VF is greater than the unrolling factor needed for the SLP
8296 group of stmts, NUMBER_OF_VECTORS to be created is greater than
8297 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
8298 to replicate the vectors. */
8299 while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
8300 for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (ix: i, ptr: &vop) && i < vec_num;
8301 i++)
8302 SLP_TREE_VEC_DEFS (op_node).quick_push (obj: vop);
8303}
8304
8305/* Get the Ith vectorized definition from SLP_NODE. */
8306
8307tree
8308vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
8309{
8310 return SLP_TREE_VEC_DEFS (slp_node)[i];
8311}
8312
8313/* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
8314
8315void
8316vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
8317{
8318 vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
8319 vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
8320}
8321
8322/* Get N vectorized definitions for SLP_NODE. */
8323
8324void
8325vect_get_slp_defs (vec_info *,
8326 slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
8327{
8328 if (n == -1U)
8329 n = SLP_TREE_CHILDREN (slp_node).length ();
8330
8331 for (unsigned i = 0; i < n; ++i)
8332 {
8333 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8334 vec<tree> vec_defs = vNULL;
8335 vect_get_slp_defs (slp_node: child, vec_defs: &vec_defs);
8336 vec_oprnds->quick_push (obj: vec_defs);
8337 }
8338}
8339
8340/* A subroutine of vect_transform_slp_perm_load with two extra arguments:
8341 - PERM gives the permutation that the caller wants to use for NODE,
8342 which might be different from SLP_LOAD_PERMUTATION.
8343 - DUMP_P controls whether the function dumps information. */
8344
8345static bool
8346vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node,
8347 load_permutation_t &perm,
8348 const vec<tree> &dr_chain,
8349 gimple_stmt_iterator *gsi, poly_uint64 vf,
8350 bool analyze_only, bool dump_p,
8351 unsigned *n_perms, unsigned int *n_loads,
8352 bool dce_chain)
8353{
8354 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
8355 int vec_index = 0;
8356 tree vectype = SLP_TREE_VECTYPE (node);
8357 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
8358 unsigned int mask_element;
8359 unsigned dr_group_size;
8360 machine_mode mode;
8361
8362 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
8363 dr_group_size = 1;
8364 else
8365 {
8366 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
8367 dr_group_size = DR_GROUP_SIZE (stmt_info);
8368 }
8369
8370 mode = TYPE_MODE (vectype);
8371 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (node: vectype);
8372 unsigned int nstmts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8373
8374 /* Initialize the vect stmts of NODE to properly insert the generated
8375 stmts later. */
8376 if (! analyze_only)
8377 for (unsigned i = SLP_TREE_VEC_DEFS (node).length (); i < nstmts; i++)
8378 SLP_TREE_VEC_DEFS (node).quick_push (NULL_TREE);
8379
8380 /* Generate permutation masks for every NODE. Number of masks for each NODE
8381 is equal to GROUP_SIZE.
8382 E.g., we have a group of three nodes with three loads from the same
8383 location in each node, and the vector size is 4. I.e., we have a
8384 a0b0c0a1b1c1... sequence and we need to create the following vectors:
8385 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
8386 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
8387 ...
8388
8389 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
8390 The last mask is illegal since we assume two operands for permute
8391 operation, and the mask element values can't be outside that range.
8392 Hence, the last mask must be converted into {2,5,5,5}.
8393 For the first two permutations we need the first and the second input
8394 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
8395 we need the second and the third vectors: {b1,c1,a2,b2} and
8396 {c2,a3,b3,c3}. */
8397
8398 int vect_stmts_counter = 0;
8399 unsigned int index = 0;
8400 int first_vec_index = -1;
8401 int second_vec_index = -1;
8402 bool noop_p = true;
8403 *n_perms = 0;
8404
8405 vec_perm_builder mask;
8406 unsigned int nelts_to_build;
8407 unsigned int nvectors_per_build;
8408 unsigned int in_nlanes;
8409 bool repeating_p = (group_size == dr_group_size
8410 && multiple_p (a: nunits, b: group_size));
8411 if (repeating_p)
8412 {
8413 /* A single vector contains a whole number of copies of the node, so:
8414 (a) all permutes can use the same mask; and
8415 (b) the permutes only need a single vector input. */
8416 mask.new_vector (full_nelts: nunits, npatterns: group_size, nelts_per_pattern: 3);
8417 nelts_to_build = mask.encoded_nelts ();
8418 /* It's possible to obtain zero nstmts during analyze_only, so make
8419 it at least one to ensure the later computation for n_perms
8420 proceed. */
8421 nvectors_per_build = nstmts > 0 ? nstmts : 1;
8422 in_nlanes = dr_group_size * 3;
8423 }
8424 else
8425 {
8426 /* We need to construct a separate mask for each vector statement. */
8427 unsigned HOST_WIDE_INT const_nunits, const_vf;
8428 if (!nunits.is_constant (const_value: &const_nunits)
8429 || !vf.is_constant (const_value: &const_vf))
8430 return false;
8431 mask.new_vector (full_nelts: const_nunits, npatterns: const_nunits, nelts_per_pattern: 1);
8432 nelts_to_build = const_vf * group_size;
8433 nvectors_per_build = 1;
8434 in_nlanes = const_vf * dr_group_size;
8435 }
8436 auto_sbitmap used_in_lanes (in_nlanes);
8437 bitmap_clear (used_in_lanes);
8438 auto_bitmap used_defs;
8439
8440 unsigned int count = mask.encoded_nelts ();
8441 mask.quick_grow (len: count);
8442 vec_perm_indices indices;
8443
8444 for (unsigned int j = 0; j < nelts_to_build; j++)
8445 {
8446 unsigned int iter_num = j / group_size;
8447 unsigned int stmt_num = j % group_size;
8448 unsigned int i = (iter_num * dr_group_size + perm[stmt_num]);
8449 bitmap_set_bit (map: used_in_lanes, bitno: i);
8450 if (repeating_p)
8451 {
8452 first_vec_index = 0;
8453 mask_element = i;
8454 }
8455 else
8456 {
8457 /* Enforced before the loop when !repeating_p. */
8458 unsigned int const_nunits = nunits.to_constant ();
8459 vec_index = i / const_nunits;
8460 mask_element = i % const_nunits;
8461 if (vec_index == first_vec_index
8462 || first_vec_index == -1)
8463 {
8464 first_vec_index = vec_index;
8465 }
8466 else if (vec_index == second_vec_index
8467 || second_vec_index == -1)
8468 {
8469 second_vec_index = vec_index;
8470 mask_element += const_nunits;
8471 }
8472 else
8473 {
8474 if (dump_p)
8475 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8476 "permutation requires at "
8477 "least three vectors %G",
8478 stmt_info->stmt);
8479 gcc_assert (analyze_only);
8480 return false;
8481 }
8482
8483 gcc_assert (mask_element < 2 * const_nunits);
8484 }
8485
8486 if (mask_element != index)
8487 noop_p = false;
8488 mask[index++] = mask_element;
8489
8490 if (index == count)
8491 {
8492 if (!noop_p)
8493 {
8494 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
8495 if (!can_vec_perm_const_p (mode, mode, indices))
8496 {
8497 if (dump_p)
8498 {
8499 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8500 "unsupported vect permute { ");
8501 for (i = 0; i < count; ++i)
8502 {
8503 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
8504 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
8505 }
8506 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
8507 }
8508 gcc_assert (analyze_only);
8509 return false;
8510 }
8511
8512 tree mask_vec = NULL_TREE;
8513 if (!analyze_only)
8514 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
8515
8516 if (second_vec_index == -1)
8517 second_vec_index = first_vec_index;
8518
8519 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
8520 {
8521 ++*n_perms;
8522 if (analyze_only)
8523 continue;
8524 /* Generate the permute statement if necessary. */
8525 tree first_vec = dr_chain[first_vec_index + ri];
8526 tree second_vec = dr_chain[second_vec_index + ri];
8527 gassign *stmt = as_a<gassign *> (p: stmt_info->stmt);
8528 tree perm_dest
8529 = vect_create_destination_var (gimple_assign_lhs (gs: stmt),
8530 vectype);
8531 perm_dest = make_ssa_name (var: perm_dest);
8532 gimple *perm_stmt
8533 = gimple_build_assign (perm_dest, VEC_PERM_EXPR, first_vec,
8534 second_vec, mask_vec);
8535 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
8536 gsi);
8537 if (dce_chain)
8538 {
8539 bitmap_set_bit (used_defs, first_vec_index + ri);
8540 bitmap_set_bit (used_defs, second_vec_index + ri);
8541 }
8542
8543 /* Store the vector statement in NODE. */
8544 SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = perm_dest;
8545 }
8546 }
8547 else if (!analyze_only)
8548 {
8549 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
8550 {
8551 tree first_vec = dr_chain[first_vec_index + ri];
8552 /* If mask was NULL_TREE generate the requested
8553 identity transform. */
8554 if (dce_chain)
8555 bitmap_set_bit (used_defs, first_vec_index + ri);
8556
8557 /* Store the vector statement in NODE. */
8558 SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = first_vec;
8559 }
8560 }
8561
8562 index = 0;
8563 first_vec_index = -1;
8564 second_vec_index = -1;
8565 noop_p = true;
8566 }
8567 }
8568
8569 if (n_loads)
8570 {
8571 if (repeating_p)
8572 *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8573 else
8574 {
8575 /* Enforced above when !repeating_p. */
8576 unsigned int const_nunits = nunits.to_constant ();
8577 *n_loads = 0;
8578 bool load_seen = false;
8579 for (unsigned i = 0; i < in_nlanes; ++i)
8580 {
8581 if (i % const_nunits == 0)
8582 {
8583 if (load_seen)
8584 *n_loads += 1;
8585 load_seen = false;
8586 }
8587 if (bitmap_bit_p (map: used_in_lanes, bitno: i))
8588 load_seen = true;
8589 }
8590 if (load_seen)
8591 *n_loads += 1;
8592 }
8593 }
8594
8595 if (dce_chain)
8596 for (unsigned i = 0; i < dr_chain.length (); ++i)
8597 if (!bitmap_bit_p (used_defs, i))
8598 {
8599 gimple *stmt = SSA_NAME_DEF_STMT (dr_chain[i]);
8600 gimple_stmt_iterator rgsi = gsi_for_stmt (stmt);
8601 gsi_remove (&rgsi, true);
8602 release_defs (stmt);
8603 }
8604
8605 return true;
8606}
8607
8608/* Generate vector permute statements from a list of loads in DR_CHAIN.
8609 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
8610 permute statements for the SLP node NODE. Store the number of vector
8611 permute instructions in *N_PERMS and the number of vector load
8612 instructions in *N_LOADS. If DCE_CHAIN is true, remove all definitions
8613 that were not needed. */
8614
8615bool
8616vect_transform_slp_perm_load (vec_info *vinfo,
8617 slp_tree node, const vec<tree> &dr_chain,
8618 gimple_stmt_iterator *gsi, poly_uint64 vf,
8619 bool analyze_only, unsigned *n_perms,
8620 unsigned int *n_loads, bool dce_chain)
8621{
8622 return vect_transform_slp_perm_load_1 (vinfo, node,
8623 SLP_TREE_LOAD_PERMUTATION (node),
8624 dr_chain, gsi, vf, analyze_only,
8625 dump_p: dump_enabled_p (), n_perms, n_loads,
8626 dce_chain);
8627}
8628
8629/* Produce the next vector result for SLP permutation NODE by adding a vector
8630 statement at GSI. If MASK_VEC is nonnull, add:
8631
8632 <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
8633
8634 otherwise add:
8635
8636 <new SSA name> = FIRST_DEF. */
8637
8638static void
8639vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
8640 slp_tree node, tree first_def, tree second_def,
8641 tree mask_vec, poly_uint64 identity_offset)
8642{
8643 tree vectype = SLP_TREE_VECTYPE (node);
8644
8645 /* ??? We SLP match existing vector element extracts but
8646 allow punning which we need to re-instantiate at uses
8647 but have no good way of explicitly representing. */
8648 if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)), TYPE_SIZE (vectype))
8649 && !types_compatible_p (TREE_TYPE (first_def), type2: vectype))
8650 {
8651 gassign *conv_stmt
8652 = gimple_build_assign (make_ssa_name (var: vectype),
8653 build1 (VIEW_CONVERT_EXPR, vectype, first_def));
8654 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
8655 first_def = gimple_assign_lhs (gs: conv_stmt);
8656 }
8657 gassign *perm_stmt;
8658 tree perm_dest = make_ssa_name (var: vectype);
8659 if (mask_vec)
8660 {
8661 if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)),
8662 TYPE_SIZE (vectype))
8663 && !types_compatible_p (TREE_TYPE (second_def), type2: vectype))
8664 {
8665 gassign *conv_stmt
8666 = gimple_build_assign (make_ssa_name (var: vectype),
8667 build1 (VIEW_CONVERT_EXPR,
8668 vectype, second_def));
8669 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
8670 second_def = gimple_assign_lhs (gs: conv_stmt);
8671 }
8672 perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
8673 first_def, second_def,
8674 mask_vec);
8675 }
8676 else if (!types_compatible_p (TREE_TYPE (first_def), type2: vectype))
8677 {
8678 /* For identity permutes we still need to handle the case
8679 of offsetted extracts or concats. */
8680 unsigned HOST_WIDE_INT c;
8681 auto first_def_nunits
8682 = TYPE_VECTOR_SUBPARTS (TREE_TYPE (first_def));
8683 if (known_le (TYPE_VECTOR_SUBPARTS (vectype), first_def_nunits))
8684 {
8685 unsigned HOST_WIDE_INT elsz
8686 = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (first_def))));
8687 tree lowpart = build3 (BIT_FIELD_REF, vectype, first_def,
8688 TYPE_SIZE (vectype),
8689 bitsize_int (identity_offset * elsz));
8690 perm_stmt = gimple_build_assign (perm_dest, lowpart);
8691 }
8692 else if (constant_multiple_p (a: TYPE_VECTOR_SUBPARTS (node: vectype),
8693 b: first_def_nunits, multiple: &c) && c == 2)
8694 {
8695 tree ctor = build_constructor_va (vectype, 2, NULL_TREE, first_def,
8696 NULL_TREE, second_def);
8697 perm_stmt = gimple_build_assign (perm_dest, ctor);
8698 }
8699 else
8700 gcc_unreachable ();
8701 }
8702 else
8703 {
8704 /* We need a copy here in case the def was external. */
8705 perm_stmt = gimple_build_assign (perm_dest, first_def);
8706 }
8707 vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
8708 /* Store the vector statement in NODE. */
8709 node->push_vec_def (def: perm_stmt);
8710}
8711
8712/* Subroutine of vectorizable_slp_permutation. Check whether the target
8713 can perform permutation PERM on the (1 or 2) input nodes in CHILDREN.
8714 If GSI is nonnull, emit the permutation there.
8715
8716 When GSI is null, the only purpose of NODE is to give properties
8717 of the result, such as the vector type and number of SLP lanes.
8718 The node does not need to be a VEC_PERM_EXPR.
8719
8720 If the target supports the operation, return the number of individual
8721 VEC_PERM_EXPRs needed, otherwise return -1. Print information to the
8722 dump file if DUMP_P is true. */
8723
8724static int
8725vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
8726 slp_tree node, lane_permutation_t &perm,
8727 vec<slp_tree> &children, bool dump_p)
8728{
8729 tree vectype = SLP_TREE_VECTYPE (node);
8730
8731 /* ??? We currently only support all same vector input types
8732 while the SLP IL should really do a concat + select and thus accept
8733 arbitrary mismatches. */
8734 slp_tree child;
8735 unsigned i;
8736 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (node: vectype);
8737 bool repeating_p = multiple_p (a: nunits, SLP_TREE_LANES (node));
8738 tree op_vectype = NULL_TREE;
8739 FOR_EACH_VEC_ELT (children, i, child)
8740 if (SLP_TREE_VECTYPE (child))
8741 {
8742 op_vectype = SLP_TREE_VECTYPE (child);
8743 break;
8744 }
8745 if (!op_vectype)
8746 op_vectype = vectype;
8747 FOR_EACH_VEC_ELT (children, i, child)
8748 {
8749 if ((SLP_TREE_DEF_TYPE (child) != vect_internal_def
8750 && !vect_maybe_update_slp_op_vectype (child, op_vectype))
8751 || !types_compatible_p (SLP_TREE_VECTYPE (child), type2: op_vectype)
8752 || !types_compatible_p (TREE_TYPE (vectype), TREE_TYPE (op_vectype)))
8753 {
8754 if (dump_p)
8755 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8756 "Unsupported vector types in lane permutation\n");
8757 return -1;
8758 }
8759 if (SLP_TREE_LANES (child) != SLP_TREE_LANES (node))
8760 repeating_p = false;
8761 }
8762
8763 gcc_assert (perm.length () == SLP_TREE_LANES (node));
8764 if (dump_p)
8765 {
8766 dump_printf_loc (MSG_NOTE, vect_location,
8767 "vectorizing permutation");
8768 for (unsigned i = 0; i < perm.length (); ++i)
8769 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
8770 if (repeating_p)
8771 dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
8772 dump_printf (MSG_NOTE, "\n");
8773 }
8774
8775 /* REPEATING_P is true if every output vector is guaranteed to use the
8776 same permute vector. We can handle that case for both variable-length
8777 and constant-length vectors, but we only handle other cases for
8778 constant-length vectors.
8779
8780 Set:
8781
8782 - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
8783 mask vector that we want to build.
8784
8785 - NCOPIES to the number of copies of PERM that we need in order
8786 to build the necessary permute mask vectors.
8787
8788 - NOUTPUTS_PER_MASK to the number of output vectors we want to create
8789 for each permute mask vector. This is only relevant when GSI is
8790 nonnull. */
8791 uint64_t npatterns;
8792 unsigned nelts_per_pattern;
8793 uint64_t ncopies;
8794 unsigned noutputs_per_mask;
8795 if (repeating_p)
8796 {
8797 /* We need a single permute mask vector that has the form:
8798
8799 { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
8800
8801 In other words, the original n-element permute in PERM is
8802 "unrolled" to fill a full vector. The stepped vector encoding
8803 that we use for permutes requires 3n elements. */
8804 npatterns = SLP_TREE_LANES (node);
8805 nelts_per_pattern = ncopies = 3;
8806 noutputs_per_mask = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8807 }
8808 else
8809 {
8810 /* Calculate every element of every permute mask vector explicitly,
8811 instead of relying on the pattern described above. */
8812 if (!nunits.is_constant (const_value: &npatterns))
8813 return -1;
8814 nelts_per_pattern = ncopies = 1;
8815 if (loop_vec_info linfo = dyn_cast <loop_vec_info> (p: vinfo))
8816 if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (const_value: &ncopies))
8817 return -1;
8818 noutputs_per_mask = 1;
8819 }
8820 unsigned olanes = ncopies * SLP_TREE_LANES (node);
8821 gcc_assert (repeating_p || multiple_p (olanes, nunits));
8822
8823 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
8824 from the { SLP operand, scalar lane } permutation as recorded in the
8825 SLP node as intermediate step. This part should already work
8826 with SLP children with arbitrary number of lanes. */
8827 auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
8828 auto_vec<unsigned> active_lane;
8829 vperm.create (nelems: olanes);
8830 active_lane.safe_grow_cleared (len: children.length (), exact: true);
8831 for (unsigned i = 0; i < ncopies; ++i)
8832 {
8833 for (unsigned pi = 0; pi < perm.length (); ++pi)
8834 {
8835 std::pair<unsigned, unsigned> p = perm[pi];
8836 tree vtype = SLP_TREE_VECTYPE (children[p.first]);
8837 if (repeating_p)
8838 vperm.quick_push (obj: {{p.first, 0}, p.second + active_lane[p.first]});
8839 else
8840 {
8841 /* We checked above that the vectors are constant-length. */
8842 unsigned vnunits = TYPE_VECTOR_SUBPARTS (node: vtype).to_constant ();
8843 unsigned vi = (active_lane[p.first] + p.second) / vnunits;
8844 unsigned vl = (active_lane[p.first] + p.second) % vnunits;
8845 vperm.quick_push (obj: {{p.first, vi}, vl});
8846 }
8847 }
8848 /* Advance to the next group. */
8849 for (unsigned j = 0; j < children.length (); ++j)
8850 active_lane[j] += SLP_TREE_LANES (children[j]);
8851 }
8852
8853 if (dump_p)
8854 {
8855 dump_printf_loc (MSG_NOTE, vect_location,
8856 "vectorizing permutation");
8857 for (unsigned i = 0; i < perm.length (); ++i)
8858 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
8859 if (repeating_p)
8860 dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
8861 dump_printf (MSG_NOTE, "\n");
8862 dump_printf_loc (MSG_NOTE, vect_location, "as");
8863 for (unsigned i = 0; i < vperm.length (); ++i)
8864 {
8865 if (i != 0
8866 && (repeating_p
8867 ? multiple_p (a: i, b: npatterns)
8868 : multiple_p (a: i, b: TYPE_VECTOR_SUBPARTS (node: vectype))))
8869 dump_printf (MSG_NOTE, ",");
8870 dump_printf (MSG_NOTE, " vops%u[%u][%u]",
8871 vperm[i].first.first, vperm[i].first.second,
8872 vperm[i].second);
8873 }
8874 dump_printf (MSG_NOTE, "\n");
8875 }
8876
8877 /* We can only handle two-vector permutes, everything else should
8878 be lowered on the SLP level. The following is closely inspired
8879 by vect_transform_slp_perm_load and is supposed to eventually
8880 replace it.
8881 ??? As intermediate step do code-gen in the SLP tree representation
8882 somehow? */
8883 std::pair<unsigned, unsigned> first_vec = std::make_pair (x: -1U, y: -1U);
8884 std::pair<unsigned, unsigned> second_vec = std::make_pair (x: -1U, y: -1U);
8885 unsigned int index = 0;
8886 poly_uint64 mask_element;
8887 vec_perm_builder mask;
8888 mask.new_vector (full_nelts: nunits, npatterns, nelts_per_pattern);
8889 unsigned int count = mask.encoded_nelts ();
8890 mask.quick_grow (len: count);
8891 vec_perm_indices indices;
8892 unsigned nperms = 0;
8893 for (unsigned i = 0; i < vperm.length (); ++i)
8894 {
8895 mask_element = vperm[i].second;
8896 if (first_vec.first == -1U
8897 || first_vec == vperm[i].first)
8898 first_vec = vperm[i].first;
8899 else if (second_vec.first == -1U
8900 || second_vec == vperm[i].first)
8901 {
8902 second_vec = vperm[i].first;
8903 mask_element += nunits;
8904 }
8905 else
8906 {
8907 if (dump_p)
8908 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8909 "permutation requires at "
8910 "least three vectors\n");
8911 gcc_assert (!gsi);
8912 return -1;
8913 }
8914
8915 mask[index++] = mask_element;
8916
8917 if (index == count)
8918 {
8919 indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
8920 TYPE_VECTOR_SUBPARTS (node: op_vectype));
8921 bool identity_p = (indices.series_p (0, 1, mask[0], 1)
8922 && constant_multiple_p (a: mask[0], b: nunits));
8923 machine_mode vmode = TYPE_MODE (vectype);
8924 machine_mode op_vmode = TYPE_MODE (op_vectype);
8925 unsigned HOST_WIDE_INT c;
8926 if ((!identity_p
8927 && !can_vec_perm_const_p (vmode, op_vmode, indices))
8928 || (identity_p
8929 && !known_le (nunits,
8930 TYPE_VECTOR_SUBPARTS (op_vectype))
8931 && (!constant_multiple_p (a: nunits,
8932 b: TYPE_VECTOR_SUBPARTS (node: op_vectype),
8933 multiple: &c) || c != 2)))
8934 {
8935 if (dump_p)
8936 {
8937 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
8938 vect_location,
8939 "unsupported vect permute { ");
8940 for (i = 0; i < count; ++i)
8941 {
8942 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
8943 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
8944 }
8945 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
8946 }
8947 gcc_assert (!gsi);
8948 return -1;
8949 }
8950
8951 if (!identity_p)
8952 nperms++;
8953 if (gsi)
8954 {
8955 if (second_vec.first == -1U)
8956 second_vec = first_vec;
8957
8958 slp_tree
8959 first_node = children[first_vec.first],
8960 second_node = children[second_vec.first];
8961
8962 tree mask_vec = NULL_TREE;
8963 if (!identity_p)
8964 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
8965
8966 for (unsigned int vi = 0; vi < noutputs_per_mask; ++vi)
8967 {
8968 tree first_def
8969 = vect_get_slp_vect_def (slp_node: first_node,
8970 i: first_vec.second + vi);
8971 tree second_def
8972 = vect_get_slp_vect_def (slp_node: second_node,
8973 i: second_vec.second + vi);
8974 vect_add_slp_permutation (vinfo, gsi, node, first_def,
8975 second_def, mask_vec, identity_offset: mask[0]);
8976 }
8977 }
8978
8979 index = 0;
8980 first_vec = std::make_pair (x: -1U, y: -1U);
8981 second_vec = std::make_pair (x: -1U, y: -1U);
8982 }
8983 }
8984
8985 return nperms;
8986}
8987
8988/* Vectorize the SLP permutations in NODE as specified
8989 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
8990 child number and lane number.
8991 Interleaving of two two-lane two-child SLP subtrees (not supported):
8992 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
8993 A blend of two four-lane two-child SLP subtrees:
8994 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
8995 Highpart of a four-lane one-child SLP subtree (not supported):
8996 [ { 0, 2 }, { 0, 3 } ]
8997 Where currently only a subset is supported by code generating below. */
8998
8999static bool
9000vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
9001 slp_tree node, stmt_vector_for_cost *cost_vec)
9002{
9003 tree vectype = SLP_TREE_VECTYPE (node);
9004 lane_permutation_t &perm = SLP_TREE_LANE_PERMUTATION (node);
9005 int nperms = vectorizable_slp_permutation_1 (vinfo, gsi, node, perm,
9006 SLP_TREE_CHILDREN (node),
9007 dump_p: dump_enabled_p ());
9008 if (nperms < 0)
9009 return false;
9010
9011 if (!gsi)
9012 record_stmt_cost (cost_vec, nperms, vec_perm, node, vectype, 0, vect_body);
9013
9014 return true;
9015}
9016
9017/* Vectorize SLP NODE. */
9018
9019static void
9020vect_schedule_slp_node (vec_info *vinfo,
9021 slp_tree node, slp_instance instance)
9022{
9023 gimple_stmt_iterator si;
9024 int i;
9025 slp_tree child;
9026
9027 /* For existing vectors there's nothing to do. */
9028 if (SLP_TREE_DEF_TYPE (node) == vect_external_def
9029 && SLP_TREE_VEC_DEFS (node).exists ())
9030 return;
9031
9032 gcc_assert (SLP_TREE_VEC_DEFS (node).is_empty ());
9033
9034 /* Vectorize externals and constants. */
9035 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
9036 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
9037 {
9038 /* ??? vectorizable_shift can end up using a scalar operand which is
9039 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
9040 node in this case. */
9041 if (!SLP_TREE_VECTYPE (node))
9042 return;
9043
9044 vect_create_constant_vectors (vinfo, op_node: node);
9045 return;
9046 }
9047
9048 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
9049
9050 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
9051 SLP_TREE_VEC_DEFS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
9052
9053 if (dump_enabled_p ())
9054 dump_printf_loc (MSG_NOTE, vect_location,
9055 "------>vectorizing SLP node starting from: %G",
9056 stmt_info->stmt);
9057
9058 if (STMT_VINFO_DATA_REF (stmt_info)
9059 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
9060 {
9061 /* Vectorized loads go before the first scalar load to make it
9062 ready early, vectorized stores go before the last scalar
9063 stmt which is where all uses are ready. */
9064 stmt_vec_info last_stmt_info = NULL;
9065 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
9066 last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
9067 else /* DR_IS_WRITE */
9068 last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
9069 si = gsi_for_stmt (last_stmt_info->stmt);
9070 }
9071 else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
9072 || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
9073 || STMT_VINFO_TYPE (stmt_info) == phi_info_type)
9074 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
9075 {
9076 /* For PHI node vectorization we do not use the insertion iterator. */
9077 si = gsi_none ();
9078 }
9079 else
9080 {
9081 /* Emit other stmts after the children vectorized defs which is
9082 earliest possible. */
9083 gimple *last_stmt = NULL;
9084 bool seen_vector_def = false;
9085 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9086 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
9087 {
9088 /* For fold-left reductions we are retaining the scalar
9089 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
9090 set so the representation isn't perfect. Resort to the
9091 last scalar def here. */
9092 if (SLP_TREE_VEC_DEFS (child).is_empty ())
9093 {
9094 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
9095 == cycle_phi_info_type);
9096 gphi *phi = as_a <gphi *>
9097 (p: vect_find_last_scalar_stmt_in_slp (node: child)->stmt);
9098 if (!last_stmt
9099 || vect_stmt_dominates_stmt_p (last_stmt, phi))
9100 last_stmt = phi;
9101 }
9102 /* We are emitting all vectorized stmts in the same place and
9103 the last one is the last.
9104 ??? Unless we have a load permutation applied and that
9105 figures to re-use an earlier generated load. */
9106 unsigned j;
9107 tree vdef;
9108 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
9109 {
9110 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
9111 if (!last_stmt
9112 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
9113 last_stmt = vstmt;
9114 }
9115 }
9116 else if (!SLP_TREE_VECTYPE (child))
9117 {
9118 /* For externals we use unvectorized at all scalar defs. */
9119 unsigned j;
9120 tree def;
9121 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
9122 if (TREE_CODE (def) == SSA_NAME
9123 && !SSA_NAME_IS_DEFAULT_DEF (def))
9124 {
9125 gimple *stmt = SSA_NAME_DEF_STMT (def);
9126 if (!last_stmt
9127 || vect_stmt_dominates_stmt_p (last_stmt, stmt))
9128 last_stmt = stmt;
9129 }
9130 }
9131 else
9132 {
9133 /* For externals we have to look at all defs since their
9134 insertion place is decided per vector. But beware
9135 of pre-existing vectors where we need to make sure
9136 we do not insert before the region boundary. */
9137 if (SLP_TREE_SCALAR_OPS (child).is_empty ()
9138 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
9139 seen_vector_def = true;
9140 else
9141 {
9142 unsigned j;
9143 tree vdef;
9144 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
9145 if (TREE_CODE (vdef) == SSA_NAME
9146 && !SSA_NAME_IS_DEFAULT_DEF (vdef))
9147 {
9148 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
9149 if (!last_stmt
9150 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
9151 last_stmt = vstmt;
9152 }
9153 }
9154 }
9155 /* This can happen when all children are pre-existing vectors or
9156 constants. */
9157 if (!last_stmt)
9158 last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
9159 if (!last_stmt)
9160 {
9161 gcc_assert (seen_vector_def);
9162 si = gsi_after_labels (bb: as_a <bb_vec_info> (p: vinfo)->bbs[0]);
9163 }
9164 else if (is_ctrl_altering_stmt (last_stmt))
9165 {
9166 /* We split regions to vectorize at control altering stmts
9167 with a definition so this must be an external which
9168 we can insert at the start of the region. */
9169 si = gsi_after_labels (bb: as_a <bb_vec_info> (p: vinfo)->bbs[0]);
9170 }
9171 else if (is_a <bb_vec_info> (p: vinfo)
9172 && gimple_bb (g: last_stmt) != gimple_bb (g: stmt_info->stmt)
9173 && gimple_could_trap_p (stmt_info->stmt))
9174 {
9175 /* We've constrained possibly trapping operations to all come
9176 from the same basic-block, if vectorized defs would allow earlier
9177 scheduling still force vectorized stmts to the original block.
9178 This is only necessary for BB vectorization since for loop vect
9179 all operations are in a single BB and scalar stmt based
9180 placement doesn't play well with epilogue vectorization. */
9181 gcc_assert (dominated_by_p (CDI_DOMINATORS,
9182 gimple_bb (stmt_info->stmt),
9183 gimple_bb (last_stmt)));
9184 si = gsi_after_labels (bb: gimple_bb (g: stmt_info->stmt));
9185 }
9186 else if (is_a <gphi *> (p: last_stmt))
9187 si = gsi_after_labels (bb: gimple_bb (g: last_stmt));
9188 else
9189 {
9190 si = gsi_for_stmt (last_stmt);
9191 gsi_next (i: &si);
9192 }
9193 }
9194
9195 /* Handle purely internal nodes. */
9196 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
9197 {
9198 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
9199 be shared with different SLP nodes (but usually it's the same
9200 operation apart from the case the stmt is only there for denoting
9201 the actual scalar lane defs ...). So do not call vect_transform_stmt
9202 but open-code it here (partly). */
9203 bool done = vectorizable_slp_permutation (vinfo, gsi: &si, node, NULL);
9204 gcc_assert (done);
9205 stmt_vec_info slp_stmt_info;
9206 unsigned int i;
9207 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
9208 if (STMT_VINFO_LIVE_P (slp_stmt_info))
9209 {
9210 done = vectorizable_live_operation (vinfo, slp_stmt_info, node,
9211 instance, i, true, NULL);
9212 gcc_assert (done);
9213 }
9214 }
9215 else
9216 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
9217}
9218
9219/* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
9220 For loop vectorization this is done in vectorizable_call, but for SLP
9221 it needs to be deferred until end of vect_schedule_slp, because multiple
9222 SLP instances may refer to the same scalar stmt. */
9223
9224static void
9225vect_remove_slp_scalar_calls (vec_info *vinfo,
9226 slp_tree node, hash_set<slp_tree> &visited)
9227{
9228 gimple *new_stmt;
9229 gimple_stmt_iterator gsi;
9230 int i;
9231 slp_tree child;
9232 tree lhs;
9233 stmt_vec_info stmt_info;
9234
9235 if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
9236 return;
9237
9238 if (visited.add (k: node))
9239 return;
9240
9241 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9242 vect_remove_slp_scalar_calls (vinfo, node: child, visited);
9243
9244 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
9245 {
9246 gcall *stmt = dyn_cast <gcall *> (p: stmt_info->stmt);
9247 if (!stmt || gimple_bb (g: stmt) == NULL)
9248 continue;
9249 if (is_pattern_stmt_p (stmt_info)
9250 || !PURE_SLP_STMT (stmt_info))
9251 continue;
9252 lhs = gimple_call_lhs (gs: stmt);
9253 if (lhs)
9254 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
9255 else
9256 {
9257 new_stmt = gimple_build_nop ();
9258 unlink_stmt_vdef (stmt_info->stmt);
9259 }
9260 gsi = gsi_for_stmt (stmt);
9261 vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
9262 if (lhs)
9263 SSA_NAME_DEF_STMT (lhs) = new_stmt;
9264 }
9265}
9266
9267static void
9268vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
9269{
9270 hash_set<slp_tree> visited;
9271 vect_remove_slp_scalar_calls (vinfo, node, visited);
9272}
9273
9274/* Vectorize the instance root. */
9275
9276void
9277vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
9278{
9279 gassign *rstmt = NULL;
9280
9281 if (instance->kind == slp_inst_kind_ctor)
9282 {
9283 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
9284 {
9285 tree vect_lhs = SLP_TREE_VEC_DEFS (node)[0];
9286 tree root_lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
9287 if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
9288 TREE_TYPE (vect_lhs)))
9289 vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
9290 vect_lhs);
9291 rstmt = gimple_build_assign (root_lhs, vect_lhs);
9292 }
9293 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
9294 {
9295 int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
9296 tree child_def;
9297 int j;
9298 vec<constructor_elt, va_gc> *v;
9299 vec_alloc (v, nelems: nelts);
9300
9301 /* A CTOR can handle V16HI composition from VNx8HI so we
9302 do not need to convert vector elements if the types
9303 do not match. */
9304 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (node), j, child_def)
9305 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, child_def);
9306 tree lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
9307 tree rtype
9308 = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[0]->stmt));
9309 tree r_constructor = build_constructor (rtype, v);
9310 rstmt = gimple_build_assign (lhs, r_constructor);
9311 }
9312 }
9313 else if (instance->kind == slp_inst_kind_bb_reduc)
9314 {
9315 /* Largely inspired by reduction chain epilogue handling in
9316 vect_create_epilog_for_reduction. */
9317 vec<tree> vec_defs = vNULL;
9318 vect_get_slp_defs (slp_node: node, vec_defs: &vec_defs);
9319 enum tree_code reduc_code
9320 = gimple_assign_rhs_code (gs: instance->root_stmts[0]->stmt);
9321 /* ??? We actually have to reflect signs somewhere. */
9322 if (reduc_code == MINUS_EXPR)
9323 reduc_code = PLUS_EXPR;
9324 gimple_seq epilogue = NULL;
9325 /* We may end up with more than one vector result, reduce them
9326 to one vector. */
9327 tree vec_def = vec_defs[0];
9328 tree vectype = TREE_TYPE (vec_def);
9329 tree compute_vectype = vectype;
9330 bool pun_for_overflow_p = (ANY_INTEGRAL_TYPE_P (vectype)
9331 && TYPE_OVERFLOW_UNDEFINED (vectype)
9332 && operation_can_overflow (reduc_code));
9333 if (pun_for_overflow_p)
9334 {
9335 compute_vectype = unsigned_type_for (vectype);
9336 vec_def = gimple_build (seq: &epilogue, code: VIEW_CONVERT_EXPR,
9337 type: compute_vectype, ops: vec_def);
9338 }
9339 for (unsigned i = 1; i < vec_defs.length (); ++i)
9340 {
9341 tree def = vec_defs[i];
9342 if (pun_for_overflow_p)
9343 def = gimple_build (seq: &epilogue, code: VIEW_CONVERT_EXPR,
9344 type: compute_vectype, ops: def);
9345 vec_def = gimple_build (seq: &epilogue, code: reduc_code, type: compute_vectype,
9346 ops: vec_def, ops: def);
9347 }
9348 vec_defs.release ();
9349 /* ??? Support other schemes than direct internal fn. */
9350 internal_fn reduc_fn;
9351 if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
9352 || reduc_fn == IFN_LAST)
9353 gcc_unreachable ();
9354 tree scalar_def = gimple_build (seq: &epilogue, fn: as_combined_fn (fn: reduc_fn),
9355 TREE_TYPE (compute_vectype), args: vec_def);
9356 if (!SLP_INSTANCE_REMAIN_DEFS (instance).is_empty ())
9357 {
9358 tree rem_def = NULL_TREE;
9359 for (auto def : SLP_INSTANCE_REMAIN_DEFS (instance))
9360 {
9361 def = gimple_convert (seq: &epilogue, TREE_TYPE (scalar_def), op: def);
9362 if (!rem_def)
9363 rem_def = def;
9364 else
9365 rem_def = gimple_build (seq: &epilogue, code: reduc_code,
9366 TREE_TYPE (scalar_def),
9367 ops: rem_def, ops: def);
9368 }
9369 scalar_def = gimple_build (seq: &epilogue, code: reduc_code,
9370 TREE_TYPE (scalar_def),
9371 ops: scalar_def, ops: rem_def);
9372 }
9373 scalar_def = gimple_convert (seq: &epilogue,
9374 TREE_TYPE (vectype), op: scalar_def);
9375 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
9376 gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
9377 gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
9378 update_stmt (s: gsi_stmt (i: rgsi));
9379 return;
9380 }
9381 else
9382 gcc_unreachable ();
9383
9384 gcc_assert (rstmt);
9385
9386 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
9387 gsi_replace (&rgsi, rstmt, true);
9388}
9389
9390struct slp_scc_info
9391{
9392 bool on_stack;
9393 int dfs;
9394 int lowlink;
9395};
9396
9397/* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
9398
9399static void
9400vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
9401 hash_map<slp_tree, slp_scc_info> &scc_info,
9402 int &maxdfs, vec<slp_tree> &stack)
9403{
9404 bool existed_p;
9405 slp_scc_info *info = &scc_info.get_or_insert (k: node, existed: &existed_p);
9406 gcc_assert (!existed_p);
9407 info->dfs = maxdfs;
9408 info->lowlink = maxdfs;
9409 maxdfs++;
9410
9411 /* Leaf. */
9412 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
9413 {
9414 info->on_stack = false;
9415 vect_schedule_slp_node (vinfo, node, instance);
9416 return;
9417 }
9418
9419 info->on_stack = true;
9420 stack.safe_push (obj: node);
9421
9422 unsigned i;
9423 slp_tree child;
9424 /* DFS recurse. */
9425 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9426 {
9427 if (!child)
9428 continue;
9429 slp_scc_info *child_info = scc_info.get (k: child);
9430 if (!child_info)
9431 {
9432 vect_schedule_scc (vinfo, node: child, instance, scc_info, maxdfs, stack);
9433 /* Recursion might have re-allocated the node. */
9434 info = scc_info.get (k: node);
9435 child_info = scc_info.get (k: child);
9436 info->lowlink = MIN (info->lowlink, child_info->lowlink);
9437 }
9438 else if (child_info->on_stack)
9439 info->lowlink = MIN (info->lowlink, child_info->dfs);
9440 }
9441 if (info->lowlink != info->dfs)
9442 return;
9443
9444 auto_vec<slp_tree, 4> phis_to_fixup;
9445
9446 /* Singleton. */
9447 if (stack.last () == node)
9448 {
9449 stack.pop ();
9450 info->on_stack = false;
9451 vect_schedule_slp_node (vinfo, node, instance);
9452 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
9453 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
9454 phis_to_fixup.quick_push (obj: node);
9455 }
9456 else
9457 {
9458 /* SCC. */
9459 int last_idx = stack.length () - 1;
9460 while (stack[last_idx] != node)
9461 last_idx--;
9462 /* We can break the cycle at PHIs who have at least one child
9463 code generated. Then we could re-start the DFS walk until
9464 all nodes in the SCC are covered (we might have new entries
9465 for only back-reachable nodes). But it's simpler to just
9466 iterate and schedule those that are ready. */
9467 unsigned todo = stack.length () - last_idx;
9468 do
9469 {
9470 for (int idx = stack.length () - 1; idx >= last_idx; --idx)
9471 {
9472 slp_tree entry = stack[idx];
9473 if (!entry)
9474 continue;
9475 bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
9476 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
9477 bool ready = !phi;
9478 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
9479 if (!child)
9480 {
9481 gcc_assert (phi);
9482 ready = true;
9483 break;
9484 }
9485 else if (scc_info.get (k: child)->on_stack)
9486 {
9487 if (!phi)
9488 {
9489 ready = false;
9490 break;
9491 }
9492 }
9493 else
9494 {
9495 if (phi)
9496 {
9497 ready = true;
9498 break;
9499 }
9500 }
9501 if (ready)
9502 {
9503 vect_schedule_slp_node (vinfo, node: entry, instance);
9504 scc_info.get (k: entry)->on_stack = false;
9505 stack[idx] = NULL;
9506 todo--;
9507 if (phi)
9508 phis_to_fixup.safe_push (obj: entry);
9509 }
9510 }
9511 }
9512 while (todo != 0);
9513
9514 /* Pop the SCC. */
9515 stack.truncate (size: last_idx);
9516 }
9517
9518 /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
9519 slp_tree phi_node;
9520 FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
9521 {
9522 gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
9523 edge_iterator ei;
9524 edge e;
9525 FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
9526 {
9527 unsigned dest_idx = e->dest_idx;
9528 child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
9529 if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
9530 continue;
9531 unsigned n = SLP_TREE_VEC_DEFS (phi_node).length ();
9532 /* Simply fill all args. */
9533 if (STMT_VINFO_DEF_TYPE (SLP_TREE_REPRESENTATIVE (phi_node))
9534 != vect_first_order_recurrence)
9535 for (unsigned i = 0; i < n; ++i)
9536 {
9537 tree phidef = SLP_TREE_VEC_DEFS (phi_node)[i];
9538 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
9539 add_phi_arg (phi, vect_get_slp_vect_def (slp_node: child, i),
9540 e, gimple_phi_arg_location (phi, i: dest_idx));
9541 }
9542 else
9543 {
9544 /* Unless it is a first order recurrence which needs
9545 args filled in for both the PHI node and the permutes. */
9546 gimple *perm
9547 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[0]);
9548 gimple *rphi = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (perm));
9549 add_phi_arg (as_a <gphi *> (p: rphi),
9550 vect_get_slp_vect_def (slp_node: child, i: n - 1),
9551 e, gimple_phi_arg_location (phi, i: dest_idx));
9552 for (unsigned i = 0; i < n; ++i)
9553 {
9554 gimple *perm
9555 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[i]);
9556 if (i > 0)
9557 gimple_assign_set_rhs1 (gs: perm,
9558 rhs: vect_get_slp_vect_def (slp_node: child, i: i - 1));
9559 gimple_assign_set_rhs2 (gs: perm,
9560 rhs: vect_get_slp_vect_def (slp_node: child, i));
9561 update_stmt (s: perm);
9562 }
9563 }
9564 }
9565 }
9566}
9567
9568/* Generate vector code for SLP_INSTANCES in the loop/basic block. */
9569
9570void
9571vect_schedule_slp (vec_info *vinfo, const vec<slp_instance> &slp_instances)
9572{
9573 slp_instance instance;
9574 unsigned int i;
9575
9576 hash_map<slp_tree, slp_scc_info> scc_info;
9577 int maxdfs = 0;
9578 FOR_EACH_VEC_ELT (slp_instances, i, instance)
9579 {
9580 slp_tree node = SLP_INSTANCE_TREE (instance);
9581 if (dump_enabled_p ())
9582 {
9583 dump_printf_loc (MSG_NOTE, vect_location,
9584 "Vectorizing SLP tree:\n");
9585 /* ??? Dump all? */
9586 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9587 dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
9588 SLP_INSTANCE_ROOT_STMTS (instance)[0]->stmt);
9589 vect_print_slp_graph (dump_kind: MSG_NOTE, loc: vect_location,
9590 SLP_INSTANCE_TREE (instance));
9591 }
9592 /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
9593 have a PHI be the node breaking the cycle. */
9594 auto_vec<slp_tree> stack;
9595 if (!scc_info.get (k: node))
9596 vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
9597
9598 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9599 vectorize_slp_instance_root_stmt (node, instance);
9600
9601 if (dump_enabled_p ())
9602 dump_printf_loc (MSG_NOTE, vect_location,
9603 "vectorizing stmts using SLP.\n");
9604 }
9605
9606 FOR_EACH_VEC_ELT (slp_instances, i, instance)
9607 {
9608 slp_tree root = SLP_INSTANCE_TREE (instance);
9609 stmt_vec_info store_info;
9610 unsigned int j;
9611
9612 /* Remove scalar call stmts. Do not do this for basic-block
9613 vectorization as not all uses may be vectorized.
9614 ??? Why should this be necessary? DCE should be able to
9615 remove the stmts itself.
9616 ??? For BB vectorization we can as well remove scalar
9617 stmts starting from the SLP tree root if they have no
9618 uses. */
9619 if (is_a <loop_vec_info> (p: vinfo))
9620 vect_remove_slp_scalar_calls (vinfo, node: root);
9621
9622 /* Remove vectorized stores original scalar stmts. */
9623 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (ix: j, ptr: &store_info); j++)
9624 {
9625 if (!STMT_VINFO_DATA_REF (store_info)
9626 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
9627 break;
9628
9629 store_info = vect_orig_stmt (stmt_info: store_info);
9630 /* Free the attached stmt_vec_info and remove the stmt. */
9631 vinfo->remove_stmt (store_info);
9632
9633 /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
9634 to not crash in vect_free_slp_tree later. */
9635 if (SLP_TREE_REPRESENTATIVE (root) == store_info)
9636 SLP_TREE_REPRESENTATIVE (root) = NULL;
9637 }
9638 }
9639}
9640

source code of gcc/tree-vect-slp.cc