1/* Loop autoparallelization.
2 Copyright (C) 2006-2023 Free Software Foundation, Inc.
3 Contributed by Sebastian Pop <pop@cri.ensmp.fr>
4 Zdenek Dvorak <dvorakz@suse.cz> and Razya Ladelsky <razya@il.ibm.com>.
5
6This file is part of GCC.
7
8GCC is free software; you can redistribute it and/or modify it under
9the terms of the GNU General Public License as published by the Free
10Software Foundation; either version 3, or (at your option) any later
11version.
12
13GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14WARRANTY; without even the implied warranty of MERCHANTABILITY or
15FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16for more details.
17
18You should have received a copy of the GNU General Public License
19along with GCC; see the file COPYING3. If not see
20<http://www.gnu.org/licenses/>. */
21
22#include "config.h"
23#include "system.h"
24#include "coretypes.h"
25#include "backend.h"
26#include "tree.h"
27#include "gimple.h"
28#include "cfghooks.h"
29#include "tree-pass.h"
30#include "ssa.h"
31#include "cgraph.h"
32#include "gimple-pretty-print.h"
33#include "fold-const.h"
34#include "gimplify.h"
35#include "gimple-iterator.h"
36#include "gimplify-me.h"
37#include "gimple-walk.h"
38#include "stor-layout.h"
39#include "tree-nested.h"
40#include "tree-cfg.h"
41#include "tree-ssa-loop-ivopts.h"
42#include "tree-ssa-loop-manip.h"
43#include "tree-ssa-loop-niter.h"
44#include "tree-ssa-loop.h"
45#include "tree-into-ssa.h"
46#include "cfgloop.h"
47#include "tree-scalar-evolution.h"
48#include "langhooks.h"
49#include "tree-vectorizer.h"
50#include "tree-hasher.h"
51#include "tree-parloops.h"
52#include "omp-general.h"
53#include "omp-low.h"
54#include "tree-ssa.h"
55#include "tree-ssa-alias.h"
56#include "tree-eh.h"
57#include "gomp-constants.h"
58#include "tree-dfa.h"
59#include "stringpool.h"
60#include "attribs.h"
61
62/* This pass tries to distribute iterations of loops into several threads.
63 The implementation is straightforward -- for each loop we test whether its
64 iterations are independent, and if it is the case (and some additional
65 conditions regarding profitability and correctness are satisfied), we
66 add GIMPLE_OMP_PARALLEL and GIMPLE_OMP_FOR codes and let omp expansion
67 machinery do its job.
68
69 The most of the complexity is in bringing the code into shape expected
70 by the omp expanders:
71 -- for GIMPLE_OMP_FOR, ensuring that the loop has only one induction
72 variable and that the exit test is at the start of the loop body
73 -- for GIMPLE_OMP_PARALLEL, replacing the references to local addressable
74 variables by accesses through pointers, and breaking up ssa chains
75 by storing the values incoming to the parallelized loop to a structure
76 passed to the new function as an argument (something similar is done
77 in omp gimplification, unfortunately only a small part of the code
78 can be shared).
79
80 TODO:
81 -- if there are several parallelizable loops in a function, it may be
82 possible to generate the threads just once (using synchronization to
83 ensure that cross-loop dependences are obeyed).
84 -- handling of common reduction patterns for outer loops.
85
86 More info can also be found at http://gcc.gnu.org/wiki/AutoParInGCC */
87/*
88 Reduction handling:
89 currently we use code inspired by vect_force_simple_reduction to detect
90 reduction patterns.
91 The code transformation will be introduced by an example.
92
93
94parloop
95{
96 int sum=1;
97
98 for (i = 0; i < N; i++)
99 {
100 x[i] = i + 3;
101 sum+=x[i];
102 }
103}
104
105gimple-like code:
106header_bb:
107
108 # sum_29 = PHI <sum_11(5), 1(3)>
109 # i_28 = PHI <i_12(5), 0(3)>
110 D.1795_8 = i_28 + 3;
111 x[i_28] = D.1795_8;
112 sum_11 = D.1795_8 + sum_29;
113 i_12 = i_28 + 1;
114 if (N_6(D) > i_12)
115 goto header_bb;
116
117
118exit_bb:
119
120 # sum_21 = PHI <sum_11(4)>
121 printf (&"%d"[0], sum_21);
122
123
124after reduction transformation (only relevant parts):
125
126parloop
127{
128
129....
130
131
132 # Storing the initial value given by the user. #
133
134 .paral_data_store.32.sum.27 = 1;
135
136 #pragma omp parallel num_threads(4)
137
138 #pragma omp for schedule(static)
139
140 # The neutral element corresponding to the particular
141 reduction's operation, e.g. 0 for PLUS_EXPR,
142 1 for MULT_EXPR, etc. replaces the user's initial value. #
143
144 # sum.27_29 = PHI <sum.27_11, 0>
145
146 sum.27_11 = D.1827_8 + sum.27_29;
147
148 GIMPLE_OMP_CONTINUE
149
150 # Adding this reduction phi is done at create_phi_for_local_result() #
151 # sum.27_56 = PHI <sum.27_11, 0>
152 GIMPLE_OMP_RETURN
153
154 # Creating the atomic operation is done at
155 create_call_for_reduction_1() #
156
157 #pragma omp atomic_load
158 D.1839_59 = *&.paral_data_load.33_51->reduction.23;
159 D.1840_60 = sum.27_56 + D.1839_59;
160 #pragma omp atomic_store (D.1840_60);
161
162 GIMPLE_OMP_RETURN
163
164 # collecting the result after the join of the threads is done at
165 create_loads_for_reductions().
166 The value computed by the threads is loaded from the
167 shared struct. #
168
169
170 .paral_data_load.33_52 = &.paral_data_store.32;
171 sum_37 = .paral_data_load.33_52->sum.27;
172 sum_43 = D.1795_41 + sum_37;
173
174 exit bb:
175 # sum_21 = PHI <sum_43, sum_26>
176 printf (&"%d"[0], sum_21);
177
178...
179
180}
181
182*/
183
184/* Error reporting helper for parloops_is_simple_reduction below. GIMPLE
185 statement STMT is printed with a message MSG. */
186
187static void
188report_ploop_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
189{
190 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
191}
192
193/* DEF_STMT_INFO occurs in a loop that contains a potential reduction
194 operation. Return true if the results of DEF_STMT_INFO are something
195 that can be accumulated by such a reduction. */
196
197static bool
198parloops_valid_reduction_input_p (stmt_vec_info def_stmt_info)
199{
200 return (is_gimple_assign (gs: def_stmt_info->stmt)
201 || is_gimple_call (gs: def_stmt_info->stmt)
202 || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_induction_def
203 || (gimple_code (g: def_stmt_info->stmt) == GIMPLE_PHI
204 && STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_internal_def
205 && !is_loop_header_bb_p (bb: gimple_bb (g: def_stmt_info->stmt))));
206}
207
208/* Detect SLP reduction of the form:
209
210 #a1 = phi <a5, a0>
211 a2 = operation (a1)
212 a3 = operation (a2)
213 a4 = operation (a3)
214 a5 = operation (a4)
215
216 #a = phi <a5>
217
218 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
219 FIRST_STMT is the first reduction stmt in the chain
220 (a2 = operation (a1)).
221
222 Return TRUE if a reduction chain was detected. */
223
224static bool
225parloops_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
226 gimple *first_stmt)
227{
228 class loop *loop = (gimple_bb (g: phi))->loop_father;
229 class loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
230 enum tree_code code;
231 gimple *loop_use_stmt = NULL;
232 stmt_vec_info use_stmt_info;
233 tree lhs;
234 imm_use_iterator imm_iter;
235 use_operand_p use_p;
236 int nloop_uses, size = 0, n_out_of_loop_uses;
237 bool found = false;
238
239 if (loop != vect_loop)
240 return false;
241
242 auto_vec<stmt_vec_info, 8> reduc_chain;
243 lhs = PHI_RESULT (phi);
244 code = gimple_assign_rhs_code (gs: first_stmt);
245 while (1)
246 {
247 nloop_uses = 0;
248 n_out_of_loop_uses = 0;
249 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
250 {
251 gimple *use_stmt = USE_STMT (use_p);
252 if (is_gimple_debug (gs: use_stmt))
253 continue;
254
255 /* Check if we got back to the reduction phi. */
256 if (use_stmt == phi)
257 {
258 loop_use_stmt = use_stmt;
259 found = true;
260 break;
261 }
262
263 if (flow_bb_inside_loop_p (loop, gimple_bb (g: use_stmt)))
264 {
265 loop_use_stmt = use_stmt;
266 nloop_uses++;
267 }
268 else
269 n_out_of_loop_uses++;
270
271 /* There are can be either a single use in the loop or two uses in
272 phi nodes. */
273 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
274 return false;
275 }
276
277 if (found)
278 break;
279
280 /* We reached a statement with no loop uses. */
281 if (nloop_uses == 0)
282 return false;
283
284 /* This is a loop exit phi, and we haven't reached the reduction phi. */
285 if (gimple_code (g: loop_use_stmt) == GIMPLE_PHI)
286 return false;
287
288 if (!is_gimple_assign (gs: loop_use_stmt)
289 || code != gimple_assign_rhs_code (gs: loop_use_stmt)
290 || !flow_bb_inside_loop_p (loop, gimple_bb (g: loop_use_stmt)))
291 return false;
292
293 /* Insert USE_STMT into reduction chain. */
294 use_stmt_info = loop_info->lookup_stmt (loop_use_stmt);
295 reduc_chain.safe_push (obj: use_stmt_info);
296
297 lhs = gimple_assign_lhs (gs: loop_use_stmt);
298 size++;
299 }
300
301 if (!found || loop_use_stmt != phi || size < 2)
302 return false;
303
304 /* Swap the operands, if needed, to make the reduction operand be the second
305 operand. */
306 lhs = PHI_RESULT (phi);
307 for (unsigned i = 0; i < reduc_chain.length (); ++i)
308 {
309 gassign *next_stmt = as_a <gassign *> (p: reduc_chain[i]->stmt);
310 if (gimple_assign_rhs2 (gs: next_stmt) == lhs)
311 {
312 tree op = gimple_assign_rhs1 (gs: next_stmt);
313 stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
314
315 /* Check that the other def is either defined in the loop
316 ("vect_internal_def"), or it's an induction (defined by a
317 loop-header phi-node). */
318 if (def_stmt_info
319 && flow_bb_inside_loop_p (loop, gimple_bb (g: def_stmt_info->stmt))
320 && parloops_valid_reduction_input_p (def_stmt_info))
321 {
322 lhs = gimple_assign_lhs (gs: next_stmt);
323 continue;
324 }
325
326 return false;
327 }
328 else
329 {
330 tree op = gimple_assign_rhs2 (gs: next_stmt);
331 stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
332
333 /* Check that the other def is either defined in the loop
334 ("vect_internal_def"), or it's an induction (defined by a
335 loop-header phi-node). */
336 if (def_stmt_info
337 && flow_bb_inside_loop_p (loop, gimple_bb (g: def_stmt_info->stmt))
338 && parloops_valid_reduction_input_p (def_stmt_info))
339 {
340 if (dump_enabled_p ())
341 dump_printf_loc (MSG_NOTE, vect_location,
342 "swapping oprnds: %G", (gimple *) next_stmt);
343
344 swap_ssa_operands (next_stmt,
345 gimple_assign_rhs1_ptr (gs: next_stmt),
346 gimple_assign_rhs2_ptr (gs: next_stmt));
347 update_stmt (s: next_stmt);
348 }
349 else
350 return false;
351 }
352
353 lhs = gimple_assign_lhs (gs: next_stmt);
354 }
355
356 /* Build up the actual chain. */
357 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
358 {
359 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
360 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
361 }
362 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
363 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
364
365 /* Save the chain for further analysis in SLP detection. */
366 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (obj: reduc_chain[0]);
367 REDUC_GROUP_SIZE (reduc_chain[0]) = size;
368
369 return true;
370}
371
372/* Return true if we need an in-order reduction for operation CODE
373 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
374 overflow must wrap. */
375
376static bool
377parloops_needs_fold_left_reduction_p (tree type, tree_code code,
378 bool need_wrapping_integral_overflow)
379{
380 /* CHECKME: check for !flag_finite_math_only too? */
381 if (SCALAR_FLOAT_TYPE_P (type))
382 switch (code)
383 {
384 case MIN_EXPR:
385 case MAX_EXPR:
386 return false;
387
388 default:
389 return !flag_associative_math;
390 }
391
392 if (INTEGRAL_TYPE_P (type))
393 {
394 if (!operation_no_trapping_overflow (type, code))
395 return true;
396 if (need_wrapping_integral_overflow
397 && !TYPE_OVERFLOW_WRAPS (type)
398 && operation_can_overflow (code))
399 return true;
400 return false;
401 }
402
403 if (SAT_FIXED_POINT_TYPE_P (type))
404 return true;
405
406 return false;
407}
408
409
410/* Function parloops_is_simple_reduction
411
412 (1) Detect a cross-iteration def-use cycle that represents a simple
413 reduction computation. We look for the following pattern:
414
415 loop_header:
416 a1 = phi < a0, a2 >
417 a3 = ...
418 a2 = operation (a3, a1)
419
420 or
421
422 a3 = ...
423 loop_header:
424 a1 = phi < a0, a2 >
425 a2 = operation (a3, a1)
426
427 such that:
428 1. operation is commutative and associative and it is safe to
429 change the order of the computation
430 2. no uses for a2 in the loop (a2 is used out of the loop)
431 3. no uses of a1 in the loop besides the reduction operation
432 4. no uses of a1 outside the loop.
433
434 Conditions 1,4 are tested here.
435 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
436
437 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
438 nested cycles.
439
440 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
441 reductions:
442
443 a1 = phi < a0, a2 >
444 inner loop (def of a3)
445 a2 = phi < a3 >
446
447 (4) Detect condition expressions, ie:
448 for (int i = 0; i < N; i++)
449 if (a[i] < val)
450 ret_val = a[i];
451
452*/
453
454static stmt_vec_info
455parloops_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
456 bool *double_reduc,
457 bool need_wrapping_integral_overflow,
458 enum vect_reduction_type *v_reduc_type)
459{
460 gphi *phi = as_a <gphi *> (p: phi_info->stmt);
461 class loop *loop = (gimple_bb (g: phi))->loop_father;
462 class loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
463 bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
464 gimple *phi_use_stmt = NULL;
465 enum tree_code orig_code, code;
466 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
467 tree type;
468 tree name;
469 imm_use_iterator imm_iter;
470 use_operand_p use_p;
471 bool phi_def;
472
473 *double_reduc = false;
474 *v_reduc_type = TREE_CODE_REDUCTION;
475
476 tree phi_name = PHI_RESULT (phi);
477 /* ??? If there are no uses of the PHI result the inner loop reduction
478 won't be detected as possibly double-reduction by vectorizable_reduction
479 because that tries to walk the PHI arg from the preheader edge which
480 can be constant. See PR60382. */
481 if (has_zero_uses (var: phi_name))
482 return NULL;
483 unsigned nphi_def_loop_uses = 0;
484 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
485 {
486 gimple *use_stmt = USE_STMT (use_p);
487 if (is_gimple_debug (gs: use_stmt))
488 continue;
489
490 if (!flow_bb_inside_loop_p (loop, gimple_bb (g: use_stmt)))
491 {
492 if (dump_enabled_p ())
493 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
494 "intermediate value used outside loop.\n");
495
496 return NULL;
497 }
498
499 nphi_def_loop_uses++;
500 phi_use_stmt = use_stmt;
501 }
502
503 edge latch_e = loop_latch_edge (loop);
504 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
505 if (TREE_CODE (loop_arg) != SSA_NAME)
506 {
507 if (dump_enabled_p ())
508 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
509 "reduction: not ssa_name: %T\n", loop_arg);
510 return NULL;
511 }
512
513 stmt_vec_info def_stmt_info = loop_info->lookup_def (loop_arg);
514 if (!def_stmt_info
515 || !flow_bb_inside_loop_p (loop, gimple_bb (g: def_stmt_info->stmt)))
516 return NULL;
517
518 if (gassign *def_stmt = dyn_cast <gassign *> (p: def_stmt_info->stmt))
519 {
520 name = gimple_assign_lhs (gs: def_stmt);
521 phi_def = false;
522 }
523 else if (gphi *def_stmt = dyn_cast <gphi *> (p: def_stmt_info->stmt))
524 {
525 name = PHI_RESULT (def_stmt);
526 phi_def = true;
527 }
528 else
529 {
530 if (dump_enabled_p ())
531 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
532 "reduction: unhandled reduction operation: %G",
533 def_stmt_info->stmt);
534 return NULL;
535 }
536
537 unsigned nlatch_def_loop_uses = 0;
538 auto_vec<gphi *, 3> lcphis;
539 bool inner_loop_of_double_reduc = false;
540 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
541 {
542 gimple *use_stmt = USE_STMT (use_p);
543 if (is_gimple_debug (gs: use_stmt))
544 continue;
545 if (flow_bb_inside_loop_p (loop, gimple_bb (g: use_stmt)))
546 nlatch_def_loop_uses++;
547 else
548 {
549 /* We can have more than one loop-closed PHI. */
550 lcphis.safe_push (obj: as_a <gphi *> (p: use_stmt));
551 if (nested_in_vect_loop
552 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
553 == vect_double_reduction_def))
554 inner_loop_of_double_reduc = true;
555 }
556 }
557
558 /* If this isn't a nested cycle or if the nested cycle reduction value
559 is used ouside of the inner loop we cannot handle uses of the reduction
560 value. */
561 if ((!nested_in_vect_loop || inner_loop_of_double_reduc)
562 && (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1))
563 {
564 if (dump_enabled_p ())
565 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
566 "reduction used in loop.\n");
567 return NULL;
568 }
569
570 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
571 defined in the inner loop. */
572 if (phi_def)
573 {
574 gphi *def_stmt = as_a <gphi *> (p: def_stmt_info->stmt);
575 op1 = PHI_ARG_DEF (def_stmt, 0);
576
577 if (gimple_phi_num_args (gs: def_stmt) != 1
578 || TREE_CODE (op1) != SSA_NAME)
579 {
580 if (dump_enabled_p ())
581 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
582 "unsupported phi node definition.\n");
583
584 return NULL;
585 }
586
587 gimple *def1 = SSA_NAME_DEF_STMT (op1);
588 if (gimple_bb (g: def1)
589 && flow_bb_inside_loop_p (loop, gimple_bb (g: def_stmt))
590 && loop->inner
591 && flow_bb_inside_loop_p (loop->inner, gimple_bb (g: def1))
592 && is_gimple_assign (gs: def1)
593 && is_a <gphi *> (p: phi_use_stmt)
594 && flow_bb_inside_loop_p (loop->inner, gimple_bb (g: phi_use_stmt)))
595 {
596 if (dump_enabled_p ())
597 report_ploop_op (msg_type: MSG_NOTE, stmt: def_stmt,
598 msg: "detected double reduction: ");
599
600 *double_reduc = true;
601 return def_stmt_info;
602 }
603
604 return NULL;
605 }
606
607 /* If we are vectorizing an inner reduction we are executing that
608 in the original order only in case we are not dealing with a
609 double reduction. */
610 bool check_reduction = true;
611 if (flow_loop_nested_p (vect_loop, loop))
612 {
613 gphi *lcphi;
614 unsigned i;
615 check_reduction = false;
616 FOR_EACH_VEC_ELT (lcphis, i, lcphi)
617 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
618 {
619 gimple *use_stmt = USE_STMT (use_p);
620 if (is_gimple_debug (gs: use_stmt))
621 continue;
622 if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (g: use_stmt)))
623 check_reduction = true;
624 }
625 }
626
627 gassign *def_stmt = as_a <gassign *> (p: def_stmt_info->stmt);
628 code = orig_code = gimple_assign_rhs_code (gs: def_stmt);
629
630 if (nested_in_vect_loop && !check_reduction)
631 {
632 /* FIXME: Even for non-reductions code generation is funneled
633 through vectorizable_reduction for the stmt defining the
634 PHI latch value. So we have to artificially restrict ourselves
635 for the supported operations. */
636 switch (get_gimple_rhs_class (code))
637 {
638 case GIMPLE_BINARY_RHS:
639 case GIMPLE_TERNARY_RHS:
640 break;
641 default:
642 /* Not supported by vectorizable_reduction. */
643 if (dump_enabled_p ())
644 report_ploop_op (msg_type: MSG_MISSED_OPTIMIZATION, stmt: def_stmt,
645 msg: "nested cycle: not handled operation: ");
646 return NULL;
647 }
648 if (dump_enabled_p ())
649 report_ploop_op (msg_type: MSG_NOTE, stmt: def_stmt, msg: "detected nested cycle: ");
650 return def_stmt_info;
651 }
652
653 /* We can handle "res -= x[i]", which is non-associative by
654 simply rewriting this into "res += -x[i]". Avoid changing
655 gimple instruction for the first simple tests and only do this
656 if we're allowed to change code at all. */
657 if (code == MINUS_EXPR && gimple_assign_rhs2 (gs: def_stmt) != phi_name)
658 code = PLUS_EXPR;
659
660 if (code == COND_EXPR)
661 {
662 if (! nested_in_vect_loop)
663 *v_reduc_type = COND_REDUCTION;
664
665 op3 = gimple_assign_rhs1 (gs: def_stmt);
666 if (COMPARISON_CLASS_P (op3))
667 {
668 op4 = TREE_OPERAND (op3, 1);
669 op3 = TREE_OPERAND (op3, 0);
670 }
671 if (op3 == phi_name || op4 == phi_name)
672 {
673 if (dump_enabled_p ())
674 report_ploop_op (msg_type: MSG_MISSED_OPTIMIZATION, stmt: def_stmt,
675 msg: "reduction: condition depends on previous"
676 " iteration: ");
677 return NULL;
678 }
679
680 op1 = gimple_assign_rhs2 (gs: def_stmt);
681 op2 = gimple_assign_rhs3 (gs: def_stmt);
682 }
683 else if (!commutative_tree_code (code) || !associative_tree_code (code))
684 {
685 if (dump_enabled_p ())
686 report_ploop_op (msg_type: MSG_MISSED_OPTIMIZATION, stmt: def_stmt,
687 msg: "reduction: not commutative/associative: ");
688 return NULL;
689 }
690 else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
691 {
692 op1 = gimple_assign_rhs1 (gs: def_stmt);
693 op2 = gimple_assign_rhs2 (gs: def_stmt);
694 }
695 else
696 {
697 if (dump_enabled_p ())
698 report_ploop_op (msg_type: MSG_MISSED_OPTIMIZATION, stmt: def_stmt,
699 msg: "reduction: not handled operation: ");
700 return NULL;
701 }
702
703 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
704 {
705 if (dump_enabled_p ())
706 report_ploop_op (msg_type: MSG_MISSED_OPTIMIZATION, stmt: def_stmt,
707 msg: "reduction: both uses not ssa_names: ");
708
709 return NULL;
710 }
711
712 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
713 if ((TREE_CODE (op1) == SSA_NAME
714 && !types_compatible_p (type1: type,TREE_TYPE (op1)))
715 || (TREE_CODE (op2) == SSA_NAME
716 && !types_compatible_p (type1: type, TREE_TYPE (op2)))
717 || (op3 && TREE_CODE (op3) == SSA_NAME
718 && !types_compatible_p (type1: type, TREE_TYPE (op3)))
719 || (op4 && TREE_CODE (op4) == SSA_NAME
720 && !types_compatible_p (type1: type, TREE_TYPE (op4))))
721 {
722 if (dump_enabled_p ())
723 {
724 dump_printf_loc (MSG_NOTE, vect_location,
725 "reduction: multiple types: operation type: "
726 "%T, operands types: %T,%T",
727 type, TREE_TYPE (op1), TREE_TYPE (op2));
728 if (op3)
729 dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op3));
730
731 if (op4)
732 dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op4));
733 dump_printf (MSG_NOTE, "\n");
734 }
735
736 return NULL;
737 }
738
739 /* Check whether it's ok to change the order of the computation.
740 Generally, when vectorizing a reduction we change the order of the
741 computation. This may change the behavior of the program in some
742 cases, so we need to check that this is ok. One exception is when
743 vectorizing an outer-loop: the inner-loop is executed sequentially,
744 and therefore vectorizing reductions in the inner-loop during
745 outer-loop vectorization is safe. */
746 if (check_reduction
747 && *v_reduc_type == TREE_CODE_REDUCTION
748 && parloops_needs_fold_left_reduction_p (type, code,
749 need_wrapping_integral_overflow))
750 *v_reduc_type = FOLD_LEFT_REDUCTION;
751
752 /* Reduction is safe. We're dealing with one of the following:
753 1) integer arithmetic and no trapv
754 2) floating point arithmetic, and special flags permit this optimization
755 3) nested cycle (i.e., outer loop vectorization). */
756 stmt_vec_info def1_info = loop_info->lookup_def (op1);
757 stmt_vec_info def2_info = loop_info->lookup_def (op2);
758 if (code != COND_EXPR && !def1_info && !def2_info)
759 {
760 if (dump_enabled_p ())
761 report_ploop_op (msg_type: MSG_NOTE, stmt: def_stmt,
762 msg: "reduction: no defs for operands: ");
763 return NULL;
764 }
765
766 /* Check that one def is the reduction def, defined by PHI,
767 the other def is either defined in the loop ("vect_internal_def"),
768 or it's an induction (defined by a loop-header phi-node). */
769
770 if (def2_info
771 && def2_info->stmt == phi
772 && (code == COND_EXPR
773 || !def1_info
774 || !flow_bb_inside_loop_p (loop, gimple_bb (g: def1_info->stmt))
775 || parloops_valid_reduction_input_p (def_stmt_info: def1_info)))
776 {
777 if (dump_enabled_p ())
778 report_ploop_op (msg_type: MSG_NOTE, stmt: def_stmt, msg: "detected reduction: ");
779 return def_stmt_info;
780 }
781
782 if (def1_info
783 && def1_info->stmt == phi
784 && (code == COND_EXPR
785 || !def2_info
786 || !flow_bb_inside_loop_p (loop, gimple_bb (g: def2_info->stmt))
787 || parloops_valid_reduction_input_p (def_stmt_info: def2_info)))
788 {
789 if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
790 {
791 /* Check if we can swap operands (just for simplicity - so that
792 the rest of the code can assume that the reduction variable
793 is always the last (second) argument). */
794 if (code == COND_EXPR)
795 {
796 /* Swap cond_expr by inverting the condition. */
797 tree cond_expr = gimple_assign_rhs1 (gs: def_stmt);
798 enum tree_code invert_code = ERROR_MARK;
799 enum tree_code cond_code = TREE_CODE (cond_expr);
800
801 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
802 {
803 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
804 invert_code = invert_tree_comparison (cond_code, honor_nans);
805 }
806 if (invert_code != ERROR_MARK)
807 {
808 TREE_SET_CODE (cond_expr, invert_code);
809 swap_ssa_operands (def_stmt,
810 gimple_assign_rhs2_ptr (gs: def_stmt),
811 gimple_assign_rhs3_ptr (gs: def_stmt));
812 }
813 else
814 {
815 if (dump_enabled_p ())
816 report_ploop_op (msg_type: MSG_NOTE, stmt: def_stmt,
817 msg: "detected reduction: cannot swap operands "
818 "for cond_expr");
819 return NULL;
820 }
821 }
822 else
823 swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (gs: def_stmt),
824 gimple_assign_rhs2_ptr (gs: def_stmt));
825
826 if (dump_enabled_p ())
827 report_ploop_op (msg_type: MSG_NOTE, stmt: def_stmt,
828 msg: "detected reduction: need to swap operands: ");
829 }
830 else
831 {
832 if (dump_enabled_p ())
833 report_ploop_op (msg_type: MSG_NOTE, stmt: def_stmt, msg: "detected reduction: ");
834 }
835
836 return def_stmt_info;
837 }
838
839 /* Try to find SLP reduction chain. */
840 if (! nested_in_vect_loop
841 && code != COND_EXPR
842 && orig_code != MINUS_EXPR
843 && parloops_is_slp_reduction (loop_info, phi, first_stmt: def_stmt))
844 {
845 if (dump_enabled_p ())
846 report_ploop_op (msg_type: MSG_NOTE, stmt: def_stmt,
847 msg: "reduction: detected reduction chain: ");
848
849 return def_stmt_info;
850 }
851
852 /* Look for the expression computing loop_arg from loop PHI result. */
853 if (check_reduction_path (vect_location, loop, phi, loop_arg, code))
854 return def_stmt_info;
855
856 if (dump_enabled_p ())
857 {
858 report_ploop_op (msg_type: MSG_MISSED_OPTIMIZATION, stmt: def_stmt,
859 msg: "reduction: unknown pattern: ");
860 }
861
862 return NULL;
863}
864
865/* Wrapper around vect_is_simple_reduction, which will modify code
866 in-place if it enables detection of more reductions. Arguments
867 as there. */
868
869stmt_vec_info
870parloops_force_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
871 bool *double_reduc,
872 bool need_wrapping_integral_overflow)
873{
874 enum vect_reduction_type v_reduc_type;
875 stmt_vec_info def_info
876 = parloops_is_simple_reduction (loop_info, phi_info, double_reduc,
877 need_wrapping_integral_overflow,
878 v_reduc_type: &v_reduc_type);
879 if (def_info)
880 {
881 STMT_VINFO_REDUC_TYPE (phi_info) = v_reduc_type;
882 STMT_VINFO_REDUC_DEF (phi_info) = def_info;
883 STMT_VINFO_REDUC_TYPE (def_info) = v_reduc_type;
884 STMT_VINFO_REDUC_DEF (def_info) = phi_info;
885 }
886 return def_info;
887}
888
889/* Minimal number of iterations of a loop that should be executed in each
890 thread. */
891#define MIN_PER_THREAD param_parloops_min_per_thread
892
893/* Element of the hashtable, representing a
894 reduction in the current loop. */
895struct reduction_info
896{
897 gimple *reduc_stmt; /* reduction statement. */
898 gimple *reduc_phi; /* The phi node defining the reduction. */
899 enum tree_code reduction_code;/* code for the reduction operation. */
900 unsigned reduc_version; /* SSA_NAME_VERSION of original reduc_phi
901 result. */
902 gphi *keep_res; /* The PHI_RESULT of this phi is the resulting value
903 of the reduction variable when existing the loop. */
904 tree initial_value; /* The initial value of the reduction var before entering the loop. */
905 tree field; /* the name of the field in the parloop data structure intended for reduction. */
906 tree reduc_addr; /* The address of the reduction variable for
907 openacc reductions. */
908 tree init; /* reduction initialization value. */
909 gphi *new_phi; /* (helper field) Newly created phi node whose result
910 will be passed to the atomic operation. Represents
911 the local result each thread computed for the reduction
912 operation. */
913};
914
915/* Reduction info hashtable helpers. */
916
917struct reduction_hasher : free_ptr_hash <reduction_info>
918{
919 static inline hashval_t hash (const reduction_info *);
920 static inline bool equal (const reduction_info *, const reduction_info *);
921};
922
923/* Equality and hash functions for hashtab code. */
924
925inline bool
926reduction_hasher::equal (const reduction_info *a, const reduction_info *b)
927{
928 return (a->reduc_phi == b->reduc_phi);
929}
930
931inline hashval_t
932reduction_hasher::hash (const reduction_info *a)
933{
934 return a->reduc_version;
935}
936
937typedef hash_table<reduction_hasher> reduction_info_table_type;
938
939
940static struct reduction_info *
941reduction_phi (reduction_info_table_type *reduction_list, gimple *phi)
942{
943 struct reduction_info tmpred, *red;
944
945 if (reduction_list->is_empty () || phi == NULL)
946 return NULL;
947
948 if (gimple_uid (g: phi) == (unsigned int)-1
949 || gimple_uid (g: phi) == 0)
950 return NULL;
951
952 tmpred.reduc_phi = phi;
953 tmpred.reduc_version = gimple_uid (g: phi);
954 red = reduction_list->find (value: &tmpred);
955 gcc_assert (red == NULL || red->reduc_phi == phi);
956
957 return red;
958}
959
960/* Element of hashtable of names to copy. */
961
962struct name_to_copy_elt
963{
964 unsigned version; /* The version of the name to copy. */
965 tree new_name; /* The new name used in the copy. */
966 tree field; /* The field of the structure used to pass the
967 value. */
968};
969
970/* Name copies hashtable helpers. */
971
972struct name_to_copy_hasher : free_ptr_hash <name_to_copy_elt>
973{
974 static inline hashval_t hash (const name_to_copy_elt *);
975 static inline bool equal (const name_to_copy_elt *, const name_to_copy_elt *);
976};
977
978/* Equality and hash functions for hashtab code. */
979
980inline bool
981name_to_copy_hasher::equal (const name_to_copy_elt *a, const name_to_copy_elt *b)
982{
983 return a->version == b->version;
984}
985
986inline hashval_t
987name_to_copy_hasher::hash (const name_to_copy_elt *a)
988{
989 return (hashval_t) a->version;
990}
991
992typedef hash_table<name_to_copy_hasher> name_to_copy_table_type;
993
994/* A transformation matrix, which is a self-contained ROWSIZE x COLSIZE
995 matrix. Rather than use floats, we simply keep a single DENOMINATOR that
996 represents the denominator for every element in the matrix. */
997typedef struct lambda_trans_matrix_s
998{
999 lambda_matrix matrix;
1000 int rowsize;
1001 int colsize;
1002 int denominator;
1003} *lambda_trans_matrix;
1004#define LTM_MATRIX(T) ((T)->matrix)
1005#define LTM_ROWSIZE(T) ((T)->rowsize)
1006#define LTM_COLSIZE(T) ((T)->colsize)
1007#define LTM_DENOMINATOR(T) ((T)->denominator)
1008
1009/* Allocate a new transformation matrix. */
1010
1011static lambda_trans_matrix
1012lambda_trans_matrix_new (int colsize, int rowsize,
1013 struct obstack * lambda_obstack)
1014{
1015 lambda_trans_matrix ret;
1016
1017 ret = (lambda_trans_matrix)
1018 obstack_alloc (lambda_obstack, sizeof (struct lambda_trans_matrix_s));
1019 LTM_MATRIX (ret) = lambda_matrix_new (m: rowsize, n: colsize, lambda_obstack);
1020 LTM_ROWSIZE (ret) = rowsize;
1021 LTM_COLSIZE (ret) = colsize;
1022 LTM_DENOMINATOR (ret) = 1;
1023 return ret;
1024}
1025
1026/* Multiply a vector VEC by a matrix MAT.
1027 MAT is an M*N matrix, and VEC is a vector with length N. The result
1028 is stored in DEST which must be a vector of length M. */
1029
1030static void
1031lambda_matrix_vector_mult (lambda_matrix matrix, int m, int n,
1032 lambda_vector vec, lambda_vector dest)
1033{
1034 int i, j;
1035
1036 lambda_vector_clear (vec1: dest, size: m);
1037 for (i = 0; i < m; i++)
1038 for (j = 0; j < n; j++)
1039 dest[i] += matrix[i][j] * vec[j];
1040}
1041
1042/* Return true if TRANS is a legal transformation matrix that respects
1043 the dependence vectors in DISTS and DIRS. The conservative answer
1044 is false.
1045
1046 "Wolfe proves that a unimodular transformation represented by the
1047 matrix T is legal when applied to a loop nest with a set of
1048 lexicographically non-negative distance vectors RDG if and only if
1049 for each vector d in RDG, (T.d >= 0) is lexicographically positive.
1050 i.e.: if and only if it transforms the lexicographically positive
1051 distance vectors to lexicographically positive vectors. Note that
1052 a unimodular matrix must transform the zero vector (and only it) to
1053 the zero vector." S.Muchnick. */
1054
1055static bool
1056lambda_transform_legal_p (lambda_trans_matrix trans,
1057 int nb_loops,
1058 vec<ddr_p> dependence_relations)
1059{
1060 unsigned int i, j;
1061 lambda_vector distres;
1062 struct data_dependence_relation *ddr;
1063
1064 gcc_assert (LTM_COLSIZE (trans) == nb_loops
1065 && LTM_ROWSIZE (trans) == nb_loops);
1066
1067 /* When there are no dependences, the transformation is correct. */
1068 if (dependence_relations.length () == 0)
1069 return true;
1070
1071 ddr = dependence_relations[0];
1072 if (ddr == NULL)
1073 return true;
1074
1075 /* When there is an unknown relation in the dependence_relations, we
1076 know that it is no worth looking at this loop nest: give up. */
1077 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1078 return false;
1079
1080 distres = lambda_vector_new (size: nb_loops);
1081
1082 /* For each distance vector in the dependence graph. */
1083 FOR_EACH_VEC_ELT (dependence_relations, i, ddr)
1084 {
1085 /* Don't care about relations for which we know that there is no
1086 dependence, nor about read-read (aka. output-dependences):
1087 these data accesses can happen in any order. */
1088 if (DDR_ARE_DEPENDENT (ddr) == chrec_known
1089 || (DR_IS_READ (DDR_A (ddr)) && DR_IS_READ (DDR_B (ddr))))
1090 continue;
1091
1092 /* Conservatively answer: "this transformation is not valid". */
1093 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1094 return false;
1095
1096 /* If the dependence could not be captured by a distance vector,
1097 conservatively answer that the transform is not valid. */
1098 if (DDR_NUM_DIST_VECTS (ddr) == 0)
1099 return false;
1100
1101 /* Compute trans.dist_vect */
1102 for (j = 0; j < DDR_NUM_DIST_VECTS (ddr); j++)
1103 {
1104 lambda_matrix_vector_mult (LTM_MATRIX (trans), m: nb_loops, n: nb_loops,
1105 DDR_DIST_VECT (ddr, j), dest: distres);
1106
1107 if (!lambda_vector_lexico_pos (v: distres, n: nb_loops))
1108 return false;
1109 }
1110 }
1111 return true;
1112}
1113
1114/* Data dependency analysis. Returns true if the iterations of LOOP
1115 are independent on each other (that is, if we can execute them
1116 in parallel). */
1117
1118static bool
1119loop_parallel_p (class loop *loop, struct obstack * parloop_obstack)
1120{
1121 vec<ddr_p> dependence_relations;
1122 vec<data_reference_p> datarefs;
1123 lambda_trans_matrix trans;
1124 bool ret = false;
1125
1126 if (dump_file && (dump_flags & TDF_DETAILS))
1127 {
1128 fprintf (stream: dump_file, format: "Considering loop %d\n", loop->num);
1129 if (!loop->inner)
1130 fprintf (stream: dump_file, format: "loop is innermost\n");
1131 else
1132 fprintf (stream: dump_file, format: "loop NOT innermost\n");
1133 }
1134
1135 /* Check for problems with dependences. If the loop can be reversed,
1136 the iterations are independent. */
1137 auto_vec<loop_p, 3> loop_nest;
1138 datarefs.create (nelems: 10);
1139 dependence_relations.create (nelems: 100);
1140 if (! compute_data_dependences_for_loop (loop, true, &loop_nest, &datarefs,
1141 &dependence_relations))
1142 {
1143 if (dump_file && (dump_flags & TDF_DETAILS))
1144 fprintf (stream: dump_file, format: " FAILED: cannot analyze data dependencies\n");
1145 ret = false;
1146 goto end;
1147 }
1148 if (dump_file && (dump_flags & TDF_DETAILS))
1149 dump_data_dependence_relations (dump_file, dependence_relations);
1150
1151 trans = lambda_trans_matrix_new (colsize: 1, rowsize: 1, lambda_obstack: parloop_obstack);
1152 LTM_MATRIX (trans)[0][0] = -1;
1153
1154 if (lambda_transform_legal_p (trans, nb_loops: 1, dependence_relations))
1155 {
1156 ret = true;
1157 if (dump_file && (dump_flags & TDF_DETAILS))
1158 fprintf (stream: dump_file, format: " SUCCESS: may be parallelized\n");
1159 }
1160 else if (dump_file && (dump_flags & TDF_DETAILS))
1161 fprintf (stream: dump_file,
1162 format: " FAILED: data dependencies exist across iterations\n");
1163
1164 end:
1165 free_dependence_relations (dependence_relations);
1166 free_data_refs (datarefs);
1167
1168 return ret;
1169}
1170
1171/* Return true when LOOP contains basic blocks marked with the
1172 BB_IRREDUCIBLE_LOOP flag. */
1173
1174static inline bool
1175loop_has_blocks_with_irreducible_flag (class loop *loop)
1176{
1177 unsigned i;
1178 basic_block *bbs = get_loop_body_in_dom_order (loop);
1179 bool res = true;
1180
1181 for (i = 0; i < loop->num_nodes; i++)
1182 if (bbs[i]->flags & BB_IRREDUCIBLE_LOOP)
1183 goto end;
1184
1185 res = false;
1186 end:
1187 free (ptr: bbs);
1188 return res;
1189}
1190
1191/* Assigns the address of OBJ in TYPE to an ssa name, and returns this name.
1192 The assignment statement is placed on edge ENTRY. DECL_ADDRESS maps decls
1193 to their addresses that can be reused. The address of OBJ is known to
1194 be invariant in the whole function. Other needed statements are placed
1195 right before GSI. */
1196
1197static tree
1198take_address_of (tree obj, tree type, edge entry,
1199 int_tree_htab_type *decl_address, gimple_stmt_iterator *gsi)
1200{
1201 int uid;
1202 tree *var_p, name, addr;
1203 gassign *stmt;
1204 gimple_seq stmts;
1205
1206 /* Since the address of OBJ is invariant, the trees may be shared.
1207 Avoid rewriting unrelated parts of the code. */
1208 obj = unshare_expr (obj);
1209 for (var_p = &obj;
1210 handled_component_p (t: *var_p);
1211 var_p = &TREE_OPERAND (*var_p, 0))
1212 continue;
1213
1214 /* Canonicalize the access to base on a MEM_REF. */
1215 if (DECL_P (*var_p))
1216 *var_p = build_simple_mem_ref (build_fold_addr_expr (*var_p));
1217
1218 /* Assign a canonical SSA name to the address of the base decl used
1219 in the address and share it for all accesses and addresses based
1220 on it. */
1221 uid = DECL_UID (TREE_OPERAND (TREE_OPERAND (*var_p, 0), 0));
1222 int_tree_map elt;
1223 elt.uid = uid;
1224 int_tree_map *slot = decl_address->find_slot (value: elt,
1225 insert: gsi == NULL
1226 ? NO_INSERT
1227 : INSERT);
1228 if (!slot || !slot->to)
1229 {
1230 if (gsi == NULL)
1231 return NULL;
1232 addr = TREE_OPERAND (*var_p, 0);
1233 const char *obj_name
1234 = get_name (TREE_OPERAND (TREE_OPERAND (*var_p, 0), 0));
1235 if (obj_name)
1236 name = make_temp_ssa_name (TREE_TYPE (addr), NULL, name: obj_name);
1237 else
1238 name = make_ssa_name (TREE_TYPE (addr));
1239 stmt = gimple_build_assign (name, addr);
1240 gsi_insert_on_edge_immediate (entry, stmt);
1241
1242 slot->uid = uid;
1243 slot->to = name;
1244 }
1245 else
1246 name = slot->to;
1247
1248 /* Express the address in terms of the canonical SSA name. */
1249 TREE_OPERAND (*var_p, 0) = name;
1250 if (gsi == NULL)
1251 return build_fold_addr_expr_with_type (obj, type);
1252
1253 name = force_gimple_operand (build_addr (obj),
1254 &stmts, true, NULL_TREE);
1255 if (!gimple_seq_empty_p (s: stmts))
1256 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
1257
1258 if (!useless_type_conversion_p (type, TREE_TYPE (name)))
1259 {
1260 name = force_gimple_operand (fold_convert (type, name), &stmts, true,
1261 NULL_TREE);
1262 if (!gimple_seq_empty_p (s: stmts))
1263 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
1264 }
1265
1266 return name;
1267}
1268
1269static tree
1270reduc_stmt_res (gimple *stmt)
1271{
1272 return (gimple_code (g: stmt) == GIMPLE_PHI
1273 ? gimple_phi_result (gs: stmt)
1274 : gimple_assign_lhs (gs: stmt));
1275}
1276
1277/* Callback for htab_traverse. Create the initialization statement
1278 for reduction described in SLOT, and place it at the preheader of
1279 the loop described in DATA. */
1280
1281int
1282initialize_reductions (reduction_info **slot, class loop *loop)
1283{
1284 tree init;
1285 tree type, arg;
1286 edge e;
1287
1288 struct reduction_info *const reduc = *slot;
1289
1290 /* Create initialization in preheader:
1291 reduction_variable = initialization value of reduction. */
1292
1293 /* In the phi node at the header, replace the argument coming
1294 from the preheader with the reduction initialization value. */
1295
1296 /* Initialize the reduction. */
1297 type = TREE_TYPE (PHI_RESULT (reduc->reduc_phi));
1298 init = omp_reduction_init_op (gimple_location (g: reduc->reduc_stmt),
1299 reduc->reduction_code, type);
1300 reduc->init = init;
1301
1302 /* Replace the argument representing the initialization value
1303 with the initialization value for the reduction (neutral
1304 element for the particular operation, e.g. 0 for PLUS_EXPR,
1305 1 for MULT_EXPR, etc).
1306 Keep the old value in a new variable "reduction_initial",
1307 that will be taken in consideration after the parallel
1308 computing is done. */
1309
1310 e = loop_preheader_edge (loop);
1311 arg = PHI_ARG_DEF_FROM_EDGE (reduc->reduc_phi, e);
1312 /* Create new variable to hold the initial value. */
1313
1314 SET_USE (PHI_ARG_DEF_PTR_FROM_EDGE
1315 (reduc->reduc_phi, loop_preheader_edge (loop)), init);
1316 reduc->initial_value = arg;
1317 return 1;
1318}
1319
1320struct elv_data
1321{
1322 struct walk_stmt_info info;
1323 edge entry;
1324 int_tree_htab_type *decl_address;
1325 gimple_stmt_iterator *gsi;
1326 bool changed;
1327 bool reset;
1328};
1329
1330/* Eliminates references to local variables in *TP out of the single
1331 entry single exit region starting at DTA->ENTRY.
1332 DECL_ADDRESS contains addresses of the references that had their
1333 address taken already. If the expression is changed, CHANGED is
1334 set to true. Callback for walk_tree. */
1335
1336static tree
1337eliminate_local_variables_1 (tree *tp, int *walk_subtrees, void *data)
1338{
1339 struct elv_data *const dta = (struct elv_data *) data;
1340 tree t = *tp, var, addr, addr_type, type, obj;
1341
1342 if (DECL_P (t))
1343 {
1344 *walk_subtrees = 0;
1345
1346 if (!SSA_VAR_P (t) || DECL_EXTERNAL (t))
1347 return NULL_TREE;
1348
1349 type = TREE_TYPE (t);
1350 addr_type = build_pointer_type (type);
1351 addr = take_address_of (obj: t, type: addr_type, entry: dta->entry, decl_address: dta->decl_address,
1352 gsi: dta->gsi);
1353 if (dta->gsi == NULL && addr == NULL_TREE)
1354 {
1355 dta->reset = true;
1356 return NULL_TREE;
1357 }
1358
1359 *tp = build_simple_mem_ref (addr);
1360
1361 dta->changed = true;
1362 return NULL_TREE;
1363 }
1364
1365 if (TREE_CODE (t) == ADDR_EXPR)
1366 {
1367 /* ADDR_EXPR may appear in two contexts:
1368 -- as a gimple operand, when the address taken is a function invariant
1369 -- as gimple rhs, when the resulting address in not a function
1370 invariant
1371 We do not need to do anything special in the latter case (the base of
1372 the memory reference whose address is taken may be replaced in the
1373 DECL_P case). The former case is more complicated, as we need to
1374 ensure that the new address is still a gimple operand. Thus, it
1375 is not sufficient to replace just the base of the memory reference --
1376 we need to move the whole computation of the address out of the
1377 loop. */
1378 if (!is_gimple_val (t))
1379 return NULL_TREE;
1380
1381 *walk_subtrees = 0;
1382 obj = TREE_OPERAND (t, 0);
1383 var = get_base_address (t: obj);
1384 if (!var || !SSA_VAR_P (var) || DECL_EXTERNAL (var))
1385 return NULL_TREE;
1386
1387 addr_type = TREE_TYPE (t);
1388 addr = take_address_of (obj, type: addr_type, entry: dta->entry, decl_address: dta->decl_address,
1389 gsi: dta->gsi);
1390 if (dta->gsi == NULL && addr == NULL_TREE)
1391 {
1392 dta->reset = true;
1393 return NULL_TREE;
1394 }
1395 *tp = addr;
1396
1397 dta->changed = true;
1398 return NULL_TREE;
1399 }
1400
1401 if (!EXPR_P (t))
1402 *walk_subtrees = 0;
1403
1404 return NULL_TREE;
1405}
1406
1407/* Moves the references to local variables in STMT at *GSI out of the single
1408 entry single exit region starting at ENTRY. DECL_ADDRESS contains
1409 addresses of the references that had their address taken
1410 already. */
1411
1412static void
1413eliminate_local_variables_stmt (edge entry, gimple_stmt_iterator *gsi,
1414 int_tree_htab_type *decl_address)
1415{
1416 struct elv_data dta;
1417 gimple *stmt = gsi_stmt (i: *gsi);
1418
1419 memset (s: &dta.info, c: '\0', n: sizeof (dta.info));
1420 dta.entry = entry;
1421 dta.decl_address = decl_address;
1422 dta.changed = false;
1423 dta.reset = false;
1424
1425 if (gimple_debug_bind_p (s: stmt))
1426 {
1427 dta.gsi = NULL;
1428 walk_tree (gimple_debug_bind_get_value_ptr (stmt),
1429 eliminate_local_variables_1, &dta.info, NULL);
1430 if (dta.reset)
1431 {
1432 gimple_debug_bind_reset_value (dbg: stmt);
1433 dta.changed = true;
1434 }
1435 }
1436 else if (gimple_clobber_p (s: stmt))
1437 {
1438 unlink_stmt_vdef (stmt);
1439 stmt = gimple_build_nop ();
1440 gsi_replace (gsi, stmt, false);
1441 dta.changed = true;
1442 }
1443 else
1444 {
1445 dta.gsi = gsi;
1446 walk_gimple_op (stmt, eliminate_local_variables_1, &dta.info);
1447 }
1448
1449 if (dta.changed)
1450 update_stmt (s: stmt);
1451}
1452
1453/* Eliminates the references to local variables from the single entry
1454 single exit region between the ENTRY and EXIT edges.
1455
1456 This includes:
1457 1) Taking address of a local variable -- these are moved out of the
1458 region (and temporary variable is created to hold the address if
1459 necessary).
1460
1461 2) Dereferencing a local variable -- these are replaced with indirect
1462 references. */
1463
1464static void
1465eliminate_local_variables (edge entry, edge exit)
1466{
1467 basic_block bb;
1468 auto_vec<basic_block, 3> body;
1469 unsigned i;
1470 gimple_stmt_iterator gsi;
1471 bool has_debug_stmt = false;
1472 int_tree_htab_type decl_address (10);
1473 basic_block entry_bb = entry->src;
1474 basic_block exit_bb = exit->dest;
1475
1476 gather_blocks_in_sese_region (entry: entry_bb, exit: exit_bb, bbs_p: &body);
1477
1478 FOR_EACH_VEC_ELT (body, i, bb)
1479 if (bb != entry_bb && bb != exit_bb)
1480 {
1481 for (gsi = gsi_start_bb (bb); !gsi_end_p (i: gsi); gsi_next (i: &gsi))
1482 if (is_gimple_debug (gs: gsi_stmt (i: gsi)))
1483 {
1484 if (gimple_debug_bind_p (s: gsi_stmt (i: gsi)))
1485 has_debug_stmt = true;
1486 }
1487 else
1488 eliminate_local_variables_stmt (entry, gsi: &gsi, decl_address: &decl_address);
1489 }
1490
1491 if (has_debug_stmt)
1492 FOR_EACH_VEC_ELT (body, i, bb)
1493 if (bb != entry_bb && bb != exit_bb)
1494 for (gsi = gsi_start_bb (bb); !gsi_end_p (i: gsi); gsi_next (i: &gsi))
1495 if (gimple_debug_bind_p (s: gsi_stmt (i: gsi)))
1496 eliminate_local_variables_stmt (entry, gsi: &gsi, decl_address: &decl_address);
1497}
1498
1499/* Returns true if expression EXPR is not defined between ENTRY and
1500 EXIT, i.e. if all its operands are defined outside of the region. */
1501
1502static bool
1503expr_invariant_in_region_p (edge entry, edge exit, tree expr)
1504{
1505 basic_block entry_bb = entry->src;
1506 basic_block exit_bb = exit->dest;
1507 basic_block def_bb;
1508
1509 if (is_gimple_min_invariant (expr))
1510 return true;
1511
1512 if (TREE_CODE (expr) == SSA_NAME)
1513 {
1514 def_bb = gimple_bb (SSA_NAME_DEF_STMT (expr));
1515 if (def_bb
1516 && dominated_by_p (CDI_DOMINATORS, def_bb, entry_bb)
1517 && !dominated_by_p (CDI_DOMINATORS, def_bb, exit_bb))
1518 return false;
1519
1520 return true;
1521 }
1522
1523 return false;
1524}
1525
1526/* If COPY_NAME_P is true, creates and returns a duplicate of NAME.
1527 The copies are stored to NAME_COPIES, if NAME was already duplicated,
1528 its duplicate stored in NAME_COPIES is returned.
1529
1530 Regardless of COPY_NAME_P, the decl used as a base of the ssa name is also
1531 duplicated, storing the copies in DECL_COPIES. */
1532
1533static tree
1534separate_decls_in_region_name (tree name, name_to_copy_table_type *name_copies,
1535 int_tree_htab_type *decl_copies,
1536 bool copy_name_p)
1537{
1538 tree copy, var, var_copy;
1539 unsigned idx, uid, nuid;
1540 struct int_tree_map ielt;
1541 struct name_to_copy_elt elt, *nelt;
1542 name_to_copy_elt **slot;
1543 int_tree_map *dslot;
1544
1545 if (TREE_CODE (name) != SSA_NAME)
1546 return name;
1547
1548 idx = SSA_NAME_VERSION (name);
1549 elt.version = idx;
1550 slot = name_copies->find_slot_with_hash (comparable: &elt, hash: idx,
1551 insert: copy_name_p ? INSERT : NO_INSERT);
1552 if (slot && *slot)
1553 return (*slot)->new_name;
1554
1555 if (copy_name_p)
1556 {
1557 copy = duplicate_ssa_name (var: name, NULL);
1558 nelt = XNEW (struct name_to_copy_elt);
1559 nelt->version = idx;
1560 nelt->new_name = copy;
1561 nelt->field = NULL_TREE;
1562 *slot = nelt;
1563 }
1564 else
1565 {
1566 gcc_assert (!slot);
1567 copy = name;
1568 }
1569
1570 var = SSA_NAME_VAR (name);
1571 if (!var)
1572 return copy;
1573
1574 uid = DECL_UID (var);
1575 ielt.uid = uid;
1576 dslot = decl_copies->find_slot_with_hash (comparable: ielt, hash: uid, insert: INSERT);
1577 if (!dslot->to)
1578 {
1579 var_copy = create_tmp_var (TREE_TYPE (var), get_name (var));
1580 DECL_NOT_GIMPLE_REG_P (var_copy) = DECL_NOT_GIMPLE_REG_P (var);
1581 dslot->uid = uid;
1582 dslot->to = var_copy;
1583
1584 /* Ensure that when we meet this decl next time, we won't duplicate
1585 it again. */
1586 nuid = DECL_UID (var_copy);
1587 ielt.uid = nuid;
1588 dslot = decl_copies->find_slot_with_hash (comparable: ielt, hash: nuid, insert: INSERT);
1589 gcc_assert (!dslot->to);
1590 dslot->uid = nuid;
1591 dslot->to = var_copy;
1592 }
1593 else
1594 var_copy = dslot->to;
1595
1596 replace_ssa_name_symbol (copy, var_copy);
1597 return copy;
1598}
1599
1600/* Finds the ssa names used in STMT that are defined outside the
1601 region between ENTRY and EXIT and replaces such ssa names with
1602 their duplicates. The duplicates are stored to NAME_COPIES. Base
1603 decls of all ssa names used in STMT (including those defined in
1604 LOOP) are replaced with the new temporary variables; the
1605 replacement decls are stored in DECL_COPIES. */
1606
1607static void
1608separate_decls_in_region_stmt (edge entry, edge exit, gimple *stmt,
1609 name_to_copy_table_type *name_copies,
1610 int_tree_htab_type *decl_copies)
1611{
1612 use_operand_p use;
1613 def_operand_p def;
1614 ssa_op_iter oi;
1615 tree name, copy;
1616 bool copy_name_p;
1617
1618 FOR_EACH_PHI_OR_STMT_DEF (def, stmt, oi, SSA_OP_DEF)
1619 {
1620 name = DEF_FROM_PTR (def);
1621 gcc_assert (TREE_CODE (name) == SSA_NAME);
1622 copy = separate_decls_in_region_name (name, name_copies, decl_copies,
1623 copy_name_p: false);
1624 gcc_assert (copy == name);
1625 }
1626
1627 FOR_EACH_PHI_OR_STMT_USE (use, stmt, oi, SSA_OP_USE)
1628 {
1629 name = USE_FROM_PTR (use);
1630 if (TREE_CODE (name) != SSA_NAME)
1631 continue;
1632
1633 copy_name_p = expr_invariant_in_region_p (entry, exit, expr: name);
1634 copy = separate_decls_in_region_name (name, name_copies, decl_copies,
1635 copy_name_p);
1636 SET_USE (use, copy);
1637 }
1638}
1639
1640/* Finds the ssa names used in STMT that are defined outside the
1641 region between ENTRY and EXIT and replaces such ssa names with
1642 their duplicates. The duplicates are stored to NAME_COPIES. Base
1643 decls of all ssa names used in STMT (including those defined in
1644 LOOP) are replaced with the new temporary variables; the
1645 replacement decls are stored in DECL_COPIES. */
1646
1647static bool
1648separate_decls_in_region_debug (gimple *stmt,
1649 name_to_copy_table_type *name_copies,
1650 int_tree_htab_type *decl_copies)
1651{
1652 use_operand_p use;
1653 ssa_op_iter oi;
1654 tree var, name;
1655 struct int_tree_map ielt;
1656 struct name_to_copy_elt elt;
1657 name_to_copy_elt **slot;
1658 int_tree_map *dslot;
1659
1660 if (gimple_debug_bind_p (s: stmt))
1661 var = gimple_debug_bind_get_var (dbg: stmt);
1662 else if (gimple_debug_source_bind_p (s: stmt))
1663 var = gimple_debug_source_bind_get_var (dbg: stmt);
1664 else
1665 return true;
1666 if (TREE_CODE (var) == DEBUG_EXPR_DECL || TREE_CODE (var) == LABEL_DECL)
1667 return true;
1668 gcc_assert (DECL_P (var) && SSA_VAR_P (var));
1669 ielt.uid = DECL_UID (var);
1670 dslot = decl_copies->find_slot_with_hash (comparable: ielt, hash: ielt.uid, insert: NO_INSERT);
1671 if (!dslot)
1672 return true;
1673 if (gimple_debug_bind_p (s: stmt))
1674 gimple_debug_bind_set_var (dbg: stmt, var: dslot->to);
1675 else if (gimple_debug_source_bind_p (s: stmt))
1676 gimple_debug_source_bind_set_var (dbg: stmt, var: dslot->to);
1677
1678 FOR_EACH_PHI_OR_STMT_USE (use, stmt, oi, SSA_OP_USE)
1679 {
1680 name = USE_FROM_PTR (use);
1681 if (TREE_CODE (name) != SSA_NAME)
1682 continue;
1683
1684 elt.version = SSA_NAME_VERSION (name);
1685 slot = name_copies->find_slot_with_hash (comparable: &elt, hash: elt.version, insert: NO_INSERT);
1686 if (!slot)
1687 {
1688 gimple_debug_bind_reset_value (dbg: stmt);
1689 update_stmt (s: stmt);
1690 break;
1691 }
1692
1693 SET_USE (use, (*slot)->new_name);
1694 }
1695
1696 return false;
1697}
1698
1699/* Callback for htab_traverse. Adds a field corresponding to the reduction
1700 specified in SLOT. The type is passed in DATA. */
1701
1702int
1703add_field_for_reduction (reduction_info **slot, tree type)
1704{
1705
1706 struct reduction_info *const red = *slot;
1707 tree var = reduc_stmt_res (stmt: red->reduc_stmt);
1708 tree field = build_decl (gimple_location (g: red->reduc_stmt), FIELD_DECL,
1709 SSA_NAME_IDENTIFIER (var), TREE_TYPE (var));
1710
1711 insert_field_into_struct (type, field);
1712
1713 red->field = field;
1714
1715 return 1;
1716}
1717
1718/* Callback for htab_traverse. Adds a field corresponding to a ssa name
1719 described in SLOT. The type is passed in DATA. */
1720
1721int
1722add_field_for_name (name_to_copy_elt **slot, tree type)
1723{
1724 struct name_to_copy_elt *const elt = *slot;
1725 tree name = ssa_name (elt->version);
1726 tree field = build_decl (UNKNOWN_LOCATION,
1727 FIELD_DECL, SSA_NAME_IDENTIFIER (name),
1728 TREE_TYPE (name));
1729
1730 insert_field_into_struct (type, field);
1731 elt->field = field;
1732
1733 return 1;
1734}
1735
1736/* Callback for htab_traverse. A local result is the intermediate result
1737 computed by a single
1738 thread, or the initial value in case no iteration was executed.
1739 This function creates a phi node reflecting these values.
1740 The phi's result will be stored in NEW_PHI field of the
1741 reduction's data structure. */
1742
1743int
1744create_phi_for_local_result (reduction_info **slot, class loop *loop)
1745{
1746 struct reduction_info *const reduc = *slot;
1747 edge e;
1748 gphi *new_phi;
1749 basic_block store_bb, continue_bb;
1750 tree local_res;
1751 location_t locus;
1752
1753 /* STORE_BB is the block where the phi
1754 should be stored. It is the destination of the loop exit.
1755 (Find the fallthru edge from GIMPLE_OMP_CONTINUE). */
1756 continue_bb = single_pred (bb: loop->latch);
1757 store_bb = FALLTHRU_EDGE (continue_bb)->dest;
1758
1759 /* STORE_BB has two predecessors. One coming from the loop
1760 (the reduction's result is computed at the loop),
1761 and another coming from a block preceding the loop,
1762 when no iterations
1763 are executed (the initial value should be taken). */
1764 if (EDGE_PRED (store_bb, 0) == FALLTHRU_EDGE (continue_bb))
1765 e = EDGE_PRED (store_bb, 1);
1766 else
1767 e = EDGE_PRED (store_bb, 0);
1768 tree lhs = reduc_stmt_res (stmt: reduc->reduc_stmt);
1769 local_res = copy_ssa_name (var: lhs);
1770 locus = gimple_location (g: reduc->reduc_stmt);
1771 new_phi = create_phi_node (local_res, store_bb);
1772 add_phi_arg (new_phi, reduc->init, e, locus);
1773 add_phi_arg (new_phi, lhs, FALLTHRU_EDGE (continue_bb), locus);
1774 reduc->new_phi = new_phi;
1775
1776 return 1;
1777}
1778
1779struct clsn_data
1780{
1781 tree store;
1782 tree load;
1783
1784 basic_block store_bb;
1785 basic_block load_bb;
1786};
1787
1788/* Callback for htab_traverse. Create an atomic instruction for the
1789 reduction described in SLOT.
1790 DATA annotates the place in memory the atomic operation relates to,
1791 and the basic block it needs to be generated in. */
1792
1793int
1794create_call_for_reduction_1 (reduction_info **slot, struct clsn_data *clsn_data)
1795{
1796 struct reduction_info *const reduc = *slot;
1797 gimple_stmt_iterator gsi;
1798 tree type = TREE_TYPE (PHI_RESULT (reduc->reduc_phi));
1799 tree load_struct;
1800 basic_block bb;
1801 basic_block new_bb;
1802 edge e;
1803 tree t, addr, ref, x;
1804 tree tmp_load, name;
1805 gimple *load;
1806
1807 if (reduc->reduc_addr == NULL_TREE)
1808 {
1809 load_struct = build_simple_mem_ref (clsn_data->load);
1810 t = build3 (COMPONENT_REF, type, load_struct, reduc->field, NULL_TREE);
1811
1812 addr = build_addr (t);
1813 }
1814 else
1815 {
1816 /* Set the address for the atomic store. */
1817 addr = reduc->reduc_addr;
1818
1819 /* Remove the non-atomic store '*addr = sum'. */
1820 tree res = PHI_RESULT (reduc->keep_res);
1821 use_operand_p use_p;
1822 gimple *stmt;
1823 bool single_use_p = single_imm_use (var: res, use_p: &use_p, stmt: &stmt);
1824 gcc_assert (single_use_p);
1825 replace_uses_by (gimple_vdef (g: stmt),
1826 gimple_vuse (g: stmt));
1827 gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
1828 gsi_remove (&gsi, true);
1829 }
1830
1831 /* Create phi node. */
1832 bb = clsn_data->load_bb;
1833
1834 gsi = gsi_last_bb (bb);
1835 e = split_block (bb, gsi_stmt (i: gsi));
1836 new_bb = e->dest;
1837
1838 tmp_load = create_tmp_var (TREE_TYPE (TREE_TYPE (addr)));
1839 tmp_load = make_ssa_name (var: tmp_load);
1840 load = gimple_build_omp_atomic_load (tmp_load, addr,
1841 OMP_MEMORY_ORDER_RELAXED);
1842 SSA_NAME_DEF_STMT (tmp_load) = load;
1843 gsi = gsi_start_bb (bb: new_bb);
1844 gsi_insert_after (&gsi, load, GSI_NEW_STMT);
1845
1846 e = split_block (new_bb, load);
1847 new_bb = e->dest;
1848 gsi = gsi_start_bb (bb: new_bb);
1849 ref = tmp_load;
1850 x = fold_build2 (reduc->reduction_code,
1851 TREE_TYPE (PHI_RESULT (reduc->new_phi)), ref,
1852 PHI_RESULT (reduc->new_phi));
1853
1854 name = force_gimple_operand_gsi (&gsi, x, true, NULL_TREE, true,
1855 GSI_CONTINUE_LINKING);
1856
1857 gimple *store = gimple_build_omp_atomic_store (name,
1858 OMP_MEMORY_ORDER_RELAXED);
1859 gsi_insert_after (&gsi, store, GSI_NEW_STMT);
1860 return 1;
1861}
1862
1863/* Create the atomic operation at the join point of the threads.
1864 REDUCTION_LIST describes the reductions in the LOOP.
1865 LD_ST_DATA describes the shared data structure where
1866 shared data is stored in and loaded from. */
1867static void
1868create_call_for_reduction (class loop *loop,
1869 reduction_info_table_type *reduction_list,
1870 struct clsn_data *ld_st_data)
1871{
1872 reduction_list->traverse <class loop *, create_phi_for_local_result> (argument: loop);
1873 /* Find the fallthru edge from GIMPLE_OMP_CONTINUE. */
1874 basic_block continue_bb = single_pred (bb: loop->latch);
1875 ld_st_data->load_bb = FALLTHRU_EDGE (continue_bb)->dest;
1876 reduction_list
1877 ->traverse <struct clsn_data *, create_call_for_reduction_1> (argument: ld_st_data);
1878}
1879
1880/* Callback for htab_traverse. Loads the final reduction value at the
1881 join point of all threads, and inserts it in the right place. */
1882
1883int
1884create_loads_for_reductions (reduction_info **slot, struct clsn_data *clsn_data)
1885{
1886 struct reduction_info *const red = *slot;
1887 gimple *stmt;
1888 gimple_stmt_iterator gsi;
1889 tree type = TREE_TYPE (reduc_stmt_res (red->reduc_stmt));
1890 tree load_struct;
1891 tree name;
1892 tree x;
1893
1894 /* If there's no exit phi, the result of the reduction is unused. */
1895 if (red->keep_res == NULL)
1896 return 1;
1897
1898 gsi = gsi_after_labels (bb: clsn_data->load_bb);
1899 load_struct = build_simple_mem_ref (clsn_data->load);
1900 load_struct = build3 (COMPONENT_REF, type, load_struct, red->field,
1901 NULL_TREE);
1902
1903 x = load_struct;
1904 name = PHI_RESULT (red->keep_res);
1905 stmt = gimple_build_assign (name, x);
1906
1907 gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
1908
1909 for (gsi = gsi_start_phis (gimple_bb (g: red->keep_res));
1910 !gsi_end_p (i: gsi); gsi_next (i: &gsi))
1911 if (gsi_stmt (i: gsi) == red->keep_res)
1912 {
1913 remove_phi_node (&gsi, false);
1914 return 1;
1915 }
1916 gcc_unreachable ();
1917}
1918
1919/* Load the reduction result that was stored in LD_ST_DATA.
1920 REDUCTION_LIST describes the list of reductions that the
1921 loads should be generated for. */
1922static void
1923create_final_loads_for_reduction (reduction_info_table_type *reduction_list,
1924 struct clsn_data *ld_st_data)
1925{
1926 gimple_stmt_iterator gsi;
1927 tree t;
1928 gimple *stmt;
1929
1930 gsi = gsi_after_labels (bb: ld_st_data->load_bb);
1931 t = build_fold_addr_expr (ld_st_data->store);
1932 stmt = gimple_build_assign (ld_st_data->load, t);
1933
1934 gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
1935
1936 reduction_list
1937 ->traverse <struct clsn_data *, create_loads_for_reductions> (argument: ld_st_data);
1938
1939}
1940
1941/* Callback for htab_traverse. Store the neutral value for the
1942 particular reduction's operation, e.g. 0 for PLUS_EXPR,
1943 1 for MULT_EXPR, etc. into the reduction field.
1944 The reduction is specified in SLOT. The store information is
1945 passed in DATA. */
1946
1947int
1948create_stores_for_reduction (reduction_info **slot, struct clsn_data *clsn_data)
1949{
1950 struct reduction_info *const red = *slot;
1951 tree t;
1952 gimple *stmt;
1953 gimple_stmt_iterator gsi;
1954 tree type = TREE_TYPE (reduc_stmt_res (red->reduc_stmt));
1955
1956 gsi = gsi_last_bb (bb: clsn_data->store_bb);
1957 t = build3 (COMPONENT_REF, type, clsn_data->store, red->field, NULL_TREE);
1958 stmt = gimple_build_assign (t, red->initial_value);
1959 gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
1960
1961 return 1;
1962}
1963
1964/* Callback for htab_traverse. Creates loads to a field of LOAD in LOAD_BB and
1965 store to a field of STORE in STORE_BB for the ssa name and its duplicate
1966 specified in SLOT. */
1967
1968int
1969create_loads_and_stores_for_name (name_to_copy_elt **slot,
1970 struct clsn_data *clsn_data)
1971{
1972 struct name_to_copy_elt *const elt = *slot;
1973 tree t;
1974 gimple *stmt;
1975 gimple_stmt_iterator gsi;
1976 tree type = TREE_TYPE (elt->new_name);
1977 tree load_struct;
1978
1979 gsi = gsi_last_bb (bb: clsn_data->store_bb);
1980 t = build3 (COMPONENT_REF, type, clsn_data->store, elt->field, NULL_TREE);
1981 stmt = gimple_build_assign (t, ssa_name (elt->version));
1982 gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
1983
1984 gsi = gsi_last_bb (bb: clsn_data->load_bb);
1985 load_struct = build_simple_mem_ref (clsn_data->load);
1986 t = build3 (COMPONENT_REF, type, load_struct, elt->field, NULL_TREE);
1987 stmt = gimple_build_assign (elt->new_name, t);
1988 gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
1989
1990 return 1;
1991}
1992
1993/* Moves all the variables used in LOOP and defined outside of it (including
1994 the initial values of loop phi nodes, and *PER_THREAD if it is a ssa
1995 name) to a structure created for this purpose. The code
1996
1997 while (1)
1998 {
1999 use (a);
2000 use (b);
2001 }
2002
2003 is transformed this way:
2004
2005 bb0:
2006 old.a = a;
2007 old.b = b;
2008
2009 bb1:
2010 a' = new->a;
2011 b' = new->b;
2012 while (1)
2013 {
2014 use (a');
2015 use (b');
2016 }
2017
2018 `old' is stored to *ARG_STRUCT and `new' is stored to NEW_ARG_STRUCT. The
2019 pointer `new' is intentionally not initialized (the loop will be split to a
2020 separate function later, and `new' will be initialized from its arguments).
2021 LD_ST_DATA holds information about the shared data structure used to pass
2022 information among the threads. It is initialized here, and
2023 gen_parallel_loop will pass it to create_call_for_reduction that
2024 needs this information. REDUCTION_LIST describes the reductions
2025 in LOOP. */
2026
2027static void
2028separate_decls_in_region (edge entry, edge exit,
2029 reduction_info_table_type *reduction_list,
2030 tree *arg_struct, tree *new_arg_struct,
2031 struct clsn_data *ld_st_data)
2032
2033{
2034 basic_block bb1 = split_edge (entry);
2035 basic_block bb0 = single_pred (bb: bb1);
2036 name_to_copy_table_type name_copies (10);
2037 int_tree_htab_type decl_copies (10);
2038 unsigned i;
2039 tree type, type_name, nvar;
2040 gimple_stmt_iterator gsi;
2041 struct clsn_data clsn_data;
2042 auto_vec<basic_block, 3> body;
2043 basic_block bb;
2044 basic_block entry_bb = bb1;
2045 basic_block exit_bb = exit->dest;
2046 bool has_debug_stmt = false;
2047
2048 entry = single_succ_edge (bb: entry_bb);
2049 gather_blocks_in_sese_region (entry: entry_bb, exit: exit_bb, bbs_p: &body);
2050
2051 FOR_EACH_VEC_ELT (body, i, bb)
2052 {
2053 if (bb != entry_bb && bb != exit_bb)
2054 {
2055 for (gsi = gsi_start_phis (bb); !gsi_end_p (i: gsi); gsi_next (i: &gsi))
2056 separate_decls_in_region_stmt (entry, exit, stmt: gsi_stmt (i: gsi),
2057 name_copies: &name_copies, decl_copies: &decl_copies);
2058
2059 for (gsi = gsi_start_bb (bb); !gsi_end_p (i: gsi); gsi_next (i: &gsi))
2060 {
2061 gimple *stmt = gsi_stmt (i: gsi);
2062
2063 if (is_gimple_debug (gs: stmt))
2064 has_debug_stmt = true;
2065 else
2066 separate_decls_in_region_stmt (entry, exit, stmt,
2067 name_copies: &name_copies, decl_copies: &decl_copies);
2068 }
2069 }
2070 }
2071
2072 /* Now process debug bind stmts. We must not create decls while
2073 processing debug stmts, so we defer their processing so as to
2074 make sure we will have debug info for as many variables as
2075 possible (all of those that were dealt with in the loop above),
2076 and discard those for which we know there's nothing we can
2077 do. */
2078 if (has_debug_stmt)
2079 FOR_EACH_VEC_ELT (body, i, bb)
2080 if (bb != entry_bb && bb != exit_bb)
2081 {
2082 for (gsi = gsi_start_bb (bb); !gsi_end_p (i: gsi);)
2083 {
2084 gimple *stmt = gsi_stmt (i: gsi);
2085
2086 if (is_gimple_debug (gs: stmt))
2087 {
2088 if (separate_decls_in_region_debug (stmt, name_copies: &name_copies,
2089 decl_copies: &decl_copies))
2090 {
2091 gsi_remove (&gsi, true);
2092 continue;
2093 }
2094 }
2095
2096 gsi_next (i: &gsi);
2097 }
2098 }
2099
2100 if (name_copies.is_empty () && reduction_list->is_empty ())
2101 {
2102 /* It may happen that there is nothing to copy (if there are only
2103 loop carried and external variables in the loop). */
2104 *arg_struct = NULL;
2105 *new_arg_struct = NULL;
2106 }
2107 else
2108 {
2109 /* Create the type for the structure to store the ssa names to. */
2110 type = lang_hooks.types.make_type (RECORD_TYPE);
2111 type_name = build_decl (UNKNOWN_LOCATION,
2112 TYPE_DECL, create_tmp_var_name (".paral_data"),
2113 type);
2114 TYPE_NAME (type) = type_name;
2115
2116 name_copies.traverse <tree, add_field_for_name> (argument: type);
2117 if (reduction_list && !reduction_list->is_empty ())
2118 {
2119 /* Create the fields for reductions. */
2120 reduction_list->traverse <tree, add_field_for_reduction> (argument: type);
2121 }
2122 layout_type (type);
2123
2124 /* Create the loads and stores. */
2125 *arg_struct = create_tmp_var (type, ".paral_data_store");
2126 nvar = create_tmp_var (build_pointer_type (type), ".paral_data_load");
2127 *new_arg_struct = make_ssa_name (var: nvar);
2128
2129 ld_st_data->store = *arg_struct;
2130 ld_st_data->load = *new_arg_struct;
2131 ld_st_data->store_bb = bb0;
2132 ld_st_data->load_bb = bb1;
2133
2134 name_copies
2135 .traverse <struct clsn_data *, create_loads_and_stores_for_name>
2136 (argument: ld_st_data);
2137
2138 /* Load the calculation from memory (after the join of the threads). */
2139
2140 if (reduction_list && !reduction_list->is_empty ())
2141 {
2142 reduction_list
2143 ->traverse <struct clsn_data *, create_stores_for_reduction>
2144 (argument: ld_st_data);
2145 clsn_data.load = make_ssa_name (var: nvar);
2146 clsn_data.load_bb = exit->dest;
2147 clsn_data.store = ld_st_data->store;
2148 create_final_loads_for_reduction (reduction_list, ld_st_data: &clsn_data);
2149 }
2150 }
2151}
2152
2153/* Returns true if FN was created to run in parallel. */
2154
2155bool
2156parallelized_function_p (tree fndecl)
2157{
2158 cgraph_node *node = cgraph_node::get (decl: fndecl);
2159 gcc_assert (node != NULL);
2160 return node->parallelized_function;
2161}
2162
2163/* Creates and returns an empty function that will receive the body of
2164 a parallelized loop. */
2165
2166static tree
2167create_loop_fn (location_t loc)
2168{
2169 char buf[100];
2170 char *tname;
2171 tree decl, type, name, t;
2172 struct function *act_cfun = cfun;
2173 static unsigned loopfn_num;
2174
2175 loc = LOCATION_LOCUS (loc);
2176 snprintf (s: buf, maxlen: 100, format: "%s.$loopfn", current_function_name ());
2177 ASM_FORMAT_PRIVATE_NAME (tname, buf, loopfn_num++);
2178 clean_symbol_name (tname);
2179 name = get_identifier (tname);
2180 type = build_function_type_list (void_type_node, ptr_type_node, NULL_TREE);
2181
2182 decl = build_decl (loc, FUNCTION_DECL, name, type);
2183 TREE_STATIC (decl) = 1;
2184 TREE_USED (decl) = 1;
2185 DECL_ARTIFICIAL (decl) = 1;
2186 DECL_IGNORED_P (decl) = 0;
2187 TREE_PUBLIC (decl) = 0;
2188 DECL_UNINLINABLE (decl) = 1;
2189 DECL_EXTERNAL (decl) = 0;
2190 DECL_CONTEXT (decl) = NULL_TREE;
2191 DECL_INITIAL (decl) = make_node (BLOCK);
2192 BLOCK_SUPERCONTEXT (DECL_INITIAL (decl)) = decl;
2193
2194 t = build_decl (loc, RESULT_DECL, NULL_TREE, void_type_node);
2195 DECL_ARTIFICIAL (t) = 1;
2196 DECL_IGNORED_P (t) = 1;
2197 DECL_RESULT (decl) = t;
2198
2199 t = build_decl (loc, PARM_DECL, get_identifier (".paral_data_param"),
2200 ptr_type_node);
2201 DECL_ARTIFICIAL (t) = 1;
2202 DECL_ARG_TYPE (t) = ptr_type_node;
2203 DECL_CONTEXT (t) = decl;
2204 TREE_USED (t) = 1;
2205 DECL_ARGUMENTS (decl) = t;
2206 DECL_FUNCTION_SPECIFIC_TARGET (decl)
2207 = DECL_FUNCTION_SPECIFIC_TARGET (act_cfun->decl);
2208 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (decl)
2209 = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (act_cfun->decl);
2210
2211
2212 allocate_struct_function (decl, false);
2213
2214 /* The call to allocate_struct_function clobbers CFUN, so we need to restore
2215 it. */
2216 set_cfun (new_cfun: act_cfun);
2217
2218 return decl;
2219}
2220
2221/* Replace uses of NAME by VAL in block BB. */
2222
2223static void
2224replace_uses_in_bb_by (tree name, tree val, basic_block bb)
2225{
2226 gimple *use_stmt;
2227 imm_use_iterator imm_iter;
2228
2229 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, name)
2230 {
2231 if (gimple_bb (g: use_stmt) != bb)
2232 continue;
2233
2234 use_operand_p use_p;
2235 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
2236 SET_USE (use_p, val);
2237 }
2238}
2239
2240/* Do transformation from:
2241
2242 <bb preheader>:
2243 ...
2244 goto <bb header>
2245
2246 <bb header>:
2247 ivtmp_a = PHI <ivtmp_init (preheader), ivtmp_b (latch)>
2248 sum_a = PHI <sum_init (preheader), sum_b (latch)>
2249 ...
2250 use (ivtmp_a)
2251 ...
2252 sum_b = sum_a + sum_update
2253 ...
2254 if (ivtmp_a < n)
2255 goto <bb latch>;
2256 else
2257 goto <bb exit>;
2258
2259 <bb latch>:
2260 ivtmp_b = ivtmp_a + 1;
2261 goto <bb header>
2262
2263 <bb exit>:
2264 sum_z = PHI <sum_b (cond[1]), ...>
2265
2266 [1] Where <bb cond> is single_pred (bb latch); In the simplest case,
2267 that's <bb header>.
2268
2269 to:
2270
2271 <bb preheader>:
2272 ...
2273 goto <bb newheader>
2274
2275 <bb header>:
2276 ivtmp_a = PHI <ivtmp_c (latch)>
2277 sum_a = PHI <sum_c (latch)>
2278 ...
2279 use (ivtmp_a)
2280 ...
2281 sum_b = sum_a + sum_update
2282 ...
2283 goto <bb latch>;
2284
2285 <bb newheader>:
2286 ivtmp_c = PHI <ivtmp_init (preheader), ivtmp_b (latch)>
2287 sum_c = PHI <sum_init (preheader), sum_b (latch)>
2288 if (ivtmp_c < n + 1)
2289 goto <bb header>;
2290 else
2291 goto <bb newexit>;
2292
2293 <bb latch>:
2294 ivtmp_b = ivtmp_a + 1;
2295 goto <bb newheader>
2296
2297 <bb newexit>:
2298 sum_y = PHI <sum_c (newheader)>
2299
2300 <bb exit>:
2301 sum_z = PHI <sum_y (newexit), ...>
2302
2303
2304 In unified diff format:
2305
2306 <bb preheader>:
2307 ...
2308- goto <bb header>
2309+ goto <bb newheader>
2310
2311 <bb header>:
2312- ivtmp_a = PHI <ivtmp_init (preheader), ivtmp_b (latch)>
2313- sum_a = PHI <sum_init (preheader), sum_b (latch)>
2314+ ivtmp_a = PHI <ivtmp_c (latch)>
2315+ sum_a = PHI <sum_c (latch)>
2316 ...
2317 use (ivtmp_a)
2318 ...
2319 sum_b = sum_a + sum_update
2320 ...
2321- if (ivtmp_a < n)
2322- goto <bb latch>;
2323+ goto <bb latch>;
2324+
2325+ <bb newheader>:
2326+ ivtmp_c = PHI <ivtmp_init (preheader), ivtmp_b (latch)>
2327+ sum_c = PHI <sum_init (preheader), sum_b (latch)>
2328+ if (ivtmp_c < n + 1)
2329+ goto <bb header>;
2330 else
2331 goto <bb exit>;
2332
2333 <bb latch>:
2334 ivtmp_b = ivtmp_a + 1;
2335- goto <bb header>
2336+ goto <bb newheader>
2337
2338+ <bb newexit>:
2339+ sum_y = PHI <sum_c (newheader)>
2340
2341 <bb exit>:
2342- sum_z = PHI <sum_b (cond[1]), ...>
2343+ sum_z = PHI <sum_y (newexit), ...>
2344
2345 Note: the example does not show any virtual phis, but these are handled more
2346 or less as reductions.
2347
2348
2349 Moves the exit condition of LOOP to the beginning of its header.
2350 REDUCTION_LIST describes the reductions in LOOP. BOUND is the new loop
2351 bound. */
2352
2353static void
2354transform_to_exit_first_loop_alt (class loop *loop,
2355 reduction_info_table_type *reduction_list,
2356 tree bound)
2357{
2358 basic_block header = loop->header;
2359 basic_block latch = loop->latch;
2360 edge exit = single_dom_exit (loop);
2361 basic_block exit_block = exit->dest;
2362 gcond *cond_stmt = as_a <gcond *> (p: *gsi_last_bb (bb: exit->src));
2363 tree control = gimple_cond_lhs (gs: cond_stmt);
2364 edge e;
2365
2366 /* Create the new_header block. */
2367 basic_block new_header = split_block_before_cond_jump (exit->src);
2368 edge edge_at_split = single_pred_edge (bb: new_header);
2369
2370 /* Redirect entry edge to new_header. */
2371 edge entry = loop_preheader_edge (loop);
2372 e = redirect_edge_and_branch (entry, new_header);
2373 gcc_assert (e == entry);
2374
2375 /* Redirect post_inc_edge to new_header. */
2376 edge post_inc_edge = single_succ_edge (bb: latch);
2377 e = redirect_edge_and_branch (post_inc_edge, new_header);
2378 gcc_assert (e == post_inc_edge);
2379
2380 /* Redirect post_cond_edge to header. */
2381 edge post_cond_edge = single_pred_edge (bb: latch);
2382 e = redirect_edge_and_branch (post_cond_edge, header);
2383 gcc_assert (e == post_cond_edge);
2384
2385 /* Redirect edge_at_split to latch. */
2386 e = redirect_edge_and_branch (edge_at_split, latch);
2387 gcc_assert (e == edge_at_split);
2388
2389 /* Set the new loop bound. */
2390 gimple_cond_set_rhs (gs: cond_stmt, rhs: bound);
2391 update_stmt (s: cond_stmt);
2392
2393 /* Repair the ssa. */
2394 vec<edge_var_map> *v = redirect_edge_var_map_vector (post_inc_edge);
2395 edge_var_map *vm;
2396 gphi_iterator gsi;
2397 int i;
2398 for (gsi = gsi_start_phis (header), i = 0;
2399 !gsi_end_p (i: gsi) && v->iterate (ix: i, ptr: &vm);
2400 gsi_next (i: &gsi), i++)
2401 {
2402 gphi *phi = gsi.phi ();
2403 tree res_a = PHI_RESULT (phi);
2404
2405 /* Create new phi. */
2406 tree res_c = copy_ssa_name (var: res_a, stmt: phi);
2407 gphi *nphi = create_phi_node (res_c, new_header);
2408
2409 /* Replace ivtmp_a with ivtmp_c in condition 'if (ivtmp_a < n)'. */
2410 replace_uses_in_bb_by (name: res_a, val: res_c, bb: new_header);
2411
2412 /* Replace ivtmp/sum_b with ivtmp/sum_c in header phi. */
2413 add_phi_arg (phi, res_c, post_cond_edge, UNKNOWN_LOCATION);
2414
2415 /* Replace sum_b with sum_c in exit phi. */
2416 tree res_b = redirect_edge_var_map_def (v: vm);
2417 replace_uses_in_bb_by (name: res_b, val: res_c, bb: exit_block);
2418
2419 struct reduction_info *red = reduction_phi (reduction_list, phi);
2420 gcc_assert (virtual_operand_p (res_a)
2421 || res_a == control
2422 || red != NULL);
2423
2424 if (red)
2425 {
2426 /* Register the new reduction phi. */
2427 red->reduc_phi = nphi;
2428 gimple_set_uid (g: red->reduc_phi, uid: red->reduc_version);
2429 }
2430 }
2431 gcc_assert (gsi_end_p (gsi) && !v->iterate (i, &vm));
2432
2433 /* Set the preheader argument of the new phis to ivtmp/sum_init. */
2434 flush_pending_stmts (entry);
2435
2436 /* Set the latch arguments of the new phis to ivtmp/sum_b. */
2437 flush_pending_stmts (post_inc_edge);
2438
2439
2440 basic_block new_exit_block = NULL;
2441 if (!single_pred_p (bb: exit->dest))
2442 {
2443 /* Create a new empty exit block, inbetween the new loop header and the
2444 old exit block. The function separate_decls_in_region needs this block
2445 to insert code that is active on loop exit, but not any other path. */
2446 new_exit_block = split_edge (exit);
2447 }
2448
2449 /* Insert and register the reduction exit phis. */
2450 for (gphi_iterator gsi = gsi_start_phis (exit_block);
2451 !gsi_end_p (i: gsi);
2452 gsi_next (i: &gsi))
2453 {
2454 gphi *phi = gsi.phi ();
2455 gphi *nphi = NULL;
2456 tree res_z = PHI_RESULT (phi);
2457 tree res_c;
2458
2459 if (new_exit_block != NULL)
2460 {
2461 /* Now that we have a new exit block, duplicate the phi of the old
2462 exit block in the new exit block to preserve loop-closed ssa. */
2463 edge succ_new_exit_block = single_succ_edge (bb: new_exit_block);
2464 edge pred_new_exit_block = single_pred_edge (bb: new_exit_block);
2465 tree res_y = copy_ssa_name (var: res_z, stmt: phi);
2466 nphi = create_phi_node (res_y, new_exit_block);
2467 res_c = PHI_ARG_DEF_FROM_EDGE (phi, succ_new_exit_block);
2468 add_phi_arg (nphi, res_c, pred_new_exit_block, UNKNOWN_LOCATION);
2469 add_phi_arg (phi, res_y, succ_new_exit_block, UNKNOWN_LOCATION);
2470 }
2471 else
2472 res_c = PHI_ARG_DEF_FROM_EDGE (phi, exit);
2473
2474 if (virtual_operand_p (op: res_z))
2475 continue;
2476
2477 gimple *reduc_phi = SSA_NAME_DEF_STMT (res_c);
2478 struct reduction_info *red = reduction_phi (reduction_list, phi: reduc_phi);
2479 if (red != NULL)
2480 red->keep_res = (nphi != NULL
2481 ? nphi
2482 : phi);
2483 }
2484
2485 /* We're going to cancel the loop at the end of gen_parallel_loop, but until
2486 then we're still using some fields, so only bother about fields that are
2487 still used: header and latch.
2488 The loop has a new header bb, so we update it. The latch bb stays the
2489 same. */
2490 loop->header = new_header;
2491
2492 /* Recalculate dominance info. */
2493 free_dominance_info (CDI_DOMINATORS);
2494 calculate_dominance_info (CDI_DOMINATORS);
2495}
2496
2497/* Tries to moves the exit condition of LOOP to the beginning of its header
2498 without duplication of the loop body. NIT is the number of iterations of the
2499 loop. REDUCTION_LIST describes the reductions in LOOP. Return true if
2500 transformation is successful. */
2501
2502static bool
2503try_transform_to_exit_first_loop_alt (class loop *loop,
2504 reduction_info_table_type *reduction_list,
2505 tree nit)
2506{
2507 /* Check whether the latch contains a single statement. */
2508 if (!gimple_seq_nondebug_singleton_p (seq: bb_seq (bb: loop->latch)))
2509 return false;
2510
2511 /* Check whether the latch contains no phis. */
2512 if (phi_nodes (bb: loop->latch) != NULL)
2513 return false;
2514
2515 /* Check whether the latch contains the loop iv increment. */
2516 edge back = single_succ_edge (bb: loop->latch);
2517 edge exit = single_dom_exit (loop);
2518 gcond *cond_stmt = as_a <gcond *> (p: *gsi_last_bb (bb: exit->src));
2519 tree control = gimple_cond_lhs (gs: cond_stmt);
2520 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (control));
2521 tree inc_res = gimple_phi_arg_def (gs: phi, index: back->dest_idx);
2522 if (gimple_bb (SSA_NAME_DEF_STMT (inc_res)) != loop->latch)
2523 return false;
2524
2525 /* Check whether there's no code between the loop condition and the latch. */
2526 if (!single_pred_p (bb: loop->latch)
2527 || single_pred (bb: loop->latch) != exit->src)
2528 return false;
2529
2530 tree alt_bound = NULL_TREE;
2531 tree nit_type = TREE_TYPE (nit);
2532
2533 /* Figure out whether nit + 1 overflows. */
2534 if (poly_int_tree_p (t: nit))
2535 {
2536 if (!tree_int_cst_equal (nit, TYPE_MAX_VALUE (nit_type)))
2537 {
2538 alt_bound = fold_build2_loc (UNKNOWN_LOCATION, PLUS_EXPR, nit_type,
2539 nit, build_one_cst (nit_type));
2540
2541 gcc_assert (TREE_CODE (alt_bound) == INTEGER_CST
2542 || TREE_CODE (alt_bound) == POLY_INT_CST);
2543 transform_to_exit_first_loop_alt (loop, reduction_list, bound: alt_bound);
2544 return true;
2545 }
2546 else
2547 {
2548 /* Todo: Figure out if we can trigger this, if it's worth to handle
2549 optimally, and if we can handle it optimally. */
2550 return false;
2551 }
2552 }
2553
2554 gcc_assert (TREE_CODE (nit) == SSA_NAME);
2555
2556 /* Variable nit is the loop bound as returned by canonicalize_loop_ivs, for an
2557 iv with base 0 and step 1 that is incremented in the latch, like this:
2558
2559 <bb header>:
2560 # iv_1 = PHI <0 (preheader), iv_2 (latch)>
2561 ...
2562 if (iv_1 < nit)
2563 goto <bb latch>;
2564 else
2565 goto <bb exit>;
2566
2567 <bb latch>:
2568 iv_2 = iv_1 + 1;
2569 goto <bb header>;
2570
2571 The range of iv_1 is [0, nit]. The latch edge is taken for
2572 iv_1 == [0, nit - 1] and the exit edge is taken for iv_1 == nit. So the
2573 number of latch executions is equal to nit.
2574
2575 The function max_loop_iterations gives us the maximum number of latch
2576 executions, so it gives us the maximum value of nit. */
2577 widest_int nit_max;
2578 if (!max_loop_iterations (loop, &nit_max))
2579 return false;
2580
2581 /* Check if nit + 1 overflows. */
2582 widest_int type_max = wi::to_widest (TYPE_MAX_VALUE (nit_type));
2583 if (nit_max >= type_max)
2584 return false;
2585
2586 gimple *def = SSA_NAME_DEF_STMT (nit);
2587
2588 /* Try to find nit + 1, in the form of n in an assignment nit = n - 1. */
2589 if (def
2590 && is_gimple_assign (gs: def)
2591 && gimple_assign_rhs_code (gs: def) == PLUS_EXPR)
2592 {
2593 tree op1 = gimple_assign_rhs1 (gs: def);
2594 tree op2 = gimple_assign_rhs2 (gs: def);
2595 if (integer_minus_onep (op1))
2596 alt_bound = op2;
2597 else if (integer_minus_onep (op2))
2598 alt_bound = op1;
2599 }
2600
2601 /* If not found, insert nit + 1. */
2602 if (alt_bound == NULL_TREE)
2603 {
2604 alt_bound = fold_build2 (PLUS_EXPR, nit_type, nit,
2605 build_int_cst_type (nit_type, 1));
2606
2607 gimple_stmt_iterator gsi = gsi_last_bb (bb: loop_preheader_edge (loop)->src);
2608
2609 alt_bound
2610 = force_gimple_operand_gsi (&gsi, alt_bound, true, NULL_TREE, false,
2611 GSI_CONTINUE_LINKING);
2612 }
2613
2614 transform_to_exit_first_loop_alt (loop, reduction_list, bound: alt_bound);
2615 return true;
2616}
2617
2618/* Moves the exit condition of LOOP to the beginning of its header. NIT is the
2619 number of iterations of the loop. REDUCTION_LIST describes the reductions in
2620 LOOP. */
2621
2622static void
2623transform_to_exit_first_loop (class loop *loop,
2624 reduction_info_table_type *reduction_list,
2625 tree nit)
2626{
2627 basic_block *bbs, *nbbs, ex_bb, orig_header;
2628 unsigned n;
2629 bool ok;
2630 edge exit = single_dom_exit (loop), hpred;
2631 tree control, control_name, res, t;
2632 gphi *phi, *nphi;
2633 gassign *stmt;
2634 gcond *cond_stmt, *cond_nit;
2635 tree nit_1;
2636
2637 split_block_after_labels (loop->header);
2638 orig_header = single_succ (bb: loop->header);
2639 hpred = single_succ_edge (bb: loop->header);
2640
2641 cond_stmt = as_a <gcond *> (p: *gsi_last_bb (bb: exit->src));
2642 control = gimple_cond_lhs (gs: cond_stmt);
2643 gcc_assert (gimple_cond_rhs (cond_stmt) == nit);
2644
2645 /* Make sure that we have phi nodes on exit for all loop header phis
2646 (create_parallel_loop requires that). */
2647 for (gphi_iterator gsi = gsi_start_phis (loop->header);
2648 !gsi_end_p (i: gsi);
2649 gsi_next (i: &gsi))
2650 {
2651 phi = gsi.phi ();
2652 res = PHI_RESULT (phi);
2653 t = copy_ssa_name (var: res, stmt: phi);
2654 SET_PHI_RESULT (phi, t);
2655 nphi = create_phi_node (res, orig_header);
2656 add_phi_arg (nphi, t, hpred, UNKNOWN_LOCATION);
2657
2658 if (res == control)
2659 {
2660 gimple_cond_set_lhs (gs: cond_stmt, lhs: t);
2661 update_stmt (s: cond_stmt);
2662 control = t;
2663 }
2664 }
2665
2666 bbs = get_loop_body_in_dom_order (loop);
2667
2668 for (n = 0; bbs[n] != exit->src; n++)
2669 continue;
2670 nbbs = XNEWVEC (basic_block, n);
2671 ok = gimple_duplicate_sese_tail (single_succ_edge (bb: loop->header), exit,
2672 bbs + 1, n, nbbs);
2673 gcc_assert (ok);
2674 free (ptr: bbs);
2675 ex_bb = nbbs[0];
2676 free (ptr: nbbs);
2677
2678 /* Other than reductions, the only gimple reg that should be copied
2679 out of the loop is the control variable. */
2680 exit = single_dom_exit (loop);
2681 control_name = NULL_TREE;
2682 for (gphi_iterator gsi = gsi_start_phis (ex_bb);
2683 !gsi_end_p (i: gsi); )
2684 {
2685 phi = gsi.phi ();
2686 res = PHI_RESULT (phi);
2687 if (virtual_operand_p (op: res))
2688 {
2689 gsi_next (i: &gsi);
2690 continue;
2691 }
2692
2693 /* Check if it is a part of reduction. If it is,
2694 keep the phi at the reduction's keep_res field. The
2695 PHI_RESULT of this phi is the resulting value of the reduction
2696 variable when exiting the loop. */
2697
2698 if (!reduction_list->is_empty ())
2699 {
2700 struct reduction_info *red;
2701
2702 tree val = PHI_ARG_DEF_FROM_EDGE (phi, exit);
2703 red = reduction_phi (reduction_list, SSA_NAME_DEF_STMT (val));
2704 if (red)
2705 {
2706 red->keep_res = phi;
2707 gsi_next (i: &gsi);
2708 continue;
2709 }
2710 }
2711 gcc_assert (control_name == NULL_TREE
2712 && SSA_NAME_VAR (res) == SSA_NAME_VAR (control));
2713 control_name = res;
2714 remove_phi_node (&gsi, false);
2715 }
2716 gcc_assert (control_name != NULL_TREE);
2717
2718 /* Initialize the control variable to number of iterations
2719 according to the rhs of the exit condition. */
2720 gimple_stmt_iterator gsi = gsi_after_labels (bb: ex_bb);
2721 cond_nit = as_a <gcond *> (p: *gsi_last_bb (bb: exit->src));
2722 nit_1 = gimple_cond_rhs (gs: cond_nit);
2723 nit_1 = force_gimple_operand_gsi (&gsi,
2724 fold_convert (TREE_TYPE (control_name), nit_1),
2725 false, NULL_TREE, false, GSI_SAME_STMT);
2726 stmt = gimple_build_assign (control_name, nit_1);
2727 gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
2728}
2729
2730/* Create the parallel constructs for LOOP as described in gen_parallel_loop.
2731 LOOP_FN and DATA are the arguments of GIMPLE_OMP_PARALLEL.
2732 NEW_DATA is the variable that should be initialized from the argument
2733 of LOOP_FN. N_THREADS is the requested number of threads, which can be 0 if
2734 that number is to be determined later. */
2735
2736static void
2737create_parallel_loop (class loop *loop, tree loop_fn, tree data,
2738 tree new_data, unsigned n_threads, location_t loc,
2739 bool oacc_kernels_p)
2740{
2741 gimple_stmt_iterator gsi;
2742 basic_block for_bb, ex_bb, continue_bb;
2743 tree t, param;
2744 gomp_parallel *omp_par_stmt;
2745 gimple *omp_return_stmt1, *omp_return_stmt2;
2746 gimple *phi;
2747 gcond *cond_stmt;
2748 gomp_for *for_stmt;
2749 gomp_continue *omp_cont_stmt;
2750 tree cvar, cvar_init, initvar, cvar_next, cvar_base, type;
2751 edge exit, nexit, guard, end, e;
2752
2753 if (oacc_kernels_p)
2754 {
2755 gcc_checking_assert (lookup_attribute ("oacc kernels",
2756 DECL_ATTRIBUTES (cfun->decl)));
2757 /* Indicate to later processing that this is a parallelized OpenACC
2758 kernels construct. */
2759 DECL_ATTRIBUTES (cfun->decl)
2760 = tree_cons (get_identifier ("oacc kernels parallelized"),
2761 NULL_TREE, DECL_ATTRIBUTES (cfun->decl));
2762 }
2763 else
2764 {
2765 /* Prepare the GIMPLE_OMP_PARALLEL statement. */
2766
2767 basic_block bb = loop_preheader_edge (loop)->src;
2768 basic_block paral_bb = single_pred (bb);
2769 gsi = gsi_last_bb (bb: paral_bb);
2770
2771 gcc_checking_assert (n_threads != 0);
2772 t = build_omp_clause (loc, OMP_CLAUSE_NUM_THREADS);
2773 OMP_CLAUSE_NUM_THREADS_EXPR (t)
2774 = build_int_cst (integer_type_node, n_threads);
2775 omp_par_stmt = gimple_build_omp_parallel (NULL, t, loop_fn, data);
2776 gimple_set_location (g: omp_par_stmt, location: loc);
2777
2778 gsi_insert_after (&gsi, omp_par_stmt, GSI_NEW_STMT);
2779
2780 /* Initialize NEW_DATA. */
2781 if (data)
2782 {
2783 gassign *assign_stmt;
2784
2785 gsi = gsi_after_labels (bb);
2786
2787 param = make_ssa_name (DECL_ARGUMENTS (loop_fn));
2788 assign_stmt = gimple_build_assign (param, build_fold_addr_expr (data));
2789 gsi_insert_before (&gsi, assign_stmt, GSI_SAME_STMT);
2790
2791 assign_stmt = gimple_build_assign (new_data,
2792 fold_convert (TREE_TYPE (new_data), param));
2793 gsi_insert_before (&gsi, assign_stmt, GSI_SAME_STMT);
2794 }
2795
2796 /* Emit GIMPLE_OMP_RETURN for GIMPLE_OMP_PARALLEL. */
2797 bb = split_loop_exit_edge (single_dom_exit (loop));
2798 gsi = gsi_last_bb (bb);
2799 omp_return_stmt1 = gimple_build_omp_return (false);
2800 gimple_set_location (g: omp_return_stmt1, location: loc);
2801 gsi_insert_after (&gsi, omp_return_stmt1, GSI_NEW_STMT);
2802 }
2803
2804 /* Extract data for GIMPLE_OMP_FOR. */
2805 gcc_assert (loop->header == single_dom_exit (loop)->src);
2806 cond_stmt = as_a <gcond *> (p: *gsi_last_bb (bb: loop->header));
2807
2808 cvar = gimple_cond_lhs (gs: cond_stmt);
2809 cvar_base = SSA_NAME_VAR (cvar);
2810 phi = SSA_NAME_DEF_STMT (cvar);
2811 cvar_init = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
2812 initvar = copy_ssa_name (var: cvar);
2813 SET_USE (PHI_ARG_DEF_PTR_FROM_EDGE (phi, loop_preheader_edge (loop)),
2814 initvar);
2815 cvar_next = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
2816
2817 gsi = gsi_last_nondebug_bb (bb: loop->latch);
2818 gcc_assert (gsi_stmt (gsi) == SSA_NAME_DEF_STMT (cvar_next));
2819 gsi_remove (&gsi, true);
2820
2821 /* Prepare cfg. */
2822 for_bb = split_edge (loop_preheader_edge (loop));
2823 ex_bb = split_loop_exit_edge (single_dom_exit (loop));
2824 extract_true_false_edges_from_block (loop->header, &nexit, &exit);
2825 gcc_assert (exit == single_dom_exit (loop));
2826
2827 guard = make_edge (for_bb, ex_bb, 0);
2828 /* FIXME: What is the probability? */
2829 guard->probability = profile_probability::guessed_never ();
2830 /* Split the latch edge, so LOOPS_HAVE_SIMPLE_LATCHES is still valid. */
2831 loop->latch = split_edge (single_succ_edge (bb: loop->latch));
2832 single_pred_edge (bb: loop->latch)->flags = 0;
2833 end = make_single_succ_edge (single_pred (bb: loop->latch), ex_bb, EDGE_FALLTHRU);
2834 rescan_loop_exit (end, true, false);
2835
2836 for (gphi_iterator gpi = gsi_start_phis (ex_bb);
2837 !gsi_end_p (i: gpi); gsi_next (i: &gpi))
2838 {
2839 location_t locus;
2840 gphi *phi = gpi.phi ();
2841 tree def = PHI_ARG_DEF_FROM_EDGE (phi, exit);
2842 gimple *def_stmt = SSA_NAME_DEF_STMT (def);
2843
2844 /* If the exit phi is not connected to a header phi in the same loop, this
2845 value is not modified in the loop, and we're done with this phi. */
2846 if (!(gimple_code (g: def_stmt) == GIMPLE_PHI
2847 && gimple_bb (g: def_stmt) == loop->header))
2848 {
2849 locus = gimple_phi_arg_location_from_edge (phi, e: exit);
2850 add_phi_arg (phi, def, guard, locus);
2851 add_phi_arg (phi, def, end, locus);
2852 continue;
2853 }
2854
2855 gphi *stmt = as_a <gphi *> (p: def_stmt);
2856 def = PHI_ARG_DEF_FROM_EDGE (stmt, loop_preheader_edge (loop));
2857 locus = gimple_phi_arg_location_from_edge (phi: stmt,
2858 e: loop_preheader_edge (loop));
2859 add_phi_arg (phi, def, guard, locus);
2860
2861 def = PHI_ARG_DEF_FROM_EDGE (stmt, loop_latch_edge (loop));
2862 locus = gimple_phi_arg_location_from_edge (phi: stmt, e: loop_latch_edge (loop));
2863 add_phi_arg (phi, def, end, locus);
2864 }
2865 e = redirect_edge_and_branch (exit, nexit->dest);
2866 PENDING_STMT (e) = NULL;
2867
2868 /* Emit GIMPLE_OMP_FOR. */
2869 if (oacc_kernels_p)
2870 /* Parallelized OpenACC kernels constructs use gang parallelism. See also
2871 omp-offload.cc:execute_oacc_loop_designation. */
2872 t = build_omp_clause (loc, OMP_CLAUSE_GANG);
2873 else
2874 {
2875 t = build_omp_clause (loc, OMP_CLAUSE_SCHEDULE);
2876 int chunk_size = param_parloops_chunk_size;
2877 switch (param_parloops_schedule)
2878 {
2879 case PARLOOPS_SCHEDULE_STATIC:
2880 OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_STATIC;
2881 break;
2882 case PARLOOPS_SCHEDULE_DYNAMIC:
2883 OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_DYNAMIC;
2884 break;
2885 case PARLOOPS_SCHEDULE_GUIDED:
2886 OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_GUIDED;
2887 break;
2888 case PARLOOPS_SCHEDULE_AUTO:
2889 OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_AUTO;
2890 chunk_size = 0;
2891 break;
2892 case PARLOOPS_SCHEDULE_RUNTIME:
2893 OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_RUNTIME;
2894 chunk_size = 0;
2895 break;
2896 default:
2897 gcc_unreachable ();
2898 }
2899 if (chunk_size != 0)
2900 OMP_CLAUSE_SCHEDULE_CHUNK_EXPR (t)
2901 = build_int_cst (integer_type_node, chunk_size);
2902 }
2903
2904 for_stmt = gimple_build_omp_for (NULL,
2905 (oacc_kernels_p
2906 ? GF_OMP_FOR_KIND_OACC_LOOP
2907 : GF_OMP_FOR_KIND_FOR),
2908 t, 1, NULL);
2909
2910 gimple_cond_set_lhs (gs: cond_stmt, lhs: cvar_base);
2911 type = TREE_TYPE (cvar);
2912 gimple_set_location (g: for_stmt, location: loc);
2913 gimple_omp_for_set_index (gs: for_stmt, i: 0, index: initvar);
2914 gimple_omp_for_set_initial (gs: for_stmt, i: 0, initial: cvar_init);
2915 gimple_omp_for_set_final (gs: for_stmt, i: 0, final: gimple_cond_rhs (gs: cond_stmt));
2916 gimple_omp_for_set_cond (gs: for_stmt, i: 0, cond: gimple_cond_code (gs: cond_stmt));
2917 gimple_omp_for_set_incr (gs: for_stmt, i: 0, incr: build2 (PLUS_EXPR, type,
2918 cvar_base,
2919 build_int_cst (type, 1)));
2920
2921 gsi = gsi_last_bb (bb: for_bb);
2922 gsi_insert_after (&gsi, for_stmt, GSI_NEW_STMT);
2923 SSA_NAME_DEF_STMT (initvar) = for_stmt;
2924
2925 /* Emit GIMPLE_OMP_CONTINUE. */
2926 continue_bb = single_pred (bb: loop->latch);
2927 gsi = gsi_last_bb (bb: continue_bb);
2928 omp_cont_stmt = gimple_build_omp_continue (cvar_next, cvar);
2929 gimple_set_location (g: omp_cont_stmt, location: loc);
2930 gsi_insert_after (&gsi, omp_cont_stmt, GSI_NEW_STMT);
2931 SSA_NAME_DEF_STMT (cvar_next) = omp_cont_stmt;
2932
2933 /* Emit GIMPLE_OMP_RETURN for GIMPLE_OMP_FOR. */
2934 gsi = gsi_last_bb (bb: ex_bb);
2935 omp_return_stmt2 = gimple_build_omp_return (true);
2936 gimple_set_location (g: omp_return_stmt2, location: loc);
2937 gsi_insert_after (&gsi, omp_return_stmt2, GSI_NEW_STMT);
2938
2939 /* After the above dom info is hosed. Re-compute it. */
2940 free_dominance_info (CDI_DOMINATORS);
2941 calculate_dominance_info (CDI_DOMINATORS);
2942}
2943
2944/* Return number of phis in bb. If COUNT_VIRTUAL_P is false, don't count the
2945 virtual phi. */
2946
2947static unsigned int
2948num_phis (basic_block bb, bool count_virtual_p)
2949{
2950 unsigned int nr_phis = 0;
2951 gphi_iterator gsi;
2952 for (gsi = gsi_start_phis (bb); !gsi_end_p (i: gsi); gsi_next (i: &gsi))
2953 {
2954 if (!count_virtual_p && virtual_operand_p (PHI_RESULT (gsi.phi ())))
2955 continue;
2956
2957 nr_phis++;
2958 }
2959
2960 return nr_phis;
2961}
2962
2963/* Generates code to execute the iterations of LOOP in N_THREADS
2964 threads in parallel, which can be 0 if that number is to be determined
2965 later.
2966
2967 NITER describes number of iterations of LOOP.
2968 REDUCTION_LIST describes the reductions existent in the LOOP. */
2969
2970static void
2971gen_parallel_loop (class loop *loop,
2972 reduction_info_table_type *reduction_list,
2973 unsigned n_threads, class tree_niter_desc *niter,
2974 bool oacc_kernels_p)
2975{
2976 tree many_iterations_cond, type, nit;
2977 tree arg_struct, new_arg_struct;
2978 gimple_seq stmts;
2979 edge entry, exit;
2980 struct clsn_data clsn_data;
2981 location_t loc;
2982 gimple *cond_stmt;
2983 unsigned int m_p_thread=2;
2984
2985 /* From
2986
2987 ---------------------------------------------------------------------
2988 loop
2989 {
2990 IV = phi (INIT, IV + STEP)
2991 BODY1;
2992 if (COND)
2993 break;
2994 BODY2;
2995 }
2996 ---------------------------------------------------------------------
2997
2998 with # of iterations NITER (possibly with MAY_BE_ZERO assumption),
2999 we generate the following code:
3000
3001 ---------------------------------------------------------------------
3002
3003 if (MAY_BE_ZERO
3004 || NITER < MIN_PER_THREAD * N_THREADS)
3005 goto original;
3006
3007 BODY1;
3008 store all local loop-invariant variables used in body of the loop to DATA.
3009 GIMPLE_OMP_PARALLEL (OMP_CLAUSE_NUM_THREADS (N_THREADS), LOOPFN, DATA);
3010 load the variables from DATA.
3011 GIMPLE_OMP_FOR (IV = INIT; COND; IV += STEP) (OMP_CLAUSE_SCHEDULE (static))
3012 BODY2;
3013 BODY1;
3014 GIMPLE_OMP_CONTINUE;
3015 GIMPLE_OMP_RETURN -- GIMPLE_OMP_FOR
3016 GIMPLE_OMP_RETURN -- GIMPLE_OMP_PARALLEL
3017 goto end;
3018
3019 original:
3020 loop
3021 {
3022 IV = phi (INIT, IV + STEP)
3023 BODY1;
3024 if (COND)
3025 break;
3026 BODY2;
3027 }
3028
3029 end:
3030
3031 */
3032
3033 /* Create two versions of the loop -- in the old one, we know that the
3034 number of iterations is large enough, and we will transform it into the
3035 loop that will be split to loop_fn, the new one will be used for the
3036 remaining iterations. */
3037
3038 /* We should compute a better number-of-iterations value for outer loops.
3039 That is, if we have
3040
3041 for (i = 0; i < n; ++i)
3042 for (j = 0; j < m; ++j)
3043 ...
3044
3045 we should compute nit = n * m, not nit = n.
3046 Also may_be_zero handling would need to be adjusted. */
3047
3048 type = TREE_TYPE (niter->niter);
3049 nit = force_gimple_operand (unshare_expr (niter->niter), &stmts, true,
3050 NULL_TREE);
3051 if (stmts)
3052 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
3053
3054 if (!oacc_kernels_p)
3055 {
3056 if (loop->inner)
3057 m_p_thread=2;
3058 else
3059 m_p_thread=MIN_PER_THREAD;
3060
3061 gcc_checking_assert (n_threads != 0);
3062 many_iterations_cond =
3063 fold_build2 (GE_EXPR, boolean_type_node,
3064 nit, build_int_cst (type, m_p_thread * n_threads - 1));
3065
3066 many_iterations_cond
3067 = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3068 invert_truthvalue (unshare_expr (niter->may_be_zero)),
3069 many_iterations_cond);
3070 many_iterations_cond
3071 = force_gimple_operand (many_iterations_cond, &stmts, false, NULL_TREE);
3072 if (stmts)
3073 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
3074 if (!is_gimple_condexpr_for_cond (many_iterations_cond))
3075 {
3076 many_iterations_cond
3077 = force_gimple_operand (many_iterations_cond, &stmts,
3078 true, NULL_TREE);
3079 if (stmts)
3080 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
3081 stmts);
3082 }
3083
3084 initialize_original_copy_tables ();
3085
3086 /* We assume that the loop usually iterates a lot. */
3087 loop_version (loop, many_iterations_cond, NULL,
3088 profile_probability::likely (),
3089 profile_probability::unlikely (),
3090 profile_probability::likely (),
3091 profile_probability::unlikely (), true);
3092 update_ssa (TODO_update_ssa_no_phi);
3093 free_original_copy_tables ();
3094 }
3095
3096 /* Base all the induction variables in LOOP on a single control one. */
3097 canonicalize_loop_ivs (loop, &nit, true);
3098 if (num_phis (bb: loop->header, count_virtual_p: false) != reduction_list->elements () + 1)
3099 {
3100 /* The call to canonicalize_loop_ivs above failed to "base all the
3101 induction variables in LOOP on a single control one". Do damage
3102 control. */
3103 basic_block preheader = loop_preheader_edge (loop)->src;
3104 basic_block cond_bb = single_pred (bb: preheader);
3105 gcond *cond = as_a <gcond *> (p: gsi_stmt (i: gsi_last_bb (bb: cond_bb)));
3106 gimple_cond_make_true (gs: cond);
3107 update_stmt (s: cond);
3108 /* We've gotten rid of the duplicate loop created by loop_version, but
3109 we can't undo whatever canonicalize_loop_ivs has done.
3110 TODO: Fix this properly by ensuring that the call to
3111 canonicalize_loop_ivs succeeds. */
3112 if (dump_file
3113 && (dump_flags & TDF_DETAILS))
3114 fprintf (stream: dump_file, format: "canonicalize_loop_ivs failed for loop %d,"
3115 " aborting transformation\n", loop->num);
3116 return;
3117 }
3118
3119 /* Ensure that the exit condition is the first statement in the loop.
3120 The common case is that latch of the loop is empty (apart from the
3121 increment) and immediately follows the loop exit test. Attempt to move the
3122 entry of the loop directly before the exit check and increase the number of
3123 iterations of the loop by one. */
3124 if (try_transform_to_exit_first_loop_alt (loop, reduction_list, nit))
3125 {
3126 if (dump_file
3127 && (dump_flags & TDF_DETAILS))
3128 fprintf (stream: dump_file,
3129 format: "alternative exit-first loop transform succeeded"
3130 " for loop %d\n", loop->num);
3131 }
3132 else
3133 {
3134 if (oacc_kernels_p)
3135 n_threads = 1;
3136
3137 /* Fall back on the method that handles more cases, but duplicates the
3138 loop body: move the exit condition of LOOP to the beginning of its
3139 header, and duplicate the part of the last iteration that gets disabled
3140 to the exit of the loop. */
3141 transform_to_exit_first_loop (loop, reduction_list, nit);
3142 }
3143 update_ssa (TODO_update_ssa_no_phi);
3144
3145 /* Generate initializations for reductions. */
3146 if (!reduction_list->is_empty ())
3147 reduction_list->traverse <class loop *, initialize_reductions> (argument: loop);
3148
3149 /* Eliminate the references to local variables from the loop. */
3150 gcc_assert (single_exit (loop));
3151 entry = loop_preheader_edge (loop);
3152 exit = single_dom_exit (loop);
3153
3154 /* This rewrites the body in terms of new variables. This has already
3155 been done for oacc_kernels_p in pass_lower_omp/lower_omp (). */
3156 if (!oacc_kernels_p)
3157 {
3158 eliminate_local_variables (entry, exit);
3159 /* In the old loop, move all variables non-local to the loop to a
3160 structure and back, and create separate decls for the variables used in
3161 loop. */
3162 separate_decls_in_region (entry, exit, reduction_list, arg_struct: &arg_struct,
3163 new_arg_struct: &new_arg_struct, ld_st_data: &clsn_data);
3164 }
3165 else
3166 {
3167 arg_struct = NULL_TREE;
3168 new_arg_struct = NULL_TREE;
3169 clsn_data.load = NULL_TREE;
3170 clsn_data.load_bb = exit->dest;
3171 clsn_data.store = NULL_TREE;
3172 clsn_data.store_bb = NULL;
3173 }
3174
3175 /* Create the parallel constructs. */
3176 loc = UNKNOWN_LOCATION;
3177 cond_stmt = last_nondebug_stmt (loop->header);
3178 if (cond_stmt)
3179 loc = gimple_location (g: cond_stmt);
3180 create_parallel_loop (loop, loop_fn: create_loop_fn (loc), data: arg_struct, new_data: new_arg_struct,
3181 n_threads, loc, oacc_kernels_p);
3182 if (!reduction_list->is_empty ())
3183 create_call_for_reduction (loop, reduction_list, ld_st_data: &clsn_data);
3184
3185 scev_reset ();
3186
3187 /* Free loop bound estimations that could contain references to
3188 removed statements. */
3189 free_numbers_of_iterations_estimates (cfun);
3190}
3191
3192/* Returns true when LOOP contains vector phi nodes. */
3193
3194static bool
3195loop_has_vector_phi_nodes (class loop *loop ATTRIBUTE_UNUSED)
3196{
3197 unsigned i;
3198 basic_block *bbs = get_loop_body_in_dom_order (loop);
3199 gphi_iterator gsi;
3200 bool res = true;
3201
3202 for (i = 0; i < loop->num_nodes; i++)
3203 for (gsi = gsi_start_phis (bbs[i]); !gsi_end_p (i: gsi); gsi_next (i: &gsi))
3204 if (VECTOR_TYPE_P (TREE_TYPE (PHI_RESULT (gsi.phi ()))))
3205 goto end;
3206
3207 res = false;
3208 end:
3209 free (ptr: bbs);
3210 return res;
3211}
3212
3213/* Create a reduction_info struct, initialize it with REDUC_STMT
3214 and PHI, insert it to the REDUCTION_LIST. */
3215
3216static void
3217build_new_reduction (reduction_info_table_type *reduction_list,
3218 gimple *reduc_stmt, gphi *phi)
3219{
3220 reduction_info **slot;
3221 struct reduction_info *new_reduction;
3222 enum tree_code reduction_code;
3223
3224 gcc_assert (reduc_stmt);
3225
3226 if (gimple_code (g: reduc_stmt) == GIMPLE_PHI)
3227 {
3228 tree op1 = PHI_ARG_DEF (reduc_stmt, 0);
3229 gimple *def1 = SSA_NAME_DEF_STMT (op1);
3230 reduction_code = gimple_assign_rhs_code (gs: def1);
3231 }
3232 else
3233 reduction_code = gimple_assign_rhs_code (gs: reduc_stmt);
3234 /* Check for OpenMP supported reduction. */
3235 switch (reduction_code)
3236 {
3237 case MINUS_EXPR:
3238 reduction_code = PLUS_EXPR;
3239 /* Fallthru. */
3240 case PLUS_EXPR:
3241 case MULT_EXPR:
3242 case MAX_EXPR:
3243 case MIN_EXPR:
3244 case BIT_IOR_EXPR:
3245 case BIT_XOR_EXPR:
3246 case BIT_AND_EXPR:
3247 case TRUTH_OR_EXPR:
3248 case TRUTH_XOR_EXPR:
3249 case TRUTH_AND_EXPR:
3250 break;
3251 default:
3252 return;
3253 }
3254
3255 if (dump_file && (dump_flags & TDF_DETAILS))
3256 {
3257 fprintf (stream: dump_file,
3258 format: "Detected reduction. reduction stmt is:\n");
3259 print_gimple_stmt (dump_file, reduc_stmt, 0);
3260 fprintf (stream: dump_file, format: "\n");
3261 }
3262
3263 new_reduction = XCNEW (struct reduction_info);
3264
3265 new_reduction->reduc_stmt = reduc_stmt;
3266 new_reduction->reduc_phi = phi;
3267 new_reduction->reduc_version = SSA_NAME_VERSION (gimple_phi_result (phi));
3268 new_reduction->reduction_code = reduction_code;
3269 slot = reduction_list->find_slot (value: new_reduction, insert: INSERT);
3270 *slot = new_reduction;
3271}
3272
3273/* Callback for htab_traverse. Sets gimple_uid of reduc_phi stmts. */
3274
3275int
3276set_reduc_phi_uids (reduction_info **slot, void *data ATTRIBUTE_UNUSED)
3277{
3278 struct reduction_info *const red = *slot;
3279 gimple_set_uid (g: red->reduc_phi, uid: red->reduc_version);
3280 return 1;
3281}
3282
3283/* Return true if the type of reduction performed by STMT_INFO is suitable
3284 for this pass. */
3285
3286static bool
3287valid_reduction_p (stmt_vec_info stmt_info)
3288{
3289 /* Parallelization would reassociate the operation, which isn't
3290 allowed for in-order reductions. */
3291 vect_reduction_type reduc_type = STMT_VINFO_REDUC_TYPE (stmt_info);
3292 return reduc_type != FOLD_LEFT_REDUCTION;
3293}
3294
3295/* Detect all reductions in the LOOP, insert them into REDUCTION_LIST. */
3296
3297static void
3298gather_scalar_reductions (loop_p loop, reduction_info_table_type *reduction_list)
3299{
3300 gphi_iterator gsi;
3301 loop_vec_info simple_loop_info;
3302 auto_vec<gphi *, 4> double_reduc_phis;
3303 auto_vec<gimple *, 4> double_reduc_stmts;
3304
3305 vec_info_shared shared;
3306 vect_loop_form_info info;
3307 if (!vect_analyze_loop_form (loop, &info))
3308 goto gather_done;
3309
3310 simple_loop_info = vect_create_loop_vinfo (loop, &shared, &info);
3311 for (gsi = gsi_start_phis (loop->header); !gsi_end_p (i: gsi); gsi_next (i: &gsi))
3312 {
3313 gphi *phi = gsi.phi ();
3314 affine_iv iv;
3315 tree res = PHI_RESULT (phi);
3316 bool double_reduc;
3317
3318 if (virtual_operand_p (op: res))
3319 continue;
3320
3321 if (simple_iv (loop, loop, res, &iv, true))
3322 continue;
3323
3324 stmt_vec_info reduc_stmt_info
3325 = parloops_force_simple_reduction (loop_info: simple_loop_info,
3326 phi_info: simple_loop_info->lookup_stmt (phi),
3327 double_reduc: &double_reduc, need_wrapping_integral_overflow: true);
3328 if (!reduc_stmt_info || !valid_reduction_p (stmt_info: reduc_stmt_info))
3329 continue;
3330
3331 if (double_reduc)
3332 {
3333 if (loop->inner->inner != NULL)
3334 continue;
3335
3336 double_reduc_phis.safe_push (obj: phi);
3337 double_reduc_stmts.safe_push (obj: reduc_stmt_info->stmt);
3338 continue;
3339 }
3340
3341 build_new_reduction (reduction_list, reduc_stmt: reduc_stmt_info->stmt, phi);
3342 }
3343 delete simple_loop_info;
3344
3345 if (!double_reduc_phis.is_empty ())
3346 {
3347 vec_info_shared shared;
3348 vect_loop_form_info info;
3349 if (vect_analyze_loop_form (loop->inner, &info))
3350 {
3351 simple_loop_info
3352 = vect_create_loop_vinfo (loop->inner, &shared, &info);
3353 gphi *phi;
3354 unsigned int i;
3355
3356 FOR_EACH_VEC_ELT (double_reduc_phis, i, phi)
3357 {
3358 affine_iv iv;
3359 tree res = PHI_RESULT (phi);
3360 bool double_reduc;
3361
3362 use_operand_p use_p;
3363 gimple *inner_stmt;
3364 bool single_use_p = single_imm_use (var: res, use_p: &use_p, stmt: &inner_stmt);
3365 gcc_assert (single_use_p);
3366 if (gimple_code (g: inner_stmt) != GIMPLE_PHI)
3367 continue;
3368 gphi *inner_phi = as_a <gphi *> (p: inner_stmt);
3369 if (simple_iv (loop->inner, loop->inner, PHI_RESULT (inner_phi),
3370 &iv, true))
3371 continue;
3372
3373 stmt_vec_info inner_phi_info
3374 = simple_loop_info->lookup_stmt (inner_phi);
3375 stmt_vec_info inner_reduc_stmt_info
3376 = parloops_force_simple_reduction (loop_info: simple_loop_info,
3377 phi_info: inner_phi_info,
3378 double_reduc: &double_reduc, need_wrapping_integral_overflow: true);
3379 gcc_assert (!double_reduc);
3380 if (!inner_reduc_stmt_info
3381 || !valid_reduction_p (stmt_info: inner_reduc_stmt_info))
3382 continue;
3383
3384 build_new_reduction (reduction_list, reduc_stmt: double_reduc_stmts[i], phi);
3385 }
3386 delete simple_loop_info;
3387 }
3388 }
3389
3390 gather_done:
3391 if (reduction_list->is_empty ())
3392 return;
3393
3394 /* As gimple_uid is used by the vectorizer in between vect_analyze_loop_form
3395 and delete simple_loop_info, we can set gimple_uid of reduc_phi stmts only
3396 now. */
3397 basic_block bb;
3398 FOR_EACH_BB_FN (bb, cfun)
3399 for (gsi = gsi_start_phis (bb); !gsi_end_p (i: gsi); gsi_next (i: &gsi))
3400 gimple_set_uid (g: gsi_stmt (i: gsi), uid: (unsigned int)-1);
3401 reduction_list->traverse <void *, set_reduc_phi_uids> (NULL);
3402}
3403
3404/* Try to initialize NITER for code generation part. */
3405
3406static bool
3407try_get_loop_niter (loop_p loop, class tree_niter_desc *niter)
3408{
3409 edge exit = single_dom_exit (loop);
3410
3411 gcc_assert (exit);
3412
3413 /* We need to know # of iterations, and there should be no uses of values
3414 defined inside loop outside of it, unless the values are invariants of
3415 the loop. */
3416 if (!number_of_iterations_exit (loop, exit, niter, false))
3417 {
3418 if (dump_file && (dump_flags & TDF_DETAILS))
3419 fprintf (stream: dump_file, format: " FAILED: number of iterations not known\n");
3420 return false;
3421 }
3422
3423 return true;
3424}
3425
3426/* Return the default def of the first function argument. */
3427
3428static tree
3429get_omp_data_i_param (void)
3430{
3431 tree decl = DECL_ARGUMENTS (cfun->decl);
3432 gcc_assert (DECL_CHAIN (decl) == NULL_TREE);
3433 return ssa_default_def (cfun, decl);
3434}
3435
3436/* For PHI in loop header of LOOP, look for pattern:
3437
3438 <bb preheader>
3439 .omp_data_i = &.omp_data_arr;
3440 addr = .omp_data_i->sum;
3441 sum_a = *addr;
3442
3443 <bb header>:
3444 sum_b = PHI <sum_a (preheader), sum_c (latch)>
3445
3446 and return addr. Otherwise, return NULL_TREE. */
3447
3448static tree
3449find_reduc_addr (class loop *loop, gphi *phi)
3450{
3451 edge e = loop_preheader_edge (loop);
3452 tree arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
3453 gimple *stmt = SSA_NAME_DEF_STMT (arg);
3454 if (!gimple_assign_single_p (gs: stmt))
3455 return NULL_TREE;
3456 tree memref = gimple_assign_rhs1 (gs: stmt);
3457 if (TREE_CODE (memref) != MEM_REF)
3458 return NULL_TREE;
3459 tree addr = TREE_OPERAND (memref, 0);
3460
3461 gimple *stmt2 = SSA_NAME_DEF_STMT (addr);
3462 if (!gimple_assign_single_p (gs: stmt2))
3463 return NULL_TREE;
3464 tree compref = gimple_assign_rhs1 (gs: stmt2);
3465 if (TREE_CODE (compref) != COMPONENT_REF)
3466 return NULL_TREE;
3467 tree addr2 = TREE_OPERAND (compref, 0);
3468 if (TREE_CODE (addr2) != MEM_REF)
3469 return NULL_TREE;
3470 addr2 = TREE_OPERAND (addr2, 0);
3471 if (TREE_CODE (addr2) != SSA_NAME
3472 || addr2 != get_omp_data_i_param ())
3473 return NULL_TREE;
3474
3475 return addr;
3476}
3477
3478/* Try to initialize REDUCTION_LIST for code generation part.
3479 REDUCTION_LIST describes the reductions. */
3480
3481static bool
3482try_create_reduction_list (loop_p loop,
3483 reduction_info_table_type *reduction_list,
3484 bool oacc_kernels_p)
3485{
3486 edge exit = single_dom_exit (loop);
3487 gphi_iterator gsi;
3488
3489 gcc_assert (exit);
3490
3491 /* Try to get rid of exit phis. */
3492 final_value_replacement_loop (loop);
3493
3494 gather_scalar_reductions (loop, reduction_list);
3495
3496
3497 for (gsi = gsi_start_phis (exit->dest); !gsi_end_p (i: gsi); gsi_next (i: &gsi))
3498 {
3499 gphi *phi = gsi.phi ();
3500 struct reduction_info *red;
3501 imm_use_iterator imm_iter;
3502 use_operand_p use_p;
3503 gimple *reduc_phi;
3504 tree val = PHI_ARG_DEF_FROM_EDGE (phi, exit);
3505
3506 if (!virtual_operand_p (op: val))
3507 {
3508 if (TREE_CODE (val) != SSA_NAME)
3509 {
3510 if (dump_file && (dump_flags & TDF_DETAILS))
3511 fprintf (stream: dump_file,
3512 format: " FAILED: exit PHI argument invariant.\n");
3513 return false;
3514 }
3515
3516 if (dump_file && (dump_flags & TDF_DETAILS))
3517 {
3518 fprintf (stream: dump_file, format: "phi is ");
3519 print_gimple_stmt (dump_file, phi, 0);
3520 fprintf (stream: dump_file, format: "arg of phi to exit: value ");
3521 print_generic_expr (dump_file, val);
3522 fprintf (stream: dump_file, format: " used outside loop\n");
3523 fprintf (stream: dump_file,
3524 format: " checking if it is part of reduction pattern:\n");
3525 }
3526 if (reduction_list->is_empty ())
3527 {
3528 if (dump_file && (dump_flags & TDF_DETAILS))
3529 fprintf (stream: dump_file,
3530 format: " FAILED: it is not a part of reduction.\n");
3531 return false;
3532 }
3533 reduc_phi = NULL;
3534 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, val)
3535 {
3536 if (!gimple_debug_bind_p (USE_STMT (use_p))
3537 && flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
3538 {
3539 reduc_phi = USE_STMT (use_p);
3540 break;
3541 }
3542 }
3543 red = reduction_phi (reduction_list, phi: reduc_phi);
3544 if (red == NULL)
3545 {
3546 if (dump_file && (dump_flags & TDF_DETAILS))
3547 fprintf (stream: dump_file,
3548 format: " FAILED: it is not a part of reduction.\n");
3549 return false;
3550 }
3551 if (red->keep_res != NULL)
3552 {
3553 if (dump_file && (dump_flags & TDF_DETAILS))
3554 fprintf (stream: dump_file,
3555 format: " FAILED: reduction has multiple exit phis.\n");
3556 return false;
3557 }
3558 red->keep_res = phi;
3559 if (dump_file && (dump_flags & TDF_DETAILS))
3560 {
3561 fprintf (stream: dump_file, format: "reduction phi is ");
3562 print_gimple_stmt (dump_file, red->reduc_phi, 0);
3563 fprintf (stream: dump_file, format: "reduction stmt is ");
3564 print_gimple_stmt (dump_file, red->reduc_stmt, 0);
3565 }
3566 }
3567 }
3568
3569 /* The iterations of the loop may communicate only through bivs whose
3570 iteration space can be distributed efficiently. */
3571 for (gsi = gsi_start_phis (loop->header); !gsi_end_p (i: gsi); gsi_next (i: &gsi))
3572 {
3573 gphi *phi = gsi.phi ();
3574 tree def = PHI_RESULT (phi);
3575 affine_iv iv;
3576
3577 if (!virtual_operand_p (op: def) && !simple_iv (loop, loop, def, &iv, true))
3578 {
3579 struct reduction_info *red;
3580
3581 red = reduction_phi (reduction_list, phi);
3582 if (red == NULL)
3583 {
3584 if (dump_file && (dump_flags & TDF_DETAILS))
3585 fprintf (stream: dump_file,
3586 format: " FAILED: scalar dependency between iterations\n");
3587 return false;
3588 }
3589 }
3590 }
3591
3592 if (oacc_kernels_p)
3593 {
3594 for (gsi = gsi_start_phis (loop->header); !gsi_end_p (i: gsi);
3595 gsi_next (i: &gsi))
3596 {
3597 gphi *phi = gsi.phi ();
3598 tree def = PHI_RESULT (phi);
3599 affine_iv iv;
3600
3601 if (!virtual_operand_p (op: def)
3602 && !simple_iv (loop, loop, def, &iv, true))
3603 {
3604 tree addr = find_reduc_addr (loop, phi);
3605 if (addr == NULL_TREE)
3606 return false;
3607 struct reduction_info *red = reduction_phi (reduction_list, phi);
3608 red->reduc_addr = addr;
3609 }
3610 }
3611 }
3612
3613 return true;
3614}
3615
3616/* Return true if LOOP contains phis with ADDR_EXPR in args. */
3617
3618static bool
3619loop_has_phi_with_address_arg (class loop *loop)
3620{
3621 basic_block *bbs = get_loop_body (loop);
3622 bool res = false;
3623
3624 unsigned i, j;
3625 gphi_iterator gsi;
3626 for (i = 0; i < loop->num_nodes; i++)
3627 for (gsi = gsi_start_phis (bbs[i]); !gsi_end_p (i: gsi); gsi_next (i: &gsi))
3628 {
3629 gphi *phi = gsi.phi ();
3630 for (j = 0; j < gimple_phi_num_args (gs: phi); j++)
3631 {
3632 tree arg = gimple_phi_arg_def (gs: phi, index: j);
3633 if (TREE_CODE (arg) == ADDR_EXPR)
3634 {
3635 /* This should be handled by eliminate_local_variables, but that
3636 function currently ignores phis. */
3637 res = true;
3638 goto end;
3639 }
3640 }
3641 }
3642 end:
3643 free (ptr: bbs);
3644
3645 return res;
3646}
3647
3648/* Return true if memory ref REF (corresponding to the stmt at GSI in
3649 REGIONS_BB[I]) conflicts with the statements in REGIONS_BB[I] after gsi,
3650 or the statements in REGIONS_BB[I + n]. REF_IS_STORE indicates if REF is a
3651 store. Ignore conflicts with SKIP_STMT. */
3652
3653static bool
3654ref_conflicts_with_region (gimple_stmt_iterator gsi, ao_ref *ref,
3655 bool ref_is_store, vec<basic_block> region_bbs,
3656 unsigned int i, gimple *skip_stmt)
3657{
3658 basic_block bb = region_bbs[i];
3659 gsi_next (i: &gsi);
3660
3661 while (true)
3662 {
3663 for (; !gsi_end_p (i: gsi);
3664 gsi_next (i: &gsi))
3665 {
3666 gimple *stmt = gsi_stmt (i: gsi);
3667 if (stmt == skip_stmt)
3668 {
3669 if (dump_file)
3670 {
3671 fprintf (stream: dump_file, format: "skipping reduction store: ");
3672 print_gimple_stmt (dump_file, stmt, 0);
3673 }
3674 continue;
3675 }
3676
3677 if (!gimple_vdef (g: stmt)
3678 && !gimple_vuse (g: stmt))
3679 continue;
3680
3681 if (gimple_code (g: stmt) == GIMPLE_RETURN)
3682 continue;
3683
3684 if (ref_is_store)
3685 {
3686 if (ref_maybe_used_by_stmt_p (stmt, ref))
3687 {
3688 if (dump_file)
3689 {
3690 fprintf (stream: dump_file, format: "Stmt ");
3691 print_gimple_stmt (dump_file, stmt, 0);
3692 }
3693 return true;
3694 }
3695 }
3696 else
3697 {
3698 if (stmt_may_clobber_ref_p_1 (stmt, ref))
3699 {
3700 if (dump_file)
3701 {
3702 fprintf (stream: dump_file, format: "Stmt ");
3703 print_gimple_stmt (dump_file, stmt, 0);
3704 }
3705 return true;
3706 }
3707 }
3708 }
3709 i++;
3710 if (i == region_bbs.length ())
3711 break;
3712 bb = region_bbs[i];
3713 gsi = gsi_start_bb (bb);
3714 }
3715
3716 return false;
3717}
3718
3719/* Return true if the bbs in REGION_BBS but not in in_loop_bbs can be executed
3720 in parallel with REGION_BBS containing the loop. Return the stores of
3721 reduction results in REDUCTION_STORES. */
3722
3723static bool
3724oacc_entry_exit_ok_1 (bitmap in_loop_bbs, const vec<basic_block> &region_bbs,
3725 reduction_info_table_type *reduction_list,
3726 bitmap reduction_stores)
3727{
3728 tree omp_data_i = get_omp_data_i_param ();
3729
3730 unsigned i;
3731 basic_block bb;
3732 FOR_EACH_VEC_ELT (region_bbs, i, bb)
3733 {
3734 if (bitmap_bit_p (in_loop_bbs, bb->index))
3735 continue;
3736
3737 gimple_stmt_iterator gsi;
3738 for (gsi = gsi_start_bb (bb); !gsi_end_p (i: gsi);
3739 gsi_next (i: &gsi))
3740 {
3741 gimple *stmt = gsi_stmt (i: gsi);
3742 gimple *skip_stmt = NULL;
3743
3744 if (is_gimple_debug (gs: stmt)
3745 || gimple_code (g: stmt) == GIMPLE_COND)
3746 continue;
3747
3748 ao_ref ref;
3749 bool ref_is_store = false;
3750 if (gimple_assign_load_p (stmt))
3751 {
3752 tree rhs = gimple_assign_rhs1 (gs: stmt);
3753 tree base = get_base_address (t: rhs);
3754 if (TREE_CODE (base) == MEM_REF
3755 && operand_equal_p (TREE_OPERAND (base, 0), omp_data_i, flags: 0))
3756 continue;
3757
3758 tree lhs = gimple_assign_lhs (gs: stmt);
3759 if (TREE_CODE (lhs) == SSA_NAME
3760 && has_single_use (var: lhs))
3761 {
3762 use_operand_p use_p;
3763 gimple *use_stmt;
3764 struct reduction_info *red;
3765 single_imm_use (var: lhs, use_p: &use_p, stmt: &use_stmt);
3766 if (gimple_code (g: use_stmt) == GIMPLE_PHI
3767 && (red = reduction_phi (reduction_list, phi: use_stmt)))
3768 {
3769 tree val = PHI_RESULT (red->keep_res);
3770 if (has_single_use (var: val))
3771 {
3772 single_imm_use (var: val, use_p: &use_p, stmt: &use_stmt);
3773 if (gimple_store_p (gs: use_stmt))
3774 {
3775 unsigned int id
3776 = SSA_NAME_VERSION (gimple_vdef (use_stmt));
3777 bitmap_set_bit (reduction_stores, id);
3778 skip_stmt = use_stmt;
3779 if (dump_file)
3780 {
3781 fprintf (stream: dump_file, format: "found reduction load: ");
3782 print_gimple_stmt (dump_file, stmt, 0);
3783 }
3784 }
3785 }
3786 }
3787 }
3788
3789 ao_ref_init (&ref, rhs);
3790 }
3791 else if (gimple_store_p (gs: stmt))
3792 {
3793 ao_ref_init (&ref, gimple_assign_lhs (gs: stmt));
3794 ref_is_store = true;
3795 }
3796 else if (gimple_code (g: stmt) == GIMPLE_OMP_RETURN)
3797 continue;
3798 else if (!gimple_has_side_effects (stmt)
3799 && !gimple_could_trap_p (stmt)
3800 && !stmt_could_throw_p (cfun, stmt)
3801 && !gimple_vdef (g: stmt)
3802 && !gimple_vuse (g: stmt))
3803 continue;
3804 else if (gimple_call_internal_p (gs: stmt, fn: IFN_GOACC_DIM_POS))
3805 continue;
3806 else if (gimple_code (g: stmt) == GIMPLE_RETURN)
3807 continue;
3808 else
3809 {
3810 if (dump_file)
3811 {
3812 fprintf (stream: dump_file, format: "Unhandled stmt in entry/exit: ");
3813 print_gimple_stmt (dump_file, stmt, 0);
3814 }
3815 return false;
3816 }
3817
3818 if (ref_conflicts_with_region (gsi, ref: &ref, ref_is_store, region_bbs,
3819 i, skip_stmt))
3820 {
3821 if (dump_file)
3822 {
3823 fprintf (stream: dump_file, format: "conflicts with entry/exit stmt: ");
3824 print_gimple_stmt (dump_file, stmt, 0);
3825 }
3826 return false;
3827 }
3828 }
3829 }
3830
3831 return true;
3832}
3833
3834/* Find stores inside REGION_BBS and outside IN_LOOP_BBS, and guard them with
3835 gang_pos == 0, except when the stores are REDUCTION_STORES. Return true
3836 if any changes were made. */
3837
3838static bool
3839oacc_entry_exit_single_gang (bitmap in_loop_bbs,
3840 const vec<basic_block> &region_bbs,
3841 bitmap reduction_stores)
3842{
3843 tree gang_pos = NULL_TREE;
3844 bool changed = false;
3845
3846 unsigned i;
3847 basic_block bb;
3848 FOR_EACH_VEC_ELT (region_bbs, i, bb)
3849 {
3850 if (bitmap_bit_p (in_loop_bbs, bb->index))
3851 continue;
3852
3853 gimple_stmt_iterator gsi;
3854 for (gsi = gsi_start_bb (bb); !gsi_end_p (i: gsi);)
3855 {
3856 gimple *stmt = gsi_stmt (i: gsi);
3857
3858 if (!gimple_store_p (gs: stmt))
3859 {
3860 /* Update gsi to point to next stmt. */
3861 gsi_next (i: &gsi);
3862 continue;
3863 }
3864
3865 if (bitmap_bit_p (reduction_stores,
3866 SSA_NAME_VERSION (gimple_vdef (stmt))))
3867 {
3868 if (dump_file)
3869 {
3870 fprintf (stream: dump_file,
3871 format: "skipped reduction store for single-gang"
3872 " neutering: ");
3873 print_gimple_stmt (dump_file, stmt, 0);
3874 }
3875
3876 /* Update gsi to point to next stmt. */
3877 gsi_next (i: &gsi);
3878 continue;
3879 }
3880
3881 changed = true;
3882
3883 if (gang_pos == NULL_TREE)
3884 {
3885 tree arg = build_int_cst (integer_type_node, GOMP_DIM_GANG);
3886 gcall *gang_single
3887 = gimple_build_call_internal (IFN_GOACC_DIM_POS, 1, arg);
3888 gang_pos = make_ssa_name (integer_type_node);
3889 gimple_call_set_lhs (gs: gang_single, lhs: gang_pos);
3890 gimple_stmt_iterator start
3891 = gsi_start_bb (bb: single_succ (ENTRY_BLOCK_PTR_FOR_FN (cfun)));
3892 tree vuse = ssa_default_def (cfun, gimple_vop (cfun));
3893 gimple_set_vuse (g: gang_single, vuse);
3894 gsi_insert_before (&start, gang_single, GSI_SAME_STMT);
3895 }
3896
3897 if (dump_file)
3898 {
3899 fprintf (stream: dump_file,
3900 format: "found store that needs single-gang neutering: ");
3901 print_gimple_stmt (dump_file, stmt, 0);
3902 }
3903
3904 {
3905 /* Split block before store. */
3906 gimple_stmt_iterator gsi2 = gsi;
3907 gsi_prev (i: &gsi2);
3908 edge e;
3909 if (gsi_end_p (i: gsi2))
3910 {
3911 e = split_block_after_labels (bb);
3912 gsi2 = gsi_last_bb (bb);
3913 }
3914 else
3915 e = split_block (bb, gsi_stmt (i: gsi2));
3916 basic_block bb2 = e->dest;
3917
3918 /* Split block after store. */
3919 gimple_stmt_iterator gsi3 = gsi_start_bb (bb: bb2);
3920 edge e2 = split_block (bb2, gsi_stmt (i: gsi3));
3921 basic_block bb3 = e2->dest;
3922
3923 gimple *cond
3924 = gimple_build_cond (EQ_EXPR, gang_pos, integer_zero_node,
3925 NULL_TREE, NULL_TREE);
3926 gsi_insert_after (&gsi2, cond, GSI_NEW_STMT);
3927
3928 edge e3 = make_edge (bb, bb3, EDGE_FALSE_VALUE);
3929 /* FIXME: What is the probability? */
3930 e3->probability = profile_probability::guessed_never ();
3931 e->flags = EDGE_TRUE_VALUE;
3932
3933 tree vdef = gimple_vdef (g: stmt);
3934 tree vuse = gimple_vuse (g: stmt);
3935
3936 tree phi_res = copy_ssa_name (var: vdef);
3937 gphi *new_phi = create_phi_node (phi_res, bb3);
3938 replace_uses_by (vdef, phi_res);
3939 add_phi_arg (new_phi, vuse, e3, UNKNOWN_LOCATION);
3940 add_phi_arg (new_phi, vdef, e2, UNKNOWN_LOCATION);
3941
3942 /* Update gsi to point to next stmt. */
3943 bb = bb3;
3944 gsi = gsi_start_bb (bb);
3945 }
3946 }
3947 }
3948
3949 return changed;
3950}
3951
3952/* Return true if the statements before and after the LOOP can be executed in
3953 parallel with the function containing the loop. Resolve conflicting stores
3954 outside LOOP by guarding them such that only a single gang executes them. */
3955
3956static bool
3957oacc_entry_exit_ok (class loop *loop,
3958 reduction_info_table_type *reduction_list)
3959{
3960 basic_block *loop_bbs = get_loop_body_in_dom_order (loop);
3961 auto_vec<basic_block> region_bbs
3962 = get_all_dominated_blocks (CDI_DOMINATORS, ENTRY_BLOCK_PTR_FOR_FN (cfun));
3963
3964 bitmap in_loop_bbs = BITMAP_ALLOC (NULL);
3965 bitmap_clear (in_loop_bbs);
3966 for (unsigned int i = 0; i < loop->num_nodes; i++)
3967 bitmap_set_bit (in_loop_bbs, loop_bbs[i]->index);
3968
3969 bitmap reduction_stores = BITMAP_ALLOC (NULL);
3970 bool res = oacc_entry_exit_ok_1 (in_loop_bbs, region_bbs, reduction_list,
3971 reduction_stores);
3972
3973 if (res)
3974 {
3975 bool changed = oacc_entry_exit_single_gang (in_loop_bbs, region_bbs,
3976 reduction_stores);
3977 if (changed)
3978 {
3979 free_dominance_info (CDI_DOMINATORS);
3980 calculate_dominance_info (CDI_DOMINATORS);
3981 }
3982 }
3983
3984 free (ptr: loop_bbs);
3985
3986 BITMAP_FREE (in_loop_bbs);
3987 BITMAP_FREE (reduction_stores);
3988
3989 return res;
3990}
3991
3992/* Detect parallel loops and generate parallel code using libgomp
3993 primitives. Returns true if some loop was parallelized, false
3994 otherwise. */
3995
3996static bool
3997parallelize_loops (bool oacc_kernels_p)
3998{
3999 unsigned n_threads;
4000 bool changed = false;
4001 class loop *skip_loop = NULL;
4002 class tree_niter_desc niter_desc;
4003 struct obstack parloop_obstack;
4004 HOST_WIDE_INT estimated;
4005
4006 /* Do not parallelize loops in the functions created by parallelization. */
4007 if (!oacc_kernels_p
4008 && parallelized_function_p (cfun->decl))
4009 return false;
4010
4011 /* Do not parallelize loops in offloaded functions. */
4012 if (!oacc_kernels_p
4013 && oacc_get_fn_attrib (cfun->decl) != NULL)
4014 return false;
4015
4016 if (cfun->has_nonlocal_label)
4017 return false;
4018
4019 /* For OpenACC kernels, n_threads will be determined later; otherwise, it's
4020 the argument to -ftree-parallelize-loops. */
4021 if (oacc_kernels_p)
4022 n_threads = 0;
4023 else
4024 n_threads = flag_tree_parallelize_loops;
4025
4026 gcc_obstack_init (&parloop_obstack);
4027 reduction_info_table_type reduction_list (10);
4028
4029 calculate_dominance_info (CDI_DOMINATORS);
4030
4031 for (auto loop : loops_list (cfun, 0))
4032 {
4033 if (loop == skip_loop)
4034 {
4035 if (!loop->in_oacc_kernels_region
4036 && dump_file && (dump_flags & TDF_DETAILS))
4037 fprintf (stream: dump_file,
4038 format: "Skipping loop %d as inner loop of parallelized loop\n",
4039 loop->num);
4040
4041 skip_loop = loop->inner;
4042 continue;
4043 }
4044 else
4045 skip_loop = NULL;
4046
4047 reduction_list.empty ();
4048
4049 if (oacc_kernels_p)
4050 {
4051 if (!loop->in_oacc_kernels_region)
4052 continue;
4053
4054 /* Don't try to parallelize inner loops in an oacc kernels region. */
4055 if (loop->inner)
4056 skip_loop = loop->inner;
4057
4058 if (dump_file && (dump_flags & TDF_DETAILS))
4059 fprintf (stream: dump_file,
4060 format: "Trying loop %d with header bb %d in oacc kernels"
4061 " region\n", loop->num, loop->header->index);
4062 }
4063
4064 if (dump_file && (dump_flags & TDF_DETAILS))
4065 {
4066 fprintf (stream: dump_file, format: "Trying loop %d as candidate\n",loop->num);
4067 if (loop->inner)
4068 fprintf (stream: dump_file, format: "loop %d is not innermost\n",loop->num);
4069 else
4070 fprintf (stream: dump_file, format: "loop %d is innermost\n",loop->num);
4071 }
4072
4073 if (!single_dom_exit (loop))
4074 {
4075
4076 if (dump_file && (dump_flags & TDF_DETAILS))
4077 fprintf (stream: dump_file, format: "loop is !single_dom_exit\n");
4078
4079 continue;
4080 }
4081
4082 if (/* And of course, the loop must be parallelizable. */
4083 !can_duplicate_loop_p (loop)
4084 || loop_has_blocks_with_irreducible_flag (loop)
4085 || (loop_preheader_edge (loop)->src->flags & BB_IRREDUCIBLE_LOOP)
4086 /* FIXME: the check for vector phi nodes could be removed. */
4087 || loop_has_vector_phi_nodes (loop))
4088 continue;
4089
4090 estimated = estimated_loop_iterations_int (loop);
4091 if (estimated == -1)
4092 estimated = get_likely_max_loop_iterations_int (loop);
4093 /* FIXME: Bypass this check as graphite doesn't update the
4094 count and frequency correctly now. */
4095 if (!flag_loop_parallelize_all
4096 && !oacc_kernels_p
4097 && ((estimated != -1
4098 && (estimated
4099 < ((HOST_WIDE_INT) n_threads
4100 * (loop->inner ? 2 : MIN_PER_THREAD) - 1)))
4101 /* Do not bother with loops in cold areas. */
4102 || optimize_loop_nest_for_size_p (loop)))
4103 continue;
4104
4105 if (!try_get_loop_niter (loop, niter: &niter_desc))
4106 continue;
4107
4108 if (!try_create_reduction_list (loop, reduction_list: &reduction_list, oacc_kernels_p))
4109 continue;
4110
4111 if (loop_has_phi_with_address_arg (loop))
4112 continue;
4113
4114 if (!loop->can_be_parallel
4115 && !loop_parallel_p (loop, parloop_obstack: &parloop_obstack))
4116 continue;
4117
4118 if (oacc_kernels_p
4119 && !oacc_entry_exit_ok (loop, reduction_list: &reduction_list))
4120 {
4121 if (dump_file)
4122 fprintf (stream: dump_file, format: "entry/exit not ok: FAILED\n");
4123 continue;
4124 }
4125
4126 changed = true;
4127 skip_loop = loop->inner;
4128
4129 if (dump_enabled_p ())
4130 {
4131 dump_user_location_t loop_loc = find_loop_location (loop);
4132 if (loop->inner)
4133 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loop_loc,
4134 "parallelizing outer loop %d\n", loop->num);
4135 else
4136 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loop_loc,
4137 "parallelizing inner loop %d\n", loop->num);
4138 }
4139
4140 gen_parallel_loop (loop, reduction_list: &reduction_list,
4141 n_threads, niter: &niter_desc, oacc_kernels_p);
4142 }
4143
4144 obstack_free (&parloop_obstack, NULL);
4145
4146 /* Parallelization will cause new function calls to be inserted through
4147 which local variables will escape. Reset the points-to solution
4148 for ESCAPED. */
4149 if (changed)
4150 pt_solution_reset (&cfun->gimple_df->escaped);
4151
4152 return changed;
4153}
4154
4155/* Parallelization. */
4156
4157namespace {
4158
4159const pass_data pass_data_parallelize_loops =
4160{
4161 .type: GIMPLE_PASS, /* type */
4162 .name: "parloops", /* name */
4163 .optinfo_flags: OPTGROUP_LOOP, /* optinfo_flags */
4164 .tv_id: TV_TREE_PARALLELIZE_LOOPS, /* tv_id */
4165 .properties_required: ( PROP_cfg | PROP_ssa ), /* properties_required */
4166 .properties_provided: 0, /* properties_provided */
4167 .properties_destroyed: 0, /* properties_destroyed */
4168 .todo_flags_start: 0, /* todo_flags_start */
4169 .todo_flags_finish: 0, /* todo_flags_finish */
4170};
4171
4172class pass_parallelize_loops : public gimple_opt_pass
4173{
4174public:
4175 pass_parallelize_loops (gcc::context *ctxt)
4176 : gimple_opt_pass (pass_data_parallelize_loops, ctxt),
4177 oacc_kernels_p (false)
4178 {}
4179
4180 /* opt_pass methods: */
4181 bool gate (function *) final override
4182 {
4183 if (oacc_kernels_p)
4184 return flag_openacc;
4185 else
4186 return flag_tree_parallelize_loops > 1;
4187 }
4188 unsigned int execute (function *) final override;
4189 opt_pass * clone () final override
4190 {
4191 return new pass_parallelize_loops (m_ctxt);
4192 }
4193 void set_pass_param (unsigned int n, bool param) final override
4194 {
4195 gcc_assert (n == 0);
4196 oacc_kernels_p = param;
4197 }
4198
4199 private:
4200 bool oacc_kernels_p;
4201}; // class pass_parallelize_loops
4202
4203unsigned
4204pass_parallelize_loops::execute (function *fun)
4205{
4206 tree nthreads = builtin_decl_explicit (fncode: BUILT_IN_OMP_GET_NUM_THREADS);
4207 if (nthreads == NULL_TREE)
4208 return 0;
4209
4210 bool in_loop_pipeline = scev_initialized_p ();
4211 if (!in_loop_pipeline)
4212 loop_optimizer_init (LOOPS_NORMAL
4213 | LOOPS_HAVE_RECORDED_EXITS);
4214
4215 if (number_of_loops (fn: fun) <= 1)
4216 return 0;
4217
4218 if (!in_loop_pipeline)
4219 {
4220 rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa);
4221 scev_initialize ();
4222 }
4223
4224 unsigned int todo = 0;
4225 if (parallelize_loops (oacc_kernels_p))
4226 {
4227 fun->curr_properties &= ~(PROP_gimple_eomp);
4228
4229 checking_verify_loop_structure ();
4230
4231 /* ??? Intermediate SSA updates with no PHIs might have lost
4232 the virtual operand renaming needed by separate_decls_in_region,
4233 make sure to rename them again. */
4234 mark_virtual_operands_for_renaming (fun);
4235 update_ssa (TODO_update_ssa);
4236 if (in_loop_pipeline)
4237 rewrite_into_loop_closed_ssa (NULL, 0);
4238 }
4239
4240 if (!in_loop_pipeline)
4241 {
4242 scev_finalize ();
4243 loop_optimizer_finalize ();
4244 }
4245
4246 return todo;
4247}
4248
4249} // anon namespace
4250
4251gimple_opt_pass *
4252make_pass_parallelize_loops (gcc::context *ctxt)
4253{
4254 return new pass_parallelize_loops (ctxt);
4255}
4256

source code of gcc/tree-parloops.cc