1/* Loop unroll-and-jam.
2 Copyright (C) 2017-2023 Free Software Foundation, Inc.
3
4This file is part of GCC.
5
6GCC is free software; you can redistribute it and/or modify it
7under the terms of the GNU General Public License as published by the
8Free Software Foundation; either version 3, or (at your option) any
9later version.
10
11GCC is distributed in the hope that it will be useful, but WITHOUT
12ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14for more details.
15
16You should have received a copy of the GNU General Public License
17along with GCC; see the file COPYING3. If not see
18<http://www.gnu.org/licenses/>. */
19
20#include "config.h"
21#include "system.h"
22#include "coretypes.h"
23#include "tree-pass.h"
24#include "backend.h"
25#include "tree.h"
26#include "gimple.h"
27#include "ssa.h"
28#include "fold-const.h"
29#include "tree-cfg.h"
30#include "tree-ssa.h"
31#include "tree-ssa-loop-niter.h"
32#include "tree-ssa-loop.h"
33#include "tree-ssa-loop-manip.h"
34#include "cfgloop.h"
35#include "tree-scalar-evolution.h"
36#include "gimple-iterator.h"
37#include "cfghooks.h"
38#include "tree-data-ref.h"
39#include "tree-ssa-loop-ivopts.h"
40#include "tree-vectorizer.h"
41#include "tree-ssa-sccvn.h"
42#include "tree-cfgcleanup.h"
43
44/* Unroll and Jam transformation
45
46 This is a combination of two transformations, where the second
47 is not always valid. It's applicable if a loop nest has redundancies
48 over the iterations of an outer loop while not having that with
49 an inner loop.
50
51 Given this nest:
52 for (i) {
53 for (j) {
54 B(i,j)
55 }
56 }
57
58 first unroll:
59 for (i by 2) {
60 for (j) {
61 B(i,j)
62 }
63 for (j) {
64 B(i+1,j)
65 }
66 }
67
68 then fuse the two adjacent inner loops resulting from that:
69 for (i by 2) {
70 for (j) {
71 B(i,j)
72 B(i+1,j)
73 }
74 }
75
76 As the order of evaluations of the body B changes this is valid
77 only in certain situations: all distance vectors need to be forward.
78 Additionally if there are multiple induction variables than just
79 a counting control IV (j above) we can also deal with some situations.
80
81 The validity is checked by unroll_jam_possible_p, and the data-dep
82 testing below.
83
84 A trivial example where the fusion is wrong would be when
85 B(i,j) == x[j-1] = x[j];
86 for (i by 2) {
87 for (j) {
88 x[j-1] = x[j];
89 }
90 for (j) {
91 x[j-1] = x[j];
92 }
93 } effect: move content to front by two elements
94 -->
95 for (i by 2) {
96 for (j) {
97 x[j-1] = x[j];
98 x[j-1] = x[j];
99 }
100 } effect: move content to front by one element
101*/
102
103/* Modify the loop tree for the fact that all code once belonging
104 to the OLD loop or the outer loop of OLD now is inside LOOP. */
105
106static void
107merge_loop_tree (class loop *loop, class loop *old)
108{
109 basic_block *bbs;
110 int i, n;
111 class loop *subloop;
112 edge e;
113 edge_iterator ei;
114
115 /* Find its nodes. */
116 bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (cfun));
117 n = get_loop_body_with_size (loop, bbs, n_basic_blocks_for_fn (cfun));
118
119 for (i = 0; i < n; i++)
120 {
121 /* If the block was direct child of OLD loop it's now part
122 of LOOP. If it was outside OLD, then it moved into LOOP
123 as well. This avoids changing the loop father for BBs
124 in inner loops of OLD. */
125 if (bbs[i]->loop_father == old
126 || loop_depth (loop: bbs[i]->loop_father) < loop_depth (loop: old))
127 {
128 remove_bb_from_loops (bbs[i]);
129 add_bb_to_loop (bbs[i], loop);
130 continue;
131 }
132
133 /* If we find a direct subloop of OLD, move it to LOOP. */
134 subloop = bbs[i]->loop_father;
135 if (loop_outer (loop: subloop) == old && subloop->header == bbs[i])
136 {
137 flow_loop_tree_node_remove (subloop);
138 flow_loop_tree_node_add (loop, subloop);
139 }
140 }
141
142 /* Update the information about loop exit edges. */
143 for (i = 0; i < n; i++)
144 {
145 FOR_EACH_EDGE (e, ei, bbs[i]->succs)
146 {
147 rescan_loop_exit (e, false, false);
148 }
149 }
150
151 loop->num_nodes = n;
152
153 free (ptr: bbs);
154}
155
156/* BB is part of the outer loop of an unroll-and-jam situation.
157 Check if any statements therein would prevent the transformation. */
158
159static bool
160bb_prevents_fusion_p (basic_block bb)
161{
162 gimple_stmt_iterator gsi;
163 /* BB is duplicated by outer unrolling and then all N-1 first copies
164 move into the body of the fused inner loop. If BB exits the outer loop
165 the last copy still does so, and the first N-1 copies are cancelled
166 by loop unrolling, so also after fusion it's the exit block.
167 But there might be other reasons that prevent fusion:
168 * stores or unknown side-effects prevent fusion
169 * loads don't
170 * computations into SSA names: these aren't problematic. Their
171 result will be unused on the exit edges of the first N-1 copies
172 (those aren't taken after unrolling). If they are used on the
173 other edge (the one leading to the outer latch block) they are
174 loop-carried (on the outer loop) and the Nth copy of BB will
175 compute them again (i.e. the first N-1 copies will be dead). */
176 for (gsi = gsi_start_bb (bb); !gsi_end_p (i: gsi); gsi_next (i: &gsi))
177 {
178 gimple *g = gsi_stmt (i: gsi);
179 if (gimple_vdef (g) || gimple_has_side_effects (g))
180 return true;
181 }
182 return false;
183}
184
185/* Given an inner loop LOOP (of some OUTER loop) determine if
186 we can safely fuse copies of it (generated by outer unrolling).
187 If so return true, otherwise return false. */
188
189static bool
190unroll_jam_possible_p (class loop *outer, class loop *loop)
191{
192 basic_block *bbs;
193 int i, n;
194 class tree_niter_desc niter;
195
196 /* When fusing the loops we skip the latch block
197 of the first one, so it mustn't have any effects to
198 preserve. */
199 if (!empty_block_p (loop->latch))
200 return false;
201
202 edge exit;
203 if (!(exit = single_exit (loop)))
204 return false;
205
206 /* We need a perfect nest. Quick check for adjacent inner loops. */
207 if (outer->inner != loop || loop->next)
208 return false;
209
210 /* Prevent head-controlled inner loops, that we usually have.
211 The guard block would need to be accepted
212 (invariant condition either entering or skipping the loop),
213 without also accepting arbitrary control flow. When unswitching
214 ran before us (as with -O3) this won't be a problem because its
215 outer loop unswitching will have moved out the invariant condition.
216
217 If we do that we need to extend fuse_loops() to cope with this
218 by threading through the (still invariant) copied condition
219 between the two loop copies. */
220 if (!dominated_by_p (CDI_DOMINATORS, outer->latch, loop->header))
221 return false;
222
223 /* The number of iterations of the inner loop must be loop invariant
224 with respect to the outer loop. */
225 if (!number_of_iterations_exit (loop, single_exit (loop), niter: &niter,
226 false, every_iteration: true)
227 || niter.cmp == ERROR_MARK
228 || !integer_zerop (niter.may_be_zero)
229 || !expr_invariant_in_loop_p (outer, niter.niter))
230 return false;
231
232 /* If the inner loop produces any values that are used inside the
233 outer loop (except the virtual op) then it can flow
234 back (perhaps indirectly) into the inner loop. This prevents
235 fusion: without fusion the value at the last iteration is used,
236 with fusion the value after the initial iteration is used.
237
238 If all uses are outside the outer loop this doesn't prevent fusion;
239 the value of the last iteration is still used (and the values from
240 all intermediate iterations are dead). */
241 gphi_iterator psi;
242 for (psi = gsi_start_phis (single_exit (loop)->dest);
243 !gsi_end_p (i: psi); gsi_next (i: &psi))
244 {
245 imm_use_iterator imm_iter;
246 use_operand_p use_p;
247 tree op = gimple_phi_result (gs: psi.phi ());
248 if (virtual_operand_p (op))
249 continue;
250 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, op)
251 {
252 gimple *use_stmt = USE_STMT (use_p);
253 if (!is_gimple_debug (gs: use_stmt)
254 && flow_bb_inside_loop_p (outer, gimple_bb (g: use_stmt)))
255 return false;
256 }
257 }
258
259 /* And check blocks belonging to just outer loop. */
260 bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (cfun));
261 n = get_loop_body_with_size (outer, bbs, n_basic_blocks_for_fn (cfun));
262
263 for (i = 0; i < n; i++)
264 if (bbs[i]->loop_father == outer
265 && (bb_prevents_fusion_p (bb: bbs[i])
266 /* Outer loop exits must come after the inner loop, otherwise
267 we'll put the outer loop exit into the fused inner loop. */
268 || (loop_exits_from_bb_p (outer, bbs[i])
269 && !dominated_by_p (CDI_DOMINATORS, bbs[i], exit->src))))
270 break;
271 free (ptr: bbs);
272 if (i != n)
273 return false;
274
275 /* For now we can safely fuse copies of LOOP only if all
276 loop carried variables are inductions (or the virtual op).
277
278 We could handle reductions as well (the initial value in the second
279 body would be the after-iter value of the first body) if it's over
280 an associative and commutative operation. We wouldn't
281 be able to handle unknown cycles. */
282 for (psi = gsi_start_phis (loop->header); !gsi_end_p (i: psi); gsi_next (i: &psi))
283 {
284 affine_iv iv;
285 tree op = gimple_phi_result (gs: psi.phi ());
286
287 if (virtual_operand_p (op))
288 continue;
289 if (!simple_iv (loop, loop, op, &iv, true))
290 return false;
291 /* The inductions must be regular, loop invariant step and initial
292 value. */
293 if (!expr_invariant_in_loop_p (outer, iv.step)
294 || !expr_invariant_in_loop_p (outer, iv.base))
295 return false;
296 /* XXX With more effort we could also be able to deal with inductions
297 where the initial value is loop variant but a simple IV in the
298 outer loop. The initial value for the second body would be
299 the original initial value plus iv.base.step. The next value
300 for the fused loop would be the original next value of the first
301 copy, _not_ the next value of the second body. */
302 }
303
304 return true;
305}
306
307/* Fuse LOOP with all further neighbors. The loops are expected to
308 be in appropriate form. */
309
310static void
311fuse_loops (class loop *loop)
312{
313 class loop *next = loop->next;
314
315 while (next)
316 {
317 edge e;
318
319 remove_branch (single_pred_edge (bb: loop->latch));
320 /* Make delete_basic_block not fiddle with the loop structure. */
321 basic_block oldlatch = loop->latch;
322 loop->latch = NULL;
323 delete_basic_block (oldlatch);
324 e = redirect_edge_and_branch (loop_latch_edge (next),
325 loop->header);
326 loop->latch = e->src;
327 flush_pending_stmts (e);
328
329 gcc_assert (EDGE_COUNT (next->header->preds) == 1);
330
331 /* The PHI nodes of the second body (single-argument now)
332 need adjustments to use the right values: either directly
333 the value of the corresponding PHI in the first copy or
334 the one leaving the first body which unrolling did for us.
335
336 See also unroll_jam_possible_p() for further possibilities. */
337 gphi_iterator psi_first, psi_second;
338 e = single_pred_edge (bb: next->header);
339 for (psi_first = gsi_start_phis (loop->header),
340 psi_second = gsi_start_phis (next->header);
341 !gsi_end_p (i: psi_first);
342 gsi_next (i: &psi_first), gsi_next (i: &psi_second))
343 {
344 gphi *phi_first = psi_first.phi ();
345 gphi *phi_second = psi_second.phi ();
346 tree firstop = gimple_phi_result (gs: phi_first);
347 /* The virtual operand is correct already as it's
348 always live at exit, hence has a LCSSA node and outer
349 loop unrolling updated SSA form. */
350 if (virtual_operand_p (op: firstop))
351 continue;
352
353 /* Due to unroll_jam_possible_p() we know that this is
354 an induction. The second body goes over the same
355 iteration space. */
356 add_phi_arg (phi_second, firstop, e,
357 gimple_location (g: phi_first));
358 }
359 gcc_assert (gsi_end_p (psi_second));
360
361 merge_loop_tree (loop, old: next);
362 gcc_assert (!next->num_nodes);
363 class loop *ln = next->next;
364 delete_loop (next);
365 next = ln;
366 }
367}
368
369/* Return true if any of the access functions for dataref A
370 isn't invariant with respect to loop LOOP_NEST. */
371static bool
372any_access_function_variant_p (const struct data_reference *a,
373 const class loop *loop_nest)
374{
375 vec<tree> fns = DR_ACCESS_FNS (a);
376
377 for (tree t : fns)
378 if (!evolution_function_is_invariant_p (t, loop_nest->num))
379 return true;
380
381 return false;
382}
383
384/* Returns true if the distance in DDR can be determined and adjusts
385 the unroll factor in *UNROLL to make unrolling valid for that distance.
386 Otherwise return false. DDR is with respect to the outer loop of INNER.
387
388 If this data dep can lead to a removed memory reference, increment
389 *REMOVED and adjust *PROFIT_UNROLL to be the necessary unroll factor
390 for this to happen. */
391
392static bool
393adjust_unroll_factor (class loop *inner, struct data_dependence_relation *ddr,
394 unsigned *unroll, unsigned *profit_unroll,
395 unsigned *removed)
396{
397 bool ret = false;
398 if (DDR_ARE_DEPENDENT (ddr) != chrec_known)
399 {
400 if (DDR_NUM_DIST_VECTS (ddr) == 0)
401 return false;
402 unsigned i;
403 lambda_vector dist_v;
404 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
405 {
406 /* A distance (a,b) is at worst transformed into (a/N,b) by the
407 unrolling (factor N), so the transformation is valid if
408 a >= N, or b > 0, or b is zero and a > 0. Otherwise the unroll
409 factor needs to be limited so that the first condition holds.
410 That may limit the factor down to zero in the worst case. */
411 lambda_int dist = dist_v[0];
412 if (dist < 0)
413 gcc_unreachable ();
414 else if (dist >= (lambda_int)*unroll)
415 ;
416 else if (lambda_vector_zerop (vec1: dist_v + 1, DDR_NB_LOOPS (ddr) - 1))
417 {
418 /* We have (a,0) with a < N, so this will be transformed into
419 (0,0) after unrolling by N. This might potentially be a
420 problem, if it's not a read-read dependency. */
421 if (DR_IS_READ (DDR_A (ddr)) && DR_IS_READ (DDR_B (ddr)))
422 ;
423 else
424 {
425 /* So, at least one is a write, and we might reduce the
426 distance vector to (0,0). This is still no problem
427 if both data-refs are affine with respect to the inner
428 loops. But if one of them is invariant with respect
429 to an inner loop our reordering implicit in loop fusion
430 corrupts the program, as our data dependences don't
431 capture this. E.g. for:
432 for (0 <= i < n)
433 for (0 <= j < m)
434 a[i][0] = a[i+1][0] + 2; // (1)
435 b[i][j] = b[i+1][j] + 2; // (2)
436 the distance vector for both statements is (-1,0),
437 but exchanging the order for (2) is okay, while
438 for (1) it is not. To see this, write out the original
439 accesses (assume m is 2):
440 a i j original
441 0 0 0 r a[1][0] b[1][0]
442 1 0 0 w a[0][0] b[0][0]
443 2 0 1 r a[1][0] b[1][1]
444 3 0 1 w a[0][0] b[0][1]
445 4 1 0 r a[2][0] b[2][0]
446 5 1 0 w a[1][0] b[1][0]
447 after unroll-by-2 and fusion the accesses are done in
448 this order (from column a): 0,1, 4,5, 2,3, i.e. this:
449 a i j transformed
450 0 0 0 r a[1][0] b[1][0]
451 1 0 0 w a[0][0] b[0][0]
452 4 1 0 r a[2][0] b[2][0]
453 5 1 0 w a[1][0] b[1][0]
454 2 0 1 r a[1][0] b[1][1]
455 3 0 1 w a[0][0] b[0][1]
456 Note how access 2 accesses the same element as access 5
457 for array 'a' but not for array 'b'. */
458 if (any_access_function_variant_p (DDR_A (ddr), loop_nest: inner)
459 && any_access_function_variant_p (DDR_B (ddr), loop_nest: inner))
460 ;
461 else
462 /* And if any dataref of this pair is invariant with
463 respect to the inner loop, we have no chance than
464 to reduce the unroll factor. */
465 *unroll = dist;
466 }
467 }
468 else if (lambda_vector_lexico_pos (v: dist_v + 1, DDR_NB_LOOPS (ddr) - 1))
469 ;
470 else
471 *unroll = dist;
472
473 /* With a distance (a,0) it's always profitable to unroll-and-jam
474 (by a+1), because one memory reference will go away. With
475 (a,b) and b != 0 that's less clear. We will increase the
476 number of streams without lowering the number of mem refs.
477 So for now only handle the first situation. */
478 if (lambda_vector_zerop (vec1: dist_v + 1, DDR_NB_LOOPS (ddr) - 1))
479 {
480 *profit_unroll = MAX (*profit_unroll, (unsigned)dist + 1);
481 (*removed)++;
482 }
483
484 ret = true;
485 }
486 }
487 return ret;
488}
489
490/* Main entry point for the unroll-and-jam transformation
491 described above. */
492
493static unsigned int
494tree_loop_unroll_and_jam (void)
495{
496 unsigned int todo = 0;
497
498 gcc_assert (scev_initialized_p ());
499
500 /* Go through all innermost loops. */
501 for (auto loop : loops_list (cfun, LI_ONLY_INNERMOST))
502 {
503 class loop *outer = loop_outer (loop);
504
505 if (loop_depth (loop) < 2
506 || optimize_loop_nest_for_size_p (outer))
507 continue;
508
509 if (!unroll_jam_possible_p (outer, loop))
510 continue;
511
512 vec<data_reference_p> datarefs = vNULL;
513 vec<ddr_p> dependences = vNULL;
514 unsigned unroll_factor, profit_unroll, removed;
515 class tree_niter_desc desc;
516 bool unroll = false;
517
518 auto_vec<loop_p, 3> loop_nest;
519 if (!compute_data_dependences_for_loop (outer, true, &loop_nest,
520 &datarefs, &dependences))
521 {
522 if (dump_file && (dump_flags & TDF_DETAILS))
523 fprintf (stream: dump_file, format: "Cannot analyze data dependencies\n");
524 free_data_refs (datarefs);
525 free_dependence_relations (dependences);
526 continue;
527 }
528 if (!datarefs.length ())
529 continue;
530
531 if (dump_file && (dump_flags & TDF_DETAILS))
532 dump_data_dependence_relations (dump_file, dependences);
533
534 unroll_factor = (unsigned)-1;
535 profit_unroll = 1;
536 removed = 0;
537
538 /* Check all dependencies. */
539 unsigned i;
540 struct data_dependence_relation *ddr;
541 FOR_EACH_VEC_ELT (dependences, i, ddr)
542 {
543 struct data_reference *dra, *drb;
544
545 /* If the refs are independend there's nothing to do. */
546 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
547 continue;
548
549 dra = DDR_A (ddr);
550 drb = DDR_B (ddr);
551
552 /* Nothing interesting for the self dependencies, except for WAW if
553 the access function is not affine or constant because we may end
554 up reordering writes to the same location. */
555 if (dra == drb)
556 {
557 if (DR_IS_WRITE (dra)
558 && !DR_ACCESS_FNS (dra).is_empty ()
559 && DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
560 {
561 unroll_factor = 0;
562 break;
563 }
564 else
565 continue;
566 }
567
568 /* Now check the distance vector, for determining a sensible
569 outer unroll factor, and for validity of merging the inner
570 loop copies. */
571 if (!adjust_unroll_factor (inner: loop, ddr, unroll: &unroll_factor, profit_unroll: &profit_unroll,
572 removed: &removed))
573 {
574 /* Couldn't get the distance vector. For two reads that's
575 harmless (we assume we should unroll). For at least
576 one write this means we can't check the dependence direction
577 and hence can't determine safety. */
578
579 if (DR_IS_WRITE (dra) || DR_IS_WRITE (drb))
580 {
581 unroll_factor = 0;
582 break;
583 }
584 }
585 }
586
587 /* We regard a user-specified minimum percentage of zero as a request
588 to ignore all profitability concerns and apply the transformation
589 always. */
590 if (!param_unroll_jam_min_percent)
591 profit_unroll = MAX(2, profit_unroll);
592 else if (removed * 100 / datarefs.length ()
593 < (unsigned)param_unroll_jam_min_percent)
594 profit_unroll = 1;
595 if (unroll_factor > profit_unroll)
596 unroll_factor = profit_unroll;
597 if (unroll_factor > (unsigned)param_unroll_jam_max_unroll)
598 unroll_factor = param_unroll_jam_max_unroll;
599 unroll = (unroll_factor > 1
600 && can_unroll_loop_p (loop: outer, factor: unroll_factor, niter: &desc));
601
602 if (unroll)
603 {
604 if (dump_enabled_p ())
605 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | TDF_DETAILS,
606 find_loop_location (outer),
607 "applying unroll and jam with factor %d\n",
608 unroll_factor);
609 initialize_original_copy_tables ();
610 tree_unroll_loop (outer, unroll_factor, &desc);
611 free_original_copy_tables ();
612 fuse_loops (loop: outer->inner);
613 todo |= TODO_cleanup_cfg;
614
615 auto_bitmap exit_bbs;
616 bitmap_set_bit (exit_bbs, single_exit (outer)->dest->index);
617 todo |= do_rpo_vn (cfun, loop_preheader_edge (outer), exit_bbs);
618 }
619
620 loop_nest.release ();
621 free_dependence_relations (dependences);
622 free_data_refs (datarefs);
623 }
624
625 if (todo)
626 {
627 free_dominance_info (CDI_DOMINATORS);
628 /* We need to cleanup the CFG first since otherwise SSA form can
629 be not up-to-date from the VN run. */
630 if (todo & TODO_cleanup_cfg)
631 {
632 cleanup_tree_cfg ();
633 todo &= ~TODO_cleanup_cfg;
634 }
635 rewrite_into_loop_closed_ssa (NULL, 0);
636 scev_reset ();
637 }
638 return todo;
639}
640
641/* Pass boilerplate */
642
643namespace {
644
645const pass_data pass_data_loop_jam =
646{
647 .type: GIMPLE_PASS, /* type */
648 .name: "unrolljam", /* name */
649 .optinfo_flags: OPTGROUP_LOOP, /* optinfo_flags */
650 .tv_id: TV_LOOP_JAM, /* tv_id */
651 PROP_cfg, /* properties_required */
652 .properties_provided: 0, /* properties_provided */
653 .properties_destroyed: 0, /* properties_destroyed */
654 .todo_flags_start: 0, /* todo_flags_start */
655 .todo_flags_finish: 0, /* todo_flags_finish */
656};
657
658class pass_loop_jam : public gimple_opt_pass
659{
660public:
661 pass_loop_jam (gcc::context *ctxt)
662 : gimple_opt_pass (pass_data_loop_jam, ctxt)
663 {}
664
665 /* opt_pass methods: */
666 bool gate (function *) final override { return flag_unroll_jam != 0; }
667 unsigned int execute (function *) final override;
668
669};
670
671unsigned int
672pass_loop_jam::execute (function *fun)
673{
674 if (number_of_loops (fn: fun) <= 1)
675 return 0;
676
677 return tree_loop_unroll_and_jam ();
678}
679
680}
681
682gimple_opt_pass *
683make_pass_loop_jam (gcc::context *ctxt)
684{
685 return new pass_loop_jam (ctxt);
686}
687

source code of gcc/gimple-loop-jam.cc