1 | /* Natural loop analysis code for GNU compiler. |
2 | Copyright (C) 2002-2023 Free Software Foundation, Inc. |
3 | |
4 | This file is part of GCC. |
5 | |
6 | GCC is free software; you can redistribute it and/or modify it under |
7 | the terms of the GNU General Public License as published by the Free |
8 | Software Foundation; either version 3, or (at your option) any later |
9 | version. |
10 | |
11 | GCC is distributed in the hope that it will be useful, but WITHOUT ANY |
12 | WARRANTY; without even the implied warranty of MERCHANTABILITY or |
13 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
14 | for more details. |
15 | |
16 | You should have received a copy of the GNU General Public License |
17 | along with GCC; see the file COPYING3. If not see |
18 | <http://www.gnu.org/licenses/>. */ |
19 | |
20 | #include "config.h" |
21 | #include "system.h" |
22 | #include "coretypes.h" |
23 | #include "backend.h" |
24 | #include "rtl.h" |
25 | #include "tree.h" |
26 | #include "predict.h" |
27 | #include "memmodel.h" |
28 | #include "emit-rtl.h" |
29 | #include "cfgloop.h" |
30 | #include "explow.h" |
31 | #include "expr.h" |
32 | #include "graphds.h" |
33 | #include "sreal.h" |
34 | #include "regs.h" |
35 | #include "function-abi.h" |
36 | |
37 | struct target_cfgloop default_target_cfgloop; |
38 | #if SWITCHABLE_TARGET |
39 | struct target_cfgloop *this_target_cfgloop = &default_target_cfgloop; |
40 | #endif |
41 | |
42 | /* Checks whether BB is executed exactly once in each LOOP iteration. */ |
43 | |
44 | bool |
45 | just_once_each_iteration_p (const class loop *loop, const_basic_block bb) |
46 | { |
47 | /* It must be executed at least once each iteration. */ |
48 | if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb)) |
49 | return false; |
50 | |
51 | /* And just once. */ |
52 | if (bb->loop_father != loop) |
53 | return false; |
54 | |
55 | /* But this was not enough. We might have some irreducible loop here. */ |
56 | if (bb->flags & BB_IRREDUCIBLE_LOOP) |
57 | return false; |
58 | |
59 | return true; |
60 | } |
61 | |
62 | /* Marks blocks and edges that are part of non-recognized loops; i.e. we |
63 | throw away all latch edges and mark blocks inside any remaining cycle. |
64 | Everything is a bit complicated due to fact we do not want to do this |
65 | for parts of cycles that only "pass" through some loop -- i.e. for |
66 | each cycle, we want to mark blocks that belong directly to innermost |
67 | loop containing the whole cycle. |
68 | |
69 | LOOPS is the loop tree. */ |
70 | |
71 | #define LOOP_REPR(LOOP) ((LOOP)->num + last_basic_block_for_fn (cfun)) |
72 | #define BB_REPR(BB) ((BB)->index + 1) |
73 | |
74 | bool |
75 | mark_irreducible_loops (void) |
76 | { |
77 | basic_block act; |
78 | struct graph_edge *ge; |
79 | edge e; |
80 | edge_iterator ei; |
81 | int src, dest; |
82 | unsigned depth; |
83 | struct graph *g; |
84 | int num = number_of_loops (cfun); |
85 | class loop *cloop; |
86 | bool irred_loop_found = false; |
87 | int i; |
88 | |
89 | gcc_assert (current_loops != NULL); |
90 | |
91 | /* Reset the flags. */ |
92 | FOR_BB_BETWEEN (act, ENTRY_BLOCK_PTR_FOR_FN (cfun), |
93 | EXIT_BLOCK_PTR_FOR_FN (cfun), next_bb) |
94 | { |
95 | act->flags &= ~BB_IRREDUCIBLE_LOOP; |
96 | FOR_EACH_EDGE (e, ei, act->succs) |
97 | e->flags &= ~EDGE_IRREDUCIBLE_LOOP; |
98 | } |
99 | |
100 | /* Create the edge lists. */ |
101 | g = new_graph (last_basic_block_for_fn (cfun) + num); |
102 | |
103 | FOR_BB_BETWEEN (act, ENTRY_BLOCK_PTR_FOR_FN (cfun), |
104 | EXIT_BLOCK_PTR_FOR_FN (cfun), next_bb) |
105 | FOR_EACH_EDGE (e, ei, act->succs) |
106 | { |
107 | /* Ignore edges to exit. */ |
108 | if (e->dest == EXIT_BLOCK_PTR_FOR_FN (cfun)) |
109 | continue; |
110 | |
111 | src = BB_REPR (act); |
112 | dest = BB_REPR (e->dest); |
113 | |
114 | /* Ignore latch edges. */ |
115 | if (e->dest->loop_father->header == e->dest |
116 | && dominated_by_p (CDI_DOMINATORS, act, e->dest)) |
117 | continue; |
118 | |
119 | /* Edges inside a single loop should be left where they are. Edges |
120 | to subloop headers should lead to representative of the subloop, |
121 | but from the same place. |
122 | |
123 | Edges exiting loops should lead from representative |
124 | of the son of nearest common ancestor of the loops in that |
125 | act lays. */ |
126 | |
127 | if (e->dest->loop_father->header == e->dest) |
128 | dest = LOOP_REPR (e->dest->loop_father); |
129 | |
130 | if (!flow_bb_inside_loop_p (act->loop_father, e->dest)) |
131 | { |
132 | depth = 1 + loop_depth (loop: find_common_loop (act->loop_father, |
133 | e->dest->loop_father)); |
134 | if (depth == loop_depth (loop: act->loop_father)) |
135 | cloop = act->loop_father; |
136 | else |
137 | cloop = (*act->loop_father->superloops)[depth]; |
138 | |
139 | src = LOOP_REPR (cloop); |
140 | } |
141 | |
142 | add_edge (g, src, dest)->data = e; |
143 | } |
144 | |
145 | /* Find the strongly connected components. */ |
146 | graphds_scc (g, NULL); |
147 | |
148 | /* Mark the irreducible loops. */ |
149 | for (i = 0; i < g->n_vertices; i++) |
150 | for (ge = g->vertices[i].succ; ge; ge = ge->succ_next) |
151 | { |
152 | edge real = (edge) ge->data; |
153 | /* edge E in graph G is irreducible if it connects two vertices in the |
154 | same scc. */ |
155 | |
156 | /* All edges should lead from a component with higher number to the |
157 | one with lower one. */ |
158 | gcc_assert (g->vertices[ge->src].component >= g->vertices[ge->dest].component); |
159 | |
160 | if (g->vertices[ge->src].component != g->vertices[ge->dest].component) |
161 | continue; |
162 | |
163 | real->flags |= EDGE_IRREDUCIBLE_LOOP; |
164 | irred_loop_found = true; |
165 | if (flow_bb_inside_loop_p (real->src->loop_father, real->dest)) |
166 | real->src->flags |= BB_IRREDUCIBLE_LOOP; |
167 | } |
168 | |
169 | free_graph (g); |
170 | |
171 | loops_state_set (flags: LOOPS_HAVE_MARKED_IRREDUCIBLE_REGIONS); |
172 | return irred_loop_found; |
173 | } |
174 | |
175 | /* Counts number of insns inside LOOP. */ |
176 | int |
177 | num_loop_insns (const class loop *loop) |
178 | { |
179 | basic_block *bbs, bb; |
180 | unsigned i, ninsns = 0; |
181 | rtx_insn *insn; |
182 | |
183 | bbs = get_loop_body (loop); |
184 | for (i = 0; i < loop->num_nodes; i++) |
185 | { |
186 | bb = bbs[i]; |
187 | FOR_BB_INSNS (bb, insn) |
188 | if (NONDEBUG_INSN_P (insn)) |
189 | ninsns++; |
190 | } |
191 | free (ptr: bbs); |
192 | |
193 | if (!ninsns) |
194 | ninsns = 1; /* To avoid division by zero. */ |
195 | |
196 | return ninsns; |
197 | } |
198 | |
199 | /* Counts number of insns executed on average per iteration LOOP. */ |
200 | int |
201 | average_num_loop_insns (const class loop *loop) |
202 | { |
203 | basic_block *bbs, bb; |
204 | unsigned i, binsns; |
205 | sreal ninsns; |
206 | rtx_insn *insn; |
207 | |
208 | ninsns = 0; |
209 | bbs = get_loop_body (loop); |
210 | for (i = 0; i < loop->num_nodes; i++) |
211 | { |
212 | bb = bbs[i]; |
213 | |
214 | binsns = 0; |
215 | FOR_BB_INSNS (bb, insn) |
216 | if (NONDEBUG_INSN_P (insn)) |
217 | binsns++; |
218 | |
219 | ninsns += (sreal)binsns * bb->count.to_sreal_scale (in: loop->header->count); |
220 | /* Avoid overflows. */ |
221 | if (ninsns > 1000000) |
222 | { |
223 | free (ptr: bbs); |
224 | return 1000000; |
225 | } |
226 | } |
227 | free (ptr: bbs); |
228 | |
229 | int64_t ret = ninsns.to_int (); |
230 | if (!ret) |
231 | ret = 1; /* To avoid division by zero. */ |
232 | |
233 | return ret; |
234 | } |
235 | |
236 | /* Compute how many times loop is entered. */ |
237 | |
238 | profile_count |
239 | loop_count_in (const class loop *loop) |
240 | { |
241 | /* Compute number of invocations of the loop. */ |
242 | profile_count count_in = profile_count::zero (); |
243 | edge e; |
244 | edge_iterator ei; |
245 | bool found_latch = false; |
246 | |
247 | if (loops_state_satisfies_p (flags: LOOPS_MAY_HAVE_MULTIPLE_LATCHES)) |
248 | FOR_EACH_EDGE (e, ei, loop->header->preds) |
249 | if (!flow_bb_inside_loop_p (loop, e->src)) |
250 | count_in += e->count (); |
251 | else |
252 | found_latch = true; |
253 | else |
254 | FOR_EACH_EDGE (e, ei, loop->header->preds) |
255 | if (e->src != loop->latch) |
256 | count_in += e->count (); |
257 | else |
258 | found_latch = true; |
259 | gcc_checking_assert (found_latch); |
260 | return count_in; |
261 | } |
262 | |
263 | /* Return true if BB profile can be used to determine the expected number of |
264 | iterations (that is number of executions of latch edge(s) for each |
265 | entry of the loop. If this is the case initialize RET with the number |
266 | of iterations. |
267 | |
268 | RELIABLE is set if profile indiates that the returned value should be |
269 | realistic estimate. (This is the case if we read profile and did not |
270 | messed it up yet and not the case of guessed profiles.) |
271 | |
272 | This function uses only CFG profile. We track more reliable info in |
273 | loop_info structure and for loop optimization heuristics more relevant |
274 | is get_estimated_loop_iterations API. */ |
275 | |
276 | bool |
277 | expected_loop_iterations_by_profile (const class loop *loop, sreal *ret, |
278 | bool *reliable) |
279 | { |
280 | profile_count = loop->header->count; |
281 | if (reliable) |
282 | *reliable = false; |
283 | |
284 | /* TODO: For single exit loops we can use loop exit edge probability. |
285 | It also may be reliable while loop itself was adjusted. */ |
286 | if (!header_count.initialized_p () |
287 | || !header_count.nonzero_p ()) |
288 | return false; |
289 | |
290 | profile_count count_in = loop_count_in (loop); |
291 | |
292 | bool known; |
293 | /* Number of iterations is number of executions of latch edge. */ |
294 | *ret = (header_count - count_in).to_sreal_scale (in: count_in, known: &known); |
295 | if (!known) |
296 | return false; |
297 | if (reliable) |
298 | { |
299 | /* Header should have at least count_in many executions. |
300 | Give up on clearly inconsistent profile. */ |
301 | if (header_count < count_in && header_count.differs_from_p (other: count_in)) |
302 | { |
303 | if (dump_file && (dump_flags & TDF_DETAILS)) |
304 | fprintf (stream: dump_file, format: "Inconsistent bb profile of loop %i\n" , |
305 | loop->num); |
306 | *reliable = false; |
307 | } |
308 | else |
309 | *reliable = count_in.reliable_p () && header_count.reliable_p (); |
310 | } |
311 | return true; |
312 | } |
313 | |
314 | /* Return true if loop CFG profile may be unrealistically flat. |
315 | This is a common case, since average loops iterate only about 5 times. |
316 | In the case we do not have profile feedback or do not know real number of |
317 | iterations during profile estimation, we are likely going to predict it with |
318 | similar low iteration count. For static loop profiles we also artificially |
319 | cap profile of loops with known large iteration count so they do not appear |
320 | significantly more hot than other loops with unknown iteration counts. |
321 | |
322 | For loop optimization heuristics we ignore CFG profile and instead |
323 | use get_estimated_loop_iterations API which returns estimate |
324 | only when it is realistic. For unknown counts some optimizations, |
325 | like vectorizer or unroller make guess that iteration count will |
326 | be large. In this case we need to avoid scaling down the profile |
327 | after the loop transform. */ |
328 | |
329 | bool |
330 | maybe_flat_loop_profile (const class loop *loop) |
331 | { |
332 | bool reliable; |
333 | sreal ret; |
334 | |
335 | if (!expected_loop_iterations_by_profile (loop, ret: &ret, reliable: &reliable)) |
336 | return true; |
337 | |
338 | /* Reliable CFG estimates ought never be flat. Sanity check with |
339 | nb_iterations_estimate. If those differ, it is a but in profile |
340 | updating code */ |
341 | if (reliable) |
342 | { |
343 | int64_t intret = ret.to_nearest_int (); |
344 | if (loop->any_estimate |
345 | && (wi::ltu_p (x: intret * 2, y: loop->nb_iterations_estimate) |
346 | || wi::gtu_p (x: intret, y: loop->nb_iterations_estimate * 2))) |
347 | { |
348 | if (dump_file && (dump_flags & TDF_DETAILS)) |
349 | fprintf (stream: dump_file, |
350 | format: "Loop %i has inconsistent iterations estimates: " |
351 | "reliable CFG based iteration estimate is %f " |
352 | "while nb_iterations_estimate is %i\n" , |
353 | loop->num, |
354 | ret.to_double (), |
355 | (int)loop->nb_iterations_estimate.to_shwi ()); |
356 | return true; |
357 | } |
358 | return false; |
359 | } |
360 | |
361 | /* Allow some margin of error and see if we are close to known bounds. |
362 | sreal (9,-3) is 9/8 */ |
363 | int64_t intret = (ret * sreal (9, -3)).to_nearest_int (); |
364 | if (loop->any_upper_bound && wi::geu_p (x: intret, y: loop->nb_iterations_upper_bound)) |
365 | return false; |
366 | if (loop->any_likely_upper_bound |
367 | && wi::geu_p (x: intret, y: loop->nb_iterations_likely_upper_bound)) |
368 | return false; |
369 | if (loop->any_estimate |
370 | && wi::geu_p (x: intret, y: loop->nb_iterations_estimate)) |
371 | return false; |
372 | return true; |
373 | } |
374 | |
375 | /* Returns expected number of iterations of LOOP, according to |
376 | measured or guessed profile. |
377 | |
378 | This functions attempts to return "sane" value even if profile |
379 | information is not good enough to derive osmething. */ |
380 | |
381 | gcov_type |
382 | expected_loop_iterations_unbounded (const class loop *loop, |
383 | bool *read_profile_p) |
384 | { |
385 | gcov_type expected = -1; |
386 | |
387 | if (read_profile_p) |
388 | *read_profile_p = false; |
389 | |
390 | sreal sreal_expected; |
391 | if (expected_loop_iterations_by_profile |
392 | (loop, ret: &sreal_expected, reliable: read_profile_p)) |
393 | expected = sreal_expected.to_nearest_int (); |
394 | else |
395 | expected = param_avg_loop_niter; |
396 | |
397 | HOST_WIDE_INT max = get_max_loop_iterations_int (loop); |
398 | if (max != -1 && max < expected) |
399 | return max; |
400 | |
401 | return expected; |
402 | } |
403 | |
404 | /* Returns expected number of LOOP iterations. The returned value is bounded |
405 | by REG_BR_PROB_BASE. */ |
406 | |
407 | unsigned |
408 | expected_loop_iterations (class loop *loop) |
409 | { |
410 | gcov_type expected = expected_loop_iterations_unbounded (loop); |
411 | return (expected > REG_BR_PROB_BASE ? REG_BR_PROB_BASE : expected); |
412 | } |
413 | |
414 | /* Returns the maximum level of nesting of subloops of LOOP. */ |
415 | |
416 | unsigned |
417 | get_loop_level (const class loop *loop) |
418 | { |
419 | const class loop *ploop; |
420 | unsigned mx = 0, l; |
421 | |
422 | for (ploop = loop->inner; ploop; ploop = ploop->next) |
423 | { |
424 | l = get_loop_level (loop: ploop); |
425 | if (l >= mx) |
426 | mx = l + 1; |
427 | } |
428 | return mx; |
429 | } |
430 | |
431 | /* Initialize the constants for computing set costs. */ |
432 | |
433 | void |
434 | init_set_costs (void) |
435 | { |
436 | int speed; |
437 | rtx_insn *seq; |
438 | rtx reg1 = gen_raw_REG (SImode, LAST_VIRTUAL_REGISTER + 1); |
439 | rtx reg2 = gen_raw_REG (SImode, LAST_VIRTUAL_REGISTER + 2); |
440 | rtx addr = gen_raw_REG (Pmode, LAST_VIRTUAL_REGISTER + 3); |
441 | rtx mem = validize_mem (gen_rtx_MEM (SImode, addr)); |
442 | unsigned i; |
443 | |
444 | target_avail_regs = 0; |
445 | target_clobbered_regs = 0; |
446 | for (i = 0; i < FIRST_PSEUDO_REGISTER; i++) |
447 | if (TEST_HARD_REG_BIT (reg_class_contents[GENERAL_REGS], bit: i) |
448 | && !fixed_regs[i]) |
449 | { |
450 | target_avail_regs++; |
451 | /* ??? This is only a rough heuristic. It doesn't cope well |
452 | with alternative ABIs, but that's an optimization rather than |
453 | correctness issue. */ |
454 | if (default_function_abi.clobbers_full_reg_p (regno: i)) |
455 | target_clobbered_regs++; |
456 | } |
457 | |
458 | target_res_regs = 3; |
459 | |
460 | for (speed = 0; speed < 2; speed++) |
461 | { |
462 | crtl->maybe_hot_insn_p = speed; |
463 | /* Set up the costs for using extra registers: |
464 | |
465 | 1) If not many free registers remain, we should prefer having an |
466 | additional move to decreasing the number of available registers. |
467 | (TARGET_REG_COST). |
468 | 2) If no registers are available, we need to spill, which may require |
469 | storing the old value to memory and loading it back |
470 | (TARGET_SPILL_COST). */ |
471 | |
472 | start_sequence (); |
473 | emit_move_insn (reg1, reg2); |
474 | seq = get_insns (); |
475 | end_sequence (); |
476 | target_reg_cost [speed] = seq_cost (seq, speed); |
477 | |
478 | start_sequence (); |
479 | emit_move_insn (mem, reg1); |
480 | emit_move_insn (reg2, mem); |
481 | seq = get_insns (); |
482 | end_sequence (); |
483 | target_spill_cost [speed] = seq_cost (seq, speed); |
484 | } |
485 | default_rtl_profile (); |
486 | } |
487 | |
488 | /* Estimates cost of increased register pressure caused by making N_NEW new |
489 | registers live around the loop. N_OLD is the number of registers live |
490 | around the loop. If CALL_P is true, also take into account that |
491 | call-used registers may be clobbered in the loop body, reducing the |
492 | number of available registers before we spill. */ |
493 | |
494 | unsigned |
495 | estimate_reg_pressure_cost (unsigned n_new, unsigned n_old, bool speed, |
496 | bool call_p) |
497 | { |
498 | unsigned cost; |
499 | unsigned regs_needed = n_new + n_old; |
500 | unsigned available_regs = target_avail_regs; |
501 | |
502 | /* If there is a call in the loop body, the call-clobbered registers |
503 | are not available for loop invariants. */ |
504 | if (call_p) |
505 | available_regs = available_regs - target_clobbered_regs; |
506 | |
507 | /* If we have enough registers, we should use them and not restrict |
508 | the transformations unnecessarily. */ |
509 | if (regs_needed + target_res_regs <= available_regs) |
510 | return 0; |
511 | |
512 | if (regs_needed <= available_regs) |
513 | /* If we are close to running out of registers, try to preserve |
514 | them. */ |
515 | cost = target_reg_cost [speed] * n_new; |
516 | else |
517 | /* If we run out of registers, it is very expensive to add another |
518 | one. */ |
519 | cost = target_spill_cost [speed] * n_new; |
520 | |
521 | if (optimize && (flag_ira_region == IRA_REGION_ALL |
522 | || flag_ira_region == IRA_REGION_MIXED) |
523 | && number_of_loops (cfun) <= (unsigned) param_ira_max_loops_num) |
524 | /* IRA regional allocation deals with high register pressure |
525 | better. So decrease the cost (to do more accurate the cost |
526 | calculation for IRA, we need to know how many registers lives |
527 | through the loop transparently). */ |
528 | cost /= 2; |
529 | |
530 | return cost; |
531 | } |
532 | |
533 | /* Sets EDGE_LOOP_EXIT flag for all loop exits. */ |
534 | |
535 | void |
536 | mark_loop_exit_edges (void) |
537 | { |
538 | basic_block bb; |
539 | edge e; |
540 | |
541 | if (number_of_loops (cfun) <= 1) |
542 | return; |
543 | |
544 | FOR_EACH_BB_FN (bb, cfun) |
545 | { |
546 | edge_iterator ei; |
547 | |
548 | FOR_EACH_EDGE (e, ei, bb->succs) |
549 | { |
550 | if (loop_outer (loop: bb->loop_father) |
551 | && loop_exit_edge_p (bb->loop_father, e)) |
552 | e->flags |= EDGE_LOOP_EXIT; |
553 | else |
554 | e->flags &= ~EDGE_LOOP_EXIT; |
555 | } |
556 | } |
557 | } |
558 | |
559 | /* Return exit edge if loop has only one exit that is likely |
560 | to be executed on runtime (i.e. it is not EH or leading |
561 | to noreturn call. */ |
562 | |
563 | edge |
564 | single_likely_exit (class loop *loop, const vec<edge> &exits) |
565 | { |
566 | edge found = single_exit (loop); |
567 | unsigned i; |
568 | edge ex; |
569 | |
570 | if (found) |
571 | return found; |
572 | FOR_EACH_VEC_ELT (exits, i, ex) |
573 | { |
574 | if (probably_never_executed_edge_p (cfun, ex) |
575 | /* We want to rule out paths to noreturns but not low probabilities |
576 | resulting from adjustments or combining. |
577 | FIXME: once we have better quality tracking, make this more |
578 | robust. */ |
579 | || ex->probability <= profile_probability::very_unlikely ()) |
580 | continue; |
581 | if (!found) |
582 | found = ex; |
583 | else |
584 | return NULL; |
585 | } |
586 | return found; |
587 | } |
588 | |
589 | |
590 | /* Gets basic blocks of a LOOP. Header is the 0-th block, rest is in dfs |
591 | order against direction of edges from latch. Specially, if |
592 | header != latch, latch is the 1-st block. */ |
593 | |
594 | auto_vec<basic_block> |
595 | get_loop_hot_path (const class loop *loop) |
596 | { |
597 | basic_block bb = loop->header; |
598 | auto_vec<basic_block> path; |
599 | bitmap visited = BITMAP_ALLOC (NULL); |
600 | |
601 | while (true) |
602 | { |
603 | edge_iterator ei; |
604 | edge e; |
605 | edge best = NULL; |
606 | |
607 | path.safe_push (obj: bb); |
608 | bitmap_set_bit (visited, bb->index); |
609 | FOR_EACH_EDGE (e, ei, bb->succs) |
610 | if ((!best || e->probability > best->probability) |
611 | && !loop_exit_edge_p (loop, e) |
612 | && !bitmap_bit_p (visited, e->dest->index)) |
613 | best = e; |
614 | if (!best || best->dest == loop->header) |
615 | break; |
616 | bb = best->dest; |
617 | } |
618 | BITMAP_FREE (visited); |
619 | return path; |
620 | } |
621 | |