1// Pass to fuse adjacent loads/stores into paired memory accesses.
2// Copyright (C) 2023-2025 Free Software Foundation, Inc.
3//
4// This file is part of GCC.
5//
6// GCC is free software; you can redistribute it and/or modify it
7// under the terms of the GNU General Public License as published by
8// the Free Software Foundation; either version 3, or (at your option)
9// any later version.
10//
11// GCC is distributed in the hope that it will be useful, but
12// WITHOUT ANY WARRANTY; without even the implied warranty of
13// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14// General Public License for more details.
15//
16// You should have received a copy of the GNU General Public License
17// along with GCC; see the file COPYING3. If not see
18// <http://www.gnu.org/licenses/>.
19
20#define INCLUDE_ALGORITHM
21#define INCLUDE_FUNCTIONAL
22#define INCLUDE_LIST
23#define INCLUDE_TYPE_TRAITS
24#define INCLUDE_ARRAY
25#include "config.h"
26#include "system.h"
27#include "coretypes.h"
28#include "backend.h"
29#include "rtl.h"
30#include "df.h"
31#include "rtl-iter.h"
32#include "rtl-ssa.h"
33#include "cfgcleanup.h"
34#include "tree-pass.h"
35#include "ordered-hash-map.h"
36#include "tree-dfa.h"
37#include "fold-const.h"
38#include "tree-hash-traits.h"
39#include "print-tree.h"
40#include "pair-fusion.h"
41
42using namespace rtl_ssa;
43
44// We pack these fields (load_p, fpsimd_p, and size) into an integer
45// (LFS) which we use as part of the key into the main hash tables.
46//
47// The idea is that we group candidates together only if they agree on
48// the fields below. Candidates that disagree on any of these
49// properties shouldn't be merged together.
50struct lfs_fields
51{
52 bool load_p;
53 bool fpsimd_p;
54 unsigned size;
55};
56
57using insn_list_t = std::list<insn_info *>;
58
59// Information about the accesses at a given offset from a particular
60// base. Stored in an access_group, see below.
61struct access_record
62{
63 poly_int64 offset;
64 std::list<insn_info *> cand_insns;
65 std::list<access_record>::iterator place;
66
67 access_record (poly_int64 off) : offset (off) {}
68};
69
70// A group of accesses where adjacent accesses could be ldp/stp
71// candidates. The splay tree supports efficient insertion,
72// while the list supports efficient iteration.
73struct access_group
74{
75 splay_tree<access_record *> tree;
76 std::list<access_record> list;
77
78 template<typename Alloc>
79 inline void track (Alloc node_alloc, poly_int64 offset, insn_info *insn);
80};
81
82// Test if this base candidate is viable according to HAZARDS.
83bool
84base_cand::viable () const
85{
86 return !hazards[0] || !hazards[1] || (*hazards[0] > *hazards[1]);
87}
88
89// Information about an alternate base. For a def_info D, it may
90// instead be expressed as D = BASE + OFFSET.
91struct alt_base
92{
93 def_info *base;
94 poly_int64 offset;
95};
96
97// Virtual base class for load/store walkers used in alias analysis.
98struct alias_walker
99{
100 virtual bool conflict_p (int &budget) const = 0;
101 virtual insn_info *insn () const = 0;
102 virtual bool valid () const = 0;
103 virtual void advance () = 0;
104};
105
106
107pair_fusion::pair_fusion ()
108{
109 calculate_dominance_info (CDI_DOMINATORS);
110 df_analyze ();
111 crtl->ssa = new rtl_ssa::function_info (cfun);
112}
113
114pair_fusion::~pair_fusion ()
115{
116 if (crtl->ssa->perform_pending_updates ())
117 cleanup_cfg (0);
118
119 free_dominance_info (CDI_DOMINATORS);
120
121 delete crtl->ssa;
122 crtl->ssa = nullptr;
123}
124
125// This is the main function to start the pass.
126void
127pair_fusion::run ()
128{
129 if (!track_loads_p () && !track_stores_p ())
130 return;
131
132 for (auto bb : crtl->ssa->bbs ())
133 process_block (bb);
134}
135
136// State used by the pass for a given basic block.
137struct pair_fusion_bb_info
138{
139 using def_hash = nofree_ptr_hash<def_info>;
140 using expr_key_t = pair_hash<tree_operand_hash, int_hash<int, -1, -2>>;
141 using def_key_t = pair_hash<def_hash, int_hash<int, -1, -2>>;
142
143 // Map of <tree base, LFS> -> access_group.
144 ordered_hash_map<expr_key_t, access_group> expr_map;
145
146 // Map of <RTL-SSA def_info *, LFS> -> access_group.
147 ordered_hash_map<def_key_t, access_group> def_map;
148
149 // Given the def_info for an RTL base register, express it as an offset from
150 // some canonical base instead.
151 //
152 // Canonicalizing bases in this way allows us to identify adjacent accesses
153 // even if they see different base register defs.
154 hash_map<def_hash, alt_base> canon_base_map;
155
156 static const size_t obstack_alignment = sizeof (void *);
157
158 pair_fusion_bb_info (bb_info *bb, pair_fusion *d)
159 : m_bb (bb), m_pass (d), m_emitted_tombstone (false)
160 {
161 obstack_specify_allocation (&m_obstack, OBSTACK_CHUNK_SIZE,
162 obstack_alignment, obstack_chunk_alloc,
163 obstack_chunk_free);
164 }
165 ~pair_fusion_bb_info ()
166 {
167 obstack_free (&m_obstack, nullptr);
168
169 if (m_emitted_tombstone)
170 {
171 bitmap_release (head: &m_tombstone_bitmap);
172 bitmap_obstack_release (&m_bitmap_obstack);
173 }
174 }
175
176 inline void track_access (insn_info *, bool load, rtx mem);
177 inline void transform ();
178 inline void cleanup_tombstones ();
179
180private:
181 obstack m_obstack;
182 bb_info *m_bb;
183 pair_fusion *m_pass;
184
185 // State for keeping track of tombstone insns emitted for this BB.
186 bitmap_obstack m_bitmap_obstack;
187 bitmap_head m_tombstone_bitmap;
188 bool m_emitted_tombstone;
189
190 inline splay_tree_node<access_record *> *node_alloc (access_record *);
191
192 template<typename Map>
193 inline void traverse_base_map (Map &map);
194 inline void transform_for_base (int load_size, access_group &group);
195
196 inline void merge_pairs (insn_list_t &, insn_list_t &,
197 bool load_p, unsigned access_size);
198
199 inline bool try_fuse_pair (bool load_p, unsigned access_size,
200 insn_info *i1, insn_info *i2);
201
202 inline bool fuse_pair (bool load_p, unsigned access_size,
203 int writeback,
204 insn_info *i1, insn_info *i2,
205 base_cand &base,
206 const insn_range_info &move_range);
207
208 inline void track_tombstone (int uid);
209
210 inline bool track_via_mem_expr (insn_info *, rtx mem, lfs_fields lfs);
211};
212
213splay_tree_node<access_record *> *
214pair_fusion_bb_info::node_alloc (access_record *access)
215{
216 using T = splay_tree_node<access_record *>;
217 void *addr = obstack_alloc (&m_obstack, sizeof (T));
218 return new (addr) T (access);
219}
220
221// Given a mem MEM, if the address has side effects, return a MEM that accesses
222// the same address but without the side effects. Otherwise, return
223// MEM unchanged.
224static rtx
225drop_writeback (rtx mem)
226{
227 rtx addr = XEXP (mem, 0);
228
229 if (!side_effects_p (addr))
230 return mem;
231
232 switch (GET_CODE (addr))
233 {
234 case PRE_MODIFY:
235 addr = XEXP (addr, 1);
236 break;
237 case POST_MODIFY:
238 case POST_INC:
239 case POST_DEC:
240 addr = XEXP (addr, 0);
241 break;
242 case PRE_INC:
243 case PRE_DEC:
244 {
245 poly_int64 adjustment = GET_MODE_SIZE (GET_MODE (mem));
246 if (GET_CODE (addr) == PRE_DEC)
247 adjustment *= -1;
248 addr = plus_constant (GET_MODE (addr), XEXP (addr, 0), adjustment);
249 break;
250 }
251 default:
252 gcc_unreachable ();
253 }
254
255 return change_address (mem, GET_MODE (mem), addr);
256}
257
258// Convenience wrapper around strip_offset that can also look through
259// RTX_AUTOINC addresses. The interface is like strip_offset except we take a
260// MEM so that we know the mode of the access.
261static rtx
262pair_mem_strip_offset (rtx mem, poly_int64 *offset)
263{
264 rtx addr = XEXP (mem, 0);
265
266 switch (GET_CODE (addr))
267 {
268 case PRE_MODIFY:
269 case POST_MODIFY:
270 addr = strip_offset (XEXP (addr, 1), offset);
271 gcc_checking_assert (REG_P (addr));
272 gcc_checking_assert (rtx_equal_p (XEXP (XEXP (mem, 0), 0), addr));
273 break;
274 case PRE_INC:
275 case POST_INC:
276 addr = XEXP (addr, 0);
277 *offset = GET_MODE_SIZE (GET_MODE (mem));
278 gcc_checking_assert (REG_P (addr));
279 break;
280 case PRE_DEC:
281 case POST_DEC:
282 addr = XEXP (addr, 0);
283 *offset = -GET_MODE_SIZE (GET_MODE (mem));
284 gcc_checking_assert (REG_P (addr));
285 break;
286
287 default:
288 addr = strip_offset (addr, offset);
289 }
290
291 return addr;
292}
293
294// Return true if X is a PRE_{INC,DEC,MODIFY} rtx.
295static bool
296any_pre_modify_p (rtx x)
297{
298 const auto code = GET_CODE (x);
299 return code == PRE_INC || code == PRE_DEC || code == PRE_MODIFY;
300}
301
302// Return true if X is a POST_{INC,DEC,MODIFY} rtx.
303static bool
304any_post_modify_p (rtx x)
305{
306 const auto code = GET_CODE (x);
307 return code == POST_INC || code == POST_DEC || code == POST_MODIFY;
308}
309
310// Given LFS (load_p, fpsimd_p, size) fields in FIELDS, encode these
311// into an integer for use as a hash table key.
312static int
313encode_lfs (lfs_fields fields)
314{
315 int size_log2 = exact_log2 (x: fields.size);
316 gcc_checking_assert (size_log2 >= 2 && size_log2 <= 4);
317 return ((int)fields.load_p << 3)
318 | ((int)fields.fpsimd_p << 2)
319 | (size_log2 - 2);
320}
321
322// Inverse of encode_lfs.
323static lfs_fields
324decode_lfs (int lfs)
325{
326 bool load_p = (lfs & (1 << 3));
327 bool fpsimd_p = (lfs & (1 << 2));
328 unsigned size = 1U << ((lfs & 3) + 2);
329 return { .load_p: load_p, .fpsimd_p: fpsimd_p, .size: size };
330}
331
332// Track the access INSN at offset OFFSET in this access group.
333// ALLOC_NODE is used to allocate splay tree nodes.
334template<typename Alloc>
335void
336access_group::track (Alloc alloc_node, poly_int64 offset, insn_info *insn)
337{
338 auto insert_before = [&](std::list<access_record>::iterator after)
339 {
340 auto it = list.emplace (position: after, args&: offset);
341 it->cand_insns.push_back (x: insn);
342 it->place = it;
343 return &*it;
344 };
345
346 if (!list.size ())
347 {
348 auto access = insert_before (list.end ());
349 tree.insert_max_node (new_node: alloc_node (access));
350 return;
351 }
352
353 auto compare = [&](splay_tree_node<access_record *> *node)
354 {
355 return compare_sizes_for_sort (a: offset, b: node->value ()->offset);
356 };
357 auto result = tree.lookup (compare);
358 splay_tree_node<access_record *> *node = tree.root ();
359 if (result == 0)
360 node->value ()->cand_insns.push_back (x: insn);
361 else
362 {
363 auto it = node->value ()->place;
364 auto after = (result > 0) ? std::next (x: it) : it;
365 auto access = insert_before (after);
366 tree.insert_child (node, index: result > 0, child: alloc_node (access));
367 }
368}
369
370// Given a candidate access INSN (with mem MEM), see if it has a suitable
371// MEM_EXPR base (i.e. a tree decl) relative to which we can track the access.
372// LFS is used as part of the key to the hash table, see track_access.
373bool
374pair_fusion_bb_info::track_via_mem_expr (insn_info *insn, rtx mem,
375 lfs_fields lfs)
376{
377 if (!MEM_EXPR (mem) || !MEM_OFFSET_KNOWN_P (mem))
378 return false;
379
380 poly_int64 offset;
381 tree base_expr = get_addr_base_and_unit_offset (MEM_EXPR (mem),
382 &offset);
383 if (!base_expr || !DECL_P (base_expr))
384 return false;
385
386 offset += MEM_OFFSET (mem);
387
388 const machine_mode mem_mode = GET_MODE (mem);
389 const HOST_WIDE_INT mem_size = GET_MODE_SIZE (mode: mem_mode).to_constant ();
390
391 // Punt on misaligned offsets. Paired memory access instructions require
392 // offsets to be a multiple of the access size, and we believe that
393 // misaligned offsets on MEM_EXPR bases are likely to lead to misaligned
394 // offsets w.r.t. RTL bases.
395 if (!multiple_p (a: offset, b: mem_size))
396 return false;
397
398 const auto key = std::make_pair (x&: base_expr, y: encode_lfs (fields: lfs));
399 access_group &group = expr_map.get_or_insert (k: key, NULL);
400 auto alloc = [&](access_record *access) { return node_alloc (access); };
401 group.track (alloc_node: alloc, offset, insn);
402
403 if (dump_file)
404 {
405 fprintf (stream: dump_file, format: "[bb %u] tracking insn %d via ",
406 m_bb->index (), insn->uid ());
407 print_node_brief (dump_file, "mem expr", base_expr, 0);
408 fprintf (stream: dump_file, format: " [L=%d FP=%d, %smode, off=",
409 lfs.load_p, lfs.fpsimd_p, mode_name[mem_mode]);
410 print_dec (value: offset, file: dump_file);
411 fprintf (stream: dump_file, format: "]\n");
412 }
413
414 return true;
415}
416
417// Main function to begin pair discovery. Given a memory access INSN,
418// determine whether it could be a candidate for fusing into a paired
419// access, and if so, track it in the appropriate data structure for
420// this basic block. LOAD_P is true if the access is a load, and MEM
421// is the mem rtx that occurs in INSN.
422void
423pair_fusion_bb_info::track_access (insn_info *insn, bool load_p, rtx mem)
424{
425 // We can't combine volatile MEMs, so punt on these.
426 if (MEM_VOLATILE_P (mem))
427 return;
428
429 // Ignore writeback accesses if the hook says to do so.
430 if (!m_pass->should_handle_writeback (which: writeback_type::EXISTING)
431 && GET_RTX_CLASS (GET_CODE (XEXP (mem, 0))) == RTX_AUTOINC)
432 return;
433
434 const machine_mode mem_mode = GET_MODE (mem);
435 if (!m_pass->pair_operand_mode_ok_p (mode: mem_mode))
436 return;
437
438 rtx reg_op = XEXP (PATTERN (insn->rtl ()), !load_p);
439
440 if (!m_pass->pair_reg_operand_ok_p (load_p, reg_op, mode: mem_mode))
441 return;
442
443 // We want to segregate FP/SIMD accesses from GPR accesses.
444 const bool fpsimd_op_p = m_pass->fpsimd_op_p (reg_op, mem_mode, load_p);
445
446 // Note pair_operand_mode_ok_p already rejected VL modes.
447 const unsigned mem_size = GET_MODE_SIZE (mode: mem_mode).to_constant ();
448 const lfs_fields lfs = { .load_p: load_p, .fpsimd_p: fpsimd_op_p, .size: mem_size };
449
450 if (track_via_mem_expr (insn, mem, lfs))
451 return;
452
453 poly_int64 mem_off;
454 rtx addr = XEXP (mem, 0);
455 const bool autoinc_p = GET_RTX_CLASS (GET_CODE (addr)) == RTX_AUTOINC;
456 rtx base = pair_mem_strip_offset (mem, offset: &mem_off);
457 if (!REG_P (base))
458 return;
459
460 // Need to calculate two (possibly different) offsets:
461 // - Offset at which the access occurs.
462 // - Offset of the new base def.
463 poly_int64 access_off;
464 if (autoinc_p && any_post_modify_p (x: addr))
465 access_off = 0;
466 else
467 access_off = mem_off;
468
469 poly_int64 new_def_off = mem_off;
470
471 // Punt on accesses relative to eliminable regs. Since we don't know the
472 // elimination offset pre-RA, we should postpone forming pairs on such
473 // accesses until after RA.
474 //
475 // As it stands, addresses in range for an individual load/store but not
476 // for a paired access are currently reloaded inefficiently,
477 // ending up with a separate base register for each pair.
478 //
479 // In theory LRA should make use of
480 // targetm.legitimize_address_displacement to promote sharing of
481 // bases among multiple (nearby) address reloads, but the current
482 // LRA code returns early from process_address_1 for operands that
483 // satisfy "m", even if they don't satisfy the real (relaxed) address
484 // constraint; this early return means we never get to the code
485 // that calls targetm.legitimize_address_displacement.
486 //
487 // So for now, it's better to punt when we can't be sure that the
488 // offset is in range for paired access. On aarch64, out-of-range cases
489 // can then be handled after RA by the out-of-range LDP/STP peepholes.
490 // Eventually, it would be nice to handle known out-of-range opportunities
491 // in the pass itself (for stack accesses, this would be in the post-RA pass).
492 if (!reload_completed
493 && (REGNO (base) == FRAME_POINTER_REGNUM
494 || REGNO (base) == ARG_POINTER_REGNUM))
495 return;
496
497 // Now need to find def of base register.
498 use_info *base_use = find_access (accesses: insn->uses (), REGNO (base));
499 gcc_assert (base_use);
500 def_info *base_def = base_use->def ();
501 if (!base_def)
502 {
503 if (dump_file)
504 fprintf (stream: dump_file,
505 format: "base register (regno %d) of insn %d is undefined",
506 REGNO (base), insn->uid ());
507 return;
508 }
509
510 alt_base *canon_base = canon_base_map.get (k: base_def);
511 if (canon_base)
512 {
513 // Express this as the combined offset from the canonical base.
514 base_def = canon_base->base;
515 new_def_off += canon_base->offset;
516 access_off += canon_base->offset;
517 }
518
519 if (autoinc_p)
520 {
521 auto def = find_access (accesses: insn->defs (), REGNO (base));
522 gcc_assert (def);
523
524 // Record that DEF = BASE_DEF + MEM_OFF.
525 if (dump_file)
526 {
527 pretty_printer pp;
528 pp_access (&pp, def, flags: 0);
529 pp_string (&pp, " = ");
530 pp_access (&pp, base_def, flags: 0);
531 fprintf (stream: dump_file, format: "[bb %u] recording %s + ",
532 m_bb->index (), pp_formatted_text (&pp));
533 print_dec (value: new_def_off, file: dump_file);
534 fprintf (stream: dump_file, format: "\n");
535 }
536
537 alt_base base_rec { .base: base_def, .offset: new_def_off };
538 if (canon_base_map.put (k: def, v: base_rec))
539 gcc_unreachable (); // Base defs should be unique.
540 }
541
542 // Punt on misaligned offsets. Paired memory accesses require offsets
543 // to be a multiple of the access size.
544 if (!multiple_p (a: mem_off, b: mem_size))
545 return;
546
547 const auto key = std::make_pair (x&: base_def, y: encode_lfs (fields: lfs));
548 access_group &group = def_map.get_or_insert (k: key, NULL);
549 auto alloc = [&](access_record *access) { return node_alloc (access); };
550 group.track (alloc_node: alloc, offset: access_off, insn);
551
552 if (dump_file)
553 {
554 pretty_printer pp;
555 pp_access (&pp, base_def, flags: 0);
556
557 fprintf (stream: dump_file, format: "[bb %u] tracking insn %d via %s",
558 m_bb->index (), insn->uid (), pp_formatted_text (&pp));
559 fprintf (stream: dump_file,
560 format: " [L=%d, WB=%d, FP=%d, %smode, off=",
561 lfs.load_p, autoinc_p, lfs.fpsimd_p, mode_name[mem_mode]);
562 print_dec (value: access_off, file: dump_file);
563 fprintf (stream: dump_file, format: "]\n");
564 }
565}
566
567// Return the latest dataflow hazard before INSN.
568//
569// If IGNORE is non-NULL, this points to a sub-rtx which we should ignore for
570// dataflow purposes. This is needed when considering changing the RTL base of
571// an access discovered through a MEM_EXPR base.
572//
573// If IGNORE_INSN is non-NULL, we should further ignore any hazards arising
574// from that insn.
575//
576// IS_LOAD_STORE is true if INSN is one of the loads or stores in the
577// candidate pair. We ignore any defs/uses of memory in such instructions
578// as we deal with that separately, making use of alias disambiguation.
579static insn_info *
580latest_hazard_before (insn_info *insn, rtx *ignore,
581 insn_info *ignore_insn = nullptr,
582 bool is_load_store = true)
583{
584 insn_info *result = nullptr;
585
586 // If the insn can throw then it is at the end of a BB and we can't
587 // move it, model this by recording a hazard in the previous insn
588 // which will prevent moving the insn up.
589 if (cfun->can_throw_non_call_exceptions
590 && find_reg_note (insn->rtl (), REG_EH_REGION, NULL_RTX))
591 return insn->prev_nondebug_insn ();
592
593 if (!is_load_store
594 && accesses_include_memory (accesses: insn->defs ()))
595 return insn->prev_nondebug_insn ();
596
597 // Return true if we registered the hazard.
598 auto hazard = [&](insn_info *h) -> bool
599 {
600 gcc_checking_assert (*h < *insn);
601 if (h == ignore_insn)
602 return false;
603
604 if (!result || *h > *result)
605 result = h;
606
607 return true;
608 };
609
610 rtx pat = PATTERN (insn: insn->rtl ());
611 auto ignore_use = [&](use_info *u)
612 {
613 if (u->is_mem ())
614 return true;
615
616 return !refers_to_regno_p (u->regno (), u->regno () + 1, pat, ignore);
617 };
618
619 // Find defs of uses in INSN (RaW).
620 for (auto use : insn->uses ())
621 if (!ignore_use (use) && use->def ())
622 hazard (use->def ()->insn ());
623
624 // Find previous defs (WaW) or previous uses (WaR) of defs in INSN.
625 for (auto def : insn->defs ())
626 {
627 if (def->is_mem ())
628 continue;
629
630 if (def->prev_def ())
631 {
632 hazard (def->prev_def ()->insn ()); // WaW
633
634 auto set = dyn_cast<set_info *> (p: def->prev_def ());
635 if (set && set->has_nondebug_insn_uses ())
636 for (auto use : set->reverse_nondebug_insn_uses ())
637 if (use->insn () != insn && hazard (use->insn ())) // WaR
638 break;
639 }
640
641 if (!HARD_REGISTER_NUM_P (def->regno ()))
642 continue;
643
644 // Also need to check backwards for call clobbers (WaW).
645 for (auto call_group : def->ebb ()->call_clobbers ())
646 {
647 if (!call_group->clobbers (resource: def->resource ()))
648 continue;
649
650 auto clobber_insn = prev_call_clobbers (tree&: *call_group, insn: def->insn (),
651 ignore: ignore_nothing ());
652 if (clobber_insn)
653 hazard (clobber_insn);
654 }
655
656 }
657
658 return result;
659}
660
661// Return the first dataflow hazard after INSN.
662//
663// If IGNORE is non-NULL, this points to a sub-rtx which we should ignore for
664// dataflow purposes. This is needed when considering changing the RTL base of
665// an access discovered through a MEM_EXPR base.
666//
667// N.B. we ignore any defs/uses of memory here as we deal with that separately,
668// making use of alias disambiguation.
669static insn_info *
670first_hazard_after (insn_info *insn, rtx *ignore)
671{
672 insn_info *result = nullptr;
673 auto hazard = [insn, &result](insn_info *h)
674 {
675 gcc_checking_assert (*h > *insn);
676 if (!result || *h < *result)
677 result = h;
678 };
679
680 rtx pat = PATTERN (insn: insn->rtl ());
681 auto ignore_use = [&](use_info *u)
682 {
683 if (u->is_mem ())
684 return true;
685
686 return !refers_to_regno_p (u->regno (), u->regno () + 1, pat, ignore);
687 };
688
689 for (auto def : insn->defs ())
690 {
691 if (def->is_mem ())
692 continue;
693
694 if (def->next_def ())
695 hazard (def->next_def ()->insn ()); // WaW
696
697 auto set = dyn_cast<set_info *> (p: def);
698 if (set && set->has_nondebug_insn_uses ())
699 hazard (set->first_nondebug_insn_use ()->insn ()); // RaW
700
701 if (!HARD_REGISTER_NUM_P (def->regno ()))
702 continue;
703
704 // Also check for call clobbers of this def (WaW).
705 for (auto call_group : def->ebb ()->call_clobbers ())
706 {
707 if (!call_group->clobbers (resource: def->resource ()))
708 continue;
709
710 auto clobber_insn = next_call_clobbers (tree&: *call_group, insn: def->insn (),
711 ignore: ignore_nothing ());
712 if (clobber_insn)
713 hazard (clobber_insn);
714 }
715 }
716
717 // Find any subsequent defs of uses in INSN (WaR).
718 for (auto use : insn->uses ())
719 {
720 if (ignore_use (use))
721 continue;
722
723 if (use->def ())
724 {
725 auto def = use->def ()->next_def ();
726 if (def && def->insn () == insn)
727 def = def->next_def ();
728
729 if (def)
730 hazard (def->insn ());
731 }
732
733 if (!HARD_REGISTER_NUM_P (use->regno ()))
734 continue;
735
736 // Also need to handle call clobbers of our uses (again WaR).
737 //
738 // See restrict_movement_for_uses for why we don't need to check
739 // backwards for call clobbers.
740 for (auto call_group : use->ebb ()->call_clobbers ())
741 {
742 if (!call_group->clobbers (resource: use->resource ()))
743 continue;
744
745 auto clobber_insn = next_call_clobbers (tree&: *call_group, insn: use->insn (),
746 ignore: ignore_nothing ());
747 if (clobber_insn)
748 hazard (clobber_insn);
749 }
750 }
751
752 return result;
753}
754
755// Return true iff R1 and R2 overlap.
756static bool
757ranges_overlap_p (const insn_range_info &r1, const insn_range_info &r2)
758{
759 // If either range is empty, then their intersection is empty.
760 if (!r1 || !r2)
761 return false;
762
763 // When do they not overlap? When one range finishes before the other
764 // starts, i.e. (*r1.last < *r2.first || *r2.last < *r1.first).
765 // Inverting this, we get the below.
766 return *r1.last >= *r2.first && *r2.last >= *r1.first;
767}
768
769// Get the range of insns that def feeds.
770static insn_range_info get_def_range (def_info *def)
771{
772 insn_info *last = def->next_def ()->insn ()->prev_nondebug_insn ();
773 return { def->insn (), last };
774}
775
776// Given a def (of memory), return the downwards range within which we
777// can safely move this def.
778static insn_range_info
779def_downwards_move_range (def_info *def)
780{
781 auto range = get_def_range (def);
782
783 auto set = dyn_cast<set_info *> (p: def);
784 if (!set || !set->has_any_uses ())
785 return range;
786
787 auto use = set->first_nondebug_insn_use ();
788 if (use)
789 range = move_earlier_than (range, insn: use->insn ());
790
791 return range;
792}
793
794// Given a def (of memory), return the upwards range within which we can
795// safely move this def.
796static insn_range_info
797def_upwards_move_range (def_info *def)
798{
799 def_info *prev = def->prev_def ();
800 insn_range_info range { prev->insn (), def->insn () };
801
802 auto set = dyn_cast<set_info *> (p: prev);
803 if (!set || !set->has_any_uses ())
804 return range;
805
806 auto use = set->last_nondebug_insn_use ();
807 if (use)
808 range = move_later_than (range, insn: use->insn ());
809
810 return range;
811}
812
813// Class that implements a state machine for building the changes needed to form
814// a store pair instruction. This allows us to easily build the changes in
815// program order, as required by rtl-ssa.
816struct store_change_builder
817{
818 enum class state
819 {
820 FIRST,
821 INSERT,
822 FIXUP_USE,
823 LAST,
824 DONE
825 };
826
827 enum class action
828 {
829 TOMBSTONE,
830 CHANGE,
831 INSERT,
832 FIXUP_USE
833 };
834
835 struct change
836 {
837 action type;
838 insn_info *insn;
839 };
840
841 bool done () const { return m_state == state::DONE; }
842
843 store_change_builder (insn_info *insns[2],
844 insn_info *repurpose,
845 insn_info *dest)
846 : m_state (state::FIRST), m_insns { insns[0], insns[1] },
847 m_repurpose (repurpose), m_dest (dest), m_use (nullptr) {}
848
849 change get_change () const
850 {
851 switch (m_state)
852 {
853 case state::FIRST:
854 return {
855 .type: m_insns[0] == m_repurpose ? action::CHANGE : action::TOMBSTONE,
856 .insn: m_insns[0]
857 };
858 case state::LAST:
859 return {
860 .type: m_insns[1] == m_repurpose ? action::CHANGE : action::TOMBSTONE,
861 .insn: m_insns[1]
862 };
863 case state::INSERT:
864 return { .type: action::INSERT, .insn: m_dest };
865 case state::FIXUP_USE:
866 return { .type: action::FIXUP_USE, .insn: m_use->insn () };
867 case state::DONE:
868 break;
869 }
870
871 gcc_unreachable ();
872 }
873
874 // Transition to the next state.
875 void advance ()
876 {
877 switch (m_state)
878 {
879 case state::FIRST:
880 if (m_repurpose)
881 m_state = state::LAST;
882 else
883 m_state = state::INSERT;
884 break;
885 case state::INSERT:
886 {
887 def_info *def = memory_access (accesses: m_insns[0]->defs ());
888 while (*def->next_def ()->insn () <= *m_dest)
889 def = def->next_def ();
890
891 // Now we know DEF feeds the insertion point for the new stp.
892 // Look for any uses of DEF that will consume the new stp.
893 gcc_assert (*def->insn () <= *m_dest
894 && *def->next_def ()->insn () > *m_dest);
895
896 auto set = as_a<set_info *> (p: def);
897 for (auto use : set->nondebug_insn_uses ())
898 if (*use->insn () > *m_dest)
899 {
900 m_use = use;
901 break;
902 }
903
904 if (m_use)
905 m_state = state::FIXUP_USE;
906 else
907 m_state = state::LAST;
908 break;
909 }
910 case state::FIXUP_USE:
911 m_use = m_use->next_nondebug_insn_use ();
912 if (!m_use)
913 m_state = state::LAST;
914 break;
915 case state::LAST:
916 m_state = state::DONE;
917 break;
918 case state::DONE:
919 gcc_unreachable ();
920 }
921 }
922
923private:
924 state m_state;
925
926 // Original candidate stores.
927 insn_info *m_insns[2];
928
929 // If non-null, this is a candidate insn to change into an stp. Otherwise we
930 // are deleting both original insns and inserting a new insn for the stp.
931 insn_info *m_repurpose;
932
933 // Destionation of the stp, it will be placed immediately after m_dest.
934 insn_info *m_dest;
935
936 // Current nondebug use that needs updating due to stp insertion.
937 use_info *m_use;
938};
939
940// Given candidate store insns FIRST and SECOND, see if we can re-purpose one
941// of them (together with its def of memory) for the stp insn. If so, return
942// that insn. Otherwise, return null.
943static insn_info *
944try_repurpose_store (insn_info *first,
945 insn_info *second,
946 const insn_range_info &move_range)
947{
948 def_info * const defs[2] = {
949 memory_access (accesses: first->defs ()),
950 memory_access (accesses: second->defs ())
951 };
952
953 if (move_range.includes (insn: first)
954 || ranges_overlap_p (r1: move_range, r2: def_downwards_move_range (def: defs[0])))
955 return first;
956
957 if (move_range.includes (insn: second)
958 || ranges_overlap_p (r1: move_range, r2: def_upwards_move_range (def: defs[1])))
959 return second;
960
961 return nullptr;
962}
963
964// Generate the RTL pattern for a "tombstone"; used temporarily during this pass
965// to replace stores that are marked for deletion where we can't immediately
966// delete the store (since there are uses of mem hanging off the store).
967//
968// These are deleted at the end of the pass and uses re-parented appropriately
969// at this point.
970static rtx
971gen_tombstone (void)
972{
973 return gen_rtx_CLOBBER (VOIDmode,
974 gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode)));
975}
976
977// Go through the reg notes rooted at NOTE, dropping those that we should drop,
978// and preserving those that we want to keep by prepending them to (and
979// returning) RESULT. EH_REGION is used to make sure we have at most one
980// REG_EH_REGION note in the resulting list. FR_EXPR is used to return any
981// REG_FRAME_RELATED_EXPR note we find, as these can need special handling in
982// combine_reg_notes.
983static rtx
984filter_notes (rtx note, rtx result, bool *eh_region, rtx *fr_expr)
985{
986 for (; note; note = XEXP (note, 1))
987 {
988 switch (REG_NOTE_KIND (note))
989 {
990 case REG_DEAD:
991 // REG_DEAD notes aren't required to be maintained.
992 case REG_EQUAL:
993 case REG_EQUIV:
994 case REG_UNUSED:
995 case REG_NOALIAS:
996 // These can all be dropped. For REG_EQU{AL,IV} they cannot apply to
997 // non-single_set insns, and REG_UNUSED is re-computed by RTl-SSA, see
998 // rtl-ssa/changes.cc:update_notes.
999 //
1000 // Similarly, REG_NOALIAS cannot apply to a parallel.
1001 case REG_INC:
1002 // When we form the pair insn, the reg update is implemented
1003 // as just another SET in the parallel, so isn't really an
1004 // auto-increment in the RTL sense, hence we drop the note.
1005 break;
1006 case REG_EH_REGION:
1007 gcc_assert (!*eh_region);
1008 *eh_region = true;
1009 result = alloc_reg_note (REG_EH_REGION, XEXP (note, 0), result);
1010 break;
1011 case REG_CFA_DEF_CFA:
1012 case REG_CFA_OFFSET:
1013 case REG_CFA_RESTORE:
1014 result = alloc_reg_note (REG_NOTE_KIND (note),
1015 copy_rtx (XEXP (note, 0)),
1016 result);
1017 break;
1018 case REG_FRAME_RELATED_EXPR:
1019 gcc_assert (!*fr_expr);
1020 *fr_expr = copy_rtx (XEXP (note, 0));
1021 break;
1022 default:
1023 // Unexpected REG_NOTE kind.
1024 gcc_unreachable ();
1025 }
1026 }
1027
1028 return result;
1029}
1030
1031// Return the notes that should be attached to a combination of I1 and I2, where
1032// *I1 < *I2. LOAD_P is true for loads.
1033static rtx
1034combine_reg_notes (insn_info *i1, insn_info *i2, bool load_p)
1035{
1036 // Temporary storage for REG_FRAME_RELATED_EXPR notes.
1037 rtx fr_expr[2] = {};
1038
1039 bool found_eh_region = false;
1040 rtx result = NULL_RTX;
1041 result = filter_notes (REG_NOTES (i2->rtl ()), result,
1042 eh_region: &found_eh_region, fr_expr: fr_expr + 1);
1043 result = filter_notes (REG_NOTES (i1->rtl ()), result,
1044 eh_region: &found_eh_region, fr_expr);
1045
1046 if (!load_p)
1047 {
1048 // Simple frame-related sp-relative saves don't need CFI notes, but when
1049 // we combine them into an stp we will need a CFI note as dwarf2cfi can't
1050 // interpret the unspec pair representation directly.
1051 if (RTX_FRAME_RELATED_P (i1->rtl ()) && !fr_expr[0])
1052 fr_expr[0] = copy_rtx (PATTERN (insn: i1->rtl ()));
1053 if (RTX_FRAME_RELATED_P (i2->rtl ()) && !fr_expr[1])
1054 fr_expr[1] = copy_rtx (PATTERN (insn: i2->rtl ()));
1055 }
1056
1057 rtx fr_pat = NULL_RTX;
1058 if (fr_expr[0] && fr_expr[1])
1059 {
1060 // Combining two frame-related insns, need to construct
1061 // a REG_FRAME_RELATED_EXPR note which represents the combined
1062 // operation.
1063 RTX_FRAME_RELATED_P (fr_expr[1]) = 1;
1064 fr_pat = gen_rtx_PARALLEL (VOIDmode,
1065 gen_rtvec (2, fr_expr[0], fr_expr[1]));
1066 }
1067 else
1068 fr_pat = fr_expr[0] ? fr_expr[0] : fr_expr[1];
1069
1070 if (fr_pat)
1071 result = alloc_reg_note (REG_FRAME_RELATED_EXPR,
1072 fr_pat, result);
1073
1074 return result;
1075}
1076
1077// Given two memory accesses in PATS, at least one of which is of a
1078// writeback form, extract two non-writeback memory accesses addressed
1079// relative to the initial value of the base register, and output these
1080// in PATS. Return an rtx that represents the overall change to the
1081// base register.
1082static rtx
1083extract_writebacks (bool load_p, rtx pats[2], int changed)
1084{
1085 rtx base_reg = NULL_RTX;
1086 poly_int64 current_offset = 0;
1087
1088 poly_int64 offsets[2];
1089
1090 for (int i = 0; i < 2; i++)
1091 {
1092 rtx mem = XEXP (pats[i], load_p);
1093 rtx reg = XEXP (pats[i], !load_p);
1094
1095 rtx addr = XEXP (mem, 0);
1096 const bool autoinc_p = GET_RTX_CLASS (GET_CODE (addr)) == RTX_AUTOINC;
1097
1098 poly_int64 offset;
1099 rtx this_base = pair_mem_strip_offset (mem, offset: &offset);
1100 gcc_assert (REG_P (this_base));
1101 if (base_reg)
1102 gcc_assert (rtx_equal_p (base_reg, this_base));
1103 else
1104 base_reg = this_base;
1105
1106 // If we changed base for the current insn, then we already
1107 // derived the correct mem for this insn from the effective
1108 // address of the other access.
1109 if (i == changed)
1110 {
1111 gcc_checking_assert (!autoinc_p);
1112 offsets[i] = offset;
1113 continue;
1114 }
1115
1116 if (autoinc_p && any_pre_modify_p (x: addr))
1117 current_offset += offset;
1118
1119 poly_int64 this_off = current_offset;
1120 if (!autoinc_p)
1121 this_off += offset;
1122
1123 offsets[i] = this_off;
1124 rtx new_mem = change_address (mem, GET_MODE (mem),
1125 plus_constant (GET_MODE (base_reg),
1126 base_reg, this_off));
1127 pats[i] = load_p
1128 ? gen_rtx_SET (reg, new_mem)
1129 : gen_rtx_SET (new_mem, reg);
1130
1131 if (autoinc_p && any_post_modify_p (x: addr))
1132 current_offset += offset;
1133 }
1134
1135 if (known_eq (current_offset, 0))
1136 return NULL_RTX;
1137
1138 return gen_rtx_SET (base_reg, plus_constant (GET_MODE (base_reg),
1139 base_reg, current_offset));
1140}
1141
1142// INSNS contains either {nullptr, pair insn} (when promoting an existing
1143// non-writeback pair) or contains the candidate insns used to form the pair
1144// (when fusing a new pair).
1145//
1146// PAIR_RANGE specifies where we want to form the final pair.
1147// INITIAL_OFFSET gives the current base offset for the pair.
1148// Bit I of INITIAL_WRITEBACK is set if INSNS[I] initially had writeback.
1149// ACCESS_SIZE gives the access size for a single arm of the pair.
1150// BASE_DEF gives the initial def of the base register consumed by the pair.
1151//
1152// Given the above, this function looks for a trailing destructive update of the
1153// base register. If there is one, we choose the first such update after
1154// PAIR_DST that is still in the same BB as our pair. We return the new def in
1155// *ADD_DEF and the resulting writeback effect in *WRITEBACK_EFFECT.
1156insn_info *
1157pair_fusion::find_trailing_add (insn_info *insns[2],
1158 const insn_range_info &pair_range,
1159 int initial_writeback,
1160 rtx *writeback_effect,
1161 def_info **add_def,
1162 def_info *base_def,
1163 poly_int64 initial_offset,
1164 unsigned access_size)
1165{
1166 // Punt on frame-related insns, it is better to be conservative and
1167 // not try to form writeback pairs here, and means we don't have to
1168 // worry about the writeback case in forming REG_FRAME_RELATED_EXPR
1169 // notes (see combine_reg_notes).
1170 if ((insns[0] && RTX_FRAME_RELATED_P (insns[0]->rtl ()))
1171 || RTX_FRAME_RELATED_P (insns[1]->rtl ()))
1172 return nullptr;
1173
1174 insn_info *pair_dst = pair_range.singleton ();
1175 gcc_assert (pair_dst);
1176
1177 def_info *def = base_def->next_def ();
1178
1179 // In the case that either of the initial pair insns had writeback,
1180 // then there will be intervening defs of the base register.
1181 // Skip over these.
1182 for (int i = 0; i < 2; i++)
1183 if (initial_writeback & (1 << i))
1184 {
1185 gcc_assert (def->insn () == insns[i]);
1186 def = def->next_def ();
1187 }
1188
1189 if (!def || def->bb () != pair_dst->bb ())
1190 return nullptr;
1191
1192 // DEF should now be the first def of the base register after PAIR_DST.
1193 insn_info *cand = def->insn ();
1194 gcc_assert (*cand > *pair_dst);
1195
1196 const auto base_regno = base_def->regno ();
1197
1198 // If CAND doesn't also use our base register,
1199 // it can't destructively update it.
1200 if (!find_access (accesses: cand->uses (), regno: base_regno))
1201 return nullptr;
1202
1203 auto rti = cand->rtl ();
1204
1205 if (!INSN_P (rti))
1206 return nullptr;
1207
1208 auto pat = PATTERN (insn: rti);
1209 if (GET_CODE (pat) != SET)
1210 return nullptr;
1211
1212 auto dest = XEXP (pat, 0);
1213 if (!REG_P (dest) || REGNO (dest) != base_regno)
1214 return nullptr;
1215
1216 poly_int64 offset;
1217 rtx rhs_base = strip_offset (XEXP (pat, 1), &offset);
1218 if (!REG_P (rhs_base)
1219 || REGNO (rhs_base) != base_regno
1220 || !offset.is_constant ())
1221 return nullptr;
1222
1223 // If the initial base offset is zero, we can handle any add offset
1224 // (post-inc). Otherwise, we require the offsets to match (pre-inc).
1225 if (!known_eq (initial_offset, 0) && !known_eq (offset, initial_offset))
1226 return nullptr;
1227
1228 auto off_hwi = offset.to_constant ();
1229
1230 if (off_hwi % access_size != 0)
1231 return nullptr;
1232
1233 off_hwi /= access_size;
1234
1235 if (!pair_mem_in_range_p (offset: off_hwi))
1236 return nullptr;
1237
1238 auto dump_prefix = [&]()
1239 {
1240 if (!insns[0])
1241 fprintf (stream: dump_file, format: "existing pair i%d: ", insns[1]->uid ());
1242 else
1243 fprintf (stream: dump_file, format: " (%d,%d)",
1244 insns[0]->uid (), insns[1]->uid ());
1245 };
1246
1247 insn_info *hazard = latest_hazard_before (insn: cand, ignore: nullptr, ignore_insn: insns[1], is_load_store: false);
1248 if (!hazard || *hazard <= *pair_dst)
1249 {
1250 if (dump_file)
1251 {
1252 dump_prefix ();
1253 fprintf (stream: dump_file,
1254 format: "folding in trailing add (%d) to use writeback form\n",
1255 cand->uid ());
1256 }
1257
1258 *add_def = def;
1259 *writeback_effect = copy_rtx (pat);
1260 return cand;
1261 }
1262
1263 if (dump_file)
1264 {
1265 dump_prefix ();
1266 fprintf (stream: dump_file,
1267 format: "can't fold in trailing add (%d), hazard = %d\n",
1268 cand->uid (), hazard->uid ());
1269 }
1270
1271 return nullptr;
1272}
1273
1274// We just emitted a tombstone with uid UID, track it in a bitmap for
1275// this BB so we can easily identify it later when cleaning up tombstones.
1276void
1277pair_fusion_bb_info::track_tombstone (int uid)
1278{
1279 if (!m_emitted_tombstone)
1280 {
1281 // Lazily initialize the bitmap for tracking tombstone insns.
1282 bitmap_obstack_initialize (&m_bitmap_obstack);
1283 bitmap_initialize (head: &m_tombstone_bitmap, obstack: &m_bitmap_obstack);
1284 m_emitted_tombstone = true;
1285 }
1286
1287 if (!bitmap_set_bit (&m_tombstone_bitmap, uid))
1288 gcc_unreachable (); // Bit should have changed.
1289}
1290
1291// Reset the debug insn containing USE (the debug insn has been
1292// optimized away).
1293static void
1294reset_debug_use (use_info *use)
1295{
1296 auto use_insn = use->insn ();
1297 auto use_rtl = use_insn->rtl ();
1298 insn_change change (use_insn);
1299 change.new_uses = {};
1300 INSN_VAR_LOCATION_LOC (use_rtl) = gen_rtx_UNKNOWN_VAR_LOC ();
1301 crtl->ssa->change_insn (change);
1302}
1303
1304// USE is a debug use that needs updating because DEF (a def of the same
1305// register) is being re-ordered over it. If BASE is non-null, then DEF
1306// is an update of the register BASE by a constant, given by WB_OFFSET,
1307// and we can preserve debug info by accounting for the change in side
1308// effects.
1309static void
1310fixup_debug_use (obstack_watermark &attempt,
1311 use_info *use,
1312 def_info *def,
1313 rtx base,
1314 poly_int64 wb_offset)
1315{
1316 auto use_insn = use->insn ();
1317 if (base)
1318 {
1319 auto use_rtl = use_insn->rtl ();
1320 insn_change change (use_insn);
1321
1322 gcc_checking_assert (REG_P (base) && use->regno () == REGNO (base));
1323 change.new_uses = check_remove_regno_access (watermark&: attempt,
1324 accesses: change.new_uses,
1325 regno: use->regno ());
1326
1327 // The effect of the writeback is to add WB_OFFSET to BASE. If
1328 // we're re-ordering DEF below USE, then we update USE by adding
1329 // WB_OFFSET to it. Otherwise, if we're re-ordering DEF above
1330 // USE, we update USE by undoing the effect of the writeback
1331 // (subtracting WB_OFFSET).
1332 use_info *new_use;
1333 if (*def->insn () > *use_insn)
1334 {
1335 // We now need USE_INSN to consume DEF. Create a new use of DEF.
1336 //
1337 // N.B. this means until we call change_insns for the main change
1338 // group we will temporarily have a debug use consuming a def that
1339 // comes after it, but RTL-SSA doesn't currently support updating
1340 // debug insns as part of the main change group (together with
1341 // nondebug changes), so we will have to live with this update
1342 // leaving the IR being temporarily inconsistent. It seems to
1343 // work out OK once the main change group is applied.
1344 wb_offset *= -1;
1345 new_use = crtl->ssa->create_use (watermark&: attempt,
1346 insn: use_insn,
1347 set: as_a<set_info *> (p: def));
1348 }
1349 else
1350 new_use = find_access (accesses: def->insn ()->uses (), regno: use->regno ());
1351
1352 change.new_uses = insert_access (watermark&: attempt, access1: new_use, accesses2: change.new_uses);
1353
1354 if (dump_file)
1355 {
1356 const char *dir = (*def->insn () < *use_insn) ? "down" : "up";
1357 pretty_printer pp;
1358 pp_string (&pp, "[");
1359 pp_access (&pp, use, flags: 0);
1360 pp_string (&pp, "]");
1361 pp_string (&pp, " due to wb def ");
1362 pp_string (&pp, "[");
1363 pp_access (&pp, def, flags: 0);
1364 pp_string (&pp, "]");
1365 fprintf (stream: dump_file,
1366 format: " i%d: fix up debug use %s re-ordered %s, "
1367 "sub r%u -> r%u + ",
1368 use_insn->uid (), pp_formatted_text (&pp),
1369 dir, REGNO (base), REGNO (base));
1370 print_dec (value: wb_offset, file: dump_file);
1371 fprintf (stream: dump_file, format: "\n");
1372 }
1373
1374 insn_propagation prop (use_rtl, base,
1375 plus_constant (GET_MODE (base), base, wb_offset));
1376 if (prop.apply_to_pattern (&INSN_VAR_LOCATION_LOC (use_rtl)))
1377 crtl->ssa->change_insn (change);
1378 else
1379 {
1380 if (dump_file)
1381 fprintf (stream: dump_file, format: " i%d: RTL substitution failed (%s)"
1382 ", resetting debug insn", use_insn->uid (),
1383 prop.failure_reason);
1384 reset_debug_use (use);
1385 }
1386 }
1387 else
1388 {
1389 if (dump_file)
1390 {
1391 pretty_printer pp;
1392 pp_string (&pp, "[");
1393 pp_access (&pp, use, flags: 0);
1394 pp_string (&pp, "] due to re-ordered load def [");
1395 pp_access (&pp, def, flags: 0);
1396 pp_string (&pp, "]");
1397 fprintf (stream: dump_file, format: " i%d: resetting debug use %s\n",
1398 use_insn->uid (), pp_formatted_text (&pp));
1399 }
1400 reset_debug_use (use);
1401 }
1402}
1403
1404// Update debug uses when folding in a trailing add insn to form a
1405// writeback pair.
1406//
1407// ATTEMPT is used to allocate RTL-SSA temporaries for the changes,
1408// the final pair is placed immediately after PAIR_DST, TRAILING_ADD
1409// is a trailing add insn which is being folded into the pair to make it
1410// use writeback addressing, and WRITEBACK_EFFECT is the pattern for
1411// TRAILING_ADD.
1412static void
1413fixup_debug_uses_trailing_add (obstack_watermark &attempt,
1414 insn_info *pair_dst,
1415 insn_info *trailing_add,
1416 rtx writeback_effect)
1417{
1418 rtx base = SET_DEST (writeback_effect);
1419
1420 poly_int64 wb_offset;
1421 rtx base2 = strip_offset (SET_SRC (writeback_effect), &wb_offset);
1422 gcc_checking_assert (rtx_equal_p (base, base2));
1423
1424 auto defs = trailing_add->defs ();
1425 gcc_checking_assert (defs.size () == 1);
1426 def_info *def = defs[0];
1427
1428 if (auto set = safe_dyn_cast<set_info *> (p: def->prev_def ()))
1429 for (auto use : iterate_safely (range: set->debug_insn_uses ()))
1430 if (*use->insn () > *pair_dst)
1431 // DEF is getting re-ordered above USE, fix up USE accordingly.
1432 fixup_debug_use (attempt, use, def, base, wb_offset);
1433}
1434
1435// Called from fuse_pair, fixes up any debug uses that will be affected
1436// by the changes.
1437//
1438// ATTEMPT is the obstack watermark used to allocate RTL-SSA temporaries for
1439// the changes, INSNS gives the candidate insns: at this point the use/def
1440// information should still be as on entry to fuse_pair, but the patterns may
1441// have changed, hence we pass ORIG_RTL which contains the original patterns
1442// for the candidate insns.
1443//
1444// The final pair will be placed immediately after PAIR_DST, LOAD_P is true if
1445// it is a load pair, bit I of WRITEBACK is set if INSNS[I] originally had
1446// writeback, and WRITEBACK_EFFECT is an rtx describing the overall update to
1447// the base register in the final pair (if any). BASE_REGNO gives the register
1448// number of the base register used in the final pair.
1449static void
1450fixup_debug_uses (obstack_watermark &attempt,
1451 insn_info *insns[2],
1452 rtx orig_rtl[2],
1453 insn_info *pair_dst,
1454 insn_info *trailing_add,
1455 bool load_p,
1456 int writeback,
1457 rtx writeback_effect,
1458 unsigned base_regno)
1459{
1460 // USE is a debug use that needs updating because DEF (a def of the
1461 // resource) is being re-ordered over it. If WRITEBACK_PAT is non-NULL,
1462 // then it gives the original RTL pattern for DEF's insn, and DEF is a
1463 // writeback update of the base register.
1464 //
1465 // This simply unpacks WRITEBACK_PAT if needed and calls fixup_debug_use.
1466 auto update_debug_use = [&](use_info *use, def_info *def,
1467 rtx writeback_pat)
1468 {
1469 poly_int64 offset = 0;
1470 rtx base = NULL_RTX;
1471 if (writeback_pat)
1472 {
1473 rtx mem = XEXP (writeback_pat, load_p);
1474 gcc_checking_assert (GET_RTX_CLASS (GET_CODE (XEXP (mem, 0)))
1475 == RTX_AUTOINC);
1476
1477 base = pair_mem_strip_offset (mem, offset: &offset);
1478 gcc_checking_assert (REG_P (base) && REGNO (base) == base_regno);
1479 }
1480 fixup_debug_use (attempt, use, def, base, wb_offset: offset);
1481 };
1482
1483 // Reset any debug uses of mem over which we re-ordered a store.
1484 //
1485 // It would be nice to try and preserve debug info here, but it seems that
1486 // would require doing alias analysis to see if the store aliases with the
1487 // debug use, which seems a little extravagant just to preserve debug info.
1488 if (!load_p)
1489 {
1490 auto def = memory_access (accesses: insns[0]->defs ());
1491 auto last_def = memory_access (accesses: insns[1]->defs ());
1492 for (; def != last_def; def = def->next_def ())
1493 {
1494 auto set = as_a<set_info *> (p: def);
1495 for (auto use : iterate_safely (range: set->debug_insn_uses ()))
1496 {
1497 if (dump_file)
1498 fprintf (stream: dump_file, format: " i%d: resetting debug use of mem\n",
1499 use->insn ()->uid ());
1500 reset_debug_use (use);
1501 }
1502 }
1503 }
1504
1505 // Now let's take care of register uses, starting with debug uses
1506 // attached to defs from our first insn.
1507 for (auto def : insns[0]->defs ())
1508 {
1509 auto set = dyn_cast<set_info *> (p: def);
1510 if (!set || set->is_mem () || !set->first_debug_insn_use ())
1511 continue;
1512
1513 def_info *defs[2] = {
1514 def,
1515 find_access (accesses: insns[1]->defs (), regno: def->regno ())
1516 };
1517
1518 rtx writeback_pats[2] = {};
1519 if (def->regno () == base_regno)
1520 for (int i = 0; i < 2; i++)
1521 if (writeback & (1 << i))
1522 {
1523 gcc_checking_assert (defs[i]);
1524 writeback_pats[i] = orig_rtl[i];
1525 }
1526
1527 // Now that we've characterized the defs involved, go through the
1528 // debug uses and determine how to update them (if needed).
1529 for (auto use : iterate_safely (range: set->debug_insn_uses ()))
1530 {
1531 if (*pair_dst < *use->insn () && defs[1])
1532 // We're re-ordering defs[1] above a previous use of the
1533 // same resource.
1534 update_debug_use (use, defs[1], writeback_pats[1]);
1535 else if (*pair_dst >= *use->insn ())
1536 // We're re-ordering defs[0] below its use.
1537 update_debug_use (use, defs[0], writeback_pats[0]);
1538 }
1539 }
1540
1541 // Now let's look at registers which are def'd by the second insn
1542 // but not by the first insn, there may still be debug uses of a
1543 // previous def which can be affected by moving the second insn up.
1544 for (auto def : insns[1]->defs ())
1545 {
1546 // This should be M log N where N is the number of defs in
1547 // insns[0] and M is the number of defs in insns[1].
1548 if (def->is_mem () || find_access (accesses: insns[0]->defs (), regno: def->regno ()))
1549 continue;
1550
1551 auto prev_set = safe_dyn_cast<set_info *> (p: def->prev_def ());
1552 if (!prev_set)
1553 continue;
1554
1555 rtx writeback_pat = NULL_RTX;
1556 if (def->regno () == base_regno && (writeback & 2))
1557 writeback_pat = orig_rtl[1];
1558
1559 // We have a def in insns[1] which isn't def'd by the first insn.
1560 // Look to the previous def and see if it has any debug uses.
1561 for (auto use : iterate_safely (range: prev_set->debug_insn_uses ()))
1562 if (*pair_dst < *use->insn ())
1563 // We're ordering DEF above a previous use of the same register.
1564 update_debug_use (use, def, writeback_pat);
1565 }
1566
1567 if ((writeback & 2) && !writeback_effect)
1568 {
1569 // If the second insn initially had writeback but the final
1570 // pair does not, then there may be trailing debug uses of the
1571 // second writeback def which need re-parenting: do that.
1572 auto def = find_access (accesses: insns[1]->defs (), regno: base_regno);
1573 gcc_assert (def);
1574 auto set = as_a<set_info *> (p: def);
1575 for (auto use : iterate_safely (range: set->debug_insn_uses ()))
1576 {
1577 insn_change change (use->insn ());
1578 change.new_uses = check_remove_regno_access (watermark&: attempt,
1579 accesses: change.new_uses,
1580 regno: base_regno);
1581 auto new_use = find_access (accesses: insns[0]->uses (), regno: base_regno);
1582
1583 // N.B. insns must have already shared a common base due to writeback.
1584 gcc_assert (new_use);
1585
1586 if (dump_file)
1587 fprintf (stream: dump_file,
1588 format: " i%d: cancelling wb, re-parenting trailing debug use\n",
1589 use->insn ()->uid ());
1590
1591 change.new_uses = insert_access (watermark&: attempt, access1: new_use, accesses2: change.new_uses);
1592 crtl->ssa->change_insn (change);
1593 }
1594 }
1595 else if (trailing_add)
1596 fixup_debug_uses_trailing_add (attempt, pair_dst, trailing_add,
1597 writeback_effect);
1598}
1599
1600// Try and actually fuse the pair given by insns I1 and I2.
1601//
1602// Here we've done enough analysis to know this is safe, we only
1603// reject the pair at this stage if either the tuning policy says to,
1604// or recog fails on the final pair insn.
1605//
1606// LOAD_P is true for loads, ACCESS_SIZE gives the access size of each
1607// candidate insn. Bit i of WRITEBACK is set if the ith insn (in program
1608// order) uses writeback.
1609//
1610// BASE gives the chosen base candidate for the pair and MOVE_RANGE is
1611// a singleton range which says where to place the pair.
1612bool
1613pair_fusion_bb_info::fuse_pair (bool load_p,
1614 unsigned access_size,
1615 int writeback,
1616 insn_info *i1, insn_info *i2,
1617 base_cand &base,
1618 const insn_range_info &move_range)
1619{
1620 auto attempt = crtl->ssa->new_change_attempt ();
1621
1622 auto make_change = [&attempt](insn_info *insn)
1623 {
1624 return crtl->ssa->change_alloc<insn_change> (wm&: attempt, args: insn);
1625 };
1626 auto make_delete = [&attempt](insn_info *insn)
1627 {
1628 return crtl->ssa->change_alloc<insn_change> (wm&: attempt,
1629 args: insn,
1630 args: insn_change::DELETE);
1631 };
1632
1633 insn_info *first = (*i1 < *i2) ? i1 : i2;
1634 insn_info *second = (first == i1) ? i2 : i1;
1635
1636 insn_info *pair_dst = move_range.singleton ();
1637 gcc_assert (pair_dst);
1638
1639 insn_info *insns[2] = { first, second };
1640
1641 auto_vec<insn_change *> changes;
1642 auto_vec<int, 3> tombstone_uids;
1643
1644 rtx pats[2] = {
1645 PATTERN (insn: first->rtl ()),
1646 PATTERN (insn: second->rtl ())
1647 };
1648
1649 // Make copies of the patterns as we might need to refer to the original RTL
1650 // later, for example when updating debug uses (which is after we've updated
1651 // one or both of the patterns in the candidate insns).
1652 rtx orig_rtl[2];
1653 for (int i = 0; i < 2; i++)
1654 orig_rtl[i] = copy_rtx (pats[i]);
1655
1656 use_array input_uses[2] = { first->uses (), second->uses () };
1657 def_array input_defs[2] = { first->defs (), second->defs () };
1658
1659 int changed_insn = -1;
1660 if (base.from_insn != -1)
1661 {
1662 // If we're not already using a shared base, we need
1663 // to re-write one of the accesses to use the base from
1664 // the other insn.
1665 gcc_checking_assert (base.from_insn == 0 || base.from_insn == 1);
1666 changed_insn = !base.from_insn;
1667
1668 rtx base_pat = pats[base.from_insn];
1669 rtx change_pat = pats[changed_insn];
1670 rtx base_mem = XEXP (base_pat, load_p);
1671 rtx change_mem = XEXP (change_pat, load_p);
1672
1673 const bool lower_base_p = (insns[base.from_insn] == i1);
1674 HOST_WIDE_INT adjust_amt = access_size;
1675 if (!lower_base_p)
1676 adjust_amt *= -1;
1677
1678 rtx change_reg = XEXP (change_pat, !load_p);
1679 rtx effective_base = drop_writeback (mem: base_mem);
1680 rtx adjusted_addr = plus_constant (Pmode,
1681 XEXP (effective_base, 0),
1682 adjust_amt);
1683 rtx new_mem = replace_equiv_address_nv (change_mem, adjusted_addr);
1684 rtx new_set = load_p
1685 ? gen_rtx_SET (change_reg, new_mem)
1686 : gen_rtx_SET (new_mem, change_reg);
1687
1688 pats[changed_insn] = new_set;
1689
1690 auto keep_use = [&](use_info *u)
1691 {
1692 return refers_to_regno_p (u->regno (), u->regno () + 1,
1693 change_pat, &XEXP (change_pat, load_p));
1694 };
1695
1696 // Drop any uses that only occur in the old address.
1697 input_uses[changed_insn] = filter_accesses (watermark&: attempt,
1698 accesses: input_uses[changed_insn],
1699 predicate: keep_use);
1700 }
1701
1702 rtx writeback_effect = NULL_RTX;
1703 if (writeback)
1704 writeback_effect = extract_writebacks (load_p, pats, changed: changed_insn);
1705
1706 const auto base_regno = base.def->regno ();
1707
1708 if (base.from_insn == -1 && (writeback & 1))
1709 {
1710 // If the first of the candidate insns had a writeback form, we'll need to
1711 // drop the use of the updated base register from the second insn's uses.
1712 //
1713 // N.B. we needn't worry about the base register occurring as a store
1714 // operand, as we checked that there was no non-address true dependence
1715 // between the insns in try_fuse_pair.
1716 gcc_checking_assert (find_access (input_uses[1], base_regno));
1717 input_uses[1] = check_remove_regno_access (watermark&: attempt,
1718 accesses: input_uses[1],
1719 regno: base_regno);
1720 }
1721
1722 // Go through and drop uses that only occur in register notes,
1723 // as we won't be preserving those.
1724 for (int i = 0; i < 2; i++)
1725 {
1726 auto rti = insns[i]->rtl ();
1727 if (!REG_NOTES (rti))
1728 continue;
1729
1730 input_uses[i] = remove_note_accesses (watermark&: attempt, accesses: input_uses[i]);
1731 }
1732
1733 // Get the uses that the pair instruction would have, and punt if
1734 // the unpaired instructions use different definitions of the same
1735 // register. That would normally be caught as a side-effect of
1736 // hazard detection below, but this check also deals with cases
1737 // in which one use is undefined and the other isn't.
1738 auto new_uses = merge_access_arrays (watermark&: attempt,
1739 accesses1: drop_memory_access (accesses: input_uses[0]),
1740 accesses2: drop_memory_access (accesses: input_uses[1]));
1741 if (!new_uses.is_valid ())
1742 {
1743 if (dump_file)
1744 fprintf (stream: dump_file,
1745 format: " rejecting pair: i%d and i%d use different definiitions of"
1746 " the same register\n",
1747 insns[0]->uid (), insns[1]->uid ());
1748 return false;
1749 }
1750
1751 // Edge case: if the first insn is a writeback load and the
1752 // second insn is a non-writeback load which transfers into the base
1753 // register, then we should drop the writeback altogether as the
1754 // update of the base register from the second load should prevail.
1755 //
1756 // For example:
1757 // ldr x2, [x1], #8
1758 // ldr x1, [x1]
1759 // -->
1760 // ldp x2, x1, [x1]
1761 if (writeback == 1
1762 && load_p
1763 && find_access (accesses: input_defs[1], regno: base_regno))
1764 {
1765 if (dump_file)
1766 fprintf (stream: dump_file,
1767 format: " load pair: i%d has wb but subsequent i%d has non-wb "
1768 "update of base (r%d), dropping wb\n",
1769 insns[0]->uid (), insns[1]->uid (), base_regno);
1770 gcc_assert (writeback_effect);
1771 writeback_effect = NULL_RTX;
1772 }
1773
1774 // So far the patterns have been in instruction order,
1775 // now we want them in offset order.
1776 if (i1 != first)
1777 std::swap (a&: pats[0], b&: pats[1]);
1778
1779 poly_int64 offsets[2];
1780 for (int i = 0; i < 2; i++)
1781 {
1782 rtx mem = XEXP (pats[i], load_p);
1783 gcc_checking_assert (MEM_P (mem));
1784 rtx base = strip_offset (XEXP (mem, 0), offsets + i);
1785 gcc_checking_assert (REG_P (base));
1786 gcc_checking_assert (base_regno == REGNO (base));
1787 }
1788
1789 // If either of the original insns had writeback, but the resulting pair insn
1790 // does not (can happen e.g. in the load pair edge case above, or if the
1791 // writeback effects cancel out), then drop the def (s) of the base register
1792 // as appropriate.
1793 //
1794 // Also drop the first def in the case that both of the original insns had
1795 // writeback. The second def could well have uses, but the first def should
1796 // only be used by the second insn (and we dropped that use above).
1797 for (int i = 0; i < 2; i++)
1798 if ((!writeback_effect && (writeback & (1 << i)))
1799 || (i == 0 && writeback == 3))
1800 input_defs[i] = check_remove_regno_access (watermark&: attempt,
1801 accesses: input_defs[i],
1802 regno: base_regno);
1803
1804 // If we don't currently have a writeback pair, and we don't have
1805 // a load that clobbers the base register, look for a trailing destructive
1806 // update of the base register and try and fold it in to make this into a
1807 // writeback pair.
1808 insn_info *trailing_add = nullptr;
1809 if (m_pass->should_handle_writeback (which: writeback_type::ALL)
1810 && !writeback_effect
1811 && (!load_p || (!refers_to_regno_p (base_regno, base_regno + 1,
1812 XEXP (pats[0], 0), nullptr)
1813 && !refers_to_regno_p (base_regno, base_regno + 1,
1814 XEXP (pats[1], 0), nullptr))))
1815 {
1816 def_info *add_def;
1817 trailing_add = m_pass->find_trailing_add (insns, pair_range: move_range, initial_writeback: writeback,
1818 writeback_effect: &writeback_effect,
1819 add_def: &add_def, base_def: base.def, initial_offset: offsets[0],
1820 access_size);
1821 if (trailing_add)
1822 {
1823 // The def of the base register from the trailing add should prevail.
1824 input_defs[0] = insert_access (watermark&: attempt, access1: add_def, accesses2: input_defs[0]);
1825 gcc_assert (input_defs[0].is_valid ());
1826 }
1827 }
1828
1829 // Now that we know what base mem we're going to use, check if it's OK
1830 // with the pair mem policy.
1831 rtx first_mem = XEXP (pats[0], load_p);
1832 if (!m_pass->pair_mem_ok_with_policy (base_mem: first_mem, load_p))
1833 {
1834 if (dump_file)
1835 fprintf (stream: dump_file,
1836 format: "punting on pair (%d,%d), pair mem policy says no\n",
1837 i1->uid (), i2->uid ());
1838 return false;
1839 }
1840
1841 rtx reg_notes = combine_reg_notes (i1: first, i2: second, load_p);
1842
1843 rtx pair_pat = m_pass->gen_pair (pats, writeback: writeback_effect, load_p);
1844 insn_change *pair_change = nullptr;
1845 auto set_pair_pat = [pair_pat,reg_notes](insn_change *change) {
1846 rtx_insn *rti = change->insn ()->rtl ();
1847 validate_unshare_change (rti, &PATTERN (insn: rti), pair_pat, true);
1848 validate_change (rti, &REG_NOTES (rti), reg_notes, true);
1849 };
1850
1851 // Turn CHANGE into a memory definition tombstone.
1852 auto make_tombstone = [&](insn_change *change)
1853 {
1854 tombstone_uids.quick_push (obj: change->insn ()->uid ());
1855 rtx_insn *rti = change->insn ()->rtl ();
1856 validate_change (rti, &PATTERN (insn: rti), gen_tombstone (), true);
1857 validate_change (rti, &REG_NOTES (rti), NULL_RTX, true);
1858 change->new_uses = use_array ();
1859 };
1860
1861 if (load_p)
1862 {
1863 changes.safe_push (obj: make_delete (first));
1864 pair_change = make_change (second);
1865 changes.safe_push (obj: pair_change);
1866
1867 pair_change->move_range = move_range;
1868 pair_change->new_defs = merge_access_arrays (watermark&: attempt,
1869 accesses1: input_defs[0],
1870 accesses2: input_defs[1]);
1871 gcc_assert (pair_change->new_defs.is_valid ());
1872
1873 pair_change->new_uses = new_uses;
1874 set_pair_pat (pair_change);
1875 }
1876 else
1877 {
1878 using Action = store_change_builder::action;
1879 insn_info *store_to_change = try_repurpose_store (first, second,
1880 move_range);
1881 store_change_builder builder (insns, store_to_change, pair_dst);
1882 insn_change *change;
1883 set_info *new_set = nullptr;
1884 for (; !builder.done (); builder.advance ())
1885 {
1886 auto action = builder.get_change ();
1887 change = (action.type == Action::INSERT)
1888 ? nullptr : make_change (action.insn);
1889 switch (action.type)
1890 {
1891 case Action::CHANGE:
1892 {
1893 set_pair_pat (change);
1894 change->new_uses = new_uses;
1895 auto d1 = drop_memory_access (accesses: input_defs[0]);
1896 auto d2 = drop_memory_access (accesses: input_defs[1]);
1897 change->new_defs = merge_access_arrays (watermark&: attempt, accesses1: d1, accesses2: d2);
1898 gcc_assert (change->new_defs.is_valid ());
1899 def_info *store_def = memory_access (accesses: change->insn ()->defs ());
1900 change->new_defs = insert_access (watermark&: attempt,
1901 access1: store_def,
1902 accesses2: change->new_defs);
1903 gcc_assert (change->new_defs.is_valid ());
1904 change->move_range = move_range;
1905 pair_change = change;
1906 break;
1907 }
1908 case Action::TOMBSTONE:
1909 {
1910 make_tombstone (change);
1911 break;
1912 }
1913 case Action::INSERT:
1914 {
1915 if (dump_file)
1916 fprintf (stream: dump_file,
1917 format: " stp: cannot re-purpose candidate stores\n");
1918
1919 auto new_insn = crtl->ssa->create_insn (watermark&: attempt, insn_code: INSN, pat: pair_pat);
1920 change = make_change (new_insn);
1921 change->move_range = move_range;
1922 change->new_uses = new_uses;
1923 gcc_assert (change->new_uses.is_valid ());
1924
1925 auto d1 = drop_memory_access (accesses: input_defs[0]);
1926 auto d2 = drop_memory_access (accesses: input_defs[1]);
1927 change->new_defs = merge_access_arrays (watermark&: attempt, accesses1: d1, accesses2: d2);
1928 gcc_assert (change->new_defs.is_valid ());
1929
1930 new_set = crtl->ssa->create_set (watermark&: attempt, insn: new_insn, resource: memory);
1931 change->new_defs = insert_access (watermark&: attempt, access1: new_set,
1932 accesses2: change->new_defs);
1933 gcc_assert (change->new_defs.is_valid ());
1934 pair_change = change;
1935 break;
1936 }
1937 case Action::FIXUP_USE:
1938 {
1939 // This use now needs to consume memory from our stp.
1940 if (dump_file)
1941 fprintf (stream: dump_file,
1942 format: " stp: changing i%d to use mem from new stp "
1943 "(after i%d)\n",
1944 action.insn->uid (), pair_dst->uid ());
1945 change->new_uses = drop_memory_access (accesses: change->new_uses);
1946 gcc_assert (new_set);
1947 auto new_use = crtl->ssa->create_use (watermark&: attempt, insn: action.insn,
1948 set: new_set);
1949 change->new_uses = insert_access (watermark&: attempt, access1: new_use,
1950 accesses2: change->new_uses);
1951 break;
1952 }
1953 }
1954 changes.safe_push (obj: change);
1955 }
1956 }
1957
1958 if (trailing_add)
1959 {
1960 if (auto *mem_def = memory_access (accesses: trailing_add->defs()))
1961 {
1962 auto *change = make_change (trailing_add);
1963 change->new_defs = insert_access (watermark&: attempt, access1: mem_def, accesses2: def_array ());
1964 make_tombstone (change);
1965 changes.safe_push (obj: change);
1966 }
1967 else
1968 changes.safe_push (obj: make_delete (trailing_add));
1969 }
1970 else if ((writeback & 2) && !writeback_effect)
1971 {
1972 // The second insn initially had writeback but now the pair does not,
1973 // need to update any nondebug uses of the base register def in the
1974 // second insn. We'll take care of debug uses later.
1975 auto def = find_access (accesses: insns[1]->defs (), regno: base_regno);
1976 gcc_assert (def);
1977 auto set = dyn_cast<set_info *> (p: def);
1978 if (set && set->has_nondebug_uses ())
1979 {
1980 auto orig_use = find_access (accesses: insns[0]->uses (), regno: base_regno);
1981 for (auto use : set->nondebug_insn_uses ())
1982 {
1983 auto change = make_change (use->insn ());
1984 change->new_uses = check_remove_regno_access (watermark&: attempt,
1985 accesses: change->new_uses,
1986 regno: base_regno);
1987 change->new_uses = insert_access (watermark&: attempt,
1988 access1: orig_use,
1989 accesses2: change->new_uses);
1990 changes.safe_push (obj: change);
1991 }
1992 }
1993 }
1994
1995 auto ignore = ignore_changing_insns (changes);
1996 for (unsigned i = 0; i < changes.length (); i++)
1997 gcc_assert (changes[i]->move_range.singleton ()
1998 && rtl_ssa::restrict_movement (*changes[i], ignore));
1999
2000 // Check the pair pattern is recog'd.
2001 if (!rtl_ssa::recog (watermark&: attempt, change&: *pair_change, ignore))
2002 {
2003 if (dump_file)
2004 fprintf (stream: dump_file, format: " failed to form pair, recog failed\n");
2005
2006 // Free any reg notes we allocated.
2007 while (reg_notes)
2008 {
2009 rtx next = XEXP (reg_notes, 1);
2010 free_EXPR_LIST_node (reg_notes);
2011 reg_notes = next;
2012 }
2013 cancel_changes (0);
2014 return false;
2015 }
2016
2017 gcc_assert (crtl->ssa->verify_insn_changes (changes));
2018
2019 // Fix up any debug uses that will be affected by the changes.
2020 if (MAY_HAVE_DEBUG_INSNS)
2021 fixup_debug_uses (attempt, insns, orig_rtl, pair_dst, trailing_add,
2022 load_p, writeback, writeback_effect, base_regno);
2023
2024 confirm_change_group ();
2025 crtl->ssa->change_insns (changes);
2026
2027 for (auto uid : tombstone_uids)
2028 track_tombstone (uid);
2029
2030 return true;
2031}
2032
2033// Return true if STORE_INSN may modify mem rtx MEM. Make sure we keep
2034// within our BUDGET for alias analysis.
2035static bool
2036store_modifies_mem_p (rtx mem, insn_info *store_insn, int &budget)
2037{
2038 if (!budget)
2039 {
2040 if (dump_file)
2041 {
2042 fprintf (stream: dump_file,
2043 format: "exceeded budget, assuming store %d aliases with mem ",
2044 store_insn->uid ());
2045 print_simple_rtl (dump_file, mem);
2046 fprintf (stream: dump_file, format: "\n");
2047 }
2048
2049 return true;
2050 }
2051
2052 budget--;
2053 return memory_modified_in_insn_p (mem, store_insn->rtl ());
2054}
2055
2056// Return true if LOAD may be modified by STORE. Make sure we keep
2057// within our BUDGET for alias analysis.
2058static bool
2059load_modified_by_store_p (insn_info *load,
2060 insn_info *store,
2061 int &budget)
2062{
2063 gcc_checking_assert (budget >= 0);
2064
2065 if (!budget)
2066 {
2067 if (dump_file)
2068 {
2069 fprintf (stream: dump_file,
2070 format: "exceeded budget, assuming load %d aliases with store %d\n",
2071 load->uid (), store->uid ());
2072 }
2073 return true;
2074 }
2075
2076 // It isn't safe to re-order stores over calls.
2077 if (CALL_P (load->rtl ()))
2078 return true;
2079
2080 budget--;
2081
2082 // Iterate over all MEMs in the load, seeing if any alias with
2083 // our store.
2084 subrtx_var_iterator::array_type array;
2085 rtx pat = PATTERN (insn: load->rtl ());
2086 FOR_EACH_SUBRTX_VAR (iter, array, pat, NONCONST)
2087 if (MEM_P (*iter) && memory_modified_in_insn_p (*iter, store->rtl ()))
2088 return true;
2089
2090 return false;
2091}
2092
2093// Implement some common functionality used by both store_walker
2094// and load_walker.
2095template<bool reverse>
2096class def_walker : public alias_walker
2097{
2098protected:
2099 using def_iter_t = typename std::conditional<reverse,
2100 reverse_def_iterator, def_iterator>::type;
2101
2102 static use_info *start_use_chain (def_iter_t &def_iter)
2103 {
2104 set_info *set = nullptr;
2105 for (; *def_iter; def_iter++)
2106 {
2107 set = dyn_cast<set_info *> (*def_iter);
2108 if (!set)
2109 continue;
2110
2111 use_info *use = reverse
2112 ? set->last_nondebug_insn_use ()
2113 : set->first_nondebug_insn_use ();
2114
2115 if (use)
2116 return use;
2117 }
2118
2119 return nullptr;
2120 }
2121
2122 def_iter_t def_iter;
2123 insn_info *limit;
2124
2125 // Array of register uses from the candidate insn which occur in MEMs.
2126 use_array cand_addr_uses;
2127
2128 def_walker (def_info *def, insn_info *limit, use_array addr_uses) :
2129 def_iter (def), limit (limit), cand_addr_uses (addr_uses) {}
2130
2131 virtual bool iter_valid () const { return *def_iter; }
2132
2133 // Implemented in {load,store}_walker.
2134 virtual bool alias_conflict_p (int &budget) const = 0;
2135
2136 // Return true if the current (walking) INSN () uses a register R inside a
2137 // MEM, where R is also used inside a MEM by the (static) candidate insn, and
2138 // those uses see different definitions of that register. In this case we
2139 // can't rely on RTL alias analysis, and for now we conservatively assume that
2140 // there is an alias conflict. See PR116783.
2141 bool addr_reg_conflict_p () const
2142 {
2143 use_array curr_insn_uses = insn ()->uses ();
2144 auto cand_use_iter = cand_addr_uses.begin ();
2145 auto insn_use_iter = curr_insn_uses.begin ();
2146 while (cand_use_iter != cand_addr_uses.end ()
2147 && insn_use_iter != curr_insn_uses.end ())
2148 {
2149 auto insn_use = *insn_use_iter;
2150 auto cand_use = *cand_use_iter;
2151 if (insn_use->regno () > cand_use->regno ())
2152 cand_use_iter++;
2153 else if (insn_use->regno () < cand_use->regno ())
2154 insn_use_iter++;
2155 else
2156 {
2157 // As it stands I believe the alias code (memory_modified_in_insn_p)
2158 // doesn't look at insn notes such as REG_EQU{IV,AL}, so it should
2159 // be safe to skip over uses that only occur in notes.
2160 if (insn_use->includes_address_uses ()
2161 && !insn_use->only_occurs_in_notes ()
2162 && insn_use->def () != cand_use->def ())
2163 {
2164 if (dump_file)
2165 {
2166 fprintf (stream: dump_file,
2167 format: "assuming aliasing of cand i%d and i%d:\n"
2168 "-> insns see different defs of common addr reg r%u\n"
2169 "-> ",
2170 cand_use->insn ()->uid (), insn_use->insn ()->uid (),
2171 insn_use->regno ());
2172
2173 // Note that while the following sequence could be made more
2174 // concise by eliding pp_string calls into the pp_printf
2175 // calls, doing so triggers -Wformat-diag.
2176 pretty_printer pp;
2177 pp_string (&pp, "[");
2178 pp_access (&pp, cand_use, flags: 0);
2179 pp_string (&pp, "] in ");
2180 pp_printf (&pp, "i%d", cand_use->insn ()->uid ());
2181 pp_string (&pp, " vs [");
2182 pp_access (&pp, insn_use, flags: 0);
2183 pp_string (&pp, "] in ");
2184 pp_printf (&pp, "i%d", insn_use->insn ()->uid ());
2185 fprintf (stream: dump_file, format: "%s\n", pp_formatted_text (&pp));
2186 }
2187 return true;
2188 }
2189
2190 cand_use_iter++;
2191 insn_use_iter++;
2192 }
2193 }
2194
2195 return false;
2196 }
2197
2198public:
2199 insn_info *insn () const override { return (*def_iter)->insn (); }
2200 void advance () override { def_iter++; }
2201 bool valid () const override final
2202 {
2203 if (!iter_valid ())
2204 return false;
2205
2206 if (reverse)
2207 return *(insn ()) > *limit;
2208 else
2209 return *(insn ()) < *limit;
2210 }
2211
2212 bool conflict_p (int &budget) const override final
2213 {
2214 if (addr_reg_conflict_p ())
2215 return true;
2216
2217 return alias_conflict_p (budget);
2218 }
2219};
2220
2221// alias_walker that iterates over stores.
2222template<bool reverse, typename InsnPredicate>
2223class store_walker : public def_walker<reverse>
2224{
2225 rtx cand_mem;
2226 InsnPredicate tombstone_p;
2227
2228public:
2229 store_walker (def_info *mem_def, rtx mem,
2230 use_array addr_uses,
2231 insn_info *limit_insn,
2232 InsnPredicate tombstone_fn) :
2233 def_walker<reverse> (mem_def, limit_insn, addr_uses),
2234 cand_mem (mem), tombstone_p (tombstone_fn) {}
2235
2236 bool alias_conflict_p (int &budget) const override final
2237 {
2238 if (tombstone_p (this->insn ()))
2239 return false;
2240
2241 return store_modifies_mem_p (cand_mem, this->insn (), budget);
2242 }
2243};
2244
2245// alias_walker that iterates over loads.
2246template<bool reverse>
2247class load_walker : public def_walker<reverse>
2248{
2249 using Base = def_walker<reverse>;
2250 using use_iter_t = typename std::conditional<reverse,
2251 reverse_use_iterator, nondebug_insn_use_iterator>::type;
2252
2253 use_iter_t use_iter;
2254 insn_info *cand_store;
2255
2256 bool iter_valid () const override final { return *use_iter; }
2257
2258public:
2259 void advance () override final
2260 {
2261 use_iter++;
2262 if (*use_iter)
2263 return;
2264 this->def_iter++;
2265 use_iter = Base::start_use_chain (this->def_iter);
2266 }
2267
2268 insn_info *insn () const override final
2269 {
2270 return (*use_iter)->insn ();
2271 }
2272
2273 bool alias_conflict_p (int &budget) const override final
2274 {
2275 return load_modified_by_store_p (insn (), cand_store, budget);
2276 }
2277
2278 load_walker (def_info *def, insn_info *store, use_array addr_uses,
2279 insn_info *limit_insn)
2280 : Base (def, limit_insn, addr_uses),
2281 use_iter (Base::start_use_chain (this->def_iter)),
2282 cand_store (store) {}
2283};
2284
2285// Process our alias_walkers in a round-robin fashion, proceeding until
2286// nothing more can be learned from alias analysis.
2287//
2288// We try to maintain the invariant that if a walker becomes invalid, we
2289// set its pointer to null.
2290void
2291pair_fusion::do_alias_analysis (insn_info *alias_hazards[4],
2292 alias_walker *walkers[4],
2293 bool load_p)
2294{
2295 const int n_walkers = 2 + (2 * !load_p);
2296 int budget = pair_mem_alias_check_limit ();
2297
2298 auto next_walker = [walkers,n_walkers](int current) -> int {
2299 for (int j = 1; j <= n_walkers; j++)
2300 {
2301 int idx = (current + j) % n_walkers;
2302 if (walkers[idx])
2303 return idx;
2304 }
2305 return -1;
2306 };
2307
2308 int i = -1;
2309 for (int j = 0; j < n_walkers; j++)
2310 {
2311 alias_hazards[j] = nullptr;
2312 if (!walkers[j])
2313 continue;
2314
2315 if (!walkers[j]->valid ())
2316 walkers[j] = nullptr;
2317 else if (i == -1)
2318 i = j;
2319 }
2320
2321 while (i >= 0)
2322 {
2323 int insn_i = i % 2;
2324 int paired_i = (i & 2) + !insn_i;
2325 int pair_fst = (i & 2);
2326 int pair_snd = (i & 2) + 1;
2327
2328 if (walkers[i]->conflict_p (budget))
2329 {
2330 alias_hazards[i] = walkers[i]->insn ();
2331
2332 // We got an aliasing conflict for this {load,store} walker,
2333 // so we don't need to walk any further.
2334 walkers[i] = nullptr;
2335
2336 // If we have a pair of alias conflicts that prevent
2337 // forming the pair, stop. There's no need to do further
2338 // analysis.
2339 if (alias_hazards[paired_i]
2340 && (*alias_hazards[pair_fst] <= *alias_hazards[pair_snd]))
2341 return;
2342
2343 if (!load_p)
2344 {
2345 int other_pair_fst = (pair_fst ? 0 : 2);
2346 int other_paired_i = other_pair_fst + !insn_i;
2347
2348 int x_pair_fst = (i == pair_fst) ? i : other_paired_i;
2349 int x_pair_snd = (i == pair_fst) ? other_paired_i : i;
2350
2351 // Similarly, handle the case where we have a {load,store}
2352 // or {store,load} alias hazard pair that prevents forming
2353 // the pair.
2354 if (alias_hazards[other_paired_i]
2355 && *alias_hazards[x_pair_fst] <= *alias_hazards[x_pair_snd])
2356 return;
2357 }
2358 }
2359
2360 if (walkers[i])
2361 {
2362 walkers[i]->advance ();
2363
2364 if (!walkers[i]->valid ())
2365 walkers[i] = nullptr;
2366 }
2367
2368 i = next_walker (i);
2369 }
2370}
2371
2372// Given INSNS (in program order) which are known to be adjacent, look
2373// to see if either insn has a suitable RTL (register) base that we can
2374// use to form a pair. Push these to BASE_CANDS if we find any. CAND_MEMs
2375// gives the relevant mems from the candidate insns, ACCESS_SIZE gives the
2376// size of a single candidate access, and REVERSED says whether the accesses
2377// are inverted in offset order.
2378//
2379// Returns an integer where bit (1 << i) is set if INSNS[i] uses writeback
2380// addressing.
2381int
2382pair_fusion::get_viable_bases (insn_info *insns[2],
2383 vec<base_cand> &base_cands,
2384 rtx cand_mems[2],
2385 unsigned access_size,
2386 bool reversed)
2387{
2388 // We discovered this pair through a common base. Need to ensure that
2389 // we have a common base register that is live at both locations.
2390 def_info *base_defs[2] = {};
2391 int writeback = 0;
2392 for (int i = 0; i < 2; i++)
2393 {
2394 const bool is_lower = (i == reversed);
2395 poly_int64 poly_off;
2396 rtx base = pair_mem_strip_offset (mem: cand_mems[i], offset: &poly_off);
2397 if (GET_RTX_CLASS (GET_CODE (XEXP (cand_mems[i], 0))) == RTX_AUTOINC)
2398 writeback |= (1 << i);
2399
2400 if (!REG_P (base) || !poly_off.is_constant ())
2401 continue;
2402
2403 // Punt on accesses relative to eliminable regs. See the comment in
2404 // pair_fusion_bb_info::track_access for a detailed explanation of this.
2405 if (!reload_completed
2406 && (REGNO (base) == FRAME_POINTER_REGNUM
2407 || REGNO (base) == ARG_POINTER_REGNUM))
2408 continue;
2409
2410 HOST_WIDE_INT base_off = poly_off.to_constant ();
2411
2412 // It should be unlikely that we ever punt here, since MEM_EXPR offset
2413 // alignment should be a good proxy for register offset alignment.
2414 if (base_off % access_size != 0)
2415 {
2416 if (dump_file)
2417 fprintf (stream: dump_file,
2418 format: "base not viable, offset misaligned (insn %d)\n",
2419 insns[i]->uid ());
2420 continue;
2421 }
2422
2423 base_off /= access_size;
2424
2425 if (!is_lower)
2426 base_off--;
2427
2428 if (!pair_mem_in_range_p (offset: base_off))
2429 continue;
2430
2431 use_info *use = find_access (accesses: insns[i]->uses (), REGNO (base));
2432 gcc_assert (use);
2433 base_defs[i] = use->def ();
2434 }
2435
2436 if (!base_defs[0] && !base_defs[1])
2437 {
2438 if (dump_file)
2439 fprintf (stream: dump_file, format: "no viable base register for pair (%d,%d)\n",
2440 insns[0]->uid (), insns[1]->uid ());
2441 return writeback;
2442 }
2443
2444 for (int i = 0; i < 2; i++)
2445 if ((writeback & (1 << i)) && !base_defs[i])
2446 {
2447 if (dump_file)
2448 fprintf (stream: dump_file, format: "insn %d has writeback but base isn't viable\n",
2449 insns[i]->uid ());
2450 return writeback;
2451 }
2452
2453 if (writeback == 3
2454 && base_defs[0]->regno () != base_defs[1]->regno ())
2455 {
2456 if (dump_file)
2457 fprintf (stream: dump_file,
2458 format: "pair (%d,%d): double writeback with distinct regs (%d,%d): "
2459 "punting\n",
2460 insns[0]->uid (), insns[1]->uid (),
2461 base_defs[0]->regno (), base_defs[1]->regno ());
2462 return writeback;
2463 }
2464
2465 if (base_defs[0] && base_defs[1]
2466 && base_defs[0]->regno () == base_defs[1]->regno ())
2467 {
2468 // Easy case: insns already share the same base reg.
2469 base_cands.quick_push (obj: base_defs[0]);
2470 return writeback;
2471 }
2472
2473 // Otherwise, we know that one of the bases must change.
2474 //
2475 // Note that if there is writeback we must use the writeback base
2476 // (we know now there is exactly one).
2477 for (int i = 0; i < 2; i++)
2478 if (base_defs[i] && (!writeback || (writeback & (1 << i))))
2479 base_cands.quick_push (obj: base_cand { base_defs[i], i });
2480
2481 return writeback;
2482}
2483
2484// Given two adjacent memory accesses of the same size, I1 and I2, try
2485// and see if we can merge them into a paired access.
2486//
2487// ACCESS_SIZE gives the (common) size of a single access, LOAD_P is true
2488// if the accesses are both loads, otherwise they are both stores.
2489bool
2490pair_fusion_bb_info::try_fuse_pair (bool load_p, unsigned access_size,
2491 insn_info *i1, insn_info *i2)
2492{
2493 if (dump_file)
2494 fprintf (stream: dump_file, format: "analyzing pair (load=%d): (%d,%d)\n",
2495 load_p, i1->uid (), i2->uid ());
2496
2497 insn_info *insns[2];
2498 bool reversed = false;
2499 if (*i1 < *i2)
2500 {
2501 insns[0] = i1;
2502 insns[1] = i2;
2503 }
2504 else
2505 {
2506 insns[0] = i2;
2507 insns[1] = i1;
2508 reversed = true;
2509 }
2510
2511 rtx cand_mems[2];
2512 rtx reg_ops[2];
2513 rtx pats[2];
2514 for (int i = 0; i < 2; i++)
2515 {
2516 pats[i] = PATTERN (insn: insns[i]->rtl ());
2517 cand_mems[i] = XEXP (pats[i], load_p);
2518 reg_ops[i] = XEXP (pats[i], !load_p);
2519 }
2520
2521 if (load_p && reg_overlap_mentioned_p (reg_ops[0], reg_ops[1]))
2522 {
2523 if (dump_file)
2524 fprintf (stream: dump_file,
2525 format: "punting on load pair due to reg conflcits (%d,%d)\n",
2526 insns[0]->uid (), insns[1]->uid ());
2527 return false;
2528 }
2529
2530 if (cfun->can_throw_non_call_exceptions
2531 && find_reg_note (insns[0]->rtl (), REG_EH_REGION, NULL_RTX)
2532 && find_reg_note (insns[1]->rtl (), REG_EH_REGION, NULL_RTX))
2533 {
2534 if (dump_file)
2535 fprintf (stream: dump_file,
2536 format: "can't combine insns with EH side effects (%d,%d)\n",
2537 insns[0]->uid (), insns[1]->uid ());
2538 return false;
2539 }
2540
2541 auto_vec<base_cand, 2> base_cands (2);
2542
2543 int writeback = m_pass->get_viable_bases (insns, base_cands, cand_mems,
2544 access_size, reversed);
2545 if (base_cands.is_empty ())
2546 {
2547 if (dump_file)
2548 fprintf (stream: dump_file, format: "no viable base for pair (%d,%d)\n",
2549 insns[0]->uid (), insns[1]->uid ());
2550 return false;
2551 }
2552
2553 // Punt on frame-related insns with writeback. We probably won't see
2554 // these in practice, but this is conservative and ensures we don't
2555 // have to worry about these later on.
2556 if (writeback && (RTX_FRAME_RELATED_P (i1->rtl ())
2557 || RTX_FRAME_RELATED_P (i2->rtl ())))
2558 {
2559 if (dump_file)
2560 fprintf (stream: dump_file,
2561 format: "rejecting pair (%d,%d): frame-related insn with writeback\n",
2562 i1->uid (), i2->uid ());
2563 return false;
2564 }
2565
2566 rtx *ignore = &XEXP (pats[1], load_p);
2567 for (auto use : insns[1]->uses ())
2568 if (!use->is_mem ()
2569 && refers_to_regno_p (use->regno (), use->regno () + 1, pats[1], ignore)
2570 && use->def () && use->def ()->insn () == insns[0])
2571 {
2572 // N.B. we allow a true dependence on the base address, as this
2573 // happens in the case of auto-inc accesses. Consider a post-increment
2574 // load followed by a regular indexed load, for example.
2575 if (dump_file)
2576 fprintf (stream: dump_file,
2577 format: "%d has non-address true dependence on %d, rejecting pair\n",
2578 insns[1]->uid (), insns[0]->uid ());
2579 return false;
2580 }
2581
2582 unsigned i = 0;
2583 while (i < base_cands.length ())
2584 {
2585 base_cand &cand = base_cands[i];
2586
2587 rtx *ignore[2] = {};
2588 for (int j = 0; j < 2; j++)
2589 if (cand.from_insn == !j)
2590 ignore[j] = &XEXP (cand_mems[j], 0);
2591
2592 insn_info *h = first_hazard_after (insn: insns[0], ignore: ignore[0]);
2593 if (h && *h < *insns[1])
2594 cand.hazards[0] = h;
2595
2596 h = latest_hazard_before (insn: insns[1], ignore: ignore[1]);
2597 if (h && *h > *insns[0])
2598 cand.hazards[1] = h;
2599
2600 if (!cand.viable ())
2601 {
2602 if (dump_file)
2603 fprintf (stream: dump_file,
2604 format: "pair (%d,%d): rejecting base %d due to dataflow "
2605 "hazards (%d,%d)\n",
2606 insns[0]->uid (),
2607 insns[1]->uid (),
2608 cand.def->regno (),
2609 cand.hazards[0]->uid (),
2610 cand.hazards[1]->uid ());
2611
2612 base_cands.ordered_remove (ix: i);
2613 }
2614 else
2615 i++;
2616 }
2617
2618 if (base_cands.is_empty ())
2619 {
2620 if (dump_file)
2621 fprintf (stream: dump_file,
2622 format: "can't form pair (%d,%d) due to dataflow hazards\n",
2623 insns[0]->uid (), insns[1]->uid ());
2624 return false;
2625 }
2626
2627 insn_info *alias_hazards[4] = {};
2628
2629 // First def of memory after the first insn, and last def of memory
2630 // before the second insn, respectively.
2631 def_info *mem_defs[2] = {};
2632 if (load_p)
2633 {
2634 if (!MEM_READONLY_P (cand_mems[0]))
2635 {
2636 mem_defs[0] = memory_access (accesses: insns[0]->uses ())->def ();
2637 gcc_checking_assert (mem_defs[0]);
2638 mem_defs[0] = mem_defs[0]->next_def ();
2639 }
2640 if (!MEM_READONLY_P (cand_mems[1]))
2641 {
2642 mem_defs[1] = memory_access (accesses: insns[1]->uses ())->def ();
2643 gcc_checking_assert (mem_defs[1]);
2644 }
2645 }
2646 else
2647 {
2648 mem_defs[0] = memory_access (accesses: insns[0]->defs ())->next_def ();
2649 mem_defs[1] = memory_access (accesses: insns[1]->defs ())->prev_def ();
2650 gcc_checking_assert (mem_defs[0]);
2651 gcc_checking_assert (mem_defs[1]);
2652 }
2653
2654 auto tombstone_p = [&](insn_info *insn) -> bool {
2655 return m_emitted_tombstone
2656 && bitmap_bit_p (&m_tombstone_bitmap, insn->uid ());
2657 };
2658
2659 auto_vec<access_info *, 2> addr_use_vec[2];
2660 use_array addr_uses[2];
2661
2662 // Collect the lists of register uses that occur in the candidate MEMs.
2663 for (int i = 0; i < 2; i++)
2664 {
2665 // N.B. it's safe for us to ignore uses that only occur in notes
2666 // here (e.g. in a REG_EQUIV expression) since we only pass the
2667 // MEM down to the alias machinery, so it can't see any insn-level
2668 // notes.
2669 for (auto use : insns[i]->uses ())
2670 if (use->is_reg ()
2671 && use->includes_address_uses ()
2672 && !use->only_occurs_in_notes ())
2673 addr_use_vec[i].safe_push (obj: use);
2674
2675 addr_uses[i] = use_array (addr_use_vec[i]);
2676 }
2677
2678 store_walker<false, decltype(tombstone_p)>
2679 forward_store_walker (mem_defs[0], cand_mems[0], addr_uses[0], insns[1],
2680 tombstone_p);
2681
2682 store_walker<true, decltype(tombstone_p)>
2683 backward_store_walker (mem_defs[1], cand_mems[1], addr_uses[1], insns[0],
2684 tombstone_p);
2685
2686 alias_walker *walkers[4] = {};
2687 if (mem_defs[0])
2688 walkers[0] = &forward_store_walker;
2689 if (mem_defs[1])
2690 walkers[1] = &backward_store_walker;
2691
2692 if (load_p && (mem_defs[0] || mem_defs[1]))
2693 m_pass->do_alias_analysis (alias_hazards, walkers, load_p);
2694 else
2695 {
2696 // We want to find any loads hanging off the first store.
2697 mem_defs[0] = memory_access (accesses: insns[0]->defs ());
2698 load_walker<false> forward_load_walker (mem_defs[0], insns[0],
2699 addr_uses[0], insns[1]);
2700 load_walker<true> backward_load_walker (mem_defs[1], insns[1],
2701 addr_uses[1], insns[0]);
2702 walkers[2] = &forward_load_walker;
2703 walkers[3] = &backward_load_walker;
2704 m_pass->do_alias_analysis (alias_hazards, walkers, load_p);
2705 // Now consolidate hazards back down.
2706 if (alias_hazards[2]
2707 && (!alias_hazards[0] || (*alias_hazards[2] < *alias_hazards[0])))
2708 alias_hazards[0] = alias_hazards[2];
2709
2710 if (alias_hazards[3]
2711 && (!alias_hazards[1] || (*alias_hazards[3] > *alias_hazards[1])))
2712 alias_hazards[1] = alias_hazards[3];
2713 }
2714
2715 if (alias_hazards[0] && alias_hazards[1]
2716 && *alias_hazards[0] <= *alias_hazards[1])
2717 {
2718 if (dump_file)
2719 fprintf (stream: dump_file,
2720 format: "cannot form pair (%d,%d) due to alias conflicts (%d,%d)\n",
2721 i1->uid (), i2->uid (),
2722 alias_hazards[0]->uid (), alias_hazards[1]->uid ());
2723 return false;
2724 }
2725
2726 // Now narrow the hazards on each base candidate using
2727 // the alias hazards.
2728 i = 0;
2729 while (i < base_cands.length ())
2730 {
2731 base_cand &cand = base_cands[i];
2732 if (alias_hazards[0] && (!cand.hazards[0]
2733 || *alias_hazards[0] < *cand.hazards[0]))
2734 cand.hazards[0] = alias_hazards[0];
2735 if (alias_hazards[1] && (!cand.hazards[1]
2736 || *alias_hazards[1] > *cand.hazards[1]))
2737 cand.hazards[1] = alias_hazards[1];
2738
2739 if (cand.viable ())
2740 i++;
2741 else
2742 {
2743 if (dump_file)
2744 fprintf (stream: dump_file, format: "pair (%d,%d): rejecting base %d due to "
2745 "alias/dataflow hazards (%d,%d)",
2746 insns[0]->uid (), insns[1]->uid (),
2747 cand.def->regno (),
2748 cand.hazards[0]->uid (),
2749 cand.hazards[1]->uid ());
2750
2751 base_cands.ordered_remove (ix: i);
2752 }
2753 }
2754
2755 if (base_cands.is_empty ())
2756 {
2757 if (dump_file)
2758 fprintf (stream: dump_file,
2759 format: "cannot form pair (%d,%d) due to alias/dataflow hazards",
2760 insns[0]->uid (), insns[1]->uid ());
2761
2762 return false;
2763 }
2764
2765 base_cand *base = &base_cands[0];
2766 if (base_cands.length () > 1)
2767 {
2768 // If there are still multiple viable bases, it makes sense
2769 // to choose one that allows us to reduce register pressure,
2770 // for loads this means moving further down, for stores this
2771 // means moving further up.
2772 gcc_checking_assert (base_cands.length () == 2);
2773 const int hazard_i = !load_p;
2774 if (base->hazards[hazard_i])
2775 {
2776 if (!base_cands[1].hazards[hazard_i])
2777 base = &base_cands[1];
2778 else if (load_p
2779 && *base_cands[1].hazards[hazard_i]
2780 > *(base->hazards[hazard_i]))
2781 base = &base_cands[1];
2782 else if (!load_p
2783 && *base_cands[1].hazards[hazard_i]
2784 < *(base->hazards[hazard_i]))
2785 base = &base_cands[1];
2786 }
2787 }
2788
2789 // Otherwise, hazards[0] > hazards[1].
2790 // Pair can be formed anywhere in (hazards[1], hazards[0]).
2791 insn_range_info range (insns[0], insns[1]);
2792 if (base->hazards[1])
2793 range.first = base->hazards[1];
2794 if (base->hazards[0])
2795 range.last = base->hazards[0]->prev_nondebug_insn ();
2796
2797 // If the second insn can throw, narrow the move range to exactly that insn.
2798 // This prevents us trying to move the second insn from the end of the BB.
2799 if (cfun->can_throw_non_call_exceptions
2800 && find_reg_note (insns[1]->rtl (), REG_EH_REGION, NULL_RTX))
2801 {
2802 gcc_assert (range.includes (insns[1]));
2803 range = insn_range_info (insns[1]);
2804 }
2805
2806 // Placement strategy: push loads down and pull stores up, this should
2807 // help register pressure by reducing live ranges.
2808 if (load_p)
2809 range.first = range.last;
2810 else
2811 range.last = range.first;
2812
2813 if (dump_file)
2814 {
2815 auto print_hazard = [](insn_info *i)
2816 {
2817 if (i)
2818 fprintf (stream: dump_file, format: "%d", i->uid ());
2819 else
2820 fprintf (stream: dump_file, format: "-");
2821 };
2822 auto print_pair = [print_hazard](insn_info **i)
2823 {
2824 print_hazard (i[0]);
2825 fprintf (stream: dump_file, format: ",");
2826 print_hazard (i[1]);
2827 };
2828
2829 fprintf (stream: dump_file, format: "fusing pair [L=%d] (%d,%d), base=%d, hazards: (",
2830 load_p, insns[0]->uid (), insns[1]->uid (),
2831 base->def->regno ());
2832 print_pair (base->hazards);
2833 fprintf (stream: dump_file, format: "), move_range: (%d,%d)\n",
2834 range.first->uid (), range.last->uid ());
2835 }
2836
2837 return fuse_pair (load_p, access_size, writeback,
2838 i1, i2, base&: *base, move_range: range);
2839}
2840
2841static void
2842dump_insn_list (FILE *f, const insn_list_t &l)
2843{
2844 fprintf (stream: f, format: "(");
2845
2846 auto i = l.begin ();
2847 auto end = l.end ();
2848
2849 if (i != end)
2850 fprintf (stream: f, format: "%d", (*i)->uid ());
2851 i++;
2852
2853 for (; i != end; i++)
2854 fprintf (stream: f, format: ", %d", (*i)->uid ());
2855
2856 fprintf (stream: f, format: ")");
2857}
2858
2859DEBUG_FUNCTION void
2860debug (const insn_list_t &l)
2861{
2862 dump_insn_list (stderr, l);
2863 fprintf (stderr, format: "\n");
2864}
2865
2866// LEFT_LIST and RIGHT_LIST are lists of candidate instructions where all insns
2867// in LEFT_LIST are known to be adjacent to those in RIGHT_LIST.
2868//
2869// This function traverses the resulting 2D matrix of possible pair candidates
2870// and attempts to merge them into pairs.
2871//
2872// The algorithm is straightforward: if we consider a combined list of
2873// candidates X obtained by merging LEFT_LIST and RIGHT_LIST in program order,
2874// then we advance through X until we reach a crossing point (where X[i] and
2875// X[i+1] come from different source lists).
2876//
2877// At this point we know X[i] and X[i+1] are adjacent accesses, and we try to
2878// fuse them into a pair. If this succeeds, we remove X[i] and X[i+1] from
2879// their original lists and continue as above.
2880//
2881// In the failure case, we advance through the source list containing X[i] and
2882// continue as above (proceeding to the next crossing point).
2883//
2884// The rationale for skipping over groups of consecutive candidates from the
2885// same source list is as follows:
2886//
2887// In the store case, the insns in the group can't be re-ordered over each
2888// other as they are guaranteed to store to the same location, so we're
2889// guaranteed not to lose opportunities by doing this.
2890//
2891// In the load case, subsequent loads from the same location are either
2892// redundant (in which case they should have been cleaned up by an earlier
2893// optimization pass) or there is an intervening aliasing hazard, in which case
2894// we can't re-order them anyway, so provided earlier passes have cleaned up
2895// redundant loads, we shouldn't miss opportunities by doing this.
2896void
2897pair_fusion_bb_info::merge_pairs (insn_list_t &left_list,
2898 insn_list_t &right_list,
2899 bool load_p,
2900 unsigned access_size)
2901{
2902 if (dump_file)
2903 {
2904 fprintf (stream: dump_file, format: "merge_pairs [L=%d], cand vecs ", load_p);
2905 dump_insn_list (f: dump_file, l: left_list);
2906 fprintf (stream: dump_file, format: " x ");
2907 dump_insn_list (f: dump_file, l: right_list);
2908 fprintf (stream: dump_file, format: "\n");
2909 }
2910
2911 auto iter_l = left_list.begin ();
2912 auto iter_r = right_list.begin ();
2913
2914 while (iter_l != left_list.end () && iter_r != right_list.end ())
2915 {
2916 auto next_l = std::next (x: iter_l);
2917 auto next_r = std::next (x: iter_r);
2918 if (**iter_l < **iter_r
2919 && next_l != left_list.end ()
2920 && **next_l < **iter_r)
2921 iter_l = next_l;
2922 else if (**iter_r < **iter_l
2923 && next_r != right_list.end ()
2924 && **next_r < **iter_l)
2925 iter_r = next_r;
2926 else if (try_fuse_pair (load_p, access_size, i1: *iter_l, i2: *iter_r))
2927 {
2928 left_list.erase (position: iter_l);
2929 iter_l = next_l;
2930 right_list.erase (position: iter_r);
2931 iter_r = next_r;
2932 }
2933 else if (**iter_l < **iter_r)
2934 iter_l = next_l;
2935 else
2936 iter_r = next_r;
2937 }
2938}
2939
2940// Iterate over the accesses in GROUP, looking for adjacent sets
2941// of accesses. If we find two sets of adjacent accesses, call
2942// merge_pairs.
2943void
2944pair_fusion_bb_info::transform_for_base (int encoded_lfs,
2945 access_group &group)
2946{
2947 const auto lfs = decode_lfs (lfs: encoded_lfs);
2948 const unsigned access_size = lfs.size;
2949
2950 bool skip_next = true;
2951 access_record *prev_access = nullptr;
2952
2953 for (auto &access : group.list)
2954 {
2955 if (skip_next)
2956 skip_next = false;
2957 else if (known_eq (access.offset, prev_access->offset + access_size))
2958 {
2959 merge_pairs (left_list&: prev_access->cand_insns,
2960 right_list&: access.cand_insns,
2961 load_p: lfs.load_p,
2962 access_size);
2963 skip_next = access.cand_insns.empty ();
2964 }
2965 prev_access = &access;
2966 }
2967}
2968
2969// If we emitted tombstone insns for this BB, iterate through the BB
2970// and remove all the tombstone insns, being sure to reparent any uses
2971// of mem to previous defs when we do this.
2972void
2973pair_fusion_bb_info::cleanup_tombstones ()
2974{
2975 // No need to do anything if we didn't emit a tombstone insn for this BB.
2976 if (!m_emitted_tombstone)
2977 return;
2978
2979 for (auto insn : iterate_safely (range: m_bb->nondebug_insns ()))
2980 {
2981 if (!insn->is_real ()
2982 || !bitmap_bit_p (&m_tombstone_bitmap, insn->uid ()))
2983 continue;
2984
2985 auto set = as_a<set_info *> (p: memory_access (accesses: insn->defs ()));
2986 if (set->has_any_uses ())
2987 {
2988 auto prev_set = as_a<set_info *> (p: set->prev_def ());
2989 while (set->first_use ())
2990 crtl->ssa->reparent_use (use: set->first_use (), new_def: prev_set);
2991 }
2992
2993 // Now set has no uses, we can delete it.
2994 insn_change change (insn, insn_change::DELETE);
2995 crtl->ssa->change_insn (change);
2996 }
2997}
2998
2999template<typename Map>
3000void
3001pair_fusion_bb_info::traverse_base_map (Map &map)
3002{
3003 for (auto kv : map)
3004 {
3005 const auto &key = kv.first;
3006 auto &value = kv.second;
3007 transform_for_base (encoded_lfs: key.second, group&: value);
3008 }
3009}
3010
3011void
3012pair_fusion_bb_info::transform ()
3013{
3014 traverse_base_map (map&: expr_map);
3015 traverse_base_map (map&: def_map);
3016}
3017
3018// Given an existing pair insn INSN, look for a trailing update of
3019// the base register which we can fold in to make this pair use
3020// a writeback addressing mode.
3021void
3022pair_fusion::try_promote_writeback (insn_info *insn, bool load_p)
3023{
3024 rtx regs[2];
3025
3026 rtx mem = destructure_pair (regs, pattern: PATTERN (insn: insn->rtl ()), load_p);
3027 gcc_checking_assert (MEM_P (mem));
3028
3029 poly_int64 offset;
3030 rtx base = strip_offset (XEXP (mem, 0), &offset);
3031 gcc_assert (REG_P (base));
3032
3033 const auto access_size = GET_MODE_SIZE (GET_MODE (mem)).to_constant () / 2;
3034
3035 if (find_access (accesses: insn->defs (), REGNO (base)))
3036 {
3037 gcc_assert (load_p);
3038 if (dump_file)
3039 fprintf (stream: dump_file,
3040 format: "ldp %d clobbers base r%d, can't promote to writeback\n",
3041 insn->uid (), REGNO (base));
3042 return;
3043 }
3044
3045 auto base_use = find_access (accesses: insn->uses (), REGNO (base));
3046 gcc_assert (base_use);
3047
3048 if (!base_use->def ())
3049 {
3050 if (dump_file)
3051 fprintf (stream: dump_file,
3052 format: "found pair (i%d, L=%d): but base r%d is upwards exposed\n",
3053 insn->uid (), load_p, REGNO (base));
3054 return;
3055 }
3056
3057 auto base_def = base_use->def ();
3058
3059 rtx wb_effect = NULL_RTX;
3060 def_info *add_def;
3061 const insn_range_info pair_range (insn);
3062 insn_info *insns[2] = { nullptr, insn };
3063 insn_info *trailing_add
3064 = find_trailing_add (insns, pair_range, initial_writeback: 0, writeback_effect: &wb_effect,
3065 add_def: &add_def, base_def, initial_offset: offset,
3066 access_size);
3067 if (!trailing_add)
3068 return;
3069
3070 auto attempt = crtl->ssa->new_change_attempt ();
3071
3072 insn_change pair_change (insn);
3073 insn_change del_change (trailing_add, insn_change::DELETE);
3074 insn_change *changes[] = { &pair_change, &del_change };
3075
3076 rtx pair_pat = gen_promote_writeback_pair (wb_effect, mem, regs, load_p);
3077 validate_unshare_change (insn->rtl (), &PATTERN (insn: insn->rtl ()), pair_pat,
3078 true);
3079
3080 // The pair must gain the def of the base register from the add.
3081 pair_change.new_defs = insert_access (watermark&: attempt,
3082 access1: add_def,
3083 accesses2: pair_change.new_defs);
3084 gcc_assert (pair_change.new_defs.is_valid ());
3085
3086 auto ignore = ignore_changing_insns (changes);
3087 for (unsigned i = 0; i < ARRAY_SIZE (changes); i++)
3088 gcc_assert (changes[i]->move_range.singleton ()
3089 && rtl_ssa::restrict_movement (*changes[i], ignore));
3090
3091 if (!rtl_ssa::recog (watermark&: attempt, change&: pair_change, ignore))
3092 {
3093 if (dump_file)
3094 fprintf (stream: dump_file, format: "i%d: recog failed on wb pair, bailing out\n",
3095 insn->uid ());
3096 cancel_changes (0);
3097 return;
3098 }
3099
3100 gcc_assert (crtl->ssa->verify_insn_changes (changes));
3101
3102 if (MAY_HAVE_DEBUG_INSNS)
3103 fixup_debug_uses_trailing_add (attempt, pair_dst: insn, trailing_add, writeback_effect: wb_effect);
3104
3105 confirm_change_group ();
3106 crtl->ssa->change_insns (changes);
3107}
3108
3109// Main function for the pass. Iterate over the insns in BB looking
3110// for load/store candidates. If running after RA, also try and promote
3111// non-writeback pairs to use writeback addressing. Then try to fuse
3112// candidates into pairs.
3113void pair_fusion::process_block (bb_info *bb)
3114{
3115 const bool track_loads = track_loads_p ();
3116 const bool track_stores = track_stores_p ();
3117
3118 pair_fusion_bb_info bb_state (bb, this);
3119
3120 for (auto insn : bb->nondebug_insns ())
3121 {
3122 rtx_insn *rti = insn->rtl ();
3123
3124 if (!rti || !INSN_P (rti))
3125 continue;
3126
3127 rtx pat = PATTERN (insn: rti);
3128 bool load_p;
3129 if (reload_completed
3130 && should_handle_writeback (which: writeback_type::ALL)
3131 && pair_mem_insn_p (insn: rti, load_p))
3132 try_promote_writeback (insn, load_p);
3133
3134 if (GET_CODE (pat) != SET)
3135 continue;
3136
3137 if (track_stores && MEM_P (XEXP (pat, 0)))
3138 bb_state.track_access (insn, load_p: false, XEXP (pat, 0));
3139 else if (track_loads && MEM_P (XEXP (pat, 1)))
3140 bb_state.track_access (insn, load_p: true, XEXP (pat, 1));
3141 }
3142
3143 bb_state.transform ();
3144 bb_state.cleanup_tombstones ();
3145}
3146

source code of gcc/pair-fusion.cc