SILoadStoreOptimizer.cpp source code [llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp]

1	//===- SILoadStoreOptimizer.cpp -------------------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This pass tries to fuse DS instructions with close by immediate offsets.
10	// This will fuse operations such as
11	// ds_read_b32 v0, v2 offset:16
12	// ds_read_b32 v1, v2 offset:32
13	// ==>
14	// ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
15	//
16	// The same is done for certain SMEM and VMEM opcodes, e.g.:
17	// s_buffer_load_dword s4, s[0:3], 4
18	// s_buffer_load_dword s5, s[0:3], 8
19	// ==>
20	// s_buffer_load_dwordx2 s[4:5], s[0:3], 4
21	//
22	// This pass also tries to promote constant offset to the immediate by
23	// adjusting the base. It tries to use a base from the nearby instructions that
24	// allows it to have a 13bit constant offset and then promotes the 13bit offset
25	// to the immediate.
26	// E.g.
27	// s_movk_i32 s0, 0x1800
28	// v_add_co_u32_e32 v0, vcc, s0, v2
29	// v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
30	//
31	// s_movk_i32 s0, 0x1000
32	// v_add_co_u32_e32 v5, vcc, s0, v2
33	// v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
34	// global_load_dwordx2 v[5:6], v[5:6], off
35	// global_load_dwordx2 v[0:1], v[0:1], off
36	// =>
37	// s_movk_i32 s0, 0x1000
38	// v_add_co_u32_e32 v5, vcc, s0, v2
39	// v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
40	// global_load_dwordx2 v[5:6], v[5:6], off
41	// global_load_dwordx2 v[0:1], v[5:6], off offset:2048
42	//
43	// Future improvements:
44	//
45	// - This is currently missing stores of constants because loading
46	// the constant into the data register is placed between the stores, although
47	// this is arguably a scheduling problem.
48	//
49	// - Live interval recomputing seems inefficient. This currently only matches
50	// one pair, and recomputes live intervals and moves on to the next pair. It
51	// would be better to compute a list of all merges that need to occur.
52	//
53	// - With a list of instructions to process, we can also merge more. If a
54	// cluster of loads have offsets that are too large to fit in the 8-bit
55	// offsets, but are close enough to fit in the 8 bits, we can add to the base
56	// pointer and use the new reduced offsets.
57	//
58	//===----------------------------------------------------------------------===//
59
60	#include "AMDGPU.h"
61	#include "GCNSubtarget.h"
62	#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
63	#include "llvm/Analysis/AliasAnalysis.h"
64	#include "llvm/CodeGen/MachineFunctionPass.h"
65	#include "llvm/InitializePasses.h"
66
67	using namespace llvm;
68
69	#define DEBUG_TYPE "si-load-store-opt"
70
71	namespace {
72	enum InstClassEnum {
73	UNKNOWN,
74	DS_READ,
75	DS_WRITE,
76	S_BUFFER_LOAD_IMM,
77	S_BUFFER_LOAD_SGPR_IMM,
78	S_LOAD_IMM,
79	BUFFER_LOAD,
80	BUFFER_STORE,
81	MIMG,
82	TBUFFER_LOAD,
83	TBUFFER_STORE,
84	GLOBAL_LOAD_SADDR,
85	GLOBAL_STORE_SADDR,
86	FLAT_LOAD,
87	FLAT_STORE,
88	GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of
89	GLOBAL_STORE // any CombineInfo, they are only ever returned by
90	// getCommonInstClass.
91	};
92
93	struct AddressRegs {
94	unsigned char NumVAddrs = `0`;
95	bool SBase = false;
96	bool SRsrc = false;
97	bool SOffset = false;
98	bool SAddr = false;
99	bool VAddr = false;
100	bool Addr = false;
101	bool SSamp = false;
102	};
103
104	// GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp.
105	const unsigned MaxAddressRegs = `12` + `1` + `1`;
106
107	class SILoadStoreOptimizer : public MachineFunctionPass {
108	struct CombineInfo {
109	MachineBasicBlock::iterator I;
110	unsigned EltSize;
111	unsigned Offset;
112	unsigned Width;
113	unsigned Format;
114	unsigned BaseOff;
115	unsigned DMask;
116	InstClassEnum InstClass;
117	unsigned CPol = `0`;
118	bool IsAGPR;
119	bool UseST64;
120	int AddrIdx[MaxAddressRegs];
121	const MachineOperand *AddrReg[MaxAddressRegs];
122	unsigned NumAddresses;
123	unsigned Order;
124
125	bool hasSameBaseAddress(const CombineInfo &CI) {
126	if (NumAddresses != CI.NumAddresses)
127	return false;
128
129	const MachineInstr &MI = *CI.I;
130	for (unsigned i = `0`; i < NumAddresses; i++) {
131	const MachineOperand &AddrRegNext = MI.getOperand(i: AddrIdx[i]);
132
133	if (AddrReg[i]->isImm() \|\| AddrRegNext.isImm()) {
134	if (AddrReg[i]->isImm() != AddrRegNext.isImm() \|\|
135	AddrReg[i]->getImm() != AddrRegNext.getImm()) {
136	return false;
137	}
138	continue;
139	}
140
141	// Check same base pointer. Be careful of subregisters, which can occur
142	// with vectors of pointers.
143	if (AddrReg[i]->getReg() != AddrRegNext.getReg() \|\|
144	AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
145	return false;
146	}
147	}
148	return true;
149	}
150
151	bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
152	for (unsigned i = `0`; i < NumAddresses; ++i) {
153	const MachineOperand *AddrOp = AddrReg[i];
154	// Immediates are always OK.
155	if (AddrOp->isImm())
156	continue;
157
158	// Don't try to merge addresses that aren't either immediates or registers.
159	// TODO: Should be possible to merge FrameIndexes and maybe some other
160	// non-register
161	if (!AddrOp->isReg())
162	return false;
163
164	// TODO: We should be able to merge instructions with other physical reg
165	// addresses too.
166	if (AddrOp->getReg().isPhysical() &&
167	AddrOp->getReg() != AMDGPU::SGPR_NULL)
168	return false;
169
170	// If an address has only one use then there will be no other
171	// instructions with the same address, so we can't merge this one.
172	if (MRI.hasOneNonDBGUse(RegNo: AddrOp->getReg()))
173	return false;
174	}
175	return true;
176	}
177
178	void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO);
179
180	// Compare by pointer order.
181	bool operator<(const CombineInfo& Other) const {
182	return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset;
183	}
184	};
185
186	struct BaseRegisters {
187	Register LoReg;
188	Register HiReg;
189
190	unsigned LoSubReg = `0`;
191	unsigned HiSubReg = `0`;
192	};
193
194	struct MemAddress {
195	BaseRegisters Base;
196	int64_t Offset = `0`;
197	};
198
199	using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
200
201	private:
202	const GCNSubtarget STM = nullptr*;
203	const SIInstrInfo TII = nullptr*;
204	const SIRegisterInfo TRI = nullptr*;
205	MachineRegisterInfo MRI = nullptr*;
206	AliasAnalysis AA = nullptr*;
207	bool OptimizeAgain;
208
209	bool canSwapInstructions(const DenseSet<Register> &ARegDefs,
210	const DenseSet<Register> &ARegUses,
211	const MachineInstr &A, const MachineInstr &B) const;
212	static bool dmasksCanBeCombined(const CombineInfo &CI,
213	const SIInstrInfo &TII,
214	const CombineInfo &Paired);
215	static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI,
216	CombineInfo &Paired, bool Modify = false);
217	static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,
218	const CombineInfo &Paired);
219	static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
220	static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
221	const CombineInfo &Paired);
222	const TargetRegisterClass getTargetRegisterClass(const* CombineInfo &CI,
223	const CombineInfo &Paired);
224	const TargetRegisterClass getDataRegClass(const* MachineInstr &MI) const;
225
226	CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
227
228	unsigned read2Opcode(unsigned EltSize) const;
229	unsigned read2ST64Opcode(unsigned EltSize) const;
230	MachineBasicBlock::iterator
231	mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
232	MachineBasicBlock::iterator InsertBefore);
233
234	unsigned write2Opcode(unsigned EltSize) const;
235	unsigned write2ST64Opcode(unsigned EltSize) const;
236	MachineBasicBlock::iterator
237	mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
238	MachineBasicBlock::iterator InsertBefore);
239	MachineBasicBlock::iterator
240	mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
241	MachineBasicBlock::iterator InsertBefore);
242	MachineBasicBlock::iterator
243	mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
244	MachineBasicBlock::iterator InsertBefore);
245	MachineBasicBlock::iterator
246	mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
247	MachineBasicBlock::iterator InsertBefore);
248	MachineBasicBlock::iterator
249	mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
250	MachineBasicBlock::iterator InsertBefore);
251	MachineBasicBlock::iterator
252	mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
253	MachineBasicBlock::iterator InsertBefore);
254	MachineBasicBlock::iterator
255	mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
256	MachineBasicBlock::iterator InsertBefore);
257	MachineBasicBlock::iterator
258	mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired,
259	MachineBasicBlock::iterator InsertBefore);
260	MachineBasicBlock::iterator
261	mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired,
262	MachineBasicBlock::iterator InsertBefore);
263
264	void updateBaseAndOffset(MachineInstr &I, Register NewBase,
265	int32_t NewOffset) const;
266	Register computeBase(MachineInstr &MI, const MemAddress &Addr) const;
267	MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
268	std::optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
269	void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
270	/// Promotes constant offset to the immediate by adjusting the base. It
271	/// tries to use a base from the nearby instructions that allows it to have
272	/// a 13bit constant offset which gets promoted to the immediate.
273	bool promoteConstantOffsetToImm(MachineInstr &CI,
274	MemInfoMap &Visited,
275	SmallPtrSet<MachineInstr , `4`> &Promoted) const*;
276	void addInstToMergeableList(const CombineInfo &CI,
277	std::list<std::list<CombineInfo> > &MergeableInsts) const;
278
279	std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts(
280	MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
281	MemInfoMap &Visited, SmallPtrSet<MachineInstr *, `4`> &AnchorList,
282	std::list<std::list<CombineInfo>> &MergeableInsts) const;
283
284	static MachineMemOperand combineKnownAdjacentMMOs(const* CombineInfo &CI,
285	const CombineInfo &Paired);
286
287	static InstClassEnum getCommonInstClass(const CombineInfo &CI,
288	const CombineInfo &Paired);
289
290	public:
291	static char ID;
292
293	SILoadStoreOptimizer() : MachineFunctionPass (ID) {
294	initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
295	}
296
297	bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
298	bool &OptimizeListAgain);
299	bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
300
301	bool runOnMachineFunction(MachineFunction &MF) override;
302
303	StringRef getPassName() const override { return "SI Load Store Optimizer"; }
304
305	void getAnalysisUsage(AnalysisUsage &AU) const override {
306	AU.setPreservesCFG();
307	AU.addRequired<AAResultsWrapperPass>();
308
309	MachineFunctionPass::getAnalysisUsage(AU);
310	}
311
312	MachineFunctionProperties getRequiredProperties() const override {
313	return MachineFunctionProperties ()
314	.set(MachineFunctionProperties::Property::IsSSA);
315	}
316	};
317
318	static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
319	const unsigned Opc = MI.getOpcode();
320
321	if (TII.isMUBUF(Opcode: Opc)) {
322	// FIXME: Handle d16 correctly
323	return AMDGPU::getMUBUFElements(Opc);
324	}
325	if (TII.isImage(MI)) {
326	uint64_t DMaskImm =
327	TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();
328	return llvm::popcount(Value: DMaskImm);
329	}
330	if (TII.isMTBUF(Opcode: Opc)) {
331	return AMDGPU::getMTBUFElements(Opc);
332	}
333
334	switch (Opc) {
335	case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
336	case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
337	case AMDGPU::S_LOAD_DWORD_IMM:
338	case AMDGPU::GLOBAL_LOAD_DWORD:
339	case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
340	case AMDGPU::GLOBAL_STORE_DWORD:
341	case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
342	case AMDGPU::FLAT_LOAD_DWORD:
343	case AMDGPU::FLAT_STORE_DWORD:
344	return `1`;
345	case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
346	case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
347	case AMDGPU::S_LOAD_DWORDX2_IMM:
348	case AMDGPU::GLOBAL_LOAD_DWORDX2:
349	case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
350	case AMDGPU::GLOBAL_STORE_DWORDX2:
351	case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
352	case AMDGPU::FLAT_LOAD_DWORDX2:
353	case AMDGPU::FLAT_STORE_DWORDX2:
354	return `2`;
355	case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
356	case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
357	case AMDGPU::S_LOAD_DWORDX3_IMM:
358	case AMDGPU::GLOBAL_LOAD_DWORDX3:
359	case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
360	case AMDGPU::GLOBAL_STORE_DWORDX3:
361	case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
362	case AMDGPU::FLAT_LOAD_DWORDX3:
363	case AMDGPU::FLAT_STORE_DWORDX3:
364	return `3`;
365	case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
366	case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
367	case AMDGPU::S_LOAD_DWORDX4_IMM:
368	case AMDGPU::GLOBAL_LOAD_DWORDX4:
369	case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
370	case AMDGPU::GLOBAL_STORE_DWORDX4:
371	case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
372	case AMDGPU::FLAT_LOAD_DWORDX4:
373	case AMDGPU::FLAT_STORE_DWORDX4:
374	return `4`;
375	case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
376	case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
377	case AMDGPU::S_LOAD_DWORDX8_IMM:
378	return `8`;
379	case AMDGPU::DS_READ_B32: [[fallthrough]];
380	case AMDGPU::DS_READ_B32_gfx9: [[fallthrough]];
381	case AMDGPU::DS_WRITE_B32: [[fallthrough]];
382	case AMDGPU::DS_WRITE_B32_gfx9:
383	return `1`;
384	case AMDGPU::DS_READ_B64: [[fallthrough]];
385	case AMDGPU::DS_READ_B64_gfx9: [[fallthrough]];
386	case AMDGPU::DS_WRITE_B64: [[fallthrough]];
387	case AMDGPU::DS_WRITE_B64_gfx9:
388	return `2`;
389	default:
390	return `0`;
391	}
392	}
393
394	/// Maps instruction opcode to enum InstClassEnum.
395	static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
396	switch (Opc) {
397	default:
398	if (TII.isMUBUF(Opcode: Opc)) {
399	switch (AMDGPU::getMUBUFBaseOpcode(Opc)) {
400	default:
401	return UNKNOWN;
402	case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN:
403	case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN_exact:
404	case AMDGPU::BUFFER_LOAD_DWORD_IDXEN:
405	case AMDGPU::BUFFER_LOAD_DWORD_IDXEN_exact:
406	case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
407	case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
408	case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
409	case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
410	case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:
411	case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact:
412	case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN:
413	case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact:
414	case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN:
415	case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN_exact:
416	case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET:
417	case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET_exact:
418	return BUFFER_LOAD;
419	case AMDGPU::BUFFER_STORE_DWORD_BOTHEN:
420	case AMDGPU::BUFFER_STORE_DWORD_BOTHEN_exact:
421	case AMDGPU::BUFFER_STORE_DWORD_IDXEN:
422	case AMDGPU::BUFFER_STORE_DWORD_IDXEN_exact:
423	case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
424	case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
425	case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
426	case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
427	case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN:
428	case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact:
429	case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN:
430	case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN_exact:
431	case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN:
432	case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact:
433	case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET:
434	case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact:
435	return BUFFER_STORE;
436	}
437	}
438	if (TII.isImage(Opcode: Opc)) {
439	// Ignore instructions encoded without vaddr.
440	if (!AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
441	!AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr0))
442	return UNKNOWN;
443	// Ignore BVH instructions
444	if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH)
445	return UNKNOWN;
446	// TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
447	if (TII.get(Opc).mayStore() \|\| !TII.get(Opc).mayLoad() \|\|
448	TII.isGather4(Opcode: Opc))
449	return UNKNOWN;
450	return MIMG;
451	}
452	if (TII.isMTBUF(Opcode: Opc)) {
453	switch (AMDGPU::getMTBUFBaseOpcode(Opc)) {
454	default:
455	return UNKNOWN;
456	case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN:
457	case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:
458	case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN:
459	case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact:
460	case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
461	case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
462	case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
463	case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
464	case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN:
465	case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN_exact:
466	case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN:
467	case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN_exact:
468	case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN:
469	case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN_exact:
470	case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET:
471	case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET_exact:
472	return TBUFFER_LOAD;
473	case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
474	case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
475	case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
476	case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
477	case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN:
478	case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact:
479	case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET:
480	case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET_exact:
481	return TBUFFER_STORE;
482	}
483	}
484	return UNKNOWN;
485	case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
486	case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
487	case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
488	case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
489	case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
490	return S_BUFFER_LOAD_IMM;
491	case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
492	case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
493	case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
494	case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
495	case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
496	return S_BUFFER_LOAD_SGPR_IMM;
497	case AMDGPU::S_LOAD_DWORD_IMM:
498	case AMDGPU::S_LOAD_DWORDX2_IMM:
499	case AMDGPU::S_LOAD_DWORDX3_IMM:
500	case AMDGPU::S_LOAD_DWORDX4_IMM:
501	case AMDGPU::S_LOAD_DWORDX8_IMM:
502	return S_LOAD_IMM;
503	case AMDGPU::DS_READ_B32:
504	case AMDGPU::DS_READ_B32_gfx9:
505	case AMDGPU::DS_READ_B64:
506	case AMDGPU::DS_READ_B64_gfx9:
507	return DS_READ;
508	case AMDGPU::DS_WRITE_B32:
509	case AMDGPU::DS_WRITE_B32_gfx9:
510	case AMDGPU::DS_WRITE_B64:
511	case AMDGPU::DS_WRITE_B64_gfx9:
512	return DS_WRITE;
513	case AMDGPU::GLOBAL_LOAD_DWORD:
514	case AMDGPU::GLOBAL_LOAD_DWORDX2:
515	case AMDGPU::GLOBAL_LOAD_DWORDX3:
516	case AMDGPU::GLOBAL_LOAD_DWORDX4:
517	case AMDGPU::FLAT_LOAD_DWORD:
518	case AMDGPU::FLAT_LOAD_DWORDX2:
519	case AMDGPU::FLAT_LOAD_DWORDX3:
520	case AMDGPU::FLAT_LOAD_DWORDX4:
521	return FLAT_LOAD;
522	case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
523	case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
524	case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
525	case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
526	return GLOBAL_LOAD_SADDR;
527	case AMDGPU::GLOBAL_STORE_DWORD:
528	case AMDGPU::GLOBAL_STORE_DWORDX2:
529	case AMDGPU::GLOBAL_STORE_DWORDX3:
530	case AMDGPU::GLOBAL_STORE_DWORDX4:
531	case AMDGPU::FLAT_STORE_DWORD:
532	case AMDGPU::FLAT_STORE_DWORDX2:
533	case AMDGPU::FLAT_STORE_DWORDX3:
534	case AMDGPU::FLAT_STORE_DWORDX4:
535	return FLAT_STORE;
536	case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
537	case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
538	case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
539	case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
540	return GLOBAL_STORE_SADDR;
541	}
542	}
543
544	/// Determines instruction subclass from opcode. Only instructions
545	/// of the same subclass can be merged together. The merged instruction may have
546	/// a different subclass but must have the same class.
547	static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
548	switch (Opc) {
549	default:
550	if (TII.isMUBUF(Opcode: Opc))
551	return AMDGPU::getMUBUFBaseOpcode(Opc);
552	if (TII.isImage(Opcode: Opc)) {
553	const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
554	assert(Info);
555	return Info->BaseOpcode;
556	}
557	if (TII.isMTBUF(Opcode: Opc))
558	return AMDGPU::getMTBUFBaseOpcode(Opc);
559	return -`1`;
560	case AMDGPU::DS_READ_B32:
561	case AMDGPU::DS_READ_B32_gfx9:
562	case AMDGPU::DS_READ_B64:
563	case AMDGPU::DS_READ_B64_gfx9:
564	case AMDGPU::DS_WRITE_B32:
565	case AMDGPU::DS_WRITE_B32_gfx9:
566	case AMDGPU::DS_WRITE_B64:
567	case AMDGPU::DS_WRITE_B64_gfx9:
568	return Opc;
569	case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
570	case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
571	case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
572	case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
573	case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
574	return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
575	case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
576	case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
577	case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
578	case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
579	case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
580	return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;
581	case AMDGPU::S_LOAD_DWORD_IMM:
582	case AMDGPU::S_LOAD_DWORDX2_IMM:
583	case AMDGPU::S_LOAD_DWORDX3_IMM:
584	case AMDGPU::S_LOAD_DWORDX4_IMM:
585	case AMDGPU::S_LOAD_DWORDX8_IMM:
586	return AMDGPU::S_LOAD_DWORD_IMM;
587	case AMDGPU::GLOBAL_LOAD_DWORD:
588	case AMDGPU::GLOBAL_LOAD_DWORDX2:
589	case AMDGPU::GLOBAL_LOAD_DWORDX3:
590	case AMDGPU::GLOBAL_LOAD_DWORDX4:
591	case AMDGPU::FLAT_LOAD_DWORD:
592	case AMDGPU::FLAT_LOAD_DWORDX2:
593	case AMDGPU::FLAT_LOAD_DWORDX3:
594	case AMDGPU::FLAT_LOAD_DWORDX4:
595	return AMDGPU::FLAT_LOAD_DWORD;
596	case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
597	case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
598	case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
599	case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
600	return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;
601	case AMDGPU::GLOBAL_STORE_DWORD:
602	case AMDGPU::GLOBAL_STORE_DWORDX2:
603	case AMDGPU::GLOBAL_STORE_DWORDX3:
604	case AMDGPU::GLOBAL_STORE_DWORDX4:
605	case AMDGPU::FLAT_STORE_DWORD:
606	case AMDGPU::FLAT_STORE_DWORDX2:
607	case AMDGPU::FLAT_STORE_DWORDX3:
608	case AMDGPU::FLAT_STORE_DWORDX4:
609	return AMDGPU::FLAT_STORE_DWORD;
610	case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
611	case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
612	case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
613	case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
614	return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
615	}
616	}
617
618	// GLOBAL loads and stores are classified as FLAT initially. If both combined
619	// instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE.
620	// If either or both instructions are non segment specific FLAT the resulting
621	// combined operation will be FLAT, potentially promoting one of the GLOBAL
622	// operations to FLAT.
623	// For other instructions return the original unmodified class.
624	InstClassEnum
625	SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI,
626	const CombineInfo &Paired) {
627	assert(CI.InstClass == Paired.InstClass);
628
629	if ((CI.InstClass == FLAT_LOAD \|\| CI.InstClass == FLAT_STORE) &&
630	SIInstrInfo::isFLATGlobal(MI: CI.I) && SIInstrInfo::isFLATGlobal(MI: Paired.I))
631	return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD;
632
633	return CI.InstClass;
634	}
635
636	static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
637	AddressRegs Result;
638
639	if (TII.isMUBUF(Opcode: Opc)) {
640	if (AMDGPU::getMUBUFHasVAddr(Opc))
641	Result.VAddr = true;
642	if (AMDGPU::getMUBUFHasSrsrc(Opc))
643	Result.SRsrc = true;
644	if (AMDGPU::getMUBUFHasSoffset(Opc))
645	Result.SOffset = true;
646
647	return Result;
648	}
649
650	if (TII.isImage(Opcode: Opc)) {
651	int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
652	if (VAddr0Idx >= `0`) {
653	int RsrcName =
654	TII.isMIMG(Opc) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
655	int RsrcIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, NamedIdx: RsrcName);
656	Result.NumVAddrs = RsrcIdx - VAddr0Idx;
657	} else {
658	Result.VAddr = true;
659	}
660	Result.SRsrc = true;
661	const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
662	if (Info && AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: Info->BaseOpcode)->Sampler)
663	Result.SSamp = true;
664
665	return Result;
666	}
667	if (TII.isMTBUF(Opcode: Opc)) {
668	if (AMDGPU::getMTBUFHasVAddr(Opc))
669	Result.VAddr = true;
670	if (AMDGPU::getMTBUFHasSrsrc(Opc))
671	Result.SRsrc = true;
672	if (AMDGPU::getMTBUFHasSoffset(Opc))
673	Result.SOffset = true;
674
675	return Result;
676	}
677
678	switch (Opc) {
679	default:
680	return Result;
681	case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
682	case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
683	case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
684	case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
685	case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
686	Result.SOffset = true;
687	[[fallthrough]];
688	case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
689	case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
690	case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
691	case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
692	case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
693	case AMDGPU::S_LOAD_DWORD_IMM:
694	case AMDGPU::S_LOAD_DWORDX2_IMM:
695	case AMDGPU::S_LOAD_DWORDX3_IMM:
696	case AMDGPU::S_LOAD_DWORDX4_IMM:
697	case AMDGPU::S_LOAD_DWORDX8_IMM:
698	Result.SBase = true;
699	return Result;
700	case AMDGPU::DS_READ_B32:
701	case AMDGPU::DS_READ_B64:
702	case AMDGPU::DS_READ_B32_gfx9:
703	case AMDGPU::DS_READ_B64_gfx9:
704	case AMDGPU::DS_WRITE_B32:
705	case AMDGPU::DS_WRITE_B64:
706	case AMDGPU::DS_WRITE_B32_gfx9:
707	case AMDGPU::DS_WRITE_B64_gfx9:
708	Result.Addr = true;
709	return Result;
710	case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
711	case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
712	case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
713	case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
714	case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
715	case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
716	case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
717	case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
718	Result.SAddr = true;
719	[[fallthrough]];
720	case AMDGPU::GLOBAL_LOAD_DWORD:
721	case AMDGPU::GLOBAL_LOAD_DWORDX2:
722	case AMDGPU::GLOBAL_LOAD_DWORDX3:
723	case AMDGPU::GLOBAL_LOAD_DWORDX4:
724	case AMDGPU::GLOBAL_STORE_DWORD:
725	case AMDGPU::GLOBAL_STORE_DWORDX2:
726	case AMDGPU::GLOBAL_STORE_DWORDX3:
727	case AMDGPU::GLOBAL_STORE_DWORDX4:
728	case AMDGPU::FLAT_LOAD_DWORD:
729	case AMDGPU::FLAT_LOAD_DWORDX2:
730	case AMDGPU::FLAT_LOAD_DWORDX3:
731	case AMDGPU::FLAT_LOAD_DWORDX4:
732	case AMDGPU::FLAT_STORE_DWORD:
733	case AMDGPU::FLAT_STORE_DWORDX2:
734	case AMDGPU::FLAT_STORE_DWORDX3:
735	case AMDGPU::FLAT_STORE_DWORDX4:
736	Result.VAddr = true;
737	return Result;
738	}
739	}
740
741	void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
742	const SILoadStoreOptimizer &LSO) {
743	I = MI;
744	unsigned Opc = MI ->getOpcode();
745	InstClass = getInstClass(Opc, TII: *LSO.TII);
746
747	if (InstClass == UNKNOWN)
748	return;
749
750	IsAGPR = LSO.TRI->hasAGPRs(RC: LSO.getDataRegClass(MI: *MI));
751
752	switch (InstClass) {
753	case DS_READ:
754	EltSize =
755	(Opc == AMDGPU::DS_READ_B64 \|\| Opc == AMDGPU::DS_READ_B64_gfx9) ? `8`
756	: `4`;
757	break;
758	case DS_WRITE:
759	EltSize =
760	(Opc == AMDGPU::DS_WRITE_B64 \|\| Opc == AMDGPU::DS_WRITE_B64_gfx9) ? `8`
761	: `4`;
762	break;
763	case S_BUFFER_LOAD_IMM:
764	case S_BUFFER_LOAD_SGPR_IMM:
765	case S_LOAD_IMM:
766	EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, `4`);
767	break;
768	default:
769	EltSize = `4`;
770	break;
771	}
772
773	if (InstClass == MIMG) {
774	DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
775	// Offset is not considered for MIMG instructions.
776	Offset = `0`;
777	} else {
778	int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
779	Offset = I ->getOperand(i: OffsetIdx).getImm();
780	}
781
782	if (InstClass == TBUFFER_LOAD \|\| InstClass == TBUFFER_STORE)
783	Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm();
784
785	Width = getOpcodeWidth(MI: I, TII: LSO.TII);
786
787	if ((InstClass == DS_READ) \|\| (InstClass == DS_WRITE)) {
788	Offset &= `0xffff`;
789	} else if (InstClass != MIMG) {
790	CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm();
791	}
792
793	AddressRegs Regs = getRegs(Opc, TII: *LSO.TII);
794	bool isVIMAGEorVSAMPLE = LSO.TII->isVIMAGE(MI: I) \|\| LSO.TII->isVSAMPLE(MI: I);
795
796	NumAddresses = `0`;
797	for (unsigned J = `0`; J < Regs.NumVAddrs; J++)
798	AddrIdx[NumAddresses++] =
799	AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J;
800	if (Regs.Addr)
801	AddrIdx[NumAddresses++] =
802	AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr);
803	if (Regs.SBase)
804	AddrIdx[NumAddresses++] =
805	AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase);
806	if (Regs.SRsrc)
807	AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
808	Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::rsrc : AMDGPU::OpName::srsrc);
809	if (Regs.SOffset)
810	AddrIdx[NumAddresses++] =
811	AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);
812	if (Regs.SAddr)
813	AddrIdx[NumAddresses++] =
814	AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
815	if (Regs.VAddr)
816	AddrIdx[NumAddresses++] =
817	AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
818	if (Regs.SSamp)
819	AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
820	Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::samp : AMDGPU::OpName::ssamp);
821	assert(NumAddresses <= MaxAddressRegs);
822
823	for (unsigned J = `0`; J < NumAddresses; J++)
824	AddrReg[J] = &I ->getOperand(i: AddrIdx[J]);
825	}
826
827	} // end anonymous namespace.
828
829	INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
830	"SI Load Store Optimizer", false, false)
831	INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
832	INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer",
833	false, false)
834
835	char SILoadStoreOptimizer::ID = `0`;
836
837	char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
838
839	FunctionPass *llvm::createSILoadStoreOptimizerPass() {
840	return new SILoadStoreOptimizer ();
841	}
842
843	static void addDefsUsesToList(const MachineInstr &MI,
844	DenseSet<Register> &RegDefs,
845	DenseSet<Register> &RegUses) {
846	for (const auto &Op : MI.operands()) {
847	if (!Op.isReg())
848	continue;
849	if (Op.isDef())
850	RegDefs.insert(V: Op.getReg());
851	if (Op.readsReg())
852	RegUses.insert(V: Op.getReg());
853	}
854	}
855
856	bool SILoadStoreOptimizer::canSwapInstructions(
857	const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses,
858	const MachineInstr &A, const MachineInstr &B) const {
859	if (A.mayLoadOrStore() && B.mayLoadOrStore() &&
860	(A.mayStore() \|\| B.mayStore()) && A.mayAlias(AA, Other: B, UseTBAA: true))
861	return false;
862	for (const auto &BOp : B.operands()) {
863	if (!BOp.isReg())
864	continue;
865	if ((BOp.isDef() \|\| BOp.readsReg()) && ARegDefs.contains(V: BOp.getReg()))
866	return false;
867	if (BOp.isDef() && ARegUses.contains(V: BOp.getReg()))
868	return false;
869	}
870	return true;
871	}
872
873	// Given that \p CI and \p Paired are adjacent memory operations produce a new
874	// MMO for the combined operation with a new access size.
875	MachineMemOperand *
876	SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI,
877	const CombineInfo &Paired) {
878	const MachineMemOperand MMOa = CI.I ->memoperands_begin();
879	const MachineMemOperand MMOb = Paired.I ->memoperands_begin();
880
881	unsigned Size = MMOa->getSize().getValue() + MMOb->getSize().getValue();
882
883	// A base pointer for the combined operation is the same as the leading
884	// operation's pointer.
885	if (Paired < CI)
886	std::swap(a&: MMOa, b&: MMOb);
887
888	MachinePointerInfo PtrInfo(MMOa->getPointerInfo());
889	// If merging FLAT and GLOBAL set address space to FLAT.
890	if (MMOb->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
891	PtrInfo.AddrSpace = AMDGPUAS::FLAT_ADDRESS;
892
893	MachineFunction *MF = CI.I ->getMF();
894	return MF->getMachineMemOperand(MMO: MMOa, PtrInfo, Size);
895	}
896
897	bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
898	const SIInstrInfo &TII,
899	const CombineInfo &Paired) {
900	assert(CI.InstClass == MIMG);
901
902	// Ignore instructions with tfe/lwe set.
903	const auto TFEOp = TII.getNamedOperand(CI.I, AMDGPU::OpName::tfe);
904	const auto LWEOp = TII.getNamedOperand(CI.I, AMDGPU::OpName::lwe);
905
906	if ((TFEOp && TFEOp->getImm()) \|\| (LWEOp && LWEOp->getImm()))
907	return false;
908
909	// Check other optional immediate operands for equality.
910	unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16,
911	AMDGPU::OpName::unorm, AMDGPU::OpName::da,
912	AMDGPU::OpName::r128, AMDGPU::OpName::a16};
913
914	for (auto op : OperandsToMatch) {
915	int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
916	if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx)
917	return false;
918	if (Idx != -`1` &&
919	CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm())
920	return false;
921	}
922
923	// Check DMask for overlaps.
924	unsigned MaxMask = std::max(a: CI.DMask, b: Paired.DMask);
925	unsigned MinMask = std::min(a: CI.DMask, b: Paired.DMask);
926
927	if (!MaxMask)
928	return false;
929
930	unsigned AllowedBitsForMin = llvm::countr_zero(Val: MaxMask);
931	if ((`1u` << AllowedBitsForMin) <= MinMask)
932	return false;
933
934	return true;
935	}
936
937	static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
938	unsigned ComponentCount,
939	const GCNSubtarget &STI) {
940	if (ComponentCount > `4`)
941	return `0`;
942
943	const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo =
944	llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI);
945	if (!OldFormatInfo)
946	return `0`;
947
948	const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo =
949	llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp,
950	ComponentCount,
951	OldFormatInfo->NumFormat, STI);
952
953	if (!NewFormatInfo)
954	return `0`;
955
956	assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat &&
957	NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp);
958
959	return NewFormatInfo->Format;
960	}
961
962	// Return the value in the inclusive range [Lo,Hi] that is aligned to the
963	// highest power of two. Note that the result is well defined for all inputs
964	// including corner cases like:
965	// - if Lo == Hi, return that value
966	// - if Lo == 0, return 0 (even though the "- 1" below underflows
967	// - if Lo > Hi, return 0 (as if the range wrapped around)
968	static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) {
969	return Hi & maskLeadingOnes<uint32_t>(N: llvm::countl_zero(Val: (Lo - `1`) ^ Hi) + `1`);
970	}
971
972	bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
973	const GCNSubtarget &STI,
974	CombineInfo &Paired,
975	bool Modify) {
976	assert(CI.InstClass != MIMG);
977
978	// XXX - Would the same offset be OK? Is there any reason this would happen or
979	// be useful?
980	if (CI.Offset == Paired.Offset)
981	return false;
982
983	// This won't be valid if the offset isn't aligned.
984	if ((CI.Offset % CI.EltSize != `0`) \|\| (Paired.Offset % CI.EltSize != `0`))
985	return false;
986
987	if (CI.InstClass == TBUFFER_LOAD \|\| CI.InstClass == TBUFFER_STORE) {
988
989	const llvm::AMDGPU::GcnBufferFormatInfo *Info0 =
990	llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI);
991	if (!Info0)
992	return false;
993	const llvm::AMDGPU::GcnBufferFormatInfo *Info1 =
994	llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI);
995	if (!Info1)
996	return false;
997
998	if (Info0->BitsPerComp != Info1->BitsPerComp \|\|
999	Info0->NumFormat != Info1->NumFormat)
1000	return false;
1001
1002	// TODO: Should be possible to support more formats, but if format loads
1003	// are not dword-aligned, the merged load might not be valid.
1004	if (Info0->BitsPerComp != `32`)
1005	return false;
1006
1007	if (getBufferFormatWithCompCount(OldFormat: CI.Format, ComponentCount: CI.Width + Paired.Width, STI) == `0`)
1008	return false;
1009	}
1010
1011	uint32_t EltOffset0 = CI.Offset / CI.EltSize;
1012	uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
1013	CI.UseST64 = false;
1014	CI.BaseOff = `0`;
1015
1016	// Handle all non-DS instructions.
1017	if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
1018	if (EltOffset0 + CI.Width != EltOffset1 &&
1019	EltOffset1 + Paired.Width != EltOffset0)
1020	return false;
1021	if (CI.CPol != Paired.CPol)
1022	return false;
1023	if (CI.InstClass == S_LOAD_IMM \|\| CI.InstClass == S_BUFFER_LOAD_IMM \|\|
1024	CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) {
1025	// Reject cases like:
1026	// dword + dwordx2 -> dwordx3
1027	// dword + dwordx3 -> dwordx4
1028	// If we tried to combine these cases, we would fail to extract a subreg
1029	// for the result of the second load due to SGPR alignment requirements.
1030	if (CI.Width != Paired.Width &&
1031	(CI.Width < Paired.Width) == (CI.Offset < Paired.Offset))
1032	return false;
1033	}
1034	return true;
1035	}
1036
1037	// If the offset in elements doesn't fit in 8-bits, we might be able to use
1038	// the stride 64 versions.
1039	if ((EltOffset0 % `64` == `0`) && (EltOffset1 % `64`) == `0` &&
1040	isUInt<`8`>(x: EltOffset0 / `64`) && isUInt<`8`>(x: EltOffset1 / `64`)) {
1041	if (Modify) {
1042	CI.Offset = EltOffset0 / `64`;
1043	Paired.Offset = EltOffset1 / `64`;
1044	CI.UseST64 = true;
1045	}
1046	return true;
1047	}
1048
1049	// Check if the new offsets fit in the reduced 8-bit range.
1050	if (isUInt<`8`>(x: EltOffset0) && isUInt<`8`>(x: EltOffset1)) {
1051	if (Modify) {
1052	CI.Offset = EltOffset0;
1053	Paired.Offset = EltOffset1;
1054	}
1055	return true;
1056	}
1057
1058	// Try to shift base address to decrease offsets.
1059	uint32_t Min = std::min(a: EltOffset0, b: EltOffset1);
1060	uint32_t Max = std::max(a: EltOffset0, b: EltOffset1);
1061
1062	const uint32_t Mask = maskTrailingOnes<uint32_t>(N: `8`) * `64`;
1063	if (((Max - Min) & ~Mask) == `0`) {
1064	if (Modify) {
1065	// From the range of values we could use for BaseOff, choose the one that
1066	// is aligned to the highest power of two, to maximise the chance that
1067	// the same offset can be reused for other load/store pairs.
1068	uint32_t BaseOff = mostAlignedValueInRange(Lo: Max - `0xff` * `64`, Hi: Min);
1069	// Copy the low bits of the offsets, so that when we adjust them by
1070	// subtracting BaseOff they will be multiples of 64.
1071	BaseOff \|= Min & maskTrailingOnes<uint32_t>(N: `6`);
1072	CI.BaseOff = BaseOff * CI.EltSize;
1073	CI.Offset = (EltOffset0 - BaseOff) / `64`;
1074	Paired.Offset = (EltOffset1 - BaseOff) / `64`;
1075	CI.UseST64 = true;
1076	}
1077	return true;
1078	}
1079
1080	if (isUInt<`8`>(x: Max - Min)) {
1081	if (Modify) {
1082	// From the range of values we could use for BaseOff, choose the one that
1083	// is aligned to the highest power of two, to maximise the chance that
1084	// the same offset can be reused for other load/store pairs.
1085	uint32_t BaseOff = mostAlignedValueInRange(Lo: Max - `0xff`, Hi: Min);
1086	CI.BaseOff = BaseOff * CI.EltSize;
1087	CI.Offset = EltOffset0 - BaseOff;
1088	Paired.Offset = EltOffset1 - BaseOff;
1089	}
1090	return true;
1091	}
1092
1093	return false;
1094	}
1095
1096	bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
1097	const CombineInfo &CI,
1098	const CombineInfo &Paired) {
1099	const unsigned Width = (CI.Width + Paired.Width);
1100	switch (CI.InstClass) {
1101	default:
1102	return (Width <= `4`) && (STM.hasDwordx3LoadStores() \|\| (Width != `3`));
1103	case S_BUFFER_LOAD_IMM:
1104	case S_BUFFER_LOAD_SGPR_IMM:
1105	case S_LOAD_IMM:
1106	switch (Width) {
1107	default:
1108	return false;
1109	case `2`:
1110	case `4`:
1111	case `8`:
1112	return true;
1113	case `3`:
1114	return STM.hasScalarDwordx3Loads();
1115	}
1116	}
1117	}
1118
1119	const TargetRegisterClass *
1120	SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const {
1121	if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
1122	return TRI->getRegClassForReg(MRI: *MRI, Reg: Dst->getReg());
1123	}
1124	if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) {
1125	return TRI->getRegClassForReg(MRI: *MRI, Reg: Src->getReg());
1126	}
1127	if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) {
1128	return TRI->getRegClassForReg(MRI: *MRI, Reg: Src->getReg());
1129	}
1130	if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
1131	return TRI->getRegClassForReg(MRI: *MRI, Reg: Dst->getReg());
1132	}
1133	if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) {
1134	return TRI->getRegClassForReg(MRI: *MRI, Reg: Src->getReg());
1135	}
1136	return nullptr;
1137	}
1138
1139	/// This function assumes that CI comes before Paired in a basic block. Return
1140	/// an insertion point for the merged instruction or nullptr on failure.
1141	SILoadStoreOptimizer::CombineInfo *
1142	SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
1143	CombineInfo &Paired) {
1144	// If another instruction has already been merged into CI, it may now be a
1145	// type that we can't do any further merging into.
1146	if (CI.InstClass == UNKNOWN \|\| Paired.InstClass == UNKNOWN)
1147	return nullptr;
1148	assert(CI.InstClass == Paired.InstClass);
1149
1150	if (getInstSubclass(Opc: CI.I ->getOpcode(), TII: *TII) !=
1151	getInstSubclass(Opc: Paired.I ->getOpcode(), TII: *TII))
1152	return nullptr;
1153
1154	// Check both offsets (or masks for MIMG) can be combined and fit in the
1155	// reduced range.
1156	if (CI.InstClass == MIMG) {
1157	if (!dmasksCanBeCombined(CI, TII: *TII, Paired))
1158	return nullptr;
1159	} else {
1160	if (!widthsFit(STM: STM, CI, Paired) \|\| !offsetsCanBeCombined(CI, STI: STM, Paired))
1161	return nullptr;
1162	}
1163
1164	DenseSet<Register> RegDefs;
1165	DenseSet<Register> RegUses;
1166	CombineInfo *Where;
1167	if (CI.I ->mayLoad()) {
1168	// Try to hoist Paired up to CI.
1169	addDefsUsesToList(MI: *Paired.I, RegDefs, RegUses);
1170	for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) {
1171	if (!canSwapInstructions(ARegDefs: RegDefs, ARegUses: RegUses, A: Paired.I, B: MBBI))
1172	return nullptr;
1173	}
1174	Where = &CI;
1175	} else {
1176	// Try to sink CI down to Paired.
1177	addDefsUsesToList(MI: *CI.I, RegDefs, RegUses);
1178	for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) {
1179	if (!canSwapInstructions(ARegDefs: RegDefs, ARegUses: RegUses, A: CI.I, B: MBBI))
1180	return nullptr;
1181	}
1182	Where = &Paired;
1183	}
1184
1185	// Call offsetsCanBeCombined with modify = true so that the offsets are
1186	// correct for the new instruction. This should return true, because
1187	// this function should only be called on CombineInfo objects that
1188	// have already been confirmed to be mergeable.
1189	if (CI.InstClass == DS_READ \|\| CI.InstClass == DS_WRITE)
1190	offsetsCanBeCombined(CI, STI: STM, Paired, Modify: true*);
1191	return Where;
1192	}
1193
1194	unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
1195	if (STM->ldsRequiresM0Init())
1196	return (EltSize == `4`) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
1197	return (EltSize == `4`) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
1198	}
1199
1200	unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
1201	if (STM->ldsRequiresM0Init())
1202	return (EltSize == `4`) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
1203
1204	return (EltSize == `4`) ? AMDGPU::DS_READ2ST64_B32_gfx9
1205	: AMDGPU::DS_READ2ST64_B64_gfx9;
1206	}
1207
1208	MachineBasicBlock::iterator
1209	SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
1210	MachineBasicBlock::iterator InsertBefore) {
1211	MachineBasicBlock *MBB = CI.I ->getParent();
1212
1213	// Be careful, since the addresses could be subregisters themselves in weird
1214	// cases, like vectors of pointers.
1215	const auto AddrReg = TII->getNamedOperand(CI.I, AMDGPU::OpName::addr);
1216
1217	const auto Dest0 = TII->getNamedOperand(CI.I, AMDGPU::OpName::vdst);
1218	const auto Dest1 = TII->getNamedOperand(Paired.I, AMDGPU::OpName::vdst);
1219
1220	unsigned NewOffset0 = CI.Offset;
1221	unsigned NewOffset1 = Paired.Offset;
1222	unsigned Opc =
1223	CI.UseST64 ? read2ST64Opcode(EltSize: CI.EltSize) : read2Opcode(EltSize: CI.EltSize);
1224
1225	unsigned SubRegIdx0 = (CI.EltSize == `4`) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
1226	unsigned SubRegIdx1 = (CI.EltSize == `4`) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
1227
1228	if (NewOffset0 > NewOffset1) {
1229	// Canonicalize the merged instruction so the smaller offset comes first.
1230	std::swap(a&: NewOffset0, b&: NewOffset1);
1231	std::swap(a&: SubRegIdx0, b&: SubRegIdx1);
1232	}
1233
1234	assert((isUInt<`8`>(NewOffset0) && isUInt<`8`>(NewOffset1)) &&
1235	(NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1236
1237	const MCInstrDesc &Read2Desc = TII->get(Opc);
1238
1239	const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1240	Register DestReg = MRI->createVirtualRegister(RegClass: SuperRC);
1241
1242	DebugLoc DL = CI.I ->getDebugLoc();
1243
1244	Register BaseReg = AddrReg->getReg();
1245	unsigned BaseSubReg = AddrReg->getSubReg();
1246	unsigned BaseRegFlags = `0`;
1247	if (CI.BaseOff) {
1248	Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1249	BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1250	.addImm(CI.BaseOff);
1251
1252	BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1253	BaseRegFlags = RegState::Kill;
1254
1255	TII->getAddNoCarry(MBB&: *MBB, I: InsertBefore, DL, DestReg: BaseReg)
1256	.addReg(RegNo: ImmReg)
1257	.addReg(RegNo: AddrReg->getReg(), flags: `0`, SubReg: BaseSubReg)
1258	.addImm(`0`); // clamp bit
1259	BaseSubReg = `0`;
1260	}
1261
1262	MachineInstrBuilder Read2 =
1263	BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: Read2Desc, DestReg)
1264	.addReg(RegNo: BaseReg, flags: BaseRegFlags, SubReg: BaseSubReg) // addr
1265	.addImm(Val: NewOffset0) // offset0
1266	.addImm(Val: NewOffset1) // offset1
1267	.addImm(Val: `0`) // gds
1268	.cloneMergedMemRefs(OtherMIs: {&CI.I, &Paired.I});
1269
1270	(void)Read2;
1271
1272	const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1273
1274	// Copy to the old destination registers.
1275	BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: CopyDesc)
1276	.add(Dest0) // Copy to same destination including flags and sub reg.*
1277	.addReg(DestReg, `0`, SubRegIdx0);
1278	BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: CopyDesc)
1279	.add(*Dest1)
1280	.addReg(DestReg, RegState::Kill, SubRegIdx1);
1281
1282	CI.I ->eraseFromParent();
1283	Paired.I ->eraseFromParent();
1284
1285	LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << `'\n'`);
1286	return Read2;
1287	}
1288
1289	unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
1290	if (STM->ldsRequiresM0Init())
1291	return (EltSize == `4`) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
1292	return (EltSize == `4`) ? AMDGPU::DS_WRITE2_B32_gfx9
1293	: AMDGPU::DS_WRITE2_B64_gfx9;
1294	}
1295
1296	unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
1297	if (STM->ldsRequiresM0Init())
1298	return (EltSize == `4`) ? AMDGPU::DS_WRITE2ST64_B32
1299	: AMDGPU::DS_WRITE2ST64_B64;
1300
1301	return (EltSize == `4`) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
1302	: AMDGPU::DS_WRITE2ST64_B64_gfx9;
1303	}
1304
1305	MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
1306	CombineInfo &CI, CombineInfo &Paired,
1307	MachineBasicBlock::iterator InsertBefore) {
1308	MachineBasicBlock *MBB = CI.I ->getParent();
1309
1310	// Be sure to use .addOperand(), and not .addReg() with these. We want to be
1311	// sure we preserve the subregister index and any register flags set on them.
1312	const MachineOperand *AddrReg =
1313	TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1314	const MachineOperand *Data0 =
1315	TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1316	const MachineOperand *Data1 =
1317	TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1318
1319	unsigned NewOffset0 = CI.Offset;
1320	unsigned NewOffset1 = Paired.Offset;
1321	unsigned Opc =
1322	CI.UseST64 ? write2ST64Opcode(EltSize: CI.EltSize) : write2Opcode(EltSize: CI.EltSize);
1323
1324	if (NewOffset0 > NewOffset1) {
1325	// Canonicalize the merged instruction so the smaller offset comes first.
1326	std::swap(a&: NewOffset0, b&: NewOffset1);
1327	std::swap(a&: Data0, b&: Data1);
1328	}
1329
1330	assert((isUInt<`8`>(NewOffset0) && isUInt<`8`>(NewOffset1)) &&
1331	(NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1332
1333	const MCInstrDesc &Write2Desc = TII->get(Opc);
1334	DebugLoc DL = CI.I ->getDebugLoc();
1335
1336	Register BaseReg = AddrReg->getReg();
1337	unsigned BaseSubReg = AddrReg->getSubReg();
1338	unsigned BaseRegFlags = `0`;
1339	if (CI.BaseOff) {
1340	Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1341	BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1342	.addImm(CI.BaseOff);
1343
1344	BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1345	BaseRegFlags = RegState::Kill;
1346
1347	TII->getAddNoCarry(MBB&: *MBB, I: InsertBefore, DL, DestReg: BaseReg)
1348	.addReg(RegNo: ImmReg)
1349	.addReg(RegNo: AddrReg->getReg(), flags: `0`, SubReg: BaseSubReg)
1350	.addImm(Val: `0`); // clamp bit
1351	BaseSubReg = `0`;
1352	}
1353
1354	MachineInstrBuilder Write2 =
1355	BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: Write2Desc)
1356	.addReg(RegNo: BaseReg, flags: BaseRegFlags, SubReg: BaseSubReg) // addr
1357	.add(MO: Data0) // data0*
1358	.add(MO: Data1) // data1*
1359	.addImm(Val: NewOffset0) // offset0
1360	.addImm(Val: NewOffset1) // offset1
1361	.addImm(Val: `0`) // gds
1362	.cloneMergedMemRefs(OtherMIs: {&CI.I, &Paired.I});
1363
1364	CI.I ->eraseFromParent();
1365	Paired.I ->eraseFromParent();
1366
1367	LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << `'\n'`);
1368	return Write2;
1369	}
1370
1371	MachineBasicBlock::iterator
1372	SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
1373	MachineBasicBlock::iterator InsertBefore) {
1374	MachineBasicBlock *MBB = CI.I ->getParent();
1375	DebugLoc DL = CI.I ->getDebugLoc();
1376	const unsigned Opcode = getNewOpcode(CI, Paired);
1377
1378	const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1379
1380	Register DestReg = MRI->createVirtualRegister(RegClass: SuperRC);
1381	unsigned MergedDMask = CI.DMask \| Paired.DMask;
1382	unsigned DMaskIdx =
1383	AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
1384
1385	auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1386	for (unsigned I = `1`, E = (*CI.I).getNumOperands(); I != E; ++I) {
1387	if (I == DMaskIdx)
1388	MIB.addImm(MergedDMask);
1389	else
1390	MIB.add((*CI.I).getOperand(i: I));
1391	}
1392
1393	// It shouldn't be possible to get this far if the two instructions
1394	// don't have a single memoperand, because MachineInstr::mayAlias()
1395	// will return true if this is the case.
1396	assert(CI.I ->hasOneMemOperand() && Paired.I ->hasOneMemOperand());
1397
1398	MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1399
1400	unsigned SubRegIdx0, SubRegIdx1;
1401	std::tie(args&: SubRegIdx0, args&: SubRegIdx1) = getSubRegIdxs(CI, Paired);
1402
1403	// Copy to the old destination registers.
1404	const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1405	const auto Dest0 = TII->getNamedOperand(CI.I, AMDGPU::OpName::vdata);
1406	const auto Dest1 = TII->getNamedOperand(Paired.I, AMDGPU::OpName::vdata);
1407
1408	BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: CopyDesc)
1409	.add(Dest0) // Copy to same destination including flags and sub reg.*
1410	.addReg(DestReg, `0`, SubRegIdx0);
1411	BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: CopyDesc)
1412	.add(*Dest1)
1413	.addReg(DestReg, RegState::Kill, SubRegIdx1);
1414
1415	CI.I ->eraseFromParent();
1416	Paired.I ->eraseFromParent();
1417	return New;
1418	}
1419
1420	MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair(
1421	CombineInfo &CI, CombineInfo &Paired,
1422	MachineBasicBlock::iterator InsertBefore) {
1423	MachineBasicBlock *MBB = CI.I ->getParent();
1424	DebugLoc DL = CI.I ->getDebugLoc();
1425	const unsigned Opcode = getNewOpcode(CI, Paired);
1426
1427	const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1428
1429	Register DestReg = MRI->createVirtualRegister(RegClass: SuperRC);
1430	unsigned MergedOffset = std::min(a: CI.Offset, b: Paired.Offset);
1431
1432	// It shouldn't be possible to get this far if the two instructions
1433	// don't have a single memoperand, because MachineInstr::mayAlias()
1434	// will return true if this is the case.
1435	assert(CI.I ->hasOneMemOperand() && Paired.I ->hasOneMemOperand());
1436
1437	MachineInstrBuilder New =
1438	BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg)
1439	.add(TII->getNamedOperand(CI.I, AMDGPU::OpName::sbase));
1440	if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM)
1441	New.add(TII->getNamedOperand(CI.I, AMDGPU::OpName::soffset));
1442	New.addImm(Val: MergedOffset);
1443	New.addImm(Val: CI.CPol).addMemOperand(MMO: combineKnownAdjacentMMOs(CI, Paired));
1444
1445	std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1446	const unsigned SubRegIdx0 = std::get<`0`>(in&: SubRegIdx);
1447	const unsigned SubRegIdx1 = std::get<`1`>(in&: SubRegIdx);
1448
1449	// Copy to the old destination registers.
1450	const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1451	const auto Dest0 = TII->getNamedOperand(CI.I, AMDGPU::OpName::sdst);
1452	const auto Dest1 = TII->getNamedOperand(Paired.I, AMDGPU::OpName::sdst);
1453
1454	BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: CopyDesc)
1455	.add(Dest0) // Copy to same destination including flags and sub reg.*
1456	.addReg(DestReg, `0`, SubRegIdx0);
1457	BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: CopyDesc)
1458	.add(*Dest1)
1459	.addReg(DestReg, RegState::Kill, SubRegIdx1);
1460
1461	CI.I ->eraseFromParent();
1462	Paired.I ->eraseFromParent();
1463	return New;
1464	}
1465
1466	MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
1467	CombineInfo &CI, CombineInfo &Paired,
1468	MachineBasicBlock::iterator InsertBefore) {
1469	MachineBasicBlock *MBB = CI.I ->getParent();
1470	DebugLoc DL = CI.I ->getDebugLoc();
1471
1472	const unsigned Opcode = getNewOpcode(CI, Paired);
1473
1474	const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1475
1476	// Copy to the new source register.
1477	Register DestReg = MRI->createVirtualRegister(RegClass: SuperRC);
1478	unsigned MergedOffset = std::min(a: CI.Offset, b: Paired.Offset);
1479
1480	auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1481
1482	AddressRegs Regs = getRegs(Opc: Opcode, TII: *TII);
1483
1484	if (Regs.VAddr)
1485	MIB.add(TII->getNamedOperand(CI.I, AMDGPU::OpName::vaddr));
1486
1487	// It shouldn't be possible to get this far if the two instructions
1488	// don't have a single memoperand, because MachineInstr::mayAlias()
1489	// will return true if this is the case.
1490	assert(CI.I ->hasOneMemOperand() && Paired.I ->hasOneMemOperand());
1491
1492	MachineInstr *New =
1493	MIB.add(TII->getNamedOperand(CI.I, AMDGPU::OpName::srsrc))
1494	.add(TII->getNamedOperand(CI.I, AMDGPU::OpName::soffset))
1495	.addImm(MergedOffset) // offset
1496	.addImm(CI.CPol) // cpol
1497	.addImm(`0`) // swz
1498	.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1499
1500	std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1501	const unsigned SubRegIdx0 = std::get<`0`>(in&: SubRegIdx);
1502	const unsigned SubRegIdx1 = std::get<`1`>(in&: SubRegIdx);
1503
1504	// Copy to the old destination registers.
1505	const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1506	const auto Dest0 = TII->getNamedOperand(CI.I, AMDGPU::OpName::vdata);
1507	const auto Dest1 = TII->getNamedOperand(Paired.I, AMDGPU::OpName::vdata);
1508
1509	BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: CopyDesc)
1510	.add(Dest0) // Copy to same destination including flags and sub reg.*
1511	.addReg(DestReg, `0`, SubRegIdx0);
1512	BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: CopyDesc)
1513	.add(*Dest1)
1514	.addReg(DestReg, RegState::Kill, SubRegIdx1);
1515
1516	CI.I ->eraseFromParent();
1517	Paired.I ->eraseFromParent();
1518	return New;
1519	}
1520
1521	MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
1522	CombineInfo &CI, CombineInfo &Paired,
1523	MachineBasicBlock::iterator InsertBefore) {
1524	MachineBasicBlock *MBB = CI.I ->getParent();
1525	DebugLoc DL = CI.I ->getDebugLoc();
1526
1527	const unsigned Opcode = getNewOpcode(CI, Paired);
1528
1529	const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1530
1531	// Copy to the new source register.
1532	Register DestReg = MRI->createVirtualRegister(RegClass: SuperRC);
1533	unsigned MergedOffset = std::min(a: CI.Offset, b: Paired.Offset);
1534
1535	auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1536
1537	AddressRegs Regs = getRegs(Opc: Opcode, TII: *TII);
1538
1539	if (Regs.VAddr)
1540	MIB.add(TII->getNamedOperand(CI.I, AMDGPU::OpName::vaddr));
1541
1542	unsigned JoinedFormat =
1543	getBufferFormatWithCompCount(OldFormat: CI.Format, ComponentCount: CI.Width + Paired.Width, STI: *STM);
1544
1545	// It shouldn't be possible to get this far if the two instructions
1546	// don't have a single memoperand, because MachineInstr::mayAlias()
1547	// will return true if this is the case.
1548	assert(CI.I ->hasOneMemOperand() && Paired.I ->hasOneMemOperand());
1549
1550	MachineInstr *New =
1551	MIB.add(TII->getNamedOperand(CI.I, AMDGPU::OpName::srsrc))
1552	.add(TII->getNamedOperand(CI.I, AMDGPU::OpName::soffset))
1553	.addImm(MergedOffset) // offset
1554	.addImm(JoinedFormat) // format
1555	.addImm(CI.CPol) // cpol
1556	.addImm(`0`) // swz
1557	.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1558
1559	std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1560	const unsigned SubRegIdx0 = std::get<`0`>(in&: SubRegIdx);
1561	const unsigned SubRegIdx1 = std::get<`1`>(in&: SubRegIdx);
1562
1563	// Copy to the old destination registers.
1564	const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1565	const auto Dest0 = TII->getNamedOperand(CI.I, AMDGPU::OpName::vdata);
1566	const auto Dest1 = TII->getNamedOperand(Paired.I, AMDGPU::OpName::vdata);
1567
1568	BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: CopyDesc)
1569	.add(Dest0) // Copy to same destination including flags and sub reg.*
1570	.addReg(DestReg, `0`, SubRegIdx0);
1571	BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: CopyDesc)
1572	.add(*Dest1)
1573	.addReg(DestReg, RegState::Kill, SubRegIdx1);
1574
1575	CI.I ->eraseFromParent();
1576	Paired.I ->eraseFromParent();
1577	return New;
1578	}
1579
1580	MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
1581	CombineInfo &CI, CombineInfo &Paired,
1582	MachineBasicBlock::iterator InsertBefore) {
1583	MachineBasicBlock *MBB = CI.I ->getParent();
1584	DebugLoc DL = CI.I ->getDebugLoc();
1585
1586	const unsigned Opcode = getNewOpcode(CI, Paired);
1587
1588	std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1589	const unsigned SubRegIdx0 = std::get<`0`>(in&: SubRegIdx);
1590	const unsigned SubRegIdx1 = std::get<`1`>(in&: SubRegIdx);
1591
1592	// Copy to the new source register.
1593	const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1594	Register SrcReg = MRI->createVirtualRegister(RegClass: SuperRC);
1595
1596	const auto Src0 = TII->getNamedOperand(CI.I, AMDGPU::OpName::vdata);
1597	const auto Src1 = TII->getNamedOperand(Paired.I, AMDGPU::OpName::vdata);
1598
1599	BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1600	.add(*Src0)
1601	.addImm(SubRegIdx0)
1602	.add(*Src1)
1603	.addImm(SubRegIdx1);
1604
1605	auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1606	.addReg(SrcReg, RegState::Kill);
1607
1608	AddressRegs Regs = getRegs(Opc: Opcode, TII: *TII);
1609
1610	if (Regs.VAddr)
1611	MIB.add(TII->getNamedOperand(CI.I, AMDGPU::OpName::vaddr));
1612
1613	unsigned JoinedFormat =
1614	getBufferFormatWithCompCount(OldFormat: CI.Format, ComponentCount: CI.Width + Paired.Width, STI: *STM);
1615
1616	// It shouldn't be possible to get this far if the two instructions
1617	// don't have a single memoperand, because MachineInstr::mayAlias()
1618	// will return true if this is the case.
1619	assert(CI.I ->hasOneMemOperand() && Paired.I ->hasOneMemOperand());
1620
1621	MachineInstr *New =
1622	MIB.add(TII->getNamedOperand(CI.I, AMDGPU::OpName::srsrc))
1623	.add(TII->getNamedOperand(CI.I, AMDGPU::OpName::soffset))
1624	.addImm(std::min(CI.Offset, Paired.Offset)) // offset
1625	.addImm(JoinedFormat) // format
1626	.addImm(CI.CPol) // cpol
1627	.addImm(`0`) // swz
1628	.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1629
1630	CI.I ->eraseFromParent();
1631	Paired.I ->eraseFromParent();
1632	return New;
1633	}
1634
1635	MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(
1636	CombineInfo &CI, CombineInfo &Paired,
1637	MachineBasicBlock::iterator InsertBefore) {
1638	MachineBasicBlock *MBB = CI.I ->getParent();
1639	DebugLoc DL = CI.I ->getDebugLoc();
1640
1641	const unsigned Opcode = getNewOpcode(CI, Paired);
1642
1643	const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1644	Register DestReg = MRI->createVirtualRegister(RegClass: SuperRC);
1645
1646	auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1647
1648	if (auto SAddr = TII->getNamedOperand(CI.I, AMDGPU::OpName::saddr))
1649	MIB.add(*SAddr);
1650
1651	MachineInstr *New =
1652	MIB.add(TII->getNamedOperand(CI.I, AMDGPU::OpName::vaddr))
1653	.addImm(std::min(CI.Offset, Paired.Offset))
1654	.addImm(CI.CPol)
1655	.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1656
1657	std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1658	const unsigned SubRegIdx0 = std::get<`0`>(in&: SubRegIdx);
1659	const unsigned SubRegIdx1 = std::get<`1`>(in&: SubRegIdx);
1660
1661	// Copy to the old destination registers.
1662	const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1663	const auto Dest0 = TII->getNamedOperand(CI.I, AMDGPU::OpName::vdst);
1664	const auto Dest1 = TII->getNamedOperand(Paired.I, AMDGPU::OpName::vdst);
1665
1666	BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: CopyDesc)
1667	.add(Dest0) // Copy to same destination including flags and sub reg.*
1668	.addReg(DestReg, `0`, SubRegIdx0);
1669	BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: CopyDesc)
1670	.add(*Dest1)
1671	.addReg(DestReg, RegState::Kill, SubRegIdx1);
1672
1673	CI.I ->eraseFromParent();
1674	Paired.I ->eraseFromParent();
1675	return New;
1676	}
1677
1678	MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
1679	CombineInfo &CI, CombineInfo &Paired,
1680	MachineBasicBlock::iterator InsertBefore) {
1681	MachineBasicBlock *MBB = CI.I ->getParent();
1682	DebugLoc DL = CI.I ->getDebugLoc();
1683
1684	const unsigned Opcode = getNewOpcode(CI, Paired);
1685
1686	std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1687	const unsigned SubRegIdx0 = std::get<`0`>(in&: SubRegIdx);
1688	const unsigned SubRegIdx1 = std::get<`1`>(in&: SubRegIdx);
1689
1690	// Copy to the new source register.
1691	const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1692	Register SrcReg = MRI->createVirtualRegister(RegClass: SuperRC);
1693
1694	const auto Src0 = TII->getNamedOperand(CI.I, AMDGPU::OpName::vdata);
1695	const auto Src1 = TII->getNamedOperand(Paired.I, AMDGPU::OpName::vdata);
1696
1697	BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1698	.add(*Src0)
1699	.addImm(SubRegIdx0)
1700	.add(*Src1)
1701	.addImm(SubRegIdx1);
1702
1703	auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1704	.add(TII->getNamedOperand(CI.I, AMDGPU::OpName::vaddr))
1705	.addReg(SrcReg, RegState::Kill);
1706
1707	if (auto SAddr = TII->getNamedOperand(CI.I, AMDGPU::OpName::saddr))
1708	MIB.add(*SAddr);
1709
1710	MachineInstr *New =
1711	MIB.addImm(std::min(a: CI.Offset, b: Paired.Offset))
1712	.addImm(CI.CPol)
1713	.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1714
1715	CI.I ->eraseFromParent();
1716	Paired.I ->eraseFromParent();
1717	return New;
1718	}
1719
1720	unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
1721	const CombineInfo &Paired) {
1722	const unsigned Width = CI.Width + Paired.Width;
1723
1724	switch (getCommonInstClass(CI, Paired)) {
1725	default:
1726	assert(CI.InstClass == BUFFER_LOAD \|\| CI.InstClass == BUFFER_STORE);
1727	// FIXME: Handle d16 correctly
1728	return AMDGPU::getMUBUFOpcode(BaseOpc: AMDGPU::getMUBUFBaseOpcode(Opc: CI.I ->getOpcode()),
1729	Elements: Width);
1730	case TBUFFER_LOAD:
1731	case TBUFFER_STORE:
1732	return AMDGPU::getMTBUFOpcode(BaseOpc: AMDGPU::getMTBUFBaseOpcode(Opc: CI.I ->getOpcode()),
1733	Elements: Width);
1734
1735	case UNKNOWN:
1736	llvm_unreachable("Unknown instruction class");
1737	case S_BUFFER_LOAD_IMM:
1738	switch (Width) {
1739	default:
1740	return `0`;
1741	case `2`:
1742	return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1743	case `3`:
1744	return AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;
1745	case `4`:
1746	return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1747	case `8`:
1748	return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1749	}
1750	case S_BUFFER_LOAD_SGPR_IMM:
1751	switch (Width) {
1752	default:
1753	return `0`;
1754	case `2`:
1755	return AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
1756	case `3`:
1757	return AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;
1758	case `4`:
1759	return AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
1760	case `8`:
1761	return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
1762	}
1763	case S_LOAD_IMM:
1764	switch (Width) {
1765	default:
1766	return `0`;
1767	case `2`:
1768	return AMDGPU::S_LOAD_DWORDX2_IMM;
1769	case `3`:
1770	return AMDGPU::S_LOAD_DWORDX3_IMM;
1771	case `4`:
1772	return AMDGPU::S_LOAD_DWORDX4_IMM;
1773	case `8`:
1774	return AMDGPU::S_LOAD_DWORDX8_IMM;
1775	}
1776	case GLOBAL_LOAD:
1777	switch (Width) {
1778	default:
1779	return `0`;
1780	case `2`:
1781	return AMDGPU::GLOBAL_LOAD_DWORDX2;
1782	case `3`:
1783	return AMDGPU::GLOBAL_LOAD_DWORDX3;
1784	case `4`:
1785	return AMDGPU::GLOBAL_LOAD_DWORDX4;
1786	}
1787	case GLOBAL_LOAD_SADDR:
1788	switch (Width) {
1789	default:
1790	return `0`;
1791	case `2`:
1792	return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;
1793	case `3`:
1794	return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;
1795	case `4`:
1796	return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;
1797	}
1798	case GLOBAL_STORE:
1799	switch (Width) {
1800	default:
1801	return `0`;
1802	case `2`:
1803	return AMDGPU::GLOBAL_STORE_DWORDX2;
1804	case `3`:
1805	return AMDGPU::GLOBAL_STORE_DWORDX3;
1806	case `4`:
1807	return AMDGPU::GLOBAL_STORE_DWORDX4;
1808	}
1809	case GLOBAL_STORE_SADDR:
1810	switch (Width) {
1811	default:
1812	return `0`;
1813	case `2`:
1814	return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR;
1815	case `3`:
1816	return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR;
1817	case `4`:
1818	return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;
1819	}
1820	case FLAT_LOAD:
1821	switch (Width) {
1822	default:
1823	return `0`;
1824	case `2`:
1825	return AMDGPU::FLAT_LOAD_DWORDX2;
1826	case `3`:
1827	return AMDGPU::FLAT_LOAD_DWORDX3;
1828	case `4`:
1829	return AMDGPU::FLAT_LOAD_DWORDX4;
1830	}
1831	case FLAT_STORE:
1832	switch (Width) {
1833	default:
1834	return `0`;
1835	case `2`:
1836	return AMDGPU::FLAT_STORE_DWORDX2;
1837	case `3`:
1838	return AMDGPU::FLAT_STORE_DWORDX3;
1839	case `4`:
1840	return AMDGPU::FLAT_STORE_DWORDX4;
1841	}
1842	case MIMG:
1843	assert(((unsigned)llvm::popcount(CI.DMask \| Paired.DMask) == Width) &&
1844	"No overlaps");
1845	return AMDGPU::getMaskedMIMGOp(Opc: CI.I ->getOpcode(), NewChannels: Width);
1846	}
1847	}
1848
1849	std::pair<unsigned, unsigned>
1850	SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
1851	const CombineInfo &Paired) {
1852	assert((CI.InstClass != MIMG \|\|
1853	((unsigned)llvm::popcount(CI.DMask \| Paired.DMask) ==
1854	CI.Width + Paired.Width)) &&
1855	"No overlaps");
1856
1857	unsigned Idx0;
1858	unsigned Idx1;
1859
1860	static const unsigned Idxs[`5`][`4`] = {
1861	{AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
1862	{AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
1863	{AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
1864	{AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
1865	{AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
1866	};
1867
1868	assert(CI.Width >= `1` && CI.Width <= `4`);
1869	assert(Paired.Width >= `1` && Paired.Width <= `4`);
1870
1871	if (Paired < CI) {
1872	Idx1 = Idxs[`0`][Paired.Width - `1`];
1873	Idx0 = Idxs[Paired.Width][CI.Width - `1`];
1874	} else {
1875	Idx0 = Idxs[`0`][CI.Width - `1`];
1876	Idx1 = Idxs[CI.Width][Paired.Width - `1`];
1877	}
1878
1879	return std::pair(Idx0, Idx1);
1880	}
1881
1882	const TargetRegisterClass *
1883	SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
1884	const CombineInfo &Paired) {
1885	if (CI.InstClass == S_BUFFER_LOAD_IMM \|\|
1886	CI.InstClass == S_BUFFER_LOAD_SGPR_IMM \|\| CI.InstClass == S_LOAD_IMM) {
1887	switch (CI.Width + Paired.Width) {
1888	default:
1889	return nullptr;
1890	case `2`:
1891	return &AMDGPU::SReg_64_XEXECRegClass;
1892	case `3`:
1893	return &AMDGPU::SGPR_96RegClass;
1894	case `4`:
1895	return &AMDGPU::SGPR_128RegClass;
1896	case `8`:
1897	return &AMDGPU::SGPR_256RegClass;
1898	case `16`:
1899	return &AMDGPU::SGPR_512RegClass;
1900	}
1901	}
1902
1903	unsigned BitWidth = `32` * (CI.Width + Paired.Width);
1904	return TRI->isAGPRClass(RC: getDataRegClass(MI: *CI.I))
1905	? TRI->getAGPRClassForBitWidth(BitWidth)
1906	: TRI->getVGPRClassForBitWidth(BitWidth);
1907	}
1908
1909	MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
1910	CombineInfo &CI, CombineInfo &Paired,
1911	MachineBasicBlock::iterator InsertBefore) {
1912	MachineBasicBlock *MBB = CI.I ->getParent();
1913	DebugLoc DL = CI.I ->getDebugLoc();
1914
1915	const unsigned Opcode = getNewOpcode(CI, Paired);
1916
1917	std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1918	const unsigned SubRegIdx0 = std::get<`0`>(in&: SubRegIdx);
1919	const unsigned SubRegIdx1 = std::get<`1`>(in&: SubRegIdx);
1920
1921	// Copy to the new source register.
1922	const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1923	Register SrcReg = MRI->createVirtualRegister(RegClass: SuperRC);
1924
1925	const auto Src0 = TII->getNamedOperand(CI.I, AMDGPU::OpName::vdata);
1926	const auto Src1 = TII->getNamedOperand(Paired.I, AMDGPU::OpName::vdata);
1927
1928	BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1929	.add(*Src0)
1930	.addImm(SubRegIdx0)
1931	.add(*Src1)
1932	.addImm(SubRegIdx1);
1933
1934	auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1935	.addReg(SrcReg, RegState::Kill);
1936
1937	AddressRegs Regs = getRegs(Opc: Opcode, TII: *TII);
1938
1939	if (Regs.VAddr)
1940	MIB.add(TII->getNamedOperand(CI.I, AMDGPU::OpName::vaddr));
1941
1942
1943	// It shouldn't be possible to get this far if the two instructions
1944	// don't have a single memoperand, because MachineInstr::mayAlias()
1945	// will return true if this is the case.
1946	assert(CI.I ->hasOneMemOperand() && Paired.I ->hasOneMemOperand());
1947
1948	MachineInstr *New =
1949	MIB.add(TII->getNamedOperand(CI.I, AMDGPU::OpName::srsrc))
1950	.add(TII->getNamedOperand(CI.I, AMDGPU::OpName::soffset))
1951	.addImm(std::min(CI.Offset, Paired.Offset)) // offset
1952	.addImm(CI.CPol) // cpol
1953	.addImm(`0`) // swz
1954	.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1955
1956	CI.I ->eraseFromParent();
1957	Paired.I ->eraseFromParent();
1958	return New;
1959	}
1960
1961	MachineOperand
1962	SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
1963	APInt V(`32`, Val, true);
1964	if (TII->isInlineConstant(Imm: V))
1965	return MachineOperand::CreateImm(Val);
1966
1967	Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1968	MachineInstr *Mov =
1969	BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1970	TII->get(AMDGPU::S_MOV_B32), Reg)
1971	.addImm(Val);
1972	(void)Mov;
1973	LLVM_DEBUG(dbgs() << " "; Mov->dump());
1974	return MachineOperand::CreateReg(Reg, isDef: false);
1975	}
1976
1977	// Compute base address using Addr and return the final register.
1978	Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
1979	const MemAddress &Addr) const {
1980	MachineBasicBlock *MBB = MI.getParent();
1981	MachineBasicBlock::iterator MBBI = MI.getIterator();
1982	DebugLoc DL = MI.getDebugLoc();
1983
1984	assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == `32` \|\|
1985	Addr.Base.LoSubReg) &&
1986	"Expected 32-bit Base-Register-Low!!");
1987
1988	assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == `32` \|\|
1989	Addr.Base.HiSubReg) &&
1990	"Expected 32-bit Base-Register-Hi!!");
1991
1992	LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n");
1993	MachineOperand OffsetLo = createRegOrImm(Val: static_cast<int32_t>(Addr.Offset), MI);
1994	MachineOperand OffsetHi =
1995	createRegOrImm(Val: static_cast<int32_t>(Addr.Offset >> `32`), MI);
1996
1997	const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
1998	Register CarryReg = MRI->createVirtualRegister(CarryRC);
1999	Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);
2000
2001	Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2002	Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2003	MachineInstr *LoHalf =
2004	BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0)
2005	.addReg(CarryReg, RegState::Define)
2006	.addReg(Addr.Base.LoReg, `0`, Addr.Base.LoSubReg)
2007	.add(OffsetLo)
2008	.addImm(`0`); // clamp bit
2009	(void)LoHalf;
2010	LLVM_DEBUG(dbgs() << " "; LoHalf->dump(););
2011
2012	MachineInstr *HiHalf =
2013	BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
2014	.addReg(DeadCarryReg, RegState::Define \| RegState::Dead)
2015	.addReg(Addr.Base.HiReg, `0`, Addr.Base.HiSubReg)
2016	.add(OffsetHi)
2017	.addReg(CarryReg, RegState::Kill)
2018	.addImm(`0`); // clamp bit
2019	(void)HiHalf;
2020	LLVM_DEBUG(dbgs() << " "; HiHalf->dump(););
2021
2022	Register FullDestReg = MRI->createVirtualRegister(RegClass: TRI->getVGPR64Class());
2023	MachineInstr *FullBase =
2024	BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
2025	.addReg(DestSub0)
2026	.addImm(AMDGPU::sub0)
2027	.addReg(DestSub1)
2028	.addImm(AMDGPU::sub1);
2029	(void)FullBase;
2030	LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";);
2031
2032	return FullDestReg;
2033	}
2034
2035	// Update base and offset with the NewBase and NewOffset in MI.
2036	void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
2037	Register NewBase,
2038	int32_t NewOffset) const {
2039	auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
2040	Base->setReg(NewBase);
2041	Base->setIsKill(false);
2042	TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
2043	}
2044
2045	std::optional<int32_t>
2046	SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
2047	if (Op.isImm())
2048	return Op.getImm();
2049
2050	if (!Op.isReg())
2051	return std::nullopt;
2052
2053	MachineInstr *Def = MRI->getUniqueVRegDef(Reg: Op.getReg());
2054	if (!Def \|\| Def->getOpcode() != AMDGPU::S_MOV_B32 \|\|
2055	!Def->getOperand(`1`).isImm())
2056	return std::nullopt;
2057
2058	return Def->getOperand(i: `1`).getImm();
2059	}
2060
2061	// Analyze Base and extracts:
2062	// - 32bit base registers, subregisters
2063	// - 64bit constant offset
2064	// Expecting base computation as:
2065	// %OFFSET0:sgpr_32 = S_MOV_B32 8000
2066	// %LO:vgpr_32, %c:sreg_64_xexec =
2067	// V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
2068	// %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
2069	// %Base:vreg_64 =
2070	// REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
2071	void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
2072	MemAddress &Addr) const {
2073	if (!Base.isReg())
2074	return;
2075
2076	MachineInstr *Def = MRI->getUniqueVRegDef(Reg: Base.getReg());
2077	if (!Def \|\| Def->getOpcode() != AMDGPU::REG_SEQUENCE
2078	\|\| Def->getNumOperands() != `5`)
2079	return;
2080
2081	MachineOperand BaseLo = Def->getOperand(i: `1`);
2082	MachineOperand BaseHi = Def->getOperand(i: `3`);
2083	if (!BaseLo.isReg() \|\| !BaseHi.isReg())
2084	return;
2085
2086	MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(Reg: BaseLo.getReg());
2087	MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(Reg: BaseHi.getReg());
2088
2089	if (!BaseLoDef \|\| BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 \|\|
2090	!BaseHiDef \|\| BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
2091	return;
2092
2093	const auto Src0 = TII->getNamedOperand(BaseLoDef, AMDGPU::OpName::src0);
2094	const auto Src1 = TII->getNamedOperand(BaseLoDef, AMDGPU::OpName::src1);
2095
2096	auto Offset0P = extractConstOffset(Op: *Src0);
2097	if (Offset0P)
2098	BaseLo = *Src1;
2099	else {
2100	if (!(Offset0P = extractConstOffset(Op: *Src1)))
2101	return;
2102	BaseLo = *Src0;
2103	}
2104
2105	Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
2106	Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
2107
2108	if (Src0->isImm())
2109	std::swap(Src0, Src1);
2110
2111	if (!Src1->isImm())
2112	return;
2113
2114	uint64_t Offset1 = Src1->getImm();
2115	BaseHi = *Src0;
2116
2117	Addr.Base.LoReg = BaseLo.getReg();
2118	Addr.Base.HiReg = BaseHi.getReg();
2119	Addr.Base.LoSubReg = BaseLo.getSubReg();
2120	Addr.Base.HiSubReg = BaseHi.getSubReg();
2121	Addr.Offset = (*Offset0P & `0x00000000ffffffff`) \| (Offset1 << `32`);
2122	}
2123
2124	bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
2125	MachineInstr &MI,
2126	MemInfoMap &Visited,
2127	SmallPtrSet<MachineInstr , `4`> &AnchorList) const* {
2128
2129	if (!(MI.mayLoad() ^ MI.mayStore()))
2130	return false;
2131
2132	// TODO: Support flat and scratch.
2133	if (AMDGPU::getGlobalSaddrOp(Opcode: MI.getOpcode()) < `0`)
2134	return false;
2135
2136	if (MI.mayLoad() &&
2137	TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != nullptr)
2138	return false;
2139
2140	if (AnchorList.count(Ptr: &MI))
2141	return false;
2142
2143	LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
2144
2145	if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
2146	LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";);
2147	return false;
2148	}
2149
2150	// Step1: Find the base-registers and a 64bit constant offset.
2151	MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
2152	MemAddress MAddr;
2153	if (!Visited.contains(Val: &MI)) {
2154	processBaseWithConstOffset(Base, Addr&: MAddr);
2155	Visited [&MI] = MAddr;
2156	} else
2157	MAddr = Visited [&MI];
2158
2159	if (MAddr.Offset == `0`) {
2160	LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no"
2161	" constant offsets that can be promoted.\n";);
2162	return false;
2163	}
2164
2165	LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", "
2166	<< MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
2167
2168	// Step2: Traverse through MI's basic block and find an anchor(that has the
2169	// same base-registers) with the highest 13bit distance from MI's offset.
2170	// E.g. (64bit loads)
2171	// bb:
2172	// addr1 = &a + 4096; load1 = load(addr1, 0)
2173	// addr2 = &a + 6144; load2 = load(addr2, 0)
2174	// addr3 = &a + 8192; load3 = load(addr3, 0)
2175	// addr4 = &a + 10240; load4 = load(addr4, 0)
2176	// addr5 = &a + 12288; load5 = load(addr5, 0)
2177	//
2178	// Starting from the first load, the optimization will try to find a new base
2179	// from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
2180	// has 13bit distance from &a + 4096. The heuristic considers &a + 8192
2181	// as the new-base(anchor) because of the maximum distance which can
2182	// accommodate more intermediate bases presumably.
2183	//
2184	// Step3: move (&a + 8192) above load1. Compute and promote offsets from
2185	// (&a + 8192) for load1, load2, load4.
2186	// addr = &a + 8192
2187	// load1 = load(addr, -4096)
2188	// load2 = load(addr, -2048)
2189	// load3 = load(addr, 0)
2190	// load4 = load(addr, 2048)
2191	// addr5 = &a + 12288; load5 = load(addr5, 0)
2192	//
2193	MachineInstr AnchorInst = nullptr*;
2194	MemAddress AnchorAddr;
2195	uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
2196	SmallVector<std::pair<MachineInstr *, int64_t>, `4`> InstsWCommonBase;
2197
2198	MachineBasicBlock *MBB = MI.getParent();
2199	MachineBasicBlock::iterator E = MBB->end();
2200	MachineBasicBlock::iterator MBBI = MI.getIterator();
2201	++MBBI;
2202	const SITargetLowering *TLI =
2203	static_cast<const SITargetLowering *>(STM->getTargetLowering());
2204
2205	for ( ; MBBI != E; ++MBBI) {
2206	MachineInstr &MINext = *MBBI;
2207	// TODO: Support finding an anchor(with same base) from store addresses or
2208	// any other load addresses where the opcodes are different.
2209	if (MINext.getOpcode() != MI.getOpcode() \|\|
2210	TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
2211	continue;
2212
2213	const MachineOperand &BaseNext =
2214	*TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
2215	MemAddress MAddrNext;
2216	if (!Visited.contains(Val: &MINext)) {
2217	processBaseWithConstOffset(Base: BaseNext, Addr&: MAddrNext);
2218	Visited [&MINext] = MAddrNext;
2219	} else
2220	MAddrNext = Visited [&MINext];
2221
2222	if (MAddrNext.Base.LoReg != MAddr.Base.LoReg \|\|
2223	MAddrNext.Base.HiReg != MAddr.Base.HiReg \|\|
2224	MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg \|\|
2225	MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
2226	continue;
2227
2228	InstsWCommonBase.push_back(Elt: std::pair(&MINext, MAddrNext.Offset));
2229
2230	int64_t Dist = MAddr.Offset - MAddrNext.Offset;
2231	TargetLoweringBase::AddrMode AM;
2232	AM.HasBaseReg = true;
2233	AM.BaseOffs = Dist;
2234	if (TLI->isLegalGlobalAddressingMode(AM) &&
2235	(uint32_t)std::abs(i: Dist) > MaxDist) {
2236	MaxDist = std::abs(i: Dist);
2237
2238	AnchorAddr = MAddrNext;
2239	AnchorInst = &MINext;
2240	}
2241	}
2242
2243	if (AnchorInst) {
2244	LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): ";
2245	AnchorInst->dump());
2246	LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: "
2247	<< AnchorAddr.Offset << "\n\n");
2248
2249	// Instead of moving up, just re-compute anchor-instruction's base address.
2250	Register Base = computeBase(MI, Addr: AnchorAddr);
2251
2252	updateBaseAndOffset(MI, NewBase: Base, NewOffset: MAddr.Offset - AnchorAddr.Offset);
2253	LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump(););
2254
2255	for (auto P : InstsWCommonBase) {
2256	TargetLoweringBase::AddrMode AM;
2257	AM.HasBaseReg = true;
2258	AM.BaseOffs = P.second - AnchorAddr.Offset;
2259
2260	if (TLI->isLegalGlobalAddressingMode(AM)) {
2261	LLVM_DEBUG(dbgs() << " Promote Offset(" << P.second;
2262	dbgs() << ")"; P.first->dump());
2263	updateBaseAndOffset(MI&: *P.first, NewBase: Base, NewOffset: P.second - AnchorAddr.Offset);
2264	LLVM_DEBUG(dbgs() << " After promotion: "; P.first->dump());
2265	}
2266	}
2267	AnchorList.insert(Ptr: AnchorInst);
2268	return true;
2269	}
2270
2271	return false;
2272	}
2273
2274	void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
2275	std::list<std::list<CombineInfo> > &MergeableInsts) const {
2276	for (std::list<CombineInfo> &AddrList : MergeableInsts) {
2277	if (AddrList.front().InstClass == CI.InstClass &&
2278	AddrList.front().IsAGPR == CI.IsAGPR &&
2279	AddrList.front().hasSameBaseAddress(CI)) {
2280	AddrList.emplace_back(args: CI);
2281	return;
2282	}
2283	}
2284
2285	// Base address not found, so add a new list.
2286	MergeableInsts.emplace_back(args: `1`, args: CI);
2287	}
2288
2289	std::pair<MachineBasicBlock::iterator, bool>
2290	SILoadStoreOptimizer::collectMergeableInsts(
2291	MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
2292	MemInfoMap &Visited, SmallPtrSet<MachineInstr *, `4`> &AnchorList,
2293	std::list<std::list<CombineInfo>> &MergeableInsts) const {
2294	bool Modified = false;
2295
2296	// Sort potential mergeable instructions into lists. One list per base address.
2297	unsigned Order = `0`;
2298	MachineBasicBlock::iterator BlockI = Begin;
2299	for (; BlockI != End; ++BlockI) {
2300	MachineInstr &MI = *BlockI;
2301
2302	// We run this before checking if an address is mergeable, because it can produce
2303	// better code even if the instructions aren't mergeable.
2304	if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
2305	Modified = true;
2306
2307	// Treat volatile accesses, ordered accesses and unmodeled side effects as
2308	// barriers. We can look after this barrier for separate merges.
2309	if (MI.hasOrderedMemoryRef() \|\| MI.hasUnmodeledSideEffects()) {
2310	LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI);
2311
2312	// Search will resume after this instruction in a separate merge list.
2313	++BlockI;
2314	break;
2315	}
2316
2317	const InstClassEnum InstClass = getInstClass(Opc: MI.getOpcode(), TII: *TII);
2318	if (InstClass == UNKNOWN)
2319	continue;
2320
2321	// Do not merge VMEM buffer instructions with "swizzled" bit set.
2322	int Swizzled =
2323	AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz);
2324	if (Swizzled != -`1` && MI.getOperand(i: Swizzled).getImm())
2325	continue;
2326
2327	CombineInfo CI;
2328	CI.setMI(MI, LSO: *this);
2329	CI.Order = Order++;
2330
2331	if (!CI.hasMergeableAddress(MRI: *MRI))
2332	continue;
2333
2334	if (CI.InstClass == DS_WRITE && CI.IsAGPR) {
2335	// FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data
2336	// operands. However we are reporting that ds_write2 shall have
2337	// only VGPR data so that machine copy propagation does not
2338	// create an illegal instruction with a VGPR and AGPR sources.
2339	// Consequenctially if we create such instruction the verifier
2340	// will complain.
2341	continue;
2342	}
2343
2344	LLVM_DEBUG(dbgs() << "Mergeable: " << MI);
2345
2346	addInstToMergeableList(CI, MergeableInsts);
2347	}
2348
2349	// At this point we have lists of Mergeable instructions.
2350	//
2351	// Part 2: Sort lists by offset and then for each CombineInfo object in the
2352	// list try to find an instruction that can be merged with I. If an instruction
2353	// is found, it is stored in the Paired field. If no instructions are found, then
2354	// the CombineInfo object is deleted from the list.
2355
2356	for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2357	E = MergeableInsts.end(); I != E;) {
2358
2359	std::list<CombineInfo> &MergeList = *I;
2360	if (MergeList.size() <= `1`) {
2361	// This means we have found only one instruction with a given address
2362	// that can be merged, and we need at least 2 instructions to do a merge,
2363	// so this list can be discarded.
2364	I = MergeableInsts.erase(position: I);
2365	continue;
2366	}
2367
2368	// Sort the lists by offsets, this way mergeable instructions will be
2369	// adjacent to each other in the list, which will make it easier to find
2370	// matches.
2371	MergeList.sort(
2372	comp: [] (const CombineInfo &A, const CombineInfo &B) {
2373	return A.Offset < B.Offset;
2374	});
2375	++I;
2376	}
2377
2378	return std::pair(BlockI, Modified);
2379	}
2380
2381	// Scan through looking for adjacent LDS operations with constant offsets from
2382	// the same base register. We rely on the scheduler to do the hard work of
2383	// clustering nearby loads, and assume these are all adjacent.
2384	bool SILoadStoreOptimizer::optimizeBlock(
2385	std::list<std::list<CombineInfo> > &MergeableInsts) {
2386	bool Modified = false;
2387
2388	for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2389	E = MergeableInsts.end(); I != E;) {
2390	std::list<CombineInfo> &MergeList = *I;
2391
2392	bool OptimizeListAgain = false;
2393	if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
2394	// We weren't able to make any changes, so delete the list so we don't
2395	// process the same instructions the next time we try to optimize this
2396	// block.
2397	I = MergeableInsts.erase(position: I);
2398	continue;
2399	}
2400
2401	Modified = true;
2402
2403	// We made changes, but also determined that there were no more optimization
2404	// opportunities, so we don't need to reprocess the list
2405	if (!OptimizeListAgain) {
2406	I = MergeableInsts.erase(position: I);
2407	continue;
2408	}
2409	OptimizeAgain = true;
2410	}
2411	return Modified;
2412	}
2413
2414	bool
2415	SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
2416	std::list<CombineInfo> &MergeList,
2417	bool &OptimizeListAgain) {
2418	if (MergeList.empty())
2419	return false;
2420
2421	bool Modified = false;
2422
2423	for (auto I = MergeList.begin(), Next = std::next(x: I); Next != MergeList.end();
2424	Next = std::next(x: I)) {
2425
2426	auto First = I;
2427	auto Second = Next;
2428
2429	if ((First).Order > (Second).Order)
2430	std::swap(a&: First, b&: Second);
2431	CombineInfo &CI = *First;
2432	CombineInfo &Paired = *Second;
2433
2434	CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
2435	if (!Where) {
2436	++I;
2437	continue;
2438	}
2439
2440	Modified = true;
2441
2442	LLVM_DEBUG(dbgs() << "Merging: " << CI.I << " with: " << Paired.I);
2443
2444	MachineBasicBlock::iterator NewMI;
2445	switch (CI.InstClass) {
2446	default:
2447	llvm_unreachable("unknown InstClass");
2448	break;
2449	case DS_READ:
2450	NewMI = mergeRead2Pair(CI, Paired, InsertBefore: Where->I);
2451	break;
2452	case DS_WRITE:
2453	NewMI = mergeWrite2Pair(CI, Paired, InsertBefore: Where->I);
2454	break;
2455	case S_BUFFER_LOAD_IMM:
2456	case S_BUFFER_LOAD_SGPR_IMM:
2457	case S_LOAD_IMM:
2458	NewMI = mergeSMemLoadImmPair(CI, Paired, InsertBefore: Where->I);
2459	OptimizeListAgain \|= CI.Width + Paired.Width < `8`;
2460	break;
2461	case BUFFER_LOAD:
2462	NewMI = mergeBufferLoadPair(CI, Paired, InsertBefore: Where->I);
2463	OptimizeListAgain \|= CI.Width + Paired.Width < `4`;
2464	break;
2465	case BUFFER_STORE:
2466	NewMI = mergeBufferStorePair(CI, Paired, InsertBefore: Where->I);
2467	OptimizeListAgain \|= CI.Width + Paired.Width < `4`;
2468	break;
2469	case MIMG:
2470	NewMI = mergeImagePair(CI, Paired, InsertBefore: Where->I);
2471	OptimizeListAgain \|= CI.Width + Paired.Width < `4`;
2472	break;
2473	case TBUFFER_LOAD:
2474	NewMI = mergeTBufferLoadPair(CI, Paired, InsertBefore: Where->I);
2475	OptimizeListAgain \|= CI.Width + Paired.Width < `4`;
2476	break;
2477	case TBUFFER_STORE:
2478	NewMI = mergeTBufferStorePair(CI, Paired, InsertBefore: Where->I);
2479	OptimizeListAgain \|= CI.Width + Paired.Width < `4`;
2480	break;
2481	case FLAT_LOAD:
2482	case GLOBAL_LOAD:
2483	case GLOBAL_LOAD_SADDR:
2484	NewMI = mergeFlatLoadPair(CI, Paired, InsertBefore: Where->I);
2485	OptimizeListAgain \|= CI.Width + Paired.Width < `4`;
2486	break;
2487	case FLAT_STORE:
2488	case GLOBAL_STORE:
2489	case GLOBAL_STORE_SADDR:
2490	NewMI = mergeFlatStorePair(CI, Paired, InsertBefore: Where->I);
2491	OptimizeListAgain \|= CI.Width + Paired.Width < `4`;
2492	break;
2493	}
2494	CI.setMI(MI: NewMI, LSO: *this);
2495	CI.Order = Where->Order;
2496	if (I == Second)
2497	I = Next;
2498
2499	MergeList.erase(position: Second);
2500	}
2501
2502	return Modified;
2503	}
2504
2505	bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
2506	if (skipFunction(F: MF.getFunction()))
2507	return false;
2508
2509	STM = &MF.getSubtarget<GCNSubtarget>();
2510	if (!STM->loadStoreOptEnabled())
2511	return false;
2512
2513	TII = STM->getInstrInfo();
2514	TRI = &TII->getRegisterInfo();
2515
2516	MRI = &MF.getRegInfo();
2517	AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2518
2519	LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
2520
2521	bool Modified = false;
2522
2523	// Contains the list of instructions for which constant offsets are being
2524	// promoted to the IMM. This is tracked for an entire block at time.
2525	SmallPtrSet<MachineInstr *, `4`> AnchorList;
2526	MemInfoMap Visited;
2527
2528	for (MachineBasicBlock &MBB : MF) {
2529	MachineBasicBlock::iterator SectionEnd;
2530	for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
2531	I = SectionEnd) {
2532	bool CollectModified;
2533	std::list<std::list<CombineInfo>> MergeableInsts;
2534
2535	// First pass: Collect list of all instructions we know how to merge in a
2536	// subset of the block.
2537	std::tie(args&: SectionEnd, args&: CollectModified) =
2538	collectMergeableInsts(Begin: I, End: E, Visited, AnchorList, MergeableInsts);
2539
2540	Modified \|= CollectModified;
2541
2542	do {
2543	OptimizeAgain = false;
2544	Modified \|= optimizeBlock(MergeableInsts);
2545	} while (OptimizeAgain);
2546	}
2547
2548	Visited.clear();
2549	AnchorList.clear();
2550	}
2551
2552	return Modified;
2553	}
2554

source code of llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp