1 | //===- SILoadStoreOptimizer.cpp -------------------------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This pass tries to fuse DS instructions with close by immediate offsets. |
10 | // This will fuse operations such as |
11 | // ds_read_b32 v0, v2 offset:16 |
12 | // ds_read_b32 v1, v2 offset:32 |
13 | // ==> |
14 | // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8 |
15 | // |
16 | // The same is done for certain SMEM and VMEM opcodes, e.g.: |
17 | // s_buffer_load_dword s4, s[0:3], 4 |
18 | // s_buffer_load_dword s5, s[0:3], 8 |
19 | // ==> |
20 | // s_buffer_load_dwordx2 s[4:5], s[0:3], 4 |
21 | // |
22 | // This pass also tries to promote constant offset to the immediate by |
23 | // adjusting the base. It tries to use a base from the nearby instructions that |
24 | // allows it to have a 13bit constant offset and then promotes the 13bit offset |
25 | // to the immediate. |
26 | // E.g. |
27 | // s_movk_i32 s0, 0x1800 |
28 | // v_add_co_u32_e32 v0, vcc, s0, v2 |
29 | // v_addc_co_u32_e32 v1, vcc, 0, v6, vcc |
30 | // |
31 | // s_movk_i32 s0, 0x1000 |
32 | // v_add_co_u32_e32 v5, vcc, s0, v2 |
33 | // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc |
34 | // global_load_dwordx2 v[5:6], v[5:6], off |
35 | // global_load_dwordx2 v[0:1], v[0:1], off |
36 | // => |
37 | // s_movk_i32 s0, 0x1000 |
38 | // v_add_co_u32_e32 v5, vcc, s0, v2 |
39 | // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc |
40 | // global_load_dwordx2 v[5:6], v[5:6], off |
41 | // global_load_dwordx2 v[0:1], v[5:6], off offset:2048 |
42 | // |
43 | // Future improvements: |
44 | // |
45 | // - This is currently missing stores of constants because loading |
46 | // the constant into the data register is placed between the stores, although |
47 | // this is arguably a scheduling problem. |
48 | // |
49 | // - Live interval recomputing seems inefficient. This currently only matches |
50 | // one pair, and recomputes live intervals and moves on to the next pair. It |
51 | // would be better to compute a list of all merges that need to occur. |
52 | // |
53 | // - With a list of instructions to process, we can also merge more. If a |
54 | // cluster of loads have offsets that are too large to fit in the 8-bit |
55 | // offsets, but are close enough to fit in the 8 bits, we can add to the base |
56 | // pointer and use the new reduced offsets. |
57 | // |
58 | //===----------------------------------------------------------------------===// |
59 | |
60 | #include "AMDGPU.h" |
61 | #include "GCNSubtarget.h" |
62 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
63 | #include "llvm/Analysis/AliasAnalysis.h" |
64 | #include "llvm/CodeGen/MachineFunctionPass.h" |
65 | #include "llvm/InitializePasses.h" |
66 | |
67 | using namespace llvm; |
68 | |
69 | #define DEBUG_TYPE "si-load-store-opt" |
70 | |
71 | namespace { |
72 | enum InstClassEnum { |
73 | UNKNOWN, |
74 | DS_READ, |
75 | DS_WRITE, |
76 | S_BUFFER_LOAD_IMM, |
77 | S_BUFFER_LOAD_SGPR_IMM, |
78 | S_LOAD_IMM, |
79 | BUFFER_LOAD, |
80 | BUFFER_STORE, |
81 | MIMG, |
82 | TBUFFER_LOAD, |
83 | TBUFFER_STORE, |
84 | GLOBAL_LOAD_SADDR, |
85 | GLOBAL_STORE_SADDR, |
86 | FLAT_LOAD, |
87 | FLAT_STORE, |
88 | GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of |
89 | GLOBAL_STORE // any CombineInfo, they are only ever returned by |
90 | // getCommonInstClass. |
91 | }; |
92 | |
93 | struct AddressRegs { |
94 | unsigned char NumVAddrs = 0; |
95 | bool SBase = false; |
96 | bool SRsrc = false; |
97 | bool SOffset = false; |
98 | bool SAddr = false; |
99 | bool VAddr = false; |
100 | bool Addr = false; |
101 | bool SSamp = false; |
102 | }; |
103 | |
104 | // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp. |
105 | const unsigned MaxAddressRegs = 12 + 1 + 1; |
106 | |
107 | class SILoadStoreOptimizer : public MachineFunctionPass { |
108 | struct CombineInfo { |
109 | MachineBasicBlock::iterator I; |
110 | unsigned EltSize; |
111 | unsigned Offset; |
112 | unsigned Width; |
113 | unsigned Format; |
114 | unsigned BaseOff; |
115 | unsigned DMask; |
116 | InstClassEnum InstClass; |
117 | unsigned CPol = 0; |
118 | bool IsAGPR; |
119 | bool UseST64; |
120 | int AddrIdx[MaxAddressRegs]; |
121 | const MachineOperand *AddrReg[MaxAddressRegs]; |
122 | unsigned NumAddresses; |
123 | unsigned Order; |
124 | |
125 | bool hasSameBaseAddress(const CombineInfo &CI) { |
126 | if (NumAddresses != CI.NumAddresses) |
127 | return false; |
128 | |
129 | const MachineInstr &MI = *CI.I; |
130 | for (unsigned i = 0; i < NumAddresses; i++) { |
131 | const MachineOperand &AddrRegNext = MI.getOperand(i: AddrIdx[i]); |
132 | |
133 | if (AddrReg[i]->isImm() || AddrRegNext.isImm()) { |
134 | if (AddrReg[i]->isImm() != AddrRegNext.isImm() || |
135 | AddrReg[i]->getImm() != AddrRegNext.getImm()) { |
136 | return false; |
137 | } |
138 | continue; |
139 | } |
140 | |
141 | // Check same base pointer. Be careful of subregisters, which can occur |
142 | // with vectors of pointers. |
143 | if (AddrReg[i]->getReg() != AddrRegNext.getReg() || |
144 | AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) { |
145 | return false; |
146 | } |
147 | } |
148 | return true; |
149 | } |
150 | |
151 | bool hasMergeableAddress(const MachineRegisterInfo &MRI) { |
152 | for (unsigned i = 0; i < NumAddresses; ++i) { |
153 | const MachineOperand *AddrOp = AddrReg[i]; |
154 | // Immediates are always OK. |
155 | if (AddrOp->isImm()) |
156 | continue; |
157 | |
158 | // Don't try to merge addresses that aren't either immediates or registers. |
159 | // TODO: Should be possible to merge FrameIndexes and maybe some other |
160 | // non-register |
161 | if (!AddrOp->isReg()) |
162 | return false; |
163 | |
164 | // TODO: We should be able to merge instructions with other physical reg |
165 | // addresses too. |
166 | if (AddrOp->getReg().isPhysical() && |
167 | AddrOp->getReg() != AMDGPU::SGPR_NULL) |
168 | return false; |
169 | |
170 | // If an address has only one use then there will be no other |
171 | // instructions with the same address, so we can't merge this one. |
172 | if (MRI.hasOneNonDBGUse(RegNo: AddrOp->getReg())) |
173 | return false; |
174 | } |
175 | return true; |
176 | } |
177 | |
178 | void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO); |
179 | |
180 | // Compare by pointer order. |
181 | bool operator<(const CombineInfo& Other) const { |
182 | return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset; |
183 | } |
184 | }; |
185 | |
186 | struct BaseRegisters { |
187 | Register LoReg; |
188 | Register HiReg; |
189 | |
190 | unsigned LoSubReg = 0; |
191 | unsigned HiSubReg = 0; |
192 | }; |
193 | |
194 | struct MemAddress { |
195 | BaseRegisters Base; |
196 | int64_t Offset = 0; |
197 | }; |
198 | |
199 | using MemInfoMap = DenseMap<MachineInstr *, MemAddress>; |
200 | |
201 | private: |
202 | const GCNSubtarget *STM = nullptr; |
203 | const SIInstrInfo *TII = nullptr; |
204 | const SIRegisterInfo *TRI = nullptr; |
205 | MachineRegisterInfo *MRI = nullptr; |
206 | AliasAnalysis *AA = nullptr; |
207 | bool OptimizeAgain; |
208 | |
209 | bool canSwapInstructions(const DenseSet<Register> &ARegDefs, |
210 | const DenseSet<Register> &ARegUses, |
211 | const MachineInstr &A, const MachineInstr &B) const; |
212 | static bool dmasksCanBeCombined(const CombineInfo &CI, |
213 | const SIInstrInfo &TII, |
214 | const CombineInfo &Paired); |
215 | static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI, |
216 | CombineInfo &Paired, bool Modify = false); |
217 | static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI, |
218 | const CombineInfo &Paired); |
219 | static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired); |
220 | static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI, |
221 | const CombineInfo &Paired); |
222 | const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI, |
223 | const CombineInfo &Paired); |
224 | const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const; |
225 | |
226 | CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired); |
227 | |
228 | unsigned read2Opcode(unsigned EltSize) const; |
229 | unsigned read2ST64Opcode(unsigned EltSize) const; |
230 | MachineBasicBlock::iterator |
231 | mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, |
232 | MachineBasicBlock::iterator InsertBefore); |
233 | |
234 | unsigned write2Opcode(unsigned EltSize) const; |
235 | unsigned write2ST64Opcode(unsigned EltSize) const; |
236 | MachineBasicBlock::iterator |
237 | mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, |
238 | MachineBasicBlock::iterator InsertBefore); |
239 | MachineBasicBlock::iterator |
240 | mergeImagePair(CombineInfo &CI, CombineInfo &Paired, |
241 | MachineBasicBlock::iterator InsertBefore); |
242 | MachineBasicBlock::iterator |
243 | mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired, |
244 | MachineBasicBlock::iterator InsertBefore); |
245 | MachineBasicBlock::iterator |
246 | mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, |
247 | MachineBasicBlock::iterator InsertBefore); |
248 | MachineBasicBlock::iterator |
249 | mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired, |
250 | MachineBasicBlock::iterator InsertBefore); |
251 | MachineBasicBlock::iterator |
252 | mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, |
253 | MachineBasicBlock::iterator InsertBefore); |
254 | MachineBasicBlock::iterator |
255 | mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired, |
256 | MachineBasicBlock::iterator InsertBefore); |
257 | MachineBasicBlock::iterator |
258 | mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired, |
259 | MachineBasicBlock::iterator InsertBefore); |
260 | MachineBasicBlock::iterator |
261 | mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired, |
262 | MachineBasicBlock::iterator InsertBefore); |
263 | |
264 | void updateBaseAndOffset(MachineInstr &I, Register NewBase, |
265 | int32_t NewOffset) const; |
266 | Register computeBase(MachineInstr &MI, const MemAddress &Addr) const; |
267 | MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const; |
268 | std::optional<int32_t> extractConstOffset(const MachineOperand &Op) const; |
269 | void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const; |
270 | /// Promotes constant offset to the immediate by adjusting the base. It |
271 | /// tries to use a base from the nearby instructions that allows it to have |
272 | /// a 13bit constant offset which gets promoted to the immediate. |
273 | bool promoteConstantOffsetToImm(MachineInstr &CI, |
274 | MemInfoMap &Visited, |
275 | SmallPtrSet<MachineInstr *, 4> &Promoted) const; |
276 | void addInstToMergeableList(const CombineInfo &CI, |
277 | std::list<std::list<CombineInfo> > &MergeableInsts) const; |
278 | |
279 | std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts( |
280 | MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, |
281 | MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, |
282 | std::list<std::list<CombineInfo>> &MergeableInsts) const; |
283 | |
284 | static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI, |
285 | const CombineInfo &Paired); |
286 | |
287 | static InstClassEnum getCommonInstClass(const CombineInfo &CI, |
288 | const CombineInfo &Paired); |
289 | |
290 | public: |
291 | static char ID; |
292 | |
293 | SILoadStoreOptimizer() : MachineFunctionPass(ID) { |
294 | initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); |
295 | } |
296 | |
297 | bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList, |
298 | bool &OptimizeListAgain); |
299 | bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts); |
300 | |
301 | bool runOnMachineFunction(MachineFunction &MF) override; |
302 | |
303 | StringRef getPassName() const override { return "SI Load Store Optimizer" ; } |
304 | |
305 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
306 | AU.setPreservesCFG(); |
307 | AU.addRequired<AAResultsWrapperPass>(); |
308 | |
309 | MachineFunctionPass::getAnalysisUsage(AU); |
310 | } |
311 | |
312 | MachineFunctionProperties getRequiredProperties() const override { |
313 | return MachineFunctionProperties() |
314 | .set(MachineFunctionProperties::Property::IsSSA); |
315 | } |
316 | }; |
317 | |
318 | static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { |
319 | const unsigned Opc = MI.getOpcode(); |
320 | |
321 | if (TII.isMUBUF(Opcode: Opc)) { |
322 | // FIXME: Handle d16 correctly |
323 | return AMDGPU::getMUBUFElements(Opc); |
324 | } |
325 | if (TII.isImage(MI)) { |
326 | uint64_t DMaskImm = |
327 | TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm(); |
328 | return llvm::popcount(Value: DMaskImm); |
329 | } |
330 | if (TII.isMTBUF(Opcode: Opc)) { |
331 | return AMDGPU::getMTBUFElements(Opc); |
332 | } |
333 | |
334 | switch (Opc) { |
335 | case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: |
336 | case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: |
337 | case AMDGPU::S_LOAD_DWORD_IMM: |
338 | case AMDGPU::GLOBAL_LOAD_DWORD: |
339 | case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: |
340 | case AMDGPU::GLOBAL_STORE_DWORD: |
341 | case AMDGPU::GLOBAL_STORE_DWORD_SADDR: |
342 | case AMDGPU::FLAT_LOAD_DWORD: |
343 | case AMDGPU::FLAT_STORE_DWORD: |
344 | return 1; |
345 | case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: |
346 | case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: |
347 | case AMDGPU::S_LOAD_DWORDX2_IMM: |
348 | case AMDGPU::GLOBAL_LOAD_DWORDX2: |
349 | case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: |
350 | case AMDGPU::GLOBAL_STORE_DWORDX2: |
351 | case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: |
352 | case AMDGPU::FLAT_LOAD_DWORDX2: |
353 | case AMDGPU::FLAT_STORE_DWORDX2: |
354 | return 2; |
355 | case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: |
356 | case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: |
357 | case AMDGPU::S_LOAD_DWORDX3_IMM: |
358 | case AMDGPU::GLOBAL_LOAD_DWORDX3: |
359 | case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: |
360 | case AMDGPU::GLOBAL_STORE_DWORDX3: |
361 | case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: |
362 | case AMDGPU::FLAT_LOAD_DWORDX3: |
363 | case AMDGPU::FLAT_STORE_DWORDX3: |
364 | return 3; |
365 | case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: |
366 | case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: |
367 | case AMDGPU::S_LOAD_DWORDX4_IMM: |
368 | case AMDGPU::GLOBAL_LOAD_DWORDX4: |
369 | case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: |
370 | case AMDGPU::GLOBAL_STORE_DWORDX4: |
371 | case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: |
372 | case AMDGPU::FLAT_LOAD_DWORDX4: |
373 | case AMDGPU::FLAT_STORE_DWORDX4: |
374 | return 4; |
375 | case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: |
376 | case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: |
377 | case AMDGPU::S_LOAD_DWORDX8_IMM: |
378 | return 8; |
379 | case AMDGPU::DS_READ_B32: [[fallthrough]]; |
380 | case AMDGPU::DS_READ_B32_gfx9: [[fallthrough]]; |
381 | case AMDGPU::DS_WRITE_B32: [[fallthrough]]; |
382 | case AMDGPU::DS_WRITE_B32_gfx9: |
383 | return 1; |
384 | case AMDGPU::DS_READ_B64: [[fallthrough]]; |
385 | case AMDGPU::DS_READ_B64_gfx9: [[fallthrough]]; |
386 | case AMDGPU::DS_WRITE_B64: [[fallthrough]]; |
387 | case AMDGPU::DS_WRITE_B64_gfx9: |
388 | return 2; |
389 | default: |
390 | return 0; |
391 | } |
392 | } |
393 | |
394 | /// Maps instruction opcode to enum InstClassEnum. |
395 | static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { |
396 | switch (Opc) { |
397 | default: |
398 | if (TII.isMUBUF(Opcode: Opc)) { |
399 | switch (AMDGPU::getMUBUFBaseOpcode(Opc)) { |
400 | default: |
401 | return UNKNOWN; |
402 | case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN: |
403 | case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN_exact: |
404 | case AMDGPU::BUFFER_LOAD_DWORD_IDXEN: |
405 | case AMDGPU::BUFFER_LOAD_DWORD_IDXEN_exact: |
406 | case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: |
407 | case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact: |
408 | case AMDGPU::BUFFER_LOAD_DWORD_OFFSET: |
409 | case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact: |
410 | case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN: |
411 | case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact: |
412 | case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN: |
413 | case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact: |
414 | case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN: |
415 | case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN_exact: |
416 | case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET: |
417 | case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET_exact: |
418 | return BUFFER_LOAD; |
419 | case AMDGPU::BUFFER_STORE_DWORD_BOTHEN: |
420 | case AMDGPU::BUFFER_STORE_DWORD_BOTHEN_exact: |
421 | case AMDGPU::BUFFER_STORE_DWORD_IDXEN: |
422 | case AMDGPU::BUFFER_STORE_DWORD_IDXEN_exact: |
423 | case AMDGPU::BUFFER_STORE_DWORD_OFFEN: |
424 | case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: |
425 | case AMDGPU::BUFFER_STORE_DWORD_OFFSET: |
426 | case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: |
427 | case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN: |
428 | case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact: |
429 | case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN: |
430 | case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN_exact: |
431 | case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN: |
432 | case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact: |
433 | case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET: |
434 | case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact: |
435 | return BUFFER_STORE; |
436 | } |
437 | } |
438 | if (TII.isImage(Opcode: Opc)) { |
439 | // Ignore instructions encoded without vaddr. |
440 | if (!AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) && |
441 | !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr0)) |
442 | return UNKNOWN; |
443 | // Ignore BVH instructions |
444 | if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH) |
445 | return UNKNOWN; |
446 | // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD. |
447 | if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() || |
448 | TII.isGather4(Opcode: Opc)) |
449 | return UNKNOWN; |
450 | return MIMG; |
451 | } |
452 | if (TII.isMTBUF(Opcode: Opc)) { |
453 | switch (AMDGPU::getMTBUFBaseOpcode(Opc)) { |
454 | default: |
455 | return UNKNOWN; |
456 | case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN: |
457 | case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact: |
458 | case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN: |
459 | case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact: |
460 | case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN: |
461 | case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact: |
462 | case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET: |
463 | case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact: |
464 | case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN: |
465 | case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN_exact: |
466 | case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN: |
467 | case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN_exact: |
468 | case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN: |
469 | case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN_exact: |
470 | case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET: |
471 | case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET_exact: |
472 | return TBUFFER_LOAD; |
473 | case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN: |
474 | case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact: |
475 | case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET: |
476 | case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact: |
477 | case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN: |
478 | case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact: |
479 | case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET: |
480 | case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET_exact: |
481 | return TBUFFER_STORE; |
482 | } |
483 | } |
484 | return UNKNOWN; |
485 | case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: |
486 | case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: |
487 | case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: |
488 | case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: |
489 | case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: |
490 | return S_BUFFER_LOAD_IMM; |
491 | case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: |
492 | case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: |
493 | case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: |
494 | case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: |
495 | case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: |
496 | return S_BUFFER_LOAD_SGPR_IMM; |
497 | case AMDGPU::S_LOAD_DWORD_IMM: |
498 | case AMDGPU::S_LOAD_DWORDX2_IMM: |
499 | case AMDGPU::S_LOAD_DWORDX3_IMM: |
500 | case AMDGPU::S_LOAD_DWORDX4_IMM: |
501 | case AMDGPU::S_LOAD_DWORDX8_IMM: |
502 | return S_LOAD_IMM; |
503 | case AMDGPU::DS_READ_B32: |
504 | case AMDGPU::DS_READ_B32_gfx9: |
505 | case AMDGPU::DS_READ_B64: |
506 | case AMDGPU::DS_READ_B64_gfx9: |
507 | return DS_READ; |
508 | case AMDGPU::DS_WRITE_B32: |
509 | case AMDGPU::DS_WRITE_B32_gfx9: |
510 | case AMDGPU::DS_WRITE_B64: |
511 | case AMDGPU::DS_WRITE_B64_gfx9: |
512 | return DS_WRITE; |
513 | case AMDGPU::GLOBAL_LOAD_DWORD: |
514 | case AMDGPU::GLOBAL_LOAD_DWORDX2: |
515 | case AMDGPU::GLOBAL_LOAD_DWORDX3: |
516 | case AMDGPU::GLOBAL_LOAD_DWORDX4: |
517 | case AMDGPU::FLAT_LOAD_DWORD: |
518 | case AMDGPU::FLAT_LOAD_DWORDX2: |
519 | case AMDGPU::FLAT_LOAD_DWORDX3: |
520 | case AMDGPU::FLAT_LOAD_DWORDX4: |
521 | return FLAT_LOAD; |
522 | case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: |
523 | case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: |
524 | case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: |
525 | case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: |
526 | return GLOBAL_LOAD_SADDR; |
527 | case AMDGPU::GLOBAL_STORE_DWORD: |
528 | case AMDGPU::GLOBAL_STORE_DWORDX2: |
529 | case AMDGPU::GLOBAL_STORE_DWORDX3: |
530 | case AMDGPU::GLOBAL_STORE_DWORDX4: |
531 | case AMDGPU::FLAT_STORE_DWORD: |
532 | case AMDGPU::FLAT_STORE_DWORDX2: |
533 | case AMDGPU::FLAT_STORE_DWORDX3: |
534 | case AMDGPU::FLAT_STORE_DWORDX4: |
535 | return FLAT_STORE; |
536 | case AMDGPU::GLOBAL_STORE_DWORD_SADDR: |
537 | case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: |
538 | case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: |
539 | case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: |
540 | return GLOBAL_STORE_SADDR; |
541 | } |
542 | } |
543 | |
544 | /// Determines instruction subclass from opcode. Only instructions |
545 | /// of the same subclass can be merged together. The merged instruction may have |
546 | /// a different subclass but must have the same class. |
547 | static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { |
548 | switch (Opc) { |
549 | default: |
550 | if (TII.isMUBUF(Opcode: Opc)) |
551 | return AMDGPU::getMUBUFBaseOpcode(Opc); |
552 | if (TII.isImage(Opcode: Opc)) { |
553 | const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); |
554 | assert(Info); |
555 | return Info->BaseOpcode; |
556 | } |
557 | if (TII.isMTBUF(Opcode: Opc)) |
558 | return AMDGPU::getMTBUFBaseOpcode(Opc); |
559 | return -1; |
560 | case AMDGPU::DS_READ_B32: |
561 | case AMDGPU::DS_READ_B32_gfx9: |
562 | case AMDGPU::DS_READ_B64: |
563 | case AMDGPU::DS_READ_B64_gfx9: |
564 | case AMDGPU::DS_WRITE_B32: |
565 | case AMDGPU::DS_WRITE_B32_gfx9: |
566 | case AMDGPU::DS_WRITE_B64: |
567 | case AMDGPU::DS_WRITE_B64_gfx9: |
568 | return Opc; |
569 | case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: |
570 | case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: |
571 | case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: |
572 | case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: |
573 | case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: |
574 | return AMDGPU::S_BUFFER_LOAD_DWORD_IMM; |
575 | case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: |
576 | case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: |
577 | case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: |
578 | case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: |
579 | case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: |
580 | return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM; |
581 | case AMDGPU::S_LOAD_DWORD_IMM: |
582 | case AMDGPU::S_LOAD_DWORDX2_IMM: |
583 | case AMDGPU::S_LOAD_DWORDX3_IMM: |
584 | case AMDGPU::S_LOAD_DWORDX4_IMM: |
585 | case AMDGPU::S_LOAD_DWORDX8_IMM: |
586 | return AMDGPU::S_LOAD_DWORD_IMM; |
587 | case AMDGPU::GLOBAL_LOAD_DWORD: |
588 | case AMDGPU::GLOBAL_LOAD_DWORDX2: |
589 | case AMDGPU::GLOBAL_LOAD_DWORDX3: |
590 | case AMDGPU::GLOBAL_LOAD_DWORDX4: |
591 | case AMDGPU::FLAT_LOAD_DWORD: |
592 | case AMDGPU::FLAT_LOAD_DWORDX2: |
593 | case AMDGPU::FLAT_LOAD_DWORDX3: |
594 | case AMDGPU::FLAT_LOAD_DWORDX4: |
595 | return AMDGPU::FLAT_LOAD_DWORD; |
596 | case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: |
597 | case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: |
598 | case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: |
599 | case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: |
600 | return AMDGPU::GLOBAL_LOAD_DWORD_SADDR; |
601 | case AMDGPU::GLOBAL_STORE_DWORD: |
602 | case AMDGPU::GLOBAL_STORE_DWORDX2: |
603 | case AMDGPU::GLOBAL_STORE_DWORDX3: |
604 | case AMDGPU::GLOBAL_STORE_DWORDX4: |
605 | case AMDGPU::FLAT_STORE_DWORD: |
606 | case AMDGPU::FLAT_STORE_DWORDX2: |
607 | case AMDGPU::FLAT_STORE_DWORDX3: |
608 | case AMDGPU::FLAT_STORE_DWORDX4: |
609 | return AMDGPU::FLAT_STORE_DWORD; |
610 | case AMDGPU::GLOBAL_STORE_DWORD_SADDR: |
611 | case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: |
612 | case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: |
613 | case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: |
614 | return AMDGPU::GLOBAL_STORE_DWORD_SADDR; |
615 | } |
616 | } |
617 | |
618 | // GLOBAL loads and stores are classified as FLAT initially. If both combined |
619 | // instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE. |
620 | // If either or both instructions are non segment specific FLAT the resulting |
621 | // combined operation will be FLAT, potentially promoting one of the GLOBAL |
622 | // operations to FLAT. |
623 | // For other instructions return the original unmodified class. |
624 | InstClassEnum |
625 | SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI, |
626 | const CombineInfo &Paired) { |
627 | assert(CI.InstClass == Paired.InstClass); |
628 | |
629 | if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) && |
630 | SIInstrInfo::isFLATGlobal(MI: *CI.I) && SIInstrInfo::isFLATGlobal(MI: *Paired.I)) |
631 | return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD; |
632 | |
633 | return CI.InstClass; |
634 | } |
635 | |
636 | static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) { |
637 | AddressRegs Result; |
638 | |
639 | if (TII.isMUBUF(Opcode: Opc)) { |
640 | if (AMDGPU::getMUBUFHasVAddr(Opc)) |
641 | Result.VAddr = true; |
642 | if (AMDGPU::getMUBUFHasSrsrc(Opc)) |
643 | Result.SRsrc = true; |
644 | if (AMDGPU::getMUBUFHasSoffset(Opc)) |
645 | Result.SOffset = true; |
646 | |
647 | return Result; |
648 | } |
649 | |
650 | if (TII.isImage(Opcode: Opc)) { |
651 | int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); |
652 | if (VAddr0Idx >= 0) { |
653 | int RsrcName = |
654 | TII.isMIMG(Opc) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc; |
655 | int RsrcIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, NamedIdx: RsrcName); |
656 | Result.NumVAddrs = RsrcIdx - VAddr0Idx; |
657 | } else { |
658 | Result.VAddr = true; |
659 | } |
660 | Result.SRsrc = true; |
661 | const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); |
662 | if (Info && AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: Info->BaseOpcode)->Sampler) |
663 | Result.SSamp = true; |
664 | |
665 | return Result; |
666 | } |
667 | if (TII.isMTBUF(Opcode: Opc)) { |
668 | if (AMDGPU::getMTBUFHasVAddr(Opc)) |
669 | Result.VAddr = true; |
670 | if (AMDGPU::getMTBUFHasSrsrc(Opc)) |
671 | Result.SRsrc = true; |
672 | if (AMDGPU::getMTBUFHasSoffset(Opc)) |
673 | Result.SOffset = true; |
674 | |
675 | return Result; |
676 | } |
677 | |
678 | switch (Opc) { |
679 | default: |
680 | return Result; |
681 | case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: |
682 | case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: |
683 | case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: |
684 | case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: |
685 | case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: |
686 | Result.SOffset = true; |
687 | [[fallthrough]]; |
688 | case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: |
689 | case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: |
690 | case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: |
691 | case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: |
692 | case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: |
693 | case AMDGPU::S_LOAD_DWORD_IMM: |
694 | case AMDGPU::S_LOAD_DWORDX2_IMM: |
695 | case AMDGPU::S_LOAD_DWORDX3_IMM: |
696 | case AMDGPU::S_LOAD_DWORDX4_IMM: |
697 | case AMDGPU::S_LOAD_DWORDX8_IMM: |
698 | Result.SBase = true; |
699 | return Result; |
700 | case AMDGPU::DS_READ_B32: |
701 | case AMDGPU::DS_READ_B64: |
702 | case AMDGPU::DS_READ_B32_gfx9: |
703 | case AMDGPU::DS_READ_B64_gfx9: |
704 | case AMDGPU::DS_WRITE_B32: |
705 | case AMDGPU::DS_WRITE_B64: |
706 | case AMDGPU::DS_WRITE_B32_gfx9: |
707 | case AMDGPU::DS_WRITE_B64_gfx9: |
708 | Result.Addr = true; |
709 | return Result; |
710 | case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: |
711 | case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: |
712 | case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: |
713 | case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: |
714 | case AMDGPU::GLOBAL_STORE_DWORD_SADDR: |
715 | case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: |
716 | case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: |
717 | case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: |
718 | Result.SAddr = true; |
719 | [[fallthrough]]; |
720 | case AMDGPU::GLOBAL_LOAD_DWORD: |
721 | case AMDGPU::GLOBAL_LOAD_DWORDX2: |
722 | case AMDGPU::GLOBAL_LOAD_DWORDX3: |
723 | case AMDGPU::GLOBAL_LOAD_DWORDX4: |
724 | case AMDGPU::GLOBAL_STORE_DWORD: |
725 | case AMDGPU::GLOBAL_STORE_DWORDX2: |
726 | case AMDGPU::GLOBAL_STORE_DWORDX3: |
727 | case AMDGPU::GLOBAL_STORE_DWORDX4: |
728 | case AMDGPU::FLAT_LOAD_DWORD: |
729 | case AMDGPU::FLAT_LOAD_DWORDX2: |
730 | case AMDGPU::FLAT_LOAD_DWORDX3: |
731 | case AMDGPU::FLAT_LOAD_DWORDX4: |
732 | case AMDGPU::FLAT_STORE_DWORD: |
733 | case AMDGPU::FLAT_STORE_DWORDX2: |
734 | case AMDGPU::FLAT_STORE_DWORDX3: |
735 | case AMDGPU::FLAT_STORE_DWORDX4: |
736 | Result.VAddr = true; |
737 | return Result; |
738 | } |
739 | } |
740 | |
741 | void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, |
742 | const SILoadStoreOptimizer &LSO) { |
743 | I = MI; |
744 | unsigned Opc = MI->getOpcode(); |
745 | InstClass = getInstClass(Opc, TII: *LSO.TII); |
746 | |
747 | if (InstClass == UNKNOWN) |
748 | return; |
749 | |
750 | IsAGPR = LSO.TRI->hasAGPRs(RC: LSO.getDataRegClass(MI: *MI)); |
751 | |
752 | switch (InstClass) { |
753 | case DS_READ: |
754 | EltSize = |
755 | (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 |
756 | : 4; |
757 | break; |
758 | case DS_WRITE: |
759 | EltSize = |
760 | (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 |
761 | : 4; |
762 | break; |
763 | case S_BUFFER_LOAD_IMM: |
764 | case S_BUFFER_LOAD_SGPR_IMM: |
765 | case S_LOAD_IMM: |
766 | EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4); |
767 | break; |
768 | default: |
769 | EltSize = 4; |
770 | break; |
771 | } |
772 | |
773 | if (InstClass == MIMG) { |
774 | DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm(); |
775 | // Offset is not considered for MIMG instructions. |
776 | Offset = 0; |
777 | } else { |
778 | int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset); |
779 | Offset = I->getOperand(i: OffsetIdx).getImm(); |
780 | } |
781 | |
782 | if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) |
783 | Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm(); |
784 | |
785 | Width = getOpcodeWidth(MI: *I, TII: *LSO.TII); |
786 | |
787 | if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) { |
788 | Offset &= 0xffff; |
789 | } else if (InstClass != MIMG) { |
790 | CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm(); |
791 | } |
792 | |
793 | AddressRegs Regs = getRegs(Opc, TII: *LSO.TII); |
794 | bool isVIMAGEorVSAMPLE = LSO.TII->isVIMAGE(MI: *I) || LSO.TII->isVSAMPLE(MI: *I); |
795 | |
796 | NumAddresses = 0; |
797 | for (unsigned J = 0; J < Regs.NumVAddrs; J++) |
798 | AddrIdx[NumAddresses++] = |
799 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J; |
800 | if (Regs.Addr) |
801 | AddrIdx[NumAddresses++] = |
802 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr); |
803 | if (Regs.SBase) |
804 | AddrIdx[NumAddresses++] = |
805 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase); |
806 | if (Regs.SRsrc) |
807 | AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx( |
808 | Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::rsrc : AMDGPU::OpName::srsrc); |
809 | if (Regs.SOffset) |
810 | AddrIdx[NumAddresses++] = |
811 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset); |
812 | if (Regs.SAddr) |
813 | AddrIdx[NumAddresses++] = |
814 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr); |
815 | if (Regs.VAddr) |
816 | AddrIdx[NumAddresses++] = |
817 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr); |
818 | if (Regs.SSamp) |
819 | AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx( |
820 | Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::samp : AMDGPU::OpName::ssamp); |
821 | assert(NumAddresses <= MaxAddressRegs); |
822 | |
823 | for (unsigned J = 0; J < NumAddresses; J++) |
824 | AddrReg[J] = &I->getOperand(i: AddrIdx[J]); |
825 | } |
826 | |
827 | } // end anonymous namespace. |
828 | |
829 | INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, |
830 | "SI Load Store Optimizer" , false, false) |
831 | INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) |
832 | INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer" , |
833 | false, false) |
834 | |
835 | char SILoadStoreOptimizer::ID = 0; |
836 | |
837 | char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID; |
838 | |
839 | FunctionPass *llvm::createSILoadStoreOptimizerPass() { |
840 | return new SILoadStoreOptimizer(); |
841 | } |
842 | |
843 | static void addDefsUsesToList(const MachineInstr &MI, |
844 | DenseSet<Register> &RegDefs, |
845 | DenseSet<Register> &RegUses) { |
846 | for (const auto &Op : MI.operands()) { |
847 | if (!Op.isReg()) |
848 | continue; |
849 | if (Op.isDef()) |
850 | RegDefs.insert(V: Op.getReg()); |
851 | if (Op.readsReg()) |
852 | RegUses.insert(V: Op.getReg()); |
853 | } |
854 | } |
855 | |
856 | bool SILoadStoreOptimizer::canSwapInstructions( |
857 | const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses, |
858 | const MachineInstr &A, const MachineInstr &B) const { |
859 | if (A.mayLoadOrStore() && B.mayLoadOrStore() && |
860 | (A.mayStore() || B.mayStore()) && A.mayAlias(AA, Other: B, UseTBAA: true)) |
861 | return false; |
862 | for (const auto &BOp : B.operands()) { |
863 | if (!BOp.isReg()) |
864 | continue; |
865 | if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(V: BOp.getReg())) |
866 | return false; |
867 | if (BOp.isDef() && ARegUses.contains(V: BOp.getReg())) |
868 | return false; |
869 | } |
870 | return true; |
871 | } |
872 | |
873 | // Given that \p CI and \p Paired are adjacent memory operations produce a new |
874 | // MMO for the combined operation with a new access size. |
875 | MachineMemOperand * |
876 | SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI, |
877 | const CombineInfo &Paired) { |
878 | const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); |
879 | const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); |
880 | |
881 | unsigned Size = MMOa->getSize().getValue() + MMOb->getSize().getValue(); |
882 | |
883 | // A base pointer for the combined operation is the same as the leading |
884 | // operation's pointer. |
885 | if (Paired < CI) |
886 | std::swap(a&: MMOa, b&: MMOb); |
887 | |
888 | MachinePointerInfo PtrInfo(MMOa->getPointerInfo()); |
889 | // If merging FLAT and GLOBAL set address space to FLAT. |
890 | if (MMOb->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS) |
891 | PtrInfo.AddrSpace = AMDGPUAS::FLAT_ADDRESS; |
892 | |
893 | MachineFunction *MF = CI.I->getMF(); |
894 | return MF->getMachineMemOperand(MMO: MMOa, PtrInfo, Size); |
895 | } |
896 | |
897 | bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI, |
898 | const SIInstrInfo &TII, |
899 | const CombineInfo &Paired) { |
900 | assert(CI.InstClass == MIMG); |
901 | |
902 | // Ignore instructions with tfe/lwe set. |
903 | const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe); |
904 | const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe); |
905 | |
906 | if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm())) |
907 | return false; |
908 | |
909 | // Check other optional immediate operands for equality. |
910 | unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16, |
911 | AMDGPU::OpName::unorm, AMDGPU::OpName::da, |
912 | AMDGPU::OpName::r128, AMDGPU::OpName::a16}; |
913 | |
914 | for (auto op : OperandsToMatch) { |
915 | int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op); |
916 | if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx) |
917 | return false; |
918 | if (Idx != -1 && |
919 | CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm()) |
920 | return false; |
921 | } |
922 | |
923 | // Check DMask for overlaps. |
924 | unsigned MaxMask = std::max(a: CI.DMask, b: Paired.DMask); |
925 | unsigned MinMask = std::min(a: CI.DMask, b: Paired.DMask); |
926 | |
927 | if (!MaxMask) |
928 | return false; |
929 | |
930 | unsigned AllowedBitsForMin = llvm::countr_zero(Val: MaxMask); |
931 | if ((1u << AllowedBitsForMin) <= MinMask) |
932 | return false; |
933 | |
934 | return true; |
935 | } |
936 | |
937 | static unsigned getBufferFormatWithCompCount(unsigned OldFormat, |
938 | unsigned ComponentCount, |
939 | const GCNSubtarget &STI) { |
940 | if (ComponentCount > 4) |
941 | return 0; |
942 | |
943 | const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo = |
944 | llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI); |
945 | if (!OldFormatInfo) |
946 | return 0; |
947 | |
948 | const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo = |
949 | llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp, |
950 | ComponentCount, |
951 | OldFormatInfo->NumFormat, STI); |
952 | |
953 | if (!NewFormatInfo) |
954 | return 0; |
955 | |
956 | assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat && |
957 | NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp); |
958 | |
959 | return NewFormatInfo->Format; |
960 | } |
961 | |
962 | // Return the value in the inclusive range [Lo,Hi] that is aligned to the |
963 | // highest power of two. Note that the result is well defined for all inputs |
964 | // including corner cases like: |
965 | // - if Lo == Hi, return that value |
966 | // - if Lo == 0, return 0 (even though the "- 1" below underflows |
967 | // - if Lo > Hi, return 0 (as if the range wrapped around) |
968 | static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) { |
969 | return Hi & maskLeadingOnes<uint32_t>(N: llvm::countl_zero(Val: (Lo - 1) ^ Hi) + 1); |
970 | } |
971 | |
972 | bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI, |
973 | const GCNSubtarget &STI, |
974 | CombineInfo &Paired, |
975 | bool Modify) { |
976 | assert(CI.InstClass != MIMG); |
977 | |
978 | // XXX - Would the same offset be OK? Is there any reason this would happen or |
979 | // be useful? |
980 | if (CI.Offset == Paired.Offset) |
981 | return false; |
982 | |
983 | // This won't be valid if the offset isn't aligned. |
984 | if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0)) |
985 | return false; |
986 | |
987 | if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) { |
988 | |
989 | const llvm::AMDGPU::GcnBufferFormatInfo *Info0 = |
990 | llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI); |
991 | if (!Info0) |
992 | return false; |
993 | const llvm::AMDGPU::GcnBufferFormatInfo *Info1 = |
994 | llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI); |
995 | if (!Info1) |
996 | return false; |
997 | |
998 | if (Info0->BitsPerComp != Info1->BitsPerComp || |
999 | Info0->NumFormat != Info1->NumFormat) |
1000 | return false; |
1001 | |
1002 | // TODO: Should be possible to support more formats, but if format loads |
1003 | // are not dword-aligned, the merged load might not be valid. |
1004 | if (Info0->BitsPerComp != 32) |
1005 | return false; |
1006 | |
1007 | if (getBufferFormatWithCompCount(OldFormat: CI.Format, ComponentCount: CI.Width + Paired.Width, STI) == 0) |
1008 | return false; |
1009 | } |
1010 | |
1011 | uint32_t EltOffset0 = CI.Offset / CI.EltSize; |
1012 | uint32_t EltOffset1 = Paired.Offset / CI.EltSize; |
1013 | CI.UseST64 = false; |
1014 | CI.BaseOff = 0; |
1015 | |
1016 | // Handle all non-DS instructions. |
1017 | if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) { |
1018 | if (EltOffset0 + CI.Width != EltOffset1 && |
1019 | EltOffset1 + Paired.Width != EltOffset0) |
1020 | return false; |
1021 | if (CI.CPol != Paired.CPol) |
1022 | return false; |
1023 | if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM || |
1024 | CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) { |
1025 | // Reject cases like: |
1026 | // dword + dwordx2 -> dwordx3 |
1027 | // dword + dwordx3 -> dwordx4 |
1028 | // If we tried to combine these cases, we would fail to extract a subreg |
1029 | // for the result of the second load due to SGPR alignment requirements. |
1030 | if (CI.Width != Paired.Width && |
1031 | (CI.Width < Paired.Width) == (CI.Offset < Paired.Offset)) |
1032 | return false; |
1033 | } |
1034 | return true; |
1035 | } |
1036 | |
1037 | // If the offset in elements doesn't fit in 8-bits, we might be able to use |
1038 | // the stride 64 versions. |
1039 | if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 && |
1040 | isUInt<8>(x: EltOffset0 / 64) && isUInt<8>(x: EltOffset1 / 64)) { |
1041 | if (Modify) { |
1042 | CI.Offset = EltOffset0 / 64; |
1043 | Paired.Offset = EltOffset1 / 64; |
1044 | CI.UseST64 = true; |
1045 | } |
1046 | return true; |
1047 | } |
1048 | |
1049 | // Check if the new offsets fit in the reduced 8-bit range. |
1050 | if (isUInt<8>(x: EltOffset0) && isUInt<8>(x: EltOffset1)) { |
1051 | if (Modify) { |
1052 | CI.Offset = EltOffset0; |
1053 | Paired.Offset = EltOffset1; |
1054 | } |
1055 | return true; |
1056 | } |
1057 | |
1058 | // Try to shift base address to decrease offsets. |
1059 | uint32_t Min = std::min(a: EltOffset0, b: EltOffset1); |
1060 | uint32_t Max = std::max(a: EltOffset0, b: EltOffset1); |
1061 | |
1062 | const uint32_t Mask = maskTrailingOnes<uint32_t>(N: 8) * 64; |
1063 | if (((Max - Min) & ~Mask) == 0) { |
1064 | if (Modify) { |
1065 | // From the range of values we could use for BaseOff, choose the one that |
1066 | // is aligned to the highest power of two, to maximise the chance that |
1067 | // the same offset can be reused for other load/store pairs. |
1068 | uint32_t BaseOff = mostAlignedValueInRange(Lo: Max - 0xff * 64, Hi: Min); |
1069 | // Copy the low bits of the offsets, so that when we adjust them by |
1070 | // subtracting BaseOff they will be multiples of 64. |
1071 | BaseOff |= Min & maskTrailingOnes<uint32_t>(N: 6); |
1072 | CI.BaseOff = BaseOff * CI.EltSize; |
1073 | CI.Offset = (EltOffset0 - BaseOff) / 64; |
1074 | Paired.Offset = (EltOffset1 - BaseOff) / 64; |
1075 | CI.UseST64 = true; |
1076 | } |
1077 | return true; |
1078 | } |
1079 | |
1080 | if (isUInt<8>(x: Max - Min)) { |
1081 | if (Modify) { |
1082 | // From the range of values we could use for BaseOff, choose the one that |
1083 | // is aligned to the highest power of two, to maximise the chance that |
1084 | // the same offset can be reused for other load/store pairs. |
1085 | uint32_t BaseOff = mostAlignedValueInRange(Lo: Max - 0xff, Hi: Min); |
1086 | CI.BaseOff = BaseOff * CI.EltSize; |
1087 | CI.Offset = EltOffset0 - BaseOff; |
1088 | Paired.Offset = EltOffset1 - BaseOff; |
1089 | } |
1090 | return true; |
1091 | } |
1092 | |
1093 | return false; |
1094 | } |
1095 | |
1096 | bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM, |
1097 | const CombineInfo &CI, |
1098 | const CombineInfo &Paired) { |
1099 | const unsigned Width = (CI.Width + Paired.Width); |
1100 | switch (CI.InstClass) { |
1101 | default: |
1102 | return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3)); |
1103 | case S_BUFFER_LOAD_IMM: |
1104 | case S_BUFFER_LOAD_SGPR_IMM: |
1105 | case S_LOAD_IMM: |
1106 | switch (Width) { |
1107 | default: |
1108 | return false; |
1109 | case 2: |
1110 | case 4: |
1111 | case 8: |
1112 | return true; |
1113 | case 3: |
1114 | return STM.hasScalarDwordx3Loads(); |
1115 | } |
1116 | } |
1117 | } |
1118 | |
1119 | const TargetRegisterClass * |
1120 | SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const { |
1121 | if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) { |
1122 | return TRI->getRegClassForReg(MRI: *MRI, Reg: Dst->getReg()); |
1123 | } |
1124 | if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) { |
1125 | return TRI->getRegClassForReg(MRI: *MRI, Reg: Src->getReg()); |
1126 | } |
1127 | if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) { |
1128 | return TRI->getRegClassForReg(MRI: *MRI, Reg: Src->getReg()); |
1129 | } |
1130 | if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) { |
1131 | return TRI->getRegClassForReg(MRI: *MRI, Reg: Dst->getReg()); |
1132 | } |
1133 | if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) { |
1134 | return TRI->getRegClassForReg(MRI: *MRI, Reg: Src->getReg()); |
1135 | } |
1136 | return nullptr; |
1137 | } |
1138 | |
1139 | /// This function assumes that CI comes before Paired in a basic block. Return |
1140 | /// an insertion point for the merged instruction or nullptr on failure. |
1141 | SILoadStoreOptimizer::CombineInfo * |
1142 | SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI, |
1143 | CombineInfo &Paired) { |
1144 | // If another instruction has already been merged into CI, it may now be a |
1145 | // type that we can't do any further merging into. |
1146 | if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN) |
1147 | return nullptr; |
1148 | assert(CI.InstClass == Paired.InstClass); |
1149 | |
1150 | if (getInstSubclass(Opc: CI.I->getOpcode(), TII: *TII) != |
1151 | getInstSubclass(Opc: Paired.I->getOpcode(), TII: *TII)) |
1152 | return nullptr; |
1153 | |
1154 | // Check both offsets (or masks for MIMG) can be combined and fit in the |
1155 | // reduced range. |
1156 | if (CI.InstClass == MIMG) { |
1157 | if (!dmasksCanBeCombined(CI, TII: *TII, Paired)) |
1158 | return nullptr; |
1159 | } else { |
1160 | if (!widthsFit(STM: *STM, CI, Paired) || !offsetsCanBeCombined(CI, STI: *STM, Paired)) |
1161 | return nullptr; |
1162 | } |
1163 | |
1164 | DenseSet<Register> RegDefs; |
1165 | DenseSet<Register> RegUses; |
1166 | CombineInfo *Where; |
1167 | if (CI.I->mayLoad()) { |
1168 | // Try to hoist Paired up to CI. |
1169 | addDefsUsesToList(MI: *Paired.I, RegDefs, RegUses); |
1170 | for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) { |
1171 | if (!canSwapInstructions(ARegDefs: RegDefs, ARegUses: RegUses, A: *Paired.I, B: *MBBI)) |
1172 | return nullptr; |
1173 | } |
1174 | Where = &CI; |
1175 | } else { |
1176 | // Try to sink CI down to Paired. |
1177 | addDefsUsesToList(MI: *CI.I, RegDefs, RegUses); |
1178 | for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) { |
1179 | if (!canSwapInstructions(ARegDefs: RegDefs, ARegUses: RegUses, A: *CI.I, B: *MBBI)) |
1180 | return nullptr; |
1181 | } |
1182 | Where = &Paired; |
1183 | } |
1184 | |
1185 | // Call offsetsCanBeCombined with modify = true so that the offsets are |
1186 | // correct for the new instruction. This should return true, because |
1187 | // this function should only be called on CombineInfo objects that |
1188 | // have already been confirmed to be mergeable. |
1189 | if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE) |
1190 | offsetsCanBeCombined(CI, STI: *STM, Paired, Modify: true); |
1191 | return Where; |
1192 | } |
1193 | |
1194 | unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const { |
1195 | if (STM->ldsRequiresM0Init()) |
1196 | return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64; |
1197 | return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9; |
1198 | } |
1199 | |
1200 | unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const { |
1201 | if (STM->ldsRequiresM0Init()) |
1202 | return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; |
1203 | |
1204 | return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9 |
1205 | : AMDGPU::DS_READ2ST64_B64_gfx9; |
1206 | } |
1207 | |
1208 | MachineBasicBlock::iterator |
1209 | SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, |
1210 | MachineBasicBlock::iterator InsertBefore) { |
1211 | MachineBasicBlock *MBB = CI.I->getParent(); |
1212 | |
1213 | // Be careful, since the addresses could be subregisters themselves in weird |
1214 | // cases, like vectors of pointers. |
1215 | const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); |
1216 | |
1217 | const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst); |
1218 | const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst); |
1219 | |
1220 | unsigned NewOffset0 = CI.Offset; |
1221 | unsigned NewOffset1 = Paired.Offset; |
1222 | unsigned Opc = |
1223 | CI.UseST64 ? read2ST64Opcode(EltSize: CI.EltSize) : read2Opcode(EltSize: CI.EltSize); |
1224 | |
1225 | unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; |
1226 | unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; |
1227 | |
1228 | if (NewOffset0 > NewOffset1) { |
1229 | // Canonicalize the merged instruction so the smaller offset comes first. |
1230 | std::swap(a&: NewOffset0, b&: NewOffset1); |
1231 | std::swap(a&: SubRegIdx0, b&: SubRegIdx1); |
1232 | } |
1233 | |
1234 | assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && |
1235 | (NewOffset0 != NewOffset1) && "Computed offset doesn't fit" ); |
1236 | |
1237 | const MCInstrDesc &Read2Desc = TII->get(Opc); |
1238 | |
1239 | const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); |
1240 | Register DestReg = MRI->createVirtualRegister(RegClass: SuperRC); |
1241 | |
1242 | DebugLoc DL = CI.I->getDebugLoc(); |
1243 | |
1244 | Register BaseReg = AddrReg->getReg(); |
1245 | unsigned BaseSubReg = AddrReg->getSubReg(); |
1246 | unsigned BaseRegFlags = 0; |
1247 | if (CI.BaseOff) { |
1248 | Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); |
1249 | BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) |
1250 | .addImm(CI.BaseOff); |
1251 | |
1252 | BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
1253 | BaseRegFlags = RegState::Kill; |
1254 | |
1255 | TII->getAddNoCarry(MBB&: *MBB, I: InsertBefore, DL, DestReg: BaseReg) |
1256 | .addReg(RegNo: ImmReg) |
1257 | .addReg(RegNo: AddrReg->getReg(), flags: 0, SubReg: BaseSubReg) |
1258 | .addImm(0); // clamp bit |
1259 | BaseSubReg = 0; |
1260 | } |
1261 | |
1262 | MachineInstrBuilder Read2 = |
1263 | BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: Read2Desc, DestReg) |
1264 | .addReg(RegNo: BaseReg, flags: BaseRegFlags, SubReg: BaseSubReg) // addr |
1265 | .addImm(Val: NewOffset0) // offset0 |
1266 | .addImm(Val: NewOffset1) // offset1 |
1267 | .addImm(Val: 0) // gds |
1268 | .cloneMergedMemRefs(OtherMIs: {&*CI.I, &*Paired.I}); |
1269 | |
1270 | (void)Read2; |
1271 | |
1272 | const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); |
1273 | |
1274 | // Copy to the old destination registers. |
1275 | BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: CopyDesc) |
1276 | .add(*Dest0) // Copy to same destination including flags and sub reg. |
1277 | .addReg(DestReg, 0, SubRegIdx0); |
1278 | BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: CopyDesc) |
1279 | .add(*Dest1) |
1280 | .addReg(DestReg, RegState::Kill, SubRegIdx1); |
1281 | |
1282 | CI.I->eraseFromParent(); |
1283 | Paired.I->eraseFromParent(); |
1284 | |
1285 | LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); |
1286 | return Read2; |
1287 | } |
1288 | |
1289 | unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const { |
1290 | if (STM->ldsRequiresM0Init()) |
1291 | return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; |
1292 | return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 |
1293 | : AMDGPU::DS_WRITE2_B64_gfx9; |
1294 | } |
1295 | |
1296 | unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const { |
1297 | if (STM->ldsRequiresM0Init()) |
1298 | return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 |
1299 | : AMDGPU::DS_WRITE2ST64_B64; |
1300 | |
1301 | return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9 |
1302 | : AMDGPU::DS_WRITE2ST64_B64_gfx9; |
1303 | } |
1304 | |
1305 | MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( |
1306 | CombineInfo &CI, CombineInfo &Paired, |
1307 | MachineBasicBlock::iterator InsertBefore) { |
1308 | MachineBasicBlock *MBB = CI.I->getParent(); |
1309 | |
1310 | // Be sure to use .addOperand(), and not .addReg() with these. We want to be |
1311 | // sure we preserve the subregister index and any register flags set on them. |
1312 | const MachineOperand *AddrReg = |
1313 | TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); |
1314 | const MachineOperand *Data0 = |
1315 | TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0); |
1316 | const MachineOperand *Data1 = |
1317 | TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0); |
1318 | |
1319 | unsigned NewOffset0 = CI.Offset; |
1320 | unsigned NewOffset1 = Paired.Offset; |
1321 | unsigned Opc = |
1322 | CI.UseST64 ? write2ST64Opcode(EltSize: CI.EltSize) : write2Opcode(EltSize: CI.EltSize); |
1323 | |
1324 | if (NewOffset0 > NewOffset1) { |
1325 | // Canonicalize the merged instruction so the smaller offset comes first. |
1326 | std::swap(a&: NewOffset0, b&: NewOffset1); |
1327 | std::swap(a&: Data0, b&: Data1); |
1328 | } |
1329 | |
1330 | assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && |
1331 | (NewOffset0 != NewOffset1) && "Computed offset doesn't fit" ); |
1332 | |
1333 | const MCInstrDesc &Write2Desc = TII->get(Opc); |
1334 | DebugLoc DL = CI.I->getDebugLoc(); |
1335 | |
1336 | Register BaseReg = AddrReg->getReg(); |
1337 | unsigned BaseSubReg = AddrReg->getSubReg(); |
1338 | unsigned BaseRegFlags = 0; |
1339 | if (CI.BaseOff) { |
1340 | Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); |
1341 | BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) |
1342 | .addImm(CI.BaseOff); |
1343 | |
1344 | BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
1345 | BaseRegFlags = RegState::Kill; |
1346 | |
1347 | TII->getAddNoCarry(MBB&: *MBB, I: InsertBefore, DL, DestReg: BaseReg) |
1348 | .addReg(RegNo: ImmReg) |
1349 | .addReg(RegNo: AddrReg->getReg(), flags: 0, SubReg: BaseSubReg) |
1350 | .addImm(Val: 0); // clamp bit |
1351 | BaseSubReg = 0; |
1352 | } |
1353 | |
1354 | MachineInstrBuilder Write2 = |
1355 | BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: Write2Desc) |
1356 | .addReg(RegNo: BaseReg, flags: BaseRegFlags, SubReg: BaseSubReg) // addr |
1357 | .add(MO: *Data0) // data0 |
1358 | .add(MO: *Data1) // data1 |
1359 | .addImm(Val: NewOffset0) // offset0 |
1360 | .addImm(Val: NewOffset1) // offset1 |
1361 | .addImm(Val: 0) // gds |
1362 | .cloneMergedMemRefs(OtherMIs: {&*CI.I, &*Paired.I}); |
1363 | |
1364 | CI.I->eraseFromParent(); |
1365 | Paired.I->eraseFromParent(); |
1366 | |
1367 | LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); |
1368 | return Write2; |
1369 | } |
1370 | |
1371 | MachineBasicBlock::iterator |
1372 | SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired, |
1373 | MachineBasicBlock::iterator InsertBefore) { |
1374 | MachineBasicBlock *MBB = CI.I->getParent(); |
1375 | DebugLoc DL = CI.I->getDebugLoc(); |
1376 | const unsigned Opcode = getNewOpcode(CI, Paired); |
1377 | |
1378 | const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); |
1379 | |
1380 | Register DestReg = MRI->createVirtualRegister(RegClass: SuperRC); |
1381 | unsigned MergedDMask = CI.DMask | Paired.DMask; |
1382 | unsigned DMaskIdx = |
1383 | AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask); |
1384 | |
1385 | auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); |
1386 | for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) { |
1387 | if (I == DMaskIdx) |
1388 | MIB.addImm(MergedDMask); |
1389 | else |
1390 | MIB.add((*CI.I).getOperand(i: I)); |
1391 | } |
1392 | |
1393 | // It shouldn't be possible to get this far if the two instructions |
1394 | // don't have a single memoperand, because MachineInstr::mayAlias() |
1395 | // will return true if this is the case. |
1396 | assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); |
1397 | |
1398 | MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); |
1399 | |
1400 | unsigned SubRegIdx0, SubRegIdx1; |
1401 | std::tie(args&: SubRegIdx0, args&: SubRegIdx1) = getSubRegIdxs(CI, Paired); |
1402 | |
1403 | // Copy to the old destination registers. |
1404 | const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); |
1405 | const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); |
1406 | const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); |
1407 | |
1408 | BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: CopyDesc) |
1409 | .add(*Dest0) // Copy to same destination including flags and sub reg. |
1410 | .addReg(DestReg, 0, SubRegIdx0); |
1411 | BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: CopyDesc) |
1412 | .add(*Dest1) |
1413 | .addReg(DestReg, RegState::Kill, SubRegIdx1); |
1414 | |
1415 | CI.I->eraseFromParent(); |
1416 | Paired.I->eraseFromParent(); |
1417 | return New; |
1418 | } |
1419 | |
1420 | MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair( |
1421 | CombineInfo &CI, CombineInfo &Paired, |
1422 | MachineBasicBlock::iterator InsertBefore) { |
1423 | MachineBasicBlock *MBB = CI.I->getParent(); |
1424 | DebugLoc DL = CI.I->getDebugLoc(); |
1425 | const unsigned Opcode = getNewOpcode(CI, Paired); |
1426 | |
1427 | const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); |
1428 | |
1429 | Register DestReg = MRI->createVirtualRegister(RegClass: SuperRC); |
1430 | unsigned MergedOffset = std::min(a: CI.Offset, b: Paired.Offset); |
1431 | |
1432 | // It shouldn't be possible to get this far if the two instructions |
1433 | // don't have a single memoperand, because MachineInstr::mayAlias() |
1434 | // will return true if this is the case. |
1435 | assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); |
1436 | |
1437 | MachineInstrBuilder New = |
1438 | BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg) |
1439 | .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)); |
1440 | if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) |
1441 | New.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)); |
1442 | New.addImm(Val: MergedOffset); |
1443 | New.addImm(Val: CI.CPol).addMemOperand(MMO: combineKnownAdjacentMMOs(CI, Paired)); |
1444 | |
1445 | std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); |
1446 | const unsigned SubRegIdx0 = std::get<0>(in&: SubRegIdx); |
1447 | const unsigned SubRegIdx1 = std::get<1>(in&: SubRegIdx); |
1448 | |
1449 | // Copy to the old destination registers. |
1450 | const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); |
1451 | const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst); |
1452 | const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst); |
1453 | |
1454 | BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: CopyDesc) |
1455 | .add(*Dest0) // Copy to same destination including flags and sub reg. |
1456 | .addReg(DestReg, 0, SubRegIdx0); |
1457 | BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: CopyDesc) |
1458 | .add(*Dest1) |
1459 | .addReg(DestReg, RegState::Kill, SubRegIdx1); |
1460 | |
1461 | CI.I->eraseFromParent(); |
1462 | Paired.I->eraseFromParent(); |
1463 | return New; |
1464 | } |
1465 | |
1466 | MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( |
1467 | CombineInfo &CI, CombineInfo &Paired, |
1468 | MachineBasicBlock::iterator InsertBefore) { |
1469 | MachineBasicBlock *MBB = CI.I->getParent(); |
1470 | DebugLoc DL = CI.I->getDebugLoc(); |
1471 | |
1472 | const unsigned Opcode = getNewOpcode(CI, Paired); |
1473 | |
1474 | const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); |
1475 | |
1476 | // Copy to the new source register. |
1477 | Register DestReg = MRI->createVirtualRegister(RegClass: SuperRC); |
1478 | unsigned MergedOffset = std::min(a: CI.Offset, b: Paired.Offset); |
1479 | |
1480 | auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); |
1481 | |
1482 | AddressRegs Regs = getRegs(Opc: Opcode, TII: *TII); |
1483 | |
1484 | if (Regs.VAddr) |
1485 | MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); |
1486 | |
1487 | // It shouldn't be possible to get this far if the two instructions |
1488 | // don't have a single memoperand, because MachineInstr::mayAlias() |
1489 | // will return true if this is the case. |
1490 | assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); |
1491 | |
1492 | MachineInstr *New = |
1493 | MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) |
1494 | .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) |
1495 | .addImm(MergedOffset) // offset |
1496 | .addImm(CI.CPol) // cpol |
1497 | .addImm(0) // swz |
1498 | .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); |
1499 | |
1500 | std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); |
1501 | const unsigned SubRegIdx0 = std::get<0>(in&: SubRegIdx); |
1502 | const unsigned SubRegIdx1 = std::get<1>(in&: SubRegIdx); |
1503 | |
1504 | // Copy to the old destination registers. |
1505 | const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); |
1506 | const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); |
1507 | const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); |
1508 | |
1509 | BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: CopyDesc) |
1510 | .add(*Dest0) // Copy to same destination including flags and sub reg. |
1511 | .addReg(DestReg, 0, SubRegIdx0); |
1512 | BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: CopyDesc) |
1513 | .add(*Dest1) |
1514 | .addReg(DestReg, RegState::Kill, SubRegIdx1); |
1515 | |
1516 | CI.I->eraseFromParent(); |
1517 | Paired.I->eraseFromParent(); |
1518 | return New; |
1519 | } |
1520 | |
1521 | MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair( |
1522 | CombineInfo &CI, CombineInfo &Paired, |
1523 | MachineBasicBlock::iterator InsertBefore) { |
1524 | MachineBasicBlock *MBB = CI.I->getParent(); |
1525 | DebugLoc DL = CI.I->getDebugLoc(); |
1526 | |
1527 | const unsigned Opcode = getNewOpcode(CI, Paired); |
1528 | |
1529 | const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); |
1530 | |
1531 | // Copy to the new source register. |
1532 | Register DestReg = MRI->createVirtualRegister(RegClass: SuperRC); |
1533 | unsigned MergedOffset = std::min(a: CI.Offset, b: Paired.Offset); |
1534 | |
1535 | auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); |
1536 | |
1537 | AddressRegs Regs = getRegs(Opc: Opcode, TII: *TII); |
1538 | |
1539 | if (Regs.VAddr) |
1540 | MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); |
1541 | |
1542 | unsigned JoinedFormat = |
1543 | getBufferFormatWithCompCount(OldFormat: CI.Format, ComponentCount: CI.Width + Paired.Width, STI: *STM); |
1544 | |
1545 | // It shouldn't be possible to get this far if the two instructions |
1546 | // don't have a single memoperand, because MachineInstr::mayAlias() |
1547 | // will return true if this is the case. |
1548 | assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); |
1549 | |
1550 | MachineInstr *New = |
1551 | MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) |
1552 | .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) |
1553 | .addImm(MergedOffset) // offset |
1554 | .addImm(JoinedFormat) // format |
1555 | .addImm(CI.CPol) // cpol |
1556 | .addImm(0) // swz |
1557 | .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); |
1558 | |
1559 | std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); |
1560 | const unsigned SubRegIdx0 = std::get<0>(in&: SubRegIdx); |
1561 | const unsigned SubRegIdx1 = std::get<1>(in&: SubRegIdx); |
1562 | |
1563 | // Copy to the old destination registers. |
1564 | const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); |
1565 | const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); |
1566 | const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); |
1567 | |
1568 | BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: CopyDesc) |
1569 | .add(*Dest0) // Copy to same destination including flags and sub reg. |
1570 | .addReg(DestReg, 0, SubRegIdx0); |
1571 | BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: CopyDesc) |
1572 | .add(*Dest1) |
1573 | .addReg(DestReg, RegState::Kill, SubRegIdx1); |
1574 | |
1575 | CI.I->eraseFromParent(); |
1576 | Paired.I->eraseFromParent(); |
1577 | return New; |
1578 | } |
1579 | |
1580 | MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair( |
1581 | CombineInfo &CI, CombineInfo &Paired, |
1582 | MachineBasicBlock::iterator InsertBefore) { |
1583 | MachineBasicBlock *MBB = CI.I->getParent(); |
1584 | DebugLoc DL = CI.I->getDebugLoc(); |
1585 | |
1586 | const unsigned Opcode = getNewOpcode(CI, Paired); |
1587 | |
1588 | std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); |
1589 | const unsigned SubRegIdx0 = std::get<0>(in&: SubRegIdx); |
1590 | const unsigned SubRegIdx1 = std::get<1>(in&: SubRegIdx); |
1591 | |
1592 | // Copy to the new source register. |
1593 | const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); |
1594 | Register SrcReg = MRI->createVirtualRegister(RegClass: SuperRC); |
1595 | |
1596 | const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); |
1597 | const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); |
1598 | |
1599 | BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) |
1600 | .add(*Src0) |
1601 | .addImm(SubRegIdx0) |
1602 | .add(*Src1) |
1603 | .addImm(SubRegIdx1); |
1604 | |
1605 | auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) |
1606 | .addReg(SrcReg, RegState::Kill); |
1607 | |
1608 | AddressRegs Regs = getRegs(Opc: Opcode, TII: *TII); |
1609 | |
1610 | if (Regs.VAddr) |
1611 | MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); |
1612 | |
1613 | unsigned JoinedFormat = |
1614 | getBufferFormatWithCompCount(OldFormat: CI.Format, ComponentCount: CI.Width + Paired.Width, STI: *STM); |
1615 | |
1616 | // It shouldn't be possible to get this far if the two instructions |
1617 | // don't have a single memoperand, because MachineInstr::mayAlias() |
1618 | // will return true if this is the case. |
1619 | assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); |
1620 | |
1621 | MachineInstr *New = |
1622 | MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) |
1623 | .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) |
1624 | .addImm(std::min(CI.Offset, Paired.Offset)) // offset |
1625 | .addImm(JoinedFormat) // format |
1626 | .addImm(CI.CPol) // cpol |
1627 | .addImm(0) // swz |
1628 | .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); |
1629 | |
1630 | CI.I->eraseFromParent(); |
1631 | Paired.I->eraseFromParent(); |
1632 | return New; |
1633 | } |
1634 | |
1635 | MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair( |
1636 | CombineInfo &CI, CombineInfo &Paired, |
1637 | MachineBasicBlock::iterator InsertBefore) { |
1638 | MachineBasicBlock *MBB = CI.I->getParent(); |
1639 | DebugLoc DL = CI.I->getDebugLoc(); |
1640 | |
1641 | const unsigned Opcode = getNewOpcode(CI, Paired); |
1642 | |
1643 | const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); |
1644 | Register DestReg = MRI->createVirtualRegister(RegClass: SuperRC); |
1645 | |
1646 | auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); |
1647 | |
1648 | if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr)) |
1649 | MIB.add(*SAddr); |
1650 | |
1651 | MachineInstr *New = |
1652 | MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)) |
1653 | .addImm(std::min(CI.Offset, Paired.Offset)) |
1654 | .addImm(CI.CPol) |
1655 | .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); |
1656 | |
1657 | std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); |
1658 | const unsigned SubRegIdx0 = std::get<0>(in&: SubRegIdx); |
1659 | const unsigned SubRegIdx1 = std::get<1>(in&: SubRegIdx); |
1660 | |
1661 | // Copy to the old destination registers. |
1662 | const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); |
1663 | const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst); |
1664 | const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst); |
1665 | |
1666 | BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: CopyDesc) |
1667 | .add(*Dest0) // Copy to same destination including flags and sub reg. |
1668 | .addReg(DestReg, 0, SubRegIdx0); |
1669 | BuildMI(BB&: *MBB, I: InsertBefore, MIMD: DL, MCID: CopyDesc) |
1670 | .add(*Dest1) |
1671 | .addReg(DestReg, RegState::Kill, SubRegIdx1); |
1672 | |
1673 | CI.I->eraseFromParent(); |
1674 | Paired.I->eraseFromParent(); |
1675 | return New; |
1676 | } |
1677 | |
1678 | MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair( |
1679 | CombineInfo &CI, CombineInfo &Paired, |
1680 | MachineBasicBlock::iterator InsertBefore) { |
1681 | MachineBasicBlock *MBB = CI.I->getParent(); |
1682 | DebugLoc DL = CI.I->getDebugLoc(); |
1683 | |
1684 | const unsigned Opcode = getNewOpcode(CI, Paired); |
1685 | |
1686 | std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); |
1687 | const unsigned SubRegIdx0 = std::get<0>(in&: SubRegIdx); |
1688 | const unsigned SubRegIdx1 = std::get<1>(in&: SubRegIdx); |
1689 | |
1690 | // Copy to the new source register. |
1691 | const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); |
1692 | Register SrcReg = MRI->createVirtualRegister(RegClass: SuperRC); |
1693 | |
1694 | const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); |
1695 | const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); |
1696 | |
1697 | BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) |
1698 | .add(*Src0) |
1699 | .addImm(SubRegIdx0) |
1700 | .add(*Src1) |
1701 | .addImm(SubRegIdx1); |
1702 | |
1703 | auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) |
1704 | .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)) |
1705 | .addReg(SrcReg, RegState::Kill); |
1706 | |
1707 | if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr)) |
1708 | MIB.add(*SAddr); |
1709 | |
1710 | MachineInstr *New = |
1711 | MIB.addImm(std::min(a: CI.Offset, b: Paired.Offset)) |
1712 | .addImm(CI.CPol) |
1713 | .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); |
1714 | |
1715 | CI.I->eraseFromParent(); |
1716 | Paired.I->eraseFromParent(); |
1717 | return New; |
1718 | } |
1719 | |
1720 | unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, |
1721 | const CombineInfo &Paired) { |
1722 | const unsigned Width = CI.Width + Paired.Width; |
1723 | |
1724 | switch (getCommonInstClass(CI, Paired)) { |
1725 | default: |
1726 | assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE); |
1727 | // FIXME: Handle d16 correctly |
1728 | return AMDGPU::getMUBUFOpcode(BaseOpc: AMDGPU::getMUBUFBaseOpcode(Opc: CI.I->getOpcode()), |
1729 | Elements: Width); |
1730 | case TBUFFER_LOAD: |
1731 | case TBUFFER_STORE: |
1732 | return AMDGPU::getMTBUFOpcode(BaseOpc: AMDGPU::getMTBUFBaseOpcode(Opc: CI.I->getOpcode()), |
1733 | Elements: Width); |
1734 | |
1735 | case UNKNOWN: |
1736 | llvm_unreachable("Unknown instruction class" ); |
1737 | case S_BUFFER_LOAD_IMM: |
1738 | switch (Width) { |
1739 | default: |
1740 | return 0; |
1741 | case 2: |
1742 | return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; |
1743 | case 3: |
1744 | return AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM; |
1745 | case 4: |
1746 | return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM; |
1747 | case 8: |
1748 | return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM; |
1749 | } |
1750 | case S_BUFFER_LOAD_SGPR_IMM: |
1751 | switch (Width) { |
1752 | default: |
1753 | return 0; |
1754 | case 2: |
1755 | return AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM; |
1756 | case 3: |
1757 | return AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM; |
1758 | case 4: |
1759 | return AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM; |
1760 | case 8: |
1761 | return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM; |
1762 | } |
1763 | case S_LOAD_IMM: |
1764 | switch (Width) { |
1765 | default: |
1766 | return 0; |
1767 | case 2: |
1768 | return AMDGPU::S_LOAD_DWORDX2_IMM; |
1769 | case 3: |
1770 | return AMDGPU::S_LOAD_DWORDX3_IMM; |
1771 | case 4: |
1772 | return AMDGPU::S_LOAD_DWORDX4_IMM; |
1773 | case 8: |
1774 | return AMDGPU::S_LOAD_DWORDX8_IMM; |
1775 | } |
1776 | case GLOBAL_LOAD: |
1777 | switch (Width) { |
1778 | default: |
1779 | return 0; |
1780 | case 2: |
1781 | return AMDGPU::GLOBAL_LOAD_DWORDX2; |
1782 | case 3: |
1783 | return AMDGPU::GLOBAL_LOAD_DWORDX3; |
1784 | case 4: |
1785 | return AMDGPU::GLOBAL_LOAD_DWORDX4; |
1786 | } |
1787 | case GLOBAL_LOAD_SADDR: |
1788 | switch (Width) { |
1789 | default: |
1790 | return 0; |
1791 | case 2: |
1792 | return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR; |
1793 | case 3: |
1794 | return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR; |
1795 | case 4: |
1796 | return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR; |
1797 | } |
1798 | case GLOBAL_STORE: |
1799 | switch (Width) { |
1800 | default: |
1801 | return 0; |
1802 | case 2: |
1803 | return AMDGPU::GLOBAL_STORE_DWORDX2; |
1804 | case 3: |
1805 | return AMDGPU::GLOBAL_STORE_DWORDX3; |
1806 | case 4: |
1807 | return AMDGPU::GLOBAL_STORE_DWORDX4; |
1808 | } |
1809 | case GLOBAL_STORE_SADDR: |
1810 | switch (Width) { |
1811 | default: |
1812 | return 0; |
1813 | case 2: |
1814 | return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR; |
1815 | case 3: |
1816 | return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR; |
1817 | case 4: |
1818 | return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR; |
1819 | } |
1820 | case FLAT_LOAD: |
1821 | switch (Width) { |
1822 | default: |
1823 | return 0; |
1824 | case 2: |
1825 | return AMDGPU::FLAT_LOAD_DWORDX2; |
1826 | case 3: |
1827 | return AMDGPU::FLAT_LOAD_DWORDX3; |
1828 | case 4: |
1829 | return AMDGPU::FLAT_LOAD_DWORDX4; |
1830 | } |
1831 | case FLAT_STORE: |
1832 | switch (Width) { |
1833 | default: |
1834 | return 0; |
1835 | case 2: |
1836 | return AMDGPU::FLAT_STORE_DWORDX2; |
1837 | case 3: |
1838 | return AMDGPU::FLAT_STORE_DWORDX3; |
1839 | case 4: |
1840 | return AMDGPU::FLAT_STORE_DWORDX4; |
1841 | } |
1842 | case MIMG: |
1843 | assert(((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == Width) && |
1844 | "No overlaps" ); |
1845 | return AMDGPU::getMaskedMIMGOp(Opc: CI.I->getOpcode(), NewChannels: Width); |
1846 | } |
1847 | } |
1848 | |
1849 | std::pair<unsigned, unsigned> |
1850 | SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI, |
1851 | const CombineInfo &Paired) { |
1852 | assert((CI.InstClass != MIMG || |
1853 | ((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == |
1854 | CI.Width + Paired.Width)) && |
1855 | "No overlaps" ); |
1856 | |
1857 | unsigned Idx0; |
1858 | unsigned Idx1; |
1859 | |
1860 | static const unsigned Idxs[5][4] = { |
1861 | {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3}, |
1862 | {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4}, |
1863 | {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5}, |
1864 | {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6}, |
1865 | {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7}, |
1866 | }; |
1867 | |
1868 | assert(CI.Width >= 1 && CI.Width <= 4); |
1869 | assert(Paired.Width >= 1 && Paired.Width <= 4); |
1870 | |
1871 | if (Paired < CI) { |
1872 | Idx1 = Idxs[0][Paired.Width - 1]; |
1873 | Idx0 = Idxs[Paired.Width][CI.Width - 1]; |
1874 | } else { |
1875 | Idx0 = Idxs[0][CI.Width - 1]; |
1876 | Idx1 = Idxs[CI.Width][Paired.Width - 1]; |
1877 | } |
1878 | |
1879 | return std::pair(Idx0, Idx1); |
1880 | } |
1881 | |
1882 | const TargetRegisterClass * |
1883 | SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI, |
1884 | const CombineInfo &Paired) { |
1885 | if (CI.InstClass == S_BUFFER_LOAD_IMM || |
1886 | CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) { |
1887 | switch (CI.Width + Paired.Width) { |
1888 | default: |
1889 | return nullptr; |
1890 | case 2: |
1891 | return &AMDGPU::SReg_64_XEXECRegClass; |
1892 | case 3: |
1893 | return &AMDGPU::SGPR_96RegClass; |
1894 | case 4: |
1895 | return &AMDGPU::SGPR_128RegClass; |
1896 | case 8: |
1897 | return &AMDGPU::SGPR_256RegClass; |
1898 | case 16: |
1899 | return &AMDGPU::SGPR_512RegClass; |
1900 | } |
1901 | } |
1902 | |
1903 | unsigned BitWidth = 32 * (CI.Width + Paired.Width); |
1904 | return TRI->isAGPRClass(RC: getDataRegClass(MI: *CI.I)) |
1905 | ? TRI->getAGPRClassForBitWidth(BitWidth) |
1906 | : TRI->getVGPRClassForBitWidth(BitWidth); |
1907 | } |
1908 | |
1909 | MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( |
1910 | CombineInfo &CI, CombineInfo &Paired, |
1911 | MachineBasicBlock::iterator InsertBefore) { |
1912 | MachineBasicBlock *MBB = CI.I->getParent(); |
1913 | DebugLoc DL = CI.I->getDebugLoc(); |
1914 | |
1915 | const unsigned Opcode = getNewOpcode(CI, Paired); |
1916 | |
1917 | std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); |
1918 | const unsigned SubRegIdx0 = std::get<0>(in&: SubRegIdx); |
1919 | const unsigned SubRegIdx1 = std::get<1>(in&: SubRegIdx); |
1920 | |
1921 | // Copy to the new source register. |
1922 | const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); |
1923 | Register SrcReg = MRI->createVirtualRegister(RegClass: SuperRC); |
1924 | |
1925 | const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); |
1926 | const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); |
1927 | |
1928 | BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) |
1929 | .add(*Src0) |
1930 | .addImm(SubRegIdx0) |
1931 | .add(*Src1) |
1932 | .addImm(SubRegIdx1); |
1933 | |
1934 | auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) |
1935 | .addReg(SrcReg, RegState::Kill); |
1936 | |
1937 | AddressRegs Regs = getRegs(Opc: Opcode, TII: *TII); |
1938 | |
1939 | if (Regs.VAddr) |
1940 | MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); |
1941 | |
1942 | |
1943 | // It shouldn't be possible to get this far if the two instructions |
1944 | // don't have a single memoperand, because MachineInstr::mayAlias() |
1945 | // will return true if this is the case. |
1946 | assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); |
1947 | |
1948 | MachineInstr *New = |
1949 | MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) |
1950 | .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) |
1951 | .addImm(std::min(CI.Offset, Paired.Offset)) // offset |
1952 | .addImm(CI.CPol) // cpol |
1953 | .addImm(0) // swz |
1954 | .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); |
1955 | |
1956 | CI.I->eraseFromParent(); |
1957 | Paired.I->eraseFromParent(); |
1958 | return New; |
1959 | } |
1960 | |
1961 | MachineOperand |
1962 | SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const { |
1963 | APInt V(32, Val, true); |
1964 | if (TII->isInlineConstant(Imm: V)) |
1965 | return MachineOperand::CreateImm(Val); |
1966 | |
1967 | Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); |
1968 | MachineInstr *Mov = |
1969 | BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), |
1970 | TII->get(AMDGPU::S_MOV_B32), Reg) |
1971 | .addImm(Val); |
1972 | (void)Mov; |
1973 | LLVM_DEBUG(dbgs() << " " ; Mov->dump()); |
1974 | return MachineOperand::CreateReg(Reg, isDef: false); |
1975 | } |
1976 | |
1977 | // Compute base address using Addr and return the final register. |
1978 | Register SILoadStoreOptimizer::computeBase(MachineInstr &MI, |
1979 | const MemAddress &Addr) const { |
1980 | MachineBasicBlock *MBB = MI.getParent(); |
1981 | MachineBasicBlock::iterator MBBI = MI.getIterator(); |
1982 | DebugLoc DL = MI.getDebugLoc(); |
1983 | |
1984 | assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 || |
1985 | Addr.Base.LoSubReg) && |
1986 | "Expected 32-bit Base-Register-Low!!" ); |
1987 | |
1988 | assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 || |
1989 | Addr.Base.HiSubReg) && |
1990 | "Expected 32-bit Base-Register-Hi!!" ); |
1991 | |
1992 | LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n" ); |
1993 | MachineOperand OffsetLo = createRegOrImm(Val: static_cast<int32_t>(Addr.Offset), MI); |
1994 | MachineOperand OffsetHi = |
1995 | createRegOrImm(Val: static_cast<int32_t>(Addr.Offset >> 32), MI); |
1996 | |
1997 | const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); |
1998 | Register CarryReg = MRI->createVirtualRegister(CarryRC); |
1999 | Register DeadCarryReg = MRI->createVirtualRegister(CarryRC); |
2000 | |
2001 | Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
2002 | Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
2003 | MachineInstr *LoHalf = |
2004 | BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0) |
2005 | .addReg(CarryReg, RegState::Define) |
2006 | .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg) |
2007 | .add(OffsetLo) |
2008 | .addImm(0); // clamp bit |
2009 | (void)LoHalf; |
2010 | LLVM_DEBUG(dbgs() << " " ; LoHalf->dump();); |
2011 | |
2012 | MachineInstr *HiHalf = |
2013 | BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1) |
2014 | .addReg(DeadCarryReg, RegState::Define | RegState::Dead) |
2015 | .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg) |
2016 | .add(OffsetHi) |
2017 | .addReg(CarryReg, RegState::Kill) |
2018 | .addImm(0); // clamp bit |
2019 | (void)HiHalf; |
2020 | LLVM_DEBUG(dbgs() << " " ; HiHalf->dump();); |
2021 | |
2022 | Register FullDestReg = MRI->createVirtualRegister(RegClass: TRI->getVGPR64Class()); |
2023 | MachineInstr *FullBase = |
2024 | BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg) |
2025 | .addReg(DestSub0) |
2026 | .addImm(AMDGPU::sub0) |
2027 | .addReg(DestSub1) |
2028 | .addImm(AMDGPU::sub1); |
2029 | (void)FullBase; |
2030 | LLVM_DEBUG(dbgs() << " " ; FullBase->dump(); dbgs() << "\n" ;); |
2031 | |
2032 | return FullDestReg; |
2033 | } |
2034 | |
2035 | // Update base and offset with the NewBase and NewOffset in MI. |
2036 | void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI, |
2037 | Register NewBase, |
2038 | int32_t NewOffset) const { |
2039 | auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); |
2040 | Base->setReg(NewBase); |
2041 | Base->setIsKill(false); |
2042 | TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset); |
2043 | } |
2044 | |
2045 | std::optional<int32_t> |
2046 | SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const { |
2047 | if (Op.isImm()) |
2048 | return Op.getImm(); |
2049 | |
2050 | if (!Op.isReg()) |
2051 | return std::nullopt; |
2052 | |
2053 | MachineInstr *Def = MRI->getUniqueVRegDef(Reg: Op.getReg()); |
2054 | if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 || |
2055 | !Def->getOperand(1).isImm()) |
2056 | return std::nullopt; |
2057 | |
2058 | return Def->getOperand(i: 1).getImm(); |
2059 | } |
2060 | |
2061 | // Analyze Base and extracts: |
2062 | // - 32bit base registers, subregisters |
2063 | // - 64bit constant offset |
2064 | // Expecting base computation as: |
2065 | // %OFFSET0:sgpr_32 = S_MOV_B32 8000 |
2066 | // %LO:vgpr_32, %c:sreg_64_xexec = |
2067 | // V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32, |
2068 | // %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec |
2069 | // %Base:vreg_64 = |
2070 | // REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1 |
2071 | void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base, |
2072 | MemAddress &Addr) const { |
2073 | if (!Base.isReg()) |
2074 | return; |
2075 | |
2076 | MachineInstr *Def = MRI->getUniqueVRegDef(Reg: Base.getReg()); |
2077 | if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE |
2078 | || Def->getNumOperands() != 5) |
2079 | return; |
2080 | |
2081 | MachineOperand BaseLo = Def->getOperand(i: 1); |
2082 | MachineOperand BaseHi = Def->getOperand(i: 3); |
2083 | if (!BaseLo.isReg() || !BaseHi.isReg()) |
2084 | return; |
2085 | |
2086 | MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(Reg: BaseLo.getReg()); |
2087 | MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(Reg: BaseHi.getReg()); |
2088 | |
2089 | if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 || |
2090 | !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64) |
2091 | return; |
2092 | |
2093 | const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0); |
2094 | const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1); |
2095 | |
2096 | auto Offset0P = extractConstOffset(Op: *Src0); |
2097 | if (Offset0P) |
2098 | BaseLo = *Src1; |
2099 | else { |
2100 | if (!(Offset0P = extractConstOffset(Op: *Src1))) |
2101 | return; |
2102 | BaseLo = *Src0; |
2103 | } |
2104 | |
2105 | Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0); |
2106 | Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1); |
2107 | |
2108 | if (Src0->isImm()) |
2109 | std::swap(Src0, Src1); |
2110 | |
2111 | if (!Src1->isImm()) |
2112 | return; |
2113 | |
2114 | uint64_t Offset1 = Src1->getImm(); |
2115 | BaseHi = *Src0; |
2116 | |
2117 | Addr.Base.LoReg = BaseLo.getReg(); |
2118 | Addr.Base.HiReg = BaseHi.getReg(); |
2119 | Addr.Base.LoSubReg = BaseLo.getSubReg(); |
2120 | Addr.Base.HiSubReg = BaseHi.getSubReg(); |
2121 | Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32); |
2122 | } |
2123 | |
2124 | bool SILoadStoreOptimizer::promoteConstantOffsetToImm( |
2125 | MachineInstr &MI, |
2126 | MemInfoMap &Visited, |
2127 | SmallPtrSet<MachineInstr *, 4> &AnchorList) const { |
2128 | |
2129 | if (!(MI.mayLoad() ^ MI.mayStore())) |
2130 | return false; |
2131 | |
2132 | // TODO: Support flat and scratch. |
2133 | if (AMDGPU::getGlobalSaddrOp(Opcode: MI.getOpcode()) < 0) |
2134 | return false; |
2135 | |
2136 | if (MI.mayLoad() && |
2137 | TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != nullptr) |
2138 | return false; |
2139 | |
2140 | if (AnchorList.count(Ptr: &MI)) |
2141 | return false; |
2142 | |
2143 | LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor " ; MI.dump()); |
2144 | |
2145 | if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) { |
2146 | LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n" ;); |
2147 | return false; |
2148 | } |
2149 | |
2150 | // Step1: Find the base-registers and a 64bit constant offset. |
2151 | MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); |
2152 | MemAddress MAddr; |
2153 | if (!Visited.contains(Val: &MI)) { |
2154 | processBaseWithConstOffset(Base, Addr&: MAddr); |
2155 | Visited[&MI] = MAddr; |
2156 | } else |
2157 | MAddr = Visited[&MI]; |
2158 | |
2159 | if (MAddr.Offset == 0) { |
2160 | LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no" |
2161 | " constant offsets that can be promoted.\n" ;); |
2162 | return false; |
2163 | } |
2164 | |
2165 | LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", " |
2166 | << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n" ;); |
2167 | |
2168 | // Step2: Traverse through MI's basic block and find an anchor(that has the |
2169 | // same base-registers) with the highest 13bit distance from MI's offset. |
2170 | // E.g. (64bit loads) |
2171 | // bb: |
2172 | // addr1 = &a + 4096; load1 = load(addr1, 0) |
2173 | // addr2 = &a + 6144; load2 = load(addr2, 0) |
2174 | // addr3 = &a + 8192; load3 = load(addr3, 0) |
2175 | // addr4 = &a + 10240; load4 = load(addr4, 0) |
2176 | // addr5 = &a + 12288; load5 = load(addr5, 0) |
2177 | // |
2178 | // Starting from the first load, the optimization will try to find a new base |
2179 | // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192 |
2180 | // has 13bit distance from &a + 4096. The heuristic considers &a + 8192 |
2181 | // as the new-base(anchor) because of the maximum distance which can |
2182 | // accommodate more intermediate bases presumably. |
2183 | // |
2184 | // Step3: move (&a + 8192) above load1. Compute and promote offsets from |
2185 | // (&a + 8192) for load1, load2, load4. |
2186 | // addr = &a + 8192 |
2187 | // load1 = load(addr, -4096) |
2188 | // load2 = load(addr, -2048) |
2189 | // load3 = load(addr, 0) |
2190 | // load4 = load(addr, 2048) |
2191 | // addr5 = &a + 12288; load5 = load(addr5, 0) |
2192 | // |
2193 | MachineInstr *AnchorInst = nullptr; |
2194 | MemAddress AnchorAddr; |
2195 | uint32_t MaxDist = std::numeric_limits<uint32_t>::min(); |
2196 | SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase; |
2197 | |
2198 | MachineBasicBlock *MBB = MI.getParent(); |
2199 | MachineBasicBlock::iterator E = MBB->end(); |
2200 | MachineBasicBlock::iterator MBBI = MI.getIterator(); |
2201 | ++MBBI; |
2202 | const SITargetLowering *TLI = |
2203 | static_cast<const SITargetLowering *>(STM->getTargetLowering()); |
2204 | |
2205 | for ( ; MBBI != E; ++MBBI) { |
2206 | MachineInstr &MINext = *MBBI; |
2207 | // TODO: Support finding an anchor(with same base) from store addresses or |
2208 | // any other load addresses where the opcodes are different. |
2209 | if (MINext.getOpcode() != MI.getOpcode() || |
2210 | TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm()) |
2211 | continue; |
2212 | |
2213 | const MachineOperand &BaseNext = |
2214 | *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr); |
2215 | MemAddress MAddrNext; |
2216 | if (!Visited.contains(Val: &MINext)) { |
2217 | processBaseWithConstOffset(Base: BaseNext, Addr&: MAddrNext); |
2218 | Visited[&MINext] = MAddrNext; |
2219 | } else |
2220 | MAddrNext = Visited[&MINext]; |
2221 | |
2222 | if (MAddrNext.Base.LoReg != MAddr.Base.LoReg || |
2223 | MAddrNext.Base.HiReg != MAddr.Base.HiReg || |
2224 | MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg || |
2225 | MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg) |
2226 | continue; |
2227 | |
2228 | InstsWCommonBase.push_back(Elt: std::pair(&MINext, MAddrNext.Offset)); |
2229 | |
2230 | int64_t Dist = MAddr.Offset - MAddrNext.Offset; |
2231 | TargetLoweringBase::AddrMode AM; |
2232 | AM.HasBaseReg = true; |
2233 | AM.BaseOffs = Dist; |
2234 | if (TLI->isLegalGlobalAddressingMode(AM) && |
2235 | (uint32_t)std::abs(i: Dist) > MaxDist) { |
2236 | MaxDist = std::abs(i: Dist); |
2237 | |
2238 | AnchorAddr = MAddrNext; |
2239 | AnchorInst = &MINext; |
2240 | } |
2241 | } |
2242 | |
2243 | if (AnchorInst) { |
2244 | LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): " ; |
2245 | AnchorInst->dump()); |
2246 | LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: " |
2247 | << AnchorAddr.Offset << "\n\n" ); |
2248 | |
2249 | // Instead of moving up, just re-compute anchor-instruction's base address. |
2250 | Register Base = computeBase(MI, Addr: AnchorAddr); |
2251 | |
2252 | updateBaseAndOffset(MI, NewBase: Base, NewOffset: MAddr.Offset - AnchorAddr.Offset); |
2253 | LLVM_DEBUG(dbgs() << " After promotion: " ; MI.dump();); |
2254 | |
2255 | for (auto P : InstsWCommonBase) { |
2256 | TargetLoweringBase::AddrMode AM; |
2257 | AM.HasBaseReg = true; |
2258 | AM.BaseOffs = P.second - AnchorAddr.Offset; |
2259 | |
2260 | if (TLI->isLegalGlobalAddressingMode(AM)) { |
2261 | LLVM_DEBUG(dbgs() << " Promote Offset(" << P.second; |
2262 | dbgs() << ")" ; P.first->dump()); |
2263 | updateBaseAndOffset(MI&: *P.first, NewBase: Base, NewOffset: P.second - AnchorAddr.Offset); |
2264 | LLVM_DEBUG(dbgs() << " After promotion: " ; P.first->dump()); |
2265 | } |
2266 | } |
2267 | AnchorList.insert(Ptr: AnchorInst); |
2268 | return true; |
2269 | } |
2270 | |
2271 | return false; |
2272 | } |
2273 | |
2274 | void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI, |
2275 | std::list<std::list<CombineInfo> > &MergeableInsts) const { |
2276 | for (std::list<CombineInfo> &AddrList : MergeableInsts) { |
2277 | if (AddrList.front().InstClass == CI.InstClass && |
2278 | AddrList.front().IsAGPR == CI.IsAGPR && |
2279 | AddrList.front().hasSameBaseAddress(CI)) { |
2280 | AddrList.emplace_back(args: CI); |
2281 | return; |
2282 | } |
2283 | } |
2284 | |
2285 | // Base address not found, so add a new list. |
2286 | MergeableInsts.emplace_back(args: 1, args: CI); |
2287 | } |
2288 | |
2289 | std::pair<MachineBasicBlock::iterator, bool> |
2290 | SILoadStoreOptimizer::collectMergeableInsts( |
2291 | MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, |
2292 | MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, |
2293 | std::list<std::list<CombineInfo>> &MergeableInsts) const { |
2294 | bool Modified = false; |
2295 | |
2296 | // Sort potential mergeable instructions into lists. One list per base address. |
2297 | unsigned Order = 0; |
2298 | MachineBasicBlock::iterator BlockI = Begin; |
2299 | for (; BlockI != End; ++BlockI) { |
2300 | MachineInstr &MI = *BlockI; |
2301 | |
2302 | // We run this before checking if an address is mergeable, because it can produce |
2303 | // better code even if the instructions aren't mergeable. |
2304 | if (promoteConstantOffsetToImm(MI, Visited, AnchorList)) |
2305 | Modified = true; |
2306 | |
2307 | // Treat volatile accesses, ordered accesses and unmodeled side effects as |
2308 | // barriers. We can look after this barrier for separate merges. |
2309 | if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) { |
2310 | LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI); |
2311 | |
2312 | // Search will resume after this instruction in a separate merge list. |
2313 | ++BlockI; |
2314 | break; |
2315 | } |
2316 | |
2317 | const InstClassEnum InstClass = getInstClass(Opc: MI.getOpcode(), TII: *TII); |
2318 | if (InstClass == UNKNOWN) |
2319 | continue; |
2320 | |
2321 | // Do not merge VMEM buffer instructions with "swizzled" bit set. |
2322 | int Swizzled = |
2323 | AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz); |
2324 | if (Swizzled != -1 && MI.getOperand(i: Swizzled).getImm()) |
2325 | continue; |
2326 | |
2327 | CombineInfo CI; |
2328 | CI.setMI(MI, LSO: *this); |
2329 | CI.Order = Order++; |
2330 | |
2331 | if (!CI.hasMergeableAddress(MRI: *MRI)) |
2332 | continue; |
2333 | |
2334 | if (CI.InstClass == DS_WRITE && CI.IsAGPR) { |
2335 | // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data |
2336 | // operands. However we are reporting that ds_write2 shall have |
2337 | // only VGPR data so that machine copy propagation does not |
2338 | // create an illegal instruction with a VGPR and AGPR sources. |
2339 | // Consequenctially if we create such instruction the verifier |
2340 | // will complain. |
2341 | continue; |
2342 | } |
2343 | |
2344 | LLVM_DEBUG(dbgs() << "Mergeable: " << MI); |
2345 | |
2346 | addInstToMergeableList(CI, MergeableInsts); |
2347 | } |
2348 | |
2349 | // At this point we have lists of Mergeable instructions. |
2350 | // |
2351 | // Part 2: Sort lists by offset and then for each CombineInfo object in the |
2352 | // list try to find an instruction that can be merged with I. If an instruction |
2353 | // is found, it is stored in the Paired field. If no instructions are found, then |
2354 | // the CombineInfo object is deleted from the list. |
2355 | |
2356 | for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), |
2357 | E = MergeableInsts.end(); I != E;) { |
2358 | |
2359 | std::list<CombineInfo> &MergeList = *I; |
2360 | if (MergeList.size() <= 1) { |
2361 | // This means we have found only one instruction with a given address |
2362 | // that can be merged, and we need at least 2 instructions to do a merge, |
2363 | // so this list can be discarded. |
2364 | I = MergeableInsts.erase(position: I); |
2365 | continue; |
2366 | } |
2367 | |
2368 | // Sort the lists by offsets, this way mergeable instructions will be |
2369 | // adjacent to each other in the list, which will make it easier to find |
2370 | // matches. |
2371 | MergeList.sort( |
2372 | comp: [] (const CombineInfo &A, const CombineInfo &B) { |
2373 | return A.Offset < B.Offset; |
2374 | }); |
2375 | ++I; |
2376 | } |
2377 | |
2378 | return std::pair(BlockI, Modified); |
2379 | } |
2380 | |
2381 | // Scan through looking for adjacent LDS operations with constant offsets from |
2382 | // the same base register. We rely on the scheduler to do the hard work of |
2383 | // clustering nearby loads, and assume these are all adjacent. |
2384 | bool SILoadStoreOptimizer::optimizeBlock( |
2385 | std::list<std::list<CombineInfo> > &MergeableInsts) { |
2386 | bool Modified = false; |
2387 | |
2388 | for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), |
2389 | E = MergeableInsts.end(); I != E;) { |
2390 | std::list<CombineInfo> &MergeList = *I; |
2391 | |
2392 | bool OptimizeListAgain = false; |
2393 | if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) { |
2394 | // We weren't able to make any changes, so delete the list so we don't |
2395 | // process the same instructions the next time we try to optimize this |
2396 | // block. |
2397 | I = MergeableInsts.erase(position: I); |
2398 | continue; |
2399 | } |
2400 | |
2401 | Modified = true; |
2402 | |
2403 | // We made changes, but also determined that there were no more optimization |
2404 | // opportunities, so we don't need to reprocess the list |
2405 | if (!OptimizeListAgain) { |
2406 | I = MergeableInsts.erase(position: I); |
2407 | continue; |
2408 | } |
2409 | OptimizeAgain = true; |
2410 | } |
2411 | return Modified; |
2412 | } |
2413 | |
2414 | bool |
2415 | SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr( |
2416 | std::list<CombineInfo> &MergeList, |
2417 | bool &OptimizeListAgain) { |
2418 | if (MergeList.empty()) |
2419 | return false; |
2420 | |
2421 | bool Modified = false; |
2422 | |
2423 | for (auto I = MergeList.begin(), Next = std::next(x: I); Next != MergeList.end(); |
2424 | Next = std::next(x: I)) { |
2425 | |
2426 | auto First = I; |
2427 | auto Second = Next; |
2428 | |
2429 | if ((*First).Order > (*Second).Order) |
2430 | std::swap(a&: First, b&: Second); |
2431 | CombineInfo &CI = *First; |
2432 | CombineInfo &Paired = *Second; |
2433 | |
2434 | CombineInfo *Where = checkAndPrepareMerge(CI, Paired); |
2435 | if (!Where) { |
2436 | ++I; |
2437 | continue; |
2438 | } |
2439 | |
2440 | Modified = true; |
2441 | |
2442 | LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << " with: " << *Paired.I); |
2443 | |
2444 | MachineBasicBlock::iterator NewMI; |
2445 | switch (CI.InstClass) { |
2446 | default: |
2447 | llvm_unreachable("unknown InstClass" ); |
2448 | break; |
2449 | case DS_READ: |
2450 | NewMI = mergeRead2Pair(CI, Paired, InsertBefore: Where->I); |
2451 | break; |
2452 | case DS_WRITE: |
2453 | NewMI = mergeWrite2Pair(CI, Paired, InsertBefore: Where->I); |
2454 | break; |
2455 | case S_BUFFER_LOAD_IMM: |
2456 | case S_BUFFER_LOAD_SGPR_IMM: |
2457 | case S_LOAD_IMM: |
2458 | NewMI = mergeSMemLoadImmPair(CI, Paired, InsertBefore: Where->I); |
2459 | OptimizeListAgain |= CI.Width + Paired.Width < 8; |
2460 | break; |
2461 | case BUFFER_LOAD: |
2462 | NewMI = mergeBufferLoadPair(CI, Paired, InsertBefore: Where->I); |
2463 | OptimizeListAgain |= CI.Width + Paired.Width < 4; |
2464 | break; |
2465 | case BUFFER_STORE: |
2466 | NewMI = mergeBufferStorePair(CI, Paired, InsertBefore: Where->I); |
2467 | OptimizeListAgain |= CI.Width + Paired.Width < 4; |
2468 | break; |
2469 | case MIMG: |
2470 | NewMI = mergeImagePair(CI, Paired, InsertBefore: Where->I); |
2471 | OptimizeListAgain |= CI.Width + Paired.Width < 4; |
2472 | break; |
2473 | case TBUFFER_LOAD: |
2474 | NewMI = mergeTBufferLoadPair(CI, Paired, InsertBefore: Where->I); |
2475 | OptimizeListAgain |= CI.Width + Paired.Width < 4; |
2476 | break; |
2477 | case TBUFFER_STORE: |
2478 | NewMI = mergeTBufferStorePair(CI, Paired, InsertBefore: Where->I); |
2479 | OptimizeListAgain |= CI.Width + Paired.Width < 4; |
2480 | break; |
2481 | case FLAT_LOAD: |
2482 | case GLOBAL_LOAD: |
2483 | case GLOBAL_LOAD_SADDR: |
2484 | NewMI = mergeFlatLoadPair(CI, Paired, InsertBefore: Where->I); |
2485 | OptimizeListAgain |= CI.Width + Paired.Width < 4; |
2486 | break; |
2487 | case FLAT_STORE: |
2488 | case GLOBAL_STORE: |
2489 | case GLOBAL_STORE_SADDR: |
2490 | NewMI = mergeFlatStorePair(CI, Paired, InsertBefore: Where->I); |
2491 | OptimizeListAgain |= CI.Width + Paired.Width < 4; |
2492 | break; |
2493 | } |
2494 | CI.setMI(MI: NewMI, LSO: *this); |
2495 | CI.Order = Where->Order; |
2496 | if (I == Second) |
2497 | I = Next; |
2498 | |
2499 | MergeList.erase(position: Second); |
2500 | } |
2501 | |
2502 | return Modified; |
2503 | } |
2504 | |
2505 | bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { |
2506 | if (skipFunction(F: MF.getFunction())) |
2507 | return false; |
2508 | |
2509 | STM = &MF.getSubtarget<GCNSubtarget>(); |
2510 | if (!STM->loadStoreOptEnabled()) |
2511 | return false; |
2512 | |
2513 | TII = STM->getInstrInfo(); |
2514 | TRI = &TII->getRegisterInfo(); |
2515 | |
2516 | MRI = &MF.getRegInfo(); |
2517 | AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); |
2518 | |
2519 | LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n" ); |
2520 | |
2521 | bool Modified = false; |
2522 | |
2523 | // Contains the list of instructions for which constant offsets are being |
2524 | // promoted to the IMM. This is tracked for an entire block at time. |
2525 | SmallPtrSet<MachineInstr *, 4> AnchorList; |
2526 | MemInfoMap Visited; |
2527 | |
2528 | for (MachineBasicBlock &MBB : MF) { |
2529 | MachineBasicBlock::iterator SectionEnd; |
2530 | for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; |
2531 | I = SectionEnd) { |
2532 | bool CollectModified; |
2533 | std::list<std::list<CombineInfo>> MergeableInsts; |
2534 | |
2535 | // First pass: Collect list of all instructions we know how to merge in a |
2536 | // subset of the block. |
2537 | std::tie(args&: SectionEnd, args&: CollectModified) = |
2538 | collectMergeableInsts(Begin: I, End: E, Visited, AnchorList, MergeableInsts); |
2539 | |
2540 | Modified |= CollectModified; |
2541 | |
2542 | do { |
2543 | OptimizeAgain = false; |
2544 | Modified |= optimizeBlock(MergeableInsts); |
2545 | } while (OptimizeAgain); |
2546 | } |
2547 | |
2548 | Visited.clear(); |
2549 | AnchorList.clear(); |
2550 | } |
2551 | |
2552 | return Modified; |
2553 | } |
2554 | |