1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Custom DAG lowering for SI
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIISelLowering.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPUTargetMachine.h"
18#include "GCNSubtarget.h"
19#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20#include "SIMachineFunctionInfo.h"
21#include "SIRegisterInfo.h"
22#include "llvm/ADT/APInt.h"
23#include "llvm/ADT/FloatingPointMode.h"
24#include "llvm/ADT/Statistic.h"
25#include "llvm/Analysis/OptimizationRemarkEmitter.h"
26#include "llvm/Analysis/UniformityAnalysis.h"
27#include "llvm/BinaryFormat/ELF.h"
28#include "llvm/CodeGen/Analysis.h"
29#include "llvm/CodeGen/ByteProvider.h"
30#include "llvm/CodeGen/FunctionLoweringInfo.h"
31#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
32#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
33#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
34#include "llvm/CodeGen/MachineFrameInfo.h"
35#include "llvm/CodeGen/MachineFunction.h"
36#include "llvm/CodeGen/MachineLoopInfo.h"
37#include "llvm/IR/DiagnosticInfo.h"
38#include "llvm/IR/IRBuilder.h"
39#include "llvm/IR/IntrinsicInst.h"
40#include "llvm/IR/IntrinsicsAMDGPU.h"
41#include "llvm/IR/IntrinsicsR600.h"
42#include "llvm/Support/CommandLine.h"
43#include "llvm/Support/KnownBits.h"
44#include "llvm/Support/ModRef.h"
45#include <optional>
46
47using namespace llvm;
48
49#define DEBUG_TYPE "si-lower"
50
51STATISTIC(NumTailCalls, "Number of tail calls");
52
53static cl::opt<bool> DisableLoopAlignment(
54 "amdgpu-disable-loop-alignment",
55 cl::desc("Do not align and prefetch loops"),
56 cl::init(Val: false));
57
58static cl::opt<bool> UseDivergentRegisterIndexing(
59 "amdgpu-use-divergent-register-indexing",
60 cl::Hidden,
61 cl::desc("Use indirect register addressing for divergent indexes"),
62 cl::init(Val: false));
63
64static bool denormalModeIsFlushAllF32(const MachineFunction &MF) {
65 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
66 return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
67}
68
69static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF) {
70 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
71 return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign();
72}
73
74static unsigned findFirstFreeSGPR(CCState &CCInfo) {
75 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
76 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
77 if (!CCInfo.isAllocated(AMDGPU::Reg: SGPR0 + Reg)) {
78 return AMDGPU::SGPR0 + Reg;
79 }
80 }
81 llvm_unreachable("Cannot allocate sgpr");
82}
83
84SITargetLowering::SITargetLowering(const TargetMachine &TM,
85 const GCNSubtarget &STI)
86 : AMDGPUTargetLowering(TM, STI),
87 Subtarget(&STI) {
88 addRegisterClass(MVT::VT: i1, RC: &AMDGPU::VReg_1RegClass);
89 addRegisterClass(MVT::VT: i64, RC: &AMDGPU::SReg_64RegClass);
90
91 addRegisterClass(MVT::VT: i32, RC: &AMDGPU::SReg_32RegClass);
92 addRegisterClass(MVT::VT: f32, RC: &AMDGPU::VGPR_32RegClass);
93
94 addRegisterClass(MVT::VT: v2i32, RC: &AMDGPU::SReg_64RegClass);
95
96 const SIRegisterInfo *TRI = STI.getRegisterInfo();
97 const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
98
99 addRegisterClass(MVT::VT: f64, RC: V64RegClass);
100 addRegisterClass(MVT::VT: v2f32, RC: V64RegClass);
101 addRegisterClass(MVT::VT: Untyped, RC: V64RegClass);
102
103 addRegisterClass(MVT::VT: v3i32, RC: &AMDGPU::SGPR_96RegClass);
104 addRegisterClass(MVT::VT: v3f32, RC: TRI->getVGPRClassForBitWidth(BitWidth: 96));
105
106 addRegisterClass(MVT::VT: v2i64, RC: &AMDGPU::SGPR_128RegClass);
107 addRegisterClass(MVT::VT: v2f64, RC: &AMDGPU::SGPR_128RegClass);
108
109 addRegisterClass(MVT::VT: v4i32, RC: &AMDGPU::SGPR_128RegClass);
110 addRegisterClass(MVT::VT: v4f32, RC: TRI->getVGPRClassForBitWidth(BitWidth: 128));
111
112 addRegisterClass(MVT::VT: v5i32, RC: &AMDGPU::SGPR_160RegClass);
113 addRegisterClass(MVT::VT: v5f32, RC: TRI->getVGPRClassForBitWidth(BitWidth: 160));
114
115 addRegisterClass(MVT::VT: v6i32, RC: &AMDGPU::SGPR_192RegClass);
116 addRegisterClass(MVT::VT: v6f32, RC: TRI->getVGPRClassForBitWidth(BitWidth: 192));
117
118 addRegisterClass(MVT::VT: v3i64, RC: &AMDGPU::SGPR_192RegClass);
119 addRegisterClass(MVT::VT: v3f64, RC: TRI->getVGPRClassForBitWidth(BitWidth: 192));
120
121 addRegisterClass(MVT::VT: v7i32, RC: &AMDGPU::SGPR_224RegClass);
122 addRegisterClass(MVT::VT: v7f32, RC: TRI->getVGPRClassForBitWidth(BitWidth: 224));
123
124 addRegisterClass(MVT::VT: v8i32, RC: &AMDGPU::SGPR_256RegClass);
125 addRegisterClass(MVT::VT: v8f32, RC: TRI->getVGPRClassForBitWidth(BitWidth: 256));
126
127 addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
128 addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
129
130 addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
131 addRegisterClass(MVT::v9f32, TRI->getVGPRClassForBitWidth(288));
132
133 addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
134 addRegisterClass(MVT::v10f32, TRI->getVGPRClassForBitWidth(320));
135
136 addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
137 addRegisterClass(MVT::v11f32, TRI->getVGPRClassForBitWidth(352));
138
139 addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
140 addRegisterClass(MVT::v12f32, TRI->getVGPRClassForBitWidth(384));
141
142 addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
143 addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
144
145 addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
146 addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512));
147
148 addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
149 addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
150
151 if (Subtarget->has16BitInsts()) {
152 if (Subtarget->useRealTrue16Insts()) {
153 addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
154 addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
155 addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass);
156 } else {
157 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
158 addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
159 addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass);
160 }
161
162 // Unless there are also VOP3P operations, not operations are really legal.
163 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
164 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
165 addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);
166 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
167 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
168 addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass);
169 addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
170 addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
171 addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass);
172 addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
173 addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
174 addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass);
175 addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
176 addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
177 addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass);
178 }
179
180 addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
181 addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));
182
183 computeRegisterProperties(Subtarget->getRegisterInfo());
184
185 // The boolean content concept here is too inflexible. Compares only ever
186 // really produce a 1-bit result. Any copy/extend from these will turn into a
187 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
188 // it's what most targets use.
189 setBooleanContents(ZeroOrOneBooleanContent);
190 setBooleanVectorContents(ZeroOrOneBooleanContent);
191
192 // We need to custom lower vector stores from local memory
193 setOperationAction(ISD::LOAD,
194 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
195 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
196 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
197 MVT::i1, MVT::v32i32},
198 Custom);
199
200 setOperationAction(ISD::STORE,
201 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
202 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
203 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
204 MVT::i1, MVT::v32i32},
205 Custom);
206
207 if (isTypeLegal(MVT::bf16)) {
208 for (unsigned Opc :
209 {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV,
210 ISD::FREM, ISD::FMA, ISD::FMINNUM, ISD::FMAXNUM,
211 ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FSQRT, ISD::FCBRT,
212 ISD::FSIN, ISD::FCOS, ISD::FPOW, ISD::FPOWI,
213 ISD::FLDEXP, ISD::FFREXP, ISD::FLOG, ISD::FLOG2,
214 ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10,
215 ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FNEARBYINT,
216 ISD::FROUND, ISD::FROUNDEVEN, ISD::FFLOOR, ISD::FCANONICALIZE,
217 ISD::SETCC}) {
218 // FIXME: The promoted to type shouldn't need to be explicit
219 setOperationAction(Opc, MVT::bf16, Promote);
220 AddPromotedToType(Opc, MVT::bf16, MVT::f32);
221 }
222
223 setOperationAction(ISD::FP_ROUND, MVT::bf16, Expand);
224
225 setOperationAction(ISD::SELECT, MVT::bf16, Promote);
226 AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);
227
228 // TODO: Could make these legal
229 setOperationAction(ISD::FABS, MVT::bf16, Expand);
230 setOperationAction(ISD::FNEG, MVT::bf16, Expand);
231 setOperationAction(ISD::FCOPYSIGN, MVT::bf16, Expand);
232
233 // We only need to custom lower because we can't specify an action for bf16
234 // sources.
235 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
236 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
237
238 setOperationAction(ISD::BUILD_VECTOR, MVT::v2bf16, Promote);
239 AddPromotedToType(ISD::BUILD_VECTOR, MVT::v2bf16, MVT::v2i16);
240 }
241
242 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
243 setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
244 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
245 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
246 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
247 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
248 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
249 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
250 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
251 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
252 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
253 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
254 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
255 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
256 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
257 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
258
259 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
260 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
261 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
262 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
263 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
264 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
265 setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
266
267 setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
268
269 setOperationAction(ISD::SELECT, MVT::i1, Promote);
270 setOperationAction(ISD::SELECT, MVT::i64, Custom);
271 setOperationAction(ISD::SELECT, MVT::f64, Promote);
272 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
273
274 setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom);
275
276 setOperationAction(ISD::SELECT_CC,
277 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
278
279 setOperationAction(ISD::SETCC, MVT::i1, Promote);
280 setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
281 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
282
283 setOperationAction(ISD::TRUNCATE,
284 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
285 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
286 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
287 Expand);
288 setOperationAction(ISD::FP_ROUND,
289 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
290 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
291 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
292 Expand);
293
294 setOperationAction(ISD::SIGN_EXTEND_INREG,
295 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
296 MVT::v3i16, MVT::v4i16, MVT::Other},
297 Custom);
298
299 setOperationAction(ISD::BRCOND, MVT::Other, Custom);
300 setOperationAction(ISD::BR_CC,
301 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
302
303 setOperationAction({ISD::UADDO, ISD::USUBO}, MVT::i32, Legal);
304
305 setOperationAction({ISD::UADDO_CARRY, ISD::USUBO_CARRY}, MVT::i32, Legal);
306
307 setOperationAction({ISD::SHL_PARTS, ISD::SRA_PARTS, ISD::SRL_PARTS}, MVT::i64,
308 Expand);
309
310#if 0
311 setOperationAction({ISD::UADDO_CARRY, ISD::USUBO_CARRY}, MVT::i64, Legal);
312#endif
313
314 // We only support LOAD/STORE and vector manipulation ops for vectors
315 // with > 4 elements.
316 for (MVT VT :
317 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
318 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
319 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
320 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
321 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
322 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
323 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
324 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
325 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
326 switch (Op) {
327 case ISD::LOAD:
328 case ISD::STORE:
329 case ISD::BUILD_VECTOR:
330 case ISD::BITCAST:
331 case ISD::UNDEF:
332 case ISD::EXTRACT_VECTOR_ELT:
333 case ISD::INSERT_VECTOR_ELT:
334 case ISD::SCALAR_TO_VECTOR:
335 case ISD::IS_FPCLASS:
336 break;
337 case ISD::EXTRACT_SUBVECTOR:
338 case ISD::INSERT_SUBVECTOR:
339 case ISD::CONCAT_VECTORS:
340 setOperationAction(Op, VT, Custom);
341 break;
342 default:
343 setOperationAction(Op, VT, Expand);
344 break;
345 }
346 }
347 }
348
349 setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Expand);
350
351 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
352 // is expanded to avoid having two separate loops in case the index is a VGPR.
353
354 // Most operations are naturally 32-bit vector operations. We only support
355 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
356 for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
357 setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
358 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
359
360 setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
361 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
362
363 setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
364 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
365
366 setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
367 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
368 }
369
370 for (MVT Vec64 : { MVT::v3i64, MVT::v3f64 }) {
371 setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
372 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
373
374 setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
375 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
376
377 setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
378 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
379
380 setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
381 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
382 }
383
384 for (MVT Vec64 : { MVT::v4i64, MVT::v4f64 }) {
385 setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
386 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
387
388 setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
389 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
390
391 setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
392 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);
393
394 setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
395 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);
396 }
397
398 for (MVT Vec64 : { MVT::v8i64, MVT::v8f64 }) {
399 setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
400 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
401
402 setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
403 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
404
405 setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
406 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);
407
408 setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
409 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);
410 }
411
412 for (MVT Vec64 : { MVT::v16i64, MVT::v16f64 }) {
413 setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
414 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
415
416 setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
417 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
418
419 setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
420 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);
421
422 setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
423 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
424 }
425
426 setOperationAction(ISD::VECTOR_SHUFFLE,
427 {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32},
428 Expand);
429
430 setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
431 Custom);
432
433 // Avoid stack access for these.
434 // TODO: Generalize to more vector types.
435 setOperationAction({ISD::EXTRACT_VECTOR_ELT, ISD::INSERT_VECTOR_ELT},
436 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
437 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
438 Custom);
439
440 // Deal with vec3 vector operations when widened to vec4.
441 setOperationAction(ISD::INSERT_SUBVECTOR,
442 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
443
444 // Deal with vec5/6/7 vector operations when widened to vec8.
445 setOperationAction(ISD::INSERT_SUBVECTOR,
446 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
447 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
448 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
449 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
450 Custom);
451
452 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
453 // and output demarshalling
454 setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
455
456 // We can't return success/failure, only the old value,
457 // let LLVM add the comparison
458 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, {MVT::i32, MVT::i64},
459 Expand);
460
461 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
462
463 setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
464
465 // FIXME: This should be narrowed to i32, but that only happens if i64 is
466 // illegal.
467 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
468 setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
469
470 // On SI this is s_memtime and s_memrealtime on VI.
471 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
472
473 if (Subtarget->hasSMemRealTime() ||
474 Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11)
475 setOperationAction(ISD::READSTEADYCOUNTER, MVT::i64, Legal);
476 setOperationAction({ISD::TRAP, ISD::DEBUGTRAP}, MVT::Other, Custom);
477
478 if (Subtarget->has16BitInsts()) {
479 setOperationAction({ISD::FPOW, ISD::FPOWI}, MVT::f16, Promote);
480 setOperationAction({ISD::FLOG, ISD::FEXP, ISD::FLOG10}, MVT::f16, Custom);
481 } else {
482 setOperationAction(ISD::FSQRT, MVT::f16, Custom);
483 }
484
485 if (Subtarget->hasMadMacF32Insts())
486 setOperationAction(ISD::FMAD, MVT::f32, Legal);
487
488 if (!Subtarget->hasBFI())
489 // fcopysign can be done in a single instruction with BFI.
490 setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand);
491
492 if (!Subtarget->hasBCNT(32))
493 setOperationAction(ISD::CTPOP, MVT::i32, Expand);
494
495 if (!Subtarget->hasBCNT(64))
496 setOperationAction(ISD::CTPOP, MVT::i64, Expand);
497
498 if (Subtarget->hasFFBH())
499 setOperationAction({ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, MVT::i32, Custom);
500
501 if (Subtarget->hasFFBL())
502 setOperationAction({ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF}, MVT::i32, Custom);
503
504 // We only really have 32-bit BFE instructions (and 16-bit on VI).
505 //
506 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
507 // effort to match them now. We want this to be false for i64 cases when the
508 // extraction isn't restricted to the upper or lower half. Ideally we would
509 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
510 // span the midpoint are probably relatively rare, so don't worry about them
511 // for now.
512 if (Subtarget->hasBFE())
513 setHasExtractBitsInsn(true);
514
515 // Clamp modifier on add/sub
516 if (Subtarget->hasIntClamp())
517 setOperationAction({ISD::UADDSAT, ISD::USUBSAT}, MVT::i32, Legal);
518
519 if (Subtarget->hasAddNoCarry())
520 setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
521 Legal);
522
523 setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, {MVT::f32, MVT::f64},
524 Custom);
525
526 // These are really only legal for ieee_mode functions. We should be avoiding
527 // them for functions that don't have ieee_mode enabled, so just say they are
528 // legal.
529 setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE},
530 {MVT::f32, MVT::f64}, Legal);
531
532 if (Subtarget->haveRoundOpsF64())
533 setOperationAction({ISD::FTRUNC, ISD::FCEIL, ISD::FROUNDEVEN}, MVT::f64,
534 Legal);
535 else
536 setOperationAction({ISD::FCEIL, ISD::FTRUNC, ISD::FROUNDEVEN, ISD::FFLOOR},
537 MVT::f64, Custom);
538
539 setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
540 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64},
541 Legal);
542 setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom);
543
544 setOperationAction({ISD::FSIN, ISD::FCOS, ISD::FDIV}, MVT::f32, Custom);
545 setOperationAction(ISD::FDIV, MVT::f64, Custom);
546
547 setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
548 setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
549
550 // Custom lower these because we can't specify a rule based on an illegal
551 // source bf16.
552 setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f32, Custom);
553 setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f64, Custom);
554
555 if (Subtarget->has16BitInsts()) {
556 setOperationAction({ISD::Constant, ISD::SMIN, ISD::SMAX, ISD::UMIN,
557 ISD::UMAX, ISD::UADDSAT, ISD::USUBSAT},
558 MVT::i16, Legal);
559
560 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
561
562 setOperationAction({ISD::ROTR, ISD::ROTL, ISD::SELECT_CC, ISD::BR_CC},
563 MVT::i16, Expand);
564
565 setOperationAction({ISD::SIGN_EXTEND, ISD::SDIV, ISD::UDIV, ISD::SREM,
566 ISD::UREM, ISD::BITREVERSE, ISD::CTTZ,
567 ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF,
568 ISD::CTPOP},
569 MVT::i16, Promote);
570
571 setOperationAction(ISD::LOAD, MVT::i16, Custom);
572
573 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
574
575 setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote);
576 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
577 setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote);
578 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
579
580 setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::i16, Custom);
581 setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i16, Custom);
582 setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i16, Custom);
583
584 setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i32, Custom);
585
586 // F16 - Constant Actions.
587 setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
588 setOperationAction(ISD::ConstantFP, MVT::bf16, Legal);
589
590 // F16 - Load/Store Actions.
591 setOperationAction(ISD::LOAD, MVT::f16, Promote);
592 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
593 setOperationAction(ISD::STORE, MVT::f16, Promote);
594 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
595
596 // BF16 - Load/Store Actions.
597 setOperationAction(ISD::LOAD, MVT::bf16, Promote);
598 AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16);
599 setOperationAction(ISD::STORE, MVT::bf16, Promote);
600 AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16);
601
602 // F16 - VOP1 Actions.
603 setOperationAction({ISD::FP_ROUND, ISD::STRICT_FP_ROUND, ISD::FCOS,
604 ISD::FSIN, ISD::FROUND, ISD::FPTRUNC_ROUND},
605 MVT::f16, Custom);
606
607 setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::f16, Promote);
608 setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::bf16, Promote);
609
610 // F16 - VOP2 Actions.
611 setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},
612 Expand);
613 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, MVT::f16, Custom);
614 setOperationAction(ISD::FFREXP, MVT::f16, Custom);
615 setOperationAction(ISD::FDIV, MVT::f16, Custom);
616
617 // F16 - VOP3 Actions.
618 setOperationAction(ISD::FMA, MVT::f16, Legal);
619 if (STI.hasMadF16())
620 setOperationAction(ISD::FMAD, MVT::f16, Legal);
621
622 for (MVT VT :
623 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
624 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
625 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
626 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
627 switch (Op) {
628 case ISD::LOAD:
629 case ISD::STORE:
630 case ISD::BUILD_VECTOR:
631 case ISD::BITCAST:
632 case ISD::UNDEF:
633 case ISD::EXTRACT_VECTOR_ELT:
634 case ISD::INSERT_VECTOR_ELT:
635 case ISD::INSERT_SUBVECTOR:
636 case ISD::EXTRACT_SUBVECTOR:
637 case ISD::SCALAR_TO_VECTOR:
638 case ISD::IS_FPCLASS:
639 break;
640 case ISD::CONCAT_VECTORS:
641 setOperationAction(Op, VT, Custom);
642 break;
643 default:
644 setOperationAction(Op, VT, Expand);
645 break;
646 }
647 }
648 }
649
650 // v_perm_b32 can handle either of these.
651 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
652 setOperationAction(ISD::BSWAP, MVT::v4i16, Custom);
653
654 // XXX - Do these do anything? Vector constants turn into build_vector.
655 setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
656
657 setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
658 Legal);
659
660 setOperationAction(ISD::STORE, MVT::v2i16, Promote);
661 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
662 setOperationAction(ISD::STORE, MVT::v2f16, Promote);
663 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
664
665 setOperationAction(ISD::LOAD, MVT::v2i16, Promote);
666 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
667 setOperationAction(ISD::LOAD, MVT::v2f16, Promote);
668 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
669
670 setOperationAction(ISD::AND, MVT::v2i16, Promote);
671 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
672 setOperationAction(ISD::OR, MVT::v2i16, Promote);
673 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
674 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
675 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
676
677 setOperationAction(ISD::LOAD, MVT::v4i16, Promote);
678 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
679 setOperationAction(ISD::LOAD, MVT::v4f16, Promote);
680 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
681 setOperationAction(ISD::LOAD, MVT::v4bf16, Promote);
682 AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32);
683
684 setOperationAction(ISD::STORE, MVT::v4i16, Promote);
685 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
686 setOperationAction(ISD::STORE, MVT::v4f16, Promote);
687 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
688 setOperationAction(ISD::STORE, MVT::v4bf16, Promote);
689 AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32);
690
691 setOperationAction(ISD::LOAD, MVT::v8i16, Promote);
692 AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
693 setOperationAction(ISD::LOAD, MVT::v8f16, Promote);
694 AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
695 setOperationAction(ISD::LOAD, MVT::v8bf16, Promote);
696 AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32);
697
698 setOperationAction(ISD::STORE, MVT::v4i16, Promote);
699 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
700 setOperationAction(ISD::STORE, MVT::v4f16, Promote);
701 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
702
703 setOperationAction(ISD::STORE, MVT::v8i16, Promote);
704 AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
705 setOperationAction(ISD::STORE, MVT::v8f16, Promote);
706 AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
707 setOperationAction(ISD::STORE, MVT::v8bf16, Promote);
708 AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32);
709
710 setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
711 AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
712 setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
713 AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
714 setOperationAction(ISD::LOAD, MVT::v16bf16, Promote);
715 AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32);
716
717 setOperationAction(ISD::STORE, MVT::v16i16, Promote);
718 AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
719 setOperationAction(ISD::STORE, MVT::v16f16, Promote);
720 AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
721 setOperationAction(ISD::STORE, MVT::v16bf16, Promote);
722 AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32);
723
724 setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
725 AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
726 setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
727 AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
728 setOperationAction(ISD::LOAD, MVT::v32bf16, Promote);
729 AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32);
730
731 setOperationAction(ISD::STORE, MVT::v32i16, Promote);
732 AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
733 setOperationAction(ISD::STORE, MVT::v32f16, Promote);
734 AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
735 setOperationAction(ISD::STORE, MVT::v32bf16, Promote);
736 AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32);
737
738 setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},
739 MVT::v2i32, Expand);
740 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
741
742 setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},
743 MVT::v4i32, Expand);
744
745 setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},
746 MVT::v8i32, Expand);
747
748 if (!Subtarget->hasVOP3PInsts())
749 setOperationAction(ISD::BUILD_VECTOR,
750 {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Custom);
751
752 setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
753 // This isn't really legal, but this avoids the legalizer unrolling it (and
754 // allows matching fneg (fabs x) patterns)
755 setOperationAction(ISD::FABS, MVT::v2f16, Legal);
756
757 setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, MVT::f16, Custom);
758 setOperationAction({ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, MVT::f16, Legal);
759
760 setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE},
761 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
762 Custom);
763
764 setOperationAction({ISD::FMINNUM, ISD::FMAXNUM},
765 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
766 Expand);
767
768 for (MVT Vec16 :
769 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
770 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
771 setOperationAction(
772 {ISD::BUILD_VECTOR, ISD::EXTRACT_VECTOR_ELT, ISD::SCALAR_TO_VECTOR},
773 Vec16, Custom);
774 setOperationAction(ISD::INSERT_VECTOR_ELT, Vec16, Expand);
775 }
776 }
777
778 if (Subtarget->hasVOP3PInsts()) {
779 setOperationAction({ISD::ADD, ISD::SUB, ISD::MUL, ISD::SHL, ISD::SRL,
780 ISD::SRA, ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX,
781 ISD::UADDSAT, ISD::USUBSAT, ISD::SADDSAT, ISD::SSUBSAT},
782 MVT::v2i16, Legal);
783
784 setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FMINNUM_IEEE,
785 ISD::FMAXNUM_IEEE, ISD::FCANONICALIZE},
786 MVT::v2f16, Legal);
787
788 setOperationAction(ISD::EXTRACT_VECTOR_ELT, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
789 Custom);
790
791 setOperationAction(ISD::VECTOR_SHUFFLE,
792 {MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::v8i16,
793 MVT::v16f16, MVT::v16i16, MVT::v32f16, MVT::v32i16},
794 Custom);
795
796 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
797 // Split vector operations.
798 setOperationAction({ISD::SHL, ISD::SRA, ISD::SRL, ISD::ADD, ISD::SUB,
799 ISD::MUL, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX,
800 ISD::UADDSAT, ISD::SADDSAT, ISD::USUBSAT,
801 ISD::SSUBSAT},
802 VT, Custom);
803
804 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
805 // Split vector operations.
806 setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FCANONICALIZE},
807 VT, Custom);
808
809 setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, {MVT::v2f16, MVT::v4f16},
810 Custom);
811
812 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
813 setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
814 Custom);
815
816 if (Subtarget->hasPackedFP32Ops()) {
817 setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FNEG},
818 MVT::v2f32, Legal);
819 setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA},
820 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
821 Custom);
822 }
823 }
824
825 setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v4f16, Custom);
826
827 if (Subtarget->has16BitInsts()) {
828 setOperationAction(ISD::SELECT, MVT::v2i16, Promote);
829 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
830 setOperationAction(ISD::SELECT, MVT::v2f16, Promote);
831 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
832 } else {
833 // Legalization hack.
834 setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
835
836 setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v2f16, Custom);
837 }
838
839 setOperationAction(ISD::SELECT,
840 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
841 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
842 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
843 MVT::v32f16, MVT::v32bf16},
844 Custom);
845
846 setOperationAction({ISD::SMULO, ISD::UMULO}, MVT::i64, Custom);
847
848 if (Subtarget->hasScalarSMulU64())
849 setOperationAction(ISD::MUL, MVT::i64, Custom);
850
851 if (Subtarget->hasMad64_32())
852 setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, MVT::i32, Custom);
853
854 if (Subtarget->hasPrefetch())
855 setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
856
857 if (Subtarget->hasIEEEMinMax())
858 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM},
859 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
860
861 setOperationAction(ISD::INTRINSIC_WO_CHAIN,
862 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
863 MVT::v2i16, MVT::v2f16, MVT::i128, MVT::i8},
864 Custom);
865
866 setOperationAction(ISD::INTRINSIC_W_CHAIN,
867 {MVT::v2f16, MVT::v2i16, MVT::v3f16, MVT::v3i16,
868 MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::Other, MVT::f16,
869 MVT::i16, MVT::i8, MVT::i128},
870 Custom);
871
872 setOperationAction(ISD::INTRINSIC_VOID,
873 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v3i16,
874 MVT::v3f16, MVT::v4f16, MVT::v4i16, MVT::f16, MVT::i16,
875 MVT::i8, MVT::i128},
876 Custom);
877
878 setOperationAction(ISD::STACKSAVE, MVT::Other, Custom);
879 setOperationAction(ISD::GET_ROUNDING, MVT::i32, Custom);
880 setOperationAction(ISD::GET_FPENV, MVT::i64, Custom);
881 setOperationAction(ISD::SET_FPENV, MVT::i64, Custom);
882
883 // TODO: Could move this to custom lowering, could benefit from combines on
884 // extract of relevant bits.
885 setOperationAction(ISD::GET_FPMODE, MVT::i32, Legal);
886
887 setOperationAction(ISD::MUL, MVT::i1, Promote);
888
889 setTargetDAGCombine({ISD::ADD,
890 ISD::UADDO_CARRY,
891 ISD::SUB,
892 ISD::USUBO_CARRY,
893 ISD::FADD,
894 ISD::FSUB,
895 ISD::FDIV,
896 ISD::FMINNUM,
897 ISD::FMAXNUM,
898 ISD::FMINNUM_IEEE,
899 ISD::FMAXNUM_IEEE,
900 ISD::FMINIMUM,
901 ISD::FMAXIMUM,
902 ISD::FMA,
903 ISD::SMIN,
904 ISD::SMAX,
905 ISD::UMIN,
906 ISD::UMAX,
907 ISD::SETCC,
908 ISD::AND,
909 ISD::OR,
910 ISD::XOR,
911 ISD::FSHR,
912 ISD::SINT_TO_FP,
913 ISD::UINT_TO_FP,
914 ISD::FCANONICALIZE,
915 ISD::SCALAR_TO_VECTOR,
916 ISD::ZERO_EXTEND,
917 ISD::SIGN_EXTEND_INREG,
918 ISD::EXTRACT_VECTOR_ELT,
919 ISD::INSERT_VECTOR_ELT,
920 ISD::FCOPYSIGN});
921
922 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
923 setTargetDAGCombine(ISD::FP_ROUND);
924
925 // All memory operations. Some folding on the pointer operand is done to help
926 // matching the constant offsets in the addressing modes.
927 setTargetDAGCombine({ISD::LOAD,
928 ISD::STORE,
929 ISD::ATOMIC_LOAD,
930 ISD::ATOMIC_STORE,
931 ISD::ATOMIC_CMP_SWAP,
932 ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS,
933 ISD::ATOMIC_SWAP,
934 ISD::ATOMIC_LOAD_ADD,
935 ISD::ATOMIC_LOAD_SUB,
936 ISD::ATOMIC_LOAD_AND,
937 ISD::ATOMIC_LOAD_OR,
938 ISD::ATOMIC_LOAD_XOR,
939 ISD::ATOMIC_LOAD_NAND,
940 ISD::ATOMIC_LOAD_MIN,
941 ISD::ATOMIC_LOAD_MAX,
942 ISD::ATOMIC_LOAD_UMIN,
943 ISD::ATOMIC_LOAD_UMAX,
944 ISD::ATOMIC_LOAD_FADD,
945 ISD::ATOMIC_LOAD_UINC_WRAP,
946 ISD::ATOMIC_LOAD_UDEC_WRAP,
947 ISD::INTRINSIC_VOID,
948 ISD::INTRINSIC_W_CHAIN});
949
950 // FIXME: In other contexts we pretend this is a per-function property.
951 setStackPointerRegisterToSaveRestore(AMDGPU::SGPR32);
952
953 setSchedulingPreference(Sched::RegPressure);
954}
955
956const GCNSubtarget *SITargetLowering::getSubtarget() const {
957 return Subtarget;
958}
959
960//===----------------------------------------------------------------------===//
961// TargetLowering queries
962//===----------------------------------------------------------------------===//
963
964// v_mad_mix* support a conversion from f16 to f32.
965//
966// There is only one special case when denormals are enabled we don't currently,
967// where this is OK to use.
968bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
969 EVT DestVT, EVT SrcVT) const {
970 return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
971 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
972 DestVT.getScalarType() == MVT::f32 &&
973 SrcVT.getScalarType() == MVT::f16 &&
974 // TODO: This probably only requires no input flushing?
975 denormalModeIsFlushAllF32(DAG.getMachineFunction());
976}
977
978bool SITargetLowering::isFPExtFoldable(const MachineInstr &MI, unsigned Opcode,
979 LLT DestTy, LLT SrcTy) const {
980 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
981 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
982 DestTy.getScalarSizeInBits() == 32 &&
983 SrcTy.getScalarSizeInBits() == 16 &&
984 // TODO: This probably only requires no input flushing?
985 denormalModeIsFlushAllF32(MF: *MI.getMF());
986}
987
988bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const {
989 // SI has some legal vector types, but no legal vector operations. Say no
990 // shuffles are legal in order to prefer scalarizing some vector operations.
991 return false;
992}
993
994MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
995 CallingConv::ID CC,
996 EVT VT) const {
997 if (CC == CallingConv::AMDGPU_KERNEL)
998 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
999
1000 if (VT.isVector()) {
1001 EVT ScalarVT = VT.getScalarType();
1002 unsigned Size = ScalarVT.getSizeInBits();
1003 if (Size == 16) {
1004 if (Subtarget->has16BitInsts()) {
1005 if (VT.isInteger())
1006 return MVT::v2i16;
1007 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1008 }
1009 return VT.isInteger() ? MVT::i32 : MVT::f32;
1010 }
1011
1012 if (Size < 16)
1013 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1014 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
1015 }
1016
1017 if (VT.getSizeInBits() > 32)
1018 return MVT::i32;
1019
1020 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1021}
1022
1023unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
1024 CallingConv::ID CC,
1025 EVT VT) const {
1026 if (CC == CallingConv::AMDGPU_KERNEL)
1027 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1028
1029 if (VT.isVector()) {
1030 unsigned NumElts = VT.getVectorNumElements();
1031 EVT ScalarVT = VT.getScalarType();
1032 unsigned Size = ScalarVT.getSizeInBits();
1033
1034 // FIXME: Should probably promote 8-bit vectors to i16.
1035 if (Size == 16 && Subtarget->has16BitInsts())
1036 return (NumElts + 1) / 2;
1037
1038 if (Size <= 32)
1039 return NumElts;
1040
1041 if (Size > 32)
1042 return NumElts * ((Size + 31) / 32);
1043 } else if (VT.getSizeInBits() > 32)
1044 return (VT.getSizeInBits() + 31) / 32;
1045
1046 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1047}
1048
1049unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
1050 LLVMContext &Context, CallingConv::ID CC,
1051 EVT VT, EVT &IntermediateVT,
1052 unsigned &NumIntermediates, MVT &RegisterVT) const {
1053 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1054 unsigned NumElts = VT.getVectorNumElements();
1055 EVT ScalarVT = VT.getScalarType();
1056 unsigned Size = ScalarVT.getSizeInBits();
1057 // FIXME: We should fix the ABI to be the same on targets without 16-bit
1058 // support, but unless we can properly handle 3-vectors, it will be still be
1059 // inconsistent.
1060 if (Size == 16 && Subtarget->has16BitInsts()) {
1061 if (ScalarVT == MVT::bf16) {
1062 RegisterVT = MVT::i32;
1063 IntermediateVT = MVT::v2bf16;
1064 } else {
1065 RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
1066 IntermediateVT = RegisterVT;
1067 }
1068 NumIntermediates = (NumElts + 1) / 2;
1069 return NumIntermediates;
1070 }
1071
1072 if (Size == 32) {
1073 RegisterVT = ScalarVT.getSimpleVT();
1074 IntermediateVT = RegisterVT;
1075 NumIntermediates = NumElts;
1076 return NumIntermediates;
1077 }
1078
1079 if (Size < 16 && Subtarget->has16BitInsts()) {
1080 // FIXME: Should probably form v2i16 pieces
1081 RegisterVT = MVT::i16;
1082 IntermediateVT = ScalarVT;
1083 NumIntermediates = NumElts;
1084 return NumIntermediates;
1085 }
1086
1087
1088 if (Size != 16 && Size <= 32) {
1089 RegisterVT = MVT::i32;
1090 IntermediateVT = ScalarVT;
1091 NumIntermediates = NumElts;
1092 return NumIntermediates;
1093 }
1094
1095 if (Size > 32) {
1096 RegisterVT = MVT::i32;
1097 IntermediateVT = RegisterVT;
1098 NumIntermediates = NumElts * ((Size + 31) / 32);
1099 return NumIntermediates;
1100 }
1101 }
1102
1103 return TargetLowering::getVectorTypeBreakdownForCallingConv(
1104 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1105}
1106
1107static EVT memVTFromLoadIntrData(Type *Ty, unsigned MaxNumLanes) {
1108 assert(MaxNumLanes != 0);
1109
1110 if (auto *VT = dyn_cast<FixedVectorType>(Val: Ty)) {
1111 unsigned NumElts = std::min(a: MaxNumLanes, b: VT->getNumElements());
1112 return EVT::getVectorVT(Context&: Ty->getContext(),
1113 VT: EVT::getEVT(Ty: VT->getElementType()),
1114 NumElements: NumElts);
1115 }
1116
1117 return EVT::getEVT(Ty);
1118}
1119
1120// Peek through TFE struct returns to only use the data size.
1121static EVT memVTFromLoadIntrReturn(Type *Ty, unsigned MaxNumLanes) {
1122 auto *ST = dyn_cast<StructType>(Val: Ty);
1123 if (!ST)
1124 return memVTFromLoadIntrData(Ty, MaxNumLanes);
1125
1126 // TFE intrinsics return an aggregate type.
1127 assert(ST->getNumContainedTypes() == 2 &&
1128 ST->getContainedType(1)->isIntegerTy(32));
1129 return memVTFromLoadIntrData(Ty: ST->getContainedType(i: 0), MaxNumLanes);
1130}
1131
1132/// Map address space 7 to MVT::v5i32 because that's its in-memory
1133/// representation. This return value is vector-typed because there is no
1134/// MVT::i160 and it is not clear if one can be added. While this could
1135/// cause issues during codegen, these address space 7 pointers will be
1136/// rewritten away by then. Therefore, we can return MVT::v5i32 in order
1137/// to allow pre-codegen passes that query TargetTransformInfo, often for cost
1138/// modeling, to work.
1139MVT SITargetLowering::getPointerTy(const DataLayout &DL, unsigned AS) const {
1140 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1141 return MVT::v5i32;
1142 if (AMDGPUAS::BUFFER_STRIDED_POINTER == AS &&
1143 DL.getPointerSizeInBits(AS) == 192)
1144 return MVT::v6i32;
1145 return AMDGPUTargetLowering::getPointerTy(DL, AS);
1146}
1147/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1148/// v8i32 when padding is added.
1149/// The in-memory representation of a p9 is {p8, i32, i32}, which is
1150/// also v8i32 with padding.
1151MVT SITargetLowering::getPointerMemTy(const DataLayout &DL, unsigned AS) const {
1152 if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1153 DL.getPointerSizeInBits(AS) == 160) ||
1154 (AMDGPUAS::BUFFER_STRIDED_POINTER == AS &&
1155 DL.getPointerSizeInBits(AS) == 192))
1156 return MVT::v8i32;
1157 return AMDGPUTargetLowering::getPointerMemTy(DL, AS);
1158}
1159
1160bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
1161 const CallInst &CI,
1162 MachineFunction &MF,
1163 unsigned IntrID) const {
1164 Info.flags = MachineMemOperand::MONone;
1165 if (CI.hasMetadata(KindID: LLVMContext::MD_invariant_load))
1166 Info.flags |= MachineMemOperand::MOInvariant;
1167
1168 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1169 AMDGPU::lookupRsrcIntrinsic(Intr: IntrID)) {
1170 AttributeList Attr = Intrinsic::getAttributes(C&: CI.getContext(),
1171 id: (Intrinsic::ID)IntrID);
1172 MemoryEffects ME = Attr.getMemoryEffects();
1173 if (ME.doesNotAccessMemory())
1174 return false;
1175
1176 // TODO: Should images get their own address space?
1177 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1178
1179 if (RsrcIntr->IsImage)
1180 Info.align.reset();
1181
1182 Value *RsrcArg = CI.getArgOperand(i: RsrcIntr->RsrcArg);
1183 if (auto *RsrcPtrTy = dyn_cast<PointerType>(Val: RsrcArg->getType())) {
1184 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1185 // We conservatively set the memory operand of a buffer intrinsic to the
1186 // base resource pointer, so that we can access alias information about
1187 // those pointers. Cases like "this points at the same value
1188 // but with a different offset" are handled in
1189 // areMemAccessesTriviallyDisjoint.
1190 Info.ptrVal = RsrcArg;
1191 }
1192
1193 auto *Aux = cast<ConstantInt>(Val: CI.getArgOperand(i: CI.arg_size() - 1));
1194 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1195 Info.flags |= MachineMemOperand::MOVolatile;
1196 Info.flags |= MachineMemOperand::MODereferenceable;
1197 if (ME.onlyReadsMemory()) {
1198 unsigned MaxNumLanes = 4;
1199
1200 if (RsrcIntr->IsImage) {
1201 const AMDGPU::ImageDimIntrinsicInfo *Intr
1202 = AMDGPU::getImageDimIntrinsicInfo(Intr: IntrID);
1203 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
1204 AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: Intr->BaseOpcode);
1205
1206 if (!BaseOpcode->Gather4) {
1207 // If this isn't a gather, we may have excess loaded elements in the
1208 // IR type. Check the dmask for the real number of elements loaded.
1209 unsigned DMask
1210 = cast<ConstantInt>(Val: CI.getArgOperand(i: 0))->getZExtValue();
1211 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(Value: DMask);
1212 }
1213 }
1214
1215 Info.memVT = memVTFromLoadIntrReturn(Ty: CI.getType(), MaxNumLanes);
1216
1217 // FIXME: What does alignment mean for an image?
1218 Info.opc = ISD::INTRINSIC_W_CHAIN;
1219 Info.flags |= MachineMemOperand::MOLoad;
1220 } else if (ME.onlyWritesMemory()) {
1221 Info.opc = ISD::INTRINSIC_VOID;
1222
1223 Type *DataTy = CI.getArgOperand(i: 0)->getType();
1224 if (RsrcIntr->IsImage) {
1225 unsigned DMask = cast<ConstantInt>(Val: CI.getArgOperand(i: 1))->getZExtValue();
1226 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(Value: DMask);
1227 Info.memVT = memVTFromLoadIntrData(Ty: DataTy, MaxNumLanes: DMaskLanes);
1228 } else
1229 Info.memVT = EVT::getEVT(Ty: DataTy);
1230
1231 Info.flags |= MachineMemOperand::MOStore;
1232 } else {
1233 // Atomic
1234 Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID :
1235 ISD::INTRINSIC_W_CHAIN;
1236 Info.memVT = MVT::getVT(Ty: CI.getArgOperand(i: 0)->getType());
1237 Info.flags |= MachineMemOperand::MOLoad |
1238 MachineMemOperand::MOStore |
1239 MachineMemOperand::MODereferenceable;
1240
1241 switch (IntrID) {
1242 default:
1243 // XXX - Should this be volatile without known ordering?
1244 Info.flags |= MachineMemOperand::MOVolatile;
1245 break;
1246 case Intrinsic::amdgcn_raw_buffer_load_lds:
1247 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1248 case Intrinsic::amdgcn_struct_buffer_load_lds:
1249 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1250 unsigned Width = cast<ConstantInt>(Val: CI.getArgOperand(i: 2))->getZExtValue();
1251 Info.memVT = EVT::getIntegerVT(Context&: CI.getContext(), BitWidth: Width * 8);
1252 Info.ptrVal = CI.getArgOperand(i: 1);
1253 return true;
1254 }
1255 }
1256 }
1257 return true;
1258 }
1259
1260 switch (IntrID) {
1261 case Intrinsic::amdgcn_ds_ordered_add:
1262 case Intrinsic::amdgcn_ds_ordered_swap:
1263 case Intrinsic::amdgcn_ds_fadd:
1264 case Intrinsic::amdgcn_ds_fmin:
1265 case Intrinsic::amdgcn_ds_fmax: {
1266 Info.opc = ISD::INTRINSIC_W_CHAIN;
1267 Info.memVT = MVT::getVT(Ty: CI.getType());
1268 Info.ptrVal = CI.getOperand(i_nocapture: 0);
1269 Info.align.reset();
1270 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1271
1272 const ConstantInt *Vol = cast<ConstantInt>(Val: CI.getOperand(i_nocapture: 4));
1273 if (!Vol->isZero())
1274 Info.flags |= MachineMemOperand::MOVolatile;
1275
1276 return true;
1277 }
1278 case Intrinsic::amdgcn_buffer_atomic_fadd: {
1279 Info.opc = ISD::INTRINSIC_W_CHAIN;
1280 Info.memVT = MVT::getVT(Ty: CI.getOperand(i_nocapture: 0)->getType());
1281 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1282 Info.align.reset();
1283 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1284
1285 const ConstantInt *Vol = dyn_cast<ConstantInt>(Val: CI.getOperand(i_nocapture: 4));
1286 if (!Vol || !Vol->isZero())
1287 Info.flags |= MachineMemOperand::MOVolatile;
1288
1289 return true;
1290 }
1291 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1292 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1293 Info.opc = ISD::INTRINSIC_W_CHAIN;
1294 Info.memVT = MVT::getVT(Ty: CI.getOperand(i_nocapture: 0)->getType());
1295 Info.ptrVal = nullptr;
1296 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1297 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1298 return true;
1299 }
1300 case Intrinsic::amdgcn_ds_append:
1301 case Intrinsic::amdgcn_ds_consume: {
1302 Info.opc = ISD::INTRINSIC_W_CHAIN;
1303 Info.memVT = MVT::getVT(Ty: CI.getType());
1304 Info.ptrVal = CI.getOperand(i_nocapture: 0);
1305 Info.align.reset();
1306 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1307
1308 const ConstantInt *Vol = cast<ConstantInt>(Val: CI.getOperand(i_nocapture: 1));
1309 if (!Vol->isZero())
1310 Info.flags |= MachineMemOperand::MOVolatile;
1311
1312 return true;
1313 }
1314 case Intrinsic::amdgcn_global_atomic_csub: {
1315 Info.opc = ISD::INTRINSIC_W_CHAIN;
1316 Info.memVT = MVT::getVT(Ty: CI.getType());
1317 Info.ptrVal = CI.getOperand(i_nocapture: 0);
1318 Info.align.reset();
1319 Info.flags |= MachineMemOperand::MOLoad |
1320 MachineMemOperand::MOStore |
1321 MachineMemOperand::MOVolatile;
1322 return true;
1323 }
1324 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
1325 Info.opc = ISD::INTRINSIC_W_CHAIN;
1326 Info.memVT = MVT::getVT(Ty: CI.getType()); // XXX: what is correct VT?
1327
1328 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1329 Info.align.reset();
1330 Info.flags |= MachineMemOperand::MOLoad |
1331 MachineMemOperand::MODereferenceable;
1332 return true;
1333 }
1334 case Intrinsic::amdgcn_global_atomic_fadd:
1335 case Intrinsic::amdgcn_global_atomic_fmin:
1336 case Intrinsic::amdgcn_global_atomic_fmax:
1337 case Intrinsic::amdgcn_global_atomic_fmin_num:
1338 case Intrinsic::amdgcn_global_atomic_fmax_num:
1339 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1340 case Intrinsic::amdgcn_flat_atomic_fadd:
1341 case Intrinsic::amdgcn_flat_atomic_fmin:
1342 case Intrinsic::amdgcn_flat_atomic_fmax:
1343 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1344 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1345 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1346 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1347 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: {
1348 Info.opc = ISD::INTRINSIC_W_CHAIN;
1349 Info.memVT = MVT::getVT(Ty: CI.getType());
1350 Info.ptrVal = CI.getOperand(i_nocapture: 0);
1351 Info.align.reset();
1352 Info.flags |= MachineMemOperand::MOLoad |
1353 MachineMemOperand::MOStore |
1354 MachineMemOperand::MODereferenceable |
1355 MachineMemOperand::MOVolatile;
1356 return true;
1357 }
1358 case Intrinsic::amdgcn_global_load_tr_b64:
1359 case Intrinsic::amdgcn_global_load_tr_b128: {
1360 Info.opc = ISD::INTRINSIC_W_CHAIN;
1361 Info.memVT = MVT::getVT(Ty: CI.getType());
1362 Info.ptrVal = CI.getOperand(i_nocapture: 0);
1363 Info.align.reset();
1364 Info.flags |= MachineMemOperand::MOLoad;
1365 return true;
1366 }
1367 case Intrinsic::amdgcn_ds_gws_init:
1368 case Intrinsic::amdgcn_ds_gws_barrier:
1369 case Intrinsic::amdgcn_ds_gws_sema_v:
1370 case Intrinsic::amdgcn_ds_gws_sema_br:
1371 case Intrinsic::amdgcn_ds_gws_sema_p:
1372 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1373 Info.opc = ISD::INTRINSIC_VOID;
1374
1375 const GCNTargetMachine &TM =
1376 static_cast<const GCNTargetMachine &>(getTargetMachine());
1377
1378 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1379 Info.ptrVal = MFI->getGWSPSV(TM);
1380
1381 // This is an abstract access, but we need to specify a type and size.
1382 Info.memVT = MVT::i32;
1383 Info.size = 4;
1384 Info.align = Align(4);
1385
1386 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1387 Info.flags |= MachineMemOperand::MOLoad;
1388 else
1389 Info.flags |= MachineMemOperand::MOStore;
1390 return true;
1391 }
1392 case Intrinsic::amdgcn_global_load_lds: {
1393 Info.opc = ISD::INTRINSIC_VOID;
1394 unsigned Width = cast<ConstantInt>(Val: CI.getArgOperand(i: 2))->getZExtValue();
1395 Info.memVT = EVT::getIntegerVT(Context&: CI.getContext(), BitWidth: Width * 8);
1396 Info.ptrVal = CI.getArgOperand(i: 1);
1397 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1398 return true;
1399 }
1400 case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
1401 Info.opc = ISD::INTRINSIC_W_CHAIN;
1402
1403 const GCNTargetMachine &TM =
1404 static_cast<const GCNTargetMachine &>(getTargetMachine());
1405
1406 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1407 Info.ptrVal = MFI->getGWSPSV(TM);
1408
1409 // This is an abstract access, but we need to specify a type and size.
1410 Info.memVT = MVT::i32;
1411 Info.size = 4;
1412 Info.align = Align(4);
1413
1414 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1415 return true;
1416 }
1417 default:
1418 return false;
1419 }
1420}
1421
1422void SITargetLowering::CollectTargetIntrinsicOperands(
1423 const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1424 switch (cast<IntrinsicInst>(Val: I).getIntrinsicID()) {
1425 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1426 // The DAG's ValueType loses the addrspaces.
1427 // Add them as 2 extra Constant operands "from" and "to".
1428 unsigned SrcAS = I.getOperand(i_nocapture: 0)->getType()->getPointerAddressSpace();
1429 unsigned DstAS = I.getType()->getPointerAddressSpace();
1430 Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));
1431 Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));
1432 break;
1433 }
1434 default:
1435 break;
1436 }
1437}
1438
1439bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
1440 SmallVectorImpl<Value*> &Ops,
1441 Type *&AccessTy) const {
1442 Value *Ptr = nullptr;
1443 switch (II->getIntrinsicID()) {
1444 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1445 case Intrinsic::amdgcn_ds_append:
1446 case Intrinsic::amdgcn_ds_consume:
1447 case Intrinsic::amdgcn_ds_fadd:
1448 case Intrinsic::amdgcn_ds_fmax:
1449 case Intrinsic::amdgcn_ds_fmin:
1450 case Intrinsic::amdgcn_ds_ordered_add:
1451 case Intrinsic::amdgcn_ds_ordered_swap:
1452 case Intrinsic::amdgcn_flat_atomic_fadd:
1453 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
1454 case Intrinsic::amdgcn_flat_atomic_fmax:
1455 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1456 case Intrinsic::amdgcn_flat_atomic_fmin:
1457 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1458 case Intrinsic::amdgcn_global_atomic_csub:
1459 case Intrinsic::amdgcn_global_atomic_fadd:
1460 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1461 case Intrinsic::amdgcn_global_atomic_fmax:
1462 case Intrinsic::amdgcn_global_atomic_fmax_num:
1463 case Intrinsic::amdgcn_global_atomic_fmin:
1464 case Intrinsic::amdgcn_global_atomic_fmin_num:
1465 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1466 case Intrinsic::amdgcn_global_load_tr_b64:
1467 case Intrinsic::amdgcn_global_load_tr_b128:
1468 Ptr = II->getArgOperand(i: 0);
1469 break;
1470 case Intrinsic::amdgcn_global_load_lds:
1471 Ptr = II->getArgOperand(i: 1);
1472 break;
1473 default:
1474 return false;
1475 }
1476 AccessTy = II->getType();
1477 Ops.push_back(Elt: Ptr);
1478 return true;
1479}
1480
1481bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM,
1482 unsigned AddrSpace,
1483 uint64_t FlatVariant) const {
1484 if (!Subtarget->hasFlatInstOffsets()) {
1485 // Flat instructions do not have offsets, and only have the register
1486 // address.
1487 return AM.BaseOffs == 0 && AM.Scale == 0;
1488 }
1489
1490 return AM.Scale == 0 &&
1491 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1492 Offset: AM.BaseOffs, AddrSpace, FlatVariant));
1493}
1494
1495bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const {
1496 if (Subtarget->hasFlatGlobalInsts())
1497 return isLegalFlatAddressingMode(AM, AddrSpace: AMDGPUAS::GLOBAL_ADDRESS,
1498 FlatVariant: SIInstrFlags::FlatGlobal);
1499
1500 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1501 // Assume the we will use FLAT for all global memory accesses
1502 // on VI.
1503 // FIXME: This assumption is currently wrong. On VI we still use
1504 // MUBUF instructions for the r + i addressing mode. As currently
1505 // implemented, the MUBUF instructions only work on buffer < 4GB.
1506 // It may be possible to support > 4GB buffers with MUBUF instructions,
1507 // by setting the stride value in the resource descriptor which would
1508 // increase the size limit to (stride * 4GB). However, this is risky,
1509 // because it has never been validated.
1510 return isLegalFlatAddressingMode(AM, AddrSpace: AMDGPUAS::FLAT_ADDRESS,
1511 FlatVariant: SIInstrFlags::FLAT);
1512 }
1513
1514 return isLegalMUBUFAddressingMode(AM);
1515}
1516
1517bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1518 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1519 // additionally can do r + r + i with addr64. 32-bit has more addressing
1520 // mode options. Depending on the resource constant, it can also do
1521 // (i64 r0) + (i32 r1) * (i14 i).
1522 //
1523 // Private arrays end up using a scratch buffer most of the time, so also
1524 // assume those use MUBUF instructions. Scratch loads / stores are currently
1525 // implemented as mubuf instructions with offen bit set, so slightly
1526 // different than the normal addr64.
1527 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1528 if (!TII->isLegalMUBUFImmOffset(Imm: AM.BaseOffs))
1529 return false;
1530
1531 // FIXME: Since we can split immediate into soffset and immediate offset,
1532 // would it make sense to allow any immediate?
1533
1534 switch (AM.Scale) {
1535 case 0: // r + i or just i, depending on HasBaseReg.
1536 return true;
1537 case 1:
1538 return true; // We have r + r or r + i.
1539 case 2:
1540 if (AM.HasBaseReg) {
1541 // Reject 2 * r + r.
1542 return false;
1543 }
1544
1545 // Allow 2 * r as r + r
1546 // Or 2 * r + i is allowed as r + r + i.
1547 return true;
1548 default: // Don't allow n * r
1549 return false;
1550 }
1551}
1552
1553bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
1554 const AddrMode &AM, Type *Ty,
1555 unsigned AS, Instruction *I) const {
1556 // No global is ever allowed as a base.
1557 if (AM.BaseGV)
1558 return false;
1559
1560 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1561 return isLegalGlobalAddressingMode(AM);
1562
1563 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1564 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
1565 AS == AMDGPUAS::BUFFER_FAT_POINTER || AS == AMDGPUAS::BUFFER_RESOURCE ||
1566 AS == AMDGPUAS::BUFFER_STRIDED_POINTER) {
1567 // If the offset isn't a multiple of 4, it probably isn't going to be
1568 // correctly aligned.
1569 // FIXME: Can we get the real alignment here?
1570 if (AM.BaseOffs % 4 != 0)
1571 return isLegalMUBUFAddressingMode(AM);
1572
1573 if (!Subtarget->hasScalarSubwordLoads()) {
1574 // There are no SMRD extloads, so if we have to do a small type access we
1575 // will use a MUBUF load.
1576 // FIXME?: We also need to do this if unaligned, but we don't know the
1577 // alignment here.
1578 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1579 return isLegalGlobalAddressingMode(AM);
1580 }
1581
1582 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
1583 // SMRD instructions have an 8-bit, dword offset on SI.
1584 if (!isUInt<8>(x: AM.BaseOffs / 4))
1585 return false;
1586 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1587 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1588 // in 8-bits, it can use a smaller encoding.
1589 if (!isUInt<32>(x: AM.BaseOffs / 4))
1590 return false;
1591 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1592 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1593 if (!isUInt<20>(x: AM.BaseOffs))
1594 return false;
1595 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1596 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1597 // for S_BUFFER_* instructions).
1598 if (!isInt<21>(x: AM.BaseOffs))
1599 return false;
1600 } else {
1601 // On GFX12, all offsets are signed 24-bit in bytes.
1602 if (!isInt<24>(x: AM.BaseOffs))
1603 return false;
1604 }
1605
1606 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1607 return true;
1608
1609 if (AM.Scale == 1 && AM.HasBaseReg)
1610 return true;
1611
1612 return false;
1613 }
1614
1615 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1616 return Subtarget->enableFlatScratch()
1617 ? isLegalFlatAddressingMode(AM, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS,
1618 FlatVariant: SIInstrFlags::FlatScratch)
1619 : isLegalMUBUFAddressingMode(AM);
1620
1621 if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1622 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
1623 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1624 // field.
1625 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1626 // an 8-bit dword offset but we don't know the alignment here.
1627 if (!isUInt<16>(x: AM.BaseOffs))
1628 return false;
1629
1630 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1631 return true;
1632
1633 if (AM.Scale == 1 && AM.HasBaseReg)
1634 return true;
1635
1636 return false;
1637 }
1638
1639 if (AS == AMDGPUAS::FLAT_ADDRESS || AS == AMDGPUAS::UNKNOWN_ADDRESS_SPACE) {
1640 // For an unknown address space, this usually means that this is for some
1641 // reason being used for pure arithmetic, and not based on some addressing
1642 // computation. We don't have instructions that compute pointers with any
1643 // addressing modes, so treat them as having no offset like flat
1644 // instructions.
1645 return isLegalFlatAddressingMode(AM, AddrSpace: AMDGPUAS::FLAT_ADDRESS,
1646 FlatVariant: SIInstrFlags::FLAT);
1647 }
1648
1649 // Assume a user alias of global for unknown address spaces.
1650 return isLegalGlobalAddressingMode(AM);
1651}
1652
1653bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
1654 const MachineFunction &MF) const {
1655 if (AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) {
1656 return (MemVT.getSizeInBits() <= 4 * 32);
1657 } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1658 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1659 return (MemVT.getSizeInBits() <= MaxPrivateBits);
1660 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1661 return (MemVT.getSizeInBits() <= 2 * 32);
1662 }
1663 return true;
1664}
1665
1666bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
1667 unsigned Size, unsigned AddrSpace, Align Alignment,
1668 MachineMemOperand::Flags Flags, unsigned *IsFast) const {
1669 if (IsFast)
1670 *IsFast = 0;
1671
1672 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1673 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1674 // Check if alignment requirements for ds_read/write instructions are
1675 // disabled.
1676 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
1677 return false;
1678
1679 Align RequiredAlignment(PowerOf2Ceil(A: Size/8)); // Natural alignment.
1680 if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
1681 Alignment < RequiredAlignment)
1682 return false;
1683
1684 // Either, the alignment requirements are "enabled", or there is an
1685 // unaligned LDS access related hardware bug though alignment requirements
1686 // are "disabled". In either case, we need to check for proper alignment
1687 // requirements.
1688 //
1689 switch (Size) {
1690 case 64:
1691 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
1692 // address is negative, then the instruction is incorrectly treated as
1693 // out-of-bounds even if base + offsets is in bounds. Split vectorized
1694 // loads here to avoid emitting ds_read2_b32. We may re-combine the
1695 // load later in the SILoadStoreOptimizer.
1696 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
1697 return false;
1698
1699 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
1700 // can do a 4 byte aligned, 8 byte access in a single operation using
1701 // ds_read2/write2_b32 with adjacent offsets.
1702 RequiredAlignment = Align(4);
1703
1704 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1705 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
1706 // ds_write2_b32 depending on the alignment. In either case with either
1707 // alignment there is no faster way of doing this.
1708
1709 // The numbers returned here and below are not additive, it is a 'speed
1710 // rank'. They are just meant to be compared to decide if a certain way
1711 // of lowering an operation is faster than another. For that purpose
1712 // naturally aligned operation gets it bitsize to indicate that "it
1713 // operates with a speed comparable to N-bit wide load". With the full
1714 // alignment ds128 is slower than ds96 for example. If underaligned it
1715 // is comparable to a speed of a single dword access, which would then
1716 // mean 32 < 128 and it is faster to issue a wide load regardless.
1717 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
1718 // wider load which will not be aligned anymore the latter is slower.
1719 if (IsFast)
1720 *IsFast = (Alignment >= RequiredAlignment) ? 64
1721 : (Alignment < Align(4)) ? 32
1722 : 1;
1723 return true;
1724 }
1725
1726 break;
1727 case 96:
1728 if (!Subtarget->hasDS96AndDS128())
1729 return false;
1730
1731 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
1732 // gfx8 and older.
1733
1734 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1735 // Naturally aligned access is fastest. However, also report it is Fast
1736 // if memory is aligned less than DWORD. A narrow load or store will be
1737 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
1738 // be more of them, so overall we will pay less penalty issuing a single
1739 // instruction.
1740
1741 // See comment on the values above.
1742 if (IsFast)
1743 *IsFast = (Alignment >= RequiredAlignment) ? 96
1744 : (Alignment < Align(4)) ? 32
1745 : 1;
1746 return true;
1747 }
1748
1749 break;
1750 case 128:
1751 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
1752 return false;
1753
1754 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
1755 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
1756 // single operation using ds_read2/write2_b64.
1757 RequiredAlignment = Align(8);
1758
1759 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1760 // Naturally aligned access is fastest. However, also report it is Fast
1761 // if memory is aligned less than DWORD. A narrow load or store will be
1762 // be equally slow as a single ds_read_b128/ds_write_b128, but there
1763 // will be more of them, so overall we will pay less penalty issuing a
1764 // single instruction.
1765
1766 // See comment on the values above.
1767 if (IsFast)
1768 *IsFast = (Alignment >= RequiredAlignment) ? 128
1769 : (Alignment < Align(4)) ? 32
1770 : 1;
1771 return true;
1772 }
1773
1774 break;
1775 default:
1776 if (Size > 32)
1777 return false;
1778
1779 break;
1780 }
1781
1782 // See comment on the values above.
1783 // Note that we have a single-dword or sub-dword here, so if underaligned
1784 // it is a slowest possible access, hence returned value is 0.
1785 if (IsFast)
1786 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
1787
1788 return Alignment >= RequiredAlignment ||
1789 Subtarget->hasUnalignedDSAccessEnabled();
1790 }
1791
1792 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
1793 bool AlignedBy4 = Alignment >= Align(4);
1794 if (IsFast)
1795 *IsFast = AlignedBy4;
1796
1797 return AlignedBy4 ||
1798 Subtarget->enableFlatScratch() ||
1799 Subtarget->hasUnalignedScratchAccess();
1800 }
1801
1802 // FIXME: We have to be conservative here and assume that flat operations
1803 // will access scratch. If we had access to the IR function, then we
1804 // could determine if any private memory was used in the function.
1805 if (AddrSpace == AMDGPUAS::FLAT_ADDRESS &&
1806 !Subtarget->hasUnalignedScratchAccess()) {
1807 bool AlignedBy4 = Alignment >= Align(4);
1808 if (IsFast)
1809 *IsFast = AlignedBy4;
1810
1811 return AlignedBy4;
1812 }
1813
1814 // So long as they are correct, wide global memory operations perform better
1815 // than multiple smaller memory ops -- even when misaligned
1816 if (AMDGPU::isExtendedGlobalAddrSpace(AS: AddrSpace)) {
1817 if (IsFast)
1818 *IsFast = Size;
1819
1820 return Alignment >= Align(4) ||
1821 Subtarget->hasUnalignedBufferAccessEnabled();
1822 }
1823
1824 // Smaller than dword value must be aligned.
1825 if (Size < 32)
1826 return false;
1827
1828 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1829 // byte-address are ignored, thus forcing Dword alignment.
1830 // This applies to private, global, and constant memory.
1831 if (IsFast)
1832 *IsFast = 1;
1833
1834 return Size >= 32 && Alignment >= Align(4);
1835}
1836
1837bool SITargetLowering::allowsMisalignedMemoryAccesses(
1838 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
1839 unsigned *IsFast) const {
1840 return allowsMisalignedMemoryAccessesImpl(Size: VT.getSizeInBits(), AddrSpace,
1841 Alignment, Flags, IsFast);
1842}
1843
1844EVT SITargetLowering::getOptimalMemOpType(
1845 const MemOp &Op, const AttributeList &FuncAttributes) const {
1846 // FIXME: Should account for address space here.
1847
1848 // The default fallback uses the private pointer size as a guess for a type to
1849 // use. Make sure we switch these to 64-bit accesses.
1850
1851 if (Op.size() >= 16 &&
1852 Op.isDstAligned(Align(4))) // XXX: Should only do for global
1853 return MVT::v4i32;
1854
1855 if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
1856 return MVT::v2i32;
1857
1858 // Use the default.
1859 return MVT::Other;
1860}
1861
1862bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const {
1863 const MemSDNode *MemNode = cast<MemSDNode>(Val: N);
1864 return MemNode->getMemOperand()->getFlags() & MONoClobber;
1865}
1866
1867bool SITargetLowering::isNonGlobalAddrSpace(unsigned AS) {
1868 return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS ||
1869 AS == AMDGPUAS::PRIVATE_ADDRESS;
1870}
1871
1872bool SITargetLowering::isFreeAddrSpaceCast(unsigned SrcAS,
1873 unsigned DestAS) const {
1874 // Flat -> private/local is a simple truncate.
1875 // Flat -> global is no-op
1876 if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
1877 return true;
1878
1879 const GCNTargetMachine &TM =
1880 static_cast<const GCNTargetMachine &>(getTargetMachine());
1881 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
1882}
1883
1884bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
1885 const MemSDNode *MemNode = cast<MemSDNode>(Val: N);
1886
1887 return AMDGPUInstrInfo::isUniformMMO(MMO: MemNode->getMemOperand());
1888}
1889
1890TargetLoweringBase::LegalizeTypeAction
1891SITargetLowering::getPreferredVectorAction(MVT VT) const {
1892 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
1893 VT.getScalarType().bitsLE(MVT::i16))
1894 return VT.isPow2VectorType() ? TypeSplitVector : TypeWidenVector;
1895 return TargetLoweringBase::getPreferredVectorAction(VT);
1896}
1897
1898bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
1899 Type *Ty) const {
1900 // FIXME: Could be smarter if called for vector constants.
1901 return true;
1902}
1903
1904bool SITargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
1905 unsigned Index) const {
1906 if (!isOperationLegalOrCustom(Op: ISD::EXTRACT_SUBVECTOR, VT: ResVT))
1907 return false;
1908
1909 // TODO: Add more cases that are cheap.
1910 return Index == 0;
1911}
1912
1913bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
1914 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
1915 switch (Op) {
1916 case ISD::LOAD:
1917 case ISD::STORE:
1918
1919 // These operations are done with 32-bit instructions anyway.
1920 case ISD::AND:
1921 case ISD::OR:
1922 case ISD::XOR:
1923 case ISD::SELECT:
1924 // TODO: Extensions?
1925 return true;
1926 default:
1927 return false;
1928 }
1929 }
1930
1931 // SimplifySetCC uses this function to determine whether or not it should
1932 // create setcc with i1 operands. We don't have instructions for i1 setcc.
1933 if (VT == MVT::i1 && Op == ISD::SETCC)
1934 return false;
1935
1936 return TargetLowering::isTypeDesirableForOp(Op, VT);
1937}
1938
1939SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
1940 const SDLoc &SL,
1941 SDValue Chain,
1942 uint64_t Offset) const {
1943 const DataLayout &DL = DAG.getDataLayout();
1944 MachineFunction &MF = DAG.getMachineFunction();
1945 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1946
1947 const ArgDescriptor *InputPtrReg;
1948 const TargetRegisterClass *RC;
1949 LLT ArgTy;
1950 MVT PtrVT = getPointerTy(DL, AS: AMDGPUAS::CONSTANT_ADDRESS);
1951
1952 std::tie(args&: InputPtrReg, args&: RC, args&: ArgTy) =
1953 Info->getPreloadedValue(Value: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1954
1955 // We may not have the kernarg segment argument if we have no kernel
1956 // arguments.
1957 if (!InputPtrReg)
1958 return DAG.getConstant(Val: Offset, DL: SL, VT: PtrVT);
1959
1960 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
1961 SDValue BasePtr = DAG.getCopyFromReg(Chain, dl: SL,
1962 Reg: MRI.getLiveInVirtReg(PReg: InputPtrReg->getRegister()), VT: PtrVT);
1963
1964 return DAG.getObjectPtrOffset(SL, Ptr: BasePtr, Offset: TypeSize::getFixed(ExactSize: Offset));
1965}
1966
1967SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
1968 const SDLoc &SL) const {
1969 uint64_t Offset = getImplicitParameterOffset(MF: DAG.getMachineFunction(),
1970 Param: FIRST_IMPLICIT);
1971 return lowerKernArgParameterPtr(DAG, SL, Chain: DAG.getEntryNode(), Offset);
1972}
1973
1974SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
1975 const SDLoc &SL) const {
1976
1977 Function &F = DAG.getMachineFunction().getFunction();
1978 std::optional<uint32_t> KnownSize =
1979 AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
1980 if (KnownSize.has_value())
1981 return DAG.getConstant(*KnownSize, SL, MVT::i32);
1982 return SDValue();
1983}
1984
1985SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
1986 const SDLoc &SL, SDValue Val,
1987 bool Signed,
1988 const ISD::InputArg *Arg) const {
1989 // First, if it is a widened vector, narrow it.
1990 if (VT.isVector() &&
1991 VT.getVectorNumElements() != MemVT.getVectorNumElements()) {
1992 EVT NarrowedVT =
1993 EVT::getVectorVT(Context&: *DAG.getContext(), VT: MemVT.getVectorElementType(),
1994 NumElements: VT.getVectorNumElements());
1995 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
1996 DAG.getConstant(0, SL, MVT::i32));
1997 }
1998
1999 // Then convert the vector elements or scalar value.
2000 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
2001 VT.bitsLT(VT: MemVT)) {
2002 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2003 Val = DAG.getNode(Opcode: Opc, DL: SL, VT: MemVT, N1: Val, N2: DAG.getValueType(VT));
2004 }
2005
2006 if (MemVT.isFloatingPoint())
2007 Val = getFPExtOrFPRound(DAG, Op: Val, DL: SL, VT);
2008 else if (Signed)
2009 Val = DAG.getSExtOrTrunc(Op: Val, DL: SL, VT);
2010 else
2011 Val = DAG.getZExtOrTrunc(Op: Val, DL: SL, VT);
2012
2013 return Val;
2014}
2015
2016SDValue SITargetLowering::lowerKernargMemParameter(
2017 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2018 uint64_t Offset, Align Alignment, bool Signed,
2019 const ISD::InputArg *Arg) const {
2020 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
2021
2022 // Try to avoid using an extload by loading earlier than the argument address,
2023 // and extracting the relevant bits. The load should hopefully be merged with
2024 // the previous argument.
2025 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2026 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2027 int64_t AlignDownOffset = alignDown(Value: Offset, Align: 4);
2028 int64_t OffsetDiff = Offset - AlignDownOffset;
2029
2030 EVT IntVT = MemVT.changeTypeToInteger();
2031
2032 // TODO: If we passed in the base kernel offset we could have a better
2033 // alignment than 4, but we don't really need it.
2034 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset: AlignDownOffset);
2035 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4),
2036 MachineMemOperand::MODereferenceable |
2037 MachineMemOperand::MOInvariant);
2038
2039 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
2040 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
2041
2042 SDValue ArgVal = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: IntVT, Operand: Extract);
2043 ArgVal = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MemVT, Operand: ArgVal);
2044 ArgVal = convertArgType(DAG, VT, MemVT, SL, Val: ArgVal, Signed, Arg);
2045
2046
2047 return DAG.getMergeValues(Ops: { ArgVal, Load.getValue(R: 1) }, dl: SL);
2048 }
2049
2050 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2051 SDValue Load = DAG.getLoad(VT: MemVT, dl: SL, Chain, Ptr, PtrInfo, Alignment,
2052 MMOFlags: MachineMemOperand::MODereferenceable |
2053 MachineMemOperand::MOInvariant);
2054
2055 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Val: Load, Signed, Arg);
2056 return DAG.getMergeValues(Ops: { Val, Load.getValue(R: 1) }, dl: SL);
2057}
2058
2059SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
2060 const SDLoc &SL, SDValue Chain,
2061 const ISD::InputArg &Arg) const {
2062 MachineFunction &MF = DAG.getMachineFunction();
2063 MachineFrameInfo &MFI = MF.getFrameInfo();
2064
2065 if (Arg.Flags.isByVal()) {
2066 unsigned Size = Arg.Flags.getByValSize();
2067 int FrameIdx = MFI.CreateFixedObject(Size, SPOffset: VA.getLocMemOffset(), IsImmutable: false);
2068 return DAG.getFrameIndex(FrameIdx, MVT::i32);
2069 }
2070
2071 unsigned ArgOffset = VA.getLocMemOffset();
2072 unsigned ArgSize = VA.getValVT().getStoreSize();
2073
2074 int FI = MFI.CreateFixedObject(Size: ArgSize, SPOffset: ArgOffset, IsImmutable: true);
2075
2076 // Create load nodes to retrieve arguments from the stack.
2077 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
2078 SDValue ArgValue;
2079
2080 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2081 ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
2082 MVT MemVT = VA.getValVT();
2083
2084 switch (VA.getLocInfo()) {
2085 default:
2086 break;
2087 case CCValAssign::BCvt:
2088 MemVT = VA.getLocVT();
2089 break;
2090 case CCValAssign::SExt:
2091 ExtType = ISD::SEXTLOAD;
2092 break;
2093 case CCValAssign::ZExt:
2094 ExtType = ISD::ZEXTLOAD;
2095 break;
2096 case CCValAssign::AExt:
2097 ExtType = ISD::EXTLOAD;
2098 break;
2099 }
2100
2101 ArgValue = DAG.getExtLoad(
2102 ExtType, dl: SL, VT: VA.getLocVT(), Chain, Ptr: FIN,
2103 PtrInfo: MachinePointerInfo::getFixedStack(MF&: DAG.getMachineFunction(), FI),
2104 MemVT);
2105 return ArgValue;
2106}
2107
2108SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
2109 const SIMachineFunctionInfo &MFI,
2110 EVT VT,
2111 AMDGPUFunctionArgInfo::PreloadedValue PVID) const {
2112 const ArgDescriptor *Reg = nullptr;
2113 const TargetRegisterClass *RC;
2114 LLT Ty;
2115
2116 CallingConv::ID CC = DAG.getMachineFunction().getFunction().getCallingConv();
2117 const ArgDescriptor WorkGroupIDX =
2118 ArgDescriptor::createRegister(AMDGPU::TTMP9);
2119 // If GridZ is not programmed in an entry function then the hardware will set
2120 // it to all zeros, so there is no need to mask the GridY value in the low
2121 // order bits.
2122 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2123 AMDGPU::TTMP7,
2124 AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2125 const ArgDescriptor WorkGroupIDZ =
2126 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
2127 if (Subtarget->hasArchitectedSGPRs() &&
2128 (AMDGPU::isCompute(CC) || CC == CallingConv::AMDGPU_Gfx)) {
2129 switch (PVID) {
2130 case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
2131 Reg = &WorkGroupIDX;
2132 RC = &AMDGPU::SReg_32RegClass;
2133 Ty = LLT::scalar(SizeInBits: 32);
2134 break;
2135 case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y:
2136 Reg = &WorkGroupIDY;
2137 RC = &AMDGPU::SReg_32RegClass;
2138 Ty = LLT::scalar(SizeInBits: 32);
2139 break;
2140 case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z:
2141 Reg = &WorkGroupIDZ;
2142 RC = &AMDGPU::SReg_32RegClass;
2143 Ty = LLT::scalar(SizeInBits: 32);
2144 break;
2145 default:
2146 break;
2147 }
2148 }
2149
2150 if (!Reg)
2151 std::tie(args&: Reg, args&: RC, args&: Ty) = MFI.getPreloadedValue(Value: PVID);
2152 if (!Reg) {
2153 if (PVID == AMDGPUFunctionArgInfo::PreloadedValue::KERNARG_SEGMENT_PTR) {
2154 // It's possible for a kernarg intrinsic call to appear in a kernel with
2155 // no allocated segment, in which case we do not add the user sgpr
2156 // argument, so just return null.
2157 return DAG.getConstant(Val: 0, DL: SDLoc(), VT);
2158 }
2159
2160 // It's undefined behavior if a function marked with the amdgpu-no-*
2161 // attributes uses the corresponding intrinsic.
2162 return DAG.getUNDEF(VT);
2163 }
2164
2165 return loadInputValue(DAG, RC, VT, SL: SDLoc(DAG.getEntryNode()), Arg: *Reg);
2166}
2167
2168static void processPSInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
2169 CallingConv::ID CallConv,
2170 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2171 FunctionType *FType,
2172 SIMachineFunctionInfo *Info) {
2173 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
2174 const ISD::InputArg *Arg = &Ins[I];
2175
2176 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2177 "vector type argument should have been split");
2178
2179 // First check if it's a PS input addr.
2180 if (CallConv == CallingConv::AMDGPU_PS &&
2181 !Arg->Flags.isInReg() && PSInputNum <= 15) {
2182 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(Index: PSInputNum);
2183
2184 // Inconveniently only the first part of the split is marked as isSplit,
2185 // so skip to the end. We only want to increment PSInputNum once for the
2186 // entire split argument.
2187 if (Arg->Flags.isSplit()) {
2188 while (!Arg->Flags.isSplitEnd()) {
2189 assert((!Arg->VT.isVector() ||
2190 Arg->VT.getScalarSizeInBits() == 16) &&
2191 "unexpected vector split in ps argument type");
2192 if (!SkipArg)
2193 Splits.push_back(Elt: *Arg);
2194 Arg = &Ins[++I];
2195 }
2196 }
2197
2198 if (SkipArg) {
2199 // We can safely skip PS inputs.
2200 Skipped.set(Arg->getOrigArgIndex());
2201 ++PSInputNum;
2202 continue;
2203 }
2204
2205 Info->markPSInputAllocated(Index: PSInputNum);
2206 if (Arg->Used)
2207 Info->markPSInputEnabled(Index: PSInputNum);
2208
2209 ++PSInputNum;
2210 }
2211
2212 Splits.push_back(Elt: *Arg);
2213 }
2214}
2215
2216// Allocate special inputs passed in VGPRs.
2217void SITargetLowering::allocateSpecialEntryInputVGPRs(CCState &CCInfo,
2218 MachineFunction &MF,
2219 const SIRegisterInfo &TRI,
2220 SIMachineFunctionInfo &Info) const {
2221 const LLT S32 = LLT::scalar(SizeInBits: 32);
2222 MachineRegisterInfo &MRI = MF.getRegInfo();
2223
2224 if (Info.hasWorkItemIDX()) {
2225 Register Reg = AMDGPU::VGPR0;
2226 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2227
2228 CCInfo.AllocateReg(Reg);
2229 unsigned Mask = (Subtarget->hasPackedTID() &&
2230 Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2231 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2232 }
2233
2234 if (Info.hasWorkItemIDY()) {
2235 assert(Info.hasWorkItemIDX());
2236 if (Subtarget->hasPackedTID()) {
2237 Info.setWorkItemIDY(ArgDescriptor::createRegister(AMDGPU::VGPR0,
2238 0x3ff << 10));
2239 } else {
2240 unsigned Reg = AMDGPU::VGPR1;
2241 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2242
2243 CCInfo.AllocateReg(Reg);
2244 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2245 }
2246 }
2247
2248 if (Info.hasWorkItemIDZ()) {
2249 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2250 if (Subtarget->hasPackedTID()) {
2251 Info.setWorkItemIDZ(ArgDescriptor::createRegister(AMDGPU::VGPR0,
2252 0x3ff << 20));
2253 } else {
2254 unsigned Reg = AMDGPU::VGPR2;
2255 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2256
2257 CCInfo.AllocateReg(Reg);
2258 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2259 }
2260 }
2261}
2262
2263// Try to allocate a VGPR at the end of the argument list, or if no argument
2264// VGPRs are left allocating a stack slot.
2265// If \p Mask is is given it indicates bitfield position in the register.
2266// If \p Arg is given use it with new ]p Mask instead of allocating new.
2267static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2268 ArgDescriptor Arg = ArgDescriptor()) {
2269 if (Arg.isSet())
2270 return ArgDescriptor::createArg(Arg, Mask);
2271
2272 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2273 unsigned RegIdx = CCInfo.getFirstUnallocated(Regs: ArgVGPRs);
2274 if (RegIdx == ArgVGPRs.size()) {
2275 // Spill to stack required.
2276 int64_t Offset = CCInfo.AllocateStack(Size: 4, Alignment: Align(4));
2277
2278 return ArgDescriptor::createStack(Offset, Mask);
2279 }
2280
2281 unsigned Reg = ArgVGPRs[RegIdx];
2282 Reg = CCInfo.AllocateReg(Reg);
2283 assert(Reg != AMDGPU::NoRegister);
2284
2285 MachineFunction &MF = CCInfo.getMachineFunction();
2286 Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
2287 MF.getRegInfo().setType(VReg: LiveInVReg, Ty: LLT::scalar(SizeInBits: 32));
2288 return ArgDescriptor::createRegister(Reg, Mask);
2289}
2290
2291static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo,
2292 const TargetRegisterClass *RC,
2293 unsigned NumArgRegs) {
2294 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2295 unsigned RegIdx = CCInfo.getFirstUnallocated(Regs: ArgSGPRs);
2296 if (RegIdx == ArgSGPRs.size())
2297 report_fatal_error(reason: "ran out of SGPRs for arguments");
2298
2299 unsigned Reg = ArgSGPRs[RegIdx];
2300 Reg = CCInfo.AllocateReg(Reg);
2301 assert(Reg != AMDGPU::NoRegister);
2302
2303 MachineFunction &MF = CCInfo.getMachineFunction();
2304 MF.addLiveIn(PReg: Reg, RC);
2305 return ArgDescriptor::createRegister(Reg);
2306}
2307
2308// If this has a fixed position, we still should allocate the register in the
2309// CCInfo state. Technically we could get away with this for values passed
2310// outside of the normal argument range.
2311static void allocateFixedSGPRInputImpl(CCState &CCInfo,
2312 const TargetRegisterClass *RC,
2313 MCRegister Reg) {
2314 Reg = CCInfo.AllocateReg(Reg);
2315 assert(Reg != AMDGPU::NoRegister);
2316 MachineFunction &MF = CCInfo.getMachineFunction();
2317 MF.addLiveIn(PReg: Reg, RC);
2318}
2319
2320static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2321 if (Arg) {
2322 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
2323 Arg.getRegister());
2324 } else
2325 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
2326}
2327
2328static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2329 if (Arg) {
2330 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
2331 Arg.getRegister());
2332 } else
2333 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
2334}
2335
2336/// Allocate implicit function VGPR arguments at the end of allocated user
2337/// arguments.
2338void SITargetLowering::allocateSpecialInputVGPRs(
2339 CCState &CCInfo, MachineFunction &MF,
2340 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2341 const unsigned Mask = 0x3ff;
2342 ArgDescriptor Arg;
2343
2344 if (Info.hasWorkItemIDX()) {
2345 Arg = allocateVGPR32Input(CCInfo, Mask);
2346 Info.setWorkItemIDX(Arg);
2347 }
2348
2349 if (Info.hasWorkItemIDY()) {
2350 Arg = allocateVGPR32Input(CCInfo, Mask: Mask << 10, Arg);
2351 Info.setWorkItemIDY(Arg);
2352 }
2353
2354 if (Info.hasWorkItemIDZ())
2355 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask: Mask << 20, Arg));
2356}
2357
2358/// Allocate implicit function VGPR arguments in fixed registers.
2359void SITargetLowering::allocateSpecialInputVGPRsFixed(
2360 CCState &CCInfo, MachineFunction &MF,
2361 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2362 Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2363 if (!Reg)
2364 report_fatal_error(reason: "failed to allocated VGPR for implicit arguments");
2365
2366 const unsigned Mask = 0x3ff;
2367 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2368 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask: Mask << 10));
2369 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask: Mask << 20));
2370}
2371
2372void SITargetLowering::allocateSpecialInputSGPRs(
2373 CCState &CCInfo,
2374 MachineFunction &MF,
2375 const SIRegisterInfo &TRI,
2376 SIMachineFunctionInfo &Info) const {
2377 auto &ArgInfo = Info.getArgInfo();
2378 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2379
2380 // TODO: Unify handling with private memory pointers.
2381 if (UserSGPRInfo.hasDispatchPtr())
2382 allocateSGPR64Input(CCInfo, Arg&: ArgInfo.DispatchPtr);
2383
2384 const Module *M = MF.getFunction().getParent();
2385 if (UserSGPRInfo.hasQueuePtr() &&
2386 AMDGPU::getAMDHSACodeObjectVersion(M: *M) < AMDGPU::AMDHSA_COV5)
2387 allocateSGPR64Input(CCInfo, Arg&: ArgInfo.QueuePtr);
2388
2389 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2390 // constant offset from the kernarg segment.
2391 if (Info.hasImplicitArgPtr())
2392 allocateSGPR64Input(CCInfo, Arg&: ArgInfo.ImplicitArgPtr);
2393
2394 if (UserSGPRInfo.hasDispatchID())
2395 allocateSGPR64Input(CCInfo, Arg&: ArgInfo.DispatchID);
2396
2397 // flat_scratch_init is not applicable for non-kernel functions.
2398
2399 if (Info.hasWorkGroupIDX())
2400 allocateSGPR32Input(CCInfo, Arg&: ArgInfo.WorkGroupIDX);
2401
2402 if (Info.hasWorkGroupIDY())
2403 allocateSGPR32Input(CCInfo, Arg&: ArgInfo.WorkGroupIDY);
2404
2405 if (Info.hasWorkGroupIDZ())
2406 allocateSGPR32Input(CCInfo, Arg&: ArgInfo.WorkGroupIDZ);
2407
2408 if (Info.hasLDSKernelId())
2409 allocateSGPR32Input(CCInfo, Arg&: ArgInfo.LDSKernelId);
2410}
2411
2412// Allocate special inputs passed in user SGPRs.
2413void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,
2414 MachineFunction &MF,
2415 const SIRegisterInfo &TRI,
2416 SIMachineFunctionInfo &Info) const {
2417 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2418 if (UserSGPRInfo.hasImplicitBufferPtr()) {
2419 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2420 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2421 CCInfo.AllocateReg(Reg: ImplicitBufferPtrReg);
2422 }
2423
2424 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2425 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
2426 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2427 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2428 CCInfo.AllocateReg(Reg: PrivateSegmentBufferReg);
2429 }
2430
2431 if (UserSGPRInfo.hasDispatchPtr()) {
2432 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2433 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2434 CCInfo.AllocateReg(Reg: DispatchPtrReg);
2435 }
2436
2437 const Module *M = MF.getFunction().getParent();
2438 if (UserSGPRInfo.hasQueuePtr() &&
2439 AMDGPU::getAMDHSACodeObjectVersion(M: *M) < AMDGPU::AMDHSA_COV5) {
2440 Register QueuePtrReg = Info.addQueuePtr(TRI);
2441 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2442 CCInfo.AllocateReg(Reg: QueuePtrReg);
2443 }
2444
2445 if (UserSGPRInfo.hasKernargSegmentPtr()) {
2446 MachineRegisterInfo &MRI = MF.getRegInfo();
2447 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2448 CCInfo.AllocateReg(Reg: InputPtrReg);
2449
2450 Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
2451 MRI.setType(VReg, Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64));
2452 }
2453
2454 if (UserSGPRInfo.hasDispatchID()) {
2455 Register DispatchIDReg = Info.addDispatchID(TRI);
2456 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2457 CCInfo.AllocateReg(Reg: DispatchIDReg);
2458 }
2459
2460 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2461 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2462 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2463 CCInfo.AllocateReg(Reg: FlatScratchInitReg);
2464 }
2465
2466 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2467 // these from the dispatch pointer.
2468}
2469
2470// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
2471// sequential starting from the first argument.
2472void SITargetLowering::allocatePreloadKernArgSGPRs(
2473 CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
2474 const SmallVectorImpl<ISD::InputArg> &Ins, MachineFunction &MF,
2475 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2476 Function &F = MF.getFunction();
2477 unsigned LastExplicitArgOffset =
2478 MF.getSubtarget<GCNSubtarget>().getExplicitKernelArgOffset();
2479 GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
2480 bool InPreloadSequence = true;
2481 unsigned InIdx = 0;
2482 for (auto &Arg : F.args()) {
2483 if (!InPreloadSequence || !Arg.hasInRegAttr())
2484 break;
2485
2486 int ArgIdx = Arg.getArgNo();
2487 // Don't preload non-original args or parts not in the current preload
2488 // sequence.
2489 if (InIdx < Ins.size() && (!Ins[InIdx].isOrigArg() ||
2490 (int)Ins[InIdx].getOrigArgIndex() != ArgIdx))
2491 break;
2492
2493 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2494 (int)Ins[InIdx].getOrigArgIndex() == ArgIdx;
2495 InIdx++) {
2496 assert(ArgLocs[ArgIdx].isMemLoc());
2497 auto &ArgLoc = ArgLocs[InIdx];
2498 const Align KernelArgBaseAlign = Align(16);
2499 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2500 Align Alignment = commonAlignment(A: KernelArgBaseAlign, Offset: ArgOffset);
2501 unsigned NumAllocSGPRs =
2502 alignTo(Value: ArgLoc.getLocVT().getFixedSizeInBits(), Align: 32) / 32;
2503
2504 // Arg is preloaded into the previous SGPR.
2505 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2506 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2507 Elt: Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2508 continue;
2509 }
2510
2511 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2512 unsigned PaddingSGPRs = alignTo(Value: Padding, Align: 4) / 4;
2513 // Check for free user SGPRs for preloading.
2514 if (PaddingSGPRs + NumAllocSGPRs + 1 /*Synthetic SGPRs*/ >
2515 SGPRInfo.getNumFreeUserSGPRs()) {
2516 InPreloadSequence = false;
2517 break;
2518 }
2519
2520 // Preload this argument.
2521 const TargetRegisterClass *RC =
2522 TRI.getSGPRClassForBitWidth(BitWidth: NumAllocSGPRs * 32);
2523 SmallVectorImpl<MCRegister> *PreloadRegs =
2524 Info.addPreloadedKernArg(TRI, RC, AllocSizeDWord: NumAllocSGPRs, KernArgIdx: InIdx, PaddingSGPRs);
2525
2526 if (PreloadRegs->size() > 1)
2527 RC = &AMDGPU::SGPR_32RegClass;
2528 for (auto &Reg : *PreloadRegs) {
2529 assert(Reg);
2530 MF.addLiveIn(Reg, RC);
2531 CCInfo.AllocateReg(Reg);
2532 }
2533
2534 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
2535 }
2536 }
2537}
2538
2539void SITargetLowering::allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF,
2540 const SIRegisterInfo &TRI,
2541 SIMachineFunctionInfo &Info) const {
2542 // Always allocate this last since it is a synthetic preload.
2543 if (Info.hasLDSKernelId()) {
2544 Register Reg = Info.addLDSKernelId();
2545 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2546 CCInfo.AllocateReg(Reg);
2547 }
2548}
2549
2550// Allocate special input registers that are initialized per-wave.
2551void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo,
2552 MachineFunction &MF,
2553 SIMachineFunctionInfo &Info,
2554 CallingConv::ID CallConv,
2555 bool IsShader) const {
2556 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
2557 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
2558 // Note: user SGPRs are handled by the front-end for graphics shaders
2559 // Pad up the used user SGPRs with dead inputs.
2560
2561 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
2562 // before enabling architected SGPRs for workgroup IDs.
2563 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
2564
2565 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
2566 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
2567 // rely on it to reach 16 since if we end up having no stack usage, it will
2568 // not really be added.
2569 unsigned NumRequiredSystemSGPRs = Info.hasWorkGroupIDX() +
2570 Info.hasWorkGroupIDY() +
2571 Info.hasWorkGroupIDZ() +
2572 Info.hasWorkGroupInfo();
2573 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2574 Register Reg = Info.addReservedUserSGPR();
2575 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2576 CCInfo.AllocateReg(Reg);
2577 }
2578 }
2579
2580 if (!HasArchitectedSGPRs) {
2581 if (Info.hasWorkGroupIDX()) {
2582 Register Reg = Info.addWorkGroupIDX();
2583 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2584 CCInfo.AllocateReg(Reg);
2585 }
2586
2587 if (Info.hasWorkGroupIDY()) {
2588 Register Reg = Info.addWorkGroupIDY();
2589 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2590 CCInfo.AllocateReg(Reg);
2591 }
2592
2593 if (Info.hasWorkGroupIDZ()) {
2594 Register Reg = Info.addWorkGroupIDZ();
2595 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2596 CCInfo.AllocateReg(Reg);
2597 }
2598 }
2599
2600 if (Info.hasWorkGroupInfo()) {
2601 Register Reg = Info.addWorkGroupInfo();
2602 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2603 CCInfo.AllocateReg(Reg);
2604 }
2605
2606 if (Info.hasPrivateSegmentWaveByteOffset()) {
2607 // Scratch wave offset passed in system SGPR.
2608 unsigned PrivateSegmentWaveByteOffsetReg;
2609
2610 if (IsShader) {
2611 PrivateSegmentWaveByteOffsetReg =
2612 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2613
2614 // This is true if the scratch wave byte offset doesn't have a fixed
2615 // location.
2616 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2617 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
2618 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2619 }
2620 } else
2621 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
2622
2623 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2624 CCInfo.AllocateReg(Reg: PrivateSegmentWaveByteOffsetReg);
2625 }
2626
2627 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
2628 Info.getNumPreloadedSGPRs() >= 16);
2629}
2630
2631static void reservePrivateMemoryRegs(const TargetMachine &TM,
2632 MachineFunction &MF,
2633 const SIRegisterInfo &TRI,
2634 SIMachineFunctionInfo &Info) {
2635 // Now that we've figured out where the scratch register inputs are, see if
2636 // should reserve the arguments and use them directly.
2637 MachineFrameInfo &MFI = MF.getFrameInfo();
2638 bool HasStackObjects = MFI.hasStackObjects();
2639 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2640
2641 // Record that we know we have non-spill stack objects so we don't need to
2642 // check all stack objects later.
2643 if (HasStackObjects)
2644 Info.setHasNonSpillStackObjects(true);
2645
2646 // Everything live out of a block is spilled with fast regalloc, so it's
2647 // almost certain that spilling will be required.
2648 if (TM.getOptLevel() == CodeGenOptLevel::None)
2649 HasStackObjects = true;
2650
2651 // For now assume stack access is needed in any callee functions, so we need
2652 // the scratch registers to pass in.
2653 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
2654
2655 if (!ST.enableFlatScratch()) {
2656 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
2657 // If we have stack objects, we unquestionably need the private buffer
2658 // resource. For the Code Object V2 ABI, this will be the first 4 user
2659 // SGPR inputs. We can reserve those and use them directly.
2660
2661 Register PrivateSegmentBufferReg =
2662 Info.getPreloadedReg(Value: AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
2663 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2664 } else {
2665 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
2666 // We tentatively reserve the last registers (skipping the last registers
2667 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
2668 // we'll replace these with the ones immediately after those which were
2669 // really allocated. In the prologue copies will be inserted from the
2670 // argument to these reserved registers.
2671
2672 // Without HSA, relocations are used for the scratch pointer and the
2673 // buffer resource setup is always inserted in the prologue. Scratch wave
2674 // offset is still in an input SGPR.
2675 Info.setScratchRSrcReg(ReservedBufferReg);
2676 }
2677 }
2678
2679 MachineRegisterInfo &MRI = MF.getRegInfo();
2680
2681 // For entry functions we have to set up the stack pointer if we use it,
2682 // whereas non-entry functions get this "for free". This means there is no
2683 // intrinsic advantage to using S32 over S34 in cases where we do not have
2684 // calls but do need a frame pointer (i.e. if we are requested to have one
2685 // because frame pointer elimination is disabled). To keep things simple we
2686 // only ever use S32 as the call ABI stack pointer, and so using it does not
2687 // imply we need a separate frame pointer.
2688 //
2689 // Try to use s32 as the SP, but move it if it would interfere with input
2690 // arguments. This won't work with calls though.
2691 //
2692 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
2693 // registers.
2694 if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
2695 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2696 } else {
2697 assert(AMDGPU::isShader(MF.getFunction().getCallingConv()));
2698
2699 if (MFI.hasCalls())
2700 report_fatal_error(reason: "call in graphics shader with too many input SGPRs");
2701
2702 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
2703 if (!MRI.isLiveIn(Reg)) {
2704 Info.setStackPtrOffsetReg(Reg);
2705 break;
2706 }
2707 }
2708
2709 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2710 report_fatal_error(reason: "failed to find register for SP");
2711 }
2712
2713 // hasFP should be accurate for entry functions even before the frame is
2714 // finalized, because it does not rely on the known stack size, only
2715 // properties like whether variable sized objects are present.
2716 if (ST.getFrameLowering()->hasFP(MF)) {
2717 Info.setFrameOffsetReg(AMDGPU::SGPR33);
2718 }
2719}
2720
2721bool SITargetLowering::supportSplitCSR(MachineFunction *MF) const {
2722 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
2723 return !Info->isEntryFunction();
2724}
2725
2726void SITargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
2727
2728}
2729
2730void SITargetLowering::insertCopiesSplitCSR(
2731 MachineBasicBlock *Entry,
2732 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
2733 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
2734
2735 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(MF: Entry->getParent());
2736 if (!IStart)
2737 return;
2738
2739 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2740 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
2741 MachineBasicBlock::iterator MBBI = Entry->begin();
2742 for (const MCPhysReg *I = IStart; *I; ++I) {
2743 const TargetRegisterClass *RC = nullptr;
2744 if (AMDGPU::SReg_64RegClass.contains(*I))
2745 RC = &AMDGPU::SGPR_64RegClass;
2746 else if (AMDGPU::SReg_32RegClass.contains(*I))
2747 RC = &AMDGPU::SGPR_32RegClass;
2748 else
2749 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2750
2751 Register NewVR = MRI->createVirtualRegister(RegClass: RC);
2752 // Create copy from CSR to a virtual register.
2753 Entry->addLiveIn(PhysReg: *I);
2754 BuildMI(BB&: *Entry, I: MBBI, MIMD: DebugLoc(), MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: NewVR)
2755 .addReg(RegNo: *I);
2756
2757 // Insert the copy-back instructions right before the terminator.
2758 for (auto *Exit : Exits)
2759 BuildMI(BB&: *Exit, I: Exit->getFirstTerminator(), MIMD: DebugLoc(),
2760 MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: *I)
2761 .addReg(RegNo: NewVR);
2762 }
2763}
2764
2765SDValue SITargetLowering::LowerFormalArguments(
2766 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2767 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2768 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2769 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
2770
2771 MachineFunction &MF = DAG.getMachineFunction();
2772 const Function &Fn = MF.getFunction();
2773 FunctionType *FType = MF.getFunction().getFunctionType();
2774 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2775
2776 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CC: CallConv)) {
2777 DiagnosticInfoUnsupported NoGraphicsHSA(
2778 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
2779 DAG.getContext()->diagnose(DI: NoGraphicsHSA);
2780 return DAG.getEntryNode();
2781 }
2782
2783 SmallVector<ISD::InputArg, 16> Splits;
2784 SmallVector<CCValAssign, 16> ArgLocs;
2785 BitVector Skipped(Ins.size());
2786 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2787 *DAG.getContext());
2788
2789 bool IsGraphics = AMDGPU::isGraphics(CC: CallConv);
2790 bool IsKernel = AMDGPU::isKernel(CC: CallConv);
2791 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CC: CallConv);
2792
2793 if (IsGraphics) {
2794 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
2795 assert(!UserSGPRInfo.hasDispatchPtr() &&
2796 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
2797 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
2798 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
2799 (void)UserSGPRInfo;
2800 if (!Subtarget->enableFlatScratch())
2801 assert(!UserSGPRInfo.hasFlatScratchInit());
2802 if ((CallConv != CallingConv::AMDGPU_CS &&
2803 CallConv != CallingConv::AMDGPU_Gfx) ||
2804 !Subtarget->hasArchitectedSGPRs())
2805 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
2806 !Info->hasWorkGroupIDZ());
2807 }
2808
2809 if (CallConv == CallingConv::AMDGPU_PS) {
2810 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
2811
2812 // At least one interpolation mode must be enabled or else the GPU will
2813 // hang.
2814 //
2815 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
2816 // set PSInputAddr, the user wants to enable some bits after the compilation
2817 // based on run-time states. Since we can't know what the final PSInputEna
2818 // will look like, so we shouldn't do anything here and the user should take
2819 // responsibility for the correct programming.
2820 //
2821 // Otherwise, the following restrictions apply:
2822 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
2823 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
2824 // enabled too.
2825 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
2826 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(Index: 11))) {
2827 CCInfo.AllocateReg(AMDGPU::VGPR0);
2828 CCInfo.AllocateReg(AMDGPU::VGPR1);
2829 Info->markPSInputAllocated(Index: 0);
2830 Info->markPSInputEnabled(Index: 0);
2831 }
2832 if (Subtarget->isAmdPalOS()) {
2833 // For isAmdPalOS, the user does not enable some bits after compilation
2834 // based on run-time states; the register values being generated here are
2835 // the final ones set in hardware. Therefore we need to apply the
2836 // workaround to PSInputAddr and PSInputEnable together. (The case where
2837 // a bit is set in PSInputAddr but not PSInputEnable is where the
2838 // frontend set up an input arg for a particular interpolation mode, but
2839 // nothing uses that input arg. Really we should have an earlier pass
2840 // that removes such an arg.)
2841 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
2842 if ((PsInputBits & 0x7F) == 0 ||
2843 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
2844 Info->markPSInputEnabled(Index: llvm::countr_zero(Val: Info->getPSInputAddr()));
2845 }
2846 } else if (IsKernel) {
2847 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
2848 } else {
2849 Splits.append(in_start: Ins.begin(), in_end: Ins.end());
2850 }
2851
2852 if (IsKernel)
2853 analyzeFormalArgumentsCompute(State&: CCInfo, Ins);
2854
2855 if (IsEntryFunc) {
2856 allocateSpecialEntryInputVGPRs(CCInfo, MF, TRI: *TRI, Info&: *Info);
2857 allocateHSAUserSGPRs(CCInfo, MF, TRI: *TRI, Info&: *Info);
2858 if (IsKernel && Subtarget->hasKernargPreload())
2859 allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, TRI: *TRI, Info&: *Info);
2860
2861 allocateLDSKernelId(CCInfo, MF, TRI: *TRI, Info&: *Info);
2862 } else if (!IsGraphics) {
2863 // For the fixed ABI, pass workitem IDs in the last argument register.
2864 allocateSpecialInputVGPRsFixed(CCInfo, MF, TRI: *TRI, Info&: *Info);
2865
2866 // FIXME: Sink this into allocateSpecialInputSGPRs
2867 if (!Subtarget->enableFlatScratch())
2868 CCInfo.AllocateReg(Reg: Info->getScratchRSrcReg());
2869
2870 allocateSpecialInputSGPRs(CCInfo, MF, TRI: *TRI, Info&: *Info);
2871 }
2872
2873 if (!IsKernel) {
2874 CCAssignFn *AssignFn = CCAssignFnForCall(CC: CallConv, IsVarArg: isVarArg);
2875 CCInfo.AnalyzeFormalArguments(Ins: Splits, Fn: AssignFn);
2876 }
2877
2878 SmallVector<SDValue, 16> Chains;
2879
2880 // FIXME: This is the minimum kernel argument alignment. We should improve
2881 // this to the maximum alignment of the arguments.
2882 //
2883 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
2884 // kern arg offset.
2885 const Align KernelArgBaseAlign = Align(16);
2886
2887 for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2888 const ISD::InputArg &Arg = Ins[i];
2889 if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) {
2890 InVals.push_back(Elt: DAG.getUNDEF(VT: Arg.VT));
2891 continue;
2892 }
2893
2894 CCValAssign &VA = ArgLocs[ArgIdx++];
2895 MVT VT = VA.getLocVT();
2896
2897 if (IsEntryFunc && VA.isMemLoc()) {
2898 VT = Ins[i].VT;
2899 EVT MemVT = VA.getLocVT();
2900
2901 const uint64_t Offset = VA.getLocMemOffset();
2902 Align Alignment = commonAlignment(A: KernelArgBaseAlign, Offset);
2903
2904 if (Arg.Flags.isByRef()) {
2905 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL: DL, Chain, Offset);
2906
2907 const GCNTargetMachine &TM =
2908 static_cast<const GCNTargetMachine &>(getTargetMachine());
2909 if (!TM.isNoopAddrSpaceCast(SrcAS: AMDGPUAS::CONSTANT_ADDRESS,
2910 DestAS: Arg.Flags.getPointerAddrSpace())) {
2911 Ptr = DAG.getAddrSpaceCast(dl: DL, VT, Ptr, SrcAS: AMDGPUAS::CONSTANT_ADDRESS,
2912 DestAS: Arg.Flags.getPointerAddrSpace());
2913 }
2914
2915 InVals.push_back(Elt: Ptr);
2916 continue;
2917 }
2918
2919 SDValue NewArg;
2920 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(Val: i)) {
2921 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2922 // In this case the argument is packed into the previous preload SGPR.
2923 int64_t AlignDownOffset = alignDown(Value: Offset, Align: 4);
2924 int64_t OffsetDiff = Offset - AlignDownOffset;
2925 EVT IntVT = MemVT.changeTypeToInteger();
2926
2927 const SIMachineFunctionInfo *Info =
2928 MF.getInfo<SIMachineFunctionInfo>();
2929 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
2930 Register Reg =
2931 Info->getArgInfo().PreloadKernArgs.find(Val: i)->getSecond().Regs[0];
2932
2933 assert(Reg);
2934 Register VReg = MRI.getLiveInVirtReg(PReg: Reg);
2935 SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
2936
2937 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32);
2938 SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt);
2939
2940 SDValue ArgVal = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: IntVT, Operand: Extract);
2941 ArgVal = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MemVT, Operand: ArgVal);
2942 NewArg = convertArgType(DAG, VT, MemVT, SL: DL, Val: ArgVal,
2943 Signed: Ins[i].Flags.isSExt(), Arg: &Ins[i]);
2944
2945 NewArg = DAG.getMergeValues(Ops: {NewArg, Copy.getValue(R: 1)}, dl: DL);
2946 } else {
2947 const SIMachineFunctionInfo *Info =
2948 MF.getInfo<SIMachineFunctionInfo>();
2949 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
2950 const SmallVectorImpl<MCRegister> &PreloadRegs =
2951 Info->getArgInfo().PreloadKernArgs.find(Val: i)->getSecond().Regs;
2952
2953 SDValue Copy;
2954 if (PreloadRegs.size() == 1) {
2955 Register VReg = MRI.getLiveInVirtReg(PReg: PreloadRegs[0]);
2956 const TargetRegisterClass *RC = MRI.getRegClass(Reg: VReg);
2957 NewArg = DAG.getCopyFromReg(
2958 Chain, DL, VReg,
2959 EVT::getIntegerVT(Context&: *DAG.getContext(),
2960 BitWidth: TRI->getRegSizeInBits(*RC)));
2961
2962 } else {
2963 // If the kernarg alignment does not match the alignment of the SGPR
2964 // tuple RC that can accommodate this argument, it will be built up
2965 // via copies from from the individual SGPRs that the argument was
2966 // preloaded to.
2967 SmallVector<SDValue, 4> Elts;
2968 for (auto Reg : PreloadRegs) {
2969 Register VReg = MRI.getLiveInVirtReg(PReg: Reg);
2970 Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
2971 Elts.push_back(Elt: Copy);
2972 }
2973 NewArg =
2974 DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32,
2975 PreloadRegs.size()),
2976 DL, Elts);
2977 }
2978
2979 SDValue CMemVT;
2980 if (VT.isScalarInteger() && VT.bitsLT(VT: NewArg.getSimpleValueType()))
2981 CMemVT = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MemVT, Operand: NewArg);
2982 else
2983 CMemVT = DAG.getBitcast(VT: MemVT, V: NewArg);
2984 NewArg = convertArgType(DAG, VT, MemVT, SL: DL, Val: CMemVT,
2985 Signed: Ins[i].Flags.isSExt(), Arg: &Ins[i]);
2986 NewArg = DAG.getMergeValues(Ops: {NewArg, Chain}, dl: DL);
2987 }
2988 } else {
2989 NewArg =
2990 lowerKernargMemParameter(DAG, VT, MemVT, SL: DL, Chain, Offset,
2991 Alignment, Signed: Ins[i].Flags.isSExt(), Arg: &Ins[i]);
2992 }
2993 Chains.push_back(Elt: NewArg.getValue(R: 1));
2994
2995 auto *ParamTy =
2996 dyn_cast<PointerType>(Val: FType->getParamType(i: Ins[i].getOrigArgIndex()));
2997 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
2998 ParamTy && (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
2999 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
3000 // On SI local pointers are just offsets into LDS, so they are always
3001 // less than 16-bits. On CI and newer they could potentially be
3002 // real pointers, so we can't guarantee their size.
3003 NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg,
3004 DAG.getValueType(MVT::i16));
3005 }
3006
3007 InVals.push_back(Elt: NewArg);
3008 continue;
3009 } else if (!IsEntryFunc && VA.isMemLoc()) {
3010 SDValue Val = lowerStackParameter(DAG, VA, SL: DL, Chain, Arg);
3011 InVals.push_back(Elt: Val);
3012 if (!Arg.Flags.isByVal())
3013 Chains.push_back(Elt: Val.getValue(R: 1));
3014 continue;
3015 }
3016
3017 assert(VA.isRegLoc() && "Parameter must be in a register!");
3018
3019 Register Reg = VA.getLocReg();
3020 const TargetRegisterClass *RC = nullptr;
3021 if (AMDGPU::VGPR_32RegClass.contains(Reg))
3022 RC = &AMDGPU::VGPR_32RegClass;
3023 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3024 RC = &AMDGPU::SGPR_32RegClass;
3025 else
3026 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3027 EVT ValVT = VA.getValVT();
3028
3029 Reg = MF.addLiveIn(PReg: Reg, RC);
3030 SDValue Val = DAG.getCopyFromReg(Chain, dl: DL, Reg, VT);
3031
3032 if (Arg.Flags.isSRet()) {
3033 // The return object should be reasonably addressable.
3034
3035 // FIXME: This helps when the return is a real sret. If it is a
3036 // automatically inserted sret (i.e. CanLowerReturn returns false), an
3037 // extra copy is inserted in SelectionDAGBuilder which obscures this.
3038 unsigned NumBits
3039 = 32 - getSubtarget()->getKnownHighZeroBitsForFrameIndex();
3040 Val = DAG.getNode(Opcode: ISD::AssertZext, DL, VT, N1: Val,
3041 N2: DAG.getValueType(EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: NumBits)));
3042 }
3043
3044 // If this is an 8 or 16-bit value, it is really passed promoted
3045 // to 32 bits. Insert an assert[sz]ext to capture this, then
3046 // truncate to the right size.
3047 switch (VA.getLocInfo()) {
3048 case CCValAssign::Full:
3049 break;
3050 case CCValAssign::BCvt:
3051 Val = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: ValVT, Operand: Val);
3052 break;
3053 case CCValAssign::SExt:
3054 Val = DAG.getNode(Opcode: ISD::AssertSext, DL, VT, N1: Val,
3055 N2: DAG.getValueType(ValVT));
3056 Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: ValVT, Operand: Val);
3057 break;
3058 case CCValAssign::ZExt:
3059 Val = DAG.getNode(Opcode: ISD::AssertZext, DL, VT, N1: Val,
3060 N2: DAG.getValueType(ValVT));
3061 Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: ValVT, Operand: Val);
3062 break;
3063 case CCValAssign::AExt:
3064 Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: ValVT, Operand: Val);
3065 break;
3066 default:
3067 llvm_unreachable("Unknown loc info!");
3068 }
3069
3070 InVals.push_back(Elt: Val);
3071 }
3072
3073 // Start adding system SGPRs.
3074 if (IsEntryFunc)
3075 allocateSystemSGPRs(CCInfo, MF, Info&: *Info, CallConv, IsShader: IsGraphics);
3076
3077 auto &ArgUsageInfo =
3078 DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
3079 ArgUsageInfo.setFuncArgInfo(F: Fn, ArgInfo: Info->getArgInfo());
3080
3081 unsigned StackArgSize = CCInfo.getStackSize();
3082 Info->setBytesInStackArgArea(StackArgSize);
3083
3084 return Chains.empty() ? Chain :
3085 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
3086}
3087
3088// TODO: If return values can't fit in registers, we should return as many as
3089// possible in registers before passing on stack.
3090bool SITargetLowering::CanLowerReturn(
3091 CallingConv::ID CallConv,
3092 MachineFunction &MF, bool IsVarArg,
3093 const SmallVectorImpl<ISD::OutputArg> &Outs,
3094 LLVMContext &Context) const {
3095 // Replacing returns with sret/stack usage doesn't make sense for shaders.
3096 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3097 // for shaders. Vector types should be explicitly handled by CC.
3098 if (AMDGPU::isEntryFunctionCC(CC: CallConv))
3099 return true;
3100
3101 SmallVector<CCValAssign, 16> RVLocs;
3102 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3103 if (!CCInfo.CheckReturn(Outs, Fn: CCAssignFnForReturn(CC: CallConv, IsVarArg)))
3104 return false;
3105
3106 // We must use the stack if return would require unavailable registers.
3107 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3108 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
3109 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3110 if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3111 return false;
3112
3113 return true;
3114}
3115
3116SDValue
3117SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
3118 bool isVarArg,
3119 const SmallVectorImpl<ISD::OutputArg> &Outs,
3120 const SmallVectorImpl<SDValue> &OutVals,
3121 const SDLoc &DL, SelectionDAG &DAG) const {
3122 MachineFunction &MF = DAG.getMachineFunction();
3123 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
3124
3125 if (AMDGPU::isKernel(CC: CallConv)) {
3126 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3127 OutVals, DL, DAG);
3128 }
3129
3130 bool IsShader = AMDGPU::isShader(CC: CallConv);
3131
3132 Info->setIfReturnsVoid(Outs.empty());
3133 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3134
3135 // CCValAssign - represent the assignment of the return value to a location.
3136 SmallVector<CCValAssign, 48> RVLocs;
3137 SmallVector<ISD::OutputArg, 48> Splits;
3138
3139 // CCState - Info about the registers and stack slots.
3140 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3141 *DAG.getContext());
3142
3143 // Analyze outgoing return values.
3144 CCInfo.AnalyzeReturn(Outs, Fn: CCAssignFnForReturn(CC: CallConv, IsVarArg: isVarArg));
3145
3146 SDValue Glue;
3147 SmallVector<SDValue, 48> RetOps;
3148 RetOps.push_back(Elt: Chain); // Operand #0 = Chain (updated below)
3149
3150 // Copy the result values into the output registers.
3151 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
3152 ++I, ++RealRVLocIdx) {
3153 CCValAssign &VA = RVLocs[I];
3154 assert(VA.isRegLoc() && "Can only return in registers!");
3155 // TODO: Partially return in registers if return values don't fit.
3156 SDValue Arg = OutVals[RealRVLocIdx];
3157
3158 // Copied from other backends.
3159 switch (VA.getLocInfo()) {
3160 case CCValAssign::Full:
3161 break;
3162 case CCValAssign::BCvt:
3163 Arg = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: VA.getLocVT(), Operand: Arg);
3164 break;
3165 case CCValAssign::SExt:
3166 Arg = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
3167 break;
3168 case CCValAssign::ZExt:
3169 Arg = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
3170 break;
3171 case CCValAssign::AExt:
3172 Arg = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
3173 break;
3174 default:
3175 llvm_unreachable("Unknown loc info!");
3176 }
3177
3178 Chain = DAG.getCopyToReg(Chain, dl: DL, Reg: VA.getLocReg(), N: Arg, Glue);
3179 Glue = Chain.getValue(R: 1);
3180 RetOps.push_back(Elt: DAG.getRegister(Reg: VA.getLocReg(), VT: VA.getLocVT()));
3181 }
3182
3183 // FIXME: Does sret work properly?
3184 if (!Info->isEntryFunction()) {
3185 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3186 const MCPhysReg *I =
3187 TRI->getCalleeSavedRegsViaCopy(MF: &DAG.getMachineFunction());
3188 if (I) {
3189 for (; *I; ++I) {
3190 if (AMDGPU::SReg_64RegClass.contains(*I))
3191 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3192 else if (AMDGPU::SReg_32RegClass.contains(*I))
3193 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3194 else
3195 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3196 }
3197 }
3198 }
3199
3200 // Update chain and glue.
3201 RetOps[0] = Chain;
3202 if (Glue.getNode())
3203 RetOps.push_back(Elt: Glue);
3204
3205 unsigned Opc = AMDGPUISD::ENDPGM;
3206 if (!IsWaveEnd)
3207 Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_GLUE;
3208 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
3209}
3210
3211SDValue SITargetLowering::LowerCallResult(
3212 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3213 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3214 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3215 SDValue ThisVal) const {
3216 CCAssignFn *RetCC = CCAssignFnForReturn(CC: CallConv, IsVarArg);
3217
3218 // Assign locations to each value returned by this call.
3219 SmallVector<CCValAssign, 16> RVLocs;
3220 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3221 *DAG.getContext());
3222 CCInfo.AnalyzeCallResult(Ins, Fn: RetCC);
3223
3224 // Copy all of the result registers out of their specified physreg.
3225 for (unsigned i = 0; i != RVLocs.size(); ++i) {
3226 CCValAssign VA = RVLocs[i];
3227 SDValue Val;
3228
3229 if (VA.isRegLoc()) {
3230 Val = DAG.getCopyFromReg(Chain, dl: DL, Reg: VA.getLocReg(), VT: VA.getLocVT(), Glue: InGlue);
3231 Chain = Val.getValue(R: 1);
3232 InGlue = Val.getValue(R: 2);
3233 } else if (VA.isMemLoc()) {
3234 report_fatal_error(reason: "TODO: return values in memory");
3235 } else
3236 llvm_unreachable("unknown argument location type");
3237
3238 switch (VA.getLocInfo()) {
3239 case CCValAssign::Full:
3240 break;
3241 case CCValAssign::BCvt:
3242 Val = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: VA.getValVT(), Operand: Val);
3243 break;
3244 case CCValAssign::ZExt:
3245 Val = DAG.getNode(Opcode: ISD::AssertZext, DL, VT: VA.getLocVT(), N1: Val,
3246 N2: DAG.getValueType(VA.getValVT()));
3247 Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: VA.getValVT(), Operand: Val);
3248 break;
3249 case CCValAssign::SExt:
3250 Val = DAG.getNode(Opcode: ISD::AssertSext, DL, VT: VA.getLocVT(), N1: Val,
3251 N2: DAG.getValueType(VA.getValVT()));
3252 Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: VA.getValVT(), Operand: Val);
3253 break;
3254 case CCValAssign::AExt:
3255 Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: VA.getValVT(), Operand: Val);
3256 break;
3257 default:
3258 llvm_unreachable("Unknown loc info!");
3259 }
3260
3261 InVals.push_back(Elt: Val);
3262 }
3263
3264 return Chain;
3265}
3266
3267// Add code to pass special inputs required depending on used features separate
3268// from the explicit user arguments present in the IR.
3269void SITargetLowering::passSpecialInputs(
3270 CallLoweringInfo &CLI,
3271 CCState &CCInfo,
3272 const SIMachineFunctionInfo &Info,
3273 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3274 SmallVectorImpl<SDValue> &MemOpChains,
3275 SDValue Chain) const {
3276 // If we don't have a call site, this was a call inserted by
3277 // legalization. These can never use special inputs.
3278 if (!CLI.CB)
3279 return;
3280
3281 SelectionDAG &DAG = CLI.DAG;
3282 const SDLoc &DL = CLI.DL;
3283 const Function &F = DAG.getMachineFunction().getFunction();
3284
3285 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3286 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3287
3288 const AMDGPUFunctionArgInfo *CalleeArgInfo
3289 = &AMDGPUArgumentUsageInfo::FixedABIFunctionInfo;
3290 if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
3291 auto &ArgUsageInfo =
3292 DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
3293 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(F: *CalleeFunc);
3294 }
3295
3296 // TODO: Unify with private memory register handling. This is complicated by
3297 // the fact that at least in kernels, the input argument is not necessarily
3298 // in the same location as the input.
3299 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3300 StringLiteral> ImplicitAttrs[] = {
3301 {AMDGPUFunctionArgInfo::DISPATCH_PTR, "amdgpu-no-dispatch-ptr"},
3302 {AMDGPUFunctionArgInfo::QUEUE_PTR, "amdgpu-no-queue-ptr" },
3303 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"},
3304 {AMDGPUFunctionArgInfo::DISPATCH_ID, "amdgpu-no-dispatch-id"},
3305 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"},
3306 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,"amdgpu-no-workgroup-id-y"},
3307 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"},
3308 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID,"amdgpu-no-lds-kernel-id"},
3309 };
3310
3311 for (auto Attr : ImplicitAttrs) {
3312 const ArgDescriptor *OutgoingArg;
3313 const TargetRegisterClass *ArgRC;
3314 LLT ArgTy;
3315
3316 AMDGPUFunctionArgInfo::PreloadedValue InputID = Attr.first;
3317
3318 // If the callee does not use the attribute value, skip copying the value.
3319 if (CLI.CB->hasFnAttr(Kind: Attr.second))
3320 continue;
3321
3322 std::tie(args&: OutgoingArg, args&: ArgRC, args&: ArgTy) =
3323 CalleeArgInfo->getPreloadedValue(Value: InputID);
3324 if (!OutgoingArg)
3325 continue;
3326
3327 const ArgDescriptor *IncomingArg;
3328 const TargetRegisterClass *IncomingArgRC;
3329 LLT Ty;
3330 std::tie(args&: IncomingArg, args&: IncomingArgRC, args&: Ty) =
3331 CallerArgInfo.getPreloadedValue(Value: InputID);
3332 assert(IncomingArgRC == ArgRC);
3333
3334 // All special arguments are ints for now.
3335 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3336 SDValue InputReg;
3337
3338 if (IncomingArg) {
3339 InputReg = loadInputValue(DAG, RC: ArgRC, VT: ArgVT, SL: DL, Arg: *IncomingArg);
3340 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
3341 // The implicit arg ptr is special because it doesn't have a corresponding
3342 // input for kernels, and is computed from the kernarg segment pointer.
3343 InputReg = getImplicitArgPtr(DAG, SL: DL);
3344 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
3345 std::optional<uint32_t> Id =
3346 AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
3347 if (Id.has_value()) {
3348 InputReg = DAG.getConstant(Val: *Id, DL, VT: ArgVT);
3349 } else {
3350 InputReg = DAG.getUNDEF(VT: ArgVT);
3351 }
3352 } else {
3353 // We may have proven the input wasn't needed, although the ABI is
3354 // requiring it. We just need to allocate the register appropriately.
3355 InputReg = DAG.getUNDEF(VT: ArgVT);
3356 }
3357
3358 if (OutgoingArg->isRegister()) {
3359 RegsToPass.emplace_back(Args: OutgoingArg->getRegister(), Args&: InputReg);
3360 if (!CCInfo.AllocateReg(Reg: OutgoingArg->getRegister()))
3361 report_fatal_error(reason: "failed to allocate implicit input argument");
3362 } else {
3363 unsigned SpecialArgOffset =
3364 CCInfo.AllocateStack(Size: ArgVT.getStoreSize(), Alignment: Align(4));
3365 SDValue ArgStore = storeStackInputValue(DAG, SL: DL, Chain, ArgVal: InputReg,
3366 Offset: SpecialArgOffset);
3367 MemOpChains.push_back(Elt: ArgStore);
3368 }
3369 }
3370
3371 // Pack workitem IDs into a single register or pass it as is if already
3372 // packed.
3373 const ArgDescriptor *OutgoingArg;
3374 const TargetRegisterClass *ArgRC;
3375 LLT Ty;
3376
3377 std::tie(args&: OutgoingArg, args&: ArgRC, args&: Ty) =
3378 CalleeArgInfo->getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_X);
3379 if (!OutgoingArg)
3380 std::tie(args&: OutgoingArg, args&: ArgRC, args&: Ty) =
3381 CalleeArgInfo->getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
3382 if (!OutgoingArg)
3383 std::tie(args&: OutgoingArg, args&: ArgRC, args&: Ty) =
3384 CalleeArgInfo->getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
3385 if (!OutgoingArg)
3386 return;
3387
3388 const ArgDescriptor *IncomingArgX = std::get<0>(
3389 t: CallerArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_X));
3390 const ArgDescriptor *IncomingArgY = std::get<0>(
3391 t: CallerArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_Y));
3392 const ArgDescriptor *IncomingArgZ = std::get<0>(
3393 t: CallerArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_Z));
3394
3395 SDValue InputReg;
3396 SDLoc SL;
3397
3398 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr(Kind: "amdgpu-no-workitem-id-x");
3399 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr(Kind: "amdgpu-no-workitem-id-y");
3400 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr(Kind: "amdgpu-no-workitem-id-z");
3401
3402 // If incoming ids are not packed we need to pack them.
3403 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
3404 NeedWorkItemIDX) {
3405 if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
3406 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
3407 } else {
3408 InputReg = DAG.getConstant(0, DL, MVT::i32);
3409 }
3410 }
3411
3412 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
3413 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
3414 SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
3415 Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
3416 DAG.getShiftAmountConstant(10, MVT::i32, SL));
3417 InputReg = InputReg.getNode() ?
3418 DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y) : Y;
3419 }
3420
3421 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
3422 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
3423 SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
3424 Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
3425 DAG.getShiftAmountConstant(20, MVT::i32, SL));
3426 InputReg = InputReg.getNode() ?
3427 DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z) : Z;
3428 }
3429
3430 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3431 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3432 // We're in a situation where the outgoing function requires the workitem
3433 // ID, but the calling function does not have it (e.g a graphics function
3434 // calling a C calling convention function). This is illegal, but we need
3435 // to produce something.
3436 InputReg = DAG.getUNDEF(MVT::i32);
3437 } else {
3438 // Workitem ids are already packed, any of present incoming arguments
3439 // will carry all required fields.
3440 ArgDescriptor IncomingArg = ArgDescriptor::createArg(
3441 Arg: IncomingArgX ? *IncomingArgX :
3442 IncomingArgY ? *IncomingArgY :
3443 *IncomingArgZ, Mask: ~0u);
3444 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
3445 }
3446 }
3447
3448 if (OutgoingArg->isRegister()) {
3449 if (InputReg)
3450 RegsToPass.emplace_back(Args: OutgoingArg->getRegister(), Args&: InputReg);
3451
3452 CCInfo.AllocateReg(Reg: OutgoingArg->getRegister());
3453 } else {
3454 unsigned SpecialArgOffset = CCInfo.AllocateStack(Size: 4, Alignment: Align(4));
3455 if (InputReg) {
3456 SDValue ArgStore = storeStackInputValue(DAG, SL: DL, Chain, ArgVal: InputReg,
3457 Offset: SpecialArgOffset);
3458 MemOpChains.push_back(Elt: ArgStore);
3459 }
3460 }
3461}
3462
3463static bool canGuaranteeTCO(CallingConv::ID CC) {
3464 return CC == CallingConv::Fast;
3465}
3466
3467/// Return true if we might ever do TCO for calls with this calling convention.
3468static bool mayTailCallThisCC(CallingConv::ID CC) {
3469 switch (CC) {
3470 case CallingConv::C:
3471 case CallingConv::AMDGPU_Gfx:
3472 return true;
3473 default:
3474 return canGuaranteeTCO(CC);
3475 }
3476}
3477
3478bool SITargetLowering::isEligibleForTailCallOptimization(
3479 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
3480 const SmallVectorImpl<ISD::OutputArg> &Outs,
3481 const SmallVectorImpl<SDValue> &OutVals,
3482 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3483 if (AMDGPU::isChainCC(CC: CalleeCC))
3484 return true;
3485
3486 if (!mayTailCallThisCC(CC: CalleeCC))
3487 return false;
3488
3489 // For a divergent call target, we need to do a waterfall loop over the
3490 // possible callees which precludes us from using a simple jump.
3491 if (Callee->isDivergent())
3492 return false;
3493
3494 MachineFunction &MF = DAG.getMachineFunction();
3495 const Function &CallerF = MF.getFunction();
3496 CallingConv::ID CallerCC = CallerF.getCallingConv();
3497 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
3498 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3499
3500 // Kernels aren't callable, and don't have a live in return address so it
3501 // doesn't make sense to do a tail call with entry functions.
3502 if (!CallerPreserved)
3503 return false;
3504
3505 bool CCMatch = CallerCC == CalleeCC;
3506
3507 if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
3508 if (canGuaranteeTCO(CC: CalleeCC) && CCMatch)
3509 return true;
3510 return false;
3511 }
3512
3513 // TODO: Can we handle var args?
3514 if (IsVarArg)
3515 return false;
3516
3517 for (const Argument &Arg : CallerF.args()) {
3518 if (Arg.hasByValAttr())
3519 return false;
3520 }
3521
3522 LLVMContext &Ctx = *DAG.getContext();
3523
3524 // Check that the call results are passed in the same way.
3525 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C&: Ctx, Ins,
3526 CalleeFn: CCAssignFnForCall(CC: CalleeCC, IsVarArg),
3527 CallerFn: CCAssignFnForCall(CC: CallerCC, IsVarArg)))
3528 return false;
3529
3530 // The callee has to preserve all registers the caller needs to preserve.
3531 if (!CCMatch) {
3532 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3533 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3534 return false;
3535 }
3536
3537 // Nothing more to check if the callee is taking no arguments.
3538 if (Outs.empty())
3539 return true;
3540
3541 SmallVector<CCValAssign, 16> ArgLocs;
3542 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3543
3544 CCInfo.AnalyzeCallOperands(Outs, Fn: CCAssignFnForCall(CC: CalleeCC, IsVarArg));
3545
3546 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
3547 // If the stack arguments for this call do not fit into our own save area then
3548 // the call cannot be made tail.
3549 // TODO: Is this really necessary?
3550 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
3551 return false;
3552
3553 const MachineRegisterInfo &MRI = MF.getRegInfo();
3554 return parametersInCSRMatch(MRI, CallerPreservedMask: CallerPreserved, ArgLocs, OutVals);
3555}
3556
3557bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
3558 if (!CI->isTailCall())
3559 return false;
3560
3561 const Function *ParentFn = CI->getParent()->getParent();
3562 if (AMDGPU::isEntryFunctionCC(CC: ParentFn->getCallingConv()))
3563 return false;
3564 return true;
3565}
3566
3567// The wave scratch offset register is used as the global base pointer.
3568SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
3569 SmallVectorImpl<SDValue> &InVals) const {
3570 CallingConv::ID CallConv = CLI.CallConv;
3571 bool IsChainCallConv = AMDGPU::isChainCC(CC: CallConv);
3572
3573 SelectionDAG &DAG = CLI.DAG;
3574
3575 TargetLowering::ArgListEntry RequestedExec;
3576 if (IsChainCallConv) {
3577 // The last argument should be the value that we need to put in EXEC.
3578 // Pop it out of CLI.Outs and CLI.OutVals before we do any processing so we
3579 // don't treat it like the rest of the arguments.
3580 RequestedExec = CLI.Args.back();
3581 assert(RequestedExec.Node && "No node for EXEC");
3582
3583 if (!RequestedExec.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
3584 return lowerUnhandledCall(CLI, InVals, Reason: "Invalid value for EXEC");
3585
3586 assert(CLI.Outs.back().OrigArgIndex == 2 && "Unexpected last arg");
3587 CLI.Outs.pop_back();
3588 CLI.OutVals.pop_back();
3589
3590 if (RequestedExec.Ty->isIntegerTy(Bitwidth: 64)) {
3591 assert(CLI.Outs.back().OrigArgIndex == 2 && "Exec wasn't split up");
3592 CLI.Outs.pop_back();
3593 CLI.OutVals.pop_back();
3594 }
3595
3596 assert(CLI.Outs.back().OrigArgIndex != 2 &&
3597 "Haven't popped all the pieces of the EXEC mask");
3598 }
3599
3600 const SDLoc &DL = CLI.DL;
3601 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
3602 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
3603 SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
3604 SDValue Chain = CLI.Chain;
3605 SDValue Callee = CLI.Callee;
3606 bool &IsTailCall = CLI.IsTailCall;
3607 bool IsVarArg = CLI.IsVarArg;
3608 bool IsSibCall = false;
3609 MachineFunction &MF = DAG.getMachineFunction();
3610
3611 if (Callee.isUndef() || isNullConstant(V: Callee)) {
3612 if (!CLI.IsTailCall) {
3613 for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
3614 InVals.push_back(Elt: DAG.getUNDEF(VT: CLI.Ins[I].VT));
3615 }
3616
3617 return Chain;
3618 }
3619
3620 if (IsVarArg) {
3621 return lowerUnhandledCall(CLI, InVals,
3622 Reason: "unsupported call to variadic function ");
3623 }
3624
3625 if (!CLI.CB)
3626 report_fatal_error(reason: "unsupported libcall legalization");
3627
3628 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
3629 return lowerUnhandledCall(CLI, InVals,
3630 Reason: "unsupported required tail call to function ");
3631 }
3632
3633 if (IsTailCall) {
3634 IsTailCall = isEligibleForTailCallOptimization(
3635 Callee, CalleeCC: CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
3636 if (!IsTailCall &&
3637 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
3638 report_fatal_error(reason: "failed to perform tail call elimination on a call "
3639 "site marked musttail or on llvm.amdgcn.cs.chain");
3640 }
3641
3642 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
3643
3644 // A sibling call is one where we're under the usual C ABI and not planning
3645 // to change that but can still do a tail call:
3646 if (!TailCallOpt && IsTailCall)
3647 IsSibCall = true;
3648
3649 if (IsTailCall)
3650 ++NumTailCalls;
3651 }
3652
3653 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
3654 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
3655 SmallVector<SDValue, 8> MemOpChains;
3656
3657 // Analyze operands of the call, assigning locations to each operand.
3658 SmallVector<CCValAssign, 16> ArgLocs;
3659 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
3660 CCAssignFn *AssignFn = CCAssignFnForCall(CC: CallConv, IsVarArg);
3661
3662 if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CC: CallConv)) {
3663 // With a fixed ABI, allocate fixed registers before user arguments.
3664 passSpecialInputs(CLI, CCInfo, Info: *Info, RegsToPass, MemOpChains, Chain);
3665 }
3666
3667 CCInfo.AnalyzeCallOperands(Outs, Fn: AssignFn);
3668
3669 // Get a count of how many bytes are to be pushed on the stack.
3670 unsigned NumBytes = CCInfo.getStackSize();
3671
3672 if (IsSibCall) {
3673 // Since we're not changing the ABI to make this a tail call, the memory
3674 // operands are already available in the caller's incoming argument space.
3675 NumBytes = 0;
3676 }
3677
3678 // FPDiff is the byte offset of the call's argument area from the callee's.
3679 // Stores to callee stack arguments will be placed in FixedStackSlots offset
3680 // by this amount for a tail call. In a sibling call it must be 0 because the
3681 // caller will deallocate the entire stack and the callee still expects its
3682 // arguments to begin at SP+0. Completely unused for non-tail calls.
3683 int32_t FPDiff = 0;
3684 MachineFrameInfo &MFI = MF.getFrameInfo();
3685
3686 // Adjust the stack pointer for the new arguments...
3687 // These operations are automatically eliminated by the prolog/epilog pass
3688 if (!IsSibCall)
3689 Chain = DAG.getCALLSEQ_START(Chain, InSize: 0, OutSize: 0, DL);
3690
3691 if (!IsSibCall || IsChainCallConv) {
3692 if (!Subtarget->enableFlatScratch()) {
3693 SmallVector<SDValue, 4> CopyFromChains;
3694
3695 // In the HSA case, this should be an identity copy.
3696 SDValue ScratchRSrcReg
3697 = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
3698 RegsToPass.emplace_back(IsChainCallConv
3699 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
3700 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
3701 ScratchRSrcReg);
3702 CopyFromChains.push_back(Elt: ScratchRSrcReg.getValue(R: 1));
3703 Chain = DAG.getTokenFactor(DL, Vals&: CopyFromChains);
3704 }
3705 }
3706
3707 MVT PtrVT = MVT::i32;
3708
3709 // Walk the register/memloc assignments, inserting copies/loads.
3710 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3711 CCValAssign &VA = ArgLocs[i];
3712 SDValue Arg = OutVals[i];
3713
3714 // Promote the value if needed.
3715 switch (VA.getLocInfo()) {
3716 case CCValAssign::Full:
3717 break;
3718 case CCValAssign::BCvt:
3719 Arg = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: VA.getLocVT(), Operand: Arg);
3720 break;
3721 case CCValAssign::ZExt:
3722 Arg = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
3723 break;
3724 case CCValAssign::SExt:
3725 Arg = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
3726 break;
3727 case CCValAssign::AExt:
3728 Arg = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
3729 break;
3730 case CCValAssign::FPExt:
3731 Arg = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
3732 break;
3733 default:
3734 llvm_unreachable("Unknown loc info!");
3735 }
3736
3737 if (VA.isRegLoc()) {
3738 RegsToPass.push_back(Elt: std::pair(VA.getLocReg(), Arg));
3739 } else {
3740 assert(VA.isMemLoc());
3741
3742 SDValue DstAddr;
3743 MachinePointerInfo DstInfo;
3744
3745 unsigned LocMemOffset = VA.getLocMemOffset();
3746 int32_t Offset = LocMemOffset;
3747
3748 SDValue PtrOff = DAG.getConstant(Val: Offset, DL, VT: PtrVT);
3749 MaybeAlign Alignment;
3750
3751 if (IsTailCall) {
3752 ISD::ArgFlagsTy Flags = Outs[i].Flags;
3753 unsigned OpSize = Flags.isByVal() ?
3754 Flags.getByValSize() : VA.getValVT().getStoreSize();
3755
3756 // FIXME: We can have better than the minimum byval required alignment.
3757 Alignment =
3758 Flags.isByVal()
3759 ? Flags.getNonZeroByValAlign()
3760 : commonAlignment(A: Subtarget->getStackAlignment(), Offset);
3761
3762 Offset = Offset + FPDiff;
3763 int FI = MFI.CreateFixedObject(Size: OpSize, SPOffset: Offset, IsImmutable: true);
3764
3765 DstAddr = DAG.getFrameIndex(FI, VT: PtrVT);
3766 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
3767
3768 // Make sure any stack arguments overlapping with where we're storing
3769 // are loaded before this eventual operation. Otherwise they'll be
3770 // clobbered.
3771
3772 // FIXME: Why is this really necessary? This seems to just result in a
3773 // lot of code to copy the stack and write them back to the same
3774 // locations, which are supposed to be immutable?
3775 Chain = addTokenForArgument(Chain, DAG, MFI, ClobberedFI: FI);
3776 } else {
3777 // Stores to the argument stack area are relative to the stack pointer.
3778 SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
3779 MVT::i32);
3780 DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
3781 DstInfo = MachinePointerInfo::getStack(MF, Offset: LocMemOffset);
3782 Alignment =
3783 commonAlignment(A: Subtarget->getStackAlignment(), Offset: LocMemOffset);
3784 }
3785
3786 if (Outs[i].Flags.isByVal()) {
3787 SDValue SizeNode =
3788 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
3789 SDValue Cpy =
3790 DAG.getMemcpy(Chain, dl: DL, Dst: DstAddr, Src: Arg, Size: SizeNode,
3791 Alignment: Outs[i].Flags.getNonZeroByValAlign(),
3792 /*isVol = */ false, /*AlwaysInline = */ true,
3793 /*isTailCall = */ false, DstPtrInfo: DstInfo,
3794 SrcPtrInfo: MachinePointerInfo(AMDGPUAS::PRIVATE_ADDRESS));
3795
3796 MemOpChains.push_back(Elt: Cpy);
3797 } else {
3798 SDValue Store =
3799 DAG.getStore(Chain, dl: DL, Val: Arg, Ptr: DstAddr, PtrInfo: DstInfo, Alignment);
3800 MemOpChains.push_back(Elt: Store);
3801 }
3802 }
3803 }
3804
3805 if (!MemOpChains.empty())
3806 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
3807
3808 // Build a sequence of copy-to-reg nodes chained together with token chain
3809 // and flag operands which copy the outgoing args into the appropriate regs.
3810 SDValue InGlue;
3811 for (auto &RegToPass : RegsToPass) {
3812 Chain = DAG.getCopyToReg(Chain, dl: DL, Reg: RegToPass.first,
3813 N: RegToPass.second, Glue: InGlue);
3814 InGlue = Chain.getValue(R: 1);
3815 }
3816
3817
3818 // We don't usually want to end the call-sequence here because we would tidy
3819 // the frame up *after* the call, however in the ABI-changing tail-call case
3820 // we've carefully laid out the parameters so that when sp is reset they'll be
3821 // in the correct location.
3822 if (IsTailCall && !IsSibCall) {
3823 Chain = DAG.getCALLSEQ_END(Chain, Size1: NumBytes, Size2: 0, Glue: InGlue, DL);
3824 InGlue = Chain.getValue(R: 1);
3825 }
3826
3827 std::vector<SDValue> Ops;
3828 Ops.push_back(x: Chain);
3829 Ops.push_back(x: Callee);
3830 // Add a redundant copy of the callee global which will not be legalized, as
3831 // we need direct access to the callee later.
3832 if (GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Val&: Callee)) {
3833 const GlobalValue *GV = GSD->getGlobal();
3834 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
3835 } else {
3836 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
3837 }
3838
3839 if (IsTailCall) {
3840 // Each tail call may have to adjust the stack by a different amount, so
3841 // this information must travel along with the operation for eventual
3842 // consumption by emitEpilogue.
3843 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
3844 }
3845
3846 if (IsChainCallConv)
3847 Ops.push_back(x: RequestedExec.Node);
3848
3849 // Add argument registers to the end of the list so that they are known live
3850 // into the call.
3851 for (auto &RegToPass : RegsToPass) {
3852 Ops.push_back(x: DAG.getRegister(Reg: RegToPass.first,
3853 VT: RegToPass.second.getValueType()));
3854 }
3855
3856 // Add a register mask operand representing the call-preserved registers.
3857 auto *TRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
3858 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
3859 assert(Mask && "Missing call preserved mask for calling convention");
3860 Ops.push_back(x: DAG.getRegisterMask(RegMask: Mask));
3861
3862 if (InGlue.getNode())
3863 Ops.push_back(x: InGlue);
3864
3865 // NOTE: This potentially results in *two* glue operands, and the wrong one
3866 // might possibly show up where the other was intended. In particular,
3867 // Emitter::EmitMachineNode() expects only the glued convergence token if it
3868 // exists. Similarly, the selection of the call expects to match only the
3869 // InGlue operand if it exists.
3870 if (SDValue Token = CLI.ConvergenceControlToken) {
3871 Ops.push_back(SDValue(DAG.getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE,
3872 DL, MVT::Glue, Token),
3873 0));
3874 }
3875
3876 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3877
3878 // If we're doing a tall call, use a TC_RETURN here rather than an
3879 // actual call instruction.
3880 if (IsTailCall) {
3881 MFI.setHasTailCall();
3882 unsigned OPC = AMDGPUISD::TC_RETURN;
3883 switch (CallConv) {
3884 case CallingConv::AMDGPU_Gfx:
3885 OPC = AMDGPUISD::TC_RETURN_GFX;
3886 break;
3887 case CallingConv::AMDGPU_CS_Chain:
3888 case CallingConv::AMDGPU_CS_ChainPreserve:
3889 OPC = AMDGPUISD::TC_RETURN_CHAIN;
3890 break;
3891 }
3892
3893 return DAG.getNode(Opcode: OPC, DL, VTList: NodeTys, Ops);
3894 }
3895
3896 // Returns a chain and a flag for retval copy to use.
3897 SDValue Call = DAG.getNode(Opcode: AMDGPUISD::CALL, DL, VTList: NodeTys, Ops);
3898 Chain = Call.getValue(R: 0);
3899 InGlue = Call.getValue(R: 1);
3900
3901 uint64_t CalleePopBytes = NumBytes;
3902 Chain = DAG.getCALLSEQ_END(Chain, Size1: 0, Size2: CalleePopBytes, Glue: InGlue, DL);
3903 if (!Ins.empty())
3904 InGlue = Chain.getValue(R: 1);
3905
3906 // Handle result values, copying them out of physregs into vregs that we
3907 // return.
3908 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
3909 InVals, /*IsThisReturn=*/false, ThisVal: SDValue());
3910}
3911
3912// This is identical to the default implementation in ExpandDYNAMIC_STACKALLOC,
3913// except for applying the wave size scale to the increment amount.
3914SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(
3915 SDValue Op, SelectionDAG &DAG) const {
3916 const MachineFunction &MF = DAG.getMachineFunction();
3917 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
3918
3919 SDLoc dl(Op);
3920 EVT VT = Op.getValueType();
3921 SDValue Tmp1 = Op;
3922 SDValue Tmp2 = Op.getValue(R: 1);
3923 SDValue Tmp3 = Op.getOperand(i: 2);
3924 SDValue Chain = Tmp1.getOperand(i: 0);
3925
3926 Register SPReg = Info->getStackPtrOffsetReg();
3927
3928 // Chain the dynamic stack allocation so that it doesn't modify the stack
3929 // pointer when other instructions are using the stack.
3930 Chain = DAG.getCALLSEQ_START(Chain, InSize: 0, OutSize: 0, DL: dl);
3931
3932 SDValue Size = Tmp2.getOperand(i: 1);
3933 SDValue SP = DAG.getCopyFromReg(Chain, dl, Reg: SPReg, VT);
3934 Chain = SP.getValue(R: 1);
3935 MaybeAlign Alignment = cast<ConstantSDNode>(Val&: Tmp3)->getMaybeAlignValue();
3936 const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
3937 unsigned Opc =
3938 TFL->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp ?
3939 ISD::ADD : ISD::SUB;
3940
3941 SDValue ScaledSize = DAG.getNode(
3942 ISD::SHL, dl, VT, Size,
3943 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
3944
3945 Align StackAlign = TFL->getStackAlign();
3946 Tmp1 = DAG.getNode(Opcode: Opc, DL: dl, VT, N1: SP, N2: ScaledSize); // Value
3947 if (Alignment && *Alignment > StackAlign) {
3948 Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
3949 DAG.getConstant(-(uint64_t)Alignment->value()
3950 << Subtarget->getWavefrontSizeLog2(),
3951 dl, VT));
3952 }
3953
3954 Chain = DAG.getCopyToReg(Chain, dl, Reg: SPReg, N: Tmp1); // Output chain
3955 Tmp2 = DAG.getCALLSEQ_END(Chain, Size1: 0, Size2: 0, Glue: SDValue(), DL: dl);
3956
3957 return DAG.getMergeValues(Ops: {Tmp1, Tmp2}, dl);
3958}
3959
3960SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
3961 SelectionDAG &DAG) const {
3962 // We only handle constant sizes here to allow non-entry block, static sized
3963 // allocas. A truly dynamic value is more difficult to support because we
3964 // don't know if the size value is uniform or not. If the size isn't uniform,
3965 // we would need to do a wave reduction to get the maximum size to know how
3966 // much to increment the uniform stack pointer.
3967 SDValue Size = Op.getOperand(i: 1);
3968 if (isa<ConstantSDNode>(Val: Size))
3969 return lowerDYNAMIC_STACKALLOCImpl(Op, DAG); // Use "generic" expansion.
3970
3971 return AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(Op, DAG);
3972}
3973
3974SDValue SITargetLowering::LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const {
3975 if (Op.getValueType() != MVT::i32)
3976 return Op; // Defer to cannot select error.
3977
3978 Register SP = getStackPointerRegisterToSaveRestore();
3979 SDLoc SL(Op);
3980
3981 SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32);
3982
3983 // Convert from wave uniform to swizzled vector address. This should protect
3984 // from any edge cases where the stacksave result isn't directly used with
3985 // stackrestore.
3986 SDValue VectorAddress =
3987 DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP);
3988 return DAG.getMergeValues(Ops: {VectorAddress, CopyFromSP.getValue(R: 1)}, dl: SL);
3989}
3990
3991SDValue SITargetLowering::lowerGET_ROUNDING(SDValue Op,
3992 SelectionDAG &DAG) const {
3993 SDLoc SL(Op);
3994 assert(Op.getValueType() == MVT::i32);
3995
3996 uint32_t BothRoundHwReg =
3997 AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 4);
3998 SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
3999
4000 SDValue IntrinID =
4001 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4002 SDValue GetReg = DAG.getNode(Opcode: ISD::INTRINSIC_W_CHAIN, DL: SL, VTList: Op->getVTList(),
4003 N1: Op.getOperand(i: 0), N2: IntrinID, N3: GetRoundBothImm);
4004
4005 // There are two rounding modes, one for f32 and one for f64/f16. We only
4006 // report in the standard value range if both are the same.
4007 //
4008 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
4009 // ties away from zero is not supported, and the other values are rotated by
4010 // 1.
4011 //
4012 // If the two rounding modes are not the same, report a target defined value.
4013
4014 // Mode register rounding mode fields:
4015 //
4016 // [1:0] Single-precision round mode.
4017 // [3:2] Double/Half-precision round mode.
4018 //
4019 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4020 //
4021 // Hardware Spec
4022 // Toward-0 3 0
4023 // Nearest Even 0 1
4024 // +Inf 1 2
4025 // -Inf 2 3
4026 // NearestAway0 N/A 4
4027 //
4028 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4029 // table we can index by the raw hardware mode.
4030 //
4031 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4032
4033 SDValue BitTable =
4034 DAG.getConstant(AMDGPU::FltRoundConversionTable, SL, MVT::i64);
4035
4036 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4037 SDValue RoundModeTimesNumBits =
4038 DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
4039
4040 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4041 // knew only one mode was demanded.
4042 SDValue TableValue =
4043 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4044 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4045
4046 SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
4047 SDValue TableEntry =
4048 DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
4049
4050 // There's a gap in the 4-bit encoded table and actual enum values, so offset
4051 // if it's an extended value.
4052 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4053 SDValue IsStandardValue =
4054 DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);
4055 SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);
4056 SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4057 TableEntry, EnumOffset);
4058
4059 return DAG.getMergeValues(Ops: {Result, GetReg.getValue(R: 1)}, dl: SL);
4060}
4061
4062SDValue SITargetLowering::lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const {
4063 if (Op->isDivergent())
4064 return SDValue();
4065
4066 switch (cast<MemSDNode>(Val&: Op)->getAddressSpace()) {
4067 case AMDGPUAS::FLAT_ADDRESS:
4068 case AMDGPUAS::GLOBAL_ADDRESS:
4069 case AMDGPUAS::CONSTANT_ADDRESS:
4070 case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
4071 break;
4072 default:
4073 return SDValue();
4074 }
4075
4076 return Op;
4077}
4078
4079// Work around DAG legality rules only based on the result type.
4080SDValue SITargetLowering::lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
4081 bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4082 SDValue Src = Op.getOperand(i: IsStrict ? 1 : 0);
4083 EVT SrcVT = Src.getValueType();
4084
4085 if (SrcVT.getScalarType() != MVT::bf16)
4086 return Op;
4087
4088 SDLoc SL(Op);
4089 SDValue BitCast =
4090 DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: SrcVT.changeTypeToInteger(), Operand: Src);
4091
4092 EVT DstVT = Op.getValueType();
4093 if (IsStrict)
4094 llvm_unreachable("Need STRICT_BF16_TO_FP");
4095
4096 return DAG.getNode(Opcode: ISD::BF16_TO_FP, DL: SL, VT: DstVT, Operand: BitCast);
4097}
4098
4099SDValue SITargetLowering::lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const {
4100 SDLoc SL(Op);
4101 if (Op.getValueType() != MVT::i64)
4102 return Op;
4103
4104 uint32_t ModeHwReg =
4105 AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 23);
4106 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4107 uint32_t TrapHwReg =
4108 AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_TRAPSTS, 0, 5);
4109 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4110
4111 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other);
4112 SDValue IntrinID =
4113 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4114 SDValue GetModeReg = DAG.getNode(Opcode: ISD::INTRINSIC_W_CHAIN, DL: SL, VTList,
4115 N1: Op.getOperand(i: 0), N2: IntrinID, N3: ModeHwRegImm);
4116 SDValue GetTrapReg = DAG.getNode(Opcode: ISD::INTRINSIC_W_CHAIN, DL: SL, VTList,
4117 N1: Op.getOperand(i: 0), N2: IntrinID, N3: TrapHwRegImm);
4118 SDValue TokenReg =
4119 DAG.getNode(ISD::TokenFactor, SL, MVT::Other, GetModeReg.getValue(1),
4120 GetTrapReg.getValue(1));
4121
4122 SDValue CvtPtr =
4123 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, GetModeReg, GetTrapReg);
4124 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4125
4126 return DAG.getMergeValues(Ops: {Result, TokenReg}, dl: SL);
4127}
4128
4129SDValue SITargetLowering::lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const {
4130 SDLoc SL(Op);
4131 if (Op.getOperand(1).getValueType() != MVT::i64)
4132 return Op;
4133
4134 SDValue Input = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op.getOperand(1));
4135 SDValue NewModeReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4136 DAG.getConstant(0, SL, MVT::i32));
4137 SDValue NewTrapReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4138 DAG.getConstant(1, SL, MVT::i32));
4139
4140 SDValue ReadFirstLaneID =
4141 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4142 NewModeReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4143 ReadFirstLaneID, NewModeReg);
4144 NewTrapReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4145 ReadFirstLaneID, NewTrapReg);
4146
4147 unsigned ModeHwReg =
4148 AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 23);
4149 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4150 unsigned TrapHwReg =
4151 AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_TRAPSTS, 0, 5);
4152 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4153
4154 SDValue IntrinID =
4155 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4156 SDValue SetModeReg =
4157 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4158 IntrinID, ModeHwRegImm, NewModeReg);
4159 SDValue SetTrapReg =
4160 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4161 IntrinID, TrapHwRegImm, NewTrapReg);
4162 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, SetTrapReg, SetModeReg);
4163}
4164
4165Register SITargetLowering::getRegisterByName(const char* RegName, LLT VT,
4166 const MachineFunction &MF) const {
4167 Register Reg = StringSwitch<Register>(RegName)
4168 .Case("m0", AMDGPU::M0)
4169 .Case("exec", AMDGPU::EXEC)
4170 .Case("exec_lo", AMDGPU::EXEC_LO)
4171 .Case("exec_hi", AMDGPU::EXEC_HI)
4172 .Case("flat_scratch", AMDGPU::FLAT_SCR)
4173 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4174 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4175 .Default(Register());
4176
4177 if (Reg == AMDGPU::NoRegister) {
4178 report_fatal_error(reason: Twine("invalid register name \""
4179 + StringRef(RegName) + "\"."));
4180
4181 }
4182
4183 if (!Subtarget->hasFlatScrRegister() &&
4184 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4185 report_fatal_error(reason: Twine("invalid register \""
4186 + StringRef(RegName) + "\" for subtarget."));
4187 }
4188
4189 switch (Reg) {
4190 case AMDGPU::M0:
4191 case AMDGPU::EXEC_LO:
4192 case AMDGPU::EXEC_HI:
4193 case AMDGPU::FLAT_SCR_LO:
4194 case AMDGPU::FLAT_SCR_HI:
4195 if (VT.getSizeInBits() == 32)
4196 return Reg;
4197 break;
4198 case AMDGPU::EXEC:
4199 case AMDGPU::FLAT_SCR:
4200 if (VT.getSizeInBits() == 64)
4201 return Reg;
4202 break;
4203 default:
4204 llvm_unreachable("missing register type checking");
4205 }
4206
4207 report_fatal_error(reason: Twine("invalid type for register \""
4208 + StringRef(RegName) + "\"."));
4209}
4210
4211// If kill is not the last instruction, split the block so kill is always a
4212// proper terminator.
4213MachineBasicBlock *
4214SITargetLowering::splitKillBlock(MachineInstr &MI,
4215 MachineBasicBlock *BB) const {
4216 MachineBasicBlock *SplitBB = BB->splitAt(SplitInst&: MI, UpdateLiveIns: false /*UpdateLiveIns*/);
4217 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
4218 MI.setDesc(TII->getKillTerminatorFromPseudo(Opcode: MI.getOpcode()));
4219 return SplitBB;
4220}
4221
4222// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
4223// \p MI will be the only instruction in the loop body block. Otherwise, it will
4224// be the first instruction in the remainder block.
4225//
4226/// \returns { LoopBody, Remainder }
4227static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4228splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop) {
4229 MachineFunction *MF = MBB.getParent();
4230 MachineBasicBlock::iterator I(&MI);
4231
4232 // To insert the loop we need to split the block. Move everything after this
4233 // point to a new block, and insert a new empty block between the two.
4234 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
4235 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
4236 MachineFunction::iterator MBBI(MBB);
4237 ++MBBI;
4238
4239 MF->insert(MBBI, MBB: LoopBB);
4240 MF->insert(MBBI, MBB: RemainderBB);
4241
4242 LoopBB->addSuccessor(Succ: LoopBB);
4243 LoopBB->addSuccessor(Succ: RemainderBB);
4244
4245 // Move the rest of the block into a new block.
4246 RemainderBB->transferSuccessorsAndUpdatePHIs(FromMBB: &MBB);
4247
4248 if (InstInLoop) {
4249 auto Next = std::next(x: I);
4250
4251 // Move instruction to loop body.
4252 LoopBB->splice(Where: LoopBB->begin(), Other: &MBB, From: I, To: Next);
4253
4254 // Move the rest of the block.
4255 RemainderBB->splice(Where: RemainderBB->begin(), Other: &MBB, From: Next, To: MBB.end());
4256 } else {
4257 RemainderBB->splice(Where: RemainderBB->begin(), Other: &MBB, From: I, To: MBB.end());
4258 }
4259
4260 MBB.addSuccessor(Succ: LoopBB);
4261
4262 return std::pair(LoopBB, RemainderBB);
4263}
4264
4265/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
4266void SITargetLowering::bundleInstWithWaitcnt(MachineInstr &MI) const {
4267 MachineBasicBlock *MBB = MI.getParent();
4268 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
4269 auto I = MI.getIterator();
4270 auto E = std::next(x: I);
4271
4272 BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
4273 .addImm(0);
4274
4275 MIBundleBuilder Bundler(*MBB, I, E);
4276 finalizeBundle(MBB&: *MBB, FirstMI: Bundler.begin());
4277}
4278
4279MachineBasicBlock *
4280SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI,
4281 MachineBasicBlock *BB) const {
4282 const DebugLoc &DL = MI.getDebugLoc();
4283
4284 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
4285
4286 MachineBasicBlock *LoopBB;
4287 MachineBasicBlock *RemainderBB;
4288 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
4289
4290 // Apparently kill flags are only valid if the def is in the same block?
4291 if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
4292 Src->setIsKill(false);
4293
4294 std::tie(args&: LoopBB, args&: RemainderBB) = splitBlockForLoop(MI, MBB&: *BB, InstInLoop: true);
4295
4296 MachineBasicBlock::iterator I = LoopBB->end();
4297
4298 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
4299 AMDGPU::Hwreg::ID_TRAPSTS, AMDGPU::Hwreg::OFFSET_MEM_VIOL, 1);
4300
4301 // Clear TRAP_STS.MEM_VIOL
4302 BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
4303 .addImm(0)
4304 .addImm(EncodedReg);
4305
4306 bundleInstWithWaitcnt(MI);
4307
4308 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4309
4310 // Load and check TRAP_STS.MEM_VIOL
4311 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
4312 .addImm(EncodedReg);
4313
4314 // FIXME: Do we need to use an isel pseudo that may clobber scc?
4315 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
4316 .addReg(Reg, RegState::Kill)
4317 .addImm(0);
4318 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
4319 .addMBB(LoopBB);
4320
4321 return RemainderBB;
4322}
4323
4324// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
4325// wavefront. If the value is uniform and just happens to be in a VGPR, this
4326// will only do one iteration. In the worst case, this will loop 64 times.
4327//
4328// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
4329static MachineBasicBlock::iterator
4330emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI,
4331 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
4332 const DebugLoc &DL, const MachineOperand &Idx,
4333 unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
4334 unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
4335 Register &SGPRIdxReg) {
4336
4337 MachineFunction *MF = OrigBB.getParent();
4338 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4339 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4340 MachineBasicBlock::iterator I = LoopBB.begin();
4341
4342 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
4343 Register PhiExec = MRI.createVirtualRegister(RegClass: BoolRC);
4344 Register NewExec = MRI.createVirtualRegister(RegClass: BoolRC);
4345 Register CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4346 Register CondReg = MRI.createVirtualRegister(RegClass: BoolRC);
4347
4348 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
4349 .addReg(InitReg)
4350 .addMBB(&OrigBB)
4351 .addReg(ResultReg)
4352 .addMBB(&LoopBB);
4353
4354 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
4355 .addReg(InitSaveExecReg)
4356 .addMBB(&OrigBB)
4357 .addReg(NewExec)
4358 .addMBB(&LoopBB);
4359
4360 // Read the next variant <- also loop target.
4361 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
4362 .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
4363
4364 // Compare the just read M0 value to all possible Idx values.
4365 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
4366 .addReg(CurrentIdxReg)
4367 .addReg(Idx.getReg(), 0, Idx.getSubReg());
4368
4369 // Update EXEC, save the original EXEC value to VCC.
4370 BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
4371 : AMDGPU::S_AND_SAVEEXEC_B64),
4372 NewExec)
4373 .addReg(CondReg, RegState::Kill);
4374
4375 MRI.setSimpleHint(VReg: NewExec, PrefReg: CondReg);
4376
4377 if (UseGPRIdxMode) {
4378 if (Offset == 0) {
4379 SGPRIdxReg = CurrentIdxReg;
4380 } else {
4381 SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4382 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
4383 .addReg(CurrentIdxReg, RegState::Kill)
4384 .addImm(Offset);
4385 }
4386 } else {
4387 // Move index from VCC into M0
4388 if (Offset == 0) {
4389 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
4390 .addReg(CurrentIdxReg, RegState::Kill);
4391 } else {
4392 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4393 .addReg(CurrentIdxReg, RegState::Kill)
4394 .addImm(Offset);
4395 }
4396 }
4397
4398 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
4399 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4400 MachineInstr *InsertPt =
4401 BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
4402 : AMDGPU::S_XOR_B64_term), Exec)
4403 .addReg(Exec)
4404 .addReg(NewExec);
4405
4406 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
4407 // s_cbranch_scc0?
4408
4409 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
4410 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
4411 .addMBB(&LoopBB);
4412
4413 return InsertPt->getIterator();
4414}
4415
4416// This has slightly sub-optimal regalloc when the source vector is killed by
4417// the read. The register allocator does not understand that the kill is
4418// per-workitem, so is kept alive for the whole loop so we end up not re-using a
4419// subregister from it, using 1 more VGPR than necessary. This was saved when
4420// this was expanded after register allocation.
4421static MachineBasicBlock::iterator
4422loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI,
4423 unsigned InitResultReg, unsigned PhiReg, int Offset,
4424 bool UseGPRIdxMode, Register &SGPRIdxReg) {
4425 MachineFunction *MF = MBB.getParent();
4426 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4427 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4428 MachineRegisterInfo &MRI = MF->getRegInfo();
4429 const DebugLoc &DL = MI.getDebugLoc();
4430 MachineBasicBlock::iterator I(&MI);
4431
4432 const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
4433 Register DstReg = MI.getOperand(i: 0).getReg();
4434 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
4435 Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
4436 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4437 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4438
4439 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
4440
4441 // Save the EXEC mask
4442 BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec)
4443 .addReg(Exec);
4444
4445 MachineBasicBlock *LoopBB;
4446 MachineBasicBlock *RemainderBB;
4447 std::tie(args&: LoopBB, args&: RemainderBB) = splitBlockForLoop(MI, MBB, InstInLoop: false);
4448
4449 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4450
4451 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, OrigBB&: MBB, LoopBB&: *LoopBB, DL, Idx: *Idx,
4452 InitReg: InitResultReg, ResultReg: DstReg, PhiReg, InitSaveExecReg: TmpExec,
4453 Offset, UseGPRIdxMode, SGPRIdxReg);
4454
4455 MachineBasicBlock* LandingPad = MF->CreateMachineBasicBlock();
4456 MachineFunction::iterator MBBI(LoopBB);
4457 ++MBBI;
4458 MF->insert(MBBI, MBB: LandingPad);
4459 LoopBB->removeSuccessor(Succ: RemainderBB);
4460 LandingPad->addSuccessor(Succ: RemainderBB);
4461 LoopBB->addSuccessor(Succ: LandingPad);
4462 MachineBasicBlock::iterator First = LandingPad->begin();
4463 BuildMI(*LandingPad, First, DL, TII->get(MovExecOpc), Exec)
4464 .addReg(SaveExec);
4465
4466 return InsPt;
4467}
4468
4469// Returns subreg index, offset
4470static std::pair<unsigned, int>
4471computeIndirectRegAndOffset(const SIRegisterInfo &TRI,
4472 const TargetRegisterClass *SuperRC,
4473 unsigned VecReg,
4474 int Offset) {
4475 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
4476
4477 // Skip out of bounds offsets, or else we would end up using an undefined
4478 // register.
4479 if (Offset >= NumElts || Offset < 0)
4480 return std::pair(AMDGPU::sub0, Offset);
4481
4482 return std::pair(SIRegisterInfo::getSubRegFromChannel(Channel: Offset), 0);
4483}
4484
4485static void setM0ToIndexFromSGPR(const SIInstrInfo *TII,
4486 MachineRegisterInfo &MRI, MachineInstr &MI,
4487 int Offset) {
4488 MachineBasicBlock *MBB = MI.getParent();
4489 const DebugLoc &DL = MI.getDebugLoc();
4490 MachineBasicBlock::iterator I(&MI);
4491
4492 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4493
4494 assert(Idx->getReg() != AMDGPU::NoRegister);
4495
4496 if (Offset == 0) {
4497 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0).add(*Idx);
4498 } else {
4499 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4500 .add(*Idx)
4501 .addImm(Offset);
4502 }
4503}
4504
4505static Register getIndirectSGPRIdx(const SIInstrInfo *TII,
4506 MachineRegisterInfo &MRI, MachineInstr &MI,
4507 int Offset) {
4508 MachineBasicBlock *MBB = MI.getParent();
4509 const DebugLoc &DL = MI.getDebugLoc();
4510 MachineBasicBlock::iterator I(&MI);
4511
4512 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4513
4514 if (Offset == 0)
4515 return Idx->getReg();
4516
4517 Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4518 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
4519 .add(*Idx)
4520 .addImm(Offset);
4521 return Tmp;
4522}
4523
4524static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
4525 MachineBasicBlock &MBB,
4526 const GCNSubtarget &ST) {
4527 const SIInstrInfo *TII = ST.getInstrInfo();
4528 const SIRegisterInfo &TRI = TII->getRegisterInfo();
4529 MachineFunction *MF = MBB.getParent();
4530 MachineRegisterInfo &MRI = MF->getRegInfo();
4531
4532 Register Dst = MI.getOperand(i: 0).getReg();
4533 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4534 Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
4535 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
4536
4537 const TargetRegisterClass *VecRC = MRI.getRegClass(Reg: SrcReg);
4538 const TargetRegisterClass *IdxRC = MRI.getRegClass(Reg: Idx->getReg());
4539
4540 unsigned SubReg;
4541 std::tie(args&: SubReg, args&: Offset)
4542 = computeIndirectRegAndOffset(TRI, SuperRC: VecRC, VecReg: SrcReg, Offset);
4543
4544 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4545
4546 // Check for a SGPR index.
4547 if (TII->getRegisterInfo().isSGPRClass(RC: IdxRC)) {
4548 MachineBasicBlock::iterator I(&MI);
4549 const DebugLoc &DL = MI.getDebugLoc();
4550
4551 if (UseGPRIdxMode) {
4552 // TODO: Look at the uses to avoid the copy. This may require rescheduling
4553 // to avoid interfering with other uses, so probably requires a new
4554 // optimization pass.
4555 Register Idx = getIndirectSGPRIdx(TII, MRI, MI, Offset);
4556
4557 const MCInstrDesc &GPRIDXDesc =
4558 TII->getIndirectGPRIDXPseudo(VecSize: TRI.getRegSizeInBits(*VecRC), IsIndirectSrc: true);
4559 BuildMI(BB&: MBB, I, MIMD: DL, MCID: GPRIDXDesc, DestReg: Dst)
4560 .addReg(RegNo: SrcReg)
4561 .addReg(RegNo: Idx)
4562 .addImm(Val: SubReg);
4563 } else {
4564 setM0ToIndexFromSGPR(TII, MRI, MI, Offset);
4565
4566 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4567 .addReg(SrcReg, 0, SubReg)
4568 .addReg(SrcReg, RegState::Implicit);
4569 }
4570
4571 MI.eraseFromParent();
4572
4573 return &MBB;
4574 }
4575
4576 // Control flow needs to be inserted if indexing with a VGPR.
4577 const DebugLoc &DL = MI.getDebugLoc();
4578 MachineBasicBlock::iterator I(&MI);
4579
4580 Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4581 Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4582
4583 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
4584
4585 Register SGPRIdxReg;
4586 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitResultReg: InitReg, PhiReg, Offset,
4587 UseGPRIdxMode, SGPRIdxReg);
4588
4589 MachineBasicBlock *LoopBB = InsPt->getParent();
4590
4591 if (UseGPRIdxMode) {
4592 const MCInstrDesc &GPRIDXDesc =
4593 TII->getIndirectGPRIDXPseudo(VecSize: TRI.getRegSizeInBits(*VecRC), IsIndirectSrc: true);
4594
4595 BuildMI(BB&: *LoopBB, I: InsPt, MIMD: DL, MCID: GPRIDXDesc, DestReg: Dst)
4596 .addReg(RegNo: SrcReg)
4597 .addReg(RegNo: SGPRIdxReg)
4598 .addImm(Val: SubReg);
4599 } else {
4600 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4601 .addReg(SrcReg, 0, SubReg)
4602 .addReg(SrcReg, RegState::Implicit);
4603 }
4604
4605 MI.eraseFromParent();
4606
4607 return LoopBB;
4608}
4609
4610static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
4611 MachineBasicBlock &MBB,
4612 const GCNSubtarget &ST) {
4613 const SIInstrInfo *TII = ST.getInstrInfo();
4614 const SIRegisterInfo &TRI = TII->getRegisterInfo();
4615 MachineFunction *MF = MBB.getParent();
4616 MachineRegisterInfo &MRI = MF->getRegInfo();
4617
4618 Register Dst = MI.getOperand(i: 0).getReg();
4619 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
4620 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4621 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
4622 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
4623 const TargetRegisterClass *VecRC = MRI.getRegClass(Reg: SrcVec->getReg());
4624 const TargetRegisterClass *IdxRC = MRI.getRegClass(Reg: Idx->getReg());
4625
4626 // This can be an immediate, but will be folded later.
4627 assert(Val->getReg());
4628
4629 unsigned SubReg;
4630 std::tie(args&: SubReg, args&: Offset) = computeIndirectRegAndOffset(TRI, SuperRC: VecRC,
4631 VecReg: SrcVec->getReg(),
4632 Offset);
4633 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4634
4635 if (Idx->getReg() == AMDGPU::NoRegister) {
4636 MachineBasicBlock::iterator I(&MI);
4637 const DebugLoc &DL = MI.getDebugLoc();
4638
4639 assert(Offset == 0);
4640
4641 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
4642 .add(*SrcVec)
4643 .add(*Val)
4644 .addImm(SubReg);
4645
4646 MI.eraseFromParent();
4647 return &MBB;
4648 }
4649
4650 // Check for a SGPR index.
4651 if (TII->getRegisterInfo().isSGPRClass(RC: IdxRC)) {
4652 MachineBasicBlock::iterator I(&MI);
4653 const DebugLoc &DL = MI.getDebugLoc();
4654
4655 if (UseGPRIdxMode) {
4656 Register Idx = getIndirectSGPRIdx(TII, MRI, MI, Offset);
4657
4658 const MCInstrDesc &GPRIDXDesc =
4659 TII->getIndirectGPRIDXPseudo(VecSize: TRI.getRegSizeInBits(*VecRC), IsIndirectSrc: false);
4660 BuildMI(BB&: MBB, I, MIMD: DL, MCID: GPRIDXDesc, DestReg: Dst)
4661 .addReg(RegNo: SrcVec->getReg())
4662 .add(MO: *Val)
4663 .addReg(RegNo: Idx)
4664 .addImm(Val: SubReg);
4665 } else {
4666 setM0ToIndexFromSGPR(TII, MRI, MI, Offset);
4667
4668 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
4669 VecSize: TRI.getRegSizeInBits(*VecRC), EltSize: 32, IsSGPR: false);
4670 BuildMI(BB&: MBB, I, MIMD: DL, MCID: MovRelDesc, DestReg: Dst)
4671 .addReg(RegNo: SrcVec->getReg())
4672 .add(MO: *Val)
4673 .addImm(Val: SubReg);
4674 }
4675 MI.eraseFromParent();
4676 return &MBB;
4677 }
4678
4679 // Control flow needs to be inserted if indexing with a VGPR.
4680 if (Val->isReg())
4681 MRI.clearKillFlags(Reg: Val->getReg());
4682
4683 const DebugLoc &DL = MI.getDebugLoc();
4684
4685 Register PhiReg = MRI.createVirtualRegister(RegClass: VecRC);
4686
4687 Register SGPRIdxReg;
4688 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitResultReg: SrcVec->getReg(), PhiReg, Offset,
4689 UseGPRIdxMode, SGPRIdxReg);
4690 MachineBasicBlock *LoopBB = InsPt->getParent();
4691
4692 if (UseGPRIdxMode) {
4693 const MCInstrDesc &GPRIDXDesc =
4694 TII->getIndirectGPRIDXPseudo(VecSize: TRI.getRegSizeInBits(*VecRC), IsIndirectSrc: false);
4695
4696 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
4697 .addReg(PhiReg)
4698 .add(*Val)
4699 .addReg(SGPRIdxReg)
4700 .addImm(AMDGPU::sub0);
4701 } else {
4702 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
4703 VecSize: TRI.getRegSizeInBits(*VecRC), EltSize: 32, IsSGPR: false);
4704 BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
4705 .addReg(PhiReg)
4706 .add(*Val)
4707 .addImm(AMDGPU::sub0);
4708 }
4709
4710 MI.eraseFromParent();
4711 return LoopBB;
4712}
4713
4714static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
4715 MachineBasicBlock &BB,
4716 const GCNSubtarget &ST,
4717 unsigned Opc) {
4718 MachineRegisterInfo &MRI = BB.getParent()->getRegInfo();
4719 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4720 const DebugLoc &DL = MI.getDebugLoc();
4721 const SIInstrInfo *TII = ST.getInstrInfo();
4722
4723 // Reduction operations depend on whether the input operand is SGPR or VGPR.
4724 Register SrcReg = MI.getOperand(i: 1).getReg();
4725 bool isSGPR = TRI->isSGPRClass(RC: MRI.getRegClass(Reg: SrcReg));
4726 Register DstReg = MI.getOperand(i: 0).getReg();
4727 MachineBasicBlock *RetBB = nullptr;
4728 if (isSGPR) {
4729 // These operations with a uniform value i.e. SGPR are idempotent.
4730 // Reduced value will be same as given sgpr.
4731 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
4732 RetBB = &BB;
4733 } else {
4734 // TODO: Implement DPP Strategy and switch based on immediate strategy
4735 // operand. For now, for all the cases (default, Iterative and DPP we use
4736 // iterative approach by default.)
4737
4738 // To reduce the VGPR using iterative approach, we need to iterate
4739 // over all the active lanes. Lowering consists of ComputeLoop,
4740 // which iterate over only active lanes. We use copy of EXEC register
4741 // as induction variable and every active lane modifies it using bitset0
4742 // so that we will get the next active lane for next iteration.
4743 MachineBasicBlock::iterator I = BB.end();
4744 Register SrcReg = MI.getOperand(i: 1).getReg();
4745
4746 // Create Control flow for loop
4747 // Split MI's Machine Basic block into For loop
4748 auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, MBB&: BB, InstInLoop: true);
4749
4750 // Create virtual registers required for lowering.
4751 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
4752 const TargetRegisterClass *DstRegClass = MRI.getRegClass(Reg: DstReg);
4753 Register LoopIterator = MRI.createVirtualRegister(RegClass: WaveMaskRegClass);
4754 Register InitalValReg = MRI.createVirtualRegister(RegClass: DstRegClass);
4755
4756 Register AccumulatorReg = MRI.createVirtualRegister(RegClass: DstRegClass);
4757 Register ActiveBitsReg = MRI.createVirtualRegister(RegClass: WaveMaskRegClass);
4758 Register NewActiveBitsReg = MRI.createVirtualRegister(RegClass: WaveMaskRegClass);
4759
4760 Register FF1Reg = MRI.createVirtualRegister(RegClass: DstRegClass);
4761 Register LaneValueReg = MRI.createVirtualRegister(RegClass: DstRegClass);
4762
4763 bool IsWave32 = ST.isWave32();
4764 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4765 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4766
4767 // Create initail values of induction variable from Exec, Accumulator and
4768 // insert branch instr to newly created ComputeBlockk
4769 uint32_t InitalValue =
4770 (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
4771 auto TmpSReg =
4772 BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
4773 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
4774 .addImm(InitalValue);
4775 BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH)).addMBB(ComputeLoop);
4776
4777 // Start constructing ComputeLoop
4778 I = ComputeLoop->end();
4779 auto Accumulator =
4780 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
4781 .addReg(InitalValReg)
4782 .addMBB(&BB);
4783 auto ActiveBits =
4784 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
4785 .addReg(TmpSReg->getOperand(0).getReg())
4786 .addMBB(&BB);
4787
4788 // Perform the computations
4789 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
4790 auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
4791 .addReg(ActiveBits->getOperand(0).getReg());
4792 auto LaneValue = BuildMI(*ComputeLoop, I, DL,
4793 TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
4794 .addReg(SrcReg)
4795 .addReg(FF1->getOperand(0).getReg());
4796 auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
4797 .addReg(Accumulator->getOperand(0).getReg())
4798 .addReg(LaneValue->getOperand(0).getReg());
4799
4800 // Manipulate the iterator to get the next active lane
4801 unsigned BITSETOpc =
4802 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
4803 auto NewActiveBits =
4804 BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
4805 .addReg(FF1->getOperand(0).getReg())
4806 .addReg(ActiveBits->getOperand(0).getReg());
4807
4808 // Add phi nodes
4809 Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
4810 .addMBB(ComputeLoop);
4811 ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
4812 .addMBB(ComputeLoop);
4813
4814 // Creating branching
4815 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
4816 BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
4817 .addReg(NewActiveBits->getOperand(0).getReg())
4818 .addImm(0);
4819 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
4820 .addMBB(ComputeLoop);
4821
4822 RetBB = ComputeEnd;
4823 }
4824 MI.eraseFromParent();
4825 return RetBB;
4826}
4827
4828MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
4829 MachineInstr &MI, MachineBasicBlock *BB) const {
4830
4831 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
4832 MachineFunction *MF = BB->getParent();
4833 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
4834
4835 switch (MI.getOpcode()) {
4836 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
4837 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
4838 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
4839 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
4840 case AMDGPU::S_UADDO_PSEUDO:
4841 case AMDGPU::S_USUBO_PSEUDO: {
4842 const DebugLoc &DL = MI.getDebugLoc();
4843 MachineOperand &Dest0 = MI.getOperand(i: 0);
4844 MachineOperand &Dest1 = MI.getOperand(i: 1);
4845 MachineOperand &Src0 = MI.getOperand(i: 2);
4846 MachineOperand &Src1 = MI.getOperand(i: 3);
4847
4848 unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
4849 ? AMDGPU::S_ADD_I32
4850 : AMDGPU::S_SUB_I32;
4851 BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg()).add(Src0).add(Src1);
4852
4853 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B64), Dest1.getReg())
4854 .addImm(1)
4855 .addImm(0);
4856
4857 MI.eraseFromParent();
4858 return BB;
4859 }
4860 case AMDGPU::S_ADD_U64_PSEUDO:
4861 case AMDGPU::S_SUB_U64_PSEUDO: {
4862 // For targets older than GFX12, we emit a sequence of 32-bit operations.
4863 // For GFX12, we emit s_add_u64 and s_sub_u64.
4864 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4865 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
4866 const DebugLoc &DL = MI.getDebugLoc();
4867 MachineOperand &Dest = MI.getOperand(i: 0);
4868 MachineOperand &Src0 = MI.getOperand(i: 1);
4869 MachineOperand &Src1 = MI.getOperand(i: 2);
4870 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
4871 if (Subtarget->hasScalarAddSub64()) {
4872 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
4873 BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
4874 .add(Src0)
4875 .add(Src1);
4876 } else {
4877 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4878 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
4879
4880 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4881 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4882
4883 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
4884 MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
4885 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
4886 MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
4887
4888 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
4889 MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
4890 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
4891 MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
4892
4893 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
4894 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
4895 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
4896 .add(Src0Sub0)
4897 .add(Src1Sub0);
4898 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
4899 .add(Src0Sub1)
4900 .add(Src1Sub1);
4901 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
4902 .addReg(DestSub0)
4903 .addImm(AMDGPU::sub0)
4904 .addReg(DestSub1)
4905 .addImm(AMDGPU::sub1);
4906 }
4907 MI.eraseFromParent();
4908 return BB;
4909 }
4910 case AMDGPU::V_ADD_U64_PSEUDO:
4911 case AMDGPU::V_SUB_U64_PSEUDO: {
4912 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
4913 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4914 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4915 const DebugLoc &DL = MI.getDebugLoc();
4916
4917 bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
4918
4919 MachineOperand &Dest = MI.getOperand(i: 0);
4920 MachineOperand &Src0 = MI.getOperand(i: 1);
4921 MachineOperand &Src1 = MI.getOperand(i: 2);
4922
4923 if (IsAdd && ST.hasLshlAddB64()) {
4924 auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
4925 Dest.getReg())
4926 .add(Src0)
4927 .addImm(0)
4928 .add(Src1);
4929 TII->legalizeOperands(MI&: *Add);
4930 MI.eraseFromParent();
4931 return BB;
4932 }
4933
4934 const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
4935
4936 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4937 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4938
4939 Register CarryReg = MRI.createVirtualRegister(CarryRC);
4940 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
4941
4942 const TargetRegisterClass *Src0RC = Src0.isReg()
4943 ? MRI.getRegClass(Src0.getReg())
4944 : &AMDGPU::VReg_64RegClass;
4945 const TargetRegisterClass *Src1RC = Src1.isReg()
4946 ? MRI.getRegClass(Src1.getReg())
4947 : &AMDGPU::VReg_64RegClass;
4948
4949 const TargetRegisterClass *Src0SubRC =
4950 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
4951 const TargetRegisterClass *Src1SubRC =
4952 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
4953
4954 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
4955 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
4956 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
4957 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
4958
4959 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
4960 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
4961 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
4962 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
4963
4964 unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
4965 MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
4966 .addReg(CarryReg, RegState::Define)
4967 .add(SrcReg0Sub0)
4968 .add(SrcReg1Sub0)
4969 .addImm(0); // clamp bit
4970
4971 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
4972 MachineInstr *HiHalf =
4973 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
4974 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
4975 .add(SrcReg0Sub1)
4976 .add(SrcReg1Sub1)
4977 .addReg(CarryReg, RegState::Kill)
4978 .addImm(0); // clamp bit
4979
4980 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
4981 .addReg(DestSub0)
4982 .addImm(AMDGPU::sub0)
4983 .addReg(DestSub1)
4984 .addImm(AMDGPU::sub1);
4985 TII->legalizeOperands(MI&: *LoHalf);
4986 TII->legalizeOperands(MI&: *HiHalf);
4987 MI.eraseFromParent();
4988 return BB;
4989 }
4990 case AMDGPU::S_ADD_CO_PSEUDO:
4991 case AMDGPU::S_SUB_CO_PSEUDO: {
4992 // This pseudo has a chance to be selected
4993 // only from uniform add/subcarry node. All the VGPR operands
4994 // therefore assumed to be splat vectors.
4995 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
4996 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4997 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4998 MachineBasicBlock::iterator MII = MI;
4999 const DebugLoc &DL = MI.getDebugLoc();
5000 MachineOperand &Dest = MI.getOperand(i: 0);
5001 MachineOperand &CarryDest = MI.getOperand(i: 1);
5002 MachineOperand &Src0 = MI.getOperand(i: 2);
5003 MachineOperand &Src1 = MI.getOperand(i: 3);
5004 MachineOperand &Src2 = MI.getOperand(i: 4);
5005 unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
5006 ? AMDGPU::S_ADDC_U32
5007 : AMDGPU::S_SUBB_U32;
5008 if (Src0.isReg() && TRI->isVectorRegister(MRI, Reg: Src0.getReg())) {
5009 Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5010 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
5011 .addReg(Src0.getReg());
5012 Src0.setReg(RegOp0);
5013 }
5014 if (Src1.isReg() && TRI->isVectorRegister(MRI, Reg: Src1.getReg())) {
5015 Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5016 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
5017 .addReg(Src1.getReg());
5018 Src1.setReg(RegOp1);
5019 }
5020 Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5021 if (TRI->isVectorRegister(MRI, Reg: Src2.getReg())) {
5022 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
5023 .addReg(Src2.getReg());
5024 Src2.setReg(RegOp2);
5025 }
5026
5027 const TargetRegisterClass *Src2RC = MRI.getRegClass(Reg: Src2.getReg());
5028 unsigned WaveSize = TRI->getRegSizeInBits(*Src2RC);
5029 assert(WaveSize == 64 || WaveSize == 32);
5030
5031 if (WaveSize == 64) {
5032 if (ST.hasScalarCompareEq64()) {
5033 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
5034 .addReg(Src2.getReg())
5035 .addImm(0);
5036 } else {
5037 const TargetRegisterClass *SubRC =
5038 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
5039 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
5040 MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
5041 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
5042 MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
5043 Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5044
5045 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
5046 .add(Src2Sub0)
5047 .add(Src2Sub1);
5048
5049 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5050 .addReg(Src2_32, RegState::Kill)
5051 .addImm(0);
5052 }
5053 } else {
5054 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5055 .addReg(Src2.getReg())
5056 .addImm(0);
5057 }
5058
5059 BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg()).add(Src0).add(Src1);
5060
5061 unsigned SelOpc =
5062 (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
5063
5064 BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
5065 .addImm(-1)
5066 .addImm(0);
5067
5068 MI.eraseFromParent();
5069 return BB;
5070 }
5071 case AMDGPU::SI_INIT_M0: {
5072 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
5073 TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
5074 .add(MI.getOperand(0));
5075 MI.eraseFromParent();
5076 return BB;
5077 }
5078 case AMDGPU::GET_GROUPSTATICSIZE: {
5079 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
5080 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
5081 DebugLoc DL = MI.getDebugLoc();
5082 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
5083 .add(MI.getOperand(0))
5084 .addImm(MFI->getLDSSize());
5085 MI.eraseFromParent();
5086 return BB;
5087 }
5088 case AMDGPU::GET_SHADERCYCLESHILO: {
5089 assert(MF->getSubtarget<GCNSubtarget>().hasShaderCyclesHiLoRegisters());
5090 MachineRegisterInfo &MRI = MF->getRegInfo();
5091 const DebugLoc &DL = MI.getDebugLoc();
5092 // The algorithm is:
5093 //
5094 // hi1 = getreg(SHADER_CYCLES_HI)
5095 // lo1 = getreg(SHADER_CYCLES_LO)
5096 // hi2 = getreg(SHADER_CYCLES_HI)
5097 //
5098 // If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
5099 // Otherwise there was overflow and the result is hi2:0. In both cases the
5100 // result should represent the actual time at some point during the sequence
5101 // of three getregs.
5102 using namespace AMDGPU::Hwreg;
5103 Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5104 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)
5105 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5106 Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5107 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)
5108 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
5109 Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5110 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)
5111 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5112 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
5113 .addReg(RegHi1)
5114 .addReg(RegHi2);
5115 Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5116 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo)
5117 .addReg(RegLo1)
5118 .addImm(0);
5119 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE))
5120 .add(MI.getOperand(0))
5121 .addReg(RegLo)
5122 .addImm(AMDGPU::sub0)
5123 .addReg(RegHi2)
5124 .addImm(AMDGPU::sub1);
5125 MI.eraseFromParent();
5126 return BB;
5127 }
5128 case AMDGPU::SI_INDIRECT_SRC_V1:
5129 case AMDGPU::SI_INDIRECT_SRC_V2:
5130 case AMDGPU::SI_INDIRECT_SRC_V4:
5131 case AMDGPU::SI_INDIRECT_SRC_V8:
5132 case AMDGPU::SI_INDIRECT_SRC_V9:
5133 case AMDGPU::SI_INDIRECT_SRC_V10:
5134 case AMDGPU::SI_INDIRECT_SRC_V11:
5135 case AMDGPU::SI_INDIRECT_SRC_V12:
5136 case AMDGPU::SI_INDIRECT_SRC_V16:
5137 case AMDGPU::SI_INDIRECT_SRC_V32:
5138 return emitIndirectSrc(MI, MBB&: *BB, ST: *getSubtarget());
5139 case AMDGPU::SI_INDIRECT_DST_V1:
5140 case AMDGPU::SI_INDIRECT_DST_V2:
5141 case AMDGPU::SI_INDIRECT_DST_V4:
5142 case AMDGPU::SI_INDIRECT_DST_V8:
5143 case AMDGPU::SI_INDIRECT_DST_V9:
5144 case AMDGPU::SI_INDIRECT_DST_V10:
5145 case AMDGPU::SI_INDIRECT_DST_V11:
5146 case AMDGPU::SI_INDIRECT_DST_V12:
5147 case AMDGPU::SI_INDIRECT_DST_V16:
5148 case AMDGPU::SI_INDIRECT_DST_V32:
5149 return emitIndirectDst(MI, MBB&: *BB, ST: *getSubtarget());
5150 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
5151 case AMDGPU::SI_KILL_I1_PSEUDO:
5152 return splitKillBlock(MI, BB);
5153 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
5154 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5155 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5156 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5157
5158 Register Dst = MI.getOperand(i: 0).getReg();
5159 const MachineOperand &Src0 = MI.getOperand(i: 1);
5160 const MachineOperand &Src1 = MI.getOperand(i: 2);
5161 const DebugLoc &DL = MI.getDebugLoc();
5162 Register SrcCond = MI.getOperand(i: 3).getReg();
5163
5164 Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5165 Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5166 const auto *CondRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
5167 Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
5168
5169 const TargetRegisterClass *Src0RC = Src0.isReg()
5170 ? MRI.getRegClass(Src0.getReg())
5171 : &AMDGPU::VReg_64RegClass;
5172 const TargetRegisterClass *Src1RC = Src1.isReg()
5173 ? MRI.getRegClass(Src1.getReg())
5174 : &AMDGPU::VReg_64RegClass;
5175
5176 const TargetRegisterClass *Src0SubRC =
5177 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5178 const TargetRegisterClass *Src1SubRC =
5179 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5180
5181 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5182 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5183 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5184 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5185
5186 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5187 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5188 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5189 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5190
5191 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy)
5192 .addReg(SrcCond);
5193 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
5194 .addImm(0)
5195 .add(Src0Sub0)
5196 .addImm(0)
5197 .add(Src1Sub0)
5198 .addReg(SrcCondCopy);
5199 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
5200 .addImm(0)
5201 .add(Src0Sub1)
5202 .addImm(0)
5203 .add(Src1Sub1)
5204 .addReg(SrcCondCopy);
5205
5206 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
5207 .addReg(DstLo)
5208 .addImm(AMDGPU::sub0)
5209 .addReg(DstHi)
5210 .addImm(AMDGPU::sub1);
5211 MI.eraseFromParent();
5212 return BB;
5213 }
5214 case AMDGPU::SI_BR_UNDEF: {
5215 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
5216 const DebugLoc &DL = MI.getDebugLoc();
5217 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5218 .add(MI.getOperand(0));
5219 Br->getOperand(i: 1).setIsUndef(); // read undef SCC
5220 MI.eraseFromParent();
5221 return BB;
5222 }
5223 case AMDGPU::ADJCALLSTACKUP:
5224 case AMDGPU::ADJCALLSTACKDOWN: {
5225 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
5226 MachineInstrBuilder MIB(*MF, &MI);
5227 MIB.addReg(RegNo: Info->getStackPtrOffsetReg(), flags: RegState::ImplicitDefine)
5228 .addReg(RegNo: Info->getStackPtrOffsetReg(), flags: RegState::Implicit);
5229 return BB;
5230 }
5231 case AMDGPU::SI_CALL_ISEL: {
5232 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
5233 const DebugLoc &DL = MI.getDebugLoc();
5234
5235 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(MF: *MF);
5236
5237 MachineInstrBuilder MIB;
5238 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
5239
5240 for (const MachineOperand &MO : MI.operands())
5241 MIB.add(MO);
5242
5243 MIB.cloneMemRefs(OtherMI: MI);
5244 MI.eraseFromParent();
5245 return BB;
5246 }
5247 case AMDGPU::V_ADD_CO_U32_e32:
5248 case AMDGPU::V_SUB_CO_U32_e32:
5249 case AMDGPU::V_SUBREV_CO_U32_e32: {
5250 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
5251 const DebugLoc &DL = MI.getDebugLoc();
5252 unsigned Opc = MI.getOpcode();
5253
5254 bool NeedClampOperand = false;
5255 if (TII->pseudoToMCOpcode(Opcode: Opc) == -1) {
5256 Opc = AMDGPU::getVOPe64(Opcode: Opc);
5257 NeedClampOperand = true;
5258 }
5259
5260 auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(i: 0).getReg());
5261 if (TII->isVOP3(*I)) {
5262 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5263 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5264 I.addReg(TRI->getVCC(), RegState::Define);
5265 }
5266 I.add(MI.getOperand(i: 1))
5267 .add(MI.getOperand(i: 2));
5268 if (NeedClampOperand)
5269 I.addImm(0); // clamp bit for e64 encoding
5270
5271 TII->legalizeOperands(MI&: *I);
5272
5273 MI.eraseFromParent();
5274 return BB;
5275 }
5276 case AMDGPU::V_ADDC_U32_e32:
5277 case AMDGPU::V_SUBB_U32_e32:
5278 case AMDGPU::V_SUBBREV_U32_e32:
5279 // These instructions have an implicit use of vcc which counts towards the
5280 // constant bus limit.
5281 TII->legalizeOperands(MI);
5282 return BB;
5283 case AMDGPU::DS_GWS_INIT:
5284 case AMDGPU::DS_GWS_SEMA_BR:
5285 case AMDGPU::DS_GWS_BARRIER:
5286 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::data0);
5287 [[fallthrough]];
5288 case AMDGPU::DS_GWS_SEMA_V:
5289 case AMDGPU::DS_GWS_SEMA_P:
5290 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
5291 // A s_waitcnt 0 is required to be the instruction immediately following.
5292 if (getSubtarget()->hasGWSAutoReplay()) {
5293 bundleInstWithWaitcnt(MI);
5294 return BB;
5295 }
5296
5297 return emitGWSMemViolTestLoop(MI, BB);
5298 case AMDGPU::S_SETREG_B32: {
5299 // Try to optimize cases that only set the denormal mode or rounding mode.
5300 //
5301 // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
5302 // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
5303 // instead.
5304 //
5305 // FIXME: This could be predicates on the immediate, but tablegen doesn't
5306 // allow you to have a no side effect instruction in the output of a
5307 // sideeffecting pattern.
5308 auto [ID, Offset, Width] =
5309 AMDGPU::Hwreg::HwregEncoding::decode(MI.getOperand(1).getImm());
5310 if (ID != AMDGPU::Hwreg::ID_MODE)
5311 return BB;
5312
5313 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
5314 const unsigned SetMask = WidthMask << Offset;
5315
5316 if (getSubtarget()->hasDenormModeInst()) {
5317 unsigned SetDenormOp = 0;
5318 unsigned SetRoundOp = 0;
5319
5320 // The dedicated instructions can only set the whole denorm or round mode
5321 // at once, not a subset of bits in either.
5322 if (SetMask ==
5323 (AMDGPU::Hwreg::FP_ROUND_MASK | AMDGPU::Hwreg::FP_DENORM_MASK)) {
5324 // If this fully sets both the round and denorm mode, emit the two
5325 // dedicated instructions for these.
5326 SetRoundOp = AMDGPU::S_ROUND_MODE;
5327 SetDenormOp = AMDGPU::S_DENORM_MODE;
5328 } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
5329 SetRoundOp = AMDGPU::S_ROUND_MODE;
5330 } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
5331 SetDenormOp = AMDGPU::S_DENORM_MODE;
5332 }
5333
5334 if (SetRoundOp || SetDenormOp) {
5335 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5336 MachineInstr *Def = MRI.getVRegDef(Reg: MI.getOperand(i: 0).getReg());
5337 if (Def && Def->isMoveImmediate() && Def->getOperand(i: 1).isImm()) {
5338 unsigned ImmVal = Def->getOperand(i: 1).getImm();
5339 if (SetRoundOp) {
5340 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
5341 .addImm(ImmVal & 0xf);
5342
5343 // If we also have the denorm mode, get just the denorm mode bits.
5344 ImmVal >>= 4;
5345 }
5346
5347 if (SetDenormOp) {
5348 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
5349 .addImm(ImmVal & 0xf);
5350 }
5351
5352 MI.eraseFromParent();
5353 return BB;
5354 }
5355 }
5356 }
5357
5358 // If only FP bits are touched, used the no side effects pseudo.
5359 if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
5360 AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
5361 MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode));
5362
5363 return BB;
5364 }
5365 case AMDGPU::S_INVERSE_BALLOT_U32:
5366 case AMDGPU::S_INVERSE_BALLOT_U64: {
5367 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5368 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5369 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5370 const DebugLoc &DL = MI.getDebugLoc();
5371 const Register DstReg = MI.getOperand(i: 0).getReg();
5372 Register MaskReg = MI.getOperand(i: 1).getReg();
5373
5374 const bool IsVALU = TRI->isVectorRegister(MRI, Reg: MaskReg);
5375
5376 if (IsVALU) {
5377 MaskReg = TII->readlaneVGPRToSGPR(SrcReg: MaskReg, UseMI&: MI, MRI);
5378 }
5379
5380 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::COPY), DstReg).addReg(MaskReg);
5381 MI.eraseFromParent();
5382 return BB;
5383 }
5384 case AMDGPU::ENDPGM_TRAP: {
5385 const DebugLoc &DL = MI.getDebugLoc();
5386 if (BB->succ_empty() && std::next(x: MI.getIterator()) == BB->end()) {
5387 MI.setDesc(TII->get(AMDGPU::S_ENDPGM));
5388 MI.addOperand(Op: MachineOperand::CreateImm(Val: 0));
5389 return BB;
5390 }
5391
5392 // We need a block split to make the real endpgm a terminator. We also don't
5393 // want to break phis in successor blocks, so we can't just delete to the
5394 // end of the block.
5395
5396 MachineBasicBlock *SplitBB = BB->splitAt(SplitInst&: MI, UpdateLiveIns: false /*UpdateLiveIns*/);
5397 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
5398 MF->push_back(MBB: TrapBB);
5399 BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM))
5400 .addImm(0);
5401 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
5402 .addMBB(TrapBB);
5403
5404 BB->addSuccessor(Succ: TrapBB);
5405 MI.eraseFromParent();
5406 return SplitBB;
5407 }
5408 case AMDGPU::SIMULATED_TRAP: {
5409 assert(Subtarget->hasPrivEnabledTrap2NopBug());
5410 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5411 MachineBasicBlock *SplitBB =
5412 TII->insertSimulatedTrap(MRI, MBB&: *BB, MI, DL: MI.getDebugLoc());
5413 MI.eraseFromParent();
5414 return SplitBB;
5415 }
5416 default:
5417 if (TII->isImage(MI) || TII->isMUBUF(MI)) {
5418 if (!MI.mayStore())
5419 AddMemOpInit(MI);
5420 return BB;
5421 }
5422 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, MBB: BB);
5423 }
5424}
5425
5426bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const {
5427 // This currently forces unfolding various combinations of fsub into fma with
5428 // free fneg'd operands. As long as we have fast FMA (controlled by
5429 // isFMAFasterThanFMulAndFAdd), we should perform these.
5430
5431 // When fma is quarter rate, for f64 where add / sub are at best half rate,
5432 // most of these combines appear to be cycle neutral but save on instruction
5433 // count / code size.
5434 return true;
5435}
5436
5437bool SITargetLowering::enableAggressiveFMAFusion(LLT Ty) const { return true; }
5438
5439EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
5440 EVT VT) const {
5441 if (!VT.isVector()) {
5442 return MVT::i1;
5443 }
5444 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
5445}
5446
5447MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT VT) const {
5448 // TODO: Should i16 be used always if legal? For now it would force VALU
5449 // shifts.
5450 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
5451}
5452
5453LLT SITargetLowering::getPreferredShiftAmountTy(LLT Ty) const {
5454 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
5455 ? Ty.changeElementSize(NewEltSize: 16)
5456 : Ty.changeElementSize(NewEltSize: 32);
5457}
5458
5459// Answering this is somewhat tricky and depends on the specific device which
5460// have different rates for fma or all f64 operations.
5461//
5462// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
5463// regardless of which device (although the number of cycles differs between
5464// devices), so it is always profitable for f64.
5465//
5466// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
5467// only on full rate devices. Normally, we should prefer selecting v_mad_f32
5468// which we can always do even without fused FP ops since it returns the same
5469// result as the separate operations and since it is always full
5470// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
5471// however does not support denormals, so we do report fma as faster if we have
5472// a fast fma device and require denormals.
5473//
5474bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
5475 EVT VT) const {
5476 VT = VT.getScalarType();
5477
5478 switch (VT.getSimpleVT().SimpleTy) {
5479 case MVT::f32: {
5480 // If mad is not available this depends only on if f32 fma is full rate.
5481 if (!Subtarget->hasMadMacF32Insts())
5482 return Subtarget->hasFastFMAF32();
5483
5484 // Otherwise f32 mad is always full rate and returns the same result as
5485 // the separate operations so should be preferred over fma.
5486 // However does not support denormals.
5487 if (!denormalModeIsFlushAllF32(MF))
5488 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
5489
5490 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
5491 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
5492 }
5493 case MVT::f64:
5494 return true;
5495 case MVT::f16:
5496 return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
5497 default:
5498 break;
5499 }
5500
5501 return false;
5502}
5503
5504bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
5505 LLT Ty) const {
5506 switch (Ty.getScalarSizeInBits()) {
5507 case 16:
5508 return isFMAFasterThanFMulAndFAdd(MF, MVT::f16);
5509 case 32:
5510 return isFMAFasterThanFMulAndFAdd(MF, MVT::f32);
5511 case 64:
5512 return isFMAFasterThanFMulAndFAdd(MF, MVT::f64);
5513 default:
5514 break;
5515 }
5516
5517 return false;
5518}
5519
5520bool SITargetLowering::isFMADLegal(const MachineInstr &MI, LLT Ty) const {
5521 if (!Ty.isScalar())
5522 return false;
5523
5524 if (Ty.getScalarSizeInBits() == 16)
5525 return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(MF: *MI.getMF());
5526 if (Ty.getScalarSizeInBits() == 32)
5527 return Subtarget->hasMadMacF32Insts() &&
5528 denormalModeIsFlushAllF32(MF: *MI.getMF());
5529
5530 return false;
5531}
5532
5533bool SITargetLowering::isFMADLegal(const SelectionDAG &DAG,
5534 const SDNode *N) const {
5535 // TODO: Check future ftz flag
5536 // v_mad_f32/v_mac_f32 do not support denormals.
5537 EVT VT = N->getValueType(ResNo: 0);
5538 if (VT == MVT::f32)
5539 return Subtarget->hasMadMacF32Insts() &&
5540 denormalModeIsFlushAllF32(MF: DAG.getMachineFunction());
5541 if (VT == MVT::f16) {
5542 return Subtarget->hasMadF16() &&
5543 denormalModeIsFlushAllF64F16(MF: DAG.getMachineFunction());
5544 }
5545
5546 return false;
5547}
5548
5549//===----------------------------------------------------------------------===//
5550// Custom DAG Lowering Operations
5551//===----------------------------------------------------------------------===//
5552
5553// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
5554// wider vector type is legal.
5555SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
5556 SelectionDAG &DAG) const {
5557 unsigned Opc = Op.getOpcode();
5558 EVT VT = Op.getValueType();
5559 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5560 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5561 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5562 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5563
5564 SDValue Lo, Hi;
5565 std::tie(args&: Lo, args&: Hi) = DAG.SplitVectorOperand(N: Op.getNode(), OpNo: 0);
5566
5567 SDLoc SL(Op);
5568 SDValue OpLo = DAG.getNode(Opcode: Opc, DL: SL, VT: Lo.getValueType(), Operand: Lo,
5569 Flags: Op->getFlags());
5570 SDValue OpHi = DAG.getNode(Opcode: Opc, DL: SL, VT: Hi.getValueType(), Operand: Hi,
5571 Flags: Op->getFlags());
5572
5573 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SDLoc(Op), VT, N1: OpLo, N2: OpHi);
5574}
5575
5576// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
5577// wider vector type is legal.
5578SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
5579 SelectionDAG &DAG) const {
5580 unsigned Opc = Op.getOpcode();
5581 EVT VT = Op.getValueType();
5582 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5583 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5584 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5585 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5586
5587 SDValue Lo0, Hi0;
5588 std::tie(args&: Lo0, args&: Hi0) = DAG.SplitVectorOperand(N: Op.getNode(), OpNo: 0);
5589 SDValue Lo1, Hi1;
5590 std::tie(args&: Lo1, args&: Hi1) = DAG.SplitVectorOperand(N: Op.getNode(), OpNo: 1);
5591
5592 SDLoc SL(Op);
5593
5594 SDValue OpLo = DAG.getNode(Opcode: Opc, DL: SL, VT: Lo0.getValueType(), N1: Lo0, N2: Lo1,
5595 Flags: Op->getFlags());
5596 SDValue OpHi = DAG.getNode(Opcode: Opc, DL: SL, VT: Hi0.getValueType(), N1: Hi0, N2: Hi1,
5597 Flags: Op->getFlags());
5598
5599 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SDLoc(Op), VT, N1: OpLo, N2: OpHi);
5600}
5601
5602SDValue SITargetLowering::splitTernaryVectorOp(SDValue Op,
5603 SelectionDAG &DAG) const {
5604 unsigned Opc = Op.getOpcode();
5605 EVT VT = Op.getValueType();
5606 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
5607 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
5608 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5609 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
5610 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
5611 VT == MVT::v32bf16);
5612
5613 SDValue Lo0, Hi0;
5614 SDValue Op0 = Op.getOperand(i: 0);
5615 std::tie(args&: Lo0, args&: Hi0) = Op0.getValueType().isVector()
5616 ? DAG.SplitVectorOperand(N: Op.getNode(), OpNo: 0)
5617 : std::pair(Op0, Op0);
5618 SDValue Lo1, Hi1;
5619 std::tie(args&: Lo1, args&: Hi1) = DAG.SplitVectorOperand(N: Op.getNode(), OpNo: 1);
5620 SDValue Lo2, Hi2;
5621 std::tie(args&: Lo2, args&: Hi2) = DAG.SplitVectorOperand(N: Op.getNode(), OpNo: 2);
5622
5623 SDLoc SL(Op);
5624 auto ResVT = DAG.GetSplitDestVTs(VT);
5625
5626 SDValue OpLo = DAG.getNode(Opcode: Opc, DL: SL, VT: ResVT.first, N1: Lo0, N2: Lo1, N3: Lo2,
5627 Flags: Op->getFlags());
5628 SDValue OpHi = DAG.getNode(Opcode: Opc, DL: SL, VT: ResVT.second, N1: Hi0, N2: Hi1, N3: Hi2,
5629 Flags: Op->getFlags());
5630
5631 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SDLoc(Op), VT, N1: OpLo, N2: OpHi);
5632}
5633
5634
5635SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
5636 switch (Op.getOpcode()) {
5637 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
5638 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
5639 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
5640 case ISD::LOAD: {
5641 SDValue Result = LowerLOAD(Op, DAG);
5642 assert((!Result.getNode() ||
5643 Result.getNode()->getNumValues() == 2) &&
5644 "Load should return a value and a chain");
5645 return Result;
5646 }
5647 case ISD::FSQRT: {
5648 EVT VT = Op.getValueType();
5649 if (VT == MVT::f32)
5650 return lowerFSQRTF32(Op, DAG);
5651 if (VT == MVT::f64)
5652 return lowerFSQRTF64(Op, DAG);
5653 return SDValue();
5654 }
5655 case ISD::FSIN:
5656 case ISD::FCOS:
5657 return LowerTrig(Op, DAG);
5658 case ISD::SELECT: return LowerSELECT(Op, DAG);
5659 case ISD::FDIV: return LowerFDIV(Op, DAG);
5660 case ISD::FFREXP: return LowerFFREXP(Op, DAG);
5661 case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG);
5662 case ISD::STORE: return LowerSTORE(Op, DAG);
5663 case ISD::GlobalAddress: {
5664 MachineFunction &MF = DAG.getMachineFunction();
5665 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
5666 return LowerGlobalAddress(MFI, Op, DAG);
5667 }
5668 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
5669 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
5670 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
5671 case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
5672 case ISD::INSERT_SUBVECTOR:
5673 return lowerINSERT_SUBVECTOR(Op, DAG);
5674 case ISD::INSERT_VECTOR_ELT:
5675 return lowerINSERT_VECTOR_ELT(Op, DAG);
5676 case ISD::EXTRACT_VECTOR_ELT:
5677 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
5678 case ISD::VECTOR_SHUFFLE:
5679 return lowerVECTOR_SHUFFLE(Op, DAG);
5680 case ISD::SCALAR_TO_VECTOR:
5681 return lowerSCALAR_TO_VECTOR(Op, DAG);
5682 case ISD::BUILD_VECTOR:
5683 return lowerBUILD_VECTOR(Op, DAG);
5684 case ISD::FP_ROUND:
5685 case ISD::STRICT_FP_ROUND:
5686 return lowerFP_ROUND(Op, DAG);
5687 case ISD::FPTRUNC_ROUND: {
5688 unsigned Opc;
5689 SDLoc DL(Op);
5690
5691 if (Op.getOperand(0)->getValueType(0) != MVT::f32)
5692 return SDValue();
5693
5694 // Get the rounding mode from the last operand
5695 int RoundMode = Op.getConstantOperandVal(i: 1);
5696 if (RoundMode == (int)RoundingMode::TowardPositive)
5697 Opc = AMDGPUISD::FPTRUNC_ROUND_UPWARD;
5698 else if (RoundMode == (int)RoundingMode::TowardNegative)
5699 Opc = AMDGPUISD::FPTRUNC_ROUND_DOWNWARD;
5700 else
5701 return SDValue();
5702
5703 return DAG.getNode(Opcode: Opc, DL, VTList: Op.getNode()->getVTList(), N: Op->getOperand(Num: 0));
5704 }
5705 case ISD::TRAP:
5706 return lowerTRAP(Op, DAG);
5707 case ISD::DEBUGTRAP:
5708 return lowerDEBUGTRAP(Op, DAG);
5709 case ISD::FABS:
5710 case ISD::FNEG:
5711 case ISD::FCANONICALIZE:
5712 case ISD::BSWAP:
5713 return splitUnaryVectorOp(Op, DAG);
5714 case ISD::FMINNUM:
5715 case ISD::FMAXNUM:
5716 return lowerFMINNUM_FMAXNUM(Op, DAG);
5717 case ISD::FLDEXP:
5718 case ISD::STRICT_FLDEXP:
5719 return lowerFLDEXP(Op, DAG);
5720 case ISD::FMA:
5721 return splitTernaryVectorOp(Op, DAG);
5722 case ISD::FP_TO_SINT:
5723 case ISD::FP_TO_UINT:
5724 return LowerFP_TO_INT(Op, DAG);
5725 case ISD::SHL:
5726 case ISD::SRA:
5727 case ISD::SRL:
5728 case ISD::ADD:
5729 case ISD::SUB:
5730 case ISD::SMIN:
5731 case ISD::SMAX:
5732 case ISD::UMIN:
5733 case ISD::UMAX:
5734 case ISD::FADD:
5735 case ISD::FMUL:
5736 case ISD::FMINNUM_IEEE:
5737 case ISD::FMAXNUM_IEEE:
5738 case ISD::UADDSAT:
5739 case ISD::USUBSAT:
5740 case ISD::SADDSAT:
5741 case ISD::SSUBSAT:
5742 return splitBinaryVectorOp(Op, DAG);
5743 case ISD::MUL:
5744 return lowerMUL(Op, DAG);
5745 case ISD::SMULO:
5746 case ISD::UMULO:
5747 return lowerXMULO(Op, DAG);
5748 case ISD::SMUL_LOHI:
5749 case ISD::UMUL_LOHI:
5750 return lowerXMUL_LOHI(Op, DAG);
5751 case ISD::DYNAMIC_STACKALLOC:
5752 return LowerDYNAMIC_STACKALLOC(Op, DAG);
5753 case ISD::STACKSAVE:
5754 return LowerSTACKSAVE(Op, DAG);
5755 case ISD::GET_ROUNDING:
5756 return lowerGET_ROUNDING(Op, DAG);
5757 case ISD::PREFETCH:
5758 return lowerPREFETCH(Op, DAG);
5759 case ISD::FP_EXTEND:
5760 case ISD::STRICT_FP_EXTEND:
5761 return lowerFP_EXTEND(Op, DAG);
5762 case ISD::GET_FPENV:
5763 return lowerGET_FPENV(Op, DAG);
5764 case ISD::SET_FPENV:
5765 return lowerSET_FPENV(Op, DAG);
5766 }
5767 return SDValue();
5768}
5769
5770// Used for D16: Casts the result of an instruction into the right vector,
5771// packs values if loads return unpacked values.
5772static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT,
5773 const SDLoc &DL,
5774 SelectionDAG &DAG, bool Unpacked) {
5775 if (!LoadVT.isVector())
5776 return Result;
5777
5778 // Cast back to the original packed type or to a larger type that is a
5779 // multiple of 32 bit for D16. Widening the return type is a required for
5780 // legalization.
5781 EVT FittingLoadVT = LoadVT;
5782 if ((LoadVT.getVectorNumElements() % 2) == 1) {
5783 FittingLoadVT =
5784 EVT::getVectorVT(Context&: *DAG.getContext(), VT: LoadVT.getVectorElementType(),
5785 NumElements: LoadVT.getVectorNumElements() + 1);
5786 }
5787
5788 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
5789 // Truncate to v2i16/v4i16.
5790 EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
5791
5792 // Workaround legalizer not scalarizing truncate after vector op
5793 // legalization but not creating intermediate vector trunc.
5794 SmallVector<SDValue, 4> Elts;
5795 DAG.ExtractVectorElements(Op: Result, Args&: Elts);
5796 for (SDValue &Elt : Elts)
5797 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
5798
5799 // Pad illegal v1i16/v3fi6 to v4i16
5800 if ((LoadVT.getVectorNumElements() % 2) == 1)
5801 Elts.push_back(DAG.getUNDEF(MVT::i16));
5802
5803 Result = DAG.getBuildVector(VT: IntLoadVT, DL, Ops: Elts);
5804
5805 // Bitcast to original type (v2f16/v4f16).
5806 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: FittingLoadVT, Operand: Result);
5807 }
5808
5809 // Cast back to the original packed type.
5810 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: FittingLoadVT, Operand: Result);
5811}
5812
5813SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode,
5814 MemSDNode *M,
5815 SelectionDAG &DAG,
5816 ArrayRef<SDValue> Ops,
5817 bool IsIntrinsic) const {
5818 SDLoc DL(M);
5819
5820 bool Unpacked = Subtarget->hasUnpackedD16VMem();
5821 EVT LoadVT = M->getValueType(ResNo: 0);
5822
5823 EVT EquivLoadVT = LoadVT;
5824 if (LoadVT.isVector()) {
5825 if (Unpacked) {
5826 EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
5827 LoadVT.getVectorNumElements());
5828 } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
5829 // Widen v3f16 to legal type
5830 EquivLoadVT =
5831 EVT::getVectorVT(Context&: *DAG.getContext(), VT: LoadVT.getVectorElementType(),
5832 NumElements: LoadVT.getVectorNumElements() + 1);
5833 }
5834 }
5835
5836 // Change from v4f16/v2f16 to EquivLoadVT.
5837 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
5838
5839 SDValue Load
5840 = DAG.getMemIntrinsicNode(
5841 Opcode: IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, dl: DL,
5842 VTList, Ops, MemVT: M->getMemoryVT(),
5843 MMO: M->getMemOperand());
5844
5845 SDValue Adjusted = adjustLoadValueTypeImpl(Result: Load, LoadVT, DL, DAG, Unpacked);
5846
5847 return DAG.getMergeValues(Ops: { Adjusted, Load.getValue(R: 1) }, dl: DL);
5848}
5849
5850SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
5851 SelectionDAG &DAG,
5852 ArrayRef<SDValue> Ops) const {
5853 SDLoc DL(M);
5854 EVT LoadVT = M->getValueType(ResNo: 0);
5855 EVT EltType = LoadVT.getScalarType();
5856 EVT IntVT = LoadVT.changeTypeToInteger();
5857
5858 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
5859
5860 assert(M->getNumValues() == 2 || M->getNumValues() == 3);
5861 bool IsTFE = M->getNumValues() == 3;
5862
5863 unsigned Opc;
5864 if (IsFormat) {
5865 Opc = IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
5866 : AMDGPUISD::BUFFER_LOAD_FORMAT;
5867 } else {
5868 // TODO: Support non-format TFE loads.
5869 if (IsTFE)
5870 return SDValue();
5871 Opc = AMDGPUISD::BUFFER_LOAD;
5872 }
5873
5874 if (IsD16) {
5875 return adjustLoadValueType(Opcode: AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
5876 }
5877
5878 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
5879 if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
5880 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, MMO: M->getMemOperand());
5881
5882 if (isTypeLegal(VT: LoadVT)) {
5883 return getMemIntrinsicNode(Opcode: Opc, DL, VTList: M->getVTList(), Ops, MemVT: IntVT,
5884 MMO: M->getMemOperand(), DAG);
5885 }
5886
5887 EVT CastVT = getEquivalentMemType(Context&: *DAG.getContext(), VT: LoadVT);
5888 SDVTList VTList = DAG.getVTList(CastVT, MVT::Other);
5889 SDValue MemNode = getMemIntrinsicNode(Opcode: Opc, DL, VTList, Ops, MemVT: CastVT,
5890 MMO: M->getMemOperand(), DAG);
5891 return DAG.getMergeValues(
5892 Ops: {DAG.getNode(Opcode: ISD::BITCAST, DL, VT: LoadVT, Operand: MemNode), MemNode.getValue(R: 1)},
5893 dl: DL);
5894}
5895
5896static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI,
5897 SDNode *N, SelectionDAG &DAG) {
5898 EVT VT = N->getValueType(ResNo: 0);
5899 unsigned CondCode = N->getConstantOperandVal(Num: 3);
5900 if (!ICmpInst::isIntPredicate(P: static_cast<ICmpInst::Predicate>(CondCode)))
5901 return DAG.getUNDEF(VT);
5902
5903 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
5904
5905 SDValue LHS = N->getOperand(Num: 1);
5906 SDValue RHS = N->getOperand(Num: 2);
5907
5908 SDLoc DL(N);
5909
5910 EVT CmpVT = LHS.getValueType();
5911 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
5912 unsigned PromoteOp = ICmpInst::isSigned(predicate: IcInput) ?
5913 ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
5914 LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
5915 RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
5916 }
5917
5918 ISD::CondCode CCOpcode = getICmpCondCode(Pred: IcInput);
5919
5920 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
5921 EVT CCVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: WavefrontSize);
5922
5923 SDValue SetCC = DAG.getNode(Opcode: AMDGPUISD::SETCC, DL, VT: CCVT, N1: LHS, N2: RHS,
5924 N3: DAG.getCondCode(Cond: CCOpcode));
5925 if (VT.bitsEq(VT: CCVT))
5926 return SetCC;
5927 return DAG.getZExtOrTrunc(Op: SetCC, DL, VT);
5928}
5929
5930static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI,
5931 SDNode *N, SelectionDAG &DAG) {
5932 EVT VT = N->getValueType(ResNo: 0);
5933
5934 unsigned CondCode = N->getConstantOperandVal(Num: 3);
5935 if (!FCmpInst::isFPPredicate(P: static_cast<FCmpInst::Predicate>(CondCode)))
5936 return DAG.getUNDEF(VT);
5937
5938 SDValue Src0 = N->getOperand(Num: 1);
5939 SDValue Src1 = N->getOperand(Num: 2);
5940 EVT CmpVT = Src0.getValueType();
5941 SDLoc SL(N);
5942
5943 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
5944 Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
5945 Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
5946 }
5947
5948 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
5949 ISD::CondCode CCOpcode = getFCmpCondCode(Pred: IcInput);
5950 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
5951 EVT CCVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: WavefrontSize);
5952 SDValue SetCC = DAG.getNode(Opcode: AMDGPUISD::SETCC, DL: SL, VT: CCVT, N1: Src0,
5953 N2: Src1, N3: DAG.getCondCode(Cond: CCOpcode));
5954 if (VT.bitsEq(VT: CCVT))
5955 return SetCC;
5956 return DAG.getZExtOrTrunc(Op: SetCC, DL: SL, VT);
5957}
5958
5959static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N,
5960 SelectionDAG &DAG) {
5961 EVT VT = N->getValueType(ResNo: 0);
5962 SDValue Src = N->getOperand(Num: 1);
5963 SDLoc SL(N);
5964
5965 if (Src.getOpcode() == ISD::SETCC) {
5966 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
5967 return DAG.getNode(Opcode: AMDGPUISD::SETCC, DL: SL, VT, N1: Src.getOperand(i: 0),
5968 N2: Src.getOperand(i: 1), N3: Src.getOperand(i: 2));
5969 }
5970 if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Val&: Src)) {
5971 // (ballot 0) -> 0
5972 if (Arg->isZero())
5973 return DAG.getConstant(Val: 0, DL: SL, VT);
5974
5975 // (ballot 1) -> EXEC/EXEC_LO
5976 if (Arg->isOne()) {
5977 Register Exec;
5978 if (VT.getScalarSizeInBits() == 32)
5979 Exec = AMDGPU::EXEC_LO;
5980 else if (VT.getScalarSizeInBits() == 64)
5981 Exec = AMDGPU::EXEC;
5982 else
5983 return SDValue();
5984
5985 return DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: SL, Reg: Exec, VT);
5986 }
5987 }
5988
5989 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
5990 // ISD::SETNE)
5991 return DAG.getNode(
5992 AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32),
5993 DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
5994}
5995
5996void SITargetLowering::ReplaceNodeResults(SDNode *N,
5997 SmallVectorImpl<SDValue> &Results,
5998 SelectionDAG &DAG) const {
5999 switch (N->getOpcode()) {
6000 case ISD::INSERT_VECTOR_ELT: {
6001 if (SDValue Res = lowerINSERT_VECTOR_ELT(Op: SDValue(N, 0), DAG))
6002 Results.push_back(Elt: Res);
6003 return;
6004 }
6005 case ISD::EXTRACT_VECTOR_ELT: {
6006 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(Op: SDValue(N, 0), DAG))
6007 Results.push_back(Elt: Res);
6008 return;
6009 }
6010 case ISD::INTRINSIC_WO_CHAIN: {
6011 unsigned IID = N->getConstantOperandVal(Num: 0);
6012 switch (IID) {
6013 case Intrinsic::amdgcn_make_buffer_rsrc:
6014 Results.push_back(Elt: lowerPointerAsRsrcIntrin(Op: N, DAG));
6015 return;
6016 case Intrinsic::amdgcn_cvt_pkrtz: {
6017 SDValue Src0 = N->getOperand(Num: 1);
6018 SDValue Src1 = N->getOperand(Num: 2);
6019 SDLoc SL(N);
6020 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32,
6021 Src0, Src1);
6022 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
6023 return;
6024 }
6025 case Intrinsic::amdgcn_cvt_pknorm_i16:
6026 case Intrinsic::amdgcn_cvt_pknorm_u16:
6027 case Intrinsic::amdgcn_cvt_pk_i16:
6028 case Intrinsic::amdgcn_cvt_pk_u16: {
6029 SDValue Src0 = N->getOperand(Num: 1);
6030 SDValue Src1 = N->getOperand(Num: 2);
6031 SDLoc SL(N);
6032 unsigned Opcode;
6033
6034 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
6035 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
6036 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
6037 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
6038 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
6039 Opcode = AMDGPUISD::CVT_PK_I16_I32;
6040 else
6041 Opcode = AMDGPUISD::CVT_PK_U16_U32;
6042
6043 EVT VT = N->getValueType(ResNo: 0);
6044 if (isTypeLegal(VT))
6045 Results.push_back(Elt: DAG.getNode(Opcode, DL: SL, VT, N1: Src0, N2: Src1));
6046 else {
6047 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
6048 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
6049 }
6050 return;
6051 }
6052 case Intrinsic::amdgcn_s_buffer_load: {
6053 // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
6054 // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
6055 // combiner tries to merge the s_buffer_load_u8 with a sext instruction
6056 // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
6057 // s_buffer_load_i8.
6058 if (!Subtarget->hasScalarSubwordLoads())
6059 return;
6060 SDValue Op = SDValue(N, 0);
6061 SDValue Rsrc = Op.getOperand(i: 1);
6062 SDValue Offset = Op.getOperand(i: 2);
6063 SDValue CachePolicy = Op.getOperand(i: 3);
6064 EVT VT = Op.getValueType();
6065 assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
6066 SDLoc DL(Op);
6067 MachineFunction &MF = DAG.getMachineFunction();
6068 const DataLayout &DataLayout = DAG.getDataLayout();
6069 Align Alignment =
6070 DataLayout.getABITypeAlign(Ty: VT.getTypeForEVT(Context&: *DAG.getContext()));
6071 MachineMemOperand *MMO = MF.getMachineMemOperand(
6072 PtrInfo: MachinePointerInfo(),
6073 F: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
6074 MachineMemOperand::MOInvariant,
6075 Size: VT.getStoreSize(), BaseAlignment: Alignment);
6076 SDValue LoadVal;
6077 if (!Offset->isDivergent()) {
6078 SDValue Ops[] = {Rsrc, // source register
6079 Offset, CachePolicy};
6080 SDValue BufferLoad =
6081 DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD_UBYTE, DL,
6082 DAG.getVTList(MVT::i32), Ops, VT, MMO);
6083 LoadVal = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: BufferLoad);
6084 } else {
6085 SDValue Ops[] = {
6086 DAG.getEntryNode(), // Chain
6087 Rsrc, // rsrc
6088 DAG.getConstant(0, DL, MVT::i32), // vindex
6089 {}, // voffset
6090 {}, // soffset
6091 {}, // offset
6092 CachePolicy, // cachepolicy
6093 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
6094 };
6095 setBufferOffsets(CombinedOffset: Offset, DAG, Offsets: &Ops[3], Alignment: Align(4));
6096 LoadVal = handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
6097 }
6098 Results.push_back(Elt: LoadVal);
6099 return;
6100 }
6101 }
6102 break;
6103 }
6104 case ISD::INTRINSIC_W_CHAIN: {
6105 if (SDValue Res = LowerINTRINSIC_W_CHAIN(Op: SDValue(N, 0), DAG)) {
6106 if (Res.getOpcode() == ISD::MERGE_VALUES) {
6107 // FIXME: Hacky
6108 for (unsigned I = 0; I < Res.getNumOperands(); I++) {
6109 Results.push_back(Elt: Res.getOperand(i: I));
6110 }
6111 } else {
6112 Results.push_back(Elt: Res);
6113 Results.push_back(Elt: Res.getValue(R: 1));
6114 }
6115 return;
6116 }
6117
6118 break;
6119 }
6120 case ISD::SELECT: {
6121 SDLoc SL(N);
6122 EVT VT = N->getValueType(ResNo: 0);
6123 EVT NewVT = getEquivalentMemType(Context&: *DAG.getContext(), VT);
6124 SDValue LHS = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewVT, Operand: N->getOperand(Num: 1));
6125 SDValue RHS = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewVT, Operand: N->getOperand(Num: 2));
6126
6127 EVT SelectVT = NewVT;
6128 if (NewVT.bitsLT(MVT::i32)) {
6129 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
6130 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
6131 SelectVT = MVT::i32;
6132 }
6133
6134 SDValue NewSelect = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: SelectVT,
6135 N1: N->getOperand(Num: 0), N2: LHS, N3: RHS);
6136
6137 if (NewVT != SelectVT)
6138 NewSelect = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: NewVT, Operand: NewSelect);
6139 Results.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: NewSelect));
6140 return;
6141 }
6142 case ISD::FNEG: {
6143 if (N->getValueType(0) != MVT::v2f16)
6144 break;
6145
6146 SDLoc SL(N);
6147 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
6148
6149 SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32,
6150 BC,
6151 DAG.getConstant(0x80008000, SL, MVT::i32));
6152 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
6153 return;
6154 }
6155 case ISD::FABS: {
6156 if (N->getValueType(0) != MVT::v2f16)
6157 break;
6158
6159 SDLoc SL(N);
6160 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
6161
6162 SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32,
6163 BC,
6164 DAG.getConstant(0x7fff7fff, SL, MVT::i32));
6165 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
6166 return;
6167 }
6168 case ISD::FSQRT: {
6169 if (N->getValueType(0) != MVT::f16)
6170 break;
6171 Results.push_back(Elt: lowerFSQRTF16(Op: SDValue(N, 0), DAG));
6172 break;
6173 }
6174 default:
6175 AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
6176 break;
6177 }
6178}
6179
6180/// Helper function for LowerBRCOND
6181static SDNode *findUser(SDValue Value, unsigned Opcode) {
6182
6183 SDNode *Parent = Value.getNode();
6184 for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
6185 I != E; ++I) {
6186
6187 if (I.getUse().get() != Value)
6188 continue;
6189
6190 if (I->getOpcode() == Opcode)
6191 return *I;
6192 }
6193 return nullptr;
6194}
6195
6196unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
6197 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
6198 switch (Intr->getConstantOperandVal(Num: 1)) {
6199 case Intrinsic::amdgcn_if:
6200 return AMDGPUISD::IF;
6201 case Intrinsic::amdgcn_else:
6202 return AMDGPUISD::ELSE;
6203 case Intrinsic::amdgcn_loop:
6204 return AMDGPUISD::LOOP;
6205 case Intrinsic::amdgcn_end_cf:
6206 llvm_unreachable("should not occur");
6207 default:
6208 return 0;
6209 }
6210 }
6211
6212 // break, if_break, else_break are all only used as inputs to loop, not
6213 // directly as branch conditions.
6214 return 0;
6215}
6216
6217bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {
6218 const Triple &TT = getTargetMachine().getTargetTriple();
6219 return (GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
6220 GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
6221 AMDGPU::shouldEmitConstantsToTextSection(TT);
6222}
6223
6224bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
6225 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
6226 return false;
6227
6228 // FIXME: Either avoid relying on address space here or change the default
6229 // address space for functions to avoid the explicit check.
6230 return (GV->getValueType()->isFunctionTy() ||
6231 !isNonGlobalAddrSpace(AS: GV->getAddressSpace())) &&
6232 !shouldEmitFixup(GV) && !getTargetMachine().shouldAssumeDSOLocal(GV);
6233}
6234
6235bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const {
6236 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
6237}
6238
6239bool SITargetLowering::shouldUseLDSConstAddress(const GlobalValue *GV) const {
6240 if (!GV->hasExternalLinkage())
6241 return true;
6242
6243 const auto OS = getTargetMachine().getTargetTriple().getOS();
6244 return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
6245}
6246
6247/// This transforms the control flow intrinsics to get the branch destination as
6248/// last parameter, also switches branch target with BR if the need arise
6249SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
6250 SelectionDAG &DAG) const {
6251 SDLoc DL(BRCOND);
6252
6253 SDNode *Intr = BRCOND.getOperand(i: 1).getNode();
6254 SDValue Target = BRCOND.getOperand(i: 2);
6255 SDNode *BR = nullptr;
6256 SDNode *SetCC = nullptr;
6257
6258 if (Intr->getOpcode() == ISD::SETCC) {
6259 // As long as we negate the condition everything is fine
6260 SetCC = Intr;
6261 Intr = SetCC->getOperand(Num: 0).getNode();
6262
6263 } else {
6264 // Get the target from BR if we don't negate the condition
6265 BR = findUser(Value: BRCOND, Opcode: ISD::BR);
6266 assert(BR && "brcond missing unconditional branch user");
6267 Target = BR->getOperand(Num: 1);
6268 }
6269
6270 unsigned CFNode = isCFIntrinsic(Intr);
6271 if (CFNode == 0) {
6272 // This is a uniform branch so we don't need to legalize.
6273 return BRCOND;
6274 }
6275
6276 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
6277 Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
6278
6279 assert(!SetCC ||
6280 (SetCC->getConstantOperandVal(1) == 1 &&
6281 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
6282 ISD::SETNE));
6283
6284 // operands of the new intrinsic call
6285 SmallVector<SDValue, 4> Ops;
6286 if (HaveChain)
6287 Ops.push_back(Elt: BRCOND.getOperand(i: 0));
6288
6289 Ops.append(in_start: Intr->op_begin() + (HaveChain ? 2 : 1), in_end: Intr->op_end());
6290 Ops.push_back(Elt: Target);
6291
6292 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
6293
6294 // build the new intrinsic call
6295 SDNode *Result = DAG.getNode(Opcode: CFNode, DL, VTList: DAG.getVTList(VTs: Res), Ops).getNode();
6296
6297 if (!HaveChain) {
6298 SDValue Ops[] = {
6299 SDValue(Result, 0),
6300 BRCOND.getOperand(i: 0)
6301 };
6302
6303 Result = DAG.getMergeValues(Ops, dl: DL).getNode();
6304 }
6305
6306 if (BR) {
6307 // Give the branch instruction our target
6308 SDValue Ops[] = {
6309 BR->getOperand(Num: 0),
6310 BRCOND.getOperand(i: 2)
6311 };
6312 SDValue NewBR = DAG.getNode(Opcode: ISD::BR, DL, VTList: BR->getVTList(), Ops);
6313 DAG.ReplaceAllUsesWith(From: BR, To: NewBR.getNode());
6314 }
6315
6316 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
6317
6318 // Copy the intrinsic results to registers
6319 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
6320 SDNode *CopyToReg = findUser(Value: SDValue(Intr, i), Opcode: ISD::CopyToReg);
6321 if (!CopyToReg)
6322 continue;
6323
6324 Chain = DAG.getCopyToReg(
6325 Chain, dl: DL,
6326 Reg: CopyToReg->getOperand(Num: 1),
6327 N: SDValue(Result, i - 1),
6328 Glue: SDValue());
6329
6330 DAG.ReplaceAllUsesWith(From: SDValue(CopyToReg, 0), To: CopyToReg->getOperand(Num: 0));
6331 }
6332
6333 // Remove the old intrinsic from the chain
6334 DAG.ReplaceAllUsesOfValueWith(
6335 From: SDValue(Intr, Intr->getNumValues() - 1),
6336 To: Intr->getOperand(Num: 0));
6337
6338 return Chain;
6339}
6340
6341SDValue SITargetLowering::LowerRETURNADDR(SDValue Op,
6342 SelectionDAG &DAG) const {
6343 MVT VT = Op.getSimpleValueType();
6344 SDLoc DL(Op);
6345 // Checking the depth
6346 if (Op.getConstantOperandVal(i: 0) != 0)
6347 return DAG.getConstant(Val: 0, DL, VT);
6348
6349 MachineFunction &MF = DAG.getMachineFunction();
6350 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
6351 // Check for kernel and shader functions
6352 if (Info->isEntryFunction())
6353 return DAG.getConstant(Val: 0, DL, VT);
6354
6355 MachineFrameInfo &MFI = MF.getFrameInfo();
6356 // There is a call to @llvm.returnaddress in this function
6357 MFI.setReturnAddressIsTaken(true);
6358
6359 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
6360 // Get the return address reg and mark it as an implicit live-in
6361 Register Reg = MF.addLiveIn(PReg: TRI->getReturnAddressReg(MF), RC: getRegClassFor(VT, isDivergent: Op.getNode()->isDivergent()));
6362
6363 return DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: DL, Reg, VT);
6364}
6365
6366SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG,
6367 SDValue Op,
6368 const SDLoc &DL,
6369 EVT VT) const {
6370 return Op.getValueType().bitsLE(VT) ?
6371 DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) :
6372 DAG.getNode(ISD::FP_ROUND, DL, VT, Op,
6373 DAG.getTargetConstant(0, DL, MVT::i32));
6374}
6375
6376SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
6377 assert(Op.getValueType() == MVT::f16 &&
6378 "Do not know how to custom lower FP_ROUND for non-f16 type");
6379
6380 SDValue Src = Op.getOperand(i: 0);
6381 EVT SrcVT = Src.getValueType();
6382 if (SrcVT != MVT::f64)
6383 return Op;
6384
6385 // TODO: Handle strictfp
6386 if (Op.getOpcode() != ISD::FP_ROUND)
6387 return Op;
6388
6389 SDLoc DL(Op);
6390
6391 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
6392 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
6393 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
6394}
6395
6396SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
6397 SelectionDAG &DAG) const {
6398 EVT VT = Op.getValueType();
6399 const MachineFunction &MF = DAG.getMachineFunction();
6400 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
6401 bool IsIEEEMode = Info->getMode().IEEE;
6402
6403 // FIXME: Assert during selection that this is only selected for
6404 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
6405 // mode functions, but this happens to be OK since it's only done in cases
6406 // where there is known no sNaN.
6407 if (IsIEEEMode)
6408 return expandFMINNUM_FMAXNUM(N: Op.getNode(), DAG);
6409
6410 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
6411 VT == MVT::v16bf16)
6412 return splitBinaryVectorOp(Op, DAG);
6413 return Op;
6414}
6415
6416SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
6417 bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
6418 EVT VT = Op.getValueType();
6419 assert(VT == MVT::f16);
6420
6421 SDValue Exp = Op.getOperand(i: IsStrict ? 2 : 1);
6422 EVT ExpVT = Exp.getValueType();
6423 if (ExpVT == MVT::i16)
6424 return Op;
6425
6426 SDLoc DL(Op);
6427
6428 // Correct the exponent type for f16 to i16.
6429 // Clamp the range of the exponent to the instruction's range.
6430
6431 // TODO: This should be a generic narrowing legalization, and can easily be
6432 // for GlobalISel.
6433
6434 SDValue MinExp = DAG.getConstant(Val: minIntN(N: 16), DL, VT: ExpVT);
6435 SDValue ClampMin = DAG.getNode(Opcode: ISD::SMAX, DL, VT: ExpVT, N1: Exp, N2: MinExp);
6436
6437 SDValue MaxExp = DAG.getConstant(Val: maxIntN(N: 16), DL, VT: ExpVT);
6438 SDValue Clamp = DAG.getNode(Opcode: ISD::SMIN, DL, VT: ExpVT, N1: ClampMin, N2: MaxExp);
6439
6440 SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp);
6441
6442 if (IsStrict) {
6443 return DAG.getNode(ISD::STRICT_FLDEXP, DL, {VT, MVT::Other},
6444 {Op.getOperand(0), Op.getOperand(1), TruncExp});
6445 }
6446
6447 return DAG.getNode(Opcode: ISD::FLDEXP, DL, VT, N1: Op.getOperand(i: 0), N2: TruncExp);
6448}
6449
6450// Custom lowering for vector multiplications and s_mul_u64.
6451SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
6452 EVT VT = Op.getValueType();
6453
6454 // Split vector operands.
6455 if (VT.isVector())
6456 return splitBinaryVectorOp(Op, DAG);
6457
6458 assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");
6459
6460 // There are four ways to lower s_mul_u64:
6461 //
6462 // 1. If all the operands are uniform, then we lower it as it is.
6463 //
6464 // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
6465 // multiplications because there is not a vector equivalent of s_mul_u64.
6466 //
6467 // 3. If the cost model decides that it is more efficient to use vector
6468 // registers, then we have to split s_mul_u64 in 32-bit multiplications.
6469 // This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
6470 //
6471 // 4. If the cost model decides to use vector registers and both of the
6472 // operands are zero-extended/sign-extended from 32-bits, then we split the
6473 // s_mul_u64 in two 32-bit multiplications. The problem is that it is not
6474 // possible to check if the operands are zero-extended or sign-extended in
6475 // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
6476 // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
6477 // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
6478 // If the cost model decides that we have to use vector registers, then
6479 // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
6480 // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
6481 // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
6482 // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
6483 // SIInstrInfo.cpp .
6484
6485 if (Op->isDivergent())
6486 return SDValue();
6487
6488 SDValue Op0 = Op.getOperand(i: 0);
6489 SDValue Op1 = Op.getOperand(i: 1);
6490 // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
6491 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
6492 // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
6493 KnownBits Op0KnownBits = DAG.computeKnownBits(Op: Op0);
6494 unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();
6495 KnownBits Op1KnownBits = DAG.computeKnownBits(Op: Op1);
6496 unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();
6497 SDLoc SL(Op);
6498 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
6499 return SDValue(
6500 DAG.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
6501 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op: Op0);
6502 unsigned Op1SignBits = DAG.ComputeNumSignBits(Op: Op1);
6503 if (Op0SignBits >= 33 && Op1SignBits >= 33)
6504 return SDValue(
6505 DAG.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
6506 // If all the operands are uniform, then we lower s_mul_u64 as it is.
6507 return Op;
6508}
6509
6510SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
6511 EVT VT = Op.getValueType();
6512 SDLoc SL(Op);
6513 SDValue LHS = Op.getOperand(i: 0);
6514 SDValue RHS = Op.getOperand(i: 1);
6515 bool isSigned = Op.getOpcode() == ISD::SMULO;
6516
6517 if (ConstantSDNode *RHSC = isConstOrConstSplat(N: RHS)) {
6518 const APInt &C = RHSC->getAPIntValue();
6519 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
6520 if (C.isPowerOf2()) {
6521 // smulo(x, signed_min) is same as umulo(x, signed_min).
6522 bool UseArithShift = isSigned && !C.isMinSignedValue();
6523 SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32);
6524 SDValue Result = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT, N1: LHS, N2: ShiftAmt);
6525 SDValue Overflow = DAG.getSetCC(SL, MVT::i1,
6526 DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL,
6527 SL, VT, Result, ShiftAmt),
6528 LHS, ISD::SETNE);
6529 return DAG.getMergeValues(Ops: { Result, Overflow }, dl: SL);
6530 }
6531 }
6532
6533 SDValue Result = DAG.getNode(Opcode: ISD::MUL, DL: SL, VT, N1: LHS, N2: RHS);
6534 SDValue Top = DAG.getNode(Opcode: isSigned ? ISD::MULHS : ISD::MULHU,
6535 DL: SL, VT, N1: LHS, N2: RHS);
6536
6537 SDValue Sign = isSigned
6538 ? DAG.getNode(ISD::SRA, SL, VT, Result,
6539 DAG.getConstant(VT.getScalarSizeInBits() - 1, SL, MVT::i32))
6540 : DAG.getConstant(0, SL, VT);
6541 SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE);
6542
6543 return DAG.getMergeValues(Ops: { Result, Overflow }, dl: SL);
6544}
6545
6546SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
6547 if (Op->isDivergent()) {
6548 // Select to V_MAD_[IU]64_[IU]32.
6549 return Op;
6550 }
6551 if (Subtarget->hasSMulHi()) {
6552 // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
6553 return SDValue();
6554 }
6555 // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
6556 // calculate the high part, so we might as well do the whole thing with
6557 // V_MAD_[IU]64_[IU]32.
6558 return Op;
6559}
6560
6561SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
6562 if (!Subtarget->isTrapHandlerEnabled() ||
6563 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
6564 return lowerTrapEndpgm(Op, DAG);
6565
6566 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG) :
6567 lowerTrapHsaQueuePtr(Op, DAG);
6568}
6569
6570SDValue SITargetLowering::lowerTrapEndpgm(
6571 SDValue Op, SelectionDAG &DAG) const {
6572 SDLoc SL(Op);
6573 SDValue Chain = Op.getOperand(i: 0);
6574 return DAG.getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
6575}
6576
6577SDValue SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
6578 const SDLoc &DL, Align Alignment, ImplicitParameter Param) const {
6579 MachineFunction &MF = DAG.getMachineFunction();
6580 uint64_t Offset = getImplicitParameterOffset(MF, Param);
6581 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL: DL, Chain: DAG.getEntryNode(), Offset);
6582 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
6583 return DAG.getLoad(VT, dl: DL, Chain: DAG.getEntryNode(), Ptr, PtrInfo, Alignment,
6584 MMOFlags: MachineMemOperand::MODereferenceable |
6585 MachineMemOperand::MOInvariant);
6586}
6587
6588SDValue SITargetLowering::lowerTrapHsaQueuePtr(
6589 SDValue Op, SelectionDAG &DAG) const {
6590 SDLoc SL(Op);
6591 SDValue Chain = Op.getOperand(i: 0);
6592
6593 SDValue QueuePtr;
6594 // For code object version 5, QueuePtr is passed through implicit kernarg.
6595 const Module *M = DAG.getMachineFunction().getFunction().getParent();
6596 if (AMDGPU::getAMDHSACodeObjectVersion(M: *M) >= AMDGPU::AMDHSA_COV5) {
6597 QueuePtr =
6598 loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);
6599 } else {
6600 MachineFunction &MF = DAG.getMachineFunction();
6601 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
6602 Register UserSGPR = Info->getQueuePtrUserSGPR();
6603
6604 if (UserSGPR == AMDGPU::NoRegister) {
6605 // We probably are in a function incorrectly marked with
6606 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
6607 // trap, so just use a null pointer.
6608 QueuePtr = DAG.getConstant(0, SL, MVT::i64);
6609 } else {
6610 QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR,
6611 MVT::i64);
6612 }
6613 }
6614
6615 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
6616 SDValue ToReg = DAG.getCopyToReg(Chain, dl: SL, Reg: SGPR01,
6617 N: QueuePtr, Glue: SDValue());
6618
6619 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
6620 SDValue Ops[] = {
6621 ToReg,
6622 DAG.getTargetConstant(TrapID, SL, MVT::i16),
6623 SGPR01,
6624 ToReg.getValue(1)
6625 };
6626 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
6627}
6628
6629SDValue SITargetLowering::lowerTrapHsa(
6630 SDValue Op, SelectionDAG &DAG) const {
6631 SDLoc SL(Op);
6632 SDValue Chain = Op.getOperand(i: 0);
6633
6634 // We need to simulate the 's_trap 2' instruction on targets that run in
6635 // PRIV=1 (where it is treated as a nop).
6636 if (Subtarget->hasPrivEnabledTrap2NopBug())
6637 return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
6638
6639 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
6640 SDValue Ops[] = {
6641 Chain,
6642 DAG.getTargetConstant(TrapID, SL, MVT::i16)
6643 };
6644 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
6645}
6646
6647SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
6648 SDLoc SL(Op);
6649 SDValue Chain = Op.getOperand(i: 0);
6650 MachineFunction &MF = DAG.getMachineFunction();
6651
6652 if (!Subtarget->isTrapHandlerEnabled() ||
6653 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
6654 DiagnosticInfoUnsupported NoTrap(MF.getFunction(),
6655 "debugtrap handler not supported",
6656 Op.getDebugLoc(),
6657 DS_Warning);
6658 LLVMContext &Ctx = MF.getFunction().getContext();
6659 Ctx.diagnose(DI: NoTrap);
6660 return Chain;
6661 }
6662
6663 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap);
6664 SDValue Ops[] = {
6665 Chain,
6666 DAG.getTargetConstant(TrapID, SL, MVT::i16)
6667 };
6668 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
6669}
6670
6671SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
6672 SelectionDAG &DAG) const {
6673 if (Subtarget->hasApertureRegs()) {
6674 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
6675 ? AMDGPU::SRC_SHARED_BASE
6676 : AMDGPU::SRC_PRIVATE_BASE;
6677 // Note: this feature (register) is broken. When used as a 32-bit operand,
6678 // it returns a wrong value (all zeroes?). The real value is in the upper 32
6679 // bits.
6680 //
6681 // To work around the issue, directly emit a 64 bit mov from this register
6682 // then extract the high bits. Note that this shouldn't even result in a
6683 // shift being emitted and simply become a pair of registers (e.g.):
6684 // s_mov_b64 s[6:7], src_shared_base
6685 // v_mov_b32_e32 v1, s7
6686 //
6687 // FIXME: It would be more natural to emit a CopyFromReg here, but then copy
6688 // coalescing would kick in and it would think it's okay to use the "HI"
6689 // subregister directly (instead of extracting the HI 32 bits) which is an
6690 // artificial (unusable) register.
6691 // Register TableGen definitions would need an overhaul to get rid of the
6692 // artificial "HI" aperture registers and prevent this kind of issue from
6693 // happening.
6694 SDNode *Mov = DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64,
6695 DAG.getRegister(ApertureRegNo, MVT::i64));
6696 return DAG.getNode(
6697 ISD::TRUNCATE, DL, MVT::i32,
6698 DAG.getNode(ISD::SRL, DL, MVT::i64,
6699 {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)}));
6700 }
6701
6702 // For code object version 5, private_base and shared_base are passed through
6703 // implicit kernargs.
6704 const Module *M = DAG.getMachineFunction().getFunction().getParent();
6705 if (AMDGPU::getAMDHSACodeObjectVersion(M: *M) >= AMDGPU::AMDHSA_COV5) {
6706 ImplicitParameter Param =
6707 (AS == AMDGPUAS::LOCAL_ADDRESS) ? SHARED_BASE : PRIVATE_BASE;
6708 return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
6709 }
6710
6711 MachineFunction &MF = DAG.getMachineFunction();
6712 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
6713 Register UserSGPR = Info->getQueuePtrUserSGPR();
6714 if (UserSGPR == AMDGPU::NoRegister) {
6715 // We probably are in a function incorrectly marked with
6716 // amdgpu-no-queue-ptr. This is undefined.
6717 return DAG.getUNDEF(MVT::i32);
6718 }
6719
6720 SDValue QueuePtr = CreateLiveInRegister(
6721 DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
6722
6723 // Offset into amd_queue_t for group_segment_aperture_base_hi /
6724 // private_segment_aperture_base_hi.
6725 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
6726
6727 SDValue Ptr =
6728 DAG.getObjectPtrOffset(SL: DL, Ptr: QueuePtr, Offset: TypeSize::getFixed(ExactSize: StructOffset));
6729
6730 // TODO: Use custom target PseudoSourceValue.
6731 // TODO: We should use the value from the IR intrinsic call, but it might not
6732 // be available and how do we get it?
6733 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
6734 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
6735 commonAlignment(Align(64), StructOffset),
6736 MachineMemOperand::MODereferenceable |
6737 MachineMemOperand::MOInvariant);
6738}
6739
6740/// Return true if the value is a known valid address, such that a null check is
6741/// not necessary.
6742static bool isKnownNonNull(SDValue Val, SelectionDAG &DAG,
6743 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
6744 if (isa<FrameIndexSDNode>(Val) || isa<GlobalAddressSDNode>(Val) ||
6745 isa<BasicBlockSDNode>(Val))
6746 return true;
6747
6748 if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
6749 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
6750
6751 // TODO: Search through arithmetic, handle arguments and loads
6752 // marked nonnull.
6753 return false;
6754}
6755
6756SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
6757 SelectionDAG &DAG) const {
6758 SDLoc SL(Op);
6759
6760 const AMDGPUTargetMachine &TM =
6761 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
6762
6763 unsigned DestAS, SrcAS;
6764 SDValue Src;
6765 bool IsNonNull = false;
6766 if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Val&: Op)) {
6767 SrcAS = ASC->getSrcAddressSpace();
6768 Src = ASC->getOperand(Num: 0);
6769 DestAS = ASC->getDestAddressSpace();
6770 } else {
6771 assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
6772 Op.getConstantOperandVal(0) ==
6773 Intrinsic::amdgcn_addrspacecast_nonnull);
6774 Src = Op->getOperand(Num: 1);
6775 SrcAS = Op->getConstantOperandVal(Num: 2);
6776 DestAS = Op->getConstantOperandVal(Num: 3);
6777 IsNonNull = true;
6778 }
6779
6780 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
6781
6782 // flat -> local/private
6783 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
6784 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
6785 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
6786 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
6787
6788 if (IsNonNull || isKnownNonNull(Val: Op, DAG, TM, AddrSpace: SrcAS))
6789 return Ptr;
6790
6791 unsigned NullVal = TM.getNullPointerValue(AddrSpace: DestAS);
6792 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
6793 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
6794
6795 return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr,
6796 SegmentNullPtr);
6797 }
6798 }
6799
6800 // local/private -> flat
6801 if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
6802 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
6803 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
6804
6805 SDValue Aperture = getSegmentAperture(AS: SrcAS, DL: SL, DAG);
6806 SDValue CvtPtr =
6807 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
6808 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
6809
6810 if (IsNonNull || isKnownNonNull(Val: Op, DAG, TM, AddrSpace: SrcAS))
6811 return CvtPtr;
6812
6813 unsigned NullVal = TM.getNullPointerValue(AddrSpace: SrcAS);
6814 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
6815
6816 SDValue NonNull
6817 = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
6818
6819 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr,
6820 FlatNullPtr);
6821 }
6822 }
6823
6824 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
6825 Op.getValueType() == MVT::i64) {
6826 const SIMachineFunctionInfo *Info =
6827 DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();
6828 SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);
6829 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi);
6830 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
6831 }
6832
6833 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
6834 Src.getValueType() == MVT::i64)
6835 return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
6836
6837 // global <-> flat are no-ops and never emitted.
6838
6839 const MachineFunction &MF = DAG.getMachineFunction();
6840 DiagnosticInfoUnsupported InvalidAddrSpaceCast(
6841 MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
6842 DAG.getContext()->diagnose(DI: InvalidAddrSpaceCast);
6843
6844 return DAG.getUNDEF(VT: Op->getValueType(ResNo: 0));
6845}
6846
6847// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
6848// the small vector and inserting them into the big vector. That is better than
6849// the default expansion of doing it via a stack slot. Even though the use of
6850// the stack slot would be optimized away afterwards, the stack slot itself
6851// remains.
6852SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
6853 SelectionDAG &DAG) const {
6854 SDValue Vec = Op.getOperand(i: 0);
6855 SDValue Ins = Op.getOperand(i: 1);
6856 SDValue Idx = Op.getOperand(i: 2);
6857 EVT VecVT = Vec.getValueType();
6858 EVT InsVT = Ins.getValueType();
6859 EVT EltVT = VecVT.getVectorElementType();
6860 unsigned InsNumElts = InsVT.getVectorNumElements();
6861 unsigned IdxVal = Idx->getAsZExtVal();
6862 SDLoc SL(Op);
6863
6864 if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
6865 // Insert 32-bit registers at a time.
6866 assert(InsNumElts % 2 == 0 && "expect legal vector types");
6867
6868 unsigned VecNumElts = VecVT.getVectorNumElements();
6869 EVT NewVecVT =
6870 EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2);
6871 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
6872 : EVT::getVectorVT(*DAG.getContext(),
6873 MVT::i32, InsNumElts / 2);
6874
6875 Vec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewVecVT, Operand: Vec);
6876 Ins = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewInsVT, Operand: Ins);
6877
6878 for (unsigned I = 0; I != InsNumElts / 2; ++I) {
6879 SDValue Elt;
6880 if (InsNumElts == 2) {
6881 Elt = Ins;
6882 } else {
6883 Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins,
6884 DAG.getConstant(I, SL, MVT::i32));
6885 }
6886 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt,
6887 DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32));
6888 }
6889
6890 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: VecVT, Operand: Vec);
6891 }
6892
6893 for (unsigned I = 0; I != InsNumElts; ++I) {
6894 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins,
6895 DAG.getConstant(I, SL, MVT::i32));
6896 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt,
6897 DAG.getConstant(IdxVal + I, SL, MVT::i32));
6898 }
6899 return Vec;
6900}
6901
6902SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
6903 SelectionDAG &DAG) const {
6904 SDValue Vec = Op.getOperand(i: 0);
6905 SDValue InsVal = Op.getOperand(i: 1);
6906 SDValue Idx = Op.getOperand(i: 2);
6907 EVT VecVT = Vec.getValueType();
6908 EVT EltVT = VecVT.getVectorElementType();
6909 unsigned VecSize = VecVT.getSizeInBits();
6910 unsigned EltSize = EltVT.getSizeInBits();
6911 SDLoc SL(Op);
6912
6913 // Specially handle the case of v4i16 with static indexing.
6914 unsigned NumElts = VecVT.getVectorNumElements();
6915 auto KIdx = dyn_cast<ConstantSDNode>(Val&: Idx);
6916 if (NumElts == 4 && EltSize == 16 && KIdx) {
6917 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
6918
6919 SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
6920 DAG.getConstant(0, SL, MVT::i32));
6921 SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
6922 DAG.getConstant(1, SL, MVT::i32));
6923
6924 SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
6925 SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
6926
6927 unsigned Idx = KIdx->getZExtValue();
6928 bool InsertLo = Idx < 2;
6929 SDValue InsHalf = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16,
6930 InsertLo ? LoVec : HiVec,
6931 DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
6932 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
6933
6934 InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
6935
6936 SDValue Concat = InsertLo ?
6937 DAG.getBuildVector(MVT::v2i32, SL, { InsHalf, HiHalf }) :
6938 DAG.getBuildVector(MVT::v2i32, SL, { LoHalf, InsHalf });
6939
6940 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: VecVT, Operand: Concat);
6941 }
6942
6943 // Static indexing does not lower to stack access, and hence there is no need
6944 // for special custom lowering to avoid stack access.
6945 if (isa<ConstantSDNode>(Val: Idx))
6946 return SDValue();
6947
6948 // Avoid stack access for dynamic indexing by custom lowering to
6949 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
6950
6951 assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
6952
6953 MVT IntVT = MVT::getIntegerVT(BitWidth: VecSize);
6954
6955 // Convert vector index to bit-index and get the required bit mask.
6956 assert(isPowerOf2_32(EltSize));
6957 const auto EltMask = maskTrailingOnes<uint64_t>(N: EltSize);
6958 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
6959 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
6960 SDValue BFM = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: IntVT,
6961 N1: DAG.getConstant(Val: EltMask, DL: SL, VT: IntVT), N2: ScaledIdx);
6962
6963 // 1. Create a congruent vector with the target value in each element.
6964 SDValue ExtVal = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: IntVT,
6965 Operand: DAG.getSplatBuildVector(VT: VecVT, DL: SL, Op: InsVal));
6966
6967 // 2. Mask off all other indicies except the required index within (1).
6968 SDValue LHS = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: IntVT, N1: BFM, N2: ExtVal);
6969
6970 // 3. Mask off the required index within the target vector.
6971 SDValue BCVec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: IntVT, Operand: Vec);
6972 SDValue RHS = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: IntVT,
6973 N1: DAG.getNOT(DL: SL, Val: BFM, VT: IntVT), N2: BCVec);
6974
6975 // 4. Get (2) and (3) ORed into the target vector.
6976 SDValue BFI = DAG.getNode(Opcode: ISD::OR, DL: SL, VT: IntVT, N1: LHS, N2: RHS);
6977
6978 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: VecVT, Operand: BFI);
6979}
6980
6981SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
6982 SelectionDAG &DAG) const {
6983 SDLoc SL(Op);
6984
6985 EVT ResultVT = Op.getValueType();
6986 SDValue Vec = Op.getOperand(i: 0);
6987 SDValue Idx = Op.getOperand(i: 1);
6988 EVT VecVT = Vec.getValueType();
6989 unsigned VecSize = VecVT.getSizeInBits();
6990 EVT EltVT = VecVT.getVectorElementType();
6991
6992 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
6993
6994 // Make sure we do any optimizations that will make it easier to fold
6995 // source modifiers before obscuring it with bit operations.
6996
6997 // XXX - Why doesn't this get called when vector_shuffle is expanded?
6998 if (SDValue Combined = performExtractVectorEltCombine(N: Op.getNode(), DCI))
6999 return Combined;
7000
7001 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
7002 SDValue Lo, Hi;
7003 EVT LoVT, HiVT;
7004 std::tie(args&: LoVT, args&: HiVT) = DAG.GetSplitDestVTs(VT: VecVT);
7005
7006 if (VecSize == 128) {
7007 SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
7008 Lo = DAG.getBitcast(LoVT,
7009 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7010 DAG.getConstant(0, SL, MVT::i32)));
7011 Hi = DAG.getBitcast(HiVT,
7012 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7013 DAG.getConstant(1, SL, MVT::i32)));
7014 } else if (VecSize == 256) {
7015 SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
7016 SDValue Parts[4];
7017 for (unsigned P = 0; P < 4; ++P) {
7018 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7019 DAG.getConstant(P, SL, MVT::i32));
7020 }
7021
7022 Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
7023 Parts[0], Parts[1]));
7024 Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
7025 Parts[2], Parts[3]));
7026 } else {
7027 assert(VecSize == 512);
7028
7029 SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec);
7030 SDValue Parts[8];
7031 for (unsigned P = 0; P < 8; ++P) {
7032 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7033 DAG.getConstant(P, SL, MVT::i32));
7034 }
7035
7036 Lo = DAG.getBitcast(LoVT,
7037 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
7038 Parts[0], Parts[1], Parts[2], Parts[3]));
7039 Hi = DAG.getBitcast(HiVT,
7040 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
7041 Parts[4], Parts[5],Parts[6], Parts[7]));
7042 }
7043
7044 EVT IdxVT = Idx.getValueType();
7045 unsigned NElem = VecVT.getVectorNumElements();
7046 assert(isPowerOf2_32(NElem));
7047 SDValue IdxMask = DAG.getConstant(Val: NElem / 2 - 1, DL: SL, VT: IdxVT);
7048 SDValue NewIdx = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: IdxVT, N1: Idx, N2: IdxMask);
7049 SDValue Half = DAG.getSelectCC(DL: SL, LHS: Idx, RHS: IdxMask, True: Hi, False: Lo, Cond: ISD::SETUGT);
7050 return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: EltVT, N1: Half, N2: NewIdx);
7051 }
7052
7053 assert(VecSize <= 64);
7054
7055 MVT IntVT = MVT::getIntegerVT(BitWidth: VecSize);
7056
7057 // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
7058 SDValue VecBC = peekThroughBitcasts(V: Vec);
7059 if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
7060 SDValue Src = VecBC.getOperand(i: 0);
7061 Src = DAG.getBitcast(VT: Src.getValueType().changeTypeToInteger(), V: Src);
7062 Vec = DAG.getAnyExtOrTrunc(Op: Src, DL: SL, VT: IntVT);
7063 }
7064
7065 unsigned EltSize = EltVT.getSizeInBits();
7066 assert(isPowerOf2_32(EltSize));
7067
7068 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
7069
7070 // Convert vector index to bit-index (* EltSize)
7071 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
7072
7073 SDValue BC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: IntVT, Operand: Vec);
7074 SDValue Elt = DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: IntVT, N1: BC, N2: ScaledIdx);
7075
7076 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
7077 SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
7078 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: ResultVT, Operand: Result);
7079 }
7080
7081 return DAG.getAnyExtOrTrunc(Op: Elt, DL: SL, VT: ResultVT);
7082}
7083
7084static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
7085 assert(Elt % 2 == 0);
7086 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
7087}
7088
7089SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
7090 SelectionDAG &DAG) const {
7091 SDLoc SL(Op);
7092 EVT ResultVT = Op.getValueType();
7093 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Val&: Op);
7094
7095 EVT PackVT = ResultVT.isInteger() ? MVT::v2i16 : MVT::v2f16;
7096 EVT EltVT = PackVT.getVectorElementType();
7097 int SrcNumElts = Op.getOperand(i: 0).getValueType().getVectorNumElements();
7098
7099 // vector_shuffle <0,1,6,7> lhs, rhs
7100 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
7101 //
7102 // vector_shuffle <6,7,2,3> lhs, rhs
7103 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
7104 //
7105 // vector_shuffle <6,7,0,1> lhs, rhs
7106 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
7107
7108 // Avoid scalarizing when both halves are reading from consecutive elements.
7109 SmallVector<SDValue, 4> Pieces;
7110 for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
7111 if (elementPairIsContiguous(Mask: SVN->getMask(), Elt: I)) {
7112 const int Idx = SVN->getMaskElt(Idx: I);
7113 int VecIdx = Idx < SrcNumElts ? 0 : 1;
7114 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
7115 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL,
7116 PackVT, SVN->getOperand(VecIdx),
7117 DAG.getConstant(EltIdx, SL, MVT::i32));
7118 Pieces.push_back(Elt: SubVec);
7119 } else {
7120 const int Idx0 = SVN->getMaskElt(Idx: I);
7121 const int Idx1 = SVN->getMaskElt(Idx: I + 1);
7122 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
7123 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
7124 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
7125 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
7126
7127 SDValue Vec0 = SVN->getOperand(Num: VecIdx0);
7128 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
7129 Vec0, DAG.getConstant(EltIdx0, SL, MVT::i32));
7130
7131 SDValue Vec1 = SVN->getOperand(Num: VecIdx1);
7132 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
7133 Vec1, DAG.getConstant(EltIdx1, SL, MVT::i32));
7134 Pieces.push_back(Elt: DAG.getBuildVector(VT: PackVT, DL: SL, Ops: { Elt0, Elt1 }));
7135 }
7136 }
7137
7138 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SL, VT: ResultVT, Ops: Pieces);
7139}
7140
7141SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
7142 SelectionDAG &DAG) const {
7143 SDValue SVal = Op.getOperand(i: 0);
7144 EVT ResultVT = Op.getValueType();
7145 EVT SValVT = SVal.getValueType();
7146 SDValue UndefVal = DAG.getUNDEF(VT: SValVT);
7147 SDLoc SL(Op);
7148
7149 SmallVector<SDValue, 8> VElts;
7150 VElts.push_back(Elt: SVal);
7151 for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
7152 VElts.push_back(Elt: UndefVal);
7153
7154 return DAG.getBuildVector(VT: ResultVT, DL: SL, Ops: VElts);
7155}
7156
7157SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
7158 SelectionDAG &DAG) const {
7159 SDLoc SL(Op);
7160 EVT VT = Op.getValueType();
7161
7162 if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
7163 VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
7164 EVT HalfVT = MVT::getVectorVT(VT: VT.getVectorElementType().getSimpleVT(),
7165 NumElements: VT.getVectorNumElements() / 2);
7166 MVT HalfIntVT = MVT::getIntegerVT(BitWidth: HalfVT.getSizeInBits());
7167
7168 // Turn into pair of packed build_vectors.
7169 // TODO: Special case for constants that can be materialized with s_mov_b64.
7170 SmallVector<SDValue, 4> LoOps, HiOps;
7171 for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I != E; ++I) {
7172 LoOps.push_back(Elt: Op.getOperand(i: I));
7173 HiOps.push_back(Elt: Op.getOperand(i: I + E));
7174 }
7175 SDValue Lo = DAG.getBuildVector(VT: HalfVT, DL: SL, Ops: LoOps);
7176 SDValue Hi = DAG.getBuildVector(VT: HalfVT, DL: SL, Ops: HiOps);
7177
7178 SDValue CastLo = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: HalfIntVT, Operand: Lo);
7179 SDValue CastHi = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: HalfIntVT, Operand: Hi);
7180
7181 SDValue Blend = DAG.getBuildVector(VT: MVT::getVectorVT(VT: HalfIntVT, NumElements: 2), DL: SL,
7182 Ops: { CastLo, CastHi });
7183 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Blend);
7184 }
7185
7186 if (VT == MVT::v16i16 || VT == MVT::v16f16 || VT == MVT::v16bf16) {
7187 EVT QuarterVT = MVT::getVectorVT(VT: VT.getVectorElementType().getSimpleVT(),
7188 NumElements: VT.getVectorNumElements() / 4);
7189 MVT QuarterIntVT = MVT::getIntegerVT(BitWidth: QuarterVT.getSizeInBits());
7190
7191 SmallVector<SDValue, 4> Parts[4];
7192 for (unsigned I = 0, E = VT.getVectorNumElements() / 4; I != E; ++I) {
7193 for (unsigned P = 0; P < 4; ++P)
7194 Parts[P].push_back(Elt: Op.getOperand(i: I + P * E));
7195 }
7196 SDValue Casts[4];
7197 for (unsigned P = 0; P < 4; ++P) {
7198 SDValue Vec = DAG.getBuildVector(VT: QuarterVT, DL: SL, Ops: Parts[P]);
7199 Casts[P] = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: QuarterIntVT, Operand: Vec);
7200 }
7201
7202 SDValue Blend =
7203 DAG.getBuildVector(VT: MVT::getVectorVT(VT: QuarterIntVT, NumElements: 4), DL: SL, Ops: Casts);
7204 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Blend);
7205 }
7206
7207 if (VT == MVT::v32i16 || VT == MVT::v32f16 || VT == MVT::v32bf16) {
7208 EVT QuarterVT = MVT::getVectorVT(VT: VT.getVectorElementType().getSimpleVT(),
7209 NumElements: VT.getVectorNumElements() / 8);
7210 MVT QuarterIntVT = MVT::getIntegerVT(BitWidth: QuarterVT.getSizeInBits());
7211
7212 SmallVector<SDValue, 8> Parts[8];
7213 for (unsigned I = 0, E = VT.getVectorNumElements() / 8; I != E; ++I) {
7214 for (unsigned P = 0; P < 8; ++P)
7215 Parts[P].push_back(Elt: Op.getOperand(i: I + P * E));
7216 }
7217 SDValue Casts[8];
7218 for (unsigned P = 0; P < 8; ++P) {
7219 SDValue Vec = DAG.getBuildVector(VT: QuarterVT, DL: SL, Ops: Parts[P]);
7220 Casts[P] = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: QuarterIntVT, Operand: Vec);
7221 }
7222
7223 SDValue Blend =
7224 DAG.getBuildVector(VT: MVT::getVectorVT(VT: QuarterIntVT, NumElements: 8), DL: SL, Ops: Casts);
7225 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Blend);
7226 }
7227
7228 assert(VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16);
7229 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
7230
7231 SDValue Lo = Op.getOperand(i: 0);
7232 SDValue Hi = Op.getOperand(i: 1);
7233
7234 // Avoid adding defined bits with the zero_extend.
7235 if (Hi.isUndef()) {
7236 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
7237 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
7238 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: ExtLo);
7239 }
7240
7241 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
7242 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
7243
7244 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
7245 DAG.getConstant(16, SL, MVT::i32));
7246 if (Lo.isUndef())
7247 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: ShlHi);
7248
7249 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
7250 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
7251
7252 SDValue Or = DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi);
7253 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Or);
7254}
7255
7256bool
7257SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
7258 // OSes that use ELF REL relocations (instead of RELA) can only store a
7259 // 32-bit addend in the instruction, so it is not safe to allow offset folding
7260 // which can create arbitrary 64-bit addends. (This is only a problem for
7261 // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
7262 // the high 32 bits of the addend.)
7263 //
7264 // This should be kept in sync with how HasRelocationAddend is initialized in
7265 // the constructor of ELFAMDGPUAsmBackend.
7266 if (!Subtarget->isAmdHsaOS())
7267 return false;
7268
7269 // We can fold offsets for anything that doesn't require a GOT relocation.
7270 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
7271 GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
7272 GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
7273 !shouldEmitGOTReloc(GV: GA->getGlobal());
7274}
7275
7276static SDValue
7277buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
7278 const SDLoc &DL, int64_t Offset, EVT PtrVT,
7279 unsigned GAFlags = SIInstrInfo::MO_NONE) {
7280 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
7281 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
7282 // lowered to the following code sequence:
7283 //
7284 // For constant address space:
7285 // s_getpc_b64 s[0:1]
7286 // s_add_u32 s0, s0, $symbol
7287 // s_addc_u32 s1, s1, 0
7288 //
7289 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
7290 // a fixup or relocation is emitted to replace $symbol with a literal
7291 // constant, which is a pc-relative offset from the encoding of the $symbol
7292 // operand to the global variable.
7293 //
7294 // For global address space:
7295 // s_getpc_b64 s[0:1]
7296 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
7297 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
7298 //
7299 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
7300 // fixups or relocations are emitted to replace $symbol@*@lo and
7301 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
7302 // which is a 64-bit pc-relative offset from the encoding of the $symbol
7303 // operand to the global variable.
7304 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags);
7305 SDValue PtrHi;
7306 if (GAFlags == SIInstrInfo::MO_NONE)
7307 PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
7308 else
7309 PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1);
7310 return DAG.getNode(Opcode: AMDGPUISD::PC_ADD_REL_OFFSET, DL, VT: PtrVT, N1: PtrLo, N2: PtrHi);
7311}
7312
7313SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
7314 SDValue Op,
7315 SelectionDAG &DAG) const {
7316 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Val&: Op);
7317 SDLoc DL(GSD);
7318 EVT PtrVT = Op.getValueType();
7319
7320 const GlobalValue *GV = GSD->getGlobal();
7321 if ((GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
7322 shouldUseLDSConstAddress(GV)) ||
7323 GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS ||
7324 GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
7325 if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
7326 GV->hasExternalLinkage()) {
7327 Type *Ty = GV->getValueType();
7328 // HIP uses an unsized array `extern __shared__ T s[]` or similar
7329 // zero-sized type in other languages to declare the dynamic shared
7330 // memory which size is not known at the compile time. They will be
7331 // allocated by the runtime and placed directly after the static
7332 // allocated ones. They all share the same offset.
7333 if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) {
7334 assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
7335 // Adjust alignment for that dynamic shared memory array.
7336 Function &F = DAG.getMachineFunction().getFunction();
7337 MFI->setDynLDSAlign(F, GV: *cast<GlobalVariable>(Val: GV));
7338 MFI->setUsesDynamicLDS(true);
7339 return SDValue(
7340 DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
7341 }
7342 }
7343 return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
7344 }
7345
7346 if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
7347 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
7348 SIInstrInfo::MO_ABS32_LO);
7349 return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
7350 }
7351
7352 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
7353 SDValue AddrLo = DAG.getTargetGlobalAddress(
7354 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO);
7355 AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0};
7356
7357 SDValue AddrHi = DAG.getTargetGlobalAddress(
7358 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI);
7359 AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0};
7360
7361 return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi);
7362 }
7363
7364 if (shouldEmitFixup(GV))
7365 return buildPCRelGlobalAddress(DAG, GV, DL, Offset: GSD->getOffset(), PtrVT);
7366
7367 if (shouldEmitPCReloc(GV))
7368 return buildPCRelGlobalAddress(DAG, GV, DL, Offset: GSD->getOffset(), PtrVT,
7369 GAFlags: SIInstrInfo::MO_REL32);
7370
7371 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, Offset: 0, PtrVT,
7372 GAFlags: SIInstrInfo::MO_GOTPCREL32);
7373
7374 Type *Ty = PtrVT.getTypeForEVT(Context&: *DAG.getContext());
7375 PointerType *PtrTy = PointerType::get(ElementType: Ty, AddressSpace: AMDGPUAS::CONSTANT_ADDRESS);
7376 const DataLayout &DataLayout = DAG.getDataLayout();
7377 Align Alignment = DataLayout.getABITypeAlign(Ty: PtrTy);
7378 MachinePointerInfo PtrInfo
7379 = MachinePointerInfo::getGOT(MF&: DAG.getMachineFunction());
7380
7381 return DAG.getLoad(VT: PtrVT, dl: DL, Chain: DAG.getEntryNode(), Ptr: GOTAddr, PtrInfo, Alignment,
7382 MMOFlags: MachineMemOperand::MODereferenceable |
7383 MachineMemOperand::MOInvariant);
7384}
7385
7386SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain,
7387 const SDLoc &DL, SDValue V) const {
7388 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
7389 // the destination register.
7390 //
7391 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
7392 // so we will end up with redundant moves to m0.
7393 //
7394 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
7395
7396 // A Null SDValue creates a glue result.
7397 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
7398 V, Chain);
7399 return SDValue(M0, 0);
7400}
7401
7402SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
7403 SDValue Op,
7404 MVT VT,
7405 unsigned Offset) const {
7406 SDLoc SL(Op);
7407 SDValue Param = lowerKernargMemParameter(
7408 DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false);
7409 // The local size values will have the hi 16-bits as zero.
7410 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
7411 DAG.getValueType(VT));
7412}
7413
7414static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
7415 EVT VT) {
7416 DiagnosticInfoUnsupported BadIntrin(DAG.getMachineFunction().getFunction(),
7417 "non-hsa intrinsic with hsa target",
7418 DL.getDebugLoc());
7419 DAG.getContext()->diagnose(DI: BadIntrin);
7420 return DAG.getUNDEF(VT);
7421}
7422
7423static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
7424 EVT VT) {
7425 DiagnosticInfoUnsupported BadIntrin(DAG.getMachineFunction().getFunction(),
7426 "intrinsic not supported on subtarget",
7427 DL.getDebugLoc());
7428 DAG.getContext()->diagnose(DI: BadIntrin);
7429 return DAG.getUNDEF(VT);
7430}
7431
7432static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL,
7433 ArrayRef<SDValue> Elts) {
7434 assert(!Elts.empty());
7435 MVT Type;
7436 unsigned NumElts = Elts.size();
7437
7438 if (NumElts <= 12) {
7439 Type = MVT::getVectorVT(MVT::f32, NumElts);
7440 } else {
7441 assert(Elts.size() <= 16);
7442 Type = MVT::v16f32;
7443 NumElts = 16;
7444 }
7445
7446 SmallVector<SDValue, 16> VecElts(NumElts);
7447 for (unsigned i = 0; i < Elts.size(); ++i) {
7448 SDValue Elt = Elts[i];
7449 if (Elt.getValueType() != MVT::f32)
7450 Elt = DAG.getBitcast(MVT::f32, Elt);
7451 VecElts[i] = Elt;
7452 }
7453 for (unsigned i = Elts.size(); i < NumElts; ++i)
7454 VecElts[i] = DAG.getUNDEF(MVT::f32);
7455
7456 if (NumElts == 1)
7457 return VecElts[0];
7458 return DAG.getBuildVector(VT: Type, DL, Ops: VecElts);
7459}
7460
7461static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
7462 SDValue Src, int ExtraElts) {
7463 EVT SrcVT = Src.getValueType();
7464
7465 SmallVector<SDValue, 8> Elts;
7466
7467 if (SrcVT.isVector())
7468 DAG.ExtractVectorElements(Op: Src, Args&: Elts);
7469 else
7470 Elts.push_back(Elt: Src);
7471
7472 SDValue Undef = DAG.getUNDEF(VT: SrcVT.getScalarType());
7473 while (ExtraElts--)
7474 Elts.push_back(Elt: Undef);
7475
7476 return DAG.getBuildVector(VT: CastVT, DL, Ops: Elts);
7477}
7478
7479// Re-construct the required return value for a image load intrinsic.
7480// This is more complicated due to the optional use TexFailCtrl which means the required
7481// return type is an aggregate
7482static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result,
7483 ArrayRef<EVT> ResultTypes, bool IsTexFail,
7484 bool Unpacked, bool IsD16, int DMaskPop,
7485 int NumVDataDwords, bool IsAtomicPacked16Bit,
7486 const SDLoc &DL) {
7487 // Determine the required return type. This is the same regardless of IsTexFail flag
7488 EVT ReqRetVT = ResultTypes[0];
7489 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
7490 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
7491 ? (ReqRetNumElts + 1) / 2
7492 : ReqRetNumElts;
7493
7494 int MaskPopDwords = (!IsD16 || (IsD16 && Unpacked)) ?
7495 DMaskPop : (DMaskPop + 1) / 2;
7496
7497 MVT DataDwordVT = NumDataDwords == 1 ?
7498 MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
7499
7500 MVT MaskPopVT = MaskPopDwords == 1 ?
7501 MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);
7502
7503 SDValue Data(Result, 0);
7504 SDValue TexFail;
7505
7506 if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
7507 SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
7508 if (MaskPopVT.isVector()) {
7509 Data = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MaskPopVT,
7510 N1: SDValue(Result, 0), N2: ZeroIdx);
7511 } else {
7512 Data = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MaskPopVT,
7513 N1: SDValue(Result, 0), N2: ZeroIdx);
7514 }
7515 }
7516
7517 if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)
7518 Data = padEltsToUndef(DAG, DL, CastVT: DataDwordVT, Src: Data,
7519 ExtraElts: NumDataDwords - MaskPopDwords);
7520
7521 if (IsD16)
7522 Data = adjustLoadValueTypeImpl(Result: Data, LoadVT: ReqRetVT, DL, DAG, Unpacked);
7523
7524 EVT LegalReqRetVT = ReqRetVT;
7525 if (!ReqRetVT.isVector()) {
7526 if (!Data.getValueType().isInteger())
7527 Data = DAG.getNode(Opcode: ISD::BITCAST, DL,
7528 VT: Data.getValueType().changeTypeToInteger(), Operand: Data);
7529 Data = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: ReqRetVT.changeTypeToInteger(), Operand: Data);
7530 } else {
7531 // We need to widen the return vector to a legal type
7532 if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
7533 ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
7534 LegalReqRetVT =
7535 EVT::getVectorVT(Context&: *DAG.getContext(), VT: ReqRetVT.getVectorElementType(),
7536 NumElements: ReqRetVT.getVectorNumElements() + 1);
7537 }
7538 }
7539 Data = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: LegalReqRetVT, Operand: Data);
7540
7541 if (IsTexFail) {
7542 TexFail =
7543 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0),
7544 DAG.getConstant(MaskPopDwords, DL, MVT::i32));
7545
7546 return DAG.getMergeValues(Ops: {Data, TexFail, SDValue(Result, 1)}, dl: DL);
7547 }
7548
7549 if (Result->getNumValues() == 1)
7550 return Data;
7551
7552 return DAG.getMergeValues(Ops: {Data, SDValue(Result, 1)}, dl: DL);
7553}
7554
7555static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
7556 SDValue *LWE, bool &IsTexFail) {
7557 auto TexFailCtrlConst = cast<ConstantSDNode>(Val: TexFailCtrl.getNode());
7558
7559 uint64_t Value = TexFailCtrlConst->getZExtValue();
7560 if (Value) {
7561 IsTexFail = true;
7562 }
7563
7564 SDLoc DL(TexFailCtrlConst);
7565 *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
7566 Value &= ~(uint64_t)0x1;
7567 *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
7568 Value &= ~(uint64_t)0x2;
7569
7570 return Value == 0;
7571}
7572
7573static void packImage16bitOpsToDwords(SelectionDAG &DAG, SDValue Op,
7574 MVT PackVectorVT,
7575 SmallVectorImpl<SDValue> &PackedAddrs,
7576 unsigned DimIdx, unsigned EndIdx,
7577 unsigned NumGradients) {
7578 SDLoc DL(Op);
7579 for (unsigned I = DimIdx; I < EndIdx; I++) {
7580 SDValue Addr = Op.getOperand(i: I);
7581
7582 // Gradients are packed with undef for each coordinate.
7583 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
7584 // 1D: undef,dx/dh; undef,dx/dv
7585 // 2D: dy/dh,dx/dh; dy/dv,dx/dv
7586 // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
7587 if (((I + 1) >= EndIdx) ||
7588 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
7589 I == DimIdx + NumGradients - 1))) {
7590 if (Addr.getValueType() != MVT::i16)
7591 Addr = DAG.getBitcast(MVT::i16, Addr);
7592 Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr);
7593 } else {
7594 Addr = DAG.getBuildVector(VT: PackVectorVT, DL, Ops: {Addr, Op.getOperand(i: I + 1)});
7595 I++;
7596 }
7597 Addr = DAG.getBitcast(MVT::f32, Addr);
7598 PackedAddrs.push_back(Elt: Addr);
7599 }
7600}
7601
7602SDValue SITargetLowering::lowerImage(SDValue Op,
7603 const AMDGPU::ImageDimIntrinsicInfo *Intr,
7604 SelectionDAG &DAG, bool WithChain) const {
7605 SDLoc DL(Op);
7606 MachineFunction &MF = DAG.getMachineFunction();
7607 const GCNSubtarget* ST = &MF.getSubtarget<GCNSubtarget>();
7608 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
7609 AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: Intr->BaseOpcode);
7610 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(DimEnum: Intr->Dim);
7611 unsigned IntrOpcode = Intr->BaseOpcode;
7612 bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
7613 bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
7614 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
7615
7616 SmallVector<EVT, 3> ResultTypes(Op->values());
7617 SmallVector<EVT, 3> OrigResultTypes(Op->values());
7618 bool IsD16 = false;
7619 bool IsG16 = false;
7620 bool IsA16 = false;
7621 SDValue VData;
7622 int NumVDataDwords;
7623 bool AdjustRetType = false;
7624 bool IsAtomicPacked16Bit = false;
7625
7626 // Offset of intrinsic arguments
7627 const unsigned ArgOffset = WithChain ? 2 : 1;
7628
7629 unsigned DMask;
7630 unsigned DMaskLanes = 0;
7631
7632 if (BaseOpcode->Atomic) {
7633 VData = Op.getOperand(i: 2);
7634
7635 IsAtomicPacked16Bit =
7636 (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
7637 Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
7638
7639 bool Is64Bit = VData.getValueSizeInBits() == 64;
7640 if (BaseOpcode->AtomicX2) {
7641 SDValue VData2 = Op.getOperand(i: 3);
7642 VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
7643 {VData, VData2});
7644 if (Is64Bit)
7645 VData = DAG.getBitcast(MVT::v4i32, VData);
7646
7647 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
7648 DMask = Is64Bit ? 0xf : 0x3;
7649 NumVDataDwords = Is64Bit ? 4 : 2;
7650 } else {
7651 DMask = Is64Bit ? 0x3 : 0x1;
7652 NumVDataDwords = Is64Bit ? 2 : 1;
7653 }
7654 } else {
7655 DMask = Op->getConstantOperandVal(Num: ArgOffset + Intr->DMaskIndex);
7656 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(Value: DMask);
7657
7658 if (BaseOpcode->Store) {
7659 VData = Op.getOperand(i: 2);
7660
7661 MVT StoreVT = VData.getSimpleValueType();
7662 if (StoreVT.getScalarType() == MVT::f16) {
7663 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
7664 return Op; // D16 is unsupported for this instruction
7665
7666 IsD16 = true;
7667 VData = handleD16VData(VData, DAG, ImageStore: true);
7668 }
7669
7670 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
7671 } else {
7672 // Work out the num dwords based on the dmask popcount and underlying type
7673 // and whether packing is supported.
7674 MVT LoadVT = ResultTypes[0].getSimpleVT();
7675 if (LoadVT.getScalarType() == MVT::f16) {
7676 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
7677 return Op; // D16 is unsupported for this instruction
7678
7679 IsD16 = true;
7680 }
7681
7682 // Confirm that the return type is large enough for the dmask specified
7683 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
7684 (!LoadVT.isVector() && DMaskLanes > 1))
7685 return Op;
7686
7687 // The sq block of gfx8 and gfx9 do not estimate register use correctly
7688 // for d16 image_gather4, image_gather4_l, and image_gather4_lz
7689 // instructions.
7690 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
7691 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
7692 NumVDataDwords = (DMaskLanes + 1) / 2;
7693 else
7694 NumVDataDwords = DMaskLanes;
7695
7696 AdjustRetType = true;
7697 }
7698 }
7699
7700 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
7701 SmallVector<SDValue, 4> VAddrs;
7702
7703 // Check for 16 bit addresses or derivatives and pack if true.
7704 MVT VAddrVT =
7705 Op.getOperand(i: ArgOffset + Intr->GradientStart).getSimpleValueType();
7706 MVT VAddrScalarVT = VAddrVT.getScalarType();
7707 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
7708 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
7709
7710 VAddrVT = Op.getOperand(i: ArgOffset + Intr->CoordStart).getSimpleValueType();
7711 VAddrScalarVT = VAddrVT.getScalarType();
7712 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
7713 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
7714
7715 // Push back extra arguments.
7716 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
7717 if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) {
7718 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
7719 // Special handling of bias when A16 is on. Bias is of type half but
7720 // occupies full 32-bit.
7721 SDValue Bias = DAG.getBuildVector(
7722 MVT::v2f16, DL,
7723 {Op.getOperand(ArgOffset + I), DAG.getUNDEF(MVT::f16)});
7724 VAddrs.push_back(Elt: Bias);
7725 } else {
7726 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
7727 "Bias needs to be converted to 16 bit in A16 mode");
7728 VAddrs.push_back(Elt: Op.getOperand(i: ArgOffset + I));
7729 }
7730 }
7731
7732 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
7733 // 16 bit gradients are supported, but are tied to the A16 control
7734 // so both gradients and addresses must be 16 bit
7735 LLVM_DEBUG(
7736 dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
7737 "require 16 bit args for both gradients and addresses");
7738 return Op;
7739 }
7740
7741 if (IsA16) {
7742 if (!ST->hasA16()) {
7743 LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
7744 "support 16 bit addresses\n");
7745 return Op;
7746 }
7747 }
7748
7749 // We've dealt with incorrect input so we know that if IsA16, IsG16
7750 // are set then we have to compress/pack operands (either address,
7751 // gradient or both)
7752 // In the case where a16 and gradients are tied (no G16 support) then we
7753 // have already verified that both IsA16 and IsG16 are true
7754 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
7755 // Activate g16
7756 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
7757 AMDGPU::getMIMGG16MappingInfo(G: Intr->BaseOpcode);
7758 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
7759 }
7760
7761 // Add gradients (packed or unpacked)
7762 if (IsG16) {
7763 // Pack the gradients
7764 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
7765 packImage16bitOpsToDwords(DAG, Op, PackVectorVT: GradPackVectorVT, PackedAddrs&: VAddrs,
7766 DimIdx: ArgOffset + Intr->GradientStart,
7767 EndIdx: ArgOffset + Intr->CoordStart, NumGradients: Intr->NumGradients);
7768 } else {
7769 for (unsigned I = ArgOffset + Intr->GradientStart;
7770 I < ArgOffset + Intr->CoordStart; I++)
7771 VAddrs.push_back(Elt: Op.getOperand(i: I));
7772 }
7773
7774 // Add addresses (packed or unpacked)
7775 if (IsA16) {
7776 packImage16bitOpsToDwords(DAG, Op, PackVectorVT: AddrPackVectorVT, PackedAddrs&: VAddrs,
7777 DimIdx: ArgOffset + Intr->CoordStart, EndIdx: VAddrEnd,
7778 NumGradients: 0 /* No gradients */);
7779 } else {
7780 // Add uncompressed address
7781 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
7782 VAddrs.push_back(Elt: Op.getOperand(i: I));
7783 }
7784
7785 // If the register allocator cannot place the address registers contiguously
7786 // without introducing moves, then using the non-sequential address encoding
7787 // is always preferable, since it saves VALU instructions and is usually a
7788 // wash in terms of code size or even better.
7789 //
7790 // However, we currently have no way of hinting to the register allocator that
7791 // MIMG addresses should be placed contiguously when it is possible to do so,
7792 // so force non-NSA for the common 2-address case as a heuristic.
7793 //
7794 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
7795 // allocation when possible.
7796 //
7797 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
7798 // set of the remaining addresses.
7799 const unsigned NSAMaxSize = ST->getNSAMaxSize(HasSampler: BaseOpcode->Sampler);
7800 const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
7801 const bool UseNSA = ST->hasNSAEncoding() &&
7802 VAddrs.size() >= ST->getNSAThreshold(MF) &&
7803 (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);
7804 const bool UsePartialNSA =
7805 UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;
7806
7807 SDValue VAddr;
7808 if (UsePartialNSA) {
7809 VAddr = getBuildDwordsVector(DAG, DL,
7810 Elts: ArrayRef(VAddrs).drop_front(N: NSAMaxSize - 1));
7811 }
7812 else if (!UseNSA) {
7813 VAddr = getBuildDwordsVector(DAG, DL, Elts: VAddrs);
7814 }
7815
7816 SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
7817 SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
7818 SDValue Unorm;
7819 if (!BaseOpcode->Sampler) {
7820 Unorm = True;
7821 } else {
7822 uint64_t UnormConst =
7823 Op.getConstantOperandVal(i: ArgOffset + Intr->UnormIndex);
7824
7825 Unorm = UnormConst ? True : False;
7826 }
7827
7828 SDValue TFE;
7829 SDValue LWE;
7830 SDValue TexFail = Op.getOperand(i: ArgOffset + Intr->TexFailCtrlIndex);
7831 bool IsTexFail = false;
7832 if (!parseTexFail(TexFailCtrl: TexFail, DAG, TFE: &TFE, LWE: &LWE, IsTexFail))
7833 return Op;
7834
7835 if (IsTexFail) {
7836 if (!DMaskLanes) {
7837 // Expecting to get an error flag since TFC is on - and dmask is 0
7838 // Force dmask to be at least 1 otherwise the instruction will fail
7839 DMask = 0x1;
7840 DMaskLanes = 1;
7841 NumVDataDwords = 1;
7842 }
7843 NumVDataDwords += 1;
7844 AdjustRetType = true;
7845 }
7846
7847 // Has something earlier tagged that the return type needs adjusting
7848 // This happens if the instruction is a load or has set TexFailCtrl flags
7849 if (AdjustRetType) {
7850 // NumVDataDwords reflects the true number of dwords required in the return type
7851 if (DMaskLanes == 0 && !BaseOpcode->Store) {
7852 // This is a no-op load. This can be eliminated
7853 SDValue Undef = DAG.getUNDEF(VT: Op.getValueType());
7854 if (isa<MemSDNode>(Val: Op))
7855 return DAG.getMergeValues(Ops: {Undef, Op.getOperand(i: 0)}, dl: DL);
7856 return Undef;
7857 }
7858
7859 EVT NewVT = NumVDataDwords > 1 ?
7860 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumVDataDwords)
7861 : MVT::i32;
7862
7863 ResultTypes[0] = NewVT;
7864 if (ResultTypes.size() == 3) {
7865 // Original result was aggregate type used for TexFailCtrl results
7866 // The actual instruction returns as a vector type which has now been
7867 // created. Remove the aggregate result.
7868 ResultTypes.erase(CI: &ResultTypes[1]);
7869 }
7870 }
7871
7872 unsigned CPol = Op.getConstantOperandVal(i: ArgOffset + Intr->CachePolicyIndex);
7873 if (BaseOpcode->Atomic)
7874 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
7875 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
7876 AMDGPU::CPol::VOLATILE))
7877 return Op;
7878
7879 SmallVector<SDValue, 26> Ops;
7880 if (BaseOpcode->Store || BaseOpcode->Atomic)
7881 Ops.push_back(Elt: VData); // vdata
7882 if (UsePartialNSA) {
7883 append_range(C&: Ops, R: ArrayRef(VAddrs).take_front(N: NSAMaxSize - 1));
7884 Ops.push_back(Elt: VAddr);
7885 }
7886 else if (UseNSA)
7887 append_range(C&: Ops, R&: VAddrs);
7888 else
7889 Ops.push_back(Elt: VAddr);
7890 Ops.push_back(Elt: Op.getOperand(i: ArgOffset + Intr->RsrcIndex));
7891 if (BaseOpcode->Sampler)
7892 Ops.push_back(Elt: Op.getOperand(i: ArgOffset + Intr->SampIndex));
7893 Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
7894 if (IsGFX10Plus)
7895 Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
7896 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
7897 Ops.push_back(Elt: Unorm);
7898 Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));
7899 Ops.push_back(IsA16 && // r128, a16 for gfx9
7900 ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False);
7901 if (IsGFX10Plus)
7902 Ops.push_back(Elt: IsA16 ? True : False);
7903 if (!Subtarget->hasGFX90AInsts()) {
7904 Ops.push_back(Elt: TFE); //tfe
7905 } else if (TFE->getAsZExtVal()) {
7906 report_fatal_error(reason: "TFE is not supported on this GPU");
7907 }
7908 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
7909 Ops.push_back(Elt: LWE); // lwe
7910 if (!IsGFX10Plus)
7911 Ops.push_back(Elt: DimInfo->DA ? True : False);
7912 if (BaseOpcode->HasD16)
7913 Ops.push_back(Elt: IsD16 ? True : False);
7914 if (isa<MemSDNode>(Val: Op))
7915 Ops.push_back(Elt: Op.getOperand(i: 0)); // chain
7916
7917 int NumVAddrDwords =
7918 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
7919 int Opcode = -1;
7920
7921 if (IsGFX12Plus) {
7922 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
7923 NumVDataDwords, NumVAddrDwords);
7924 } else if (IsGFX11Plus) {
7925 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
7926 UseNSA ? AMDGPU::MIMGEncGfx11NSA
7927 : AMDGPU::MIMGEncGfx11Default,
7928 NumVDataDwords, NumVAddrDwords);
7929 } else if (IsGFX10Plus) {
7930 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
7931 UseNSA ? AMDGPU::MIMGEncGfx10NSA
7932 : AMDGPU::MIMGEncGfx10Default,
7933 NumVDataDwords, NumVAddrDwords);
7934 } else {
7935 if (Subtarget->hasGFX90AInsts()) {
7936 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
7937 NumVDataDwords, NumVAddrDwords);
7938 if (Opcode == -1)
7939 report_fatal_error(
7940 reason: "requested image instruction is not supported on this GPU");
7941 }
7942 if (Opcode == -1 &&
7943 Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
7944 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
7945 NumVDataDwords, NumVAddrDwords);
7946 if (Opcode == -1)
7947 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
7948 NumVDataDwords, NumVAddrDwords);
7949 }
7950 if (Opcode == -1)
7951 return Op;
7952
7953 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, dl: DL, ResultTys: ResultTypes, Ops);
7954 if (auto MemOp = dyn_cast<MemSDNode>(Val&: Op)) {
7955 MachineMemOperand *MemRef = MemOp->getMemOperand();
7956 DAG.setNodeMemRefs(N: NewNode, NewMemRefs: {MemRef});
7957 }
7958
7959 if (BaseOpcode->AtomicX2) {
7960 SmallVector<SDValue, 1> Elt;
7961 DAG.ExtractVectorElements(Op: SDValue(NewNode, 0), Args&: Elt, Start: 0, Count: 1);
7962 return DAG.getMergeValues(Ops: {Elt[0], SDValue(NewNode, 1)}, dl: DL);
7963 }
7964 if (BaseOpcode->Store)
7965 return SDValue(NewNode, 0);
7966 return constructRetValue(DAG, Result: NewNode, ResultTypes: OrigResultTypes, IsTexFail,
7967 Unpacked: Subtarget->hasUnpackedD16VMem(), IsD16, DMaskPop: DMaskLanes,
7968 NumVDataDwords, IsAtomicPacked16Bit, DL);
7969}
7970
7971SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
7972 SDValue Offset, SDValue CachePolicy,
7973 SelectionDAG &DAG) const {
7974 MachineFunction &MF = DAG.getMachineFunction();
7975
7976 const DataLayout &DataLayout = DAG.getDataLayout();
7977 Align Alignment =
7978 DataLayout.getABITypeAlign(Ty: VT.getTypeForEVT(Context&: *DAG.getContext()));
7979
7980 MachineMemOperand *MMO = MF.getMachineMemOperand(
7981 PtrInfo: MachinePointerInfo(),
7982 F: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
7983 MachineMemOperand::MOInvariant,
7984 Size: VT.getStoreSize(), BaseAlignment: Alignment);
7985
7986 if (!Offset->isDivergent()) {
7987 SDValue Ops[] = {Rsrc, Offset, CachePolicy};
7988
7989 // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
7990 // s_buffer_load_u16 instruction is emitted for both signed and unsigned
7991 // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
7992 // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
7993 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
7994 SDValue BufferLoad =
7995 DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD_USHORT, DL,
7996 DAG.getVTList(MVT::i32), Ops, VT, MMO);
7997 return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: BufferLoad);
7998 }
7999
8000 // Widen vec3 load to vec4.
8001 if (VT.isVector() && VT.getVectorNumElements() == 3 &&
8002 !Subtarget->hasScalarDwordx3Loads()) {
8003 EVT WidenedVT =
8004 EVT::getVectorVT(Context&: *DAG.getContext(), VT: VT.getVectorElementType(), NumElements: 4);
8005 auto WidenedOp = DAG.getMemIntrinsicNode(
8006 Opcode: AMDGPUISD::SBUFFER_LOAD, dl: DL, VTList: DAG.getVTList(VT: WidenedVT), Ops, MemVT: WidenedVT,
8007 MMO: MF.getMachineMemOperand(MMO, Offset: 0, Size: WidenedVT.getStoreSize()));
8008 auto Subvector = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT, N1: WidenedOp,
8009 N2: DAG.getVectorIdxConstant(Val: 0, DL));
8010 return Subvector;
8011 }
8012
8013 return DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::SBUFFER_LOAD, dl: DL,
8014 VTList: DAG.getVTList(VT), Ops, MemVT: VT, MMO);
8015 }
8016
8017 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
8018 // assume that the buffer is unswizzled.
8019 SDValue Ops[] = {
8020 DAG.getEntryNode(), // Chain
8021 Rsrc, // rsrc
8022 DAG.getConstant(0, DL, MVT::i32), // vindex
8023 {}, // voffset
8024 {}, // soffset
8025 {}, // offset
8026 CachePolicy, // cachepolicy
8027 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8028 };
8029 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
8030 setBufferOffsets(CombinedOffset: Offset, DAG, Offsets: &Ops[3], Alignment: Align(4));
8031 return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
8032 }
8033
8034 SmallVector<SDValue, 4> Loads;
8035 unsigned NumLoads = 1;
8036 MVT LoadVT = VT.getSimpleVT();
8037 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
8038 assert((LoadVT.getScalarType() == MVT::i32 ||
8039 LoadVT.getScalarType() == MVT::f32));
8040
8041 if (NumElts == 8 || NumElts == 16) {
8042 NumLoads = NumElts / 4;
8043 LoadVT = MVT::getVectorVT(VT: LoadVT.getScalarType(), NumElements: 4);
8044 }
8045
8046 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue});
8047
8048 // Use the alignment to ensure that the required offsets will fit into the
8049 // immediate offsets.
8050 setBufferOffsets(CombinedOffset: Offset, DAG, Offsets: &Ops[3],
8051 Alignment: NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
8052
8053 uint64_t InstOffset = Ops[5]->getAsZExtVal();
8054 for (unsigned i = 0; i < NumLoads; ++i) {
8055 Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
8056 Loads.push_back(Elt: getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
8057 LoadVT, MMO, DAG));
8058 }
8059
8060 if (NumElts == 8 || NumElts == 16)
8061 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT, Ops: Loads);
8062
8063 return Loads[0];
8064}
8065
8066SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
8067 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
8068 if (!Subtarget->hasArchitectedSGPRs())
8069 return {};
8070 SDLoc SL(Op);
8071 MVT VT = MVT::i32;
8072 SDValue TTMP8 = DAG.getCopyFromReg(DAG.getEntryNode(), SL, AMDGPU::TTMP8, VT);
8073 return DAG.getNode(Opcode: AMDGPUISD::BFE_U32, DL: SL, VT, N1: TTMP8,
8074 N2: DAG.getConstant(Val: 25, DL: SL, VT), N3: DAG.getConstant(Val: 5, DL: SL, VT));
8075}
8076
8077SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
8078 unsigned Dim,
8079 const ArgDescriptor &Arg) const {
8080 SDLoc SL(Op);
8081 MachineFunction &MF = DAG.getMachineFunction();
8082 unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim);
8083 if (MaxID == 0)
8084 return DAG.getConstant(0, SL, MVT::i32);
8085
8086 SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
8087 SDLoc(DAG.getEntryNode()), Arg);
8088
8089 // Don't bother inserting AssertZext for packed IDs since we're emitting the
8090 // masking operations anyway.
8091 //
8092 // TODO: We could assert the top bit is 0 for the source copy.
8093 if (Arg.isMasked())
8094 return Val;
8095
8096 // Preserve the known bits after expansion to a copy.
8097 EVT SmallVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: llvm::bit_width(Value: MaxID));
8098 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val,
8099 DAG.getValueType(SmallVT));
8100}
8101
8102SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
8103 SelectionDAG &DAG) const {
8104 MachineFunction &MF = DAG.getMachineFunction();
8105 auto MFI = MF.getInfo<SIMachineFunctionInfo>();
8106
8107 EVT VT = Op.getValueType();
8108 SDLoc DL(Op);
8109 unsigned IntrinsicID = Op.getConstantOperandVal(i: 0);
8110
8111 // TODO: Should this propagate fast-math-flags?
8112
8113 switch (IntrinsicID) {
8114 case Intrinsic::amdgcn_implicit_buffer_ptr: {
8115 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
8116 return emitNonHSAIntrinsicError(DAG, DL, VT);
8117 return getPreloadedValue(DAG, MFI: *MFI, VT,
8118 PVID: AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
8119 }
8120 case Intrinsic::amdgcn_dispatch_ptr:
8121 case Intrinsic::amdgcn_queue_ptr: {
8122 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
8123 DiagnosticInfoUnsupported BadIntrin(
8124 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
8125 DL.getDebugLoc());
8126 DAG.getContext()->diagnose(DI: BadIntrin);
8127 return DAG.getUNDEF(VT);
8128 }
8129
8130 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
8131 AMDGPUFunctionArgInfo::DISPATCH_PTR : AMDGPUFunctionArgInfo::QUEUE_PTR;
8132 return getPreloadedValue(DAG, MFI: *MFI, VT, PVID: RegID);
8133 }
8134 case Intrinsic::amdgcn_implicitarg_ptr: {
8135 if (MFI->isEntryFunction())
8136 return getImplicitArgPtr(DAG, SL: DL);
8137 return getPreloadedValue(DAG, MFI: *MFI, VT,
8138 PVID: AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
8139 }
8140 case Intrinsic::amdgcn_kernarg_segment_ptr: {
8141 if (!AMDGPU::isKernel(CC: MF.getFunction().getCallingConv())) {
8142 // This only makes sense to call in a kernel, so just lower to null.
8143 return DAG.getConstant(Val: 0, DL, VT);
8144 }
8145
8146 return getPreloadedValue(DAG, MFI: *MFI, VT,
8147 PVID: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
8148 }
8149 case Intrinsic::amdgcn_dispatch_id: {
8150 return getPreloadedValue(DAG, MFI: *MFI, VT, PVID: AMDGPUFunctionArgInfo::DISPATCH_ID);
8151 }
8152 case Intrinsic::amdgcn_rcp:
8153 return DAG.getNode(Opcode: AMDGPUISD::RCP, DL, VT, Operand: Op.getOperand(i: 1));
8154 case Intrinsic::amdgcn_rsq:
8155 return DAG.getNode(Opcode: AMDGPUISD::RSQ, DL, VT, Operand: Op.getOperand(i: 1));
8156 case Intrinsic::amdgcn_rsq_legacy:
8157 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
8158 return emitRemovedIntrinsicError(DAG, DL, VT);
8159 return SDValue();
8160 case Intrinsic::amdgcn_rcp_legacy:
8161 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
8162 return emitRemovedIntrinsicError(DAG, DL, VT);
8163 return DAG.getNode(Opcode: AMDGPUISD::RCP_LEGACY, DL, VT, Operand: Op.getOperand(i: 1));
8164 case Intrinsic::amdgcn_rsq_clamp: {
8165 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
8166 return DAG.getNode(Opcode: AMDGPUISD::RSQ_CLAMP, DL, VT, Operand: Op.getOperand(i: 1));
8167
8168 Type *Type = VT.getTypeForEVT(Context&: *DAG.getContext());
8169 APFloat Max = APFloat::getLargest(Sem: Type->getFltSemantics());
8170 APFloat Min = APFloat::getLargest(Sem: Type->getFltSemantics(), Negative: true);
8171
8172 SDValue Rsq = DAG.getNode(Opcode: AMDGPUISD::RSQ, DL, VT, Operand: Op.getOperand(i: 1));
8173 SDValue Tmp = DAG.getNode(Opcode: ISD::FMINNUM, DL, VT, N1: Rsq,
8174 N2: DAG.getConstantFP(Val: Max, DL, VT));
8175 return DAG.getNode(Opcode: ISD::FMAXNUM, DL, VT, N1: Tmp,
8176 N2: DAG.getConstantFP(Val: Min, DL, VT));
8177 }
8178 case Intrinsic::r600_read_ngroups_x:
8179 if (Subtarget->isAmdHsaOS())
8180 return emitNonHSAIntrinsicError(DAG, DL, VT);
8181
8182 return lowerKernargMemParameter(DAG, VT, MemVT: VT, SL: DL, Chain: DAG.getEntryNode(),
8183 Offset: SI::KernelInputOffsets::NGROUPS_X, Alignment: Align(4),
8184 Signed: false);
8185 case Intrinsic::r600_read_ngroups_y:
8186 if (Subtarget->isAmdHsaOS())
8187 return emitNonHSAIntrinsicError(DAG, DL, VT);
8188
8189 return lowerKernargMemParameter(DAG, VT, MemVT: VT, SL: DL, Chain: DAG.getEntryNode(),
8190 Offset: SI::KernelInputOffsets::NGROUPS_Y, Alignment: Align(4),
8191 Signed: false);
8192 case Intrinsic::r600_read_ngroups_z:
8193 if (Subtarget->isAmdHsaOS())
8194 return emitNonHSAIntrinsicError(DAG, DL, VT);
8195
8196 return lowerKernargMemParameter(DAG, VT, MemVT: VT, SL: DL, Chain: DAG.getEntryNode(),
8197 Offset: SI::KernelInputOffsets::NGROUPS_Z, Alignment: Align(4),
8198 Signed: false);
8199 case Intrinsic::r600_read_global_size_x:
8200 if (Subtarget->isAmdHsaOS())
8201 return emitNonHSAIntrinsicError(DAG, DL, VT);
8202
8203 return lowerKernargMemParameter(DAG, VT, MemVT: VT, SL: DL, Chain: DAG.getEntryNode(),
8204 Offset: SI::KernelInputOffsets::GLOBAL_SIZE_X,
8205 Alignment: Align(4), Signed: false);
8206 case Intrinsic::r600_read_global_size_y:
8207 if (Subtarget->isAmdHsaOS())
8208 return emitNonHSAIntrinsicError(DAG, DL, VT);
8209
8210 return lowerKernargMemParameter(DAG, VT, MemVT: VT, SL: DL, Chain: DAG.getEntryNode(),
8211 Offset: SI::KernelInputOffsets::GLOBAL_SIZE_Y,
8212 Alignment: Align(4), Signed: false);
8213 case Intrinsic::r600_read_global_size_z:
8214 if (Subtarget->isAmdHsaOS())
8215 return emitNonHSAIntrinsicError(DAG, DL, VT);
8216
8217 return lowerKernargMemParameter(DAG, VT, MemVT: VT, SL: DL, Chain: DAG.getEntryNode(),
8218 Offset: SI::KernelInputOffsets::GLOBAL_SIZE_Z,
8219 Alignment: Align(4), Signed: false);
8220 case Intrinsic::r600_read_local_size_x:
8221 if (Subtarget->isAmdHsaOS())
8222 return emitNonHSAIntrinsicError(DAG, DL, VT);
8223
8224 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8225 SI::KernelInputOffsets::LOCAL_SIZE_X);
8226 case Intrinsic::r600_read_local_size_y:
8227 if (Subtarget->isAmdHsaOS())
8228 return emitNonHSAIntrinsicError(DAG, DL, VT);
8229
8230 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8231 SI::KernelInputOffsets::LOCAL_SIZE_Y);
8232 case Intrinsic::r600_read_local_size_z:
8233 if (Subtarget->isAmdHsaOS())
8234 return emitNonHSAIntrinsicError(DAG, DL, VT);
8235
8236 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8237 SI::KernelInputOffsets::LOCAL_SIZE_Z);
8238 case Intrinsic::amdgcn_workgroup_id_x:
8239 return getPreloadedValue(DAG, MFI: *MFI, VT,
8240 PVID: AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
8241 case Intrinsic::amdgcn_workgroup_id_y:
8242 return getPreloadedValue(DAG, MFI: *MFI, VT,
8243 PVID: AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
8244 case Intrinsic::amdgcn_workgroup_id_z:
8245 return getPreloadedValue(DAG, MFI: *MFI, VT,
8246 PVID: AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
8247 case Intrinsic::amdgcn_wave_id:
8248 return lowerWaveID(DAG, Op);
8249 case Intrinsic::amdgcn_lds_kernel_id: {
8250 if (MFI->isEntryFunction())
8251 return getLDSKernelId(DAG, SL: DL);
8252 return getPreloadedValue(DAG, MFI: *MFI, VT,
8253 PVID: AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
8254 }
8255 case Intrinsic::amdgcn_workitem_id_x:
8256 return lowerWorkitemID(DAG, Op, Dim: 0, Arg: MFI->getArgInfo().WorkItemIDX);
8257 case Intrinsic::amdgcn_workitem_id_y:
8258 return lowerWorkitemID(DAG, Op, Dim: 1, Arg: MFI->getArgInfo().WorkItemIDY);
8259 case Intrinsic::amdgcn_workitem_id_z:
8260 return lowerWorkitemID(DAG, Op, Dim: 2, Arg: MFI->getArgInfo().WorkItemIDZ);
8261 case Intrinsic::amdgcn_wavefrontsize:
8262 return DAG.getConstant(MF.getSubtarget<GCNSubtarget>().getWavefrontSize(),
8263 SDLoc(Op), MVT::i32);
8264 case Intrinsic::amdgcn_s_buffer_load: {
8265 unsigned CPol = Op.getConstantOperandVal(i: 3);
8266 // s_buffer_load, because of how it's optimized, can't be volatile
8267 // so reject ones with the volatile bit set.
8268 if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
8269 ? AMDGPU::CPol::ALL
8270 : AMDGPU::CPol::ALL_pregfx12))
8271 return Op;
8272 return lowerSBuffer(VT, DL, Rsrc: Op.getOperand(i: 1), Offset: Op.getOperand(i: 2), CachePolicy: Op.getOperand(i: 3),
8273 DAG);
8274 }
8275 case Intrinsic::amdgcn_fdiv_fast:
8276 return lowerFDIV_FAST(Op, DAG);
8277 case Intrinsic::amdgcn_sin:
8278 return DAG.getNode(Opcode: AMDGPUISD::SIN_HW, DL, VT, Operand: Op.getOperand(i: 1));
8279
8280 case Intrinsic::amdgcn_cos:
8281 return DAG.getNode(Opcode: AMDGPUISD::COS_HW, DL, VT, Operand: Op.getOperand(i: 1));
8282
8283 case Intrinsic::amdgcn_mul_u24:
8284 return DAG.getNode(Opcode: AMDGPUISD::MUL_U24, DL, VT, N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
8285 case Intrinsic::amdgcn_mul_i24:
8286 return DAG.getNode(Opcode: AMDGPUISD::MUL_I24, DL, VT, N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
8287
8288 case Intrinsic::amdgcn_log_clamp: {
8289 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
8290 return SDValue();
8291
8292 return emitRemovedIntrinsicError(DAG, DL, VT);
8293 }
8294 case Intrinsic::amdgcn_fract:
8295 return DAG.getNode(Opcode: AMDGPUISD::FRACT, DL, VT, Operand: Op.getOperand(i: 1));
8296
8297 case Intrinsic::amdgcn_class:
8298 return DAG.getNode(Opcode: AMDGPUISD::FP_CLASS, DL, VT,
8299 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
8300 case Intrinsic::amdgcn_div_fmas:
8301 return DAG.getNode(Opcode: AMDGPUISD::DIV_FMAS, DL, VT,
8302 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3),
8303 N4: Op.getOperand(i: 4));
8304
8305 case Intrinsic::amdgcn_div_fixup:
8306 return DAG.getNode(Opcode: AMDGPUISD::DIV_FIXUP, DL, VT,
8307 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3));
8308
8309 case Intrinsic::amdgcn_div_scale: {
8310 const ConstantSDNode *Param = cast<ConstantSDNode>(Val: Op.getOperand(i: 3));
8311
8312 // Translate to the operands expected by the machine instruction. The
8313 // first parameter must be the same as the first instruction.
8314 SDValue Numerator = Op.getOperand(i: 1);
8315 SDValue Denominator = Op.getOperand(i: 2);
8316
8317 // Note this order is opposite of the machine instruction's operations,
8318 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
8319 // intrinsic has the numerator as the first operand to match a normal
8320 // division operation.
8321
8322 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
8323
8324 return DAG.getNode(Opcode: AMDGPUISD::DIV_SCALE, DL, VTList: Op->getVTList(), N1: Src0,
8325 N2: Denominator, N3: Numerator);
8326 }
8327 case Intrinsic::amdgcn_icmp: {
8328 // There is a Pat that handles this variant, so return it as-is.
8329 if (Op.getOperand(1).getValueType() == MVT::i1 &&
8330 Op.getConstantOperandVal(2) == 0 &&
8331 Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
8332 return Op;
8333 return lowerICMPIntrinsic(TLI: *this, N: Op.getNode(), DAG);
8334 }
8335 case Intrinsic::amdgcn_fcmp: {
8336 return lowerFCMPIntrinsic(TLI: *this, N: Op.getNode(), DAG);
8337 }
8338 case Intrinsic::amdgcn_ballot:
8339 return lowerBALLOTIntrinsic(TLI: *this, N: Op.getNode(), DAG);
8340 case Intrinsic::amdgcn_fmed3:
8341 return DAG.getNode(Opcode: AMDGPUISD::FMED3, DL, VT,
8342 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3));
8343 case Intrinsic::amdgcn_fdot2:
8344 return DAG.getNode(Opcode: AMDGPUISD::FDOT2, DL, VT,
8345 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3),
8346 N4: Op.getOperand(i: 4));
8347 case Intrinsic::amdgcn_fmul_legacy:
8348 return DAG.getNode(Opcode: AMDGPUISD::FMUL_LEGACY, DL, VT,
8349 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
8350 case Intrinsic::amdgcn_sffbh:
8351 return DAG.getNode(Opcode: AMDGPUISD::FFBH_I32, DL, VT, Operand: Op.getOperand(i: 1));
8352 case Intrinsic::amdgcn_sbfe:
8353 return DAG.getNode(Opcode: AMDGPUISD::BFE_I32, DL, VT,
8354 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3));
8355 case Intrinsic::amdgcn_ubfe:
8356 return DAG.getNode(Opcode: AMDGPUISD::BFE_U32, DL, VT,
8357 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3));
8358 case Intrinsic::amdgcn_cvt_pkrtz:
8359 case Intrinsic::amdgcn_cvt_pknorm_i16:
8360 case Intrinsic::amdgcn_cvt_pknorm_u16:
8361 case Intrinsic::amdgcn_cvt_pk_i16:
8362 case Intrinsic::amdgcn_cvt_pk_u16: {
8363 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
8364 EVT VT = Op.getValueType();
8365 unsigned Opcode;
8366
8367 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
8368 Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
8369 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
8370 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
8371 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
8372 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
8373 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
8374 Opcode = AMDGPUISD::CVT_PK_I16_I32;
8375 else
8376 Opcode = AMDGPUISD::CVT_PK_U16_U32;
8377
8378 if (isTypeLegal(VT))
8379 return DAG.getNode(Opcode, DL, VT, N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
8380
8381 SDValue Node = DAG.getNode(Opcode, DL, MVT::i32,
8382 Op.getOperand(1), Op.getOperand(2));
8383 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: Node);
8384 }
8385 case Intrinsic::amdgcn_fmad_ftz:
8386 return DAG.getNode(Opcode: AMDGPUISD::FMAD_FTZ, DL, VT, N1: Op.getOperand(i: 1),
8387 N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3));
8388
8389 case Intrinsic::amdgcn_if_break:
8390 return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
8391 Op->getOperand(1), Op->getOperand(2)), 0);
8392
8393 case Intrinsic::amdgcn_groupstaticsize: {
8394 Triple::OSType OS = getTargetMachine().getTargetTriple().getOS();
8395 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
8396 return Op;
8397
8398 const Module *M = MF.getFunction().getParent();
8399 const GlobalValue *GV =
8400 M->getNamedValue(Intrinsic::getName(Intrinsic::amdgcn_groupstaticsize));
8401 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
8402 SIInstrInfo::MO_ABS32_LO);
8403 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
8404 }
8405 case Intrinsic::amdgcn_is_shared:
8406 case Intrinsic::amdgcn_is_private: {
8407 SDLoc SL(Op);
8408 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared) ?
8409 AMDGPUAS::LOCAL_ADDRESS : AMDGPUAS::PRIVATE_ADDRESS;
8410 SDValue Aperture = getSegmentAperture(AS, DL: SL, DAG);
8411 SDValue SrcVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32,
8412 Op.getOperand(1));
8413
8414 SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
8415 DAG.getConstant(1, SL, MVT::i32));
8416 return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
8417 }
8418 case Intrinsic::amdgcn_perm:
8419 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
8420 Op.getOperand(2), Op.getOperand(3));
8421 case Intrinsic::amdgcn_reloc_constant: {
8422 Module *M = const_cast<Module *>(MF.getFunction().getParent());
8423 const MDNode *Metadata = cast<MDNodeSDNode>(Val: Op.getOperand(i: 1))->getMD();
8424 auto SymbolName = cast<MDString>(Val: Metadata->getOperand(I: 0))->getString();
8425 auto RelocSymbol = cast<GlobalVariable>(
8426 Val: M->getOrInsertGlobal(Name: SymbolName, Ty: Type::getInt32Ty(C&: M->getContext())));
8427 SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0,
8428 SIInstrInfo::MO_ABS32_LO);
8429 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
8430 }
8431 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8432 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8433 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8434 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8435 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8436 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8437 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8438 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8439 if (Op.getOperand(4).getValueType() == MVT::i32)
8440 return SDValue();
8441
8442 SDLoc SL(Op);
8443 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32);
8444 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
8445 Op.getOperand(i: 0), Op.getOperand(i: 1), Op.getOperand(i: 2),
8446 Op.getOperand(i: 3), IndexKeyi32);
8447 }
8448 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8449 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8450 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8451 if (Op.getOperand(6).getValueType() == MVT::i32)
8452 return SDValue();
8453
8454 SDLoc SL(Op);
8455 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32);
8456 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
8457 {Op.getOperand(i: 0), Op.getOperand(i: 1), Op.getOperand(i: 2),
8458 Op.getOperand(i: 3), Op.getOperand(i: 4), Op.getOperand(i: 5),
8459 IndexKeyi32, Op.getOperand(i: 7)});
8460 }
8461 case Intrinsic::amdgcn_addrspacecast_nonnull:
8462 return lowerADDRSPACECAST(Op, DAG);
8463 default:
8464 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
8465 AMDGPU::getImageDimIntrinsicInfo(Intr: IntrinsicID))
8466 return lowerImage(Op, Intr: ImageDimIntr, DAG, WithChain: false);
8467
8468 return Op;
8469 }
8470}
8471
8472// On targets not supporting constant in soffset field, turn zero to
8473// SGPR_NULL to avoid generating an extra s_mov with zero.
8474static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG,
8475 const GCNSubtarget *Subtarget) {
8476 if (Subtarget->hasRestrictedSOffset() && isNullConstant(SOffset))
8477 return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32);
8478 return SOffset;
8479}
8480
8481SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
8482 SelectionDAG &DAG,
8483 unsigned NewOpcode) const {
8484 SDLoc DL(Op);
8485
8486 SDValue VData = Op.getOperand(i: 2);
8487 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 3), DAG);
8488 auto Offsets = splitBufferOffsets(Offset: Op.getOperand(i: 4), DAG);
8489 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 5), DAG, Subtarget);
8490 SDValue Ops[] = {
8491 Op.getOperand(0), // Chain
8492 VData, // vdata
8493 Rsrc, // rsrc
8494 DAG.getConstant(0, DL, MVT::i32), // vindex
8495 Offsets.first, // voffset
8496 SOffset, // soffset
8497 Offsets.second, // offset
8498 Op.getOperand(6), // cachepolicy
8499 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8500 };
8501
8502 auto *M = cast<MemSDNode>(Val&: Op);
8503
8504 EVT MemVT = VData.getValueType();
8505 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
8506 M->getMemOperand());
8507}
8508
8509// Return a value to use for the idxen operand by examining the vindex operand.
8510static unsigned getIdxEn(SDValue VIndex) {
8511 // No need to set idxen if vindex is known to be zero.
8512 return isNullConstant(V: VIndex) ? 0 : 1;
8513}
8514
8515SDValue
8516SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
8517 unsigned NewOpcode) const {
8518 SDLoc DL(Op);
8519
8520 SDValue VData = Op.getOperand(i: 2);
8521 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 3), DAG);
8522 auto Offsets = splitBufferOffsets(Offset: Op.getOperand(i: 5), DAG);
8523 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 6), DAG, Subtarget);
8524 SDValue Ops[] = {
8525 Op.getOperand(0), // Chain
8526 VData, // vdata
8527 Rsrc, // rsrc
8528 Op.getOperand(4), // vindex
8529 Offsets.first, // voffset
8530 SOffset, // soffset
8531 Offsets.second, // offset
8532 Op.getOperand(7), // cachepolicy
8533 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
8534 };
8535
8536 auto *M = cast<MemSDNode>(Val&: Op);
8537
8538 EVT MemVT = VData.getValueType();
8539 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
8540 M->getMemOperand());
8541}
8542
8543SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
8544 SelectionDAG &DAG) const {
8545 unsigned IntrID = Op.getConstantOperandVal(i: 1);
8546 SDLoc DL(Op);
8547
8548 switch (IntrID) {
8549 case Intrinsic::amdgcn_ds_ordered_add:
8550 case Intrinsic::amdgcn_ds_ordered_swap: {
8551 MemSDNode *M = cast<MemSDNode>(Val&: Op);
8552 SDValue Chain = M->getOperand(Num: 0);
8553 SDValue M0 = M->getOperand(Num: 2);
8554 SDValue Value = M->getOperand(Num: 3);
8555 unsigned IndexOperand = M->getConstantOperandVal(Num: 7);
8556 unsigned WaveRelease = M->getConstantOperandVal(Num: 8);
8557 unsigned WaveDone = M->getConstantOperandVal(Num: 9);
8558
8559 unsigned OrderedCountIndex = IndexOperand & 0x3f;
8560 IndexOperand &= ~0x3f;
8561 unsigned CountDw = 0;
8562
8563 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
8564 CountDw = (IndexOperand >> 24) & 0xf;
8565 IndexOperand &= ~(0xf << 24);
8566
8567 if (CountDw < 1 || CountDw > 4) {
8568 report_fatal_error(
8569 reason: "ds_ordered_count: dword count must be between 1 and 4");
8570 }
8571 }
8572
8573 if (IndexOperand)
8574 report_fatal_error(reason: "ds_ordered_count: bad index operand");
8575
8576 if (WaveDone && !WaveRelease)
8577 report_fatal_error(reason: "ds_ordered_count: wave_done requires wave_release");
8578
8579 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
8580 unsigned ShaderType =
8581 SIInstrInfo::getDSShaderTypeValue(MF: DAG.getMachineFunction());
8582 unsigned Offset0 = OrderedCountIndex << 2;
8583 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
8584
8585 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
8586 Offset1 |= (CountDw - 1) << 6;
8587
8588 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
8589 Offset1 |= ShaderType << 2;
8590
8591 unsigned Offset = Offset0 | (Offset1 << 8);
8592
8593 SDValue Ops[] = {
8594 Chain,
8595 Value,
8596 DAG.getTargetConstant(Offset, DL, MVT::i16),
8597 copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
8598 };
8599 return DAG.getMemIntrinsicNode(AMDGPUISD::DS_ORDERED_COUNT, DL,
8600 M->getVTList(), Ops, M->getMemoryVT(),
8601 M->getMemOperand());
8602 }
8603 case Intrinsic::amdgcn_ds_fadd: {
8604 MemSDNode *M = cast<MemSDNode>(Val&: Op);
8605 unsigned Opc;
8606 switch (IntrID) {
8607 case Intrinsic::amdgcn_ds_fadd:
8608 Opc = ISD::ATOMIC_LOAD_FADD;
8609 break;
8610 }
8611
8612 return DAG.getAtomic(Opcode: Opc, dl: SDLoc(Op), MemVT: M->getMemoryVT(),
8613 Chain: M->getOperand(Num: 0), Ptr: M->getOperand(Num: 2), Val: M->getOperand(Num: 3),
8614 MMO: M->getMemOperand());
8615 }
8616 case Intrinsic::amdgcn_ds_fmin:
8617 case Intrinsic::amdgcn_ds_fmax: {
8618 MemSDNode *M = cast<MemSDNode>(Val&: Op);
8619 unsigned Opc;
8620 switch (IntrID) {
8621 case Intrinsic::amdgcn_ds_fmin:
8622 Opc = AMDGPUISD::ATOMIC_LOAD_FMIN;
8623 break;
8624 case Intrinsic::amdgcn_ds_fmax:
8625 Opc = AMDGPUISD::ATOMIC_LOAD_FMAX;
8626 break;
8627 default:
8628 llvm_unreachable("Unknown intrinsic!");
8629 }
8630 SDValue Ops[] = {
8631 M->getOperand(Num: 0), // Chain
8632 M->getOperand(Num: 2), // Ptr
8633 M->getOperand(Num: 3) // Value
8634 };
8635
8636 return DAG.getMemIntrinsicNode(Opcode: Opc, dl: SDLoc(Op), VTList: M->getVTList(), Ops,
8637 MemVT: M->getMemoryVT(), MMO: M->getMemOperand());
8638 }
8639 case Intrinsic::amdgcn_buffer_load:
8640 case Intrinsic::amdgcn_buffer_load_format: {
8641 unsigned Glc = Op.getConstantOperandVal(i: 5);
8642 unsigned Slc = Op.getConstantOperandVal(i: 6);
8643 unsigned IdxEn = getIdxEn(VIndex: Op.getOperand(i: 3));
8644 SDValue Ops[] = {
8645 Op.getOperand(0), // Chain
8646 Op.getOperand(2), // rsrc
8647 Op.getOperand(3), // vindex
8648 SDValue(), // voffset -- will be set by setBufferOffsets
8649 SDValue(), // soffset -- will be set by setBufferOffsets
8650 SDValue(), // offset -- will be set by setBufferOffsets
8651 DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
8652 DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
8653 };
8654 setBufferOffsets(CombinedOffset: Op.getOperand(i: 4), DAG, Offsets: &Ops[3]);
8655
8656 unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
8657 AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
8658
8659 EVT VT = Op.getValueType();
8660 EVT IntVT = VT.changeTypeToInteger();
8661 auto *M = cast<MemSDNode>(Val&: Op);
8662 EVT LoadVT = Op.getValueType();
8663
8664 if (LoadVT.getScalarType() == MVT::f16)
8665 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
8666 M, DAG, Ops);
8667
8668 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
8669 if (LoadVT.getScalarType() == MVT::i8 || LoadVT.getScalarType() == MVT::i16)
8670 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops,
8671 M->getMemOperand());
8672
8673 return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
8674 M->getMemOperand(), DAG);
8675 }
8676 case Intrinsic::amdgcn_raw_buffer_load:
8677 case Intrinsic::amdgcn_raw_ptr_buffer_load:
8678 case Intrinsic::amdgcn_raw_buffer_load_format:
8679 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
8680 const bool IsFormat =
8681 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
8682 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
8683
8684 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 2), DAG);
8685 auto Offsets = splitBufferOffsets(Offset: Op.getOperand(i: 3), DAG);
8686 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 4), DAG, Subtarget);
8687 SDValue Ops[] = {
8688 Op.getOperand(0), // Chain
8689 Rsrc, // rsrc
8690 DAG.getConstant(0, DL, MVT::i32), // vindex
8691 Offsets.first, // voffset
8692 SOffset, // soffset
8693 Offsets.second, // offset
8694 Op.getOperand(5), // cachepolicy, swizzled buffer
8695 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8696 };
8697
8698 auto *M = cast<MemSDNode>(Val&: Op);
8699 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
8700 }
8701 case Intrinsic::amdgcn_struct_buffer_load:
8702 case Intrinsic::amdgcn_struct_ptr_buffer_load:
8703 case Intrinsic::amdgcn_struct_buffer_load_format:
8704 case Intrinsic::amdgcn_struct_ptr_buffer_load_format: {
8705 const bool IsFormat =
8706 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
8707 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
8708
8709 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 2), DAG);
8710 auto Offsets = splitBufferOffsets(Offset: Op.getOperand(i: 4), DAG);
8711 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 5), DAG, Subtarget);
8712 SDValue Ops[] = {
8713 Op.getOperand(0), // Chain
8714 Rsrc, // rsrc
8715 Op.getOperand(3), // vindex
8716 Offsets.first, // voffset
8717 SOffset, // soffset
8718 Offsets.second, // offset
8719 Op.getOperand(6), // cachepolicy, swizzled buffer
8720 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
8721 };
8722
8723 return lowerIntrinsicLoad(cast<MemSDNode>(Val&: Op), IsFormat, DAG, Ops);
8724 }
8725 case Intrinsic::amdgcn_tbuffer_load: {
8726 MemSDNode *M = cast<MemSDNode>(Val&: Op);
8727 EVT LoadVT = Op.getValueType();
8728
8729 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 5), DAG, Subtarget);
8730 unsigned Dfmt = Op.getConstantOperandVal(i: 7);
8731 unsigned Nfmt = Op.getConstantOperandVal(i: 8);
8732 unsigned Glc = Op.getConstantOperandVal(i: 9);
8733 unsigned Slc = Op.getConstantOperandVal(i: 10);
8734 unsigned IdxEn = getIdxEn(VIndex: Op.getOperand(i: 3));
8735 SDValue Ops[] = {
8736 Op.getOperand(0), // Chain
8737 Op.getOperand(2), // rsrc
8738 Op.getOperand(3), // vindex
8739 Op.getOperand(4), // voffset
8740 SOffset, // soffset
8741 Op.getOperand(6), // offset
8742 DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
8743 DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
8744 DAG.getTargetConstant(IdxEn, DL, MVT::i1) // idxen
8745 };
8746
8747 if (LoadVT.getScalarType() == MVT::f16)
8748 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
8749 M, DAG, Ops);
8750 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
8751 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
8752 DAG);
8753 }
8754 case Intrinsic::amdgcn_raw_tbuffer_load:
8755 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
8756 MemSDNode *M = cast<MemSDNode>(Val&: Op);
8757 EVT LoadVT = Op.getValueType();
8758 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 2), DAG);
8759 auto Offsets = splitBufferOffsets(Offset: Op.getOperand(i: 3), DAG);
8760 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 4), DAG, Subtarget);
8761
8762 SDValue Ops[] = {
8763 Op.getOperand(0), // Chain
8764 Rsrc, // rsrc
8765 DAG.getConstant(0, DL, MVT::i32), // vindex
8766 Offsets.first, // voffset
8767 SOffset, // soffset
8768 Offsets.second, // offset
8769 Op.getOperand(5), // format
8770 Op.getOperand(6), // cachepolicy, swizzled buffer
8771 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8772 };
8773
8774 if (LoadVT.getScalarType() == MVT::f16)
8775 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
8776 M, DAG, Ops);
8777 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
8778 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
8779 DAG);
8780 }
8781 case Intrinsic::amdgcn_struct_tbuffer_load:
8782 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
8783 MemSDNode *M = cast<MemSDNode>(Val&: Op);
8784 EVT LoadVT = Op.getValueType();
8785 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 2), DAG);
8786 auto Offsets = splitBufferOffsets(Offset: Op.getOperand(i: 4), DAG);
8787 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 5), DAG, Subtarget);
8788
8789 SDValue Ops[] = {
8790 Op.getOperand(0), // Chain
8791 Rsrc, // rsrc
8792 Op.getOperand(3), // vindex
8793 Offsets.first, // voffset
8794 SOffset, // soffset
8795 Offsets.second, // offset
8796 Op.getOperand(6), // format
8797 Op.getOperand(7), // cachepolicy, swizzled buffer
8798 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
8799 };
8800
8801 if (LoadVT.getScalarType() == MVT::f16)
8802 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
8803 M, DAG, Ops);
8804 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
8805 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
8806 DAG);
8807 }
8808 case Intrinsic::amdgcn_buffer_atomic_swap:
8809 case Intrinsic::amdgcn_buffer_atomic_add:
8810 case Intrinsic::amdgcn_buffer_atomic_sub:
8811 case Intrinsic::amdgcn_buffer_atomic_csub:
8812 case Intrinsic::amdgcn_buffer_atomic_smin:
8813 case Intrinsic::amdgcn_buffer_atomic_umin:
8814 case Intrinsic::amdgcn_buffer_atomic_smax:
8815 case Intrinsic::amdgcn_buffer_atomic_umax:
8816 case Intrinsic::amdgcn_buffer_atomic_and:
8817 case Intrinsic::amdgcn_buffer_atomic_or:
8818 case Intrinsic::amdgcn_buffer_atomic_xor:
8819 case Intrinsic::amdgcn_buffer_atomic_fadd: {
8820 unsigned Slc = Op.getConstantOperandVal(i: 6);
8821 unsigned IdxEn = getIdxEn(VIndex: Op.getOperand(i: 4));
8822 SDValue Ops[] = {
8823 Op.getOperand(0), // Chain
8824 Op.getOperand(2), // vdata
8825 Op.getOperand(3), // rsrc
8826 Op.getOperand(4), // vindex
8827 SDValue(), // voffset -- will be set by setBufferOffsets
8828 SDValue(), // soffset -- will be set by setBufferOffsets
8829 SDValue(), // offset -- will be set by setBufferOffsets
8830 DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy
8831 DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
8832 };
8833 setBufferOffsets(CombinedOffset: Op.getOperand(i: 5), DAG, Offsets: &Ops[4]);
8834
8835 EVT VT = Op.getValueType();
8836
8837 auto *M = cast<MemSDNode>(Val&: Op);
8838 unsigned Opcode = 0;
8839
8840 switch (IntrID) {
8841 case Intrinsic::amdgcn_buffer_atomic_swap:
8842 Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP;
8843 break;
8844 case Intrinsic::amdgcn_buffer_atomic_add:
8845 Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD;
8846 break;
8847 case Intrinsic::amdgcn_buffer_atomic_sub:
8848 Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB;
8849 break;
8850 case Intrinsic::amdgcn_buffer_atomic_csub:
8851 Opcode = AMDGPUISD::BUFFER_ATOMIC_CSUB;
8852 break;
8853 case Intrinsic::amdgcn_buffer_atomic_smin:
8854 Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN;
8855 break;
8856 case Intrinsic::amdgcn_buffer_atomic_umin:
8857 Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN;
8858 break;
8859 case Intrinsic::amdgcn_buffer_atomic_smax:
8860 Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX;
8861 break;
8862 case Intrinsic::amdgcn_buffer_atomic_umax:
8863 Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX;
8864 break;
8865 case Intrinsic::amdgcn_buffer_atomic_and:
8866 Opcode = AMDGPUISD::BUFFER_ATOMIC_AND;
8867 break;
8868 case Intrinsic::amdgcn_buffer_atomic_or:
8869 Opcode = AMDGPUISD::BUFFER_ATOMIC_OR;
8870 break;
8871 case Intrinsic::amdgcn_buffer_atomic_xor:
8872 Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
8873 break;
8874 case Intrinsic::amdgcn_buffer_atomic_fadd:
8875 Opcode = AMDGPUISD::BUFFER_ATOMIC_FADD;
8876 break;
8877 default:
8878 llvm_unreachable("unhandled atomic opcode");
8879 }
8880
8881 return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
8882 M->getMemOperand());
8883 }
8884 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
8885 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
8886 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FADD);
8887 case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16:
8888 return lowerRawBufferAtomicIntrin(Op, DAG,
8889 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FADD_BF16);
8890 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
8891 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
8892 return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FADD);
8893 case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16:
8894 return lowerStructBufferAtomicIntrin(Op, DAG,
8895 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FADD_BF16);
8896 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
8897 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
8898 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FMIN);
8899 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
8900 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
8901 return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FMIN);
8902 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
8903 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
8904 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FMAX);
8905 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
8906 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
8907 return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FMAX);
8908 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
8909 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
8910 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SWAP);
8911 case Intrinsic::amdgcn_raw_buffer_atomic_add:
8912 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
8913 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_ADD);
8914 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
8915 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
8916 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SUB);
8917 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
8918 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
8919 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SMIN);
8920 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
8921 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
8922 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_UMIN);
8923 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
8924 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
8925 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SMAX);
8926 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
8927 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
8928 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_UMAX);
8929 case Intrinsic::amdgcn_raw_buffer_atomic_and:
8930 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
8931 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_AND);
8932 case Intrinsic::amdgcn_raw_buffer_atomic_or:
8933 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
8934 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_OR);
8935 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
8936 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
8937 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_XOR);
8938 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
8939 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
8940 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_INC);
8941 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
8942 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
8943 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_DEC);
8944 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
8945 return lowerRawBufferAtomicIntrin(Op, DAG,
8946 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
8947 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
8948 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
8949 return lowerStructBufferAtomicIntrin(Op, DAG,
8950 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SWAP);
8951 case Intrinsic::amdgcn_struct_buffer_atomic_add:
8952 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
8953 return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_ADD);
8954 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
8955 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
8956 return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SUB);
8957 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
8958 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
8959 return lowerStructBufferAtomicIntrin(Op, DAG,
8960 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SMIN);
8961 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
8962 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
8963 return lowerStructBufferAtomicIntrin(Op, DAG,
8964 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_UMIN);
8965 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
8966 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
8967 return lowerStructBufferAtomicIntrin(Op, DAG,
8968 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SMAX);
8969 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
8970 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
8971 return lowerStructBufferAtomicIntrin(Op, DAG,
8972 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_UMAX);
8973 case Intrinsic::amdgcn_struct_buffer_atomic_and:
8974 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
8975 return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_AND);
8976 case Intrinsic::amdgcn_struct_buffer_atomic_or:
8977 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
8978 return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_OR);
8979 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
8980 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
8981 return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_XOR);
8982 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
8983 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
8984 return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_INC);
8985 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
8986 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
8987 return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_DEC);
8988 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
8989 return lowerStructBufferAtomicIntrin(Op, DAG,
8990 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
8991
8992 case Intrinsic::amdgcn_buffer_atomic_cmpswap: {
8993 unsigned Slc = Op.getConstantOperandVal(i: 7);
8994 unsigned IdxEn = getIdxEn(VIndex: Op.getOperand(i: 5));
8995 SDValue Ops[] = {
8996 Op.getOperand(0), // Chain
8997 Op.getOperand(2), // src
8998 Op.getOperand(3), // cmp
8999 Op.getOperand(4), // rsrc
9000 Op.getOperand(5), // vindex
9001 SDValue(), // voffset -- will be set by setBufferOffsets
9002 SDValue(), // soffset -- will be set by setBufferOffsets
9003 SDValue(), // offset -- will be set by setBufferOffsets
9004 DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy
9005 DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
9006 };
9007 setBufferOffsets(CombinedOffset: Op.getOperand(i: 6), DAG, Offsets: &Ops[5]);
9008
9009 EVT VT = Op.getValueType();
9010 auto *M = cast<MemSDNode>(Val&: Op);
9011
9012 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
9013 Op->getVTList(), Ops, VT, M->getMemOperand());
9014 }
9015 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
9016 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
9017 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 4), DAG);
9018 auto Offsets = splitBufferOffsets(Offset: Op.getOperand(i: 5), DAG);
9019 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 6), DAG, Subtarget);
9020 SDValue Ops[] = {
9021 Op.getOperand(0), // Chain
9022 Op.getOperand(2), // src
9023 Op.getOperand(3), // cmp
9024 Rsrc, // rsrc
9025 DAG.getConstant(0, DL, MVT::i32), // vindex
9026 Offsets.first, // voffset
9027 SOffset, // soffset
9028 Offsets.second, // offset
9029 Op.getOperand(7), // cachepolicy
9030 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9031 };
9032 EVT VT = Op.getValueType();
9033 auto *M = cast<MemSDNode>(Val&: Op);
9034
9035 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
9036 Op->getVTList(), Ops, VT, M->getMemOperand());
9037 }
9038 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
9039 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
9040 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op->getOperand(Num: 4), DAG);
9041 auto Offsets = splitBufferOffsets(Offset: Op.getOperand(i: 6), DAG);
9042 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 7), DAG, Subtarget);
9043 SDValue Ops[] = {
9044 Op.getOperand(0), // Chain
9045 Op.getOperand(2), // src
9046 Op.getOperand(3), // cmp
9047 Rsrc, // rsrc
9048 Op.getOperand(5), // vindex
9049 Offsets.first, // voffset
9050 SOffset, // soffset
9051 Offsets.second, // offset
9052 Op.getOperand(8), // cachepolicy
9053 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9054 };
9055 EVT VT = Op.getValueType();
9056 auto *M = cast<MemSDNode>(Val&: Op);
9057
9058 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
9059 Op->getVTList(), Ops, VT, M->getMemOperand());
9060 }
9061 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
9062 MemSDNode *M = cast<MemSDNode>(Val&: Op);
9063 SDValue NodePtr = M->getOperand(Num: 2);
9064 SDValue RayExtent = M->getOperand(Num: 3);
9065 SDValue RayOrigin = M->getOperand(Num: 4);
9066 SDValue RayDir = M->getOperand(Num: 5);
9067 SDValue RayInvDir = M->getOperand(Num: 6);
9068 SDValue TDescr = M->getOperand(Num: 7);
9069
9070 assert(NodePtr.getValueType() == MVT::i32 ||
9071 NodePtr.getValueType() == MVT::i64);
9072 assert(RayDir.getValueType() == MVT::v3f16 ||
9073 RayDir.getValueType() == MVT::v3f32);
9074
9075 if (!Subtarget->hasGFX10_AEncoding()) {
9076 emitRemovedIntrinsicError(DAG, DL, VT: Op.getValueType());
9077 return SDValue();
9078 }
9079
9080 const bool IsGFX11 = AMDGPU::isGFX11(*Subtarget);
9081 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
9082 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
9083 const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
9084 const bool Is64 = NodePtr.getValueType() == MVT::i64;
9085 const unsigned NumVDataDwords = 4;
9086 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
9087 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
9088 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
9089 NumVAddrs <= Subtarget->getNSAMaxSize()) ||
9090 IsGFX12Plus;
9091 const unsigned BaseOpcodes[2][2] = {
9092 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
9093 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
9094 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
9095 int Opcode;
9096 if (UseNSA) {
9097 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
9098 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
9099 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
9100 : AMDGPU::MIMGEncGfx10NSA,
9101 NumVDataDwords, NumVAddrDwords);
9102 } else {
9103 assert(!IsGFX12Plus);
9104 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
9105 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
9106 : AMDGPU::MIMGEncGfx10Default,
9107 NumVDataDwords, NumVAddrDwords);
9108 }
9109 assert(Opcode != -1);
9110
9111 SmallVector<SDValue, 16> Ops;
9112
9113 auto packLanes = [&DAG, &Ops, &DL] (SDValue Op, bool IsAligned) {
9114 SmallVector<SDValue, 3> Lanes;
9115 DAG.ExtractVectorElements(Op, Args&: Lanes, Start: 0, Count: 3);
9116 if (Lanes[0].getValueSizeInBits() == 32) {
9117 for (unsigned I = 0; I < 3; ++I)
9118 Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I]));
9119 } else {
9120 if (IsAligned) {
9121 Ops.push_back(
9122 DAG.getBitcast(MVT::i32,
9123 DAG.getBuildVector(MVT::v2f16, DL,
9124 { Lanes[0], Lanes[1] })));
9125 Ops.push_back(Elt: Lanes[2]);
9126 } else {
9127 SDValue Elt0 = Ops.pop_back_val();
9128 Ops.push_back(
9129 DAG.getBitcast(MVT::i32,
9130 DAG.getBuildVector(MVT::v2f16, DL,
9131 { Elt0, Lanes[0] })));
9132 Ops.push_back(
9133 DAG.getBitcast(MVT::i32,
9134 DAG.getBuildVector(MVT::v2f16, DL,
9135 { Lanes[1], Lanes[2] })));
9136 }
9137 }
9138 };
9139
9140 if (UseNSA && IsGFX11Plus) {
9141 Ops.push_back(Elt: NodePtr);
9142 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
9143 Ops.push_back(Elt: RayOrigin);
9144 if (IsA16) {
9145 SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
9146 DAG.ExtractVectorElements(Op: RayDir, Args&: DirLanes, Start: 0, Count: 3);
9147 DAG.ExtractVectorElements(Op: RayInvDir, Args&: InvDirLanes, Start: 0, Count: 3);
9148 for (unsigned I = 0; I < 3; ++I) {
9149 MergedLanes.push_back(DAG.getBitcast(
9150 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL,
9151 {DirLanes[I], InvDirLanes[I]})));
9152 }
9153 Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes));
9154 } else {
9155 Ops.push_back(Elt: RayDir);
9156 Ops.push_back(Elt: RayInvDir);
9157 }
9158 } else {
9159 if (Is64)
9160 DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0,
9161 2);
9162 else
9163 Ops.push_back(Elt: NodePtr);
9164
9165 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
9166 packLanes(RayOrigin, true);
9167 packLanes(RayDir, true);
9168 packLanes(RayInvDir, false);
9169 }
9170
9171 if (!UseNSA) {
9172 // Build a single vector containing all the operands so far prepared.
9173 if (NumVAddrDwords > 12) {
9174 SDValue Undef = DAG.getUNDEF(MVT::i32);
9175 Ops.append(NumInputs: 16 - Ops.size(), Elt: Undef);
9176 }
9177 assert(Ops.size() >= 8 && Ops.size() <= 12);
9178 SDValue MergedOps = DAG.getBuildVector(
9179 MVT::getVectorVT(MVT::i32, Ops.size()), DL, Ops);
9180 Ops.clear();
9181 Ops.push_back(Elt: MergedOps);
9182 }
9183
9184 Ops.push_back(Elt: TDescr);
9185 Ops.push_back(DAG.getTargetConstant(IsA16, DL, MVT::i1));
9186 Ops.push_back(Elt: M->getChain());
9187
9188 auto *NewNode = DAG.getMachineNode(Opcode, dl: DL, VTs: M->getVTList(), Ops);
9189 MachineMemOperand *MemRef = M->getMemOperand();
9190 DAG.setNodeMemRefs(N: NewNode, NewMemRefs: {MemRef});
9191 return SDValue(NewNode, 0);
9192 }
9193 case Intrinsic::amdgcn_global_atomic_fmin:
9194 case Intrinsic::amdgcn_global_atomic_fmax:
9195 case Intrinsic::amdgcn_global_atomic_fmin_num:
9196 case Intrinsic::amdgcn_global_atomic_fmax_num:
9197 case Intrinsic::amdgcn_flat_atomic_fmin:
9198 case Intrinsic::amdgcn_flat_atomic_fmax:
9199 case Intrinsic::amdgcn_flat_atomic_fmin_num:
9200 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9201 MemSDNode *M = cast<MemSDNode>(Val&: Op);
9202 SDValue Ops[] = {
9203 M->getOperand(Num: 0), // Chain
9204 M->getOperand(Num: 2), // Ptr
9205 M->getOperand(Num: 3) // Value
9206 };
9207 unsigned Opcode = 0;
9208 switch (IntrID) {
9209 case Intrinsic::amdgcn_global_atomic_fmin:
9210 case Intrinsic::amdgcn_global_atomic_fmin_num:
9211 case Intrinsic::amdgcn_flat_atomic_fmin:
9212 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
9213 Opcode = AMDGPUISD::ATOMIC_LOAD_FMIN;
9214 break;
9215 }
9216 case Intrinsic::amdgcn_global_atomic_fmax:
9217 case Intrinsic::amdgcn_global_atomic_fmax_num:
9218 case Intrinsic::amdgcn_flat_atomic_fmax:
9219 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9220 Opcode = AMDGPUISD::ATOMIC_LOAD_FMAX;
9221 break;
9222 }
9223 default:
9224 llvm_unreachable("unhandled atomic opcode");
9225 }
9226 return DAG.getMemIntrinsicNode(Opcode, dl: SDLoc(Op),
9227 VTList: M->getVTList(), Ops, MemVT: M->getMemoryVT(),
9228 MMO: M->getMemOperand());
9229 }
9230 case Intrinsic::amdgcn_s_get_barrier_state: {
9231 SDValue Chain = Op->getOperand(Num: 0);
9232 SmallVector<SDValue, 2> Ops;
9233 unsigned Opc;
9234 bool IsInlinableBarID = false;
9235 int64_t BarID;
9236
9237 if (isa<ConstantSDNode>(Val: Op->getOperand(Num: 2))) {
9238 BarID = cast<ConstantSDNode>(Val: Op->getOperand(Num: 2))->getSExtValue();
9239 IsInlinableBarID = AMDGPU::isInlinableIntLiteral(Literal: BarID);
9240 }
9241
9242 if (IsInlinableBarID) {
9243 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
9244 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
9245 Ops.push_back(Elt: K);
9246 } else {
9247 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
9248 SDValue M0Val = copyToM0(DAG, Chain, DL, V: Op.getOperand(i: 2));
9249 Ops.push_back(Elt: M0Val.getValue(R: 0));
9250 }
9251
9252 auto NewMI = DAG.getMachineNode(Opcode: Opc, dl: DL, VTs: Op->getVTList(), Ops);
9253 return SDValue(NewMI, 0);
9254 }
9255 default:
9256
9257 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
9258 AMDGPU::getImageDimIntrinsicInfo(Intr: IntrID))
9259 return lowerImage(Op, Intr: ImageDimIntr, DAG, WithChain: true);
9260
9261 return SDValue();
9262 }
9263}
9264
9265// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
9266// dwordx4 if on SI and handle TFE loads.
9267SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
9268 SDVTList VTList,
9269 ArrayRef<SDValue> Ops, EVT MemVT,
9270 MachineMemOperand *MMO,
9271 SelectionDAG &DAG) const {
9272 LLVMContext &C = *DAG.getContext();
9273 MachineFunction &MF = DAG.getMachineFunction();
9274 EVT VT = VTList.VTs[0];
9275
9276 assert(VTList.NumVTs == 2 || VTList.NumVTs == 3);
9277 bool IsTFE = VTList.NumVTs == 3;
9278 if (IsTFE) {
9279 unsigned NumValueDWords = divideCeil(Numerator: VT.getSizeInBits(), Denominator: 32);
9280 unsigned NumOpDWords = NumValueDWords + 1;
9281 EVT OpDWordsVT = EVT::getVectorVT(C, MVT::i32, NumOpDWords);
9282 SDVTList OpDWordsVTList = DAG.getVTList(VT1: OpDWordsVT, VT2: VTList.VTs[2]);
9283 MachineMemOperand *OpDWordsMMO =
9284 MF.getMachineMemOperand(MMO, Offset: 0, Size: NumOpDWords * 4);
9285 SDValue Op = getMemIntrinsicNode(Opcode, DL, VTList: OpDWordsVTList, Ops,
9286 MemVT: OpDWordsVT, MMO: OpDWordsMMO, DAG);
9287 SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
9288 DAG.getVectorIdxConstant(NumValueDWords, DL));
9289 SDValue ZeroIdx = DAG.getVectorIdxConstant(Val: 0, DL);
9290 SDValue ValueDWords =
9291 NumValueDWords == 1
9292 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, ZeroIdx)
9293 : DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL,
9294 EVT::getVectorVT(C, MVT::i32, NumValueDWords), Op,
9295 ZeroIdx);
9296 SDValue Value = DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: ValueDWords);
9297 return DAG.getMergeValues(Ops: {Value, Status, SDValue(Op.getNode(), 1)}, dl: DL);
9298 }
9299
9300 if (!Subtarget->hasDwordx3LoadStores() &&
9301 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
9302 EVT WidenedVT = EVT::getVectorVT(Context&: C, VT: VT.getVectorElementType(), NumElements: 4);
9303 EVT WidenedMemVT = EVT::getVectorVT(Context&: C, VT: MemVT.getVectorElementType(), NumElements: 4);
9304 MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, Offset: 0, Size: 16);
9305 SDVTList WidenedVTList = DAG.getVTList(VT1: WidenedVT, VT2: VTList.VTs[1]);
9306 SDValue Op = DAG.getMemIntrinsicNode(Opcode, dl: DL, VTList: WidenedVTList, Ops,
9307 MemVT: WidenedMemVT, MMO: WidenedMMO);
9308 SDValue Value = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT, N1: Op,
9309 N2: DAG.getVectorIdxConstant(Val: 0, DL));
9310 return DAG.getMergeValues(Ops: {Value, SDValue(Op.getNode(), 1)}, dl: DL);
9311 }
9312
9313 return DAG.getMemIntrinsicNode(Opcode, dl: DL, VTList, Ops, MemVT, MMO);
9314}
9315
9316SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
9317 bool ImageStore) const {
9318 EVT StoreVT = VData.getValueType();
9319
9320 // No change for f16 and legal vector D16 types.
9321 if (!StoreVT.isVector())
9322 return VData;
9323
9324 SDLoc DL(VData);
9325 unsigned NumElements = StoreVT.getVectorNumElements();
9326
9327 if (Subtarget->hasUnpackedD16VMem()) {
9328 // We need to unpack the packed data to store.
9329 EVT IntStoreVT = StoreVT.changeTypeToInteger();
9330 SDValue IntVData = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: IntStoreVT, Operand: VData);
9331
9332 EVT EquivStoreVT =
9333 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements);
9334 SDValue ZExt = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: EquivStoreVT, Operand: IntVData);
9335 return DAG.UnrollVectorOp(N: ZExt.getNode());
9336 }
9337
9338 // The sq block of gfx8.1 does not estimate register use correctly for d16
9339 // image store instructions. The data operand is computed as if it were not a
9340 // d16 image instruction.
9341 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
9342 // Bitcast to i16
9343 EVT IntStoreVT = StoreVT.changeTypeToInteger();
9344 SDValue IntVData = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: IntStoreVT, Operand: VData);
9345
9346 // Decompose into scalars
9347 SmallVector<SDValue, 4> Elts;
9348 DAG.ExtractVectorElements(Op: IntVData, Args&: Elts);
9349
9350 // Group pairs of i16 into v2i16 and bitcast to i32
9351 SmallVector<SDValue, 4> PackedElts;
9352 for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
9353 SDValue Pair =
9354 DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]});
9355 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
9356 PackedElts.push_back(Elt: IntPair);
9357 }
9358 if ((NumElements % 2) == 1) {
9359 // Handle v3i16
9360 unsigned I = Elts.size() / 2;
9361 SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL,
9362 {Elts[I * 2], DAG.getUNDEF(MVT::i16)});
9363 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
9364 PackedElts.push_back(Elt: IntPair);
9365 }
9366
9367 // Pad using UNDEF
9368 PackedElts.resize(Elts.size(), DAG.getUNDEF(MVT::i32));
9369
9370 // Build final vector
9371 EVT VecVT =
9372 EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size());
9373 return DAG.getBuildVector(VT: VecVT, DL, Ops: PackedElts);
9374 }
9375
9376 if (NumElements == 3) {
9377 EVT IntStoreVT =
9378 EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: StoreVT.getStoreSizeInBits());
9379 SDValue IntVData = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: IntStoreVT, Operand: VData);
9380
9381 EVT WidenedStoreVT = EVT::getVectorVT(
9382 Context&: *DAG.getContext(), VT: StoreVT.getVectorElementType(), NumElements: NumElements + 1);
9383 EVT WidenedIntVT = EVT::getIntegerVT(Context&: *DAG.getContext(),
9384 BitWidth: WidenedStoreVT.getStoreSizeInBits());
9385 SDValue ZExt = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: WidenedIntVT, Operand: IntVData);
9386 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: WidenedStoreVT, Operand: ZExt);
9387 }
9388
9389 assert(isTypeLegal(StoreVT));
9390 return VData;
9391}
9392
9393SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
9394 SelectionDAG &DAG) const {
9395 SDLoc DL(Op);
9396 SDValue Chain = Op.getOperand(i: 0);
9397 unsigned IntrinsicID = Op.getConstantOperandVal(i: 1);
9398 MachineFunction &MF = DAG.getMachineFunction();
9399
9400 switch (IntrinsicID) {
9401 case Intrinsic::amdgcn_exp_compr: {
9402 if (!Subtarget->hasCompressedExport()) {
9403 DiagnosticInfoUnsupported BadIntrin(
9404 DAG.getMachineFunction().getFunction(),
9405 "intrinsic not supported on subtarget", DL.getDebugLoc());
9406 DAG.getContext()->diagnose(DI: BadIntrin);
9407 }
9408 SDValue Src0 = Op.getOperand(i: 4);
9409 SDValue Src1 = Op.getOperand(i: 5);
9410 // Hack around illegal type on SI by directly selecting it.
9411 if (isTypeLegal(VT: Src0.getValueType()))
9412 return SDValue();
9413
9414 const ConstantSDNode *Done = cast<ConstantSDNode>(Val: Op.getOperand(i: 6));
9415 SDValue Undef = DAG.getUNDEF(MVT::f32);
9416 const SDValue Ops[] = {
9417 Op.getOperand(2), // tgt
9418 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0
9419 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1
9420 Undef, // src2
9421 Undef, // src3
9422 Op.getOperand(7), // vm
9423 DAG.getTargetConstant(1, DL, MVT::i1), // compr
9424 Op.getOperand(3), // en
9425 Op.getOperand(0) // Chain
9426 };
9427
9428 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
9429 return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
9430 }
9431 case Intrinsic::amdgcn_s_barrier: {
9432 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
9433 if (getTargetMachine().getOptLevel() > CodeGenOptLevel::None) {
9434 unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
9435 if (WGSize <= ST.getWavefrontSize())
9436 return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other,
9437 Op.getOperand(0)), 0);
9438 }
9439
9440 // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
9441 if (ST.hasSplitBarriers()) {
9442 SDValue K =
9443 DAG.getTargetConstant(AMDGPU::Barrier::WORKGROUP, DL, MVT::i32);
9444 SDValue BarSignal =
9445 SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_IMM, DL,
9446 MVT::Other, K, Op.getOperand(0)),
9447 0);
9448 SDValue BarWait =
9449 SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_WAIT, DL, MVT::Other, K,
9450 BarSignal.getValue(0)),
9451 0);
9452 return BarWait;
9453 }
9454
9455 return SDValue();
9456 };
9457 case Intrinsic::amdgcn_tbuffer_store: {
9458 SDValue VData = Op.getOperand(i: 2);
9459 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
9460 if (IsD16)
9461 VData = handleD16VData(VData, DAG);
9462 unsigned Dfmt = Op.getConstantOperandVal(i: 8);
9463 unsigned Nfmt = Op.getConstantOperandVal(i: 9);
9464 unsigned Glc = Op.getConstantOperandVal(i: 10);
9465 unsigned Slc = Op.getConstantOperandVal(i: 11);
9466 unsigned IdxEn = getIdxEn(VIndex: Op.getOperand(i: 4));
9467 SDValue Ops[] = {
9468 Chain,
9469 VData, // vdata
9470 Op.getOperand(3), // rsrc
9471 Op.getOperand(4), // vindex
9472 Op.getOperand(5), // voffset
9473 Op.getOperand(6), // soffset
9474 Op.getOperand(7), // offset
9475 DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
9476 DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
9477 DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
9478 };
9479 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
9480 AMDGPUISD::TBUFFER_STORE_FORMAT;
9481 MemSDNode *M = cast<MemSDNode>(Val&: Op);
9482 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9483 M->getMemoryVT(), M->getMemOperand());
9484 }
9485
9486 case Intrinsic::amdgcn_struct_tbuffer_store:
9487 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
9488 SDValue VData = Op.getOperand(i: 2);
9489 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
9490 if (IsD16)
9491 VData = handleD16VData(VData, DAG);
9492 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 3), DAG);
9493 auto Offsets = splitBufferOffsets(Offset: Op.getOperand(i: 5), DAG);
9494 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 6), DAG, Subtarget);
9495 SDValue Ops[] = {
9496 Chain,
9497 VData, // vdata
9498 Rsrc, // rsrc
9499 Op.getOperand(4), // vindex
9500 Offsets.first, // voffset
9501 SOffset, // soffset
9502 Offsets.second, // offset
9503 Op.getOperand(7), // format
9504 Op.getOperand(8), // cachepolicy, swizzled buffer
9505 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9506 };
9507 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
9508 AMDGPUISD::TBUFFER_STORE_FORMAT;
9509 MemSDNode *M = cast<MemSDNode>(Val&: Op);
9510 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9511 M->getMemoryVT(), M->getMemOperand());
9512 }
9513
9514 case Intrinsic::amdgcn_raw_tbuffer_store:
9515 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
9516 SDValue VData = Op.getOperand(i: 2);
9517 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
9518 if (IsD16)
9519 VData = handleD16VData(VData, DAG);
9520 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 3), DAG);
9521 auto Offsets = splitBufferOffsets(Offset: Op.getOperand(i: 4), DAG);
9522 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 5), DAG, Subtarget);
9523 SDValue Ops[] = {
9524 Chain,
9525 VData, // vdata
9526 Rsrc, // rsrc
9527 DAG.getConstant(0, DL, MVT::i32), // vindex
9528 Offsets.first, // voffset
9529 SOffset, // soffset
9530 Offsets.second, // offset
9531 Op.getOperand(6), // format
9532 Op.getOperand(7), // cachepolicy, swizzled buffer
9533 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9534 };
9535 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
9536 AMDGPUISD::TBUFFER_STORE_FORMAT;
9537 MemSDNode *M = cast<MemSDNode>(Val&: Op);
9538 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9539 M->getMemoryVT(), M->getMemOperand());
9540 }
9541
9542 case Intrinsic::amdgcn_buffer_store:
9543 case Intrinsic::amdgcn_buffer_store_format: {
9544 SDValue VData = Op.getOperand(i: 2);
9545 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
9546 if (IsD16)
9547 VData = handleD16VData(VData, DAG);
9548 unsigned Glc = Op.getConstantOperandVal(i: 6);
9549 unsigned Slc = Op.getConstantOperandVal(i: 7);
9550 unsigned IdxEn = getIdxEn(VIndex: Op.getOperand(i: 4));
9551 SDValue Ops[] = {
9552 Chain,
9553 VData,
9554 Op.getOperand(3), // rsrc
9555 Op.getOperand(4), // vindex
9556 SDValue(), // voffset -- will be set by setBufferOffsets
9557 SDValue(), // soffset -- will be set by setBufferOffsets
9558 SDValue(), // offset -- will be set by setBufferOffsets
9559 DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
9560 DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
9561 };
9562 setBufferOffsets(CombinedOffset: Op.getOperand(i: 5), DAG, Offsets: &Ops[4]);
9563
9564 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ?
9565 AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
9566 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
9567 MemSDNode *M = cast<MemSDNode>(Val&: Op);
9568
9569 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
9570 EVT VDataType = VData.getValueType().getScalarType();
9571 if (VDataType == MVT::i8 || VDataType == MVT::i16)
9572 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
9573
9574 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9575 M->getMemoryVT(), M->getMemOperand());
9576 }
9577
9578 case Intrinsic::amdgcn_raw_buffer_store:
9579 case Intrinsic::amdgcn_raw_ptr_buffer_store:
9580 case Intrinsic::amdgcn_raw_buffer_store_format:
9581 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
9582 const bool IsFormat =
9583 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
9584 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
9585
9586 SDValue VData = Op.getOperand(i: 2);
9587 EVT VDataVT = VData.getValueType();
9588 EVT EltType = VDataVT.getScalarType();
9589 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
9590 if (IsD16) {
9591 VData = handleD16VData(VData, DAG);
9592 VDataVT = VData.getValueType();
9593 }
9594
9595 if (!isTypeLegal(VT: VDataVT)) {
9596 VData =
9597 DAG.getNode(Opcode: ISD::BITCAST, DL,
9598 VT: getEquivalentMemType(Context&: *DAG.getContext(), VT: VDataVT), Operand: VData);
9599 }
9600
9601 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 3), DAG);
9602 auto Offsets = splitBufferOffsets(Offset: Op.getOperand(i: 4), DAG);
9603 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 5), DAG, Subtarget);
9604 SDValue Ops[] = {
9605 Chain,
9606 VData,
9607 Rsrc,
9608 DAG.getConstant(0, DL, MVT::i32), // vindex
9609 Offsets.first, // voffset
9610 SOffset, // soffset
9611 Offsets.second, // offset
9612 Op.getOperand(6), // cachepolicy, swizzled buffer
9613 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9614 };
9615 unsigned Opc =
9616 IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE;
9617 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
9618 MemSDNode *M = cast<MemSDNode>(Val&: Op);
9619
9620 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
9621 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
9622 return handleByteShortBufferStores(DAG, VDataType: VDataVT, DL, Ops, M);
9623
9624 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9625 M->getMemoryVT(), M->getMemOperand());
9626 }
9627
9628 case Intrinsic::amdgcn_struct_buffer_store:
9629 case Intrinsic::amdgcn_struct_ptr_buffer_store:
9630 case Intrinsic::amdgcn_struct_buffer_store_format:
9631 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
9632 const bool IsFormat =
9633 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
9634 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
9635
9636 SDValue VData = Op.getOperand(i: 2);
9637 EVT VDataVT = VData.getValueType();
9638 EVT EltType = VDataVT.getScalarType();
9639 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
9640
9641 if (IsD16) {
9642 VData = handleD16VData(VData, DAG);
9643 VDataVT = VData.getValueType();
9644 }
9645
9646 if (!isTypeLegal(VT: VDataVT)) {
9647 VData =
9648 DAG.getNode(Opcode: ISD::BITCAST, DL,
9649 VT: getEquivalentMemType(Context&: *DAG.getContext(), VT: VDataVT), Operand: VData);
9650 }
9651
9652 auto Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 3), DAG);
9653 auto Offsets = splitBufferOffsets(Offset: Op.getOperand(i: 5), DAG);
9654 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 6), DAG, Subtarget);
9655 SDValue Ops[] = {
9656 Chain,
9657 VData,
9658 Rsrc,
9659 Op.getOperand(4), // vindex
9660 Offsets.first, // voffset
9661 SOffset, // soffset
9662 Offsets.second, // offset
9663 Op.getOperand(7), // cachepolicy, swizzled buffer
9664 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9665 };
9666 unsigned Opc =
9667 !IsFormat ? AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
9668 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
9669 MemSDNode *M = cast<MemSDNode>(Val&: Op);
9670
9671 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
9672 EVT VDataType = VData.getValueType().getScalarType();
9673 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
9674 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
9675
9676 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9677 M->getMemoryVT(), M->getMemOperand());
9678 }
9679 case Intrinsic::amdgcn_raw_buffer_load_lds:
9680 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
9681 case Intrinsic::amdgcn_struct_buffer_load_lds:
9682 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
9683 assert(!AMDGPU::isGFX12Plus(*Subtarget));
9684 unsigned Opc;
9685 bool HasVIndex =
9686 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
9687 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
9688 unsigned OpOffset = HasVIndex ? 1 : 0;
9689 SDValue VOffset = Op.getOperand(i: 5 + OpOffset);
9690 bool HasVOffset = !isNullConstant(V: VOffset);
9691 unsigned Size = Op->getConstantOperandVal(Num: 4);
9692
9693 switch (Size) {
9694 default:
9695 return SDValue();
9696 case 1:
9697 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
9698 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
9699 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
9700 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
9701 break;
9702 case 2:
9703 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
9704 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
9705 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
9706 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
9707 break;
9708 case 4:
9709 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
9710 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
9711 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
9712 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
9713 break;
9714 }
9715
9716 SDValue M0Val = copyToM0(DAG, Chain, DL, V: Op.getOperand(i: 3));
9717
9718 SmallVector<SDValue, 8> Ops;
9719
9720 if (HasVIndex && HasVOffset)
9721 Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL,
9722 { Op.getOperand(5), // VIndex
9723 VOffset }));
9724 else if (HasVIndex)
9725 Ops.push_back(Elt: Op.getOperand(i: 5));
9726 else if (HasVOffset)
9727 Ops.push_back(Elt: VOffset);
9728
9729 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 2), DAG);
9730 Ops.push_back(Elt: Rsrc);
9731 Ops.push_back(Elt: Op.getOperand(i: 6 + OpOffset)); // soffset
9732 Ops.push_back(Elt: Op.getOperand(i: 7 + OpOffset)); // imm offset
9733 unsigned Aux = Op.getConstantOperandVal(i: 8 + OpOffset);
9734 Ops.push_back(
9735 DAG.getTargetConstant(Aux & AMDGPU::CPol::ALL, DL, MVT::i8)); // cpol
9736 Ops.push_back(DAG.getTargetConstant(
9737 Aux & AMDGPU::CPol::SWZ_pregfx12 ? 1 : 0, DL, MVT::i8)); // swz
9738 Ops.push_back(Elt: M0Val.getValue(R: 0)); // Chain
9739 Ops.push_back(Elt: M0Val.getValue(R: 1)); // Glue
9740
9741 auto *M = cast<MemSDNode>(Val&: Op);
9742 MachineMemOperand *LoadMMO = M->getMemOperand();
9743 // Don't set the offset value here because the pointer points to the base of
9744 // the buffer.
9745 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
9746
9747 MachinePointerInfo StorePtrI = LoadPtrI;
9748 LoadPtrI.V = PoisonValue::get(
9749 T: PointerType::get(C&: *DAG.getContext(), AddressSpace: AMDGPUAS::GLOBAL_ADDRESS));
9750 LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
9751 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
9752
9753 auto F = LoadMMO->getFlags() &
9754 ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
9755 LoadMMO =
9756 MF.getMachineMemOperand(PtrInfo: LoadPtrI, F: F | MachineMemOperand::MOLoad, Size,
9757 BaseAlignment: LoadMMO->getBaseAlign(), AAInfo: LoadMMO->getAAInfo());
9758
9759 MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
9760 PtrInfo: StorePtrI, F: F | MachineMemOperand::MOStore, Size: sizeof(int32_t),
9761 BaseAlignment: LoadMMO->getBaseAlign(), AAInfo: LoadMMO->getAAInfo());
9762
9763 auto Load = DAG.getMachineNode(Opcode: Opc, dl: DL, VTs: M->getVTList(), Ops);
9764 DAG.setNodeMemRefs(N: Load, NewMemRefs: {LoadMMO, StoreMMO});
9765
9766 return SDValue(Load, 0);
9767 }
9768 case Intrinsic::amdgcn_global_load_lds: {
9769 unsigned Opc;
9770 unsigned Size = Op->getConstantOperandVal(Num: 4);
9771 switch (Size) {
9772 default:
9773 return SDValue();
9774 case 1:
9775 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
9776 break;
9777 case 2:
9778 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
9779 break;
9780 case 4:
9781 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
9782 break;
9783 }
9784
9785 auto *M = cast<MemSDNode>(Val&: Op);
9786 SDValue M0Val = copyToM0(DAG, Chain, DL, V: Op.getOperand(i: 3));
9787
9788 SmallVector<SDValue, 6> Ops;
9789
9790 SDValue Addr = Op.getOperand(i: 2); // Global ptr
9791 SDValue VOffset;
9792 // Try to split SAddr and VOffset. Global and LDS pointers share the same
9793 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
9794 if (Addr->isDivergent() && Addr.getOpcode() == ISD::ADD) {
9795 SDValue LHS = Addr.getOperand(i: 0);
9796 SDValue RHS = Addr.getOperand(i: 1);
9797
9798 if (LHS->isDivergent())
9799 std::swap(a&: LHS, b&: RHS);
9800
9801 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
9802 RHS.getOperand(0).getValueType() == MVT::i32) {
9803 // add (i64 sgpr), (zero_extend (i32 vgpr))
9804 Addr = LHS;
9805 VOffset = RHS.getOperand(i: 0);
9806 }
9807 }
9808
9809 Ops.push_back(Elt: Addr);
9810 if (!Addr->isDivergent()) {
9811 Opc = AMDGPU::getGlobalSaddrOp(Opcode: Opc);
9812 if (!VOffset)
9813 VOffset = SDValue(
9814 DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
9815 DAG.getTargetConstant(0, DL, MVT::i32)), 0);
9816 Ops.push_back(Elt: VOffset);
9817 }
9818
9819 Ops.push_back(Elt: Op.getOperand(i: 5)); // Offset
9820 Ops.push_back(Elt: Op.getOperand(i: 6)); // CPol
9821 Ops.push_back(Elt: M0Val.getValue(R: 0)); // Chain
9822 Ops.push_back(Elt: M0Val.getValue(R: 1)); // Glue
9823
9824 MachineMemOperand *LoadMMO = M->getMemOperand();
9825 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
9826 LoadPtrI.Offset = Op->getConstantOperandVal(Num: 5);
9827 MachinePointerInfo StorePtrI = LoadPtrI;
9828 LoadPtrI.V = PoisonValue::get(
9829 T: PointerType::get(C&: *DAG.getContext(), AddressSpace: AMDGPUAS::GLOBAL_ADDRESS));
9830 LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
9831 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
9832 auto F = LoadMMO->getFlags() &
9833 ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
9834 LoadMMO =
9835 MF.getMachineMemOperand(PtrInfo: LoadPtrI, F: F | MachineMemOperand::MOLoad, Size,
9836 BaseAlignment: LoadMMO->getBaseAlign(), AAInfo: LoadMMO->getAAInfo());
9837 MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
9838 PtrInfo: StorePtrI, F: F | MachineMemOperand::MOStore, Size: sizeof(int32_t), BaseAlignment: Align(4),
9839 AAInfo: LoadMMO->getAAInfo());
9840
9841 auto Load = DAG.getMachineNode(Opcode: Opc, dl: DL, VTs: Op->getVTList(), Ops);
9842 DAG.setNodeMemRefs(N: Load, NewMemRefs: {LoadMMO, StoreMMO});
9843
9844 return SDValue(Load, 0);
9845 }
9846 case Intrinsic::amdgcn_end_cf:
9847 return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
9848 Op->getOperand(2), Chain), 0);
9849 case Intrinsic::amdgcn_s_barrier_init:
9850 case Intrinsic::amdgcn_s_barrier_join:
9851 case Intrinsic::amdgcn_s_wakeup_barrier: {
9852 SDValue Chain = Op->getOperand(Num: 0);
9853 SmallVector<SDValue, 2> Ops;
9854 SDValue BarOp = Op->getOperand(Num: 2);
9855 unsigned Opc;
9856 bool IsInlinableBarID = false;
9857 int64_t BarVal;
9858
9859 if (isa<ConstantSDNode>(Val: BarOp)) {
9860 BarVal = cast<ConstantSDNode>(Val&: BarOp)->getSExtValue();
9861 IsInlinableBarID = AMDGPU::isInlinableIntLiteral(Literal: BarVal);
9862 }
9863
9864 if (IsInlinableBarID) {
9865 switch (IntrinsicID) {
9866 default:
9867 return SDValue();
9868 case Intrinsic::amdgcn_s_barrier_init:
9869 Opc = AMDGPU::S_BARRIER_INIT_IMM;
9870 break;
9871 case Intrinsic::amdgcn_s_barrier_join:
9872 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
9873 break;
9874 case Intrinsic::amdgcn_s_wakeup_barrier:
9875 Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
9876 break;
9877 }
9878
9879 SDValue K = DAG.getTargetConstant(BarVal, DL, MVT::i32);
9880 Ops.push_back(Elt: K);
9881 } else {
9882 switch (IntrinsicID) {
9883 default:
9884 return SDValue();
9885 case Intrinsic::amdgcn_s_barrier_init:
9886 Opc = AMDGPU::S_BARRIER_INIT_M0;
9887 break;
9888 case Intrinsic::amdgcn_s_barrier_join:
9889 Opc = AMDGPU::S_BARRIER_JOIN_M0;
9890 break;
9891 case Intrinsic::amdgcn_s_wakeup_barrier:
9892 Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
9893 break;
9894 }
9895 }
9896
9897 if (IntrinsicID == Intrinsic::amdgcn_s_barrier_init) {
9898 SDValue M0Val;
9899 // Member count will be read from M0[16:22]
9900 M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, Op.getOperand(3),
9901 DAG.getShiftAmountConstant(16, MVT::i32, DL));
9902
9903 if (!IsInlinableBarID) {
9904 // If reference to barrier id is not an inline constant then it must be
9905 // referenced with M0[4:0]. Perform an OR with the member count to
9906 // include it in M0.
9907 M0Val = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32,
9908 Op.getOperand(2), M0Val),
9909 0);
9910 }
9911 Ops.push_back(Elt: copyToM0(DAG, Chain, DL, V: M0Val).getValue(R: 0));
9912 } else if (!IsInlinableBarID) {
9913 Ops.push_back(Elt: copyToM0(DAG, Chain, DL, V: BarOp).getValue(R: 0));
9914 }
9915
9916 auto NewMI = DAG.getMachineNode(Opcode: Opc, dl: DL, VTs: Op->getVTList(), Ops);
9917 return SDValue(NewMI, 0);
9918 }
9919 default: {
9920 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
9921 AMDGPU::getImageDimIntrinsicInfo(Intr: IntrinsicID))
9922 return lowerImage(Op, Intr: ImageDimIntr, DAG, WithChain: true);
9923
9924 return Op;
9925 }
9926 }
9927}
9928
9929// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
9930// offset (the offset that is included in bounds checking and swizzling, to be
9931// split between the instruction's voffset and immoffset fields) and soffset
9932// (the offset that is excluded from bounds checking and swizzling, to go in
9933// the instruction's soffset field). This function takes the first kind of
9934// offset and figures out how to split it between voffset and immoffset.
9935std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
9936 SDValue Offset, SelectionDAG &DAG) const {
9937 SDLoc DL(Offset);
9938 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST: *Subtarget);
9939 SDValue N0 = Offset;
9940 ConstantSDNode *C1 = nullptr;
9941
9942 if ((C1 = dyn_cast<ConstantSDNode>(Val&: N0)))
9943 N0 = SDValue();
9944 else if (DAG.isBaseWithConstantOffset(Op: N0)) {
9945 C1 = cast<ConstantSDNode>(Val: N0.getOperand(i: 1));
9946 N0 = N0.getOperand(i: 0);
9947 }
9948
9949 if (C1) {
9950 unsigned ImmOffset = C1->getZExtValue();
9951 // If the immediate value is too big for the immoffset field, put only bits
9952 // that would normally fit in the immoffset field. The remaining value that
9953 // is copied/added for the voffset field is a large power of 2, and it
9954 // stands more chance of being CSEd with the copy/add for another similar
9955 // load/store.
9956 // However, do not do that rounding down if that is a negative
9957 // number, as it appears to be illegal to have a negative offset in the
9958 // vgpr, even if adding the immediate offset makes it positive.
9959 unsigned Overflow = ImmOffset & ~MaxImm;
9960 ImmOffset -= Overflow;
9961 if ((int32_t)Overflow < 0) {
9962 Overflow += ImmOffset;
9963 ImmOffset = 0;
9964 }
9965 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32));
9966 if (Overflow) {
9967 auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
9968 if (!N0)
9969 N0 = OverflowVal;
9970 else {
9971 SDValue Ops[] = { N0, OverflowVal };
9972 N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
9973 }
9974 }
9975 }
9976 if (!N0)
9977 N0 = DAG.getConstant(0, DL, MVT::i32);
9978 if (!C1)
9979 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32));
9980 return {N0, SDValue(C1, 0)};
9981}
9982
9983// Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the
9984// three offsets (voffset, soffset and instoffset) into the SDValue[3] array
9985// pointed to by Offsets.
9986void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
9987 SelectionDAG &DAG, SDValue *Offsets,
9988 Align Alignment) const {
9989 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
9990 SDLoc DL(CombinedOffset);
9991 if (auto *C = dyn_cast<ConstantSDNode>(Val&: CombinedOffset)) {
9992 uint32_t Imm = C->getZExtValue();
9993 uint32_t SOffset, ImmOffset;
9994 if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
9995 Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
9996 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
9997 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
9998 return;
9999 }
10000 }
10001 if (DAG.isBaseWithConstantOffset(Op: CombinedOffset)) {
10002 SDValue N0 = CombinedOffset.getOperand(i: 0);
10003 SDValue N1 = CombinedOffset.getOperand(i: 1);
10004 uint32_t SOffset, ImmOffset;
10005 int Offset = cast<ConstantSDNode>(Val&: N1)->getSExtValue();
10006 if (Offset >= 0 &&
10007 TII->splitMUBUFOffset(Imm: Offset, SOffset, ImmOffset, Alignment)) {
10008 Offsets[0] = N0;
10009 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
10010 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
10011 return;
10012 }
10013 }
10014
10015 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
10016 ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32)
10017 : DAG.getConstant(0, DL, MVT::i32);
10018
10019 Offsets[0] = CombinedOffset;
10020 Offsets[1] = SOffsetZero;
10021 Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
10022}
10023
10024SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
10025 SelectionDAG &DAG) const {
10026 if (!MaybePointer.getValueType().isScalarInteger())
10027 return MaybePointer;
10028
10029 SDLoc DL(MaybePointer);
10030
10031 SDValue Rsrc = DAG.getBitcast(MVT::v4i32, MaybePointer);
10032 return Rsrc;
10033}
10034
10035// Wrap a global or flat pointer into a buffer intrinsic using the flags
10036// specified in the intrinsic.
10037SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
10038 SelectionDAG &DAG) const {
10039 SDLoc Loc(Op);
10040
10041 SDValue Pointer = Op->getOperand(Num: 1);
10042 SDValue Stride = Op->getOperand(Num: 2);
10043 SDValue NumRecords = Op->getOperand(Num: 3);
10044 SDValue Flags = Op->getOperand(Num: 4);
10045
10046 auto [LowHalf, HighHalf] = DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
10047 SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
10048 SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
10049 std::optional<uint32_t> ConstStride = std::nullopt;
10050 if (auto *ConstNode = dyn_cast<ConstantSDNode>(Val&: Stride))
10051 ConstStride = ConstNode->getZExtValue();
10052
10053 SDValue NewHighHalf = Masked;
10054 if (!ConstStride || *ConstStride != 0) {
10055 SDValue ShiftedStride;
10056 if (ConstStride) {
10057 ShiftedStride = DAG.getConstant(*ConstStride << 16, Loc, MVT::i32);
10058 } else {
10059 SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32);
10060 ShiftedStride =
10061 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
10062 DAG.getShiftAmountConstant(16, MVT::i32, Loc));
10063 }
10064 NewHighHalf = DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
10065 }
10066
10067 SDValue Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf,
10068 NewHighHalf, NumRecords, Flags);
10069 SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
10070 return RsrcPtr;
10071}
10072
10073// Handle 8 bit and 16 bit buffer loads
10074SDValue
10075SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG, EVT LoadVT,
10076 SDLoc DL, ArrayRef<SDValue> Ops,
10077 MachineMemOperand *MMO) const {
10078 EVT IntVT = LoadVT.changeTypeToInteger();
10079 unsigned Opc = (LoadVT.getScalarType() == MVT::i8) ?
10080 AMDGPUISD::BUFFER_LOAD_UBYTE : AMDGPUISD::BUFFER_LOAD_USHORT;
10081
10082 SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
10083 SDValue BufferLoad =
10084 DAG.getMemIntrinsicNode(Opcode: Opc, dl: DL, VTList: ResList, Ops, MemVT: IntVT, MMO);
10085 SDValue LoadVal = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: IntVT, Operand: BufferLoad);
10086 LoadVal = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: LoadVT, Operand: LoadVal);
10087
10088 return DAG.getMergeValues(Ops: {LoadVal, BufferLoad.getValue(R: 1)}, dl: DL);
10089}
10090
10091// Handle 8 bit and 16 bit buffer stores
10092SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
10093 EVT VDataType, SDLoc DL,
10094 SDValue Ops[],
10095 MemSDNode *M) const {
10096 if (VDataType == MVT::f16)
10097 Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
10098
10099 SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
10100 Ops[1] = BufferStoreExt;
10101 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE :
10102 AMDGPUISD::BUFFER_STORE_SHORT;
10103 ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9);
10104 return DAG.getMemIntrinsicNode(Opcode: Opc, dl: DL, VTList: M->getVTList(), Ops: OpsRef, MemVT: VDataType,
10105 MMO: M->getMemOperand());
10106}
10107
10108static SDValue getLoadExtOrTrunc(SelectionDAG &DAG,
10109 ISD::LoadExtType ExtType, SDValue Op,
10110 const SDLoc &SL, EVT VT) {
10111 if (VT.bitsLT(VT: Op.getValueType()))
10112 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT, Operand: Op);
10113
10114 switch (ExtType) {
10115 case ISD::SEXTLOAD:
10116 return DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL: SL, VT, Operand: Op);
10117 case ISD::ZEXTLOAD:
10118 return DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SL, VT, Operand: Op);
10119 case ISD::EXTLOAD:
10120 return DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT, Operand: Op);
10121 case ISD::NON_EXTLOAD:
10122 return Op;
10123 }
10124
10125 llvm_unreachable("invalid ext type");
10126}
10127
10128// Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
10129// TODO: Skip this on GFX12 which does have scalar sub-dword loads.
10130SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const {
10131 SelectionDAG &DAG = DCI.DAG;
10132 if (Ld->getAlign() < Align(4) || Ld->isDivergent())
10133 return SDValue();
10134
10135 // FIXME: Constant loads should all be marked invariant.
10136 unsigned AS = Ld->getAddressSpace();
10137 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
10138 AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
10139 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
10140 return SDValue();
10141
10142 // Don't do this early, since it may interfere with adjacent load merging for
10143 // illegal types. We can avoid losing alignment information for exotic types
10144 // pre-legalize.
10145 EVT MemVT = Ld->getMemoryVT();
10146 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
10147 MemVT.getSizeInBits() >= 32)
10148 return SDValue();
10149
10150 SDLoc SL(Ld);
10151
10152 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
10153 "unexpected vector extload");
10154
10155 // TODO: Drop only high part of range.
10156 SDValue Ptr = Ld->getBasePtr();
10157 SDValue NewLoad = DAG.getLoad(
10158 ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr,
10159 Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(),
10160 Ld->getMemOperand()->getFlags(), Ld->getAAInfo(),
10161 nullptr); // Drop ranges
10162
10163 EVT TruncVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: MemVT.getSizeInBits());
10164 if (MemVT.isFloatingPoint()) {
10165 assert(Ld->getExtensionType() == ISD::NON_EXTLOAD &&
10166 "unexpected fp extload");
10167 TruncVT = MemVT.changeTypeToInteger();
10168 }
10169
10170 SDValue Cvt = NewLoad;
10171 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
10172 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
10173 DAG.getValueType(TruncVT));
10174 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
10175 Ld->getExtensionType() == ISD::NON_EXTLOAD) {
10176 Cvt = DAG.getZeroExtendInReg(Op: NewLoad, DL: SL, VT: TruncVT);
10177 } else {
10178 assert(Ld->getExtensionType() == ISD::EXTLOAD);
10179 }
10180
10181 EVT VT = Ld->getValueType(ResNo: 0);
10182 EVT IntVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: VT.getSizeInBits());
10183
10184 DCI.AddToWorklist(N: Cvt.getNode());
10185
10186 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
10187 // the appropriate extension from the 32-bit load.
10188 Cvt = getLoadExtOrTrunc(DAG, ExtType: Ld->getExtensionType(), Op: Cvt, SL, VT: IntVT);
10189 DCI.AddToWorklist(N: Cvt.getNode());
10190
10191 // Handle conversion back to floating point if necessary.
10192 Cvt = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Cvt);
10193
10194 return DAG.getMergeValues(Ops: { Cvt, NewLoad.getValue(R: 1) }, dl: SL);
10195}
10196
10197static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO,
10198 const SIMachineFunctionInfo &Info) {
10199 // TODO: Should check if the address can definitely not access stack.
10200 if (Info.isEntryFunction())
10201 return Info.getUserSGPRInfo().hasFlatScratchInit();
10202 return true;
10203}
10204
10205SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
10206 SDLoc DL(Op);
10207 LoadSDNode *Load = cast<LoadSDNode>(Val&: Op);
10208 ISD::LoadExtType ExtType = Load->getExtensionType();
10209 EVT MemVT = Load->getMemoryVT();
10210
10211 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
10212 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
10213 return SDValue();
10214
10215 // FIXME: Copied from PPC
10216 // First, load into 32 bits, then truncate to 1 bit.
10217
10218 SDValue Chain = Load->getChain();
10219 SDValue BasePtr = Load->getBasePtr();
10220 MachineMemOperand *MMO = Load->getMemOperand();
10221
10222 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
10223
10224 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
10225 BasePtr, RealMemVT, MMO);
10226
10227 if (!MemVT.isVector()) {
10228 SDValue Ops[] = {
10229 DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MemVT, Operand: NewLD),
10230 NewLD.getValue(R: 1)
10231 };
10232
10233 return DAG.getMergeValues(Ops, dl: DL);
10234 }
10235
10236 SmallVector<SDValue, 3> Elts;
10237 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
10238 SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
10239 DAG.getConstant(I, DL, MVT::i32));
10240
10241 Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
10242 }
10243
10244 SDValue Ops[] = {
10245 DAG.getBuildVector(VT: MemVT, DL, Ops: Elts),
10246 NewLD.getValue(R: 1)
10247 };
10248
10249 return DAG.getMergeValues(Ops, dl: DL);
10250 }
10251
10252 if (!MemVT.isVector())
10253 return SDValue();
10254
10255 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
10256 "Custom lowering for non-i32 vectors hasn't been implemented.");
10257
10258 Align Alignment = Load->getAlign();
10259 unsigned AS = Load->getAddressSpace();
10260 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
10261 Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
10262 return SplitVectorLoad(Op, DAG);
10263 }
10264
10265 MachineFunction &MF = DAG.getMachineFunction();
10266 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
10267 // If there is a possibility that flat instruction access scratch memory
10268 // then we need to use the same legalization rules we use for private.
10269 if (AS == AMDGPUAS::FLAT_ADDRESS &&
10270 !Subtarget->hasMultiDwordFlatScratchAddressing())
10271 AS = addressMayBeAccessedAsPrivate(MMO: Load->getMemOperand(), Info: *MFI) ?
10272 AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS;
10273
10274 unsigned NumElements = MemVT.getVectorNumElements();
10275
10276 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10277 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
10278 if (!Op->isDivergent() && Alignment >= Align(4) && NumElements < 32) {
10279 if (MemVT.isPow2VectorType() ||
10280 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
10281 return SDValue();
10282 return WidenOrSplitVectorLoad(Op, DAG);
10283 }
10284 // Non-uniform loads will be selected to MUBUF instructions, so they
10285 // have the same legalization requirements as global and private
10286 // loads.
10287 //
10288 }
10289
10290 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10291 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
10292 AS == AMDGPUAS::GLOBAL_ADDRESS) {
10293 if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() &&
10294 Load->isSimple() && isMemOpHasNoClobberedMemOperand(N: Load) &&
10295 Alignment >= Align(4) && NumElements < 32) {
10296 if (MemVT.isPow2VectorType() ||
10297 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
10298 return SDValue();
10299 return WidenOrSplitVectorLoad(Op, DAG);
10300 }
10301 // Non-uniform loads will be selected to MUBUF instructions, so they
10302 // have the same legalization requirements as global and private
10303 // loads.
10304 //
10305 }
10306 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10307 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
10308 AS == AMDGPUAS::GLOBAL_ADDRESS ||
10309 AS == AMDGPUAS::FLAT_ADDRESS) {
10310 if (NumElements > 4)
10311 return SplitVectorLoad(Op, DAG);
10312 // v3 loads not supported on SI.
10313 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10314 return WidenOrSplitVectorLoad(Op, DAG);
10315
10316 // v3 and v4 loads are supported for private and global memory.
10317 return SDValue();
10318 }
10319 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
10320 // Depending on the setting of the private_element_size field in the
10321 // resource descriptor, we can only make private accesses up to a certain
10322 // size.
10323 switch (Subtarget->getMaxPrivateElementSize()) {
10324 case 4: {
10325 SDValue Ops[2];
10326 std::tie(args&: Ops[0], args&: Ops[1]) = scalarizeVectorLoad(LD: Load, DAG);
10327 return DAG.getMergeValues(Ops, dl: DL);
10328 }
10329 case 8:
10330 if (NumElements > 2)
10331 return SplitVectorLoad(Op, DAG);
10332 return SDValue();
10333 case 16:
10334 // Same as global/flat
10335 if (NumElements > 4)
10336 return SplitVectorLoad(Op, DAG);
10337 // v3 loads not supported on SI.
10338 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10339 return WidenOrSplitVectorLoad(Op, DAG);
10340
10341 return SDValue();
10342 default:
10343 llvm_unreachable("unsupported private_element_size");
10344 }
10345 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
10346 unsigned Fast = 0;
10347 auto Flags = Load->getMemOperand()->getFlags();
10348 if (allowsMisalignedMemoryAccessesImpl(Size: MemVT.getSizeInBits(), AddrSpace: AS,
10349 Alignment: Load->getAlign(), Flags, IsFast: &Fast) &&
10350 Fast > 1)
10351 return SDValue();
10352
10353 if (MemVT.isVector())
10354 return SplitVectorLoad(Op, DAG);
10355 }
10356
10357 if (!allowsMemoryAccessForAlignment(Context&: *DAG.getContext(), DL: DAG.getDataLayout(),
10358 VT: MemVT, MMO: *Load->getMemOperand())) {
10359 SDValue Ops[2];
10360 std::tie(args&: Ops[0], args&: Ops[1]) = expandUnalignedLoad(LD: Load, DAG);
10361 return DAG.getMergeValues(Ops, dl: DL);
10362 }
10363
10364 return SDValue();
10365}
10366
10367SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
10368 EVT VT = Op.getValueType();
10369 if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
10370 VT.getSizeInBits() == 512)
10371 return splitTernaryVectorOp(Op, DAG);
10372
10373 assert(VT.getSizeInBits() == 64);
10374
10375 SDLoc DL(Op);
10376 SDValue Cond = Op.getOperand(i: 0);
10377
10378 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
10379 SDValue One = DAG.getConstant(1, DL, MVT::i32);
10380
10381 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
10382 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
10383
10384 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
10385 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
10386
10387 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
10388
10389 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
10390 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
10391
10392 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
10393
10394 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
10395 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: Res);
10396}
10397
10398// Catch division cases where we can use shortcuts with rcp and rsq
10399// instructions.
10400SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
10401 SelectionDAG &DAG) const {
10402 SDLoc SL(Op);
10403 SDValue LHS = Op.getOperand(i: 0);
10404 SDValue RHS = Op.getOperand(i: 1);
10405 EVT VT = Op.getValueType();
10406 const SDNodeFlags Flags = Op->getFlags();
10407
10408 bool AllowInaccurateRcp = Flags.hasApproximateFuncs() ||
10409 DAG.getTarget().Options.UnsafeFPMath;
10410
10411 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(Val&: LHS)) {
10412 // Without !fpmath accuracy information, we can't do more because we don't
10413 // know exactly whether rcp is accurate enough to meet !fpmath requirement.
10414 // f16 is always accurate enough
10415 if (!AllowInaccurateRcp && VT != MVT::f16)
10416 return SDValue();
10417
10418 if (CLHS->isExactlyValue(V: 1.0)) {
10419 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
10420 // the CI documentation has a worst case error of 1 ulp.
10421 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
10422 // use it as long as we aren't trying to use denormals.
10423 //
10424 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
10425
10426 // 1.0 / sqrt(x) -> rsq(x)
10427
10428 // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
10429 // error seems really high at 2^29 ULP.
10430 // 1.0 / x -> rcp(x)
10431 return DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT, Operand: RHS);
10432 }
10433
10434 // Same as for 1.0, but expand the sign out of the constant.
10435 if (CLHS->isExactlyValue(V: -1.0)) {
10436 // -1.0 / x -> rcp (fneg x)
10437 SDValue FNegRHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
10438 return DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT, Operand: FNegRHS);
10439 }
10440 }
10441
10442 // For f16 require afn or arcp.
10443 // For f32 require afn.
10444 if (!AllowInaccurateRcp && (VT != MVT::f16 || !Flags.hasAllowReciprocal()))
10445 return SDValue();
10446
10447 // Turn into multiply by the reciprocal.
10448 // x / y -> x * (1.0 / y)
10449 SDValue Recip = DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT, Operand: RHS);
10450 return DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: LHS, N2: Recip, Flags);
10451}
10452
10453SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
10454 SelectionDAG &DAG) const {
10455 SDLoc SL(Op);
10456 SDValue X = Op.getOperand(i: 0);
10457 SDValue Y = Op.getOperand(i: 1);
10458 EVT VT = Op.getValueType();
10459 const SDNodeFlags Flags = Op->getFlags();
10460
10461 bool AllowInaccurateDiv = Flags.hasApproximateFuncs() ||
10462 DAG.getTarget().Options.UnsafeFPMath;
10463 if (!AllowInaccurateDiv)
10464 return SDValue();
10465
10466 SDValue NegY = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Y);
10467 SDValue One = DAG.getConstantFP(Val: 1.0, DL: SL, VT);
10468
10469 SDValue R = DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT, Operand: Y);
10470 SDValue Tmp0 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: NegY, N2: R, N3: One);
10471
10472 R = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: Tmp0, N2: R, N3: R);
10473 SDValue Tmp1 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: NegY, N2: R, N3: One);
10474 R = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: Tmp1, N2: R, N3: R);
10475 SDValue Ret = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: X, N2: R);
10476 SDValue Tmp2 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: NegY, N2: Ret, N3: X);
10477 return DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: Tmp2, N2: R, N3: Ret);
10478}
10479
10480static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
10481 EVT VT, SDValue A, SDValue B, SDValue GlueChain,
10482 SDNodeFlags Flags) {
10483 if (GlueChain->getNumValues() <= 1) {
10484 return DAG.getNode(Opcode, DL: SL, VT, N1: A, N2: B, Flags);
10485 }
10486
10487 assert(GlueChain->getNumValues() == 3);
10488
10489 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
10490 switch (Opcode) {
10491 default: llvm_unreachable("no chain equivalent for opcode");
10492 case ISD::FMUL:
10493 Opcode = AMDGPUISD::FMUL_W_CHAIN;
10494 break;
10495 }
10496
10497 return DAG.getNode(Opcode, DL: SL, VTList,
10498 Ops: {GlueChain.getValue(R: 1), A, B, GlueChain.getValue(R: 2)},
10499 Flags);
10500}
10501
10502static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
10503 EVT VT, SDValue A, SDValue B, SDValue C,
10504 SDValue GlueChain, SDNodeFlags Flags) {
10505 if (GlueChain->getNumValues() <= 1) {
10506 return DAG.getNode(Opcode, DL: SL, VT, Ops: {A, B, C}, Flags);
10507 }
10508
10509 assert(GlueChain->getNumValues() == 3);
10510
10511 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
10512 switch (Opcode) {
10513 default: llvm_unreachable("no chain equivalent for opcode");
10514 case ISD::FMA:
10515 Opcode = AMDGPUISD::FMA_W_CHAIN;
10516 break;
10517 }
10518
10519 return DAG.getNode(Opcode, DL: SL, VTList,
10520 Ops: {GlueChain.getValue(R: 1), A, B, C, GlueChain.getValue(R: 2)},
10521 Flags);
10522}
10523
10524SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
10525 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
10526 return FastLowered;
10527
10528 SDLoc SL(Op);
10529 SDValue Src0 = Op.getOperand(i: 0);
10530 SDValue Src1 = Op.getOperand(i: 1);
10531
10532 SDValue CvtSrc0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
10533 SDValue CvtSrc1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
10534
10535 SDValue RcpSrc1 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, CvtSrc1);
10536 SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, CvtSrc0, RcpSrc1);
10537
10538 SDValue FPRoundFlag = DAG.getTargetConstant(0, SL, MVT::i32);
10539 SDValue BestQuot = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag);
10540
10541 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, BestQuot, Src1, Src0);
10542}
10543
10544// Faster 2.5 ULP division that does not support denormals.
10545SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
10546 SDNodeFlags Flags = Op->getFlags();
10547 SDLoc SL(Op);
10548 SDValue LHS = Op.getOperand(i: 1);
10549 SDValue RHS = Op.getOperand(i: 2);
10550
10551 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);
10552
10553 const APFloat K0Val(0x1p+96f);
10554 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
10555
10556 const APFloat K1Val(0x1p-32f);
10557 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
10558
10559 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
10560
10561 EVT SetCCVT =
10562 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
10563
10564 SDValue r2 = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: r1, RHS: K0, Cond: ISD::SETOGT);
10565
10566 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One, Flags);
10567
10568 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3, Flags);
10569
10570 // rcp does not support denormals.
10571 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1, Flags);
10572
10573 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0, Flags);
10574
10575 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul, Flags);
10576}
10577
10578// Returns immediate value for setting the F32 denorm mode when using the
10579// S_DENORM_MODE instruction.
10580static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG,
10581 const SIMachineFunctionInfo *Info,
10582 const GCNSubtarget *ST) {
10583 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
10584 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
10585 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
10586 return DAG.getTargetConstant(Mode, SDLoc(), MVT::i32);
10587}
10588
10589SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
10590 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
10591 return FastLowered;
10592
10593 // The selection matcher assumes anything with a chain selecting to a
10594 // mayRaiseFPException machine instruction. Since we're introducing a chain
10595 // here, we need to explicitly report nofpexcept for the regular fdiv
10596 // lowering.
10597 SDNodeFlags Flags = Op->getFlags();
10598 Flags.setNoFPExcept(true);
10599
10600 SDLoc SL(Op);
10601 SDValue LHS = Op.getOperand(i: 0);
10602 SDValue RHS = Op.getOperand(i: 1);
10603
10604 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
10605
10606 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
10607
10608 SDValue DenominatorScaled = DAG.getNode(Opcode: AMDGPUISD::DIV_SCALE, DL: SL, VTList: ScaleVT,
10609 Ops: {RHS, RHS, LHS}, Flags);
10610 SDValue NumeratorScaled = DAG.getNode(Opcode: AMDGPUISD::DIV_SCALE, DL: SL, VTList: ScaleVT,
10611 Ops: {LHS, RHS, LHS}, Flags);
10612
10613 // Denominator is scaled to not be denormal, so using rcp is ok.
10614 SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32,
10615 DenominatorScaled, Flags);
10616 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32,
10617 DenominatorScaled, Flags);
10618
10619 using namespace AMDGPU::Hwreg;
10620 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
10621 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
10622
10623 const MachineFunction &MF = DAG.getMachineFunction();
10624 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
10625 const DenormalMode DenormMode = Info->getMode().FP32Denormals;
10626
10627 const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
10628 const bool HasDynamicDenormals =
10629 (DenormMode.Input == DenormalMode::Dynamic) ||
10630 (DenormMode.Output == DenormalMode::Dynamic);
10631
10632 SDValue SavedDenormMode;
10633
10634 if (!PreservesDenormals) {
10635 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
10636 // lowering. The chain dependence is insufficient, and we need glue. We do
10637 // not need the glue variants in a strictfp function.
10638
10639 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
10640
10641 SDValue Glue = DAG.getEntryNode();
10642 if (HasDynamicDenormals) {
10643 SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL,
10644 DAG.getVTList(MVT::i32, MVT::Glue),
10645 {BitField, Glue});
10646 SavedDenormMode = SDValue(GetReg, 0);
10647
10648 Glue = DAG.getMergeValues(
10649 Ops: {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, dl: SL);
10650 }
10651
10652 SDNode *EnableDenorm;
10653 if (Subtarget->hasDenormModeInst()) {
10654 const SDValue EnableDenormValue =
10655 getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, Info, ST: Subtarget);
10656
10657 EnableDenorm = DAG.getNode(Opcode: AMDGPUISD::DENORM_MODE, DL: SL, VTList: BindParamVTs, N1: Glue,
10658 N2: EnableDenormValue)
10659 .getNode();
10660 } else {
10661 const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE,
10662 SL, MVT::i32);
10663 EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
10664 {EnableDenormValue, BitField, Glue});
10665 }
10666
10667 SDValue Ops[3] = {
10668 NegDivScale0,
10669 SDValue(EnableDenorm, 0),
10670 SDValue(EnableDenorm, 1)
10671 };
10672
10673 NegDivScale0 = DAG.getMergeValues(Ops, dl: SL);
10674 }
10675
10676 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
10677 ApproxRcp, One, NegDivScale0, Flags);
10678
10679 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
10680 ApproxRcp, Fma0, Flags);
10681
10682 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled,
10683 Fma1, Fma1, Flags);
10684
10685 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
10686 NumeratorScaled, Mul, Flags);
10687
10688 SDValue Fma3 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32,
10689 Fma2, Fma1, Mul, Fma2, Flags);
10690
10691 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
10692 NumeratorScaled, Fma3, Flags);
10693
10694 if (!PreservesDenormals) {
10695 SDNode *DisableDenorm;
10696 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
10697 const SDValue DisableDenormValue = getSPDenormModeValue(
10698 FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, ST: Subtarget);
10699
10700 DisableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, MVT::Other,
10701 Fma4.getValue(1), DisableDenormValue,
10702 Fma4.getValue(2)).getNode();
10703 } else {
10704 assert(HasDynamicDenormals == (bool)SavedDenormMode);
10705 const SDValue DisableDenormValue =
10706 HasDynamicDenormals
10707 ? SavedDenormMode
10708 : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
10709
10710 DisableDenorm = DAG.getMachineNode(
10711 AMDGPU::S_SETREG_B32, SL, MVT::Other,
10712 {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)});
10713 }
10714
10715 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
10716 SDValue(DisableDenorm, 0), DAG.getRoot());
10717 DAG.setRoot(OutputChain);
10718 }
10719
10720 SDValue Scale = NumeratorScaled.getValue(R: 1);
10721 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
10722 {Fma4, Fma1, Fma3, Scale}, Flags);
10723
10724 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags);
10725}
10726
10727SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
10728 if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
10729 return FastLowered;
10730
10731 SDLoc SL(Op);
10732 SDValue X = Op.getOperand(i: 0);
10733 SDValue Y = Op.getOperand(i: 1);
10734
10735 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
10736
10737 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
10738
10739 SDValue DivScale0 = DAG.getNode(Opcode: AMDGPUISD::DIV_SCALE, DL: SL, VTList: ScaleVT, N1: Y, N2: Y, N3: X);
10740
10741 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
10742
10743 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
10744
10745 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
10746
10747 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
10748
10749 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
10750
10751 SDValue DivScale1 = DAG.getNode(Opcode: AMDGPUISD::DIV_SCALE, DL: SL, VTList: ScaleVT, N1: X, N2: Y, N3: X);
10752
10753 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
10754 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
10755
10756 SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64,
10757 NegDivScale0, Mul, DivScale1);
10758
10759 SDValue Scale;
10760
10761 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
10762 // Workaround a hardware bug on SI where the condition output from div_scale
10763 // is not usable.
10764
10765 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
10766
10767 // Figure out if the scale to use for div_fmas.
10768 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
10769 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
10770 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
10771 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
10772
10773 SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
10774 SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
10775
10776 SDValue Scale0Hi
10777 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
10778 SDValue Scale1Hi
10779 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
10780
10781 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
10782 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
10783 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
10784 } else {
10785 Scale = DivScale1.getValue(R: 1);
10786 }
10787
10788 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64,
10789 Fma4, Fma3, Mul, Scale);
10790
10791 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
10792}
10793
10794SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
10795 EVT VT = Op.getValueType();
10796
10797 if (VT == MVT::f32)
10798 return LowerFDIV32(Op, DAG);
10799
10800 if (VT == MVT::f64)
10801 return LowerFDIV64(Op, DAG);
10802
10803 if (VT == MVT::f16)
10804 return LowerFDIV16(Op, DAG);
10805
10806 llvm_unreachable("Unexpected type for fdiv");
10807}
10808
10809SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {
10810 SDLoc dl(Op);
10811 SDValue Val = Op.getOperand(i: 0);
10812 EVT VT = Val.getValueType();
10813 EVT ResultExpVT = Op->getValueType(ResNo: 1);
10814 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
10815
10816 SDValue Mant = DAG.getNode(
10817 ISD::INTRINSIC_WO_CHAIN, dl, VT,
10818 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_mant, dl, MVT::i32), Val);
10819
10820 SDValue Exp = DAG.getNode(
10821 ISD::INTRINSIC_WO_CHAIN, dl, InstrExpVT,
10822 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_exp, dl, MVT::i32), Val);
10823
10824 if (Subtarget->hasFractBug()) {
10825 SDValue Fabs = DAG.getNode(Opcode: ISD::FABS, DL: dl, VT, Operand: Val);
10826 SDValue Inf = DAG.getConstantFP(
10827 Val: APFloat::getInf(Sem: SelectionDAG::EVTToAPFloatSemantics(VT)), DL: dl, VT);
10828
10829 SDValue IsFinite = DAG.getSetCC(dl, MVT::i1, Fabs, Inf, ISD::SETOLT);
10830 SDValue Zero = DAG.getConstant(Val: 0, DL: dl, VT: InstrExpVT);
10831 Exp = DAG.getNode(Opcode: ISD::SELECT, DL: dl, VT: InstrExpVT, N1: IsFinite, N2: Exp, N3: Zero);
10832 Mant = DAG.getNode(Opcode: ISD::SELECT, DL: dl, VT, N1: IsFinite, N2: Mant, N3: Val);
10833 }
10834
10835 SDValue CastExp = DAG.getSExtOrTrunc(Op: Exp, DL: dl, VT: ResultExpVT);
10836 return DAG.getMergeValues(Ops: {Mant, CastExp}, dl);
10837}
10838
10839SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
10840 SDLoc DL(Op);
10841 StoreSDNode *Store = cast<StoreSDNode>(Val&: Op);
10842 EVT VT = Store->getMemoryVT();
10843
10844 if (VT == MVT::i1) {
10845 return DAG.getTruncStore(Store->getChain(), DL,
10846 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
10847 Store->getBasePtr(), MVT::i1, Store->getMemOperand());
10848 }
10849
10850 assert(VT.isVector() &&
10851 Store->getValue().getValueType().getScalarType() == MVT::i32);
10852
10853 unsigned AS = Store->getAddressSpace();
10854 if (Subtarget->hasLDSMisalignedBug() &&
10855 AS == AMDGPUAS::FLAT_ADDRESS &&
10856 Store->getAlign().value() < VT.getStoreSize() && VT.getSizeInBits() > 32) {
10857 return SplitVectorStore(Op, DAG);
10858 }
10859
10860 MachineFunction &MF = DAG.getMachineFunction();
10861 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
10862 // If there is a possibility that flat instruction access scratch memory
10863 // then we need to use the same legalization rules we use for private.
10864 if (AS == AMDGPUAS::FLAT_ADDRESS &&
10865 !Subtarget->hasMultiDwordFlatScratchAddressing())
10866 AS = addressMayBeAccessedAsPrivate(MMO: Store->getMemOperand(), Info: *MFI) ?
10867 AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS;
10868
10869 unsigned NumElements = VT.getVectorNumElements();
10870 if (AS == AMDGPUAS::GLOBAL_ADDRESS ||
10871 AS == AMDGPUAS::FLAT_ADDRESS) {
10872 if (NumElements > 4)
10873 return SplitVectorStore(Op, DAG);
10874 // v3 stores not supported on SI.
10875 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10876 return SplitVectorStore(Op, DAG);
10877
10878 if (!allowsMemoryAccessForAlignment(Context&: *DAG.getContext(), DL: DAG.getDataLayout(),
10879 VT, MMO: *Store->getMemOperand()))
10880 return expandUnalignedStore(ST: Store, DAG);
10881
10882 return SDValue();
10883 } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
10884 switch (Subtarget->getMaxPrivateElementSize()) {
10885 case 4:
10886 return scalarizeVectorStore(ST: Store, DAG);
10887 case 8:
10888 if (NumElements > 2)
10889 return SplitVectorStore(Op, DAG);
10890 return SDValue();
10891 case 16:
10892 if (NumElements > 4 ||
10893 (NumElements == 3 && !Subtarget->enableFlatScratch()))
10894 return SplitVectorStore(Op, DAG);
10895 return SDValue();
10896 default:
10897 llvm_unreachable("unsupported private_element_size");
10898 }
10899 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
10900 unsigned Fast = 0;
10901 auto Flags = Store->getMemOperand()->getFlags();
10902 if (allowsMisalignedMemoryAccessesImpl(Size: VT.getSizeInBits(), AddrSpace: AS,
10903 Alignment: Store->getAlign(), Flags, IsFast: &Fast) &&
10904 Fast > 1)
10905 return SDValue();
10906
10907 if (VT.isVector())
10908 return SplitVectorStore(Op, DAG);
10909
10910 return expandUnalignedStore(ST: Store, DAG);
10911 }
10912
10913 // Probably an invalid store. If so we'll end up emitting a selection error.
10914 return SDValue();
10915}
10916
10917// Avoid the full correct expansion for f32 sqrt when promoting from f16.
10918SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {
10919 SDLoc SL(Op);
10920 assert(!Subtarget->has16BitInsts());
10921 SDNodeFlags Flags = Op->getFlags();
10922 SDValue Ext =
10923 DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags);
10924
10925 SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32);
10926 SDValue Sqrt =
10927 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags);
10928
10929 return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt,
10930 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
10931}
10932
10933SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
10934 SDLoc DL(Op);
10935 SDNodeFlags Flags = Op->getFlags();
10936 MVT VT = Op.getValueType().getSimpleVT();
10937 const SDValue X = Op.getOperand(i: 0);
10938
10939 if (allowApproxFunc(DAG, Flags)) {
10940 // Instruction is 1ulp but ignores denormals.
10941 return DAG.getNode(
10942 ISD::INTRINSIC_WO_CHAIN, DL, VT,
10943 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags);
10944 }
10945
10946 SDValue ScaleThreshold = DAG.getConstantFP(Val: 0x1.0p-96f, DL, VT);
10947 SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT);
10948
10949 SDValue ScaleUpFactor = DAG.getConstantFP(Val: 0x1.0p+32f, DL, VT);
10950
10951 SDValue ScaledX = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: X, N2: ScaleUpFactor, Flags);
10952
10953 SDValue SqrtX =
10954 DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: NeedScale, N2: ScaledX, N3: X, Flags);
10955
10956 SDValue SqrtS;
10957 if (needsDenormHandlingF32(DAG, Src: X, Flags)) {
10958 SDValue SqrtID =
10959 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32);
10960 SqrtS = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT, N1: SqrtID, N2: SqrtX, Flags);
10961
10962 SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS);
10963 SDValue SqrtSNextDownInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
10964 DAG.getConstant(-1, DL, MVT::i32));
10965 SDValue SqrtSNextDown = DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: SqrtSNextDownInt);
10966
10967 SDValue NegSqrtSNextDown =
10968 DAG.getNode(Opcode: ISD::FNEG, DL, VT, Operand: SqrtSNextDown, Flags);
10969
10970 SDValue SqrtVP =
10971 DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: NegSqrtSNextDown, N2: SqrtS, N3: SqrtX, Flags);
10972
10973 SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
10974 DAG.getConstant(1, DL, MVT::i32));
10975 SDValue SqrtSNextUp = DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: SqrtSNextUpInt);
10976
10977 SDValue NegSqrtSNextUp = DAG.getNode(Opcode: ISD::FNEG, DL, VT, Operand: SqrtSNextUp, Flags);
10978 SDValue SqrtVS =
10979 DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: NegSqrtSNextUp, N2: SqrtS, N3: SqrtX, Flags);
10980
10981 SDValue Zero = DAG.getConstantFP(Val: 0.0f, DL, VT);
10982 SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE);
10983
10984 SqrtS = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: SqrtVPLE0, N2: SqrtSNextDown, N3: SqrtS,
10985 Flags);
10986
10987 SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT);
10988 SqrtS = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: SqrtVPVSGT0, N2: SqrtSNextUp, N3: SqrtS,
10989 Flags);
10990 } else {
10991 SDValue SqrtR = DAG.getNode(Opcode: AMDGPUISD::RSQ, DL, VT, Operand: SqrtX, Flags);
10992
10993 SqrtS = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: SqrtX, N2: SqrtR, Flags);
10994
10995 SDValue Half = DAG.getConstantFP(Val: 0.5f, DL, VT);
10996 SDValue SqrtH = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: SqrtR, N2: Half, Flags);
10997 SDValue NegSqrtH = DAG.getNode(Opcode: ISD::FNEG, DL, VT, Operand: SqrtH, Flags);
10998
10999 SDValue SqrtE = DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: NegSqrtH, N2: SqrtS, N3: Half, Flags);
11000 SqrtH = DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: SqrtH, N2: SqrtE, N3: SqrtH, Flags);
11001 SqrtS = DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: SqrtS, N2: SqrtE, N3: SqrtS, Flags);
11002
11003 SDValue NegSqrtS = DAG.getNode(Opcode: ISD::FNEG, DL, VT, Operand: SqrtS, Flags);
11004 SDValue SqrtD =
11005 DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: NegSqrtS, N2: SqrtS, N3: SqrtX, Flags);
11006 SqrtS = DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: SqrtD, N2: SqrtH, N3: SqrtS, Flags);
11007 }
11008
11009 SDValue ScaleDownFactor = DAG.getConstantFP(Val: 0x1.0p-16f, DL, VT);
11010
11011 SDValue ScaledDown =
11012 DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: SqrtS, N2: ScaleDownFactor, Flags);
11013
11014 SqrtS = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: NeedScale, N2: ScaledDown, N3: SqrtS, Flags);
11015 SDValue IsZeroOrInf =
11016 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
11017 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
11018
11019 return DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: IsZeroOrInf, N2: SqrtX, N3: SqrtS, Flags);
11020}
11021
11022SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
11023 // For double type, the SQRT and RSQ instructions don't have required
11024 // precision, we apply Goldschmidt's algorithm to improve the result:
11025 //
11026 // y0 = rsq(x)
11027 // g0 = x * y0
11028 // h0 = 0.5 * y0
11029 //
11030 // r0 = 0.5 - h0 * g0
11031 // g1 = g0 * r0 + g0
11032 // h1 = h0 * r0 + h0
11033 //
11034 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
11035 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
11036 // h2 = h1 * r1 + h1
11037 //
11038 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
11039 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
11040 //
11041 // sqrt(x) = g3
11042
11043 SDNodeFlags Flags = Op->getFlags();
11044
11045 SDLoc DL(Op);
11046
11047 SDValue X = Op.getOperand(i: 0);
11048 SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64);
11049
11050 SDValue Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT);
11051
11052 SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32);
11053
11054 // Scale up input if it is too small.
11055 SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32);
11056 SDValue ScaleUp =
11057 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt);
11058 SDValue SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags);
11059
11060 SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX);
11061
11062 SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY);
11063
11064 SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64);
11065 SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half);
11066
11067 SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0);
11068 SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half);
11069
11070 SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0);
11071
11072 SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0);
11073
11074 SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1);
11075 SDValue SqrtD0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX);
11076
11077 SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1);
11078
11079 SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2);
11080 SDValue SqrtD1 =
11081 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX);
11082
11083 SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2);
11084
11085 SDValue ScaleDownFactor = DAG.getConstant(-128, DL, MVT::i32);
11086 SDValue ScaleDown =
11087 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt);
11088 SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags);
11089
11090 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
11091 // with finite only or nsz because rsq(+/-0) = +/-inf
11092
11093 // TODO: Check for DAZ and expand to subnormals
11094 SDValue IsZeroOrInf =
11095 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
11096 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
11097
11098 // If x is +INF, +0, or -0, use its original value
11099 return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet,
11100 Flags);
11101}
11102
11103SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
11104 SDLoc DL(Op);
11105 EVT VT = Op.getValueType();
11106 SDValue Arg = Op.getOperand(i: 0);
11107 SDValue TrigVal;
11108
11109 // Propagate fast-math flags so that the multiply we introduce can be folded
11110 // if Arg is already the result of a multiply by constant.
11111 auto Flags = Op->getFlags();
11112
11113 SDValue OneOver2Pi = DAG.getConstantFP(Val: 0.5 * numbers::inv_pi, DL, VT);
11114
11115 if (Subtarget->hasTrigReducedRange()) {
11116 SDValue MulVal = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: Arg, N2: OneOver2Pi, Flags);
11117 TrigVal = DAG.getNode(Opcode: AMDGPUISD::FRACT, DL, VT, Operand: MulVal, Flags);
11118 } else {
11119 TrigVal = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: Arg, N2: OneOver2Pi, Flags);
11120 }
11121
11122 switch (Op.getOpcode()) {
11123 case ISD::FCOS:
11124 return DAG.getNode(Opcode: AMDGPUISD::COS_HW, DL: SDLoc(Op), VT, Operand: TrigVal, Flags);
11125 case ISD::FSIN:
11126 return DAG.getNode(Opcode: AMDGPUISD::SIN_HW, DL: SDLoc(Op), VT, Operand: TrigVal, Flags);
11127 default:
11128 llvm_unreachable("Wrong trig opcode");
11129 }
11130}
11131
11132SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
11133 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Val&: Op);
11134 assert(AtomicNode->isCompareAndSwap());
11135 unsigned AS = AtomicNode->getAddressSpace();
11136
11137 // No custom lowering required for local address space
11138 if (!AMDGPU::isFlatGlobalAddrSpace(AS))
11139 return Op;
11140
11141 // Non-local address space requires custom lowering for atomic compare
11142 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
11143 SDLoc DL(Op);
11144 SDValue ChainIn = Op.getOperand(i: 0);
11145 SDValue Addr = Op.getOperand(i: 1);
11146 SDValue Old = Op.getOperand(i: 2);
11147 SDValue New = Op.getOperand(i: 3);
11148 EVT VT = Op.getValueType();
11149 MVT SimpleVT = VT.getSimpleVT();
11150 MVT VecType = MVT::getVectorVT(VT: SimpleVT, NumElements: 2);
11151
11152 SDValue NewOld = DAG.getBuildVector(VT: VecType, DL, Ops: {New, Old});
11153 SDValue Ops[] = { ChainIn, Addr, NewOld };
11154
11155 return DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::ATOMIC_CMP_SWAP, dl: DL, VTList: Op->getVTList(),
11156 Ops, MemVT: VT, MMO: AtomicNode->getMemOperand());
11157}
11158
11159//===----------------------------------------------------------------------===//
11160// Custom DAG optimizations
11161//===----------------------------------------------------------------------===//
11162
11163SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
11164 DAGCombinerInfo &DCI) const {
11165 EVT VT = N->getValueType(ResNo: 0);
11166 EVT ScalarVT = VT.getScalarType();
11167 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
11168 return SDValue();
11169
11170 SelectionDAG &DAG = DCI.DAG;
11171 SDLoc DL(N);
11172
11173 SDValue Src = N->getOperand(Num: 0);
11174 EVT SrcVT = Src.getValueType();
11175
11176 // TODO: We could try to match extracting the higher bytes, which would be
11177 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
11178 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
11179 // about in practice.
11180 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
11181 if (DAG.MaskedValueIsZero(Op: Src, Mask: APInt::getHighBitsSet(numBits: 32, hiBitsSet: 24))) {
11182 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src);
11183 DCI.AddToWorklist(N: Cvt.getNode());
11184
11185 // For the f16 case, fold to a cast to f32 and then cast back to f16.
11186 if (ScalarVT != MVT::f32) {
11187 Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt,
11188 DAG.getTargetConstant(0, DL, MVT::i32));
11189 }
11190 return Cvt;
11191 }
11192 }
11193
11194 return SDValue();
11195}
11196
11197SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
11198 DAGCombinerInfo &DCI) const {
11199 SDValue MagnitudeOp = N->getOperand(Num: 0);
11200 SDValue SignOp = N->getOperand(Num: 1);
11201 SelectionDAG &DAG = DCI.DAG;
11202 SDLoc DL(N);
11203
11204 // f64 fcopysign is really an f32 copysign on the high bits, so replace the
11205 // lower half with a copy.
11206 // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
11207 if (MagnitudeOp.getValueType() == MVT::f64) {
11208 SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, MagnitudeOp);
11209 SDValue MagLo =
11210 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
11211 DAG.getConstant(0, DL, MVT::i32));
11212 SDValue MagHi =
11213 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
11214 DAG.getConstant(1, DL, MVT::i32));
11215
11216 SDValue HiOp =
11217 DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOp);
11218
11219 SDValue Vector = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
11220
11221 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
11222 }
11223
11224 if (SignOp.getValueType() != MVT::f64)
11225 return SDValue();
11226
11227 // Reduce width of sign operand, we only need the highest bit.
11228 //
11229 // fcopysign f64:x, f64:y ->
11230 // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
11231 // TODO: In some cases it might make sense to go all the way to f16.
11232 SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, SignOp);
11233 SDValue SignAsF32 =
11234 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
11235 DAG.getConstant(1, DL, MVT::i32));
11236
11237 return DAG.getNode(Opcode: ISD::FCOPYSIGN, DL, VT: N->getValueType(ResNo: 0), N1: N->getOperand(Num: 0),
11238 N2: SignAsF32);
11239}
11240
11241// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
11242// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
11243// bits
11244
11245// This is a variant of
11246// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
11247//
11248// The normal DAG combiner will do this, but only if the add has one use since
11249// that would increase the number of instructions.
11250//
11251// This prevents us from seeing a constant offset that can be folded into a
11252// memory instruction's addressing mode. If we know the resulting add offset of
11253// a pointer can be folded into an addressing offset, we can replace the pointer
11254// operand with the add of new constant offset. This eliminates one of the uses,
11255// and may allow the remaining use to also be simplified.
11256//
11257SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
11258 unsigned AddrSpace,
11259 EVT MemVT,
11260 DAGCombinerInfo &DCI) const {
11261 SDValue N0 = N->getOperand(Num: 0);
11262 SDValue N1 = N->getOperand(Num: 1);
11263
11264 // We only do this to handle cases where it's profitable when there are
11265 // multiple uses of the add, so defer to the standard combine.
11266 if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) ||
11267 N0->hasOneUse())
11268 return SDValue();
11269
11270 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(Val&: N1);
11271 if (!CN1)
11272 return SDValue();
11273
11274 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(Val: N0.getOperand(i: 1));
11275 if (!CAdd)
11276 return SDValue();
11277
11278 SelectionDAG &DAG = DCI.DAG;
11279
11280 if (N0->getOpcode() == ISD::OR &&
11281 !DAG.haveNoCommonBitsSet(A: N0.getOperand(i: 0), B: N0.getOperand(i: 1)))
11282 return SDValue();
11283
11284 // If the resulting offset is too large, we can't fold it into the
11285 // addressing mode offset.
11286 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
11287 Type *Ty = MemVT.getTypeForEVT(Context&: *DCI.DAG.getContext());
11288
11289 AddrMode AM;
11290 AM.HasBaseReg = true;
11291 AM.BaseOffs = Offset.getSExtValue();
11292 if (!isLegalAddressingMode(DL: DCI.DAG.getDataLayout(), AM, Ty, AS: AddrSpace))
11293 return SDValue();
11294
11295 SDLoc SL(N);
11296 EVT VT = N->getValueType(ResNo: 0);
11297
11298 SDValue ShlX = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT, N1: N0.getOperand(i: 0), N2: N1);
11299 SDValue COffset = DAG.getConstant(Val: Offset, DL: SL, VT);
11300
11301 SDNodeFlags Flags;
11302 Flags.setNoUnsignedWrap(N->getFlags().hasNoUnsignedWrap() &&
11303 (N0.getOpcode() == ISD::OR ||
11304 N0->getFlags().hasNoUnsignedWrap()));
11305
11306 return DAG.getNode(Opcode: ISD::ADD, DL: SL, VT, N1: ShlX, N2: COffset, Flags);
11307}
11308
11309/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
11310/// by the chain and intrinsic ID. Theoretically we would also need to check the
11311/// specific intrinsic, but they all place the pointer operand first.
11312static unsigned getBasePtrIndex(const MemSDNode *N) {
11313 switch (N->getOpcode()) {
11314 case ISD::STORE:
11315 case ISD::INTRINSIC_W_CHAIN:
11316 case ISD::INTRINSIC_VOID:
11317 return 2;
11318 default:
11319 return 1;
11320 }
11321}
11322
11323SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
11324 DAGCombinerInfo &DCI) const {
11325 SelectionDAG &DAG = DCI.DAG;
11326 SDLoc SL(N);
11327
11328 unsigned PtrIdx = getBasePtrIndex(N);
11329 SDValue Ptr = N->getOperand(Num: PtrIdx);
11330
11331 // TODO: We could also do this for multiplies.
11332 if (Ptr.getOpcode() == ISD::SHL) {
11333 SDValue NewPtr = performSHLPtrCombine(N: Ptr.getNode(), AddrSpace: N->getAddressSpace(),
11334 MemVT: N->getMemoryVT(), DCI);
11335 if (NewPtr) {
11336 SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end());
11337
11338 NewOps[PtrIdx] = NewPtr;
11339 return SDValue(DAG.UpdateNodeOperands(N, Ops: NewOps), 0);
11340 }
11341 }
11342
11343 return SDValue();
11344}
11345
11346static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
11347 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
11348 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
11349 (Opc == ISD::XOR && Val == 0);
11350}
11351
11352// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
11353// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
11354// integer combine opportunities since most 64-bit operations are decomposed
11355// this way. TODO: We won't want this for SALU especially if it is an inline
11356// immediate.
11357SDValue SITargetLowering::splitBinaryBitConstantOp(
11358 DAGCombinerInfo &DCI,
11359 const SDLoc &SL,
11360 unsigned Opc, SDValue LHS,
11361 const ConstantSDNode *CRHS) const {
11362 uint64_t Val = CRHS->getZExtValue();
11363 uint32_t ValLo = Lo_32(Value: Val);
11364 uint32_t ValHi = Hi_32(Value: Val);
11365 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
11366
11367 if ((bitOpWithConstantIsReducible(Opc, Val: ValLo) ||
11368 bitOpWithConstantIsReducible(Opc, Val: ValHi)) ||
11369 (CRHS->hasOneUse() && !TII->isInlineConstant(Imm: CRHS->getAPIntValue()))) {
11370 // If we need to materialize a 64-bit immediate, it will be split up later
11371 // anyway. Avoid creating the harder to understand 64-bit immediate
11372 // materialization.
11373 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
11374 }
11375
11376 return SDValue();
11377}
11378
11379bool llvm::isBoolSGPR(SDValue V) {
11380 if (V.getValueType() != MVT::i1)
11381 return false;
11382 switch (V.getOpcode()) {
11383 default:
11384 break;
11385 case ISD::SETCC:
11386 case AMDGPUISD::FP_CLASS:
11387 return true;
11388 case ISD::AND:
11389 case ISD::OR:
11390 case ISD::XOR:
11391 return isBoolSGPR(V: V.getOperand(i: 0)) && isBoolSGPR(V: V.getOperand(i: 1));
11392 }
11393 return false;
11394}
11395
11396// If a constant has all zeroes or all ones within each byte return it.
11397// Otherwise return 0.
11398static uint32_t getConstantPermuteMask(uint32_t C) {
11399 // 0xff for any zero byte in the mask
11400 uint32_t ZeroByteMask = 0;
11401 if (!(C & 0x000000ff)) ZeroByteMask |= 0x000000ff;
11402 if (!(C & 0x0000ff00)) ZeroByteMask |= 0x0000ff00;
11403 if (!(C & 0x00ff0000)) ZeroByteMask |= 0x00ff0000;
11404 if (!(C & 0xff000000)) ZeroByteMask |= 0xff000000;
11405 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
11406 if ((NonZeroByteMask & C) != NonZeroByteMask)
11407 return 0; // Partial bytes selected.
11408 return C;
11409}
11410
11411// Check if a node selects whole bytes from its operand 0 starting at a byte
11412// boundary while masking the rest. Returns select mask as in the v_perm_b32
11413// or -1 if not succeeded.
11414// Note byte select encoding:
11415// value 0-3 selects corresponding source byte;
11416// value 0xc selects zero;
11417// value 0xff selects 0xff.
11418static uint32_t getPermuteMask(SDValue V) {
11419 assert(V.getValueSizeInBits() == 32);
11420
11421 if (V.getNumOperands() != 2)
11422 return ~0;
11423
11424 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(Val: V.getOperand(i: 1));
11425 if (!N1)
11426 return ~0;
11427
11428 uint32_t C = N1->getZExtValue();
11429
11430 switch (V.getOpcode()) {
11431 default:
11432 break;
11433 case ISD::AND:
11434 if (uint32_t ConstMask = getConstantPermuteMask(C))
11435 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
11436 break;
11437
11438 case ISD::OR:
11439 if (uint32_t ConstMask = getConstantPermuteMask(C))
11440 return (0x03020100 & ~ConstMask) | ConstMask;
11441 break;
11442
11443 case ISD::SHL:
11444 if (C % 8)
11445 return ~0;
11446
11447 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
11448
11449 case ISD::SRL:
11450 if (C % 8)
11451 return ~0;
11452
11453 return uint32_t(0x0c0c0c0c03020100ull >> C);
11454 }
11455
11456 return ~0;
11457}
11458
11459SDValue SITargetLowering::performAndCombine(SDNode *N,
11460 DAGCombinerInfo &DCI) const {
11461 if (DCI.isBeforeLegalize())
11462 return SDValue();
11463
11464 SelectionDAG &DAG = DCI.DAG;
11465 EVT VT = N->getValueType(ResNo: 0);
11466 SDValue LHS = N->getOperand(Num: 0);
11467 SDValue RHS = N->getOperand(Num: 1);
11468
11469
11470 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Val&: RHS);
11471 if (VT == MVT::i64 && CRHS) {
11472 if (SDValue Split
11473 = splitBinaryBitConstantOp(DCI, SL: SDLoc(N), Opc: ISD::AND, LHS, CRHS))
11474 return Split;
11475 }
11476
11477 if (CRHS && VT == MVT::i32) {
11478 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
11479 // nb = number of trailing zeroes in mask
11480 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
11481 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
11482 uint64_t Mask = CRHS->getZExtValue();
11483 unsigned Bits = llvm::popcount(Value: Mask);
11484 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
11485 (Bits == 8 || Bits == 16) && isShiftedMask_64(Value: Mask) && !(Mask & 1)) {
11486 if (auto *CShift = dyn_cast<ConstantSDNode>(Val: LHS->getOperand(Num: 1))) {
11487 unsigned Shift = CShift->getZExtValue();
11488 unsigned NB = CRHS->getAPIntValue().countr_zero();
11489 unsigned Offset = NB + Shift;
11490 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
11491 SDLoc SL(N);
11492 SDValue BFE = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
11493 LHS->getOperand(0),
11494 DAG.getConstant(Offset, SL, MVT::i32),
11495 DAG.getConstant(Bits, SL, MVT::i32));
11496 EVT NarrowVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: Bits);
11497 SDValue Ext = DAG.getNode(Opcode: ISD::AssertZext, DL: SL, VT, N1: BFE,
11498 N2: DAG.getValueType(NarrowVT));
11499 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
11500 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
11501 return Shl;
11502 }
11503 }
11504 }
11505
11506 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
11507 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
11508 isa<ConstantSDNode>(Val: LHS.getOperand(i: 2))) {
11509 uint32_t Sel = getConstantPermuteMask(C: Mask);
11510 if (!Sel)
11511 return SDValue();
11512
11513 // Select 0xc for all zero bytes
11514 Sel = (LHS.getConstantOperandVal(i: 2) & Sel) | (~Sel & 0x0c0c0c0c);
11515 SDLoc DL(N);
11516 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
11517 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
11518 }
11519 }
11520
11521 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
11522 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
11523 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
11524 ISD::CondCode LCC = cast<CondCodeSDNode>(Val: LHS.getOperand(i: 2))->get();
11525 ISD::CondCode RCC = cast<CondCodeSDNode>(Val: RHS.getOperand(i: 2))->get();
11526
11527 SDValue X = LHS.getOperand(i: 0);
11528 SDValue Y = RHS.getOperand(i: 0);
11529 if (Y.getOpcode() != ISD::FABS || Y.getOperand(i: 0) != X ||
11530 !isTypeLegal(VT: X.getValueType()))
11531 return SDValue();
11532
11533 if (LCC == ISD::SETO) {
11534 if (X != LHS.getOperand(i: 1))
11535 return SDValue();
11536
11537 if (RCC == ISD::SETUNE) {
11538 const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(Val: RHS.getOperand(i: 1));
11539 if (!C1 || !C1->isInfinity() || C1->isNegative())
11540 return SDValue();
11541
11542 const uint32_t Mask = SIInstrFlags::N_NORMAL |
11543 SIInstrFlags::N_SUBNORMAL |
11544 SIInstrFlags::N_ZERO |
11545 SIInstrFlags::P_ZERO |
11546 SIInstrFlags::P_SUBNORMAL |
11547 SIInstrFlags::P_NORMAL;
11548
11549 static_assert(((~(SIInstrFlags::S_NAN |
11550 SIInstrFlags::Q_NAN |
11551 SIInstrFlags::N_INFINITY |
11552 SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask,
11553 "mask not equal");
11554
11555 SDLoc DL(N);
11556 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
11557 X, DAG.getConstant(Mask, DL, MVT::i32));
11558 }
11559 }
11560 }
11561
11562 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
11563 std::swap(a&: LHS, b&: RHS);
11564
11565 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
11566 RHS.hasOneUse()) {
11567 ISD::CondCode LCC = cast<CondCodeSDNode>(Val: LHS.getOperand(i: 2))->get();
11568 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan | n_nan)
11569 // and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan | n_nan)
11570 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(Val: RHS.getOperand(i: 1));
11571 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
11572 (RHS.getOperand(i: 0) == LHS.getOperand(i: 0) &&
11573 LHS.getOperand(i: 0) == LHS.getOperand(i: 1))) {
11574 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
11575 unsigned NewMask = LCC == ISD::SETO ?
11576 Mask->getZExtValue() & ~OrdMask :
11577 Mask->getZExtValue() & OrdMask;
11578
11579 SDLoc DL(N);
11580 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
11581 DAG.getConstant(NewMask, DL, MVT::i32));
11582 }
11583 }
11584
11585 if (VT == MVT::i32 &&
11586 (RHS.getOpcode() == ISD::SIGN_EXTEND || LHS.getOpcode() == ISD::SIGN_EXTEND)) {
11587 // and x, (sext cc from i1) => select cc, x, 0
11588 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
11589 std::swap(a&: LHS, b&: RHS);
11590 if (isBoolSGPR(RHS.getOperand(0)))
11591 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0),
11592 LHS, DAG.getConstant(0, SDLoc(N), MVT::i32));
11593 }
11594
11595 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
11596 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
11597 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
11598 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
11599 uint32_t LHSMask = getPermuteMask(V: LHS);
11600 uint32_t RHSMask = getPermuteMask(V: RHS);
11601 if (LHSMask != ~0u && RHSMask != ~0u) {
11602 // Canonicalize the expression in an attempt to have fewer unique masks
11603 // and therefore fewer registers used to hold the masks.
11604 if (LHSMask > RHSMask) {
11605 std::swap(a&: LHSMask, b&: RHSMask);
11606 std::swap(a&: LHS, b&: RHS);
11607 }
11608
11609 // Select 0xc for each lane used from source operand. Zero has 0xc mask
11610 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
11611 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11612 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11613
11614 // Check of we need to combine values from two sources within a byte.
11615 if (!(LHSUsedLanes & RHSUsedLanes) &&
11616 // If we select high and lower word keep it for SDWA.
11617 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
11618 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
11619 // Each byte in each mask is either selector mask 0-3, or has higher
11620 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
11621 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
11622 // mask which is not 0xff wins. By anding both masks we have a correct
11623 // result except that 0x0c shall be corrected to give 0x0c only.
11624 uint32_t Mask = LHSMask & RHSMask;
11625 for (unsigned I = 0; I < 32; I += 8) {
11626 uint32_t ByteSel = 0xff << I;
11627 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
11628 Mask &= (0x0c << I) & 0xffffffff;
11629 }
11630
11631 // Add 4 to each active LHS lane. It will not affect any existing 0xff
11632 // or 0x0c.
11633 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
11634 SDLoc DL(N);
11635
11636 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
11637 LHS.getOperand(0), RHS.getOperand(0),
11638 DAG.getConstant(Sel, DL, MVT::i32));
11639 }
11640 }
11641 }
11642
11643 return SDValue();
11644}
11645
11646// A key component of v_perm is a mapping between byte position of the src
11647// operands, and the byte position of the dest. To provide such, we need: 1. the
11648// node that provides x byte of the dest of the OR, and 2. the byte of the node
11649// used to provide that x byte. calculateByteProvider finds which node provides
11650// a certain byte of the dest of the OR, and calculateSrcByte takes that node,
11651// and finds an ultimate src and byte position For example: The supported
11652// LoadCombine pattern for vector loads is as follows
11653// t1
11654// or
11655// / \
11656// t2 t3
11657// zext shl
11658// | | \
11659// t4 t5 16
11660// or anyext
11661// / \ |
11662// t6 t7 t8
11663// srl shl or
11664// / | / \ / \
11665// t9 t10 t11 t12 t13 t14
11666// trunc* 8 trunc* 8 and and
11667// | | / | | \
11668// t15 t16 t17 t18 t19 t20
11669// trunc* 255 srl -256
11670// | / \
11671// t15 t15 16
11672//
11673// *In this example, the truncs are from i32->i16
11674//
11675// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
11676// respectively. calculateSrcByte would find (given node) -> ultimate src &
11677// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
11678// After finding the mapping, we can combine the tree into vperm t15, t16,
11679// 0x05000407
11680
11681// Find the source and byte position from a node.
11682// \p DestByte is the byte position of the dest of the or that the src
11683// ultimately provides. \p SrcIndex is the byte of the src that maps to this
11684// dest of the or byte. \p Depth tracks how many recursive iterations we have
11685// performed.
11686static const std::optional<ByteProvider<SDValue>>
11687calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
11688 unsigned Depth = 0) {
11689 // We may need to recursively traverse a series of SRLs
11690 if (Depth >= 6)
11691 return std::nullopt;
11692
11693 if (Op.getValueSizeInBits() < 8)
11694 return std::nullopt;
11695
11696 if (Op.getValueType().isVector())
11697 return ByteProvider<SDValue>::getSrc(Val: Op, ByteOffset: DestByte, VectorOffset: SrcIndex);
11698
11699 switch (Op->getOpcode()) {
11700 case ISD::TRUNCATE: {
11701 return calculateSrcByte(Op: Op->getOperand(Num: 0), DestByte, SrcIndex, Depth: Depth + 1);
11702 }
11703
11704 case ISD::SIGN_EXTEND:
11705 case ISD::ZERO_EXTEND:
11706 case ISD::SIGN_EXTEND_INREG: {
11707 SDValue NarrowOp = Op->getOperand(Num: 0);
11708 auto NarrowVT = NarrowOp.getValueType();
11709 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
11710 auto *VTSign = cast<VTSDNode>(Val: Op->getOperand(Num: 1));
11711 NarrowVT = VTSign->getVT();
11712 }
11713 if (!NarrowVT.isByteSized())
11714 return std::nullopt;
11715 uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
11716
11717 if (SrcIndex >= NarrowByteWidth)
11718 return std::nullopt;
11719 return calculateSrcByte(Op: Op->getOperand(Num: 0), DestByte, SrcIndex, Depth: Depth + 1);
11720 }
11721
11722 case ISD::SRA:
11723 case ISD::SRL: {
11724 auto ShiftOp = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 1));
11725 if (!ShiftOp)
11726 return std::nullopt;
11727
11728 uint64_t BitShift = ShiftOp->getZExtValue();
11729
11730 if (BitShift % 8 != 0)
11731 return std::nullopt;
11732
11733 SrcIndex += BitShift / 8;
11734
11735 return calculateSrcByte(Op: Op->getOperand(Num: 0), DestByte, SrcIndex, Depth: Depth + 1);
11736 }
11737
11738 default: {
11739 return ByteProvider<SDValue>::getSrc(Val: Op, ByteOffset: DestByte, VectorOffset: SrcIndex);
11740 }
11741 }
11742 llvm_unreachable("fully handled switch");
11743}
11744
11745// For a byte position in the result of an Or, traverse the tree and find the
11746// node (and the byte of the node) which ultimately provides this {Or,
11747// BytePosition}. \p Op is the operand we are currently examining. \p Index is
11748// the byte position of the Op that corresponds with the originally requested
11749// byte of the Or \p Depth tracks how many recursive iterations we have
11750// performed. \p StartingIndex is the originally requested byte of the Or
11751static const std::optional<ByteProvider<SDValue>>
11752calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
11753 unsigned StartingIndex = 0) {
11754 // Finding Src tree of RHS of or typically requires at least 1 additional
11755 // depth
11756 if (Depth > 6)
11757 return std::nullopt;
11758
11759 unsigned BitWidth = Op.getScalarValueSizeInBits();
11760 if (BitWidth % 8 != 0)
11761 return std::nullopt;
11762 if (Index > BitWidth / 8 - 1)
11763 return std::nullopt;
11764
11765 bool IsVec = Op.getValueType().isVector();
11766 switch (Op.getOpcode()) {
11767 case ISD::OR: {
11768 if (IsVec)
11769 return std::nullopt;
11770
11771 auto RHS = calculateByteProvider(Op: Op.getOperand(i: 1), Index, Depth: Depth + 1,
11772 StartingIndex);
11773 if (!RHS)
11774 return std::nullopt;
11775 auto LHS = calculateByteProvider(Op: Op.getOperand(i: 0), Index, Depth: Depth + 1,
11776 StartingIndex);
11777 if (!LHS)
11778 return std::nullopt;
11779 // A well formed Or will have two ByteProviders for each byte, one of which
11780 // is constant zero
11781 if (!LHS->isConstantZero() && !RHS->isConstantZero())
11782 return std::nullopt;
11783 if (!LHS || LHS->isConstantZero())
11784 return RHS;
11785 if (!RHS || RHS->isConstantZero())
11786 return LHS;
11787 return std::nullopt;
11788 }
11789
11790 case ISD::AND: {
11791 if (IsVec)
11792 return std::nullopt;
11793
11794 auto BitMaskOp = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 1));
11795 if (!BitMaskOp)
11796 return std::nullopt;
11797
11798 uint32_t BitMask = BitMaskOp->getZExtValue();
11799 // Bits we expect for our StartingIndex
11800 uint32_t IndexMask = 0xFF << (Index * 8);
11801
11802 if ((IndexMask & BitMask) != IndexMask) {
11803 // If the result of the and partially provides the byte, then it
11804 // is not well formatted
11805 if (IndexMask & BitMask)
11806 return std::nullopt;
11807 return ByteProvider<SDValue>::getConstantZero();
11808 }
11809
11810 return calculateSrcByte(Op: Op->getOperand(Num: 0), DestByte: StartingIndex, SrcIndex: Index);
11811 }
11812
11813 case ISD::FSHR: {
11814 if (IsVec)
11815 return std::nullopt;
11816
11817 // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
11818 auto ShiftOp = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 2));
11819 if (!ShiftOp || Op.getValueType().isVector())
11820 return std::nullopt;
11821
11822 uint64_t BitsProvided = Op.getValueSizeInBits();
11823 if (BitsProvided % 8 != 0)
11824 return std::nullopt;
11825
11826 uint64_t BitShift = ShiftOp->getAPIntValue().urem(RHS: BitsProvided);
11827 if (BitShift % 8)
11828 return std::nullopt;
11829
11830 uint64_t ConcatSizeInBytes = BitsProvided / 4;
11831 uint64_t ByteShift = BitShift / 8;
11832
11833 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
11834 uint64_t BytesProvided = BitsProvided / 8;
11835 SDValue NextOp = Op.getOperand(i: NewIndex >= BytesProvided ? 0 : 1);
11836 NewIndex %= BytesProvided;
11837 return calculateByteProvider(Op: NextOp, Index: NewIndex, Depth: Depth + 1, StartingIndex);
11838 }
11839
11840 case ISD::SRA:
11841 case ISD::SRL: {
11842 if (IsVec)
11843 return std::nullopt;
11844
11845 auto ShiftOp = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 1));
11846 if (!ShiftOp)
11847 return std::nullopt;
11848
11849 uint64_t BitShift = ShiftOp->getZExtValue();
11850 if (BitShift % 8)
11851 return std::nullopt;
11852
11853 auto BitsProvided = Op.getScalarValueSizeInBits();
11854 if (BitsProvided % 8 != 0)
11855 return std::nullopt;
11856
11857 uint64_t BytesProvided = BitsProvided / 8;
11858 uint64_t ByteShift = BitShift / 8;
11859 // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
11860 // If the byte we are trying to provide (as tracked by index) falls in this
11861 // range, then the SRL provides the byte. The byte of interest of the src of
11862 // the SRL is Index + ByteShift
11863 return BytesProvided - ByteShift > Index
11864 ? calculateSrcByte(Op: Op->getOperand(Num: 0), DestByte: StartingIndex,
11865 SrcIndex: Index + ByteShift)
11866 : ByteProvider<SDValue>::getConstantZero();
11867 }
11868
11869 case ISD::SHL: {
11870 if (IsVec)
11871 return std::nullopt;
11872
11873 auto ShiftOp = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 1));
11874 if (!ShiftOp)
11875 return std::nullopt;
11876
11877 uint64_t BitShift = ShiftOp->getZExtValue();
11878 if (BitShift % 8 != 0)
11879 return std::nullopt;
11880 uint64_t ByteShift = BitShift / 8;
11881
11882 // If we are shifting by an amount greater than (or equal to)
11883 // the index we are trying to provide, then it provides 0s. If not,
11884 // then this bytes are not definitively 0s, and the corresponding byte
11885 // of interest is Index - ByteShift of the src
11886 return Index < ByteShift
11887 ? ByteProvider<SDValue>::getConstantZero()
11888 : calculateByteProvider(Op: Op.getOperand(i: 0), Index: Index - ByteShift,
11889 Depth: Depth + 1, StartingIndex);
11890 }
11891 case ISD::ANY_EXTEND:
11892 case ISD::SIGN_EXTEND:
11893 case ISD::ZERO_EXTEND:
11894 case ISD::SIGN_EXTEND_INREG:
11895 case ISD::AssertZext:
11896 case ISD::AssertSext: {
11897 if (IsVec)
11898 return std::nullopt;
11899
11900 SDValue NarrowOp = Op->getOperand(Num: 0);
11901 unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
11902 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
11903 Op->getOpcode() == ISD::AssertZext ||
11904 Op->getOpcode() == ISD::AssertSext) {
11905 auto *VTSign = cast<VTSDNode>(Val: Op->getOperand(Num: 1));
11906 NarrowBitWidth = VTSign->getVT().getSizeInBits();
11907 }
11908 if (NarrowBitWidth % 8 != 0)
11909 return std::nullopt;
11910 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
11911
11912 if (Index >= NarrowByteWidth)
11913 return Op.getOpcode() == ISD::ZERO_EXTEND
11914 ? std::optional<ByteProvider<SDValue>>(
11915 ByteProvider<SDValue>::getConstantZero())
11916 : std::nullopt;
11917 return calculateByteProvider(Op: NarrowOp, Index, Depth: Depth + 1, StartingIndex);
11918 }
11919
11920 case ISD::TRUNCATE: {
11921 if (IsVec)
11922 return std::nullopt;
11923
11924 uint64_t NarrowByteWidth = BitWidth / 8;
11925
11926 if (NarrowByteWidth >= Index) {
11927 return calculateByteProvider(Op: Op.getOperand(i: 0), Index, Depth: Depth + 1,
11928 StartingIndex);
11929 }
11930
11931 return std::nullopt;
11932 }
11933
11934 case ISD::CopyFromReg: {
11935 if (BitWidth / 8 > Index)
11936 return calculateSrcByte(Op, DestByte: StartingIndex, SrcIndex: Index);
11937
11938 return std::nullopt;
11939 }
11940
11941 case ISD::LOAD: {
11942 auto L = cast<LoadSDNode>(Val: Op.getNode());
11943
11944 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
11945 if (NarrowBitWidth % 8 != 0)
11946 return std::nullopt;
11947 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
11948
11949 // If the width of the load does not reach byte we are trying to provide for
11950 // and it is not a ZEXTLOAD, then the load does not provide for the byte in
11951 // question
11952 if (Index >= NarrowByteWidth) {
11953 return L->getExtensionType() == ISD::ZEXTLOAD
11954 ? std::optional<ByteProvider<SDValue>>(
11955 ByteProvider<SDValue>::getConstantZero())
11956 : std::nullopt;
11957 }
11958
11959 if (NarrowByteWidth > Index) {
11960 return calculateSrcByte(Op, DestByte: StartingIndex, SrcIndex: Index);
11961 }
11962
11963 return std::nullopt;
11964 }
11965
11966 case ISD::BSWAP: {
11967 if (IsVec)
11968 return std::nullopt;
11969
11970 return calculateByteProvider(Op: Op->getOperand(Num: 0), Index: BitWidth / 8 - Index - 1,
11971 Depth: Depth + 1, StartingIndex);
11972 }
11973
11974 case ISD::EXTRACT_VECTOR_ELT: {
11975 auto IdxOp = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 1));
11976 if (!IdxOp)
11977 return std::nullopt;
11978 auto VecIdx = IdxOp->getZExtValue();
11979 auto ScalarSize = Op.getScalarValueSizeInBits();
11980 if (ScalarSize != 32) {
11981 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
11982 }
11983
11984 return calculateSrcByte(Op: ScalarSize == 32 ? Op : Op.getOperand(i: 0),
11985 DestByte: StartingIndex, SrcIndex: Index);
11986 }
11987
11988 case AMDGPUISD::PERM: {
11989 if (IsVec)
11990 return std::nullopt;
11991
11992 auto PermMask = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 2));
11993 if (!PermMask)
11994 return std::nullopt;
11995
11996 auto IdxMask =
11997 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
11998 if (IdxMask > 0x07 && IdxMask != 0x0c)
11999 return std::nullopt;
12000
12001 auto NextOp = Op.getOperand(i: IdxMask > 0x03 ? 0 : 1);
12002 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
12003
12004 return IdxMask != 0x0c ? calculateSrcByte(Op: NextOp, DestByte: StartingIndex, SrcIndex: NextIndex)
12005 : ByteProvider<SDValue>(
12006 ByteProvider<SDValue>::getConstantZero());
12007 }
12008
12009 default: {
12010 return std::nullopt;
12011 }
12012 }
12013
12014 llvm_unreachable("fully handled switch");
12015}
12016
12017// Returns true if the Operand is a scalar and is 16 bits
12018static bool isExtendedFrom16Bits(SDValue &Operand) {
12019
12020 switch (Operand.getOpcode()) {
12021 case ISD::ANY_EXTEND:
12022 case ISD::SIGN_EXTEND:
12023 case ISD::ZERO_EXTEND: {
12024 auto OpVT = Operand.getOperand(i: 0).getValueType();
12025 return !OpVT.isVector() && OpVT.getSizeInBits() == 16;
12026 }
12027 case ISD::LOAD: {
12028 LoadSDNode *L = cast<LoadSDNode>(Val: Operand.getNode());
12029 auto ExtType = cast<LoadSDNode>(Val: L)->getExtensionType();
12030 if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD ||
12031 ExtType == ISD::EXTLOAD) {
12032 auto MemVT = L->getMemoryVT();
12033 return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
12034 }
12035 return L->getMemoryVT().getSizeInBits() == 16;
12036 }
12037 default:
12038 return false;
12039 }
12040}
12041
12042// Returns true if the mask matches consecutive bytes, and the first byte
12043// begins at a power of 2 byte offset from 0th byte
12044static bool addresses16Bits(int Mask) {
12045 int Low8 = Mask & 0xff;
12046 int Hi8 = (Mask & 0xff00) >> 8;
12047
12048 assert(Low8 < 8 && Hi8 < 8);
12049 // Are the bytes contiguous in the order of increasing addresses.
12050 bool IsConsecutive = (Hi8 - Low8 == 1);
12051 // Is the first byte at location that is aligned for 16 bit instructions.
12052 // A counter example is taking 2 consecutive bytes starting at the 8th bit.
12053 // In this case, we still need code to extract the 16 bit operand, so it
12054 // is better to use i8 v_perm
12055 bool Is16Aligned = !(Low8 % 2);
12056
12057 return IsConsecutive && Is16Aligned;
12058}
12059
12060// Do not lower into v_perm if the operands are actually 16 bit
12061// and the selected bits (based on PermMask) correspond with two
12062// easily addressable 16 bit operands.
12063static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op,
12064 SDValue &OtherOp) {
12065 int Low16 = PermMask & 0xffff;
12066 int Hi16 = (PermMask & 0xffff0000) >> 16;
12067
12068 auto TempOp = peekThroughBitcasts(V: Op);
12069 auto TempOtherOp = peekThroughBitcasts(V: OtherOp);
12070
12071 auto OpIs16Bit =
12072 TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(Operand&: TempOp);
12073 if (!OpIs16Bit)
12074 return true;
12075
12076 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
12077 isExtendedFrom16Bits(Operand&: TempOtherOp);
12078 if (!OtherOpIs16Bit)
12079 return true;
12080
12081 // Do we cleanly address both
12082 return !addresses16Bits(Mask: Low16) || !addresses16Bits(Mask: Hi16);
12083}
12084
12085static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src,
12086 unsigned DWordOffset) {
12087 SDValue Ret;
12088
12089 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
12090 // ByteProvider must be at least 8 bits
12091 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
12092
12093 if (TypeSize <= 32)
12094 return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32);
12095
12096 if (Src.getValueType().isVector()) {
12097 auto ScalarTySize = Src.getScalarValueSizeInBits();
12098 auto ScalarTy = Src.getValueType().getScalarType();
12099 if (ScalarTySize == 32) {
12100 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Src,
12101 DAG.getConstant(DWordOffset, SL, MVT::i32));
12102 }
12103 if (ScalarTySize > 32) {
12104 Ret = DAG.getNode(
12105 ISD::EXTRACT_VECTOR_ELT, SL, ScalarTy, Src,
12106 DAG.getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
12107 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
12108 if (ShiftVal)
12109 Ret = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret,
12110 DAG.getConstant(ShiftVal, SL, MVT::i32));
12111 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12112 }
12113
12114 assert(ScalarTySize < 32);
12115 auto NumElements = TypeSize / ScalarTySize;
12116 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
12117 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
12118 auto NumElementsIn32 = 32 / ScalarTySize;
12119 auto NumAvailElements = DWordOffset < Trunc32Elements
12120 ? NumElementsIn32
12121 : NumElements - NormalizedTrunc;
12122
12123 SmallVector<SDValue, 4> VecSrcs;
12124 DAG.ExtractVectorElements(Op: Src, Args&: VecSrcs, Start: DWordOffset * NumElementsIn32,
12125 Count: NumAvailElements);
12126
12127 Ret = DAG.getBuildVector(
12128 VT: MVT::getVectorVT(VT: MVT::getIntegerVT(BitWidth: ScalarTySize), NumElements: NumAvailElements), DL: SL,
12129 Ops: VecSrcs);
12130 return Ret = DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12131 }
12132
12133 /// Scalar Type
12134 auto ShiftVal = 32 * DWordOffset;
12135 Ret = DAG.getNode(ISD::SRL, SL, Src.getValueType(), Src,
12136 DAG.getConstant(ShiftVal, SL, MVT::i32));
12137 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12138}
12139
12140static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
12141 SelectionDAG &DAG = DCI.DAG;
12142 [[maybe_unused]] EVT VT = N->getValueType(ResNo: 0);
12143 SmallVector<ByteProvider<SDValue>, 8> PermNodes;
12144
12145 // VT is known to be MVT::i32, so we need to provide 4 bytes.
12146 assert(VT == MVT::i32);
12147 for (int i = 0; i < 4; i++) {
12148 // Find the ByteProvider that provides the ith byte of the result of OR
12149 std::optional<ByteProvider<SDValue>> P =
12150 calculateByteProvider(Op: SDValue(N, 0), Index: i, Depth: 0, /*StartingIndex = */ i);
12151 // TODO support constantZero
12152 if (!P || P->isConstantZero())
12153 return SDValue();
12154
12155 PermNodes.push_back(Elt: *P);
12156 }
12157 if (PermNodes.size() != 4)
12158 return SDValue();
12159
12160 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
12161 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
12162 uint64_t PermMask = 0x00000000;
12163 for (size_t i = 0; i < PermNodes.size(); i++) {
12164 auto PermOp = PermNodes[i];
12165 // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
12166 // by sizeof(Src2) = 4
12167 int SrcByteAdjust = 4;
12168
12169 // If the Src uses a byte from a different DWORD, then it corresponds
12170 // with a difference source
12171 if (!PermOp.hasSameSrc(Other: PermNodes[FirstSrc.first]) ||
12172 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
12173 if (SecondSrc)
12174 if (!PermOp.hasSameSrc(Other: PermNodes[SecondSrc->first]) ||
12175 ((PermOp.SrcOffset / 4) != SecondSrc->second))
12176 return SDValue();
12177
12178 // Set the index of the second distinct Src node
12179 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
12180 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
12181 SrcByteAdjust = 0;
12182 }
12183 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
12184 assert(!DAG.getDataLayout().isBigEndian());
12185 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
12186 }
12187 SDLoc DL(N);
12188 SDValue Op = *PermNodes[FirstSrc.first].Src;
12189 Op = getDWordFromOffset(DAG, SL: DL, Src: Op, DWordOffset: FirstSrc.second);
12190 assert(Op.getValueSizeInBits() == 32);
12191
12192 // Check that we are not just extracting the bytes in order from an op
12193 if (!SecondSrc) {
12194 int Low16 = PermMask & 0xffff;
12195 int Hi16 = (PermMask & 0xffff0000) >> 16;
12196
12197 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
12198 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
12199
12200 // The perm op would really just produce Op. So combine into Op
12201 if (WellFormedLow && WellFormedHi)
12202 return DAG.getBitcast(VT: MVT::getIntegerVT(BitWidth: 32), V: Op);
12203 }
12204
12205 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op;
12206
12207 if (SecondSrc) {
12208 OtherOp = getDWordFromOffset(DAG, SL: DL, Src: OtherOp, DWordOffset: SecondSrc->second);
12209 assert(OtherOp.getValueSizeInBits() == 32);
12210 }
12211
12212 if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
12213
12214 assert(Op.getValueType().isByteSized() &&
12215 OtherOp.getValueType().isByteSized());
12216
12217 // If the ultimate src is less than 32 bits, then we will only be
12218 // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
12219 // CalculateByteProvider would not have returned Op as source if we
12220 // used a byte that is outside its ValueType. Thus, we are free to
12221 // ANY_EXTEND as the extended bits are dont-cares.
12222 Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32);
12223 OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32);
12224
12225 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
12226 DAG.getConstant(PermMask, DL, MVT::i32));
12227 }
12228 return SDValue();
12229}
12230
12231SDValue SITargetLowering::performOrCombine(SDNode *N,
12232 DAGCombinerInfo &DCI) const {
12233 SelectionDAG &DAG = DCI.DAG;
12234 SDValue LHS = N->getOperand(Num: 0);
12235 SDValue RHS = N->getOperand(Num: 1);
12236
12237 EVT VT = N->getValueType(ResNo: 0);
12238 if (VT == MVT::i1) {
12239 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
12240 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
12241 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
12242 SDValue Src = LHS.getOperand(i: 0);
12243 if (Src != RHS.getOperand(i: 0))
12244 return SDValue();
12245
12246 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(Val: LHS.getOperand(i: 1));
12247 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Val: RHS.getOperand(i: 1));
12248 if (!CLHS || !CRHS)
12249 return SDValue();
12250
12251 // Only 10 bits are used.
12252 static const uint32_t MaxMask = 0x3ff;
12253
12254 uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
12255 SDLoc DL(N);
12256 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
12257 Src, DAG.getConstant(NewMask, DL, MVT::i32));
12258 }
12259
12260 return SDValue();
12261 }
12262
12263 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
12264 if (isa<ConstantSDNode>(Val: RHS) && LHS.hasOneUse() &&
12265 LHS.getOpcode() == AMDGPUISD::PERM &&
12266 isa<ConstantSDNode>(Val: LHS.getOperand(i: 2))) {
12267 uint32_t Sel = getConstantPermuteMask(C: N->getConstantOperandVal(Num: 1));
12268 if (!Sel)
12269 return SDValue();
12270
12271 Sel |= LHS.getConstantOperandVal(i: 2);
12272 SDLoc DL(N);
12273 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
12274 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
12275 }
12276
12277 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
12278 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
12279 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
12280 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
12281
12282 // If all the uses of an or need to extract the individual elements, do not
12283 // attempt to lower into v_perm
12284 auto usesCombinedOperand = [](SDNode *OrUse) {
12285 // If we have any non-vectorized use, then it is a candidate for v_perm
12286 if (OrUse->getOpcode() != ISD::BITCAST ||
12287 !OrUse->getValueType(ResNo: 0).isVector())
12288 return true;
12289
12290 // If we have any non-vectorized use, then it is a candidate for v_perm
12291 for (auto VUse : OrUse->uses()) {
12292 if (!VUse->getValueType(ResNo: 0).isVector())
12293 return true;
12294
12295 // If the use of a vector is a store, then combining via a v_perm
12296 // is beneficial.
12297 // TODO -- whitelist more uses
12298 for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})
12299 if (VUse->getOpcode() == VectorwiseOp)
12300 return true;
12301 }
12302 return false;
12303 };
12304
12305 if (!any_of(Range: N->uses(), P: usesCombinedOperand))
12306 return SDValue();
12307
12308 uint32_t LHSMask = getPermuteMask(V: LHS);
12309 uint32_t RHSMask = getPermuteMask(V: RHS);
12310
12311 if (LHSMask != ~0u && RHSMask != ~0u) {
12312 // Canonicalize the expression in an attempt to have fewer unique masks
12313 // and therefore fewer registers used to hold the masks.
12314 if (LHSMask > RHSMask) {
12315 std::swap(a&: LHSMask, b&: RHSMask);
12316 std::swap(a&: LHS, b&: RHS);
12317 }
12318
12319 // Select 0xc for each lane used from source operand. Zero has 0xc mask
12320 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
12321 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12322 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12323
12324 // Check of we need to combine values from two sources within a byte.
12325 if (!(LHSUsedLanes & RHSUsedLanes) &&
12326 // If we select high and lower word keep it for SDWA.
12327 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
12328 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
12329 // Kill zero bytes selected by other mask. Zero value is 0xc.
12330 LHSMask &= ~RHSUsedLanes;
12331 RHSMask &= ~LHSUsedLanes;
12332 // Add 4 to each active LHS lane
12333 LHSMask |= LHSUsedLanes & 0x04040404;
12334 // Combine masks
12335 uint32_t Sel = LHSMask | RHSMask;
12336 SDLoc DL(N);
12337
12338 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
12339 LHS.getOperand(0), RHS.getOperand(0),
12340 DAG.getConstant(Sel, DL, MVT::i32));
12341 }
12342 }
12343 if (LHSMask == ~0u || RHSMask == ~0u) {
12344 if (SDValue Perm = matchPERM(N, DCI))
12345 return Perm;
12346 }
12347 }
12348
12349 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
12350 return SDValue();
12351
12352 // TODO: This could be a generic combine with a predicate for extracting the
12353 // high half of an integer being free.
12354
12355 // (or i64:x, (zero_extend i32:y)) ->
12356 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
12357 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
12358 RHS.getOpcode() != ISD::ZERO_EXTEND)
12359 std::swap(a&: LHS, b&: RHS);
12360
12361 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
12362 SDValue ExtSrc = RHS.getOperand(i: 0);
12363 EVT SrcVT = ExtSrc.getValueType();
12364 if (SrcVT == MVT::i32) {
12365 SDLoc SL(N);
12366 SDValue LowLHS, HiBits;
12367 std::tie(args&: LowLHS, args&: HiBits) = split64BitValue(Op: LHS, DAG);
12368 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
12369
12370 DCI.AddToWorklist(N: LowOr.getNode());
12371 DCI.AddToWorklist(N: HiBits.getNode());
12372
12373 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
12374 LowOr, HiBits);
12375 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
12376 }
12377 }
12378
12379 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1));
12380 if (CRHS) {
12381 if (SDValue Split
12382 = splitBinaryBitConstantOp(DCI, SL: SDLoc(N), Opc: ISD::OR,
12383 LHS: N->getOperand(Num: 0), CRHS))
12384 return Split;
12385 }
12386
12387 return SDValue();
12388}
12389
12390SDValue SITargetLowering::performXorCombine(SDNode *N,
12391 DAGCombinerInfo &DCI) const {
12392 if (SDValue RV = reassociateScalarOps(N, DAG&: DCI.DAG))
12393 return RV;
12394
12395 SDValue LHS = N->getOperand(Num: 0);
12396 SDValue RHS = N->getOperand(Num: 1);
12397
12398 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Val&: RHS);
12399 SelectionDAG &DAG = DCI.DAG;
12400
12401 EVT VT = N->getValueType(ResNo: 0);
12402 if (CRHS && VT == MVT::i64) {
12403 if (SDValue Split
12404 = splitBinaryBitConstantOp(DCI, SL: SDLoc(N), Opc: ISD::XOR, LHS, CRHS))
12405 return Split;
12406 }
12407
12408 // Make sure to apply the 64-bit constant splitting fold before trying to fold
12409 // fneg-like xors into 64-bit select.
12410 if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
12411 // This looks like an fneg, try to fold as a source modifier.
12412 if (CRHS && CRHS->getAPIntValue().isSignMask() &&
12413 shouldFoldFNegIntoSrc(FNeg: N, FNegSrc: LHS)) {
12414 // xor (select c, a, b), 0x80000000 ->
12415 // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
12416 SDLoc DL(N);
12417 SDValue CastLHS =
12418 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1));
12419 SDValue CastRHS =
12420 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2));
12421 SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastLHS);
12422 SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastRHS);
12423 SDValue NewSelect = DAG.getNode(ISD::SELECT, DL, MVT::f32,
12424 LHS->getOperand(0), FNegLHS, FNegRHS);
12425 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: NewSelect);
12426 }
12427 }
12428
12429 return SDValue();
12430}
12431
12432SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
12433 DAGCombinerInfo &DCI) const {
12434 if (!Subtarget->has16BitInsts() ||
12435 DCI.getDAGCombineLevel() < AfterLegalizeDAG)
12436 return SDValue();
12437
12438 EVT VT = N->getValueType(ResNo: 0);
12439 if (VT != MVT::i32)
12440 return SDValue();
12441
12442 SDValue Src = N->getOperand(Num: 0);
12443 if (Src.getValueType() != MVT::i16)
12444 return SDValue();
12445
12446 return SDValue();
12447}
12448
12449SDValue
12450SITargetLowering::performSignExtendInRegCombine(SDNode *N,
12451 DAGCombinerInfo &DCI) const {
12452 SDValue Src = N->getOperand(Num: 0);
12453 auto *VTSign = cast<VTSDNode>(Val: N->getOperand(Num: 1));
12454
12455 // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
12456 // with s_buffer_load_i8 and s_buffer_load_i16 respectively.
12457 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
12458 VTSign->getVT() == MVT::i8) ||
12459 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
12460 VTSign->getVT() == MVT::i16))) {
12461 assert(Subtarget->hasScalarSubwordLoads() &&
12462 "s_buffer_load_{u8, i8} are supported "
12463 "in GFX12 (or newer) architectures.");
12464 EVT VT = Src.getValueType();
12465 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
12466 ? AMDGPUISD::SBUFFER_LOAD_BYTE
12467 : AMDGPUISD::SBUFFER_LOAD_SHORT;
12468 SDLoc DL(N);
12469 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
12470 SDValue Ops[] = {
12471 Src.getOperand(i: 0), // source register
12472 Src.getOperand(i: 1), // offset
12473 Src.getOperand(i: 2) // cachePolicy
12474 };
12475 auto *M = cast<MemSDNode>(Val&: Src);
12476 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
12477 Opcode: Opc, dl: DL, VTList: ResList, Ops, MemVT: M->getMemoryVT(), MMO: M->getMemOperand());
12478 SDValue LoadVal = DCI.DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: BufferLoad);
12479 return LoadVal;
12480 } else if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
12481 VTSign->getVT() == MVT::i8) ||
12482 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
12483 VTSign->getVT() == MVT::i16)) &&
12484 Src.hasOneUse()) {
12485 auto *M = cast<MemSDNode>(Val&: Src);
12486 SDValue Ops[] = {
12487 Src.getOperand(i: 0), // Chain
12488 Src.getOperand(i: 1), // rsrc
12489 Src.getOperand(i: 2), // vindex
12490 Src.getOperand(i: 3), // voffset
12491 Src.getOperand(i: 4), // soffset
12492 Src.getOperand(i: 5), // offset
12493 Src.getOperand(i: 6),
12494 Src.getOperand(i: 7)
12495 };
12496 // replace with BUFFER_LOAD_BYTE/SHORT
12497 SDVTList ResList = DCI.DAG.getVTList(MVT::i32,
12498 Src.getOperand(0).getValueType());
12499 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE) ?
12500 AMDGPUISD::BUFFER_LOAD_BYTE : AMDGPUISD::BUFFER_LOAD_SHORT;
12501 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(Opcode: Opc, dl: SDLoc(N),
12502 VTList: ResList,
12503 Ops, MemVT: M->getMemoryVT(),
12504 MMO: M->getMemOperand());
12505 return DCI.DAG.getMergeValues(Ops: {BufferLoadSignExt,
12506 BufferLoadSignExt.getValue(R: 1)}, dl: SDLoc(N));
12507 }
12508 return SDValue();
12509}
12510
12511SDValue SITargetLowering::performClassCombine(SDNode *N,
12512 DAGCombinerInfo &DCI) const {
12513 SelectionDAG &DAG = DCI.DAG;
12514 SDValue Mask = N->getOperand(Num: 1);
12515
12516 // fp_class x, 0 -> false
12517 if (isNullConstant(Mask))
12518 return DAG.getConstant(0, SDLoc(N), MVT::i1);
12519
12520 if (N->getOperand(0).isUndef())
12521 return DAG.getUNDEF(MVT::i1);
12522
12523 return SDValue();
12524}
12525
12526SDValue SITargetLowering::performRcpCombine(SDNode *N,
12527 DAGCombinerInfo &DCI) const {
12528 EVT VT = N->getValueType(ResNo: 0);
12529 SDValue N0 = N->getOperand(Num: 0);
12530
12531 if (N0.isUndef()) {
12532 return DCI.DAG.getConstantFP(
12533 Val: APFloat::getQNaN(Sem: SelectionDAG::EVTToAPFloatSemantics(VT)), DL: SDLoc(N),
12534 VT);
12535 }
12536
12537 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
12538 N0.getOpcode() == ISD::SINT_TO_FP)) {
12539 return DCI.DAG.getNode(Opcode: AMDGPUISD::RCP_IFLAG, DL: SDLoc(N), VT, Operand: N0,
12540 Flags: N->getFlags());
12541 }
12542
12543 // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
12544 if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
12545 N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
12546 return DCI.DAG.getNode(Opcode: AMDGPUISD::RSQ, DL: SDLoc(N), VT,
12547 Operand: N0.getOperand(i: 0), Flags: N->getFlags());
12548 }
12549
12550 return AMDGPUTargetLowering::performRcpCombine(N, DCI);
12551}
12552
12553bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
12554 unsigned MaxDepth) const {
12555 unsigned Opcode = Op.getOpcode();
12556 if (Opcode == ISD::FCANONICALIZE)
12557 return true;
12558
12559 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Val&: Op)) {
12560 const auto &F = CFP->getValueAPF();
12561 if (F.isNaN() && F.isSignaling())
12562 return false;
12563 if (!F.isDenormal())
12564 return true;
12565
12566 DenormalMode Mode =
12567 DAG.getMachineFunction().getDenormalMode(FPType: F.getSemantics());
12568 return Mode == DenormalMode::getIEEE();
12569 }
12570
12571 // If source is a result of another standard FP operation it is already in
12572 // canonical form.
12573 if (MaxDepth == 0)
12574 return false;
12575
12576 switch (Opcode) {
12577 // These will flush denorms if required.
12578 case ISD::FADD:
12579 case ISD::FSUB:
12580 case ISD::FMUL:
12581 case ISD::FCEIL:
12582 case ISD::FFLOOR:
12583 case ISD::FMA:
12584 case ISD::FMAD:
12585 case ISD::FSQRT:
12586 case ISD::FDIV:
12587 case ISD::FREM:
12588 case ISD::FP_ROUND:
12589 case ISD::FP_EXTEND:
12590 case ISD::FP16_TO_FP:
12591 case ISD::FP_TO_FP16:
12592 case ISD::BF16_TO_FP:
12593 case ISD::FP_TO_BF16:
12594 case ISD::FLDEXP:
12595 case AMDGPUISD::FMUL_LEGACY:
12596 case AMDGPUISD::FMAD_FTZ:
12597 case AMDGPUISD::RCP:
12598 case AMDGPUISD::RSQ:
12599 case AMDGPUISD::RSQ_CLAMP:
12600 case AMDGPUISD::RCP_LEGACY:
12601 case AMDGPUISD::RCP_IFLAG:
12602 case AMDGPUISD::LOG:
12603 case AMDGPUISD::EXP:
12604 case AMDGPUISD::DIV_SCALE:
12605 case AMDGPUISD::DIV_FMAS:
12606 case AMDGPUISD::DIV_FIXUP:
12607 case AMDGPUISD::FRACT:
12608 case AMDGPUISD::CVT_PKRTZ_F16_F32:
12609 case AMDGPUISD::CVT_F32_UBYTE0:
12610 case AMDGPUISD::CVT_F32_UBYTE1:
12611 case AMDGPUISD::CVT_F32_UBYTE2:
12612 case AMDGPUISD::CVT_F32_UBYTE3:
12613 case AMDGPUISD::FP_TO_FP16:
12614 case AMDGPUISD::SIN_HW:
12615 case AMDGPUISD::COS_HW:
12616 return true;
12617
12618 // It can/will be lowered or combined as a bit operation.
12619 // Need to check their input recursively to handle.
12620 case ISD::FNEG:
12621 case ISD::FABS:
12622 case ISD::FCOPYSIGN:
12623 return isCanonicalized(DAG, Op: Op.getOperand(i: 0), MaxDepth: MaxDepth - 1);
12624
12625 case ISD::AND:
12626 if (Op.getValueType() == MVT::i32) {
12627 // Be careful as we only know it is a bitcast floating point type. It
12628 // could be f32, v2f16, we have no way of knowing. Luckily the constant
12629 // value that we optimize for, which comes up in fp32 to bf16 conversions,
12630 // is valid to optimize for all types.
12631 if (auto *RHS = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 1))) {
12632 if (RHS->getZExtValue() == 0xffff0000) {
12633 return isCanonicalized(DAG, Op: Op.getOperand(i: 0), MaxDepth: MaxDepth - 1);
12634 }
12635 }
12636 }
12637 break;
12638
12639 case ISD::FSIN:
12640 case ISD::FCOS:
12641 case ISD::FSINCOS:
12642 return Op.getValueType().getScalarType() != MVT::f16;
12643
12644 case ISD::FMINNUM:
12645 case ISD::FMAXNUM:
12646 case ISD::FMINNUM_IEEE:
12647 case ISD::FMAXNUM_IEEE:
12648 case ISD::FMINIMUM:
12649 case ISD::FMAXIMUM:
12650 case AMDGPUISD::CLAMP:
12651 case AMDGPUISD::FMED3:
12652 case AMDGPUISD::FMAX3:
12653 case AMDGPUISD::FMIN3:
12654 case AMDGPUISD::FMAXIMUM3:
12655 case AMDGPUISD::FMINIMUM3: {
12656 // FIXME: Shouldn't treat the generic operations different based these.
12657 // However, we aren't really required to flush the result from
12658 // minnum/maxnum..
12659
12660 // snans will be quieted, so we only need to worry about denormals.
12661 if (Subtarget->supportsMinMaxDenormModes() ||
12662 // FIXME: denormalsEnabledForType is broken for dynamic
12663 denormalsEnabledForType(DAG, VT: Op.getValueType()))
12664 return true;
12665
12666 // Flushing may be required.
12667 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
12668 // targets need to check their input recursively.
12669
12670 // FIXME: Does this apply with clamp? It's implemented with max.
12671 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
12672 if (!isCanonicalized(DAG, Op: Op.getOperand(i: I), MaxDepth: MaxDepth - 1))
12673 return false;
12674 }
12675
12676 return true;
12677 }
12678 case ISD::SELECT: {
12679 return isCanonicalized(DAG, Op: Op.getOperand(i: 1), MaxDepth: MaxDepth - 1) &&
12680 isCanonicalized(DAG, Op: Op.getOperand(i: 2), MaxDepth: MaxDepth - 1);
12681 }
12682 case ISD::BUILD_VECTOR: {
12683 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
12684 SDValue SrcOp = Op.getOperand(i);
12685 if (!isCanonicalized(DAG, Op: SrcOp, MaxDepth: MaxDepth - 1))
12686 return false;
12687 }
12688
12689 return true;
12690 }
12691 case ISD::EXTRACT_VECTOR_ELT:
12692 case ISD::EXTRACT_SUBVECTOR: {
12693 return isCanonicalized(DAG, Op: Op.getOperand(i: 0), MaxDepth: MaxDepth - 1);
12694 }
12695 case ISD::INSERT_VECTOR_ELT: {
12696 return isCanonicalized(DAG, Op: Op.getOperand(i: 0), MaxDepth: MaxDepth - 1) &&
12697 isCanonicalized(DAG, Op: Op.getOperand(i: 1), MaxDepth: MaxDepth - 1);
12698 }
12699 case ISD::UNDEF:
12700 // Could be anything.
12701 return false;
12702
12703 case ISD::BITCAST:
12704 // TODO: This is incorrect as it loses track of the operand's type. We may
12705 // end up effectively bitcasting from f32 to v2f16 or vice versa, and the
12706 // same bits that are canonicalized in one type need not be in the other.
12707 return isCanonicalized(DAG, Op: Op.getOperand(i: 0), MaxDepth: MaxDepth - 1);
12708 case ISD::TRUNCATE: {
12709 // Hack round the mess we make when legalizing extract_vector_elt
12710 if (Op.getValueType() == MVT::i16) {
12711 SDValue TruncSrc = Op.getOperand(i: 0);
12712 if (TruncSrc.getValueType() == MVT::i32 &&
12713 TruncSrc.getOpcode() == ISD::BITCAST &&
12714 TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
12715 return isCanonicalized(DAG, Op: TruncSrc.getOperand(i: 0), MaxDepth: MaxDepth - 1);
12716 }
12717 }
12718 return false;
12719 }
12720 case ISD::INTRINSIC_WO_CHAIN: {
12721 unsigned IntrinsicID = Op.getConstantOperandVal(i: 0);
12722 // TODO: Handle more intrinsics
12723 switch (IntrinsicID) {
12724 case Intrinsic::amdgcn_cvt_pkrtz:
12725 case Intrinsic::amdgcn_cubeid:
12726 case Intrinsic::amdgcn_frexp_mant:
12727 case Intrinsic::amdgcn_fdot2:
12728 case Intrinsic::amdgcn_rcp:
12729 case Intrinsic::amdgcn_rsq:
12730 case Intrinsic::amdgcn_rsq_clamp:
12731 case Intrinsic::amdgcn_rcp_legacy:
12732 case Intrinsic::amdgcn_rsq_legacy:
12733 case Intrinsic::amdgcn_trig_preop:
12734 case Intrinsic::amdgcn_log:
12735 case Intrinsic::amdgcn_exp2:
12736 case Intrinsic::amdgcn_sqrt:
12737 return true;
12738 default:
12739 break;
12740 }
12741
12742 break;
12743 }
12744 default:
12745 break;
12746 }
12747
12748 // FIXME: denormalsEnabledForType is broken for dynamic
12749 return denormalsEnabledForType(DAG, VT: Op.getValueType()) &&
12750 DAG.isKnownNeverSNaN(Op);
12751}
12752
12753bool SITargetLowering::isCanonicalized(Register Reg, const MachineFunction &MF,
12754 unsigned MaxDepth) const {
12755 const MachineRegisterInfo &MRI = MF.getRegInfo();
12756 MachineInstr *MI = MRI.getVRegDef(Reg);
12757 unsigned Opcode = MI->getOpcode();
12758
12759 if (Opcode == AMDGPU::G_FCANONICALIZE)
12760 return true;
12761
12762 std::optional<FPValueAndVReg> FCR;
12763 // Constant splat (can be padded with undef) or scalar constant.
12764 if (mi_match(R: Reg, MRI, P: MIPatternMatch::m_GFCstOrSplat(FPValReg&: FCR))) {
12765 if (FCR->Value.isSignaling())
12766 return false;
12767 if (!FCR->Value.isDenormal())
12768 return true;
12769
12770 DenormalMode Mode = MF.getDenormalMode(FPType: FCR->Value.getSemantics());
12771 return Mode == DenormalMode::getIEEE();
12772 }
12773
12774 if (MaxDepth == 0)
12775 return false;
12776
12777 switch (Opcode) {
12778 case AMDGPU::G_FADD:
12779 case AMDGPU::G_FSUB:
12780 case AMDGPU::G_FMUL:
12781 case AMDGPU::G_FCEIL:
12782 case AMDGPU::G_FFLOOR:
12783 case AMDGPU::G_FRINT:
12784 case AMDGPU::G_FNEARBYINT:
12785 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
12786 case AMDGPU::G_INTRINSIC_TRUNC:
12787 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
12788 case AMDGPU::G_FMA:
12789 case AMDGPU::G_FMAD:
12790 case AMDGPU::G_FSQRT:
12791 case AMDGPU::G_FDIV:
12792 case AMDGPU::G_FREM:
12793 case AMDGPU::G_FPOW:
12794 case AMDGPU::G_FPEXT:
12795 case AMDGPU::G_FLOG:
12796 case AMDGPU::G_FLOG2:
12797 case AMDGPU::G_FLOG10:
12798 case AMDGPU::G_FPTRUNC:
12799 case AMDGPU::G_AMDGPU_RCP_IFLAG:
12800 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
12801 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
12802 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
12803 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
12804 return true;
12805 case AMDGPU::G_FNEG:
12806 case AMDGPU::G_FABS:
12807 case AMDGPU::G_FCOPYSIGN:
12808 return isCanonicalized(Reg: MI->getOperand(i: 1).getReg(), MF, MaxDepth: MaxDepth - 1);
12809 case AMDGPU::G_FMINNUM:
12810 case AMDGPU::G_FMAXNUM:
12811 case AMDGPU::G_FMINNUM_IEEE:
12812 case AMDGPU::G_FMAXNUM_IEEE:
12813 case AMDGPU::G_FMINIMUM:
12814 case AMDGPU::G_FMAXIMUM: {
12815 if (Subtarget->supportsMinMaxDenormModes() ||
12816 // FIXME: denormalsEnabledForType is broken for dynamic
12817 denormalsEnabledForType(Ty: MRI.getType(Reg), MF))
12818 return true;
12819
12820 [[fallthrough]];
12821 }
12822 case AMDGPU::G_BUILD_VECTOR:
12823 for (const MachineOperand &MO : llvm::drop_begin(RangeOrContainer: MI->operands()))
12824 if (!isCanonicalized(Reg: MO.getReg(), MF, MaxDepth: MaxDepth - 1))
12825 return false;
12826 return true;
12827 case AMDGPU::G_INTRINSIC:
12828 case AMDGPU::G_INTRINSIC_CONVERGENT:
12829 switch (cast<GIntrinsic>(Val: MI)->getIntrinsicID()) {
12830 case Intrinsic::amdgcn_fmul_legacy:
12831 case Intrinsic::amdgcn_fmad_ftz:
12832 case Intrinsic::amdgcn_sqrt:
12833 case Intrinsic::amdgcn_fmed3:
12834 case Intrinsic::amdgcn_sin:
12835 case Intrinsic::amdgcn_cos:
12836 case Intrinsic::amdgcn_log:
12837 case Intrinsic::amdgcn_exp2:
12838 case Intrinsic::amdgcn_log_clamp:
12839 case Intrinsic::amdgcn_rcp:
12840 case Intrinsic::amdgcn_rcp_legacy:
12841 case Intrinsic::amdgcn_rsq:
12842 case Intrinsic::amdgcn_rsq_clamp:
12843 case Intrinsic::amdgcn_rsq_legacy:
12844 case Intrinsic::amdgcn_div_scale:
12845 case Intrinsic::amdgcn_div_fmas:
12846 case Intrinsic::amdgcn_div_fixup:
12847 case Intrinsic::amdgcn_fract:
12848 case Intrinsic::amdgcn_cvt_pkrtz:
12849 case Intrinsic::amdgcn_cubeid:
12850 case Intrinsic::amdgcn_cubema:
12851 case Intrinsic::amdgcn_cubesc:
12852 case Intrinsic::amdgcn_cubetc:
12853 case Intrinsic::amdgcn_frexp_mant:
12854 case Intrinsic::amdgcn_fdot2:
12855 case Intrinsic::amdgcn_trig_preop:
12856 return true;
12857 default:
12858 break;
12859 }
12860
12861 [[fallthrough]];
12862 default:
12863 return false;
12864 }
12865
12866 llvm_unreachable("invalid operation");
12867}
12868
12869// Constant fold canonicalize.
12870SDValue SITargetLowering::getCanonicalConstantFP(
12871 SelectionDAG &DAG, const SDLoc &SL, EVT VT, const APFloat &C) const {
12872 // Flush denormals to 0 if not enabled.
12873 if (C.isDenormal()) {
12874 DenormalMode Mode =
12875 DAG.getMachineFunction().getDenormalMode(FPType: C.getSemantics());
12876 if (Mode == DenormalMode::getPreserveSign()) {
12877 return DAG.getConstantFP(
12878 Val: APFloat::getZero(Sem: C.getSemantics(), Negative: C.isNegative()), DL: SL, VT);
12879 }
12880
12881 if (Mode != DenormalMode::getIEEE())
12882 return SDValue();
12883 }
12884
12885 if (C.isNaN()) {
12886 APFloat CanonicalQNaN = APFloat::getQNaN(Sem: C.getSemantics());
12887 if (C.isSignaling()) {
12888 // Quiet a signaling NaN.
12889 // FIXME: Is this supposed to preserve payload bits?
12890 return DAG.getConstantFP(Val: CanonicalQNaN, DL: SL, VT);
12891 }
12892
12893 // Make sure it is the canonical NaN bitpattern.
12894 //
12895 // TODO: Can we use -1 as the canonical NaN value since it's an inline
12896 // immediate?
12897 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
12898 return DAG.getConstantFP(Val: CanonicalQNaN, DL: SL, VT);
12899 }
12900
12901 // Already canonical.
12902 return DAG.getConstantFP(Val: C, DL: SL, VT);
12903}
12904
12905static bool vectorEltWillFoldAway(SDValue Op) {
12906 return Op.isUndef() || isa<ConstantFPSDNode>(Val: Op);
12907}
12908
12909SDValue SITargetLowering::performFCanonicalizeCombine(
12910 SDNode *N,
12911 DAGCombinerInfo &DCI) const {
12912 SelectionDAG &DAG = DCI.DAG;
12913 SDValue N0 = N->getOperand(Num: 0);
12914 EVT VT = N->getValueType(ResNo: 0);
12915
12916 // fcanonicalize undef -> qnan
12917 if (N0.isUndef()) {
12918 APFloat QNaN = APFloat::getQNaN(Sem: SelectionDAG::EVTToAPFloatSemantics(VT));
12919 return DAG.getConstantFP(Val: QNaN, DL: SDLoc(N), VT);
12920 }
12921
12922 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N: N0)) {
12923 EVT VT = N->getValueType(ResNo: 0);
12924 return getCanonicalConstantFP(DAG, SL: SDLoc(N), VT, C: CFP->getValueAPF());
12925 }
12926
12927 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
12928 // (fcanonicalize k)
12929 //
12930 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
12931
12932 // TODO: This could be better with wider vectors that will be split to v2f16,
12933 // and to consider uses since there aren't that many packed operations.
12934 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
12935 isTypeLegal(MVT::v2f16)) {
12936 SDLoc SL(N);
12937 SDValue NewElts[2];
12938 SDValue Lo = N0.getOperand(i: 0);
12939 SDValue Hi = N0.getOperand(i: 1);
12940 EVT EltVT = Lo.getValueType();
12941
12942 if (vectorEltWillFoldAway(Op: Lo) || vectorEltWillFoldAway(Op: Hi)) {
12943 for (unsigned I = 0; I != 2; ++I) {
12944 SDValue Op = N0.getOperand(i: I);
12945 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Val&: Op)) {
12946 NewElts[I] = getCanonicalConstantFP(DAG, SL, VT: EltVT,
12947 C: CFP->getValueAPF());
12948 } else if (Op.isUndef()) {
12949 // Handled below based on what the other operand is.
12950 NewElts[I] = Op;
12951 } else {
12952 NewElts[I] = DAG.getNode(Opcode: ISD::FCANONICALIZE, DL: SL, VT: EltVT, Operand: Op);
12953 }
12954 }
12955
12956 // If one half is undef, and one is constant, prefer a splat vector rather
12957 // than the normal qNaN. If it's a register, prefer 0.0 since that's
12958 // cheaper to use and may be free with a packed operation.
12959 if (NewElts[0].isUndef()) {
12960 if (isa<ConstantFPSDNode>(Val: NewElts[1]))
12961 NewElts[0] = isa<ConstantFPSDNode>(Val: NewElts[1]) ?
12962 NewElts[1]: DAG.getConstantFP(Val: 0.0f, DL: SL, VT: EltVT);
12963 }
12964
12965 if (NewElts[1].isUndef()) {
12966 NewElts[1] = isa<ConstantFPSDNode>(Val: NewElts[0]) ?
12967 NewElts[0] : DAG.getConstantFP(Val: 0.0f, DL: SL, VT: EltVT);
12968 }
12969
12970 return DAG.getBuildVector(VT, DL: SL, Ops: NewElts);
12971 }
12972 }
12973
12974 return SDValue();
12975}
12976
12977static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
12978 switch (Opc) {
12979 case ISD::FMAXNUM:
12980 case ISD::FMAXNUM_IEEE:
12981 return AMDGPUISD::FMAX3;
12982 case ISD::FMAXIMUM:
12983 return AMDGPUISD::FMAXIMUM3;
12984 case ISD::SMAX:
12985 return AMDGPUISD::SMAX3;
12986 case ISD::UMAX:
12987 return AMDGPUISD::UMAX3;
12988 case ISD::FMINNUM:
12989 case ISD::FMINNUM_IEEE:
12990 return AMDGPUISD::FMIN3;
12991 case ISD::FMINIMUM:
12992 return AMDGPUISD::FMINIMUM3;
12993 case ISD::SMIN:
12994 return AMDGPUISD::SMIN3;
12995 case ISD::UMIN:
12996 return AMDGPUISD::UMIN3;
12997 default:
12998 llvm_unreachable("Not a min/max opcode");
12999 }
13000}
13001
13002SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
13003 const SDLoc &SL, SDValue Src,
13004 SDValue MinVal,
13005 SDValue MaxVal,
13006 bool Signed) const {
13007
13008 // med3 comes from
13009 // min(max(x, K0), K1), K0 < K1
13010 // max(min(x, K0), K1), K1 < K0
13011 //
13012 // "MinVal" and "MaxVal" respectively refer to the rhs of the
13013 // min/max op.
13014 ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(Val&: MinVal);
13015 ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(Val&: MaxVal);
13016
13017 if (!MinK || !MaxK)
13018 return SDValue();
13019
13020 if (Signed) {
13021 if (MaxK->getAPIntValue().sge(RHS: MinK->getAPIntValue()))
13022 return SDValue();
13023 } else {
13024 if (MaxK->getAPIntValue().uge(RHS: MinK->getAPIntValue()))
13025 return SDValue();
13026 }
13027
13028 EVT VT = MinK->getValueType(ResNo: 0);
13029 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
13030 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
13031 return DAG.getNode(Opcode: Med3Opc, DL: SL, VT, N1: Src, N2: MaxVal, N3: MinVal);
13032
13033 // Note: we could also extend to i32 and use i32 med3 if i16 med3 is
13034 // not available, but this is unlikely to be profitable as constants
13035 // will often need to be materialized & extended, especially on
13036 // pre-GFX10 where VOP3 instructions couldn't take literal operands.
13037 return SDValue();
13038}
13039
13040static ConstantFPSDNode *getSplatConstantFP(SDValue Op) {
13041 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val&: Op))
13042 return C;
13043
13044 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Val&: Op)) {
13045 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
13046 return C;
13047 }
13048
13049 return nullptr;
13050}
13051
13052SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
13053 const SDLoc &SL,
13054 SDValue Op0,
13055 SDValue Op1) const {
13056 ConstantFPSDNode *K1 = getSplatConstantFP(Op: Op1);
13057 if (!K1)
13058 return SDValue();
13059
13060 ConstantFPSDNode *K0 = getSplatConstantFP(Op: Op0.getOperand(i: 1));
13061 if (!K0)
13062 return SDValue();
13063
13064 // Ordered >= (although NaN inputs should have folded away by now).
13065 if (K0->getValueAPF() > K1->getValueAPF())
13066 return SDValue();
13067
13068 const MachineFunction &MF = DAG.getMachineFunction();
13069 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
13070
13071 // TODO: Check IEEE bit enabled?
13072 EVT VT = Op0.getValueType();
13073 if (Info->getMode().DX10Clamp) {
13074 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
13075 // hardware fmed3 behavior converting to a min.
13076 // FIXME: Should this be allowing -0.0?
13077 if (K1->isExactlyValue(V: 1.0) && K0->isExactlyValue(V: 0.0))
13078 return DAG.getNode(Opcode: AMDGPUISD::CLAMP, DL: SL, VT, Operand: Op0.getOperand(i: 0));
13079 }
13080
13081 // med3 for f16 is only available on gfx9+, and not available for v2f16.
13082 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
13083 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
13084 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
13085 // then give the other result, which is different from med3 with a NaN
13086 // input.
13087 SDValue Var = Op0.getOperand(i: 0);
13088 if (!DAG.isKnownNeverSNaN(Op: Var))
13089 return SDValue();
13090
13091 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13092
13093 if ((!K0->hasOneUse() || TII->isInlineConstant(Imm: K0->getValueAPF())) &&
13094 (!K1->hasOneUse() || TII->isInlineConstant(Imm: K1->getValueAPF()))) {
13095 return DAG.getNode(Opcode: AMDGPUISD::FMED3, DL: SL, VT: K0->getValueType(ResNo: 0),
13096 N1: Var, N2: SDValue(K0, 0), N3: SDValue(K1, 0));
13097 }
13098 }
13099
13100 return SDValue();
13101}
13102
13103SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
13104 DAGCombinerInfo &DCI) const {
13105 SelectionDAG &DAG = DCI.DAG;
13106
13107 EVT VT = N->getValueType(ResNo: 0);
13108 unsigned Opc = N->getOpcode();
13109 SDValue Op0 = N->getOperand(Num: 0);
13110 SDValue Op1 = N->getOperand(Num: 1);
13111
13112 // Only do this if the inner op has one use since this will just increases
13113 // register pressure for no benefit.
13114
13115 if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY &&
13116 !VT.isVector() &&
13117 (VT == MVT::i32 || VT == MVT::f32 ||
13118 ((VT == MVT::f16 || VT == MVT::i16) && Subtarget->hasMin3Max3_16()))) {
13119 // max(max(a, b), c) -> max3(a, b, c)
13120 // min(min(a, b), c) -> min3(a, b, c)
13121 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
13122 SDLoc DL(N);
13123 return DAG.getNode(Opcode: minMaxOpcToMin3Max3Opc(Opc),
13124 DL,
13125 VT: N->getValueType(ResNo: 0),
13126 N1: Op0.getOperand(i: 0),
13127 N2: Op0.getOperand(i: 1),
13128 N3: Op1);
13129 }
13130
13131 // Try commuted.
13132 // max(a, max(b, c)) -> max3(a, b, c)
13133 // min(a, min(b, c)) -> min3(a, b, c)
13134 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
13135 SDLoc DL(N);
13136 return DAG.getNode(Opcode: minMaxOpcToMin3Max3Opc(Opc),
13137 DL,
13138 VT: N->getValueType(ResNo: 0),
13139 N1: Op0,
13140 N2: Op1.getOperand(i: 0),
13141 N3: Op1.getOperand(i: 1));
13142 }
13143 }
13144
13145 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
13146 // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
13147 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
13148 if (SDValue Med3 = performIntMed3ImmCombine(
13149 DAG, SL: SDLoc(N), Src: Op0->getOperand(Num: 0), MinVal: Op1, MaxVal: Op0->getOperand(Num: 1), Signed: true))
13150 return Med3;
13151 }
13152 if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
13153 if (SDValue Med3 = performIntMed3ImmCombine(
13154 DAG, SL: SDLoc(N), Src: Op0->getOperand(Num: 0), MinVal: Op0->getOperand(Num: 1), MaxVal: Op1, Signed: true))
13155 return Med3;
13156 }
13157
13158 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
13159 if (SDValue Med3 = performIntMed3ImmCombine(
13160 DAG, SL: SDLoc(N), Src: Op0->getOperand(Num: 0), MinVal: Op1, MaxVal: Op0->getOperand(Num: 1), Signed: false))
13161 return Med3;
13162 }
13163 if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
13164 if (SDValue Med3 = performIntMed3ImmCombine(
13165 DAG, SL: SDLoc(N), Src: Op0->getOperand(Num: 0), MinVal: Op0->getOperand(Num: 1), MaxVal: Op1, Signed: false))
13166 return Med3;
13167 }
13168
13169 // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
13170 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
13171 (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
13172 (Opc == AMDGPUISD::FMIN_LEGACY &&
13173 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
13174 (VT == MVT::f32 || VT == MVT::f64 ||
13175 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
13176 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
13177 Op0.hasOneUse()) {
13178 if (SDValue Res = performFPMed3ImmCombine(DAG, SL: SDLoc(N), Op0, Op1))
13179 return Res;
13180 }
13181
13182 return SDValue();
13183}
13184
13185static bool isClampZeroToOne(SDValue A, SDValue B) {
13186 if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(Val&: A)) {
13187 if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(Val&: B)) {
13188 // FIXME: Should this be allowing -0.0?
13189 return (CA->isExactlyValue(V: 0.0) && CB->isExactlyValue(V: 1.0)) ||
13190 (CA->isExactlyValue(V: 1.0) && CB->isExactlyValue(V: 0.0));
13191 }
13192 }
13193
13194 return false;
13195}
13196
13197// FIXME: Should only worry about snans for version with chain.
13198SDValue SITargetLowering::performFMed3Combine(SDNode *N,
13199 DAGCombinerInfo &DCI) const {
13200 EVT VT = N->getValueType(ResNo: 0);
13201 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
13202 // NaNs. With a NaN input, the order of the operands may change the result.
13203
13204 SelectionDAG &DAG = DCI.DAG;
13205 SDLoc SL(N);
13206
13207 SDValue Src0 = N->getOperand(Num: 0);
13208 SDValue Src1 = N->getOperand(Num: 1);
13209 SDValue Src2 = N->getOperand(Num: 2);
13210
13211 if (isClampZeroToOne(A: Src0, B: Src1)) {
13212 // const_a, const_b, x -> clamp is safe in all cases including signaling
13213 // nans.
13214 // FIXME: Should this be allowing -0.0?
13215 return DAG.getNode(Opcode: AMDGPUISD::CLAMP, DL: SL, VT, Operand: Src2);
13216 }
13217
13218 const MachineFunction &MF = DAG.getMachineFunction();
13219 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
13220
13221 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
13222 // handling no dx10-clamp?
13223 if (Info->getMode().DX10Clamp) {
13224 // If NaNs is clamped to 0, we are free to reorder the inputs.
13225
13226 if (isa<ConstantFPSDNode>(Val: Src0) && !isa<ConstantFPSDNode>(Val: Src1))
13227 std::swap(a&: Src0, b&: Src1);
13228
13229 if (isa<ConstantFPSDNode>(Val: Src1) && !isa<ConstantFPSDNode>(Val: Src2))
13230 std::swap(a&: Src1, b&: Src2);
13231
13232 if (isa<ConstantFPSDNode>(Val: Src0) && !isa<ConstantFPSDNode>(Val: Src1))
13233 std::swap(a&: Src0, b&: Src1);
13234
13235 if (isClampZeroToOne(A: Src1, B: Src2))
13236 return DAG.getNode(Opcode: AMDGPUISD::CLAMP, DL: SL, VT, Operand: Src0);
13237 }
13238
13239 return SDValue();
13240}
13241
13242SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
13243 DAGCombinerInfo &DCI) const {
13244 SDValue Src0 = N->getOperand(Num: 0);
13245 SDValue Src1 = N->getOperand(Num: 1);
13246 if (Src0.isUndef() && Src1.isUndef())
13247 return DCI.DAG.getUNDEF(VT: N->getValueType(ResNo: 0));
13248 return SDValue();
13249}
13250
13251// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
13252// expanded into a set of cmp/select instructions.
13253bool SITargetLowering::shouldExpandVectorDynExt(unsigned EltSize,
13254 unsigned NumElem,
13255 bool IsDivergentIdx,
13256 const GCNSubtarget *Subtarget) {
13257 if (UseDivergentRegisterIndexing)
13258 return false;
13259
13260 unsigned VecSize = EltSize * NumElem;
13261
13262 // Sub-dword vectors of size 2 dword or less have better implementation.
13263 if (VecSize <= 64 && EltSize < 32)
13264 return false;
13265
13266 // Always expand the rest of sub-dword instructions, otherwise it will be
13267 // lowered via memory.
13268 if (EltSize < 32)
13269 return true;
13270
13271 // Always do this if var-idx is divergent, otherwise it will become a loop.
13272 if (IsDivergentIdx)
13273 return true;
13274
13275 // Large vectors would yield too many compares and v_cndmask_b32 instructions.
13276 unsigned NumInsts = NumElem /* Number of compares */ +
13277 ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
13278
13279 // On some architectures (GFX9) movrel is not available and it's better
13280 // to expand.
13281 if (!Subtarget->hasMovrel())
13282 return NumInsts <= 16;
13283
13284 // If movrel is available, use it instead of expanding for vector of 8
13285 // elements.
13286 return NumInsts <= 15;
13287}
13288
13289bool SITargetLowering::shouldExpandVectorDynExt(SDNode *N) const {
13290 SDValue Idx = N->getOperand(Num: N->getNumOperands() - 1);
13291 if (isa<ConstantSDNode>(Val: Idx))
13292 return false;
13293
13294 SDValue Vec = N->getOperand(Num: 0);
13295 EVT VecVT = Vec.getValueType();
13296 EVT EltVT = VecVT.getVectorElementType();
13297 unsigned EltSize = EltVT.getSizeInBits();
13298 unsigned NumElem = VecVT.getVectorNumElements();
13299
13300 return SITargetLowering::shouldExpandVectorDynExt(
13301 EltSize, NumElem, IsDivergentIdx: Idx->isDivergent(), Subtarget: getSubtarget());
13302}
13303
13304SDValue SITargetLowering::performExtractVectorEltCombine(
13305 SDNode *N, DAGCombinerInfo &DCI) const {
13306 SDValue Vec = N->getOperand(Num: 0);
13307 SelectionDAG &DAG = DCI.DAG;
13308
13309 EVT VecVT = Vec.getValueType();
13310 EVT VecEltVT = VecVT.getVectorElementType();
13311 EVT ResVT = N->getValueType(ResNo: 0);
13312
13313 unsigned VecSize = VecVT.getSizeInBits();
13314 unsigned VecEltSize = VecEltVT.getSizeInBits();
13315
13316 if ((Vec.getOpcode() == ISD::FNEG ||
13317 Vec.getOpcode() == ISD::FABS) && allUsesHaveSourceMods(N)) {
13318 SDLoc SL(N);
13319 SDValue Idx = N->getOperand(Num: 1);
13320 SDValue Elt =
13321 DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: ResVT, N1: Vec.getOperand(i: 0), N2: Idx);
13322 return DAG.getNode(Opcode: Vec.getOpcode(), DL: SL, VT: ResVT, Operand: Elt);
13323 }
13324
13325 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
13326 // =>
13327 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
13328 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
13329 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
13330 if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
13331 SDLoc SL(N);
13332 SDValue Idx = N->getOperand(Num: 1);
13333 unsigned Opc = Vec.getOpcode();
13334
13335 switch(Opc) {
13336 default:
13337 break;
13338 // TODO: Support other binary operations.
13339 case ISD::FADD:
13340 case ISD::FSUB:
13341 case ISD::FMUL:
13342 case ISD::ADD:
13343 case ISD::UMIN:
13344 case ISD::UMAX:
13345 case ISD::SMIN:
13346 case ISD::SMAX:
13347 case ISD::FMAXNUM:
13348 case ISD::FMINNUM:
13349 case ISD::FMAXNUM_IEEE:
13350 case ISD::FMINNUM_IEEE:
13351 case ISD::FMAXIMUM:
13352 case ISD::FMINIMUM: {
13353 SDValue Elt0 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: ResVT,
13354 N1: Vec.getOperand(i: 0), N2: Idx);
13355 SDValue Elt1 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: ResVT,
13356 N1: Vec.getOperand(i: 1), N2: Idx);
13357
13358 DCI.AddToWorklist(N: Elt0.getNode());
13359 DCI.AddToWorklist(N: Elt1.getNode());
13360 return DAG.getNode(Opcode: Opc, DL: SL, VT: ResVT, N1: Elt0, N2: Elt1, Flags: Vec->getFlags());
13361 }
13362 }
13363 }
13364
13365 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
13366 if (shouldExpandVectorDynExt(N)) {
13367 SDLoc SL(N);
13368 SDValue Idx = N->getOperand(Num: 1);
13369 SDValue V;
13370 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
13371 SDValue IC = DAG.getVectorIdxConstant(Val: I, DL: SL);
13372 SDValue Elt = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: ResVT, N1: Vec, N2: IC);
13373 if (I == 0)
13374 V = Elt;
13375 else
13376 V = DAG.getSelectCC(DL: SL, LHS: Idx, RHS: IC, True: Elt, False: V, Cond: ISD::SETEQ);
13377 }
13378 return V;
13379 }
13380
13381 if (!DCI.isBeforeLegalize())
13382 return SDValue();
13383
13384 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
13385 // elements. This exposes more load reduction opportunities by replacing
13386 // multiple small extract_vector_elements with a single 32-bit extract.
13387 auto *Idx = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1));
13388 if (isa<MemSDNode>(Val: Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
13389 VecSize > 32 && VecSize % 32 == 0 && Idx) {
13390 EVT NewVT = getEquivalentMemType(Context&: *DAG.getContext(), VT: VecVT);
13391
13392 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
13393 unsigned EltIdx = BitIndex / 32;
13394 unsigned LeftoverBitIdx = BitIndex % 32;
13395 SDLoc SL(N);
13396
13397 SDValue Cast = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewVT, Operand: Vec);
13398 DCI.AddToWorklist(N: Cast.getNode());
13399
13400 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
13401 DAG.getConstant(EltIdx, SL, MVT::i32));
13402 DCI.AddToWorklist(N: Elt.getNode());
13403 SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
13404 DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
13405 DCI.AddToWorklist(N: Srl.getNode());
13406
13407 EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
13408 SDValue Trunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: VecEltAsIntVT, Operand: Srl);
13409 DCI.AddToWorklist(N: Trunc.getNode());
13410
13411 if (VecEltVT == ResVT) {
13412 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: VecEltVT, Operand: Trunc);
13413 }
13414
13415 assert(ResVT.isScalarInteger());
13416 return DAG.getAnyExtOrTrunc(Op: Trunc, DL: SL, VT: ResVT);
13417 }
13418
13419 return SDValue();
13420}
13421
13422SDValue
13423SITargetLowering::performInsertVectorEltCombine(SDNode *N,
13424 DAGCombinerInfo &DCI) const {
13425 SDValue Vec = N->getOperand(Num: 0);
13426 SDValue Idx = N->getOperand(Num: 2);
13427 EVT VecVT = Vec.getValueType();
13428 EVT EltVT = VecVT.getVectorElementType();
13429
13430 // INSERT_VECTOR_ELT (<n x e>, var-idx)
13431 // => BUILD_VECTOR n x select (e, const-idx)
13432 if (!shouldExpandVectorDynExt(N))
13433 return SDValue();
13434
13435 SelectionDAG &DAG = DCI.DAG;
13436 SDLoc SL(N);
13437 SDValue Ins = N->getOperand(Num: 1);
13438 EVT IdxVT = Idx.getValueType();
13439
13440 SmallVector<SDValue, 16> Ops;
13441 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
13442 SDValue IC = DAG.getConstant(Val: I, DL: SL, VT: IdxVT);
13443 SDValue Elt = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: EltVT, N1: Vec, N2: IC);
13444 SDValue V = DAG.getSelectCC(DL: SL, LHS: Idx, RHS: IC, True: Ins, False: Elt, Cond: ISD::SETEQ);
13445 Ops.push_back(Elt: V);
13446 }
13447
13448 return DAG.getBuildVector(VT: VecVT, DL: SL, Ops);
13449}
13450
13451/// Return the source of an fp_extend from f16 to f32, or a converted FP
13452/// constant.
13453static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src) {
13454 if (Src.getOpcode() == ISD::FP_EXTEND &&
13455 Src.getOperand(0).getValueType() == MVT::f16) {
13456 return Src.getOperand(i: 0);
13457 }
13458
13459 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Val&: Src)) {
13460 APFloat Val = CFP->getValueAPF();
13461 bool LosesInfo = true;
13462 Val.convert(ToSemantics: APFloat::IEEEhalf(), RM: APFloat::rmNearestTiesToEven, losesInfo: &LosesInfo);
13463 if (!LosesInfo)
13464 return DAG.getConstantFP(Val, SDLoc(Src), MVT::f16);
13465 }
13466
13467 return SDValue();
13468}
13469
13470SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
13471 DAGCombinerInfo &DCI) const {
13472 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
13473 "combine only useful on gfx8");
13474
13475 SDValue TruncSrc = N->getOperand(Num: 0);
13476 EVT VT = N->getValueType(ResNo: 0);
13477 if (VT != MVT::f16)
13478 return SDValue();
13479
13480 if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 ||
13481 TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse())
13482 return SDValue();
13483
13484 SelectionDAG &DAG = DCI.DAG;
13485 SDLoc SL(N);
13486
13487 // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
13488 // and expanding it with min/max saves 1 instruction vs. casting to f32 and
13489 // casting back.
13490
13491 // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
13492 // fmin(fmax(a, b), fmax(fmin(a, b), c))
13493 SDValue A = strictFPExtFromF16(DAG, Src: TruncSrc.getOperand(i: 0));
13494 if (!A)
13495 return SDValue();
13496
13497 SDValue B = strictFPExtFromF16(DAG, Src: TruncSrc.getOperand(i: 1));
13498 if (!B)
13499 return SDValue();
13500
13501 SDValue C = strictFPExtFromF16(DAG, Src: TruncSrc.getOperand(i: 2));
13502 if (!C)
13503 return SDValue();
13504
13505 // This changes signaling nan behavior. If an input is a signaling nan, it
13506 // would have been quieted by the fpext originally. We don't care because
13507 // these are unconstrained ops. If we needed to insert quieting canonicalizes
13508 // we would be worse off than just doing the promotion.
13509 SDValue A1 = DAG.getNode(Opcode: ISD::FMINNUM_IEEE, DL: SL, VT, N1: A, N2: B);
13510 SDValue B1 = DAG.getNode(Opcode: ISD::FMAXNUM_IEEE, DL: SL, VT, N1: A, N2: B);
13511 SDValue C1 = DAG.getNode(Opcode: ISD::FMAXNUM_IEEE, DL: SL, VT, N1: A1, N2: C);
13512 return DAG.getNode(Opcode: ISD::FMINNUM_IEEE, DL: SL, VT, N1: B1, N2: C1);
13513}
13514
13515unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
13516 const SDNode *N0,
13517 const SDNode *N1) const {
13518 EVT VT = N0->getValueType(ResNo: 0);
13519
13520 // Only do this if we are not trying to support denormals. v_mad_f32 does not
13521 // support denormals ever.
13522 if (((VT == MVT::f32 &&
13523 denormalModeIsFlushAllF32(DAG.getMachineFunction())) ||
13524 (VT == MVT::f16 && Subtarget->hasMadF16() &&
13525 denormalModeIsFlushAllF64F16(DAG.getMachineFunction()))) &&
13526 isOperationLegal(ISD::FMAD, VT))
13527 return ISD::FMAD;
13528
13529 const TargetOptions &Options = DAG.getTarget().Options;
13530 if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
13531 (N0->getFlags().hasAllowContract() &&
13532 N1->getFlags().hasAllowContract())) &&
13533 isFMAFasterThanFMulAndFAdd(MF: DAG.getMachineFunction(), VT)) {
13534 return ISD::FMA;
13535 }
13536
13537 return 0;
13538}
13539
13540// For a reassociatable opcode perform:
13541// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
13542SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
13543 SelectionDAG &DAG) const {
13544 EVT VT = N->getValueType(ResNo: 0);
13545 if (VT != MVT::i32 && VT != MVT::i64)
13546 return SDValue();
13547
13548 if (DAG.isBaseWithConstantOffset(Op: SDValue(N, 0)))
13549 return SDValue();
13550
13551 unsigned Opc = N->getOpcode();
13552 SDValue Op0 = N->getOperand(Num: 0);
13553 SDValue Op1 = N->getOperand(Num: 1);
13554
13555 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
13556 return SDValue();
13557
13558 if (Op0->isDivergent())
13559 std::swap(a&: Op0, b&: Op1);
13560
13561 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
13562 return SDValue();
13563
13564 SDValue Op2 = Op1.getOperand(i: 1);
13565 Op1 = Op1.getOperand(i: 0);
13566 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
13567 return SDValue();
13568
13569 if (Op1->isDivergent())
13570 std::swap(a&: Op1, b&: Op2);
13571
13572 SDLoc SL(N);
13573 SDValue Add1 = DAG.getNode(Opcode: Opc, DL: SL, VT, N1: Op0, N2: Op1);
13574 return DAG.getNode(Opcode: Opc, DL: SL, VT, N1: Add1, N2: Op2);
13575}
13576
13577static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL,
13578 EVT VT,
13579 SDValue N0, SDValue N1, SDValue N2,
13580 bool Signed) {
13581 unsigned MadOpc = Signed ? AMDGPUISD::MAD_I64_I32 : AMDGPUISD::MAD_U64_U32;
13582 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
13583 SDValue Mad = DAG.getNode(Opcode: MadOpc, DL: SL, VTList: VTs, N1: N0, N2: N1, N3: N2);
13584 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT, Operand: Mad);
13585}
13586
13587// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
13588// multiplies, if any.
13589//
13590// Full 64-bit multiplies that feed into an addition are lowered here instead
13591// of using the generic expansion. The generic expansion ends up with
13592// a tree of ADD nodes that prevents us from using the "add" part of the
13593// MAD instruction. The expansion produced here results in a chain of ADDs
13594// instead of a tree.
13595SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
13596 DAGCombinerInfo &DCI) const {
13597 assert(N->getOpcode() == ISD::ADD);
13598
13599 SelectionDAG &DAG = DCI.DAG;
13600 EVT VT = N->getValueType(ResNo: 0);
13601 SDLoc SL(N);
13602 SDValue LHS = N->getOperand(Num: 0);
13603 SDValue RHS = N->getOperand(Num: 1);
13604
13605 if (VT.isVector())
13606 return SDValue();
13607
13608 // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
13609 // result in scalar registers for uniform values.
13610 if (!N->isDivergent() && Subtarget->hasSMulHi())
13611 return SDValue();
13612
13613 unsigned NumBits = VT.getScalarSizeInBits();
13614 if (NumBits <= 32 || NumBits > 64)
13615 return SDValue();
13616
13617 if (LHS.getOpcode() != ISD::MUL) {
13618 assert(RHS.getOpcode() == ISD::MUL);
13619 std::swap(a&: LHS, b&: RHS);
13620 }
13621
13622 // Avoid the fold if it would unduly increase the number of multiplies due to
13623 // multiple uses, except on hardware with full-rate multiply-add (which is
13624 // part of full-rate 64-bit ops).
13625 if (!Subtarget->hasFullRate64Ops()) {
13626 unsigned NumUsers = 0;
13627 for (SDNode *Use : LHS->uses()) {
13628 // There is a use that does not feed into addition, so the multiply can't
13629 // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
13630 if (Use->getOpcode() != ISD::ADD)
13631 return SDValue();
13632
13633 // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
13634 // MUL + 3xADD + 3xADDC over 3xMAD.
13635 ++NumUsers;
13636 if (NumUsers >= 3)
13637 return SDValue();
13638 }
13639 }
13640
13641 SDValue MulLHS = LHS.getOperand(i: 0);
13642 SDValue MulRHS = LHS.getOperand(i: 1);
13643 SDValue AddRHS = RHS;
13644
13645 // Always check whether operands are small unsigned values, since that
13646 // knowledge is useful in more cases. Check for small signed values only if
13647 // doing so can unlock a shorter code sequence.
13648 bool MulLHSUnsigned32 = numBitsUnsigned(Op: MulLHS, DAG) <= 32;
13649 bool MulRHSUnsigned32 = numBitsUnsigned(Op: MulRHS, DAG) <= 32;
13650
13651 bool MulSignedLo = false;
13652 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
13653 MulSignedLo = numBitsSigned(Op: MulLHS, DAG) <= 32 &&
13654 numBitsSigned(Op: MulRHS, DAG) <= 32;
13655 }
13656
13657 // The operands and final result all have the same number of bits. If
13658 // operands need to be extended, they can be extended with garbage. The
13659 // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
13660 // truncated away in the end.
13661 if (VT != MVT::i64) {
13662 MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS);
13663 MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS);
13664 AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS);
13665 }
13666
13667 // The basic code generated is conceptually straightforward. Pseudo code:
13668 //
13669 // accum = mad_64_32 lhs.lo, rhs.lo, accum
13670 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
13671 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
13672 //
13673 // The second and third lines are optional, depending on whether the factors
13674 // are {sign,zero}-extended or not.
13675 //
13676 // The actual DAG is noisier than the pseudo code, but only due to
13677 // instructions that disassemble values into low and high parts, and
13678 // assemble the final result.
13679 SDValue One = DAG.getConstant(1, SL, MVT::i32);
13680
13681 auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS);
13682 auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS);
13683 SDValue Accum =
13684 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
13685
13686 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
13687 SDValue AccumLo, AccumHi;
13688 std::tie(AccumLo, AccumHi) = DAG.SplitScalar(Accum, SL, MVT::i32, MVT::i32);
13689
13690 if (!MulLHSUnsigned32) {
13691 auto MulLHSHi =
13692 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One);
13693 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo);
13694 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
13695 }
13696
13697 if (!MulRHSUnsigned32) {
13698 auto MulRHSHi =
13699 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One);
13700 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi);
13701 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
13702 }
13703
13704 Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi});
13705 Accum = DAG.getBitcast(MVT::i64, Accum);
13706 }
13707
13708 if (VT != MVT::i64)
13709 Accum = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT, Operand: Accum);
13710 return Accum;
13711}
13712
13713// Collect the ultimate src of each of the mul node's operands, and confirm
13714// each operand is 8 bytes.
13715static std::optional<ByteProvider<SDValue>>
13716handleMulOperand(const SDValue &MulOperand) {
13717 auto Byte0 = calculateByteProvider(Op: MulOperand, Index: 0, Depth: 0);
13718 if (!Byte0 || Byte0->isConstantZero()) {
13719 return std::nullopt;
13720 }
13721 auto Byte1 = calculateByteProvider(Op: MulOperand, Index: 1, Depth: 0);
13722 if (Byte1 && !Byte1->isConstantZero()) {
13723 return std::nullopt;
13724 }
13725 return Byte0;
13726}
13727
13728static unsigned addPermMasks(unsigned First, unsigned Second) {
13729 unsigned FirstCs = First & 0x0c0c0c0c;
13730 unsigned SecondCs = Second & 0x0c0c0c0c;
13731 unsigned FirstNoCs = First & ~0x0c0c0c0c;
13732 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
13733
13734 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
13735 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
13736 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
13737 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
13738
13739 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
13740}
13741
13742struct DotSrc {
13743 SDValue SrcOp;
13744 int64_t PermMask;
13745 int64_t DWordOffset;
13746};
13747
13748static void placeSources(ByteProvider<SDValue> &Src0,
13749 ByteProvider<SDValue> &Src1,
13750 SmallVectorImpl<DotSrc> &Src0s,
13751 SmallVectorImpl<DotSrc> &Src1s, int Step) {
13752
13753 assert(Src0.Src.has_value() && Src1.Src.has_value());
13754 // Src0s and Src1s are empty, just place arbitrarily.
13755 if (Step == 0) {
13756 Src0s.push_back(Elt: {.SrcOp: *Src0.Src, .PermMask: ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c,
13757 .DWordOffset: Src0.SrcOffset / 4});
13758 Src1s.push_back(Elt: {.SrcOp: *Src1.Src, .PermMask: ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c,
13759 .DWordOffset: Src1.SrcOffset / 4});
13760 return;
13761 }
13762
13763 for (int BPI = 0; BPI < 2; BPI++) {
13764 std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
13765 if (BPI == 1) {
13766 BPP = {Src1, Src0};
13767 }
13768 unsigned ZeroMask = 0x0c0c0c0c;
13769 unsigned FMask = 0xFF << (8 * (3 - Step));
13770
13771 unsigned FirstMask =
13772 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
13773 unsigned SecondMask =
13774 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
13775 // Attempt to find Src vector which contains our SDValue, if so, add our
13776 // perm mask to the existing one. If we are unable to find a match for the
13777 // first SDValue, attempt to find match for the second.
13778 int FirstGroup = -1;
13779 for (int I = 0; I < 2; I++) {
13780 SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s;
13781 auto MatchesFirst = [&BPP](DotSrc &IterElt) {
13782 return IterElt.SrcOp == *BPP.first.Src &&
13783 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
13784 };
13785
13786 auto Match = llvm::find_if(Range&: Srcs, P: MatchesFirst);
13787 if (Match != Srcs.end()) {
13788 Match->PermMask = addPermMasks(First: FirstMask, Second: Match->PermMask);
13789 FirstGroup = I;
13790 break;
13791 }
13792 }
13793 if (FirstGroup != -1) {
13794 SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s;
13795 auto MatchesSecond = [&BPP](DotSrc &IterElt) {
13796 return IterElt.SrcOp == *BPP.second.Src &&
13797 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
13798 };
13799 auto Match = llvm::find_if(Range&: Srcs, P: MatchesSecond);
13800 if (Match != Srcs.end()) {
13801 Match->PermMask = addPermMasks(First: SecondMask, Second: Match->PermMask);
13802 } else
13803 Srcs.push_back(Elt: {.SrcOp: *BPP.second.Src, .PermMask: SecondMask, .DWordOffset: BPP.second.SrcOffset / 4});
13804 return;
13805 }
13806 }
13807
13808 // If we have made it here, then we could not find a match in Src0s or Src1s
13809 // for either Src0 or Src1, so just place them arbitrarily.
13810
13811 unsigned ZeroMask = 0x0c0c0c0c;
13812 unsigned FMask = 0xFF << (8 * (3 - Step));
13813
13814 Src0s.push_back(
13815 Elt: {.SrcOp: *Src0.Src,
13816 .PermMask: ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
13817 .DWordOffset: Src1.SrcOffset / 4});
13818 Src1s.push_back(
13819 Elt: {.SrcOp: *Src1.Src,
13820 .PermMask: ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
13821 .DWordOffset: Src1.SrcOffset / 4});
13822
13823 return;
13824}
13825
13826static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL,
13827 SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
13828 bool IsAny) {
13829
13830 // If we just have one source, just permute it accordingly.
13831 if (Srcs.size() == 1) {
13832 auto Elt = Srcs.begin();
13833 auto EltOp = getDWordFromOffset(DAG, SL, Src: Elt->SrcOp, DWordOffset: Elt->DWordOffset);
13834
13835 // v_perm will produce the original value
13836 if (Elt->PermMask == 0x3020100)
13837 return EltOp;
13838
13839 return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
13840 DAG.getConstant(Elt->PermMask, SL, MVT::i32));
13841 }
13842
13843 auto FirstElt = Srcs.begin();
13844 auto SecondElt = std::next(x: FirstElt);
13845
13846 SmallVector<SDValue, 2> Perms;
13847
13848 // If we have multiple sources in the chain, combine them via perms (using
13849 // calculated perm mask) and Ors.
13850 while (true) {
13851 auto FirstMask = FirstElt->PermMask;
13852 auto SecondMask = SecondElt->PermMask;
13853
13854 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
13855 unsigned FirstPlusFour = FirstMask | 0x04040404;
13856 // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
13857 // original 0x0C.
13858 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
13859
13860 auto PermMask = addPermMasks(First: FirstMask, Second: SecondMask);
13861 auto FirstVal =
13862 getDWordFromOffset(DAG, SL, Src: FirstElt->SrcOp, DWordOffset: FirstElt->DWordOffset);
13863 auto SecondVal =
13864 getDWordFromOffset(DAG, SL, Src: SecondElt->SrcOp, DWordOffset: SecondElt->DWordOffset);
13865
13866 Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal,
13867 SecondVal,
13868 DAG.getConstant(PermMask, SL, MVT::i32)));
13869
13870 FirstElt = std::next(x: SecondElt);
13871 if (FirstElt == Srcs.end())
13872 break;
13873
13874 SecondElt = std::next(x: FirstElt);
13875 // If we only have a FirstElt, then just combine that into the cumulative
13876 // source node.
13877 if (SecondElt == Srcs.end()) {
13878 auto EltOp =
13879 getDWordFromOffset(DAG, SL, Src: FirstElt->SrcOp, DWordOffset: FirstElt->DWordOffset);
13880
13881 Perms.push_back(
13882 DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
13883 DAG.getConstant(FirstElt->PermMask, SL, MVT::i32)));
13884 break;
13885 }
13886 }
13887
13888 assert(Perms.size() == 1 || Perms.size() == 2);
13889 return Perms.size() == 2
13890 ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1])
13891 : Perms[0];
13892}
13893
13894static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
13895 for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
13896 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
13897 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
13898 EntryMask += ZeroMask;
13899 }
13900}
13901
13902static bool isMul(const SDValue Op) {
13903 auto Opcode = Op.getOpcode();
13904
13905 return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
13906 Opcode == AMDGPUISD::MUL_I24);
13907}
13908
13909static std::optional<bool>
13910checkDot4MulSignedness(const SDValue &N, ByteProvider<SDValue> &Src0,
13911 ByteProvider<SDValue> &Src1, const SDValue &S0Op,
13912 const SDValue &S1Op, const SelectionDAG &DAG) {
13913 // If we both ops are i8s (pre legalize-dag), then the signedness semantics
13914 // of the dot4 is irrelevant.
13915 if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8)
13916 return false;
13917
13918 auto Known0 = DAG.computeKnownBits(Op: S0Op, Depth: 0);
13919 bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0;
13920 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
13921 auto Known1 = DAG.computeKnownBits(Op: S1Op, Depth: 0);
13922 bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0;
13923 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
13924
13925 assert(!(S0IsUnsigned && S0IsSigned));
13926 assert(!(S1IsUnsigned && S1IsSigned));
13927
13928 // There are 9 possible permutations of
13929 // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
13930
13931 // In two permutations, the sign bits are known to be the same for both Ops,
13932 // so simply return Signed / Unsigned corresponding to the MSB
13933
13934 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
13935 return S0IsSigned;
13936
13937 // In another two permutations, the sign bits are known to be opposite. In
13938 // this case return std::nullopt to indicate a bad match.
13939
13940 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
13941 return std::nullopt;
13942
13943 // In the remaining five permutations, we don't know the value of the sign
13944 // bit for at least one Op. Since we have a valid ByteProvider, we know that
13945 // the upper bits must be extension bits. Thus, the only ways for the sign
13946 // bit to be unknown is if it was sign extended from unknown value, or if it
13947 // was any extended. In either case, it is correct to use the signed
13948 // version of the signedness semantics of dot4
13949
13950 // In two of such permutations, we known the sign bit is set for
13951 // one op, and the other is unknown. It is okay to used signed version of
13952 // dot4.
13953 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
13954 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
13955 return true;
13956
13957 // In one such permutation, we don't know either of the sign bits. It is okay
13958 // to used the signed version of dot4.
13959 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
13960 return true;
13961
13962 // In two of such permutations, we known the sign bit is unset for
13963 // one op, and the other is unknown. Return std::nullopt to indicate a
13964 // bad match.
13965 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
13966 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
13967 return std::nullopt;
13968
13969 llvm_unreachable("Fully covered condition");
13970}
13971
13972SDValue SITargetLowering::performAddCombine(SDNode *N,
13973 DAGCombinerInfo &DCI) const {
13974 SelectionDAG &DAG = DCI.DAG;
13975 EVT VT = N->getValueType(ResNo: 0);
13976 SDLoc SL(N);
13977 SDValue LHS = N->getOperand(Num: 0);
13978 SDValue RHS = N->getOperand(Num: 1);
13979
13980 if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
13981 if (Subtarget->hasMad64_32()) {
13982 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
13983 return Folded;
13984 }
13985 }
13986
13987 if (SDValue V = reassociateScalarOps(N, DAG)) {
13988 return V;
13989 }
13990
13991 if ((isMul(Op: LHS) || isMul(Op: RHS)) && Subtarget->hasDot7Insts() &&
13992 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
13993 SDValue TempNode(N, 0);
13994 std::optional<bool> IsSigned;
13995 SmallVector<DotSrc, 4> Src0s;
13996 SmallVector<DotSrc, 4> Src1s;
13997 SmallVector<SDValue, 4> Src2s;
13998
13999 // Match the v_dot4 tree, while collecting src nodes.
14000 int ChainLength = 0;
14001 for (int I = 0; I < 4; I++) {
14002 auto MulIdx = isMul(Op: LHS) ? 0 : isMul(Op: RHS) ? 1 : -1;
14003 if (MulIdx == -1)
14004 break;
14005 auto Src0 = handleMulOperand(MulOperand: TempNode->getOperand(Num: MulIdx)->getOperand(Num: 0));
14006 if (!Src0)
14007 break;
14008 auto Src1 = handleMulOperand(MulOperand: TempNode->getOperand(Num: MulIdx)->getOperand(Num: 1));
14009 if (!Src1)
14010 break;
14011
14012 auto IterIsSigned = checkDot4MulSignedness(
14013 N: TempNode->getOperand(Num: MulIdx), Src0&: *Src0, Src1&: *Src1,
14014 S0Op: TempNode->getOperand(Num: MulIdx)->getOperand(Num: 0),
14015 S1Op: TempNode->getOperand(Num: MulIdx)->getOperand(Num: 1), DAG);
14016 if (!IterIsSigned)
14017 break;
14018 if (!IsSigned)
14019 IsSigned = *IterIsSigned;
14020 if (*IterIsSigned != *IsSigned)
14021 break;
14022 placeSources(Src0&: *Src0, Src1&: *Src1, Src0s, Src1s, Step: I);
14023 auto AddIdx = 1 - MulIdx;
14024 // Allow the special case where add (add (mul24, 0), mul24) became ->
14025 // add (mul24, mul24).
14026 if (I == 2 && isMul(Op: TempNode->getOperand(Num: AddIdx))) {
14027 Src2s.push_back(Elt: TempNode->getOperand(Num: AddIdx));
14028 auto Src0 =
14029 handleMulOperand(MulOperand: TempNode->getOperand(Num: AddIdx)->getOperand(Num: 0));
14030 if (!Src0)
14031 break;
14032 auto Src1 =
14033 handleMulOperand(MulOperand: TempNode->getOperand(Num: AddIdx)->getOperand(Num: 1));
14034 if (!Src1)
14035 break;
14036 auto IterIsSigned = checkDot4MulSignedness(
14037 N: TempNode->getOperand(Num: AddIdx), Src0&: *Src0, Src1&: *Src1,
14038 S0Op: TempNode->getOperand(Num: AddIdx)->getOperand(Num: 0),
14039 S1Op: TempNode->getOperand(Num: AddIdx)->getOperand(Num: 1), DAG);
14040 if (!IterIsSigned)
14041 break;
14042 assert(IsSigned);
14043 if (*IterIsSigned != *IsSigned)
14044 break;
14045 placeSources(Src0&: *Src0, Src1&: *Src1, Src0s, Src1s, Step: I + 1);
14046 Src2s.push_back(DAG.getConstant(0, SL, MVT::i32));
14047 ChainLength = I + 2;
14048 break;
14049 }
14050
14051 TempNode = TempNode->getOperand(Num: AddIdx);
14052 Src2s.push_back(Elt: TempNode);
14053 ChainLength = I + 1;
14054 if (TempNode->getNumOperands() < 2)
14055 break;
14056 LHS = TempNode->getOperand(Num: 0);
14057 RHS = TempNode->getOperand(Num: 1);
14058 }
14059
14060 if (ChainLength < 2)
14061 return SDValue();
14062
14063 // Masks were constructed with assumption that we would find a chain of
14064 // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
14065 // 0x0c) so they do not affect dot calculation.
14066 if (ChainLength < 4) {
14067 fixMasks(Srcs&: Src0s, ChainLength);
14068 fixMasks(Srcs&: Src1s, ChainLength);
14069 }
14070
14071 SDValue Src0, Src1;
14072
14073 // If we are just using a single source for both, and have permuted the
14074 // bytes consistently, we can just use the sources without permuting
14075 // (commutation).
14076 bool UseOriginalSrc = false;
14077 if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
14078 Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
14079 Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&
14080 Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {
14081 SmallVector<unsigned, 4> SrcBytes;
14082 auto Src0Mask = Src0s.begin()->PermMask;
14083 SrcBytes.push_back(Elt: Src0Mask & 0xFF000000);
14084 bool UniqueEntries = true;
14085 for (auto I = 1; I < 4; I++) {
14086 auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
14087
14088 if (is_contained(Range&: SrcBytes, Element: NextByte)) {
14089 UniqueEntries = false;
14090 break;
14091 }
14092 SrcBytes.push_back(Elt: NextByte);
14093 }
14094
14095 if (UniqueEntries) {
14096 UseOriginalSrc = true;
14097
14098 auto FirstElt = Src0s.begin();
14099 auto FirstEltOp =
14100 getDWordFromOffset(DAG, SL, Src: FirstElt->SrcOp, DWordOffset: FirstElt->DWordOffset);
14101
14102 auto SecondElt = Src1s.begin();
14103 auto SecondEltOp = getDWordFromOffset(DAG, SL, Src: SecondElt->SrcOp,
14104 DWordOffset: SecondElt->DWordOffset);
14105
14106 Src0 = DAG.getBitcastedAnyExtOrTrunc(Op: FirstEltOp, DL: SL,
14107 VT: MVT::getIntegerVT(BitWidth: 32));
14108 Src1 = DAG.getBitcastedAnyExtOrTrunc(Op: SecondEltOp, DL: SL,
14109 VT: MVT::getIntegerVT(BitWidth: 32));
14110 }
14111 }
14112
14113 if (!UseOriginalSrc) {
14114 Src0 = resolveSources(DAG, SL, Srcs&: Src0s, IsSigned: false, IsAny: true);
14115 Src1 = resolveSources(DAG, SL, Srcs&: Src1s, IsSigned: false, IsAny: true);
14116 }
14117
14118 assert(IsSigned);
14119 SDValue Src2 =
14120 DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
14121
14122 SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4
14123 : Intrinsic::amdgcn_udot4,
14124 SL, MVT::i64);
14125
14126 assert(!VT.isVector());
14127 auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0,
14128 Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1));
14129
14130 return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT);
14131 }
14132
14133 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
14134 return SDValue();
14135
14136 // add x, zext (setcc) => uaddo_carry x, 0, setcc
14137 // add x, sext (setcc) => usubo_carry x, 0, setcc
14138 unsigned Opc = LHS.getOpcode();
14139 if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND ||
14140 Opc == ISD::ANY_EXTEND || Opc == ISD::UADDO_CARRY)
14141 std::swap(a&: RHS, b&: LHS);
14142
14143 Opc = RHS.getOpcode();
14144 switch (Opc) {
14145 default: break;
14146 case ISD::ZERO_EXTEND:
14147 case ISD::SIGN_EXTEND:
14148 case ISD::ANY_EXTEND: {
14149 auto Cond = RHS.getOperand(i: 0);
14150 // If this won't be a real VOPC output, we would still need to insert an
14151 // extra instruction anyway.
14152 if (!isBoolSGPR(V: Cond))
14153 break;
14154 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
14155 SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond };
14156 Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::USUBO_CARRY : ISD::UADDO_CARRY;
14157 return DAG.getNode(Opc, SL, VTList, Args);
14158 }
14159 case ISD::UADDO_CARRY: {
14160 // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
14161 if (!isNullConstant(V: RHS.getOperand(i: 1)))
14162 break;
14163 SDValue Args[] = { LHS, RHS.getOperand(i: 0), RHS.getOperand(i: 2) };
14164 return DAG.getNode(Opcode: ISD::UADDO_CARRY, DL: SDLoc(N), VTList: RHS->getVTList(), Ops: Args);
14165 }
14166 }
14167 return SDValue();
14168}
14169
14170SDValue SITargetLowering::performSubCombine(SDNode *N,
14171 DAGCombinerInfo &DCI) const {
14172 SelectionDAG &DAG = DCI.DAG;
14173 EVT VT = N->getValueType(ResNo: 0);
14174
14175 if (VT != MVT::i32)
14176 return SDValue();
14177
14178 SDLoc SL(N);
14179 SDValue LHS = N->getOperand(Num: 0);
14180 SDValue RHS = N->getOperand(Num: 1);
14181
14182 // sub x, zext (setcc) => usubo_carry x, 0, setcc
14183 // sub x, sext (setcc) => uaddo_carry x, 0, setcc
14184 unsigned Opc = RHS.getOpcode();
14185 switch (Opc) {
14186 default: break;
14187 case ISD::ZERO_EXTEND:
14188 case ISD::SIGN_EXTEND:
14189 case ISD::ANY_EXTEND: {
14190 auto Cond = RHS.getOperand(i: 0);
14191 // If this won't be a real VOPC output, we would still need to insert an
14192 // extra instruction anyway.
14193 if (!isBoolSGPR(V: Cond))
14194 break;
14195 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
14196 SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond };
14197 Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::UADDO_CARRY : ISD::USUBO_CARRY;
14198 return DAG.getNode(Opc, SL, VTList, Args);
14199 }
14200 }
14201
14202 if (LHS.getOpcode() == ISD::USUBO_CARRY) {
14203 // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
14204 if (!isNullConstant(V: LHS.getOperand(i: 1)))
14205 return SDValue();
14206 SDValue Args[] = { LHS.getOperand(i: 0), RHS, LHS.getOperand(i: 2) };
14207 return DAG.getNode(Opcode: ISD::USUBO_CARRY, DL: SDLoc(N), VTList: LHS->getVTList(), Ops: Args);
14208 }
14209 return SDValue();
14210}
14211
14212SDValue SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
14213 DAGCombinerInfo &DCI) const {
14214
14215 if (N->getValueType(0) != MVT::i32)
14216 return SDValue();
14217
14218 if (!isNullConstant(V: N->getOperand(Num: 1)))
14219 return SDValue();
14220
14221 SelectionDAG &DAG = DCI.DAG;
14222 SDValue LHS = N->getOperand(Num: 0);
14223
14224 // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc
14225 // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc
14226 unsigned LHSOpc = LHS.getOpcode();
14227 unsigned Opc = N->getOpcode();
14228 if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) ||
14229 (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) {
14230 SDValue Args[] = { LHS.getOperand(i: 0), LHS.getOperand(i: 1), N->getOperand(Num: 2) };
14231 return DAG.getNode(Opcode: Opc, DL: SDLoc(N), VTList: N->getVTList(), Ops: Args);
14232 }
14233 return SDValue();
14234}
14235
14236SDValue SITargetLowering::performFAddCombine(SDNode *N,
14237 DAGCombinerInfo &DCI) const {
14238 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14239 return SDValue();
14240
14241 SelectionDAG &DAG = DCI.DAG;
14242 EVT VT = N->getValueType(ResNo: 0);
14243
14244 SDLoc SL(N);
14245 SDValue LHS = N->getOperand(Num: 0);
14246 SDValue RHS = N->getOperand(Num: 1);
14247
14248 // These should really be instruction patterns, but writing patterns with
14249 // source modifiers is a pain.
14250
14251 // fadd (fadd (a, a), b) -> mad 2.0, a, b
14252 if (LHS.getOpcode() == ISD::FADD) {
14253 SDValue A = LHS.getOperand(i: 0);
14254 if (A == LHS.getOperand(i: 1)) {
14255 unsigned FusedOp = getFusedOpcode(DAG, N0: N, N1: LHS.getNode());
14256 if (FusedOp != 0) {
14257 const SDValue Two = DAG.getConstantFP(Val: 2.0, DL: SL, VT);
14258 return DAG.getNode(Opcode: FusedOp, DL: SL, VT, N1: A, N2: Two, N3: RHS);
14259 }
14260 }
14261 }
14262
14263 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
14264 if (RHS.getOpcode() == ISD::FADD) {
14265 SDValue A = RHS.getOperand(i: 0);
14266 if (A == RHS.getOperand(i: 1)) {
14267 unsigned FusedOp = getFusedOpcode(DAG, N0: N, N1: RHS.getNode());
14268 if (FusedOp != 0) {
14269 const SDValue Two = DAG.getConstantFP(Val: 2.0, DL: SL, VT);
14270 return DAG.getNode(Opcode: FusedOp, DL: SL, VT, N1: A, N2: Two, N3: LHS);
14271 }
14272 }
14273 }
14274
14275 return SDValue();
14276}
14277
14278SDValue SITargetLowering::performFSubCombine(SDNode *N,
14279 DAGCombinerInfo &DCI) const {
14280 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14281 return SDValue();
14282
14283 SelectionDAG &DAG = DCI.DAG;
14284 SDLoc SL(N);
14285 EVT VT = N->getValueType(ResNo: 0);
14286 assert(!VT.isVector());
14287
14288 // Try to get the fneg to fold into the source modifier. This undoes generic
14289 // DAG combines and folds them into the mad.
14290 //
14291 // Only do this if we are not trying to support denormals. v_mad_f32 does
14292 // not support denormals ever.
14293 SDValue LHS = N->getOperand(Num: 0);
14294 SDValue RHS = N->getOperand(Num: 1);
14295 if (LHS.getOpcode() == ISD::FADD) {
14296 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
14297 SDValue A = LHS.getOperand(i: 0);
14298 if (A == LHS.getOperand(i: 1)) {
14299 unsigned FusedOp = getFusedOpcode(DAG, N0: N, N1: LHS.getNode());
14300 if (FusedOp != 0){
14301 const SDValue Two = DAG.getConstantFP(Val: 2.0, DL: SL, VT);
14302 SDValue NegRHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
14303
14304 return DAG.getNode(Opcode: FusedOp, DL: SL, VT, N1: A, N2: Two, N3: NegRHS);
14305 }
14306 }
14307 }
14308
14309 if (RHS.getOpcode() == ISD::FADD) {
14310 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
14311
14312 SDValue A = RHS.getOperand(i: 0);
14313 if (A == RHS.getOperand(i: 1)) {
14314 unsigned FusedOp = getFusedOpcode(DAG, N0: N, N1: RHS.getNode());
14315 if (FusedOp != 0){
14316 const SDValue NegTwo = DAG.getConstantFP(Val: -2.0, DL: SL, VT);
14317 return DAG.getNode(Opcode: FusedOp, DL: SL, VT, N1: A, N2: NegTwo, N3: LHS);
14318 }
14319 }
14320 }
14321
14322 return SDValue();
14323}
14324
14325SDValue SITargetLowering::performFDivCombine(SDNode *N,
14326 DAGCombinerInfo &DCI) const {
14327 SelectionDAG &DAG = DCI.DAG;
14328 SDLoc SL(N);
14329 EVT VT = N->getValueType(ResNo: 0);
14330 if (VT != MVT::f16 || !Subtarget->has16BitInsts())
14331 return SDValue();
14332
14333 SDValue LHS = N->getOperand(Num: 0);
14334 SDValue RHS = N->getOperand(Num: 1);
14335
14336 SDNodeFlags Flags = N->getFlags();
14337 SDNodeFlags RHSFlags = RHS->getFlags();
14338 if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() ||
14339 !RHS->hasOneUse())
14340 return SDValue();
14341
14342 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(Val&: LHS)) {
14343 bool IsNegative = false;
14344 if (CLHS->isExactlyValue(V: 1.0) ||
14345 (IsNegative = CLHS->isExactlyValue(V: -1.0))) {
14346 // fdiv contract 1.0, (sqrt contract x) -> rsq for f16
14347 // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
14348 if (RHS.getOpcode() == ISD::FSQRT) {
14349 // TODO: Or in RHS flags, somehow missing from SDNodeFlags
14350 SDValue Rsq =
14351 DAG.getNode(Opcode: AMDGPUISD::RSQ, DL: SL, VT, Operand: RHS.getOperand(i: 0), Flags);
14352 return IsNegative ? DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Rsq, Flags) : Rsq;
14353 }
14354 }
14355 }
14356
14357 return SDValue();
14358}
14359
14360SDValue SITargetLowering::performFMACombine(SDNode *N,
14361 DAGCombinerInfo &DCI) const {
14362 SelectionDAG &DAG = DCI.DAG;
14363 EVT VT = N->getValueType(ResNo: 0);
14364 SDLoc SL(N);
14365
14366 if (!Subtarget->hasDot7Insts() || VT != MVT::f32)
14367 return SDValue();
14368
14369 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
14370 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
14371 SDValue Op1 = N->getOperand(Num: 0);
14372 SDValue Op2 = N->getOperand(Num: 1);
14373 SDValue FMA = N->getOperand(Num: 2);
14374
14375 if (FMA.getOpcode() != ISD::FMA ||
14376 Op1.getOpcode() != ISD::FP_EXTEND ||
14377 Op2.getOpcode() != ISD::FP_EXTEND)
14378 return SDValue();
14379
14380 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
14381 // regardless of the denorm mode setting. Therefore,
14382 // unsafe-fp-math/fp-contract is sufficient to allow generating fdot2.
14383 const TargetOptions &Options = DAG.getTarget().Options;
14384 if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
14385 (N->getFlags().hasAllowContract() &&
14386 FMA->getFlags().hasAllowContract())) {
14387 Op1 = Op1.getOperand(i: 0);
14388 Op2 = Op2.getOperand(i: 0);
14389 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14390 Op2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
14391 return SDValue();
14392
14393 SDValue Vec1 = Op1.getOperand(i: 0);
14394 SDValue Idx1 = Op1.getOperand(i: 1);
14395 SDValue Vec2 = Op2.getOperand(i: 0);
14396
14397 SDValue FMAOp1 = FMA.getOperand(i: 0);
14398 SDValue FMAOp2 = FMA.getOperand(i: 1);
14399 SDValue FMAAcc = FMA.getOperand(i: 2);
14400
14401 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
14402 FMAOp2.getOpcode() != ISD::FP_EXTEND)
14403 return SDValue();
14404
14405 FMAOp1 = FMAOp1.getOperand(i: 0);
14406 FMAOp2 = FMAOp2.getOperand(i: 0);
14407 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14408 FMAOp2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
14409 return SDValue();
14410
14411 SDValue Vec3 = FMAOp1.getOperand(i: 0);
14412 SDValue Vec4 = FMAOp2.getOperand(i: 0);
14413 SDValue Idx2 = FMAOp1.getOperand(i: 1);
14414
14415 if (Idx1 != Op2.getOperand(i: 1) || Idx2 != FMAOp2.getOperand(i: 1) ||
14416 // Idx1 and Idx2 cannot be the same.
14417 Idx1 == Idx2)
14418 return SDValue();
14419
14420 if (Vec1 == Vec2 || Vec3 == Vec4)
14421 return SDValue();
14422
14423 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
14424 return SDValue();
14425
14426 if ((Vec1 == Vec3 && Vec2 == Vec4) ||
14427 (Vec1 == Vec4 && Vec2 == Vec3)) {
14428 return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
14429 DAG.getTargetConstant(0, SL, MVT::i1));
14430 }
14431 }
14432 return SDValue();
14433}
14434
14435SDValue SITargetLowering::performSetCCCombine(SDNode *N,
14436 DAGCombinerInfo &DCI) const {
14437 SelectionDAG &DAG = DCI.DAG;
14438 SDLoc SL(N);
14439
14440 SDValue LHS = N->getOperand(Num: 0);
14441 SDValue RHS = N->getOperand(Num: 1);
14442 EVT VT = LHS.getValueType();
14443 ISD::CondCode CC = cast<CondCodeSDNode>(Val: N->getOperand(Num: 2))->get();
14444
14445 auto CRHS = dyn_cast<ConstantSDNode>(Val&: RHS);
14446 if (!CRHS) {
14447 CRHS = dyn_cast<ConstantSDNode>(Val&: LHS);
14448 if (CRHS) {
14449 std::swap(a&: LHS, b&: RHS);
14450 CC = getSetCCSwappedOperands(Operation: CC);
14451 }
14452 }
14453
14454 if (CRHS) {
14455 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
14456 isBoolSGPR(LHS.getOperand(0))) {
14457 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
14458 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
14459 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
14460 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
14461 if ((CRHS->isAllOnes() &&
14462 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
14463 (CRHS->isZero() &&
14464 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
14465 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
14466 DAG.getConstant(-1, SL, MVT::i1));
14467 if ((CRHS->isAllOnes() &&
14468 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
14469 (CRHS->isZero() &&
14470 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
14471 return LHS.getOperand(i: 0);
14472 }
14473
14474 const APInt &CRHSVal = CRHS->getAPIntValue();
14475 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
14476 LHS.getOpcode() == ISD::SELECT &&
14477 isa<ConstantSDNode>(Val: LHS.getOperand(i: 1)) &&
14478 isa<ConstantSDNode>(Val: LHS.getOperand(i: 2)) &&
14479 LHS.getConstantOperandVal(i: 1) != LHS.getConstantOperandVal(i: 2) &&
14480 isBoolSGPR(V: LHS.getOperand(i: 0))) {
14481 // Given CT != FT:
14482 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
14483 // setcc (select cc, CT, CF), CF, ne => cc
14484 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
14485 // setcc (select cc, CT, CF), CT, eq => cc
14486 const APInt &CT = LHS.getConstantOperandAPInt(i: 1);
14487 const APInt &CF = LHS.getConstantOperandAPInt(i: 2);
14488
14489 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
14490 (CT == CRHSVal && CC == ISD::SETNE))
14491 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
14492 DAG.getConstant(-1, SL, MVT::i1));
14493 if ((CF == CRHSVal && CC == ISD::SETNE) ||
14494 (CT == CRHSVal && CC == ISD::SETEQ))
14495 return LHS.getOperand(i: 0);
14496 }
14497 }
14498
14499 if (VT != MVT::f32 && VT != MVT::f64 &&
14500 (!Subtarget->has16BitInsts() || VT != MVT::f16))
14501 return SDValue();
14502
14503 // Match isinf/isfinite pattern
14504 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
14505 // (fcmp one (fabs x), inf) -> (fp_class x,
14506 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
14507 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) && LHS.getOpcode() == ISD::FABS) {
14508 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(Val&: RHS);
14509 if (!CRHS)
14510 return SDValue();
14511
14512 const APFloat &APF = CRHS->getValueAPF();
14513 if (APF.isInfinity() && !APF.isNegative()) {
14514 const unsigned IsInfMask = SIInstrFlags::P_INFINITY |
14515 SIInstrFlags::N_INFINITY;
14516 const unsigned IsFiniteMask = SIInstrFlags::N_ZERO |
14517 SIInstrFlags::P_ZERO |
14518 SIInstrFlags::N_NORMAL |
14519 SIInstrFlags::P_NORMAL |
14520 SIInstrFlags::N_SUBNORMAL |
14521 SIInstrFlags::P_SUBNORMAL;
14522 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
14523 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
14524 DAG.getConstant(Mask, SL, MVT::i32));
14525 }
14526 }
14527
14528 return SDValue();
14529}
14530
14531SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
14532 DAGCombinerInfo &DCI) const {
14533 SelectionDAG &DAG = DCI.DAG;
14534 SDLoc SL(N);
14535 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
14536
14537 SDValue Src = N->getOperand(Num: 0);
14538 SDValue Shift = N->getOperand(Num: 0);
14539
14540 // TODO: Extend type shouldn't matter (assuming legal types).
14541 if (Shift.getOpcode() == ISD::ZERO_EXTEND)
14542 Shift = Shift.getOperand(i: 0);
14543
14544 if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
14545 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
14546 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
14547 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
14548 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
14549 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
14550 if (auto *C = dyn_cast<ConstantSDNode>(Val: Shift.getOperand(i: 1))) {
14551 SDValue Shifted = DAG.getZExtOrTrunc(Shift.getOperand(0),
14552 SDLoc(Shift.getOperand(0)), MVT::i32);
14553
14554 unsigned ShiftOffset = 8 * Offset;
14555 if (Shift.getOpcode() == ISD::SHL)
14556 ShiftOffset -= C->getZExtValue();
14557 else
14558 ShiftOffset += C->getZExtValue();
14559
14560 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
14561 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
14562 MVT::f32, Shifted);
14563 }
14564 }
14565 }
14566
14567 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14568 APInt DemandedBits = APInt::getBitsSet(numBits: 32, loBit: 8 * Offset, hiBit: 8 * Offset + 8);
14569 if (TLI.SimplifyDemandedBits(Op: Src, DemandedBits, DCI)) {
14570 // We simplified Src. If this node is not dead, visit it again so it is
14571 // folded properly.
14572 if (N->getOpcode() != ISD::DELETED_NODE)
14573 DCI.AddToWorklist(N);
14574 return SDValue(N, 0);
14575 }
14576
14577 // Handle (or x, (srl y, 8)) pattern when known bits are zero.
14578 if (SDValue DemandedSrc =
14579 TLI.SimplifyMultipleUseDemandedBits(Src, DemandedBits, DAG))
14580 return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc);
14581
14582 return SDValue();
14583}
14584
14585SDValue SITargetLowering::performClampCombine(SDNode *N,
14586 DAGCombinerInfo &DCI) const {
14587 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(Val: N->getOperand(Num: 0));
14588 if (!CSrc)
14589 return SDValue();
14590
14591 const MachineFunction &MF = DCI.DAG.getMachineFunction();
14592 const APFloat &F = CSrc->getValueAPF();
14593 APFloat Zero = APFloat::getZero(Sem: F.getSemantics());
14594 if (F < Zero ||
14595 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
14596 return DCI.DAG.getConstantFP(Val: Zero, DL: SDLoc(N), VT: N->getValueType(ResNo: 0));
14597 }
14598
14599 APFloat One(F.getSemantics(), "1.0");
14600 if (F > One)
14601 return DCI.DAG.getConstantFP(Val: One, DL: SDLoc(N), VT: N->getValueType(ResNo: 0));
14602
14603 return SDValue(CSrc, 0);
14604}
14605
14606
14607SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
14608 DAGCombinerInfo &DCI) const {
14609 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
14610 return SDValue();
14611 switch (N->getOpcode()) {
14612 case ISD::ADD:
14613 return performAddCombine(N, DCI);
14614 case ISD::SUB:
14615 return performSubCombine(N, DCI);
14616 case ISD::UADDO_CARRY:
14617 case ISD::USUBO_CARRY:
14618 return performAddCarrySubCarryCombine(N, DCI);
14619 case ISD::FADD:
14620 return performFAddCombine(N, DCI);
14621 case ISD::FSUB:
14622 return performFSubCombine(N, DCI);
14623 case ISD::FDIV:
14624 return performFDivCombine(N, DCI);
14625 case ISD::SETCC:
14626 return performSetCCCombine(N, DCI);
14627 case ISD::FMAXNUM:
14628 case ISD::FMINNUM:
14629 case ISD::FMAXNUM_IEEE:
14630 case ISD::FMINNUM_IEEE:
14631 case ISD::FMAXIMUM:
14632 case ISD::FMINIMUM:
14633 case ISD::SMAX:
14634 case ISD::SMIN:
14635 case ISD::UMAX:
14636 case ISD::UMIN:
14637 case AMDGPUISD::FMIN_LEGACY:
14638 case AMDGPUISD::FMAX_LEGACY:
14639 return performMinMaxCombine(N, DCI);
14640 case ISD::FMA:
14641 return performFMACombine(N, DCI);
14642 case ISD::AND:
14643 return performAndCombine(N, DCI);
14644 case ISD::OR:
14645 return performOrCombine(N, DCI);
14646 case ISD::FSHR: {
14647 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
14648 if (N->getValueType(0) == MVT::i32 && N->isDivergent() &&
14649 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
14650 return matchPERM(N, DCI);
14651 }
14652 break;
14653 }
14654 case ISD::XOR:
14655 return performXorCombine(N, DCI);
14656 case ISD::ZERO_EXTEND:
14657 return performZeroExtendCombine(N, DCI);
14658 case ISD::SIGN_EXTEND_INREG:
14659 return performSignExtendInRegCombine(N , DCI);
14660 case AMDGPUISD::FP_CLASS:
14661 return performClassCombine(N, DCI);
14662 case ISD::FCANONICALIZE:
14663 return performFCanonicalizeCombine(N, DCI);
14664 case AMDGPUISD::RCP:
14665 return performRcpCombine(N, DCI);
14666 case ISD::FLDEXP:
14667 case AMDGPUISD::FRACT:
14668 case AMDGPUISD::RSQ:
14669 case AMDGPUISD::RCP_LEGACY:
14670 case AMDGPUISD::RCP_IFLAG:
14671 case AMDGPUISD::RSQ_CLAMP: {
14672 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
14673 SDValue Src = N->getOperand(Num: 0);
14674 if (Src.isUndef())
14675 return Src;
14676 break;
14677 }
14678 case ISD::SINT_TO_FP:
14679 case ISD::UINT_TO_FP:
14680 return performUCharToFloatCombine(N, DCI);
14681 case ISD::FCOPYSIGN:
14682 return performFCopySignCombine(N, DCI);
14683 case AMDGPUISD::CVT_F32_UBYTE0:
14684 case AMDGPUISD::CVT_F32_UBYTE1:
14685 case AMDGPUISD::CVT_F32_UBYTE2:
14686 case AMDGPUISD::CVT_F32_UBYTE3:
14687 return performCvtF32UByteNCombine(N, DCI);
14688 case AMDGPUISD::FMED3:
14689 return performFMed3Combine(N, DCI);
14690 case AMDGPUISD::CVT_PKRTZ_F16_F32:
14691 return performCvtPkRTZCombine(N, DCI);
14692 case AMDGPUISD::CLAMP:
14693 return performClampCombine(N, DCI);
14694 case ISD::SCALAR_TO_VECTOR: {
14695 SelectionDAG &DAG = DCI.DAG;
14696 EVT VT = N->getValueType(ResNo: 0);
14697
14698 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
14699 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
14700 SDLoc SL(N);
14701 SDValue Src = N->getOperand(Num: 0);
14702 EVT EltVT = Src.getValueType();
14703 if (EltVT != MVT::i16)
14704 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
14705
14706 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
14707 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Ext);
14708 }
14709
14710 break;
14711 }
14712 case ISD::EXTRACT_VECTOR_ELT:
14713 return performExtractVectorEltCombine(N, DCI);
14714 case ISD::INSERT_VECTOR_ELT:
14715 return performInsertVectorEltCombine(N, DCI);
14716 case ISD::FP_ROUND:
14717 return performFPRoundCombine(N, DCI);
14718 case ISD::LOAD: {
14719 if (SDValue Widened = widenLoad(Ld: cast<LoadSDNode>(Val: N), DCI))
14720 return Widened;
14721 [[fallthrough]];
14722 }
14723 default: {
14724 if (!DCI.isBeforeLegalize()) {
14725 if (MemSDNode *MemNode = dyn_cast<MemSDNode>(Val: N))
14726 return performMemSDNodeCombine(N: MemNode, DCI);
14727 }
14728
14729 break;
14730 }
14731 }
14732
14733 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
14734}
14735
14736/// Helper function for adjustWritemask
14737static unsigned SubIdx2Lane(unsigned Idx) {
14738 switch (Idx) {
14739 default: return ~0u;
14740 case AMDGPU::sub0: return 0;
14741 case AMDGPU::sub1: return 1;
14742 case AMDGPU::sub2: return 2;
14743 case AMDGPU::sub3: return 3;
14744 case AMDGPU::sub4: return 4; // Possible with TFE/LWE
14745 }
14746}
14747
14748/// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
14749SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
14750 SelectionDAG &DAG) const {
14751 unsigned Opcode = Node->getMachineOpcode();
14752
14753 // Subtract 1 because the vdata output is not a MachineSDNode operand.
14754 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
14755 if (D16Idx >= 0 && Node->getConstantOperandVal(Num: D16Idx))
14756 return Node; // not implemented for D16
14757
14758 SDNode *Users[5] = { nullptr };
14759 unsigned Lane = 0;
14760 unsigned DmaskIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
14761 unsigned OldDmask = Node->getConstantOperandVal(Num: DmaskIdx);
14762 unsigned NewDmask = 0;
14763 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
14764 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
14765 bool UsesTFC = ((int(TFEIdx) >= 0 && Node->getConstantOperandVal(Num: TFEIdx)) ||
14766 (int(LWEIdx) >= 0 && Node->getConstantOperandVal(Num: LWEIdx)))
14767 ? true
14768 : false;
14769 unsigned TFCLane = 0;
14770 bool HasChain = Node->getNumValues() > 1;
14771
14772 if (OldDmask == 0) {
14773 // These are folded out, but on the chance it happens don't assert.
14774 return Node;
14775 }
14776
14777 unsigned OldBitsSet = llvm::popcount(Value: OldDmask);
14778 // Work out which is the TFE/LWE lane if that is enabled.
14779 if (UsesTFC) {
14780 TFCLane = OldBitsSet;
14781 }
14782
14783 // Try to figure out the used register components
14784 for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
14785 I != E; ++I) {
14786
14787 // Don't look at users of the chain.
14788 if (I.getUse().getResNo() != 0)
14789 continue;
14790
14791 // Abort if we can't understand the usage
14792 if (!I->isMachineOpcode() ||
14793 I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
14794 return Node;
14795
14796 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
14797 // Note that subregs are packed, i.e. Lane==0 is the first bit set
14798 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
14799 // set, etc.
14800 Lane = SubIdx2Lane(Idx: I->getConstantOperandVal(Num: 1));
14801 if (Lane == ~0u)
14802 return Node;
14803
14804 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
14805 if (UsesTFC && Lane == TFCLane) {
14806 Users[Lane] = *I;
14807 } else {
14808 // Set which texture component corresponds to the lane.
14809 unsigned Comp;
14810 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
14811 Comp = llvm::countr_zero(Val: Dmask);
14812 Dmask &= ~(1 << Comp);
14813 }
14814
14815 // Abort if we have more than one user per component.
14816 if (Users[Lane])
14817 return Node;
14818
14819 Users[Lane] = *I;
14820 NewDmask |= 1 << Comp;
14821 }
14822 }
14823
14824 // Don't allow 0 dmask, as hardware assumes one channel enabled.
14825 bool NoChannels = !NewDmask;
14826 if (NoChannels) {
14827 if (!UsesTFC) {
14828 // No uses of the result and not using TFC. Then do nothing.
14829 return Node;
14830 }
14831 // If the original dmask has one channel - then nothing to do
14832 if (OldBitsSet == 1)
14833 return Node;
14834 // Use an arbitrary dmask - required for the instruction to work
14835 NewDmask = 1;
14836 }
14837 // Abort if there's no change
14838 if (NewDmask == OldDmask)
14839 return Node;
14840
14841 unsigned BitsSet = llvm::popcount(Value: NewDmask);
14842
14843 // Check for TFE or LWE - increase the number of channels by one to account
14844 // for the extra return value
14845 // This will need adjustment for D16 if this is also included in
14846 // adjustWriteMask (this function) but at present D16 are excluded.
14847 unsigned NewChannels = BitsSet + UsesTFC;
14848
14849 int NewOpcode =
14850 AMDGPU::getMaskedMIMGOp(Opc: Node->getMachineOpcode(), NewChannels);
14851 assert(NewOpcode != -1 &&
14852 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
14853 "failed to find equivalent MIMG op");
14854
14855 // Adjust the writemask in the node
14856 SmallVector<SDValue, 12> Ops;
14857 Ops.insert(I: Ops.end(), From: Node->op_begin(), To: Node->op_begin() + DmaskIdx);
14858 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
14859 Ops.insert(I: Ops.end(), From: Node->op_begin() + DmaskIdx + 1, To: Node->op_end());
14860
14861 MVT SVT = Node->getValueType(ResNo: 0).getVectorElementType().getSimpleVT();
14862
14863 MVT ResultVT = NewChannels == 1 ?
14864 SVT : MVT::getVectorVT(VT: SVT, NumElements: NewChannels == 3 ? 4 :
14865 NewChannels == 5 ? 8 : NewChannels);
14866 SDVTList NewVTList = HasChain ?
14867 DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
14868
14869
14870 MachineSDNode *NewNode = DAG.getMachineNode(Opcode: NewOpcode, dl: SDLoc(Node),
14871 VTs: NewVTList, Ops);
14872
14873 if (HasChain) {
14874 // Update chain.
14875 DAG.setNodeMemRefs(N: NewNode, NewMemRefs: Node->memoperands());
14876 DAG.ReplaceAllUsesOfValueWith(From: SDValue(Node, 1), To: SDValue(NewNode, 1));
14877 }
14878
14879 if (NewChannels == 1) {
14880 assert(Node->hasNUsesOfValue(1, 0));
14881 SDNode *Copy = DAG.getMachineNode(Opcode: TargetOpcode::COPY,
14882 dl: SDLoc(Node), VT: Users[Lane]->getValueType(ResNo: 0),
14883 Op1: SDValue(NewNode, 0));
14884 DAG.ReplaceAllUsesWith(From: Users[Lane], To: Copy);
14885 return nullptr;
14886 }
14887
14888 // Update the users of the node with the new indices
14889 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
14890 SDNode *User = Users[i];
14891 if (!User) {
14892 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
14893 // Users[0] is still nullptr because channel 0 doesn't really have a use.
14894 if (i || !NoChannels)
14895 continue;
14896 } else {
14897 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
14898 SDNode *NewUser = DAG.UpdateNodeOperands(N: User, Op1: SDValue(NewNode, 0), Op2: Op);
14899 if (NewUser != User) {
14900 DAG.ReplaceAllUsesWith(From: SDValue(User, 0), To: SDValue(NewUser, 0));
14901 DAG.RemoveDeadNode(N: User);
14902 }
14903 }
14904
14905 switch (Idx) {
14906 default: break;
14907 case AMDGPU::sub0: Idx = AMDGPU::sub1; break;
14908 case AMDGPU::sub1: Idx = AMDGPU::sub2; break;
14909 case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
14910 case AMDGPU::sub3: Idx = AMDGPU::sub4; break;
14911 }
14912 }
14913
14914 DAG.RemoveDeadNode(N: Node);
14915 return nullptr;
14916}
14917
14918static bool isFrameIndexOp(SDValue Op) {
14919 if (Op.getOpcode() == ISD::AssertZext)
14920 Op = Op.getOperand(i: 0);
14921
14922 return isa<FrameIndexSDNode>(Val: Op);
14923}
14924
14925/// Legalize target independent instructions (e.g. INSERT_SUBREG)
14926/// with frame index operands.
14927/// LLVM assumes that inputs are to these instructions are registers.
14928SDNode *SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
14929 SelectionDAG &DAG) const {
14930 if (Node->getOpcode() == ISD::CopyToReg) {
14931 RegisterSDNode *DestReg = cast<RegisterSDNode>(Val: Node->getOperand(Num: 1));
14932 SDValue SrcVal = Node->getOperand(Num: 2);
14933
14934 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
14935 // to try understanding copies to physical registers.
14936 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
14937 SDLoc SL(Node);
14938 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
14939 SDValue VReg = DAG.getRegister(
14940 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
14941
14942 SDNode *Glued = Node->getGluedNode();
14943 SDValue ToVReg
14944 = DAG.getCopyToReg(Chain: Node->getOperand(Num: 0), dl: SL, Reg: VReg, N: SrcVal,
14945 Glue: SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
14946 SDValue ToResultReg
14947 = DAG.getCopyToReg(Chain: ToVReg, dl: SL, Reg: SDValue(DestReg, 0),
14948 N: VReg, Glue: ToVReg.getValue(R: 1));
14949 DAG.ReplaceAllUsesWith(From: Node, To: ToResultReg.getNode());
14950 DAG.RemoveDeadNode(N: Node);
14951 return ToResultReg.getNode();
14952 }
14953 }
14954
14955 SmallVector<SDValue, 8> Ops;
14956 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
14957 if (!isFrameIndexOp(Op: Node->getOperand(Num: i))) {
14958 Ops.push_back(Elt: Node->getOperand(Num: i));
14959 continue;
14960 }
14961
14962 SDLoc DL(Node);
14963 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
14964 Node->getOperand(i).getValueType(),
14965 Node->getOperand(i)), 0));
14966 }
14967
14968 return DAG.UpdateNodeOperands(N: Node, Ops);
14969}
14970
14971/// Fold the instructions after selecting them.
14972/// Returns null if users were already updated.
14973SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
14974 SelectionDAG &DAG) const {
14975 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
14976 unsigned Opcode = Node->getMachineOpcode();
14977
14978 if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
14979 !TII->isGather4(Opcode) &&
14980 AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) {
14981 return adjustWritemask(Node, DAG);
14982 }
14983
14984 if (Opcode == AMDGPU::INSERT_SUBREG ||
14985 Opcode == AMDGPU::REG_SEQUENCE) {
14986 legalizeTargetIndependentNode(Node, DAG);
14987 return Node;
14988 }
14989
14990 switch (Opcode) {
14991 case AMDGPU::V_DIV_SCALE_F32_e64:
14992 case AMDGPU::V_DIV_SCALE_F64_e64: {
14993 // Satisfy the operand register constraint when one of the inputs is
14994 // undefined. Ordinarily each undef value will have its own implicit_def of
14995 // a vreg, so force these to use a single register.
14996 SDValue Src0 = Node->getOperand(Num: 1);
14997 SDValue Src1 = Node->getOperand(Num: 3);
14998 SDValue Src2 = Node->getOperand(Num: 5);
14999
15000 if ((Src0.isMachineOpcode() &&
15001 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
15002 (Src0 == Src1 || Src0 == Src2))
15003 break;
15004
15005 MVT VT = Src0.getValueType().getSimpleVT();
15006 const TargetRegisterClass *RC =
15007 getRegClassFor(VT, isDivergent: Src0.getNode()->isDivergent());
15008
15009 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
15010 SDValue UndefReg = DAG.getRegister(Reg: MRI.createVirtualRegister(RegClass: RC), VT);
15011
15012 SDValue ImpDef = DAG.getCopyToReg(Chain: DAG.getEntryNode(), dl: SDLoc(Node),
15013 Reg: UndefReg, N: Src0, Glue: SDValue());
15014
15015 // src0 must be the same register as src1 or src2, even if the value is
15016 // undefined, so make sure we don't violate this constraint.
15017 if (Src0.isMachineOpcode() &&
15018 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
15019 if (Src1.isMachineOpcode() &&
15020 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
15021 Src0 = Src1;
15022 else if (Src2.isMachineOpcode() &&
15023 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
15024 Src0 = Src2;
15025 else {
15026 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
15027 Src0 = UndefReg;
15028 Src1 = UndefReg;
15029 }
15030 } else
15031 break;
15032
15033 SmallVector<SDValue, 9> Ops(Node->op_begin(), Node->op_end());
15034 Ops[1] = Src0;
15035 Ops[3] = Src1;
15036 Ops[5] = Src2;
15037 Ops.push_back(Elt: ImpDef.getValue(R: 1));
15038 return DAG.getMachineNode(Opcode, dl: SDLoc(Node), VTs: Node->getVTList(), Ops);
15039 }
15040 default:
15041 break;
15042 }
15043
15044 return Node;
15045}
15046
15047// Any MIMG instructions that use tfe or lwe require an initialization of the
15048// result register that will be written in the case of a memory access failure.
15049// The required code is also added to tie this init code to the result of the
15050// img instruction.
15051void SITargetLowering::AddMemOpInit(MachineInstr &MI) const {
15052 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
15053 const SIRegisterInfo &TRI = TII->getRegisterInfo();
15054 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
15055 MachineBasicBlock &MBB = *MI.getParent();
15056
15057 int DstIdx =
15058 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
15059 unsigned InitIdx = 0;
15060
15061 if (TII->isImage(MI)) {
15062 MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
15063 MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
15064 MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
15065
15066 if (!TFE && !LWE) // intersect_ray
15067 return;
15068
15069 unsigned TFEVal = TFE ? TFE->getImm() : 0;
15070 unsigned LWEVal = LWE ? LWE->getImm() : 0;
15071 unsigned D16Val = D16 ? D16->getImm() : 0;
15072
15073 if (!TFEVal && !LWEVal)
15074 return;
15075
15076 // At least one of TFE or LWE are non-zero
15077 // We have to insert a suitable initialization of the result value and
15078 // tie this to the dest of the image instruction.
15079
15080 // Calculate which dword we have to initialize to 0.
15081 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
15082
15083 // check that dmask operand is found.
15084 assert(MO_Dmask && "Expected dmask operand in instruction");
15085
15086 unsigned dmask = MO_Dmask->getImm();
15087 // Determine the number of active lanes taking into account the
15088 // Gather4 special case
15089 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(Value: dmask);
15090
15091 bool Packed = !Subtarget->hasUnpackedD16VMem();
15092
15093 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
15094
15095 // Abandon attempt if the dst size isn't large enough
15096 // - this is in fact an error but this is picked up elsewhere and
15097 // reported correctly.
15098 uint32_t DstSize =
15099 TRI.getRegSizeInBits(*TII->getOpRegClass(MI, OpNo: DstIdx)) / 32;
15100 if (DstSize < InitIdx)
15101 return;
15102 } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(Opc: MI.getOpcode())) {
15103 InitIdx = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, OpNo: DstIdx)) / 32;
15104 } else {
15105 return;
15106 }
15107
15108 const DebugLoc &DL = MI.getDebugLoc();
15109
15110 // Create a register for the initialization value.
15111 Register PrevDst = MRI.cloneVirtualRegister(VReg: MI.getOperand(i: DstIdx).getReg());
15112 unsigned NewDst = 0; // Final initialized value will be in here
15113
15114 // If PRTStrictNull feature is enabled (the default) then initialize
15115 // all the result registers to 0, otherwise just the error indication
15116 // register (VGPRn+1)
15117 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
15118 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
15119
15120 BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
15121 for (; SizeLeft; SizeLeft--, CurrIdx++) {
15122 NewDst = MRI.createVirtualRegister(RegClass: TII->getOpRegClass(MI, OpNo: DstIdx));
15123 // Initialize dword
15124 Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
15125 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
15126 .addImm(0);
15127 // Insert into the super-reg
15128 BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
15129 .addReg(PrevDst)
15130 .addReg(SubReg)
15131 .addImm(SIRegisterInfo::getSubRegFromChannel(Channel: CurrIdx));
15132
15133 PrevDst = NewDst;
15134 }
15135
15136 // Add as an implicit operand
15137 MI.addOperand(Op: MachineOperand::CreateReg(Reg: NewDst, isDef: false, isImp: true));
15138
15139 // Tie the just added implicit operand to the dst
15140 MI.tieOperands(DefIdx: DstIdx, UseIdx: MI.getNumOperands() - 1);
15141}
15142
15143/// Assign the register class depending on the number of
15144/// bits set in the writemask
15145void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
15146 SDNode *Node) const {
15147 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
15148
15149 MachineFunction *MF = MI.getParent()->getParent();
15150 MachineRegisterInfo &MRI = MF->getRegInfo();
15151 SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
15152
15153 if (TII->isVOP3(Opcode: MI.getOpcode())) {
15154 // Make sure constant bus requirements are respected.
15155 TII->legalizeOperandsVOP3(MRI, MI);
15156
15157 // Prefer VGPRs over AGPRs in mAI instructions where possible.
15158 // This saves a chain-copy of registers and better balance register
15159 // use between vgpr and agpr as agpr tuples tend to be big.
15160 if (!MI.getDesc().operands().empty()) {
15161 unsigned Opc = MI.getOpcode();
15162 bool HasAGPRs = Info->mayNeedAGPRs();
15163 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
15164 int16_t Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
15165 for (auto I :
15166 {AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
15167 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), Src2Idx}) {
15168 if (I == -1)
15169 break;
15170 if ((I == Src2Idx) && (HasAGPRs))
15171 break;
15172 MachineOperand &Op = MI.getOperand(I);
15173 if (!Op.isReg() || !Op.getReg().isVirtual())
15174 continue;
15175 auto *RC = TRI->getRegClassForReg(MRI, Op.getReg());
15176 if (!TRI->hasAGPRs(RC))
15177 continue;
15178 auto *Src = MRI.getUniqueVRegDef(Op.getReg());
15179 if (!Src || !Src->isCopy() ||
15180 !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg()))
15181 continue;
15182 auto *NewRC = TRI->getEquivalentVGPRClass(RC);
15183 // All uses of agpr64 and agpr32 can also accept vgpr except for
15184 // v_accvgpr_read, but we do not produce agpr reads during selection,
15185 // so no use checks are needed.
15186 MRI.setRegClass(Op.getReg(), NewRC);
15187 }
15188
15189 if (!HasAGPRs)
15190 return;
15191
15192 // Resolve the rest of AV operands to AGPRs.
15193 if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) {
15194 if (Src2->isReg() && Src2->getReg().isVirtual()) {
15195 auto *RC = TRI->getRegClassForReg(MRI, Reg: Src2->getReg());
15196 if (TRI->isVectorSuperClass(RC: RC)) {
15197 auto *NewRC = TRI->getEquivalentAGPRClass(SRC: RC);
15198 MRI.setRegClass(Reg: Src2->getReg(), RC: NewRC);
15199 if (Src2->isTied())
15200 MRI.setRegClass(Reg: MI.getOperand(i: 0).getReg(), RC: NewRC);
15201 }
15202 }
15203 }
15204 }
15205
15206 return;
15207 }
15208
15209 if (TII->isImage(MI))
15210 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
15211}
15212
15213static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL,
15214 uint64_t Val) {
15215 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
15216 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
15217}
15218
15219MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
15220 const SDLoc &DL,
15221 SDValue Ptr) const {
15222 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
15223
15224 // Build the half of the subregister with the constants before building the
15225 // full 128-bit register. If we are building multiple resource descriptors,
15226 // this will allow CSEing of the 2-component register.
15227 const SDValue Ops0[] = {
15228 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
15229 buildSMovImm32(DAG, DL, 0),
15230 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
15231 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
15232 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
15233 };
15234
15235 SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL,
15236 MVT::v2i32, Ops0), 0);
15237
15238 // Combine the constants and the pointer.
15239 const SDValue Ops1[] = {
15240 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
15241 Ptr,
15242 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32),
15243 SubRegHi,
15244 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)
15245 };
15246
15247 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
15248}
15249
15250/// Return a resource descriptor with the 'Add TID' bit enabled
15251/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
15252/// of the resource descriptor) to create an offset, which is added to
15253/// the resource pointer.
15254MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL,
15255 SDValue Ptr, uint32_t RsrcDword1,
15256 uint64_t RsrcDword2And3) const {
15257 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
15258 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
15259 if (RsrcDword1) {
15260 PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
15261 DAG.getConstant(RsrcDword1, DL, MVT::i32)),
15262 0);
15263 }
15264
15265 SDValue DataLo = buildSMovImm32(DAG, DL,
15266 Val: RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
15267 SDValue DataHi = buildSMovImm32(DAG, DL, Val: RsrcDword2And3 >> 32);
15268
15269 const SDValue Ops[] = {
15270 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
15271 PtrLo,
15272 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
15273 PtrHi,
15274 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
15275 DataLo,
15276 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
15277 DataHi,
15278 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)
15279 };
15280
15281 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
15282}
15283
15284//===----------------------------------------------------------------------===//
15285// SI Inline Assembly Support
15286//===----------------------------------------------------------------------===//
15287
15288std::pair<unsigned, const TargetRegisterClass *>
15289SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_,
15290 StringRef Constraint,
15291 MVT VT) const {
15292 const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
15293
15294 const TargetRegisterClass *RC = nullptr;
15295 if (Constraint.size() == 1) {
15296 const unsigned BitWidth = VT.getSizeInBits();
15297 switch (Constraint[0]) {
15298 default:
15299 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
15300 case 's':
15301 case 'r':
15302 switch (BitWidth) {
15303 case 16:
15304 RC = &AMDGPU::SReg_32RegClass;
15305 break;
15306 case 64:
15307 RC = &AMDGPU::SGPR_64RegClass;
15308 break;
15309 default:
15310 RC = SIRegisterInfo::getSGPRClassForBitWidth(BitWidth);
15311 if (!RC)
15312 return std::pair(0U, nullptr);
15313 break;
15314 }
15315 break;
15316 case 'v':
15317 switch (BitWidth) {
15318 case 16:
15319 RC = &AMDGPU::VGPR_32RegClass;
15320 break;
15321 default:
15322 RC = TRI->getVGPRClassForBitWidth(BitWidth);
15323 if (!RC)
15324 return std::pair(0U, nullptr);
15325 break;
15326 }
15327 break;
15328 case 'a':
15329 if (!Subtarget->hasMAIInsts())
15330 break;
15331 switch (BitWidth) {
15332 case 16:
15333 RC = &AMDGPU::AGPR_32RegClass;
15334 break;
15335 default:
15336 RC = TRI->getAGPRClassForBitWidth(BitWidth);
15337 if (!RC)
15338 return std::pair(0U, nullptr);
15339 break;
15340 }
15341 break;
15342 }
15343 // We actually support i128, i16 and f16 as inline parameters
15344 // even if they are not reported as legal
15345 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
15346 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
15347 return std::pair(0U, RC);
15348 }
15349
15350 if (Constraint.starts_with(Prefix: "{") && Constraint.ends_with(Suffix: "}")) {
15351 StringRef RegName(Constraint.data() + 1, Constraint.size() - 2);
15352 if (RegName.consume_front(Prefix: "v")) {
15353 RC = &AMDGPU::VGPR_32RegClass;
15354 } else if (RegName.consume_front(Prefix: "s")) {
15355 RC = &AMDGPU::SGPR_32RegClass;
15356 } else if (RegName.consume_front(Prefix: "a")) {
15357 RC = &AMDGPU::AGPR_32RegClass;
15358 }
15359
15360 if (RC) {
15361 uint32_t Idx;
15362 if (RegName.consume_front(Prefix: "[")) {
15363 uint32_t End;
15364 bool Failed = RegName.consumeInteger(Radix: 10, Result&: Idx);
15365 Failed |= !RegName.consume_front(Prefix: ":");
15366 Failed |= RegName.consumeInteger(Radix: 10, Result&: End);
15367 Failed |= !RegName.consume_back(Suffix: "]");
15368 if (!Failed) {
15369 uint32_t Width = (End - Idx + 1) * 32;
15370 MCRegister Reg = RC->getRegister(i: Idx);
15371 if (SIRegisterInfo::isVGPRClass(RC))
15372 RC = TRI->getVGPRClassForBitWidth(BitWidth: Width);
15373 else if (SIRegisterInfo::isSGPRClass(RC))
15374 RC = TRI->getSGPRClassForBitWidth(BitWidth: Width);
15375 else if (SIRegisterInfo::isAGPRClass(RC))
15376 RC = TRI->getAGPRClassForBitWidth(BitWidth: Width);
15377 if (RC) {
15378 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
15379 return std::pair(Reg, RC);
15380 }
15381 }
15382 } else {
15383 bool Failed = RegName.getAsInteger(Radix: 10, Result&: Idx);
15384 if (!Failed && Idx < RC->getNumRegs())
15385 return std::pair(RC->getRegister(i: Idx), RC);
15386 }
15387 }
15388 }
15389
15390 auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
15391 if (Ret.first)
15392 Ret.second = TRI->getPhysRegBaseClass(Ret.first);
15393
15394 return Ret;
15395}
15396
15397static bool isImmConstraint(StringRef Constraint) {
15398 if (Constraint.size() == 1) {
15399 switch (Constraint[0]) {
15400 default: break;
15401 case 'I':
15402 case 'J':
15403 case 'A':
15404 case 'B':
15405 case 'C':
15406 return true;
15407 }
15408 } else if (Constraint == "DA" ||
15409 Constraint == "DB") {
15410 return true;
15411 }
15412 return false;
15413}
15414
15415SITargetLowering::ConstraintType
15416SITargetLowering::getConstraintType(StringRef Constraint) const {
15417 if (Constraint.size() == 1) {
15418 switch (Constraint[0]) {
15419 default: break;
15420 case 's':
15421 case 'v':
15422 case 'a':
15423 return C_RegisterClass;
15424 }
15425 }
15426 if (isImmConstraint(Constraint)) {
15427 return C_Other;
15428 }
15429 return TargetLowering::getConstraintType(Constraint);
15430}
15431
15432static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
15433 if (!AMDGPU::isInlinableIntLiteral(Literal: Val)) {
15434 Val = Val & maskTrailingOnes<uint64_t>(N: Size);
15435 }
15436 return Val;
15437}
15438
15439void SITargetLowering::LowerAsmOperandForConstraint(SDValue Op,
15440 StringRef Constraint,
15441 std::vector<SDValue> &Ops,
15442 SelectionDAG &DAG) const {
15443 if (isImmConstraint(Constraint)) {
15444 uint64_t Val;
15445 if (getAsmOperandConstVal(Op, Val) &&
15446 checkAsmConstraintVal(Op, Constraint, Val)) {
15447 Val = clearUnusedBits(Val, Size: Op.getScalarValueSizeInBits());
15448 Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64));
15449 }
15450 } else {
15451 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
15452 }
15453}
15454
15455bool SITargetLowering::getAsmOperandConstVal(SDValue Op, uint64_t &Val) const {
15456 unsigned Size = Op.getScalarValueSizeInBits();
15457 if (Size > 64)
15458 return false;
15459
15460 if (Size == 16 && !Subtarget->has16BitInsts())
15461 return false;
15462
15463 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val&: Op)) {
15464 Val = C->getSExtValue();
15465 return true;
15466 }
15467 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val&: Op)) {
15468 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
15469 return true;
15470 }
15471 if (BuildVectorSDNode *V = dyn_cast<BuildVectorSDNode>(Val&: Op)) {
15472 if (Size != 16 || Op.getNumOperands() != 2)
15473 return false;
15474 if (Op.getOperand(i: 0).isUndef() || Op.getOperand(i: 1).isUndef())
15475 return false;
15476 if (ConstantSDNode *C = V->getConstantSplatNode()) {
15477 Val = C->getSExtValue();
15478 return true;
15479 }
15480 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
15481 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
15482 return true;
15483 }
15484 }
15485
15486 return false;
15487}
15488
15489bool SITargetLowering::checkAsmConstraintVal(SDValue Op, StringRef Constraint,
15490 uint64_t Val) const {
15491 if (Constraint.size() == 1) {
15492 switch (Constraint[0]) {
15493 case 'I':
15494 return AMDGPU::isInlinableIntLiteral(Literal: Val);
15495 case 'J':
15496 return isInt<16>(x: Val);
15497 case 'A':
15498 return checkAsmConstraintValA(Op, Val);
15499 case 'B':
15500 return isInt<32>(x: Val);
15501 case 'C':
15502 return isUInt<32>(x: clearUnusedBits(Val, Size: Op.getScalarValueSizeInBits())) ||
15503 AMDGPU::isInlinableIntLiteral(Literal: Val);
15504 default:
15505 break;
15506 }
15507 } else if (Constraint.size() == 2) {
15508 if (Constraint == "DA") {
15509 int64_t HiBits = static_cast<int32_t>(Val >> 32);
15510 int64_t LoBits = static_cast<int32_t>(Val);
15511 return checkAsmConstraintValA(Op, Val: HiBits, MaxSize: 32) &&
15512 checkAsmConstraintValA(Op, Val: LoBits, MaxSize: 32);
15513 }
15514 if (Constraint == "DB") {
15515 return true;
15516 }
15517 }
15518 llvm_unreachable("Invalid asm constraint");
15519}
15520
15521bool SITargetLowering::checkAsmConstraintValA(SDValue Op, uint64_t Val,
15522 unsigned MaxSize) const {
15523 unsigned Size = std::min<unsigned>(a: Op.getScalarValueSizeInBits(), b: MaxSize);
15524 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
15525 if (Size == 16) {
15526 MVT VT = Op.getSimpleValueType();
15527 switch (VT.SimpleTy) {
15528 default:
15529 return false;
15530 case MVT::i16:
15531 return AMDGPU::isInlinableLiteralI16(Literal: Val, HasInv2Pi);
15532 case MVT::f16:
15533 return AMDGPU::isInlinableLiteralFP16(Literal: Val, HasInv2Pi);
15534 case MVT::bf16:
15535 return AMDGPU::isInlinableLiteralBF16(Literal: Val, HasInv2Pi);
15536 case MVT::v2i16:
15537 return AMDGPU::getInlineEncodingV2I16(Literal: Val).has_value();
15538 case MVT::v2f16:
15539 return AMDGPU::getInlineEncodingV2F16(Literal: Val).has_value();
15540 case MVT::v2bf16:
15541 return AMDGPU::getInlineEncodingV2BF16(Literal: Val).has_value();
15542 }
15543 }
15544 if ((Size == 32 && AMDGPU::isInlinableLiteral32(Literal: Val, HasInv2Pi)) ||
15545 (Size == 64 && AMDGPU::isInlinableLiteral64(Literal: Val, HasInv2Pi)))
15546 return true;
15547 return false;
15548}
15549
15550static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
15551 switch (UnalignedClassID) {
15552 case AMDGPU::VReg_64RegClassID:
15553 return AMDGPU::VReg_64_Align2RegClassID;
15554 case AMDGPU::VReg_96RegClassID:
15555 return AMDGPU::VReg_96_Align2RegClassID;
15556 case AMDGPU::VReg_128RegClassID:
15557 return AMDGPU::VReg_128_Align2RegClassID;
15558 case AMDGPU::VReg_160RegClassID:
15559 return AMDGPU::VReg_160_Align2RegClassID;
15560 case AMDGPU::VReg_192RegClassID:
15561 return AMDGPU::VReg_192_Align2RegClassID;
15562 case AMDGPU::VReg_224RegClassID:
15563 return AMDGPU::VReg_224_Align2RegClassID;
15564 case AMDGPU::VReg_256RegClassID:
15565 return AMDGPU::VReg_256_Align2RegClassID;
15566 case AMDGPU::VReg_288RegClassID:
15567 return AMDGPU::VReg_288_Align2RegClassID;
15568 case AMDGPU::VReg_320RegClassID:
15569 return AMDGPU::VReg_320_Align2RegClassID;
15570 case AMDGPU::VReg_352RegClassID:
15571 return AMDGPU::VReg_352_Align2RegClassID;
15572 case AMDGPU::VReg_384RegClassID:
15573 return AMDGPU::VReg_384_Align2RegClassID;
15574 case AMDGPU::VReg_512RegClassID:
15575 return AMDGPU::VReg_512_Align2RegClassID;
15576 case AMDGPU::VReg_1024RegClassID:
15577 return AMDGPU::VReg_1024_Align2RegClassID;
15578 case AMDGPU::AReg_64RegClassID:
15579 return AMDGPU::AReg_64_Align2RegClassID;
15580 case AMDGPU::AReg_96RegClassID:
15581 return AMDGPU::AReg_96_Align2RegClassID;
15582 case AMDGPU::AReg_128RegClassID:
15583 return AMDGPU::AReg_128_Align2RegClassID;
15584 case AMDGPU::AReg_160RegClassID:
15585 return AMDGPU::AReg_160_Align2RegClassID;
15586 case AMDGPU::AReg_192RegClassID:
15587 return AMDGPU::AReg_192_Align2RegClassID;
15588 case AMDGPU::AReg_256RegClassID:
15589 return AMDGPU::AReg_256_Align2RegClassID;
15590 case AMDGPU::AReg_512RegClassID:
15591 return AMDGPU::AReg_512_Align2RegClassID;
15592 case AMDGPU::AReg_1024RegClassID:
15593 return AMDGPU::AReg_1024_Align2RegClassID;
15594 default:
15595 return -1;
15596 }
15597}
15598
15599// Figure out which registers should be reserved for stack access. Only after
15600// the function is legalized do we know all of the non-spill stack objects or if
15601// calls are present.
15602void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
15603 MachineRegisterInfo &MRI = MF.getRegInfo();
15604 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
15605 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
15606 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
15607 const SIInstrInfo *TII = ST.getInstrInfo();
15608
15609 if (Info->isEntryFunction()) {
15610 // Callable functions have fixed registers used for stack access.
15611 reservePrivateMemoryRegs(TM: getTargetMachine(), MF, TRI: *TRI, Info&: *Info);
15612 }
15613
15614 // TODO: Move this logic to getReservedRegs()
15615 // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
15616 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
15617 Register SReg = ST.isWave32()
15618 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
15619 : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,
15620 &AMDGPU::SGPR_64RegClass);
15621 Info->setSGPRForEXECCopy(SReg);
15622
15623 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
15624 Info->getStackPtrOffsetReg()));
15625 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
15626 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
15627
15628 // We need to worry about replacing the default register with itself in case
15629 // of MIR testcases missing the MFI.
15630 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
15631 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
15632
15633 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
15634 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
15635
15636 Info->limitOccupancy(MF);
15637
15638 if (ST.isWave32() && !MF.empty()) {
15639 for (auto &MBB : MF) {
15640 for (auto &MI : MBB) {
15641 TII->fixImplicitOperands(MI);
15642 }
15643 }
15644 }
15645
15646 // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
15647 // classes if required. Ideally the register class constraints would differ
15648 // per-subtarget, but there's no easy way to achieve that right now. This is
15649 // not a problem for VGPRs because the correctly aligned VGPR class is implied
15650 // from using them as the register class for legal types.
15651 if (ST.needsAlignedVGPRs()) {
15652 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
15653 const Register Reg = Register::index2VirtReg(Index: I);
15654 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
15655 if (!RC)
15656 continue;
15657 int NewClassID = getAlignedAGPRClassID(UnalignedClassID: RC->getID());
15658 if (NewClassID != -1)
15659 MRI.setRegClass(Reg, RC: TRI->getRegClass(RCID: NewClassID));
15660 }
15661 }
15662
15663 TargetLoweringBase::finalizeLowering(MF);
15664}
15665
15666void SITargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
15667 KnownBits &Known,
15668 const APInt &DemandedElts,
15669 const SelectionDAG &DAG,
15670 unsigned Depth) const {
15671 Known.resetAll();
15672 unsigned Opc = Op.getOpcode();
15673 switch (Opc) {
15674 case ISD::INTRINSIC_WO_CHAIN: {
15675 unsigned IID = Op.getConstantOperandVal(i: 0);
15676 switch (IID) {
15677 case Intrinsic::amdgcn_mbcnt_lo:
15678 case Intrinsic::amdgcn_mbcnt_hi: {
15679 const GCNSubtarget &ST =
15680 DAG.getMachineFunction().getSubtarget<GCNSubtarget>();
15681 // These return at most the (wavefront size - 1) + src1
15682 // As long as src1 is an immediate we can calc known bits
15683 KnownBits Src1Known = DAG.computeKnownBits(Op: Op.getOperand(i: 2), Depth: Depth + 1);
15684 unsigned Src1ValBits = Src1Known.countMaxActiveBits();
15685 unsigned MaxActiveBits = std::max(Src1ValBits, ST.getWavefrontSizeLog2());
15686 // Cater for potential carry
15687 MaxActiveBits += Src1ValBits ? 1 : 0;
15688 unsigned Size = Op.getValueType().getSizeInBits();
15689 if (MaxActiveBits < Size)
15690 Known.Zero.setHighBits(Size - MaxActiveBits);
15691 return;
15692 }
15693 }
15694 break;
15695 }
15696 }
15697 return AMDGPUTargetLowering::computeKnownBitsForTargetNode(
15698 Op, Known, DemandedElts, DAG, Depth);
15699}
15700
15701void SITargetLowering::computeKnownBitsForFrameIndex(
15702 const int FI, KnownBits &Known, const MachineFunction &MF) const {
15703 TargetLowering::computeKnownBitsForFrameIndex(FIOp: FI, Known, MF);
15704
15705 // Set the high bits to zero based on the maximum allowed scratch size per
15706 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
15707 // calculation won't overflow, so assume the sign bit is never set.
15708 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
15709}
15710
15711static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelKnownBits &KB,
15712 KnownBits &Known, unsigned Dim) {
15713 unsigned MaxValue =
15714 ST.getMaxWorkitemID(KB.getMachineFunction().getFunction(), Dim);
15715 Known.Zero.setHighBits(llvm::countl_zero(Val: MaxValue));
15716}
15717
15718void SITargetLowering::computeKnownBitsForTargetInstr(
15719 GISelKnownBits &KB, Register R, KnownBits &Known, const APInt &DemandedElts,
15720 const MachineRegisterInfo &MRI, unsigned Depth) const {
15721 const MachineInstr *MI = MRI.getVRegDef(Reg: R);
15722 switch (MI->getOpcode()) {
15723 case AMDGPU::G_INTRINSIC:
15724 case AMDGPU::G_INTRINSIC_CONVERGENT: {
15725 switch (cast<GIntrinsic>(Val: MI)->getIntrinsicID()) {
15726 case Intrinsic::amdgcn_workitem_id_x:
15727 knownBitsForWorkitemID(ST: *getSubtarget(), KB, Known, Dim: 0);
15728 break;
15729 case Intrinsic::amdgcn_workitem_id_y:
15730 knownBitsForWorkitemID(ST: *getSubtarget(), KB, Known, Dim: 1);
15731 break;
15732 case Intrinsic::amdgcn_workitem_id_z:
15733 knownBitsForWorkitemID(ST: *getSubtarget(), KB, Known, Dim: 2);
15734 break;
15735 case Intrinsic::amdgcn_mbcnt_lo:
15736 case Intrinsic::amdgcn_mbcnt_hi: {
15737 // These return at most the wavefront size - 1.
15738 unsigned Size = MRI.getType(Reg: R).getSizeInBits();
15739 Known.Zero.setHighBits(Size - getSubtarget()->getWavefrontSizeLog2());
15740 break;
15741 }
15742 case Intrinsic::amdgcn_groupstaticsize: {
15743 // We can report everything over the maximum size as 0. We can't report
15744 // based on the actual size because we don't know if it's accurate or not
15745 // at any given point.
15746 Known.Zero.setHighBits(
15747 llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize()));
15748 break;
15749 }
15750 }
15751 break;
15752 }
15753 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
15754 Known.Zero.setHighBits(24);
15755 break;
15756 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
15757 Known.Zero.setHighBits(16);
15758 break;
15759 case AMDGPU::G_AMDGPU_SMED3:
15760 case AMDGPU::G_AMDGPU_UMED3: {
15761 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
15762
15763 KnownBits Known2;
15764 KB.computeKnownBitsImpl(R: Src2, Known&: Known2, DemandedElts, Depth: Depth + 1);
15765 if (Known2.isUnknown())
15766 break;
15767
15768 KnownBits Known1;
15769 KB.computeKnownBitsImpl(R: Src1, Known&: Known1, DemandedElts, Depth: Depth + 1);
15770 if (Known1.isUnknown())
15771 break;
15772
15773 KnownBits Known0;
15774 KB.computeKnownBitsImpl(R: Src0, Known&: Known0, DemandedElts, Depth: Depth + 1);
15775 if (Known0.isUnknown())
15776 break;
15777
15778 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
15779 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
15780 Known.One = Known0.One & Known1.One & Known2.One;
15781 break;
15782 }
15783 }
15784}
15785
15786Align SITargetLowering::computeKnownAlignForTargetInstr(
15787 GISelKnownBits &KB, Register R, const MachineRegisterInfo &MRI,
15788 unsigned Depth) const {
15789 const MachineInstr *MI = MRI.getVRegDef(Reg: R);
15790 if (auto *GI = dyn_cast<GIntrinsic>(Val: MI)) {
15791 // FIXME: Can this move to generic code? What about the case where the call
15792 // site specifies a lower alignment?
15793 Intrinsic::ID IID = GI->getIntrinsicID();
15794 LLVMContext &Ctx = KB.getMachineFunction().getFunction().getContext();
15795 AttributeList Attrs = Intrinsic::getAttributes(C&: Ctx, id: IID);
15796 if (MaybeAlign RetAlign = Attrs.getRetAlignment())
15797 return *RetAlign;
15798 }
15799 return Align(1);
15800}
15801
15802Align SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
15803 const Align PrefAlign = TargetLowering::getPrefLoopAlignment(ML);
15804 const Align CacheLineAlign = Align(64);
15805
15806 // Pre-GFX10 target did not benefit from loop alignment
15807 if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
15808 getSubtarget()->hasInstFwdPrefetchBug())
15809 return PrefAlign;
15810
15811 // On GFX10 I$ is 4 x 64 bytes cache lines.
15812 // By default prefetcher keeps one cache line behind and reads two ahead.
15813 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
15814 // behind and one ahead.
15815 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
15816 // If loop fits 64 bytes it always spans no more than two cache lines and
15817 // does not need an alignment.
15818 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
15819 // Else if loop is less or equal 192 bytes we need two lines behind.
15820
15821 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
15822 const MachineBasicBlock *Header = ML->getHeader();
15823 if (Header->getAlignment() != PrefAlign)
15824 return Header->getAlignment(); // Already processed.
15825
15826 unsigned LoopSize = 0;
15827 for (const MachineBasicBlock *MBB : ML->blocks()) {
15828 // If inner loop block is aligned assume in average half of the alignment
15829 // size to be added as nops.
15830 if (MBB != Header)
15831 LoopSize += MBB->getAlignment().value() / 2;
15832
15833 for (const MachineInstr &MI : *MBB) {
15834 LoopSize += TII->getInstSizeInBytes(MI);
15835 if (LoopSize > 192)
15836 return PrefAlign;
15837 }
15838 }
15839
15840 if (LoopSize <= 64)
15841 return PrefAlign;
15842
15843 if (LoopSize <= 128)
15844 return CacheLineAlign;
15845
15846 // If any of parent loops is surrounded by prefetch instructions do not
15847 // insert new for inner loop, which would reset parent's settings.
15848 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
15849 if (MachineBasicBlock *Exit = P->getExitBlock()) {
15850 auto I = Exit->getFirstNonDebugInstr();
15851 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
15852 return CacheLineAlign;
15853 }
15854 }
15855
15856 MachineBasicBlock *Pre = ML->getLoopPreheader();
15857 MachineBasicBlock *Exit = ML->getExitBlock();
15858
15859 if (Pre && Exit) {
15860 auto PreTerm = Pre->getFirstTerminator();
15861 if (PreTerm == Pre->begin() ||
15862 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
15863 BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
15864 .addImm(1); // prefetch 2 lines behind PC
15865
15866 auto ExitHead = Exit->getFirstNonDebugInstr();
15867 if (ExitHead == Exit->end() ||
15868 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
15869 BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
15870 .addImm(2); // prefetch 1 line behind PC
15871 }
15872
15873 return CacheLineAlign;
15874}
15875
15876LLVM_ATTRIBUTE_UNUSED
15877static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
15878 assert(N->getOpcode() == ISD::CopyFromReg);
15879 do {
15880 // Follow the chain until we find an INLINEASM node.
15881 N = N->getOperand(Num: 0).getNode();
15882 if (N->getOpcode() == ISD::INLINEASM ||
15883 N->getOpcode() == ISD::INLINEASM_BR)
15884 return true;
15885 } while (N->getOpcode() == ISD::CopyFromReg);
15886 return false;
15887}
15888
15889bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode *N,
15890 FunctionLoweringInfo *FLI,
15891 UniformityInfo *UA) const {
15892 switch (N->getOpcode()) {
15893 case ISD::CopyFromReg: {
15894 const RegisterSDNode *R = cast<RegisterSDNode>(Val: N->getOperand(Num: 1));
15895 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
15896 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
15897 Register Reg = R->getReg();
15898
15899 // FIXME: Why does this need to consider isLiveIn?
15900 if (Reg.isPhysical() || MRI.isLiveIn(Reg))
15901 return !TRI->isSGPRReg(MRI, Reg);
15902
15903 if (const Value *V = FLI->getValueFromVirtualReg(Vreg: R->getReg()))
15904 return UA->isDivergent(V);
15905
15906 assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N));
15907 return !TRI->isSGPRReg(MRI, Reg);
15908 }
15909 case ISD::LOAD: {
15910 const LoadSDNode *L = cast<LoadSDNode>(Val: N);
15911 unsigned AS = L->getAddressSpace();
15912 // A flat load may access private memory.
15913 return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
15914 }
15915 case ISD::CALLSEQ_END:
15916 return true;
15917 case ISD::INTRINSIC_WO_CHAIN:
15918 return AMDGPU::isIntrinsicSourceOfDivergence(IntrID: N->getConstantOperandVal(Num: 0));
15919 case ISD::INTRINSIC_W_CHAIN:
15920 return AMDGPU::isIntrinsicSourceOfDivergence(IntrID: N->getConstantOperandVal(Num: 1));
15921 case AMDGPUISD::ATOMIC_CMP_SWAP:
15922 case AMDGPUISD::ATOMIC_LOAD_FMIN:
15923 case AMDGPUISD::ATOMIC_LOAD_FMAX:
15924 case AMDGPUISD::BUFFER_ATOMIC_SWAP:
15925 case AMDGPUISD::BUFFER_ATOMIC_ADD:
15926 case AMDGPUISD::BUFFER_ATOMIC_SUB:
15927 case AMDGPUISD::BUFFER_ATOMIC_SMIN:
15928 case AMDGPUISD::BUFFER_ATOMIC_UMIN:
15929 case AMDGPUISD::BUFFER_ATOMIC_SMAX:
15930 case AMDGPUISD::BUFFER_ATOMIC_UMAX:
15931 case AMDGPUISD::BUFFER_ATOMIC_AND:
15932 case AMDGPUISD::BUFFER_ATOMIC_OR:
15933 case AMDGPUISD::BUFFER_ATOMIC_XOR:
15934 case AMDGPUISD::BUFFER_ATOMIC_INC:
15935 case AMDGPUISD::BUFFER_ATOMIC_DEC:
15936 case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP:
15937 case AMDGPUISD::BUFFER_ATOMIC_CSUB:
15938 case AMDGPUISD::BUFFER_ATOMIC_FADD:
15939 case AMDGPUISD::BUFFER_ATOMIC_FADD_BF16:
15940 case AMDGPUISD::BUFFER_ATOMIC_FMIN:
15941 case AMDGPUISD::BUFFER_ATOMIC_FMAX:
15942 // Target-specific read-modify-write atomics are sources of divergence.
15943 return true;
15944 default:
15945 if (auto *A = dyn_cast<AtomicSDNode>(Val: N)) {
15946 // Generic read-modify-write atomics are sources of divergence.
15947 return A->readMem() && A->writeMem();
15948 }
15949 return false;
15950 }
15951}
15952
15953bool SITargetLowering::denormalsEnabledForType(const SelectionDAG &DAG,
15954 EVT VT) const {
15955 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
15956 case MVT::f32:
15957 return !denormalModeIsFlushAllF32(MF: DAG.getMachineFunction());
15958 case MVT::f64:
15959 case MVT::f16:
15960 return !denormalModeIsFlushAllF64F16(MF: DAG.getMachineFunction());
15961 default:
15962 return false;
15963 }
15964}
15965
15966bool SITargetLowering::denormalsEnabledForType(
15967 LLT Ty, const MachineFunction &MF) const {
15968 switch (Ty.getScalarSizeInBits()) {
15969 case 32:
15970 return !denormalModeIsFlushAllF32(MF);
15971 case 64:
15972 case 16:
15973 return !denormalModeIsFlushAllF64F16(MF);
15974 default:
15975 return false;
15976 }
15977}
15978
15979bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
15980 const SelectionDAG &DAG,
15981 bool SNaN,
15982 unsigned Depth) const {
15983 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
15984 const MachineFunction &MF = DAG.getMachineFunction();
15985 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
15986
15987 if (Info->getMode().DX10Clamp)
15988 return true; // Clamped to 0.
15989 return DAG.isKnownNeverNaN(Op: Op.getOperand(i: 0), SNaN, Depth: Depth + 1);
15990 }
15991
15992 return AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(Op, DAG,
15993 SNaN, Depth);
15994}
15995
15996#if 0
15997// FIXME: This should be checked before unsafe fp atomics are enabled
15998// Global FP atomic instructions have a hardcoded FP mode and do not support
15999// FP32 denormals, and only support v2f16 denormals.
16000static bool fpModeMatchesGlobalFPAtomicMode(const AtomicRMWInst *RMW) {
16001 const fltSemantics &Flt = RMW->getType()->getScalarType()->getFltSemantics();
16002 auto DenormMode = RMW->getParent()->getParent()->getDenormalMode(Flt);
16003 if (&Flt == &APFloat::IEEEsingle())
16004 return DenormMode == DenormalMode::getPreserveSign();
16005 return DenormMode == DenormalMode::getIEEE();
16006}
16007#endif
16008
16009// The amdgpu-unsafe-fp-atomics attribute enables generation of unsafe
16010// floating point atomic instructions. May generate more efficient code,
16011// but may not respect rounding and denormal modes, and may give incorrect
16012// results for certain memory destinations.
16013bool unsafeFPAtomicsDisabled(Function *F) {
16014 return F->getFnAttribute(Kind: "amdgpu-unsafe-fp-atomics").getValueAsString() !=
16015 "true";
16016}
16017
16018static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW) {
16019 LLVMContext &Ctx = RMW->getContext();
16020 SmallVector<StringRef> SSNs;
16021 Ctx.getSyncScopeNames(SSNs);
16022 StringRef MemScope = SSNs[RMW->getSyncScopeID()].empty()
16023 ? "system"
16024 : SSNs[RMW->getSyncScopeID()];
16025
16026 return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
16027 << "Hardware instruction generated for atomic "
16028 << RMW->getOperationName(Op: RMW->getOperation())
16029 << " operation at memory scope " << MemScope;
16030}
16031
16032TargetLowering::AtomicExpansionKind
16033SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
16034 unsigned AS = RMW->getPointerAddressSpace();
16035 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
16036 return AtomicExpansionKind::NotAtomic;
16037
16038 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
16039 OptimizationRemarkEmitter ORE(RMW->getFunction());
16040 ORE.emit(RemarkBuilder: [=]() {
16041 return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request.";
16042 });
16043 return Kind;
16044 };
16045
16046 auto SSID = RMW->getSyncScopeID();
16047 bool HasSystemScope =
16048 SSID == SyncScope::System ||
16049 SSID == RMW->getContext().getOrInsertSyncScopeID(SSN: "one-as");
16050
16051 switch (RMW->getOperation()) {
16052 case AtomicRMWInst::Sub:
16053 case AtomicRMWInst::Or:
16054 case AtomicRMWInst::Xor: {
16055 // Atomic sub/or/xor do not work over PCI express, but atomic add
16056 // does. InstCombine transforms these with 0 to or, so undo that.
16057 if (HasSystemScope && AMDGPU::isFlatGlobalAddrSpace(AS)) {
16058 if (Constant *ConstVal = dyn_cast<Constant>(Val: RMW->getValOperand());
16059 ConstVal && ConstVal->isNullValue())
16060 return AtomicExpansionKind::Expand;
16061 }
16062
16063 break;
16064 }
16065 case AtomicRMWInst::FAdd: {
16066 Type *Ty = RMW->getType();
16067
16068 // TODO: Handle REGION_ADDRESS
16069 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
16070 // DS F32 FP atomics do respect the denormal mode, but the rounding mode
16071 // is fixed to round-to-nearest-even.
16072 //
16073 // F64 / PK_F16 / PK_BF16 never flush and are also fixed to
16074 // round-to-nearest-even.
16075 //
16076 // We ignore the rounding mode problem, even in strictfp. The C++ standard
16077 // suggests it is OK if the floating-point mode may not match the calling
16078 // thread.
16079 if (Ty->isFloatTy()) {
16080 return Subtarget->hasLDSFPAtomicAddF32() ? AtomicExpansionKind::None
16081 : AtomicExpansionKind::CmpXChg;
16082 }
16083
16084 if (Ty->isDoubleTy()) {
16085 // Ignores denormal mode, but we don't consider flushing mandatory.
16086 return Subtarget->hasLDSFPAtomicAddF64() ? AtomicExpansionKind::None
16087 : AtomicExpansionKind::CmpXChg;
16088 }
16089
16090 // TODO: Handle v2f16/v2bf16 cases for gfx940
16091 return AtomicExpansionKind::CmpXChg;
16092 }
16093
16094 if (!AMDGPU::isFlatGlobalAddrSpace(AS) &&
16095 AS != AMDGPUAS::BUFFER_FAT_POINTER)
16096 return AtomicExpansionKind::CmpXChg;
16097
16098 // TODO: gfx940 supports v2f16 and v2bf16
16099 if (Subtarget->hasGFX940Insts() && (Ty->isFloatTy() || Ty->isDoubleTy()))
16100 return AtomicExpansionKind::None;
16101
16102 if (unsafeFPAtomicsDisabled(F: RMW->getFunction()))
16103 return AtomicExpansionKind::CmpXChg;
16104
16105 // Always expand system scope fp atomics.
16106 if (HasSystemScope)
16107 return AtomicExpansionKind::CmpXChg;
16108
16109 // global and flat atomic fadd f64: gfx90a, gfx940.
16110 if (Subtarget->hasGFX90AInsts() && Ty->isDoubleTy())
16111 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16112
16113 if (AS != AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
16114 // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940, gfx11+.
16115 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16116 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16117 // global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
16118 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16119 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16120 }
16121
16122 // flat atomic fadd f32: gfx940, gfx11+.
16123 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
16124 if (Subtarget->hasFlatAtomicFaddF32Inst())
16125 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16126
16127 // If it is in flat address space, and the type is float, we will try to
16128 // expand it, if the target supports global and lds atomic fadd. The
16129 // reason we need that is, in the expansion, we emit the check of address
16130 // space. If it is in global address space, we emit the global atomic
16131 // fadd; if it is in shared address space, we emit the LDS atomic fadd.
16132 if (Subtarget->hasLDSFPAtomicAddF32()) {
16133 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16134 return AtomicExpansionKind::Expand;
16135 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16136 return AtomicExpansionKind::Expand;
16137 }
16138 }
16139
16140 return AtomicExpansionKind::CmpXChg;
16141 }
16142 case AtomicRMWInst::FMin:
16143 case AtomicRMWInst::FMax:
16144 case AtomicRMWInst::Min:
16145 case AtomicRMWInst::Max:
16146 case AtomicRMWInst::UMin:
16147 case AtomicRMWInst::UMax: {
16148 if (AMDGPU::isFlatGlobalAddrSpace(AS) ||
16149 AS == AMDGPUAS::BUFFER_FAT_POINTER) {
16150 if (RMW->getType()->isFloatTy() &&
16151 unsafeFPAtomicsDisabled(F: RMW->getFunction()))
16152 return AtomicExpansionKind::CmpXChg;
16153
16154 // Always expand system scope min/max atomics.
16155 if (HasSystemScope)
16156 return AtomicExpansionKind::CmpXChg;
16157 }
16158 break;
16159 }
16160 default:
16161 break;
16162 }
16163
16164 return AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(RMW);
16165}
16166
16167TargetLowering::AtomicExpansionKind
16168SITargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
16169 return LI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
16170 ? AtomicExpansionKind::NotAtomic
16171 : AtomicExpansionKind::None;
16172}
16173
16174TargetLowering::AtomicExpansionKind
16175SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
16176 return SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
16177 ? AtomicExpansionKind::NotAtomic
16178 : AtomicExpansionKind::None;
16179}
16180
16181TargetLowering::AtomicExpansionKind
16182SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CmpX) const {
16183 return CmpX->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
16184 ? AtomicExpansionKind::NotAtomic
16185 : AtomicExpansionKind::None;
16186}
16187
16188const TargetRegisterClass *
16189SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
16190 const TargetRegisterClass *RC = TargetLoweringBase::getRegClassFor(VT, isDivergent: false);
16191 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
16192 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
16193 return Subtarget->getWavefrontSize() == 64 ? &AMDGPU::SReg_64RegClass
16194 : &AMDGPU::SReg_32RegClass;
16195 if (!TRI->isSGPRClass(RC) && !isDivergent)
16196 return TRI->getEquivalentSGPRClass(VRC: RC);
16197 else if (TRI->isSGPRClass(RC) && isDivergent)
16198 return TRI->getEquivalentVGPRClass(SRC: RC);
16199
16200 return RC;
16201}
16202
16203// FIXME: This is a workaround for DivergenceAnalysis not understanding always
16204// uniform values (as produced by the mask results of control flow intrinsics)
16205// used outside of divergent blocks. The phi users need to also be treated as
16206// always uniform.
16207//
16208// FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
16209static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
16210 unsigned WaveSize) {
16211 // FIXME: We assume we never cast the mask results of a control flow
16212 // intrinsic.
16213 // Early exit if the type won't be consistent as a compile time hack.
16214 IntegerType *IT = dyn_cast<IntegerType>(Val: V->getType());
16215 if (!IT || IT->getBitWidth() != WaveSize)
16216 return false;
16217
16218 if (!isa<Instruction>(Val: V))
16219 return false;
16220 if (!Visited.insert(Ptr: V).second)
16221 return false;
16222 bool Result = false;
16223 for (const auto *U : V->users()) {
16224 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(Val: U)) {
16225 if (V == U->getOperand(i: 1)) {
16226 switch (Intrinsic->getIntrinsicID()) {
16227 default:
16228 Result = false;
16229 break;
16230 case Intrinsic::amdgcn_if_break:
16231 case Intrinsic::amdgcn_if:
16232 case Intrinsic::amdgcn_else:
16233 Result = true;
16234 break;
16235 }
16236 }
16237 if (V == U->getOperand(i: 0)) {
16238 switch (Intrinsic->getIntrinsicID()) {
16239 default:
16240 Result = false;
16241 break;
16242 case Intrinsic::amdgcn_end_cf:
16243 case Intrinsic::amdgcn_loop:
16244 Result = true;
16245 break;
16246 }
16247 }
16248 } else {
16249 Result = hasCFUser(V: U, Visited, WaveSize);
16250 }
16251 if (Result)
16252 break;
16253 }
16254 return Result;
16255}
16256
16257bool SITargetLowering::requiresUniformRegister(MachineFunction &MF,
16258 const Value *V) const {
16259 if (const CallInst *CI = dyn_cast<CallInst>(Val: V)) {
16260 if (CI->isInlineAsm()) {
16261 // FIXME: This cannot give a correct answer. This should only trigger in
16262 // the case where inline asm returns mixed SGPR and VGPR results, used
16263 // outside the defining block. We don't have a specific result to
16264 // consider, so this assumes if any value is SGPR, the overall register
16265 // also needs to be SGPR.
16266 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
16267 TargetLowering::AsmOperandInfoVector TargetConstraints = ParseConstraints(
16268 MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI);
16269 for (auto &TC : TargetConstraints) {
16270 if (TC.Type == InlineAsm::isOutput) {
16271 ComputeConstraintToUse(TC, SDValue());
16272 const TargetRegisterClass *RC = getRegForInlineAsmConstraint(
16273 SIRI, TC.ConstraintCode, TC.ConstraintVT).second;
16274 if (RC && SIRI->isSGPRClass(RC))
16275 return true;
16276 }
16277 }
16278 }
16279 }
16280 SmallPtrSet<const Value *, 16> Visited;
16281 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
16282}
16283
16284bool SITargetLowering::hasMemSDNodeUser(SDNode *N) const {
16285 SDNode::use_iterator I = N->use_begin(), E = N->use_end();
16286 for (; I != E; ++I) {
16287 if (MemSDNode *M = dyn_cast<MemSDNode>(Val: *I)) {
16288 if (getBasePtrIndex(N: M) == I.getOperandNo())
16289 return true;
16290 }
16291 }
16292 return false;
16293}
16294
16295bool SITargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0,
16296 SDValue N1) const {
16297 if (!N0.hasOneUse())
16298 return false;
16299 // Take care of the opportunity to keep N0 uniform
16300 if (N0->isDivergent() || !N1->isDivergent())
16301 return true;
16302 // Check if we have a good chance to form the memory access pattern with the
16303 // base and offset
16304 return (DAG.isBaseWithConstantOffset(Op: N0) &&
16305 hasMemSDNodeUser(N: *N0->use_begin()));
16306}
16307
16308bool SITargetLowering::isReassocProfitable(MachineRegisterInfo &MRI,
16309 Register N0, Register N1) const {
16310 return MRI.hasOneNonDBGUse(RegNo: N0); // FIXME: handle regbanks
16311}
16312
16313MachineMemOperand::Flags
16314SITargetLowering::getTargetMMOFlags(const Instruction &I) const {
16315 // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
16316 MachineMemOperand::Flags Flags = MachineMemOperand::MONone;
16317 if (I.getMetadata(Kind: "amdgpu.noclobber"))
16318 Flags |= MONoClobber;
16319 if (I.getMetadata(Kind: "amdgpu.last.use"))
16320 Flags |= MOLastUse;
16321 return Flags;
16322}
16323
16324bool SITargetLowering::checkForPhysRegDependency(
16325 SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI,
16326 const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const {
16327 if (User->getOpcode() != ISD::CopyToReg)
16328 return false;
16329 if (!Def->isMachineOpcode())
16330 return false;
16331 MachineSDNode *MDef = dyn_cast<MachineSDNode>(Val: Def);
16332 if (!MDef)
16333 return false;
16334
16335 unsigned ResNo = User->getOperand(Num: Op).getResNo();
16336 if (User->getOperand(Op)->getValueType(ResNo) != MVT::i1)
16337 return false;
16338 const MCInstrDesc &II = TII->get(Opcode: MDef->getMachineOpcode());
16339 if (II.isCompare() && II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) {
16340 PhysReg = AMDGPU::SCC;
16341 const TargetRegisterClass *RC =
16342 TRI->getMinimalPhysRegClass(Reg: PhysReg, VT: Def->getSimpleValueType(ResNo));
16343 Cost = RC->getCopyCost();
16344 return true;
16345 }
16346 return false;
16347}
16348
16349void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16350 AtomicRMWInst::BinOp Op = AI->getOperation();
16351
16352 if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
16353 Op == AtomicRMWInst::Xor) {
16354 // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16355 assert(cast<Constant>(AI->getValOperand())->isNullValue() &&
16356 "this cannot be replaced with add");
16357 AI->setOperation(AtomicRMWInst::Add);
16358 return;
16359 }
16360
16361 assert(Subtarget->hasAtomicFaddInsts() &&
16362 "target should have atomic fadd instructions");
16363 assert(AI->getType()->isFloatTy() &&
16364 AI->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS &&
16365 "generic atomicrmw expansion only supports FP32 operand in flat "
16366 "address space");
16367 assert(Op == AtomicRMWInst::FAdd && "only fadd is supported for now");
16368
16369 // Given: atomicrmw fadd ptr %addr, float %val ordering
16370 //
16371 // With this expansion we produce the following code:
16372 // [...]
16373 // br label %atomicrmw.check.shared
16374 //
16375 // atomicrmw.check.shared:
16376 // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
16377 // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
16378 //
16379 // atomicrmw.shared:
16380 // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
16381 // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
16382 // float %val ordering
16383 // br label %atomicrmw.phi
16384 //
16385 // atomicrmw.check.private:
16386 // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
16387 // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
16388 //
16389 // atomicrmw.private:
16390 // %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
16391 // %loaded.private = load float, ptr addrspace(5) %cast.private
16392 // %val.new = fadd float %loaded.private, %val
16393 // store float %val.new, ptr addrspace(5) %cast.private
16394 // br label %atomicrmw.phi
16395 //
16396 // atomicrmw.global:
16397 // %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
16398 // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
16399 // float %val ordering
16400 // br label %atomicrmw.phi
16401 //
16402 // atomicrmw.phi:
16403 // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
16404 // [ %loaded.private, %atomicrmw.private ],
16405 // [ %loaded.global, %atomicrmw.global ]
16406 // br label %atomicrmw.end
16407 //
16408 // atomicrmw.end:
16409 // [...]
16410
16411 IRBuilder<> Builder(AI);
16412 LLVMContext &Ctx = Builder.getContext();
16413
16414 BasicBlock *BB = Builder.GetInsertBlock();
16415 Function *F = BB->getParent();
16416 BasicBlock *ExitBB =
16417 BB->splitBasicBlock(I: Builder.GetInsertPoint(), BBName: "atomicrmw.end");
16418 BasicBlock *CheckSharedBB =
16419 BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.check.shared", Parent: F, InsertBefore: ExitBB);
16420 BasicBlock *SharedBB = BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.shared", Parent: F, InsertBefore: ExitBB);
16421 BasicBlock *CheckPrivateBB =
16422 BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.check.private", Parent: F, InsertBefore: ExitBB);
16423 BasicBlock *PrivateBB =
16424 BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.private", Parent: F, InsertBefore: ExitBB);
16425 BasicBlock *GlobalBB = BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.global", Parent: F, InsertBefore: ExitBB);
16426 BasicBlock *PhiBB = BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.phi", Parent: F, InsertBefore: ExitBB);
16427
16428 Value *Val = AI->getValOperand();
16429 Type *ValTy = Val->getType();
16430 Value *Addr = AI->getPointerOperand();
16431
16432 auto CreateNewAtomicRMW = [AI](IRBuilder<> &Builder, Value *Addr,
16433 Value *Val) -> Value * {
16434 AtomicRMWInst *OldVal =
16435 Builder.CreateAtomicRMW(Op: AI->getOperation(), Ptr: Addr, Val, Align: AI->getAlign(),
16436 Ordering: AI->getOrdering(), SSID: AI->getSyncScopeID());
16437 SmallVector<std::pair<unsigned, MDNode *>> MDs;
16438 AI->getAllMetadata(MDs);
16439 for (auto &P : MDs)
16440 OldVal->setMetadata(KindID: P.first, Node: P.second);
16441 return OldVal;
16442 };
16443
16444 std::prev(x: BB->end())->eraseFromParent();
16445 Builder.SetInsertPoint(BB);
16446 Builder.CreateBr(Dest: CheckSharedBB);
16447
16448 Builder.SetInsertPoint(CheckSharedBB);
16449 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared, {},
16450 {Addr}, nullptr, "is.shared");
16451 Builder.CreateCondBr(Cond: IsShared, True: SharedBB, False: CheckPrivateBB);
16452
16453 Builder.SetInsertPoint(SharedBB);
16454 Value *CastToLocal = Builder.CreateAddrSpaceCast(
16455 V: Addr, DestTy: PointerType::get(C&: Ctx, AddressSpace: AMDGPUAS::LOCAL_ADDRESS));
16456 Value *LoadedShared = CreateNewAtomicRMW(Builder, CastToLocal, Val);
16457 Builder.CreateBr(Dest: PhiBB);
16458
16459 Builder.SetInsertPoint(CheckPrivateBB);
16460 CallInst *IsPrivate = Builder.CreateIntrinsic(
16461 Intrinsic::amdgcn_is_private, {}, {Addr}, nullptr, "is.private");
16462 Builder.CreateCondBr(Cond: IsPrivate, True: PrivateBB, False: GlobalBB);
16463
16464 Builder.SetInsertPoint(PrivateBB);
16465 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
16466 V: Addr, DestTy: PointerType::get(C&: Ctx, AddressSpace: AMDGPUAS::PRIVATE_ADDRESS));
16467 Value *LoadedPrivate =
16468 Builder.CreateLoad(Ty: ValTy, Ptr: CastToPrivate, Name: "loaded.private");
16469 Value *NewVal = Builder.CreateFAdd(L: LoadedPrivate, R: Val, Name: "val.new");
16470 Builder.CreateStore(Val: NewVal, Ptr: CastToPrivate);
16471 Builder.CreateBr(Dest: PhiBB);
16472
16473 Builder.SetInsertPoint(GlobalBB);
16474 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
16475 V: Addr, DestTy: PointerType::get(C&: Ctx, AddressSpace: AMDGPUAS::GLOBAL_ADDRESS));
16476 Value *LoadedGlobal = CreateNewAtomicRMW(Builder, CastToGlobal, Val);
16477 Builder.CreateBr(Dest: PhiBB);
16478
16479 Builder.SetInsertPoint(PhiBB);
16480 PHINode *Loaded = Builder.CreatePHI(Ty: ValTy, NumReservedValues: 3, Name: "loaded.phi");
16481 Loaded->addIncoming(V: LoadedShared, BB: SharedBB);
16482 Loaded->addIncoming(V: LoadedPrivate, BB: PrivateBB);
16483 Loaded->addIncoming(V: LoadedGlobal, BB: GlobalBB);
16484 Builder.CreateBr(Dest: ExitBB);
16485
16486 AI->replaceAllUsesWith(V: Loaded);
16487 AI->eraseFromParent();
16488}
16489
16490LoadInst *
16491SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
16492 IRBuilder<> Builder(AI);
16493 auto Order = AI->getOrdering();
16494
16495 // The optimization removes store aspect of the atomicrmw. Therefore, cache
16496 // must be flushed if the atomic ordering had a release semantics. This is
16497 // not necessary a fence, a release fence just coincides to do that flush.
16498 // Avoid replacing of an atomicrmw with a release semantics.
16499 if (isReleaseOrStronger(AO: Order))
16500 return nullptr;
16501
16502 LoadInst *LI = Builder.CreateAlignedLoad(
16503 Ty: AI->getType(), Ptr: AI->getPointerOperand(), Align: AI->getAlign());
16504 LI->setAtomic(Ordering: Order, SSID: AI->getSyncScopeID());
16505 LI->copyMetadata(SrcInst: *AI);
16506 LI->takeName(V: AI);
16507 AI->replaceAllUsesWith(V: LI);
16508 AI->eraseFromParent();
16509 return LI;
16510}
16511

source code of llvm/lib/Target/AMDGPU/SIISelLowering.cpp